aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/obsolete/sysfs-block-zram119
-rw-r--r--Documentation/ABI/testing/configfs-usb-gadget-midi12
-rw-r--r--Documentation/ABI/testing/sysfs-block-zram25
-rw-r--r--Documentation/ABI/testing/sysfs-class-dual-role-usb71
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-wakeup_reasons16
-rw-r--r--Documentation/android.txt121
-rw-r--r--Documentation/arm/memory.txt2
-rw-r--r--Documentation/block/00-INDEX6
-rw-r--r--Documentation/block/mmc-max-speed.txt38
-rw-r--r--Documentation/blockdev/zram.txt87
-rw-r--r--Documentation/cpu-freq/governors.txt85
-rw-r--r--Documentation/device-mapper/boot.txt42
-rw-r--r--Documentation/device-mapper/verity.txt52
-rw-r--r--Documentation/devicetree/bindings/misc/memory-state-time.txt8
-rw-r--r--Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt360
-rw-r--r--Documentation/filesystems/proc.txt28
-rw-r--r--Documentation/filesystems/squashfs.txt8
-rw-r--r--Documentation/kcov.txt111
-rw-r--r--Documentation/kernel-parameters.txt10
-rw-r--r--Documentation/networking/ip-sysctl.txt51
-rw-r--r--Documentation/scheduler/sched-energy.txt362
-rw-r--r--Documentation/scheduler/sched-tune.txt367
-rw-r--r--Documentation/sync.txt75
-rw-r--r--Documentation/sysctl/kernel.txt43
-rw-r--r--Documentation/sysctl/vm.txt45
-rw-r--r--Documentation/trace/events-power.txt1
-rw-r--r--Documentation/trace/ftrace.txt29
-rw-r--r--Documentation/vm/zsmalloc.txt70
-rw-r--r--MAINTAINERS1
-rw-r--r--Makefile21
-rw-r--r--android/configs/README15
-rw-r--r--android/configs/android-base-arm64.cfg5
-rw-r--r--android/configs/android-base.cfg169
-rw-r--r--android/configs/android-recommended.cfg140
-rw-r--r--arch/Kconfig77
-rw-r--r--arch/alpha/include/asm/thread_info.h5
-rw-r--r--arch/alpha/kernel/signal.c2
-rw-r--r--arch/arc/include/asm/thread_info.h4
-rw-r--r--arch/arc/kernel/signal.c2
-rw-r--r--arch/arm/Kconfig49
-rw-r--r--arch/arm/Kconfig.debug8
-rwxr-xr-x[-rw-r--r--]arch/arm/Makefile13
-rw-r--r--arch/arm/boot/.gitignore1
-rw-r--r--arch/arm/boot/Makefile13
-rw-r--r--arch/arm/boot/compressed/head.S2
-rw-r--r--arch/arm/boot/dts/Makefile9
-rw-r--r--arch/arm/boot/dts/am335x-evmsk.dts11
-rw-r--r--arch/arm/boot/dts/omap3-cm-t3517.dts10
-rw-r--r--arch/arm/boot/dts/omap3-cm-t3730.dts10
-rw-r--r--arch/arm/boot/dts/omap3-evm-common.dtsi10
-rw-r--r--arch/arm/boot/dts/omap3-zoom3.dts10
-rw-r--r--arch/arm/boot/dts/omap4-panda-common.dtsi10
-rw-r--r--arch/arm/boot/dts/omap4-sdp.dts11
-rw-r--r--arch/arm/boot/dts/omap4-var-som-om44-wlan.dtsi10
-rw-r--r--arch/arm/common/Kconfig4
-rw-r--r--arch/arm/common/Makefile1
-rw-r--r--arch/arm/common/fiq_glue.S118
-rw-r--r--arch/arm/common/fiq_glue_setup.c147
-rw-r--r--arch/arm/crypto/.gitignore2
-rw-r--r--arch/arm/crypto/Makefile8
-rw-r--r--arch/arm/crypto/sha256-armv4.pl716
-rw-r--r--arch/arm/crypto/sha256-core.S_shipped2808
-rw-r--r--arch/arm/crypto/sha256_glue.c246
-rw-r--r--arch/arm/crypto/sha256_glue.h23
-rw-r--r--arch/arm/crypto/sha256_neon_glue.c172
-rw-r--r--arch/arm/include/asm/Kbuild1
-rw-r--r--arch/arm/include/asm/assembler.h47
-rw-r--r--arch/arm/include/asm/auxvec.h1
-rw-r--r--arch/arm/include/asm/bug.h1
-rw-r--r--arch/arm/include/asm/cacheflush.h9
-rw-r--r--arch/arm/include/asm/domain.h57
-rw-r--r--arch/arm/include/asm/elf.h9
-rw-r--r--arch/arm/include/asm/fiq_glue.h33
-rw-r--r--arch/arm/include/asm/fixmap.h31
-rw-r--r--arch/arm/include/asm/futex.h32
-rw-r--r--arch/arm/include/asm/hardirq.h2
-rw-r--r--arch/arm/include/asm/hardware/coresight.h50
-rw-r--r--arch/arm/include/asm/irq.h3
-rw-r--r--arch/arm/include/asm/kvm_emulate.h10
-rw-r--r--arch/arm/include/asm/kvm_host.h3
-rw-r--r--arch/arm/include/asm/kvm_mmu.h3
-rw-r--r--arch/arm/include/asm/mach/arch.h2
-rw-r--r--arch/arm/include/asm/mach/mmc.h28
-rw-r--r--arch/arm/include/asm/memory.h22
-rw-r--r--arch/arm/include/asm/mmu.h3
-rw-r--r--arch/arm/include/asm/perf_event.h7
-rw-r--r--arch/arm/include/asm/pgalloc.h10
-rw-r--r--arch/arm/include/asm/pgtable-2level-hwdef.h3
-rw-r--r--arch/arm/include/asm/pgtable-3level-hwdef.h1
-rw-r--r--arch/arm/include/asm/proc-fns.h7
-rw-r--r--arch/arm/include/asm/smp.h4
-rw-r--r--arch/arm/include/asm/thread_info.h13
-rw-r--r--arch/arm/include/asm/topology.h7
-rw-r--r--arch/arm/include/asm/uaccess.h96
-rw-r--r--arch/arm/include/asm/vdso.h32
-rw-r--r--arch/arm/include/asm/vdso_datapage.h60
-rw-r--r--arch/arm/include/uapi/asm/Kbuild1
-rw-r--r--arch/arm/include/uapi/asm/auxvec.h7
-rw-r--r--arch/arm/kernel/Makefile3
-rw-r--r--arch/arm/kernel/armksyms.c6
-rw-r--r--arch/arm/kernel/asm-offsets.c17
-rw-r--r--arch/arm/kernel/entry-armv.S32
-rw-r--r--arch/arm/kernel/entry-common.S2
-rw-r--r--arch/arm/kernel/entry-header.S123
-rw-r--r--arch/arm/kernel/etm.c678
-rw-r--r--arch/arm/kernel/ftrace.c19
-rw-r--r--arch/arm/kernel/head-nommu.S2
-rw-r--r--arch/arm/kernel/head.S64
-rw-r--r--arch/arm/kernel/jump_label.c2
-rw-r--r--arch/arm/kernel/kgdb.c35
-rw-r--r--arch/arm/kernel/machine_kexec.c9
-rw-r--r--arch/arm/kernel/patch.c92
-rw-r--r--arch/arm/kernel/patch.h12
-rw-r--r--arch/arm/kernel/process.c196
-rw-r--r--arch/arm/kernel/setup.c2
-rw-r--r--arch/arm/kernel/signal.c4
-rw-r--r--arch/arm/kernel/smp.c68
-rw-r--r--arch/arm/kernel/swp_emulate.c3
-rw-r--r--arch/arm/kernel/topology.c149
-rw-r--r--arch/arm/kernel/traps.c1
-rw-r--r--arch/arm/kernel/vdso.c337
-rw-r--r--arch/arm/kernel/vmlinux.lds.S25
-rw-r--r--arch/arm/kvm/arm.c10
-rw-r--r--arch/arm/kvm/coproc.c70
-rw-r--r--arch/arm/kvm/coproc.h6
-rw-r--r--arch/arm/kvm/coproc_a15.c2
-rw-r--r--arch/arm/kvm/coproc_a7.c2
-rw-r--r--arch/arm/kvm/mmu.c70
-rw-r--r--arch/arm/kvm/trace.h39
-rw-r--r--arch/arm/lib/Makefile15
-rw-r--r--arch/arm/lib/clear_user.S6
-rw-r--r--arch/arm/lib/copy_from_user.S6
-rw-r--r--arch/arm/lib/copy_to_user.S6
-rw-r--r--arch/arm/lib/csumpartialcopyuser.S14
-rw-r--r--arch/arm/lib/uaccess.S564
-rw-r--r--arch/arm/lib/uaccess_with_memcpy.c33
-rw-r--r--arch/arm/mach-davinci/Kconfig11
-rw-r--r--arch/arm/mach-davinci/board-da850-evm.c112
-rw-r--r--arch/arm/mach-keystone/keystone.c27
-rw-r--r--arch/arm/mach-keystone/platsmp.c13
-rw-r--r--arch/arm/mach-omap2/pdata-quirks.c77
-rw-r--r--arch/arm/mm/Kconfig39
-rw-r--r--arch/arm/mm/Makefile1
-rw-r--r--arch/arm/mm/abort-ev4.S1
-rw-r--r--arch/arm/mm/abort-ev5t.S4
-rw-r--r--arch/arm/mm/abort-ev5tj.S4
-rw-r--r--arch/arm/mm/abort-ev6.S8
-rw-r--r--arch/arm/mm/abort-ev7.S1
-rw-r--r--arch/arm/mm/abort-lv4t.S2
-rw-r--r--arch/arm/mm/abort-macro.S14
-rw-r--r--arch/arm/mm/alignment.c30
-rw-r--r--arch/arm/mm/cache-v6.S17
-rw-r--r--arch/arm/mm/fault.c4
-rw-r--r--arch/arm/mm/highmem.c15
-rw-r--r--arch/arm/mm/init.c149
-rw-r--r--arch/arm/mm/mmap.c3
-rw-r--r--arch/arm/mm/mmu.c213
-rw-r--r--arch/arm/mm/pgd.c10
-rw-r--r--arch/arm/mm/proc-arm1020.S4
-rw-r--r--arch/arm/mm/proc-arm1020e.S4
-rw-r--r--arch/arm/mm/proc-arm1022.S4
-rw-r--r--arch/arm/mm/proc-arm1026.S4
-rw-r--r--arch/arm/mm/proc-arm720.S4
-rw-r--r--arch/arm/mm/proc-arm740.S4
-rw-r--r--arch/arm/mm/proc-arm7tdmi.S4
-rw-r--r--arch/arm/mm/proc-arm920.S4
-rw-r--r--arch/arm/mm/proc-arm922.S4
-rw-r--r--arch/arm/mm/proc-arm925.S4
-rw-r--r--arch/arm/mm/proc-arm926.S4
-rw-r--r--arch/arm/mm/proc-arm940.S4
-rw-r--r--arch/arm/mm/proc-arm946.S4
-rw-r--r--arch/arm/mm/proc-arm9tdmi.S4
-rw-r--r--arch/arm/mm/proc-fa526.S4
-rw-r--r--arch/arm/mm/proc-feroceon.S4
-rw-r--r--arch/arm/mm/proc-macros.S4
-rw-r--r--arch/arm/mm/proc-mohawk.S4
-rw-r--r--arch/arm/mm/proc-sa110.S4
-rw-r--r--arch/arm/mm/proc-sa1100.S4
-rw-r--r--arch/arm/mm/proc-v6.S4
-rw-r--r--arch/arm/mm/proc-v7-2level.S6
-rw-r--r--arch/arm/mm/proc-v7-3level.S14
-rw-r--r--arch/arm/mm/proc-v7.S57
-rw-r--r--arch/arm/mm/proc-v7m.S4
-rw-r--r--arch/arm/mm/proc-xsc3.S4
-rw-r--r--arch/arm/mm/proc-xscale.S4
-rw-r--r--arch/arm/mm/pv-fixup-asm.S88
-rw-r--r--arch/arm/nwfpe/entry.S3
-rw-r--r--arch/arm/vdso/.gitignore3
-rw-r--r--arch/arm/vdso/Makefile78
-rw-r--r--arch/arm/vdso/datapage.S15
-rw-r--r--arch/arm/vdso/vdso.S (renamed from arch/arm64/kernel/cputable.c)30
-rw-r--r--arch/arm/vdso/vdso.lds.S87
-rw-r--r--arch/arm/vdso/vdsomunge.c201
-rw-r--r--arch/arm/vdso/vgettimeofday.c282
-rw-r--r--arch/arm64/Kconfig191
-rw-r--r--arch/arm64/Kconfig.debug35
-rw-r--r--arch/arm64/Makefile19
-rw-r--r--arch/arm64/boot/.gitignore2
-rw-r--r--arch/arm64/boot/Makefile16
-rw-r--r--arch/arm64/configs/defconfig33
-rw-r--r--arch/arm64/include/asm/alternative.h146
-rw-r--r--arch/arm64/include/asm/assembler.h109
-rw-r--r--arch/arm64/include/asm/barrier.h28
-rw-r--r--arch/arm64/include/asm/cacheflush.h13
-rw-r--r--arch/arm64/include/asm/compat.h7
-rw-r--r--arch/arm64/include/asm/cpu.h12
-rw-r--r--arch/arm64/include/asm/cpufeature.h121
-rw-r--r--arch/arm64/include/asm/cputable.h30
-rw-r--r--arch/arm64/include/asm/cputype.h35
-rw-r--r--arch/arm64/include/asm/dmi.h31
-rw-r--r--arch/arm64/include/asm/efi.h31
-rw-r--r--arch/arm64/include/asm/esr.h86
-rw-r--r--arch/arm64/include/asm/fixmap.h9
-rw-r--r--arch/arm64/include/asm/futex.h15
-rw-r--r--arch/arm64/include/asm/hardirq.h4
-rw-r--r--arch/arm64/include/asm/hw_breakpoint.h6
-rw-r--r--arch/arm64/include/asm/hwcap.h8
-rw-r--r--arch/arm64/include/asm/insn.h10
-rw-r--r--arch/arm64/include/asm/irq.h1
-rw-r--r--arch/arm64/include/asm/irq_work.h11
-rw-r--r--arch/arm64/include/asm/kernel-pgtable.h74
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h10
-rw-r--r--arch/arm64/include/asm/kvm_host.h3
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h7
-rw-r--r--arch/arm64/include/asm/memory.h12
-rw-r--r--arch/arm64/include/asm/mmu.h21
-rw-r--r--arch/arm64/include/asm/mmu_context.h149
-rw-r--r--arch/arm64/include/asm/page.h22
-rw-r--r--arch/arm64/include/asm/percpu.h8
-rw-r--r--arch/arm64/include/asm/perf_event.h7
-rw-r--r--arch/arm64/include/asm/pgalloc.h8
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h13
-rw-r--r--arch/arm64/include/asm/pgtable-types.h12
-rw-r--r--arch/arm64/include/asm/pgtable.h40
-rw-r--r--arch/arm64/include/asm/proc-fns.h10
-rw-r--r--arch/arm64/include/asm/processor.h3
-rw-r--r--arch/arm64/include/asm/ptrace.h6
-rw-r--r--arch/arm64/include/asm/seccomp.h25
-rw-r--r--arch/arm64/include/asm/smp.h4
-rw-r--r--arch/arm64/include/asm/sysreg.h163
-rw-r--r--arch/arm64/include/asm/system_misc.h1
-rw-r--r--arch/arm64/include/asm/thread_info.h18
-rw-r--r--arch/arm64/include/asm/tlb.h25
-rw-r--r--arch/arm64/include/asm/tlbflush.h101
-rw-r--r--arch/arm64/include/asm/topology.h14
-rw-r--r--arch/arm64/include/asm/uaccess.h248
-rw-r--r--arch/arm64/include/asm/unistd.h3
-rw-r--r--arch/arm64/include/asm/unistd32.h3
-rw-r--r--arch/arm64/include/asm/vdso_datapage.h8
-rw-r--r--arch/arm64/include/uapi/asm/ptrace.h1
-rw-r--r--arch/arm64/kernel/Makefile13
-rw-r--r--arch/arm64/kernel/alternative.c119
-rw-r--r--arch/arm64/kernel/arm64ksyms.c4
-rw-r--r--arch/arm64/kernel/armv8_deprecated.c17
-rw-r--r--arch/arm64/kernel/asm-offsets.c16
-rw-r--r--arch/arm64/kernel/cpu_errata.c2
-rw-r--r--arch/arm64/kernel/cpu_ops.c2
-rw-r--r--arch/arm64/kernel/cpufeature.c892
-rw-r--r--arch/arm64/kernel/cpuinfo.c291
-rw-r--r--arch/arm64/kernel/debug-monitors.c8
-rw-r--r--arch/arm64/kernel/efi-entry.S3
-rw-r--r--arch/arm64/kernel/efi.c409
-rw-r--r--arch/arm64/kernel/entry.S341
-rw-r--r--arch/arm64/kernel/fpsimd.c48
-rw-r--r--arch/arm64/kernel/head.S357
-rw-r--r--arch/arm64/kernel/hw_breakpoint.c165
-rw-r--r--arch/arm64/kernel/insn.c190
-rw-r--r--arch/arm64/kernel/irq.c4
-rw-r--r--arch/arm64/kernel/jump_label.c23
-rw-r--r--arch/arm64/kernel/module.c18
-rw-r--r--arch/arm64/kernel/perf_event.c4
-rw-r--r--arch/arm64/kernel/process.c97
-rw-r--r--arch/arm64/kernel/psci.c5
-rw-r--r--arch/arm64/kernel/ptrace.c47
-rw-r--r--arch/arm64/kernel/setup.c282
-rw-r--r--arch/arm64/kernel/signal.c2
-rw-r--r--arch/arm64/kernel/signal32.c12
-rw-r--r--arch/arm64/kernel/sleep.S17
-rw-r--r--arch/arm64/kernel/smp.c24
-rw-r--r--arch/arm64/kernel/suspend.c2
-rw-r--r--arch/arm64/kernel/time.c2
-rw-r--r--arch/arm64/kernel/topology.c81
-rw-r--r--arch/arm64/kernel/traps.c80
-rw-r--r--arch/arm64/kernel/vdso.c20
-rw-r--r--arch/arm64/kernel/vdso/Makefile7
-rw-r--r--arch/arm64/kernel/vdso/gettimeofday.S331
-rw-r--r--arch/arm64/kernel/vdso/vdso.S3
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S43
-rw-r--r--arch/arm64/kvm/reset.c2
-rw-r--r--arch/arm64/kvm/sys_regs.c87
-rw-r--r--arch/arm64/lib/clear_user.S19
-rw-r--r--arch/arm64/lib/copy_from_user.S93
-rw-r--r--arch/arm64/lib/copy_in_user.S78
-rw-r--r--arch/arm64/lib/copy_to_user.S82
-rw-r--r--arch/arm64/mm/Makefile1
-rw-r--r--arch/arm64/mm/cache.S85
-rw-r--r--arch/arm64/mm/context.c251
-rw-r--r--arch/arm64/mm/dma-mapping.c4
-rw-r--r--arch/arm64/mm/dump.c332
-rw-r--r--arch/arm64/mm/fault.c68
-rw-r--r--arch/arm64/mm/flush.c5
-rw-r--r--arch/arm64/mm/init.c11
-rw-r--r--arch/arm64/mm/ioremap.c93
-rw-r--r--arch/arm64/mm/mm.h3
-rw-r--r--arch/arm64/mm/mmap.c11
-rw-r--r--arch/arm64/mm/mmu.c455
-rw-r--r--arch/arm64/mm/proc-macros.S54
-rw-r--r--arch/arm64/mm/proc.S66
-rw-r--r--arch/arm64/xen/hypercall.S15
-rw-r--r--arch/avr32/include/asm/thread_info.h4
-rw-r--r--arch/avr32/kernel/asm-offsets.c1
-rw-r--r--arch/avr32/kernel/signal.c2
-rw-r--r--arch/blackfin/include/asm/thread_info.h4
-rw-r--r--arch/blackfin/kernel/signal.c2
-rw-r--r--arch/c6x/include/asm/thread_info.h4
-rw-r--r--arch/c6x/kernel/signal.c2
-rw-r--r--arch/cris/arch-v10/kernel/signal.c2
-rw-r--r--arch/cris/arch-v32/kernel/signal.c2
-rw-r--r--arch/cris/include/asm/thread_info.h4
-rw-r--r--arch/frv/include/asm/thread_info.h4
-rw-r--r--arch/frv/kernel/asm-offsets.c1
-rw-r--r--arch/frv/kernel/signal.c2
-rw-r--r--arch/hexagon/include/asm/thread_info.h4
-rw-r--r--arch/hexagon/kernel/signal.c2
-rw-r--r--arch/ia64/Makefile4
-rw-r--r--arch/ia64/include/asm/thread_info.h4
-rw-r--r--arch/ia64/kernel/signal.c2
-rw-r--r--arch/m32r/include/asm/thread_info.h5
-rw-r--r--arch/m32r/kernel/signal.c2
-rw-r--r--arch/m68k/include/asm/thread_info.h4
-rw-r--r--arch/m68k/kernel/signal.c4
-rw-r--r--arch/metag/include/asm/thread_info.h6
-rw-r--r--arch/metag/kernel/signal.c2
-rw-r--r--arch/microblaze/include/asm/thread_info.h4
-rw-r--r--arch/microblaze/kernel/signal.c2
-rw-r--r--arch/mips/include/asm/thread_info.h4
-rw-r--r--arch/mips/kernel/asm-offsets.c1
-rw-r--r--arch/mips/kernel/signal.c2
-rw-r--r--arch/mips/kernel/signal32.c2
-rw-r--r--arch/mips/mm/mmap.c4
-rw-r--r--arch/mn10300/include/asm/thread_info.h4
-rw-r--r--arch/mn10300/kernel/asm-offsets.c1
-rw-r--r--arch/mn10300/kernel/signal.c2
-rw-r--r--arch/openrisc/include/asm/thread_info.h4
-rw-r--r--arch/openrisc/kernel/signal.c2
-rw-r--r--arch/parisc/include/asm/cache.h3
-rw-r--r--arch/parisc/include/asm/cacheflush.h4
-rw-r--r--arch/parisc/include/asm/thread_info.h4
-rw-r--r--arch/parisc/kernel/signal.c2
-rw-r--r--arch/powerpc/include/asm/thread_info.h4
-rw-r--r--arch/powerpc/kernel/process.c4
-rw-r--r--arch/powerpc/kernel/signal_32.c4
-rw-r--r--arch/powerpc/kernel/signal_64.c2
-rw-r--r--arch/powerpc/mm/mmap.c4
-rw-r--r--arch/s390/include/asm/thread_info.h4
-rw-r--r--arch/s390/kernel/compat_signal.c2
-rw-r--r--arch/s390/kernel/signal.c2
-rw-r--r--arch/s390/kernel/time.c18
-rw-r--r--arch/score/include/asm/thread_info.h4
-rw-r--r--arch/score/kernel/asm-offsets.c1
-rw-r--r--arch/score/kernel/signal.c2
-rw-r--r--arch/sh/include/asm/thread_info.h4
-rw-r--r--arch/sh/kernel/asm-offsets.c1
-rw-r--r--arch/sh/kernel/signal_32.c4
-rw-r--r--arch/sh/kernel/signal_64.c4
-rw-r--r--arch/sparc/include/asm/thread_info_32.h6
-rw-r--r--arch/sparc/include/asm/thread_info_64.h12
-rw-r--r--arch/sparc/kernel/signal32.c4
-rw-r--r--arch/sparc/kernel/signal_32.c2
-rw-r--r--arch/sparc/kernel/signal_64.c2
-rw-r--r--arch/sparc/kernel/sys_sparc_64.c2
-rw-r--r--arch/sparc/kernel/traps_64.c2
-rw-r--r--arch/tile/include/asm/thread_info.h4
-rw-r--r--arch/tile/kernel/signal.c2
-rw-r--r--arch/tile/kernel/time.c24
-rw-r--r--arch/um/include/asm/thread_info.h4
-rw-r--r--arch/unicore32/include/asm/memory.h6
-rw-r--r--arch/unicore32/include/asm/thread_info.h4
-rw-r--r--arch/unicore32/kernel/signal.c2
-rw-r--r--arch/x86/Kconfig27
-rw-r--r--arch/x86/Kconfig.debug17
-rw-r--r--arch/x86/boot/Makefile10
-rw-r--r--arch/x86/boot/compressed/Makefile3
-rw-r--r--arch/x86/ia32/ia32_signal.c2
-rw-r--r--arch/x86/include/asm/cacheflush.h6
-rw-r--r--arch/x86/include/asm/idle.h7
-rw-r--r--arch/x86/include/asm/kasan.h8
-rw-r--r--arch/x86/include/asm/kvm_para.h7
-rw-r--r--arch/x86/include/asm/sections.h2
-rw-r--r--arch/x86/include/asm/thread_info.h48
-rw-r--r--arch/x86/include/asm/uaccess.h102
-rw-r--r--arch/x86/include/asm/uaccess_32.h26
-rw-r--r--arch/x86/include/asm/uaccess_64.h96
-rw-r--r--arch/x86/kernel/Makefile6
-rw-r--r--arch/x86/kernel/apic/Makefile4
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/ftrace.c6
-rw-r--r--arch/x86/kernel/head64.c7
-rw-r--r--arch/x86/kernel/head_64.S29
-rw-r--r--arch/x86/kernel/kgdb.c8
-rw-r--r--arch/x86/kernel/process.c17
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/test_nx.c2
-rw-r--r--arch/x86/kernel/test_rodata.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S25
-rw-r--r--arch/x86/kernel/vsyscall_gtod.c24
-rw-r--r--arch/x86/kvm/x86.c14
-rw-r--r--arch/x86/lib/Makefile3
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c3
-rw-r--r--arch/x86/mm/kasan_init_64.c124
-rw-r--r--arch/x86/mm/mmap.c12
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/realmode/rm/Makefile3
-rw-r--r--arch/x86/um/signal.c2
-rw-r--r--arch/x86/vdso/Makefile4
-rw-r--r--arch/x86/vdso/vdso2c.h2
-rw-r--r--arch/xtensa/include/asm/thread_info.h5
-rw-r--r--arch/xtensa/kernel/signal.c2
-rw-r--r--block/blk-core.c84
-rw-r--r--block/genhd.c17
-rw-r--r--block/partition-generic.c11
-rw-r--r--crypto/Kconfig8
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile1
-rw-r--r--drivers/base/power/main.c5
-rw-r--r--drivers/base/power/wakeup.c36
-rw-r--r--drivers/base/syscore.c3
-rw-r--r--drivers/block/zram/zram_drv.c405
-rw-r--r--drivers/block/zram/zram_drv.h26
-rw-r--r--drivers/char/Kconfig13
-rw-r--r--drivers/char/mem.c17
-rw-r--r--drivers/clk/Kconfig18
-rw-r--r--drivers/clk/clk.c273
-rw-r--r--drivers/cpufreq/Kconfig46
-rw-r--r--drivers/cpufreq/Makefile1
-rw-r--r--drivers/cpufreq/cpufreq.c65
-rw-r--r--drivers/cpufreq/cpufreq_interactive.c1338
-rw-r--r--drivers/cpufreq/cpufreq_stats.c405
-rw-r--r--drivers/cpuidle/cpuidle.c55
-rw-r--r--drivers/cpuidle/governors/menu.c7
-rw-r--r--drivers/dma-buf/fence.c6
-rw-r--r--drivers/firmware/dmi_scan.c79
-rw-r--r--drivers/firmware/efi/efi.c60
-rw-r--r--drivers/firmware/efi/libstub/Makefile3
-rw-r--r--drivers/firmware/efi/libstub/arm-stub.c70
-rw-r--r--drivers/firmware/efi/libstub/efi-stub-helper.c25
-rw-r--r--drivers/firmware/efi/libstub/efistub.h4
-rw-r--r--drivers/firmware/efi/libstub/fdt.c62
-rw-r--r--drivers/hid/hid-appleir.c4
-rw-r--r--drivers/hid/hid-elo.c4
-rw-r--r--drivers/hid/hid-input.c10
-rw-r--r--drivers/hid/hid-logitech-hidpp.c4
-rw-r--r--drivers/hid/hid-magicmouse.c8
-rw-r--r--drivers/hid/hid-multitouch.c30
-rw-r--r--drivers/hid/hid-ntrig.c6
-rw-r--r--drivers/hid/hid-rmi.c9
-rw-r--r--drivers/hid/hid-sony.c13
-rw-r--r--drivers/hid/hid-steelseries.c5
-rw-r--r--drivers/hid/uhid.c17
-rw-r--r--drivers/iio/industrialio-event.c2
-rw-r--r--drivers/input/Kconfig13
-rw-r--r--drivers/input/Makefile3
-rw-r--r--drivers/input/evdev.c67
-rw-r--r--drivers/input/gameport/gameport.c4
-rw-r--r--drivers/input/joystick/xpad.c600
-rw-r--r--drivers/input/keycombo.c261
-rw-r--r--drivers/input/keyreset.c145
-rw-r--r--drivers/input/misc/Kconfig16
-rw-r--r--drivers/input/misc/Makefile2
-rw-r--r--drivers/input/misc/gpio_axis.c192
-rw-r--r--drivers/input/misc/gpio_event.c228
-rw-r--r--drivers/input/misc/gpio_input.c390
-rw-r--r--drivers/input/misc/gpio_matrix.c441
-rw-r--r--drivers/input/misc/gpio_output.c97
-rw-r--r--drivers/input/misc/ims-pcu.c4
-rw-r--r--drivers/input/misc/keychord.c391
-rw-r--r--drivers/input/serio/serio.c4
-rw-r--r--drivers/input/serio/serio_raw.c4
-rw-r--r--drivers/md/Kconfig61
-rw-r--r--drivers/md/Makefile6
-rw-r--r--drivers/md/dm-android-verity.c946
-rw-r--r--drivers/md/dm-android-verity.h123
-rw-r--r--drivers/md/dm-crypt.c9
-rw-r--r--drivers/md/dm-ioctl.c39
-rw-r--r--drivers/md/dm-linear.c35
-rw-r--r--drivers/md/dm-table.c1
-rw-r--r--drivers/md/dm-verity-fec.c870
-rw-r--r--drivers/md/dm-verity-fec.h159
-rw-r--r--drivers/md/dm-verity-target.c (renamed from drivers/md/dm-verity.c)699
-rw-r--r--drivers/md/dm-verity.h142
-rw-r--r--drivers/md/dm.c1
-rw-r--r--drivers/misc/Kconfig17
-rw-r--r--drivers/misc/Makefile3
-rw-r--r--drivers/misc/lkdtm.c29
-rw-r--r--drivers/misc/memory_state_time.c462
-rw-r--r--drivers/misc/uid_stat.c152
-rw-r--r--drivers/misc/uid_sys_stats.c496
-rw-r--r--drivers/mmc/card/Kconfig12
-rw-r--r--drivers/mmc/card/block.c321
-rw-r--r--drivers/mmc/card/queue.c6
-rw-r--r--drivers/mmc/card/queue.h8
-rw-r--r--drivers/mmc/core/Kconfig15
-rw-r--r--drivers/mmc/core/bus.c4
-rw-r--r--drivers/mmc/core/core.c138
-rw-r--r--drivers/mmc/core/core.h3
-rw-r--r--drivers/mmc/core/host.c16
-rw-r--r--drivers/mmc/core/host.h7
-rw-r--r--drivers/mmc/core/mmc.c16
-rw-r--r--drivers/mmc/core/sd.c83
-rw-r--r--drivers/mmc/core/sdio.c112
-rw-r--r--drivers/mmc/core/sdio_bus.c24
-rwxr-xr-x[-rw-r--r--]drivers/mmc/core/sdio_io.c33
-rw-r--r--drivers/mtd/nand/Kconfig10
-rw-r--r--drivers/net/ppp/Kconfig17
-rw-r--r--drivers/net/ppp/Makefile2
-rw-r--r--drivers/net/ppp/pppolac.c449
-rw-r--r--drivers/net/ppp/pppopns.c428
-rw-r--r--drivers/net/tun.c6
-rw-r--r--drivers/net/wireless/ti/wilink_platform_data.c25
-rw-r--r--drivers/net/wireless/ti/wl12xx/main.c63
-rw-r--r--drivers/net/wireless/ti/wl12xx/scan.c6
-rw-r--r--drivers/net/wireless/ti/wl12xx/wl12xx.h28
-rw-r--r--drivers/net/wireless/ti/wl18xx/cmd.c31
-rw-r--r--drivers/net/wireless/ti/wl18xx/cmd.h11
-rw-r--r--drivers/net/wireless/ti/wl18xx/event.c21
-rw-r--r--drivers/net/wireless/ti/wl18xx/event.h14
-rw-r--r--drivers/net/wireless/ti/wl18xx/main.c97
-rw-r--r--drivers/net/wireless/ti/wl18xx/scan.c6
-rw-r--r--drivers/net/wireless/ti/wl18xx/wl18xx.h4
-rw-r--r--drivers/net/wireless/ti/wlcore/boot.c1
-rw-r--r--drivers/net/wireless/ti/wlcore/cmd.c29
-rw-r--r--drivers/net/wireless/ti/wlcore/cmd.h6
-rw-r--r--drivers/net/wireless/ti/wlcore/debugfs.c2
-rw-r--r--drivers/net/wireless/ti/wlcore/hw_ops.h9
-rw-r--r--drivers/net/wireless/ti/wlcore/init.c2
-rw-r--r--drivers/net/wireless/ti/wlcore/init.h1
-rw-r--r--drivers/net/wireless/ti/wlcore/main.c259
-rw-r--r--drivers/net/wireless/ti/wlcore/sdio.c64
-rw-r--r--drivers/net/wireless/ti/wlcore/spi.c6
-rw-r--r--drivers/net/wireless/ti/wlcore/wlcore.h7
-rw-r--r--drivers/net/wireless/ti/wlcore/wlcore_i.h13
-rw-r--r--drivers/of/fdt.c68
-rw-r--r--drivers/pci/host/pcie-xilinx.c2
-rw-r--r--drivers/platform/Kconfig2
-rw-r--r--drivers/platform/goldfish/Kconfig18
-rw-r--r--drivers/platform/goldfish/Makefile2
-rw-r--r--drivers/platform/goldfish/goldfish_pipe.c178
-rw-r--r--drivers/power/power_supply_sysfs.c11
-rw-r--r--drivers/rtc/rtc-palmas.c44
-rw-r--r--drivers/scsi/ufs/ufshcd.c80
-rw-r--r--drivers/scsi/ufs/ufshcd.h3
-rw-r--r--drivers/staging/android/Kconfig45
-rw-r--r--drivers/staging/android/Makefile3
-rw-r--r--drivers/staging/android/TODO10
-rw-r--r--drivers/staging/android/alarm-dev.c446
-rw-r--r--drivers/staging/android/android_alarm.h41
-rw-r--r--drivers/staging/android/ashmem.c26
-rw-r--r--drivers/staging/android/binder.c1355
-rw-r--r--drivers/staging/android/fiq_debugger/Kconfig49
-rw-r--r--drivers/staging/android/fiq_debugger/Makefile4
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger.c1212
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger.h64
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_arm.c240
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c202
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_priv.h37
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h94
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_watchdog.c56
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_watchdog.h20
-rw-r--r--drivers/staging/android/ion/Kconfig7
-rw-r--r--drivers/staging/android/ion/ion.c41
-rw-r--r--drivers/staging/android/ion/ion.h2
-rwxr-xr-x[-rw-r--r--]drivers/staging/android/ion/ion_carveout_heap.c2
-rw-r--r--drivers/staging/android/ion/ion_page_pool.c10
-rw-r--r--drivers/staging/android/ion/ion_priv.h38
-rw-r--r--drivers/staging/android/ion/ion_system_heap.c8
-rw-r--r--drivers/staging/android/logger.c808
-rw-r--r--drivers/staging/android/logger.h89
-rw-r--r--drivers/staging/android/lowmemorykiller.c115
-rw-r--r--drivers/staging/android/sync.c14
-rw-r--r--drivers/staging/android/trace/lowmemorykiller.h41
-rw-r--r--drivers/staging/android/uapi/android_alarm.h62
-rw-r--r--drivers/staging/android/uapi/ashmem.h1
-rw-r--r--drivers/staging/android/uapi/binder.h106
-rw-r--r--drivers/switch/Kconfig15
-rw-r--r--drivers/switch/Makefile4
-rw-r--r--drivers/switch/switch_class.c174
-rw-r--r--drivers/switch/switch_gpio.c172
-rw-r--r--drivers/tty/n_tty.c58
-rw-r--r--drivers/tty/serial/serial_core.c3
-rw-r--r--drivers/usb/core/devio.c9
-rw-r--r--drivers/usb/gadget/Kconfig68
-rw-r--r--drivers/usb/gadget/composite.c6
-rw-r--r--drivers/usb/gadget/configfs.c259
-rw-r--r--drivers/usb/gadget/function/Makefile12
-rw-r--r--drivers/usb/gadget/function/f_accessory.c1330
-rw-r--r--drivers/usb/gadget/function/f_audio_source.c1060
-rw-r--r--drivers/usb/gadget/function/f_fs.c87
-rw-r--r--drivers/usb/gadget/function/f_hid.c5
-rw-r--r--drivers/usb/gadget/function/f_midi.c436
-rw-r--r--drivers/usb/gadget/function/f_mtp.c1583
-rw-r--r--drivers/usb/gadget/function/f_mtp.h18
-rw-r--r--drivers/usb/gadget/function/f_ptp.c38
-rw-r--r--drivers/usb/gadget/function/f_rndis.c85
-rw-r--r--drivers/usb/gadget/function/rndis.c111
-rw-r--r--drivers/usb/gadget/function/rndis.h2
-rw-r--r--drivers/usb/gadget/function/u_ether.c312
-rw-r--r--drivers/usb/gadget/function/u_ether.h3
-rw-r--r--drivers/usb/gadget/function/u_midi.h40
-rw-r--r--drivers/usb/gadget/function/u_serial.c1
-rw-r--r--drivers/usb/gadget/legacy/Kconfig21
-rw-r--r--drivers/usb/gadget/legacy/Makefile2
-rw-r--r--drivers/usb/gadget/legacy/android.c1568
-rw-r--r--drivers/usb/gadget/legacy/gmidi.c43
-rw-r--r--drivers/usb/phy/Kconfig17
-rw-r--r--drivers/usb/phy/Makefile2
-rw-r--r--drivers/usb/phy/class-dual-role.c529
-rw-r--r--drivers/usb/phy/otg-wakelock.c173
-rw-r--r--drivers/video/Kconfig1
-rw-r--r--drivers/video/Makefile1
-rw-r--r--drivers/video/adf/Kconfig14
-rw-r--r--drivers/video/adf/Makefile17
-rw-r--r--drivers/video/adf/adf.c1188
-rw-r--r--drivers/video/adf/adf.h71
-rw-r--r--drivers/video/adf/adf_client.c811
-rw-r--r--drivers/video/adf/adf_fbdev.c665
-rw-r--r--drivers/video/adf/adf_fops.c946
-rw-r--r--drivers/video/adf/adf_fops.h37
-rw-r--r--drivers/video/adf/adf_fops32.c217
-rw-r--r--drivers/video/adf/adf_fops32.h78
-rw-r--r--drivers/video/adf/adf_format.c280
-rw-r--r--drivers/video/adf/adf_memblock.c160
-rw-r--r--drivers/video/adf/adf_sysfs.c296
-rw-r--r--drivers/video/adf/adf_sysfs.h33
-rw-r--r--drivers/video/adf/adf_trace.h93
-rw-r--r--drivers/w1/masters/ds2482.c47
-rw-r--r--drivers/xen/efi.c1
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/Makefile7
-rw-r--r--fs/attr.c12
-rw-r--r--fs/coredump.c2
-rw-r--r--fs/crypto/Kconfig18
-rw-r--r--fs/crypto/Makefile4
-rw-r--r--fs/crypto/bio.c143
-rw-r--r--fs/crypto/crypto.c483
-rw-r--r--fs/crypto/fname.c464
-rw-r--r--fs/crypto/fscrypt_private.h115
-rw-r--r--fs/crypto/keyinfo.c297
-rw-r--r--fs/crypto/policy.c269
-rw-r--r--fs/dcache.c144
-rw-r--r--fs/debugfs/inode.c251
-rw-r--r--fs/ecryptfs/file.c71
-rw-r--r--fs/eventpoll.c4
-rw-r--r--fs/exec.c2
-rw-r--r--fs/ext3/acl.c12
-rw-r--r--fs/ext4/Kconfig23
-rw-r--r--fs/ext4/Makefile4
-rw-r--r--fs/ext4/crypto.c536
-rw-r--r--fs/ext4/crypto_fname.c470
-rw-r--r--fs/ext4/crypto_key.c262
-rw-r--r--fs/ext4/crypto_policy.c268
-rw-r--r--fs/ext4/dir.c88
-rw-r--r--fs/ext4/ext4.h234
-rw-r--r--fs/ext4/ext4_crypto.h158
-rw-r--r--fs/ext4/extents.c23
-rw-r--r--fs/ext4/file.c19
-rw-r--r--fs/ext4/ialloc.c22
-rw-r--r--fs/ext4/inline.c55
-rw-r--r--fs/ext4/inode.c220
-rw-r--r--fs/ext4/ioctl.c101
-rw-r--r--fs/ext4/mballoc.c28
-rw-r--r--fs/ext4/move_extent.c8
-rw-r--r--fs/ext4/namei.c585
-rw-r--r--fs/ext4/page-io.c60
-rw-r--r--fs/ext4/readpage.c367
-rw-r--r--fs/ext4/super.c57
-rw-r--r--fs/ext4/symlink.c92
-rw-r--r--fs/ext4/xattr.h3
-rw-r--r--fs/f2fs/Kconfig35
-rw-r--r--fs/f2fs/Makefile2
-rw-r--r--fs/f2fs/acl.c170
-rw-r--r--fs/f2fs/acl.h8
-rw-r--r--fs/f2fs/checkpoint.c981
-rw-r--r--fs/f2fs/data.c2187
-rw-r--r--fs/f2fs/debug.c239
-rw-r--r--fs/f2fs/dir.c726
-rw-r--r--fs/f2fs/extent_cache.c809
-rw-r--r--fs/f2fs/f2fs.h2104
-rw-r--r--fs/f2fs/file.c1937
-rw-r--r--fs/f2fs/gc.c664
-rw-r--r--fs/f2fs/gc.h42
-rw-r--r--fs/f2fs/hash.c10
-rw-r--r--fs/f2fs/inline.c684
-rw-r--r--fs/f2fs/inode.c246
-rw-r--r--fs/f2fs/namei.c753
-rw-r--r--fs/f2fs/node.c1809
-rw-r--r--fs/f2fs/node.h246
-rw-r--r--fs/f2fs/recovery.c396
-rw-r--r--fs/f2fs/segment.c2089
-rw-r--r--fs/f2fs/segment.h322
-rw-r--r--fs/f2fs/shrinker.c143
-rw-r--r--fs/f2fs/super.c1315
-rw-r--r--fs/f2fs/trace.c162
-rw-r--r--fs/f2fs/trace.h46
-rw-r--r--fs/f2fs/xattr.c261
-rw-r--r--fs/f2fs/xattr.h24
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/fs_struct.c3
-rw-r--r--fs/fuse/dev.c11
-rw-r--r--fs/fuse/dir.c45
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/inode.c6
-rw-r--r--fs/internal.h4
-rw-r--r--fs/mpage.c36
-rw-r--r--fs/namei.c178
-rw-r--r--fs/namespace.c32
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/notify/inotify/inotify_user.c17
-rw-r--r--fs/open.c37
-rw-r--r--fs/pnode.c34
-rw-r--r--fs/pnode.h1
-rw-r--r--fs/proc/array.c5
-rw-r--r--fs/proc/base.c50
-rw-r--r--fs/proc/fd.c14
-rw-r--r--fs/proc/kcore.c31
-rw-r--r--fs/proc/proc_sysctl.c3
-rw-r--r--fs/proc/task_mmu.c94
-rw-r--r--fs/proc_namespace.c8
-rw-r--r--fs/pstore/Kconfig10
-rw-r--r--fs/pstore/Makefile2
-rw-r--r--fs/pstore/inode.c26
-rw-r--r--fs/pstore/internal.h6
-rw-r--r--fs/pstore/platform.c37
-rw-r--r--fs/pstore/pmsg.c93
-rw-r--r--fs/pstore/ram.c79
-rw-r--r--fs/pstore/ram_core.c47
-rw-r--r--fs/sdcardfs/Kconfig13
-rw-r--r--fs/sdcardfs/Makefile7
-rw-r--r--fs/sdcardfs/dentry.c194
-rw-r--r--fs/sdcardfs/derived_perm.c471
-rw-r--r--fs/sdcardfs/file.c433
-rw-r--r--fs/sdcardfs/inode.c914
-rw-r--r--fs/sdcardfs/lookup.c466
-rw-r--r--fs/sdcardfs/main.c486
-rw-r--r--fs/sdcardfs/mmap.c89
-rw-r--r--fs/sdcardfs/multiuser.h53
-rw-r--r--fs/sdcardfs/packagelist.c885
-rw-r--r--fs/sdcardfs/sdcardfs.h664
-rw-r--r--fs/sdcardfs/super.c324
-rw-r--r--fs/select.c2
-rw-r--r--fs/squashfs/Kconfig43
-rw-r--r--fs/squashfs/Makefile4
-rw-r--r--fs/squashfs/block.c546
-rw-r--r--fs/squashfs/cache.c73
-rw-r--r--fs/squashfs/decompressor.c62
-rw-r--r--fs/squashfs/decompressor.h4
-rw-r--r--fs/squashfs/file.c140
-rw-r--r--fs/squashfs/file_cache.c38
-rw-r--r--fs/squashfs/file_direct.c243
-rw-r--r--fs/squashfs/lz4_wrapper.c120
-rw-r--r--fs/squashfs/lzo_wrapper.c40
-rw-r--r--fs/squashfs/page_actor.c175
-rw-r--r--fs/squashfs/page_actor.h82
-rw-r--r--fs/squashfs/squashfs.h11
-rw-r--r--fs/squashfs/squashfs_fs.h1
-rw-r--r--fs/squashfs/squashfs_fs_sb.h2
-rw-r--r--fs/squashfs/super.c7
-rw-r--r--fs/squashfs/xz_wrapper.c19
-rw-r--r--fs/squashfs/zlib_wrapper.c18
-rw-r--r--fs/super.c30
-rw-r--r--fs/sync.c1
-rw-r--r--fs/tracefs/Makefile4
-rw-r--r--fs/tracefs/inode.c654
-rw-r--r--fs/utimes.c4
-rw-r--r--include/asm-generic/memory_model.h8
-rw-r--r--include/asm-generic/seccomp.h30
-rw-r--r--include/asm-generic/vmlinux.lds.h9
-rw-r--r--include/linux/Kbuild2
-rw-r--r--include/linux/amba/mmci.h13
-rw-r--r--include/linux/android_aid.h28
-rw-r--r--include/linux/atomic.h323
-rw-r--r--include/linux/blkdev.h76
-rw-r--r--include/linux/cache.h14
-rw-r--r--include/linux/cgroup_subsys.h12
-rw-r--r--include/linux/clk-private.h17
-rw-r--r--include/linux/cpu.h7
-rw-r--r--include/linux/cpufreq.h27
-rw-r--r--include/linux/cpuidle.h7
-rw-r--r--include/linux/cpumask.h8
-rw-r--r--include/linux/cred.h8
-rw-r--r--include/linux/dcache.h5
-rw-r--r--include/linux/debugfs.h5
-rw-r--r--include/linux/device-mapper.h6
-rw-r--r--include/linux/efi.h8
-rw-r--r--include/linux/f2fs_fs.h120
-rw-r--r--include/linux/fence.h2
-rw-r--r--include/linux/fs.h31
-rw-r--r--include/linux/fscrypt_common.h145
-rw-r--r--include/linux/fscrypt_notsupp.h177
-rw-r--r--include/linux/fscrypt_supp.h144
-rw-r--r--include/linux/ftrace.h6
-rw-r--r--include/linux/gpio_event.h170
-rw-r--r--include/linux/hid.h4
-rw-r--r--include/linux/if_pppolac.h23
-rw-r--r--include/linux/if_pppopns.h23
-rw-r--r--include/linux/if_pppox.h21
-rw-r--r--include/linux/inet_diag.h11
-rw-r--r--include/linux/init.h4
-rw-r--r--include/linux/init_task.h3
-rw-r--r--include/linux/initramfs.h32
-rw-r--r--include/linux/ipv6.h9
-rw-r--r--include/linux/kcov.h29
-rw-r--r--include/linux/keychord.h23
-rw-r--r--include/linux/keycombo.h36
-rw-r--r--include/linux/keyreset.h29
-rw-r--r--include/linux/kthread.h1
-rw-r--r--include/linux/lsm_audit.h7
-rw-r--r--include/linux/memory-state-time.h42
-rw-r--r--include/linux/mm.h19
-rw-r--r--include/linux/mm_types.h14
-rw-r--r--include/linux/mmc/card.h3
-rw-r--r--include/linux/mmc/core.h4
-rw-r--r--include/linux/mmc/host.h23
-rw-r--r--include/linux/mmc/mmc.h3
-rw-r--r--include/linux/mmc/pm.h1
-rwxr-xr-x[-rw-r--r--]include/linux/mmc/sdio_func.h10
-rw-r--r--include/linux/mmzone.h2
-rw-r--r--include/linux/mount.h3
-rw-r--r--include/linux/namei.h3
-rw-r--r--include/linux/netfilter/xt_qtaguid.h14
-rw-r--r--include/linux/netfilter/xt_quota2.h25
-rw-r--r--include/linux/nmi.h15
-rw-r--r--include/linux/nodemask.h8
-rw-r--r--include/linux/of_fdt.h21
-rw-r--r--include/linux/perf_event.h5
-rw-r--r--include/linux/pfn.h1
-rw-r--r--include/linux/platform_data/ds2482.h21
-rw-r--r--include/linux/power_supply.h8
-rw-r--r--include/linux/pstore.h12
-rw-r--r--include/linux/pstore_ram.h10
-rw-r--r--include/linux/sched.h169
-rw-r--r--include/linux/sched/sysctl.h26
-rw-r--r--include/linux/sched_energy.h44
-rw-r--r--include/linux/security.h29
-rw-r--r--include/linux/serial_core.h1
-rw-r--r--include/linux/slab.h12
-rw-r--r--include/linux/slub_def.h1
-rw-r--r--include/linux/sock_diag.h2
-rw-r--r--include/linux/string.h1
-rw-r--r--include/linux/suspend.h17
-rw-r--r--include/linux/swap.h6
-rw-r--r--include/linux/switch.h53
-rw-r--r--include/linux/task_io_accounting.h2
-rw-r--r--include/linux/task_io_accounting_ops.h1
-rw-r--r--include/linux/thread_info.h25
-rw-r--r--include/linux/timekeeper_internal.h16
-rw-r--r--include/linux/tracefs.h45
-rw-r--r--include/linux/uaccess.h7
-rw-r--r--include/linux/uid_stat.h29
-rw-r--r--include/linux/usb/class-dual-role.h129
-rw-r--r--include/linux/usb/composite.h1
-rw-r--r--include/linux/usb/f_accessory.h23
-rw-r--r--include/linux/usb/f_mtp.h23
-rw-r--r--include/linux/vmstat.h2
-rw-r--r--include/linux/wakelock.h67
-rw-r--r--include/linux/wakeup_reason.h32
-rw-r--r--include/linux/wifi_tiwlan.h27
-rw-r--r--include/linux/wl12xx.h49
-rw-r--r--include/linux/wlan_plat.h30
-rw-r--r--include/linux/zpool.h5
-rw-r--r--include/linux/zsmalloc.h3
-rw-r--r--include/net/activity_stats.h25
-rw-r--r--include/net/addrconf.h8
-rw-r--r--include/net/cfg80211.h3
-rw-r--r--include/net/fib_rules.h9
-rw-r--r--include/net/flow.h9
-rw-r--r--include/net/if_inet6.h1
-rw-r--r--include/net/ip.h1
-rw-r--r--include/net/ip6_route.h5
-rw-r--r--include/net/route.h5
-rw-r--r--include/net/sock.h13
-rw-r--r--include/net/tcp.h3
-rw-r--r--include/net/udp.h1
-rw-r--r--include/trace/events/android_fs.h65
-rw-r--r--include/trace/events/android_fs_template.h64
-rw-r--r--include/trace/events/cpufreq_interactive.h112
-rw-r--r--include/trace/events/cpufreq_sched.h87
-rw-r--r--include/trace/events/f2fs.h533
-rw-r--r--include/trace/events/gpu.h143
-rw-r--r--include/trace/events/mmc.h91
-rw-r--r--include/trace/events/net.h8
-rw-r--r--include/trace/events/power.h51
-rw-r--r--include/trace/events/sched.h559
-rw-r--r--include/uapi/linux/elf.h1
-rw-r--r--include/uapi/linux/fib_rules.h8
-rw-r--r--include/uapi/linux/fs.h48
-rw-r--r--include/uapi/linux/fuse.h1
-rw-r--r--include/uapi/linux/hw_breakpoint.h4
-rw-r--r--include/uapi/linux/if_pppolac.h33
-rw-r--r--include/uapi/linux/if_pppopns.h32
-rw-r--r--include/uapi/linux/if_pppox.h6
-rw-r--r--include/uapi/linux/inet_diag.h15
-rw-r--r--include/uapi/linux/input.h7
-rw-r--r--include/uapi/linux/ipv6.h17
-rw-r--r--include/uapi/linux/kcov.h10
-rw-r--r--include/uapi/linux/keychord.h52
-rw-r--r--include/uapi/linux/magic.h4
-rw-r--r--include/uapi/linux/netfilter/xt_IDLETIMER.h8
-rw-r--r--include/uapi/linux/netfilter/xt_socket.h5
-rw-r--r--include/uapi/linux/nl80211.h5
-rw-r--r--include/uapi/linux/prctl.h16
-rw-r--r--include/uapi/linux/rtnetlink.h8
-rw-r--r--include/uapi/linux/sock_diag.h1
-rw-r--r--include/uapi/linux/sysctl.h1
-rw-r--r--include/uapi/linux/usb/f_accessory.h146
-rw-r--r--include/uapi/linux/usb/f_mtp.h61
-rw-r--r--include/uapi/video/adf.h321
-rw-r--r--include/video/adf.h502
-rw-r--r--include/video/adf_client.h61
-rw-r--r--include/video/adf_fbdev.h124
-rw-r--r--include/video/adf_format.h26
-rw-r--r--include/video/adf_memblock.h20
-rw-r--r--init/Kconfig55
-rw-r--r--init/Makefile4
-rw-r--r--init/do_mounts.c4
-rw-r--r--init/do_mounts.h10
-rw-r--r--init/do_mounts_dm.c426
-rw-r--r--init/initramfs.c19
-rw-r--r--init/main.c31
-rw-r--r--init/noinitramfs.c9
-rw-r--r--ipc/mqueue.c10
-rw-r--r--kernel/Makefile12
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/cgroup.c3
-rw-r--r--kernel/compat.c5
-rw-r--r--kernel/cpu.c31
-rw-r--r--kernel/cpuset.c33
-rw-r--r--kernel/debug/debug_core.c12
-rw-r--r--kernel/debug/kdb/kdb_bp.c4
-rw-r--r--kernel/debug/kdb/kdb_io.c12
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/exec_domain.c9
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/fork.c25
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/kcov.c274
-rw-r--r--kernel/kthread.c20
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/power/Kconfig16
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/hibernate.c20
-rw-r--r--kernel/power/main.c11
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/process.c22
-rw-r--r--kernel/power/suspend.c90
-rw-r--r--kernel/power/user.c14
-rw-r--r--kernel/power/wakeup_reason.c288
-rw-r--r--kernel/printk/printk.c8
-rw-r--r--kernel/rcu/Makefile4
-rw-r--r--kernel/sched/Makefile11
-rw-r--r--kernel/sched/auto_group.c2
-rw-r--r--kernel/sched/auto_group.h2
-rw-r--r--kernel/sched/core.c887
-rw-r--r--kernel/sched/cpufreq_sched.c500
-rw-r--r--kernel/sched/cputime.c29
-rw-r--r--kernel/sched/deadline.c270
-rw-r--r--kernel/sched/debug.c42
-rw-r--r--kernel/sched/energy.c124
-rw-r--r--kernel/sched/fair.c2722
-rw-r--r--kernel/sched/features.h13
-rw-r--r--kernel/sched/idle.c56
-rw-r--r--kernel/sched/loadavg.c (renamed from kernel/sched/proc.c)247
-rw-r--r--kernel/sched/rt.c115
-rw-r--r--kernel/sched/sched.h356
-rw-r--r--kernel/sched/stop_task.c3
-rw-r--r--kernel/sched/tune.c950
-rw-r--r--kernel/sched/tune.h54
-rw-r--r--kernel/sched/wait.c4
-rw-r--r--kernel/sched/walt.c1171
-rw-r--r--kernel/sched/walt.h62
-rw-r--r--kernel/seccomp.c4
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/sys.c174
-rw-r--r--kernel/sysctl.c112
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/hrtimer.c2
-rw-r--r--kernel/time/posix-cpu-timers.c11
-rw-r--r--kernel/time/timekeeping.c180
-rw-r--r--kernel/trace/Kconfig3
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/ftrace.c24
-rw-r--r--kernel/trace/gpu-traces.c23
-rw-r--r--kernel/trace/trace.c375
-rw-r--r--kernel/trace/trace.h18
-rw-r--r--kernel/trace/trace_events.c44
-rw-r--r--kernel/trace/trace_functions_graph.c52
-rw-r--r--kernel/trace/trace_kprobe.c12
-rw-r--r--kernel/trace/trace_output.c182
-rw-r--r--kernel/trace/trace_printk.c2
-rw-r--r--kernel/trace/trace_probe.h2
-rw-r--r--kernel/trace/trace_stack.c2
-rw-r--r--kernel/trace/trace_stat.c12
-rw-r--r--kernel/trace/trace_syscalls.c7
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/user_namespace.c1
-rw-r--r--kernel/watchdog.c123
-rw-r--r--kernel/workqueue.c6
-rw-r--r--lib/Kconfig.debug44
-rw-r--r--lib/Makefile12
-rw-r--r--lib/lz4/lz4_decompress.c30
-rw-r--r--lib/string.c17
-rw-r--r--lib/strncpy_from_user.c17
-rw-r--r--lib/strnlen_user.c21
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/Makefile19
-rw-r--r--mm/kasan/Makefile2
-rw-r--r--mm/kasan/kasan_init.c4
-rw-r--r--mm/madvise.c3
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/mlock.c7
-rw-r--r--mm/mmap.c63
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/page_alloc.c107
-rw-r--r--mm/shmem.c13
-rw-r--r--mm/slab.c30
-rw-r--r--mm/slub.c140
-rw-r--r--mm/swapfile.c42
-rw-r--r--mm/usercopy.c278
-rw-r--r--mm/vmscan.c43
-rw-r--r--mm/vmstat.c73
-rw-r--r--mm/zbud.c3
-rw-r--r--mm/zpool.c6
-rw-r--r--mm/zsmalloc.c1221
-rw-r--r--mm/zswap.c5
-rw-r--r--net/Kconfig16
-rw-r--r--net/Makefile1
-rw-r--r--net/activity_stats.c119
-rw-r--r--net/bluetooth/af_bluetooth.c29
-rw-r--r--net/bridge/br_device.c11
-rw-r--r--net/core/fib_rules.c74
-rw-r--r--net/core/neighbour.c3
-rw-r--r--net/core/sock.c5
-rw-r--r--net/core/sock_diag.c23
-rw-r--r--net/dccp/ipv6.c7
-rw-r--r--net/ipv4/Kconfig13
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c23
-rw-r--r--net/ipv4/fib_frontend.c1
-rw-r--r--net/ipv4/icmp.c2
-rw-r--r--net/ipv4/inet_connection_sock.c6
-rw-r--r--net/ipv4/inet_diag.c195
-rw-r--r--net/ipv4/ip_output.c3
-rw-r--r--net/ipv4/netfilter/Kconfig3
-rw-r--r--net/ipv4/ping.c7
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c39
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c22
-rw-r--r--net/ipv4/sysfs_net_ipv4.c88
-rw-r--r--net/ipv4/tcp.c51
-rw-r--r--net/ipv4/tcp_diag.c19
-rw-r--r--net/ipv4/tcp_input.c4
-rw-r--r--net/ipv4/tcp_ipv4.c12
-rw-r--r--net/ipv4/tcp_output.c2
-rw-r--r--net/ipv4/udp.c18
-rw-r--r--net/ipv4/udp_diag.c89
-rw-r--r--net/ipv6/addrconf.c389
-rw-r--r--net/ipv6/af_inet6.c21
-rw-r--r--net/ipv6/ah6.c5
-rw-r--r--net/ipv6/datagram.c1
-rw-r--r--net/ipv6/esp6.c5
-rw-r--r--net/ipv6/exthdrs_core.c13
-rw-r--r--net/ipv6/icmp.c9
-rw-r--r--net/ipv6/inet6_connection_sock.c2
-rw-r--r--net/ipv6/ip6_gre.c4
-rw-r--r--net/ipv6/ip6_tunnel.c3
-rw-r--r--net/ipv6/ip6_vti.c5
-rw-r--r--net/ipv6/ipcomp6.c5
-rw-r--r--net/ipv6/ndisc.c6
-rw-r--r--net/ipv6/netfilter.c1
-rw-r--r--net/ipv6/netfilter/Kconfig3
-rw-r--r--net/ipv6/ping.c39
-rw-r--r--net/ipv6/raw.c1
-rw-r--r--net/ipv6/route.c92
-rw-r--r--net/ipv6/syncookies.c1
-rw-r--r--net/ipv6/tcp_ipv6.c5
-rw-r--r--net/ipv6/udp.c2
-rw-r--r--net/l2tp/l2tp_ip6.c1
-rw-r--r--net/netfilter/Kconfig55
-rw-r--r--net/netfilter/Makefile2
-rw-r--r--net/netfilter/nf_conntrack_core.c16
-rw-r--r--net/netfilter/nfnetlink.c10
-rw-r--r--net/netfilter/xt_IDLETIMER.c244
-rw-r--r--net/netfilter/xt_qtaguid.c3038
-rw-r--r--net/netfilter/xt_qtaguid_internal.h350
-rw-r--r--net/netfilter/xt_qtaguid_print.c566
-rw-r--r--net/netfilter/xt_qtaguid_print.h120
-rw-r--r--net/netfilter/xt_quota2.c401
-rw-r--r--net/netfilter/xt_socket.c84
-rw-r--r--net/packet/af_packet.c26
-rw-r--r--net/rfkill/Kconfig5
-rw-r--r--net/rfkill/core.c9
-rw-r--r--net/socket.c18
-rw-r--r--net/wireless/core.c16
-rw-r--r--net/wireless/core.h2
-rw-r--r--net/wireless/nl80211.c16
-rw-r--r--net/wireless/scan.c2
-rw-r--r--net/xfrm/xfrm_algo.c2
-rw-r--r--scripts/Makefile.clean2
-rw-r--r--scripts/Makefile.lib20
-rw-r--r--scripts/Makefile.modinst2
-rwxr-xr-xscripts/checkpatch.pl1
-rw-r--r--security/Kconfig49
-rw-r--r--security/capability.c24
-rw-r--r--security/commoncap.c113
-rw-r--r--security/inode.c2
-rw-r--r--security/keys/process_keys.c1
-rw-r--r--security/lsm_audit.c19
-rw-r--r--security/security.c21
-rw-r--r--security/selinux/avc.c421
-rw-r--r--security/selinux/hooks.c215
-rw-r--r--security/selinux/include/avc.h5
-rw-r--r--security/selinux/include/classmap.h3
-rw-r--r--security/selinux/include/security.h33
-rw-r--r--security/selinux/nlmsgtab.c7
-rw-r--r--security/selinux/ss/avtab.c159
-rw-r--r--security/selinux/ss/avtab.h34
-rw-r--r--security/selinux/ss/conditional.c30
-rw-r--r--security/selinux/ss/conditional.h6
-rw-r--r--security/selinux/ss/policydb.c5
-rw-r--r--security/selinux/ss/services.c213
-rw-r--r--security/selinux/ss/services.h6
-rw-r--r--tools/perf/util/evsel.c15
1135 files changed, 96026 insertions, 15925 deletions
diff --git a/Documentation/ABI/obsolete/sysfs-block-zram b/Documentation/ABI/obsolete/sysfs-block-zram
new file mode 100644
index 000000000000..720ea92cfb2e
--- /dev/null
+++ b/Documentation/ABI/obsolete/sysfs-block-zram
@@ -0,0 +1,119 @@
+What: /sys/block/zram<id>/num_reads
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The num_reads file is read-only and specifies the number of
+ reads (failed or successful) done on this device.
+ Now accessible via zram<id>/stat node.
+
+What: /sys/block/zram<id>/num_writes
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The num_writes file is read-only and specifies the number of
+ writes (failed or successful) done on this device.
+ Now accessible via zram<id>/stat node.
+
+What: /sys/block/zram<id>/invalid_io
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The invalid_io file is read-only and specifies the number of
+ non-page-size-aligned I/O requests issued to this device.
+ Now accessible via zram<id>/io_stat node.
+
+What: /sys/block/zram<id>/failed_reads
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The failed_reads file is read-only and specifies the number of
+ failed reads happened on this device.
+ Now accessible via zram<id>/io_stat node.
+
+What: /sys/block/zram<id>/failed_writes
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The failed_writes file is read-only and specifies the number of
+ failed writes happened on this device.
+ Now accessible via zram<id>/io_stat node.
+
+What: /sys/block/zram<id>/notify_free
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The notify_free file is read-only. Depending on device usage
+ scenario it may account a) the number of pages freed because
+ of swap slot free notifications or b) the number of pages freed
+ because of REQ_DISCARD requests sent by bio. The former ones
+ are sent to a swap block device when a swap slot is freed, which
+ implies that this disk is being used as a swap disk. The latter
+ ones are sent by filesystem mounted with discard option,
+ whenever some data blocks are getting discarded.
+ Now accessible via zram<id>/io_stat node.
+
+What: /sys/block/zram<id>/zero_pages
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The zero_pages file is read-only and specifies number of zero
+ filled pages written to this disk. No memory is allocated for
+ such pages.
+ Now accessible via zram<id>/mm_stat node.
+
+What: /sys/block/zram<id>/orig_data_size
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The orig_data_size file is read-only and specifies uncompressed
+ size of data stored in this disk. This excludes zero-filled
+ pages (zero_pages) since no memory is allocated for them.
+ Unit: bytes
+ Now accessible via zram<id>/mm_stat node.
+
+What: /sys/block/zram<id>/compr_data_size
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The compr_data_size file is read-only and specifies compressed
+ size of data stored in this disk. So, compression ratio can be
+ calculated using orig_data_size and this statistic.
+ Unit: bytes
+ Now accessible via zram<id>/mm_stat node.
+
+What: /sys/block/zram<id>/mem_used_total
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The mem_used_total file is read-only and specifies the amount
+ of memory, including allocator fragmentation and metadata
+ overhead, allocated for this disk. So, allocator space
+ efficiency can be calculated using compr_data_size and this
+ statistic.
+ Unit: bytes
+ Now accessible via zram<id>/mm_stat node.
+
+What: /sys/block/zram<id>/mem_used_max
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The mem_used_max file is read/write and specifies the amount
+ of maximum memory zram have consumed to store compressed data.
+ For resetting the value, you should write "0". Otherwise,
+ you could see -EINVAL.
+ Unit: bytes
+ Downgraded to write-only node: so it's possible to set new
+ value only; its current value is stored in zram<id>/mm_stat
+ node.
+
+What: /sys/block/zram<id>/mem_limit
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The mem_limit file is read/write and specifies the maximum
+ amount of memory ZRAM can use to store the compressed data.
+ The limit could be changed in run time and "0" means disable
+ the limit. No limit is the initial state. Unit: bytes
+ Downgraded to write-only node: so it's possible to set new
+ value only; its current value is stored in zram<id>/mm_stat
+ node.
diff --git a/Documentation/ABI/testing/configfs-usb-gadget-midi b/Documentation/ABI/testing/configfs-usb-gadget-midi
new file mode 100644
index 000000000000..6b341df7249c
--- /dev/null
+++ b/Documentation/ABI/testing/configfs-usb-gadget-midi
@@ -0,0 +1,12 @@
+What: /config/usb-gadget/gadget/functions/midi.name
+Date: Nov 2014
+KernelVersion: 3.19
+Description:
+ The attributes:
+
+ index - index value for the USB MIDI adapter
+ id - ID string for the USB MIDI adapter
+ buflen - MIDI buffer length
+ qlen - USB read request queue length
+ in_ports - number of MIDI input ports
+ out_ports - number of MIDI output ports
diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index a6148eaf91e5..2e69e83bf510 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -141,3 +141,28 @@ Description:
amount of memory ZRAM can use to store the compressed data. The
limit could be changed in run time and "0" means disable the
limit. No limit is the initial state. Unit: bytes
+
+What: /sys/block/zram<id>/compact
+Date: August 2015
+Contact: Minchan Kim <minchan@kernel.org>
+Description:
+ The compact file is write-only and trigger compaction for
+ allocator zrm uses. The allocator moves some objects so that
+ it could free fragment space.
+
+What: /sys/block/zram<id>/io_stat
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The io_stat file is read-only and accumulates device's I/O
+ statistics not accounted by block layer. For example,
+ failed_reads, failed_writes, etc. File format is similar to
+ block layer statistics file format.
+
+What: /sys/block/zram<id>/mm_stat
+Date: August 2015
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The mm_stat file is read-only and represents device's mm
+ statistics (orig_data_size, compr_data_size, etc.) in a format
+ similar to block layer statistics file format.
diff --git a/Documentation/ABI/testing/sysfs-class-dual-role-usb b/Documentation/ABI/testing/sysfs-class-dual-role-usb
new file mode 100644
index 000000000000..a900fd75430c
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-dual-role-usb
@@ -0,0 +1,71 @@
+What: /sys/class/dual_role_usb/.../
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ Provide a generic interface to monitor and change
+ the state of dual role usb ports. The name here
+ refers to the name mentioned in the
+ dual_role_phy_desc that is passed while registering
+ the dual_role_phy_intstance through
+ devm_dual_role_instance_register.
+
+What: /sys/class/dual_role_usb/.../supported_modes
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ This is a static node, once initialized this
+ is not expected to change during runtime. "dfp"
+ refers to "downstream facing port" i.e. port can
+ only act as host. "ufp" refers to "upstream
+ facing port" i.e. port can only act as device.
+ "dfp ufp" refers to "dual role port" i.e. the port
+ can either be a host port or a device port.
+
+What: /sys/class/dual_role_usb/.../mode
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ The mode node refers to the current mode in which the
+ port is operating. "dfp" for host ports. "ufp" for device
+ ports and "none" when cable is not connected.
+
+ On devices where the USB mode is software-controllable,
+ userspace can change the mode by writing "dfp" or "ufp".
+ On devices where the USB mode is fixed in hardware,
+ this attribute is read-only.
+
+What: /sys/class/dual_role_usb/.../power_role
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ The power_role node mentions whether the port
+ is "sink"ing or "source"ing power. "none" if
+ they are not connected.
+
+ On devices implementing USB Power Delivery,
+ userspace can control the power role by writing "sink" or
+ "source". On devices without USB-PD, this attribute is
+ read-only.
+
+What: /sys/class/dual_role_usb/.../data_role
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ The data_role node mentions whether the port
+ is acting as "host" or "device" for USB data connection.
+ "none" if there is no active data link.
+
+ On devices implementing USB Power Delivery, userspace
+ can control the data role by writing "host" or "device".
+ On devices without USB-PD, this attribute is read-only
+
+What: /sys/class/dual_role_usb/.../powers_vconn
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ The powers_vconn node mentions whether the port
+ is supplying power for VCONN pin.
+
+ On devices with software control of VCONN,
+ userspace can disable the power supply to VCONN by writing "n",
+ or enable the power supply by writing "y".
diff --git a/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons b/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons
new file mode 100644
index 000000000000..acb19b91c192
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons
@@ -0,0 +1,16 @@
+What: /sys/kernel/wakeup_reasons/last_resume_reason
+Date: February 2014
+Contact: Ruchi Kandoi <kandoiruchi@google.com>
+Description:
+ The /sys/kernel/wakeup_reasons/last_resume_reason is
+ used to report wakeup reasons after system exited suspend.
+
+What: /sys/kernel/wakeup_reasons/last_suspend_time
+Date: March 2015
+Contact: jinqian <jinqian@google.com>
+Description:
+ The /sys/kernel/wakeup_reasons/last_suspend_time is
+ used to report time spent in last suspend cycle. It contains
+ two numbers (in seconds) separated by space. First number is
+ the time spent in suspend and resume processes. Second number
+ is the time spent in sleep state. \ No newline at end of file
diff --git a/Documentation/android.txt b/Documentation/android.txt
new file mode 100644
index 000000000000..0f40a78b045f
--- /dev/null
+++ b/Documentation/android.txt
@@ -0,0 +1,121 @@
+ =============
+ A N D R O I D
+ =============
+
+Copyright (C) 2009 Google, Inc.
+Written by Mike Chan <mike@android.com>
+
+CONTENTS:
+---------
+
+1. Android
+ 1.1 Required enabled config options
+ 1.2 Required disabled config options
+ 1.3 Recommended enabled config options
+2. Contact
+
+
+1. Android
+==========
+
+Android (www.android.com) is an open source operating system for mobile devices.
+This document describes configurations needed to run the Android framework on
+top of the Linux kernel.
+
+To see a working defconfig look at msm_defconfig or goldfish_defconfig
+which can be found at http://android.git.kernel.org in kernel/common.git
+and kernel/msm.git
+
+
+1.1 Required enabled config options
+-----------------------------------
+After building a standard defconfig, ensure that these options are enabled in
+your .config or defconfig if they are not already. Based off the msm_defconfig.
+You should keep the rest of the default options enabled in the defconfig
+unless you know what you are doing.
+
+ANDROID_PARANOID_NETWORK
+ASHMEM
+CONFIG_FB_MODE_HELPERS
+CONFIG_FONT_8x16
+CONFIG_FONT_8x8
+CONFIG_YAFFS_SHORT_NAMES_IN_RAM
+DAB
+EARLYSUSPEND
+FB
+FB_CFB_COPYAREA
+FB_CFB_FILLRECT
+FB_CFB_IMAGEBLIT
+FB_DEFERRED_IO
+FB_TILEBLITTING
+HIGH_RES_TIMERS
+INOTIFY
+INOTIFY_USER
+INPUT_EVDEV
+INPUT_GPIO
+INPUT_MISC
+LEDS_CLASS
+LEDS_GPIO
+LOCK_KERNEL
+LkOGGER
+LOW_MEMORY_KILLER
+MISC_DEVICES
+NEW_LEDS
+NO_HZ
+POWER_SUPPLY
+PREEMPT
+RAMFS
+RTC_CLASS
+RTC_LIB
+SWITCH
+SWITCH_GPIO
+TMPFS
+UID_STAT
+UID16
+USB_FUNCTION
+USB_FUNCTION_ADB
+USER_WAKELOCK
+VIDEO_OUTPUT_CONTROL
+WAKELOCK
+YAFFS_AUTO_YAFFS2
+YAFFS_FS
+YAFFS_YAFFS1
+YAFFS_YAFFS2
+
+
+1.2 Required disabled config options
+------------------------------------
+CONFIG_YAFFS_DISABLE_LAZY_LOAD
+DNOTIFY
+
+
+1.3 Recommended enabled config options
+------------------------------
+ANDROID_PMEM
+PSTORE_CONSOLE
+PSTORE_RAM
+SCHEDSTATS
+DEBUG_PREEMPT
+DEBUG_MUTEXES
+DEBUG_SPINLOCK_SLEEP
+DEBUG_INFO
+FRAME_POINTER
+CPU_FREQ
+CPU_FREQ_TABLE
+CPU_FREQ_DEFAULT_GOV_ONDEMAND
+CPU_FREQ_GOV_ONDEMAND
+CRC_CCITT
+EMBEDDED
+INPUT_TOUCHSCREEN
+I2C
+I2C_BOARDINFO
+LOG_BUF_SHIFT=17
+SERIAL_CORE
+SERIAL_CORE_CONSOLE
+
+
+2. Contact
+==========
+website: http://android.git.kernel.org
+
+mailing-lists: android-kernel@googlegroups.com
diff --git a/Documentation/arm/memory.txt b/Documentation/arm/memory.txt
index 38dc06d0a791..4178ebda6e66 100644
--- a/Documentation/arm/memory.txt
+++ b/Documentation/arm/memory.txt
@@ -41,7 +41,7 @@ fffe8000 fffeffff DTCM mapping area for platforms with
fffe0000 fffe7fff ITCM mapping area for platforms with
ITCM mounted inside the CPU.
-ffc00000 ffdfffff Fixmap mapping region. Addresses provided
+ffc00000 ffefffff Fixmap mapping region. Addresses provided
by fix_to_virt() will be located here.
fee00000 feffffff Mapping of PCI I/O space. This is a static
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index e840b47613f7..bc5148757edb 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -26,3 +26,9 @@ switching-sched.txt
- Switching I/O schedulers at runtime
writeback_cache_control.txt
- Control of volatile write back caches
+mmc-max-speed.txt
+ - eMMC layer speed simulation, related to /sys/block/mmcblk*/
+ attributes:
+ max_read_speed
+ max_write_speed
+ cache_size
diff --git a/Documentation/block/mmc-max-speed.txt b/Documentation/block/mmc-max-speed.txt
new file mode 100644
index 000000000000..3f052b9fb999
--- /dev/null
+++ b/Documentation/block/mmc-max-speed.txt
@@ -0,0 +1,38 @@
+eMMC Block layer simulation speed controls in /sys/block/mmcblk*/
+===============================================
+
+Turned on with CONFIG_MMC_SIMULATE_MAX_SPEED which enables MMC device speed
+limiting. Used to test and simulate the behavior of the system when
+confronted with a slow MMC.
+
+Enables max_read_speed, max_write_speed and cache_size attributes and module
+default parameters to control the write or read maximum KB/second speed
+behaviors.
+
+NB: There is room for improving the algorithm for aspects tied directly to
+eMMC specific behavior. For instance, wear leveling and stalls from an
+exhausted erase pool. We would expect that if there was a need to provide
+similar speed simulation controls to other types of block devices, aspects of
+their behavior are modelled separately (e.g. head seek times, heat assist,
+shingling and rotational latency).
+
+/sys/block/mmcblk0/max_read_speed:
+
+Number of KB/second reads allowed to the block device. Used to test and
+simulate the behavior of the system when confronted with a slow reading MMC.
+Set to 0 or "off" to place no speed limit.
+
+/sys/block/mmcblk0/max_write_speed:
+
+Number of KB/second writes allowed to the block device. Used to test and
+simulate the behavior of the system when confronted with a slow writing MMC.
+Set to 0 or "off" to place no speed limit.
+
+/sys/block/mmcblk0/cache_size:
+
+Number of MB of high speed memory or high speed SLC cache expected on the
+eMMC device being simulated. Used to help simulate the write-back behavior
+more accurately. The assumption is the cache has no delay, but draws down
+in the background to the MLC/TLC primary store at the max_write_speed rate.
+Any write speed delays will show up when the cache is full, or when an I/O
+request to flush is issued.
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 7fcf9c6592ec..48a183e29988 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -98,20 +98,79 @@ size of the disk when not in use so a huge zram is wasteful.
mount /dev/zram1 /tmp
7) Stats:
- Per-device statistics are exported as various nodes under
- /sys/block/zram<id>/
- disksize
- num_reads
- num_writes
- failed_reads
- failed_writes
- invalid_io
- notify_free
- zero_pages
- orig_data_size
- compr_data_size
- mem_used_total
- mem_used_max
+Per-device statistics are exported as various nodes under /sys/block/zram<id>/
+
+A brief description of exported device attritbutes. For more details please
+read Documentation/ABI/testing/sysfs-block-zram.
+
+Name access description
+---- ------ -----------
+disksize RW show and set the device's disk size
+initstate RO shows the initialization state of the device
+reset WO trigger device reset
+num_reads RO the number of reads
+failed_reads RO the number of failed reads
+num_write RO the number of writes
+failed_writes RO the number of failed writes
+invalid_io RO the number of non-page-size-aligned I/O requests
+max_comp_streams RW the number of possible concurrent compress operations
+comp_algorithm RW show and change the compression algorithm
+notify_free RO the number of notifications to free pages (either
+ slot free notifications or REQ_DISCARD requests)
+zero_pages RO the number of zero filled pages written to this disk
+orig_data_size RO uncompressed size of data stored in this disk
+compr_data_size RO compressed size of data stored in this disk
+mem_used_total RO the amount of memory allocated for this disk
+mem_used_max RW the maximum amount memory zram have consumed to
+ store compressed data
+mem_limit RW the maximum amount of memory ZRAM can use to store
+ the compressed data
+num_migrated RO the number of objects migrated migrated by compaction
+
+
+WARNING
+=======
+per-stat sysfs attributes are considered to be deprecated.
+The basic strategy is:
+-- the existing RW nodes will be downgraded to WO nodes (in linux 4.11)
+-- deprecated RO sysfs nodes will eventually be removed (in linux 4.11)
+
+The list of deprecated attributes can be found here:
+Documentation/ABI/obsolete/sysfs-block-zram
+
+Basically, every attribute that has its own read accessible sysfs node
+(e.g. num_reads) *AND* is accessible via one of the stat files (zram<id>/stat
+or zram<id>/io_stat or zram<id>/mm_stat) is considered to be deprecated.
+
+User space is advised to use the following files to read the device statistics.
+
+File /sys/block/zram<id>/stat
+
+Represents block layer statistics. Read Documentation/block/stat.txt for
+details.
+
+File /sys/block/zram<id>/io_stat
+
+The stat file represents device's I/O statistics not accounted by block
+layer and, thus, not available in zram<id>/stat file. It consists of a
+single line of text and contains the following stats separated by
+whitespace:
+ failed_reads
+ failed_writes
+ invalid_io
+ notify_free
+
+File /sys/block/zram<id>/mm_stat
+
+The stat file represents device's mm statistics. It consists of a single
+line of text and contains the following stats separated by whitespace:
+ orig_data_size
+ compr_data_size
+ mem_used_total
+ mem_limit
+ mem_used_max
+ zero_pages
+ num_migrated
8) Deactivate:
swapoff /dev/zram0
diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt
index 77ec21574fb1..2ed766ec75fc 100644
--- a/Documentation/cpu-freq/governors.txt
+++ b/Documentation/cpu-freq/governors.txt
@@ -28,6 +28,7 @@ Contents:
2.3 Userspace
2.4 Ondemand
2.5 Conservative
+2.6 Interactive
3. The Governor Interface in the CPUfreq Core
@@ -218,6 +219,90 @@ a decision on when to decrease the frequency while running in any
speed. Load for frequency increase is still evaluated every
sampling rate.
+2.6 Interactive
+---------------
+
+The CPUfreq governor "interactive" is designed for latency-sensitive,
+interactive workloads. This governor sets the CPU speed depending on
+usage, similar to "ondemand" and "conservative" governors, but with a
+different set of configurable behaviors.
+
+The tuneable values for this governor are:
+
+target_loads: CPU load values used to adjust speed to influence the
+current CPU load toward that value. In general, the lower the target
+load, the more often the governor will raise CPU speeds to bring load
+below the target. The format is a single target load, optionally
+followed by pairs of CPU speeds and CPU loads to target at or above
+those speeds. Colons can be used between the speeds and associated
+target loads for readability. For example:
+
+ 85 1000000:90 1700000:99
+
+targets CPU load 85% below speed 1GHz, 90% at or above 1GHz, until
+1.7GHz and above, at which load 99% is targeted. If speeds are
+specified these must appear in ascending order. Higher target load
+values are typically specified for higher speeds, that is, target load
+values also usually appear in an ascending order. The default is
+target load 90% for all speeds.
+
+min_sample_time: The minimum amount of time to spend at the current
+frequency before ramping down. Default is 80000 uS.
+
+hispeed_freq: An intermediate "hi speed" at which to initially ramp
+when CPU load hits the value specified in go_hispeed_load. If load
+stays high for the amount of time specified in above_hispeed_delay,
+then speed may be bumped higher. Default is the maximum speed
+allowed by the policy at governor initialization time.
+
+go_hispeed_load: The CPU load at which to ramp to hispeed_freq.
+Default is 99%.
+
+above_hispeed_delay: When speed is at or above hispeed_freq, wait for
+this long before raising speed in response to continued high load.
+The format is a single delay value, optionally followed by pairs of
+CPU speeds and the delay to use at or above those speeds. Colons can
+be used between the speeds and associated delays for readability. For
+example:
+
+ 80000 1300000:200000 1500000:40000
+
+uses delay 80000 uS until CPU speed 1.3 GHz, at which speed delay
+200000 uS is used until speed 1.5 GHz, at which speed (and above)
+delay 40000 uS is used. If speeds are specified these must appear in
+ascending order. Default is 20000 uS.
+
+timer_rate: Sample rate for reevaluating CPU load when the CPU is not
+idle. A deferrable timer is used, such that the CPU will not be woken
+from idle to service this timer until something else needs to run.
+(The maximum time to allow deferring this timer when not running at
+minimum speed is configurable via timer_slack.) Default is 20000 uS.
+
+timer_slack: Maximum additional time to defer handling the governor
+sampling timer beyond timer_rate when running at speeds above the
+minimum. For platforms that consume additional power at idle when
+CPUs are running at speeds greater than minimum, this places an upper
+bound on how long the timer will be deferred prior to re-evaluating
+load and dropping speed. For example, if timer_rate is 20000uS and
+timer_slack is 10000uS then timers will be deferred for up to 30msec
+when not at lowest speed. A value of -1 means defer timers
+indefinitely at all speeds. Default is 80000 uS.
+
+boost: If non-zero, immediately boost speed of all CPUs to at least
+hispeed_freq until zero is written to this attribute. If zero, allow
+CPU speeds to drop below hispeed_freq according to load as usual.
+Default is zero.
+
+boostpulse: On each write, immediately boost speed of all CPUs to
+hispeed_freq for at least the period of time specified by
+boostpulse_duration, after which speeds are allowed to drop below
+hispeed_freq according to load as usual.
+
+boostpulse_duration: Length of time to hold CPU speed at hispeed_freq
+on a write to boostpulse, before allowing speed to drop according to
+load as usual. Default is 80000 uS.
+
+
3. The Governor Interface in the CPUfreq Core
=============================================
diff --git a/Documentation/device-mapper/boot.txt b/Documentation/device-mapper/boot.txt
new file mode 100644
index 000000000000..adcaad5e5e32
--- /dev/null
+++ b/Documentation/device-mapper/boot.txt
@@ -0,0 +1,42 @@
+Boot time creation of mapped devices
+===================================
+
+It is possible to configure a device mapper device to act as the root
+device for your system in two ways.
+
+The first is to build an initial ramdisk which boots to a minimal
+userspace which configures the device, then pivot_root(8) in to it.
+
+For simple device mapper configurations, it is possible to boot directly
+using the following kernel command line:
+
+dm="<name> <uuid> <ro>,table line 1,...,table line n"
+
+name = the name to associate with the device
+ after boot, udev, if used, will use that name to label
+ the device node.
+uuid = may be 'none' or the UUID desired for the device.
+ro = may be "ro" or "rw". If "ro", the device and device table will be
+ marked read-only.
+
+Each table line may be as normal when using the dmsetup tool except for
+two variations:
+1. Any use of commas will be interpreted as a newline
+2. Quotation marks cannot be escaped and cannot be used without
+ terminating the dm= argument.
+
+Unless renamed by udev, the device node created will be dm-0 as the
+first minor number for the device-mapper is used during early creation.
+
+Example
+=======
+
+- Booting to a linear array made up of user-mode linux block devices:
+
+ dm="lroot none 0, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" \
+ root=/dev/dm-0
+
+Will boot to a rw dm-linear target of 8192 sectors split across two
+block devices identified by their major:minor numbers. After boot, udev
+will rename this target to /dev/mapper/lroot (depending on the rules).
+No uuid was assigned.
diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt
index 0075f70cd3f9..d1655d2b4419 100644
--- a/Documentation/device-mapper/verity.txt
+++ b/Documentation/device-mapper/verity.txt
@@ -10,18 +10,18 @@ Construction Parameters
<version> <dev> <hash_dev>
<data_block_size> <hash_block_size>
<num_data_blocks> <hash_start_block>
- <algorithm> <digest> <salt>
+ <algorithm> <digest> <salt> <mode>
<version>
This is the type of the on-disk hash format.
0 is the original format used in the Chromium OS.
The salt is appended when hashing, digests are stored continuously and
- the rest of the block is padded with zeros.
+ the rest of the block is padded with zeroes.
1 is the current format that should be used for new devices.
The salt is prepended when hashing and each digest is
- padded with zeros to the power of two.
+ padded with zeroes to the power of two.
<dev>
This is the device containing data, the integrity of which needs to be
@@ -62,6 +62,47 @@ Construction Parameters
<salt>
The hexadecimal encoding of the salt value.
+<mode>
+ Optional. The mode of operation.
+
+ 0 is the normal mode of operation where a corrupted block will result in an
+ I/O error.
+
+ 1 is logging mode where corrupted blocks are logged and a uevent is sent to
+ notify user space.
+
+
+ignore_zero_blocks
+ Do not verify blocks that are expected to contain zeroes and always return
+ zeroes instead. This may be useful if the partition contains unused blocks
+ that are not guaranteed to contain zeroes.
+
+use_fec_from_device <fec_dev>
+ Use forward error correction (FEC) to recover from corruption if hash
+ verification fails. Use encoding data from the specified device. This
+ may be the same device where data and hash blocks reside, in which case
+ fec_start must be outside data and hash areas.
+
+ If the encoding data covers additional metadata, it must be accessible
+ on the hash device after the hash blocks.
+
+ Note: block sizes for data and hash devices must match. Also, if the
+ verity <dev> is encrypted the <fec_dev> should be too.
+
+fec_roots <num>
+ Number of generator roots. This equals to the number of parity bytes in
+ the encoding data. For example, in RS(M, N) encoding, the number of roots
+ is M-N.
+
+fec_blocks <num>
+ The number of encoding data blocks on the FEC device. The block size for
+ the FEC device is <data_block_size>.
+
+fec_start <offset>
+ This is the offset, in <data_block_size> blocks, from the start of the
+ FEC device to the beginning of the encoding data.
+
+
Theory of operation
===================
@@ -81,6 +122,11 @@ per-block basis. This allows for a lightweight hash computation on first read
into the page cache. Block hashes are stored linearly, aligned to the nearest
block size.
+If forward error correction (FEC) support is enabled any recovery of
+corrupted data will be verified using the cryptographic hash of the
+corresponding data. This is why combining error correction with
+integrity checking is essential.
+
Hash Tree
---------
diff --git a/Documentation/devicetree/bindings/misc/memory-state-time.txt b/Documentation/devicetree/bindings/misc/memory-state-time.txt
new file mode 100644
index 000000000000..c99a506c030d
--- /dev/null
+++ b/Documentation/devicetree/bindings/misc/memory-state-time.txt
@@ -0,0 +1,8 @@
+Memory bandwidth and frequency state tracking
+
+Required properties:
+- compatible : should be:
+ "memory-state-time"
+- freq-tbl: Should contain entries with each frequency in Hz.
+- bw-buckets: Should contain upper-bound limits for each bandwidth bucket in Mbps.
+ Must match the framework power_profile.xml for the device.
diff --git a/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt
new file mode 100644
index 000000000000..11216f09e596
--- /dev/null
+++ b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt
@@ -0,0 +1,360 @@
+===========================================================
+Energy cost bindings for Energy Aware Scheduling
+===========================================================
+
+===========================================================
+1 - Introduction
+===========================================================
+
+This note specifies bindings required for energy-aware scheduling
+(EAS)[1]. Historically, the scheduler's primary objective has been
+performance. EAS aims to provide an alternative objective - energy
+efficiency. EAS relies on a simple platform energy cost model to
+guide scheduling decisions. The model only considers the CPU
+subsystem.
+
+This note is aligned with the definition of the layout of physical
+CPUs in the system as described in the ARM topology binding
+description [2]. The concept is applicable to any system so long as
+the cost model data is provided for those processing elements in
+that system's topology that EAS is required to service.
+
+Processing elements refer to hardware threads, CPUs and clusters of
+related CPUs in increasing order of hierarchy.
+
+EAS requires two key cost metrics - busy costs and idle costs. Busy
+costs comprise of a list of compute capacities for the processing
+element in question and the corresponding power consumption at that
+capacity. Idle costs comprise of a list of power consumption values
+for each idle state [C-state] that the processing element supports.
+For a detailed description of these metrics, their derivation and
+their use see [3].
+
+These cost metrics are required for processing elements in all
+scheduling domain levels that EAS is required to service.
+
+===========================================================
+2 - energy-costs node
+===========================================================
+
+Energy costs for the processing elements in scheduling domains that
+EAS is required to service are defined in the energy-costs node
+which acts as a container for the actual per processing element cost
+nodes. A single energy-costs node is required for a given system.
+
+- energy-costs node
+
+ Usage: Required
+
+ Description: The energy-costs node is a container node and
+ it's sub-nodes describe costs for each processing element at
+ all scheduling domain levels that EAS is required to
+ service.
+
+ Node name must be "energy-costs".
+
+ The energy-costs node's parent node must be the cpus node.
+
+ The energy-costs node's child nodes can be:
+
+ - one or more cost nodes.
+
+ Any other configuration is considered invalid.
+
+The energy-costs node can only contain a single type of child node
+whose bindings are described in paragraph 4.
+
+===========================================================
+3 - energy-costs node child nodes naming convention
+===========================================================
+
+energy-costs child nodes must follow a naming convention where the
+node name must be "thread-costN", "core-costN", "cluster-costN"
+depending on whether the costs in the node are for a thread, core or
+cluster. N (where N = {0, 1, ...}) is the node number and has no
+bearing to the OS' logical thread, core or cluster index.
+
+===========================================================
+4 - cost node bindings
+===========================================================
+
+Bindings for cost nodes are defined as follows:
+
+- cluster-cost node
+
+ Description: must be declared within an energy-costs node. A
+ system can contain multiple clusters and each cluster
+ serviced by EAS must have a corresponding cluster-costs
+ node.
+
+ The cluster-cost node name must be "cluster-costN" as
+ described in 3 above.
+
+ A cluster-cost node must be a leaf node with no children.
+
+ Properties for cluster-cost nodes are described in paragraph
+ 5 below.
+
+ Any other configuration is considered invalid.
+
+- core-cost node
+
+ Description: must be declared within an energy-costs node. A
+ system can contain multiple cores and each core serviced by
+ EAS must have a corresponding core-cost node.
+
+ The core-cost node name must be "core-costN" as described in
+ 3 above.
+
+ A core-cost node must be a leaf node with no children.
+
+ Properties for core-cost nodes are described in paragraph
+ 5 below.
+
+ Any other configuration is considered invalid.
+
+- thread-cost node
+
+ Description: must be declared within an energy-costs node. A
+ system can contain cores with multiple hardware threads and
+ each thread serviced by EAS must have a corresponding
+ thread-cost node.
+
+ The core-cost node name must be "core-costN" as described in
+ 3 above.
+
+ A core-cost node must be a leaf node with no children.
+
+ Properties for thread-cost nodes are described in paragraph
+ 5 below.
+
+ Any other configuration is considered invalid.
+
+===========================================================
+5 - Cost node properties
+==========================================================
+
+All cost node types must have only the following properties:
+
+- busy-cost-data
+
+ Usage: required
+ Value type: An array of 2-item tuples. Each item is of type
+ u32.
+ Definition: The first item in the tuple is the capacity
+ value as described in [3]. The second item in the tuple is
+ the energy cost value as described in [3].
+
+- idle-cost-data
+
+ Usage: required
+ Value type: An array of 1-item tuples. The item is of type
+ u32.
+ Definition: The item in the tuple is the energy cost value
+ as described in [3].
+
+===========================================================
+4 - Extensions to the cpu node
+===========================================================
+
+The cpu node is extended with a property that establishes the
+connection between the processing element represented by the cpu
+node and the cost-nodes associated with this processing element.
+
+The connection is expressed in line with the topological hierarchy
+that this processing element belongs to starting with the level in
+the hierarchy that this processing element itself belongs to through
+to the highest level that EAS is required to service. The
+connection cannot be sparse and must be contiguous from the
+processing element's level through to the highest desired level. The
+highest desired level must be the same for all processing elements.
+
+Example: Given that a cpu node may represent a thread that is a part
+of a core, this property may contain multiple elements which
+associate the thread with cost nodes describing the costs for the
+thread itself, the core the thread belongs to, the cluster the core
+belongs to and so on. The elements must be ordered from the lowest
+level nodes to the highest desired level that EAS must service. The
+highest desired level must be the same for all cpu nodes. The
+elements must not be sparse: there must be elements for the current
+thread, the next level of hierarchy (core) and so on without any
+'holes'.
+
+Example: Given that a cpu node may represent a core that is a part
+of a cluster of related cpus this property may contain multiple
+elements which associate the core with cost nodes describing the
+costs for the core itself, the cluster the core belongs to and so
+on. The elements must be ordered from the lowest level nodes to the
+highest desired level that EAS must service. The highest desired
+level must be the same for all cpu nodes. The elements must not be
+sparse: there must be elements for the current thread, the next
+level of hierarchy (core) and so on without any 'holes'.
+
+If the system comprises of hierarchical clusters of clusters, this
+property will contain multiple associations with the relevant number
+of cluster elements in hierarchical order.
+
+Property added to the cpu node:
+
+- sched-energy-costs
+
+ Usage: required
+ Value type: List of phandles
+ Definition: a list of phandles to specific cost nodes in the
+ energy-costs parent node that correspond to the processing
+ element represented by this cpu node in hierarchical order
+ of topology.
+
+ The order of phandles in the list is significant. The first
+ phandle is to the current processing element's own cost
+ node. Subsequent phandles are to higher hierarchical level
+ cost nodes up until the maximum level that EAS is to
+ service.
+
+ All cpu nodes must have the same highest level cost node.
+
+ The phandle list must not be sparsely populated with handles
+ to non-contiguous hierarchical levels. See commentary above
+ for clarity.
+
+ Any other configuration is invalid.
+
+===========================================================
+5 - Example dts
+===========================================================
+
+Example 1 (ARM 64-bit, 6-cpu system, two clusters of cpus, one
+cluster of 2 Cortex-A57 cpus, one cluster of 4 Cortex-A53 cpus):
+
+cpus {
+ #address-cells = <2>;
+ #size-cells = <0>;
+ .
+ .
+ .
+ A57_0: cpu@0 {
+ compatible = "arm,cortex-a57","arm,armv8";
+ reg = <0x0 0x0>;
+ device_type = "cpu";
+ enable-method = "psci";
+ next-level-cache = <&A57_L2>;
+ clocks = <&scpi_dvfs 0>;
+ cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+ sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>;
+ };
+
+ A57_1: cpu@1 {
+ compatible = "arm,cortex-a57","arm,armv8";
+ reg = <0x0 0x1>;
+ device_type = "cpu";
+ enable-method = "psci";
+ next-level-cache = <&A57_L2>;
+ clocks = <&scpi_dvfs 0>;
+ cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+ sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>;
+ };
+
+ A53_0: cpu@100 {
+ compatible = "arm,cortex-a53","arm,armv8";
+ reg = <0x0 0x100>;
+ device_type = "cpu";
+ enable-method = "psci";
+ next-level-cache = <&A53_L2>;
+ clocks = <&scpi_dvfs 1>;
+ cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+ sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
+ };
+
+ A53_1: cpu@101 {
+ compatible = "arm,cortex-a53","arm,armv8";
+ reg = <0x0 0x101>;
+ device_type = "cpu";
+ enable-method = "psci";
+ next-level-cache = <&A53_L2>;
+ clocks = <&scpi_dvfs 1>;
+ cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+ sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
+ };
+
+ A53_2: cpu@102 {
+ compatible = "arm,cortex-a53","arm,armv8";
+ reg = <0x0 0x102>;
+ device_type = "cpu";
+ enable-method = "psci";
+ next-level-cache = <&A53_L2>;
+ clocks = <&scpi_dvfs 1>;
+ cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+ sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
+ };
+
+ A53_3: cpu@103 {
+ compatible = "arm,cortex-a53","arm,armv8";
+ reg = <0x0 0x103>;
+ device_type = "cpu";
+ enable-method = "psci";
+ next-level-cache = <&A53_L2>;
+ clocks = <&scpi_dvfs 1>;
+ cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+ sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
+ };
+
+ energy-costs {
+ CPU_COST_0: core-cost0 {
+ busy-cost-data = <
+ 417 168
+ 579 251
+ 744 359
+ 883 479
+ 1024 616
+ >;
+ idle-cost-data = <
+ 15
+ 0
+ >;
+ };
+ CPU_COST_1: core-cost1 {
+ busy-cost-data = <
+ 235 33
+ 302 46
+ 368 61
+ 406 76
+ 447 93
+ >;
+ idle-cost-data = <
+ 6
+ 0
+ >;
+ };
+ CLUSTER_COST_0: cluster-cost0 {
+ busy-cost-data = <
+ 417 24
+ 579 32
+ 744 43
+ 883 49
+ 1024 64
+ >;
+ idle-cost-data = <
+ 65
+ 24
+ >;
+ };
+ CLUSTER_COST_1: cluster-cost1 {
+ busy-cost-data = <
+ 235 26
+ 303 30
+ 368 39
+ 406 47
+ 447 57
+ >;
+ idle-cost-data = <
+ 56
+ 17
+ >;
+ };
+ };
+};
+
+===============================================================================
+[1] https://lkml.org/lkml/2015/5/12/728
+[2] Documentation/devicetree/bindings/topology.txt
+[3] Documentation/scheduler/sched-energy.txt
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index eb8a10e22f7c..cf6b64a65179 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -369,6 +369,8 @@ is not associated with a file:
[stack:1001] = the stack of the thread with tid 1001
[vdso] = the "virtual dynamic shared object",
the kernel system call handler
+ [anon:<name>] = an anonymous mapping that has been
+ named by userspace
or if empty, the mapping is anonymous.
@@ -415,25 +417,34 @@ Private_Dirty: 0 kB
Referenced: 892 kB
Anonymous: 0 kB
Swap: 0 kB
+SwapPss: 0 kB
KernelPageSize: 4 kB
MMUPageSize: 4 kB
Locked: 374 kB
VmFlags: rd ex mr mw me de
+Name: name from userspace
the first of these lines shows the same information as is displayed for the
mapping in /proc/PID/maps. The remaining lines show the size of the mapping
(size), the amount of the mapping that is currently resident in RAM (RSS), the
process' proportional share of this mapping (PSS), the number of clean and
-dirty private pages in the mapping. Note that even a page which is part of a
-MAP_SHARED mapping, but has only a single pte mapped, i.e. is currently used
-by only one process, is accounted as private and not as shared. "Referenced"
-indicates the amount of memory currently marked as referenced or accessed.
+dirty private pages in the mapping.
+
+The "proportional set size" (PSS) of a process is the count of pages it has
+in memory, where each page is divided by the number of processes sharing it.
+So if a process has 1000 pages all to itself, and 1000 shared with one other
+process, its PSS will be 1500.
+Note that even a page which is part of a MAP_SHARED mapping, but has only
+a single pte mapped, i.e. is currently used by only one process, is accounted
+as private and not as shared.
+"Referenced" indicates the amount of memory currently marked as referenced or
+accessed.
"Anonymous" shows the amount of memory that does not belong to any file. Even
a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
and a page is modified, the file page is replaced by a private anonymous copy.
"Swap" shows how much would-be-anonymous memory is also used, but out on
swap.
-
+"SwapPss" shows proportional swap share of this mapping.
"VmFlags" field deserves a separate description. This member represents the kernel
flags associated with the particular virtual memory area in two letter encoded
manner. The codes are the following:
@@ -470,6 +481,9 @@ Note that there is no guarantee that every flag and associated mnemonic will
be present in all further kernel releases. Things get changed, the flags may
be vanished or the reverse -- new added.
+The "Name" field will only be present on a mapping that has been named by
+userspace, and will show the name passed in by userspace.
+
This file is only present if the CONFIG_MMU kernel configuration option is
enabled.
@@ -488,6 +502,10 @@ To clear the bits for the file mapped pages associated with the process
To clear the soft-dirty bit
> echo 4 > /proc/PID/clear_refs
+To reset the peak resident set size ("high water mark") to the process's
+current value:
+ > echo 5 > /proc/PID/clear_refs
+
Any other value written to /proc/PID/clear_refs will have no effect.
The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt
index 403c090aca39..e5274f84dc56 100644
--- a/Documentation/filesystems/squashfs.txt
+++ b/Documentation/filesystems/squashfs.txt
@@ -2,10 +2,10 @@ SQUASHFS 4.0 FILESYSTEM
=======================
Squashfs is a compressed read-only filesystem for Linux.
-It uses zlib/lzo/xz compression to compress files, inodes and directories.
-Inodes in the system are very small and all blocks are packed to minimise
-data overhead. Block sizes greater than 4K are supported up to a maximum
-of 1Mbytes (default block size 128K).
+It uses zlib, lz4, lzo, or xz compression to compress files, inodes and
+directories. Inodes in the system are very small and all blocks are packed to
+minimise data overhead. Block sizes greater than 4K are supported up to a
+maximum of 1Mbytes (default block size 128K).
Squashfs is intended for general read-only filesystem use, for archival
use (i.e. in cases where a .tar.gz file may be used), and in constrained
diff --git a/Documentation/kcov.txt b/Documentation/kcov.txt
new file mode 100644
index 000000000000..779ff4ab1c1d
--- /dev/null
+++ b/Documentation/kcov.txt
@@ -0,0 +1,111 @@
+kcov: code coverage for fuzzing
+===============================
+
+kcov exposes kernel code coverage information in a form suitable for coverage-
+guided fuzzing (randomized testing). Coverage data of a running kernel is
+exported via the "kcov" debugfs file. Coverage collection is enabled on a task
+basis, and thus it can capture precise coverage of a single system call.
+
+Note that kcov does not aim to collect as much coverage as possible. It aims
+to collect more or less stable coverage that is function of syscall inputs.
+To achieve this goal it does not collect coverage in soft/hard interrupts
+and instrumentation of some inherently non-deterministic parts of kernel is
+disbled (e.g. scheduler, locking).
+
+Usage:
+======
+
+Configure kernel with:
+
+ CONFIG_KCOV=y
+
+CONFIG_KCOV requires gcc built on revision 231296 or later.
+Profiling data will only become accessible once debugfs has been mounted:
+
+ mount -t debugfs none /sys/kernel/debug
+
+The following program demonstrates kcov usage from within a test program:
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define KCOV_INIT_TRACE _IOR('c', 1, unsigned long)
+#define KCOV_ENABLE _IO('c', 100)
+#define KCOV_DISABLE _IO('c', 101)
+#define COVER_SIZE (64<<10)
+
+int main(int argc, char **argv)
+{
+ int fd;
+ unsigned long *cover, n, i;
+
+ /* A single fd descriptor allows coverage collection on a single
+ * thread.
+ */
+ fd = open("/sys/kernel/debug/kcov", O_RDWR);
+ if (fd == -1)
+ perror("open"), exit(1);
+ /* Setup trace mode and trace size. */
+ if (ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE))
+ perror("ioctl"), exit(1);
+ /* Mmap buffer shared between kernel- and user-space. */
+ cover = (unsigned long*)mmap(NULL, COVER_SIZE * sizeof(unsigned long),
+ PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if ((void*)cover == MAP_FAILED)
+ perror("mmap"), exit(1);
+ /* Enable coverage collection on the current thread. */
+ if (ioctl(fd, KCOV_ENABLE, 0))
+ perror("ioctl"), exit(1);
+ /* Reset coverage from the tail of the ioctl() call. */
+ __atomic_store_n(&cover[0], 0, __ATOMIC_RELAXED);
+ /* That's the target syscal call. */
+ read(-1, NULL, 0);
+ /* Read number of PCs collected. */
+ n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED);
+ for (i = 0; i < n; i++)
+ printf("0x%lx\n", cover[i + 1]);
+ /* Disable coverage collection for the current thread. After this call
+ * coverage can be enabled for a different thread.
+ */
+ if (ioctl(fd, KCOV_DISABLE, 0))
+ perror("ioctl"), exit(1);
+ /* Free resources. */
+ if (munmap(cover, COVER_SIZE * sizeof(unsigned long)))
+ perror("munmap"), exit(1);
+ if (close(fd))
+ perror("close"), exit(1);
+ return 0;
+}
+
+After piping through addr2line output of the program looks as follows:
+
+SyS_read
+fs/read_write.c:562
+__fdget_pos
+fs/file.c:774
+__fget_light
+fs/file.c:746
+__fget_light
+fs/file.c:750
+__fget_light
+fs/file.c:760
+__fdget_pos
+fs/file.c:784
+SyS_read
+fs/read_write.c:562
+
+If a program needs to collect coverage from several threads (independently),
+it needs to open /sys/kernel/debug/kcov in each thread separately.
+
+The interface is fine-grained to allow efficient forking of test processes.
+That is, a parent process opens /sys/kernel/debug/kcov, enables trace mode,
+mmaps coverage buffer and then forks child processes in a loop. Child processes
+only need to enable coverage (disable happens automatically on thread end).
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9abe55280cf6..37aca1383323 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -56,6 +56,7 @@ parameter is applicable:
BLACKFIN Blackfin architecture is enabled.
CLK Common clock infrastructure is enabled.
CMA Contiguous Memory Area support is enabled.
+ DM Device mapper support is enabled.
DRM Direct Rendering Management support is enabled.
DYNAMIC_DEBUG Build in debug messages and enable them at runtime
EDD BIOS Enhanced Disk Drive Services (EDD) is enabled
@@ -880,6 +881,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Disable PIN 1 of APIC timer
Can be useful to work around chipset bugs.
+ dm= [DM] Allows early creation of a device-mapper device.
+ See Documentation/device-mapper/boot.txt.
+
+ dmasound= [HW,OSS] Sound subsystem buffers
+
dma_debug=off If the kernel is compiled with DMA_API_DEBUG support,
this option disables the debugging code at boot.
@@ -3164,6 +3170,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
ro [KNL] Mount root device read-only on boot
+ rodata= [KNL]
+ on Mark read-only kernel memory as read-only (default).
+ off Leave read-only kernel memory writable for debugging.
+
root= [KNL] Root filesystem
See name_to_dev_t comment in init/do_mounts.c.
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index a476b08a43e0..909d822a9622 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -60,7 +60,9 @@ fwmark_reflect - BOOLEAN
Controls the fwmark of kernel-generated IPv4 reply packets that are not
associated with a socket for example, TCP RSTs or ICMP echo replies).
If unset, these packets have a fwmark of zero. If set, they have the
- fwmark of the packet they are replying to.
+ fwmark of the packet they are replying to. Similarly affects the fwmark
+ used by internal routing lookups triggered by incoming packets, such as
+ the ones used for Path MTU Discovery.
Default: 0
route/max_size - INTEGER
@@ -516,6 +518,16 @@ tcp_fastopen - INTEGER
See include/net/tcp.h and the code for more details.
+tcp_fwmark_accept - BOOLEAN
+ If set, incoming connections to listening sockets that do not have a
+ socket mark will set the mark of the accepting socket to the fwmark of
+ the incoming SYN packet. This will cause all packets on that connection
+ (starting from the first SYNACK) to be sent with that fwmark. The
+ listening socket's mark is unchanged. Listening sockets that already
+ have a fwmark set via setsockopt(SOL_SOCKET, SO_MARK, ...) are
+ unaffected.
+ Default: 0
+
tcp_syn_retries - INTEGER
Number of times initial SYNs for an active TCP connection attempt
will be retransmitted. Should not be higher than 255. Default value
@@ -1212,7 +1224,9 @@ fwmark_reflect - BOOLEAN
Controls the fwmark of kernel-generated IPv6 reply packets that are not
associated with a socket for example, TCP RSTs or ICMPv6 echo replies).
If unset, these packets have a fwmark of zero. If set, they have the
- fwmark of the packet they are replying to.
+ fwmark of the packet they are replying to. Similarly affects the fwmark
+ used by internal routing lookups triggered by incoming packets, such as
+ the ones used for Path MTU Discovery.
Default: 0
conf/interface/*:
@@ -1262,11 +1276,20 @@ accept_ra_pinfo - BOOLEAN
Functional default: enabled if accept_ra is enabled.
disabled if accept_ra is disabled.
+accept_ra_rt_info_min_plen - INTEGER
+ Minimum prefix length of Route Information in RA.
+
+ Route Information w/ prefix smaller than this variable shall
+ be ignored.
+
+ Functional default: 0 if accept_ra_rtr_pref is enabled.
+ -1 if accept_ra_rtr_pref is disabled.
+
accept_ra_rt_info_max_plen - INTEGER
Maximum prefix length of Route Information in RA.
- Route Information w/ prefix larger than or equal to this
- variable shall be ignored.
+ Route Information w/ prefix larger than this variable shall
+ be ignored.
Functional default: 0 if accept_ra_rtr_pref is enabled.
-1 if accept_ra_rtr_pref is disabled.
@@ -1364,6 +1387,13 @@ router_solicitations - INTEGER
routers are present.
Default: 3
+use_oif_addrs_only - BOOLEAN
+ When enabled, the candidate source addresses for destinations
+ routed via this interface are restricted to the set of addresses
+ configured on this interface (vis. RFC 6724, section 4).
+
+ Default: false
+
use_tempaddr - INTEGER
Preference for Privacy Extensions (RFC3041).
<= 0 : disable Privacy Extensions
@@ -1466,6 +1496,19 @@ suppress_frag_ndisc - INTEGER
1 - (default) discard fragmented neighbor discovery packets
0 - allow fragmented neighbor discovery packets
+optimistic_dad - BOOLEAN
+ Whether to perform Optimistic Duplicate Address Detection (RFC 4429).
+ 0: disabled (default)
+ 1: enabled
+
+use_optimistic - BOOLEAN
+ If enabled, do not classify optimistic addresses as deprecated during
+ source address selection. Preferred addresses will still be chosen
+ before optimistic addresses, subject to other ranking in the source
+ address selection algorithm.
+ 0: disabled (default)
+ 1: enabled
+
icmp/*:
ratelimit - INTEGER
Limit the maximal rates for sending ICMPv6 packets.
diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt
new file mode 100644
index 000000000000..dab2f9088b33
--- /dev/null
+++ b/Documentation/scheduler/sched-energy.txt
@@ -0,0 +1,362 @@
+Energy cost model for energy-aware scheduling (EXPERIMENTAL)
+
+Introduction
+=============
+
+The basic energy model uses platform energy data stored in sched_group_energy
+data structures attached to the sched_groups in the sched_domain hierarchy. The
+energy cost model offers two functions that can be used to guide scheduling
+decisions:
+
+1. static unsigned int sched_group_energy(struct energy_env *eenv)
+2. static int energy_diff(struct energy_env *eenv)
+
+sched_group_energy() estimates the energy consumed by all cpus in a specific
+sched_group including any shared resources owned exclusively by this group of
+cpus. Resources shared with other cpus are excluded (e.g. later level caches).
+
+energy_diff() estimates the total energy impact of a utilization change. That
+is, adding, removing, or migrating utilization (tasks).
+
+Both functions use a struct energy_env to specify the scenario to be evaluated:
+
+ struct energy_env {
+ struct sched_group *sg_top;
+ struct sched_group *sg_cap;
+ int cap_idx;
+ int util_delta;
+ int src_cpu;
+ int dst_cpu;
+ int energy;
+ };
+
+sg_top: sched_group to be evaluated. Not used by energy_diff().
+
+sg_cap: sched_group covering the cpus in the same frequency domain. Set by
+sched_group_energy().
+
+cap_idx: Capacity state to be used for energy calculations. Set by
+find_new_capacity().
+
+util_delta: Amount of utilization to be added, removed, or migrated.
+
+src_cpu: Source cpu from where 'util_delta' utilization is removed. Should be
+-1 if no source (e.g. task wake-up).
+
+dst_cpu: Destination cpu where 'util_delta' utilization is added. Should be -1
+if utilization is removed (e.g. terminating tasks).
+
+energy: Result of sched_group_energy().
+
+The metric used to represent utilization is the actual per-entity running time
+averaged over time using a geometric series. Very similar to the existing
+per-entity load-tracking, but _not_ scaled by task priority and capped by the
+capacity of the cpu. The latter property does mean that utilization may
+underestimate the compute requirements for task on fully/over utilized cpus.
+The greatest potential for energy savings without affecting performance too much
+is scenarios where the system isn't fully utilized. If the system is deemed
+fully utilized load-balancing should be done with task load (includes task
+priority) instead in the interest of fairness and performance.
+
+
+Background and Terminology
+===========================
+
+To make it clear from the start:
+
+energy = [joule] (resource like a battery on powered devices)
+power = energy/time = [joule/second] = [watt]
+
+The goal of energy-aware scheduling is to minimize energy, while still getting
+the job done. That is, we want to maximize:
+
+ performance [inst/s]
+ --------------------
+ power [W]
+
+which is equivalent to minimizing:
+
+ energy [J]
+ -----------
+ instruction
+
+while still getting 'good' performance. It is essentially an alternative
+optimization objective to the current performance-only objective for the
+scheduler. This alternative considers two objectives: energy-efficiency and
+performance. Hence, there needs to be a user controllable knob to switch the
+objective. Since it is early days, this is currently a sched_feature
+(ENERGY_AWARE).
+
+The idea behind introducing an energy cost model is to allow the scheduler to
+evaluate the implications of its decisions rather than applying energy-saving
+techniques blindly that may only have positive effects on some platforms. At
+the same time, the energy cost model must be as simple as possible to minimize
+the scheduler latency impact.
+
+Platform topology
+------------------
+
+The system topology (cpus, caches, and NUMA information, not peripherals) is
+represented in the scheduler by the sched_domain hierarchy which has
+sched_groups attached at each level that covers one or more cpus (see
+sched-domains.txt for more details). To add energy awareness to the scheduler
+we need to consider power and frequency domains.
+
+Power domain:
+
+A power domain is a part of the system that can be powered on/off
+independently. Power domains are typically organized in a hierarchy where you
+may be able to power down just a cpu or a group of cpus along with any
+associated resources (e.g. shared caches). Powering up a cpu means that all
+power domains it is a part of in the hierarchy must be powered up. Hence, it is
+more expensive to power up the first cpu that belongs to a higher level power
+domain than powering up additional cpus in the same high level domain. Two
+level power domain hierarchy example:
+
+ Power source
+ +-------------------------------+----...
+per group PD G G
+ | +----------+ |
+ +--------+-------| Shared | (other groups)
+per-cpu PD G G | resource |
+ | | +----------+
+ +-------+ +-------+
+ | CPU 0 | | CPU 1 |
+ +-------+ +-------+
+
+Frequency domain:
+
+Frequency domains (P-states) typically cover the same group of cpus as one of
+the power domain levels. That is, there might be several smaller power domains
+sharing the same frequency (P-state) or there might be a power domain spanning
+multiple frequency domains.
+
+From a scheduling point of view there is no need to know the actual frequencies
+[Hz]. All the scheduler cares about is the compute capacity available at the
+current state (P-state) the cpu is in and any other available states. For that
+reason, and to also factor in any cpu micro-architecture differences, compute
+capacity scaling states are called 'capacity states' in this document. For SMP
+systems this is equivalent to P-states. For mixed micro-architecture systems
+(like ARM big.LITTLE) it is P-states scaled according to the micro-architecture
+performance relative to the other cpus in the system.
+
+Energy modelling:
+------------------
+
+Due to the hierarchical nature of the power domains, the most obvious way to
+model energy costs is therefore to associate power and energy costs with
+domains (groups of cpus). Energy costs of shared resources are associated with
+the group of cpus that share the resources, only the cost of powering the
+cpu itself and any private resources (e.g. private L1 caches) is associated
+with the per-cpu groups (lowest level).
+
+For example, for an SMP system with per-cpu power domains and a cluster level
+(group of cpus) power domain we get the overall energy costs to be:
+
+ energy = energy_cluster + n * energy_cpu
+
+where 'n' is the number of cpus powered up and energy_cluster is the cost paid
+as soon as any cpu in the cluster is powered up.
+
+The power and frequency domains can naturally be mapped onto the existing
+sched_domain hierarchy and sched_groups by adding the necessary data to the
+existing data structures.
+
+The energy model considers energy consumption from two contributors (shown in
+the illustration below):
+
+1. Busy energy: Energy consumed while a cpu and the higher level groups that it
+belongs to are busy running tasks. Busy energy is associated with the state of
+the cpu, not an event. The time the cpu spends in this state varies. Thus, the
+most obvious platform parameter for this contribution is busy power
+(energy/time).
+
+2. Idle energy: Energy consumed while a cpu and higher level groups that it
+belongs to are idle (in a C-state). Like busy energy, idle energy is associated
+with the state of the cpu. Thus, the platform parameter for this contribution
+is idle power (energy/time).
+
+Energy consumed during transitions from an idle-state (C-state) to a busy state
+(P-state) or going the other way is ignored by the model to simplify the energy
+model calculations.
+
+
+ Power
+ ^
+ | busy->idle idle->busy
+ | transition transition
+ |
+ | _ __
+ | / \ / \__________________
+ |______________/ \ /
+ | \ /
+ | Busy \ Idle / Busy
+ | low P-state \____________/ high P-state
+ |
+ +------------------------------------------------------------> time
+
+Busy |--------------| |-----------------|
+
+Wakeup |------| |------|
+
+Idle |------------|
+
+
+The basic algorithm
+====================
+
+The basic idea is to determine the total energy impact when utilization is
+added or removed by estimating the impact at each level in the sched_domain
+hierarchy starting from the bottom (sched_group contains just a single cpu).
+The energy cost comes from busy time (sched_group is awake because one or more
+cpus are busy) and idle time (in an idle-state). Energy model numbers account
+for energy costs associated with all cpus in the sched_group as a group.
+
+ for_each_domain(cpu, sd) {
+ sg = sched_group_of(cpu)
+ energy_before = curr_util(sg) * busy_power(sg)
+ + (1-curr_util(sg)) * idle_power(sg)
+ energy_after = new_util(sg) * busy_power(sg)
+ + (1-new_util(sg)) * idle_power(sg)
+ energy_diff += energy_before - energy_after
+
+ }
+
+ return energy_diff
+
+{curr, new}_util: The cpu utilization at the lowest level and the overall
+non-idle time for the entire group for higher levels. Utilization is in the
+range 0.0 to 1.0 in the pseudo-code.
+
+busy_power: The power consumption of the sched_group.
+
+idle_power: The power consumption of the sched_group when idle.
+
+Note: It is a fundamental assumption that the utilization is (roughly) scale
+invariant. Task utilization tracking factors in any frequency scaling and
+performance scaling differences due to difference cpu microarchitectures such
+that task utilization can be used across the entire system.
+
+
+Platform energy data
+=====================
+
+struct sched_group_energy can be attached to sched_groups in the sched_domain
+hierarchy and has the following members:
+
+cap_states:
+ List of struct capacity_state representing the supported capacity states
+ (P-states). struct capacity_state has two members: cap and power, which
+ represents the compute capacity and the busy_power of the state. The
+ list must be ordered by capacity low->high.
+
+nr_cap_states:
+ Number of capacity states in cap_states list.
+
+idle_states:
+ List of struct idle_state containing idle_state power cost for each
+ idle-state supported by the system orderd by shallowest state first.
+ All states must be included at all level in the hierarchy, i.e. a
+ sched_group spanning just a single cpu must also include coupled
+ idle-states (cluster states). In addition to the cpuidle idle-states,
+ the list must also contain an entry for the idling using the arch
+ default idle (arch_idle_cpu()). Despite this state may not be a true
+ hardware idle-state it is considered the shallowest idle-state in the
+ energy model and must be the first entry. cpus may enter this state
+ (possibly 'active idling') if cpuidle decides not enter a cpuidle
+ idle-state. Default idle may not be used when cpuidle is enabled.
+ In this case, it should just be a copy of the first cpuidle idle-state.
+
+nr_idle_states:
+ Number of idle states in idle_states list.
+
+There are no unit requirements for the energy cost data. Data can be normalized
+with any reference, however, the normalization must be consistent across all
+energy cost data. That is, one bogo-joule/watt must be the same quantity for
+data, but we don't care what it is.
+
+A recipe for platform characterization
+=======================================
+
+Obtaining the actual model data for a particular platform requires some way of
+measuring power/energy. There isn't a tool to help with this (yet). This
+section provides a recipe for use as reference. It covers the steps used to
+characterize the ARM TC2 development platform. This sort of measurements is
+expected to be done anyway when tuning cpuidle and cpufreq for a given
+platform.
+
+The energy model needs two types of data (struct sched_group_energy holds
+these) for each sched_group where energy costs should be taken into account:
+
+1. Capacity state information
+
+A list containing the compute capacity and power consumption when fully
+utilized attributed to the group as a whole for each available capacity state.
+At the lowest level (group contains just a single cpu) this is the power of the
+cpu alone without including power consumed by resources shared with other cpus.
+It basically needs to fit the basic modelling approach described in "Background
+and Terminology" section:
+
+ energy_system = energy_shared + n * energy_cpu
+
+for a system containing 'n' busy cpus. Only 'energy_cpu' should be included at
+the lowest level. 'energy_shared' is included at the next level which
+represents the group of cpus among which the resources are shared.
+
+This model is, of course, a simplification of reality. Thus, power/energy
+attributions might not always exactly represent how the hardware is designed.
+Also, busy power is likely to depend on the workload. It is therefore
+recommended to use a representative mix of workloads when characterizing the
+capacity states.
+
+If the group has no capacity scaling support, the list will contain a single
+state where power is the busy power attributed to the group. The capacity
+should be set to a default value (1024).
+
+When frequency domains include multiple power domains, the group representing
+the frequency domain and all child groups share capacity states. This must be
+indicated by setting the SD_SHARE_CAP_STATES sched_domain flag. All groups at
+all levels that share the capacity state must have the list of capacity states
+with the power set to the contribution of the individual group.
+
+2. Idle power information
+
+Stored in the idle_states list. The power number is the group idle power
+consumption in each idle state as well when the group is idle but has not
+entered an idle-state ('active idle' as mentioned earlier). Due to the way the
+energy model is defined, the idle power of the deepest group idle state can
+alternatively be accounted for in the parent group busy power. In that case the
+group idle state power values are offset such that the idle power of the
+deepest state is zero. It is less intuitive, but it is easier to measure as
+idle power consumed by the group and the busy/idle power of the parent group
+cannot be distinguished without per group measurement points.
+
+Measuring capacity states and idle power:
+
+The capacity states' capacity and power can be estimated by running a benchmark
+workload at each available capacity state. By restricting the benchmark to run
+on subsets of cpus it is possible to extrapolate the power consumption of
+shared resources.
+
+ARM TC2 has two clusters of two and three cpus respectively. Each cluster has a
+shared L2 cache. TC2 has on-chip energy counters per cluster. Running a
+benchmark workload on just one cpu in a cluster means that power is consumed in
+the cluster (higher level group) and a single cpu (lowest level group). Adding
+another benchmark task to another cpu increases the power consumption by the
+amount consumed by the additional cpu. Hence, it is possible to extrapolate the
+cluster busy power.
+
+For platforms that don't have energy counters or equivalent instrumentation
+built-in, it may be possible to use an external DAQ to acquire similar data.
+
+If the benchmark includes some performance score (for example sysbench cpu
+benchmark), this can be used to record the compute capacity.
+
+Measuring idle power requires insight into the idle state implementation on the
+particular platform. Specifically, if the platform has coupled idle-states (or
+package states). To measure non-coupled per-cpu idle-states it is necessary to
+keep one cpu busy to keep any shared resources alive to isolate the idle power
+of the cpu from idle/busy power of the shared resources. The cpu can be tricked
+into different per-cpu idle states by disabling the other states. Based on
+various combinations of measurements with specific cpus busy and disabling
+idle-states it is possible to extrapolate the idle-state power.
diff --git a/Documentation/scheduler/sched-tune.txt b/Documentation/scheduler/sched-tune.txt
new file mode 100644
index 000000000000..cb795e643eba
--- /dev/null
+++ b/Documentation/scheduler/sched-tune.txt
@@ -0,0 +1,367 @@
+ Central, scheduler-driven, power-performance control
+ (EXPERIMENTAL)
+
+Abstract
+========
+
+The topic of a single simple power-performance tunable, that is wholly
+scheduler centric, and has well defined and predictable properties has come up
+on several occasions in the past [1,2]. With techniques such as a scheduler
+driven DVFS [3], we now have a good framework for implementing such a tunable.
+This document describes the overall ideas behind its design and implementation.
+
+
+Table of Contents
+=================
+
+1. Motivation
+2. Introduction
+3. Signal Boosting Strategy
+4. OPP selection using boosted CPU utilization
+5. Per task group boosting
+6. Question and Answers
+ - What about "auto" mode?
+ - What about boosting on a congested system?
+ - How CPUs are boosted when we have tasks with multiple boost values?
+7. References
+
+
+1. Motivation
+=============
+
+Sched-DVFS [3] is a new event-driven cpufreq governor which allows the
+scheduler to select the optimal DVFS operating point (OPP) for running a task
+allocated to a CPU. The introduction of sched-DVFS enables running workloads at
+the most energy efficient OPPs.
+
+However, sometimes it may be desired to intentionally boost the performance of
+a workload even if that could imply a reasonable increase in energy
+consumption. For example, in order to reduce the response time of a task, we
+may want to run the task at a higher OPP than the one that is actually required
+by it's CPU bandwidth demand.
+
+This last requirement is especially important if we consider that one of the
+main goals of the sched-DVFS component is to replace all currently available
+CPUFreq policies. Since sched-DVFS is event based, as opposed to the sampling
+driven governors we currently have, it is already more responsive at selecting
+the optimal OPP to run tasks allocated to a CPU. However, just tracking the
+actual task load demand may not be enough from a performance standpoint. For
+example, it is not possible to get behaviors similar to those provided by the
+"performance" and "interactive" CPUFreq governors.
+
+This document describes an implementation of a tunable, stacked on top of the
+sched-DVFS which extends its functionality to support task performance
+boosting.
+
+By "performance boosting" we mean the reduction of the time required to
+complete a task activation, i.e. the time elapsed from a task wakeup to its
+next deactivation (e.g. because it goes back to sleep or it terminates). For
+example, if we consider a simple periodic task which executes the same workload
+for 5[s] every 20[s] while running at a certain OPP, a boosted execution of
+that task must complete each of its activations in less than 5[s].
+
+A previous attempt [5] to introduce such a boosting feature has not been
+successful mainly because of the complexity of the proposed solution. The
+approach described in this document exposes a single simple interface to
+user-space. This single tunable knob allows the tuning of system wide
+scheduler behaviours ranging from energy efficiency at one end through to
+incremental performance boosting at the other end. This first tunable affects
+all tasks. However, a more advanced extension of the concept is also provided
+which uses CGroups to boost the performance of only selected tasks while using
+the energy efficient default for all others.
+
+The rest of this document introduces in more details the proposed solution
+which has been named SchedTune.
+
+
+2. Introduction
+===============
+
+SchedTune exposes a simple user-space interface with a single power-performance
+tunable:
+
+ /proc/sys/kernel/sched_cfs_boost
+
+This permits expressing a boost value as an integer in the range [0..100].
+
+A value of 0 (default) configures the CFS scheduler for maximum energy
+efficiency. This means that sched-DVFS runs the tasks at the minimum OPP
+required to satisfy their workload demand.
+A value of 100 configures scheduler for maximum performance, which translates
+to the selection of the maximum OPP on that CPU.
+
+The range between 0 and 100 can be set to satisfy other scenarios suitably. For
+example to satisfy interactive response or depending on other system events
+(battery level etc).
+
+A CGroup based extension is also provided, which permits further user-space
+defined task classification to tune the scheduler for different goals depending
+on the specific nature of the task, e.g. background vs interactive vs
+low-priority.
+
+The overall design of the SchedTune module is built on top of "Per-Entity Load
+Tracking" (PELT) signals and sched-DVFS by introducing a bias on the Operating
+Performance Point (OPP) selection.
+Each time a task is allocated on a CPU, sched-DVFS has the opportunity to tune
+the operating frequency of that CPU to better match the workload demand. The
+selection of the actual OPP being activated is influenced by the global boost
+value, or the boost value for the task CGroup when in use.
+
+This simple biasing approach leverages existing frameworks, which means minimal
+modifications to the scheduler, and yet it allows to achieve a range of
+different behaviours all from a single simple tunable knob.
+The only new concept introduced is that of signal boosting.
+
+
+3. Signal Boosting Strategy
+===========================
+
+The whole PELT machinery works based on the value of a few load tracking signals
+which basically track the CPU bandwidth requirements for tasks and the capacity
+of CPUs. The basic idea behind the SchedTune knob is to artificially inflate
+some of these load tracking signals to make a task or RQ appears more demanding
+that it actually is.
+
+Which signals have to be inflated depends on the specific "consumer". However,
+independently from the specific (signal, consumer) pair, it is important to
+define a simple and possibly consistent strategy for the concept of boosting a
+signal.
+
+A boosting strategy defines how the "abstract" user-space defined
+sched_cfs_boost value is translated into an internal "margin" value to be added
+to a signal to get its inflated value:
+
+ margin := boosting_strategy(sched_cfs_boost, signal)
+ boosted_signal := signal + margin
+
+Different boosting strategies were identified and analyzed before selecting the
+one found to be most effective.
+
+Signal Proportional Compensation (SPC)
+--------------------------------------
+
+In this boosting strategy the sched_cfs_boost value is used to compute a
+margin which is proportional to the complement of the original signal.
+When a signal has a maximum possible value, its complement is defined as
+the delta from the actual value and its possible maximum.
+
+Since the tunable implementation uses signals which have SCHED_LOAD_SCALE as
+the maximum possible value, the margin becomes:
+
+ margin := sched_cfs_boost * (SCHED_LOAD_SCALE - signal)
+
+Using this boosting strategy:
+- a 100% sched_cfs_boost means that the signal is scaled to the maximum value
+- each value in the range of sched_cfs_boost effectively inflates the signal in
+ question by a quantity which is proportional to the maximum value.
+
+For example, by applying the SPC boosting strategy to the selection of the OPP
+to run a task it is possible to achieve these behaviors:
+
+- 0% boosting: run the task at the minimum OPP required by its workload
+- 100% boosting: run the task at the maximum OPP available for the CPU
+- 50% boosting: run at the half-way OPP between minimum and maximum
+
+Which means that, at 50% boosting, a task will be scheduled to run at half of
+the maximum theoretically achievable performance on the specific target
+platform.
+
+A graphical representation of an SPC boosted signal is represented in the
+following figure where:
+ a) "-" represents the original signal
+ b) "b" represents a 50% boosted signal
+ c) "p" represents a 100% boosted signal
+
+
+ ^
+ | SCHED_LOAD_SCALE
+ +-----------------------------------------------------------------+
+ |pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
+ |
+ | boosted_signal
+ | bbbbbbbbbbbbbbbbbbbbbbbb
+ |
+ | original signal
+ | bbbbbbbbbbbbbbbbbbbbbbbb+----------------------+
+ | |
+ |bbbbbbbbbbbbbbbbbb |
+ | |
+ | |
+ | |
+ | +-----------------------+
+ | |
+ | |
+ | |
+ |------------------+
+ |
+ |
+ +----------------------------------------------------------------------->
+
+The plot above shows a ramped load signal (titled 'original_signal') and it's
+boosted equivalent. For each step of the original signal the boosted signal
+corresponding to a 50% boost is midway from the original signal and the upper
+bound. Boosting by 100% generates a boosted signal which is always saturated to
+the upper bound.
+
+
+4. OPP selection using boosted CPU utilization
+==============================================
+
+It is worth calling out that the implementation does not introduce any new load
+signals. Instead, it provides an API to tune existing signals. This tuning is
+done on demand and only in scheduler code paths where it is sensible to do so.
+The new API calls are defined to return either the default signal or a boosted
+one, depending on the value of sched_cfs_boost. This is a clean an non invasive
+modification of the existing existing code paths.
+
+The signal representing a CPU's utilization is boosted according to the
+previously described SPC boosting strategy. To sched-DVFS, this allows a CPU
+(ie CFS run-queue) to appear more used then it actually is.
+
+Thus, with the sched_cfs_boost enabled we have the following main functions to
+get the current utilization of a CPU:
+
+ cpu_util()
+ boosted_cpu_util()
+
+The new boosted_cpu_util() is similar to the first but returns a boosted
+utilization signal which is a function of the sched_cfs_boost value.
+
+This function is used in the CFS scheduler code paths where sched-DVFS needs to
+decide the OPP to run a CPU at.
+For example, this allows selecting the highest OPP for a CPU which has
+the boost value set to 100%.
+
+
+5. Per task group boosting
+==========================
+
+The availability of a single knob which is used to boost all tasks in the
+system is certainly a simple solution but it quite likely doesn't fit many
+utilization scenarios, especially in the mobile device space.
+
+For example, on battery powered devices there usually are many background
+services which are long running and need energy efficient scheduling. On the
+other hand, some applications are more performance sensitive and require an
+interactive response and/or maximum performance, regardless of the energy cost.
+To better service such scenarios, the SchedTune implementation has an extension
+that provides a more fine grained boosting interface.
+
+A new CGroup controller, namely "schedtune", could be enabled which allows to
+defined and configure task groups with different boosting values.
+Tasks that require special performance can be put into separate CGroups.
+The value of the boost associated with the tasks in this group can be specified
+using a single knob exposed by the CGroup controller:
+
+ schedtune.boost
+
+This knob allows the definition of a boost value that is to be used for
+SPC boosting of all tasks attached to this group.
+
+The current schedtune controller implementation is really simple and has these
+main characteristics:
+
+ 1) It is only possible to create 1 level depth hierarchies
+
+ The root control groups define the system-wide boost value to be applied
+ by default to all tasks. Its direct subgroups are named "boost groups" and
+ they define the boost value for specific set of tasks.
+ Further nested subgroups are not allowed since they do not have a sensible
+ meaning from a user-space standpoint.
+
+ 2) It is possible to define only a limited number of "boost groups"
+
+ This number is defined at compile time and by default configured to 16.
+ This is a design decision motivated by two main reasons:
+ a) In a real system we do not expect utilization scenarios with more then few
+ boost groups. For example, a reasonable collection of groups could be
+ just "background", "interactive" and "performance".
+ b) It simplifies the implementation considerably, especially for the code
+ which has to compute the per CPU boosting once there are multiple
+ RUNNABLE tasks with different boost values.
+
+Such a simple design should allow servicing the main utilization scenarios identified
+so far. It provides a simple interface which can be used to manage the
+power-performance of all tasks or only selected tasks.
+Moreover, this interface can be easily integrated by user-space run-times (e.g.
+Android, ChromeOS) to implement a QoS solution for task boosting based on tasks
+classification, which has been a long standing requirement.
+
+Setup and usage
+---------------
+
+0. Use a kernel with CGROUP_SCHEDTUNE support enabled
+
+1. Check that the "schedtune" CGroup controller is available:
+
+ root@linaro-nano:~# cat /proc/cgroups
+ #subsys_name hierarchy num_cgroups enabled
+ cpuset 0 1 1
+ cpu 0 1 1
+ schedtune 0 1 1
+
+2. Mount a tmpfs to create the CGroups mount point (Optional)
+
+ root@linaro-nano:~# sudo mount -t tmpfs cgroups /sys/fs/cgroup
+
+3. Mount the "schedtune" controller
+
+ root@linaro-nano:~# mkdir /sys/fs/cgroup/stune
+ root@linaro-nano:~# sudo mount -t cgroup -o schedtune stune /sys/fs/cgroup/stune
+
+4. Setup the system-wide boost value (Optional)
+
+ If not configured the root control group has a 0% boost value, which
+ basically disables boosting for all tasks in the system thus running in
+ an energy-efficient mode.
+
+ root@linaro-nano:~# echo $SYSBOOST > /sys/fs/cgroup/stune/schedtune.boost
+
+5. Create task groups and configure their specific boost value (Optional)
+
+ For example here we create a "performance" boost group configure to boost
+ all its tasks to 100%
+
+ root@linaro-nano:~# mkdir /sys/fs/cgroup/stune/performance
+ root@linaro-nano:~# echo 100 > /sys/fs/cgroup/stune/performance/schedtune.boost
+
+6. Move tasks into the boost group
+
+ For example, the following moves the tasks with PID $TASKPID (and all its
+ threads) into the "performance" boost group.
+
+ root@linaro-nano:~# echo "TASKPID > /sys/fs/cgroup/stune/performance/cgroup.procs
+
+This simple configuration allows only the threads of the $TASKPID task to run,
+when needed, at the highest OPP in the most capable CPU of the system.
+
+
+6. Question and Answers
+=======================
+
+What about "auto" mode?
+-----------------------
+
+The 'auto' mode as described in [5] can be implemented by interfacing SchedTune
+with some suitable user-space element. This element could use the exposed
+system-wide or cgroup based interface.
+
+How are multiple groups of tasks with different boost values managed?
+---------------------------------------------------------------------
+
+The current SchedTune implementation keeps track of the boosted RUNNABLE tasks
+on a CPU. Once sched-DVFS selects the OPP to run a CPU at, the CPU utilization
+is boosted with a value which is the maximum of the boost values of the
+currently RUNNABLE tasks in its RQ.
+
+This allows sched-DVFS to boost a CPU only while there are boosted tasks ready
+to run and switch back to the energy efficient mode as soon as the last boosted
+task is dequeued.
+
+
+7. References
+=============
+[1] http://lwn.net/Articles/552889
+[2] http://lkml.org/lkml/2012/5/18/91
+[3] http://lkml.org/lkml/2015/6/26/620
+
diff --git a/Documentation/sync.txt b/Documentation/sync.txt
new file mode 100644
index 000000000000..a2d05e7fa193
--- /dev/null
+++ b/Documentation/sync.txt
@@ -0,0 +1,75 @@
+Motivation:
+
+In complicated DMA pipelines such as graphics (multimedia, camera, gpu, display)
+a consumer of a buffer needs to know when the producer has finished producing
+it. Likewise the producer needs to know when the consumer is finished with the
+buffer so it can reuse it. A particular buffer may be consumed by multiple
+consumers which will retain the buffer for different amounts of time. In
+addition, a consumer may consume multiple buffers atomically.
+The sync framework adds an API which allows synchronization between the
+producers and consumers in a generic way while also allowing platforms which
+have shared hardware synchronization primitives to exploit them.
+
+Goals:
+ * provide a generic API for expressing synchronization dependencies
+ * allow drivers to exploit hardware synchronization between hardware
+ blocks
+ * provide a userspace API that allows a compositor to manage
+ dependencies.
+ * provide rich telemetry data to allow debugging slowdowns and stalls of
+ the graphics pipeline.
+
+Objects:
+ * sync_timeline
+ * sync_pt
+ * sync_fence
+
+sync_timeline:
+
+A sync_timeline is an abstract monotonically increasing counter. In general,
+each driver/hardware block context will have one of these. They can be backed
+by the appropriate hardware or rely on the generic sw_sync implementation.
+Timelines are only ever created through their specific implementations
+(i.e. sw_sync.)
+
+sync_pt:
+
+A sync_pt is an abstract value which marks a point on a sync_timeline. Sync_pts
+have a single timeline parent. They have 3 states: active, signaled, and error.
+They start in active state and transition, once, to either signaled (when the
+timeline counter advances beyond the sync_pt’s value) or error state.
+
+sync_fence:
+
+Sync_fences are the primary primitives used by drivers to coordinate
+synchronization of their buffers. They are a collection of sync_pts which may
+or may not have the same timeline parent. A sync_pt can only exist in one fence
+and the fence's list of sync_pts is immutable once created. Fences can be
+waited on synchronously or asynchronously. Two fences can also be merged to
+create a third fence containing a copy of the two fences’ sync_pts. Fences are
+backed by file descriptors to allow userspace to coordinate the display pipeline
+dependencies.
+
+Use:
+
+A driver implementing sync support should have a work submission function which:
+ * takes a fence argument specifying when to begin work
+ * asynchronously queues that work to kick off when the fence is signaled
+ * returns a fence to indicate when its work will be done.
+ * signals the returned fence once the work is completed.
+
+Consider an imaginary display driver that has the following API:
+/*
+ * assumes buf is ready to be displayed.
+ * blocks until the buffer is on screen.
+ */
+ void display_buffer(struct dma_buf *buf);
+
+The new API will become:
+/*
+ * will display buf when fence is signaled.
+ * returns immediately with a fence that will signal when buf
+ * is no longer displayed.
+ */
+struct sync_fence* display_buffer(struct dma_buf *buf,
+ struct sync_fence *fence);
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 260c39206ac2..64dd24bc3a16 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -54,8 +54,10 @@ show up in /proc/sys/kernel:
- overflowuid
- panic
- panic_on_oops
-- panic_on_unrecovered_nmi
- panic_on_stackoverflow
+- panic_on_unrecovered_nmi
+- perf_cpu_time_max_percent
+- perf_event_paranoid
- pid_max
- powersave-nap [ PPC only ]
- printk
@@ -527,19 +529,6 @@ the recommended setting is 60.
==============================================================
-panic_on_unrecovered_nmi:
-
-The default Linux behaviour on an NMI of either memory or unknown is
-to continue operation. For many environments such as scientific
-computing it is preferable that the box is taken out and the error
-dealt with than an uncorrected parity/ECC error get propagated.
-
-A small number of systems do generate NMI's for bizarre random reasons
-such as power management so the default is off. That sysctl works like
-the existing panic controls already in that directory.
-
-==============================================================
-
panic_on_oops:
Controls the kernel's behaviour when an oops or BUG is encountered.
@@ -563,6 +552,19 @@ This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled.
==============================================================
+panic_on_unrecovered_nmi:
+
+The default Linux behaviour on an NMI of either memory or unknown is
+to continue operation. For many environments such as scientific
+computing it is preferable that the box is taken out and the error
+dealt with than an uncorrected parity/ECC error get propagated.
+
+A small number of systems do generate NMI's for bizarre random reasons
+such as power management so the default is off. That sysctl works like
+the existing panic controls already in that directory.
+
+==============================================================
+
perf_cpu_time_max_percent:
Hints to the kernel how much CPU time it should be allowed to
@@ -589,6 +591,19 @@ allowed to execute.
==============================================================
+perf_event_paranoid:
+
+Controls use of the performance events system by unprivileged
+users (without CAP_SYS_ADMIN). The default value is 3 if
+CONFIG_SECURITY_PERF_EVENTS_RESTRICT is set, or 1 otherwise.
+
+ -1: Allow use of (almost) all events by all users
+>=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
+>=1: Disallow CPU event access by users without CAP_SYS_ADMIN
+>=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
+>=3: Disallow all event access by users without CAP_SYS_ADMIN
+
+==============================================================
pid_max:
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 4415aa915681..ccc50a53c452 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm:
- dirty_writeback_centisecs
- drop_caches
- extfrag_threshold
+- extra_free_kbytes
- hugepages_treat_as_movable
- hugetlb_shm_group
- laptop_mode
@@ -41,6 +42,8 @@ Currently, these files are in /proc/sys/vm:
- min_slab_ratio
- min_unmapped_ratio
- mmap_min_addr
+- mmap_rnd_bits
+- mmap_rnd_compat_bits
- nr_hugepages
- nr_overcommit_hugepages
- nr_trim_pages (only if CONFIG_MMU=n)
@@ -225,6 +228,21 @@ fragmentation index is <= extfrag_threshold. The default value is 500.
==============================================================
+extra_free_kbytes
+
+This parameter tells the VM to keep extra free memory between the threshold
+where background reclaim (kswapd) kicks in, and the threshold where direct
+reclaim (by allocating processes) kicks in.
+
+This is useful for workloads that require low latency memory allocations
+and have a bounded burstiness in memory allocations, for example a
+realtime application that receives and transmits network traffic
+(causing in-kernel memory allocations) with a maximum total message burst
+size of 200MB may need 200MB of extra free memory to avoid direct reclaim
+related latencies.
+
+==============================================================
+
hugepages_treat_as_movable
This parameter controls whether we can allocate hugepages from ZONE_MOVABLE
@@ -474,6 +492,33 @@ against future potential kernel bugs.
==============================================================
+mmap_rnd_bits:
+
+This value can be used to select the number of bits to use to
+determine the random offset to the base address of vma regions
+resulting from mmap allocations on architectures which support
+tuning address space randomization. This value will be bounded
+by the architecture's minimum and maximum supported values.
+
+This value can be changed after boot using the
+/proc/sys/vm/mmap_rnd_bits tunable
+
+==============================================================
+
+mmap_rnd_compat_bits:
+
+This value can be used to select the number of bits to use to
+determine the random offset to the base address of vma regions
+resulting from mmap allocations for applications run in
+compatibility mode on architectures which support tuning address
+space randomization. This value will be bounded by the
+architecture's minimum and maximum supported values.
+
+This value can be changed after boot using the
+/proc/sys/vm/mmap_rnd_compat_bits tunable
+
+==============================================================
+
nr_hugepages
Change the minimum size of the hugepage pool.
diff --git a/Documentation/trace/events-power.txt b/Documentation/trace/events-power.txt
index 21d514ced212..4d817d5acc40 100644
--- a/Documentation/trace/events-power.txt
+++ b/Documentation/trace/events-power.txt
@@ -25,6 +25,7 @@ cpufreq.
cpu_idle "state=%lu cpu_id=%lu"
cpu_frequency "state=%lu cpu_id=%lu"
+cpu_frequency_limits "min=%lu max=%lu cpu_id=%lu"
A suspend event is used to indicate the system going in and out of the
suspend mode:
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 4da42616939f..0344f76da571 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -2043,6 +2043,35 @@ will produce:
1) 1.449 us | }
+You can disable the hierarchical function call formatting and instead print a
+flat list of function entry and return events. This uses the format described
+in the Output Formatting section and respects all the trace options that
+control that formatting. Hierarchical formatting is the default.
+
+ hierachical: echo nofuncgraph-flat > trace_options
+ flat: echo funcgraph-flat > trace_options
+
+ ie:
+
+ # tracer: function_graph
+ #
+ # entries-in-buffer/entries-written: 68355/68355 #P:2
+ #
+ # _-----=> irqs-off
+ # / _----=> need-resched
+ # | / _---=> hardirq/softirq
+ # || / _--=> preempt-depth
+ # ||| / delay
+ # TASK-PID CPU# |||| TIMESTAMP FUNCTION
+ # | | | |||| | |
+ sh-1806 [001] d... 198.843443: graph_ent: func=_raw_spin_lock
+ sh-1806 [001] d... 198.843445: graph_ent: func=__raw_spin_lock
+ sh-1806 [001] d..1 198.843447: graph_ret: func=__raw_spin_lock
+ sh-1806 [001] d..1 198.843449: graph_ret: func=_raw_spin_lock
+ sh-1806 [001] d..1 198.843451: graph_ent: func=_raw_spin_unlock_irqrestore
+ sh-1806 [001] d... 198.843453: graph_ret: func=_raw_spin_unlock_irqrestore
+
+
You might find other useful features for this tracer in the
following "dynamic ftrace" section such as tracing only specific
functions or tasks.
diff --git a/Documentation/vm/zsmalloc.txt b/Documentation/vm/zsmalloc.txt
new file mode 100644
index 000000000000..64ed63c4f69d
--- /dev/null
+++ b/Documentation/vm/zsmalloc.txt
@@ -0,0 +1,70 @@
+zsmalloc
+--------
+
+This allocator is designed for use with zram. Thus, the allocator is
+supposed to work well under low memory conditions. In particular, it
+never attempts higher order page allocation which is very likely to
+fail under memory pressure. On the other hand, if we just use single
+(0-order) pages, it would suffer from very high fragmentation --
+any object of size PAGE_SIZE/2 or larger would occupy an entire page.
+This was one of the major issues with its predecessor (xvmalloc).
+
+To overcome these issues, zsmalloc allocates a bunch of 0-order pages
+and links them together using various 'struct page' fields. These linked
+pages act as a single higher-order page i.e. an object can span 0-order
+page boundaries. The code refers to these linked pages as a single entity
+called zspage.
+
+For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
+since this satisfies the requirements of all its current users (in the
+worst case, page is incompressible and is thus stored "as-is" i.e. in
+uncompressed form). For allocation requests larger than this size, failure
+is returned (see zs_malloc).
+
+Additionally, zs_malloc() does not return a dereferenceable pointer.
+Instead, it returns an opaque handle (unsigned long) which encodes actual
+location of the allocated object. The reason for this indirection is that
+zsmalloc does not keep zspages permanently mapped since that would cause
+issues on 32-bit systems where the VA region for kernel space mappings
+is very small. So, before using the allocating memory, the object has to
+be mapped using zs_map_object() to get a usable pointer and subsequently
+unmapped using zs_unmap_object().
+
+stat
+----
+
+With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via
+/sys/kernel/debug/zsmalloc/<user name>. Here is a sample of stat output:
+
+# cat /sys/kernel/debug/zsmalloc/zram0/classes
+
+ class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage
+ ..
+ ..
+ 9 176 0 1 186 129 8 4
+ 10 192 1 0 2880 2872 135 3
+ 11 208 0 1 819 795 42 2
+ 12 224 0 1 219 159 12 4
+ ..
+ ..
+
+
+class: index
+size: object size zspage stores
+almost_empty: the number of ZS_ALMOST_EMPTY zspages(see below)
+almost_full: the number of ZS_ALMOST_FULL zspages(see below)
+obj_allocated: the number of objects allocated
+obj_used: the number of objects allocated to the user
+pages_used: the number of pages allocated for the class
+pages_per_zspage: the number of 0-order pages to make a zspage
+
+We assign a zspage to ZS_ALMOST_EMPTY fullness group when:
+ n <= N / f, where
+n = number of allocated objects
+N = total number of objects zspage can store
+f = fullness_threshold_frac(ie, 4 at the moment)
+
+Similarly, we assign zspage to:
+ ZS_ALMOST_FULL when n > N / f
+ ZS_EMPTY when n == 0
+ ZS_FULL when n == N
diff --git a/MAINTAINERS b/MAINTAINERS
index 0b00192244bf..599ed28e5b63 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10511,6 +10511,7 @@ L: linux-mm@kvack.org
S: Maintained
F: mm/zsmalloc.c
F: include/linux/zsmalloc.h
+F: Documentation/vm/zsmalloc.txt
ZSWAP COMPRESSED SWAP CACHING
M: Seth Jennings <sjennings@variantweb.net>
diff --git a/Makefile b/Makefile
index 30f1369b7960..0d588e6e4b75 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ VERSION = 3
PATCHLEVEL = 18
SUBLEVEL = 63
EXTRAVERSION =
-NAME = Diseased Newt
+NAME = Shuffling Zombie Juror
# *DOCUMENTATION*
# To see a list of typical targets execute "make help"
@@ -141,7 +141,7 @@ PHONY += $(MAKECMDGOALS) sub-make
$(filter-out _all sub-make $(CURDIR)/Makefile, $(MAKECMDGOALS)) _all: sub-make
@:
-sub-make: FORCE
+sub-make:
$(Q)$(MAKE) -C $(KBUILD_OUTPUT) KBUILD_SRC=$(CURDIR) \
-f $(CURDIR)/Makefile $(filter-out _all sub-make,$(MAKECMDGOALS))
@@ -377,6 +377,7 @@ LDFLAGS_MODULE =
CFLAGS_KERNEL =
AFLAGS_KERNEL =
CFLAGS_GCOV = -fprofile-arcs -ftest-coverage -fno-tree-loop-im
+CFLAGS_KCOV = -fsanitize-coverage=trace-pc
# Use USERINCLUDE when you must reference the UAPI directories only.
@@ -422,7 +423,7 @@ export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE
export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
-export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KASAN
+export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KCOV CFLAGS_KASAN CFLAGS_UBSAN
export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
@@ -677,6 +678,14 @@ endif
endif
KBUILD_CFLAGS += $(stackp-flag)
+ifdef CONFIG_KCOV
+ ifeq ($(call cc-option, $(CFLAGS_KCOV)),)
+ $(warning Cannot use CONFIG_KCOV: \
+ -fsanitize-coverage=trace-pc is not supported by compiler)
+ CFLAGS_KCOV =
+ endif
+endif
+
ifeq ($(COMPILER),clang)
KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,)
@@ -984,7 +993,7 @@ prepare1: prepare2 $(version_h) include/generated/utsrelease.h \
archprepare: archheaders archscripts prepare1 scripts_basic
-prepare0: archprepare FORCE
+prepare0: archprepare
$(Q)$(MAKE) $(build)=.
# All the preparing..
@@ -1034,7 +1043,7 @@ INSTALL_FW_PATH=$(INSTALL_MOD_PATH)/lib/firmware
export INSTALL_FW_PATH
PHONY += firmware_install
-firmware_install: FORCE
+firmware_install:
@mkdir -p $(objtree)/firmware
$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_install
@@ -1056,7 +1065,7 @@ PHONY += archscripts
archscripts:
PHONY += __headers
-__headers: $(version_h) scripts_basic asm-generic archheaders archscripts FORCE
+__headers: $(version_h) scripts_basic asm-generic archheaders archscripts
$(Q)$(MAKE) $(build)=scripts build_unifdef
PHONY += headers_install_all
diff --git a/android/configs/README b/android/configs/README
new file mode 100644
index 000000000000..8798731f8904
--- /dev/null
+++ b/android/configs/README
@@ -0,0 +1,15 @@
+The files in this directory are meant to be used as a base for an Android
+kernel config. All devices should have the options in android-base.cfg enabled.
+While not mandatory, the options in android-recommended.cfg enable advanced
+Android features.
+
+Assuming you already have a minimalist defconfig for your device, a possible
+way to enable these options would be:
+
+ ARCH=<arch> scripts/kconfig/merge_config.sh <path_to>/<device>_defconfig android/configs/android-base.cfg android/configs/android-recommended.cfg
+
+This will generate a .config that can then be used to save a new defconfig or
+compile a new kernel with Android features enabled.
+
+Because there is no tool to consistently generate these config fragments,
+lets keep them alphabetically sorted instead of random.
diff --git a/android/configs/android-base-arm64.cfg b/android/configs/android-base-arm64.cfg
new file mode 100644
index 000000000000..43f23d6b5391
--- /dev/null
+++ b/android/configs/android-base-arm64.cfg
@@ -0,0 +1,5 @@
+# KEEP ALPHABETICALLY SORTED
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_SWP_EMULATION=y
diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
new file mode 100644
index 000000000000..e12488a6140d
--- /dev/null
+++ b/android/configs/android-base.cfg
@@ -0,0 +1,169 @@
+# KEEP ALPHABETICALLY SORTED
+# CONFIG_DEVKMEM is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_FHANDLE is not set
+# CONFIG_INET_LRO is not set
+# CONFIG_NFSD is not set
+# CONFIG_NFS_FS is not set
+# CONFIG_OABI_COMPAT is not set
+# CONFIG_SYSVIPC is not set
+# CONFIG_USELIB is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_ASHMEM=y
+CONFIG_AUDIT=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_DEFAULT_SECURITY_SELINUX=y
+CONFIG_EMBEDDED=y
+CONFIG_FB=y
+CONFIG_HARDENED_USERCOPY=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_INET_ESP=y
+CONFIG_INET_XFRM_MODE_TUNNEL=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IPV6=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+CONFIG_NET=y
+CONFIG_NETDEVICES=y
+CONFIG_NETFILTER=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_KEY=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_NAT=y
+CONFIG_NO_HZ=y
+CONFIG_PACKET=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PPP=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PREEMPT=y
+CONFIG_QFMT_V2=y
+CONFIG_QUOTA=y
+CONFIG_QUOTACTL=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+CONFIG_QUOTA_TREE=y
+CONFIG_RESOURCE_COUNTERS=y
+CONFIG_RTC_CLASS=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECCOMP=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_STAGING=y
+CONFIG_SWITCH=y
+CONFIG_SYNC=y
+CONFIG_TUN=y
+CONFIG_UID_SYS_STATS=y
+CONFIG_UNIX=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_MTP=y
+CONFIG_USB_CONFIGFS_F_PTP=y
+CONFIG_USB_CONFIGFS_F_ACC=y
+CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y
+CONFIG_USB_CONFIGFS_UEVENT=y
+CONFIG_XFRM_USER=y
diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg
new file mode 100644
index 000000000000..a1c27c83cb60
--- /dev/null
+++ b/android/configs/android-recommended.cfg
@@ -0,0 +1,140 @@
+# KEEP ALPHABETICALLY SORTED
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_NF_CONNTRACK_SIP is not set
+# CONFIG_PM_WAKELOCKS_GC is not set
+# CONFIG_VT is not set
+CONFIG_ANDROID_TIMED_GPIO=y
+CONFIG_ARM_KERNMEM_PERMS=y
+CONFIG_ARM64_SW_TTBR0_PAN=y
+CONFIG_ASYMMETRIC_KEY_TYPE=y
+CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_CC_STACKPROTECTOR_STRONG=y
+CONFIG_COMPACTION=y
+CONFIG_CPU_SW_DOMAIN_PAN=y
+CONFIG_DEBUG_RODATA=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_FUSE_FS=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HIDRAW=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_GREENASIA=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_GPIO=y
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_KEYRESET=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_ION=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_KSM=y
+CONFIG_LOGIG940_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGITECH_FF=y
+CONFIG_MD=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_MSDOS_FS=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_PANTHERLORD_FF=y
+CONFIG_PERF_EVENTS=y
+CONFIG_PKCS7_MESSAGE_PARSER=y
+CONFIG_PKCS7_TEST_KEY=y
+CONFIG_PM_DEBUG=y
+CONFIG_PM_RUNTIME=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+CONFIG_POWER_SUPPLY=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_SCHEDSTATS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_SND=y
+CONFIG_SOUND=y
+CONFIG_SUSPEND_TIME=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_TABLET_USB_WACOM=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_TASK_XACCT=y
+CONFIG_TIMER_STATS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_UHID=y
+CONFIG_UID_STAT=y
+CONFIG_MEMORY_STATE_TIME=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_USBNET=y
+CONFIG_VFAT_FS=y
+CONFIG_X509_CERTIFICATE_PARSER=y
diff --git a/arch/Kconfig b/arch/Kconfig
index 05d7a8a458d5..c484676a9b8b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -411,6 +411,15 @@ config CC_STACKPROTECTOR_STRONG
endchoice
+config HAVE_ARCH_WITHIN_STACK_FRAMES
+ bool
+ help
+ An architecture should select this if it can walk the kernel stack
+ frames to determine if an object is part of either the arguments
+ or local variables (i.e. that it excludes saved return addresses,
+ and similar) by implementing an inline arch_within_stack_frames(),
+ which is used by CONFIG_HARDENED_USERCOPY.
+
config HAVE_CONTEXT_TRACKING
bool
help
@@ -484,6 +493,74 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
This spares a stack switch and improves cache usage on softirq
processing.
+config HAVE_ARCH_MMAP_RND_BITS
+ bool
+ help
+ An arch should select this symbol if it supports setting a variable
+ number of bits for use in establishing the base address for mmap
+ allocations, has MMU enabled and provides values for both:
+ - ARCH_MMAP_RND_BITS_MIN
+ - ARCH_MMAP_RND_BITS_MAX
+
+config ARCH_MMAP_RND_BITS_MIN
+ int
+
+config ARCH_MMAP_RND_BITS_MAX
+ int
+
+config ARCH_MMAP_RND_BITS_DEFAULT
+ int
+
+config ARCH_MMAP_RND_BITS
+ int "Number of bits to use for ASLR of mmap base address" if EXPERT
+ range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX
+ default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT
+ default ARCH_MMAP_RND_BITS_MIN
+ depends on HAVE_ARCH_MMAP_RND_BITS
+ help
+ This value can be used to select the number of bits to use to
+ determine the random offset to the base address of vma regions
+ resulting from mmap allocations. This value will be bounded
+ by the architecture's minimum and maximum supported values.
+
+ This value can be changed after boot using the
+ /proc/sys/vm/mmap_rnd_bits tunable
+
+config HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ bool
+ help
+ An arch should select this symbol if it supports running applications
+ in compatibility mode, supports setting a variable number of bits for
+ use in establishing the base address for mmap allocations, has MMU
+ enabled and provides values for both:
+ - ARCH_MMAP_RND_COMPAT_BITS_MIN
+ - ARCH_MMAP_RND_COMPAT_BITS_MAX
+
+config ARCH_MMAP_RND_COMPAT_BITS_MIN
+ int
+
+config ARCH_MMAP_RND_COMPAT_BITS_MAX
+ int
+
+config ARCH_MMAP_RND_COMPAT_BITS_DEFAULT
+ int
+
+config ARCH_MMAP_RND_COMPAT_BITS
+ int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT
+ range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX
+ default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT
+ default ARCH_MMAP_RND_COMPAT_BITS_MIN
+ depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ help
+ This value can be used to select the number of bits to use to
+ determine the random offset to the base address of vma regions
+ resulting from mmap allocations for compatible applications This
+ value will be bounded by the architecture's minimum and maximum
+ supported values.
+
+ This value can be changed after boot using the
+ /proc/sys/vm/mmap_rnd_compat_bits tunable
+
#
# ABI hall of shame
#
diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h
index 48bbea6898b3..d5b98ab514bb 100644
--- a/arch/alpha/include/asm/thread_info.h
+++ b/arch/alpha/include/asm/thread_info.h
@@ -27,8 +27,6 @@ struct thread_info {
int bpt_nsaved;
unsigned long bpt_addr[2]; /* breakpoint handling */
unsigned int bpt_insn[2];
-
- struct restart_block restart_block;
};
/*
@@ -40,9 +38,6 @@ struct thread_info {
.exec_domain = &default_exec_domain, \
.addr_limit = KERNEL_DS, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 6cec2881acbf..8dbfb15f1745 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -150,7 +150,7 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs)
struct switch_stack *sw = (struct switch_stack *)regs - 1;
long i, err = __get_user(regs->pc, &sc->sc_pc);
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
sw->r26 = (unsigned long) ret_from_sys_call;
diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h
index 02bc5ec0fb2e..1163a1838ac1 100644
--- a/arch/arc/include/asm/thread_info.h
+++ b/arch/arc/include/asm/thread_info.h
@@ -46,7 +46,6 @@ struct thread_info {
struct exec_domain *exec_domain;/* execution domain */
__u32 cpu; /* current CPU */
unsigned long thr_ptr; /* TLS ptr */
- struct restart_block restart_block;
};
/*
@@ -62,9 +61,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c
index a86d567f6c70..edda76fae83f 100644
--- a/arch/arc/kernel/signal.c
+++ b/arch/arc/kernel/signal.c
@@ -104,7 +104,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
struct pt_regs *regs = current_pt_regs();
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/* Since we stacked the signal on a word boundary,
* then 'sp' should be word aligned here. If it's
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 89c4b5ccc68d..31eb05c356ae 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -30,6 +30,8 @@ config ARM
select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL
select HAVE_ARCH_KGDB
+ select HAVE_ARCH_HARDENED_USERCOPY
+ select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
select HAVE_ARCH_TRACEHOOK
select HAVE_BPF_JIT
@@ -296,6 +298,14 @@ config MMU
Select if you want MMU-based virtualised addressing space
support by paged memory management. If unsure, say 'Y'.
+config ARCH_MMAP_RND_BITS_MIN
+ default 8
+
+config ARCH_MMAP_RND_BITS_MAX
+ default 14 if PAGE_OFFSET=0x40000000
+ default 15 if PAGE_OFFSET=0x80000000
+ default 16
+
#
# The "ARM system type" choice list is ordered alphabetically by option
# text. Please add new entries in the option alphabetic order.
@@ -1686,6 +1696,21 @@ config HIGHPTE
bool "Allocate 2nd-level pagetables from highmem"
depends on HIGHMEM
+config CPU_SW_DOMAIN_PAN
+ bool "Enable use of CPU domains to implement privileged no-access"
+ depends on MMU && !ARM_LPAE
+ default y
+ help
+ Increase kernel security by ensuring that normal kernel accesses
+ are unable to access userspace addresses. This can help prevent
+ use-after-free bugs becoming an exploitable privilege escalation
+ by ensuring that magic values (such as LIST_POISON) will always
+ fault when dereferenced.
+
+ CPUs with low-vector mappings use a best-efforts implementation.
+ Their lower 1MB needs to remain accessible for the vectors, but
+ the remainder of userspace will become appropriately inaccessible.
+
config HW_PERF_EVENTS
bool "Enable hardware performance counter support for perf events"
depends on PERF_EVENTS
@@ -1790,6 +1815,15 @@ config XEN
help
Say Y if you want to run Linux in a Virtual Machine on Xen on ARM.
+config ARM_FLUSH_CONSOLE_ON_RESTART
+ bool "Force flush the console on restart"
+ help
+ If the console is locked while the system is rebooted, the messages
+ in the temporary logbuffer would not have propogated to all the
+ console drivers. This option forces the console lock to be
+ released if it failed to be acquired, which will cause all the
+ pending messages to be flushed.
+
endmenu
menu "Boot options"
@@ -1820,6 +1854,21 @@ config DEPRECATED_PARAM_STRUCT
This was deprecated in 2001 and announced to live on for 5 years.
Some old boot loaders still use this way.
+config BUILD_ARM_APPENDED_DTB_IMAGE
+ bool "Build a concatenated zImage/dtb by default"
+ depends on OF
+ help
+ Enabling this option will cause a concatenated zImage and list of
+ DTBs to be built by default (instead of a standalone zImage.)
+ The image will built in arch/arm/boot/zImage-dtb
+
+config BUILD_ARM_APPENDED_DTB_IMAGE_NAMES
+ string "Default dtb names"
+ depends on BUILD_ARM_APPENDED_DTB_IMAGE
+ help
+ Space separated list of names of dtbs to append when
+ building a concatenated zImage-dtb.
+
# Compressed boot loader in ROM. Yes, we really want to ask about
# TEXT and BSS so we preserve their values in the config files.
config ZBOOT_ROM_TEXT
diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug
index 21c031fe76d8..02a0527fd10d 100644
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -1297,6 +1297,14 @@ config EARLY_PRINTK
kernel low-level debugging functions. Add earlyprintk to your
kernel parameters to enable this console.
+config EARLY_PRINTK_DIRECT
+ bool "Early printk direct"
+ depends on DEBUG_LL
+ help
+ Say Y here if you want to have an early console using the
+ kernel low-level debugging functions and EARLY_PRINTK is
+ not early enough.
+
config OC_ETM
bool "On-chip ETM and ETB"
depends on ARM_AMBA && !CORESIGHT
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 93a30a285ad2..21eccd14c9b9 100644..100755
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -271,6 +271,7 @@ core-$(CONFIG_FPE_FASTFPE) += $(FASTFPE_OBJ)
core-$(CONFIG_VFP) += arch/arm/vfp/
core-$(CONFIG_XEN) += arch/arm/xen/
core-$(CONFIG_KVM_ARM_HOST) += arch/arm/kvm/
+core-$(CONFIG_VDSO) += arch/arm/vdso/
# If we have a machine-specific directory, then include it in the build.
core-y += arch/arm/kernel/ arch/arm/mm/ arch/arm/common/
@@ -286,6 +287,8 @@ libs-y := arch/arm/lib/ $(libs-y)
# Default target when executing plain make
ifeq ($(CONFIG_XIP_KERNEL),y)
KBUILD_IMAGE := xipImage
+else ifeq ($(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE),y)
+KBUILD_IMAGE := zImage-dtb
else
KBUILD_IMAGE := zImage
endif
@@ -327,6 +330,15 @@ dtbs: prepare scripts
dtbs_install:
$(Q)$(MAKE) $(dtbinst)=$(boot)/dts MACHINE=$(MACHINE)
+zImage-dtb: vmlinux scripts dtbs
+ $(Q)$(MAKE) $(build)=$(boot) MACHINE=$(MACHINE) $(boot)/$@
+
+PHONY += vdso_install
+vdso_install:
+ifeq ($(CONFIG_VDSO),y)
+ $(Q)$(MAKE) $(build)=arch/arm/vdso $@
+endif
+
# We use MRPROPER_FILES and CLEAN_FILES now
archclean:
$(Q)$(MAKE) $(clean)=$(boot)
@@ -351,4 +363,5 @@ define archhelp
echo ' Install using (your) ~/bin/$(INSTALLKERNEL) or'
echo ' (distribution) /sbin/$(INSTALLKERNEL) or'
echo ' install to $$(INSTALL_PATH) and run lilo'
+ echo ' vdso_install - Install unstripped vdso.so to $$(INSTALL_MOD_PATH)/vdso'
endef
diff --git a/arch/arm/boot/.gitignore b/arch/arm/boot/.gitignore
index 3c79f85975aa..ad7a0253ea96 100644
--- a/arch/arm/boot/.gitignore
+++ b/arch/arm/boot/.gitignore
@@ -4,3 +4,4 @@ xipImage
bootpImage
uImage
*.dtb
+zImage-dtb \ No newline at end of file
diff --git a/arch/arm/boot/Makefile b/arch/arm/boot/Makefile
index ec2f8065f955..d09b00ad5fb4 100644
--- a/arch/arm/boot/Makefile
+++ b/arch/arm/boot/Makefile
@@ -14,6 +14,7 @@
ifneq ($(MACHINE),)
include $(srctree)/$(MACHINE)/Makefile.boot
endif
+include $(srctree)/arch/arm/boot/dts/Makefile
# Note: the following conditions must always be true:
# ZRELADDR == virt_to_phys(PAGE_OFFSET + TEXT_OFFSET)
@@ -27,6 +28,14 @@ export ZRELADDR INITRD_PHYS PARAMS_PHYS
targets := Image zImage xipImage bootpImage uImage
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
+endif
+DTB_OBJS := $(addprefix $(obj)/dts/,$(DTB_LIST))
+
ifeq ($(CONFIG_XIP_KERNEL),y)
$(obj)/xipImage: vmlinux FORCE
@@ -55,6 +64,10 @@ $(obj)/zImage: $(obj)/compressed/vmlinux FORCE
$(call if_changed,objcopy)
@$(kecho) ' Kernel: $@ is ready'
+$(obj)/zImage-dtb: $(obj)/zImage $(DTB_OBJS) FORCE
+ $(call if_changed,cat)
+ @echo ' Kernel: $@ is ready'
+
endif
ifneq ($(LOADADDR),)
diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index 68be9017593d..3f25fac7a84c 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -730,6 +730,8 @@ __armv7_mmu_cache_on:
bic r6, r6, #1 << 31 @ 32-bit translation system
bic r6, r6, #3 << 0 @ use only ttbr0
mcrne p15, 0, r3, c2, c0, 0 @ load page table pointer
+ mcrne p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
+ mcr p15, 0, r0, c7, c5, 4 @ ISB
mcrne p15, 0, r1, c3, c0, 0 @ load domain access control
mcrne p15, 0, r6, c2, c0, 2 @ load ttb control
#endif
diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index 6e784fac5798..7fdb71cbca5c 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -517,7 +517,14 @@ dtb-$(CONFIG_MACH_DOVE) += dove-cm-a510.dtb \
dove-dove-db.dtb
dtb-$(CONFIG_ARCH_MEDIATEK) += mt6589-aquaris5.dtb
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
endif
-always := $(dtb-y)
+endif
+
+always := $(DTB_LIST)
clean-files := *.dtb
diff --git a/arch/arm/boot/dts/am335x-evmsk.dts b/arch/arm/boot/dts/am335x-evmsk.dts
index df5fee6b6b4b..87fc7a35e802 100644
--- a/arch/arm/boot/dts/am335x-evmsk.dts
+++ b/arch/arm/boot/dts/am335x-evmsk.dts
@@ -15,6 +15,7 @@
#include "am33xx.dtsi"
#include <dt-bindings/pwm/pwm.h>
+#include <dt-bindings/interrupt-controller/irq.h>
/ {
model = "TI AM335x EVM-SK";
@@ -647,6 +648,16 @@
cap-power-off-card;
pinctrl-names = "default";
pinctrl-0 = <&mmc2_pins>;
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+ wlcore: wlcore@2 {
+ compatible = "ti,wl1271";
+ reg = <2>;
+ interrupt-parent = <&gpio1>;
+ interrupts = <31 IRQ_TYPE_LEVEL_HIGH>; /* gpio 31 */
+ ref-clock-frequency = <38400000>;
+ };
};
&mcasp1 {
diff --git a/arch/arm/boot/dts/omap3-cm-t3517.dts b/arch/arm/boot/dts/omap3-cm-t3517.dts
index d00502f4fd9b..0d7466d8a79b 100644
--- a/arch/arm/boot/dts/omap3-cm-t3517.dts
+++ b/arch/arm/boot/dts/omap3-cm-t3517.dts
@@ -133,4 +133,14 @@
non-removable;
bus-width = <4>;
cap-power-off-card;
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+ wlcore: wlcore@2 {
+ compatible = "ti,wl1271";
+ reg = <2>;
+ interrupt-parent = <&gpio5>;
+ interrupts = <17 IRQ_TYPE_LEVEL_HIGH>; /* gpio 145 */
+ ref-clock-frequency = <38400000>;
+ };
};
diff --git a/arch/arm/boot/dts/omap3-cm-t3730.dts b/arch/arm/boot/dts/omap3-cm-t3730.dts
index b3f9a50b3bc8..ea88c628d770 100644
--- a/arch/arm/boot/dts/omap3-cm-t3730.dts
+++ b/arch/arm/boot/dts/omap3-cm-t3730.dts
@@ -60,4 +60,14 @@
non-removable;
bus-width = <4>;
cap-power-off-card;
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+ wlcore: wlcore@2 {
+ compatible = "ti,wl1271";
+ reg = <2>;
+ interrupt-parent = <&gpio5>;
+ interrupts = <8 IRQ_TYPE_LEVEL_HIGH>; /* gpio 136 */
+ ref-clock-frequency = <38400000>;
+ };
};
diff --git a/arch/arm/boot/dts/omap3-evm-common.dtsi b/arch/arm/boot/dts/omap3-evm-common.dtsi
index c8747c7f1cc8..fa10b4c16b36 100644
--- a/arch/arm/boot/dts/omap3-evm-common.dtsi
+++ b/arch/arm/boot/dts/omap3-evm-common.dtsi
@@ -105,6 +105,16 @@
non-removable;
bus-width = <4>;
cap-power-off-card;
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+ wlcore: wlcore@2 {
+ compatible = "ti,wl1271";
+ reg = <2>;
+ interrupt-parent = <&gpio5>;
+ interrupts = <21 IRQ_TYPE_LEVEL_HIGH>; /* gpio 149 */
+ ref-clock-frequency = <38400000>;
+ };
};
&twl_gpio {
diff --git a/arch/arm/boot/dts/omap3-zoom3.dts b/arch/arm/boot/dts/omap3-zoom3.dts
index 6644f516a42b..131448d86e67 100644
--- a/arch/arm/boot/dts/omap3-zoom3.dts
+++ b/arch/arm/boot/dts/omap3-zoom3.dts
@@ -195,6 +195,16 @@
cap-power-off-card;
pinctrl-names = "default";
pinctrl-0 = <&mmc3_pins &mmc3_2_pins>;
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+ wlcore: wlcore@2 {
+ compatible = "ti,wl1271";
+ reg = <2>;
+ interrupt-parent = <&gpio6>;
+ interrupts = <2 IRQ_TYPE_LEVEL_HIGH>; /* gpio 162 */
+ ref-clock-frequency = <26000000>;
+ };
};
&uart1 {
diff --git a/arch/arm/boot/dts/omap4-panda-common.dtsi b/arch/arm/boot/dts/omap4-panda-common.dtsi
index 150513506c19..1228de5a05ed 100644
--- a/arch/arm/boot/dts/omap4-panda-common.dtsi
+++ b/arch/arm/boot/dts/omap4-panda-common.dtsi
@@ -450,6 +450,16 @@
non-removable;
bus-width = <4>;
cap-power-off-card;
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+ wlcore: wlcore@2 {
+ compatible = "ti,wl1271";
+ reg = <2>;
+ interrupt-parent = <&gpio2>;
+ interrupts = <21 IRQ_TYPE_LEVEL_HIGH>; /* gpio 53 */
+ ref-clock-frequency = <38400000>;
+ };
};
&emif1 {
diff --git a/arch/arm/boot/dts/omap4-sdp.dts b/arch/arm/boot/dts/omap4-sdp.dts
index 3e1da43068f6..8c36f51f5780 100644
--- a/arch/arm/boot/dts/omap4-sdp.dts
+++ b/arch/arm/boot/dts/omap4-sdp.dts
@@ -487,6 +487,17 @@
non-removable;
bus-width = <4>;
cap-power-off-card;
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+ wlcore: wlcore@2 {
+ compatible = "ti,wl1281";
+ reg = <2>;
+ interrupt-parent = <&gpio1>;
+ interrupts = <21 IRQ_TYPE_LEVEL_HIGH>; /* gpio 53 */
+ ref-clock-frequency = <26000000>;
+ tcxo-clock-frequency = <26000000>;
+ };
};
&emif1 {
diff --git a/arch/arm/boot/dts/omap4-var-som-om44-wlan.dtsi b/arch/arm/boot/dts/omap4-var-som-om44-wlan.dtsi
index cc66af419236..9bceeb7e1f03 100644
--- a/arch/arm/boot/dts/omap4-var-som-om44-wlan.dtsi
+++ b/arch/arm/boot/dts/omap4-var-som-om44-wlan.dtsi
@@ -65,4 +65,14 @@
bus-width = <4>;
cap-power-off-card;
status = "okay";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+ wlcore: wlcore@2 {
+ compatible = "ti,wl1271";
+ reg = <2>;
+ interrupt-parent = <&gpio2>;
+ interrupts = <9 IRQ_TYPE_LEVEL_HIGH>; /* gpio 41 */
+ ref-clock-frequency = <38400000>;
+ };
};
diff --git a/arch/arm/common/Kconfig b/arch/arm/common/Kconfig
index c3a4e9ceba34..f4a38a76db64 100644
--- a/arch/arm/common/Kconfig
+++ b/arch/arm/common/Kconfig
@@ -20,3 +20,7 @@ config SHARP_SCOOP
config TI_PRIV_EDMA
bool
+
+config FIQ_GLUE
+ bool
+ select FIQ
diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile
index 70b1eff477b3..5748afda0681 100644
--- a/arch/arm/common/Makefile
+++ b/arch/arm/common/Makefile
@@ -4,6 +4,7 @@
obj-y += firmware.o
+obj-$(CONFIG_FIQ_GLUE) += fiq_glue.o fiq_glue_setup.o
obj-$(CONFIG_ICST) += icst.o
obj-$(CONFIG_SA1111) += sa1111.o
obj-$(CONFIG_DMABOUNCE) += dmabounce.o
diff --git a/arch/arm/common/fiq_glue.S b/arch/arm/common/fiq_glue.S
new file mode 100644
index 000000000000..24b42cec4813
--- /dev/null
+++ b/arch/arm/common/fiq_glue.S
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+
+ .global fiq_glue_end
+
+ /* fiq stack: r0-r15,cpsr,spsr of interrupted mode */
+
+ENTRY(fiq_glue)
+ /* store pc, cpsr from previous mode, reserve space for spsr */
+ mrs r12, spsr
+ sub lr, lr, #4
+ subs r10, #1
+ bne nested_fiq
+
+ str r12, [sp, #-8]!
+ str lr, [sp, #-4]!
+
+ /* store r8-r14 from previous mode */
+ sub sp, sp, #(7 * 4)
+ stmia sp, {r8-r14}^
+ nop
+
+ /* store r0-r7 from previous mode */
+ stmfd sp!, {r0-r7}
+
+ /* setup func(data,regs) arguments */
+ mov r0, r9
+ mov r1, sp
+ mov r3, r8
+
+ mov r7, sp
+
+ /* Get sp and lr from non-user modes */
+ and r4, r12, #MODE_MASK
+ cmp r4, #USR_MODE
+ beq fiq_from_usr_mode
+
+ mov r7, sp
+ orr r4, r4, #(PSR_I_BIT | PSR_F_BIT)
+ msr cpsr_c, r4
+ str sp, [r7, #(4 * 13)]
+ str lr, [r7, #(4 * 14)]
+ mrs r5, spsr
+ str r5, [r7, #(4 * 17)]
+
+ cmp r4, #(SVC_MODE | PSR_I_BIT | PSR_F_BIT)
+ /* use fiq stack if we reenter this mode */
+ subne sp, r7, #(4 * 3)
+
+fiq_from_usr_mode:
+ msr cpsr_c, #(SVC_MODE | PSR_I_BIT | PSR_F_BIT)
+ mov r2, sp
+ sub sp, r7, #12
+ stmfd sp!, {r2, ip, lr}
+ /* call func(data,regs) */
+ blx r3
+ ldmfd sp, {r2, ip, lr}
+ mov sp, r2
+
+ /* restore/discard saved state */
+ cmp r4, #USR_MODE
+ beq fiq_from_usr_mode_exit
+
+ msr cpsr_c, r4
+ ldr sp, [r7, #(4 * 13)]
+ ldr lr, [r7, #(4 * 14)]
+ msr spsr_cxsf, r5
+
+fiq_from_usr_mode_exit:
+ msr cpsr_c, #(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)
+
+ ldmfd sp!, {r0-r7}
+ ldr lr, [sp, #(4 * 7)]
+ ldr r12, [sp, #(4 * 8)]
+ add sp, sp, #(10 * 4)
+exit_fiq:
+ msr spsr_cxsf, r12
+ add r10, #1
+ cmp r11, #0
+ moveqs pc, lr
+ bx r11 /* jump to custom fiq return function */
+
+nested_fiq:
+ orr r12, r12, #(PSR_F_BIT)
+ b exit_fiq
+
+fiq_glue_end:
+
+ENTRY(fiq_glue_setup) /* func, data, sp, smc call number */
+ stmfd sp!, {r4}
+ mrs r4, cpsr
+ msr cpsr_c, #(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)
+ movs r8, r0
+ mov r9, r1
+ mov sp, r2
+ mov r11, r3
+ moveq r10, #0
+ movne r10, #1
+ msr cpsr_c, r4
+ ldmfd sp!, {r4}
+ bx lr
+
diff --git a/arch/arm/common/fiq_glue_setup.c b/arch/arm/common/fiq_glue_setup.c
new file mode 100644
index 000000000000..8cb1b611c6d5
--- /dev/null
+++ b/arch/arm/common/fiq_glue_setup.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <asm/fiq.h>
+#include <asm/fiq_glue.h>
+
+extern unsigned char fiq_glue, fiq_glue_end;
+extern void fiq_glue_setup(void *func, void *data, void *sp,
+ fiq_return_handler_t fiq_return_handler);
+
+static struct fiq_handler fiq_debbuger_fiq_handler = {
+ .name = "fiq_glue",
+};
+DEFINE_PER_CPU(void *, fiq_stack);
+static struct fiq_glue_handler *current_handler;
+static fiq_return_handler_t fiq_return_handler;
+static DEFINE_MUTEX(fiq_glue_lock);
+
+static void fiq_glue_setup_helper(void *info)
+{
+ struct fiq_glue_handler *handler = info;
+ fiq_glue_setup(handler->fiq, handler,
+ __get_cpu_var(fiq_stack) + THREAD_START_SP,
+ fiq_return_handler);
+}
+
+int fiq_glue_register_handler(struct fiq_glue_handler *handler)
+{
+ int ret;
+ int cpu;
+
+ if (!handler || !handler->fiq)
+ return -EINVAL;
+
+ mutex_lock(&fiq_glue_lock);
+ if (fiq_stack) {
+ ret = -EBUSY;
+ goto err_busy;
+ }
+
+ for_each_possible_cpu(cpu) {
+ void *stack;
+ stack = (void *)__get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
+ if (WARN_ON(!stack)) {
+ ret = -ENOMEM;
+ goto err_alloc_fiq_stack;
+ }
+ per_cpu(fiq_stack, cpu) = stack;
+ }
+
+ ret = claim_fiq(&fiq_debbuger_fiq_handler);
+ if (WARN_ON(ret))
+ goto err_claim_fiq;
+
+ current_handler = handler;
+ on_each_cpu(fiq_glue_setup_helper, handler, true);
+ set_fiq_handler(&fiq_glue, &fiq_glue_end - &fiq_glue);
+
+ mutex_unlock(&fiq_glue_lock);
+ return 0;
+
+err_claim_fiq:
+err_alloc_fiq_stack:
+ for_each_possible_cpu(cpu) {
+ __free_pages(per_cpu(fiq_stack, cpu), THREAD_SIZE_ORDER);
+ per_cpu(fiq_stack, cpu) = NULL;
+ }
+err_busy:
+ mutex_unlock(&fiq_glue_lock);
+ return ret;
+}
+
+static void fiq_glue_update_return_handler(void (*fiq_return)(void))
+{
+ fiq_return_handler = fiq_return;
+ if (current_handler)
+ on_each_cpu(fiq_glue_setup_helper, current_handler, true);
+}
+
+int fiq_glue_set_return_handler(void (*fiq_return)(void))
+{
+ int ret;
+
+ mutex_lock(&fiq_glue_lock);
+ if (fiq_return_handler) {
+ ret = -EBUSY;
+ goto err_busy;
+ }
+ fiq_glue_update_return_handler(fiq_return);
+ ret = 0;
+err_busy:
+ mutex_unlock(&fiq_glue_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(fiq_glue_set_return_handler);
+
+int fiq_glue_clear_return_handler(void (*fiq_return)(void))
+{
+ int ret;
+
+ mutex_lock(&fiq_glue_lock);
+ if (WARN_ON(fiq_return_handler != fiq_return)) {
+ ret = -EINVAL;
+ goto err_inval;
+ }
+ fiq_glue_update_return_handler(NULL);
+ ret = 0;
+err_inval:
+ mutex_unlock(&fiq_glue_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(fiq_glue_clear_return_handler);
+
+/**
+ * fiq_glue_resume - Restore fiqs after suspend or low power idle states
+ *
+ * This must be called before calling local_fiq_enable after returning from a
+ * power state where the fiq mode registers were lost. If a driver provided
+ * a resume hook when it registered the handler it will be called.
+ */
+
+void fiq_glue_resume(void)
+{
+ if (!current_handler)
+ return;
+ fiq_glue_setup(current_handler->fiq, current_handler,
+ __get_cpu_var(fiq_stack) + THREAD_START_SP,
+ fiq_return_handler);
+ if (current_handler->resume)
+ current_handler->resume(current_handler);
+}
+
diff --git a/arch/arm/crypto/.gitignore b/arch/arm/crypto/.gitignore
index 6231d36b3635..31e1f538df7d 100644
--- a/arch/arm/crypto/.gitignore
+++ b/arch/arm/crypto/.gitignore
@@ -1 +1,3 @@
aesbs-core.S
+sha256-core.S
+sha512-core.S
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index b48fa341648d..2cee53b27238 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -6,12 +6,15 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
aes-arm-y := aes-armv4.o aes_glue.o
aes-arm-bs-y := aesbs-core.o aesbs-glue.o
sha1-arm-y := sha1-armv4-large.o sha1_glue.o
sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
+sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
+sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
quiet_cmd_perl = PERL $@
@@ -20,4 +23,7 @@ quiet_cmd_perl = PERL $@
$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
$(call cmd,perl)
-.PRECIOUS: $(obj)/aesbs-core.S
+$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
+ $(call cmd,perl)
+
+.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
new file mode 100644
index 000000000000..fac0533ea633
--- /dev/null
+++ b/arch/arm/crypto/sha256-armv4.pl
@@ -0,0 +1,716 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0"; $t0="r0";
+$inp="r1"; $t4="r1";
+$len="r2"; $t1="r2";
+$T1="r3"; $t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+ @ ldr $t1,[$inp],#4 @ $i
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+# ifndef __ARMEB__
+ rev $t1,$t1
+# endif
+#else
+ @ ldrb $t1,[$inp,#3] @ $i
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ ldrb $t2,[$inp,#2]
+ ldrb $t0,[$inp,#1]
+ orr $t1,$t1,$t2,lsl#8
+ ldrb $t2,[$inp],#4
+ orr $t1,$t1,$t0,lsl#16
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ orr $t1,$t1,$t2,lsl#24
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+#endif
+___
+$code.=<<___;
+ ldr $t2,[$Ktbl],#4 @ *K256++
+ add $h,$h,$t1 @ h+=X[i]
+ str $t1,[sp,#`$i%16`*4]
+ eor $t1,$f,$g
+ add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
+ and $t1,$t1,$e
+ add $h,$h,$t2 @ h+=K256[i]
+ eor $t1,$t1,$g @ Ch(e,f,g)
+ eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+ add $h,$h,$t1 @ h+=Ch(e,f,g)
+#if $i==31
+ and $t2,$t2,#0xff
+ cmp $t2,#0xf2 @ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4 @ prefetch
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t2,$a,$b @ a^b, b^c in next round
+#else
+ ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
+ eor $t2,$a,$b @ a^b, b^c in next round
+ ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
+#endif
+ eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
+ and $t3,$t3,$t2 @ (b^c)&=(a^b)
+ add $d,$d,$h @ d+=h
+ eor $t3,$t3,$b @ Maj(a,b,c)
+ add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
+ @ add $h,$h,$t3 @ h+=Maj(a,b,c)
+___
+ ($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+ @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
+ @ ldr $t4,[sp,#`($i+14)%16`*4]
+ mov $t0,$t1,ror#$sigma0[0]
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ mov $t2,$t4,ror#$sigma1[0]
+ eor $t0,$t0,$t1,ror#$sigma0[1]
+ eor $t2,$t2,$t4,ror#$sigma1[1]
+ eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
+ ldr $t1,[sp,#`($i+0)%16`*4]
+ eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
+ ldr $t4,[sp,#`($i+9)%16`*4]
+
+ add $t2,$t2,$t0
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
+ add $t1,$t1,$t2
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+ add $t1,$t1,$t4 @ X[i]
+___
+ &BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+# define adrl adr
+.thumb
+# else
+.code 32
+# endif
+#endif
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global sha256_block_data_order
+.type sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+ sub r3,pc,#8 @ sha256_block_data_order
+#else
+ adr r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
+ ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+ sub $Ktbl,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t3,$B,$C @ magic
+ eor $t2,$t2,$t2
+___
+for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq $t3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t0,[$t3,#0]
+ ldr $t1,[$t3,#4]
+ ldr $t2,[$t3,#8]
+ add $A,$A,$t0
+ ldr $t0,[$t3,#12]
+ add $B,$B,$t1
+ ldr $t1,[$t3,#16]
+ add $C,$C,$t2
+ ldr $t2,[$t3,#20]
+ add $D,$D,$t0
+ ldr $t0,[$t3,#24]
+ add $E,$E,$t1
+ ldr $t1,[$t3,#28]
+ add $F,$F,$t2
+ ldr $inp,[sp,#17*4] @ pull inp
+ ldr $t2,[sp,#18*4] @ pull inp+len
+ add $G,$G,$t0
+ add $H,$H,$t1
+ stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+ cmp $inp,$t2
+ sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#`16+3`*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T2,$T0,$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T1,$T0,$sigma0[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T2,$T0,32-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T3,$T0,$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T3,$T0,32-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T3); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ while($#insns>=2) { eval(shift(@insns)); }
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vrev32_8 (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ foreach (@insns) { eval; } # remaining instructions
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+ '&add ($h,$h,$t1)', # h+=X[i]+K[i]
+ '&eor ($t1,$f,$g)',
+ '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+ '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
+ '&and ($t1,$t1,$e)',
+ '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
+ '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+ '&eor ($t1,$t1,$g)', # Ch(e,f,g)
+ '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
+ '&eor ($t2,$a,$b)', # a^b, b^c in next round
+ '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
+ '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
+ '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
+ '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
+ '&ldr ($t1,"[sp,#64]") if ($j==31)',
+ '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
+ '&add ($d,$d,$h)', # d+=h
+ '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
+ '&eor ($t3,$t3,$b)', # Maj(a,b,c)
+ '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+ )
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ sub $H,sp,#16*4+16
+ adrl $Ktbl,K256
+ bic $H,$H,#15 @ align for 128-bit stores
+ mov $t2,sp
+ mov sp,$H @ alloca
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {@X[0]},[$inp]!
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ vld1.32 {$T0},[$Ktbl,:128]!
+ vld1.32 {$T1},[$Ktbl,:128]!
+ vld1.32 {$T2},[$Ktbl,:128]!
+ vld1.32 {$T3},[$Ktbl,:128]!
+ vrev32.8 @X[0],@X[0] @ yes, even on
+ str $ctx,[sp,#64]
+ vrev32.8 @X[1],@X[1] @ big-endian
+ str $inp,[sp,#68]
+ mov $Xfer,sp
+ vrev32.8 @X[2],@X[2]
+ str $len,[sp,#72]
+ vrev32.8 @X[3],@X[3]
+ str $t2,[sp,#76] @ save original sp
+ vadd.i32 $T0,$T0,@X[0]
+ vadd.i32 $T1,$T1,@X[1]
+ vst1.32 {$T0},[$Xfer,:128]!
+ vadd.i32 $T2,$T2,@X[2]
+ vst1.32 {$T1},[$Xfer,:128]!
+ vadd.i32 $T3,$T3,@X[3]
+ vst1.32 {$T2},[$Xfer,:128]!
+ vst1.32 {$T3},[$Xfer,:128]!
+
+ ldmia $ctx,{$A-$H}
+ sub $Xfer,$Xfer,#64
+ ldr $t1,[sp,#0]
+ eor $t2,$t2,$t2
+ eor $t3,$B,$C
+ b .L_00_48
+
+.align 4
+.L_00_48:
+___
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+$code.=<<___;
+ teq $t1,#0 @ check for K256 terminator
+ ldr $t1,[sp,#0]
+ sub $Xfer,$Xfer,#64
+ bne .L_00_48
+
+ ldr $inp,[sp,#68]
+ ldr $t0,[sp,#72]
+ sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
+ teq $inp,$t0
+ it eq
+ subeq $inp,$inp,#64 @ avoid SEGV
+ vld1.8 {@X[0]},[$inp]! @ load next input block
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ it ne
+ strne $inp,[sp,#68]
+ mov $Xfer,sp
+___
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+$code.=<<___;
+ ldr $t0,[$t1,#0]
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t2,[$t1,#4]
+ ldr $t3,[$t1,#8]
+ ldr $t4,[$t1,#12]
+ add $A,$A,$t0 @ accumulate
+ ldr $t0,[$t1,#16]
+ add $B,$B,$t2
+ ldr $t2,[$t1,#20]
+ add $C,$C,$t3
+ ldr $t3,[$t1,#24]
+ add $D,$D,$t4
+ ldr $t4,[$t1,#28]
+ add $E,$E,$t0
+ str $A,[$t1],#4
+ add $F,$F,$t2
+ str $B,[$t1],#4
+ add $G,$G,$t3
+ str $C,[$t1],#4
+ add $H,$H,$t4
+ str $D,[$t1],#4
+ stmia $t1,{$E-$H}
+
+ ittte ne
+ movne $Xfer,sp
+ ldrne $t1,[sp,#0]
+ eorne $t2,$t2,$t2
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne $t3,$B,$C
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+ adr $Ktbl,.LARMv8
+ sub $Ktbl,$Ktbl,#.LARMv8-K256
+# else
+ adrl $Ktbl,K256
+# endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+.Loop_v8:
+ vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
+ vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
+ vld1.32 {$W0},[$Ktbl]!
+ vrev32.8 @MSG[0],@MSG[0]
+ vrev32.8 @MSG[1],@MSG[1]
+ vrev32.8 @MSG[2],@MSG[2]
+ vrev32.8 @MSG[3],@MSG[3]
+ vmov $ABCD_SAVE,$ABCD @ offload
+ vmov $EFGH_SAVE,$EFGH
+ teq $inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ sha256su0 @MSG[0],@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vld1.32 {$W0},[$Ktbl]!
+ vadd.i32 $W1,$W1,@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vld1.32 {$W1},[$Ktbl]
+ vadd.i32 $W0,$W0,@MSG[2]
+ sub $Ktbl,$Ktbl,#256-16 @ rewind
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vadd.i32 $W1,$W1,@MSG[3]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
+ vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {$ABCD,$EFGH},[$ctx]
+
+ ret @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+{ my %opcode = (
+ "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
+ "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
+
+ sub unsha256 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+}
+
+foreach (split($/,$code)) {
+
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped
new file mode 100644
index 000000000000..555a1a8eec90
--- /dev/null
+++ b/arch/arm/crypto/sha256-core.S_shipped
@@ -0,0 +1,2808 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+# define adrl adr
+.thumb
+# else
+.code 32
+# endif
+#endif
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global sha256_block_data_order
+.type sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+ sub r3,pc,#8 @ sha256_block_data_order
+#else
+ adr r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
+ ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+ sub r14,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ magic
+ eor r12,r12,r12
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 0
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 0
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 0==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 1
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 1
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 1==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 2
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 2
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 2==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 3
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 3
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 3==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 4
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 4
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 4==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 5
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 5==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 6
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 6
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 6==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 7
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 7==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 8
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 8
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 8==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 9
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 9
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 9==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 10
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 10
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 10==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 11
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 11
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 11==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 12
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 12
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 12==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 13
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 13
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 13==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 14
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 14
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 14==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 15
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 15
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 15==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+ @ ldr r2,[sp,#1*4] @ 16
+ @ ldr r1,[sp,#14*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#0*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#9*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 16==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#2*4] @ 17
+ @ ldr r1,[sp,#15*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#1*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#10*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 17==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#3*4] @ 18
+ @ ldr r1,[sp,#0*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#2*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#11*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 18==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#4*4] @ 19
+ @ ldr r1,[sp,#1*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#3*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#12*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 19==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#5*4] @ 20
+ @ ldr r1,[sp,#2*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#4*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#13*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 20==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#6*4] @ 21
+ @ ldr r1,[sp,#3*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#5*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#14*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 21==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#7*4] @ 22
+ @ ldr r1,[sp,#4*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#6*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#15*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 22==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#8*4] @ 23
+ @ ldr r1,[sp,#5*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#7*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#0*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 23==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#9*4] @ 24
+ @ ldr r1,[sp,#6*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#8*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#1*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 24==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#10*4] @ 25
+ @ ldr r1,[sp,#7*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#9*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#2*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 25==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#11*4] @ 26
+ @ ldr r1,[sp,#8*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#10*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#3*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 26==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#12*4] @ 27
+ @ ldr r1,[sp,#9*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#11*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#4*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 27==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#13*4] @ 28
+ @ ldr r1,[sp,#10*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#12*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#5*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 28==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#14*4] @ 29
+ @ ldr r1,[sp,#11*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#13*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#6*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 29==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#15*4] @ 30
+ @ ldr r1,[sp,#12*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#14*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#7*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 30==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#0*4] @ 31
+ @ ldr r1,[sp,#13*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#15*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#8*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 31==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq r3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r0,[r3,#0]
+ ldr r2,[r3,#4]
+ ldr r12,[r3,#8]
+ add r4,r4,r0
+ ldr r0,[r3,#12]
+ add r5,r5,r2
+ ldr r2,[r3,#16]
+ add r6,r6,r12
+ ldr r12,[r3,#20]
+ add r7,r7,r0
+ ldr r0,[r3,#24]
+ add r8,r8,r2
+ ldr r2,[r3,#28]
+ add r9,r9,r12
+ ldr r1,[sp,#17*4] @ pull inp
+ ldr r12,[sp,#18*4] @ pull inp+len
+ add r10,r10,r0
+ add r11,r11,r2
+ stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+ cmp r1,r12
+ sub r14,r14,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#19*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size sha256_block_data_order,.-sha256_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ sub r11,sp,#16*4+16
+ adrl r14,K256
+ bic r11,r11,#15 @ align for 128-bit stores
+ mov r12,sp
+ mov sp,r11 @ alloca
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {q0},[r1]!
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ vld1.32 {q8},[r14,:128]!
+ vld1.32 {q9},[r14,:128]!
+ vld1.32 {q10},[r14,:128]!
+ vld1.32 {q11},[r14,:128]!
+ vrev32.8 q0,q0 @ yes, even on
+ str r0,[sp,#64]
+ vrev32.8 q1,q1 @ big-endian
+ str r1,[sp,#68]
+ mov r1,sp
+ vrev32.8 q2,q2
+ str r2,[sp,#72]
+ vrev32.8 q3,q3
+ str r12,[sp,#76] @ save original sp
+ vadd.i32 q8,q8,q0
+ vadd.i32 q9,q9,q1
+ vst1.32 {q8},[r1,:128]!
+ vadd.i32 q10,q10,q2
+ vst1.32 {q9},[r1,:128]!
+ vadd.i32 q11,q11,q3
+ vst1.32 {q10},[r1,:128]!
+ vst1.32 {q11},[r1,:128]!
+
+ ldmia r0,{r4-r11}
+ sub r1,r1,#64
+ ldr r2,[sp,#0]
+ eor r12,r12,r12
+ eor r3,r5,r6
+ b .L_00_48
+
+.align 4
+.L_00_48:
+ vext.8 q8,q0,q1,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q2,q3,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q0,q0,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d7,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d7,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d7,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q0,q0,q9
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d7,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d7,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d0,d0,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d0,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d0,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d0,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ vshr.u32 d24,d0,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d0,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d1,d1,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q0
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q1,q2,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q3,q0,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q1,q1,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d1,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d1,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d1,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q1,q1,q9
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d1,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d1,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d2,d2,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d2,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d2,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d2,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ vshr.u32 d24,d2,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d2,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d3,d3,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q1
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vext.8 q8,q2,q3,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q0,q1,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q2,q2,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d3,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d3,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d3,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q2,q2,q9
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d3,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d3,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d4,d4,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d4,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d4,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d4,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ vshr.u32 d24,d4,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d4,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d5,d5,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q2
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q3,q0,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q1,q2,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q3,q3,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d5,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d5,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d5,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q3,q3,q9
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d5,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d5,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d6,d6,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d6,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d6,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d6,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ vshr.u32 d24,d6,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d6,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d7,d7,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q3
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[r14]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ teq r2,#0 @ check for K256 terminator
+ ldr r2,[sp,#0]
+ sub r1,r1,#64
+ bne .L_00_48
+
+ ldr r1,[sp,#68]
+ ldr r0,[sp,#72]
+ sub r14,r14,#256 @ rewind r14
+ teq r1,r0
+ it eq
+ subeq r1,r1,#64 @ avoid SEGV
+ vld1.8 {q0},[r1]! @ load next input block
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ it ne
+ strne r1,[sp,#68]
+ mov r1,sp
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q0,q0
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q0
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q1,q1
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q1
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q2,q2
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q2
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q3,q3
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q3
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#64]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ ldr r0,[r2,#0]
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r12,[r2,#4]
+ ldr r3,[r2,#8]
+ ldr r1,[r2,#12]
+ add r4,r4,r0 @ accumulate
+ ldr r0,[r2,#16]
+ add r5,r5,r12
+ ldr r12,[r2,#20]
+ add r6,r6,r3
+ ldr r3,[r2,#24]
+ add r7,r7,r1
+ ldr r1,[r2,#28]
+ add r8,r8,r0
+ str r4,[r2],#4
+ add r9,r9,r12
+ str r5,[r2],#4
+ add r10,r10,r3
+ str r6,[r2],#4
+ add r11,r11,r1
+ str r7,[r2],#4
+ stmia r2,{r8-r11}
+
+ ittte ne
+ movne r1,sp
+ ldrne r2,[sp,#0]
+ eorne r12,r12,r12
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne r3,r5,r6
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {q0,q1},[r0]
+# ifdef __thumb2__
+ adr r3,.LARMv8
+ sub r3,r3,#.LARMv8-K256
+# else
+ adrl r3,K256
+# endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+
+.Loop_v8:
+ vld1.8 {q8-q9},[r1]!
+ vld1.8 {q10-q11},[r1]!
+ vld1.32 {q12},[r3]!
+ vrev32.8 q8,q8
+ vrev32.8 q9,q9
+ vrev32.8 q10,q10
+ vrev32.8 q11,q11
+ vmov q14,q0 @ offload
+ vmov q15,q1
+ teq r1,r2
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vld1.32 {q13},[r3]
+ vadd.i32 q12,q12,q10
+ sub r3,r3,#256-16 @ rewind
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vadd.i32 q13,q13,q11
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vadd.i32 q0,q0,q14
+ vadd.i32 q1,q1,q15
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {q0,q1},[r0]
+
+ bx lr @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
new file mode 100644
index 000000000000..ff90ced528a1
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.c
@@ -0,0 +1,246 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using optimized ARM assembler and NEON instructions.
+ *
+ * Copyright © 2015 Google Inc.
+ *
+ * This file is based on sha256_ssse3_glue.c:
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
+ unsigned int num_blks);
+
+
+int sha256_init(struct shash_desc *desc)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA256_H0;
+ sctx->state[1] = SHA256_H1;
+ sctx->state[2] = SHA256_H2;
+ sctx->state[3] = SHA256_H3;
+ sctx->state[4] = SHA256_H4;
+ sctx->state[5] = SHA256_H5;
+ sctx->state[6] = SHA256_H6;
+ sctx->state[7] = SHA256_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+int sha224_init(struct shash_desc *desc)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA224_H0;
+ sctx->state[1] = SHA224_H1;
+ sctx->state[2] = SHA224_H2;
+ sctx->state[3] = SHA224_H3;
+ sctx->state[4] = SHA224_H4;
+ sctx->state[5] = SHA224_H5;
+ sctx->state[6] = SHA224_H6;
+ sctx->state[7] = SHA224_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+int __sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len,
+ unsigned int partial)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count += len;
+
+ if (partial) {
+ done = SHA256_BLOCK_SIZE - partial;
+ memcpy(sctx->buf + partial, data, done);
+ sha256_block_data_order(sctx->state, sctx->buf, 1);
+ }
+
+ if (len - done >= SHA256_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+ sha256_block_data_order(sctx->state, data + done, rounds);
+ done += rounds * SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buf, data + done, len - done);
+
+ return 0;
+}
+
+int sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA256_BLOCK_SIZE) {
+ sctx->count += len;
+ memcpy(sctx->buf + partial, data, len);
+
+ return 0;
+ }
+
+ return __sha256_update(desc, data, len, partial);
+}
+
+/* Add padding and return the message digest. */
+static int sha256_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+ /* save number of bits */
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64 and append length */
+ index = sctx->count % SHA256_BLOCK_SIZE;
+ padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+ /* We need to fill a whole block for __sha256_update */
+ if (padlen <= 56) {
+ sctx->count += padlen;
+ memcpy(sctx->buf + index, padding, padlen);
+ } else {
+ __sha256_update(desc, padding, padlen, index);
+ }
+ __sha256_update(desc, (const u8 *)&bits, sizeof(bits), 56);
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Wipe context */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha224_final(struct shash_desc *desc, u8 *out)
+{
+ u8 D[SHA256_DIGEST_SIZE];
+
+ sha256_final(desc, D);
+
+ memcpy(out, D, SHA224_DIGEST_SIZE);
+ memzero_explicit(D, SHA256_DIGEST_SIZE);
+
+ return 0;
+}
+
+int sha256_export(struct shash_desc *desc, void *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+int sha256_import(struct shash_desc *desc, const void *in)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static struct shash_alg algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_init,
+ .update = sha256_update,
+ .final = sha256_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-asm",
+ .cra_priority = 150,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_init,
+ .update = sha256_update,
+ .final = sha224_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-asm",
+ .cra_priority = 150,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int __init sha256_mod_init(void)
+{
+ int res = crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+ if (res < 0)
+ return res;
+
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
+ res = crypto_register_shashes(sha256_neon_algs,
+ ARRAY_SIZE(sha256_neon_algs));
+
+ if (res < 0)
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+ }
+
+ return res;
+}
+
+static void __exit sha256_mod_fini(void)
+{
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
+ crypto_unregister_shashes(sha256_neon_algs,
+ ARRAY_SIZE(sha256_neon_algs));
+}
+
+module_init(sha256_mod_init);
+module_exit(sha256_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm (ARM), including NEON");
+
+MODULE_ALIAS("sha256");
diff --git a/arch/arm/crypto/sha256_glue.h b/arch/arm/crypto/sha256_glue.h
new file mode 100644
index 000000000000..0312f4ffe8cc
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.h
@@ -0,0 +1,23 @@
+#ifndef _CRYPTO_SHA256_GLUE_H
+#define _CRYPTO_SHA256_GLUE_H
+
+#include <linux/crypto.h>
+#include <crypto/sha.h>
+
+extern struct shash_alg sha256_neon_algs[2];
+
+extern int sha256_init(struct shash_desc *desc);
+
+extern int sha224_init(struct shash_desc *desc);
+
+extern int __sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial);
+
+extern int sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len);
+
+extern int sha256_export(struct shash_desc *desc, void *out);
+
+extern int sha256_import(struct shash_desc *desc, const void *in);
+
+#endif /* _CRYPTO_SHA256_GLUE_H */
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
new file mode 100644
index 000000000000..c4da10090eee
--- /dev/null
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -0,0 +1,172 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright © 2015 Google Inc.
+ *
+ * This file is based on sha512_neon_glue.c:
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order_neon(u32 *digest, const void *data,
+ unsigned int num_blks);
+
+
+static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count += len;
+
+ if (partial) {
+ done = SHA256_BLOCK_SIZE - partial;
+ memcpy(sctx->buf + partial, data, done);
+ sha256_block_data_order_neon(sctx->state, sctx->buf, 1);
+ }
+
+ if (len - done >= SHA256_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+ sha256_block_data_order_neon(sctx->state, data + done, rounds);
+ done += rounds * SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buf, data + done, len - done);
+
+ return 0;
+}
+
+static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+ int res;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA256_BLOCK_SIZE) {
+ sctx->count += len;
+ memcpy(sctx->buf + partial, data, len);
+
+ return 0;
+ }
+
+ if (!may_use_simd()) {
+ res = __sha256_update(desc, data, len, partial);
+ } else {
+ kernel_neon_begin();
+ res = __sha256_neon_update(desc, data, len, partial);
+ kernel_neon_end();
+ }
+
+ return res;
+}
+
+/* Add padding and return the message digest. */
+static int sha256_neon_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+ /* save number of bits */
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64 and append length */
+ index = sctx->count % SHA256_BLOCK_SIZE;
+ padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+ if (!may_use_simd()) {
+ sha256_update(desc, padding, padlen);
+ sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+ } else {
+ kernel_neon_begin();
+ /* We need to fill a whole block for __sha256_neon_update() */
+ if (padlen <= 56) {
+ sctx->count += padlen;
+ memcpy(sctx->buf + index, padding, padlen);
+ } else {
+ __sha256_neon_update(desc, padding, padlen, index);
+ }
+ __sha256_neon_update(desc, (const u8 *)&bits,
+ sizeof(bits), 56);
+ kernel_neon_end();
+ }
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Wipe context */
+ memzero_explicit(sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha224_neon_final(struct shash_desc *desc, u8 *out)
+{
+ u8 D[SHA256_DIGEST_SIZE];
+
+ sha256_neon_final(desc, D);
+
+ memcpy(out, D, SHA224_DIGEST_SIZE);
+ memzero_explicit(D, SHA256_DIGEST_SIZE);
+
+ return 0;
+}
+
+struct shash_alg sha256_neon_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_init,
+ .update = sha256_neon_update,
+ .final = sha256_neon_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_init,
+ .update = sha256_neon_update,
+ .final = sha224_neon_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 70cd84eb7fda..9fb902425606 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -1,6 +1,5 @@
-generic-y += auxvec.h
generic-y += bitsperlong.h
generic-y += cputime.h
generic-y += current.h
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index f67fd3afebdf..700369a54b41 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -427,6 +427,53 @@ THUMB( orr \reg , \reg , #PSR_T_BIT )
#endif
.endm
+ .macro uaccess_disable, tmp, isb=1
+#ifdef CONFIG_CPU_SW_DOMAIN_PAN
+ /*
+ * Whenever we re-enter userspace, the domains should always be
+ * set appropriately.
+ */
+ mov \tmp, #DACR_UACCESS_DISABLE
+ mcr p15, 0, \tmp, c3, c0, 0 @ Set domain register
+ .if \isb
+ instr_sync
+ .endif
+#endif
+ .endm
+
+ .macro uaccess_enable, tmp, isb=1
+#ifdef CONFIG_CPU_SW_DOMAIN_PAN
+ /*
+ * Whenever we re-enter userspace, the domains should always be
+ * set appropriately.
+ */
+ mov \tmp, #DACR_UACCESS_ENABLE
+ mcr p15, 0, \tmp, c3, c0, 0
+ .if \isb
+ instr_sync
+ .endif
+#endif
+ .endm
+
+ .macro uaccess_save, tmp
+#ifdef CONFIG_CPU_SW_DOMAIN_PAN
+ mrc p15, 0, \tmp, c3, c0, 0
+ str \tmp, [sp, #S_FRAME_SIZE]
+#endif
+ .endm
+
+ .macro uaccess_restore
+#ifdef CONFIG_CPU_SW_DOMAIN_PAN
+ ldr r0, [sp, #S_FRAME_SIZE]
+ mcr p15, 0, r0, c3, c0, 0
+#endif
+ .endm
+
+ .macro uaccess_save_and_disable, tmp
+ uaccess_save \tmp
+ uaccess_disable \tmp
+ .endm
+
.irp c,,eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,hs,lo
.macro ret\c, reg
#if __LINUX_ARM_ARCH__ < 6
diff --git a/arch/arm/include/asm/auxvec.h b/arch/arm/include/asm/auxvec.h
new file mode 100644
index 000000000000..fbd388c46299
--- /dev/null
+++ b/arch/arm/include/asm/auxvec.h
@@ -0,0 +1 @@
+#include <uapi/asm/auxvec.h>
diff --git a/arch/arm/include/asm/bug.h b/arch/arm/include/asm/bug.h
index b274bde24905..e7335a92144e 100644
--- a/arch/arm/include/asm/bug.h
+++ b/arch/arm/include/asm/bug.h
@@ -40,6 +40,7 @@ do { \
"2:\t.asciz " #__file "\n" \
".popsection\n" \
".pushsection __bug_table,\"a\"\n" \
+ ".align 2\n" \
"3:\t.word 1b, 2b\n" \
"\t.hword " #__line ", 0\n" \
".popsection"); \
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 10e78d00a0bb..5797815727fe 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -487,6 +487,15 @@ int set_memory_rw(unsigned long addr, int numpages);
int set_memory_x(unsigned long addr, int numpages);
int set_memory_nx(unsigned long addr, int numpages);
+#ifdef CONFIG_DEBUG_RODATA
+void set_kernel_text_rw(void);
+void set_kernel_text_ro(void);
+#else
+static inline void set_kernel_text_rw(void) { }
+static inline void set_kernel_text_ro(void) { }
+#endif
+
void flush_uprobe_xol_access(struct page *page, unsigned long uaddr,
void *kaddr, unsigned long len);
+
#endif
diff --git a/arch/arm/include/asm/domain.h b/arch/arm/include/asm/domain.h
index 6ddbe446425e..fc8ba1663601 100644
--- a/arch/arm/include/asm/domain.h
+++ b/arch/arm/include/asm/domain.h
@@ -12,6 +12,7 @@
#ifndef __ASSEMBLY__
#include <asm/barrier.h>
+#include <asm/thread_info.h>
#endif
/*
@@ -34,15 +35,14 @@
*/
#ifndef CONFIG_IO_36
#define DOMAIN_KERNEL 0
-#define DOMAIN_TABLE 0
#define DOMAIN_USER 1
#define DOMAIN_IO 2
#else
#define DOMAIN_KERNEL 2
-#define DOMAIN_TABLE 2
#define DOMAIN_USER 1
#define DOMAIN_IO 0
#endif
+#define DOMAIN_VECTORS 3
/*
* Domain types
@@ -55,30 +55,65 @@
#define DOMAIN_MANAGER 1
#endif
-#define domain_val(dom,type) ((type) << (2*(dom)))
+#define domain_mask(dom) ((3) << (2 * (dom)))
+#define domain_val(dom,type) ((type) << (2 * (dom)))
+
+#ifdef CONFIG_CPU_SW_DOMAIN_PAN
+#define DACR_INIT \
+ (domain_val(DOMAIN_USER, DOMAIN_NOACCESS) | \
+ domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \
+ domain_val(DOMAIN_IO, DOMAIN_CLIENT) | \
+ domain_val(DOMAIN_VECTORS, DOMAIN_CLIENT))
+#else
+#define DACR_INIT \
+ (domain_val(DOMAIN_USER, DOMAIN_CLIENT) | \
+ domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \
+ domain_val(DOMAIN_IO, DOMAIN_CLIENT) | \
+ domain_val(DOMAIN_VECTORS, DOMAIN_CLIENT))
+#endif
+
+#define __DACR_DEFAULT \
+ domain_val(DOMAIN_KERNEL, DOMAIN_CLIENT) | \
+ domain_val(DOMAIN_IO, DOMAIN_CLIENT) | \
+ domain_val(DOMAIN_VECTORS, DOMAIN_CLIENT)
+
+#define DACR_UACCESS_DISABLE \
+ (__DACR_DEFAULT | domain_val(DOMAIN_USER, DOMAIN_NOACCESS))
+#define DACR_UACCESS_ENABLE \
+ (__DACR_DEFAULT | domain_val(DOMAIN_USER, DOMAIN_CLIENT))
#ifndef __ASSEMBLY__
-#ifdef CONFIG_CPU_USE_DOMAINS
+static inline unsigned int get_domain(void)
+{
+ unsigned int domain;
+
+ asm(
+ "mrc p15, 0, %0, c3, c0 @ get domain"
+ : "=r" (domain)
+ : "m" (current_thread_info()->cpu_domain));
+
+ return domain;
+}
+
static inline void set_domain(unsigned val)
{
asm volatile(
"mcr p15, 0, %0, c3, c0 @ set domain"
- : : "r" (val));
+ : : "r" (val) : "memory");
isb();
}
+#ifdef CONFIG_CPU_USE_DOMAINS
#define modify_domain(dom,type) \
do { \
- struct thread_info *thread = current_thread_info(); \
- unsigned int domain = thread->cpu_domain; \
- domain &= ~domain_val(dom, DOMAIN_MANAGER); \
- thread->cpu_domain = domain | domain_val(dom, type); \
- set_domain(thread->cpu_domain); \
+ unsigned int domain = get_domain(); \
+ domain &= ~domain_mask(dom); \
+ domain = domain | domain_val(dom, type); \
+ set_domain(domain); \
} while (0)
#else
-static inline void set_domain(unsigned val) { }
static inline void modify_domain(unsigned dom, unsigned type) { }
#endif
diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
index 674d03f4ba15..e2f4781630f6 100644
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -1,7 +1,9 @@
#ifndef __ASMARM_ELF_H
#define __ASMARM_ELF_H
+#include <asm/auxvec.h>
#include <asm/hwcap.h>
+#include <asm/vdso_datapage.h>
/*
* ELF register definitions..
@@ -130,6 +132,13 @@ extern unsigned long arch_randomize_brk(struct mm_struct *mm);
#define arch_randomize_brk arch_randomize_brk
#ifdef CONFIG_MMU
+#ifdef CONFIG_VDSO
+#define ARCH_DLINFO \
+do { \
+ NEW_AUX_ENT(AT_SYSINFO_EHDR, \
+ (elf_addr_t)current->mm->context.vdso); \
+} while (0)
+#endif
#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
struct linux_binprm;
int arch_setup_additional_pages(struct linux_binprm *, int);
diff --git a/arch/arm/include/asm/fiq_glue.h b/arch/arm/include/asm/fiq_glue.h
new file mode 100644
index 000000000000..a9e244f9f197
--- /dev/null
+++ b/arch/arm/include/asm/fiq_glue.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __ASM_FIQ_GLUE_H
+#define __ASM_FIQ_GLUE_H
+
+struct fiq_glue_handler {
+ void (*fiq)(struct fiq_glue_handler *h, void *regs, void *svc_sp);
+ void (*resume)(struct fiq_glue_handler *h);
+};
+typedef void (*fiq_return_handler_t)(void);
+
+int fiq_glue_register_handler(struct fiq_glue_handler *handler);
+int fiq_glue_set_return_handler(fiq_return_handler_t fiq_return);
+int fiq_glue_clear_return_handler(fiq_return_handler_t fiq_return);
+
+#ifdef CONFIG_FIQ_GLUE
+void fiq_glue_resume(void);
+#else
+static inline void fiq_glue_resume(void) {}
+#endif
+
+#endif
diff --git a/arch/arm/include/asm/fixmap.h b/arch/arm/include/asm/fixmap.h
index 74124b0d0d79..0415eae1df27 100644
--- a/arch/arm/include/asm/fixmap.h
+++ b/arch/arm/include/asm/fixmap.h
@@ -2,27 +2,24 @@
#define _ASM_FIXMAP_H
#define FIXADDR_START 0xffc00000UL
-#define FIXADDR_TOP 0xffe00000UL
-#define FIXADDR_SIZE (FIXADDR_TOP - FIXADDR_START)
+#define FIXADDR_END 0xfff00000UL
+#define FIXADDR_TOP (FIXADDR_END - PAGE_SIZE)
-#define FIX_KMAP_NR_PTES (FIXADDR_SIZE >> PAGE_SHIFT)
+#include <asm/kmap_types.h>
-#define __fix_to_virt(x) (FIXADDR_START + ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x) (((x) - FIXADDR_START) >> PAGE_SHIFT)
+enum fixed_addresses {
+ FIX_KMAP_BEGIN,
+ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1,
-extern void __this_fixmap_does_not_exist(void);
+ /* Support writing RO kernel text via kprobes, jump labels, etc. */
+ FIX_TEXT_POKE0,
+ FIX_TEXT_POKE1,
-static inline unsigned long fix_to_virt(const unsigned int idx)
-{
- if (idx >= FIX_KMAP_NR_PTES)
- __this_fixmap_does_not_exist();
- return __fix_to_virt(idx);
-}
+ __end_of_fixed_addresses
+};
-static inline unsigned int virt_to_fix(const unsigned long vaddr)
-{
- BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
- return __virt_to_fix(vaddr);
-}
+void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot);
+
+#include <asm-generic/fixmap.h>
#endif
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index 53e69dae796f..616db1c0b61d 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -22,8 +22,11 @@
#ifdef CONFIG_SMP
#define __futex_atomic_op(insn, ret, oldval, tmp, uaddr, oparg) \
+({ \
+ unsigned int __ua_flags; \
smp_mb(); \
prefetchw(uaddr); \
+ __ua_flags = uaccess_save_and_enable(); \
__asm__ __volatile__( \
"1: ldrex %1, [%3]\n" \
" " insn "\n" \
@@ -34,12 +37,15 @@
__futex_atomic_ex_table("%5") \
: "=&r" (ret), "=&r" (oldval), "=&r" (tmp) \
: "r" (uaddr), "r" (oparg), "Ir" (-EFAULT) \
- : "cc", "memory")
+ : "cc", "memory"); \
+ uaccess_restore(__ua_flags); \
+})
static inline int
futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
u32 oldval, u32 newval)
{
+ unsigned int __ua_flags;
int ret;
u32 val;
@@ -49,6 +55,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
smp_mb();
/* Prefetching cannot fault */
prefetchw(uaddr);
+ __ua_flags = uaccess_save_and_enable();
__asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
"1: ldrex %1, [%4]\n"
" teq %1, %2\n"
@@ -61,6 +68,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
: "=&r" (ret), "=&r" (val)
: "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT)
: "cc", "memory");
+ uaccess_restore(__ua_flags);
smp_mb();
*uval = val;
@@ -73,6 +81,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
#include <asm/domain.h>
#define __futex_atomic_op(insn, ret, oldval, tmp, uaddr, oparg) \
+({ \
+ unsigned int __ua_flags = uaccess_save_and_enable(); \
__asm__ __volatile__( \
"1: " TUSER(ldr) " %1, [%3]\n" \
" " insn "\n" \
@@ -81,18 +91,23 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
__futex_atomic_ex_table("%5") \
: "=&r" (ret), "=&r" (oldval), "=&r" (tmp) \
: "r" (uaddr), "r" (oparg), "Ir" (-EFAULT) \
- : "cc", "memory")
+ : "cc", "memory"); \
+ uaccess_restore(__ua_flags); \
+})
static inline int
futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
u32 oldval, u32 newval)
{
+ unsigned int __ua_flags;
int ret = 0;
u32 val;
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
return -EFAULT;
+ preempt_disable();
+ __ua_flags = uaccess_save_and_enable();
__asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
"1: " TUSER(ldr) " %1, [%4]\n"
" teq %1, %2\n"
@@ -102,8 +117,11 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
: "+r" (ret), "=&r" (val)
: "r" (oldval), "r" (newval), "r" (uaddr), "Ir" (-EFAULT)
: "cc", "memory");
+ uaccess_restore(__ua_flags);
*uval = val;
+ preempt_enable();
+
return ret;
}
@@ -124,7 +142,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
return -EFAULT;
- pagefault_disable(); /* implies preempt_disable() */
+#ifndef CONFIG_SMP
+ preempt_disable();
+#endif
+ pagefault_disable();
switch (op) {
case FUTEX_OP_SET:
@@ -146,7 +167,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
ret = -ENOSYS;
}
- pagefault_enable(); /* subsumes preempt_enable() */
+ pagefault_enable();
+#ifndef CONFIG_SMP
+ preempt_enable();
+#endif
if (!ret) {
switch (cmp) {
diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h
index fe3ea776dc34..5df33e30ae1b 100644
--- a/arch/arm/include/asm/hardirq.h
+++ b/arch/arm/include/asm/hardirq.h
@@ -5,7 +5,7 @@
#include <linux/threads.h>
#include <asm/irq.h>
-#define NR_IPI 8
+#define NR_IPI 9
typedef struct {
unsigned int __softirq_pending;
diff --git a/arch/arm/include/asm/hardware/coresight.h b/arch/arm/include/asm/hardware/coresight.h
index ad774f37c47c..b3507650b41c 100644
--- a/arch/arm/include/asm/hardware/coresight.h
+++ b/arch/arm/include/asm/hardware/coresight.h
@@ -17,15 +17,23 @@
#define TRACER_ACCESSED_BIT 0
#define TRACER_RUNNING_BIT 1
#define TRACER_CYCLE_ACC_BIT 2
+#define TRACER_TRACE_DATA_BIT 3
+#define TRACER_TIMESTAMP_BIT 4
+#define TRACER_BRANCHOUTPUT_BIT 5
+#define TRACER_RETURN_STACK_BIT 6
#define TRACER_ACCESSED BIT(TRACER_ACCESSED_BIT)
#define TRACER_RUNNING BIT(TRACER_RUNNING_BIT)
#define TRACER_CYCLE_ACC BIT(TRACER_CYCLE_ACC_BIT)
+#define TRACER_TRACE_DATA BIT(TRACER_TRACE_DATA_BIT)
+#define TRACER_TIMESTAMP BIT(TRACER_TIMESTAMP_BIT)
+#define TRACER_BRANCHOUTPUT BIT(TRACER_BRANCHOUTPUT_BIT)
+#define TRACER_RETURN_STACK BIT(TRACER_RETURN_STACK_BIT)
#define TRACER_TIMEOUT 10000
-#define etm_writel(t, v, x) \
- (writel_relaxed((v), (t)->etm_regs + (x)))
-#define etm_readl(t, x) (readl_relaxed((t)->etm_regs + (x)))
+#define etm_writel(t, id, v, x) \
+ (writel_relaxed((v), (t)->etm_regs[(id)] + (x)))
+#define etm_readl(t, id, x) (readl_relaxed((t)->etm_regs[(id)] + (x)))
/* CoreSight Management Registers */
#define CSMR_LOCKACCESS 0xfb0
@@ -43,7 +51,7 @@
#define ETMCTRL_POWERDOWN 1
#define ETMCTRL_PROGRAM (1 << 10)
#define ETMCTRL_PORTSEL (1 << 11)
-#define ETMCTRL_DO_CONTEXTID (3 << 14)
+#define ETMCTRL_CONTEXTIDSIZE(x) (((x) & 3) << 14)
#define ETMCTRL_PORTMASK1 (7 << 4)
#define ETMCTRL_PORTMASK2 (1 << 21)
#define ETMCTRL_PORTMASK (ETMCTRL_PORTMASK1 | ETMCTRL_PORTMASK2)
@@ -55,9 +63,12 @@
#define ETMCTRL_DATA_DO_BOTH (ETMCTRL_DATA_DO_DATA | ETMCTRL_DATA_DO_ADDR)
#define ETMCTRL_BRANCH_OUTPUT (1 << 8)
#define ETMCTRL_CYCLEACCURATE (1 << 12)
+#define ETMCTRL_TIMESTAMP_EN (1 << 28)
+#define ETMCTRL_RETURN_STACK_EN (1 << 29)
/* ETM configuration code register */
#define ETMR_CONFCODE (0x04)
+#define ETMCCR_ETMIDR_PRESENT BIT(31)
/* ETM trace start/stop resource control register */
#define ETMR_TRACESSCTRL (0x18)
@@ -113,10 +124,25 @@
#define ETMR_TRACEENCTRL 0x24
#define ETMTE_INCLEXCL BIT(24)
#define ETMR_TRACEENEVT 0x20
-#define ETMCTRL_OPTS (ETMCTRL_DO_CPRT | \
- ETMCTRL_DATA_DO_ADDR | \
- ETMCTRL_BRANCH_OUTPUT | \
- ETMCTRL_DO_CONTEXTID)
+
+#define ETMR_VIEWDATAEVT 0x30
+#define ETMR_VIEWDATACTRL1 0x34
+#define ETMR_VIEWDATACTRL2 0x38
+#define ETMR_VIEWDATACTRL3 0x3c
+#define ETMVDC3_EXCLONLY BIT(16)
+
+#define ETMCTRL_OPTS (ETMCTRL_DO_CPRT)
+
+#define ETMR_ID 0x1e4
+#define ETMIDR_VERSION(x) (((x) >> 4) & 0xff)
+#define ETMIDR_VERSION_3_1 0x21
+#define ETMIDR_VERSION_PFT_1_0 0x30
+
+#define ETMR_CCE 0x1e8
+#define ETMCCER_RETURN_STACK_IMPLEMENTED BIT(23)
+#define ETMCCER_TIMESTAMPING_IMPLEMENTED BIT(22)
+
+#define ETMR_TRACEIDR 0x200
/* ETM management registers, "ETM Architecture", 3.5.24 */
#define ETMMR_OSLAR 0x300
@@ -140,14 +166,16 @@
#define ETBFF_TRIGIN BIT(8)
#define ETBFF_TRIGEVT BIT(9)
#define ETBFF_TRIGFL BIT(10)
+#define ETBFF_STOPFL BIT(12)
#define etb_writel(t, v, x) \
(writel_relaxed((v), (t)->etb_regs + (x)))
#define etb_readl(t, x) (readl_relaxed((t)->etb_regs + (x)))
-#define etm_lock(t) do { etm_writel((t), 0, CSMR_LOCKACCESS); } while (0)
-#define etm_unlock(t) \
- do { etm_writel((t), CS_LAR_KEY, CSMR_LOCKACCESS); } while (0)
+#define etm_lock(t, id) \
+ do { etm_writel((t), (id), 0, CSMR_LOCKACCESS); } while (0)
+#define etm_unlock(t, id) \
+ do { etm_writel((t), (id), CS_LAR_KEY, CSMR_LOCKACCESS); } while (0)
#define etb_lock(t) do { etb_writel((t), 0, CSMR_LOCKACCESS); } while (0)
#define etb_unlock(t) \
diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h
index 53c15dec7af6..809203a4b71b 100644
--- a/arch/arm/include/asm/irq.h
+++ b/arch/arm/include/asm/irq.h
@@ -35,6 +35,9 @@ extern void (*handle_arch_irq)(struct pt_regs *);
extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
#endif
+void arch_trigger_all_cpu_backtrace(void);
+#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
+
#endif
#endif
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 66ce17655bb9..7b0152321b20 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -38,6 +38,16 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
vcpu->arch.hcr = HCR_GUEST_MASK;
}
+static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hcr;
+}
+
+static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
+{
+ vcpu->arch.hcr = hcr;
+}
+
static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu)
{
return 1;
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 53036e21756b..dc7a8f0ce9ee 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -125,9 +125,6 @@ struct kvm_vcpu_arch {
* Anything that is not used directly from assembly code goes
* here.
*/
- /* dcache set/way operation pending */
- int last_pcpu;
- cpumask_t require_dcache_flush;
/* Don't run the guest on this vcpu */
bool pause;
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 16d9d788d0b8..5de64c0e477d 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -245,7 +245,8 @@ static inline void __kvm_flush_dcache_pud(pud_t pud)
#define kvm_virt_to_phys(x) virt_to_idmap((unsigned long)(x))
-void stage2_flush_vm(struct kvm *kvm);
+void kvm_set_way_flush(struct kvm_vcpu *vcpu);
+void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
#endif /* !__ASSEMBLY__ */
diff --git a/arch/arm/include/asm/mach/arch.h b/arch/arm/include/asm/mach/arch.h
index 0406cb3f1af7..e881913f7c3e 100644
--- a/arch/arm/include/asm/mach/arch.h
+++ b/arch/arm/include/asm/mach/arch.h
@@ -51,7 +51,7 @@ struct machine_desc {
bool (*smp_init)(void);
void (*fixup)(struct tag *, char **);
void (*dt_fixup)(void);
- void (*init_meminfo)(void);
+ long long (*init_meminfo)(void);
void (*reserve)(void);/* reserve mem blocks */
void (*map_io)(void);/* IO mapping function */
void (*init_early)(void);
diff --git a/arch/arm/include/asm/mach/mmc.h b/arch/arm/include/asm/mach/mmc.h
new file mode 100644
index 000000000000..bca864ac945f
--- /dev/null
+++ b/arch/arm/include/asm/mach/mmc.h
@@ -0,0 +1,28 @@
+/*
+ * arch/arm/include/asm/mach/mmc.h
+ */
+#ifndef ASMARM_MACH_MMC_H
+#define ASMARM_MACH_MMC_H
+
+#include <linux/mmc/host.h>
+#include <linux/mmc/card.h>
+#include <linux/mmc/sdio_func.h>
+
+struct embedded_sdio_data {
+ struct sdio_cis cis;
+ struct sdio_cccr cccr;
+ struct sdio_embedded_func *funcs;
+ int num_funcs;
+};
+
+struct mmc_platform_data {
+ unsigned int ocr_mask; /* available voltages */
+ int built_in; /* built-in device flag */
+ int card_present; /* card detect state */
+ u32 (*translate_vdd)(struct device *, unsigned int);
+ unsigned int (*status)(struct device *);
+ struct embedded_sdio_data *embedded_sdio;
+ int (*register_status_notify)(void (*callback)(int card_present, void *dev_id), void *dev_id);
+};
+
+#endif
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index e731018869a7..976dc09dee9c 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -18,8 +18,6 @@
#include <linux/types.h>
#include <linux/sizes.h>
-#include <asm/cache.h>
-
#ifdef CONFIG_NEED_MACH_MEMORY_H
#include <mach/memory.h>
#endif
@@ -121,32 +119,12 @@
#endif
/*
- * Convert a physical address to a Page Frame Number and back
- */
-#define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT))
-#define __pfn_to_phys(pfn) ((phys_addr_t)(pfn) << PAGE_SHIFT)
-
-/*
* Convert a page to/from a physical address
*/
#define page_to_phys(page) (__pfn_to_phys(page_to_pfn(page)))
#define phys_to_page(phys) (pfn_to_page(__phys_to_pfn(phys)))
/*
- * Minimum guaranted alignment in pgd_alloc(). The page table pointers passed
- * around in head.S and proc-*.S are shifted by this amount, in order to
- * leave spare high bits for systems with physical address extension. This
- * does not fully accomodate the 40-bit addressing capability of ARM LPAE, but
- * gives us about 38-bits or so.
- */
-#ifdef CONFIG_ARM_LPAE
-#define ARCH_PGD_SHIFT L1_CACHE_SHIFT
-#else
-#define ARCH_PGD_SHIFT 0
-#endif
-#define ARCH_PGD_MASK ((1 << ARCH_PGD_SHIFT) - 1)
-
-/*
* PLAT_PHYS_OFFSET is the offset (from zero) of the start of physical
* memory. This is used for XIP and NoMMU kernels, and on platforms that don't
* have CONFIG_ARM_PATCH_PHYS_VIRT. Assembly code must always use
diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h
index 64fd15159b7d..a5b47421059d 100644
--- a/arch/arm/include/asm/mmu.h
+++ b/arch/arm/include/asm/mmu.h
@@ -11,6 +11,9 @@ typedef struct {
#endif
unsigned int vmalloc_seq;
unsigned long sigpage;
+#ifdef CONFIG_VDSO
+ unsigned long vdso;
+#endif
} mm_context_t;
#ifdef CONFIG_CPU_HAS_ASID
diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h
index c3a83691af8e..6f3bfce213d0 100644
--- a/arch/arm/include/asm/perf_event.h
+++ b/arch/arm/include/asm/perf_event.h
@@ -19,4 +19,11 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
#define perf_misc_flags(regs) perf_misc_flags(regs)
#endif
+#define perf_arch_fetch_caller_regs(regs, __ip) { \
+ (regs)->ARM_pc = (__ip); \
+ (regs)->ARM_fp = (unsigned long) __builtin_frame_address(0); \
+ (regs)->ARM_sp = current_stack_pointer; \
+ (regs)->ARM_cpsr = SVC_MODE; \
+}
+
#endif /* __ARM_PERF_EVENT_H__ */
diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h
index 78a779361682..19cfab526d13 100644
--- a/arch/arm/include/asm/pgalloc.h
+++ b/arch/arm/include/asm/pgalloc.h
@@ -157,7 +157,15 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
static inline void
pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep)
{
- __pmd_populate(pmdp, page_to_phys(ptep), _PAGE_USER_TABLE);
+ extern pmdval_t user_pmd_table;
+ pmdval_t prot;
+
+ if (__LINUX_ARM_ARCH__ >= 6 && !IS_ENABLED(CONFIG_ARM_LPAE))
+ prot = user_pmd_table;
+ else
+ prot = _PAGE_USER_TABLE;
+
+ __pmd_populate(pmdp, page_to_phys(ptep), prot);
}
#define pmd_pgtable(pmd) pmd_page(pmd)
diff --git a/arch/arm/include/asm/pgtable-2level-hwdef.h b/arch/arm/include/asm/pgtable-2level-hwdef.h
index 5cfba15cb401..d0131ee6f6af 100644
--- a/arch/arm/include/asm/pgtable-2level-hwdef.h
+++ b/arch/arm/include/asm/pgtable-2level-hwdef.h
@@ -20,12 +20,15 @@
#define PMD_TYPE_FAULT (_AT(pmdval_t, 0) << 0)
#define PMD_TYPE_TABLE (_AT(pmdval_t, 1) << 0)
#define PMD_TYPE_SECT (_AT(pmdval_t, 2) << 0)
+#define PMD_PXNTABLE (_AT(pmdval_t, 1) << 2) /* v7 */
#define PMD_BIT4 (_AT(pmdval_t, 1) << 4)
#define PMD_DOMAIN(x) (_AT(pmdval_t, (x)) << 5)
+#define PMD_DOMAIN_MASK PMD_DOMAIN(0x0f)
#define PMD_PROTECTION (_AT(pmdval_t, 1) << 9) /* v5 */
/*
* - section
*/
+#define PMD_SECT_PXN (_AT(pmdval_t, 1) << 0) /* v7 */
#define PMD_SECT_BUFFERABLE (_AT(pmdval_t, 1) << 2)
#define PMD_SECT_CACHEABLE (_AT(pmdval_t, 1) << 3)
#define PMD_SECT_XN (_AT(pmdval_t, 1) << 4) /* v6 */
diff --git a/arch/arm/include/asm/pgtable-3level-hwdef.h b/arch/arm/include/asm/pgtable-3level-hwdef.h
index 9fd61c72a33a..f8f1cff62065 100644
--- a/arch/arm/include/asm/pgtable-3level-hwdef.h
+++ b/arch/arm/include/asm/pgtable-3level-hwdef.h
@@ -76,6 +76,7 @@
#define PTE_EXT_SHARED (_AT(pteval_t, 3) << 8) /* SH[1:0], inner shareable */
#define PTE_EXT_AF (_AT(pteval_t, 1) << 10) /* Access Flag */
#define PTE_EXT_NG (_AT(pteval_t, 1) << 11) /* nG */
+#define PTE_EXT_PXN (_AT(pteval_t, 1) << 53) /* PXN */
#define PTE_EXT_XN (_AT(pteval_t, 1) << 54) /* XN */
/*
diff --git a/arch/arm/include/asm/proc-fns.h b/arch/arm/include/asm/proc-fns.h
index 5324c1112f3a..8877ad5ffe10 100644
--- a/arch/arm/include/asm/proc-fns.h
+++ b/arch/arm/include/asm/proc-fns.h
@@ -125,13 +125,6 @@ extern void cpu_resume(void);
ttbr; \
})
-#define cpu_set_ttbr(nr, val) \
- do { \
- u64 ttbr = val; \
- __asm__("mcrr p15, " #nr ", %Q0, %R0, c2" \
- : : "r" (ttbr)); \
- } while (0)
-
#define cpu_get_pgd() \
({ \
u64 pg = cpu_get_ttbr(0); \
diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h
index 18f5a554134f..9c647f5b7e5a 100644
--- a/arch/arm/include/asm/smp.h
+++ b/arch/arm/include/asm/smp.h
@@ -61,7 +61,7 @@ asmlinkage void secondary_start_kernel(void);
struct secondary_data {
union {
unsigned long mpu_rgn_szr;
- unsigned long pgdir;
+ u64 pgdir;
};
unsigned long swapper_pg_dir;
void *stack;
@@ -81,6 +81,8 @@ extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
extern int register_ipi_completion(struct completion *completion, int cpu);
+extern void smp_send_all_cpu_backtrace(void);
+
struct smp_operations {
#ifdef CONFIG_SMP
/*
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index ce73ab635414..2db0105605b7 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -26,7 +26,6 @@ struct task_struct;
struct exec_domain;
#include <asm/types.h>
-#include <asm/domain.h>
typedef unsigned long mm_segment_t;
@@ -68,7 +67,6 @@ struct thread_info {
#ifdef CONFIG_ARM_THUMBEE
unsigned long thumbee_state; /* ThumbEE Handler Base register */
#endif
- struct restart_block restart_block;
};
#define INIT_THREAD_INFO(tsk) \
@@ -78,18 +76,17 @@ struct thread_info {
.flags = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .cpu_domain = domain_val(DOMAIN_USER, DOMAIN_MANAGER) | \
- domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \
- domain_val(DOMAIN_IO, DOMAIN_CLIENT), \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
#define init_stack (init_thread_union.stack)
/*
+ * how to get the current stack pointer in C
+ */
+register unsigned long current_stack_pointer asm ("sp");
+
+/*
* how to get the thread information struct from C
*/
static inline struct thread_info *current_thread_info(void) __attribute_const__;
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 2fe85fff5cca..7c8fc955e0df 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -3,6 +3,7 @@
#ifdef CONFIG_ARM_CPU_TOPOLOGY
+#include <linux/cpufreq.h>
#include <linux/cpumask.h>
struct cputopo_arm {
@@ -24,6 +25,12 @@ void init_cpu_topology(void);
void store_cpu_topology(unsigned int cpuid);
const struct cpumask *cpu_coregroup_mask(int cpu);
+#ifdef CONFIG_CPU_FREQ
+#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
+#endif
+#define arch_scale_cpu_capacity scale_cpu_capacity
+extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
+
#else
static inline void init_cpu_topology(void) { }
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 4767eb9caa78..c5a51e79e74f 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -50,6 +50,35 @@ struct exception_table_entry
extern int fixup_exception(struct pt_regs *regs);
/*
+ * These two functions allow hooking accesses to userspace to increase
+ * system integrity by ensuring that the kernel can not inadvertantly
+ * perform such accesses (eg, via list poison values) which could then
+ * be exploited for priviledge escalation.
+ */
+static inline unsigned int uaccess_save_and_enable(void)
+{
+#ifdef CONFIG_CPU_SW_DOMAIN_PAN
+ unsigned int old_domain = get_domain();
+
+ /* Set the current domain access to permit user accesses */
+ set_domain((old_domain & ~domain_mask(DOMAIN_USER)) |
+ domain_val(DOMAIN_USER, DOMAIN_CLIENT));
+
+ return old_domain;
+#else
+ return 0;
+#endif
+}
+
+static inline void uaccess_restore(unsigned int flags)
+{
+#ifdef CONFIG_CPU_SW_DOMAIN_PAN
+ /* Restore the user access mask */
+ set_domain(flags);
+#endif
+}
+
+/*
* These two are intentionally not defined anywhere - if the kernel
* code generates any references to them, that's a bug.
*/
@@ -165,6 +194,7 @@ extern int __get_user_64t_4(void *);
register typeof(x) __r2 asm("r2"); \
register unsigned long __l asm("r1") = __limit; \
register int __e asm("r0"); \
+ unsigned int __ua_flags = uaccess_save_and_enable(); \
switch (sizeof(*(__p))) { \
case 1: \
if (sizeof((x)) >= 8) \
@@ -192,6 +222,7 @@ extern int __get_user_64t_4(void *);
break; \
default: __e = __get_user_bad(); break; \
} \
+ uaccess_restore(__ua_flags); \
x = (typeof(*(p))) __r2; \
__e; \
})
@@ -224,6 +255,7 @@ extern int __put_user_8(void *, unsigned long long);
register const typeof(*(p)) __user *__p asm("r0") = __tmp_p; \
register unsigned long __l asm("r1") = __limit; \
register int __e asm("r0"); \
+ unsigned int __ua_flags = uaccess_save_and_enable(); \
switch (sizeof(*(__p))) { \
case 1: \
__put_user_x(__r2, __p, __e, __l, 1); \
@@ -239,6 +271,7 @@ extern int __put_user_8(void *, unsigned long long);
break; \
default: __e = __put_user_bad(); break; \
} \
+ uaccess_restore(__ua_flags); \
__e; \
})
@@ -300,14 +333,17 @@ static inline void set_fs(mm_segment_t fs)
do { \
unsigned long __gu_addr = (unsigned long)(ptr); \
unsigned long __gu_val; \
+ unsigned int __ua_flags; \
__chk_user_ptr(ptr); \
might_fault(); \
+ __ua_flags = uaccess_save_and_enable(); \
switch (sizeof(*(ptr))) { \
case 1: __get_user_asm_byte(__gu_val,__gu_addr,err); break; \
case 2: __get_user_asm_half(__gu_val,__gu_addr,err); break; \
case 4: __get_user_asm_word(__gu_val,__gu_addr,err); break; \
default: (__gu_val) = __get_user_bad(); \
} \
+ uaccess_restore(__ua_flags); \
(x) = (__typeof__(*(ptr)))__gu_val; \
} while (0)
@@ -381,9 +417,11 @@ do { \
#define __put_user_err(x,ptr,err) \
do { \
unsigned long __pu_addr = (unsigned long)(ptr); \
+ unsigned int __ua_flags; \
__typeof__(*(ptr)) __pu_val = (x); \
__chk_user_ptr(ptr); \
might_fault(); \
+ __ua_flags = uaccess_save_and_enable(); \
switch (sizeof(*(ptr))) { \
case 1: __put_user_asm_byte(__pu_val,__pu_addr,err); break; \
case 2: __put_user_asm_half(__pu_val,__pu_addr,err); break; \
@@ -391,6 +429,7 @@ do { \
case 8: __put_user_asm_dword(__pu_val,__pu_addr,err); break; \
default: __put_user_bad(); \
} \
+ uaccess_restore(__ua_flags); \
} while (0)
#define __put_user_asm_byte(x,__pu_addr,err) \
@@ -474,11 +513,57 @@ do { \
#ifdef CONFIG_MMU
-extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n);
-extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n);
-extern unsigned long __must_check __copy_to_user_std(void __user *to, const void *from, unsigned long n);
-extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n);
-extern unsigned long __must_check __clear_user_std(void __user *addr, unsigned long n);
+extern unsigned long __must_check
+arm_copy_from_user(void *to, const void __user *from, unsigned long n);
+
+static inline unsigned long __must_check
+__copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+ unsigned int __ua_flags;
+
+ check_object_size(to, n, false);
+ __ua_flags = uaccess_save_and_enable();
+ n = arm_copy_from_user(to, from, n);
+ uaccess_restore(__ua_flags);
+ return n;
+}
+
+extern unsigned long __must_check
+arm_copy_to_user(void __user *to, const void *from, unsigned long n);
+extern unsigned long __must_check
+__copy_to_user_std(void __user *to, const void *from, unsigned long n);
+
+static inline unsigned long __must_check
+__copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+#ifndef CONFIG_UACCESS_WITH_MEMCPY
+ unsigned int __ua_flags;
+
+ check_object_size(from, n, true);
+ __ua_flags = uaccess_save_and_enable();
+ n = arm_copy_to_user(to, from, n);
+ uaccess_restore(__ua_flags);
+ return n;
+#else
+ check_object_size(from, n, true);
+ return arm_copy_to_user(to, from, n);
+#endif
+}
+
+extern unsigned long __must_check
+arm_clear_user(void __user *addr, unsigned long n);
+extern unsigned long __must_check
+__clear_user_std(void __user *addr, unsigned long n);
+
+static inline unsigned long __must_check
+__clear_user(void __user *addr, unsigned long n)
+{
+ unsigned int __ua_flags = uaccess_save_and_enable();
+ n = arm_clear_user(addr, n);
+ uaccess_restore(__ua_flags);
+ return n;
+}
+
#else
#define __copy_from_user(to,from,n) (memcpy(to, (void __force *)from, n), 0)
#define __copy_to_user(to,from,n) (memcpy((void __force *)to, from, n), 0)
@@ -511,6 +596,7 @@ static inline unsigned long __must_check clear_user(void __user *to, unsigned lo
return n;
}
+/* These are from lib/ code, and use __get_user() and friends */
extern long strncpy_from_user(char *dest, const char __user *src, long count);
extern __must_check long strlen_user(const char __user *str);
diff --git a/arch/arm/include/asm/vdso.h b/arch/arm/include/asm/vdso.h
new file mode 100644
index 000000000000..d0295f1dd1a3
--- /dev/null
+++ b/arch/arm/include/asm/vdso.h
@@ -0,0 +1,32 @@
+#ifndef __ASM_VDSO_H
+#define __ASM_VDSO_H
+
+#ifdef __KERNEL__
+
+#ifndef __ASSEMBLY__
+
+struct mm_struct;
+
+#ifdef CONFIG_VDSO
+
+void arm_install_vdso(struct mm_struct *mm, unsigned long addr);
+
+extern char vdso_start, vdso_end;
+
+extern unsigned int vdso_total_pages;
+
+#else /* CONFIG_VDSO */
+
+static inline void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
+{
+}
+
+#define vdso_total_pages 0
+
+#endif /* CONFIG_VDSO */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __KERNEL__ */
+
+#endif /* __ASM_VDSO_H */
diff --git a/arch/arm/include/asm/vdso_datapage.h b/arch/arm/include/asm/vdso_datapage.h
new file mode 100644
index 000000000000..9be259442fca
--- /dev/null
+++ b/arch/arm/include/asm/vdso_datapage.h
@@ -0,0 +1,60 @@
+/*
+ * Adapted from arm64 version.
+ *
+ * Copyright (C) 2012 ARM Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __ASM_VDSO_DATAPAGE_H
+#define __ASM_VDSO_DATAPAGE_H
+
+#ifdef __KERNEL__
+
+#ifndef __ASSEMBLY__
+
+#include <asm/page.h>
+
+/* Try to be cache-friendly on systems that don't implement the
+ * generic timer: fit the unconditionally updated fields in the first
+ * 32 bytes.
+ */
+struct vdso_data {
+ u32 seq_count; /* sequence count - odd during updates */
+ u16 tk_is_cntvct; /* fall back to syscall if false */
+ u16 cs_shift; /* clocksource shift */
+ u32 xtime_coarse_sec; /* coarse time */
+ u32 xtime_coarse_nsec;
+
+ u32 wtm_clock_sec; /* wall to monotonic offset */
+ u32 wtm_clock_nsec;
+ u32 xtime_clock_sec; /* CLOCK_REALTIME - seconds */
+ u32 cs_mult; /* clocksource multiplier */
+
+ u64 cs_cycle_last; /* last cycle value */
+ u64 cs_mask; /* clocksource mask */
+
+ u64 xtime_clock_snsec; /* CLOCK_REALTIME sub-ns base */
+ u32 tz_minuteswest; /* timezone info for gettimeofday(2) */
+ u32 tz_dsttime;
+};
+
+union vdso_data_store {
+ struct vdso_data data;
+ u8 page[PAGE_SIZE];
+};
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __KERNEL__ */
+
+#endif /* __ASM_VDSO_DATAPAGE_H */
diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild
index 70a1c9da30ca..a1c05f93d920 100644
--- a/arch/arm/include/uapi/asm/Kbuild
+++ b/arch/arm/include/uapi/asm/Kbuild
@@ -1,6 +1,7 @@
# UAPI Header export list
include include/uapi/asm-generic/Kbuild.asm
+header-y += auxvec.h
header-y += byteorder.h
header-y += fcntl.h
header-y += hwcap.h
diff --git a/arch/arm/include/uapi/asm/auxvec.h b/arch/arm/include/uapi/asm/auxvec.h
new file mode 100644
index 000000000000..cb02a767a500
--- /dev/null
+++ b/arch/arm/include/uapi/asm/auxvec.h
@@ -0,0 +1,7 @@
+#ifndef __ASM_AUXVEC_H
+#define __ASM_AUXVEC_H
+
+/* VDSO location */
+#define AT_SYSINFO_EHDR 33
+
+#endif
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 03120e656aea..5f042d45f190 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -67,7 +67,7 @@ test-kprobes-objs += kprobes-test-arm.o
endif
obj-$(CONFIG_OABI_COMPAT) += sys_oabi-compat.o
obj-$(CONFIG_ARM_THUMBEE) += thumbee.o
-obj-$(CONFIG_KGDB) += kgdb.o
+obj-$(CONFIG_KGDB) += kgdb.o patch.o
obj-$(CONFIG_ARM_UNWIND) += unwind.o
obj-$(CONFIG_HAVE_TCM) += tcm.o
obj-$(CONFIG_OF) += devtree.o
@@ -86,6 +86,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o perf_event_cpu.o
AFLAGS_iwmmxt.o := -Wa,-mcpu=iwmmxt
obj-$(CONFIG_ARM_CPU_TOPOLOGY) += topology.o
+obj-$(CONFIG_VDSO) += vdso.o
ifneq ($(CONFIG_ARCH_EBSA110),y)
obj-y += io.o
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index a88671cfe1ff..a35d72d30b56 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -91,9 +91,9 @@ EXPORT_SYMBOL(__memzero);
#ifdef CONFIG_MMU
EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(__copy_from_user);
-EXPORT_SYMBOL(__copy_to_user);
-EXPORT_SYMBOL(__clear_user);
+EXPORT_SYMBOL(arm_copy_from_user);
+EXPORT_SYMBOL(arm_copy_to_user);
+EXPORT_SYMBOL(arm_clear_user);
EXPORT_SYMBOL(__get_user_1);
EXPORT_SYMBOL(__get_user_2);
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 2d2d6087b9b1..72b4c3873dc8 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -10,7 +10,6 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-#include <linux/compiler.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/dma-mapping.h>
@@ -25,6 +24,7 @@
#include <asm/memory.h>
#include <asm/procinfo.h>
#include <asm/suspend.h>
+#include <asm/vdso_datapage.h>
#include <asm/hardware/cache-l2x0.h>
#include <linux/kbuild.h>
@@ -40,19 +40,10 @@
* GCC 3.2.x: miscompiles NEW_AUX_ENT in fs/binfmt_elf.c
* (http://gcc.gnu.org/PR8896) and incorrect structure
* initialisation in fs/jffs2/erase.c
- * GCC 4.8.0-4.8.2: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58854
- * miscompiles find_get_entry(), and can result in EXT3 and EXT4
- * filesystem corruption (possibly other FS too).
*/
-#ifdef __GNUC__
#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
#error Your compiler is too buggy; it is known to miscompile kernels.
-#error Known good compilers: 3.3, 4.x
-#endif
-#if GCC_VERSION >= 40800 && GCC_VERSION < 40803
-#error Your compiler is too buggy; it is known to miscompile kernels
-#error and result in filesystem corruption and oopses.
-#endif
+#error Known good compilers: 3.3
#endif
int main(void)
@@ -210,5 +201,9 @@ int main(void)
#endif
DEFINE(KVM_VTTBR, offsetof(struct kvm, arch.vttbr));
#endif
+ BLANK();
+#ifdef CONFIG_VDSO
+ DEFINE(VDSO_DATA_SIZE, sizeof(union vdso_data_store));
+#endif
return 0;
}
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 2f5555d307b3..40d7c6e23480 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -146,10 +146,10 @@ ENDPROC(__und_invalid)
#define SPFIX(code...)
#endif
- .macro svc_entry, stack_hole=0, trace=1
+ .macro svc_entry, stack_hole=0, trace=1, uaccess=1
UNWIND(.fnstart )
UNWIND(.save {r0 - pc} )
- sub sp, sp, #(S_FRAME_SIZE + \stack_hole - 4)
+ sub sp, sp, #(S_FRAME_SIZE + 8 + \stack_hole - 4)
#ifdef CONFIG_THUMB2_KERNEL
SPFIX( str r0, [sp] ) @ temporarily saved
SPFIX( mov r0, sp )
@@ -164,7 +164,7 @@ ENDPROC(__und_invalid)
ldmia r0, {r3 - r5}
add r7, sp, #S_SP - 4 @ here for interlock avoidance
mov r6, #-1 @ "" "" "" ""
- add r2, sp, #(S_FRAME_SIZE + \stack_hole - 4)
+ add r2, sp, #(S_FRAME_SIZE + 8 + \stack_hole - 4)
SPFIX( addeq r2, r2, #4 )
str r3, [sp, #-4]! @ save the "real" r0 copied
@ from the exception stack
@@ -182,6 +182,11 @@ ENDPROC(__und_invalid)
@
stmia r7, {r2 - r6}
+ uaccess_save r0
+ .if \uaccess
+ uaccess_disable r0
+ .endif
+
.if \trace
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_off
@@ -191,7 +196,7 @@ ENDPROC(__und_invalid)
.align 5
__dabt_svc:
- svc_entry
+ svc_entry uaccess=0
mov r2, sp
dabt_helper
THUMB( ldr r5, [sp, #S_PSR] ) @ potentially updated CPSR
@@ -365,7 +370,7 @@ ENDPROC(__fiq_abt)
#error "sizeof(struct pt_regs) must be a multiple of 8"
#endif
- .macro usr_entry, trace=1
+ .macro usr_entry, trace=1, uaccess=1
UNWIND(.fnstart )
UNWIND(.cantunwind ) @ don't unwind the user space
sub sp, sp, #S_FRAME_SIZE
@@ -397,6 +402,10 @@ ENDPROC(__fiq_abt)
ARM( stmdb r0, {sp, lr}^ )
THUMB( store_user_sp_lr r0, r1, S_SP - S_PC )
+ .if \uaccess
+ uaccess_disable ip
+ .endif
+
@ Enable the alignment trap while in kernel mode
ATRAP( teq r8, r7)
ATRAP( mcrne p15, 0, r8, c1, c0, 0)
@@ -432,7 +441,7 @@ ENDPROC(__fiq_abt)
.align 5
__dabt_usr:
- usr_entry
+ usr_entry uaccess=0
kuser_cmpxchg_check
mov r2, sp
dabt_helper
@@ -455,7 +464,7 @@ ENDPROC(__irq_usr)
.align 5
__und_usr:
- usr_entry
+ usr_entry uaccess=0
mov r2, r4
mov r3, r5
@@ -481,6 +490,8 @@ __und_usr:
1: ldrt r0, [r4]
ARM_BE8(rev r0, r0) @ little endian instruction
+ uaccess_disable ip
+
@ r0 = 32-bit ARM instruction which caused the exception
@ r2 = PC value for the following instruction (:= regs->ARM_pc)
@ r4 = PC value for the faulting instruction
@@ -515,9 +526,10 @@ __und_usr_thumb:
2: ldrht r5, [r4]
ARM_BE8(rev16 r5, r5) @ little endian instruction
cmp r5, #0xe800 @ 32bit instruction if xx != 0
- blo __und_usr_fault_16 @ 16bit undefined instruction
+ blo __und_usr_fault_16_pan @ 16bit undefined instruction
3: ldrht r0, [r2]
ARM_BE8(rev16 r0, r0) @ little endian instruction
+ uaccess_disable ip
add r2, r2, #2 @ r2 is PC + 2, make it PC + 4
str r2, [sp, #S_PC] @ it's a 2x16bit instr, update
orr r0, r0, r5, lsl #16
@@ -712,6 +724,8 @@ ENDPROC(no_fp)
__und_usr_fault_32:
mov r1, #4
b 1f
+__und_usr_fault_16_pan:
+ uaccess_disable ip
__und_usr_fault_16:
mov r1, #2
1: mov r0, sp
@@ -767,6 +781,8 @@ ENTRY(__switch_to)
ldr r4, [r2, #TI_TP_VALUE]
ldr r5, [r2, #TI_TP_VALUE + 4]
#ifdef CONFIG_CPU_USE_DOMAINS
+ mrc p15, 0, r6, c3, c0, 0 @ Get domain register
+ str r6, [r1, #TI_CPU_DOMAIN] @ Save old domain register
ldr r6, [r2, #TI_CPU_DOMAIN]
#endif
switch_tls r1, r4, r5, r3, r7
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb6f1927b2c7..0edf561dd55d 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -408,6 +408,8 @@ ENTRY(vector_swi)
USER( ldr scno, [lr, #-4] ) @ get SWI instruction
#endif
+ uaccess_disable tbl
+
adr tbl, sys_call_table @ load syscall table pointer
#if defined(CONFIG_OABI_COMPAT)
diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S
index 4176df721bf0..0d22ad206d52 100644
--- a/arch/arm/kernel/entry-header.S
+++ b/arch/arm/kernel/entry-header.S
@@ -196,7 +196,7 @@
msr cpsr_c, \rtemp @ switch back to the SVC mode
.endm
-#ifndef CONFIG_THUMB2_KERNEL
+
.macro svc_exit, rpsr, irq = 0
.if \irq != 0
@ IRQs already off
@@ -215,6 +215,10 @@
blne trace_hardirqs_off
#endif
.endif
+ uaccess_restore
+
+#ifndef CONFIG_THUMB2_KERNEL
+ @ ARM mode SVC restore
msr spsr_cxsf, \rpsr
#if defined(CONFIG_CPU_V6) || defined(CONFIG_CPU_32v6K)
@ We must avoid clrex due to Cortex-A15 erratum #830321
@@ -222,6 +226,20 @@
strex r1, r2, [r0] @ clear the exclusive monitor
#endif
ldmia sp, {r0 - pc}^ @ load r0 - pc, cpsr
+#else
+ @ Thumb mode SVC restore
+ ldr lr, [sp, #S_SP] @ top of the stack
+ ldrd r0, r1, [sp, #S_LR] @ calling lr and pc
+
+ @ We must avoid clrex due to Cortex-A15 erratum #830321
+ strex r2, r1, [sp, #S_LR] @ clear the exclusive monitor
+
+ stmdb lr!, {r0, r1, \rpsr} @ calling lr and rfe context
+ ldmia sp, {r0 - r12}
+ mov sp, lr
+ ldr lr, [sp], #4
+ rfeia sp!
+#endif
.endm
@
@@ -241,6 +259,9 @@
@ on the stack remains correct).
@
.macro svc_exit_via_fiq
+ uaccess_restore
+#ifndef CONFIG_THUMB2_KERNEL
+ @ ARM mode restore
mov r0, sp
ldmib r0, {r1 - r14} @ abort is deadly from here onward (it will
@ clobber state restored below)
@@ -250,65 +271,8 @@
msr spsr_cxsf, r9
ldr r0, [r0, #S_R0]
ldmia r8, {pc}^
- .endm
-
- .macro restore_user_regs, fast = 0, offset = 0
- ldr r1, [sp, #\offset + S_PSR] @ get calling cpsr
- ldr lr, [sp, #\offset + S_PC]! @ get pc
- msr spsr_cxsf, r1 @ save in spsr_svc
-#if defined(CONFIG_CPU_V6) || defined(CONFIG_CPU_32v6K)
- @ We must avoid clrex due to Cortex-A15 erratum #830321
- strex r1, r2, [sp] @ clear the exclusive monitor
-#endif
- .if \fast
- ldmdb sp, {r1 - lr}^ @ get calling r1 - lr
- .else
- ldmdb sp, {r0 - lr}^ @ get calling r0 - lr
- .endif
- mov r0, r0 @ ARMv5T and earlier require a nop
- @ after ldm {}^
- add sp, sp, #S_FRAME_SIZE - S_PC
- movs pc, lr @ return & move spsr_svc into cpsr
- .endm
-
-#else /* CONFIG_THUMB2_KERNEL */
- .macro svc_exit, rpsr, irq = 0
- .if \irq != 0
- @ IRQs already off
-#ifdef CONFIG_TRACE_IRQFLAGS
- @ The parent context IRQs must have been enabled to get here in
- @ the first place, so there's no point checking the PSR I bit.
- bl trace_hardirqs_on
-#endif
- .else
- @ IRQs off again before pulling preserved data off the stack
- disable_irq_notrace
-#ifdef CONFIG_TRACE_IRQFLAGS
- tst \rpsr, #PSR_I_BIT
- bleq trace_hardirqs_on
- tst \rpsr, #PSR_I_BIT
- blne trace_hardirqs_off
-#endif
- .endif
- ldr lr, [sp, #S_SP] @ top of the stack
- ldrd r0, r1, [sp, #S_LR] @ calling lr and pc
-
- @ We must avoid clrex due to Cortex-A15 erratum #830321
- strex r2, r1, [sp, #S_LR] @ clear the exclusive monitor
-
- stmdb lr!, {r0, r1, \rpsr} @ calling lr and rfe context
- ldmia sp, {r0 - r12}
- mov sp, lr
- ldr lr, [sp], #4
- rfeia sp!
- .endm
-
- @
- @ svc_exit_via_fiq - like svc_exit but switches to FIQ mode before exit
- @
- @ For full details see non-Thumb implementation above.
- @
- .macro svc_exit_via_fiq
+#else
+ @ Thumb mode restore
add r0, sp, #S_R2
ldr lr, [sp, #S_LR]
ldr sp, [sp, #S_SP] @ abort is deadly from here onward (it will
@@ -320,21 +284,41 @@
add r8, r0, #S_PC
ldmia r0, {r0 - r1}
rfeia r8
+#endif
.endm
-#ifdef CONFIG_CPU_V7M
- /*
- * Note we don't need to do clrex here as clearing the local monitor is
- * part of each exception entry and exit sequence.
- */
+
.macro restore_user_regs, fast = 0, offset = 0
+ uaccess_enable r1, isb=0
+#ifndef CONFIG_THUMB2_KERNEL
+ @ ARM mode restore
+ mov r2, sp
+ ldr r1, [r2, #\offset + S_PSR] @ get calling cpsr
+ ldr lr, [r2, #\offset + S_PC]! @ get pc
+ msr spsr_cxsf, r1 @ save in spsr_svc
+#if defined(CONFIG_CPU_V6) || defined(CONFIG_CPU_32v6K)
+ @ We must avoid clrex due to Cortex-A15 erratum #830321
+ strex r1, r2, [r2] @ clear the exclusive monitor
+#endif
+ .if \fast
+ ldmdb r2, {r1 - lr}^ @ get calling r1 - lr
+ .else
+ ldmdb r2, {r0 - lr}^ @ get calling r0 - lr
+ .endif
+ mov r0, r0 @ ARMv5T and earlier require a nop
+ @ after ldm {}^
+ add sp, sp, #\offset + S_FRAME_SIZE
+ movs pc, lr @ return & move spsr_svc into cpsr
+#elif defined(CONFIG_CPU_V7M)
+ @ V7M restore.
+ @ Note that we don't need to do clrex here as clearing the local
+ @ monitor is part of the exception entry and exit sequence.
.if \offset
add sp, #\offset
.endif
v7m_exception_slow_exit ret_r0 = \fast
- .endm
-#else /* ifdef CONFIG_CPU_V7M */
- .macro restore_user_regs, fast = 0, offset = 0
+#else
+ @ Thumb mode restore
mov r2, sp
load_user_sp_lr r2, r3, \offset + S_SP @ calling sp, lr
ldr r1, [sp, #\offset + S_PSR] @ get calling cpsr
@@ -352,9 +336,8 @@
.endif
add sp, sp, #S_FRAME_SIZE - S_SP
movs pc, lr @ return & move spsr_svc into cpsr
- .endm
-#endif /* ifdef CONFIG_CPU_V7M / else */
#endif /* !CONFIG_THUMB2_KERNEL */
+ .endm
/*
* Context tracking subsystem. Used to instrument transitions
diff --git a/arch/arm/kernel/etm.c b/arch/arm/kernel/etm.c
index 131a6ab5f355..a634e0ef6d1f 100644
--- a/arch/arm/kernel/etm.c
+++ b/arch/arm/kernel/etm.c
@@ -15,6 +15,7 @@
#include <linux/init.h>
#include <linux/types.h>
#include <linux/io.h>
+#include <linux/slab.h>
#include <linux/sysrq.h>
#include <linux/device.h>
#include <linux/clk.h>
@@ -37,26 +38,37 @@ MODULE_AUTHOR("Alexander Shishkin");
struct tracectx {
unsigned int etb_bufsz;
void __iomem *etb_regs;
- void __iomem *etm_regs;
+ void __iomem **etm_regs;
+ int etm_regs_count;
unsigned long flags;
int ncmppairs;
int etm_portsz;
+ int etm_contextid_size;
+ u32 etb_fc;
+ unsigned long range_start;
+ unsigned long range_end;
+ unsigned long data_range_start;
+ unsigned long data_range_end;
+ bool dump_initial_etb;
struct device *dev;
struct clk *emu_clk;
struct mutex mutex;
};
-static struct tracectx tracer;
+static struct tracectx tracer = {
+ .range_start = (unsigned long)_stext,
+ .range_end = (unsigned long)_etext,
+};
static inline bool trace_isrunning(struct tracectx *t)
{
return !!(t->flags & TRACER_RUNNING);
}
-static int etm_setup_address_range(struct tracectx *t, int n,
+static int etm_setup_address_range(struct tracectx *t, int id, int n,
unsigned long start, unsigned long end, int exclude, int data)
{
- u32 flags = ETMAAT_ARM | ETMAAT_IGNCONTEXTID | ETMAAT_NSONLY | \
+ u32 flags = ETMAAT_ARM | ETMAAT_IGNCONTEXTID | ETMAAT_IGNSECURITY |
ETMAAT_NOVALCMP;
if (n < 1 || n > t->ncmppairs)
@@ -72,95 +84,185 @@ static int etm_setup_address_range(struct tracectx *t, int n,
flags |= ETMAAT_IEXEC;
/* first comparator for the range */
- etm_writel(t, flags, ETMR_COMP_ACC_TYPE(n * 2));
- etm_writel(t, start, ETMR_COMP_VAL(n * 2));
+ etm_writel(t, id, flags, ETMR_COMP_ACC_TYPE(n * 2));
+ etm_writel(t, id, start, ETMR_COMP_VAL(n * 2));
/* second comparator is right next to it */
- etm_writel(t, flags, ETMR_COMP_ACC_TYPE(n * 2 + 1));
- etm_writel(t, end, ETMR_COMP_VAL(n * 2 + 1));
-
- flags = exclude ? ETMTE_INCLEXCL : 0;
- etm_writel(t, flags | (1 << n), ETMR_TRACEENCTRL);
+ etm_writel(t, id, flags, ETMR_COMP_ACC_TYPE(n * 2 + 1));
+ etm_writel(t, id, end, ETMR_COMP_VAL(n * 2 + 1));
+
+ if (data) {
+ flags = exclude ? ETMVDC3_EXCLONLY : 0;
+ if (exclude)
+ n += 8;
+ etm_writel(t, id, flags | BIT(n), ETMR_VIEWDATACTRL3);
+ } else {
+ flags = exclude ? ETMTE_INCLEXCL : 0;
+ etm_writel(t, id, flags | (1 << n), ETMR_TRACEENCTRL);
+ }
return 0;
}
-static int trace_start(struct tracectx *t)
+static int trace_start_etm(struct tracectx *t, int id)
{
u32 v;
unsigned long timeout = TRACER_TIMEOUT;
- etb_unlock(t);
-
- etb_writel(t, 0, ETBR_FORMATTERCTRL);
- etb_writel(t, 1, ETBR_CTRL);
-
- etb_lock(t);
-
- /* configure etm */
v = ETMCTRL_OPTS | ETMCTRL_PROGRAM | ETMCTRL_PORTSIZE(t->etm_portsz);
+ v |= ETMCTRL_CONTEXTIDSIZE(t->etm_contextid_size);
if (t->flags & TRACER_CYCLE_ACC)
v |= ETMCTRL_CYCLEACCURATE;
- etm_unlock(t);
+ if (t->flags & TRACER_BRANCHOUTPUT)
+ v |= ETMCTRL_BRANCH_OUTPUT;
+
+ if (t->flags & TRACER_TRACE_DATA)
+ v |= ETMCTRL_DATA_DO_ADDR;
+
+ if (t->flags & TRACER_TIMESTAMP)
+ v |= ETMCTRL_TIMESTAMP_EN;
+
+ if (t->flags & TRACER_RETURN_STACK)
+ v |= ETMCTRL_RETURN_STACK_EN;
- etm_writel(t, v, ETMR_CTRL);
+ etm_unlock(t, id);
- while (!(etm_readl(t, ETMR_CTRL) & ETMCTRL_PROGRAM) && --timeout)
+ etm_writel(t, id, v, ETMR_CTRL);
+
+ while (!(etm_readl(t, id, ETMR_CTRL) & ETMCTRL_PROGRAM) && --timeout)
;
if (!timeout) {
dev_dbg(t->dev, "Waiting for progbit to assert timed out\n");
- etm_lock(t);
+ etm_lock(t, id);
return -EFAULT;
}
- etm_setup_address_range(t, 1, (unsigned long)_stext,
- (unsigned long)_etext, 0, 0);
- etm_writel(t, 0, ETMR_TRACEENCTRL2);
- etm_writel(t, 0, ETMR_TRACESSCTRL);
- etm_writel(t, 0x6f, ETMR_TRACEENEVT);
+ if (t->range_start || t->range_end)
+ etm_setup_address_range(t, id, 1,
+ t->range_start, t->range_end, 0, 0);
+ else
+ etm_writel(t, id, ETMTE_INCLEXCL, ETMR_TRACEENCTRL);
+
+ etm_writel(t, id, 0, ETMR_TRACEENCTRL2);
+ etm_writel(t, id, 0, ETMR_TRACESSCTRL);
+ etm_writel(t, id, 0x6f, ETMR_TRACEENEVT);
+
+ etm_writel(t, id, 0, ETMR_VIEWDATACTRL1);
+ etm_writel(t, id, 0, ETMR_VIEWDATACTRL2);
+
+ if (t->data_range_start || t->data_range_end)
+ etm_setup_address_range(t, id, 2, t->data_range_start,
+ t->data_range_end, 0, 1);
+ else
+ etm_writel(t, id, ETMVDC3_EXCLONLY, ETMR_VIEWDATACTRL3);
+
+ etm_writel(t, id, 0x6f, ETMR_VIEWDATAEVT);
v &= ~ETMCTRL_PROGRAM;
v |= ETMCTRL_PORTSEL;
- etm_writel(t, v, ETMR_CTRL);
+ etm_writel(t, id, v, ETMR_CTRL);
timeout = TRACER_TIMEOUT;
- while (etm_readl(t, ETMR_CTRL) & ETMCTRL_PROGRAM && --timeout)
+ while (etm_readl(t, id, ETMR_CTRL) & ETMCTRL_PROGRAM && --timeout)
;
if (!timeout) {
dev_dbg(t->dev, "Waiting for progbit to deassert timed out\n");
- etm_lock(t);
+ etm_lock(t, id);
return -EFAULT;
}
- etm_lock(t);
+ etm_lock(t, id);
+ return 0;
+}
+
+static int trace_start(struct tracectx *t)
+{
+ int ret;
+ int id;
+ u32 etb_fc = t->etb_fc;
+
+ etb_unlock(t);
+
+ t->dump_initial_etb = false;
+ etb_writel(t, 0, ETBR_WRITEADDR);
+ etb_writel(t, etb_fc, ETBR_FORMATTERCTRL);
+ etb_writel(t, 1, ETBR_CTRL);
+
+ etb_lock(t);
+
+ /* configure etm(s) */
+ for (id = 0; id < t->etm_regs_count; id++) {
+ ret = trace_start_etm(t, id);
+ if (ret)
+ return ret;
+ }
t->flags |= TRACER_RUNNING;
return 0;
}
-static int trace_stop(struct tracectx *t)
+static int trace_stop_etm(struct tracectx *t, int id)
{
unsigned long timeout = TRACER_TIMEOUT;
- etm_unlock(t);
+ etm_unlock(t, id);
- etm_writel(t, 0x440, ETMR_CTRL);
- while (!(etm_readl(t, ETMR_CTRL) & ETMCTRL_PROGRAM) && --timeout)
+ etm_writel(t, id, 0x440, ETMR_CTRL);
+ while (!(etm_readl(t, id, ETMR_CTRL) & ETMCTRL_PROGRAM) && --timeout)
;
if (!timeout) {
- dev_dbg(t->dev, "Waiting for progbit to assert timed out\n");
- etm_lock(t);
+ dev_err(t->dev,
+ "etm%d: Waiting for progbit to assert timed out\n",
+ id);
+ etm_lock(t, id);
return -EFAULT;
}
- etm_lock(t);
+ etm_lock(t, id);
+ return 0;
+}
+
+static int trace_power_down_etm(struct tracectx *t, int id)
+{
+ unsigned long timeout = TRACER_TIMEOUT;
+ etm_unlock(t, id);
+ while (!(etm_readl(t, id, ETMR_STATUS) & ETMST_PROGBIT) && --timeout)
+ ;
+ if (!timeout) {
+ dev_err(t->dev, "etm%d: Waiting for status progbit to assert timed out\n",
+ id);
+ etm_lock(t, id);
+ return -EFAULT;
+ }
+
+ etm_writel(t, id, 0x441, ETMR_CTRL);
+
+ etm_lock(t, id);
+ return 0;
+}
+
+static int trace_stop(struct tracectx *t)
+{
+ int id;
+ unsigned long timeout = TRACER_TIMEOUT;
+ u32 etb_fc = t->etb_fc;
+
+ for (id = 0; id < t->etm_regs_count; id++)
+ trace_stop_etm(t, id);
+
+ for (id = 0; id < t->etm_regs_count; id++)
+ trace_power_down_etm(t, id);
etb_unlock(t);
- etb_writel(t, ETBFF_MANUAL_FLUSH, ETBR_FORMATTERCTRL);
+ if (etb_fc) {
+ etb_fc |= ETBFF_STOPFL;
+ etb_writel(t, t->etb_fc, ETBR_FORMATTERCTRL);
+ }
+ etb_writel(t, etb_fc | ETBFF_MANUAL_FLUSH, ETBR_FORMATTERCTRL);
timeout = TRACER_TIMEOUT;
while (etb_readl(t, ETBR_FORMATTERCTRL) &
@@ -185,24 +287,15 @@ static int trace_stop(struct tracectx *t)
static int etb_getdatalen(struct tracectx *t)
{
u32 v;
- int rp, wp;
+ int wp;
v = etb_readl(t, ETBR_STATUS);
if (v & 1)
return t->etb_bufsz;
- rp = etb_readl(t, ETBR_READADDR);
wp = etb_readl(t, ETBR_WRITEADDR);
-
- if (rp > wp) {
- etb_writel(t, 0, ETBR_READADDR);
- etb_writel(t, 0, ETBR_WRITEADDR);
-
- return 0;
- }
-
- return wp - rp;
+ return wp;
}
/* sysrq+v will always stop the running trace and leave it at that */
@@ -235,21 +328,18 @@ static void etm_dump(void)
printk("%08x", cpu_to_be32(etb_readl(t, ETBR_READMEM)));
printk(KERN_INFO "\n--- ETB buffer end ---\n");
- /* deassert the overflow bit */
- etb_writel(t, 1, ETBR_CTRL);
- etb_writel(t, 0, ETBR_CTRL);
-
- etb_writel(t, 0, ETBR_TRIGGERCOUNT);
- etb_writel(t, 0, ETBR_READADDR);
- etb_writel(t, 0, ETBR_WRITEADDR);
-
etb_lock(t);
}
static void sysrq_etm_dump(int key)
{
+ if (!mutex_trylock(&tracer.mutex)) {
+ printk(KERN_INFO "Tracing hardware busy\n");
+ return;
+ }
dev_dbg(tracer.dev, "Dumping ETB buffer\n");
etm_dump();
+ mutex_unlock(&tracer.mutex);
}
static struct sysrq_key_op sysrq_etm_op = {
@@ -276,6 +366,10 @@ static ssize_t etb_read(struct file *file, char __user *data,
struct tracectx *t = file->private_data;
u32 first = 0;
u32 *buf;
+ int wpos;
+ int skip;
+ long wlength;
+ loff_t pos = *ppos;
mutex_lock(&t->mutex);
@@ -287,31 +381,39 @@ static ssize_t etb_read(struct file *file, char __user *data,
etb_unlock(t);
total = etb_getdatalen(t);
+ if (total == 0 && t->dump_initial_etb)
+ total = t->etb_bufsz;
if (total == t->etb_bufsz)
first = etb_readl(t, ETBR_WRITEADDR);
+ if (pos > total * 4) {
+ skip = 0;
+ wpos = total;
+ } else {
+ skip = (int)pos % 4;
+ wpos = (int)pos / 4;
+ }
+ total -= wpos;
+ first = (first + wpos) % t->etb_bufsz;
+
etb_writel(t, first, ETBR_READADDR);
- length = min(total * 4, (int)len);
- buf = vmalloc(length);
+ wlength = min(total, DIV_ROUND_UP(skip + (int)len, 4));
+ length = min(total * 4 - skip, (int)len);
+ buf = vmalloc(wlength * 4);
- dev_dbg(t->dev, "ETB buffer length: %d\n", total);
+ dev_dbg(t->dev, "ETB read %ld bytes to %lld from %ld words at %d\n",
+ length, pos, wlength, first);
+ dev_dbg(t->dev, "ETB buffer length: %d\n", total + wpos);
dev_dbg(t->dev, "ETB status reg: %x\n", etb_readl(t, ETBR_STATUS));
- for (i = 0; i < length / 4; i++)
+ for (i = 0; i < wlength; i++)
buf[i] = etb_readl(t, ETBR_READMEM);
- /* the only way to deassert overflow bit in ETB status is this */
- etb_writel(t, 1, ETBR_CTRL);
- etb_writel(t, 0, ETBR_CTRL);
-
- etb_writel(t, 0, ETBR_WRITEADDR);
- etb_writel(t, 0, ETBR_READADDR);
- etb_writel(t, 0, ETBR_TRIGGERCOUNT);
-
etb_lock(t);
- length -= copy_to_user(data, buf, length);
+ length -= copy_to_user(data, (u8 *)buf + skip, length);
vfree(buf);
+ *ppos = pos + length;
out:
mutex_unlock(&t->mutex);
@@ -348,28 +450,17 @@ static int etb_probe(struct amba_device *dev, const struct amba_id *id)
if (ret)
goto out;
+ mutex_lock(&t->mutex);
t->etb_regs = ioremap_nocache(dev->res.start, resource_size(&dev->res));
if (!t->etb_regs) {
ret = -ENOMEM;
goto out_release;
}
+ t->dev = &dev->dev;
+ t->dump_initial_etb = true;
amba_set_drvdata(dev, t);
- etb_miscdev.parent = &dev->dev;
-
- ret = misc_register(&etb_miscdev);
- if (ret)
- goto out_unmap;
-
- t->emu_clk = clk_get(&dev->dev, "emu_src_ck");
- if (IS_ERR(t->emu_clk)) {
- dev_dbg(&dev->dev, "Failed to obtain emu_src_ck.\n");
- return -EFAULT;
- }
-
- clk_enable(t->emu_clk);
-
etb_unlock(t);
t->etb_bufsz = etb_readl(t, ETBR_DEPTH);
dev_dbg(&dev->dev, "Size: %x\n", t->etb_bufsz);
@@ -378,6 +469,20 @@ static int etb_probe(struct amba_device *dev, const struct amba_id *id)
etb_writel(t, 0, ETBR_CTRL);
etb_writel(t, 0x1000, ETBR_FORMATTERCTRL);
etb_lock(t);
+ mutex_unlock(&t->mutex);
+
+ etb_miscdev.parent = &dev->dev;
+
+ ret = misc_register(&etb_miscdev);
+ if (ret)
+ goto out_unmap;
+
+ /* Get optional clock. Currently used to select clock source on omap3 */
+ t->emu_clk = clk_get(&dev->dev, "emu_src_ck");
+ if (IS_ERR(t->emu_clk))
+ dev_dbg(&dev->dev, "Failed to obtain emu_src_ck.\n");
+ else
+ clk_enable(t->emu_clk);
dev_dbg(&dev->dev, "ETB AMBA driver initialized.\n");
@@ -385,9 +490,12 @@ out:
return ret;
out_unmap:
+ mutex_lock(&t->mutex);
iounmap(t->etb_regs);
+ t->etb_regs = NULL;
out_release:
+ mutex_unlock(&t->mutex);
amba_release_regions(dev);
return ret;
@@ -400,8 +508,10 @@ static int etb_remove(struct amba_device *dev)
iounmap(t->etb_regs);
t->etb_regs = NULL;
- clk_disable(t->emu_clk);
- clk_put(t->emu_clk);
+ if (!IS_ERR(t->emu_clk)) {
+ clk_disable(t->emu_clk);
+ clk_put(t->emu_clk);
+ }
amba_release_regions(dev);
@@ -445,7 +555,10 @@ static ssize_t trace_running_store(struct kobject *kobj,
return -EINVAL;
mutex_lock(&tracer.mutex);
- ret = value ? trace_start(&tracer) : trace_stop(&tracer);
+ if (!tracer.etb_regs)
+ ret = -ENODEV;
+ else
+ ret = value ? trace_start(&tracer) : trace_stop(&tracer);
mutex_unlock(&tracer.mutex);
return ret ? : n;
@@ -460,36 +573,50 @@ static ssize_t trace_info_show(struct kobject *kobj,
{
u32 etb_wa, etb_ra, etb_st, etb_fc, etm_ctrl, etm_st;
int datalen;
+ int id;
+ int ret;
- etb_unlock(&tracer);
- datalen = etb_getdatalen(&tracer);
- etb_wa = etb_readl(&tracer, ETBR_WRITEADDR);
- etb_ra = etb_readl(&tracer, ETBR_READADDR);
- etb_st = etb_readl(&tracer, ETBR_STATUS);
- etb_fc = etb_readl(&tracer, ETBR_FORMATTERCTRL);
- etb_lock(&tracer);
-
- etm_unlock(&tracer);
- etm_ctrl = etm_readl(&tracer, ETMR_CTRL);
- etm_st = etm_readl(&tracer, ETMR_STATUS);
- etm_lock(&tracer);
+ mutex_lock(&tracer.mutex);
+ if (tracer.etb_regs) {
+ etb_unlock(&tracer);
+ datalen = etb_getdatalen(&tracer);
+ etb_wa = etb_readl(&tracer, ETBR_WRITEADDR);
+ etb_ra = etb_readl(&tracer, ETBR_READADDR);
+ etb_st = etb_readl(&tracer, ETBR_STATUS);
+ etb_fc = etb_readl(&tracer, ETBR_FORMATTERCTRL);
+ etb_lock(&tracer);
+ } else {
+ etb_wa = etb_ra = etb_st = etb_fc = ~0;
+ datalen = -1;
+ }
- return sprintf(buf, "Trace buffer len: %d\nComparator pairs: %d\n"
+ ret = sprintf(buf, "Trace buffer len: %d\nComparator pairs: %d\n"
"ETBR_WRITEADDR:\t%08x\n"
"ETBR_READADDR:\t%08x\n"
"ETBR_STATUS:\t%08x\n"
- "ETBR_FORMATTERCTRL:\t%08x\n"
- "ETMR_CTRL:\t%08x\n"
- "ETMR_STATUS:\t%08x\n",
+ "ETBR_FORMATTERCTRL:\t%08x\n",
datalen,
tracer.ncmppairs,
etb_wa,
etb_ra,
etb_st,
- etb_fc,
+ etb_fc
+ );
+
+ for (id = 0; id < tracer.etm_regs_count; id++) {
+ etm_unlock(&tracer, id);
+ etm_ctrl = etm_readl(&tracer, id, ETMR_CTRL);
+ etm_st = etm_readl(&tracer, id, ETMR_STATUS);
+ etm_lock(&tracer, id);
+ ret += sprintf(buf + ret, "ETMR_CTRL:\t%08x\n"
+ "ETMR_STATUS:\t%08x\n",
etm_ctrl,
etm_st
);
+ }
+ mutex_unlock(&tracer.mutex);
+
+ return ret;
}
static struct kobj_attribute trace_info_attr =
@@ -528,42 +655,260 @@ static ssize_t trace_mode_store(struct kobject *kobj,
static struct kobj_attribute trace_mode_attr =
__ATTR(trace_mode, 0644, trace_mode_show, trace_mode_store);
+static ssize_t trace_contextid_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ /* 0: No context id tracing, 1: One byte, 2: Two bytes, 3: Four bytes */
+ return sprintf(buf, "%d\n", (1 << tracer.etm_contextid_size) >> 1);
+}
+
+static ssize_t trace_contextid_size_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned int contextid_size;
+
+ if (sscanf(buf, "%u", &contextid_size) != 1)
+ return -EINVAL;
+
+ if (contextid_size == 3 || contextid_size > 4)
+ return -EINVAL;
+
+ mutex_lock(&tracer.mutex);
+ tracer.etm_contextid_size = fls(contextid_size);
+ mutex_unlock(&tracer.mutex);
+
+ return n;
+}
+
+static struct kobj_attribute trace_contextid_size_attr =
+ __ATTR(trace_contextid_size, 0644,
+ trace_contextid_size_show, trace_contextid_size_store);
+
+static ssize_t trace_branch_output_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", !!(tracer.flags & TRACER_BRANCHOUTPUT));
+}
+
+static ssize_t trace_branch_output_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned int branch_output;
+
+ if (sscanf(buf, "%u", &branch_output) != 1)
+ return -EINVAL;
+
+ mutex_lock(&tracer.mutex);
+ if (branch_output) {
+ tracer.flags |= TRACER_BRANCHOUTPUT;
+ /* Branch broadcasting is incompatible with the return stack */
+ tracer.flags &= ~TRACER_RETURN_STACK;
+ } else {
+ tracer.flags &= ~TRACER_BRANCHOUTPUT;
+ }
+ mutex_unlock(&tracer.mutex);
+
+ return n;
+}
+
+static struct kobj_attribute trace_branch_output_attr =
+ __ATTR(trace_branch_output, 0644,
+ trace_branch_output_show, trace_branch_output_store);
+
+static ssize_t trace_return_stack_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", !!(tracer.flags & TRACER_RETURN_STACK));
+}
+
+static ssize_t trace_return_stack_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned int return_stack;
+
+ if (sscanf(buf, "%u", &return_stack) != 1)
+ return -EINVAL;
+
+ mutex_lock(&tracer.mutex);
+ if (return_stack) {
+ tracer.flags |= TRACER_RETURN_STACK;
+ /* Return stack is incompatible with branch broadcasting */
+ tracer.flags &= ~TRACER_BRANCHOUTPUT;
+ } else {
+ tracer.flags &= ~TRACER_RETURN_STACK;
+ }
+ mutex_unlock(&tracer.mutex);
+
+ return n;
+}
+
+static struct kobj_attribute trace_return_stack_attr =
+ __ATTR(trace_return_stack, 0644,
+ trace_return_stack_show, trace_return_stack_store);
+
+static ssize_t trace_timestamp_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", !!(tracer.flags & TRACER_TIMESTAMP));
+}
+
+static ssize_t trace_timestamp_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned int timestamp;
+
+ if (sscanf(buf, "%u", &timestamp) != 1)
+ return -EINVAL;
+
+ mutex_lock(&tracer.mutex);
+ if (timestamp)
+ tracer.flags |= TRACER_TIMESTAMP;
+ else
+ tracer.flags &= ~TRACER_TIMESTAMP;
+ mutex_unlock(&tracer.mutex);
+
+ return n;
+}
+
+static struct kobj_attribute trace_timestamp_attr =
+ __ATTR(trace_timestamp, 0644,
+ trace_timestamp_show, trace_timestamp_store);
+
+static ssize_t trace_range_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%08lx %08lx\n",
+ tracer.range_start, tracer.range_end);
+}
+
+static ssize_t trace_range_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long range_start, range_end;
+
+ if (sscanf(buf, "%lx %lx", &range_start, &range_end) != 2)
+ return -EINVAL;
+
+ mutex_lock(&tracer.mutex);
+ tracer.range_start = range_start;
+ tracer.range_end = range_end;
+ mutex_unlock(&tracer.mutex);
+
+ return n;
+}
+
+
+static struct kobj_attribute trace_range_attr =
+ __ATTR(trace_range, 0644, trace_range_show, trace_range_store);
+
+static ssize_t trace_data_range_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ unsigned long range_start;
+ u64 range_end;
+ mutex_lock(&tracer.mutex);
+ range_start = tracer.data_range_start;
+ range_end = tracer.data_range_end;
+ if (!range_end && (tracer.flags & TRACER_TRACE_DATA))
+ range_end = 0x100000000ULL;
+ mutex_unlock(&tracer.mutex);
+ return sprintf(buf, "%08lx %08llx\n", range_start, range_end);
+}
+
+static ssize_t trace_data_range_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long range_start;
+ u64 range_end;
+
+ if (sscanf(buf, "%lx %llx", &range_start, &range_end) != 2)
+ return -EINVAL;
+
+ mutex_lock(&tracer.mutex);
+ tracer.data_range_start = range_start;
+ tracer.data_range_end = (unsigned long)range_end;
+ if (range_end)
+ tracer.flags |= TRACER_TRACE_DATA;
+ else
+ tracer.flags &= ~TRACER_TRACE_DATA;
+ mutex_unlock(&tracer.mutex);
+
+ return n;
+}
+
+
+static struct kobj_attribute trace_data_range_attr =
+ __ATTR(trace_data_range, 0644,
+ trace_data_range_show, trace_data_range_store);
+
static int etm_probe(struct amba_device *dev, const struct amba_id *id)
{
struct tracectx *t = &tracer;
int ret = 0;
+ void __iomem **new_regs;
+ int new_count;
+ u32 etmccr;
+ u32 etmidr;
+ u32 etmccer = 0;
+ u8 etm_version = 0;
+
+ mutex_lock(&t->mutex);
+ new_count = t->etm_regs_count + 1;
+ new_regs = krealloc(t->etm_regs,
+ sizeof(t->etm_regs[0]) * new_count, GFP_KERNEL);
- if (t->etm_regs) {
- dev_dbg(&dev->dev, "ETM already initialized\n");
- ret = -EBUSY;
+ if (!new_regs) {
+ dev_dbg(&dev->dev, "Failed to allocate ETM register array\n");
+ ret = -ENOMEM;
goto out;
}
+ t->etm_regs = new_regs;
ret = amba_request_regions(dev, NULL);
if (ret)
goto out;
- t->etm_regs = ioremap_nocache(dev->res.start, resource_size(&dev->res));
- if (!t->etm_regs) {
+ t->etm_regs[t->etm_regs_count] =
+ ioremap_nocache(dev->res.start, resource_size(&dev->res));
+ if (!t->etm_regs[t->etm_regs_count]) {
ret = -ENOMEM;
goto out_release;
}
- amba_set_drvdata(dev, t);
+ amba_set_drvdata(dev, t->etm_regs[t->etm_regs_count]);
- mutex_init(&t->mutex);
- t->dev = &dev->dev;
- t->flags = TRACER_CYCLE_ACC;
+ t->flags = TRACER_CYCLE_ACC | TRACER_TRACE_DATA | TRACER_BRANCHOUTPUT;
t->etm_portsz = 1;
+ t->etm_contextid_size = 3;
- etm_unlock(t);
- (void)etm_readl(t, ETMMR_PDSR);
+ etm_unlock(t, t->etm_regs_count);
+ (void)etm_readl(t, t->etm_regs_count, ETMMR_PDSR);
/* dummy first read */
- (void)etm_readl(&tracer, ETMMR_OSSRR);
-
- t->ncmppairs = etm_readl(t, ETMR_CONFCODE) & 0xf;
- etm_writel(t, 0x440, ETMR_CTRL);
- etm_lock(t);
+ (void)etm_readl(&tracer, t->etm_regs_count, ETMMR_OSSRR);
+
+ etmccr = etm_readl(t, t->etm_regs_count, ETMR_CONFCODE);
+ t->ncmppairs = etmccr & 0xf;
+ if (etmccr & ETMCCR_ETMIDR_PRESENT) {
+ etmidr = etm_readl(t, t->etm_regs_count, ETMR_ID);
+ etm_version = ETMIDR_VERSION(etmidr);
+ if (etm_version >= ETMIDR_VERSION_3_1)
+ etmccer = etm_readl(t, t->etm_regs_count, ETMR_CCE);
+ }
+ etm_writel(t, t->etm_regs_count, 0x441, ETMR_CTRL);
+ etm_writel(t, t->etm_regs_count, new_count, ETMR_TRACEIDR);
+ etm_lock(t, t->etm_regs_count);
ret = sysfs_create_file(&dev->dev.kobj,
&trace_running_attr.attr);
@@ -579,32 +924,97 @@ static int etm_probe(struct amba_device *dev, const struct amba_id *id)
if (ret)
dev_dbg(&dev->dev, "Failed to create trace_mode in sysfs\n");
- dev_dbg(t->dev, "ETM AMBA driver initialized.\n");
+ ret = sysfs_create_file(&dev->dev.kobj,
+ &trace_contextid_size_attr.attr);
+ if (ret)
+ dev_dbg(&dev->dev,
+ "Failed to create trace_contextid_size in sysfs\n");
+
+ ret = sysfs_create_file(&dev->dev.kobj,
+ &trace_branch_output_attr.attr);
+ if (ret)
+ dev_dbg(&dev->dev,
+ "Failed to create trace_branch_output in sysfs\n");
+
+ if (etmccer & ETMCCER_RETURN_STACK_IMPLEMENTED) {
+ ret = sysfs_create_file(&dev->dev.kobj,
+ &trace_return_stack_attr.attr);
+ if (ret)
+ dev_dbg(&dev->dev,
+ "Failed to create trace_return_stack in sysfs\n");
+ }
+
+ if (etmccer & ETMCCER_TIMESTAMPING_IMPLEMENTED) {
+ ret = sysfs_create_file(&dev->dev.kobj,
+ &trace_timestamp_attr.attr);
+ if (ret)
+ dev_dbg(&dev->dev,
+ "Failed to create trace_timestamp in sysfs\n");
+ }
+
+ ret = sysfs_create_file(&dev->dev.kobj, &trace_range_attr.attr);
+ if (ret)
+ dev_dbg(&dev->dev, "Failed to create trace_range in sysfs\n");
+
+ if (etm_version < ETMIDR_VERSION_PFT_1_0) {
+ ret = sysfs_create_file(&dev->dev.kobj,
+ &trace_data_range_attr.attr);
+ if (ret)
+ dev_dbg(&dev->dev,
+ "Failed to create trace_data_range in sysfs\n");
+ } else {
+ tracer.flags &= ~TRACER_TRACE_DATA;
+ }
+
+ dev_dbg(&dev->dev, "ETM AMBA driver initialized.\n");
+
+ /* Enable formatter if there are multiple trace sources */
+ if (new_count > 1)
+ t->etb_fc = ETBFF_ENFCONT | ETBFF_ENFTC;
+
+ t->etm_regs_count = new_count;
out:
+ mutex_unlock(&t->mutex);
return ret;
out_unmap:
- iounmap(t->etm_regs);
+ iounmap(t->etm_regs[t->etm_regs_count]);
out_release:
amba_release_regions(dev);
+ mutex_unlock(&t->mutex);
return ret;
}
static int etm_remove(struct amba_device *dev)
{
- struct tracectx *t = amba_get_drvdata(dev);
-
- iounmap(t->etm_regs);
- t->etm_regs = NULL;
-
- amba_release_regions(dev);
+ int i;
+ struct tracectx *t = &tracer;
+ void __iomem *etm_regs = amba_get_drvdata(dev);
sysfs_remove_file(&dev->dev.kobj, &trace_running_attr.attr);
sysfs_remove_file(&dev->dev.kobj, &trace_info_attr.attr);
sysfs_remove_file(&dev->dev.kobj, &trace_mode_attr.attr);
+ sysfs_remove_file(&dev->dev.kobj, &trace_range_attr.attr);
+ sysfs_remove_file(&dev->dev.kobj, &trace_data_range_attr.attr);
+
+ mutex_lock(&t->mutex);
+ for (i = 0; i < t->etm_regs_count; i++)
+ if (t->etm_regs[i] == etm_regs)
+ break;
+ for (; i < t->etm_regs_count - 1; i++)
+ t->etm_regs[i] = t->etm_regs[i + 1];
+ t->etm_regs_count--;
+ if (!t->etm_regs_count) {
+ kfree(t->etm_regs);
+ t->etm_regs = NULL;
+ }
+ mutex_unlock(&t->mutex);
+
+ iounmap(etm_regs);
+ amba_release_regions(dev);
return 0;
}
@@ -614,6 +1024,10 @@ static struct amba_id etm_ids[] = {
.id = 0x0003b921,
.mask = 0x0007ffff,
},
+ {
+ .id = 0x0003b950,
+ .mask = 0x0007ffff,
+ },
{ 0, 0 },
};
@@ -631,6 +1045,8 @@ static int __init etm_init(void)
{
int retval;
+ mutex_init(&tracer.mutex);
+
retval = amba_driver_register(&etb_driver);
if (retval) {
printk(KERN_ERR "Failed to register etb\n");
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index af9a8a927a4e..b8c75e45a950 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -15,6 +15,7 @@
#include <linux/ftrace.h>
#include <linux/uaccess.h>
#include <linux/module.h>
+#include <linux/stop_machine.h>
#include <asm/cacheflush.h>
#include <asm/opcodes.h>
@@ -35,6 +36,22 @@
#define OLD_NOP 0xe1a00000 /* mov r0, r0 */
+static int __ftrace_modify_code(void *data)
+{
+ int *command = data;
+
+ set_kernel_text_rw();
+ ftrace_modify_all_code(*command);
+ set_kernel_text_ro();
+
+ return 0;
+}
+
+void arch_ftrace_update_code(int command)
+{
+ stop_machine(__ftrace_modify_code, &command, NULL);
+}
+
static unsigned long ftrace_nop_replace(struct dyn_ftrace *rec)
{
return rec->arch.old_mcount ? OLD_NOP : NOP;
@@ -73,6 +90,8 @@ int ftrace_arch_code_modify_prepare(void)
int ftrace_arch_code_modify_post_process(void)
{
set_all_modules_text_ro();
+ /* Make sure any TLB misses during machine stop are cleared. */
+ flush_tlb_all();
return 0;
}
diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S
index c6c66dd4be89..aefa2d91b755 100644
--- a/arch/arm/kernel/head-nommu.S
+++ b/arch/arm/kernel/head-nommu.S
@@ -118,7 +118,7 @@ ENTRY(secondary_startup)
THUMB( add r12, r10, #PROCINFO_INITFUNC )
THUMB( ret r12 )
1: bl __after_proc_init
- ldr sp, [r7, #8] @ set up the stack pointer
+ ldr sp, [r7, #12] @ set up the stack pointer
mov fp, #0
b secondary_start_kernel
ENDPROC(secondary_startup)
diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S
index 664eee8c4a26..05e38931dc62 100644
--- a/arch/arm/kernel/head.S
+++ b/arch/arm/kernel/head.S
@@ -131,16 +131,33 @@ ENTRY(stext)
* The following calls CPU specific code in a position independent
* manner. See arch/arm/mm/proc-*.S for details. r10 = base of
* xxx_proc_info structure selected by __lookup_processor_type
- * above. On return, the CPU will be ready for the MMU to be
- * turned on, and r0 will hold the CPU control register value.
+ * above.
+ *
+ * The processor init function will be called with:
+ * r1 - machine type
+ * r2 - boot data (atags/dt) pointer
+ * r4 - translation table base (low word)
+ * r5 - translation table base (high word, if LPAE)
+ * r8 - translation table base 1 (pfn if LPAE)
+ * r9 - cpuid
+ * r13 - virtual address for __enable_mmu -> __turn_mmu_on
+ *
+ * On return, the CPU will be ready for the MMU to be turned on,
+ * r0 will hold the CPU control register value, r1, r2, r4, and
+ * r9 will be preserved. r5 will also be preserved if LPAE.
*/
ldr r13, =__mmap_switched @ address to jump to after
@ mmu has been enabled
adr lr, BSYM(1f) @ return (PIC) address
+#ifdef CONFIG_ARM_LPAE
+ mov r5, #0 @ high TTBR0
+ mov r8, r4, lsr #12 @ TTBR1 is swapper_pg_dir pfn
+#else
mov r8, r4 @ set TTBR1 to swapper_pg_dir
- ARM( add pc, r10, #PROCINFO_INITFUNC )
- THUMB( add r12, r10, #PROCINFO_INITFUNC )
- THUMB( ret r12 )
+#endif
+ ldr r12, [r10, #PROCINFO_INITFUNC]
+ add r12, r12, r10
+ ret r12
1: b __enable_mmu
ENDPROC(stext)
.ltorg
@@ -158,7 +175,7 @@ ENDPROC(stext)
*
* Returns:
* r0, r3, r5-r7 corrupted
- * r4 = page table (see ARCH_PGD_SHIFT in asm/memory.h)
+ * r4 = physical page table address
*/
__create_page_tables:
pgtbl r4, r8 @ page table address
@@ -333,7 +350,6 @@ __create_page_tables:
#endif
#ifdef CONFIG_ARM_LPAE
sub r4, r4, #0x1000 @ point to the PGD table
- mov r4, r4, lsr #ARCH_PGD_SHIFT
#endif
ret lr
ENDPROC(__create_page_tables)
@@ -375,22 +391,25 @@ ENTRY(secondary_startup)
adr r4, __secondary_data
ldmia r4, {r5, r7, r12} @ address to jump to after
sub lr, r4, r5 @ mmu has been enabled
- ldr r4, [r7, lr] @ get secondary_data.pgdir
- add r7, r7, #4
- ldr r8, [r7, lr] @ get secondary_data.swapper_pg_dir
+ add r3, r7, lr
+ ldrd r4, [r3, #0] @ get secondary_data.pgdir
+ARM_BE8(eor r4, r4, r5) @ Swap r5 and r4 in BE:
+ARM_BE8(eor r5, r4, r5) @ it can be done in 3 steps
+ARM_BE8(eor r4, r4, r5) @ without using a temp reg.
+ ldr r8, [r3, #8] @ get secondary_data.swapper_pg_dir
adr lr, BSYM(__enable_mmu) @ return address
mov r13, r12 @ __secondary_switched address
- ARM( add pc, r10, #PROCINFO_INITFUNC ) @ initialise processor
- @ (return control reg)
- THUMB( add r12, r10, #PROCINFO_INITFUNC )
- THUMB( ret r12 )
+ ldr r12, [r10, #PROCINFO_INITFUNC]
+ add r12, r12, r10 @ initialise processor
+ @ (return control reg)
+ ret r12
ENDPROC(secondary_startup)
/*
* r6 = &secondary_data
*/
ENTRY(__secondary_switched)
- ldr sp, [r7, #4] @ get secondary_data.stack
+ ldr sp, [r7, #12] @ get secondary_data.stack
mov fp, #0
b secondary_start_kernel
ENDPROC(__secondary_switched)
@@ -409,12 +428,14 @@ __secondary_data:
/*
* Setup common bits before finally enabling the MMU. Essentially
* this is just loading the page table pointer and domain access
- * registers.
+ * registers. All these registers need to be preserved by the
+ * processor setup function (or set in the case of r0)
*
* r0 = cp#15 control register
* r1 = machine ID
* r2 = atags or dtb pointer
- * r4 = page table (see ARCH_PGD_SHIFT in asm/memory.h)
+ * r4 = TTBR pointer (low word)
+ * r5 = TTBR pointer (high word if LPAE)
* r9 = processor ID
* r13 = *virtual* address to jump to upon completion
*/
@@ -433,11 +454,10 @@ __enable_mmu:
#ifdef CONFIG_CPU_ICACHE_DISABLE
bic r0, r0, #CR_I
#endif
-#ifndef CONFIG_ARM_LPAE
- mov r5, #(domain_val(DOMAIN_USER, DOMAIN_MANAGER) | \
- domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \
- domain_val(DOMAIN_TABLE, DOMAIN_MANAGER) | \
- domain_val(DOMAIN_IO, DOMAIN_CLIENT))
+#ifdef CONFIG_ARM_LPAE
+ mcrr p15, 0, r4, r5, c2 @ load TTBR0
+#else
+ mov r5, #DACR_INIT
mcr p15, 0, r5, c3, c0, 0 @ load domain access register
mcr p15, 0, r4, c2, c0, 0 @ load page table pointer
#endif
diff --git a/arch/arm/kernel/jump_label.c b/arch/arm/kernel/jump_label.c
index 4ce4f789446d..afeeb9ea6f43 100644
--- a/arch/arm/kernel/jump_label.c
+++ b/arch/arm/kernel/jump_label.c
@@ -19,7 +19,7 @@ static void __arch_jump_label_transform(struct jump_entry *entry,
insn = arm_gen_nop();
if (is_static)
- __patch_text(addr, insn);
+ __patch_text_early(addr, insn);
else
patch_text(addr, insn);
}
diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
index a74b53c1b7df..07f5059dac5e 100644
--- a/arch/arm/kernel/kgdb.c
+++ b/arch/arm/kernel/kgdb.c
@@ -12,8 +12,12 @@
#include <linux/irq.h>
#include <linux/kdebug.h>
#include <linux/kgdb.h>
+#include <linux/uaccess.h>
+
#include <asm/traps.h>
+#include "patch.h"
+
struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{
{ "r0", 4, offsetof(struct pt_regs, ARM_r0)},
@@ -144,6 +148,8 @@ int kgdb_arch_handle_exception(int exception_vector, int signo,
static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr)
{
+ if (user_mode(regs))
+ return -1;
kgdb_handle_exception(1, SIGTRAP, 0, regs);
return 0;
@@ -151,6 +157,8 @@ static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr)
static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int instr)
{
+ if (user_mode(regs))
+ return -1;
compiled_break = 1;
kgdb_handle_exception(1, SIGTRAP, 0, regs);
@@ -244,6 +252,33 @@ void kgdb_arch_exit(void)
unregister_die_notifier(&kgdb_notifier);
}
+int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+ int err;
+
+ /* patch_text() only supports int-sized breakpoints */
+ BUILD_BUG_ON(sizeof(int) != BREAK_INSTR_SIZE);
+
+ err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+ BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+
+ /* Machine is already stopped, so we can use __patch_text() directly */
+ __patch_text((void *)bpt->bpt_addr,
+ *(unsigned int *)arch_kgdb_ops.gdb_bpt_instr);
+
+ return err;
+}
+
+int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+ /* Machine is already stopped, so we can use __patch_text() directly */
+ __patch_text((void *)bpt->bpt_addr, *(unsigned int *)bpt->saved_instr);
+
+ return 0;
+}
+
/*
* Register our undef instruction hooks with ARM undef core.
* We regsiter a hook specifically looking for the KGB break inst
diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c
index 8cf0996aa1a8..4423a565ef6f 100644
--- a/arch/arm/kernel/machine_kexec.c
+++ b/arch/arm/kernel/machine_kexec.c
@@ -29,6 +29,7 @@ extern unsigned long kexec_boot_atags;
static atomic_t waiting_for_crash_ipi;
+static unsigned long dt_mem;
/*
* Provide a dummy crash_notes definition while crash dump arrives to arm.
* This prevents breakage of crash_notes attribute in kernel/ksysfs.c.
@@ -64,7 +65,7 @@ int machine_kexec_prepare(struct kimage *image)
return err;
if (be32_to_cpu(header) == OF_DT_HEADER)
- kexec_boot_atags = current_segment->mem;
+ dt_mem = current_segment->mem;
}
return 0;
}
@@ -163,12 +164,12 @@ void machine_kexec(struct kimage *image)
reboot_code_buffer = page_address(image->control_code_page);
/* Prepare parameters for reboot_code_buffer*/
+ set_kernel_text_rw();
kexec_start_address = image->start;
kexec_indirection_page = page_list;
kexec_mach_type = machine_arch_type;
- if (!kexec_boot_atags)
- kexec_boot_atags = image->start - KEXEC_ARM_ZIMAGE_OFFSET + KEXEC_ARM_ATAGS_OFFSET;
-
+ kexec_boot_atags = dt_mem ?: image->start - KEXEC_ARM_ZIMAGE_OFFSET
+ + KEXEC_ARM_ATAGS_OFFSET;
/* copy our kernel relocation code to the control code page */
reboot_entry = fncpy(reboot_code_buffer,
diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
index 07314af47733..5038960e3c55 100644
--- a/arch/arm/kernel/patch.c
+++ b/arch/arm/kernel/patch.c
@@ -1,8 +1,11 @@
#include <linux/kernel.h>
+#include <linux/spinlock.h>
#include <linux/kprobes.h>
+#include <linux/mm.h>
#include <linux/stop_machine.h>
#include <asm/cacheflush.h>
+#include <asm/fixmap.h>
#include <asm/smp_plat.h>
#include <asm/opcodes.h>
@@ -13,21 +16,77 @@ struct patch {
unsigned int insn;
};
-void __kprobes __patch_text(void *addr, unsigned int insn)
+static DEFINE_SPINLOCK(patch_lock);
+
+static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
+ __acquires(&patch_lock)
+{
+ unsigned int uintaddr = (uintptr_t) addr;
+ bool module = !core_kernel_text(uintaddr);
+ struct page *page;
+
+ if (module && IS_ENABLED(CONFIG_DEBUG_SET_MODULE_RONX))
+ page = vmalloc_to_page(addr);
+ else if (!module && IS_ENABLED(CONFIG_DEBUG_RODATA))
+ page = virt_to_page(addr);
+ else
+ return addr;
+
+ if (flags)
+ spin_lock_irqsave(&patch_lock, *flags);
+ else
+ __acquire(&patch_lock);
+
+ set_fixmap(fixmap, page_to_phys(page));
+
+ return (void *) (__fix_to_virt(fixmap) + (uintaddr & ~PAGE_MASK));
+}
+
+static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
+ __releases(&patch_lock)
+{
+ clear_fixmap(fixmap);
+
+ if (flags)
+ spin_unlock_irqrestore(&patch_lock, *flags);
+ else
+ __release(&patch_lock);
+}
+
+void __kprobes __patch_text_real(void *addr, unsigned int insn, bool remap)
{
bool thumb2 = IS_ENABLED(CONFIG_THUMB2_KERNEL);
+ unsigned int uintaddr = (uintptr_t) addr;
+ bool twopage = false;
+ unsigned long flags;
+ void *waddr = addr;
int size;
+ if (remap)
+ waddr = patch_map(addr, FIX_TEXT_POKE0, &flags);
+ else
+ __acquire(&patch_lock);
+
if (thumb2 && __opcode_is_thumb16(insn)) {
- *(u16 *)addr = __opcode_to_mem_thumb16(insn);
+ *(u16 *)waddr = __opcode_to_mem_thumb16(insn);
size = sizeof(u16);
- } else if (thumb2 && ((uintptr_t)addr & 2)) {
+ } else if (thumb2 && (uintaddr & 2)) {
u16 first = __opcode_thumb32_first(insn);
u16 second = __opcode_thumb32_second(insn);
- u16 *addrh = addr;
+ u16 *addrh0 = waddr;
+ u16 *addrh1 = waddr + 2;
+
+ twopage = (uintaddr & ~PAGE_MASK) == PAGE_SIZE - 2;
+ if (twopage && remap)
+ addrh1 = patch_map(addr + 2, FIX_TEXT_POKE1, NULL);
+
+ *addrh0 = __opcode_to_mem_thumb16(first);
+ *addrh1 = __opcode_to_mem_thumb16(second);
- addrh[0] = __opcode_to_mem_thumb16(first);
- addrh[1] = __opcode_to_mem_thumb16(second);
+ if (twopage && addrh1 != addr + 2) {
+ flush_kernel_vmap_range(addrh1, 2);
+ patch_unmap(FIX_TEXT_POKE1, NULL);
+ }
size = sizeof(u32);
} else {
@@ -36,10 +95,16 @@ void __kprobes __patch_text(void *addr, unsigned int insn)
else
insn = __opcode_to_mem_arm(insn);
- *(u32 *)addr = insn;
+ *(u32 *)waddr = insn;
size = sizeof(u32);
}
+ if (waddr != addr) {
+ flush_kernel_vmap_range(waddr, twopage ? size / 2 : size);
+ patch_unmap(FIX_TEXT_POKE0, &flags);
+ } else
+ __release(&patch_lock);
+
flush_icache_range((uintptr_t)(addr),
(uintptr_t)(addr) + size);
}
@@ -60,16 +125,5 @@ void __kprobes patch_text(void *addr, unsigned int insn)
.insn = insn,
};
- if (cache_ops_need_broadcast()) {
- stop_machine(patch_text_stop_machine, &patch, cpu_online_mask);
- } else {
- bool straddles_word = IS_ENABLED(CONFIG_THUMB2_KERNEL)
- && __opcode_is_thumb32(insn)
- && ((uintptr_t)addr & 2);
-
- if (straddles_word)
- stop_machine(patch_text_stop_machine, &patch, NULL);
- else
- __patch_text(addr, insn);
- }
+ stop_machine(patch_text_stop_machine, &patch, NULL);
}
diff --git a/arch/arm/kernel/patch.h b/arch/arm/kernel/patch.h
index b4731f2dac38..77e054c2f6cd 100644
--- a/arch/arm/kernel/patch.h
+++ b/arch/arm/kernel/patch.h
@@ -2,6 +2,16 @@
#define _ARM_KERNEL_PATCH_H
void patch_text(void *addr, unsigned int insn);
-void __patch_text(void *addr, unsigned int insn);
+void __patch_text_real(void *addr, unsigned int insn, bool remap);
+
+static inline void __patch_text(void *addr, unsigned int insn)
+{
+ __patch_text_real(addr, insn, true);
+}
+
+static inline void __patch_text_early(void *addr, unsigned int insn)
+{
+ __patch_text_real(addr, insn, false);
+}
#endif
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index ecefea4e2929..e9d2276a3056 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -32,6 +32,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/leds.h>
#include <linux/reboot.h>
+#include <linux/console.h>
#include <asm/cacheflush.h>
#include <asm/idmap.h>
@@ -41,6 +42,7 @@
#include <asm/system_misc.h>
#include <asm/mach/time.h>
#include <asm/tls.h>
+#include <asm/vdso.h>
#include "reboot.h"
#ifdef CONFIG_CC_STACKPROTECTOR
@@ -60,9 +62,46 @@ static const char *isa_modes[] __maybe_unused = {
"ARM" , "Thumb" , "Jazelle", "ThumbEE"
};
+#ifdef CONFIG_SMP
+void arch_trigger_all_cpu_backtrace(void)
+{
+ smp_send_all_cpu_backtrace();
+}
+#else
+void arch_trigger_all_cpu_backtrace(void)
+{
+ dump_stack();
+}
+#endif
+
extern void call_with_stack(void (*fn)(void *), void *arg, void *sp);
typedef void (*phys_reset_t)(unsigned long);
+#ifdef CONFIG_ARM_FLUSH_CONSOLE_ON_RESTART
+void arm_machine_flush_console(void)
+{
+ printk("\n");
+ pr_emerg("Restarting %s\n", linux_banner);
+ if (console_trylock()) {
+ console_unlock();
+ return;
+ }
+
+ mdelay(50);
+
+ local_irq_disable();
+ if (!console_trylock())
+ pr_emerg("arm_restart: Console was locked! Busting\n");
+ else
+ pr_emerg("arm_restart: Console was locked!\n");
+ console_unlock();
+}
+#else
+void arm_machine_flush_console(void)
+{
+}
+#endif
+
/*
* A temporary stack to use for CPU reset. This is static so that we
* don't clobber it with the identity mapping. When running with this
@@ -154,6 +193,7 @@ void arch_cpu_idle_prepare(void)
void arch_cpu_idle_enter(void)
{
+ idle_notifier_call_chain(IDLE_START);
ledtrig_cpu(CPU_LED_IDLE_START);
#ifdef CONFIG_PL310_ERRATA_769419
wmb();
@@ -163,6 +203,7 @@ void arch_cpu_idle_enter(void)
void arch_cpu_idle_exit(void)
{
ledtrig_cpu(CPU_LED_IDLE_END);
+ idle_notifier_call_chain(IDLE_END);
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -183,6 +224,16 @@ void arch_cpu_idle_dead(void)
*/
void machine_shutdown(void)
{
+#ifdef CONFIG_SMP
+ /*
+ * Disable preemption so we're guaranteed to
+ * run to power off or reboot and prevent
+ * the possibility of switching to another
+ * thread that might wind up blocking on
+ * one of the stopped CPUs.
+ */
+ preempt_disable();
+#endif
disable_nonboot_cpus();
}
@@ -231,6 +282,11 @@ void machine_restart(char *cmd)
local_irq_disable();
smp_send_stop();
+
+ /* Flush the console to make sure all the relevant messages make it
+ * out to the console drivers */
+ arm_machine_flush_console();
+
if (arm_pm_restart)
arm_pm_restart(reboot_mode, cmd);
else
@@ -245,6 +301,77 @@ void machine_restart(char *cmd)
while (1);
}
+/*
+ * dump a block of kernel memory from around the given address
+ */
+static void show_data(unsigned long addr, int nbytes, const char *name)
+{
+ int i, j;
+ int nlines;
+ u32 *p;
+
+ /*
+ * don't attempt to dump non-kernel addresses or
+ * values that are probably just small negative numbers
+ */
+ if (addr < PAGE_OFFSET || addr > -256UL)
+ return;
+
+ printk("\n%s: %#lx:\n", name, addr);
+
+ /*
+ * round address down to a 32 bit boundary
+ * and always dump a multiple of 32 bytes
+ */
+ p = (u32 *)(addr & ~(sizeof(u32) - 1));
+ nbytes += (addr & (sizeof(u32) - 1));
+ nlines = (nbytes + 31) / 32;
+
+
+ for (i = 0; i < nlines; i++) {
+ /*
+ * just display low 16 bits of address to keep
+ * each line of the dump < 80 characters
+ */
+ printk("%04lx ", (unsigned long)p & 0xffff);
+ for (j = 0; j < 8; j++) {
+ u32 data;
+ if (probe_kernel_address(p, data)) {
+ printk(" ********");
+ } else {
+ printk(" %08x", data);
+ }
+ ++p;
+ }
+ printk("\n");
+ }
+}
+
+static void show_extra_register_data(struct pt_regs *regs, int nbytes)
+{
+ mm_segment_t fs;
+
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ show_data(regs->ARM_pc - nbytes, nbytes * 2, "PC");
+ show_data(regs->ARM_lr - nbytes, nbytes * 2, "LR");
+ show_data(regs->ARM_sp - nbytes, nbytes * 2, "SP");
+ show_data(regs->ARM_ip - nbytes, nbytes * 2, "IP");
+ show_data(regs->ARM_fp - nbytes, nbytes * 2, "FP");
+ show_data(regs->ARM_r0 - nbytes, nbytes * 2, "R0");
+ show_data(regs->ARM_r1 - nbytes, nbytes * 2, "R1");
+ show_data(regs->ARM_r2 - nbytes, nbytes * 2, "R2");
+ show_data(regs->ARM_r3 - nbytes, nbytes * 2, "R3");
+ show_data(regs->ARM_r4 - nbytes, nbytes * 2, "R4");
+ show_data(regs->ARM_r5 - nbytes, nbytes * 2, "R5");
+ show_data(regs->ARM_r6 - nbytes, nbytes * 2, "R6");
+ show_data(regs->ARM_r7 - nbytes, nbytes * 2, "R7");
+ show_data(regs->ARM_r8 - nbytes, nbytes * 2, "R8");
+ show_data(regs->ARM_r9 - nbytes, nbytes * 2, "R9");
+ show_data(regs->ARM_r10 - nbytes, nbytes * 2, "R10");
+ set_fs(fs);
+}
+
void __show_regs(struct pt_regs *regs)
{
unsigned long flags;
@@ -276,12 +403,36 @@ void __show_regs(struct pt_regs *regs)
buf[4] = '\0';
#ifndef CONFIG_CPU_V7M
- printk("Flags: %s IRQs o%s FIQs o%s Mode %s ISA %s Segment %s\n",
- buf, interrupts_enabled(regs) ? "n" : "ff",
- fast_interrupts_enabled(regs) ? "n" : "ff",
- processor_modes[processor_mode(regs)],
- isa_modes[isa_mode(regs)],
- get_fs() == get_ds() ? "kernel" : "user");
+ {
+ unsigned int domain = get_domain();
+ const char *segment;
+
+#ifdef CONFIG_CPU_SW_DOMAIN_PAN
+ /*
+ * Get the domain register for the parent context. In user
+ * mode, we don't save the DACR, so lets use what it should
+ * be. For other modes, we place it after the pt_regs struct.
+ */
+ if (user_mode(regs))
+ domain = DACR_UACCESS_ENABLE;
+ else
+ domain = *(unsigned int *)(regs + 1);
+#endif
+
+ if ((domain & domain_mask(DOMAIN_USER)) ==
+ domain_val(DOMAIN_USER, DOMAIN_NOACCESS))
+ segment = "none";
+ else if (get_fs() == get_ds())
+ segment = "kernel";
+ else
+ segment = "user";
+
+ printk("Flags: %s IRQs o%s FIQs o%s Mode %s ISA %s Segment %s\n",
+ buf, interrupts_enabled(regs) ? "n" : "ff",
+ fast_interrupts_enabled(regs) ? "n" : "ff",
+ processor_modes[processor_mode(regs)],
+ isa_modes[isa_mode(regs)], segment);
+ }
#else
printk("xPSR: %08lx\n", regs->ARM_cpsr);
#endif
@@ -293,10 +444,9 @@ void __show_regs(struct pt_regs *regs)
buf[0] = '\0';
#ifdef CONFIG_CPU_CP15_MMU
{
- unsigned int transbase, dac;
+ unsigned int transbase, dac = get_domain();
asm("mrc p15, 0, %0, c2, c0\n\t"
- "mrc p15, 0, %1, c3, c0\n"
- : "=r" (transbase), "=r" (dac));
+ : "=r" (transbase));
snprintf(buf, sizeof(buf), " Table: %08x DAC: %08x",
transbase, dac);
}
@@ -306,6 +456,8 @@ void __show_regs(struct pt_regs *regs)
printk("Control: %08x%s\n", ctrl, buf);
}
#endif
+
+ show_extra_register_data(regs, 128);
}
void show_regs(struct pt_regs * regs)
@@ -357,6 +509,16 @@ copy_thread(unsigned long clone_flags, unsigned long stack_start,
memset(&thread->cpu_context, 0, sizeof(struct cpu_context_save));
+#ifdef CONFIG_CPU_USE_DOMAINS
+ /*
+ * Copy the initial value of the domain access control register
+ * from the current thread: thread->addr_limit will have been
+ * copied from the current thread via setup_thread_stack() in
+ * kernel/fork.c
+ */
+ thread->cpu_domain = get_domain();
+#endif
+
if (likely(!(p->flags & PF_KTHREAD))) {
*childregs = *current_pt_regs();
childregs->ARM_r0 = 0;
@@ -481,7 +643,7 @@ const char *arch_vma_name(struct vm_area_struct *vma)
}
/* If possible, provide a placement hint at a random offset from the
- * stack for the signal page.
+ * stack for the sigpage and vdso pages.
*/
static unsigned long sigpage_addr(const struct mm_struct *mm,
unsigned int npages)
@@ -525,6 +687,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
+ unsigned long npages;
unsigned long addr;
unsigned long hint;
int ret = 0;
@@ -534,9 +697,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
if (!signal_page)
return -ENOMEM;
+ npages = 1; /* for sigpage */
+ npages += vdso_total_pages;
+
down_write(&mm->mmap_sem);
- hint = sigpage_addr(mm, 1);
- addr = get_unmapped_area(NULL, hint, PAGE_SIZE, 0, 0);
+ hint = sigpage_addr(mm, npages);
+ addr = get_unmapped_area(NULL, hint, npages << PAGE_SHIFT, 0, 0);
if (IS_ERR_VALUE(addr)) {
ret = addr;
goto up_fail;
@@ -553,6 +719,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
mm->context.sigpage = addr;
+ /* Unlike the sigpage, failure to install the vdso is unlikely
+ * to be fatal to the process, so no error check needed
+ * here.
+ */
+ arm_install_vdso(mm, addr + PAGE_SIZE);
+
up_fail:
up_write(&mm->mmap_sem);
return ret;
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 306e1ac2c8e3..1ca80a3d30e4 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -748,7 +748,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
struct resource *res;
kernel_code.start = virt_to_phys(_text);
- kernel_code.end = virt_to_phys(_etext - 1);
+ kernel_code.end = virt_to_phys(__init_begin - 1);
kernel_data.start = virt_to_phys(_sdata);
kernel_data.end = virt_to_phys(_end - 1);
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index ea6d69125dde..81d104cd58b7 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -191,7 +191,7 @@ asmlinkage int sys_sigreturn(struct pt_regs *regs)
struct sigframe __user *frame;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Since we stacked the signal on a 64-bit boundary,
@@ -221,7 +221,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
struct rt_sigframe __user *frame;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Since we stacked the signal on a 64-bit boundary,
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index a8e32aaf0383..d6e6b5bfd100 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -72,6 +72,7 @@ enum ipi_msg_type {
IPI_CPU_STOP,
IPI_IRQ_WORK,
IPI_COMPLETION,
+ IPI_CPU_BACKTRACE,
};
static DECLARE_COMPLETION(cpu_running);
@@ -86,9 +87,11 @@ void __init smp_set_ops(struct smp_operations *ops)
static unsigned long get_arch_pgd(pgd_t *pgd)
{
- phys_addr_t pgdir = virt_to_idmap(pgd);
- BUG_ON(pgdir & ARCH_PGD_MASK);
- return pgdir >> ARCH_PGD_SHIFT;
+#ifdef CONFIG_ARM_LPAE
+ return __phys_to_pfn(virt_to_phys(pgd));
+#else
+ return virt_to_phys(pgd);
+#endif
}
int __cpu_up(unsigned int cpu, struct task_struct *idle)
@@ -108,7 +111,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
#endif
#ifdef CONFIG_MMU
- secondary_data.pgdir = get_arch_pgd(idmap_pgd);
+ secondary_data.pgdir = virt_to_phys(idmap_pgd);
secondary_data.swapper_pg_dir = get_arch_pgd(swapper_pg_dir);
#endif
sync_cache_w(&secondary_data);
@@ -456,6 +459,7 @@ static const char *ipi_types[NR_IPI] __tracepoint_string = {
S(IPI_CPU_STOP, "CPU stop interrupts"),
S(IPI_IRQ_WORK, "IRQ work interrupts"),
S(IPI_COMPLETION, "completion interrupts"),
+ S(IPI_CPU_BACKTRACE, "CPU backtrace"),
};
static void smp_cross_call(const struct cpumask *target, unsigned int ipinr)
@@ -557,6 +561,58 @@ static void ipi_complete(unsigned int cpu)
complete(per_cpu(cpu_completion, cpu));
}
+static cpumask_t backtrace_mask;
+static DEFINE_RAW_SPINLOCK(backtrace_lock);
+
+/* "in progress" flag of arch_trigger_all_cpu_backtrace */
+static unsigned long backtrace_flag;
+
+void smp_send_all_cpu_backtrace(void)
+{
+ unsigned int this_cpu = smp_processor_id();
+ int i;
+
+ if (test_and_set_bit(0, &backtrace_flag))
+ /*
+ * If there is already a trigger_all_cpu_backtrace() in progress
+ * (backtrace_flag == 1), don't output double cpu dump infos.
+ */
+ return;
+
+ cpumask_copy(&backtrace_mask, cpu_online_mask);
+ cpu_clear(this_cpu, backtrace_mask);
+
+ pr_info("Backtrace for cpu %d (current):\n", this_cpu);
+ dump_stack();
+
+ pr_info("\nsending IPI to all other CPUs:\n");
+ smp_cross_call(&backtrace_mask, IPI_CPU_BACKTRACE);
+
+ /* Wait for up to 10 seconds for all other CPUs to do the backtrace */
+ for (i = 0; i < 10 * 1000; i++) {
+ if (cpumask_empty(&backtrace_mask))
+ break;
+ mdelay(1);
+ }
+
+ clear_bit(0, &backtrace_flag);
+ smp_mb__after_atomic();
+}
+
+/*
+ * ipi_cpu_backtrace - handle IPI from smp_send_all_cpu_backtrace()
+ */
+static void ipi_cpu_backtrace(unsigned int cpu, struct pt_regs *regs)
+{
+ if (cpu_isset(cpu, backtrace_mask)) {
+ raw_spin_lock(&backtrace_lock);
+ pr_warning("IPI backtrace for cpu %d\n", cpu);
+ show_regs(regs);
+ raw_spin_unlock(&backtrace_lock);
+ cpu_clear(cpu, backtrace_mask);
+ }
+}
+
/*
* Main handler for inter-processor interrupts
*/
@@ -623,6 +679,10 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
irq_exit();
break;
+ case IPI_CPU_BACKTRACE:
+ ipi_cpu_backtrace(cpu, regs);
+ break;
+
default:
printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n",
cpu, ipinr);
diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c
index 587fdfe1a72c..265b4da0597e 100644
--- a/arch/arm/kernel/swp_emulate.c
+++ b/arch/arm/kernel/swp_emulate.c
@@ -141,11 +141,14 @@ static int emulate_swpX(unsigned int address, unsigned int *data,
while (1) {
unsigned long temp;
+ unsigned int __ua_flags;
+ __ua_flags = uaccess_save_and_enable();
if (type == TYPE_SWPB)
__user_swpb_asm(*data, address, res, temp);
else
__user_swp_asm(*data, address, res, temp);
+ uaccess_restore(__ua_flags);
if (likely(res != -EAGAIN) || signal_pending(current))
break;
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 89cfdd6e50cb..7ee74a026582 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -42,9 +42,15 @@
*/
static DEFINE_PER_CPU(unsigned long, cpu_scale);
-unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
+#ifdef CONFIG_CPU_FREQ
+ unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu);
+
+ return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT;
+#else
return per_cpu(cpu_scale, cpu);
+#endif
}
static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
@@ -153,6 +159,8 @@ static void __init parse_dt_topology(void)
}
+static const struct sched_group_energy * const cpu_core_energy(int cpu);
+
/*
* Look for a customed capacity of a CPU in the cpu_capacity table during the
* boot. The update of all CPUs is in O(n^2) for heteregeneous system but the
@@ -160,10 +168,14 @@ static void __init parse_dt_topology(void)
*/
static void update_cpu_capacity(unsigned int cpu)
{
- if (!cpu_capacity(cpu))
- return;
+ unsigned long capacity = SCHED_CAPACITY_SCALE;
+
+ if (cpu_core_energy(cpu)) {
+ int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1;
+ capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap;
+ }
- set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);
+ set_capacity_scale(cpu, capacity);
printk(KERN_INFO "CPU%u: update cpu_capacity %lu\n",
cpu, arch_scale_cpu_capacity(NULL, cpu));
@@ -275,17 +287,138 @@ void store_cpu_topology(unsigned int cpuid)
cpu_topology[cpuid].socket_id, mpidr);
}
+/*
+ * ARM TC2 specific energy cost model data. There are no unit requirements for
+ * the data. Data can be normalized to any reference point, but the
+ * normalization must be consistent. That is, one bogo-joule/watt must be the
+ * same quantity for all data, but we don't care what it is.
+ */
+static struct idle_state idle_states_cluster_a7[] = {
+ { .power = 25 }, /* arch_cpu_idle() (active idle) = WFI */
+ { .power = 25 }, /* WFI */
+ { .power = 10 }, /* cluster-sleep-l */
+ };
+
+static struct idle_state idle_states_cluster_a15[] = {
+ { .power = 70 }, /* arch_cpu_idle() (active idle) = WFI */
+ { .power = 70 }, /* WFI */
+ { .power = 25 }, /* cluster-sleep-b */
+ };
+
+static struct capacity_state cap_states_cluster_a7[] = {
+ /* Cluster only power */
+ { .cap = 150, .power = 2967, }, /* 350 MHz */
+ { .cap = 172, .power = 2792, }, /* 400 MHz */
+ { .cap = 215, .power = 2810, }, /* 500 MHz */
+ { .cap = 258, .power = 2815, }, /* 600 MHz */
+ { .cap = 301, .power = 2919, }, /* 700 MHz */
+ { .cap = 344, .power = 2847, }, /* 800 MHz */
+ { .cap = 387, .power = 3917, }, /* 900 MHz */
+ { .cap = 430, .power = 4905, }, /* 1000 MHz */
+ };
+
+static struct capacity_state cap_states_cluster_a15[] = {
+ /* Cluster only power */
+ { .cap = 426, .power = 7920, }, /* 500 MHz */
+ { .cap = 512, .power = 8165, }, /* 600 MHz */
+ { .cap = 597, .power = 8172, }, /* 700 MHz */
+ { .cap = 682, .power = 8195, }, /* 800 MHz */
+ { .cap = 768, .power = 8265, }, /* 900 MHz */
+ { .cap = 853, .power = 8446, }, /* 1000 MHz */
+ { .cap = 938, .power = 11426, }, /* 1100 MHz */
+ { .cap = 1024, .power = 15200, }, /* 1200 MHz */
+ };
+
+static struct sched_group_energy energy_cluster_a7 = {
+ .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a7),
+ .idle_states = idle_states_cluster_a7,
+ .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a7),
+ .cap_states = cap_states_cluster_a7,
+};
+
+static struct sched_group_energy energy_cluster_a15 = {
+ .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a15),
+ .idle_states = idle_states_cluster_a15,
+ .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a15),
+ .cap_states = cap_states_cluster_a15,
+};
+
+static struct idle_state idle_states_core_a7[] = {
+ { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */
+ { .power = 0 }, /* WFI */
+ { .power = 0 }, /* cluster-sleep-l */
+ };
+
+static struct idle_state idle_states_core_a15[] = {
+ { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */
+ { .power = 0 }, /* WFI */
+ { .power = 0 }, /* cluster-sleep-b */
+ };
+
+static struct capacity_state cap_states_core_a7[] = {
+ /* Power per cpu */
+ { .cap = 150, .power = 187, }, /* 350 MHz */
+ { .cap = 172, .power = 275, }, /* 400 MHz */
+ { .cap = 215, .power = 334, }, /* 500 MHz */
+ { .cap = 258, .power = 407, }, /* 600 MHz */
+ { .cap = 301, .power = 447, }, /* 700 MHz */
+ { .cap = 344, .power = 549, }, /* 800 MHz */
+ { .cap = 387, .power = 761, }, /* 900 MHz */
+ { .cap = 430, .power = 1024, }, /* 1000 MHz */
+ };
+
+static struct capacity_state cap_states_core_a15[] = {
+ /* Power per cpu */
+ { .cap = 426, .power = 2021, }, /* 500 MHz */
+ { .cap = 512, .power = 2312, }, /* 600 MHz */
+ { .cap = 597, .power = 2756, }, /* 700 MHz */
+ { .cap = 682, .power = 3125, }, /* 800 MHz */
+ { .cap = 768, .power = 3524, }, /* 900 MHz */
+ { .cap = 853, .power = 3846, }, /* 1000 MHz */
+ { .cap = 938, .power = 5177, }, /* 1100 MHz */
+ { .cap = 1024, .power = 6997, }, /* 1200 MHz */
+ };
+
+static struct sched_group_energy energy_core_a7 = {
+ .nr_idle_states = ARRAY_SIZE(idle_states_core_a7),
+ .idle_states = idle_states_core_a7,
+ .nr_cap_states = ARRAY_SIZE(cap_states_core_a7),
+ .cap_states = cap_states_core_a7,
+};
+
+static struct sched_group_energy energy_core_a15 = {
+ .nr_idle_states = ARRAY_SIZE(idle_states_core_a15),
+ .idle_states = idle_states_core_a15,
+ .nr_cap_states = ARRAY_SIZE(cap_states_core_a15),
+ .cap_states = cap_states_core_a15,
+};
+
+/* sd energy functions */
+static inline
+const struct sched_group_energy * const cpu_cluster_energy(int cpu)
+{
+ return cpu_topology[cpu].socket_id ? &energy_cluster_a7 :
+ &energy_cluster_a15;
+}
+
+static inline
+const struct sched_group_energy * const cpu_core_energy(int cpu)
+{
+ return cpu_topology[cpu].socket_id ? &energy_core_a7 :
+ &energy_core_a15;
+}
+
static inline int cpu_corepower_flags(void)
{
- return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN;
+ return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \
+ SD_SHARE_CAP_STATES;
}
static struct sched_domain_topology_level arm_topology[] = {
#ifdef CONFIG_SCHED_MC
- { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
#endif
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) },
{ NULL, },
};
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 9f5d81881eb6..73a5cdd54777 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -880,7 +880,6 @@ void __init early_trap_init(void *vectors_base)
kuser_init(vectors_base);
flush_icache_range(vectors, vectors + PAGE_SIZE * 2);
- modify_domain(DOMAIN_USER, DOMAIN_CLIENT);
#else /* ifndef CONFIG_CPU_V7M */
/*
* on V7-M there is no need to copy the vector table to a dedicated
diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c
new file mode 100644
index 000000000000..bbbffe946122
--- /dev/null
+++ b/arch/arm/kernel/vdso.c
@@ -0,0 +1,337 @@
+/*
+ * Adapted from arm64 version.
+ *
+ * Copyright (C) 2012 ARM Limited
+ * Copyright (C) 2015 Mentor Graphics Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/cache.h>
+#include <linux/elf.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/of.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/vmalloc.h>
+#include <asm/arch_timer.h>
+#include <asm/barrier.h>
+#include <asm/cacheflush.h>
+#include <asm/page.h>
+#include <asm/vdso.h>
+#include <asm/vdso_datapage.h>
+#include <clocksource/arm_arch_timer.h>
+
+#define MAX_SYMNAME 64
+
+static struct page **vdso_text_pagelist;
+
+/* Total number of pages needed for the data and text portions of the VDSO. */
+unsigned int vdso_total_pages __ro_after_init;
+
+/*
+ * The VDSO data page.
+ */
+static union vdso_data_store vdso_data_store __page_aligned_data;
+static struct vdso_data *vdso_data = &vdso_data_store.data;
+
+static struct page *vdso_data_page __ro_after_init;
+static const struct vm_special_mapping vdso_data_mapping = {
+ .name = "[vvar]",
+ .pages = &vdso_data_page,
+};
+
+static struct vm_special_mapping vdso_text_mapping __ro_after_init = {
+ .name = "[vdso]",
+};
+
+struct elfinfo {
+ Elf32_Ehdr *hdr; /* ptr to ELF */
+ Elf32_Sym *dynsym; /* ptr to .dynsym section */
+ unsigned long dynsymsize; /* size of .dynsym section */
+ char *dynstr; /* ptr to .dynstr section */
+};
+
+/* Cached result of boot-time check for whether the arch timer exists,
+ * and if so, whether the virtual counter is useable.
+ */
+static bool cntvct_ok __ro_after_init;
+
+static bool __init cntvct_functional(void)
+{
+ struct device_node *np;
+ bool ret = false;
+
+ if (!IS_ENABLED(CONFIG_ARM_ARCH_TIMER))
+ goto out;
+
+ /* The arm_arch_timer core should export
+ * arch_timer_use_virtual or similar so we don't have to do
+ * this.
+ */
+ np = of_find_compatible_node(NULL, NULL, "arm,armv7-timer");
+ if (!np)
+ goto out_put;
+
+ if (of_property_read_bool(np, "arm,cpu-registers-not-fw-configured"))
+ goto out_put;
+
+ ret = true;
+
+out_put:
+ of_node_put(np);
+out:
+ return ret;
+}
+
+static void * __init find_section(Elf32_Ehdr *ehdr, const char *name,
+ unsigned long *size)
+{
+ Elf32_Shdr *sechdrs;
+ unsigned int i;
+ char *secnames;
+
+ /* Grab section headers and strings so we can tell who is who */
+ sechdrs = (void *)ehdr + ehdr->e_shoff;
+ secnames = (void *)ehdr + sechdrs[ehdr->e_shstrndx].sh_offset;
+
+ /* Find the section they want */
+ for (i = 1; i < ehdr->e_shnum; i++) {
+ if (strcmp(secnames + sechdrs[i].sh_name, name) == 0) {
+ if (size)
+ *size = sechdrs[i].sh_size;
+ return (void *)ehdr + sechdrs[i].sh_offset;
+ }
+ }
+
+ if (size)
+ *size = 0;
+ return NULL;
+}
+
+static Elf32_Sym * __init find_symbol(struct elfinfo *lib, const char *symname)
+{
+ unsigned int i;
+
+ for (i = 0; i < (lib->dynsymsize / sizeof(Elf32_Sym)); i++) {
+ char name[MAX_SYMNAME], *c;
+
+ if (lib->dynsym[i].st_name == 0)
+ continue;
+ strlcpy(name, lib->dynstr + lib->dynsym[i].st_name,
+ MAX_SYMNAME);
+ c = strchr(name, '@');
+ if (c)
+ *c = 0;
+ if (strcmp(symname, name) == 0)
+ return &lib->dynsym[i];
+ }
+ return NULL;
+}
+
+static void __init vdso_nullpatch_one(struct elfinfo *lib, const char *symname)
+{
+ Elf32_Sym *sym;
+
+ sym = find_symbol(lib, symname);
+ if (!sym)
+ return;
+
+ sym->st_name = 0;
+}
+
+static void __init patch_vdso(void *ehdr)
+{
+ struct elfinfo einfo;
+
+ einfo = (struct elfinfo) {
+ .hdr = ehdr,
+ };
+
+ einfo.dynsym = find_section(einfo.hdr, ".dynsym", &einfo.dynsymsize);
+ einfo.dynstr = find_section(einfo.hdr, ".dynstr", NULL);
+
+ /* If the virtual counter is absent or non-functional we don't
+ * want programs to incur the slight additional overhead of
+ * dispatching through the VDSO only to fall back to syscalls.
+ */
+ if (!cntvct_ok) {
+ vdso_nullpatch_one(&einfo, "__vdso_gettimeofday");
+ vdso_nullpatch_one(&einfo, "__vdso_clock_gettime");
+ }
+}
+
+static int __init vdso_init(void)
+{
+ unsigned int text_pages;
+ int i;
+
+ if (memcmp(&vdso_start, "\177ELF", 4)) {
+ pr_err("VDSO is not a valid ELF object!\n");
+ return -ENOEXEC;
+ }
+
+ text_pages = (&vdso_end - &vdso_start) >> PAGE_SHIFT;
+ pr_debug("vdso: %i text pages at base %p\n", text_pages, &vdso_start);
+
+ /* Allocate the VDSO text pagelist */
+ vdso_text_pagelist = kcalloc(text_pages, sizeof(struct page *),
+ GFP_KERNEL);
+ if (vdso_text_pagelist == NULL)
+ return -ENOMEM;
+
+ /* Grab the VDSO data page. */
+ vdso_data_page = virt_to_page(vdso_data);
+
+ /* Grab the VDSO text pages. */
+ for (i = 0; i < text_pages; i++) {
+ struct page *page;
+
+ page = virt_to_page(&vdso_start + i * PAGE_SIZE);
+ vdso_text_pagelist[i] = page;
+ }
+
+ vdso_text_mapping.pages = vdso_text_pagelist;
+
+ vdso_total_pages = 1; /* for the data/vvar page */
+ vdso_total_pages += text_pages;
+
+ cntvct_ok = cntvct_functional();
+
+ patch_vdso(&vdso_start);
+
+ return 0;
+}
+arch_initcall(vdso_init);
+
+static int install_vvar(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma;
+
+ vma = _install_special_mapping(mm, addr, PAGE_SIZE,
+ VM_READ | VM_MAYREAD,
+ &vdso_data_mapping);
+
+ return PTR_ERR_OR_ZERO(vma);
+}
+
+/* assumes mmap_sem is write-locked */
+void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma;
+ unsigned long len;
+
+ mm->context.vdso = 0;
+
+ if (vdso_text_pagelist == NULL)
+ return;
+
+ if (install_vvar(mm, addr))
+ return;
+
+ /* Account for vvar page. */
+ addr += PAGE_SIZE;
+ len = (vdso_total_pages - 1) << PAGE_SHIFT;
+
+ vma = _install_special_mapping(mm, addr, len,
+ VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
+ &vdso_text_mapping);
+
+ if (!IS_ERR(vma))
+ mm->context.vdso = addr;
+}
+
+static void vdso_write_begin(struct vdso_data *vdata)
+{
+ ++vdso_data->seq_count;
+ smp_wmb(); /* Pairs with smp_rmb in vdso_read_retry */
+}
+
+static void vdso_write_end(struct vdso_data *vdata)
+{
+ smp_wmb(); /* Pairs with smp_rmb in vdso_read_begin */
+ ++vdso_data->seq_count;
+}
+
+static bool tk_is_cntvct(const struct timekeeper *tk)
+{
+ if (!IS_ENABLED(CONFIG_ARM_ARCH_TIMER))
+ return false;
+
+ if (strcmp(tk->tkr_mono.clock->name, "arch_sys_counter") != 0)
+ return false;
+
+ return true;
+}
+
+/**
+ * update_vsyscall - update the vdso data page
+ *
+ * Increment the sequence counter, making it odd, indicating to
+ * userspace that an update is in progress. Update the fields used
+ * for coarse clocks and, if the architected system timer is in use,
+ * the fields used for high precision clocks. Increment the sequence
+ * counter again, making it even, indicating to userspace that the
+ * update is finished.
+ *
+ * Userspace is expected to sample seq_count before reading any other
+ * fields from the data page. If seq_count is odd, userspace is
+ * expected to wait until it becomes even. After copying data from
+ * the page, userspace must sample seq_count again; if it has changed
+ * from its previous value, userspace must retry the whole sequence.
+ *
+ * Calls to update_vsyscall are serialized by the timekeeping core.
+ */
+void update_vsyscall(struct timekeeper *tk)
+{
+ struct timespec64 *wtm = &tk->wall_to_monotonic;
+
+ if (!cntvct_ok) {
+ /* The entry points have been zeroed, so there is no
+ * point in updating the data page.
+ */
+ return;
+ }
+
+ vdso_write_begin(vdso_data);
+
+ vdso_data->tk_is_cntvct = tk_is_cntvct(tk);
+ vdso_data->xtime_coarse_sec = tk->xtime_sec;
+ vdso_data->xtime_coarse_nsec = (u32)(tk->tkr_mono.xtime_nsec >>
+ tk->tkr_mono.shift);
+ vdso_data->wtm_clock_sec = wtm->tv_sec;
+ vdso_data->wtm_clock_nsec = wtm->tv_nsec;
+
+ if (vdso_data->tk_is_cntvct) {
+ vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last;
+ vdso_data->xtime_clock_sec = tk->xtime_sec;
+ vdso_data->xtime_clock_snsec = tk->tkr_mono.xtime_nsec;
+ vdso_data->cs_mult = tk->tkr_mono.mult;
+ vdso_data->cs_shift = tk->tkr_mono.shift;
+ vdso_data->cs_mask = tk->tkr_mono.mask;
+ }
+
+ vdso_write_end(vdso_data);
+
+ flush_dcache_page(virt_to_page(vdso_data));
+}
+
+void update_vsyscall_tz(void)
+{
+ vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
+ vdso_data->tz_dsttime = sys_tz.tz_dsttime;
+ flush_dcache_page(virt_to_page(vdso_data));
+}
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 8e95aa47457a..2af0005e2548 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -8,6 +8,9 @@
#include <asm/thread_info.h>
#include <asm/memory.h>
#include <asm/page.h>
+#ifdef CONFIG_ARM_KERNMEM_PERMS
+#include <asm/pgtable.h>
+#endif
#define PROC_INFO \
. = ALIGN(4); \
@@ -90,6 +93,11 @@ SECTIONS
_text = .;
HEAD_TEXT
}
+
+#ifdef CONFIG_ARM_KERNMEM_PERMS
+ . = ALIGN(1<<SECTION_SHIFT);
+#endif
+
.text : { /* Real text segment */
_stext = .; /* Text and read-only data */
__exception_text_start = .;
@@ -112,6 +120,11 @@ SECTIONS
ARM_CPU_KEEP(PROC_INFO)
}
+#ifdef CONFIG_DEBUG_RODATA
+ . = ALIGN(1<<SECTION_SHIFT);
+#endif
+ _etext = .; /* End of text section */
+
RO_DATA(PAGE_SIZE)
. = ALIGN(4);
@@ -142,10 +155,12 @@ SECTIONS
NOTES
- _etext = .; /* End of text and rodata section */
-
#ifndef CONFIG_XIP_KERNEL
+# ifdef CONFIG_ARM_KERNMEM_PERMS
+ . = ALIGN(1<<SECTION_SHIFT);
+# else
. = ALIGN(PAGE_SIZE);
+# endif
__init_begin = .;
#endif
/*
@@ -219,8 +234,12 @@ SECTIONS
__data_loc = ALIGN(4); /* location in binary */
. = PAGE_OFFSET + TEXT_OFFSET;
#else
- . = ALIGN(THREAD_SIZE);
__init_end = .;
+#ifdef CONFIG_ARM_KERNMEM_PERMS
+ . = ALIGN(1<<SECTION_SHIFT);
+#else
+ . = ALIGN(THREAD_SIZE);
+#endif
__data_loc = .;
#endif
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 6c3dc428a881..151813f4889a 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -279,15 +279,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vcpu->cpu = cpu;
vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
- /*
- * Check whether this vcpu requires the cache to be flushed on
- * this physical CPU. This is a consequence of doing dcache
- * operations by set/way on this vcpu. We do it here to be in
- * a non-preemptible section.
- */
- if (cpumask_test_and_clear_cpu(cpu, &vcpu->arch.require_dcache_flush))
- flush_cache_all(); /* We'd really want v7_flush_dcache_all() */
-
kvm_arm_set_running_vcpu(vcpu);
}
@@ -539,7 +530,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
vcpu->mode = OUTSIDE_GUEST_MODE;
- vcpu->arch.last_pcpu = smp_processor_id();
kvm_guest_exit();
trace_kvm_exit(*vcpu_pc(vcpu));
/*
diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index 7928dbdf2102..f3d88dc388bc 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -189,82 +189,40 @@ static bool access_l2ectlr(struct kvm_vcpu *vcpu,
return true;
}
-/* See note at ARM ARM B1.14.4 */
+/*
+ * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
+ */
static bool access_dcsw(struct kvm_vcpu *vcpu,
const struct coproc_params *p,
const struct coproc_reg *r)
{
- unsigned long val;
- int cpu;
-
if (!p->is_write)
return read_from_write_only(vcpu, p);
- cpu = get_cpu();
-
- cpumask_setall(&vcpu->arch.require_dcache_flush);
- cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush);
-
- /* If we were already preempted, take the long way around */
- if (cpu != vcpu->arch.last_pcpu) {
- flush_cache_all();
- goto done;
- }
-
- val = *vcpu_reg(vcpu, p->Rt1);
-
- switch (p->CRm) {
- case 6: /* Upgrade DCISW to DCCISW, as per HCR.SWIO */
- case 14: /* DCCISW */
- asm volatile("mcr p15, 0, %0, c7, c14, 2" : : "r" (val));
- break;
-
- case 10: /* DCCSW */
- asm volatile("mcr p15, 0, %0, c7, c10, 2" : : "r" (val));
- break;
- }
-
-done:
- put_cpu();
-
+ kvm_set_way_flush(vcpu);
return true;
}
/*
* Generic accessor for VM registers. Only called as long as HCR_TVM
- * is set.
+ * is set. If the guest enables the MMU, we stop trapping the VM
+ * sys_regs and leave it in complete control of the caches.
+ *
+ * Used by the cpu-specific code.
*/
-static bool access_vm_reg(struct kvm_vcpu *vcpu,
- const struct coproc_params *p,
- const struct coproc_reg *r)
+bool access_vm_reg(struct kvm_vcpu *vcpu,
+ const struct coproc_params *p,
+ const struct coproc_reg *r)
{
+ bool was_enabled = vcpu_has_cache_enabled(vcpu);
+
BUG_ON(!p->is_write);
vcpu->arch.cp15[r->reg] = *vcpu_reg(vcpu, p->Rt1);
if (p->is_64bit)
vcpu->arch.cp15[r->reg + 1] = *vcpu_reg(vcpu, p->Rt2);
- return true;
-}
-
-/*
- * SCTLR accessor. Only called as long as HCR_TVM is set. If the
- * guest enables the MMU, we stop trapping the VM sys_regs and leave
- * it in complete control of the caches.
- *
- * Used by the cpu-specific code.
- */
-bool access_sctlr(struct kvm_vcpu *vcpu,
- const struct coproc_params *p,
- const struct coproc_reg *r)
-{
- access_vm_reg(vcpu, p, r);
-
- if (vcpu_has_cache_enabled(vcpu)) { /* MMU+Caches enabled? */
- vcpu->arch.hcr &= ~HCR_TVM;
- stage2_flush_vm(vcpu->kvm);
- }
-
+ kvm_toggle_cache(vcpu, was_enabled);
return true;
}
diff --git a/arch/arm/kvm/coproc.h b/arch/arm/kvm/coproc.h
index 1a44bbe39643..88d24a3a9778 100644
--- a/arch/arm/kvm/coproc.h
+++ b/arch/arm/kvm/coproc.h
@@ -153,8 +153,8 @@ static inline int cmp_reg(const struct coproc_reg *i1,
#define is64 .is_64 = true
#define is32 .is_64 = false
-bool access_sctlr(struct kvm_vcpu *vcpu,
- const struct coproc_params *p,
- const struct coproc_reg *r);
+bool access_vm_reg(struct kvm_vcpu *vcpu,
+ const struct coproc_params *p,
+ const struct coproc_reg *r);
#endif /* __ARM_KVM_COPROC_LOCAL_H__ */
diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c
index e6f4ae48bda9..a7136757d373 100644
--- a/arch/arm/kvm/coproc_a15.c
+++ b/arch/arm/kvm/coproc_a15.c
@@ -34,7 +34,7 @@
static const struct coproc_reg a15_regs[] = {
/* SCTLR: swapped by interrupt.S. */
{ CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
- access_sctlr, reset_val, c1_SCTLR, 0x00C50078 },
+ access_vm_reg, reset_val, c1_SCTLR, 0x00C50078 },
};
static struct kvm_coproc_target_table a15_target_table = {
diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c
index 17fc7cd479d3..b19e46d1b2c0 100644
--- a/arch/arm/kvm/coproc_a7.c
+++ b/arch/arm/kvm/coproc_a7.c
@@ -37,7 +37,7 @@
static const struct coproc_reg a7_regs[] = {
/* SCTLR: swapped by interrupt.S. */
{ CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
- access_sctlr, reset_val, c1_SCTLR, 0x00C50878 },
+ access_vm_reg, reset_val, c1_SCTLR, 0x00C50878 },
};
static struct kvm_coproc_target_table a7_target_table = {
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 3ffbce7277f7..70ccd1ea5ba1 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -338,7 +338,7 @@ static void stage2_flush_memslot(struct kvm *kvm,
* Go through the stage 2 page tables and invalidate any cache lines
* backing memory already mapped to the VM.
*/
-void stage2_flush_vm(struct kvm *kvm)
+static void stage2_flush_vm(struct kvm *kvm)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
@@ -1521,3 +1521,71 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
unmap_stage2_range(kvm, gpa, size);
spin_unlock(&kvm->mmu_lock);
}
+
+/*
+ * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
+ *
+ * Main problems:
+ * - S/W ops are local to a CPU (not broadcast)
+ * - We have line migration behind our back (speculation)
+ * - System caches don't support S/W at all (damn!)
+ *
+ * In the face of the above, the best we can do is to try and convert
+ * S/W ops to VA ops. Because the guest is not allowed to infer the
+ * S/W to PA mapping, it can only use S/W to nuke the whole cache,
+ * which is a rather good thing for us.
+ *
+ * Also, it is only used when turning caches on/off ("The expected
+ * usage of the cache maintenance instructions that operate by set/way
+ * is associated with the cache maintenance instructions associated
+ * with the powerdown and powerup of caches, if this is required by
+ * the implementation.").
+ *
+ * We use the following policy:
+ *
+ * - If we trap a S/W operation, we enable VM trapping to detect
+ * caches being turned on/off, and do a full clean.
+ *
+ * - We flush the caches on both caches being turned on and off.
+ *
+ * - Once the caches are enabled, we stop trapping VM ops.
+ */
+void kvm_set_way_flush(struct kvm_vcpu *vcpu)
+{
+ unsigned long hcr = vcpu_get_hcr(vcpu);
+
+ /*
+ * If this is the first time we do a S/W operation
+ * (i.e. HCR_TVM not set) flush the whole memory, and set the
+ * VM trapping.
+ *
+ * Otherwise, rely on the VM trapping to wait for the MMU +
+ * Caches to be turned off. At that point, we'll be able to
+ * clean the caches again.
+ */
+ if (!(hcr & HCR_TVM)) {
+ trace_kvm_set_way_flush(*vcpu_pc(vcpu),
+ vcpu_has_cache_enabled(vcpu));
+ stage2_flush_vm(vcpu->kvm);
+ vcpu_set_hcr(vcpu, hcr | HCR_TVM);
+ }
+}
+
+void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
+{
+ bool now_enabled = vcpu_has_cache_enabled(vcpu);
+
+ /*
+ * If switching the MMU+caches on, need to invalidate the caches.
+ * If switching it off, need to clean the caches.
+ * Clean + invalidate does the trick always.
+ */
+ if (now_enabled != was_enabled)
+ stage2_flush_vm(vcpu->kvm);
+
+ /* Caches are now on, stop trapping VM ops (until a S/W op) */
+ if (now_enabled)
+ vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) & ~HCR_TVM);
+
+ trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
+}
diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h
index b1d640f78623..b6a6e7102201 100644
--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@@ -223,6 +223,45 @@ TRACE_EVENT(kvm_hvc,
__entry->vcpu_pc, __entry->r0, __entry->imm)
);
+TRACE_EVENT(kvm_set_way_flush,
+ TP_PROTO(unsigned long vcpu_pc, bool cache),
+ TP_ARGS(vcpu_pc, cache),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, vcpu_pc )
+ __field( bool, cache )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_pc = vcpu_pc;
+ __entry->cache = cache;
+ ),
+
+ TP_printk("S/W flush at 0x%016lx (cache %s)",
+ __entry->vcpu_pc, __entry->cache ? "on" : "off")
+);
+
+TRACE_EVENT(kvm_toggle_cache,
+ TP_PROTO(unsigned long vcpu_pc, bool was, bool now),
+ TP_ARGS(vcpu_pc, was, now),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, vcpu_pc )
+ __field( bool, was )
+ __field( bool, now )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_pc = vcpu_pc;
+ __entry->was = was;
+ __entry->now = now;
+ ),
+
+ TP_printk("VM op at 0x%016lx (cache was %s, now %s)",
+ __entry->vcpu_pc, __entry->was ? "on" : "off",
+ __entry->now ? "on" : "off")
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 0573faab96ad..d8a780799506 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -15,19 +15,8 @@ lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
io-readsb.o io-writesb.o io-readsl.o io-writesl.o \
call_with_stack.o bswapsdi2.o
-mmu-y := clear_user.o copy_page.o getuser.o putuser.o
-
-# the code in uaccess.S is not preemption safe and
-# probably faster on ARMv3 only
-ifeq ($(CONFIG_PREEMPT),y)
- mmu-y += copy_from_user.o copy_to_user.o
-else
-ifneq ($(CONFIG_CPU_32v3),y)
- mmu-y += copy_from_user.o copy_to_user.o
-else
- mmu-y += uaccess.o
-endif
-endif
+mmu-y := clear_user.o copy_page.o getuser.o putuser.o \
+ copy_from_user.o copy_to_user.o
# using lib_ here won't override already available weak symbols
obj-$(CONFIG_UACCESS_WITH_MEMCPY) += uaccess_with_memcpy.o
diff --git a/arch/arm/lib/clear_user.S b/arch/arm/lib/clear_user.S
index 14a0d988c82c..a6d7bb973bf1 100644
--- a/arch/arm/lib/clear_user.S
+++ b/arch/arm/lib/clear_user.S
@@ -12,14 +12,14 @@
.text
-/* Prototype: int __clear_user(void *addr, size_t sz)
+/* Prototype: unsigned long arm_clear_user(void *addr, size_t sz)
* Purpose : clear some user memory
* Params : addr - user memory address to clear
* : sz - number of bytes to clear
* Returns : number of bytes NOT cleared
*/
ENTRY(__clear_user_std)
-WEAK(__clear_user)
+WEAK(arm_clear_user)
stmfd sp!, {r1, lr}
mov r2, #0
cmp r1, #4
@@ -44,7 +44,7 @@ WEAK(__clear_user)
USER( strnebt r2, [r0])
mov r0, #0
ldmfd sp!, {r1, pc}
-ENDPROC(__clear_user)
+ENDPROC(arm_clear_user)
ENDPROC(__clear_user_std)
.pushsection .fixup,"ax"
diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
index 66a477a3e3cc..70bf0e90da4d 100644
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -16,7 +16,7 @@
/*
* Prototype:
*
- * size_t __copy_from_user(void *to, const void *from, size_t n)
+ * size_t arm_copy_from_user(void *to, const void *from, size_t n)
*
* Purpose:
*
@@ -84,11 +84,11 @@
.text
-ENTRY(__copy_from_user)
+ENTRY(arm_copy_from_user)
#include "copy_template.S"
-ENDPROC(__copy_from_user)
+ENDPROC(arm_copy_from_user)
.pushsection .fixup,"ax"
.align 0
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
index d066df686e17..a610ffaa165d 100644
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -16,7 +16,7 @@
/*
* Prototype:
*
- * size_t __copy_to_user(void *to, const void *from, size_t n)
+ * size_t arm_copy_to_user(void *to, const void *from, size_t n)
*
* Purpose:
*
@@ -88,11 +88,11 @@
.text
ENTRY(__copy_to_user_std)
-WEAK(__copy_to_user)
+WEAK(arm_copy_to_user)
#include "copy_template.S"
-ENDPROC(__copy_to_user)
+ENDPROC(arm_copy_to_user)
ENDPROC(__copy_to_user_std)
.pushsection .fixup,"ax"
diff --git a/arch/arm/lib/csumpartialcopyuser.S b/arch/arm/lib/csumpartialcopyuser.S
index 7d08b43d2c0e..2c8e3b6dead0 100644
--- a/arch/arm/lib/csumpartialcopyuser.S
+++ b/arch/arm/lib/csumpartialcopyuser.S
@@ -17,6 +17,19 @@
.text
+#ifdef CONFIG_CPU_SW_DOMAIN_PAN
+ .macro save_regs
+ mrc p15, 0, ip, c3, c0, 0
+ stmfd sp!, {r1, r2, r4 - r8, ip, lr}
+ uaccess_enable ip
+ .endm
+
+ .macro load_regs
+ ldmfd sp!, {r1, r2, r4 - r8, ip, lr}
+ mcr p15, 0, ip, c3, c0, 0
+ ret lr
+ .endm
+#else
.macro save_regs
stmfd sp!, {r1, r2, r4 - r8, lr}
.endm
@@ -24,6 +37,7 @@
.macro load_regs
ldmfd sp!, {r1, r2, r4 - r8, pc}
.endm
+#endif
.macro load1b, reg1
ldrusr \reg1, r0, 1
diff --git a/arch/arm/lib/uaccess.S b/arch/arm/lib/uaccess.S
deleted file mode 100644
index e50520904b76..000000000000
--- a/arch/arm/lib/uaccess.S
+++ /dev/null
@@ -1,564 +0,0 @@
-/*
- * linux/arch/arm/lib/uaccess.S
- *
- * Copyright (C) 1995, 1996,1997,1998 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Routines to block copy data to/from user memory
- * These are highly optimised both for the 4k page size
- * and for various alignments.
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/errno.h>
-#include <asm/domain.h>
-
- .text
-
-#define PAGE_SHIFT 12
-
-/* Prototype: int __copy_to_user(void *to, const char *from, size_t n)
- * Purpose : copy a block to user memory from kernel memory
- * Params : to - user memory
- * : from - kernel memory
- * : n - number of bytes to copy
- * Returns : Number of bytes NOT copied.
- */
-
-.Lc2u_dest_not_aligned:
- rsb ip, ip, #4
- cmp ip, #2
- ldrb r3, [r1], #1
-USER( TUSER( strb) r3, [r0], #1) @ May fault
- ldrgeb r3, [r1], #1
-USER( TUSER( strgeb) r3, [r0], #1) @ May fault
- ldrgtb r3, [r1], #1
-USER( TUSER( strgtb) r3, [r0], #1) @ May fault
- sub r2, r2, ip
- b .Lc2u_dest_aligned
-
-ENTRY(__copy_to_user)
- stmfd sp!, {r2, r4 - r7, lr}
- cmp r2, #4
- blt .Lc2u_not_enough
- ands ip, r0, #3
- bne .Lc2u_dest_not_aligned
-.Lc2u_dest_aligned:
-
- ands ip, r1, #3
- bne .Lc2u_src_not_aligned
-/*
- * Seeing as there has to be at least 8 bytes to copy, we can
- * copy one word, and force a user-mode page fault...
- */
-
-.Lc2u_0fupi: subs r2, r2, #4
- addmi ip, r2, #4
- bmi .Lc2u_0nowords
- ldr r3, [r1], #4
-USER( TUSER( str) r3, [r0], #4) @ May fault
- mov ip, r0, lsl #32 - PAGE_SHIFT @ On each page, use a ld/st??t instruction
- rsb ip, ip, #0
- movs ip, ip, lsr #32 - PAGE_SHIFT
- beq .Lc2u_0fupi
-/*
- * ip = max no. of bytes to copy before needing another "strt" insn
- */
- cmp r2, ip
- movlt ip, r2
- sub r2, r2, ip
- subs ip, ip, #32
- blt .Lc2u_0rem8lp
-
-.Lc2u_0cpy8lp: ldmia r1!, {r3 - r6}
- stmia r0!, {r3 - r6} @ Shouldnt fault
- ldmia r1!, {r3 - r6}
- subs ip, ip, #32
- stmia r0!, {r3 - r6} @ Shouldnt fault
- bpl .Lc2u_0cpy8lp
-
-.Lc2u_0rem8lp: cmn ip, #16
- ldmgeia r1!, {r3 - r6}
- stmgeia r0!, {r3 - r6} @ Shouldnt fault
- tst ip, #8
- ldmneia r1!, {r3 - r4}
- stmneia r0!, {r3 - r4} @ Shouldnt fault
- tst ip, #4
- ldrne r3, [r1], #4
- TUSER( strne) r3, [r0], #4 @ Shouldnt fault
- ands ip, ip, #3
- beq .Lc2u_0fupi
-.Lc2u_0nowords: teq ip, #0
- beq .Lc2u_finished
-.Lc2u_nowords: cmp ip, #2
- ldrb r3, [r1], #1
-USER( TUSER( strb) r3, [r0], #1) @ May fault
- ldrgeb r3, [r1], #1
-USER( TUSER( strgeb) r3, [r0], #1) @ May fault
- ldrgtb r3, [r1], #1
-USER( TUSER( strgtb) r3, [r0], #1) @ May fault
- b .Lc2u_finished
-
-.Lc2u_not_enough:
- movs ip, r2
- bne .Lc2u_nowords
-.Lc2u_finished: mov r0, #0
- ldmfd sp!, {r2, r4 - r7, pc}
-
-.Lc2u_src_not_aligned:
- bic r1, r1, #3
- ldr r7, [r1], #4
- cmp ip, #2
- bgt .Lc2u_3fupi
- beq .Lc2u_2fupi
-.Lc2u_1fupi: subs r2, r2, #4
- addmi ip, r2, #4
- bmi .Lc2u_1nowords
- mov r3, r7, lspull #8
- ldr r7, [r1], #4
- orr r3, r3, r7, lspush #24
-USER( TUSER( str) r3, [r0], #4) @ May fault
- mov ip, r0, lsl #32 - PAGE_SHIFT
- rsb ip, ip, #0
- movs ip, ip, lsr #32 - PAGE_SHIFT
- beq .Lc2u_1fupi
- cmp r2, ip
- movlt ip, r2
- sub r2, r2, ip
- subs ip, ip, #16
- blt .Lc2u_1rem8lp
-
-.Lc2u_1cpy8lp: mov r3, r7, lspull #8
- ldmia r1!, {r4 - r7}
- subs ip, ip, #16
- orr r3, r3, r4, lspush #24
- mov r4, r4, lspull #8
- orr r4, r4, r5, lspush #24
- mov r5, r5, lspull #8
- orr r5, r5, r6, lspush #24
- mov r6, r6, lspull #8
- orr r6, r6, r7, lspush #24
- stmia r0!, {r3 - r6} @ Shouldnt fault
- bpl .Lc2u_1cpy8lp
-
-.Lc2u_1rem8lp: tst ip, #8
- movne r3, r7, lspull #8
- ldmneia r1!, {r4, r7}
- orrne r3, r3, r4, lspush #24
- movne r4, r4, lspull #8
- orrne r4, r4, r7, lspush #24
- stmneia r0!, {r3 - r4} @ Shouldnt fault
- tst ip, #4
- movne r3, r7, lspull #8
- ldrne r7, [r1], #4
- orrne r3, r3, r7, lspush #24
- TUSER( strne) r3, [r0], #4 @ Shouldnt fault
- ands ip, ip, #3
- beq .Lc2u_1fupi
-.Lc2u_1nowords: mov r3, r7, get_byte_1
- teq ip, #0
- beq .Lc2u_finished
- cmp ip, #2
-USER( TUSER( strb) r3, [r0], #1) @ May fault
- movge r3, r7, get_byte_2
-USER( TUSER( strgeb) r3, [r0], #1) @ May fault
- movgt r3, r7, get_byte_3
-USER( TUSER( strgtb) r3, [r0], #1) @ May fault
- b .Lc2u_finished
-
-.Lc2u_2fupi: subs r2, r2, #4
- addmi ip, r2, #4
- bmi .Lc2u_2nowords
- mov r3, r7, lspull #16
- ldr r7, [r1], #4
- orr r3, r3, r7, lspush #16
-USER( TUSER( str) r3, [r0], #4) @ May fault
- mov ip, r0, lsl #32 - PAGE_SHIFT
- rsb ip, ip, #0
- movs ip, ip, lsr #32 - PAGE_SHIFT
- beq .Lc2u_2fupi
- cmp r2, ip
- movlt ip, r2
- sub r2, r2, ip
- subs ip, ip, #16
- blt .Lc2u_2rem8lp
-
-.Lc2u_2cpy8lp: mov r3, r7, lspull #16
- ldmia r1!, {r4 - r7}
- subs ip, ip, #16
- orr r3, r3, r4, lspush #16
- mov r4, r4, lspull #16
- orr r4, r4, r5, lspush #16
- mov r5, r5, lspull #16
- orr r5, r5, r6, lspush #16
- mov r6, r6, lspull #16
- orr r6, r6, r7, lspush #16
- stmia r0!, {r3 - r6} @ Shouldnt fault
- bpl .Lc2u_2cpy8lp
-
-.Lc2u_2rem8lp: tst ip, #8
- movne r3, r7, lspull #16
- ldmneia r1!, {r4, r7}
- orrne r3, r3, r4, lspush #16
- movne r4, r4, lspull #16
- orrne r4, r4, r7, lspush #16
- stmneia r0!, {r3 - r4} @ Shouldnt fault
- tst ip, #4
- movne r3, r7, lspull #16
- ldrne r7, [r1], #4
- orrne r3, r3, r7, lspush #16
- TUSER( strne) r3, [r0], #4 @ Shouldnt fault
- ands ip, ip, #3
- beq .Lc2u_2fupi
-.Lc2u_2nowords: mov r3, r7, get_byte_2
- teq ip, #0
- beq .Lc2u_finished
- cmp ip, #2
-USER( TUSER( strb) r3, [r0], #1) @ May fault
- movge r3, r7, get_byte_3
-USER( TUSER( strgeb) r3, [r0], #1) @ May fault
- ldrgtb r3, [r1], #0
-USER( TUSER( strgtb) r3, [r0], #1) @ May fault
- b .Lc2u_finished
-
-.Lc2u_3fupi: subs r2, r2, #4
- addmi ip, r2, #4
- bmi .Lc2u_3nowords
- mov r3, r7, lspull #24
- ldr r7, [r1], #4
- orr r3, r3, r7, lspush #8
-USER( TUSER( str) r3, [r0], #4) @ May fault
- mov ip, r0, lsl #32 - PAGE_SHIFT
- rsb ip, ip, #0
- movs ip, ip, lsr #32 - PAGE_SHIFT
- beq .Lc2u_3fupi
- cmp r2, ip
- movlt ip, r2
- sub r2, r2, ip
- subs ip, ip, #16
- blt .Lc2u_3rem8lp
-
-.Lc2u_3cpy8lp: mov r3, r7, lspull #24
- ldmia r1!, {r4 - r7}
- subs ip, ip, #16
- orr r3, r3, r4, lspush #8
- mov r4, r4, lspull #24
- orr r4, r4, r5, lspush #8
- mov r5, r5, lspull #24
- orr r5, r5, r6, lspush #8
- mov r6, r6, lspull #24
- orr r6, r6, r7, lspush #8
- stmia r0!, {r3 - r6} @ Shouldnt fault
- bpl .Lc2u_3cpy8lp
-
-.Lc2u_3rem8lp: tst ip, #8
- movne r3, r7, lspull #24
- ldmneia r1!, {r4, r7}
- orrne r3, r3, r4, lspush #8
- movne r4, r4, lspull #24
- orrne r4, r4, r7, lspush #8
- stmneia r0!, {r3 - r4} @ Shouldnt fault
- tst ip, #4
- movne r3, r7, lspull #24
- ldrne r7, [r1], #4
- orrne r3, r3, r7, lspush #8
- TUSER( strne) r3, [r0], #4 @ Shouldnt fault
- ands ip, ip, #3
- beq .Lc2u_3fupi
-.Lc2u_3nowords: mov r3, r7, get_byte_3
- teq ip, #0
- beq .Lc2u_finished
- cmp ip, #2
-USER( TUSER( strb) r3, [r0], #1) @ May fault
- ldrgeb r3, [r1], #1
-USER( TUSER( strgeb) r3, [r0], #1) @ May fault
- ldrgtb r3, [r1], #0
-USER( TUSER( strgtb) r3, [r0], #1) @ May fault
- b .Lc2u_finished
-ENDPROC(__copy_to_user)
-
- .pushsection .fixup,"ax"
- .align 0
-9001: ldmfd sp!, {r0, r4 - r7, pc}
- .popsection
-
-/* Prototype: unsigned long __copy_from_user(void *to,const void *from,unsigned long n);
- * Purpose : copy a block from user memory to kernel memory
- * Params : to - kernel memory
- * : from - user memory
- * : n - number of bytes to copy
- * Returns : Number of bytes NOT copied.
- */
-.Lcfu_dest_not_aligned:
- rsb ip, ip, #4
- cmp ip, #2
-USER( TUSER( ldrb) r3, [r1], #1) @ May fault
- strb r3, [r0], #1
-USER( TUSER( ldrgeb) r3, [r1], #1) @ May fault
- strgeb r3, [r0], #1
-USER( TUSER( ldrgtb) r3, [r1], #1) @ May fault
- strgtb r3, [r0], #1
- sub r2, r2, ip
- b .Lcfu_dest_aligned
-
-ENTRY(__copy_from_user)
- stmfd sp!, {r0, r2, r4 - r7, lr}
- cmp r2, #4
- blt .Lcfu_not_enough
- ands ip, r0, #3
- bne .Lcfu_dest_not_aligned
-.Lcfu_dest_aligned:
- ands ip, r1, #3
- bne .Lcfu_src_not_aligned
-
-/*
- * Seeing as there has to be at least 8 bytes to copy, we can
- * copy one word, and force a user-mode page fault...
- */
-
-.Lcfu_0fupi: subs r2, r2, #4
- addmi ip, r2, #4
- bmi .Lcfu_0nowords
-USER( TUSER( ldr) r3, [r1], #4)
- str r3, [r0], #4
- mov ip, r1, lsl #32 - PAGE_SHIFT @ On each page, use a ld/st??t instruction
- rsb ip, ip, #0
- movs ip, ip, lsr #32 - PAGE_SHIFT
- beq .Lcfu_0fupi
-/*
- * ip = max no. of bytes to copy before needing another "strt" insn
- */
- cmp r2, ip
- movlt ip, r2
- sub r2, r2, ip
- subs ip, ip, #32
- blt .Lcfu_0rem8lp
-
-.Lcfu_0cpy8lp: ldmia r1!, {r3 - r6} @ Shouldnt fault
- stmia r0!, {r3 - r6}
- ldmia r1!, {r3 - r6} @ Shouldnt fault
- subs ip, ip, #32
- stmia r0!, {r3 - r6}
- bpl .Lcfu_0cpy8lp
-
-.Lcfu_0rem8lp: cmn ip, #16
- ldmgeia r1!, {r3 - r6} @ Shouldnt fault
- stmgeia r0!, {r3 - r6}
- tst ip, #8
- ldmneia r1!, {r3 - r4} @ Shouldnt fault
- stmneia r0!, {r3 - r4}
- tst ip, #4
- TUSER( ldrne) r3, [r1], #4 @ Shouldnt fault
- strne r3, [r0], #4
- ands ip, ip, #3
- beq .Lcfu_0fupi
-.Lcfu_0nowords: teq ip, #0
- beq .Lcfu_finished
-.Lcfu_nowords: cmp ip, #2
-USER( TUSER( ldrb) r3, [r1], #1) @ May fault
- strb r3, [r0], #1
-USER( TUSER( ldrgeb) r3, [r1], #1) @ May fault
- strgeb r3, [r0], #1
-USER( TUSER( ldrgtb) r3, [r1], #1) @ May fault
- strgtb r3, [r0], #1
- b .Lcfu_finished
-
-.Lcfu_not_enough:
- movs ip, r2
- bne .Lcfu_nowords
-.Lcfu_finished: mov r0, #0
- add sp, sp, #8
- ldmfd sp!, {r4 - r7, pc}
-
-.Lcfu_src_not_aligned:
- bic r1, r1, #3
-USER( TUSER( ldr) r7, [r1], #4) @ May fault
- cmp ip, #2
- bgt .Lcfu_3fupi
- beq .Lcfu_2fupi
-.Lcfu_1fupi: subs r2, r2, #4
- addmi ip, r2, #4
- bmi .Lcfu_1nowords
- mov r3, r7, lspull #8
-USER( TUSER( ldr) r7, [r1], #4) @ May fault
- orr r3, r3, r7, lspush #24
- str r3, [r0], #4
- mov ip, r1, lsl #32 - PAGE_SHIFT
- rsb ip, ip, #0
- movs ip, ip, lsr #32 - PAGE_SHIFT
- beq .Lcfu_1fupi
- cmp r2, ip
- movlt ip, r2
- sub r2, r2, ip
- subs ip, ip, #16
- blt .Lcfu_1rem8lp
-
-.Lcfu_1cpy8lp: mov r3, r7, lspull #8
- ldmia r1!, {r4 - r7} @ Shouldnt fault
- subs ip, ip, #16
- orr r3, r3, r4, lspush #24
- mov r4, r4, lspull #8
- orr r4, r4, r5, lspush #24
- mov r5, r5, lspull #8
- orr r5, r5, r6, lspush #24
- mov r6, r6, lspull #8
- orr r6, r6, r7, lspush #24
- stmia r0!, {r3 - r6}
- bpl .Lcfu_1cpy8lp
-
-.Lcfu_1rem8lp: tst ip, #8
- movne r3, r7, lspull #8
- ldmneia r1!, {r4, r7} @ Shouldnt fault
- orrne r3, r3, r4, lspush #24
- movne r4, r4, lspull #8
- orrne r4, r4, r7, lspush #24
- stmneia r0!, {r3 - r4}
- tst ip, #4
- movne r3, r7, lspull #8
-USER( TUSER( ldrne) r7, [r1], #4) @ May fault
- orrne r3, r3, r7, lspush #24
- strne r3, [r0], #4
- ands ip, ip, #3
- beq .Lcfu_1fupi
-.Lcfu_1nowords: mov r3, r7, get_byte_1
- teq ip, #0
- beq .Lcfu_finished
- cmp ip, #2
- strb r3, [r0], #1
- movge r3, r7, get_byte_2
- strgeb r3, [r0], #1
- movgt r3, r7, get_byte_3
- strgtb r3, [r0], #1
- b .Lcfu_finished
-
-.Lcfu_2fupi: subs r2, r2, #4
- addmi ip, r2, #4
- bmi .Lcfu_2nowords
- mov r3, r7, lspull #16
-USER( TUSER( ldr) r7, [r1], #4) @ May fault
- orr r3, r3, r7, lspush #16
- str r3, [r0], #4
- mov ip, r1, lsl #32 - PAGE_SHIFT
- rsb ip, ip, #0
- movs ip, ip, lsr #32 - PAGE_SHIFT
- beq .Lcfu_2fupi
- cmp r2, ip
- movlt ip, r2
- sub r2, r2, ip
- subs ip, ip, #16
- blt .Lcfu_2rem8lp
-
-
-.Lcfu_2cpy8lp: mov r3, r7, lspull #16
- ldmia r1!, {r4 - r7} @ Shouldnt fault
- subs ip, ip, #16
- orr r3, r3, r4, lspush #16
- mov r4, r4, lspull #16
- orr r4, r4, r5, lspush #16
- mov r5, r5, lspull #16
- orr r5, r5, r6, lspush #16
- mov r6, r6, lspull #16
- orr r6, r6, r7, lspush #16
- stmia r0!, {r3 - r6}
- bpl .Lcfu_2cpy8lp
-
-.Lcfu_2rem8lp: tst ip, #8
- movne r3, r7, lspull #16
- ldmneia r1!, {r4, r7} @ Shouldnt fault
- orrne r3, r3, r4, lspush #16
- movne r4, r4, lspull #16
- orrne r4, r4, r7, lspush #16
- stmneia r0!, {r3 - r4}
- tst ip, #4
- movne r3, r7, lspull #16
-USER( TUSER( ldrne) r7, [r1], #4) @ May fault
- orrne r3, r3, r7, lspush #16
- strne r3, [r0], #4
- ands ip, ip, #3
- beq .Lcfu_2fupi
-.Lcfu_2nowords: mov r3, r7, get_byte_2
- teq ip, #0
- beq .Lcfu_finished
- cmp ip, #2
- strb r3, [r0], #1
- movge r3, r7, get_byte_3
- strgeb r3, [r0], #1
-USER( TUSER( ldrgtb) r3, [r1], #0) @ May fault
- strgtb r3, [r0], #1
- b .Lcfu_finished
-
-.Lcfu_3fupi: subs r2, r2, #4
- addmi ip, r2, #4
- bmi .Lcfu_3nowords
- mov r3, r7, lspull #24
-USER( TUSER( ldr) r7, [r1], #4) @ May fault
- orr r3, r3, r7, lspush #8
- str r3, [r0], #4
- mov ip, r1, lsl #32 - PAGE_SHIFT
- rsb ip, ip, #0
- movs ip, ip, lsr #32 - PAGE_SHIFT
- beq .Lcfu_3fupi
- cmp r2, ip
- movlt ip, r2
- sub r2, r2, ip
- subs ip, ip, #16
- blt .Lcfu_3rem8lp
-
-.Lcfu_3cpy8lp: mov r3, r7, lspull #24
- ldmia r1!, {r4 - r7} @ Shouldnt fault
- orr r3, r3, r4, lspush #8
- mov r4, r4, lspull #24
- orr r4, r4, r5, lspush #8
- mov r5, r5, lspull #24
- orr r5, r5, r6, lspush #8
- mov r6, r6, lspull #24
- orr r6, r6, r7, lspush #8
- stmia r0!, {r3 - r6}
- subs ip, ip, #16
- bpl .Lcfu_3cpy8lp
-
-.Lcfu_3rem8lp: tst ip, #8
- movne r3, r7, lspull #24
- ldmneia r1!, {r4, r7} @ Shouldnt fault
- orrne r3, r3, r4, lspush #8
- movne r4, r4, lspull #24
- orrne r4, r4, r7, lspush #8
- stmneia r0!, {r3 - r4}
- tst ip, #4
- movne r3, r7, lspull #24
-USER( TUSER( ldrne) r7, [r1], #4) @ May fault
- orrne r3, r3, r7, lspush #8
- strne r3, [r0], #4
- ands ip, ip, #3
- beq .Lcfu_3fupi
-.Lcfu_3nowords: mov r3, r7, get_byte_3
- teq ip, #0
- beq .Lcfu_finished
- cmp ip, #2
- strb r3, [r0], #1
-USER( TUSER( ldrgeb) r3, [r1], #1) @ May fault
- strgeb r3, [r0], #1
-USER( TUSER( ldrgtb) r3, [r1], #1) @ May fault
- strgtb r3, [r0], #1
- b .Lcfu_finished
-ENDPROC(__copy_from_user)
-
- .pushsection .fixup,"ax"
- .align 0
- /*
- * We took an exception. r0 contains a pointer to
- * the byte not copied.
- */
-9001: ldr r2, [sp], #4 @ void *to
- sub r2, r0, r2 @ bytes copied
- ldr r1, [sp], #4 @ unsigned long count
- subs r4, r1, r2 @ bytes left to copy
- movne r1, r4
- blne __memzero
- mov r0, r4
- ldmfd sp!, {r4 - r7, pc}
- .popsection
-
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index 3e58d710013c..b3e68086b347 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -88,6 +88,7 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
static unsigned long noinline
__copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
{
+ unsigned long ua_flags;
int atomic;
if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
@@ -118,7 +119,9 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
if (tocopy > n)
tocopy = n;
+ ua_flags = uaccess_save_and_enable();
memcpy((void *)to, from, tocopy);
+ uaccess_restore(ua_flags);
to += tocopy;
from += tocopy;
n -= tocopy;
@@ -136,7 +139,7 @@ out:
}
unsigned long
-__copy_to_user(void __user *to, const void *from, unsigned long n)
+arm_copy_to_user(void __user *to, const void *from, unsigned long n)
{
/*
* This test is stubbed out of the main function above to keep
@@ -145,14 +148,21 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
* With frame pointer disabled, tail call optimization kicks in
* as well making this test almost invisible.
*/
- if (n < 64)
- return __copy_to_user_std(to, from, n);
- return __copy_to_user_memcpy(to, from, n);
+ if (n < 64) {
+ unsigned long ua_flags = uaccess_save_and_enable();
+ n = __copy_to_user_std(to, from, n);
+ uaccess_restore(ua_flags);
+ } else {
+ n = __copy_to_user_memcpy(to, from, n);
+ }
+ return n;
}
static unsigned long noinline
__clear_user_memset(void __user *addr, unsigned long n)
{
+ unsigned long ua_flags;
+
if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
memset((void *)addr, 0, n);
return 0;
@@ -175,7 +185,9 @@ __clear_user_memset(void __user *addr, unsigned long n)
if (tocopy > n)
tocopy = n;
+ ua_flags = uaccess_save_and_enable();
memset((void *)addr, 0, tocopy);
+ uaccess_restore(ua_flags);
addr += tocopy;
n -= tocopy;
@@ -190,12 +202,17 @@ out:
return n;
}
-unsigned long __clear_user(void __user *addr, unsigned long n)
+unsigned long arm_clear_user(void __user *addr, unsigned long n)
{
/* See rational for this in __copy_to_user() above. */
- if (n < 64)
- return __clear_user_std(addr, n);
- return __clear_user_memset(addr, n);
+ if (n < 64) {
+ unsigned long ua_flags = uaccess_save_and_enable();
+ n = __clear_user_std(addr, n);
+ uaccess_restore(ua_flags);
+ } else {
+ n = __clear_user_memset(addr, n);
+ }
+ return n;
}
#if 0
diff --git a/arch/arm/mach-davinci/Kconfig b/arch/arm/mach-davinci/Kconfig
index 584e8d4e2892..6bb1049329c9 100644
--- a/arch/arm/mach-davinci/Kconfig
+++ b/arch/arm/mach-davinci/Kconfig
@@ -198,17 +198,6 @@ config DA850_UI_SD_VIDEO_PORT
endchoice
-config DA850_WL12XX
- bool "AM18x wl1271 daughter board"
- depends on MACH_DAVINCI_DA850_EVM
- help
- The wl1271 daughter card for AM18x EVMs is a combo wireless
- connectivity add-on card, based on the LS Research TiWi module with
- Texas Instruments' wl1271 solution.
- Say Y if you want to use a wl1271 expansion card connected to the
- AM18x EVM.
-
-
config MACH_MITYOMAPL138
bool "Critical Link MityDSP-L138/MityARM-1808 SoM"
depends on ARCH_DAVINCI_DA850
diff --git a/arch/arm/mach-davinci/board-da850-evm.c b/arch/arm/mach-davinci/board-da850-evm.c
index fa11415e906a..83f579add9e4 100644
--- a/arch/arm/mach-davinci/board-da850-evm.c
+++ b/arch/arm/mach-davinci/board-da850-evm.c
@@ -38,7 +38,6 @@
#include <linux/regulator/fixed.h>
#include <linux/spi/spi.h>
#include <linux/spi/flash.h>
-#include <linux/wl12xx.h>
#include <mach/common.h>
#include <mach/cp_intc.h>
@@ -60,9 +59,6 @@
#define DA850_MMCSD_CD_PIN GPIO_TO_PIN(4, 0)
#define DA850_MMCSD_WP_PIN GPIO_TO_PIN(4, 1)
-#define DA850_WLAN_EN GPIO_TO_PIN(6, 9)
-#define DA850_WLAN_IRQ GPIO_TO_PIN(6, 10)
-
#define DA850_MII_MDIO_CLKEN_PIN GPIO_TO_PIN(2, 6)
static struct mtd_partition da850evm_spiflash_part[] = {
@@ -1346,109 +1342,6 @@ static __init void da850_vpif_init(void)
static __init void da850_vpif_init(void) {}
#endif
-#ifdef CONFIG_DA850_WL12XX
-
-static void wl12xx_set_power(int index, bool power_on)
-{
- static bool power_state;
-
- pr_debug("Powering %s wl12xx", power_on ? "on" : "off");
-
- if (power_on == power_state)
- return;
- power_state = power_on;
-
- if (power_on) {
- /* Power up sequence required for wl127x devices */
- gpio_set_value(DA850_WLAN_EN, 1);
- usleep_range(15000, 15000);
- gpio_set_value(DA850_WLAN_EN, 0);
- usleep_range(1000, 1000);
- gpio_set_value(DA850_WLAN_EN, 1);
- msleep(70);
- } else {
- gpio_set_value(DA850_WLAN_EN, 0);
- }
-}
-
-static struct davinci_mmc_config da850_wl12xx_mmc_config = {
- .set_power = wl12xx_set_power,
- .wires = 4,
- .max_freq = 25000000,
- .caps = MMC_CAP_4_BIT_DATA | MMC_CAP_NONREMOVABLE |
- MMC_CAP_POWER_OFF_CARD,
-};
-
-static const short da850_wl12xx_pins[] __initconst = {
- DA850_MMCSD1_DAT_0, DA850_MMCSD1_DAT_1, DA850_MMCSD1_DAT_2,
- DA850_MMCSD1_DAT_3, DA850_MMCSD1_CLK, DA850_MMCSD1_CMD,
- DA850_GPIO6_9, DA850_GPIO6_10,
- -1
-};
-
-static struct wl12xx_platform_data da850_wl12xx_wlan_data __initdata = {
- .irq = -1,
- .board_ref_clock = WL12XX_REFCLOCK_38,
- .platform_quirks = WL12XX_PLATFORM_QUIRK_EDGE_IRQ,
-};
-
-static __init int da850_wl12xx_init(void)
-{
- int ret;
-
- ret = davinci_cfg_reg_list(da850_wl12xx_pins);
- if (ret) {
- pr_err("wl12xx/mmc mux setup failed: %d\n", ret);
- goto exit;
- }
-
- ret = da850_register_mmcsd1(&da850_wl12xx_mmc_config);
- if (ret) {
- pr_err("wl12xx/mmc registration failed: %d\n", ret);
- goto exit;
- }
-
- ret = gpio_request_one(DA850_WLAN_EN, GPIOF_OUT_INIT_LOW, "wl12xx_en");
- if (ret) {
- pr_err("Could not request wl12xx enable gpio: %d\n", ret);
- goto exit;
- }
-
- ret = gpio_request_one(DA850_WLAN_IRQ, GPIOF_IN, "wl12xx_irq");
- if (ret) {
- pr_err("Could not request wl12xx irq gpio: %d\n", ret);
- goto free_wlan_en;
- }
-
- da850_wl12xx_wlan_data.irq = gpio_to_irq(DA850_WLAN_IRQ);
-
- ret = wl12xx_set_platform_data(&da850_wl12xx_wlan_data);
- if (ret) {
- pr_err("Could not set wl12xx data: %d\n", ret);
- goto free_wlan_irq;
- }
-
- return 0;
-
-free_wlan_irq:
- gpio_free(DA850_WLAN_IRQ);
-
-free_wlan_en:
- gpio_free(DA850_WLAN_EN);
-
-exit:
- return ret;
-}
-
-#else /* CONFIG_DA850_WL12XX */
-
-static __init int da850_wl12xx_init(void)
-{
- return 0;
-}
-
-#endif /* CONFIG_DA850_WL12XX */
-
#define DA850EVM_SATA_REFCLKPN_RATE (100 * 1000 * 1000)
static __init void da850_evm_init(void)
@@ -1505,11 +1398,6 @@ static __init void da850_evm_init(void)
if (ret)
pr_warn("%s: MMCSD0 registration failed: %d\n",
__func__, ret);
-
- ret = da850_wl12xx_init();
- if (ret)
- pr_warn("%s: WL12xx initialization failed: %d\n",
- __func__, ret);
}
davinci_serial_init(da8xx_serial_device);
diff --git a/arch/arm/mach-keystone/keystone.c b/arch/arm/mach-keystone/keystone.c
index 7f352de26099..b9cb738895b0 100644
--- a/arch/arm/mach-keystone/keystone.c
+++ b/arch/arm/mach-keystone/keystone.c
@@ -62,11 +62,9 @@ static phys_addr_t keystone_virt_to_idmap(unsigned long x)
return (phys_addr_t)(x) - CONFIG_PAGE_OFFSET + KEYSTONE_LOW_PHYS_START;
}
-static void __init keystone_init_meminfo(void)
+static long long __init keystone_init_meminfo(void)
{
- bool lpae = IS_ENABLED(CONFIG_ARM_LPAE);
- bool pvpatch = IS_ENABLED(CONFIG_ARM_PATCH_PHYS_VIRT);
- phys_addr_t offset = PHYS_OFFSET - KEYSTONE_LOW_PHYS_START;
+ long long offset;
phys_addr_t mem_start, mem_end;
mem_start = memblock_start_of_DRAM();
@@ -75,24 +73,16 @@ static void __init keystone_init_meminfo(void)
/* nothing to do if we are running out of the <32-bit space */
if (mem_start >= KEYSTONE_LOW_PHYS_START &&
mem_end <= KEYSTONE_LOW_PHYS_END)
- return;
-
- if (!lpae || !pvpatch) {
- pr_crit("Enable %s%s%s to run outside 32-bit space\n",
- !lpae ? __stringify(CONFIG_ARM_LPAE) : "",
- (!lpae && !pvpatch) ? " and " : "",
- !pvpatch ? __stringify(CONFIG_ARM_PATCH_PHYS_VIRT) : "");
- }
+ return 0;
if (mem_start < KEYSTONE_HIGH_PHYS_START ||
mem_end > KEYSTONE_HIGH_PHYS_END) {
pr_crit("Invalid address space for memory (%08llx-%08llx)\n",
- (u64)mem_start, (u64)mem_end);
+ (u64)mem_start, (u64)mem_end);
+ return 0;
}
- offset += KEYSTONE_HIGH_PHYS_START;
- __pv_phys_pfn_offset = PFN_DOWN(offset);
- __pv_offset = (offset - PAGE_OFFSET);
+ offset = KEYSTONE_HIGH_PHYS_START - KEYSTONE_LOW_PHYS_START;
/* Populate the arch idmap hook */
arch_virt_to_idmap = keystone_virt_to_idmap;
@@ -100,7 +90,10 @@ static void __init keystone_init_meminfo(void)
keystone_dma_pfn_offset = PFN_DOWN(KEYSTONE_HIGH_PHYS_START -
KEYSTONE_LOW_PHYS_START);
- pr_info("Switching to high address space at 0x%llx\n", (u64)offset);
+ pr_info("Switching to high address space at 0x%llx\n",
+ (u64)PHYS_OFFSET + (u64)offset);
+
+ return offset;
}
static const char *keystone_match[] __initconst = {
diff --git a/arch/arm/mach-keystone/platsmp.c b/arch/arm/mach-keystone/platsmp.c
index 5f46a7cf907b..4bbb18463bfd 100644
--- a/arch/arm/mach-keystone/platsmp.c
+++ b/arch/arm/mach-keystone/platsmp.c
@@ -39,19 +39,6 @@ static int keystone_smp_boot_secondary(unsigned int cpu,
return error;
}
-#ifdef CONFIG_ARM_LPAE
-static void __cpuinit keystone_smp_secondary_initmem(unsigned int cpu)
-{
- pgd_t *pgd0 = pgd_offset_k(0);
- cpu_set_ttbr(1, __pa(pgd0) + TTBR1_OFFSET);
- local_flush_tlb_all();
-}
-#else
-static inline void __cpuinit keystone_smp_secondary_initmem(unsigned int cpu)
-{}
-#endif
-
struct smp_operations keystone_smp_ops __initdata = {
.smp_boot_secondary = keystone_smp_boot_secondary,
- .smp_secondary_init = keystone_smp_secondary_initmem,
};
diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c
index cec9d6c6442c..cb034dd67545 100644
--- a/arch/arm/mach-omap2/pdata-quirks.c
+++ b/arch/arm/mach-omap2/pdata-quirks.c
@@ -13,7 +13,6 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/of_platform.h>
-#include <linux/wl12xx.h>
#include <linux/platform_data/pinctrl-single.h>
#include <linux/platform_data/iommu-omap.h>
@@ -35,34 +34,6 @@ struct pdata_init {
struct of_dev_auxdata omap_auxdata_lookup[];
static struct twl4030_gpio_platform_data twl_gpio_auxdata;
-#if IS_ENABLED(CONFIG_WL12XX)
-
-static struct wl12xx_platform_data wl12xx __initdata;
-
-static void __init __used legacy_init_wl12xx(unsigned ref_clock,
- unsigned tcxo_clock,
- int gpio)
-{
- int res;
-
- wl12xx.board_ref_clock = ref_clock;
- wl12xx.board_tcxo_clock = tcxo_clock;
- wl12xx.irq = gpio_to_irq(gpio);
-
- res = wl12xx_set_platform_data(&wl12xx);
- if (res) {
- pr_err("error setting wl12xx data: %d\n", res);
- return;
- }
-}
-#else
-static inline void legacy_init_wl12xx(unsigned ref_clock,
- unsigned tcxo_clock,
- int gpio)
-{
-}
-#endif
-
#ifdef CONFIG_MACH_NOKIA_N8X0
static void __init omap2420_n8x0_legacy_init(void)
{
@@ -129,7 +100,6 @@ static void __init omap3_sbc_t3730_twl_init(void)
static void __init omap3_sbc_t3730_legacy_init(void)
{
omap3_sbc_t3x_usb_hub_init(167, "sb-t35 usb hub");
- legacy_init_wl12xx(WL12XX_REFCLOCK_38, 0, 136);
omap_ads7846_init(1, 57, 0, NULL);
}
@@ -139,18 +109,8 @@ static void __init omap3_sbc_t3530_legacy_init(void)
omap_ads7846_init(1, 57, 0, NULL);
}
-static void __init omap3_igep0020_legacy_init(void)
-{
-}
-
static void __init omap3_evm_legacy_init(void)
{
- legacy_init_wl12xx(WL12XX_REFCLOCK_38, 0, 149);
-}
-
-static void __init omap3_zoom_legacy_init(void)
-{
- legacy_init_wl12xx(WL12XX_REFCLOCK_26, 0, 162);
}
static void am35xx_enable_emac_int(void)
@@ -217,7 +177,6 @@ static void __init omap3_sbc_t3517_legacy_init(void)
am35xx_emac_reset();
hsmmc2_internal_input_clk();
omap3_sbc_t3517_wifi_init();
- legacy_init_wl12xx(WL12XX_REFCLOCK_38, 0, 145);
omap_ads7846_init(1, 57, 0, NULL);
}
@@ -260,24 +219,6 @@ static void __init omap3_tao3530_legacy_init(void)
}
#endif /* CONFIG_ARCH_OMAP3 */
-#ifdef CONFIG_ARCH_OMAP4
-static void __init omap4_sdp_legacy_init(void)
-{
- legacy_init_wl12xx(WL12XX_REFCLOCK_26,
- WL12XX_TCXOCLOCK_26, 53);
-}
-
-static void __init omap4_panda_legacy_init(void)
-{
- legacy_init_wl12xx(WL12XX_REFCLOCK_38, 0, 53);
-}
-
-static void __init var_som_om44_legacy_init(void)
-{
- legacy_init_wl12xx(WL12XX_REFCLOCK_38, 0, 41);
-}
-#endif
-
#if defined(CONFIG_ARCH_OMAP4) || defined(CONFIG_SOC_OMAP5)
static struct iommu_platform_data omap4_iommu_pdata = {
.reset_name = "mmu_cache",
@@ -286,13 +227,6 @@ static struct iommu_platform_data omap4_iommu_pdata = {
};
#endif
-#ifdef CONFIG_SOC_AM33XX
-static void __init am335x_evmsk_legacy_init(void)
-{
- legacy_init_wl12xx(WL12XX_REFCLOCK_38, 0, 31);
-}
-#endif
-
#ifdef CONFIG_SOC_OMAP5
static void __init omap5_uevm_legacy_init(void)
{
@@ -390,21 +324,10 @@ static struct pdata_init pdata_quirks[] __initdata = {
{ "nokia,omap3-n900", nokia_n900_legacy_init, },
{ "nokia,omap3-n9", hsmmc2_internal_input_clk, },
{ "nokia,omap3-n950", hsmmc2_internal_input_clk, },
- { "isee,omap3-igep0020", omap3_igep0020_legacy_init, },
{ "ti,omap3-evm-37xx", omap3_evm_legacy_init, },
- { "ti,omap3-zoom3", omap3_zoom_legacy_init, },
{ "ti,am3517-evm", am3517_evm_legacy_init, },
{ "technexion,omap3-tao3530", omap3_tao3530_legacy_init, },
#endif
-#ifdef CONFIG_ARCH_OMAP4
- { "ti,omap4-sdp", omap4_sdp_legacy_init, },
- { "ti,omap4-panda", omap4_panda_legacy_init, },
- { "variscite,var-dvk-om44", var_som_om44_legacy_init, },
- { "variscite,var-stk-om44", var_som_om44_legacy_init, },
-#endif
-#ifdef CONFIG_SOC_AM33XX
- { "ti,am335x-evmsk", am335x_evmsk_legacy_init, },
-#endif
#ifdef CONFIG_SOC_OMAP5
{ "ti,omap5-uevm", omap5_uevm_legacy_init, },
#endif
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 7eb94e6fc376..f222ea1a8779 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -624,6 +624,10 @@ config ARM_LPAE
If unsure, say N.
+config ARM_PV_FIXUP
+ def_bool y
+ depends on ARM_LPAE && ARM_PATCH_PHYS_VIRT && ARCH_KEYSTONE
+
config ARCH_PHYS_ADDR_T_64BIT
def_bool ARM_LPAE
@@ -825,6 +829,20 @@ config KUSER_HELPERS
Say N here only if you are absolutely certain that you do not
need these helpers; otherwise, the safe option is to say Y.
+config VDSO
+ bool "Enable VDSO for acceleration of some system calls"
+ depends on AEABI && MMU && CPU_V7
+ default y if ARM_ARCH_TIMER
+ select GENERIC_TIME_VSYSCALL
+ help
+ Place in the process address space an ELF shared object
+ providing fast implementations of gettimeofday and
+ clock_gettime. Systems that implement the ARM architected
+ timer will receive maximum benefit.
+
+ You must have glibc 2.22 or later for programs to seamlessly
+ take advantage of this.
+
config DMA_CACHE_RWFO
bool "Enable read/write for ownership DMA cache maintenance"
depends on CPU_V6K && SMP
@@ -1009,3 +1027,24 @@ config ARCH_SUPPORTS_BIG_ENDIAN
help
This option specifies the architecture can support big endian
operation.
+
+config ARM_KERNMEM_PERMS
+ bool "Restrict kernel memory permissions"
+ help
+ If this is set, kernel memory other than kernel text (and rodata)
+ will be made non-executable. The tradeoff is that each region is
+ padded to section-size (1MiB) boundaries (because their permissions
+ are different and splitting the 1M pages into 4K ones causes TLB
+ performance problems), wasting memory.
+
+config DEBUG_RODATA
+ bool "Make kernel text and rodata read-only"
+ depends on ARM_KERNMEM_PERMS
+ default y
+ help
+ If this is set, kernel text and rodata will be made read-only. This
+ is to help catch accidental or malicious attempts to change the
+ kernel's executable code. Additionally splits rodata from kernel
+ text so it can be made explicitly non-executable. This creates
+ another section-size padded region, so it can waste more memory
+ space while gaining the read-only protections.
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 91da64de440f..f9a13f144266 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_MODULES) += proc-syms.o
obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o
obj-$(CONFIG_HIGHMEM) += highmem.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_ARM_PV_FIXUP) += pv-fixup-asm.o
obj-$(CONFIG_CPU_ABRT_NOMMU) += abort-nommu.o
obj-$(CONFIG_CPU_ABRT_EV4) += abort-ev4.o
diff --git a/arch/arm/mm/abort-ev4.S b/arch/arm/mm/abort-ev4.S
index 54473cd4aba9..b3b31e30cadd 100644
--- a/arch/arm/mm/abort-ev4.S
+++ b/arch/arm/mm/abort-ev4.S
@@ -19,6 +19,7 @@ ENTRY(v4_early_abort)
mrc p15, 0, r1, c5, c0, 0 @ get FSR
mrc p15, 0, r0, c6, c0, 0 @ get FAR
ldr r3, [r4] @ read aborted ARM instruction
+ uaccess_disable ip @ disable userspace access
bic r1, r1, #1 << 11 | 1 << 10 @ clear bits 11 and 10 of FSR
tst r3, #1 << 20 @ L = 1 -> write?
orreq r1, r1, #1 << 11 @ yes.
diff --git a/arch/arm/mm/abort-ev5t.S b/arch/arm/mm/abort-ev5t.S
index a0908d4653a3..a6a381a6caa5 100644
--- a/arch/arm/mm/abort-ev5t.S
+++ b/arch/arm/mm/abort-ev5t.S
@@ -21,8 +21,10 @@ ENTRY(v5t_early_abort)
mrc p15, 0, r0, c6, c0, 0 @ get FAR
do_thumb_abort fsr=r1, pc=r4, psr=r5, tmp=r3
ldreq r3, [r4] @ read aborted ARM instruction
+ uaccess_disable ip @ disable user access
bic r1, r1, #1 << 11 @ clear bits 11 of FSR
- do_ldrd_abort tmp=ip, insn=r3
+ teq_ldrd tmp=ip, insn=r3 @ insn was LDRD?
+ beq do_DataAbort @ yes
tst r3, #1 << 20 @ check write
orreq r1, r1, #1 << 11
b do_DataAbort
diff --git a/arch/arm/mm/abort-ev5tj.S b/arch/arm/mm/abort-ev5tj.S
index 4006b7a61264..00ab011bef58 100644
--- a/arch/arm/mm/abort-ev5tj.S
+++ b/arch/arm/mm/abort-ev5tj.S
@@ -24,7 +24,9 @@ ENTRY(v5tj_early_abort)
bne do_DataAbort
do_thumb_abort fsr=r1, pc=r4, psr=r5, tmp=r3
ldreq r3, [r4] @ read aborted ARM instruction
- do_ldrd_abort tmp=ip, insn=r3
+ uaccess_disable ip @ disable userspace access
+ teq_ldrd tmp=ip, insn=r3 @ insn was LDRD?
+ beq do_DataAbort @ yes
tst r3, #1 << 20 @ L = 0 -> write
orreq r1, r1, #1 << 11 @ yes.
b do_DataAbort
diff --git a/arch/arm/mm/abort-ev6.S b/arch/arm/mm/abort-ev6.S
index 8c48c5c22a33..8801a15aa105 100644
--- a/arch/arm/mm/abort-ev6.S
+++ b/arch/arm/mm/abort-ev6.S
@@ -26,16 +26,18 @@ ENTRY(v6_early_abort)
ldr ip, =0x4107b36
mrc p15, 0, r3, c0, c0, 0 @ get processor id
teq ip, r3, lsr #4 @ r0 ARM1136?
- bne do_DataAbort
+ bne 1f
tst r5, #PSR_J_BIT @ Java?
tsteq r5, #PSR_T_BIT @ Thumb?
- bne do_DataAbort
+ bne 1f
bic r1, r1, #1 << 11 @ clear bit 11 of FSR
ldr r3, [r4] @ read aborted ARM instruction
ARM_BE8(rev r3, r3)
- do_ldrd_abort tmp=ip, insn=r3
+ teq_ldrd tmp=ip, insn=r3 @ insn was LDRD?
+ beq 1f @ yes
tst r3, #1 << 20 @ L = 0 -> write
orreq r1, r1, #1 << 11 @ yes.
#endif
+1: uaccess_disable ip @ disable userspace access
b do_DataAbort
diff --git a/arch/arm/mm/abort-ev7.S b/arch/arm/mm/abort-ev7.S
index 4812ad054214..e8d0e08c227f 100644
--- a/arch/arm/mm/abort-ev7.S
+++ b/arch/arm/mm/abort-ev7.S
@@ -15,6 +15,7 @@
ENTRY(v7_early_abort)
mrc p15, 0, r1, c5, c0, 0 @ get FSR
mrc p15, 0, r0, c6, c0, 0 @ get FAR
+ uaccess_disable ip @ disable userspace access
/*
* V6 code adjusts the returned DFSR.
diff --git a/arch/arm/mm/abort-lv4t.S b/arch/arm/mm/abort-lv4t.S
index f3982580c273..6d8e8e3365d1 100644
--- a/arch/arm/mm/abort-lv4t.S
+++ b/arch/arm/mm/abort-lv4t.S
@@ -26,6 +26,7 @@ ENTRY(v4t_late_abort)
#endif
bne .data_thumb_abort
ldr r8, [r4] @ read arm instruction
+ uaccess_disable ip @ disable userspace access
tst r8, #1 << 20 @ L = 1 -> write?
orreq r1, r1, #1 << 11 @ yes.
and r7, r8, #15 << 24
@@ -155,6 +156,7 @@ ENTRY(v4t_late_abort)
.data_thumb_abort:
ldrh r8, [r4] @ read instruction
+ uaccess_disable ip @ disable userspace access
tst r8, #1 << 11 @ L = 1 -> write?
orreq r1, r1, #1 << 8 @ yes
and r7, r8, #15 << 12
diff --git a/arch/arm/mm/abort-macro.S b/arch/arm/mm/abort-macro.S
index 2cbf68ef0e83..4509bee4e081 100644
--- a/arch/arm/mm/abort-macro.S
+++ b/arch/arm/mm/abort-macro.S
@@ -13,6 +13,7 @@
tst \psr, #PSR_T_BIT
beq not_thumb
ldrh \tmp, [\pc] @ Read aborted Thumb instruction
+ uaccess_disable ip @ disable userspace access
and \tmp, \tmp, # 0xfe00 @ Mask opcode field
cmp \tmp, # 0x5600 @ Is it ldrsb?
orreq \tmp, \tmp, #1 << 11 @ Set L-bit if yes
@@ -29,12 +30,9 @@ not_thumb:
* [7:4] == 1101
* [20] == 0
*/
- .macro do_ldrd_abort, tmp, insn
- tst \insn, #0x0e100000 @ [27:25,20] == 0
- bne not_ldrd
- and \tmp, \insn, #0x000000f0 @ [7:4] == 1101
- cmp \tmp, #0x000000d0
- beq do_DataAbort
-not_ldrd:
+ .macro teq_ldrd, tmp, insn
+ mov \tmp, #0x0e100000
+ orr \tmp, #0x000000f0
+ and \tmp, \insn, \tmp
+ teq \tmp, #0x000000d0
.endm
-
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index 83792f4324ea..3e1a1652455f 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -365,15 +365,21 @@ do_alignment_ldrhstrh(unsigned long addr, unsigned long instr, struct pt_regs *r
user:
if (LDST_L_BIT(instr)) {
unsigned long val;
+ unsigned int __ua_flags = uaccess_save_and_enable();
+
get16t_unaligned_check(val, addr);
+ uaccess_restore(__ua_flags);
/* signed half-word? */
if (instr & 0x40)
val = (signed long)((signed short) val);
regs->uregs[rd] = val;
- } else
+ } else {
+ unsigned int __ua_flags = uaccess_save_and_enable();
put16t_unaligned_check(regs->uregs[rd], addr);
+ uaccess_restore(__ua_flags);
+ }
return TYPE_LDST;
@@ -420,14 +426,21 @@ do_alignment_ldrdstrd(unsigned long addr, unsigned long instr,
user:
if (load) {
- unsigned long val;
+ unsigned long val, val2;
+ unsigned int __ua_flags = uaccess_save_and_enable();
+
get32t_unaligned_check(val, addr);
+ get32t_unaligned_check(val2, addr + 4);
+
+ uaccess_restore(__ua_flags);
+
regs->uregs[rd] = val;
- get32t_unaligned_check(val, addr + 4);
- regs->uregs[rd2] = val;
+ regs->uregs[rd2] = val2;
} else {
+ unsigned int __ua_flags = uaccess_save_and_enable();
put32t_unaligned_check(regs->uregs[rd], addr);
put32t_unaligned_check(regs->uregs[rd2], addr + 4);
+ uaccess_restore(__ua_flags);
}
return TYPE_LDST;
@@ -458,10 +471,15 @@ do_alignment_ldrstr(unsigned long addr, unsigned long instr, struct pt_regs *reg
trans:
if (LDST_L_BIT(instr)) {
unsigned int val;
+ unsigned int __ua_flags = uaccess_save_and_enable();
get32t_unaligned_check(val, addr);
+ uaccess_restore(__ua_flags);
regs->uregs[rd] = val;
- } else
+ } else {
+ unsigned int __ua_flags = uaccess_save_and_enable();
put32t_unaligned_check(regs->uregs[rd], addr);
+ uaccess_restore(__ua_flags);
+ }
return TYPE_LDST;
fault:
@@ -531,6 +549,7 @@ do_alignment_ldmstm(unsigned long addr, unsigned long instr, struct pt_regs *reg
#endif
if (user_mode(regs)) {
+ unsigned int __ua_flags = uaccess_save_and_enable();
for (regbits = REGMASK_BITS(instr), rd = 0; regbits;
regbits >>= 1, rd += 1)
if (regbits & 1) {
@@ -542,6 +561,7 @@ do_alignment_ldmstm(unsigned long addr, unsigned long instr, struct pt_regs *reg
put32t_unaligned_check(regs->uregs[rd], eaddr);
eaddr += 4;
}
+ uaccess_restore(__ua_flags);
} else {
for (regbits = REGMASK_BITS(instr), rd = 0; regbits;
regbits >>= 1, rd += 1)
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 24659952c278..11da0f50a1fe 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -270,6 +270,11 @@ v6_dma_clean_range:
* - end - virtual end address of region
*/
ENTRY(v6_dma_flush_range)
+#ifdef CONFIG_CACHE_FLUSH_RANGE_LIMIT
+ sub r2, r1, r0
+ cmp r2, #CONFIG_CACHE_FLUSH_RANGE_LIMIT
+ bhi v6_dma_flush_dcache_all
+#endif
#ifdef CONFIG_DMA_CACHE_RWFO
ldrb r2, [r0] @ read for ownership
strb r2, [r0] @ write for ownership
@@ -292,6 +297,18 @@ ENTRY(v6_dma_flush_range)
mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
ret lr
+#ifdef CONFIG_CACHE_FLUSH_RANGE_LIMIT
+v6_dma_flush_dcache_all:
+ mov r0, #0
+#ifdef HARVARD_CACHE
+ mcr p15, 0, r0, c7, c14, 0 @ D cache clean+invalidate
+#else
+ mcr p15, 0, r0, c7, c15, 0 @ Cache clean+invalidate
+#endif
+ mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
+ mov pc, lr
+#endif
+
/*
* dma_map_area(start, size, dir)
* - start - kernel virtual start address
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index eb8830a4c5ed..b784579545e8 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -274,10 +274,10 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
local_irq_enable();
/*
- * If we're in an interrupt or have no user
+ * If we're in an interrupt, or have no irqs, or have no user
* context, we must not take the fault..
*/
- if (in_atomic() || !mm)
+ if (in_atomic() || irqs_disabled() || !mm)
goto no_context;
if (user_mode(regs))
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index e17ed00828d7..b98895d9fe57 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -18,19 +18,20 @@
#include <asm/tlbflush.h>
#include "mm.h"
-pte_t *fixmap_page_table;
-
static inline void set_fixmap_pte(int idx, pte_t pte)
{
unsigned long vaddr = __fix_to_virt(idx);
- set_pte_ext(fixmap_page_table + idx, pte, 0);
+ pte_t *ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr);
+
+ set_pte_ext(ptep, pte, 0);
local_flush_tlb_kernel_page(vaddr);
}
static inline pte_t get_fixmap_pte(unsigned long vaddr)
{
- unsigned long idx = __virt_to_fix(vaddr);
- return *(fixmap_page_table + idx);
+ pte_t *ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr);
+
+ return *ptep;
}
void *kmap(struct page *page)
@@ -84,7 +85,7 @@ void *kmap_atomic(struct page *page)
* With debugging enabled, kunmap_atomic forces that entry to 0.
* Make sure it was indeed properly unmapped.
*/
- BUG_ON(!pte_none(*(fixmap_page_table + idx)));
+ BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
#endif
/*
* When debugging is off, kunmap_atomic leaves the previous mapping
@@ -137,7 +138,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
idx = type + KM_TYPE_NR * smp_processor_id();
vaddr = __fix_to_virt(idx);
#ifdef CONFIG_DEBUG_HIGHMEM
- BUG_ON(!pte_none(*(fixmap_page_table + idx)));
+ BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
#endif
set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 9481f85c56e6..4fe8ecf6a520 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -29,6 +29,7 @@
#include <asm/prom.h>
#include <asm/sections.h>
#include <asm/setup.h>
+#include <asm/system_info.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
@@ -570,7 +571,7 @@ void __init mem_init(void)
MLK(DTCM_OFFSET, (unsigned long) dtcm_end),
MLK(ITCM_OFFSET, (unsigned long) itcm_end),
#endif
- MLK(FIXADDR_START, FIXADDR_TOP),
+ MLK(FIXADDR_START, FIXADDR_END),
MLM(VMALLOC_START, VMALLOC_END),
MLM(PAGE_OFFSET, (unsigned long)high_memory),
#ifdef CONFIG_HIGHMEM
@@ -615,7 +616,145 @@ void __init mem_init(void)
}
}
-void free_initmem(void)
+#ifdef CONFIG_ARM_KERNMEM_PERMS
+struct section_perm {
+ unsigned long start;
+ unsigned long end;
+ pmdval_t mask;
+ pmdval_t prot;
+ pmdval_t clear;
+};
+
+static struct section_perm nx_perms[] = {
+ /* Make pages tables, etc before _stext RW (set NX). */
+ {
+ .start = PAGE_OFFSET,
+ .end = (unsigned long)_stext,
+ .mask = ~PMD_SECT_XN,
+ .prot = PMD_SECT_XN,
+ },
+ /* Make init RW (set NX). */
+ {
+ .start = (unsigned long)__init_begin,
+ .end = (unsigned long)_sdata,
+ .mask = ~PMD_SECT_XN,
+ .prot = PMD_SECT_XN,
+ },
+#ifdef CONFIG_DEBUG_RODATA
+ /* Make rodata NX (set RO in ro_perms below). */
+ {
+ .start = (unsigned long)__start_rodata,
+ .end = (unsigned long)__init_begin,
+ .mask = ~PMD_SECT_XN,
+ .prot = PMD_SECT_XN,
+ },
+#endif
+};
+
+#ifdef CONFIG_DEBUG_RODATA
+static struct section_perm ro_perms[] = {
+ /* Make kernel code and rodata RX (set RO). */
+ {
+ .start = (unsigned long)_stext,
+ .end = (unsigned long)__init_begin,
+#ifdef CONFIG_ARM_LPAE
+ .mask = ~PMD_SECT_RDONLY,
+ .prot = PMD_SECT_RDONLY,
+#else
+ .mask = ~(PMD_SECT_APX | PMD_SECT_AP_WRITE),
+ .prot = PMD_SECT_APX | PMD_SECT_AP_WRITE,
+ .clear = PMD_SECT_AP_WRITE,
+#endif
+ },
+};
+#endif
+
+/*
+ * Updates section permissions only for the current mm (sections are
+ * copied into each mm). During startup, this is the init_mm. Is only
+ * safe to be called with preemption disabled, as under stop_machine().
+ */
+static inline void section_update(unsigned long addr, pmdval_t mask,
+ pmdval_t prot)
+{
+ struct mm_struct *mm;
+ pmd_t *pmd;
+
+ mm = current->active_mm;
+ pmd = pmd_offset(pud_offset(pgd_offset(mm, addr), addr), addr);
+
+#ifdef CONFIG_ARM_LPAE
+ pmd[0] = __pmd((pmd_val(pmd[0]) & mask) | prot);
+#else
+ if (addr & SECTION_SIZE)
+ pmd[1] = __pmd((pmd_val(pmd[1]) & mask) | prot);
+ else
+ pmd[0] = __pmd((pmd_val(pmd[0]) & mask) | prot);
+#endif
+ flush_pmd_entry(pmd);
+ local_flush_tlb_kernel_range(addr, addr + SECTION_SIZE);
+}
+
+/* Make sure extended page tables are in use. */
+static inline bool arch_has_strict_perms(void)
+{
+ if (cpu_architecture() < CPU_ARCH_ARMv6)
+ return false;
+
+ return !!(get_cr() & CR_XP);
+}
+
+#define set_section_perms(perms, field) { \
+ size_t i; \
+ unsigned long addr; \
+ \
+ if (!arch_has_strict_perms()) \
+ return; \
+ \
+ for (i = 0; i < ARRAY_SIZE(perms); i++) { \
+ if (!IS_ALIGNED(perms[i].start, SECTION_SIZE) || \
+ !IS_ALIGNED(perms[i].end, SECTION_SIZE)) { \
+ pr_err("BUG: section %lx-%lx not aligned to %lx\n", \
+ perms[i].start, perms[i].end, \
+ SECTION_SIZE); \
+ continue; \
+ } \
+ \
+ for (addr = perms[i].start; \
+ addr < perms[i].end; \
+ addr += SECTION_SIZE) \
+ section_update(addr, perms[i].mask, \
+ perms[i].field); \
+ } \
+}
+
+static inline void fix_kernmem_perms(void)
+{
+ set_section_perms(nx_perms, prot);
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void)
+{
+ set_section_perms(ro_perms, prot);
+}
+
+void set_kernel_text_rw(void)
+{
+ set_section_perms(ro_perms, clear);
+}
+
+void set_kernel_text_ro(void)
+{
+ set_section_perms(ro_perms, prot);
+}
+#endif /* CONFIG_DEBUG_RODATA */
+
+#else
+static inline void fix_kernmem_perms(void) { }
+#endif /* CONFIG_ARM_KERNMEM_PERMS */
+
+void free_tcmmem(void)
{
#ifdef CONFIG_HAVE_TCM
extern char __tcm_start, __tcm_end;
@@ -623,6 +762,12 @@ void free_initmem(void)
poison_init_mem(&__tcm_start, &__tcm_end - &__tcm_start);
free_reserved_area(&__tcm_start, &__tcm_end, -1, "TCM link");
#endif
+}
+
+void free_initmem(void)
+{
+ fix_kernmem_perms();
+ free_tcmmem();
poison_init_mem(__init_begin, __init_end - __init_begin);
if (!machine_is_integrator() && !machine_is_cintegrator())
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 8f9d1cf505dd..dae47dfc8dae 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -173,10 +173,9 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
{
unsigned long random_factor = 0UL;
- /* 8 bits of randomness in 20 address space bits */
if ((current->flags & PF_RANDOMIZE) &&
!(current->personality & ADDR_NO_RANDOMIZE))
- random_factor = (get_random_int() % (1 << 8)) << PAGE_SHIFT;
+ random_factor = (get_random_long() & ((1UL << mmap_rnd_bits) - 1)) << PAGE_SHIFT;
if (mmap_is_legacy()) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 64b12f136f5d..ff61335cbade 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -22,6 +22,7 @@
#include <asm/cputype.h>
#include <asm/sections.h>
#include <asm/cachetype.h>
+#include <asm/fixmap.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <asm/smp_plat.h>
@@ -52,6 +53,8 @@ EXPORT_SYMBOL(empty_zero_page);
*/
pmd_t *top_pmd;
+pmdval_t user_pmd_table = _PAGE_USER_TABLE;
+
#define CPOLICY_UNCACHED 0
#define CPOLICY_BUFFERED 1
#define CPOLICY_WRITETHROUGH 2
@@ -288,13 +291,13 @@ static struct mem_type mem_types[] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_RDONLY,
.prot_l1 = PMD_TYPE_TABLE,
- .domain = DOMAIN_USER,
+ .domain = DOMAIN_VECTORS,
},
[MT_HIGH_VECTORS] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
L_PTE_USER | L_PTE_RDONLY,
.prot_l1 = PMD_TYPE_TABLE,
- .domain = DOMAIN_USER,
+ .domain = DOMAIN_VECTORS,
},
[MT_MEMORY_RWX] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
@@ -393,6 +396,29 @@ SET_MEMORY_FN(x, pte_set_x)
SET_MEMORY_FN(nx, pte_set_nx)
/*
+ * To avoid TLB flush broadcasts, this uses local_flush_tlb_kernel_range().
+ * As a result, this can only be called with preemption disabled, as under
+ * stop_machine().
+ */
+void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
+{
+ unsigned long vaddr = __fix_to_virt(idx);
+ pte_t *pte = pte_offset_kernel(pmd_off_k(vaddr), vaddr);
+
+ /* Make sure fixmap region does not exceed available allocation. */
+ BUILD_BUG_ON(FIXADDR_START + (__end_of_fixed_addresses * PAGE_SIZE) >
+ FIXADDR_END);
+ BUG_ON(idx >= __end_of_fixed_addresses);
+
+ if (pgprot_val(prot))
+ set_pte_at(NULL, vaddr, pte,
+ pfn_pte(phys >> PAGE_SHIFT, prot));
+ else
+ pte_clear(NULL, vaddr, pte);
+ local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE);
+}
+
+/*
* Adjust the PMD section entries according to the CPU in use.
*/
static void __init build_mem_type_table(void)
@@ -538,6 +564,25 @@ static void __init build_mem_type_table(void)
vecs_pgprot |= L_PTE_MT_VECTORS;
#endif
+#ifndef CONFIG_ARM_LPAE
+ /*
+ * We don't use domains on ARMv6 (since this causes problems with
+ * v6/v7 kernels), so we must use a separate memory type for user
+ * r/o, kernel r/w to map the vectors page.
+ */
+ if (cpu_arch == CPU_ARCH_ARMv6)
+ vecs_pgprot |= L_PTE_MT_VECTORS;
+
+ /*
+ * Check is it with support for the PXN bit
+ * in the Short-descriptor translation table format descriptors.
+ */
+ if (cpu_arch == CPU_ARCH_ARMv7 &&
+ (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xF) >= 4) {
+ user_pmd_table |= PMD_PXNTABLE;
+ }
+#endif
+
/*
* ARMv6 and above have extended page tables.
*/
@@ -605,6 +650,11 @@ static void __init build_mem_type_table(void)
}
kern_pgprot |= PTE_EXT_AF;
vecs_pgprot |= PTE_EXT_AF;
+
+ /*
+ * Set PXN for user mappings
+ */
+ user_pgprot |= PTE_EXT_PXN;
#endif
for (i = 0; i < 16; i++) {
@@ -1326,10 +1376,10 @@ static void __init kmap_init(void)
#ifdef CONFIG_HIGHMEM
pkmap_page_table = early_pte_alloc(pmd_off_k(PKMAP_BASE),
PKMAP_BASE, _PAGE_KERNEL_TABLE);
-
- fixmap_page_table = early_pte_alloc(pmd_off_k(FIXADDR_START),
- FIXADDR_START, _PAGE_KERNEL_TABLE);
#endif
+
+ early_pte_alloc(pmd_off_k(FIXADDR_START), FIXADDR_START,
+ _PAGE_KERNEL_TABLE);
}
static void __init map_lowmem(void)
@@ -1349,13 +1399,20 @@ static void __init map_lowmem(void)
if (start >= end)
break;
- if (end < kernel_x_start || start >= kernel_x_end) {
+ if (end < kernel_x_start) {
map.pfn = __phys_to_pfn(start);
map.virtual = __phys_to_virt(start);
map.length = end - start;
map.type = MT_MEMORY_RWX;
create_mapping(&map);
+ } else if (start >= kernel_x_end) {
+ map.pfn = __phys_to_pfn(start);
+ map.virtual = __phys_to_virt(start);
+ map.length = end - start;
+ map.type = MT_MEMORY_RW;
+
+ create_mapping(&map);
} else {
/* This better cover the entire kernel */
if (start < kernel_x_start) {
@@ -1386,7 +1443,11 @@ static void __init map_lowmem(void)
}
}
-#ifdef CONFIG_ARM_LPAE
+#ifdef CONFIG_ARM_PV_FIXUP
+extern unsigned long __atags_pointer;
+typedef void pgtables_remap(long long offset, unsigned long pgd, void *bdata);
+pgtables_remap lpae_pgtables_remap_asm;
+
/*
* early_paging_init() recreates boot time page table setup, allowing machines
* to switch over to a high (>4G) address space on LPAE systems
@@ -1394,106 +1455,65 @@ static void __init map_lowmem(void)
void __init early_paging_init(const struct machine_desc *mdesc,
struct proc_info_list *procinfo)
{
- pmdval_t pmdprot = procinfo->__cpu_mm_mmu_flags;
- unsigned long map_start, map_end;
- pgd_t *pgd0, *pgdk;
- pud_t *pud0, *pudk, *pud_start;
- pmd_t *pmd0, *pmdk;
- phys_addr_t phys;
- int i;
+ pgtables_remap *lpae_pgtables_remap;
+ unsigned long pa_pgd;
+ unsigned int cr, ttbcr;
+ long long offset;
+ void *boot_data;
if (!(mdesc->init_meminfo))
return;
- /* remap kernel code and data */
- map_start = init_mm.start_code & PMD_MASK;
- map_end = ALIGN(init_mm.brk, PMD_SIZE);
-
- /* get a handle on things... */
- pgd0 = pgd_offset_k(0);
- pud_start = pud0 = pud_offset(pgd0, 0);
- pmd0 = pmd_offset(pud0, 0);
+ offset = mdesc->init_meminfo();
+ if (offset == 0)
+ return;
- pgdk = pgd_offset_k(map_start);
- pudk = pud_offset(pgdk, map_start);
- pmdk = pmd_offset(pudk, map_start);
+ /* Re-set the phys pfn offset, and the pv offset */
+ __pv_offset += offset;
+ __pv_phys_pfn_offset += PFN_DOWN(offset);
- mdesc->init_meminfo();
+ /*
+ * Get the address of the remap function in the 1:1 identity
+ * mapping setup by the early page table assembly code. We
+ * must get this prior to the pv update. The following barrier
+ * ensures that this is complete before we fixup any P:V offsets.
+ */
+ lpae_pgtables_remap = (pgtables_remap *)(unsigned long)__pa(lpae_pgtables_remap_asm);
+ pa_pgd = __pa(swapper_pg_dir);
+ boot_data = __va(__atags_pointer);
+ barrier();
/* Run the patch stub to update the constants */
fixup_pv_table(&__pv_table_begin,
(&__pv_table_end - &__pv_table_begin) << 2);
/*
- * Cache cleaning operations for self-modifying code
- * We should clean the entries by MVA but running a
- * for loop over every pv_table entry pointer would
- * just complicate the code.
- */
- flush_cache_louis();
- dsb(ishst);
- isb();
-
- /*
- * FIXME: This code is not architecturally compliant: we modify
- * the mappings in-place, indeed while they are in use by this
- * very same code. This may lead to unpredictable behaviour of
- * the CPU.
- *
- * Even modifying the mappings in a separate page table does
- * not resolve this.
- *
- * The architecture strongly recommends that when a mapping is
- * changed, that it is changed by first going via an invalid
- * mapping and back to the new mapping. This is to ensure that
- * no TLB conflicts (caused by the TLB having more than one TLB
- * entry match a translation) can occur. However, doing that
- * here will result in unmapping the code we are running.
- */
- pr_warn("WARNING: unsafe modification of in-place page tables - tainting kernel\n");
- add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
-
- /*
- * Remap level 1 table. This changes the physical addresses
- * used to refer to the level 2 page tables to the high
- * physical address alias, leaving everything else the same.
- */
- for (i = 0; i < PTRS_PER_PGD; pud0++, i++) {
- set_pud(pud0,
- __pud(__pa(pmd0) | PMD_TYPE_TABLE | L_PGD_SWAPPER));
- pmd0 += PTRS_PER_PMD;
- }
-
- /*
- * Remap the level 2 table, pointing the mappings at the high
- * physical address alias of these pages.
- */
- phys = __pa(map_start);
- do {
- *pmdk++ = __pmd(phys | pmdprot);
- phys += PMD_SIZE;
- } while (phys < map_end);
-
- /*
- * Ensure that the above updates are flushed out of the cache.
- * This is not strictly correct; on a system where the caches
- * are coherent with each other, but the MMU page table walks
- * may not be coherent, flush_cache_all() may be a no-op, and
- * this will fail.
+ * We changing not only the virtual to physical mapping, but also
+ * the physical addresses used to access memory. We need to flush
+ * all levels of cache in the system with caching disabled to
+ * ensure that all data is written back, and nothing is prefetched
+ * into the caches. We also need to prevent the TLB walkers
+ * allocating into the caches too. Note that this is ARMv7 LPAE
+ * specific.
*/
+ cr = get_cr();
+ set_cr(cr & ~(CR_I | CR_C));
+ asm("mrc p15, 0, %0, c2, c0, 2" : "=r" (ttbcr));
+ asm volatile("mcr p15, 0, %0, c2, c0, 2"
+ : : "r" (ttbcr & ~(3 << 8 | 3 << 10)));
flush_cache_all();
/*
- * Re-write the TTBR values to point them at the high physical
- * alias of the page tables. We expect __va() will work on
- * cpu_get_pgd(), which returns the value of TTBR0.
+ * Fixup the page tables - this must be in the idmap region as
+ * we need to disable the MMU to do this safely, and hence it
+ * needs to be assembly. It's fairly simple, as we're using the
+ * temporary tables setup by the initial assembly code.
*/
- cpu_switch_mm(pgd0, &init_mm);
- cpu_set_ttbr(1, __pa(pgd0) + TTBR1_OFFSET);
+ lpae_pgtables_remap(offset, pa_pgd, boot_data);
- /* Finally flush any stale TLB values. */
- local_flush_bp_all();
- local_flush_tlb_all();
+ /* Re-enable the caches and cacheable TLB walks */
+ asm volatile("mcr p15, 0, %0, c2, c0, 2" : : "r" (ttbcr));
+ set_cr(cr);
}
#else
@@ -1501,8 +1521,19 @@ void __init early_paging_init(const struct machine_desc *mdesc,
void __init early_paging_init(const struct machine_desc *mdesc,
struct proc_info_list *procinfo)
{
- if (mdesc->init_meminfo)
- mdesc->init_meminfo();
+ long long offset;
+
+ if (!mdesc->init_meminfo)
+ return;
+
+ offset = mdesc->init_meminfo();
+ if (offset == 0)
+ return;
+
+ pr_crit("Physical address space modification is only to support Keystone2.\n");
+ pr_crit("Please enable ARM_LPAE and ARM_PATCH_PHYS_VIRT support to use this\n");
+ pr_crit("feature. Your kernel may crash now, have a good day.\n");
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
}
#endif
diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index 249379535be2..611f23e1d435 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -84,6 +84,16 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
if (!new_pte)
goto no_pte;
+#ifndef CONFIG_ARM_LPAE
+ /*
+ * Modify the PTE pointer to have the correct domain. This
+ * needs to be the vectors domain to avoid the low vectors
+ * being unmapped.
+ */
+ pmd_val(*new_pmd) &= ~PMD_DOMAIN_MASK;
+ pmd_val(*new_pmd) |= PMD_DOMAIN(DOMAIN_VECTORS);
+#endif
+
init_pud = pud_offset(init_pgd, 0);
init_pmd = pmd_offset(init_pud, 0);
init_pte = pte_offset_map(init_pmd, 0);
diff --git a/arch/arm/mm/proc-arm1020.S b/arch/arm/mm/proc-arm1020.S
index 86ee5d47ce3c..aa0519eed698 100644
--- a/arch/arm/mm/proc-arm1020.S
+++ b/arch/arm/mm/proc-arm1020.S
@@ -507,7 +507,7 @@ cpu_arm1020_name:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.type __arm1020_proc_info,#object
__arm1020_proc_info:
@@ -519,7 +519,7 @@ __arm1020_proc_info:
.long PMD_TYPE_SECT | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __arm1020_setup
+ initfn __arm1020_setup, __arm1020_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_HALF | HWCAP_THUMB
diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S
index a6331d78601f..bff4c7f70fd6 100644
--- a/arch/arm/mm/proc-arm1020e.S
+++ b/arch/arm/mm/proc-arm1020e.S
@@ -465,7 +465,7 @@ arm1020e_crval:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.type __arm1020e_proc_info,#object
__arm1020e_proc_info:
@@ -479,7 +479,7 @@ __arm1020e_proc_info:
PMD_BIT4 | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __arm1020e_setup
+ initfn __arm1020e_setup, __arm1020e_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_HALF | HWCAP_THUMB | HWCAP_EDSP
diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S
index a126b7a59928..dbb2413fe04d 100644
--- a/arch/arm/mm/proc-arm1022.S
+++ b/arch/arm/mm/proc-arm1022.S
@@ -448,7 +448,7 @@ arm1022_crval:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.type __arm1022_proc_info,#object
__arm1022_proc_info:
@@ -462,7 +462,7 @@ __arm1022_proc_info:
PMD_BIT4 | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __arm1022_setup
+ initfn __arm1022_setup, __arm1022_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_HALF | HWCAP_THUMB | HWCAP_EDSP
diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S
index fc294067e977..0b37b2cef9d3 100644
--- a/arch/arm/mm/proc-arm1026.S
+++ b/arch/arm/mm/proc-arm1026.S
@@ -442,7 +442,7 @@ arm1026_crval:
string cpu_arm1026_name, "ARM1026EJ-S"
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.type __arm1026_proc_info,#object
__arm1026_proc_info:
@@ -456,7 +456,7 @@ __arm1026_proc_info:
PMD_BIT4 | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __arm1026_setup
+ initfn __arm1026_setup, __arm1026_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP|HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT|HWCAP_EDSP|HWCAP_JAVA
diff --git a/arch/arm/mm/proc-arm720.S b/arch/arm/mm/proc-arm720.S
index 2baa66b3ac9b..3651cd70e418 100644
--- a/arch/arm/mm/proc-arm720.S
+++ b/arch/arm/mm/proc-arm720.S
@@ -186,7 +186,7 @@ arm720_crval:
* See <asm/procinfo.h> for a definition of this structure.
*/
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.macro arm720_proc_info name:req, cpu_val:req, cpu_mask:req, cpu_name:req, cpu_flush:req
.type __\name\()_proc_info,#object
@@ -203,7 +203,7 @@ __\name\()_proc_info:
PMD_BIT4 | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b \cpu_flush @ cpu_flush
+ initfn \cpu_flush, __\name\()_proc_info @ cpu_flush
.long cpu_arch_name @ arch_name
.long cpu_elf_name @ elf_name
.long HWCAP_SWP | HWCAP_HALF | HWCAP_THUMB @ elf_hwcap
diff --git a/arch/arm/mm/proc-arm740.S b/arch/arm/mm/proc-arm740.S
index ac1ea6b3bce4..024fb7732407 100644
--- a/arch/arm/mm/proc-arm740.S
+++ b/arch/arm/mm/proc-arm740.S
@@ -132,14 +132,14 @@ __arm740_setup:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.type __arm740_proc_info,#object
__arm740_proc_info:
.long 0x41807400
.long 0xfffffff0
.long 0
.long 0
- b __arm740_setup
+ initfn __arm740_setup, __arm740_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_HALF | HWCAP_THUMB | HWCAP_26BIT
diff --git a/arch/arm/mm/proc-arm7tdmi.S b/arch/arm/mm/proc-arm7tdmi.S
index bf6ba4bc30ff..25472d94426d 100644
--- a/arch/arm/mm/proc-arm7tdmi.S
+++ b/arch/arm/mm/proc-arm7tdmi.S
@@ -76,7 +76,7 @@ __arm7tdmi_setup:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.macro arm7tdmi_proc_info name:req, cpu_val:req, cpu_mask:req, cpu_name:req, \
extra_hwcaps=0
@@ -86,7 +86,7 @@ __\name\()_proc_info:
.long \cpu_mask
.long 0
.long 0
- b __arm7tdmi_setup
+ initfn __arm7tdmi_setup, __\name\()_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_26BIT | ( \extra_hwcaps )
diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S
index 22bf8dde4f84..7a14bd4414c9 100644
--- a/arch/arm/mm/proc-arm920.S
+++ b/arch/arm/mm/proc-arm920.S
@@ -448,7 +448,7 @@ arm920_crval:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.type __arm920_proc_info,#object
__arm920_proc_info:
@@ -464,7 +464,7 @@ __arm920_proc_info:
PMD_BIT4 | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __arm920_setup
+ initfn __arm920_setup, __arm920_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_HALF | HWCAP_THUMB
diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S
index 0c6d5ac5a6d4..edccfcdcd551 100644
--- a/arch/arm/mm/proc-arm922.S
+++ b/arch/arm/mm/proc-arm922.S
@@ -426,7 +426,7 @@ arm922_crval:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.type __arm922_proc_info,#object
__arm922_proc_info:
@@ -442,7 +442,7 @@ __arm922_proc_info:
PMD_BIT4 | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __arm922_setup
+ initfn __arm922_setup, __arm922_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_HALF | HWCAP_THUMB
diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S
index c32d073282ea..ede8c54ab4aa 100644
--- a/arch/arm/mm/proc-arm925.S
+++ b/arch/arm/mm/proc-arm925.S
@@ -494,7 +494,7 @@ arm925_crval:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.macro arm925_proc_info name:req, cpu_val:req, cpu_mask:req, cpu_name:req, cache
.type __\name\()_proc_info,#object
@@ -510,7 +510,7 @@ __\name\()_proc_info:
PMD_BIT4 | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __arm925_setup
+ initfn __arm925_setup, __\name\()_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_HALF | HWCAP_THUMB
diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S
index 252b2503038d..fb827c633693 100644
--- a/arch/arm/mm/proc-arm926.S
+++ b/arch/arm/mm/proc-arm926.S
@@ -474,7 +474,7 @@ arm926_crval:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.type __arm926_proc_info,#object
__arm926_proc_info:
@@ -490,7 +490,7 @@ __arm926_proc_info:
PMD_BIT4 | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __arm926_setup
+ initfn __arm926_setup, __arm926_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP|HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT|HWCAP_EDSP|HWCAP_JAVA
diff --git a/arch/arm/mm/proc-arm940.S b/arch/arm/mm/proc-arm940.S
index e5212d489377..0a0b7a9167b6 100644
--- a/arch/arm/mm/proc-arm940.S
+++ b/arch/arm/mm/proc-arm940.S
@@ -354,14 +354,14 @@ __arm940_setup:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.type __arm940_proc_info,#object
__arm940_proc_info:
.long 0x41009400
.long 0xff00fff0
.long 0
- b __arm940_setup
+ initfn __arm940_setup, __arm940_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_HALF | HWCAP_THUMB
diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S
index b3dd9b2d0b8e..c85b40d2117e 100644
--- a/arch/arm/mm/proc-arm946.S
+++ b/arch/arm/mm/proc-arm946.S
@@ -409,14 +409,14 @@ __arm946_setup:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.type __arm946_proc_info,#object
__arm946_proc_info:
.long 0x41009460
.long 0xff00fff0
.long 0
.long 0
- b __arm946_setup
+ initfn __arm946_setup, __arm946_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_HALF | HWCAP_THUMB
diff --git a/arch/arm/mm/proc-arm9tdmi.S b/arch/arm/mm/proc-arm9tdmi.S
index 8227322bbb8f..7fac8c612134 100644
--- a/arch/arm/mm/proc-arm9tdmi.S
+++ b/arch/arm/mm/proc-arm9tdmi.S
@@ -70,7 +70,7 @@ __arm9tdmi_setup:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.macro arm9tdmi_proc_info name:req, cpu_val:req, cpu_mask:req, cpu_name:req
.type __\name\()_proc_info, #object
@@ -79,7 +79,7 @@ __\name\()_proc_info:
.long \cpu_mask
.long 0
.long 0
- b __arm9tdmi_setup
+ initfn __arm9tdmi_setup, __\name\()_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_THUMB | HWCAP_26BIT
diff --git a/arch/arm/mm/proc-fa526.S b/arch/arm/mm/proc-fa526.S
index c494886892ba..4001b73af4ee 100644
--- a/arch/arm/mm/proc-fa526.S
+++ b/arch/arm/mm/proc-fa526.S
@@ -190,7 +190,7 @@ fa526_cr1_set:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.type __fa526_proc_info,#object
__fa526_proc_info:
@@ -206,7 +206,7 @@ __fa526_proc_info:
PMD_BIT4 | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __fa526_setup
+ initfn __fa526_setup, __fa526_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_HALF
diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S
index 03a1b75f2e16..92e08bf37aad 100644
--- a/arch/arm/mm/proc-feroceon.S
+++ b/arch/arm/mm/proc-feroceon.S
@@ -584,7 +584,7 @@ feroceon_crval:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.macro feroceon_proc_info name:req, cpu_val:req, cpu_mask:req, cpu_name:req, cache:req
.type __\name\()_proc_info,#object
@@ -601,7 +601,7 @@ __\name\()_proc_info:
PMD_BIT4 | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __feroceon_setup
+ initfn __feroceon_setup, __\name\()_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP|HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT|HWCAP_EDSP
diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S
index ba1196c968d8..5c455468f320 100644
--- a/arch/arm/mm/proc-macros.S
+++ b/arch/arm/mm/proc-macros.S
@@ -331,3 +331,7 @@ ENTRY(\name\()_tlb_fns)
.globl \x
.equ \x, \y
.endm
+
+.macro initfn, func, base
+ .long \func - \base
+.endm
diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S
index 53d393455f13..d65edf717bf7 100644
--- a/arch/arm/mm/proc-mohawk.S
+++ b/arch/arm/mm/proc-mohawk.S
@@ -427,7 +427,7 @@ mohawk_crval:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.type __88sv331x_proc_info,#object
__88sv331x_proc_info:
@@ -443,7 +443,7 @@ __88sv331x_proc_info:
PMD_BIT4 | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __mohawk_setup
+ initfn __mohawk_setup, __88sv331x_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP|HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT|HWCAP_EDSP
diff --git a/arch/arm/mm/proc-sa110.S b/arch/arm/mm/proc-sa110.S
index 8008a0461cf5..ee2ce496239f 100644
--- a/arch/arm/mm/proc-sa110.S
+++ b/arch/arm/mm/proc-sa110.S
@@ -199,7 +199,7 @@ sa110_crval:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.type __sa110_proc_info,#object
__sa110_proc_info:
@@ -213,7 +213,7 @@ __sa110_proc_info:
.long PMD_TYPE_SECT | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __sa110_setup
+ initfn __sa110_setup, __sa110_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_HALF | HWCAP_26BIT | HWCAP_FAST_MULT
diff --git a/arch/arm/mm/proc-sa1100.S b/arch/arm/mm/proc-sa1100.S
index 89f97ac648a9..222d5836f666 100644
--- a/arch/arm/mm/proc-sa1100.S
+++ b/arch/arm/mm/proc-sa1100.S
@@ -242,7 +242,7 @@ sa1100_crval:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.macro sa1100_proc_info name:req, cpu_val:req, cpu_mask:req, cpu_name:req
.type __\name\()_proc_info,#object
@@ -257,7 +257,7 @@ __\name\()_proc_info:
.long PMD_TYPE_SECT | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __sa1100_setup
+ initfn __sa1100_setup, __\name\()_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_HALF | HWCAP_26BIT | HWCAP_FAST_MULT
diff --git a/arch/arm/mm/proc-v6.S b/arch/arm/mm/proc-v6.S
index d0390f4b3f18..06d890a2342b 100644
--- a/arch/arm/mm/proc-v6.S
+++ b/arch/arm/mm/proc-v6.S
@@ -264,7 +264,7 @@ v6_crval:
string cpu_elf_name, "v6"
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
/*
* Match any ARMv6 processor core.
@@ -287,7 +287,7 @@ __v6_proc_info:
PMD_SECT_XN | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __v6_setup
+ initfn __v6_setup, __v6_proc_info
.long cpu_arch_name
.long cpu_elf_name
/* See also feat_v6_fixup() for HWCAP_TLS */
diff --git a/arch/arm/mm/proc-v7-2level.S b/arch/arm/mm/proc-v7-2level.S
index ed448d8a596b..bc86be205c04 100644
--- a/arch/arm/mm/proc-v7-2level.S
+++ b/arch/arm/mm/proc-v7-2level.S
@@ -144,10 +144,10 @@ ENDPROC(cpu_v7_set_pte_ext)
* Macro for setting up the TTBRx and TTBCR registers.
* - \ttb0 and \ttb1 updated with the corresponding flags.
*/
- .macro v7_ttb_setup, zero, ttbr0, ttbr1, tmp
+ .macro v7_ttb_setup, zero, ttbr0l, ttbr0h, ttbr1, tmp
mcr p15, 0, \zero, c2, c0, 2 @ TTB control register
- ALT_SMP(orr \ttbr0, \ttbr0, #TTB_FLAGS_SMP)
- ALT_UP(orr \ttbr0, \ttbr0, #TTB_FLAGS_UP)
+ ALT_SMP(orr \ttbr0l, \ttbr0l, #TTB_FLAGS_SMP)
+ ALT_UP(orr \ttbr0l, \ttbr0l, #TTB_FLAGS_UP)
ALT_SMP(orr \ttbr1, \ttbr1, #TTB_FLAGS_SMP)
ALT_UP(orr \ttbr1, \ttbr1, #TTB_FLAGS_UP)
mcr p15, 0, \ttbr1, c2, c0, 1 @ load TTB1
diff --git a/arch/arm/mm/proc-v7-3level.S b/arch/arm/mm/proc-v7-3level.S
index d3daed0ae0ad..5e5720e8bc5f 100644
--- a/arch/arm/mm/proc-v7-3level.S
+++ b/arch/arm/mm/proc-v7-3level.S
@@ -126,11 +126,10 @@ ENDPROC(cpu_v7_set_pte_ext)
* Macro for setting up the TTBRx and TTBCR registers.
* - \ttbr1 updated.
*/
- .macro v7_ttb_setup, zero, ttbr0, ttbr1, tmp
+ .macro v7_ttb_setup, zero, ttbr0l, ttbr0h, ttbr1, tmp
ldr \tmp, =swapper_pg_dir @ swapper_pg_dir virtual address
- mov \tmp, \tmp, lsr #ARCH_PGD_SHIFT
- cmp \ttbr1, \tmp @ PHYS_OFFSET > PAGE_OFFSET?
- mrc p15, 0, \tmp, c2, c0, 2 @ TTB control register
+ cmp \ttbr1, \tmp, lsr #12 @ PHYS_OFFSET > PAGE_OFFSET?
+ mrc p15, 0, \tmp, c2, c0, 2 @ TTB control egister
orr \tmp, \tmp, #TTB_EAE
ALT_SMP(orr \tmp, \tmp, #TTB_FLAGS_SMP)
ALT_UP(orr \tmp, \tmp, #TTB_FLAGS_UP)
@@ -143,13 +142,10 @@ ENDPROC(cpu_v7_set_pte_ext)
*/
orrls \tmp, \tmp, #TTBR1_SIZE @ TTBCR.T1SZ
mcr p15, 0, \tmp, c2, c0, 2 @ TTBCR
- mov \tmp, \ttbr1, lsr #(32 - ARCH_PGD_SHIFT) @ upper bits
- mov \ttbr1, \ttbr1, lsl #ARCH_PGD_SHIFT @ lower bits
+ mov \tmp, \ttbr1, lsr #20
+ mov \ttbr1, \ttbr1, lsl #12
addls \ttbr1, \ttbr1, #TTBR1_OFFSET
mcrr p15, 1, \ttbr1, \tmp, c2 @ load TTBR1
- mov \tmp, \ttbr0, lsr #(32 - ARCH_PGD_SHIFT) @ upper bits
- mov \ttbr0, \ttbr0, lsl #ARCH_PGD_SHIFT @ lower bits
- mcrr p15, 0, \ttbr0, \tmp, c2 @ load TTBR0
.endm
/*
diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index 22ac2a6fbfe3..984da460c758 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -328,9 +328,9 @@ __v7_setup:
and r10, r0, #0xff000000 @ ARM?
teq r10, #0x41000000
bne 3f
- and r5, r0, #0x00f00000 @ variant
+ and r3, r0, #0x00f00000 @ variant
and r6, r0, #0x0000000f @ revision
- orr r6, r6, r5, lsr #20-4 @ combine variant and revision
+ orr r6, r6, r3, lsr #20-4 @ combine variant and revision
ubfx r0, r0, #4, #12 @ primary part number
/* Cortex-A8 Errata */
@@ -339,7 +339,7 @@ __v7_setup:
bne 2f
#if defined(CONFIG_ARM_ERRATA_430973) && !defined(CONFIG_ARCH_MULTIPLATFORM)
- teq r5, #0x00100000 @ only present in r1p*
+ teq r3, #0x00100000 @ only present in r1p*
mrceq p15, 0, r10, c1, c0, 1 @ read aux control register
orreq r10, r10, #(1 << 6) @ set IBE to 1
mcreq p15, 0, r10, c1, c0, 1 @ write aux control register
@@ -380,7 +380,7 @@ __v7_setup:
mcreq p15, 0, r10, c15, c0, 1 @ write diagnostic register
#endif
#ifdef CONFIG_ARM_ERRATA_743622
- teq r5, #0x00200000 @ only present in r2p*
+ teq r3, #0x00200000 @ only present in r2p*
mrceq p15, 0, r10, c15, c0, 1 @ read diagnostic register
orreq r10, r10, #1 << 6 @ set bit #6
mcreq p15, 0, r10, c15, c0, 1 @ write diagnostic register
@@ -410,10 +410,10 @@ __v7_setup:
mcr p15, 0, r10, c7, c5, 0 @ I+BTB cache invalidate
#ifdef CONFIG_MMU
mcr p15, 0, r10, c8, c7, 0 @ invalidate I + D TLBs
- v7_ttb_setup r10, r4, r8, r5 @ TTBCR, TTBRx setup
- ldr r5, =PRRR @ PRRR
+ v7_ttb_setup r10, r4, r5, r8, r3 @ TTBCR, TTBRx setup
+ ldr r3, =PRRR @ PRRR
ldr r6, =NMRR @ NMRR
- mcr p15, 0, r5, c10, c2, 0 @ write PRRR
+ mcr p15, 0, r3, c10, c2, 0 @ write PRRR
mcr p15, 0, r6, c10, c2, 1 @ write NMRR
#endif
dsb @ Complete invalidations
@@ -422,22 +422,22 @@ __v7_setup:
and r0, r0, #(0xf << 12) @ ThumbEE enabled field
teq r0, #(1 << 12) @ check if ThumbEE is present
bne 1f
- mov r5, #0
- mcr p14, 6, r5, c1, c0, 0 @ Initialize TEEHBR to 0
+ mov r3, #0
+ mcr p14, 6, r3, c1, c0, 0 @ Initialize TEEHBR to 0
mrc p14, 6, r0, c0, c0, 0 @ load TEECR
orr r0, r0, #1 @ set the 1st bit in order to
mcr p14, 6, r0, c0, c0, 0 @ stop userspace TEEHBR access
1:
#endif
- adr r5, v7_crval
- ldmia r5, {r5, r6}
+ adr r3, v7_crval
+ ldmia r3, {r3, r6}
ARM_BE8(orr r6, r6, #1 << 25) @ big-endian page tables
#ifdef CONFIG_SWP_EMULATE
- orr r5, r5, #(1 << 10) @ set SW bit in "clear"
+ orr r3, r3, #(1 << 10) @ set SW bit in "clear"
bic r6, r6, #(1 << 10) @ clear it in "mmuset"
#endif
mrc p15, 0, r0, c1, c0, 0 @ read control register
- bic r0, r0, r5 @ clear bits them
+ bic r0, r0, r3 @ clear bits them
orr r0, r0, r6 @ set them
THUMB( orr r0, r0, #1 << 30 ) @ Thumb exceptions
ret lr @ return to head.S:__ret
@@ -462,19 +462,19 @@ __v7_setup_stack:
string cpu_elf_name, "v7"
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
/*
* Standard v7 proc info content
*/
-.macro __v7_proc initfunc, mm_mmuflags = 0, io_mmuflags = 0, hwcaps = 0, proc_fns = v7_processor_functions
+.macro __v7_proc name, initfunc, mm_mmuflags = 0, io_mmuflags = 0, hwcaps = 0, proc_fns = v7_processor_functions
ALT_SMP(.long PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AP_READ | \
PMD_SECT_AF | PMD_FLAGS_SMP | \mm_mmuflags)
ALT_UP(.long PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AP_READ | \
PMD_SECT_AF | PMD_FLAGS_UP | \mm_mmuflags)
.long PMD_TYPE_SECT | PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ | PMD_SECT_AF | \io_mmuflags
- W(b) \initfunc
+ initfn \initfunc, \name
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP | HWCAP_HALF | HWCAP_THUMB | HWCAP_FAST_MULT | \
@@ -494,7 +494,7 @@ __v7_setup_stack:
__v7_ca5mp_proc_info:
.long 0x410fc050
.long 0xff0ffff0
- __v7_proc __v7_ca5mp_setup
+ __v7_proc __v7_ca5mp_proc_info, __v7_ca5mp_setup
.size __v7_ca5mp_proc_info, . - __v7_ca5mp_proc_info
/*
@@ -504,7 +504,7 @@ __v7_ca5mp_proc_info:
__v7_ca9mp_proc_info:
.long 0x410fc090
.long 0xff0ffff0
- __v7_proc __v7_ca9mp_setup, proc_fns = ca9mp_processor_functions
+ __v7_proc __v7_ca9mp_proc_info, __v7_ca9mp_setup, proc_fns = ca9mp_processor_functions
.size __v7_ca9mp_proc_info, . - __v7_ca9mp_proc_info
#endif /* CONFIG_ARM_LPAE */
@@ -517,7 +517,7 @@ __v7_ca9mp_proc_info:
__v7_pj4b_proc_info:
.long 0x560f5800
.long 0xff0fff00
- __v7_proc __v7_pj4b_setup, proc_fns = pj4b_processor_functions
+ __v7_proc __v7_pj4b_proc_info, __v7_pj4b_setup, proc_fns = pj4b_processor_functions
.size __v7_pj4b_proc_info, . - __v7_pj4b_proc_info
#endif
@@ -528,7 +528,7 @@ __v7_pj4b_proc_info:
__v7_cr7mp_proc_info:
.long 0x410fc170
.long 0xff0ffff0
- __v7_proc __v7_cr7mp_setup
+ __v7_proc __v7_cr7mp_proc_info, __v7_cr7mp_setup
.size __v7_cr7mp_proc_info, . - __v7_cr7mp_proc_info
/*
@@ -538,7 +538,7 @@ __v7_cr7mp_proc_info:
__v7_ca7mp_proc_info:
.long 0x410fc070
.long 0xff0ffff0
- __v7_proc __v7_ca7mp_setup
+ __v7_proc __v7_ca7mp_proc_info, __v7_ca7mp_setup
.size __v7_ca7mp_proc_info, . - __v7_ca7mp_proc_info
/*
@@ -548,7 +548,7 @@ __v7_ca7mp_proc_info:
__v7_ca12mp_proc_info:
.long 0x410fc0d0
.long 0xff0ffff0
- __v7_proc __v7_ca12mp_setup
+ __v7_proc __v7_ca12mp_proc_info, __v7_ca12mp_setup
.size __v7_ca12mp_proc_info, . - __v7_ca12mp_proc_info
/*
@@ -558,7 +558,7 @@ __v7_ca12mp_proc_info:
__v7_ca15mp_proc_info:
.long 0x410fc0f0
.long 0xff0ffff0
- __v7_proc __v7_ca15mp_setup
+ __v7_proc __v7_ca15mp_proc_info, __v7_ca15mp_setup
.size __v7_ca15mp_proc_info, . - __v7_ca15mp_proc_info
/*
@@ -568,7 +568,7 @@ __v7_ca15mp_proc_info:
__v7_b15mp_proc_info:
.long 0x420f00f0
.long 0xff0ffff0
- __v7_proc __v7_b15mp_setup
+ __v7_proc __v7_b15mp_proc_info, __v7_b15mp_setup
.size __v7_b15mp_proc_info, . - __v7_b15mp_proc_info
/*
@@ -578,7 +578,7 @@ __v7_b15mp_proc_info:
__v7_ca17mp_proc_info:
.long 0x410fc0e0
.long 0xff0ffff0
- __v7_proc __v7_ca17mp_setup
+ __v7_proc __v7_ca17mp_proc_info, __v7_ca17mp_setup
.size __v7_ca17mp_proc_info, . - __v7_ca17mp_proc_info
/*
@@ -591,9 +591,10 @@ __krait_proc_info:
/*
* Some Krait processors don't indicate support for SDIV and UDIV
* instructions in the ARM instruction set, even though they actually
- * do support them.
+ * do support them. They also don't indicate support for fused multiply
+ * instructions even though they actually do support them.
*/
- __v7_proc __v7_setup, hwcaps = HWCAP_IDIV
+ __v7_proc __krait_proc_info, __v7_setup, hwcaps = HWCAP_IDIV | HWCAP_VFPv4
.size __krait_proc_info, . - __krait_proc_info
/*
@@ -603,5 +604,5 @@ __krait_proc_info:
__v7_proc_info:
.long 0x000f0000 @ Required ID value
.long 0x000f0000 @ Mask for ID
- __v7_proc __v7_setup
+ __v7_proc __v7_proc_info, __v7_setup
.size __v7_proc_info, . - __v7_proc_info
diff --git a/arch/arm/mm/proc-v7m.S b/arch/arm/mm/proc-v7m.S
index d1e68b553d3b..e08e1f2bab76 100644
--- a/arch/arm/mm/proc-v7m.S
+++ b/arch/arm/mm/proc-v7m.S
@@ -135,7 +135,7 @@ __v7m_setup_stack_top:
string cpu_elf_name "v7m"
string cpu_v7m_name "ARMv7-M"
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
/*
* Match any ARMv7-M processor core.
@@ -146,7 +146,7 @@ __v7m_proc_info:
.long 0x000f0000 @ Mask for ID
.long 0 @ proc_info_list.__cpu_mm_mmu_flags
.long 0 @ proc_info_list.__cpu_io_mmu_flags
- b __v7m_setup @ proc_info_list.__cpu_flush
+ initfn __v7m_setup, __v7m_proc_info @ proc_info_list.__cpu_flush
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT
diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S
index f8acdfece036..293dcc2c441f 100644
--- a/arch/arm/mm/proc-xsc3.S
+++ b/arch/arm/mm/proc-xsc3.S
@@ -499,7 +499,7 @@ xsc3_crval:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.macro xsc3_proc_info name:req, cpu_val:req, cpu_mask:req
.type __\name\()_proc_info,#object
@@ -514,7 +514,7 @@ __\name\()_proc_info:
.long PMD_TYPE_SECT | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __xsc3_setup
+ initfn __xsc3_setup, __\name\()_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP|HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT|HWCAP_EDSP
diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S
index afa2b3c4df4a..b6bbfdb6dfdc 100644
--- a/arch/arm/mm/proc-xscale.S
+++ b/arch/arm/mm/proc-xscale.S
@@ -612,7 +612,7 @@ xscale_crval:
.align
- .section ".proc.info.init", #alloc, #execinstr
+ .section ".proc.info.init", #alloc
.macro xscale_proc_info name:req, cpu_val:req, cpu_mask:req, cpu_name:req, cache
.type __\name\()_proc_info,#object
@@ -627,7 +627,7 @@ __\name\()_proc_info:
.long PMD_TYPE_SECT | \
PMD_SECT_AP_WRITE | \
PMD_SECT_AP_READ
- b __xscale_setup
+ initfn __xscale_setup, __\name\()_proc_info
.long cpu_arch_name
.long cpu_elf_name
.long HWCAP_SWP|HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT|HWCAP_EDSP
diff --git a/arch/arm/mm/pv-fixup-asm.S b/arch/arm/mm/pv-fixup-asm.S
new file mode 100644
index 000000000000..1867f3e43016
--- /dev/null
+++ b/arch/arm/mm/pv-fixup-asm.S
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2015 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This assembly is required to safely remap the physical address space
+ * for Keystone 2
+ */
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/cp15.h>
+#include <asm/memory.h>
+#include <asm/pgtable.h>
+
+ .section ".idmap.text", "ax"
+
+#define L1_ORDER 3
+#define L2_ORDER 3
+
+ENTRY(lpae_pgtables_remap_asm)
+ stmfd sp!, {r4-r8, lr}
+
+ mrc p15, 0, r8, c1, c0, 0 @ read control reg
+ bic ip, r8, #CR_M @ disable caches and MMU
+ mcr p15, 0, ip, c1, c0, 0
+ dsb
+ isb
+
+ /* Update level 2 entries covering the kernel */
+ ldr r6, =(_end - 1)
+ add r7, r2, #0x1000
+ add r6, r7, r6, lsr #SECTION_SHIFT - L2_ORDER
+ add r7, r7, #PAGE_OFFSET >> (SECTION_SHIFT - L2_ORDER)
+1: ldrd r4, [r7]
+ adds r4, r4, r0
+ adc r5, r5, r1
+ strd r4, [r7], #1 << L2_ORDER
+ cmp r7, r6
+ bls 1b
+
+ /* Update level 2 entries for the boot data */
+ add r7, r2, #0x1000
+ add r7, r7, r3, lsr #SECTION_SHIFT - L2_ORDER
+ bic r7, r7, #(1 << L2_ORDER) - 1
+ ldrd r4, [r7]
+ adds r4, r4, r0
+ adc r5, r5, r1
+ strd r4, [r7], #1 << L2_ORDER
+ ldrd r4, [r7]
+ adds r4, r4, r0
+ adc r5, r5, r1
+ strd r4, [r7]
+
+ /* Update level 1 entries */
+ mov r6, #4
+ mov r7, r2
+2: ldrd r4, [r7]
+ adds r4, r4, r0
+ adc r5, r5, r1
+ strd r4, [r7], #1 << L1_ORDER
+ subs r6, r6, #1
+ bne 2b
+
+ mrrc p15, 0, r4, r5, c2 @ read TTBR0
+ adds r4, r4, r0 @ update physical address
+ adc r5, r5, r1
+ mcrr p15, 0, r4, r5, c2 @ write back TTBR0
+ mrrc p15, 1, r4, r5, c2 @ read TTBR1
+ adds r4, r4, r0 @ update physical address
+ adc r5, r5, r1
+ mcrr p15, 1, r4, r5, c2 @ write back TTBR1
+
+ dsb
+
+ mov ip, #0
+ mcr p15, 0, ip, c7, c5, 0 @ I+BTB cache invalidate
+ mcr p15, 0, ip, c8, c7, 0 @ local_flush_tlb_all()
+ dsb
+ isb
+
+ mcr p15, 0, r8, c1, c0, 0 @ re-enable MMU
+ dsb
+ isb
+
+ ldmfd sp!, {r4-r8, pc}
+ENDPROC(lpae_pgtables_remap_asm)
diff --git a/arch/arm/nwfpe/entry.S b/arch/arm/nwfpe/entry.S
index 5d65be1f1e8a..457e82bd69d1 100644
--- a/arch/arm/nwfpe/entry.S
+++ b/arch/arm/nwfpe/entry.S
@@ -95,9 +95,10 @@ emulate:
reteq r4 @ no, return failure
next:
+ uaccess_enable r3
.Lx1: ldrt r6, [r5], #4 @ get the next instruction and
@ increment PC
-
+ uaccess_disable r3
and r2, r6, #0x0F000000 @ test for FP insns
teq r2, #0x0C000000
teqne r2, #0x0D000000
diff --git a/arch/arm/vdso/.gitignore b/arch/arm/vdso/.gitignore
new file mode 100644
index 000000000000..6b47f6e0b032
--- /dev/null
+++ b/arch/arm/vdso/.gitignore
@@ -0,0 +1,3 @@
+vdso.lds
+vdso.so.raw
+vdsomunge
diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile
new file mode 100644
index 000000000000..59a8fa7b8a3b
--- /dev/null
+++ b/arch/arm/vdso/Makefile
@@ -0,0 +1,78 @@
+hostprogs-y := vdsomunge
+
+obj-vdso := vgettimeofday.o datapage.o
+
+# Build rules
+targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.so.raw vdso.lds
+obj-vdso := $(addprefix $(obj)/, $(obj-vdso))
+
+ccflags-y := -fPIC -fno-common -fno-builtin -fno-stack-protector
+ccflags-y += -DDISABLE_BRANCH_PROFILING
+
+VDSO_LDFLAGS := -Wl,-Bsymbolic -Wl,--no-undefined -Wl,-soname=linux-vdso.so.1
+VDSO_LDFLAGS += -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
+VDSO_LDFLAGS += -nostdlib -shared
+VDSO_LDFLAGS += $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
+VDSO_LDFLAGS += $(call cc-ldoption, -Wl$(comma)--build-id)
+VDSO_LDFLAGS += $(call cc-ldoption, -fuse-ld=bfd)
+
+obj-$(CONFIG_VDSO) += vdso.o
+extra-$(CONFIG_VDSO) += vdso.lds
+CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
+
+CFLAGS_REMOVE_vdso.o = -pg
+
+# Force -O2 to avoid libgcc dependencies
+CFLAGS_REMOVE_vgettimeofday.o = -pg -Os
+CFLAGS_vgettimeofday.o = -O2
+
+# Disable gcov profiling for VDSO code
+GCOV_PROFILE := n
+
+# Force dependency
+$(obj)/vdso.o : $(obj)/vdso.so
+
+# Link rule for the .so file
+$(obj)/vdso.so.raw: $(src)/vdso.lds $(obj-vdso) FORCE
+ $(call if_changed,vdsold)
+
+$(obj)/vdso.so.dbg: $(obj)/vdso.so.raw $(obj)/vdsomunge FORCE
+ $(call if_changed,vdsomunge)
+
+# Strip rule for the .so file
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+ $(call if_changed,objcopy)
+
+# Actual build commands
+quiet_cmd_vdsold = VDSO $@
+ cmd_vdsold = $(CC) $(c_flags) $(VDSO_LDFLAGS) \
+ -Wl,-T $(filter %.lds,$^) $(filter %.o,$^) -o $@
+
+quiet_cmd_vdsomunge = MUNGE $@
+ cmd_vdsomunge = $(objtree)/$(obj)/vdsomunge $< $@
+
+#
+# Install the unstripped copy of vdso.so.dbg. If our toolchain
+# supports build-id, install .build-id links as well.
+#
+# Cribbed from arch/x86/vdso/Makefile.
+#
+quiet_cmd_vdso_install = INSTALL $<
+define cmd_vdso_install
+ cp $< "$(MODLIB)/vdso/vdso.so"; \
+ if readelf -n $< | grep -q 'Build ID'; then \
+ buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \
+ first=`echo $$buildid | cut -b-2`; \
+ last=`echo $$buildid | cut -b3-`; \
+ mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \
+ ln -sf "../../vdso.so" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \
+ fi
+endef
+
+$(MODLIB)/vdso: FORCE
+ @mkdir -p $(MODLIB)/vdso
+
+PHONY += vdso_install
+vdso_install: $(obj)/vdso.so.dbg $(MODLIB)/vdso
+ $(call cmd,vdso_install)
diff --git a/arch/arm/vdso/datapage.S b/arch/arm/vdso/datapage.S
new file mode 100644
index 000000000000..a2e60367931b
--- /dev/null
+++ b/arch/arm/vdso/datapage.S
@@ -0,0 +1,15 @@
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+
+ .align 2
+.L_vdso_data_ptr:
+ .long _start - . - VDSO_DATA_SIZE
+
+ENTRY(__get_datapage)
+ .fnstart
+ adr r0, .L_vdso_data_ptr
+ ldr r1, [r0]
+ add r0, r0, r1
+ bx lr
+ .fnend
+ENDPROC(__get_datapage)
diff --git a/arch/arm64/kernel/cputable.c b/arch/arm/vdso/vdso.S
index fd3993cb060f..b2b97e3e7bab 100644
--- a/arch/arm64/kernel/cputable.c
+++ b/arch/arm/vdso/vdso.S
@@ -1,9 +1,9 @@
/*
- * arch/arm64/kernel/cputable.c
+ * Adapted from arm64 version.
*
- * Copyright (C) 2012 ARM Ltd.
+ * Copyright (C) 2012 ARM Limited
*
- * This program is free software: you can redistribute it and/or modify
+ * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
@@ -14,20 +14,22 @@
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
*/
#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/const.h>
+#include <asm/page.h>
-#include <asm/cputable.h>
+ __PAGE_ALIGNED_DATA
-extern unsigned long __cpu_setup(void);
+ .globl vdso_start, vdso_end
+ .balign PAGE_SIZE
+vdso_start:
+ .incbin "arch/arm/vdso/vdso.so"
+ .balign PAGE_SIZE
+vdso_end:
-struct cpu_info cpu_table[] = {
- {
- .cpu_id_val = 0x000f0000,
- .cpu_id_mask = 0x000f0000,
- .cpu_name = "AArch64 Processor",
- .cpu_setup = __cpu_setup,
- },
- { /* Empty */ },
-};
+ .previous
diff --git a/arch/arm/vdso/vdso.lds.S b/arch/arm/vdso/vdso.lds.S
new file mode 100644
index 000000000000..89ca89f12d23
--- /dev/null
+++ b/arch/arm/vdso/vdso.lds.S
@@ -0,0 +1,87 @@
+/*
+ * Adapted from arm64 version.
+ *
+ * GNU linker script for the VDSO library.
+ *
+ * Copyright (C) 2012 ARM Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ * Heavily based on the vDSO linker scripts for other archs.
+ */
+
+#include <linux/const.h>
+#include <asm/page.h>
+#include <asm/vdso.h>
+
+OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm")
+OUTPUT_ARCH(arm)
+
+SECTIONS
+{
+ PROVIDE(_start = .);
+
+ . = SIZEOF_HEADERS;
+
+ .hash : { *(.hash) } :text
+ .gnu.hash : { *(.gnu.hash) }
+ .dynsym : { *(.dynsym) }
+ .dynstr : { *(.dynstr) }
+ .gnu.version : { *(.gnu.version) }
+ .gnu.version_d : { *(.gnu.version_d) }
+ .gnu.version_r : { *(.gnu.version_r) }
+
+ .note : { *(.note.*) } :text :note
+
+
+ .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
+ .eh_frame : { KEEP (*(.eh_frame)) } :text
+
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ .rodata : { *(.rodata*) } :text
+
+ .text : { *(.text*) } :text =0xe7f001f2
+
+ .got : { *(.got) }
+ .rel.plt : { *(.rel.plt) }
+
+ /DISCARD/ : {
+ *(.note.GNU-stack)
+ *(.data .data.* .gnu.linkonce.d.* .sdata*)
+ *(.bss .sbss .dynbss .dynsbss)
+ }
+}
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+ text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */
+ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+ note PT_NOTE FLAGS(4); /* PF_R */
+ eh_frame_hdr PT_GNU_EH_FRAME;
+}
+
+VERSION
+{
+ LINUX_2.6 {
+ global:
+ __vdso_clock_gettime;
+ __vdso_gettimeofday;
+ local: *;
+ };
+}
diff --git a/arch/arm/vdso/vdsomunge.c b/arch/arm/vdso/vdsomunge.c
new file mode 100644
index 000000000000..9005b07296c8
--- /dev/null
+++ b/arch/arm/vdso/vdsomunge.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright 2015 Mentor Graphics Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * vdsomunge - Host program which produces a shared object
+ * architecturally specified to be usable by both soft- and hard-float
+ * programs.
+ *
+ * The Procedure Call Standard for the ARM Architecture (ARM IHI
+ * 0042E) says:
+ *
+ * 6.4.1 VFP and Base Standard Compatibility
+ *
+ * Code compiled for the VFP calling standard is compatible with
+ * the base standard (and vice-versa) if no floating-point or
+ * containerized vector arguments or results are used.
+ *
+ * And ELF for the ARM Architecture (ARM IHI 0044E) (Table 4-2) says:
+ *
+ * If both EF_ARM_ABI_FLOAT_XXXX bits are clear, conformance to the
+ * base procedure-call standard is implied.
+ *
+ * The VDSO is built with -msoft-float, as with the rest of the ARM
+ * kernel, and uses no floating point arguments or results. The build
+ * process will produce a shared object that may or may not have the
+ * EF_ARM_ABI_FLOAT_SOFT flag set (it seems to depend on the binutils
+ * version; binutils starting with 2.24 appears to set it). The
+ * EF_ARM_ABI_FLOAT_HARD flag should definitely not be set, and this
+ * program will error out if it is.
+ *
+ * If the soft-float flag is set, this program clears it. That's all
+ * it does.
+ */
+
+#define _GNU_SOURCE
+
+#include <byteswap.h>
+#include <elf.h>
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define HOST_ORDER ELFDATA2LSB
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define HOST_ORDER ELFDATA2MSB
+#endif
+
+/* Some of the ELF constants we'd like to use were added to <elf.h>
+ * relatively recently.
+ */
+#ifndef EF_ARM_EABI_VER5
+#define EF_ARM_EABI_VER5 0x05000000
+#endif
+
+#ifndef EF_ARM_ABI_FLOAT_SOFT
+#define EF_ARM_ABI_FLOAT_SOFT 0x200
+#endif
+
+#ifndef EF_ARM_ABI_FLOAT_HARD
+#define EF_ARM_ABI_FLOAT_HARD 0x400
+#endif
+
+static const char *outfile;
+
+static void cleanup(void)
+{
+ if (error_message_count > 0 && outfile != NULL)
+ unlink(outfile);
+}
+
+static Elf32_Word read_elf_word(Elf32_Word word, bool swap)
+{
+ return swap ? bswap_32(word) : word;
+}
+
+static Elf32_Half read_elf_half(Elf32_Half half, bool swap)
+{
+ return swap ? bswap_16(half) : half;
+}
+
+static void write_elf_word(Elf32_Word val, Elf32_Word *dst, bool swap)
+{
+ *dst = swap ? bswap_32(val) : val;
+}
+
+int main(int argc, char **argv)
+{
+ const Elf32_Ehdr *inhdr;
+ bool clear_soft_float;
+ const char *infile;
+ Elf32_Word e_flags;
+ const void *inbuf;
+ struct stat stat;
+ void *outbuf;
+ bool swap;
+ int outfd;
+ int infd;
+
+ atexit(cleanup);
+
+ if (argc != 3)
+ error(EXIT_FAILURE, 0, "Usage: %s [infile] [outfile]", argv[0]);
+
+ infile = argv[1];
+ outfile = argv[2];
+
+ infd = open(infile, O_RDONLY);
+ if (infd < 0)
+ error(EXIT_FAILURE, errno, "Cannot open %s", infile);
+
+ if (fstat(infd, &stat) != 0)
+ error(EXIT_FAILURE, errno, "Failed stat for %s", infile);
+
+ inbuf = mmap(NULL, stat.st_size, PROT_READ, MAP_PRIVATE, infd, 0);
+ if (inbuf == MAP_FAILED)
+ error(EXIT_FAILURE, errno, "Failed to map %s", infile);
+
+ close(infd);
+
+ inhdr = inbuf;
+
+ if (memcmp(&inhdr->e_ident, ELFMAG, SELFMAG) != 0)
+ error(EXIT_FAILURE, 0, "Not an ELF file");
+
+ if (inhdr->e_ident[EI_CLASS] != ELFCLASS32)
+ error(EXIT_FAILURE, 0, "Unsupported ELF class");
+
+ swap = inhdr->e_ident[EI_DATA] != HOST_ORDER;
+
+ if (read_elf_half(inhdr->e_type, swap) != ET_DYN)
+ error(EXIT_FAILURE, 0, "Not a shared object");
+
+ if (read_elf_half(inhdr->e_machine, swap) != EM_ARM) {
+ error(EXIT_FAILURE, 0, "Unsupported architecture %#x",
+ inhdr->e_machine);
+ }
+
+ e_flags = read_elf_word(inhdr->e_flags, swap);
+
+ if (EF_ARM_EABI_VERSION(e_flags) != EF_ARM_EABI_VER5) {
+ error(EXIT_FAILURE, 0, "Unsupported EABI version %#x",
+ EF_ARM_EABI_VERSION(e_flags));
+ }
+
+ if (e_flags & EF_ARM_ABI_FLOAT_HARD)
+ error(EXIT_FAILURE, 0,
+ "Unexpected hard-float flag set in e_flags");
+
+ clear_soft_float = !!(e_flags & EF_ARM_ABI_FLOAT_SOFT);
+
+ outfd = open(outfile, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
+ if (outfd < 0)
+ error(EXIT_FAILURE, errno, "Cannot open %s", outfile);
+
+ if (ftruncate(outfd, stat.st_size) != 0)
+ error(EXIT_FAILURE, errno, "Cannot truncate %s", outfile);
+
+ outbuf = mmap(NULL, stat.st_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ outfd, 0);
+ if (outbuf == MAP_FAILED)
+ error(EXIT_FAILURE, errno, "Failed to map %s", outfile);
+
+ close(outfd);
+
+ memcpy(outbuf, inbuf, stat.st_size);
+
+ if (clear_soft_float) {
+ Elf32_Ehdr *outhdr;
+
+ outhdr = outbuf;
+ e_flags &= ~EF_ARM_ABI_FLOAT_SOFT;
+ write_elf_word(e_flags, &outhdr->e_flags, swap);
+ }
+
+ if (msync(outbuf, stat.st_size, MS_SYNC) != 0)
+ error(EXIT_FAILURE, errno, "Failed to sync %s", outfile);
+
+ return EXIT_SUCCESS;
+}
diff --git a/arch/arm/vdso/vgettimeofday.c b/arch/arm/vdso/vgettimeofday.c
new file mode 100644
index 000000000000..79214d5ff097
--- /dev/null
+++ b/arch/arm/vdso/vgettimeofday.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2015 Mentor Graphics Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/compiler.h>
+#include <linux/hrtimer.h>
+#include <linux/time.h>
+#include <asm/arch_timer.h>
+#include <asm/barrier.h>
+#include <asm/bug.h>
+#include <asm/page.h>
+#include <asm/unistd.h>
+#include <asm/vdso_datapage.h>
+
+#ifndef CONFIG_AEABI
+#error This code depends on AEABI system call conventions
+#endif
+
+extern struct vdso_data *__get_datapage(void);
+
+static notrace u32 __vdso_read_begin(const struct vdso_data *vdata)
+{
+ u32 seq;
+repeat:
+ seq = ACCESS_ONCE(vdata->seq_count);
+ if (seq & 1) {
+ cpu_relax();
+ goto repeat;
+ }
+ return seq;
+}
+
+static notrace u32 vdso_read_begin(const struct vdso_data *vdata)
+{
+ u32 seq;
+
+ seq = __vdso_read_begin(vdata);
+
+ smp_rmb(); /* Pairs with smp_wmb in vdso_write_end */
+ return seq;
+}
+
+static notrace int vdso_read_retry(const struct vdso_data *vdata, u32 start)
+{
+ smp_rmb(); /* Pairs with smp_wmb in vdso_write_begin */
+ return vdata->seq_count != start;
+}
+
+static notrace long clock_gettime_fallback(clockid_t _clkid,
+ struct timespec *_ts)
+{
+ register struct timespec *ts asm("r1") = _ts;
+ register clockid_t clkid asm("r0") = _clkid;
+ register long ret asm ("r0");
+ register long nr asm("r7") = __NR_clock_gettime;
+
+ asm volatile(
+ " swi #0\n"
+ : "=r" (ret)
+ : "r" (clkid), "r" (ts), "r" (nr)
+ : "memory");
+
+ return ret;
+}
+
+static notrace int do_realtime_coarse(struct timespec *ts,
+ struct vdso_data *vdata)
+{
+ u32 seq;
+
+ do {
+ seq = vdso_read_begin(vdata);
+
+ ts->tv_sec = vdata->xtime_coarse_sec;
+ ts->tv_nsec = vdata->xtime_coarse_nsec;
+
+ } while (vdso_read_retry(vdata, seq));
+
+ return 0;
+}
+
+static notrace int do_monotonic_coarse(struct timespec *ts,
+ struct vdso_data *vdata)
+{
+ struct timespec tomono;
+ u32 seq;
+
+ do {
+ seq = vdso_read_begin(vdata);
+
+ ts->tv_sec = vdata->xtime_coarse_sec;
+ ts->tv_nsec = vdata->xtime_coarse_nsec;
+
+ tomono.tv_sec = vdata->wtm_clock_sec;
+ tomono.tv_nsec = vdata->wtm_clock_nsec;
+
+ } while (vdso_read_retry(vdata, seq));
+
+ ts->tv_sec += tomono.tv_sec;
+ timespec_add_ns(ts, tomono.tv_nsec);
+
+ return 0;
+}
+
+#ifdef CONFIG_ARM_ARCH_TIMER
+
+static notrace u64 get_ns(struct vdso_data *vdata)
+{
+ u64 cycle_delta;
+ u64 cycle_now;
+ u64 nsec;
+
+ cycle_now = arch_counter_get_cntvct();
+
+ cycle_delta = (cycle_now - vdata->cs_cycle_last) & vdata->cs_mask;
+
+ nsec = (cycle_delta * vdata->cs_mult) + vdata->xtime_clock_snsec;
+ nsec >>= vdata->cs_shift;
+
+ return nsec;
+}
+
+static notrace int do_realtime(struct timespec *ts, struct vdso_data *vdata)
+{
+ u64 nsecs;
+ u32 seq;
+
+ do {
+ seq = vdso_read_begin(vdata);
+
+ if (!vdata->tk_is_cntvct)
+ return -1;
+
+ ts->tv_sec = vdata->xtime_clock_sec;
+ nsecs = get_ns(vdata);
+
+ } while (vdso_read_retry(vdata, seq));
+
+ ts->tv_nsec = 0;
+ timespec_add_ns(ts, nsecs);
+
+ return 0;
+}
+
+static notrace int do_monotonic(struct timespec *ts, struct vdso_data *vdata)
+{
+ struct timespec tomono;
+ u64 nsecs;
+ u32 seq;
+
+ do {
+ seq = vdso_read_begin(vdata);
+
+ if (!vdata->tk_is_cntvct)
+ return -1;
+
+ ts->tv_sec = vdata->xtime_clock_sec;
+ nsecs = get_ns(vdata);
+
+ tomono.tv_sec = vdata->wtm_clock_sec;
+ tomono.tv_nsec = vdata->wtm_clock_nsec;
+
+ } while (vdso_read_retry(vdata, seq));
+
+ ts->tv_sec += tomono.tv_sec;
+ ts->tv_nsec = 0;
+ timespec_add_ns(ts, nsecs + tomono.tv_nsec);
+
+ return 0;
+}
+
+#else /* CONFIG_ARM_ARCH_TIMER */
+
+static notrace int do_realtime(struct timespec *ts, struct vdso_data *vdata)
+{
+ return -1;
+}
+
+static notrace int do_monotonic(struct timespec *ts, struct vdso_data *vdata)
+{
+ return -1;
+}
+
+#endif /* CONFIG_ARM_ARCH_TIMER */
+
+notrace int __vdso_clock_gettime(clockid_t clkid, struct timespec *ts)
+{
+ struct vdso_data *vdata;
+ int ret = -1;
+
+ vdata = __get_datapage();
+
+ switch (clkid) {
+ case CLOCK_REALTIME_COARSE:
+ ret = do_realtime_coarse(ts, vdata);
+ break;
+ case CLOCK_MONOTONIC_COARSE:
+ ret = do_monotonic_coarse(ts, vdata);
+ break;
+ case CLOCK_REALTIME:
+ ret = do_realtime(ts, vdata);
+ break;
+ case CLOCK_MONOTONIC:
+ ret = do_monotonic(ts, vdata);
+ break;
+ default:
+ break;
+ }
+
+ if (ret)
+ ret = clock_gettime_fallback(clkid, ts);
+
+ return ret;
+}
+
+static notrace long gettimeofday_fallback(struct timeval *_tv,
+ struct timezone *_tz)
+{
+ register struct timezone *tz asm("r1") = _tz;
+ register struct timeval *tv asm("r0") = _tv;
+ register long ret asm ("r0");
+ register long nr asm("r7") = __NR_gettimeofday;
+
+ asm volatile(
+ " swi #0\n"
+ : "=r" (ret)
+ : "r" (tv), "r" (tz), "r" (nr)
+ : "memory");
+
+ return ret;
+}
+
+notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+ struct timespec ts;
+ struct vdso_data *vdata;
+ int ret;
+
+ vdata = __get_datapage();
+
+ ret = do_realtime(&ts, vdata);
+ if (ret)
+ return gettimeofday_fallback(tv, tz);
+
+ if (tv) {
+ tv->tv_sec = ts.tv_sec;
+ tv->tv_usec = ts.tv_nsec / 1000;
+ }
+ if (tz) {
+ tz->tz_minuteswest = vdata->tz_minuteswest;
+ tz->tz_dsttime = vdata->tz_dsttime;
+ }
+
+ return ret;
+}
+
+/* Avoid unresolved references emitted by GCC */
+
+void __aeabi_unwind_cpp_pr0(void)
+{
+}
+
+void __aeabi_unwind_cpp_pr1(void)
+{
+}
+
+void __aeabi_unwind_cpp_pr2(void)
+{
+}
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1d5e13f7a298..a11b5550bd68 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -23,7 +23,7 @@ config ARM64
select DCACHE_WORD_ACCESS
select GENERIC_ALLOCATOR
select GENERIC_CLOCKEVENTS
- select GENERIC_CLOCKEVENTS_BROADCAST if SMP
+ select GENERIC_CLOCKEVENTS_BROADCAST
select GENERIC_CPU_AUTOPROBE
select GENERIC_EARLY_IOREMAP
select GENERIC_IOMAP
@@ -37,9 +37,13 @@ config ARM64
select HANDLE_DOMAIN_IRQ
select HARDIRQS_SW_RESEND
select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_ARCH_HARDENED_USERCOPY
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
select HAVE_ARCH_KGDB
+ select HAVE_ARCH_MMAP_RND_BITS
+ select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
+ select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
select HAVE_BPF_JIT
select HAVE_C_RECORDMCOUNT
@@ -88,9 +92,40 @@ config ARCH_PHYS_ADDR_T_64BIT
config MMU
def_bool y
+config ARCH_MMAP_RND_BITS_MIN
+ default 14 if ARM64_64K_PAGES
+ default 16 if ARM64_16K_PAGES
+ default 18
+
+# max bits determined by the following formula:
+# VA_BITS - PAGE_SHIFT - 3
+config ARCH_MMAP_RND_BITS_MAX
+ default 19 if ARM64_VA_BITS=36
+ default 24 if ARM64_VA_BITS=39
+ default 27 if ARM64_VA_BITS=42
+ default 30 if ARM64_VA_BITS=47
+ default 29 if ARM64_VA_BITS=48 && ARM64_64K_PAGES
+ default 31 if ARM64_VA_BITS=48 && ARM64_16K_PAGES
+ default 33 if ARM64_VA_BITS=48
+ default 14 if ARM64_64K_PAGES
+ default 16 if ARM64_16K_PAGES
+ default 18
+
+config ARCH_MMAP_RND_COMPAT_BITS_MIN
+ default 7 if ARM64_64K_PAGES
+ default 9 if ARM64_16K_PAGES
+ default 11
+
+config ARCH_MMAP_RND_COMPAT_BITS_MAX
+ default 16
+
config NO_IOPORT_MAP
def_bool y if !PCI
+config ILLEGAL_POINTER_VALUE
+ hex
+ default 0xdead000000000000
+
config STACKTRACE_SUPPORT
def_bool y
@@ -131,6 +166,9 @@ config NEED_DMA_MAP_STATE
config NEED_SG_DMA_LENGTH
def_bool y
+config SMP
+ def_bool y
+
config SWIOTLB
def_bool y
@@ -143,6 +181,13 @@ config KERNEL_MODE_NEON
config FIX_EARLYCON_MEM
def_bool y
+config PGTABLE_LEVELS
+ int
+ default 2 if ARM64_64K_PAGES && ARM64_VA_BITS_42
+ default 3 if ARM64_64K_PAGES && ARM64_VA_BITS_48
+ default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39
+ default 4 if ARM64_4K_PAGES && ARM64_VA_BITS_48
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
@@ -379,34 +424,13 @@ config ARM64_VA_BITS
default 42 if ARM64_VA_BITS_42
default 48 if ARM64_VA_BITS_48
-config ARM64_PGTABLE_LEVELS
- int
- default 2 if ARM64_64K_PAGES && ARM64_VA_BITS_42
- default 3 if ARM64_64K_PAGES && ARM64_VA_BITS_48
- default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39
- default 4 if ARM64_4K_PAGES && ARM64_VA_BITS_48
-
config CPU_BIG_ENDIAN
bool "Build big-endian kernel"
help
Say Y if you plan on running a kernel in big-endian mode.
-config SMP
- bool "Symmetric Multi-Processing"
- help
- This enables support for systems with more than one CPU. If
- you say N here, the kernel will run on single and
- multiprocessor machines, but will use only one CPU of a
- multiprocessor machine. If you say Y here, the kernel will run
- on many, but not all, single processor machines. On a single
- processor machine, the kernel will run faster if you say N
- here.
-
- If you don't know what to do here, say N.
-
config SCHED_MC
bool "Multi-core scheduler support"
- depends on SMP
help
Multi-core scheduler support improves the CPU scheduler's decision
making when dealing with multi-core CPU chips at a cost of slightly
@@ -414,7 +438,6 @@ config SCHED_MC
config SCHED_SMT
bool "SMT scheduler support"
- depends on SMP
help
Improves the CPU scheduler's decision making when dealing with
MultiThreading at a cost of slightly increased overhead in some
@@ -422,14 +445,11 @@ config SCHED_SMT
config NR_CPUS
int "Maximum number of CPUs (2-64)"
- range 2 64
- depends on SMP
# These have to remain sorted largest to smallest
default "64"
config HOTPLUG_CPU
bool "Support for hot-pluggable CPUs"
- depends on SMP
help
Say Y here to experiment with turning CPUs off and on. CPUs
can be controlled through /sys/devices/system/cpu.
@@ -481,6 +501,19 @@ config ARCH_HAS_CACHE_LINE_SIZE
source "mm/Kconfig"
+config SECCOMP
+ bool "Enable seccomp to safely compute untrusted bytecode"
+ ---help---
+ This kernel feature is useful for number crunching applications
+ that may need to compute untrusted bytecode during their
+ execution. By using pipes or other transports made available to
+ the process as file descriptors supporting the read/write
+ syscalls, it's possible to isolate those applications in
+ their own address space using seccomp. Once seccomp is
+ enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
+ and the task is only allowed to execute a few safe syscalls
+ defined by each seccomp mode.
+
config XEN_DOM0
def_bool y
depends on XEN
@@ -497,20 +530,6 @@ config FORCE_MAX_ZONEORDER
default "14" if (ARM64_64K_PAGES && TRANSPARENT_HUGEPAGE)
default "11"
-config ARM64_PAN
- bool "Enable support for Privileged Access Never (PAN)"
- default y
- help
- Privileged Access Never (PAN; part of the ARMv8.1 Extensions)
- prevents the kernel or hypervisor from accessing user-space (EL0)
- memory directly.
-
- Choosing this option will cause any unprotected (not using
- copy_to_user et al) memory access to fail with a permission fault.
-
- The feature is detected at runtime, and will remain as a 'nop'
- instruction if the cpu does not implement the feature.
-
menuconfig ARMV8_DEPRECATED
bool "Emulate deprecated/obsolete ARMv8 instructions"
depends on COMPAT
@@ -576,8 +595,56 @@ config SETEND_EMULATION
be unexpected results in the applications.
If unsure, say Y
+
endif
+config ARM64_SW_TTBR0_PAN
+ bool "Emulate Privileged Access Never using TTBR0_EL1 switching"
+ help
+ Enabling this option prevents the kernel from accessing
+ user-space memory directly by pointing TTBR0_EL1 to a reserved
+ zeroed area and reserved ASID. The user access routines
+ restore the valid TTBR0_EL1 temporarily.
+
+menu "ARMv8.1 architectural features"
+
+config ARM64_PAN
+ bool "Enable support for Privileged Access Never (PAN)"
+ default y
+ help
+ Privileged Access Never (PAN; part of the ARMv8.1 Extensions)
+ prevents the kernel or hypervisor from accessing user-space (EL0)
+ memory directly.
+
+ Choosing this option will cause any unprotected (not using
+ copy_to_user et al) memory access to fail with a permission fault.
+
+ The feature is detected at runtime, and will remain as a 'nop'
+ instruction if the cpu does not implement the feature.
+
+endmenu
+
+config ARM64_UAO
+ bool "Enable support for User Access Override (UAO)"
+ default y
+ help
+ User Access Override (UAO; part of the ARMv8.2 Extensions)
+ causes the 'unprivileged' variant of the load/store instructions to
+ be overriden to be privileged.
+
+ This option changes get_user() and friends to use the 'unprivileged'
+ variant of the load/store instructions. This ensures that user-space
+ really did have access to the supplied memory. When addr_limit is
+ set to kernel memory the UAO bit will be set, allowing privileged
+ access to kernel memory.
+
+ Choosing this option will cause copy_to_user() et al to use user-space
+ memory permissions.
+
+ The feature is detected at runtime, the kernel will use the
+ regular load/store instructions if the cpu does not implement the
+ feature.
+
endmenu
menu "Boot options"
@@ -590,6 +657,23 @@ config CMDLINE
entering them here. As a minimum, you should specify the the
root device (e.g. root=/dev/nfs).
+choice
+ prompt "Kernel command line type" if CMDLINE != ""
+ default CMDLINE_FROM_BOOTLOADER
+
+config CMDLINE_FROM_BOOTLOADER
+ bool "Use bootloader kernel arguments if available"
+ help
+ Uses the command-line options passed by the boot loader. If
+ the boot loader doesn't provide any, the default kernel command
+ string provided in CMDLINE will be used.
+
+config CMDLINE_EXTEND
+ bool "Extend bootloader kernel arguments"
+ help
+ The command-line arguments provided by the boot loader will be
+ appended to the default kernel command string.
+
config CMDLINE_FORCE
bool "Always use the default kernel command string"
help
@@ -597,6 +681,7 @@ config CMDLINE_FORCE
loader passes other arguments to the kernel.
This is useful if you cannot or don't want to change the
command-line options your boot loader passes to the kernel.
+endchoice
config EFI_STUB
bool
@@ -618,6 +703,32 @@ config EFI
allow the kernel to be booted as an EFI application. This
is only useful on systems that have UEFI firmware.
+config BUILD_ARM64_APPENDED_DTB_IMAGE
+ bool "Build a concatenated Image.gz/dtb by default"
+ depends on OF
+ help
+ Enabling this option will cause a concatenated Image.gz and list of
+ DTBs to be built by default (instead of a standalone Image.gz.)
+ The image will built in arch/arm64/boot/Image.gz-dtb
+
+config BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES
+ string "Default dtb names"
+ depends on BUILD_ARM64_APPENDED_DTB_IMAGE
+ help
+ Space separated list of names of dtbs to append when
+ building a concatenated Image.gz-dtb.
+
+config DMI
+ bool "Enable support for SMBIOS (DMI) tables"
+ depends on EFI
+ default y
+ help
+ This enables SMBIOS/DMI feature for systems.
+
+ This option is only useful on systems that have UEFI firmware.
+ However, even with this option, the resultant kernel should
+ continue to boot on existing non-UEFI platforms.
+
endmenu
menu "Userspace binary formats"
diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index 8dd3a551c170..d6285ef9b5f9 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -6,6 +6,18 @@ config FRAME_POINTER
bool
default y
+config ARM64_PTDUMP
+ bool "Export kernel pagetable layout to userspace via debugfs"
+ depends on DEBUG_KERNEL
+ select DEBUG_FS
+ help
+ Say Y here if you want to show the kernel pagetable layout in a
+ debugfs file. This information is only useful for kernel developers
+ who are working in architecture specific areas of the kernel.
+ It is probably not a good idea to enable this feature in a production
+ kernel.
+ If in doubt, say "N"
+
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
depends on MMU
@@ -54,6 +66,29 @@ config DEBUG_SET_MODULE_RONX
against certain classes of kernel exploits.
If in doubt, say "N".
+config DEBUG_RODATA
+ bool "Make kernel text and rodata read-only"
+ help
+ If this is set, kernel text and rodata will be made read-only. This
+ is to help catch accidental or malicious attempts to change the
+ kernel's executable code. Additionally splits rodata from kernel
+ text so it can be made explicitly non-executable.
+
+ If in doubt, say Y
+
+config DEBUG_ALIGN_RODATA
+ depends on DEBUG_RODATA && !ARM64_64K_PAGES
+ bool "Align linker sections up to SECTION_SIZE"
+ help
+ If this option is enabled, sections that may potentially be marked as
+ read only or non-executable will be aligned up to the section size of
+ the kernel. This prevents sections from being split into pages and
+ avoids a potential TLB penalty. The downside is an increase in
+ alignment and potentially wasted space. Turn on this option if
+ performance is more important than memory pressure.
+
+ If in doubt, say N
+
source "drivers/hwtracing/coresight/Kconfig"
endmenu
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index fa985ad3defc..e30eaed439d9 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -18,6 +18,7 @@ GZFLAGS :=-9
KBUILD_DEFCONFIG := defconfig
KBUILD_CFLAGS += -mgeneral-regs-only
+KBUILD_CFLAGS += -fno-pic
KBUILD_CFLAGS += $(call cc-option, -mpc-relative-literal-loads)
ifeq ($(CONFIG_CPU_BIG_ENDIAN), y)
@@ -64,7 +65,12 @@ libs-y := arch/arm64/lib/ $(libs-y)
libs-$(CONFIG_EFI_STUB) += drivers/firmware/efi/libstub/
# Default target when executing plain make
+ifeq ($(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE),y)
+KBUILD_IMAGE := Image.gz-dtb
+else
KBUILD_IMAGE := Image.gz
+endif
+
KBUILD_DTBS := dtbs
all: $(KBUILD_IMAGE) $(KBUILD_DTBS)
@@ -88,6 +94,9 @@ dtbs: prepare scripts
dtbs_install:
$(Q)$(MAKE) $(dtbinst)=$(boot)/dts
+Image-dtb Image.gz-dtb: vmlinux scripts dtbs
+ $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
+
PHONY += vdso_install
vdso_install:
$(Q)$(MAKE) $(build)=arch/arm64/kernel/vdso $@
@@ -97,6 +106,16 @@ archclean:
$(Q)$(MAKE) $(clean)=$(boot)
$(Q)$(MAKE) $(clean)=$(boot)/dts
+# We need to generate vdso-offsets.h before compiling certain files in kernel/.
+# In order to do that, we should use the archprepare target, but we can't since
+# asm-offsets.h is included in some files used to generate vdso-offsets.h, and
+# asm-offsets.h is built in prepare0, for which archprepare is a dependency.
+# Therefore we need to generate the header after prepare0 has been made, hence
+# this hack.
+prepare: vdso_prepare
+vdso_prepare: prepare0
+ $(Q)$(MAKE) $(build)=arch/arm64/kernel/vdso include/generated/vdso-offsets.h
+
define archhelp
echo '* Image.gz - Compressed kernel image (arch/$(ARCH)/boot/Image.gz)'
echo ' Image - Uncompressed kernel image (arch/$(ARCH)/boot/Image)'
diff --git a/arch/arm64/boot/.gitignore b/arch/arm64/boot/.gitignore
index 8dab0bb6ae66..34e35209fc2e 100644
--- a/arch/arm64/boot/.gitignore
+++ b/arch/arm64/boot/.gitignore
@@ -1,2 +1,4 @@
Image
+Image-dtb
Image.gz
+Image.gz-dtb
diff --git a/arch/arm64/boot/Makefile b/arch/arm64/boot/Makefile
index 5a0e3ab854a5..aa24ed42b42b 100644
--- a/arch/arm64/boot/Makefile
+++ b/arch/arm64/boot/Makefile
@@ -14,14 +14,30 @@
# Based on the ia64 boot/Makefile.
#
+include $(srctree)/arch/arm64/boot/dts/Makefile
+
targets := Image Image.gz
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+DTB_OBJS := $(addprefix $(obj)/dts/,$(DTB_LIST))
+else
+DTB_OBJS := $(shell find $(obj)/dts/ -name \*.dtb)
+endif
+
$(obj)/Image: vmlinux FORCE
$(call if_changed,objcopy)
+$(obj)/Image-dtb: $(obj)/Image $(DTB_OBJS) FORCE
+ $(call if_changed,cat)
+
$(obj)/Image.gz: $(obj)/Image FORCE
$(call if_changed,gzip)
+$(obj)/Image.gz-dtb: $(obj)/Image.gz $(DTB_OBJS) FORCE
+ $(call if_changed,cat)
+
install: $(obj)/Image
$(CONFIG_SHELL) $(srctree)/$(src)/install.sh $(KERNELRELEASE) \
$(obj)/Image System.map "$(INSTALL_PATH)"
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 6738cb24c058..b65419693e49 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -47,6 +47,22 @@ CONFIG_CMA=y
CONFIG_CMDLINE="console=ttyAMA0"
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_COMPAT=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_DEBUG=y
+CONFIG_PM_ADVANCED_DEBUG=y
+CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y
+CONFIG_CPU_IDLE=y
+CONFIG_ARM_CPUIDLE=y
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_STAT_DETAILS=y
+CONFIG_CPU_FREQ_GOV_POWERSAVE=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
+CONFIG_CPUFREQ_DT=y
+CONFIG_ARM_BIG_LITTLE_CPUFREQ=y
+CONFIG_ARM_SCPI_CPUFREQ=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
@@ -66,6 +82,7 @@ CONFIG_DEVTMPFS_MOUNT=y
CONFIG_DMA_CMA=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_VIRTIO_BLK=y
+CONFIG_SRAM=y
# CONFIG_SCSI_PROC_FS is not set
CONFIG_BLK_DEV_SD=y
# CONFIG_SCSI_LOWLEVEL is not set
@@ -99,6 +116,8 @@ CONFIG_SPI_PL022=y
CONFIG_GPIO_PL061=y
CONFIG_GPIO_XGENE=y
# CONFIG_HWMON is not set
+CONFIG_SENSORS_ARM_SCPI=y
+CONFIG_SENSORS_V2M_JUNO=y
CONFIG_REGULATOR=y
CONFIG_REGULATOR_FIXED_VOLTAGE=y
CONFIG_FB=y
@@ -122,11 +141,20 @@ CONFIG_MMC_SDHCI_PLTFM=y
CONFIG_MMC_SPI=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_EFI=y
+CONFIG_RTC_DRV_PL030=y
+CONFIG_RTC_DRV_PL031=y
CONFIG_RTC_DRV_XGENE=y
CONFIG_VIRTIO_BALLOON=y
CONFIG_VIRTIO_MMIO=y
+CONFIG_COMMON_CLK_SCPI=y
+CONFIG_ARM_TIMER_SP804=y
+CONFIG_MAILBOX=y
+CONFIG_ARM_MHU=y
# CONFIG_IOMMU_SUPPORT is not set
CONFIG_PHY_XGENE=y
+CONFIG_ARM_SCPI_PROTOCOL=y
+CONFIG_DMI_SYSFS=y
+CONFIG_EFI_VARS=y
CONFIG_EXT2_FS=y
CONFIG_EXT3_FS=y
# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
@@ -147,8 +175,13 @@ CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
CONFIG_VIRTUALIZATION=y
CONFIG_KVM=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DYNAMIC_DEBUG=y
CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_REDUCED=y
CONFIG_DEBUG_FS=y
+CONFIG_HEADERS_CHECK=y
+CONFIG_DEBUG_SECTION_MISMATCH=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_KERNEL=y
CONFIG_LOCKUP_DETECTOR=y
diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index 4e3d4c8b50d1..fe34ae0bed91 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -1,8 +1,12 @@
#ifndef __ASM_ALTERNATIVE_H
#define __ASM_ALTERNATIVE_H
+#include <asm/cpufeature.h>
+#include <asm/insn.h>
+
#ifndef __ASSEMBLY__
+#include <linux/init.h>
#include <linux/kconfig.h>
#include <linux/types.h>
#include <linux/stddef.h>
@@ -16,7 +20,8 @@ struct alt_instr {
u8 alt_len; /* size of new instruction(s), <= orig_len */
};
-void apply_alternatives(void);
+void __init apply_alternatives_all(void);
+void apply_alternatives(void *start, size_t length);
void free_alternatives_memory(void);
#define ALTINSTR_ENTRY(feature) \
@@ -62,6 +67,8 @@ void free_alternatives_memory(void);
#else
+#include <asm/assembler.h>
+
.macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len
.word \orig_offset - .
.word \alt_offset - .
@@ -85,34 +92,55 @@ void free_alternatives_memory(void);
.endm
/*
- * Begin an alternative code sequence.
+ * Alternative sequences
*
- * The code that follows this macro will be assembled and linked as
- * normal. There are no restrictions on this code.
+ * The code for the case where the capability is not present will be
+ * assembled and linked as normal. There are no restrictions on this
+ * code.
+ *
+ * The code for the case where the capability is present will be
+ * assembled into a special section to be used for dynamic patching.
+ * Code for that case must:
+ *
+ * 1. Be exactly the same length (in bytes) as the default code
+ * sequence.
+ *
+ * 2. Not contain a branch target that is used outside of the
+ * alternative sequence it is defined in (branches into an
+ * alternative sequence are not fixed up).
+ */
+
+/*
+ * Begin an alternative code sequence.
*/
.macro alternative_if_not cap
+ .set .Lasm_alt_mode, 0
.pushsection .altinstructions, "a"
altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f
.popsection
661:
.endm
+.macro alternative_if cap
+ .set .Lasm_alt_mode, 1
+ .pushsection .altinstructions, "a"
+ altinstruction_entry 663f, 661f, \cap, 664f-663f, 662f-661f
+ .popsection
+ .pushsection .altinstr_replacement, "ax"
+ .align 2 /* So GAS knows label 661 is suitably aligned */
+661:
+.endm
+
/*
- * Provide the alternative code sequence.
- *
- * The code that follows this macro is assembled into a special
- * section to be used for dynamic patching. Code that follows this
- * macro must:
- *
- * 1. Be exactly the same length (in bytes) as the default code
- * sequence.
- *
- * 2. Not contain a branch target that is used outside of the
- * alternative sequence it is defined in (branches into an
- * alternative sequence are not fixed up).
+ * Provide the other half of the alternative code sequence.
*/
.macro alternative_else
-662: .pushsection .altinstr_replacement, "ax"
+662:
+ .if .Lasm_alt_mode==0
+ .pushsection .altinstr_replacement, "ax"
+ .else
+ .popsection
+ .endif
663:
.endm
@@ -120,15 +148,97 @@ void free_alternatives_memory(void);
* Complete an alternative code sequence.
*/
.macro alternative_endif
-664: .popsection
+664:
+ .if .Lasm_alt_mode==0
+ .popsection
+ .endif
.org . - (664b-663b) + (662b-661b)
.org . - (662b-661b) + (664b-663b)
.endm
+/*
+ * Provides a trivial alternative or default sequence consisting solely
+ * of NOPs. The number of NOPs is chosen automatically to match the
+ * previous case.
+ */
+.macro alternative_else_nop_endif
+alternative_else
+ nops (662b-661b) / AARCH64_INSN_SIZE
+alternative_endif
+.endm
+
#define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...) \
alternative_insn insn1, insn2, cap, IS_ENABLED(cfg)
+/*
+ * Generate the assembly for UAO alternatives with exception table entries.
+ * This is complicated as there is no post-increment or pair versions of the
+ * unprivileged instructions, and USER() only works for single instructions.
+ */
+#ifdef CONFIG_ARM64_UAO
+ .macro uao_ldp l, reg1, reg2, addr, post_inc
+ alternative_if_not ARM64_HAS_UAO
+8888: ldp \reg1, \reg2, [\addr], \post_inc;
+8889: nop;
+ nop;
+ alternative_else
+ ldtr \reg1, [\addr];
+ ldtr \reg2, [\addr, #8];
+ add \addr, \addr, \post_inc;
+ alternative_endif
+
+ .section __ex_table,"a";
+ .align 3;
+ .quad 8888b,\l;
+ .quad 8889b,\l;
+ .previous;
+ .endm
+
+ .macro uao_stp l, reg1, reg2, addr, post_inc
+ alternative_if_not ARM64_HAS_UAO
+8888: stp \reg1, \reg2, [\addr], \post_inc;
+8889: nop;
+ nop;
+ alternative_else
+ sttr \reg1, [\addr];
+ sttr \reg2, [\addr, #8];
+ add \addr, \addr, \post_inc;
+ alternative_endif
+
+ .section __ex_table,"a";
+ .align 3;
+ .quad 8888b,\l;
+ .quad 8889b,\l;
+ .previous
+ .endm
+
+ .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
+ alternative_if_not ARM64_HAS_UAO
+8888: \inst \reg, [\addr], \post_inc;
+ nop;
+ alternative_else
+ \alt_inst \reg, [\addr];
+ add \addr, \addr, \post_inc;
+ alternative_endif
+
+ .section __ex_table,"a";
+ .align 3;
+ .quad 8888b,\l;
+ .previous
+ .endm
+#else
+ .macro uao_ldp l, reg1, reg2, addr, post_inc
+ USER(\l, ldp \reg1, \reg2, [\addr], \post_inc)
+ .endm
+ .macro uao_stp l, reg1, reg2, addr, post_inc
+ USER(\l, stp \reg1, \reg2, [\addr], \post_inc)
+ .endm
+ .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
+ USER(\l, \inst \reg, [\addr], \post_inc)
+ .endm
+#endif
+
#endif /* __ASSEMBLY__ */
/*
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 3579988b23f9..deb27cf40254 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -1,5 +1,5 @@
/*
- * Based on arch/arm/include/asm/assembler.h
+ * Based on arch/arm/include/asm/assembler.h, arch/arm/mm/proc-macros.S
*
* Copyright (C) 1996-2000 Russell King
* Copyright (C) 2012 ARM Ltd.
@@ -23,6 +23,8 @@
#ifndef __ASM_ASSEMBLER_H
#define __ASM_ASSEMBLER_H
+#include <asm/asm-offsets.h>
+#include <asm/pgtable-hwdef.h>
#include <asm/ptrace.h>
#include <asm/thread_info.h>
@@ -49,6 +51,15 @@
msr daifclr, #2
.endm
+ .macro save_and_disable_irq, flags
+ mrs \flags, daif
+ msr daifset, #2
+ .endm
+
+ .macro restore_irq, flags
+ msr daif, \flags
+ .endm
+
/*
* Save/disable and restore interrupts.
*/
@@ -103,9 +114,16 @@
* SMP data memory barrier
*/
.macro smp_dmb, opt
-#ifdef CONFIG_SMP
dmb \opt
-#endif
+ .endm
+
+/*
+ * NOP sequence
+ */
+ .macro nops, num
+ .rept \num
+ nop
+ .endr
.endm
#define USER(l, x...) \
@@ -218,4 +236,89 @@ lr .req x30 // link register
.size __pi_##x, . - x; \
ENDPROC(x)
+/*
+ * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm)
+ */
+ .macro vma_vm_mm, rd, rn
+ ldr \rd, [\rn, #VMA_VM_MM]
+ .endm
+
+/*
+ * mmid - get context id from mm pointer (mm->context.id)
+ */
+ .macro mmid, rd, rn
+ ldr \rd, [\rn, #MM_CONTEXT_ID]
+ .endm
+
+/*
+ * dcache_line_size - get the minimum D-cache line size from the CTR register.
+ */
+ .macro dcache_line_size, reg, tmp
+ mrs \tmp, ctr_el0 // read CTR
+ ubfm \tmp, \tmp, #16, #19 // cache line size encoding
+ mov \reg, #4 // bytes per word
+ lsl \reg, \reg, \tmp // actual cache line size
+ .endm
+
+/*
+ * icache_line_size - get the minimum I-cache line size from the CTR register.
+ */
+ .macro icache_line_size, reg, tmp
+ mrs \tmp, ctr_el0 // read CTR
+ and \tmp, \tmp, #0xf // cache line size encoding
+ mov \reg, #4 // bytes per word
+ lsl \reg, \reg, \tmp // actual cache line size
+ .endm
+
+/*
+ * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map
+ */
+ .macro tcr_set_idmap_t0sz, valreg, tmpreg
+#ifndef CONFIG_ARM64_VA_BITS_48
+ ldr_l \tmpreg, idmap_t0sz
+ bfi \valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH
+#endif
+ .endm
+
+/*
+ * Macro to perform a data cache maintenance for the interval
+ * [kaddr, kaddr + size)
+ *
+ * op: operation passed to dc instruction
+ * domain: domain used in dsb instruciton
+ * kaddr: starting virtual address of the region
+ * size: size of the region
+ * Corrupts: kaddr, size, tmp1, tmp2
+ */
+ .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
+ dcache_line_size \tmp1, \tmp2
+ add \size, \kaddr, \size
+ sub \tmp2, \tmp1, #1
+ bic \kaddr, \kaddr, \tmp2
+9998: dc \op, \kaddr
+ add \kaddr, \kaddr, \tmp1
+ cmp \kaddr, \size
+ b.lo 9998b
+ dsb \domain
+ .endm
+
+/*
+ * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present
+ */
+ .macro reset_pmuserenr_el0, tmpreg
+ mrs \tmpreg, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer
+ sbfx \tmpreg, \tmpreg, #8, #4
+ cmp \tmpreg, #1 // Skip if no PMU present
+ b.lt 9000f
+ msr pmuserenr_el0, xzr // Disable PMU access from EL0
+9000:
+ .endm
+
+/*
+ * Return the current thread_info.
+ */
+ .macro get_thread_info, rd
+ mrs \rd, sp_el0
+ .endm
+
#endif /* __ASM_ASSEMBLER_H */
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index d3d49531ce45..01df231d6220 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -20,6 +20,9 @@
#ifndef __ASSEMBLY__
+#define __nops(n) ".rept " #n "\nnop\n.endr\n"
+#define nops(n) asm volatile(__nops(n))
+
#define sev() asm volatile("sev" : : : "memory")
#define wfe() asm volatile("wfe" : : : "memory")
#define wfi() asm volatile("wfi" : : : "memory")
@@ -32,27 +35,8 @@
#define rmb() dsb(ld)
#define wmb() dsb(st)
-#ifndef CONFIG_SMP
-#define smp_mb() barrier()
-#define smp_rmb() barrier()
-#define smp_wmb() barrier()
-
-#define smp_store_release(p, v) \
-do { \
- compiletime_assert_atomic_type(*p); \
- barrier(); \
- ACCESS_ONCE(*p) = (v); \
-} while (0)
-
-#define smp_load_acquire(p) \
-({ \
- typeof(*p) ___p1 = ACCESS_ONCE(*p); \
- compiletime_assert_atomic_type(*p); \
- barrier(); \
- ___p1; \
-})
-
-#else
+#define dma_rmb() dmb(oshld)
+#define dma_wmb() dmb(oshst)
#define smp_mb() dmb(ish)
#define smp_rmb() dmb(ishld)
@@ -96,8 +80,6 @@ do { \
___p1; \
})
-#endif
-
#define read_barrier_depends() do { } while(0)
#define smp_read_barrier_depends() do { } while(0)
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 689b6379188c..5d46c694e097 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -40,10 +40,6 @@
* the implementation assumes non-aliasing VIPT D-cache and (aliasing)
* VIPT or ASID-tagged VIVT I-cache.
*
- * flush_cache_all()
- *
- * Unconditionally clean and invalidate the entire cache.
- *
* flush_cache_mm(mm)
*
* Clean and invalidate all user space cache entries
@@ -69,7 +65,6 @@
* - kaddr - page address
* - size - region size
*/
-extern void flush_cache_all(void);
extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end);
extern void flush_icache_range(unsigned long start, unsigned long end);
extern void __flush_dcache_area(void *addr, size_t len);
@@ -120,6 +115,13 @@ extern void copy_to_user_page(struct vm_area_struct *, struct page *,
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
extern void flush_dcache_page(struct page *);
+static inline void __local_flush_icache_all(void)
+{
+ asm("ic iallu");
+ dsb(nsh);
+ isb();
+}
+
static inline void __flush_icache_all(void)
{
asm("ic ialluis");
@@ -152,4 +154,5 @@ int set_memory_ro(unsigned long addr, int numpages);
int set_memory_rw(unsigned long addr, int numpages);
int set_memory_x(unsigned long addr, int numpages);
int set_memory_nx(unsigned long addr, int numpages);
+
#endif
diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h
index 56de5aadede2..3fb053fa6e98 100644
--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -205,6 +205,13 @@ typedef struct compat_siginfo {
compat_long_t _band; /* POLL_IN, POLL_OUT, POLL_MSG */
int _fd;
} _sigpoll;
+
+ /* SIGSYS */
+ struct {
+ compat_uptr_t _call_addr; /* calling user insn */
+ int _syscall; /* triggering system call number */
+ compat_uint_t _arch; /* AUDIT_ARCH_* of syscall */
+ } _sigsys;
} _sifields;
} compat_siginfo_t;
diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index 056443086019..13a6103130cd 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -30,13 +30,17 @@ struct cpuinfo_arm64 {
u32 reg_dczid;
u32 reg_midr;
+ u64 reg_id_aa64dfr0;
+ u64 reg_id_aa64dfr1;
u64 reg_id_aa64isar0;
u64 reg_id_aa64isar1;
u64 reg_id_aa64mmfr0;
u64 reg_id_aa64mmfr1;
+ u64 reg_id_aa64mmfr2;
u64 reg_id_aa64pfr0;
u64 reg_id_aa64pfr1;
+ u32 reg_id_dfr0;
u32 reg_id_isar0;
u32 reg_id_isar1;
u32 reg_id_isar2;
@@ -49,6 +53,10 @@ struct cpuinfo_arm64 {
u32 reg_id_mmfr3;
u32 reg_id_pfr0;
u32 reg_id_pfr1;
+
+ u32 reg_mvfr0;
+ u32 reg_mvfr1;
+ u32 reg_mvfr2;
};
DECLARE_PER_CPU(struct cpuinfo_arm64, cpu_data);
@@ -56,4 +64,8 @@ DECLARE_PER_CPU(struct cpuinfo_arm64, cpu_data);
void cpuinfo_store_cpu(void);
void __init cpuinfo_store_boot_cpu(void);
+void __init init_cpu_features(struct cpuinfo_arm64 *info);
+void update_cpu_features(int cpu, struct cpuinfo_arm64 *info,
+ struct cpuinfo_arm64 *boot);
+
#endif /* __ASM_CPU_H */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index d71140b76773..a0789bfc4ac6 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -10,6 +10,7 @@
#define __ASM_CPUFEATURE_H
#include <asm/hwcap.h>
+#include <asm/sysreg.h>
/*
* In the arm64 world (as in the ARM world), elf_hwcap is used both internally
@@ -26,16 +27,55 @@
#define ARM64_WORKAROUND_845719 2
#define ARM64_HAS_SYSREG_GIC_CPUIF 3
#define ARM64_HAS_PAN 4
+#define ARM64_HAS_UAO 5
+#define ARM64_ALT_PAN_NOT_UAO 6
-#define ARM64_NCAPS 5
+#define ARM64_NCAPS 7
#ifndef __ASSEMBLY__
+#include <linux/kernel.h>
+
+/* CPU feature register tracking */
+enum ftr_type {
+ FTR_EXACT, /* Use a predefined safe value */
+ FTR_LOWER_SAFE, /* Smaller value is safe */
+ FTR_HIGHER_SAFE,/* Bigger value is safe */
+};
+
+#define FTR_STRICT true /* SANITY check strict matching required */
+#define FTR_NONSTRICT false /* SANITY check ignored */
+
+#define FTR_SIGNED true /* Value should be treated as signed */
+#define FTR_UNSIGNED false /* Value should be treated as unsigned */
+
+struct arm64_ftr_bits {
+ bool sign; /* Value is signed ? */
+ bool strict; /* CPU Sanity check: strict matching required ? */
+ enum ftr_type type;
+ u8 shift;
+ u8 width;
+ s64 safe_val; /* safe value for discrete features */
+};
+
+/*
+ * @arm64_ftr_reg - Feature register
+ * @strict_mask Bits which should match across all CPUs for sanity.
+ * @sys_val Safe value across the CPUs (system view)
+ */
+struct arm64_ftr_reg {
+ u32 sys_id;
+ const char *name;
+ u64 strict_mask;
+ u64 sys_val;
+ struct arm64_ftr_bits *ftr_bits;
+};
+
struct arm64_cpu_capabilities {
const char *desc;
u16 capability;
bool (*matches)(const struct arm64_cpu_capabilities *);
- void (*enable)(void);
+ void (*enable)(void *); /* Called on all active CPUs */
union {
struct { /* To be used for erratum handling only */
u32 midr_model;
@@ -43,8 +83,11 @@ struct arm64_cpu_capabilities {
};
struct { /* Feature register checking */
+ u32 sys_reg;
int field_pos;
int min_field_value;
+ int hwcap_type;
+ unsigned long hwcap;
};
};
};
@@ -72,19 +115,79 @@ static inline void cpus_set_cap(unsigned int num)
__set_bit(num, cpu_hwcaps);
}
-static inline int __attribute_const__ cpuid_feature_extract_field(u64 features,
- int field)
+static inline int __attribute_const__
+cpuid_feature_extract_field_width(u64 features, int field, int width)
{
- return (s64)(features << (64 - 4 - field)) >> (64 - 4);
+ return (s64)(features << (64 - width - field)) >> (64 - width);
}
+static inline int __attribute_const__
+cpuid_feature_extract_field(u64 features, int field)
+{
+ return cpuid_feature_extract_field_width(features, field, 4);
+}
+
+static inline unsigned int __attribute_const__
+cpuid_feature_extract_unsigned_field_width(u64 features, int field, int width)
+{
+ return (u64)(features << (64 - width - field)) >> (64 - width);
+}
+
+static inline unsigned int __attribute_const__
+cpuid_feature_extract_unsigned_field(u64 features, int field)
+{
+ return cpuid_feature_extract_unsigned_field_width(features, field, 4);
+}
+
+static inline u64 arm64_ftr_mask(struct arm64_ftr_bits *ftrp)
+{
+ return (u64)GENMASK(ftrp->shift + ftrp->width - 1, ftrp->shift);
+}
-void check_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
+static inline s64 arm64_ftr_value(struct arm64_ftr_bits *ftrp, u64 val)
+{
+ return ftrp->sign ?
+ cpuid_feature_extract_field_width(val, ftrp->shift, ftrp->width) :
+ cpuid_feature_extract_unsigned_field_width(val, ftrp->shift, ftrp->width);
+}
+
+static inline bool id_aa64mmfr0_mixed_endian_el0(u64 mmfr0)
+{
+ return cpuid_feature_extract_field(mmfr0, ID_AA64MMFR0_BIGENDEL_SHIFT) == 0x1 ||
+ cpuid_feature_extract_field(mmfr0, ID_AA64MMFR0_BIGENDEL0_SHIFT) == 0x1;
+}
+
+void __init setup_cpu_features(void);
+
+void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
const char *info);
void check_local_cpu_errata(void);
-void check_local_cpu_features(void);
-bool cpu_supports_mixed_endian_el0(void);
-bool system_supports_mixed_endian_el0(void);
+
+#ifdef CONFIG_HOTPLUG_CPU
+void verify_local_cpu_capabilities(void);
+#else
+static inline void verify_local_cpu_capabilities(void)
+{
+}
+#endif
+
+u64 read_system_reg(u32 id);
+
+static inline bool cpu_supports_mixed_endian_el0(void)
+{
+ return id_aa64mmfr0_mixed_endian_el0(read_cpuid(SYS_ID_AA64MMFR0_EL1));
+}
+
+static inline bool system_supports_mixed_endian_el0(void)
+{
+ return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1));
+}
+
+static inline bool system_uses_ttbr0_pan(void)
+{
+ return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) &&
+ !cpus_have_cap(ARM64_HAS_PAN);
+}
#endif /* __ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/cputable.h b/arch/arm64/include/asm/cputable.h
deleted file mode 100644
index e3bd983d3661..000000000000
--- a/arch/arm64/include/asm/cputable.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * arch/arm64/include/asm/cputable.h
- *
- * Copyright (C) 2012 ARM Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef __ASM_CPUTABLE_H
-#define __ASM_CPUTABLE_H
-
-struct cpu_info {
- unsigned int cpu_id_val;
- unsigned int cpu_id_mask;
- const char *cpu_name;
- unsigned long (*cpu_setup)(void);
-};
-
-extern struct cpu_info *lookup_processor_type(unsigned int);
-
-#endif
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index ee6403df9fe4..29dd9c7bc721 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -32,12 +32,6 @@
#define MPIDR_AFFINITY_LEVEL(mpidr, level) \
((mpidr >> MPIDR_LEVEL_SHIFT(level)) & MPIDR_LEVEL_MASK)
-#define read_cpuid(reg) ({ \
- u64 __val; \
- asm("mrs %0, " #reg : "=r" (__val)); \
- __val; \
-})
-
#define MIDR_REVISION_MASK 0xf
#define MIDR_REVISION(midr) ((midr) & MIDR_REVISION_MASK)
#define MIDR_PARTNUM_SHIFT 4
@@ -72,17 +66,16 @@
#define APM_CPU_PART_POTENZA 0x000
-#define ID_AA64MMFR0_BIGENDEL0_SHIFT 16
-#define ID_AA64MMFR0_BIGENDEL0_MASK (0xf << ID_AA64MMFR0_BIGENDEL0_SHIFT)
-#define ID_AA64MMFR0_BIGENDEL0(mmfr0) \
- (((mmfr0) & ID_AA64MMFR0_BIGENDEL0_MASK) >> ID_AA64MMFR0_BIGENDEL0_SHIFT)
-#define ID_AA64MMFR0_BIGEND_SHIFT 8
-#define ID_AA64MMFR0_BIGEND_MASK (0xf << ID_AA64MMFR0_BIGEND_SHIFT)
-#define ID_AA64MMFR0_BIGEND(mmfr0) \
- (((mmfr0) & ID_AA64MMFR0_BIGEND_MASK) >> ID_AA64MMFR0_BIGEND_SHIFT)
-
#ifndef __ASSEMBLY__
+#include <asm/sysreg.h>
+
+#define read_cpuid(reg) ({ \
+ u64 __val; \
+ asm("mrs_s %0, " __stringify(reg) : "=r" (__val)); \
+ __val; \
+})
+
/*
* The CPU ID never changes at run time, so we might as well tell the
* compiler that it's constant. Use this function to read the CPU ID
@@ -90,12 +83,12 @@
*/
static inline u32 __attribute_const__ read_cpuid_id(void)
{
- return read_cpuid(MIDR_EL1);
+ return read_cpuid(SYS_MIDR_EL1);
}
static inline u64 __attribute_const__ read_cpuid_mpidr(void)
{
- return read_cpuid(MPIDR_EL1);
+ return read_cpuid(SYS_MPIDR_EL1);
}
static inline unsigned int __attribute_const__ read_cpuid_implementor(void)
@@ -110,13 +103,7 @@ static inline unsigned int __attribute_const__ read_cpuid_part_number(void)
static inline u32 __attribute_const__ read_cpuid_cachetype(void)
{
- return read_cpuid(CTR_EL0);
-}
-
-static inline bool id_aa64mmfr0_mixed_endian_el0(u64 mmfr0)
-{
- return (ID_AA64MMFR0_BIGEND(mmfr0) == 0x1) ||
- (ID_AA64MMFR0_BIGENDEL0(mmfr0) == 0x1);
+ return read_cpuid(SYS_CTR_EL0);
}
#endif /* __ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/dmi.h b/arch/arm64/include/asm/dmi.h
new file mode 100644
index 000000000000..69d37d87b159
--- /dev/null
+++ b/arch/arm64/include/asm/dmi.h
@@ -0,0 +1,31 @@
+/*
+ * arch/arm64/include/asm/dmi.h
+ *
+ * Copyright (C) 2013 Linaro Limited.
+ * Written by: Yi Li (yi.li@linaro.org)
+ *
+ * based on arch/ia64/include/asm/dmi.h
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#ifndef __ASM_DMI_H
+#define __ASM_DMI_H
+
+#include <linux/io.h>
+#include <linux/slab.h>
+
+/*
+ * According to section 2.3.6 of the UEFI spec, the firmware should not
+ * request a virtual mapping for configuration tables such as SMBIOS.
+ * This means we have to map them before use.
+ */
+#define dmi_early_remap(x, l) ioremap_cache(x, l)
+#define dmi_early_unmap(x, l) iounmap(x)
+#define dmi_remap(x, l) ioremap_cache(x, l)
+#define dmi_unmap(x) iounmap(x)
+#define dmi_alloc(l) kzalloc(l, GFP_KERNEL)
+
+#endif
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index a34fd3b12e2b..180fe12bb334 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -1,34 +1,39 @@
#ifndef _ASM_EFI_H
#define _ASM_EFI_H
+#include <asm/cpufeature.h>
#include <asm/io.h>
#include <asm/neon.h>
#ifdef CONFIG_EFI
extern void efi_init(void);
-extern void efi_idmap_init(void);
#else
#define efi_init()
-#define efi_idmap_init()
#endif
#define efi_call_virt(f, ...) \
({ \
- efi_##f##_t *__f = efi.systab->runtime->f; \
+ efi_##f##_t *__f; \
efi_status_t __s; \
\
kernel_neon_begin(); \
+ efi_virtmap_load(); \
+ __f = efi.systab->runtime->f; \
__s = __f(__VA_ARGS__); \
+ efi_virtmap_unload(); \
kernel_neon_end(); \
__s; \
})
#define __efi_call_virt(f, ...) \
({ \
- efi_##f##_t *__f = efi.systab->runtime->f; \
+ efi_##f##_t *__f; \
\
kernel_neon_begin(); \
+ efi_virtmap_load(); \
+ __f = efi.systab->runtime->f; \
__f(__VA_ARGS__); \
+ efi_virtmap_unload(); \
kernel_neon_end(); \
})
@@ -44,4 +49,22 @@ extern void efi_idmap_init(void);
#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__)
+#define EFI_ALLOC_ALIGN SZ_64K
+
+/*
+ * On ARM systems, virtually remapped UEFI runtime services are set up in two
+ * distinct stages:
+ * - The stub retrieves the final version of the memory map from UEFI, populates
+ * the virt_addr fields and calls the SetVirtualAddressMap() [SVAM] runtime
+ * service to communicate the new mapping to the firmware (Note that the new
+ * mapping is not live at this time)
+ * - During an early initcall(), the EFI system table is permanently remapped
+ * and the virtual remapping of the UEFI Runtime Services regions is loaded
+ * into a private set of page tables. If this all succeeds, the Runtime
+ * Services are enabled and the EFI_RUNTIME_SERVICES bit set.
+ */
+
+void efi_virtmap_load(void);
+void efi_virtmap_unload(void);
+
#endif /* _ASM_EFI_H */
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 72674f4c3871..7cff572082d0 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -54,4 +54,90 @@
#define ESR_EL1_EC_BKPT32 (0x38)
#define ESR_EL1_EC_BRK64 (0x3C)
+#define ESR_ELx_EC_UNKNOWN (0x00)
+#define ESR_ELx_EC_WFx (0x01)
+/* Unallocated EC: 0x02 */
+#define ESR_ELx_EC_CP15_32 (0x03)
+#define ESR_ELx_EC_CP15_64 (0x04)
+#define ESR_ELx_EC_CP14_MR (0x05)
+#define ESR_ELx_EC_CP14_LS (0x06)
+#define ESR_ELx_EC_FP_ASIMD (0x07)
+#define ESR_ELx_EC_CP10_ID (0x08)
+/* Unallocated EC: 0x09 - 0x0B */
+#define ESR_ELx_EC_CP14_64 (0x0C)
+/* Unallocated EC: 0x0d */
+#define ESR_ELx_EC_ILL (0x0E)
+/* Unallocated EC: 0x0F - 0x10 */
+#define ESR_ELx_EC_SVC32 (0x11)
+#define ESR_ELx_EC_HVC32 (0x12)
+#define ESR_ELx_EC_SMC32 (0x13)
+/* Unallocated EC: 0x14 */
+#define ESR_ELx_EC_SVC64 (0x15)
+#define ESR_ELx_EC_HVC64 (0x16)
+#define ESR_ELx_EC_SMC64 (0x17)
+#define ESR_ELx_EC_SYS64 (0x18)
+/* Unallocated EC: 0x19 - 0x1E */
+#define ESR_ELx_EC_IMP_DEF (0x1f)
+#define ESR_ELx_EC_IABT_LOW (0x20)
+#define ESR_ELx_EC_IABT_CUR (0x21)
+#define ESR_ELx_EC_PC_ALIGN (0x22)
+/* Unallocated EC: 0x23 */
+#define ESR_ELx_EC_DABT_LOW (0x24)
+#define ESR_ELx_EC_DABT_CUR (0x25)
+#define ESR_ELx_EC_SP_ALIGN (0x26)
+/* Unallocated EC: 0x27 */
+#define ESR_ELx_EC_FP_EXC32 (0x28)
+/* Unallocated EC: 0x29 - 0x2B */
+#define ESR_ELx_EC_FP_EXC64 (0x2C)
+/* Unallocated EC: 0x2D - 0x2E */
+#define ESR_ELx_EC_SERROR (0x2F)
+#define ESR_ELx_EC_BREAKPT_LOW (0x30)
+#define ESR_ELx_EC_BREAKPT_CUR (0x31)
+#define ESR_ELx_EC_SOFTSTP_LOW (0x32)
+#define ESR_ELx_EC_SOFTSTP_CUR (0x33)
+#define ESR_ELx_EC_WATCHPT_LOW (0x34)
+#define ESR_ELx_EC_WATCHPT_CUR (0x35)
+/* Unallocated EC: 0x36 - 0x37 */
+#define ESR_ELx_EC_BKPT32 (0x38)
+/* Unallocated EC: 0x39 */
+#define ESR_ELx_EC_VECTOR32 (0x3A)
+/* Unallocted EC: 0x3B */
+#define ESR_ELx_EC_BRK64 (0x3C)
+/* Unallocated EC: 0x3D - 0x3F */
+#define ESR_ELx_EC_MAX (0x3F)
+
+#define ESR_ELx_EC_SHIFT (26)
+#define ESR_ELx_EC_MASK (UL(0x3F) << ESR_ELx_EC_SHIFT)
+#define ESR_ELx_EC(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT)
+
+#define ESR_ELx_IL (UL(1) << 25)
+#define ESR_ELx_ISS_MASK (ESR_ELx_IL - 1)
+#define ESR_ELx_ISV (UL(1) << 24)
+#define ESR_ELx_SAS_SHIFT (22)
+#define ESR_ELx_SAS (UL(3) << ESR_ELx_SAS_SHIFT)
+#define ESR_ELx_SSE (UL(1) << 21)
+#define ESR_ELx_SRT_SHIFT (16)
+#define ESR_ELx_SRT_MASK (UL(0x1F) << ESR_ELx_SRT_SHIFT)
+#define ESR_ELx_SF (UL(1) << 15)
+#define ESR_ELx_AR (UL(1) << 14)
+#define ESR_ELx_EA (UL(1) << 9)
+#define ESR_ELx_CM (UL(1) << 8)
+#define ESR_ELx_S1PTW (UL(1) << 7)
+#define ESR_ELx_WNR (UL(1) << 6)
+#define ESR_ELx_FSC (0x3F)
+#define ESR_ELx_FSC_TYPE (0x3C)
+#define ESR_ELx_FSC_EXTABT (0x10)
+#define ESR_ELx_FSC_FAULT (0x04)
+#define ESR_ELx_FSC_PERM (0x0C)
+#define ESR_ELx_CV (UL(1) << 24)
+#define ESR_ELx_COND_SHIFT (20)
+#define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT)
+#define ESR_ELx_WFx_ISS_WFE (UL(1) << 0)
+
+#ifndef __ASSEMBLY__
+#include <asm/types.h>
+
+const char *esr_get_class_string(u32 esr);
+#endif /* __ASSEMBLY */
+
#endif /* __ASM_ESR_H */
diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index 5f7bfe6df723..defa0ff98250 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -31,6 +31,7 @@
*
*/
enum fixed_addresses {
+ FIX_HOLE,
FIX_EARLYCON_MEM_BASE,
__end_of_permanent_fixed_addresses,
@@ -48,6 +49,7 @@ enum fixed_addresses {
FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
+ FIX_TEXT_POKE0,
__end_of_fixed_addresses
};
@@ -56,10 +58,11 @@ enum fixed_addresses {
#define FIXMAP_PAGE_IO __pgprot(PROT_DEVICE_nGnRE)
-extern void __early_set_fixmap(enum fixed_addresses idx,
- phys_addr_t phys, pgprot_t flags);
+void __init early_fixmap_init(void);
-#define __set_fixmap __early_set_fixmap
+#define __early_set_fixmap __set_fixmap
+
+extern void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot);
#include <asm-generic/fixmap.h>
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 667346273d9b..e8272c475bb5 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -21,15 +21,12 @@
#include <linux/futex.h>
#include <linux/uaccess.h>
-#include <asm/alternative.h>
-#include <asm/cpufeature.h>
#include <asm/errno.h>
-#include <asm/sysreg.h>
#define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \
+do { \
+ uaccess_enable(); \
asm volatile( \
- ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN) \
"1: ldxr %w1, %2\n" \
insn "\n" \
"2: stlxr %w3, %w0, %2\n" \
@@ -45,11 +42,11 @@
" .align 3\n" \
" .quad 1b, 4b, 2b, 4b\n" \
" .popsection\n" \
- ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN) \
: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \
: "r" (oparg), "Ir" (-EFAULT) \
- : "memory")
+ : "memory"); \
+ uaccess_disable(); \
+} while (0)
static inline int
futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
@@ -119,6 +116,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
return -EFAULT;
+ uaccess_enable();
asm volatile("// futex_atomic_cmpxchg_inatomic\n"
"1: ldxr %w1, %2\n"
" sub %w3, %w1, %w4\n"
@@ -138,6 +136,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp)
: "r" (oldval), "r" (newval), "Ir" (-EFAULT)
: "memory");
+ uaccess_disable();
*uval = val;
return ret;
diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h
index e8a3268a891c..0af4cdb4b5a9 100644
--- a/arch/arm64/include/asm/hardirq.h
+++ b/arch/arm64/include/asm/hardirq.h
@@ -24,9 +24,7 @@
typedef struct {
unsigned int __softirq_pending;
-#ifdef CONFIG_SMP
unsigned int ipi_irqs[NR_IPI];
-#endif
} ____cacheline_aligned irq_cpustat_t;
#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */
@@ -34,10 +32,8 @@ typedef struct {
#define __inc_irq_stat(cpu, member) __IRQ_STAT(cpu, member)++
#define __get_irq_stat(cpu, member) __IRQ_STAT(cpu, member)
-#ifdef CONFIG_SMP
u64 smp_irq_stat_cpu(unsigned int cpu);
#define arch_irq_stat_cpu smp_irq_stat_cpu
-#endif
#define __ARCH_IRQ_EXIT_IRQS_DISABLED 1
diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h
index 52b484b6aa1a..77667c30a9ad 100644
--- a/arch/arm64/include/asm/hw_breakpoint.h
+++ b/arch/arm64/include/asm/hw_breakpoint.h
@@ -65,7 +65,11 @@ static inline void decode_ctrl_reg(u32 reg,
/* Lengths */
#define ARM_BREAKPOINT_LEN_1 0x1
#define ARM_BREAKPOINT_LEN_2 0x3
+#define ARM_BREAKPOINT_LEN_3 0x7
#define ARM_BREAKPOINT_LEN_4 0xf
+#define ARM_BREAKPOINT_LEN_5 0x1f
+#define ARM_BREAKPOINT_LEN_6 0x3f
+#define ARM_BREAKPOINT_LEN_7 0x7f
#define ARM_BREAKPOINT_LEN_8 0xff
/* Kernel stepping */
@@ -107,7 +111,7 @@ struct perf_event;
struct pmu;
extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
- int *gen_len, int *gen_type);
+ int *gen_len, int *gen_type, int *offset);
extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index 0ad735166d9f..400b80b49595 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -52,6 +52,14 @@
extern unsigned int compat_elf_hwcap, compat_elf_hwcap2;
#endif
+enum {
+ CAP_HWCAP = 1,
+#ifdef CONFIG_COMPAT
+ CAP_COMPAT_HWCAP,
+ CAP_COMPAT_HWCAP2,
+#endif
+};
+
extern unsigned long elf_hwcap;
#endif
#endif
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index e2ff32a93b5c..30e50eb54a67 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -264,8 +264,10 @@ __AARCH64_INSN_FUNCS(ands, 0x7F200000, 0x6A000000)
__AARCH64_INSN_FUNCS(bics, 0x7F200000, 0x6A200000)
__AARCH64_INSN_FUNCS(b, 0xFC000000, 0x14000000)
__AARCH64_INSN_FUNCS(bl, 0xFC000000, 0x94000000)
-__AARCH64_INSN_FUNCS(cbz, 0xFE000000, 0x34000000)
-__AARCH64_INSN_FUNCS(cbnz, 0xFE000000, 0x35000000)
+__AARCH64_INSN_FUNCS(cbz, 0x7F000000, 0x34000000)
+__AARCH64_INSN_FUNCS(cbnz, 0x7F000000, 0x35000000)
+__AARCH64_INSN_FUNCS(tbz, 0x7F000000, 0x36000000)
+__AARCH64_INSN_FUNCS(tbnz, 0x7F000000, 0x37000000)
__AARCH64_INSN_FUNCS(bcond, 0xFF000010, 0x54000000)
__AARCH64_INSN_FUNCS(svc, 0xFFE0001F, 0xD4000001)
__AARCH64_INSN_FUNCS(hvc, 0xFFE0001F, 0xD4000002)
@@ -279,10 +281,12 @@ __AARCH64_INSN_FUNCS(ret, 0xFFFFFC1F, 0xD65F0000)
#undef __AARCH64_INSN_FUNCS
bool aarch64_insn_is_nop(u32 insn);
+bool aarch64_insn_is_branch_imm(u32 insn);
int aarch64_insn_read(void *addr, u32 *insnp);
int aarch64_insn_write(void *addr, u32 insn);
enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn);
+u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn);
u32 aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type,
u32 insn, u64 imm);
u32 aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr,
@@ -348,6 +352,8 @@ u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst,
int shift,
enum aarch64_insn_variant variant,
enum aarch64_insn_logic_type type);
+s32 aarch64_get_branch_offset(u32 insn);
+u32 aarch64_set_branch_offset(u32 insn, s32 offset);
bool aarch64_insn_hotpatch_safe(u32 old_insn, u32 new_insn);
diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
index e1f7ecdde11f..1eebf5bb0b58 100644
--- a/arch/arm64/include/asm/irq.h
+++ b/arch/arm64/include/asm/irq.h
@@ -3,7 +3,6 @@
#include <asm-generic/irq.h>
-extern void (*handle_arch_irq)(struct pt_regs *);
extern void migrate_irqs(void);
extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
diff --git a/arch/arm64/include/asm/irq_work.h b/arch/arm64/include/asm/irq_work.h
index b4f6b19a8a68..8e24ef3f7c82 100644
--- a/arch/arm64/include/asm/irq_work.h
+++ b/arch/arm64/include/asm/irq_work.h
@@ -1,8 +1,6 @@
#ifndef __ASM_IRQ_WORK_H
#define __ASM_IRQ_WORK_H
-#ifdef CONFIG_SMP
-
#include <asm/smp.h>
static inline bool arch_irq_work_has_interrupt(void)
@@ -10,13 +8,4 @@ static inline bool arch_irq_work_has_interrupt(void)
return !!__smp_cross_call;
}
-#else
-
-static inline bool arch_irq_work_has_interrupt(void)
-{
- return false;
-}
-
-#endif
-
#endif /* __ASM_IRQ_WORK_H */
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
new file mode 100644
index 000000000000..d06d7f06a583
--- /dev/null
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -0,0 +1,74 @@
+/*
+ * Kernel page table mapping
+ *
+ * Copyright (C) 2015 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ASM_KERNEL_PGTABLE_H
+#define __ASM_KERNEL_PGTABLE_H
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+/*
+ * The idmap and swapper page tables need some space reserved in the kernel
+ * image. Both require pgd, pud (4 levels only) and pmd tables to (section)
+ * map the kernel. With the 64K page configuration, swapper and idmap need to
+ * map to pte level. The swapper also maps the FDT (see __create_page_tables
+ * for more information). Note that the number of ID map translation levels
+ * could be increased on the fly if system RAM is out of reach for the default
+ * VA range, so 3 pages are reserved in all cases.
+ */
+#ifdef CONFIG_ARM64_64K_PAGES
+#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS)
+#else
+#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS - 1)
+#endif
+
+#define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE)
+#define IDMAP_DIR_SIZE (3 * PAGE_SIZE)
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+#define RESERVED_TTBR0_SIZE (PAGE_SIZE)
+#else
+#define RESERVED_TTBR0_SIZE (0)
+#endif
+
+/* Initial memory map size */
+#ifdef CONFIG_ARM64_64K_PAGES
+#define SWAPPER_BLOCK_SHIFT PAGE_SHIFT
+#define SWAPPER_BLOCK_SIZE PAGE_SIZE
+#define SWAPPER_TABLE_SHIFT PMD_SHIFT
+#else
+#define SWAPPER_BLOCK_SHIFT SECTION_SHIFT
+#define SWAPPER_BLOCK_SIZE SECTION_SIZE
+#define SWAPPER_TABLE_SHIFT PUD_SHIFT
+#endif
+
+
+/*
+ * Initial memory map attributes.
+ */
+#define SWAPPER_PTE_FLAGS (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
+#define SWAPPER_PMD_FLAGS (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
+
+#ifdef CONFIG_ARM64_64K_PAGES
+#define SWAPPER_MM_MMUFLAGS (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
+#else
+#define SWAPPER_MM_MMUFLAGS (PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS)
+#endif
+
+
+#endif /* __ASM_KERNEL_PGTABLE_H */
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 865a7e28ea2d..3cb4c856b10d 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -45,6 +45,16 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
vcpu->arch.hcr_el2 &= ~HCR_RW;
}
+static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hcr_el2;
+}
+
+static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
+{
+ vcpu->arch.hcr_el2 = hcr;
+}
+
static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
{
return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index dbd32127dbb6..806a12128193 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -116,9 +116,6 @@ struct kvm_vcpu_arch {
* Anything that is not used directly from assembly code goes
* here.
*/
- /* dcache set/way operation pending */
- int last_pcpu;
- cpumask_t require_dcache_flush;
/* Don't run the guest */
bool pause;
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index a205e957d5c4..390bf1230c69 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -142,12 +142,12 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
/*
* If we are concatenating first level stage-2 page tables, we would have less
* than or equal to 16 pointers in the fake PGD, because that's what the
- * architecture allows. In this case, (4 - CONFIG_ARM64_PGTABLE_LEVELS)
+ * architecture allows. In this case, (4 - CONFIG_PGTABLE_LEVELS)
* represents the first level for the host, and we add 1 to go to the next
* level (which uses contatenation) for the stage-2 tables.
*/
#if PTRS_PER_S2_PGD <= 16
-#define KVM_PREALLOC_LEVEL (4 - CONFIG_ARM64_PGTABLE_LEVELS + 1)
+#define KVM_PREALLOC_LEVEL (4 - CONFIG_PGTABLE_LEVELS + 1)
#else
#define KVM_PREALLOC_LEVEL (0)
#endif
@@ -245,7 +245,8 @@ static inline void __kvm_flush_dcache_pud(pud_t pud)
#define kvm_virt_to_phys(x) __virt_to_phys((unsigned long)(x))
-void stage2_flush_vm(struct kvm *kvm);
+void kvm_set_way_flush(struct kvm_vcpu *vcpu);
+void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
#endif /* __ASSEMBLY__ */
#endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index e4a2ef9cf998..fe3d97595c1e 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -75,12 +75,6 @@
#define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET))
/*
- * Convert a physical address to a Page Frame Number and back
- */
-#define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT))
-#define __pfn_to_phys(pfn) ((phys_addr_t)(pfn) << PAGE_SHIFT)
-
-/*
* Convert a page to/from a physical address
*/
#define page_to_phys(page) (__pfn_to_phys(page_to_pfn(page)))
@@ -147,7 +141,11 @@ static inline void *phys_to_virt(phys_addr_t x)
#define ARCH_PFN_OFFSET ((unsigned long)PHYS_PFN_OFFSET)
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
-#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define _virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+
+#define _virt_addr_is_linear(kaddr) (((u64)(kaddr)) >= PAGE_OFFSET)
+#define virt_addr_valid(kaddr) (_virt_addr_is_linear(kaddr) && \
+ _virt_addr_valid(kaddr))
#endif
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index c2f006c48bdb..77c3851deae1 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -17,21 +17,22 @@
#define __ASM_MMU_H
typedef struct {
- unsigned int id;
- raw_spinlock_t id_lock;
- void *vdso;
+ atomic64_t id;
+ void *vdso;
} mm_context_t;
-#define INIT_MM_CONTEXT(name) \
- .context.id_lock = __RAW_SPIN_LOCK_UNLOCKED(name.context.id_lock),
-
-#define ASID(mm) ((mm)->context.id & 0xffff)
+/*
+ * This macro is only used by the TLBI code, which cannot race with an
+ * ASID change and therefore doesn't need to reload the counter using
+ * atomic64_read.
+ */
+#define ASID(mm) ((mm)->context.id.counter & 0xffff)
extern void paging_init(void);
-extern void setup_mm_for_reboot(void);
extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
extern void init_mem_pgprot(void);
-/* create an identity mapping for memory (or io if map_io is true) */
-extern void create_id_mapping(phys_addr_t addr, phys_addr_t size, int map_io);
+extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
+ unsigned long virt, phys_addr_t size,
+ pgprot_t prot);
#endif
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 101a42bde728..b61e47aea3a1 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -23,18 +23,12 @@
#include <linux/sched.h>
#include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
#include <asm/proc-fns.h>
#include <asm-generic/mm_hooks.h>
#include <asm/cputype.h>
#include <asm/pgtable.h>
-#define MAX_ASID_BITS 16
-
-extern unsigned int cpu_last_asid;
-
-void __init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void __new_context(struct mm_struct *mm);
-
#ifdef CONFIG_PID_IN_CONTEXTIDR
static inline void contextidr_thread_switch(struct task_struct *next)
{
@@ -55,7 +49,7 @@ static inline void contextidr_thread_switch(struct task_struct *next)
*/
static inline void cpu_set_reserved_ttbr0(void)
{
- unsigned long ttbr = page_to_phys(empty_zero_page);
+ unsigned long ttbr = virt_to_phys(empty_zero_page);
asm(
" msr ttbr0_el1, %0 // set TTBR0\n"
@@ -64,68 +58,53 @@ static inline void cpu_set_reserved_ttbr0(void)
: "r" (ttbr));
}
-static inline void switch_new_context(struct mm_struct *mm)
-{
- unsigned long flags;
-
- __new_context(mm);
-
- local_irq_save(flags);
- cpu_switch_mm(mm->pgd, mm);
- local_irq_restore(flags);
-}
+/*
+ * TCR.T0SZ value to use when the ID map is active. Usually equals
+ * TCR_T0SZ(VA_BITS), unless system RAM is positioned very high in
+ * physical memory, in which case it will be smaller.
+ */
+extern u64 idmap_t0sz;
-static inline void check_and_switch_context(struct mm_struct *mm,
- struct task_struct *tsk)
+static inline bool __cpu_uses_extended_idmap(void)
{
- /*
- * Required during context switch to avoid speculative page table
- * walking with the wrong TTBR.
- */
- cpu_set_reserved_ttbr0();
-
- if (!((mm->context.id ^ cpu_last_asid) >> MAX_ASID_BITS))
- /*
- * The ASID is from the current generation, just switch to the
- * new pgd. This condition is only true for calls from
- * context_switch() and interrupts are already disabled.
- */
- cpu_switch_mm(mm->pgd, mm);
- else if (irqs_disabled())
- /*
- * Defer the new ASID allocation until after the context
- * switch critical region since __new_context() cannot be
- * called with interrupts disabled.
- */
- set_ti_thread_flag(task_thread_info(tsk), TIF_SWITCH_MM);
- else
- /*
- * That is a direct call to switch_mm() or activate_mm() with
- * interrupts enabled and a new context.
- */
- switch_new_context(mm);
+ return (!IS_ENABLED(CONFIG_ARM64_VA_BITS_48) &&
+ unlikely(idmap_t0sz != TCR_T0SZ(VA_BITS)));
}
-#define init_new_context(tsk,mm) (__init_new_context(tsk,mm),0)
-#define destroy_context(mm) do { } while(0)
-
-#define finish_arch_post_lock_switch \
- finish_arch_post_lock_switch
-static inline void finish_arch_post_lock_switch(void)
+/*
+ * Set TCR.T0SZ to its default value (based on VA_BITS)
+ */
+static inline void cpu_set_default_tcr_t0sz(void)
{
- if (test_and_clear_thread_flag(TIF_SWITCH_MM)) {
- struct mm_struct *mm = current->mm;
- unsigned long flags;
+ unsigned long tcr;
- __new_context(mm);
+ if (!__cpu_uses_extended_idmap())
+ return;
- local_irq_save(flags);
- cpu_switch_mm(mm->pgd, mm);
- local_irq_restore(flags);
- }
+ asm volatile (
+ " mrs %0, tcr_el1 ;"
+ " bfi %0, %1, %2, %3 ;"
+ " msr tcr_el1, %0 ;"
+ " isb"
+ : "=&r" (tcr)
+ : "r"(TCR_T0SZ(VA_BITS)), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH));
}
/*
+ * It would be nice to return ASIDs back to the allocator, but unfortunately
+ * that introduces a race with a generation rollover where we could erroneously
+ * free an ASID allocated in a future generation. We could workaround this by
+ * freeing the ASID from the context of the dying mm (e.g. in arch_exit_mmap),
+ * but we'd then need to make sure that we didn't dirty any TLBs afterwards.
+ * Setting a reserved TTBR0 or EPD0 would work, but it all gets ugly when you
+ * take CPU migration into account.
+ */
+#define destroy_context(mm) do { } while(0)
+void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);
+
+#define init_new_context(tsk,mm) ({ atomic64_set(&(mm)->context.id, 0); 0; })
+
+/*
* This is called when "tsk" is about to enter lazy TLB mode.
*
* mm: describes the currently active mm context
@@ -139,15 +118,24 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
}
-/*
- * This is the actual mm switch as far as the scheduler
- * is concerned. No registers are touched. We avoid
- * calling the CPU specific function when the mm hasn't
- * actually changed.
- */
-static inline void
-switch_mm(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+static inline void update_saved_ttbr0(struct task_struct *tsk,
+ struct mm_struct *mm)
+{
+ if (system_uses_ttbr0_pan()) {
+ BUG_ON(mm->pgd == swapper_pg_dir);
+ task_thread_info(tsk)->ttbr0 =
+ virt_to_phys(mm->pgd) | ASID(mm) << 48;
+ }
+}
+#else
+static inline void update_saved_ttbr0(struct task_struct *tsk,
+ struct mm_struct *mm)
+{
+}
+#endif
+
+static inline void __switch_mm(struct mm_struct *next)
{
unsigned int cpu = smp_processor_id();
@@ -160,11 +148,28 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
return;
}
- if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next)) || prev != next)
- check_and_switch_context(next, tsk);
+ check_and_switch_context(next, cpu);
+}
+
+static inline void
+switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ if (prev != next)
+ __switch_mm(next);
+
+ /*
+ * Update the saved TTBR0_EL1 of the scheduled-in task as the previous
+ * value may have not been initialised yet (activate_mm caller) or the
+ * ASID has changed since the last run (following the context switch
+ * of another thread of the same process). Avoid setting the reserved
+ * TTBR0_EL1 to swapper_pg_dir (init_mm; e.g. via idle_task_exit).
+ */
+ if (next != &init_mm)
+ update_saved_ttbr0(tsk, next);
}
#define deactivate_mm(tsk,mm) do { } while (0)
-#define activate_mm(prev,next) switch_mm(prev, next, NULL)
+#define activate_mm(prev,next) switch_mm(prev, next, current)
#endif
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 22b16232bd60..da3235494ffd 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -20,29 +20,19 @@
#define __ASM_PAGE_H
/* PAGE_SHIFT determines the page size */
+/* CONT_SHIFT determines the number of pages which can be tracked together */
#ifdef CONFIG_ARM64_64K_PAGES
#define PAGE_SHIFT 16
+#define CONT_SHIFT 5
#else
#define PAGE_SHIFT 12
+#define CONT_SHIFT 4
#endif
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
+#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
-/*
- * The idmap and swapper page tables need some space reserved in the kernel
- * image. Both require pgd, pud (4 levels only) and pmd tables to (section)
- * map the kernel. With the 64K page configuration, swapper and idmap need to
- * map to pte level. The swapper also maps the FDT (see __create_page_tables
- * for more information).
- */
-#ifdef CONFIG_ARM64_64K_PAGES
-#define SWAPPER_PGTABLE_LEVELS (CONFIG_ARM64_PGTABLE_LEVELS)
-#else
-#define SWAPPER_PGTABLE_LEVELS (CONFIG_ARM64_PGTABLE_LEVELS - 1)
-#endif
-
-#define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE)
-#define IDMAP_DIR_SIZE (SWAPPER_DIR_SIZE)
+#define CONT_SIZE (_AC(1, UL) << (CONT_SHIFT + PAGE_SHIFT))
+#define CONT_MASK (~(CONT_SIZE-1))
#ifndef __ASSEMBLY__
diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h
index 5279e5733386..63e2c49793e7 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -16,8 +16,6 @@
#ifndef __ASM_PERCPU_H
#define __ASM_PERCPU_H
-#ifdef CONFIG_SMP
-
static inline void set_my_cpu_offset(unsigned long off)
{
asm volatile("msr tpidr_el1, %0" :: "r" (off) : "memory");
@@ -38,12 +36,6 @@ static inline unsigned long __my_cpu_offset(void)
}
#define __my_cpu_offset __my_cpu_offset()
-#else /* !CONFIG_SMP */
-
-#define set_my_cpu_offset(x) do { } while (0)
-
-#endif /* CONFIG_SMP */
-
#include <asm-generic/percpu.h>
#endif /* __ASM_PERCPU_H */
diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index d26d1d53c0d7..6471773db6fd 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -24,4 +24,11 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
#define perf_misc_flags(regs) perf_misc_flags(regs)
#endif
+#define perf_arch_fetch_caller_regs(regs, __ip) { \
+ (regs)->pc = (__ip); \
+ (regs)->regs[29] = (unsigned long) __builtin_frame_address(0); \
+ (regs)->sp = current_stack_pointer; \
+ (regs)->pstate = PSR_MODE_EL1h; \
+}
+
#endif
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index e838b9adc4d6..c15053902942 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -29,7 +29,7 @@
#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
@@ -47,9 +47,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
set_pud(pud, __pud(__pa(pmd) | PMD_TYPE_TABLE));
}
-#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 2 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
{
@@ -67,7 +67,7 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
set_pgd(pgd, __pgd(__pa(pud) | PUD_TYPE_TABLE));
}
-#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
extern pgd_t *pgd_alloc(struct mm_struct *mm);
extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 31e6b0477e60..5e50782f625d 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -21,7 +21,7 @@
/*
* PMD_SHIFT determines the size a level 2 page table entry can map.
*/
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
#define PMD_SHIFT ((PAGE_SHIFT - 3) * 2 + 3)
#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
#define PMD_MASK (~(PMD_SIZE-1))
@@ -31,7 +31,7 @@
/*
* PUD_SHIFT determines the size a level 1 page table entry can map.
*/
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
#define PUD_SHIFT ((PAGE_SHIFT - 3) * 3 + 3)
#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
#define PUD_MASK (~(PUD_SIZE-1))
@@ -42,7 +42,7 @@
* PGDIR_SHIFT determines the size a top-level page table entry can map
* (depending on the configuration, this level can be 0, 1 or 2).
*/
-#define PGDIR_SHIFT ((PAGE_SHIFT - 3) * CONFIG_ARM64_PGTABLE_LEVELS + 3)
+#define PGDIR_SHIFT ((PAGE_SHIFT - 3) * CONFIG_PGTABLE_LEVELS + 3)
#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE-1))
#define PTRS_PER_PGD (1 << (VA_BITS - PGDIR_SHIFT))
@@ -141,7 +141,12 @@
/*
* TCR flags.
*/
-#define TCR_TxSZ(x) (((UL(64) - (x)) << 16) | ((UL(64) - (x)) << 0))
+#define TCR_T0SZ_OFFSET 0
+#define TCR_T1SZ_OFFSET 16
+#define TCR_T0SZ(x) ((UL(64) - (x)) << TCR_T0SZ_OFFSET)
+#define TCR_T1SZ(x) ((UL(64) - (x)) << TCR_T1SZ_OFFSET)
+#define TCR_TxSZ(x) (TCR_T0SZ(x) | TCR_T1SZ(x))
+#define TCR_TxSZ_WIDTH 6
#define TCR_IRGN_NC ((UL(0) << 8) | (UL(0) << 24))
#define TCR_IRGN_WBWA ((UL(1) << 8) | (UL(1) << 24))
#define TCR_IRGN_WT ((UL(2) << 8) | (UL(2) << 24))
diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h
index ca9df80af896..2b1bd7e52c3b 100644
--- a/arch/arm64/include/asm/pgtable-types.h
+++ b/arch/arm64/include/asm/pgtable-types.h
@@ -38,13 +38,13 @@ typedef struct { pteval_t pte; } pte_t;
#define pte_val(x) ((x).pte)
#define __pte(x) ((pte_t) { (x) } )
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
typedef struct { pmdval_t pmd; } pmd_t;
#define pmd_val(x) ((x).pmd)
#define __pmd(x) ((pmd_t) { (x) } )
#endif
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;
#define pud_val(x) ((x).pud)
#define __pud(x) ((pud_t) { (x) } )
@@ -64,13 +64,13 @@ typedef pteval_t pte_t;
#define pte_val(x) (x)
#define __pte(x) (x)
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
typedef pmdval_t pmd_t;
#define pmd_val(x) (x)
#define __pmd(x) (x)
#endif
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
typedef pudval_t pud_t;
#define pud_val(x) (x)
#define __pud(x) (x)
@@ -86,9 +86,9 @@ typedef pteval_t pgprot_t;
#endif /* STRICT_MM_TYPECHECKS */
-#if CONFIG_ARM64_PGTABLE_LEVELS == 2
+#if CONFIG_PGTABLE_LEVELS == 2
#include <asm-generic/pgtable-nopmd.h>
-#elif CONFIG_ARM64_PGTABLE_LEVELS == 3
+#elif CONFIG_PGTABLE_LEVELS == 3
#include <asm-generic/pgtable-nopud.h>
#endif
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 239192d72a7b..abe6186dc975 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -63,13 +63,8 @@ extern void __pmd_error(const char *file, int line, unsigned long val);
extern void __pud_error(const char *file, int line, unsigned long val);
extern void __pgd_error(const char *file, int line, unsigned long val);
-#ifdef CONFIG_SMP
#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
-#else
-#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF)
-#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF)
-#endif
#define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE))
#define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_NC))
@@ -120,8 +115,8 @@ extern void __pgd_error(const char *file, int line, unsigned long val);
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
*/
-extern struct page *empty_zero_page;
-#define ZERO_PAGE(vaddr) (empty_zero_page)
+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
+#define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page)
#define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte))
@@ -273,6 +268,11 @@ static inline pmd_t pte_pmd(pte_t pte)
return __pmd(pte_val(pte));
}
+static inline pgprot_t mk_sect_prot(pgprot_t prot)
+{
+ return __pgprot(pgprot_val(prot) & ~PTE_TABLE_BIT);
+}
+
/*
* THP definitions.
*/
@@ -347,9 +347,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
#ifdef CONFIG_ARM64_64K_PAGES
#define pud_sect(pud) (0)
+#define pud_table(pud) (1)
#else
#define pud_sect(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \
PUD_TYPE_SECT)
+#define pud_table(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \
+ PUD_TYPE_TABLE)
#endif
static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
@@ -377,7 +380,7 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd)
*/
#define mk_pte(page,prot) pfn_pte(page_to_pfn(page),prot)
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
#define pmd_ERROR(pmd) __pmd_error(__FILE__, __LINE__, pmd_val(pmd))
@@ -412,9 +415,9 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
#define pud_page(pud) pmd_page(pud_pmd(pud))
-#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 2 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
#define pud_ERROR(pud) __pud_error(__FILE__, __LINE__, pud_val(pud))
@@ -446,7 +449,7 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr)
return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(addr);
}
-#endif /* CONFIG_ARM64_PGTABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd))
@@ -519,6 +522,21 @@ extern int kern_addr_valid(unsigned long addr);
#define pgtable_cache_init() do { } while (0)
+/*
+ * On AArch64, the cache coherency is handled via the set_pte_at() function.
+ */
+static inline void update_mmu_cache(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ /*
+ * We don't do anything here, so there's a very small chance of
+ * us retaking a user fault which we just fixed up. The alternative
+ * is doing a dsb(ishst), but that penalises the fastpath.
+ */
+}
+
+#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
+
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_PGTABLE_H */
diff --git a/arch/arm64/include/asm/proc-fns.h b/arch/arm64/include/asm/proc-fns.h
index 9a8fd84f8fb2..b5240e868784 100644
--- a/arch/arm64/include/asm/proc-fns.h
+++ b/arch/arm64/include/asm/proc-fns.h
@@ -28,18 +28,18 @@
struct mm_struct;
struct cpu_suspend_ctx;
-extern void cpu_cache_off(void);
extern void cpu_do_idle(void);
extern void cpu_do_switch_mm(unsigned long pgd_phys, struct mm_struct *mm);
-extern void cpu_reset(unsigned long addr) __attribute__((noreturn));
-void cpu_soft_restart(phys_addr_t cpu_reset,
- unsigned long addr) __attribute__((noreturn));
extern void cpu_do_suspend(struct cpu_suspend_ctx *ptr);
extern u64 cpu_do_resume(phys_addr_t ptr, u64 idmap_ttbr);
#include <asm/memory.h>
-#define cpu_switch_mm(pgd,mm) cpu_do_switch_mm(virt_to_phys(pgd),mm)
+#define cpu_switch_mm(pgd,mm) \
+do { \
+ BUG_ON(pgd == swapper_pg_dir); \
+ cpu_do_switch_mm(virt_to_phys(pgd),mm); \
+} while (0)
#define cpu_get_pgd() \
({ \
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index f3a965ed5259..f0854ea2ff04 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -166,6 +166,7 @@ static inline void spin_lock_prefetch(const void *x)
#endif
-void cpu_enable_pan(void);
+void cpu_enable_pan(void *__unused);
+void cpu_enable_uao(void *__unused);
#endif /* __ASM_PROCESSOR_H */
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index d6dd9fdbc3be..bc358b525695 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -116,6 +116,8 @@ struct pt_regs {
};
u64 orig_x0;
u64 syscallno;
+ u64 orig_addr_limit;
+ u64 unused; // maintain 16 byte alignment
};
#define arch_has_single_step() (1)
@@ -183,11 +185,7 @@ static inline int valid_user_regs(struct user_pt_regs *regs)
#define instruction_pointer(regs) ((unsigned long)(regs)->pc)
-#ifdef CONFIG_SMP
extern unsigned long profile_pc(struct pt_regs *regs);
-#else
-#define profile_pc(regs) instruction_pointer(regs)
-#endif
#endif /* __ASSEMBLY__ */
#endif
diff --git a/arch/arm64/include/asm/seccomp.h b/arch/arm64/include/asm/seccomp.h
new file mode 100644
index 000000000000..c76fac979629
--- /dev/null
+++ b/arch/arm64/include/asm/seccomp.h
@@ -0,0 +1,25 @@
+/*
+ * arch/arm64/include/asm/seccomp.h
+ *
+ * Copyright (C) 2014 Linaro Limited
+ * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_SECCOMP_H
+#define _ASM_SECCOMP_H
+
+#include <asm/unistd.h>
+
+#ifdef CONFIG_COMPAT
+#define __NR_seccomp_read_32 __NR_compat_read
+#define __NR_seccomp_write_32 __NR_compat_write
+#define __NR_seccomp_exit_32 __NR_compat_exit
+#define __NR_seccomp_sigreturn_32 __NR_compat_rt_sigreturn
+#endif /* CONFIG_COMPAT */
+
+#include <asm-generic/seccomp.h>
+
+#endif /* _ASM_SECCOMP_H */
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 780f82c827b6..295cd8b0ef06 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -20,10 +20,6 @@
#include <linux/cpumask.h>
#include <linux/thread_info.h>
-#ifndef CONFIG_SMP
-# error "<asm/smp.h> included in non-SMP build"
-#endif
-
#define raw_smp_processor_id() (current_thread_info()->cpu)
struct seq_file;
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index a7f3d4b2514d..0cc436bf726b 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -20,10 +20,9 @@
#ifndef __ASM_SYSREG_H
#define __ASM_SYSREG_H
-#include <asm/opcodes.h>
+#include <linux/stringify.h>
-#define SCTLR_EL1_CP15BEN (0x1 << 5)
-#define SCTLR_EL1_SED (0x1 << 8)
+#include <asm/opcodes.h>
/*
* ARMv8 ARM reserves the following encoding for system registers:
@@ -38,11 +37,146 @@
#define sys_reg(op0, op1, crn, crm, op2) \
((((op0)&3)<<19)|((op1)<<16)|((crn)<<12)|((crm)<<8)|((op2)<<5))
-#define REG_PSTATE_PAN_IMM sys_reg(0, 0, 4, 0, 4)
-#define SCTLR_EL1_SPAN (1 << 23)
+#define SYS_MIDR_EL1 sys_reg(3, 0, 0, 0, 0)
+#define SYS_MPIDR_EL1 sys_reg(3, 0, 0, 0, 5)
+#define SYS_REVIDR_EL1 sys_reg(3, 0, 0, 0, 6)
+
+#define SYS_ID_PFR0_EL1 sys_reg(3, 0, 0, 1, 0)
+#define SYS_ID_PFR1_EL1 sys_reg(3, 0, 0, 1, 1)
+#define SYS_ID_DFR0_EL1 sys_reg(3, 0, 0, 1, 2)
+#define SYS_ID_MMFR0_EL1 sys_reg(3, 0, 0, 1, 4)
+#define SYS_ID_MMFR1_EL1 sys_reg(3, 0, 0, 1, 5)
+#define SYS_ID_MMFR2_EL1 sys_reg(3, 0, 0, 1, 6)
+#define SYS_ID_MMFR3_EL1 sys_reg(3, 0, 0, 1, 7)
+
+#define SYS_ID_ISAR0_EL1 sys_reg(3, 0, 0, 2, 0)
+#define SYS_ID_ISAR1_EL1 sys_reg(3, 0, 0, 2, 1)
+#define SYS_ID_ISAR2_EL1 sys_reg(3, 0, 0, 2, 2)
+#define SYS_ID_ISAR3_EL1 sys_reg(3, 0, 0, 2, 3)
+#define SYS_ID_ISAR4_EL1 sys_reg(3, 0, 0, 2, 4)
+#define SYS_ID_ISAR5_EL1 sys_reg(3, 0, 0, 2, 5)
+#define SYS_ID_MMFR4_EL1 sys_reg(3, 0, 0, 2, 6)
+
+#define SYS_MVFR0_EL1 sys_reg(3, 0, 0, 3, 0)
+#define SYS_MVFR1_EL1 sys_reg(3, 0, 0, 3, 1)
+#define SYS_MVFR2_EL1 sys_reg(3, 0, 0, 3, 2)
+
+#define SYS_ID_AA64PFR0_EL1 sys_reg(3, 0, 0, 4, 0)
+#define SYS_ID_AA64PFR1_EL1 sys_reg(3, 0, 0, 4, 1)
+
+#define SYS_ID_AA64DFR0_EL1 sys_reg(3, 0, 0, 5, 0)
+#define SYS_ID_AA64DFR1_EL1 sys_reg(3, 0, 0, 5, 1)
+
+#define SYS_ID_AA64ISAR0_EL1 sys_reg(3, 0, 0, 6, 0)
+#define SYS_ID_AA64ISAR1_EL1 sys_reg(3, 0, 0, 6, 1)
+
+#define SYS_ID_AA64MMFR0_EL1 sys_reg(3, 0, 0, 7, 0)
+#define SYS_ID_AA64MMFR1_EL1 sys_reg(3, 0, 0, 7, 1)
+#define SYS_ID_AA64MMFR2_EL1 sys_reg(3, 0, 0, 7, 2)
+
+#define SYS_CNTFRQ_EL0 sys_reg(3, 3, 14, 0, 0)
+#define SYS_CTR_EL0 sys_reg(3, 3, 0, 0, 1)
+#define SYS_DCZID_EL0 sys_reg(3, 3, 0, 0, 7)
+
+#define REG_PSTATE_PAN_IMM sys_reg(0, 0, 4, 0, 4)
+#define REG_PSTATE_UAO_IMM sys_reg(0, 0, 4, 0, 3)
#define SET_PSTATE_PAN(x) __inst_arm(0xd5000000 | REG_PSTATE_PAN_IMM |\
(!!x)<<8 | 0x1f)
+#define SET_PSTATE_UAO(x) __inst_arm(0xd5000000 | REG_PSTATE_UAO_IMM |\
+ (!!x)<<8 | 0x1f)
+
+/* SCTLR_EL1 */
+#define SCTLR_EL1_CP15BEN (0x1 << 5)
+#define SCTLR_EL1_SED (0x1 << 8)
+#define SCTLR_EL1_SPAN (0x1 << 23)
+
+
+/* id_aa64isar0 */
+#define ID_AA64ISAR0_RDM_SHIFT 28
+#define ID_AA64ISAR0_ATOMICS_SHIFT 20
+#define ID_AA64ISAR0_CRC32_SHIFT 16
+#define ID_AA64ISAR0_SHA2_SHIFT 12
+#define ID_AA64ISAR0_SHA1_SHIFT 8
+#define ID_AA64ISAR0_AES_SHIFT 4
+
+/* id_aa64pfr0 */
+#define ID_AA64PFR0_GIC_SHIFT 24
+#define ID_AA64PFR0_ASIMD_SHIFT 20
+#define ID_AA64PFR0_FP_SHIFT 16
+#define ID_AA64PFR0_EL3_SHIFT 12
+#define ID_AA64PFR0_EL2_SHIFT 8
+#define ID_AA64PFR0_EL1_SHIFT 4
+#define ID_AA64PFR0_EL0_SHIFT 0
+
+#define ID_AA64PFR0_FP_NI 0xf
+#define ID_AA64PFR0_FP_SUPPORTED 0x0
+#define ID_AA64PFR0_ASIMD_NI 0xf
+#define ID_AA64PFR0_ASIMD_SUPPORTED 0x0
+#define ID_AA64PFR0_EL1_64BIT_ONLY 0x1
+#define ID_AA64PFR0_EL0_64BIT_ONLY 0x1
+
+/* id_aa64mmfr0 */
+#define ID_AA64MMFR0_TGRAN4_SHIFT 28
+#define ID_AA64MMFR0_TGRAN64_SHIFT 24
+#define ID_AA64MMFR0_TGRAN16_SHIFT 20
+#define ID_AA64MMFR0_BIGENDEL0_SHIFT 16
+#define ID_AA64MMFR0_SNSMEM_SHIFT 12
+#define ID_AA64MMFR0_BIGENDEL_SHIFT 8
+#define ID_AA64MMFR0_ASID_SHIFT 4
+#define ID_AA64MMFR0_PARANGE_SHIFT 0
+
+#define ID_AA64MMFR0_TGRAN4_NI 0xf
+#define ID_AA64MMFR0_TGRAN4_SUPPORTED 0x0
+#define ID_AA64MMFR0_TGRAN64_NI 0xf
+#define ID_AA64MMFR0_TGRAN64_SUPPORTED 0x0
+#define ID_AA64MMFR0_TGRAN16_NI 0x0
+#define ID_AA64MMFR0_TGRAN16_SUPPORTED 0x1
+
+/* id_aa64mmfr1 */
+#define ID_AA64MMFR1_PAN_SHIFT 20
+#define ID_AA64MMFR1_LOR_SHIFT 16
+#define ID_AA64MMFR1_HPD_SHIFT 12
+#define ID_AA64MMFR1_VHE_SHIFT 8
+#define ID_AA64MMFR1_VMIDBITS_SHIFT 4
+#define ID_AA64MMFR1_HADBS_SHIFT 0
+
+/* id_aa64mmfr2 */
+#define ID_AA64MMFR2_UAO_SHIFT 4
+
+/* id_aa64dfr0 */
+#define ID_AA64DFR0_CTX_CMPS_SHIFT 28
+#define ID_AA64DFR0_WRPS_SHIFT 20
+#define ID_AA64DFR0_BRPS_SHIFT 12
+#define ID_AA64DFR0_PMUVER_SHIFT 8
+#define ID_AA64DFR0_TRACEVER_SHIFT 4
+#define ID_AA64DFR0_DEBUGVER_SHIFT 0
+
+#define ID_ISAR5_RDM_SHIFT 24
+#define ID_ISAR5_CRC32_SHIFT 16
+#define ID_ISAR5_SHA2_SHIFT 12
+#define ID_ISAR5_SHA1_SHIFT 8
+#define ID_ISAR5_AES_SHIFT 4
+#define ID_ISAR5_SEVL_SHIFT 0
+
+#define MVFR0_FPROUND_SHIFT 28
+#define MVFR0_FPSHVEC_SHIFT 24
+#define MVFR0_FPSQRT_SHIFT 20
+#define MVFR0_FPDIVIDE_SHIFT 16
+#define MVFR0_FPTRAP_SHIFT 12
+#define MVFR0_FPDP_SHIFT 8
+#define MVFR0_FPSP_SHIFT 4
+#define MVFR0_SIMD_SHIFT 0
+
+#define MVFR1_SIMDFMAC_SHIFT 28
+#define MVFR1_FPHP_SHIFT 24
+#define MVFR1_SIMDHP_SHIFT 20
+#define MVFR1_SIMDSP_SHIFT 16
+#define MVFR1_SIMDINT_SHIFT 12
+#define MVFR1_SIMDLS_SHIFT 8
+#define MVFR1_FPDNAN_SHIFT 4
+#define MVFR1_FPFTZ_SHIFT 0
+
#ifdef __ASSEMBLY__
@@ -61,6 +195,8 @@
#else
+#include <linux/types.h>
+
asm(
" .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n"
" .equ __reg_num_x\\num, \\num\n"
@@ -85,6 +221,23 @@ static inline void config_sctlr_el1(u32 clear, u32 set)
val |= set;
asm volatile("msr sctlr_el1, %0" : : "r" (val));
}
+
+/*
+ * Unlike read_cpuid, calls to read_sysreg are never expected to be
+ * optimized away or replaced with synthetic values.
+ */
+#define read_sysreg(r) ({ \
+ u64 __val; \
+ asm volatile("mrs %0, " __stringify(r) : "=r" (__val)); \
+ __val; \
+})
+
+#define write_sysreg(v, r) do { \
+ u64 __val = (u64)v; \
+ asm volatile("msr " __stringify(r) ", %0" \
+ : : "r" (__val)); \
+} while (0)
+
#endif
#endif /* __ASM_SYSREG_H */
diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h
index 7a18fabbe0f6..659fbf5925de 100644
--- a/arch/arm64/include/asm/system_misc.h
+++ b/arch/arm64/include/asm/system_misc.h
@@ -41,7 +41,6 @@ struct mm_struct;
extern void show_pte(struct mm_struct *mm, unsigned long addr);
extern void __show_regs(struct pt_regs *);
-void soft_restart(unsigned long);
extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd);
#define UDBG_UNDEFINED (1 << 0)
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 459bf8e53208..1b0c8e12e79f 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -48,7 +48,9 @@ struct thread_info {
mm_segment_t addr_limit; /* address limit */
struct task_struct *task; /* main task structure */
struct exec_domain *exec_domain; /* execution domain */
- struct restart_block restart_block;
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ u64 ttbr0; /* saved TTBR0_EL1 */
+#endif
int preempt_count; /* 0 => preemptable, <0 => bug */
int cpu; /* cpu */
};
@@ -60,9 +62,6 @@ struct thread_info {
.flags = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
@@ -78,10 +77,16 @@ register unsigned long current_stack_pointer asm ("sp");
*/
static inline struct thread_info *current_thread_info(void) __attribute_const__;
+/*
+ * struct thread_info can be accessed directly via sp_el0.
+ */
static inline struct thread_info *current_thread_info(void)
{
- return (struct thread_info *)
- (current_stack_pointer & ~(THREAD_SIZE - 1));
+ unsigned long sp_el0;
+
+ asm ("mrs %0, sp_el0" : "=r" (sp_el0));
+
+ return (struct thread_info *)sp_el0;
}
#define thread_saved_pc(tsk) \
@@ -118,7 +123,6 @@ static inline struct thread_info *current_thread_info(void)
#define TIF_RESTORE_SIGMASK 20
#define TIF_SINGLESTEP 21
#define TIF_32BIT 22 /* 32bit process */
-#define TIF_SWITCH_MM 23 /* deferred switch_mm */
#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index 53d9c354219f..ffdaea7954bb 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -37,12 +37,21 @@ static inline void __tlb_remove_table(void *_table)
static inline void tlb_flush(struct mmu_gather *tlb)
{
- if (tlb->fullmm) {
- flush_tlb_mm(tlb->mm);
- } else {
- struct vm_area_struct vma = { .vm_mm = tlb->mm, };
- flush_tlb_range(&vma, tlb->start, tlb->end);
- }
+ struct vm_area_struct vma = { .vm_mm = tlb->mm, };
+
+ /*
+ * The ASID allocator will either invalidate the ASID or mark
+ * it as used.
+ */
+ if (tlb->fullmm)
+ return;
+
+ /*
+ * The intermediate page table levels are already handled by
+ * the __(pte|pmd|pud)_free_tlb() functions, so last level
+ * TLBI is sufficient here.
+ */
+ __flush_tlb_range(&vma, tlb->start, tlb->end, true);
}
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
@@ -53,7 +62,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
tlb_remove_entry(tlb, pte);
}
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
unsigned long addr)
{
@@ -62,7 +71,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
}
#endif
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
unsigned long addr)
{
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 8b8d8cb46e01..b460ae28e346 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -24,17 +24,10 @@
#include <linux/sched.h>
#include <asm/cputype.h>
-extern void __cpu_flush_user_tlb_range(unsigned long, unsigned long, struct vm_area_struct *);
-extern void __cpu_flush_kern_tlb_range(unsigned long, unsigned long);
-
-extern struct cpu_tlb_fns cpu_tlb;
-
/*
* TLB Management
* ==============
*
- * The arch/arm64/mm/tlb.S files implement these methods.
- *
* The TLB specific code is expected to perform whatever tests it needs
* to determine if it should invalidate the TLB for each call. Start
* addresses are inclusive and end addresses are exclusive; it is safe to
@@ -70,6 +63,14 @@ extern struct cpu_tlb_fns cpu_tlb;
* only require the D-TLB to be invalidated.
* - kaddr - Kernel virtual memory address
*/
+static inline void local_flush_tlb_all(void)
+{
+ dsb(nshst);
+ asm("tlbi vmalle1");
+ dsb(nsh);
+ isb();
+}
+
static inline void flush_tlb_all(void)
{
dsb(ishst);
@@ -80,7 +81,7 @@ static inline void flush_tlb_all(void)
static inline void flush_tlb_mm(struct mm_struct *mm)
{
- unsigned long asid = (unsigned long)ASID(mm) << 48;
+ unsigned long asid = ASID(mm) << 48;
dsb(ishst);
asm("tlbi aside1is, %0" : : "r" (asid));
@@ -90,31 +91,59 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
static inline void flush_tlb_page(struct vm_area_struct *vma,
unsigned long uaddr)
{
- unsigned long addr = uaddr >> 12 |
- ((unsigned long)ASID(vma->vm_mm) << 48);
+ unsigned long addr = uaddr >> 12 | (ASID(vma->vm_mm) << 48);
dsb(ishst);
- asm("tlbi vae1is, %0" : : "r" (addr));
+ asm("tlbi vale1is, %0" : : "r" (addr));
dsb(ish);
}
+/*
+ * This is meant to avoid soft lock-ups on large TLB flushing ranges and not
+ * necessarily a performance improvement.
+ */
+#define MAX_TLB_RANGE (1024UL << PAGE_SHIFT)
+
static inline void __flush_tlb_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end,
+ bool last_level)
{
- unsigned long asid = (unsigned long)ASID(vma->vm_mm) << 48;
+ unsigned long asid = ASID(vma->vm_mm) << 48;
unsigned long addr;
+
+ if ((end - start) > MAX_TLB_RANGE) {
+ flush_tlb_mm(vma->vm_mm);
+ return;
+ }
+
start = asid | (start >> 12);
end = asid | (end >> 12);
dsb(ishst);
- for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
- asm("tlbi vae1is, %0" : : "r"(addr));
+ for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) {
+ if (last_level)
+ asm("tlbi vale1is, %0" : : "r"(addr));
+ else
+ asm("tlbi vae1is, %0" : : "r"(addr));
+ }
dsb(ish);
}
-static inline void __flush_tlb_kernel_range(unsigned long start, unsigned long end)
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ __flush_tlb_range(vma, start, end, false);
+}
+
+static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
unsigned long addr;
+
+ if ((end - start) > MAX_TLB_RANGE) {
+ flush_tlb_all();
+ return;
+ }
+
start >>= 12;
end >>= 12;
@@ -126,55 +155,17 @@ static inline void __flush_tlb_kernel_range(unsigned long start, unsigned long e
}
/*
- * This is meant to avoid soft lock-ups on large TLB flushing ranges and not
- * necessarily a performance improvement.
- */
-#define MAX_TLB_RANGE (1024UL << PAGE_SHIFT)
-
-static inline void flush_tlb_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
- if ((end - start) <= MAX_TLB_RANGE)
- __flush_tlb_range(vma, start, end);
- else
- flush_tlb_mm(vma->vm_mm);
-}
-
-static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
-{
- if ((end - start) <= MAX_TLB_RANGE)
- __flush_tlb_kernel_range(start, end);
- else
- flush_tlb_all();
-}
-
-/*
* Used to invalidate the TLB (walk caches) corresponding to intermediate page
* table levels (pgd/pud/pmd).
*/
static inline void __flush_tlb_pgtable(struct mm_struct *mm,
unsigned long uaddr)
{
- unsigned long addr = uaddr >> 12 | ((unsigned long)ASID(mm) << 48);
+ unsigned long addr = uaddr >> 12 | (ASID(mm) << 48);
- dsb(ishst);
asm("tlbi vae1is, %0" : : "r" (addr));
dsb(ish);
}
-/*
- * On AArch64, the cache coherency is handled via the set_pte_at() function.
- */
-static inline void update_mmu_cache(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
-{
- /*
- * set_pte() does not have a DSB for user mappings, so make sure that
- * the page table write is visible.
- */
- dsb(ishst);
-}
-
-#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
#endif
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 7ebcd31ce51c..3a5bbb45537f 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -1,8 +1,6 @@
#ifndef __ASM_TOPOLOGY_H
#define __ASM_TOPOLOGY_H
-#ifdef CONFIG_SMP
-
#include <linux/cpumask.h>
struct cpu_topology {
@@ -24,12 +22,14 @@ void init_cpu_topology(void);
void store_cpu_topology(unsigned int cpuid);
const struct cpumask *cpu_coregroup_mask(int cpu);
-#else
-
-static inline void init_cpu_topology(void) { }
-static inline void store_cpu_topology(unsigned int cpuid) { }
-
+struct sched_domain;
+#ifdef CONFIG_CPU_FREQ
+#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
+extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
+extern unsigned long cpufreq_scale_max_freq_capacity(int cpu);
#endif
+#define arch_scale_cpu_capacity scale_cpu_capacity
+extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
#include <asm-generic/topology.h>
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 14450f742554..21963026efd0 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -18,6 +18,12 @@
#ifndef __ASM_UACCESS_H
#define __ASM_UACCESS_H
+#include <asm/alternative.h>
+#include <asm/kernel-pgtable.h>
+#include <asm/sysreg.h>
+
+#ifndef __ASSEMBLY__
+
/*
* User space memory access functions
*/
@@ -25,10 +31,8 @@
#include <linux/string.h>
#include <linux/thread_info.h>
-#include <asm/alternative.h>
#include <asm/cpufeature.h>
#include <asm/ptrace.h>
-#include <asm/sysreg.h>
#include <asm/errno.h>
#include <asm/memory.h>
#include <asm/compiler.h>
@@ -65,6 +69,16 @@ extern int fixup_exception(struct pt_regs *regs);
static inline void set_fs(mm_segment_t fs)
{
current_thread_info()->addr_limit = fs;
+
+ /*
+ * Enable/disable UAO so that copy_to_user() etc can access
+ * kernel memory with the unprivileged instructions.
+ */
+ if (IS_ENABLED(CONFIG_ARM64_UAO) && fs == KERNEL_DS)
+ asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO));
+ else
+ asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO,
+ CONFIG_ARM64_UAO));
}
#define segment_eq(a,b) ((a) == (b))
@@ -115,6 +129,99 @@ static inline void set_fs(mm_segment_t fs)
#define user_addr_max get_fs
/*
+ * User access enabling/disabling.
+ */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+static inline void __uaccess_ttbr0_disable(void)
+{
+ unsigned long ttbr;
+
+ /* reserved_ttbr0 placed at the end of swapper_pg_dir */
+ ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE;
+ write_sysreg(ttbr, ttbr0_el1);
+ isb();
+}
+
+static inline void __uaccess_ttbr0_enable(void)
+{
+ unsigned long flags;
+
+ /*
+ * Disable interrupts to avoid preemption between reading the 'ttbr0'
+ * variable and the MSR. A context switch could trigger an ASID
+ * roll-over and an update of 'ttbr0'.
+ */
+ local_irq_save(flags);
+ write_sysreg(current_thread_info()->ttbr0, ttbr0_el1);
+ isb();
+ local_irq_restore(flags);
+}
+
+static inline bool uaccess_ttbr0_disable(void)
+{
+ if (!system_uses_ttbr0_pan())
+ return false;
+ __uaccess_ttbr0_disable();
+ return true;
+}
+
+static inline bool uaccess_ttbr0_enable(void)
+{
+ if (!system_uses_ttbr0_pan())
+ return false;
+ __uaccess_ttbr0_enable();
+ return true;
+}
+#else
+static inline bool uaccess_ttbr0_disable(void)
+{
+ return false;
+}
+
+static inline bool uaccess_ttbr0_enable(void)
+{
+ return false;
+}
+#endif
+
+#define __uaccess_disable(alt) \
+do { \
+ if (!uaccess_ttbr0_disable()) \
+ asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \
+ CONFIG_ARM64_PAN)); \
+} while (0)
+
+#define __uaccess_enable(alt) \
+do { \
+ if (!uaccess_ttbr0_enable()) \
+ asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \
+ CONFIG_ARM64_PAN)); \
+} while (0)
+
+static inline void uaccess_disable(void)
+{
+ __uaccess_disable(ARM64_HAS_PAN);
+}
+
+static inline void uaccess_enable(void)
+{
+ __uaccess_enable(ARM64_HAS_PAN);
+}
+
+/*
+ * These functions are no-ops when UAO is present.
+ */
+static inline void uaccess_disable_not_uao(void)
+{
+ __uaccess_disable(ARM64_ALT_PAN_NOT_UAO);
+}
+
+static inline void uaccess_enable_not_uao(void)
+{
+ __uaccess_enable(ARM64_ALT_PAN_NOT_UAO);
+}
+
+/*
* The "__xxx" versions of the user access functions do not verify the address
* space - it must have been done previously with a separate "access_ok()"
* call.
@@ -122,9 +229,10 @@ static inline void set_fs(mm_segment_t fs)
* The "__xxx_error" versions set the third argument to -EFAULT if an error
* occurs, and leave it unchanged on success.
*/
-#define __get_user_asm(instr, reg, x, addr, err) \
+#define __get_user_asm(instr, alt_instr, reg, x, addr, err, feature) \
asm volatile( \
- "1: " instr " " reg "1, [%2]\n" \
+ "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \
+ alt_instr " " reg "1, [%2]\n", feature) \
"2:\n" \
" .section .fixup, \"ax\"\n" \
" .align 2\n" \
@@ -143,27 +251,29 @@ static inline void set_fs(mm_segment_t fs)
do { \
unsigned long __gu_val; \
__chk_user_ptr(ptr); \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN)); \
+ uaccess_enable_not_uao(); \
switch (sizeof(*(ptr))) { \
case 1: \
- __get_user_asm("ldrb", "%w", __gu_val, (ptr), (err)); \
+ __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
case 2: \
- __get_user_asm("ldrh", "%w", __gu_val, (ptr), (err)); \
+ __get_user_asm("ldrh", "ldtrh", "%w", __gu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
case 4: \
- __get_user_asm("ldr", "%w", __gu_val, (ptr), (err)); \
+ __get_user_asm("ldr", "ldtr", "%w", __gu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
case 8: \
- __get_user_asm("ldr", "%", __gu_val, (ptr), (err)); \
+ __get_user_asm("ldr", "ldtr", "%", __gu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
default: \
BUILD_BUG(); \
} \
+ uaccess_disable_not_uao(); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN)); \
} while (0)
#define __get_user(x, ptr) \
@@ -190,9 +300,10 @@ do { \
((x) = 0, -EFAULT); \
})
-#define __put_user_asm(instr, reg, x, addr, err) \
+#define __put_user_asm(instr, alt_instr, reg, x, addr, err, feature) \
asm volatile( \
- "1: " instr " " reg "1, [%2]\n" \
+ "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \
+ alt_instr " " reg "1, [%2]\n", feature) \
"2:\n" \
" .section .fixup,\"ax\"\n" \
" .align 2\n" \
@@ -210,26 +321,28 @@ do { \
do { \
__typeof__(*(ptr)) __pu_val = (x); \
__chk_user_ptr(ptr); \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN)); \
+ uaccess_enable_not_uao(); \
switch (sizeof(*(ptr))) { \
case 1: \
- __put_user_asm("strb", "%w", __pu_val, (ptr), (err)); \
+ __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
case 2: \
- __put_user_asm("strh", "%w", __pu_val, (ptr), (err)); \
+ __put_user_asm("strh", "sttrh", "%w", __pu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
case 4: \
- __put_user_asm("str", "%w", __pu_val, (ptr), (err)); \
+ __put_user_asm("str", "sttr", "%w", __pu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
case 8: \
- __put_user_asm("str", "%", __pu_val, (ptr), (err)); \
+ __put_user_asm("str", "sttr", "%", __pu_val, (ptr), \
+ (err), ARM64_HAS_UAO); \
break; \
default: \
BUILD_BUG(); \
} \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN)); \
+ uaccess_disable_not_uao(); \
} while (0)
#define __put_user(x, ptr) \
@@ -256,24 +369,39 @@ do { \
-EFAULT; \
})
-extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n);
-extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n);
+extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
+extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n);
extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n);
extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n);
+static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+ check_object_size(to, n, false);
+ return __arch_copy_from_user(to, from, n);
+}
+
+static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+ check_object_size(from, n, true);
+ return __arch_copy_to_user(to, from, n);
+}
+
static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n)
{
- if (access_ok(VERIFY_READ, from, n))
- n = __copy_from_user(to, from, n);
- else /* security hole - plug it */
+ if (access_ok(VERIFY_READ, from, n)) {
+ check_object_size(to, n, false);
+ n = __arch_copy_from_user(to, from, n);
+ } else /* security hole - plug it */
memset(to, 0, n);
return n;
}
static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n)
{
- if (access_ok(VERIFY_WRITE, to, n))
- n = __copy_to_user(to, from, n);
+ if (access_ok(VERIFY_WRITE, to, n)) {
+ check_object_size(from, n, true);
+ n = __arch_copy_to_user(to, from, n);
+ }
return n;
}
@@ -299,4 +427,66 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count);
extern __must_check long strlen_user(const char __user *str);
extern __must_check long strnlen_user(const char __user *str, long n);
+#else /* __ASSEMBLY__ */
+
+#include <asm/assembler.h>
+
+/*
+ * User access enabling/disabling macros.
+ */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ .macro __uaccess_ttbr0_disable, tmp1
+ mrs \tmp1, ttbr1_el1 // swapper_pg_dir
+ add \tmp1, \tmp1, #SWAPPER_DIR_SIZE // reserved_ttbr0 at the end of swapper_pg_dir
+ msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1
+ isb
+ .endm
+
+ .macro __uaccess_ttbr0_enable, tmp1
+ get_thread_info \tmp1
+ ldr \tmp1, [\tmp1, #TSK_TI_TTBR0] // load saved TTBR0_EL1
+ msr ttbr0_el1, \tmp1 // set the non-PAN TTBR0_EL1
+ isb
+ .endm
+
+ .macro uaccess_ttbr0_disable, tmp1
+alternative_if_not ARM64_HAS_PAN
+ __uaccess_ttbr0_disable \tmp1
+alternative_else_nop_endif
+ .endm
+
+ .macro uaccess_ttbr0_enable, tmp1, tmp2
+alternative_if_not ARM64_HAS_PAN
+ save_and_disable_irq \tmp2 // avoid preemption
+ __uaccess_ttbr0_enable \tmp1
+ restore_irq \tmp2
+alternative_else_nop_endif
+ .endm
+#else
+ .macro uaccess_ttbr0_disable, tmp1
+ .endm
+
+ .macro uaccess_ttbr0_enable, tmp1, tmp2
+ .endm
+#endif
+
+/*
+ * These macros are no-ops when UAO is present.
+ */
+ .macro uaccess_disable_not_uao, tmp1
+ uaccess_ttbr0_disable \tmp1
+alternative_if ARM64_ALT_PAN_NOT_UAO
+ SET_PSTATE_PAN(1)
+alternative_else_nop_endif
+ .endm
+
+ .macro uaccess_enable_not_uao, tmp1, tmp2
+ uaccess_ttbr0_enable \tmp1, \tmp2
+alternative_if ARM64_ALT_PAN_NOT_UAO
+ SET_PSTATE_PAN(0)
+alternative_else_nop_endif
+ .endm
+
+#endif /* __ASSEMBLY__ */
+
#endif /* __ASM_UACCESS_H */
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 6d2bf419431d..49c9aefd24a5 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -31,6 +31,9 @@
* Compat syscall numbers used by the AArch64 kernel.
*/
#define __NR_compat_restart_syscall 0
+#define __NR_compat_exit 1
+#define __NR_compat_read 3
+#define __NR_compat_write 4
#define __NR_compat_sigreturn 119
#define __NR_compat_rt_sigreturn 173
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 9dfdac4a74a1..8893cebcea5b 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -787,7 +787,8 @@ __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
__SYSCALL(__NR_sched_getattr, sys_sched_getattr)
#define __NR_renameat2 382
__SYSCALL(__NR_renameat2, sys_renameat2)
- /* 383 for seccomp */
+#define __NR_seccomp 383
+__SYSCALL(__NR_seccomp, sys_seccomp)
#define __NR_getrandom 384
__SYSCALL(__NR_getrandom, sys_getrandom)
#define __NR_memfd_create 385
diff --git a/arch/arm64/include/asm/vdso_datapage.h b/arch/arm64/include/asm/vdso_datapage.h
index de66199673d7..2b9a63771eda 100644
--- a/arch/arm64/include/asm/vdso_datapage.h
+++ b/arch/arm64/include/asm/vdso_datapage.h
@@ -22,6 +22,8 @@
struct vdso_data {
__u64 cs_cycle_last; /* Timebase at clocksource init */
+ __u64 raw_time_sec; /* Raw time */
+ __u64 raw_time_nsec;
__u64 xtime_clock_sec; /* Kernel time */
__u64 xtime_clock_nsec;
__u64 xtime_coarse_sec; /* Coarse time */
@@ -29,8 +31,10 @@ struct vdso_data {
__u64 wtm_clock_sec; /* Wall to monotonic time */
__u64 wtm_clock_nsec;
__u32 tb_seq_count; /* Timebase sequence counter */
- __u32 cs_mult; /* Clocksource multiplier */
- __u32 cs_shift; /* Clocksource shift */
+ /* cs_* members must be adjacent and in this order (ldp accesses) */
+ __u32 cs_mono_mult; /* NTP-adjusted clocksource multiplier */
+ __u32 cs_shift; /* Clocksource shift (mono = raw) */
+ __u32 cs_raw_mult; /* Raw clocksource multiplier */
__u32 tz_minuteswest; /* Whacky timezone stuff */
__u32 tz_dsttime;
__u32 use_syscall;
diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index 208db3df135a..b5c3933ed441 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -45,6 +45,7 @@
#define PSR_A_BIT 0x00000100
#define PSR_D_BIT 0x00000200
#define PSR_PAN_BIT 0x00400000
+#define PSR_UAO_BIT 0x00800000
#define PSR_Q_BIT 0x08000000
#define PSR_V_BIT 0x10000000
#define PSR_C_BIT 0x20000000
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index fd65134d1053..29065295fbdd 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -14,18 +14,19 @@ CFLAGS_REMOVE_insn.o = -pg
CFLAGS_REMOVE_return_address.o = -pg
# Object file lists.
-arm64-obj-y := cputable.o debug-monitors.o entry.o irq.o fpsimd.o \
+arm64-obj-y := debug-monitors.o entry.o irq.o fpsimd.o \
entry-fpsimd.o process.o ptrace.o setup.o signal.o \
sys.o stacktrace.o time.o traps.o io.o vdso.o \
hyp-stub.o psci.o psci-call.o cpu_ops.o insn.o \
- return_address.o cpuinfo.o cpu_errata.o cpufeature.o alternative.o
+ return_address.o cpuinfo.o cpu_errata.o \
+ cpufeature.o alternative.o \
+ smp.o smp_spin_table.o topology.o
arm64-obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \
- sys_compat.o \
+ sys_compat.o \
../../arm/kernel/opcodes.o
arm64-obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o
arm64-obj-$(CONFIG_MODULES) += arm64ksyms.o module.o
-arm64-obj-$(CONFIG_SMP) += smp.o smp_spin_table.o topology.o
arm64-obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
arm64-obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o
arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
@@ -41,7 +42,3 @@ obj-y += $(arm64-obj-y) vdso/
obj-m += $(arm64-obj-m)
head-y := head.o
extra-y := $(head-y) vmlinux.lds
-
-# vDSO - this must be built first to generate the symbol offsets
-$(call objectify,$(arm64-obj-y)): $(obj)/vdso/vdso-offsets.h
-$(obj)/vdso/vdso-offsets.h: $(obj)/vdso
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index 1a3badab800a..392c9193da54 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -24,16 +24,77 @@
#include <asm/cacheflush.h>
#include <asm/alternative.h>
#include <asm/cpufeature.h>
+#include <asm/insn.h>
#include <linux/stop_machine.h>
+#define __ALT_PTR(a,f) (u32 *)((void *)&(a)->f + (a)->f)
+#define ALT_ORIG_PTR(a) __ALT_PTR(a, orig_offset)
+#define ALT_REPL_PTR(a) __ALT_PTR(a, alt_offset)
+
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-static int __apply_alternatives(void *dummy)
+struct alt_region {
+ struct alt_instr *begin;
+ struct alt_instr *end;
+};
+
+/*
+ * Check if the target PC is within an alternative block.
+ */
+static bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc)
+{
+ unsigned long replptr;
+
+ if (kernel_text_address(pc))
+ return 1;
+
+ replptr = (unsigned long)ALT_REPL_PTR(alt);
+ if (pc >= replptr && pc <= (replptr + alt->alt_len))
+ return 0;
+
+ /*
+ * Branching into *another* alternate sequence is doomed, and
+ * we're not even trying to fix it up.
+ */
+ BUG();
+}
+
+static u32 get_alt_insn(struct alt_instr *alt, u32 *insnptr, u32 *altinsnptr)
+{
+ u32 insn;
+
+ insn = le32_to_cpu(*altinsnptr);
+
+ if (aarch64_insn_is_branch_imm(insn)) {
+ s32 offset = aarch64_get_branch_offset(insn);
+ unsigned long target;
+
+ target = (unsigned long)altinsnptr + offset;
+
+ /*
+ * If we're branching inside the alternate sequence,
+ * do not rewrite the instruction, as it is already
+ * correct. Otherwise, generate the new instruction.
+ */
+ if (branch_insn_requires_update(alt, target)) {
+ offset = target - (unsigned long)insnptr;
+ insn = aarch64_set_branch_offset(insn, offset);
+ }
+ }
+
+ return insn;
+}
+
+static void __apply_alternatives(void *alt_region)
{
struct alt_instr *alt;
- u8 *origptr, *replptr;
+ struct alt_region *region = alt_region;
+ u32 *origptr, *replptr;
+
+ for (alt = region->begin; alt < region->end; alt++) {
+ u32 insn;
+ int i, nr_inst;
- for (alt = __alt_instructions; alt < __alt_instructions_end; alt++) {
if (!cpus_have_cap(alt->cpufeature))
continue;
@@ -41,20 +102,60 @@ static int __apply_alternatives(void *dummy)
pr_info_once("patching kernel code\n");
- origptr = (u8 *)&alt->orig_offset + alt->orig_offset;
- replptr = (u8 *)&alt->alt_offset + alt->alt_offset;
- memcpy(origptr, replptr, alt->alt_len);
+ origptr = ALT_ORIG_PTR(alt);
+ replptr = ALT_REPL_PTR(alt);
+ nr_inst = alt->alt_len / sizeof(insn);
+
+ for (i = 0; i < nr_inst; i++) {
+ insn = get_alt_insn(alt, origptr + i, replptr + i);
+ *(origptr + i) = cpu_to_le32(insn);
+ }
+
flush_icache_range((uintptr_t)origptr,
- (uintptr_t)(origptr + alt->alt_len));
+ (uintptr_t)(origptr + nr_inst));
+ }
+}
+
+/*
+ * We might be patching the stop_machine state machine, so implement a
+ * really simple polling protocol here.
+ */
+static int __apply_alternatives_multi_stop(void *unused)
+{
+ static int patched = 0;
+ struct alt_region region = {
+ .begin = __alt_instructions,
+ .end = __alt_instructions_end,
+ };
+
+ /* We always have a CPU 0 at this point (__init) */
+ if (smp_processor_id()) {
+ while (!READ_ONCE(patched))
+ cpu_relax();
+ } else {
+ BUG_ON(patched);
+ __apply_alternatives(&region);
+ /* Barriers provided by the cache flushing */
+ WRITE_ONCE(patched, 1);
}
return 0;
}
-void apply_alternatives(void)
+void __init apply_alternatives_all(void)
{
/* better not try code patching on a live SMP system */
- stop_machine(__apply_alternatives, NULL, NULL);
+ stop_machine(__apply_alternatives_multi_stop, NULL, cpu_online_mask);
+}
+
+void apply_alternatives(void *start, size_t length)
+{
+ struct alt_region region = {
+ .begin = start,
+ .end = start + length,
+ };
+
+ __apply_alternatives(&region);
}
void free_alternatives_memory(void)
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 3b6d8cc9dfe0..c654df05b7d7 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -33,8 +33,8 @@ EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);
/* user mem (segment) */
-EXPORT_SYMBOL(__copy_from_user);
-EXPORT_SYMBOL(__copy_to_user);
+EXPORT_SYMBOL(__arch_copy_from_user);
+EXPORT_SYMBOL(__arch_copy_to_user);
EXPORT_SYMBOL(__clear_user);
EXPORT_SYMBOL(__copy_in_user);
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index bcee7abac68e..61668a44666d 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -14,7 +14,6 @@
#include <linux/slab.h>
#include <linux/sysctl.h>
-#include <asm/alternative.h>
#include <asm/cpufeature.h>
#include <asm/insn.h>
#include <asm/opcodes.h>
@@ -62,7 +61,7 @@ struct insn_emulation {
};
static LIST_HEAD(insn_emulation);
-static int nr_insn_emulated;
+static int nr_insn_emulated __initdata;
static DEFINE_RAW_SPINLOCK(insn_emulation_lock);
static void register_emulation_hooks(struct insn_emulation_ops *ops)
@@ -173,7 +172,7 @@ static int update_insn_emulation_mode(struct insn_emulation *insn,
return ret;
}
-static void register_insn_emulation(struct insn_emulation_ops *ops)
+static void __init register_insn_emulation(struct insn_emulation_ops *ops)
{
unsigned long flags;
struct insn_emulation *insn;
@@ -237,7 +236,7 @@ static struct ctl_table ctl_abi[] = {
{ }
};
-static void register_insn_emulation_sysctl(struct ctl_table *table)
+static void __init register_insn_emulation_sysctl(struct ctl_table *table)
{
unsigned long flags;
int i = 0;
@@ -281,9 +280,9 @@ static void register_insn_emulation_sysctl(struct ctl_table *table)
* Error-checking SWP macros implemented using ldxr{b}/stxr{b}
*/
#define __user_swpX_asm(data, addr, res, temp, B) \
+do { \
+ uaccess_enable(); \
__asm__ __volatile__( \
- ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN) \
" mov %w2, %w1\n" \
"0: ldxr"B" %w1, [%3]\n" \
"1: stxr"B" %w0, %w2, [%3]\n" \
@@ -300,11 +299,11 @@ static void register_insn_emulation_sysctl(struct ctl_table *table)
" .quad 0b, 3b\n" \
" .quad 1b, 3b\n" \
" .popsection\n" \
- ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN) \
: "=&r" (res), "+r" (data), "=&r" (temp) \
: "r" (addr), "i" (-EAGAIN), "i" (-EFAULT) \
- : "memory")
+ : "memory"); \
+ uaccess_disable(); \
+} while (0)
#define __user_swp_asm(data, addr, res, temp) \
__user_swpX_asm(data, addr, res, temp, "")
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 9a9fce090d58..92a5d25bff1a 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -24,7 +24,6 @@
#include <linux/kvm_host.h>
#include <asm/thread_info.h>
#include <asm/memory.h>
-#include <asm/cputable.h>
#include <asm/smp_plat.h>
#include <asm/suspend.h>
#include <asm/vdso_datapage.h>
@@ -40,6 +39,9 @@ int main(void)
DEFINE(TI_TASK, offsetof(struct thread_info, task));
DEFINE(TI_EXEC_DOMAIN, offsetof(struct thread_info, exec_domain));
DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ DEFINE(TSK_TI_TTBR0, offsetof(struct thread_info, ttbr0));
+#endif
BLANK();
DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context));
BLANK();
@@ -60,9 +62,10 @@ int main(void)
DEFINE(S_PC, offsetof(struct pt_regs, pc));
DEFINE(S_ORIG_X0, offsetof(struct pt_regs, orig_x0));
DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno));
+ DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit));
DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs));
BLANK();
- DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id));
+ DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter));
BLANK();
DEFINE(VMA_VM_MM, offsetof(struct vm_area_struct, vm_mm));
DEFINE(VMA_VM_FLAGS, offsetof(struct vm_area_struct, vm_flags));
@@ -71,15 +74,13 @@ int main(void)
BLANK();
DEFINE(PAGE_SZ, PAGE_SIZE);
BLANK();
- DEFINE(CPU_INFO_SZ, sizeof(struct cpu_info));
- DEFINE(CPU_INFO_SETUP, offsetof(struct cpu_info, cpu_setup));
- BLANK();
DEFINE(DMA_BIDIRECTIONAL, DMA_BIDIRECTIONAL);
DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE);
DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE);
BLANK();
DEFINE(CLOCK_REALTIME, CLOCK_REALTIME);
DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC);
+ DEFINE(CLOCK_MONOTONIC_RAW, CLOCK_MONOTONIC_RAW);
DEFINE(CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC);
DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE);
DEFINE(CLOCK_MONOTONIC_COARSE,CLOCK_MONOTONIC_COARSE);
@@ -87,6 +88,8 @@ int main(void)
DEFINE(NSEC_PER_SEC, NSEC_PER_SEC);
BLANK();
DEFINE(VDSO_CS_CYCLE_LAST, offsetof(struct vdso_data, cs_cycle_last));
+ DEFINE(VDSO_RAW_TIME_SEC, offsetof(struct vdso_data, raw_time_sec));
+ DEFINE(VDSO_RAW_TIME_NSEC, offsetof(struct vdso_data, raw_time_nsec));
DEFINE(VDSO_XTIME_CLK_SEC, offsetof(struct vdso_data, xtime_clock_sec));
DEFINE(VDSO_XTIME_CLK_NSEC, offsetof(struct vdso_data, xtime_clock_nsec));
DEFINE(VDSO_XTIME_CRS_SEC, offsetof(struct vdso_data, xtime_coarse_sec));
@@ -94,7 +97,8 @@ int main(void)
DEFINE(VDSO_WTM_CLK_SEC, offsetof(struct vdso_data, wtm_clock_sec));
DEFINE(VDSO_WTM_CLK_NSEC, offsetof(struct vdso_data, wtm_clock_nsec));
DEFINE(VDSO_TB_SEQ_COUNT, offsetof(struct vdso_data, tb_seq_count));
- DEFINE(VDSO_CS_MULT, offsetof(struct vdso_data, cs_mult));
+ DEFINE(VDSO_CS_MONO_MULT, offsetof(struct vdso_data, cs_mono_mult));
+ DEFINE(VDSO_CS_RAW_MULT, offsetof(struct vdso_data, cs_raw_mult));
DEFINE(VDSO_CS_SHIFT, offsetof(struct vdso_data, cs_shift));
DEFINE(VDSO_TZ_MINWEST, offsetof(struct vdso_data, tz_minuteswest));
DEFINE(VDSO_TZ_DSTTIME, offsetof(struct vdso_data, tz_dsttime));
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 6ffd91438560..09eab326ef93 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -88,5 +88,5 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
void check_local_cpu_errata(void)
{
- check_cpu_capabilities(arm64_errata, "enabling workaround for");
+ update_cpu_capabilities(arm64_errata, "enabling workaround for");
}
diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c
index cce952440c64..d63576c0a17a 100644
--- a/arch/arm64/kernel/cpu_ops.c
+++ b/arch/arm64/kernel/cpu_ops.c
@@ -28,9 +28,7 @@ extern const struct cpu_operations cpu_psci_ops;
const struct cpu_operations *cpu_ops[NR_CPUS];
static const struct cpu_operations *supported_cpu_ops[] __initconst = {
-#ifdef CONFIG_SMP
&smp_spin_table_ops,
-#endif
&cpu_psci_ops,
NULL,
};
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 978fa169d3c3..58347534d765 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -16,36 +16,604 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-#define pr_fmt(fmt) "alternatives: " fmt
+#define pr_fmt(fmt) "CPU features: " fmt
+#include <linux/bsearch.h>
+#include <linux/sort.h>
#include <linux/types.h>
#include <asm/cpu.h>
#include <asm/cpufeature.h>
+#include <asm/cpu_ops.h>
#include <asm/processor.h>
+#include <asm/sysreg.h>
-static bool
-feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry)
+unsigned long elf_hwcap __read_mostly;
+EXPORT_SYMBOL_GPL(elf_hwcap);
+
+#ifdef CONFIG_COMPAT
+#define COMPAT_ELF_HWCAP_DEFAULT \
+ (COMPAT_HWCAP_HALF|COMPAT_HWCAP_THUMB|\
+ COMPAT_HWCAP_FAST_MULT|COMPAT_HWCAP_EDSP|\
+ COMPAT_HWCAP_TLS|COMPAT_HWCAP_VFP|\
+ COMPAT_HWCAP_VFPv3|COMPAT_HWCAP_VFPv4|\
+ COMPAT_HWCAP_NEON|COMPAT_HWCAP_IDIV|\
+ COMPAT_HWCAP_LPAE)
+unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT;
+unsigned int compat_elf_hwcap2 __read_mostly;
+#endif
+
+DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS);
+EXPORT_SYMBOL(cpu_hwcaps);
+
+#define __ARM64_FTR_BITS(SIGNED, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \
+ { \
+ .sign = SIGNED, \
+ .strict = STRICT, \
+ .type = TYPE, \
+ .shift = SHIFT, \
+ .width = WIDTH, \
+ .safe_val = SAFE_VAL, \
+ }
+
+/* Define a feature with signed values */
+#define ARM64_FTR_BITS(STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \
+ __ARM64_FTR_BITS(FTR_SIGNED, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL)
+
+/* Define a feature with unsigned value */
+#define U_ARM64_FTR_BITS(STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \
+ __ARM64_FTR_BITS(FTR_UNSIGNED, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL)
+
+#define ARM64_FTR_END \
+ { \
+ .width = 0, \
+ }
+
+/* meta feature for alternatives */
+static bool __maybe_unused
+cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry);
+
+static struct arm64_ftr_bits ftr_id_aa64isar0[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 32, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64ISAR0_RDM_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 24, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_CRC32_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA2_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SHA1_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_AES_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 0, 4, 0), /* RAZ */
+ ARM64_FTR_END,
+};
+
+static struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 32, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 28, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64PFR0_GIC_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI),
+ /* Linux doesn't care about the EL3 */
+ ARM64_FTR_BITS(FTR_NONSTRICT, FTR_EXACT, ID_AA64PFR0_EL3_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64PFR0_EL2_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY),
+ ARM64_FTR_END,
+};
+
+static struct arm64_ftr_bits ftr_id_aa64mmfr0[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 32, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN4_SHIFT, 4, ID_AA64MMFR0_TGRAN4_NI),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN64_SHIFT, 4, ID_AA64MMFR0_TGRAN64_NI),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN16_SHIFT, 4, ID_AA64MMFR0_TGRAN16_NI),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR0_BIGENDEL0_SHIFT, 4, 0),
+ /* Linux shouldn't care about secure memory */
+ ARM64_FTR_BITS(FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_SNSMEM_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR0_BIGENDEL_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR0_ASID_SHIFT, 4, 0),
+ /*
+ * Differing PARange is fine as long as all peripherals and memory are mapped
+ * within the minimum PARange of all CPUs
+ */
+ U_ARM64_FTR_BITS(FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_PARANGE_SHIFT, 4, 0),
+ ARM64_FTR_END,
+};
+
+static struct arm64_ftr_bits ftr_id_aa64mmfr1[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 32, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_PAN_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR1_LOR_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR1_HPD_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR1_VHE_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR1_VMIDBITS_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR1_HADBS_SHIFT, 4, 0),
+ ARM64_FTR_END,
+};
+
+static struct arm64_ftr_bits ftr_id_aa64mmfr2[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_UAO_SHIFT, 4, 0),
+ ARM64_FTR_END,
+};
+
+static struct arm64_ftr_bits ftr_ctr[] = {
+ U_ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RAO */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 28, 3, 0),
+ U_ARM64_FTR_BITS(FTR_STRICT, FTR_HIGHER_SAFE, 24, 4, 0), /* CWG */
+ U_ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), /* ERG */
+ U_ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 1), /* DminLine */
+ /*
+ * Linux can handle differing I-cache policies. Userspace JITs will
+ * make use of *minLine
+ */
+ U_ARM64_FTR_BITS(FTR_NONSTRICT, FTR_EXACT, 14, 2, 0), /* L1Ip */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 4, 10, 0), /* RAZ */
+ U_ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* IminLine */
+ ARM64_FTR_END,
+};
+
+static struct arm64_ftr_bits ftr_id_mmfr0[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 28, 4, 0), /* InnerShr */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 24, 4, 0), /* FCSE */
+ ARM64_FTR_BITS(FTR_NONSTRICT, FTR_LOWER_SAFE, 20, 4, 0), /* AuxReg */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 16, 4, 0), /* TCM */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 12, 4, 0), /* ShareLvl */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 8, 4, 0), /* OuterShr */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 4, 4, 0), /* PMSA */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 0, 4, 0), /* VMSA */
+ ARM64_FTR_END,
+};
+
+static struct arm64_ftr_bits ftr_id_aa64dfr0[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 32, 0),
+ U_ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_CTX_CMPS_SHIFT, 4, 0),
+ U_ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_WRPS_SHIFT, 4, 0),
+ U_ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_BRPS_SHIFT, 4, 0),
+ U_ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64DFR0_PMUVER_SHIFT, 4, 0),
+ U_ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64DFR0_TRACEVER_SHIFT, 4, 0),
+ U_ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64DFR0_DEBUGVER_SHIFT, 4, 0x6),
+ ARM64_FTR_END,
+};
+
+static struct arm64_ftr_bits ftr_mvfr2[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 8, 24, 0), /* RAZ */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 4, 4, 0), /* FPMisc */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 0, 4, 0), /* SIMDMisc */
+ ARM64_FTR_END,
+};
+
+static struct arm64_ftr_bits ftr_dczid[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 5, 27, 0), /* RAZ */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 4, 1, 1), /* DZP */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* BS */
+ ARM64_FTR_END,
+};
+
+
+static struct arm64_ftr_bits ftr_id_isar5[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_ISAR5_RDM_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 20, 4, 0), /* RAZ */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_ISAR5_CRC32_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_ISAR5_SHA2_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_ISAR5_SHA1_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_ISAR5_AES_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_ISAR5_SEVL_SHIFT, 4, 0),
+ ARM64_FTR_END,
+};
+
+static struct arm64_ftr_bits ftr_id_mmfr4[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 8, 24, 0), /* RAZ */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 4, 4, 0), /* ac2 */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 0, 4, 0), /* RAZ */
+ ARM64_FTR_END,
+};
+
+static struct arm64_ftr_bits ftr_id_pfr0[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 16, 16, 0), /* RAZ */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 12, 4, 0), /* State3 */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 8, 4, 0), /* State2 */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 4, 4, 0), /* State1 */
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 0, 4, 0), /* State0 */
+ ARM64_FTR_END,
+};
+
+/*
+ * Common ftr bits for a 32bit register with all hidden, strict
+ * attributes, with 4bit feature fields and a default safe value of
+ * 0. Covers the following 32bit registers:
+ * id_isar[0-4], id_mmfr[1-3], id_pfr1, mvfr[0-1]
+ */
+static struct arm64_ftr_bits ftr_generic_32bits[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0),
+ ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0),
+ ARM64_FTR_END,
+};
+
+static struct arm64_ftr_bits ftr_generic[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 0, 64, 0),
+ ARM64_FTR_END,
+};
+
+static struct arm64_ftr_bits ftr_generic32[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 0, 32, 0),
+ ARM64_FTR_END,
+};
+
+static struct arm64_ftr_bits ftr_aa64raz[] = {
+ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 0, 64, 0),
+ ARM64_FTR_END,
+};
+
+#define ARM64_FTR_REG(id, table) \
+ { \
+ .sys_id = id, \
+ .name = #id, \
+ .ftr_bits = &((table)[0]), \
+ }
+
+static struct arm64_ftr_reg arm64_ftr_regs[] = {
+
+ /* Op1 = 0, CRn = 0, CRm = 1 */
+ ARM64_FTR_REG(SYS_ID_PFR0_EL1, ftr_id_pfr0),
+ ARM64_FTR_REG(SYS_ID_PFR1_EL1, ftr_generic_32bits),
+ ARM64_FTR_REG(SYS_ID_DFR0_EL1, ftr_generic_32bits),
+ ARM64_FTR_REG(SYS_ID_MMFR0_EL1, ftr_id_mmfr0),
+ ARM64_FTR_REG(SYS_ID_MMFR1_EL1, ftr_generic_32bits),
+ ARM64_FTR_REG(SYS_ID_MMFR2_EL1, ftr_generic_32bits),
+ ARM64_FTR_REG(SYS_ID_MMFR3_EL1, ftr_generic_32bits),
+
+ /* Op1 = 0, CRn = 0, CRm = 2 */
+ ARM64_FTR_REG(SYS_ID_ISAR0_EL1, ftr_generic_32bits),
+ ARM64_FTR_REG(SYS_ID_ISAR1_EL1, ftr_generic_32bits),
+ ARM64_FTR_REG(SYS_ID_ISAR2_EL1, ftr_generic_32bits),
+ ARM64_FTR_REG(SYS_ID_ISAR3_EL1, ftr_generic_32bits),
+ ARM64_FTR_REG(SYS_ID_ISAR4_EL1, ftr_generic_32bits),
+ ARM64_FTR_REG(SYS_ID_ISAR5_EL1, ftr_id_isar5),
+ ARM64_FTR_REG(SYS_ID_MMFR4_EL1, ftr_id_mmfr4),
+
+ /* Op1 = 0, CRn = 0, CRm = 3 */
+ ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_generic_32bits),
+ ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_generic_32bits),
+ ARM64_FTR_REG(SYS_MVFR2_EL1, ftr_mvfr2),
+
+ /* Op1 = 0, CRn = 0, CRm = 4 */
+ ARM64_FTR_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0),
+ ARM64_FTR_REG(SYS_ID_AA64PFR1_EL1, ftr_aa64raz),
+
+ /* Op1 = 0, CRn = 0, CRm = 5 */
+ ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0),
+ ARM64_FTR_REG(SYS_ID_AA64DFR1_EL1, ftr_generic),
+
+ /* Op1 = 0, CRn = 0, CRm = 6 */
+ ARM64_FTR_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0),
+ ARM64_FTR_REG(SYS_ID_AA64ISAR1_EL1, ftr_aa64raz),
+
+ /* Op1 = 0, CRn = 0, CRm = 7 */
+ ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0),
+ ARM64_FTR_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1),
+ ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2),
+
+ /* Op1 = 3, CRn = 0, CRm = 0 */
+ ARM64_FTR_REG(SYS_CTR_EL0, ftr_ctr),
+ ARM64_FTR_REG(SYS_DCZID_EL0, ftr_dczid),
+
+ /* Op1 = 3, CRn = 14, CRm = 0 */
+ ARM64_FTR_REG(SYS_CNTFRQ_EL0, ftr_generic32),
+};
+
+static int search_cmp_ftr_reg(const void *id, const void *regp)
{
- int val = cpuid_feature_extract_field(reg, entry->field_pos);
+ return (int)(unsigned long)id - (int)((const struct arm64_ftr_reg *)regp)->sys_id;
+}
- return val >= entry->min_field_value;
+/*
+ * get_arm64_ftr_reg - Lookup a feature register entry using its
+ * sys_reg() encoding. With the array arm64_ftr_regs sorted in the
+ * ascending order of sys_id , we use binary search to find a matching
+ * entry.
+ *
+ * returns - Upon success, matching ftr_reg entry for id.
+ * - NULL on failure. It is upto the caller to decide
+ * the impact of a failure.
+ */
+static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id)
+{
+ return bsearch((const void *)(unsigned long)sys_id,
+ arm64_ftr_regs,
+ ARRAY_SIZE(arm64_ftr_regs),
+ sizeof(arm64_ftr_regs[0]),
+ search_cmp_ftr_reg);
+}
+
+static u64 arm64_ftr_set_value(struct arm64_ftr_bits *ftrp, s64 reg, s64 ftr_val)
+{
+ u64 mask = arm64_ftr_mask(ftrp);
+
+ reg &= ~mask;
+ reg |= (ftr_val << ftrp->shift) & mask;
+ return reg;
+}
+
+static s64 arm64_ftr_safe_value(struct arm64_ftr_bits *ftrp, s64 new, s64 cur)
+{
+ s64 ret = 0;
+
+ switch (ftrp->type) {
+ case FTR_EXACT:
+ ret = ftrp->safe_val;
+ break;
+ case FTR_LOWER_SAFE:
+ ret = new < cur ? new : cur;
+ break;
+ case FTR_HIGHER_SAFE:
+ ret = new > cur ? new : cur;
+ break;
+ default:
+ BUG();
+ }
+
+ return ret;
+}
+
+static int __init sort_cmp_ftr_regs(const void *a, const void *b)
+{
+ return ((const struct arm64_ftr_reg *)a)->sys_id -
+ ((const struct arm64_ftr_reg *)b)->sys_id;
+}
+
+static void __init swap_ftr_regs(void *a, void *b, int size)
+{
+ struct arm64_ftr_reg tmp = *(struct arm64_ftr_reg *)a;
+ *(struct arm64_ftr_reg *)a = *(struct arm64_ftr_reg *)b;
+ *(struct arm64_ftr_reg *)b = tmp;
+}
+
+static void __init sort_ftr_regs(void)
+{
+ /* Keep the array sorted so that we can do the binary search */
+ sort(arm64_ftr_regs,
+ ARRAY_SIZE(arm64_ftr_regs),
+ sizeof(arm64_ftr_regs[0]),
+ sort_cmp_ftr_regs,
+ swap_ftr_regs);
+}
+
+/*
+ * Initialise the CPU feature register from Boot CPU values.
+ * Also initiliases the strict_mask for the register.
+ */
+static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new)
+{
+ u64 val = 0;
+ u64 strict_mask = ~0x0ULL;
+ struct arm64_ftr_bits *ftrp;
+ struct arm64_ftr_reg *reg = get_arm64_ftr_reg(sys_reg);
+
+ BUG_ON(!reg);
+
+ for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) {
+ s64 ftr_new = arm64_ftr_value(ftrp, new);
+
+ val = arm64_ftr_set_value(ftrp, val, ftr_new);
+ if (!ftrp->strict)
+ strict_mask &= ~arm64_ftr_mask(ftrp);
+ }
+ reg->sys_val = val;
+ reg->strict_mask = strict_mask;
+}
+
+void __init init_cpu_features(struct cpuinfo_arm64 *info)
+{
+ /* Before we start using the tables, make sure it is sorted */
+ sort_ftr_regs();
+
+ init_cpu_ftr_reg(SYS_CTR_EL0, info->reg_ctr);
+ init_cpu_ftr_reg(SYS_DCZID_EL0, info->reg_dczid);
+ init_cpu_ftr_reg(SYS_CNTFRQ_EL0, info->reg_cntfrq);
+ init_cpu_ftr_reg(SYS_ID_AA64DFR0_EL1, info->reg_id_aa64dfr0);
+ init_cpu_ftr_reg(SYS_ID_AA64DFR1_EL1, info->reg_id_aa64dfr1);
+ init_cpu_ftr_reg(SYS_ID_AA64ISAR0_EL1, info->reg_id_aa64isar0);
+ init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1);
+ init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0);
+ init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1);
+ init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2);
+ init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0);
+ init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1);
+ init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0);
+ init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0);
+ init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1);
+ init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2);
+ init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3);
+ init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4);
+ init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5);
+ init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0);
+ init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1);
+ init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2);
+ init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3);
+ init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0);
+ init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1);
+ init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0);
+ init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1);
+ init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2);
+}
+
+static void update_cpu_ftr_reg(struct arm64_ftr_reg *reg, u64 new)
+{
+ struct arm64_ftr_bits *ftrp;
+
+ for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) {
+ s64 ftr_cur = arm64_ftr_value(ftrp, reg->sys_val);
+ s64 ftr_new = arm64_ftr_value(ftrp, new);
+
+ if (ftr_cur == ftr_new)
+ continue;
+ /* Find a safe value */
+ ftr_new = arm64_ftr_safe_value(ftrp, ftr_new, ftr_cur);
+ reg->sys_val = arm64_ftr_set_value(ftrp, reg->sys_val, ftr_new);
+ }
+
+}
+
+static int check_update_ftr_reg(u32 sys_id, int cpu, u64 val, u64 boot)
+{
+ struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id);
+
+ BUG_ON(!regp);
+ update_cpu_ftr_reg(regp, val);
+ if ((boot & regp->strict_mask) == (val & regp->strict_mask))
+ return 0;
+ pr_warn("SANITY CHECK: Unexpected variation in %s. Boot CPU: %#016llx, CPU%d: %#016llx\n",
+ regp->name, boot, cpu, val);
+ return 1;
+}
+
+/*
+ * Update system wide CPU feature registers with the values from a
+ * non-boot CPU. Also performs SANITY checks to make sure that there
+ * aren't any insane variations from that of the boot CPU.
+ */
+void update_cpu_features(int cpu,
+ struct cpuinfo_arm64 *info,
+ struct cpuinfo_arm64 *boot)
+{
+ int taint = 0;
+
+ /*
+ * The kernel can handle differing I-cache policies, but otherwise
+ * caches should look identical. Userspace JITs will make use of
+ * *minLine.
+ */
+ taint |= check_update_ftr_reg(SYS_CTR_EL0, cpu,
+ info->reg_ctr, boot->reg_ctr);
+
+ /*
+ * Userspace may perform DC ZVA instructions. Mismatched block sizes
+ * could result in too much or too little memory being zeroed if a
+ * process is preempted and migrated between CPUs.
+ */
+ taint |= check_update_ftr_reg(SYS_DCZID_EL0, cpu,
+ info->reg_dczid, boot->reg_dczid);
+
+ /* If different, timekeeping will be broken (especially with KVM) */
+ taint |= check_update_ftr_reg(SYS_CNTFRQ_EL0, cpu,
+ info->reg_cntfrq, boot->reg_cntfrq);
+
+ /*
+ * The kernel uses self-hosted debug features and expects CPUs to
+ * support identical debug features. We presently need CTX_CMPs, WRPs,
+ * and BRPs to be identical.
+ * ID_AA64DFR1 is currently RES0.
+ */
+ taint |= check_update_ftr_reg(SYS_ID_AA64DFR0_EL1, cpu,
+ info->reg_id_aa64dfr0, boot->reg_id_aa64dfr0);
+ taint |= check_update_ftr_reg(SYS_ID_AA64DFR1_EL1, cpu,
+ info->reg_id_aa64dfr1, boot->reg_id_aa64dfr1);
+ /*
+ * Even in big.LITTLE, processors should be identical instruction-set
+ * wise.
+ */
+ taint |= check_update_ftr_reg(SYS_ID_AA64ISAR0_EL1, cpu,
+ info->reg_id_aa64isar0, boot->reg_id_aa64isar0);
+ taint |= check_update_ftr_reg(SYS_ID_AA64ISAR1_EL1, cpu,
+ info->reg_id_aa64isar1, boot->reg_id_aa64isar1);
+
+ /*
+ * Differing PARange support is fine as long as all peripherals and
+ * memory are mapped within the minimum PARange of all CPUs.
+ * Linux should not care about secure memory.
+ */
+ taint |= check_update_ftr_reg(SYS_ID_AA64MMFR0_EL1, cpu,
+ info->reg_id_aa64mmfr0, boot->reg_id_aa64mmfr0);
+ taint |= check_update_ftr_reg(SYS_ID_AA64MMFR1_EL1, cpu,
+ info->reg_id_aa64mmfr1, boot->reg_id_aa64mmfr1);
+ taint |= check_update_ftr_reg(SYS_ID_AA64MMFR2_EL1, cpu,
+ info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2);
+
+ /*
+ * EL3 is not our concern.
+ * ID_AA64PFR1 is currently RES0.
+ */
+ taint |= check_update_ftr_reg(SYS_ID_AA64PFR0_EL1, cpu,
+ info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0);
+ taint |= check_update_ftr_reg(SYS_ID_AA64PFR1_EL1, cpu,
+ info->reg_id_aa64pfr1, boot->reg_id_aa64pfr1);
+
+ /*
+ * If we have AArch32, we care about 32-bit features for compat. These
+ * registers should be RES0 otherwise.
+ */
+ taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu,
+ info->reg_id_dfr0, boot->reg_id_dfr0);
+ taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu,
+ info->reg_id_isar0, boot->reg_id_isar0);
+ taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu,
+ info->reg_id_isar1, boot->reg_id_isar1);
+ taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu,
+ info->reg_id_isar2, boot->reg_id_isar2);
+ taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu,
+ info->reg_id_isar3, boot->reg_id_isar3);
+ taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu,
+ info->reg_id_isar4, boot->reg_id_isar4);
+ taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu,
+ info->reg_id_isar5, boot->reg_id_isar5);
+
+ /*
+ * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and
+ * ACTLR formats could differ across CPUs and therefore would have to
+ * be trapped for virtualization anyway.
+ */
+ taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu,
+ info->reg_id_mmfr0, boot->reg_id_mmfr0);
+ taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu,
+ info->reg_id_mmfr1, boot->reg_id_mmfr1);
+ taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu,
+ info->reg_id_mmfr2, boot->reg_id_mmfr2);
+ taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu,
+ info->reg_id_mmfr3, boot->reg_id_mmfr3);
+ taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu,
+ info->reg_id_pfr0, boot->reg_id_pfr0);
+ taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu,
+ info->reg_id_pfr1, boot->reg_id_pfr1);
+ taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu,
+ info->reg_mvfr0, boot->reg_mvfr0);
+ taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu,
+ info->reg_mvfr1, boot->reg_mvfr1);
+ taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu,
+ info->reg_mvfr2, boot->reg_mvfr2);
+
+ /*
+ * Mismatched CPU features are a recipe for disaster. Don't even
+ * pretend to support them.
+ */
+ WARN_TAINT_ONCE(taint, TAINT_CPU_OUT_OF_SPEC,
+ "Unsupported CPU feature variation.\n");
+}
+
+u64 read_system_reg(u32 id)
+{
+ struct arm64_ftr_reg *regp = get_arm64_ftr_reg(id);
+
+ /* We shouldn't get a request for an unsupported register */
+ BUG_ON(!regp);
+ return regp->sys_val;
}
static bool
-has_id_aa64pfr0_feature(const struct arm64_cpu_capabilities *entry)
+feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry)
{
- u64 val;
+ int val = cpuid_feature_extract_field(reg, entry->field_pos);
- val = read_cpuid(id_aa64pfr0_el1);
- return feature_matches(val, entry);
+ return val >= entry->min_field_value;
}
-static bool __maybe_unused
-has_id_aa64mmfr1_feature(const struct arm64_cpu_capabilities *entry)
+static bool
+has_cpuid_feature(const struct arm64_cpu_capabilities *entry)
{
u64 val;
- val = read_cpuid(id_aa64mmfr1_el1);
+ val = read_system_reg(entry->sys_reg);
return feature_matches(val, entry);
}
@@ -53,45 +621,319 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
{
.desc = "GIC system register CPU interface",
.capability = ARM64_HAS_SYSREG_GIC_CPUIF,
- .matches = has_id_aa64pfr0_feature,
- .field_pos = 24,
+ .matches = has_cpuid_feature,
+ .sys_reg = SYS_ID_AA64PFR0_EL1,
+ .field_pos = ID_AA64PFR0_GIC_SHIFT,
.min_field_value = 1,
},
#ifdef CONFIG_ARM64_PAN
{
.desc = "Privileged Access Never",
.capability = ARM64_HAS_PAN,
- .matches = has_id_aa64mmfr1_feature,
- .field_pos = 20,
+ .matches = has_cpuid_feature,
+ .sys_reg = SYS_ID_AA64MMFR1_EL1,
+ .field_pos = ID_AA64MMFR1_PAN_SHIFT,
.min_field_value = 1,
.enable = cpu_enable_pan,
},
#endif /* CONFIG_ARM64_PAN */
+#ifdef CONFIG_ARM64_UAO
+ {
+ .desc = "User Access Override",
+ .capability = ARM64_HAS_UAO,
+ .matches = has_cpuid_feature,
+ .sys_reg = SYS_ID_AA64MMFR2_EL1,
+ .field_pos = ID_AA64MMFR2_UAO_SHIFT,
+ .min_field_value = 1,
+ .enable = cpu_enable_uao,
+ },
+#endif /* CONFIG_ARM64_UAO */
+#ifdef CONFIG_ARM64_PAN
+ {
+ .capability = ARM64_ALT_PAN_NOT_UAO,
+ .matches = cpufeature_pan_not_uao,
+ },
+#endif /* CONFIG_ARM64_PAN */
{},
};
-void check_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
+#define HWCAP_CAP(reg, field, min_value, type, cap) \
+ { \
+ .desc = #cap, \
+ .matches = has_cpuid_feature, \
+ .sys_reg = reg, \
+ .field_pos = field, \
+ .min_field_value = min_value, \
+ .hwcap_type = type, \
+ .hwcap = cap, \
+ }
+
+static const struct arm64_cpu_capabilities arm64_hwcaps[] = {
+ HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, 2, CAP_HWCAP, HWCAP_PMULL),
+ HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, 1, CAP_HWCAP, HWCAP_AES),
+ HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, 1, CAP_HWCAP, HWCAP_SHA1),
+ HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, 1, CAP_HWCAP, HWCAP_SHA2),
+ HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_CRC32_SHIFT, 1, CAP_HWCAP, HWCAP_CRC32),
+ HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, 0, CAP_HWCAP, HWCAP_FP),
+ HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, 0, CAP_HWCAP, HWCAP_ASIMD),
+#ifdef CONFIG_COMPAT
+ HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, 2, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_PMULL),
+ HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_AES),
+ HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_SHA1_SHIFT, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA1),
+ HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_SHA2_SHIFT, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA2),
+ HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_CRC32_SHIFT, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_CRC32),
+#endif
+ {},
+};
+
+static void __init cap_set_hwcap(const struct arm64_cpu_capabilities *cap)
+{
+ switch (cap->hwcap_type) {
+ case CAP_HWCAP:
+ elf_hwcap |= cap->hwcap;
+ break;
+#ifdef CONFIG_COMPAT
+ case CAP_COMPAT_HWCAP:
+ compat_elf_hwcap |= (u32)cap->hwcap;
+ break;
+ case CAP_COMPAT_HWCAP2:
+ compat_elf_hwcap2 |= (u32)cap->hwcap;
+ break;
+#endif
+ default:
+ WARN_ON(1);
+ break;
+ }
+}
+
+/* Check if we have a particular HWCAP enabled */
+static bool __maybe_unused cpus_have_hwcap(const struct arm64_cpu_capabilities *cap)
+{
+ bool rc;
+
+ switch (cap->hwcap_type) {
+ case CAP_HWCAP:
+ rc = (elf_hwcap & cap->hwcap) != 0;
+ break;
+#ifdef CONFIG_COMPAT
+ case CAP_COMPAT_HWCAP:
+ rc = (compat_elf_hwcap & (u32)cap->hwcap) != 0;
+ break;
+ case CAP_COMPAT_HWCAP2:
+ rc = (compat_elf_hwcap2 & (u32)cap->hwcap) != 0;
+ break;
+#endif
+ default:
+ WARN_ON(1);
+ rc = false;
+ }
+
+ return rc;
+}
+
+static void __init setup_cpu_hwcaps(void)
+{
+ int i;
+ const struct arm64_cpu_capabilities *hwcaps = arm64_hwcaps;
+
+ for (i = 0; hwcaps[i].matches; i++)
+ if (hwcaps[i].matches(&hwcaps[i]))
+ cap_set_hwcap(&hwcaps[i]);
+}
+
+void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
const char *info)
{
int i;
- for (i = 0; caps[i].desc; i++) {
+ for (i = 0; caps[i].matches; i++) {
if (!caps[i].matches(&caps[i]))
continue;
- if (!cpus_have_cap(caps[i].capability))
+ if (!cpus_have_cap(caps[i].capability) && caps[i].desc)
pr_info("%s %s\n", info, caps[i].desc);
cpus_set_cap(caps[i].capability);
}
+}
+
+/*
+ * Run through the enabled capabilities and enable() it on all active
+ * CPUs
+ */
+static void __init
+enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps)
+{
+ int i;
+
+ for (i = 0; caps[i].matches; i++)
+ if (caps[i].enable && cpus_have_cap(caps[i].capability))
+ on_each_cpu(caps[i].enable, NULL, true);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Flag to indicate if we have computed the system wide
+ * capabilities based on the boot time active CPUs. This
+ * will be used to determine if a new booting CPU should
+ * go through the verification process to make sure that it
+ * supports the system capabilities, without using a hotplug
+ * notifier.
+ */
+static bool sys_caps_initialised;
+
+static inline void set_sys_caps_initialised(void)
+{
+ sys_caps_initialised = true;
+}
- /* second pass allows enable() to consider interacting capabilities */
- for (i = 0; caps[i].desc; i++) {
- if (cpus_have_cap(caps[i].capability) && caps[i].enable)
- caps[i].enable();
+/*
+ * __raw_read_system_reg() - Used by a STARTING cpu before cpuinfo is populated.
+ */
+static u64 __raw_read_system_reg(u32 sys_id)
+{
+ switch (sys_id) {
+ case SYS_ID_PFR0_EL1: return read_cpuid(SYS_ID_PFR0_EL1);
+ case SYS_ID_PFR1_EL1: return read_cpuid(SYS_ID_PFR1_EL1);
+ case SYS_ID_DFR0_EL1: return read_cpuid(SYS_ID_DFR0_EL1);
+ case SYS_ID_MMFR0_EL1: return read_cpuid(SYS_ID_MMFR0_EL1);
+ case SYS_ID_MMFR1_EL1: return read_cpuid(SYS_ID_MMFR1_EL1);
+ case SYS_ID_MMFR2_EL1: return read_cpuid(SYS_ID_MMFR2_EL1);
+ case SYS_ID_MMFR3_EL1: return read_cpuid(SYS_ID_MMFR3_EL1);
+ case SYS_ID_ISAR0_EL1: return read_cpuid(SYS_ID_ISAR0_EL1);
+ case SYS_ID_ISAR1_EL1: return read_cpuid(SYS_ID_ISAR1_EL1);
+ case SYS_ID_ISAR2_EL1: return read_cpuid(SYS_ID_ISAR2_EL1);
+ case SYS_ID_ISAR3_EL1: return read_cpuid(SYS_ID_ISAR3_EL1);
+ case SYS_ID_ISAR4_EL1: return read_cpuid(SYS_ID_ISAR4_EL1);
+ case SYS_ID_ISAR5_EL1: return read_cpuid(SYS_ID_ISAR4_EL1);
+ case SYS_MVFR0_EL1: return read_cpuid(SYS_MVFR0_EL1);
+ case SYS_MVFR1_EL1: return read_cpuid(SYS_MVFR1_EL1);
+ case SYS_MVFR2_EL1: return read_cpuid(SYS_MVFR2_EL1);
+
+ case SYS_ID_AA64PFR0_EL1: return read_cpuid(SYS_ID_AA64PFR0_EL1);
+ case SYS_ID_AA64PFR1_EL1: return read_cpuid(SYS_ID_AA64PFR0_EL1);
+ case SYS_ID_AA64DFR0_EL1: return read_cpuid(SYS_ID_AA64DFR0_EL1);
+ case SYS_ID_AA64DFR1_EL1: return read_cpuid(SYS_ID_AA64DFR0_EL1);
+ case SYS_ID_AA64MMFR0_EL1: return read_cpuid(SYS_ID_AA64MMFR0_EL1);
+ case SYS_ID_AA64MMFR1_EL1: return read_cpuid(SYS_ID_AA64MMFR1_EL1);
+ case SYS_ID_AA64MMFR2_EL1: return read_cpuid(SYS_ID_AA64MMFR2_EL1);
+ case SYS_ID_AA64ISAR0_EL1: return read_cpuid(SYS_ID_AA64ISAR0_EL1);
+ case SYS_ID_AA64ISAR1_EL1: return read_cpuid(SYS_ID_AA64ISAR1_EL1);
+
+ case SYS_CNTFRQ_EL0: return read_cpuid(SYS_CNTFRQ_EL0);
+ case SYS_CTR_EL0: return read_cpuid(SYS_CTR_EL0);
+ case SYS_DCZID_EL0: return read_cpuid(SYS_DCZID_EL0);
+ default:
+ BUG();
+ return 0;
}
}
-void check_local_cpu_features(void)
+/*
+ * Park the CPU which doesn't have the capability as advertised
+ * by the system.
+ */
+static void fail_incapable_cpu(char *cap_type,
+ const struct arm64_cpu_capabilities *cap)
+{
+ int cpu = smp_processor_id();
+
+ pr_crit("CPU%d: missing %s : %s\n", cpu, cap_type, cap->desc);
+ /* Mark this CPU absent */
+ set_cpu_present(cpu, 0);
+
+ /* Check if we can park ourselves */
+ if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_die)
+ cpu_ops[cpu]->cpu_die(cpu);
+ asm(
+ "1: wfe\n"
+ " wfi\n"
+ " b 1b");
+}
+
+/*
+ * Run through the enabled system capabilities and enable() it on this CPU.
+ * The capabilities were decided based on the available CPUs at the boot time.
+ * Any new CPU should match the system wide status of the capability. If the
+ * new CPU doesn't have a capability which the system now has enabled, we
+ * cannot do anything to fix it up and could cause unexpected failures. So
+ * we park the CPU.
+ */
+void verify_local_cpu_capabilities(void)
+{
+ int i;
+ const struct arm64_cpu_capabilities *caps;
+
+ /*
+ * If we haven't computed the system capabilities, there is nothing
+ * to verify.
+ */
+ if (!sys_caps_initialised)
+ return;
+
+ caps = arm64_features;
+ for (i = 0; caps[i].matches; i++) {
+ if (!cpus_have_cap(caps[i].capability) || !caps[i].sys_reg)
+ continue;
+ /*
+ * If the new CPU misses an advertised feature, we cannot proceed
+ * further, park the cpu.
+ */
+ if (!feature_matches(__raw_read_system_reg(caps[i].sys_reg), &caps[i]))
+ fail_incapable_cpu("arm64_features", &caps[i]);
+ if (caps[i].enable)
+ caps[i].enable(NULL);
+ }
+
+ for (i = 0, caps = arm64_hwcaps; caps[i].matches; i++) {
+ if (!cpus_have_hwcap(&caps[i]))
+ continue;
+ if (!feature_matches(__raw_read_system_reg(caps[i].sys_reg), &caps[i]))
+ fail_incapable_cpu("arm64_hwcaps", &caps[i]);
+ }
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+
+static inline void set_sys_caps_initialised(void)
+{
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static void __init setup_feature_capabilities(void)
+{
+ update_cpu_capabilities(arm64_features, "detected feature:");
+ enable_cpu_capabilities(arm64_features);
+}
+
+void __init setup_cpu_features(void)
+{
+ u32 cwg;
+ int cls;
+
+ /* Set the CPU feature capabilies */
+ setup_feature_capabilities();
+ setup_cpu_hwcaps();
+
+ /* Advertise that we have computed the system capabilities */
+ set_sys_caps_initialised();
+
+ /*
+ * Check for sane CTR_EL0.CWG value.
+ */
+ cwg = cache_type_cwg();
+ cls = cache_line_size();
+ if (!cwg)
+ pr_warn("No Cache Writeback Granule information, assuming cache line size %d\n",
+ cls);
+ if (L1_CACHE_BYTES < cls)
+ pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n",
+ L1_CACHE_BYTES, cls);
+}
+
+static bool __maybe_unused
+cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry)
{
- check_cpu_capabilities(arm64_features, "detected feature");
+ return (cpus_have_cap(ARM64_HAS_PAN) && !cpus_have_cap(ARM64_HAS_UAO));
}
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index faf5cadbd391..ac60eae5d8d6 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -22,10 +22,14 @@
#include <linux/bitops.h>
#include <linux/bug.h>
+#include <linux/delay.h>
#include <linux/init.h>
#include <linux/kernel.h>
+#include <linux/personality.h>
#include <linux/preempt.h>
#include <linux/printk.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
#include <linux/smp.h>
/*
@@ -35,7 +39,6 @@
*/
DEFINE_PER_CPU(struct cpuinfo_arm64, cpu_data);
static struct cpuinfo_arm64 boot_cpu_data;
-static bool mixed_endian_el0 = true;
static char *icache_policy_str[] = {
[ICACHE_POLICY_RESERVED] = "RESERVED/UNKNOWN",
@@ -46,179 +49,200 @@ static char *icache_policy_str[] = {
unsigned long __icache_flags;
-static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info)
+static const char *hwcap_str[] = {
+ "fp",
+ "asimd",
+ "evtstrm",
+ "aes",
+ "pmull",
+ "sha1",
+ "sha2",
+ "crc32",
+ NULL
+};
+
+#ifdef CONFIG_COMPAT
+static const char *compat_hwcap_str[] = {
+ "swp",
+ "half",
+ "thumb",
+ "26bit",
+ "fastmult",
+ "fpa",
+ "vfp",
+ "edsp",
+ "java",
+ "iwmmxt",
+ "crunch",
+ "thumbee",
+ "neon",
+ "vfpv3",
+ "vfpv3d16",
+ "tls",
+ "vfpv4",
+ "idiva",
+ "idivt",
+ "vfpd32",
+ "lpae",
+ "evtstrm"
+};
+
+static const char *compat_hwcap2_str[] = {
+ "aes",
+ "pmull",
+ "sha1",
+ "sha2",
+ "crc32",
+ NULL
+};
+#endif /* CONFIG_COMPAT */
+
+static int c_show(struct seq_file *m, void *v)
{
- unsigned int cpu = smp_processor_id();
- u32 l1ip = CTR_L1IP(info->reg_ctr);
+ int i, j;
+
+ for_each_online_cpu(i) {
+ struct cpuinfo_arm64 *cpuinfo = &per_cpu(cpu_data, i);
+ u32 midr = cpuinfo->reg_midr;
- if (l1ip != ICACHE_POLICY_PIPT) {
/*
- * VIPT caches are non-aliasing if the VA always equals the PA
- * in all bit positions that are covered by the index. This is
- * the case if the size of a way (# of sets * line size) does
- * not exceed PAGE_SIZE.
+ * glibc reads /proc/cpuinfo to determine the number of
+ * online processors, looking for lines beginning with
+ * "processor". Give glibc what it expects.
*/
- u32 waysize = icache_get_numsets() * icache_get_linesize();
+#ifdef CONFIG_SMP
+ seq_printf(m, "processor\t: %d\n", i);
+#endif
- if (l1ip != ICACHE_POLICY_VIPT || waysize > PAGE_SIZE)
- set_bit(ICACHEF_ALIASING, &__icache_flags);
+ seq_printf(m, "BogoMIPS\t: %lu.%02lu\n",
+ loops_per_jiffy / (500000UL/HZ),
+ loops_per_jiffy / (5000UL/HZ) % 100);
+
+ /*
+ * Dump out the common processor features in a single line.
+ * Userspace should read the hwcaps with getauxval(AT_HWCAP)
+ * rather than attempting to parse this, but there's a body of
+ * software which does already (at least for 32-bit).
+ */
+ seq_puts(m, "Features\t:");
+ if (personality(current->personality) == PER_LINUX32) {
+#ifdef CONFIG_COMPAT
+ for (j = 0; compat_hwcap_str[j]; j++)
+ if (compat_elf_hwcap & (1 << j))
+ seq_printf(m, " %s", compat_hwcap_str[j]);
+
+ for (j = 0; compat_hwcap2_str[j]; j++)
+ if (compat_elf_hwcap2 & (1 << j))
+ seq_printf(m, " %s", compat_hwcap2_str[j]);
+#endif /* CONFIG_COMPAT */
+ } else {
+ for (j = 0; hwcap_str[j]; j++)
+ if (elf_hwcap & (1 << j))
+ seq_printf(m, " %s", hwcap_str[j]);
+ }
+ seq_puts(m, "\n");
+
+ seq_printf(m, "CPU implementer\t: 0x%02x\n",
+ MIDR_IMPLEMENTOR(midr));
+ seq_printf(m, "CPU architecture: 8\n");
+ seq_printf(m, "CPU variant\t: 0x%x\n", MIDR_VARIANT(midr));
+ seq_printf(m, "CPU part\t: 0x%03x\n", MIDR_PARTNUM(midr));
+ seq_printf(m, "CPU revision\t: %d\n\n", MIDR_REVISION(midr));
}
- if (l1ip == ICACHE_POLICY_AIVIVT)
- set_bit(ICACHEF_AIVIVT, &__icache_flags);
- pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str[l1ip], cpu);
+ return 0;
}
-bool cpu_supports_mixed_endian_el0(void)
+static void *c_start(struct seq_file *m, loff_t *pos)
{
- return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1));
+ return *pos < 1 ? (void *)1 : NULL;
}
-bool system_supports_mixed_endian_el0(void)
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
{
- return mixed_endian_el0;
+ ++*pos;
+ return NULL;
}
-static void update_mixed_endian_el0_support(struct cpuinfo_arm64 *info)
+static void c_stop(struct seq_file *m, void *v)
{
- mixed_endian_el0 &= id_aa64mmfr0_mixed_endian_el0(info->reg_id_aa64mmfr0);
}
-static void update_cpu_features(struct cpuinfo_arm64 *info)
-{
- update_mixed_endian_el0_support(info);
-}
+const struct seq_operations cpuinfo_op = {
+ .start = c_start,
+ .next = c_next,
+ .stop = c_stop,
+ .show = c_show
+};
-static int check_reg_mask(char *name, u64 mask, u64 boot, u64 cur, int cpu)
+static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info)
{
- if ((boot & mask) == (cur & mask))
- return 0;
-
- pr_warn("SANITY CHECK: Unexpected variation in %s. Boot CPU: %#016lx, CPU%d: %#016lx\n",
- name, (unsigned long)boot, cpu, (unsigned long)cur);
-
- return 1;
-}
+ unsigned int cpu = smp_processor_id();
+ u32 l1ip = CTR_L1IP(info->reg_ctr);
-#define CHECK_MASK(field, mask, boot, cur, cpu) \
- check_reg_mask(#field, mask, (boot)->reg_ ## field, (cur)->reg_ ## field, cpu)
+ if (l1ip != ICACHE_POLICY_PIPT) {
+ /*
+ * VIPT caches are non-aliasing if the VA always equals the PA
+ * in all bit positions that are covered by the index. This is
+ * the case if the size of a way (# of sets * line size) does
+ * not exceed PAGE_SIZE.
+ */
+ u32 waysize = icache_get_numsets() * icache_get_linesize();
-#define CHECK(field, boot, cur, cpu) \
- CHECK_MASK(field, ~0ULL, boot, cur, cpu)
+ if (l1ip != ICACHE_POLICY_VIPT || waysize > PAGE_SIZE)
+ set_bit(ICACHEF_ALIASING, &__icache_flags);
+ }
+ if (l1ip == ICACHE_POLICY_AIVIVT)
+ set_bit(ICACHEF_AIVIVT, &__icache_flags);
-/*
- * Verify that CPUs don't have unexpected differences that will cause problems.
- */
-static void cpuinfo_sanity_check(struct cpuinfo_arm64 *cur)
-{
- unsigned int cpu = smp_processor_id();
- struct cpuinfo_arm64 *boot = &boot_cpu_data;
- unsigned int diff = 0;
-
- /*
- * The kernel can handle differing I-cache policies, but otherwise
- * caches should look identical. Userspace JITs will make use of
- * *minLine.
- */
- diff |= CHECK_MASK(ctr, 0xffff3fff, boot, cur, cpu);
-
- /*
- * Userspace may perform DC ZVA instructions. Mismatched block sizes
- * could result in too much or too little memory being zeroed if a
- * process is preempted and migrated between CPUs.
- */
- diff |= CHECK(dczid, boot, cur, cpu);
-
- /* If different, timekeeping will be broken (especially with KVM) */
- diff |= CHECK(cntfrq, boot, cur, cpu);
-
- /*
- * Even in big.LITTLE, processors should be identical instruction-set
- * wise.
- */
- diff |= CHECK(id_aa64isar0, boot, cur, cpu);
- diff |= CHECK(id_aa64isar1, boot, cur, cpu);
-
- /*
- * Differing PARange support is fine as long as all peripherals and
- * memory are mapped within the minimum PARange of all CPUs.
- * Linux should not care about secure memory.
- * ID_AA64MMFR1 is currently RES0.
- */
- diff |= CHECK_MASK(id_aa64mmfr0, 0xffffffffffff0ff0, boot, cur, cpu);
- diff |= CHECK(id_aa64mmfr1, boot, cur, cpu);
-
- /*
- * EL3 is not our concern.
- * ID_AA64PFR1 is currently RES0.
- */
- diff |= CHECK_MASK(id_aa64pfr0, 0xffffffffffff0fff, boot, cur, cpu);
- diff |= CHECK(id_aa64pfr1, boot, cur, cpu);
-
- /*
- * If we have AArch32, we care about 32-bit features for compat. These
- * registers should be RES0 otherwise.
- */
- diff |= CHECK(id_isar0, boot, cur, cpu);
- diff |= CHECK(id_isar1, boot, cur, cpu);
- diff |= CHECK(id_isar2, boot, cur, cpu);
- diff |= CHECK(id_isar3, boot, cur, cpu);
- diff |= CHECK(id_isar4, boot, cur, cpu);
- diff |= CHECK(id_isar5, boot, cur, cpu);
- diff |= CHECK(id_mmfr0, boot, cur, cpu);
- diff |= CHECK(id_mmfr1, boot, cur, cpu);
- diff |= CHECK(id_mmfr2, boot, cur, cpu);
- diff |= CHECK(id_mmfr3, boot, cur, cpu);
- diff |= CHECK(id_pfr0, boot, cur, cpu);
- diff |= CHECK(id_pfr1, boot, cur, cpu);
-
- /*
- * Mismatched CPU features are a recipe for disaster. Don't even
- * pretend to support them.
- */
- WARN_TAINT_ONCE(diff, TAINT_CPU_OUT_OF_SPEC,
- "Unsupported CPU feature variation.");
+ pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str[l1ip], cpu);
}
static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
{
info->reg_cntfrq = arch_timer_get_cntfrq();
info->reg_ctr = read_cpuid_cachetype();
- info->reg_dczid = read_cpuid(DCZID_EL0);
+ info->reg_dczid = read_cpuid(SYS_DCZID_EL0);
info->reg_midr = read_cpuid_id();
- info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1);
- info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1);
- info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1);
- info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1);
- info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1);
- info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1);
-
- info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1);
- info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1);
- info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1);
- info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1);
- info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1);
- info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1);
- info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1);
- info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1);
- info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1);
- info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1);
- info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1);
- info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1);
+ info->reg_id_aa64dfr0 = read_cpuid(SYS_ID_AA64DFR0_EL1);
+ info->reg_id_aa64dfr1 = read_cpuid(SYS_ID_AA64DFR1_EL1);
+ info->reg_id_aa64isar0 = read_cpuid(SYS_ID_AA64ISAR0_EL1);
+ info->reg_id_aa64isar1 = read_cpuid(SYS_ID_AA64ISAR1_EL1);
+ info->reg_id_aa64mmfr0 = read_cpuid(SYS_ID_AA64MMFR0_EL1);
+ info->reg_id_aa64mmfr1 = read_cpuid(SYS_ID_AA64MMFR1_EL1);
+ info->reg_id_aa64mmfr2 = read_cpuid(SYS_ID_AA64MMFR2_EL1);
+ info->reg_id_aa64pfr0 = read_cpuid(SYS_ID_AA64PFR0_EL1);
+ info->reg_id_aa64pfr1 = read_cpuid(SYS_ID_AA64PFR1_EL1);
+
+ info->reg_id_dfr0 = read_cpuid(SYS_ID_DFR0_EL1);
+ info->reg_id_isar0 = read_cpuid(SYS_ID_ISAR0_EL1);
+ info->reg_id_isar1 = read_cpuid(SYS_ID_ISAR1_EL1);
+ info->reg_id_isar2 = read_cpuid(SYS_ID_ISAR2_EL1);
+ info->reg_id_isar3 = read_cpuid(SYS_ID_ISAR3_EL1);
+ info->reg_id_isar4 = read_cpuid(SYS_ID_ISAR4_EL1);
+ info->reg_id_isar5 = read_cpuid(SYS_ID_ISAR5_EL1);
+ info->reg_id_mmfr0 = read_cpuid(SYS_ID_MMFR0_EL1);
+ info->reg_id_mmfr1 = read_cpuid(SYS_ID_MMFR1_EL1);
+ info->reg_id_mmfr2 = read_cpuid(SYS_ID_MMFR2_EL1);
+ info->reg_id_mmfr3 = read_cpuid(SYS_ID_MMFR3_EL1);
+ info->reg_id_pfr0 = read_cpuid(SYS_ID_PFR0_EL1);
+ info->reg_id_pfr1 = read_cpuid(SYS_ID_PFR1_EL1);
+
+ info->reg_mvfr0 = read_cpuid(SYS_MVFR0_EL1);
+ info->reg_mvfr1 = read_cpuid(SYS_MVFR1_EL1);
+ info->reg_mvfr2 = read_cpuid(SYS_MVFR2_EL1);
cpuinfo_detect_icache_policy(info);
check_local_cpu_errata();
- check_local_cpu_features();
- update_cpu_features(info);
}
void cpuinfo_store_cpu(void)
{
struct cpuinfo_arm64 *info = this_cpu_ptr(&cpu_data);
__cpuinfo_store_cpu(info);
- cpuinfo_sanity_check(info);
+ update_cpu_features(smp_processor_id(), info, &boot_cpu_data);
}
void __init cpuinfo_store_boot_cpu(void)
@@ -227,6 +251,7 @@ void __init cpuinfo_store_boot_cpu(void)
__cpuinfo_store_cpu(info);
boot_cpu_data = *info;
+ init_cpu_features(&boot_cpu_data);
}
u64 __attribute_const__ icache_get_ccsidr(void)
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 62c91b3b42e8..d35057c09297 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -26,14 +26,16 @@
#include <linux/stat.h>
#include <linux/uaccess.h>
-#include <asm/debug-monitors.h>
+#include <asm/cpufeature.h>
#include <asm/cputype.h>
+#include <asm/debug-monitors.h>
#include <asm/system_misc.h>
/* Determine debug architecture. */
u8 debug_monitors_arch(void)
{
- return read_cpuid(ID_AA64DFR0_EL1) & 0xf;
+ return cpuid_feature_extract_field(read_system_reg(SYS_ID_AA64DFR0_EL1),
+ ID_AA64DFR0_DEBUGVER_SHIFT);
}
/*
@@ -134,7 +136,7 @@ static int os_lock_notify(struct notifier_block *self,
unsigned long action, void *data)
{
int cpu = (unsigned long)data;
- if (action == CPU_ONLINE)
+ if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE)
smp_call_function_single(cpu, clear_os_lock, NULL, 1);
return NOTIFY_OK;
}
diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S
index d18a44940968..8ce9b0577442 100644
--- a/arch/arm64/kernel/efi-entry.S
+++ b/arch/arm64/kernel/efi-entry.S
@@ -61,7 +61,8 @@ ENTRY(efi_stub_entry)
*/
mov x20, x0 // DTB address
ldr x0, [sp, #16] // relocated _text address
- mov x21, x0
+ ldr x21, =stext_offset
+ add x21, x0, x21
/*
* Calculate size of the kernel Image (same for original and copy).
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 1d85a7c5a850..7d13642092d1 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -11,26 +11,45 @@
*
*/
+#include <linux/atomic.h>
+#include <linux/dmi.h>
#include <linux/efi.h>
#include <linux/export.h>
#include <linux/memblock.h>
+#include <linux/mm_types.h>
#include <linux/bootmem.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
+#include <linux/preempt.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/spinlock.h>
#include <asm/cacheflush.h>
#include <asm/efi.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
struct efi_memory_map memmap;
-static efi_runtime_services_t *runtime;
-
static u64 efi_system_table;
+static pgd_t efi_pgd[PTRS_PER_PGD] __page_aligned_bss;
+
+static struct mm_struct efi_mm = {
+ .mm_rb = RB_ROOT,
+ .pgd = efi_pgd,
+ .mm_users = ATOMIC_INIT(2),
+ .mm_count = ATOMIC_INIT(1),
+ .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem),
+ .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
+ .mmlist = LIST_HEAD_INIT(efi_mm.mmlist),
+};
+
static int uefi_debug __initdata;
static int __init uefi_debug_setup(char *str)
{
@@ -47,30 +66,33 @@ static int __init is_normal_ram(efi_memory_desc_t *md)
return 0;
}
-static void __init efi_setup_idmap(void)
+/*
+ * Translate a EFI virtual address into a physical address: this is necessary,
+ * as some data members of the EFI system table are virtually remapped after
+ * SetVirtualAddressMap() has been called.
+ */
+static phys_addr_t efi_to_phys(unsigned long addr)
{
- struct memblock_region *r;
efi_memory_desc_t *md;
- u64 paddr, npages, size;
-
- for_each_memblock(memory, r)
- create_id_mapping(r->base, r->size, 0);
- /* map runtime io spaces */
for_each_efi_memory_desc(&memmap, md) {
- if (!(md->attribute & EFI_MEMORY_RUNTIME) || is_normal_ram(md))
+ if (!(md->attribute & EFI_MEMORY_RUNTIME))
continue;
- paddr = md->phys_addr;
- npages = md->num_pages;
- memrange_efi_to_native(&paddr, &npages);
- size = npages << PAGE_SHIFT;
- create_id_mapping(paddr, size, 1);
+ if (md->virt_addr == 0)
+ /* no virtual mapping has been installed by the stub */
+ break;
+ if (md->virt_addr <= addr &&
+ (addr - md->virt_addr) < (md->num_pages << EFI_PAGE_SHIFT))
+ return md->phys_addr + addr - md->virt_addr;
}
+ return addr;
}
static int __init uefi_init(void)
{
efi_char16_t *c16;
+ void *config_tables;
+ u64 table_size;
char vendor[100] = "unknown";
int i, retval;
@@ -98,7 +120,7 @@ static int __init uefi_init(void)
efi.systab->hdr.revision & 0xffff);
/* Show what we know for posterity */
- c16 = early_memremap(efi.systab->fw_vendor,
+ c16 = early_memremap(efi_to_phys(efi.systab->fw_vendor),
sizeof(vendor));
if (c16) {
for (i = 0; i < (int) sizeof(vendor) - 1 && *c16; ++i)
@@ -111,10 +133,14 @@ static int __init uefi_init(void)
efi.systab->hdr.revision >> 16,
efi.systab->hdr.revision & 0xffff, vendor);
- retval = efi_config_init(NULL);
- if (retval == 0)
- set_bit(EFI_CONFIG_TABLES, &efi.flags);
+ table_size = sizeof(efi_config_table_64_t) * efi.systab->nr_tables;
+ config_tables = early_memremap(efi_to_phys(efi.systab->tables),
+ table_size);
+
+ retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables,
+ sizeof(efi_config_table_64_t), NULL);
+ early_memunmap(config_tables, table_size);
out:
early_memunmap(efi.systab, sizeof(efi_system_table_t));
return retval;
@@ -125,17 +151,17 @@ out:
*/
static __init int is_reserve_region(efi_memory_desc_t *md)
{
- if (!is_normal_ram(md))
+ switch (md->type) {
+ case EFI_LOADER_CODE:
+ case EFI_LOADER_DATA:
+ case EFI_BOOT_SERVICES_CODE:
+ case EFI_BOOT_SERVICES_DATA:
+ case EFI_CONVENTIONAL_MEMORY:
return 0;
-
- if (md->attribute & EFI_MEMORY_RUNTIME)
- return 1;
-
- if (md->type == EFI_ACPI_RECLAIM_MEMORY ||
- md->type == EFI_RESERVED_TYPE)
- return 1;
-
- return 0;
+ default:
+ break;
+ }
+ return is_normal_ram(md);
}
static __init void reserve_regions(void)
@@ -164,9 +190,7 @@ static __init void reserve_regions(void)
if (is_normal_ram(md))
early_init_dt_add_memory_arch(paddr, size);
- if (is_reserve_region(md) ||
- md->type == EFI_BOOT_SERVICES_CODE ||
- md->type == EFI_BOOT_SERVICES_DATA) {
+ if (is_reserve_region(md)) {
memblock_reserve(paddr, size);
if (uefi_debug)
pr_cont("*");
@@ -179,123 +203,6 @@ static __init void reserve_regions(void)
set_bit(EFI_MEMMAP, &efi.flags);
}
-
-static u64 __init free_one_region(u64 start, u64 end)
-{
- u64 size = end - start;
-
- if (uefi_debug)
- pr_info(" EFI freeing: 0x%012llx-0x%012llx\n", start, end - 1);
-
- free_bootmem_late(start, size);
- return size;
-}
-
-static u64 __init free_region(u64 start, u64 end)
-{
- u64 map_start, map_end, total = 0;
-
- if (end <= start)
- return total;
-
- map_start = (u64)memmap.phys_map;
- map_end = PAGE_ALIGN(map_start + (memmap.map_end - memmap.map));
- map_start &= PAGE_MASK;
-
- if (start < map_end && end > map_start) {
- /* region overlaps UEFI memmap */
- if (start < map_start)
- total += free_one_region(start, map_start);
-
- if (map_end < end)
- total += free_one_region(map_end, end);
- } else
- total += free_one_region(start, end);
-
- return total;
-}
-
-static void __init free_boot_services(void)
-{
- u64 total_freed = 0;
- u64 keep_end, free_start, free_end;
- efi_memory_desc_t *md;
-
- /*
- * If kernel uses larger pages than UEFI, we have to be careful
- * not to inadvertantly free memory we want to keep if there is
- * overlap at the kernel page size alignment. We do not want to
- * free is_reserve_region() memory nor the UEFI memmap itself.
- *
- * The memory map is sorted, so we keep track of the end of
- * any previous region we want to keep, remember any region
- * we want to free and defer freeing it until we encounter
- * the next region we want to keep. This way, before freeing
- * it, we can clip it as needed to avoid freeing memory we
- * want to keep for UEFI.
- */
-
- keep_end = 0;
- free_start = 0;
-
- for_each_efi_memory_desc(&memmap, md) {
- u64 paddr, npages, size;
-
- if (is_reserve_region(md)) {
- /*
- * We don't want to free any memory from this region.
- */
- if (free_start) {
- /* adjust free_end then free region */
- if (free_end > md->phys_addr)
- free_end -= PAGE_SIZE;
- total_freed += free_region(free_start, free_end);
- free_start = 0;
- }
- keep_end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
- continue;
- }
-
- if (md->type != EFI_BOOT_SERVICES_CODE &&
- md->type != EFI_BOOT_SERVICES_DATA) {
- /* no need to free this region */
- continue;
- }
-
- /*
- * We want to free memory from this region.
- */
- paddr = md->phys_addr;
- npages = md->num_pages;
- memrange_efi_to_native(&paddr, &npages);
- size = npages << PAGE_SHIFT;
-
- if (free_start) {
- if (paddr <= free_end)
- free_end = paddr + size;
- else {
- total_freed += free_region(free_start, free_end);
- free_start = paddr;
- free_end = paddr + size;
- }
- } else {
- free_start = paddr;
- free_end = paddr + size;
- }
- if (free_start < keep_end) {
- free_start += PAGE_SIZE;
- if (free_start >= free_end)
- free_start = 0;
- }
- }
- if (free_start)
- total_freed += free_region(free_start, free_end);
-
- if (total_freed)
- pr_info("Freed 0x%llx bytes of EFI boot services memory",
- total_freed);
-}
-
void __init efi_init(void)
{
struct efi_fdt_params params;
@@ -318,156 +225,154 @@ void __init efi_init(void)
return;
reserve_regions();
+ early_memunmap(memmap.map, params.mmap_size);
}
-void __init efi_idmap_init(void)
+static bool __init efi_virtmap_init(void)
{
- if (!efi_enabled(EFI_BOOT))
- return;
-
- /* boot time idmap_pg_dir is incomplete, so fill in missing parts */
- efi_setup_idmap();
- early_memunmap(memmap.map, memmap.map_end - memmap.map);
-}
-
-static int __init remap_region(efi_memory_desc_t *md, void **new)
-{
- u64 paddr, vaddr, npages, size;
+ efi_memory_desc_t *md;
- paddr = md->phys_addr;
- npages = md->num_pages;
- memrange_efi_to_native(&paddr, &npages);
- size = npages << PAGE_SHIFT;
+ init_new_context(NULL, &efi_mm);
- if (is_normal_ram(md))
- vaddr = (__force u64)ioremap_cache(paddr, size);
- else
- vaddr = (__force u64)ioremap(paddr, size);
+ for_each_efi_memory_desc(&memmap, md) {
+ u64 paddr, npages, size;
+ pgprot_t prot;
- if (!vaddr) {
- pr_err("Unable to remap 0x%llx pages @ %p\n",
- npages, (void *)paddr);
- return 0;
- }
+ if (!(md->attribute & EFI_MEMORY_RUNTIME))
+ continue;
+ if (md->virt_addr == 0)
+ return false;
- /* adjust for any rounding when EFI and system pagesize differs */
- md->virt_addr = vaddr + (md->phys_addr - paddr);
+ paddr = md->phys_addr;
+ npages = md->num_pages;
+ memrange_efi_to_native(&paddr, &npages);
+ size = npages << PAGE_SHIFT;
- if (uefi_debug)
- pr_info(" EFI remap 0x%012llx => %p\n",
+ pr_info(" EFI remap 0x%016llx => %p\n",
md->phys_addr, (void *)md->virt_addr);
- memcpy(*new, md, memmap.desc_size);
- *new += memmap.desc_size;
-
- return 1;
+ /*
+ * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be
+ * executable, everything else can be mapped with the XN bits
+ * set.
+ */
+ if (!is_normal_ram(md))
+ prot = __pgprot(PROT_DEVICE_nGnRE);
+ else if (md->type == EFI_RUNTIME_SERVICES_CODE)
+ prot = PAGE_KERNEL_EXEC;
+ else
+ prot = PAGE_KERNEL;
+
+ create_pgd_mapping(&efi_mm, paddr, md->virt_addr, size,
+ __pgprot(pgprot_val(prot) | PTE_NG));
+ }
+ return true;
}
/*
- * Switch UEFI from an identity map to a kernel virtual map
+ * Enable the UEFI Runtime Services if all prerequisites are in place, i.e.,
+ * non-early mapping of the UEFI system table and virtual mappings for all
+ * EFI_MEMORY_RUNTIME regions.
*/
-static int __init arm64_enter_virtual_mode(void)
+static int __init arm64_enable_runtime_services(void)
{
- efi_memory_desc_t *md;
- phys_addr_t virtmap_phys;
- void *virtmap, *virt_md;
- efi_status_t status;
u64 mapsize;
- int count = 0;
- unsigned long flags;
if (!efi_enabled(EFI_BOOT)) {
pr_info("EFI services will not be available.\n");
return -1;
}
- mapsize = memmap.map_end - memmap.map;
-
if (efi_runtime_disabled()) {
pr_info("EFI runtime services will be disabled.\n");
return -1;
}
pr_info("Remapping and enabling EFI services.\n");
- /* replace early memmap mapping with permanent mapping */
+
+ mapsize = memmap.map_end - memmap.map;
memmap.map = (__force void *)ioremap_cache((phys_addr_t)memmap.phys_map,
mapsize);
- memmap.map_end = memmap.map + mapsize;
-
- efi.memmap = &memmap;
-
- /* Map the runtime regions */
- virtmap = kmalloc(mapsize, GFP_KERNEL);
- if (!virtmap) {
- pr_err("Failed to allocate EFI virtual memmap\n");
+ if (!memmap.map) {
+ pr_err("Failed to remap EFI memory map\n");
return -1;
}
- virtmap_phys = virt_to_phys(virtmap);
- virt_md = virtmap;
-
- for_each_efi_memory_desc(&memmap, md) {
- if (!(md->attribute & EFI_MEMORY_RUNTIME))
- continue;
- if (!remap_region(md, &virt_md))
- goto err_unmap;
- ++count;
- }
+ memmap.map_end = memmap.map + mapsize;
+ efi.memmap = &memmap;
- efi.systab = (__force void *)efi_lookup_mapped_addr(efi_system_table);
+ efi.systab = (__force void *)ioremap_cache(efi_system_table,
+ sizeof(efi_system_table_t));
if (!efi.systab) {
- /*
- * If we have no virtual mapping for the System Table at this
- * point, the memory map doesn't cover the physical offset where
- * it resides. This means the System Table will be inaccessible
- * to Runtime Services themselves once the virtual mapping is
- * installed.
- */
- pr_err("Failed to remap EFI System Table -- buggy firmware?\n");
- goto err_unmap;
+ pr_err("Failed to remap EFI System Table\n");
+ return -1;
}
set_bit(EFI_SYSTEM_TABLES, &efi.flags);
- local_irq_save(flags);
- cpu_switch_mm(idmap_pg_dir, &init_mm);
-
- /* Call SetVirtualAddressMap with the physical address of the map */
- runtime = efi.systab->runtime;
- efi.set_virtual_address_map = runtime->set_virtual_address_map;
-
- status = efi.set_virtual_address_map(count * memmap.desc_size,
- memmap.desc_size,
- memmap.desc_version,
- (efi_memory_desc_t *)virtmap_phys);
- cpu_set_reserved_ttbr0();
- flush_tlb_all();
- local_irq_restore(flags);
-
- kfree(virtmap);
-
- free_boot_services();
-
- if (status != EFI_SUCCESS) {
- pr_err("Failed to set EFI virtual address map! [%lx]\n",
- status);
+ if (!efi_virtmap_init()) {
+ pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n");
return -1;
}
/* Set up runtime services function pointers */
- runtime = efi.systab->runtime;
efi_native_runtime_setup();
set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
efi.runtime_version = efi.systab->hdr.revision;
return 0;
+}
+early_initcall(arm64_enable_runtime_services);
-err_unmap:
- /* unmap all mappings that succeeded: there are 'count' of those */
- for (virt_md = virtmap; count--; virt_md += memmap.desc_size) {
- md = virt_md;
- iounmap((__force void __iomem *)md->virt_addr);
+static int __init arm64_dmi_init(void)
+{
+ /*
+ * On arm64, DMI depends on UEFI, and dmi_scan_machine() needs to
+ * be called early because dmi_id_init(), which is an arch_initcall
+ * itself, depends on dmi_scan_machine() having been called already.
+ */
+ dmi_scan_machine();
+ if (dmi_available)
+ dmi_set_dump_stack_arch_desc();
+ return 0;
+}
+core_initcall(arm64_dmi_init);
+
+static void efi_set_pgd(struct mm_struct *mm)
+{
+ __switch_mm(mm);
+
+ if (system_uses_ttbr0_pan()) {
+ if (mm != current->active_mm) {
+ /*
+ * Update the current thread's saved ttbr0 since it is
+ * restored as part of a return from exception. Set
+ * the hardware TTBR0_EL1 using cpu_switch_mm()
+ * directly to enable potential errata workarounds.
+ */
+ update_saved_ttbr0(current, mm);
+ cpu_switch_mm(mm->pgd, mm);
+ } else {
+ /*
+ * Defer the switch to the current thread's TTBR0_EL1
+ * until uaccess_enable(). Restore the current
+ * thread's saved ttbr0 corresponding to its active_mm
+ * (if different from init_mm).
+ */
+ cpu_set_reserved_ttbr0();
+ if (current->active_mm != &init_mm)
+ update_saved_ttbr0(current, current->active_mm);
+ }
}
- kfree(virtmap);
- return -1;
}
-early_initcall(arm64_enter_virtual_mode);
+
+void efi_virtmap_load(void)
+{
+ preempt_disable();
+ efi_set_pgd(&efi_mm);
+}
+
+void efi_virtmap_unload(void)
+{
+ efi_set_pgd(current->active_mm);
+ preempt_enable();
+}
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 075e42e74cc9..d16c670e83ff 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -27,7 +27,10 @@
#include <asm/cpufeature.h>
#include <asm/errno.h>
#include <asm/esr.h>
+#include <asm/memory.h>
+#include <asm/ptrace.h>
#include <asm/thread_info.h>
+#include <asm/uaccess.h>
#include <asm/asm-uaccess.h>
#include <asm/unistd.h>
@@ -67,36 +70,71 @@
#define BAD_ERROR 3
.macro kernel_entry, el, regsize = 64
- sub sp, sp, #S_FRAME_SIZE - S_LR // room for LR, SP, SPSR, ELR
+ sub sp, sp, #S_FRAME_SIZE
.if \regsize == 32
mov w0, w0 // zero upper 32 bits of x0
.endif
- push x28, x29
- push x26, x27
- push x24, x25
- push x22, x23
- push x20, x21
- push x18, x19
- push x16, x17
- push x14, x15
- push x12, x13
- push x10, x11
- push x8, x9
- push x6, x7
- push x4, x5
- push x2, x3
- push x0, x1
+ stp x0, x1, [sp, #16 * 0]
+ stp x2, x3, [sp, #16 * 1]
+ stp x4, x5, [sp, #16 * 2]
+ stp x6, x7, [sp, #16 * 3]
+ stp x8, x9, [sp, #16 * 4]
+ stp x10, x11, [sp, #16 * 5]
+ stp x12, x13, [sp, #16 * 6]
+ stp x14, x15, [sp, #16 * 7]
+ stp x16, x17, [sp, #16 * 8]
+ stp x18, x19, [sp, #16 * 9]
+ stp x20, x21, [sp, #16 * 10]
+ stp x22, x23, [sp, #16 * 11]
+ stp x24, x25, [sp, #16 * 12]
+ stp x26, x27, [sp, #16 * 13]
+ stp x28, x29, [sp, #16 * 14]
+
.if \el == 0
mrs x21, sp_el0
- get_thread_info tsk // Ensure MDSCR_EL1.SS is clear,
+ mov tsk, sp
+ and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear,
ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug
disable_step_tsk x19, x20 // exceptions when scheduling.
.else
add x21, sp, #S_FRAME_SIZE
- .endif
+ get_thread_info tsk
+ /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */
+ ldr x20, [tsk, #TI_ADDR_LIMIT]
+ str x20, [sp, #S_ORIG_ADDR_LIMIT]
+ mov x20, #TASK_SIZE_64
+ str x20, [tsk, #TI_ADDR_LIMIT]
+ ALTERNATIVE(nop, SET_PSTATE_UAO(0), ARM64_HAS_UAO, CONFIG_ARM64_UAO)
+ .endif /* \el == 0 */
mrs x22, elr_el1
mrs x23, spsr_el1
stp lr, x21, [sp, #S_LR]
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ /*
+ * Set the TTBR0 PAN bit in SPSR. When the exception is taken from
+ * EL0, there is no need to check the state of TTBR0_EL1 since
+ * accesses are always enabled.
+ * Note that the meaning of this bit differs from the ARMv8.1 PAN
+ * feature as all TTBR0_EL1 accesses are disabled, not just those to
+ * user mappings.
+ */
+alternative_if ARM64_HAS_PAN
+ b 1f // skip TTBR0 PAN
+alternative_else_nop_endif
+
+ .if \el != 0
+ mrs x21, ttbr0_el1
+ tst x21, #0xffff << 48 // Check for the reserved ASID
+ orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR
+ b.eq 1f // TTBR0 access already disabled
+ and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR
+ .endif
+
+ __uaccess_ttbr0_disable x21
+1:
+#endif
+
stp x22, x23, [sp, #S_PC]
/*
@@ -108,6 +146,13 @@
.endif
/*
+ * Set sp_el0 to current thread_info.
+ */
+ .if \el == 0
+ msr sp_el0, tsk
+ .endif
+
+ /*
* Registers that may be useful after this macro is invoked:
*
* x21 - aborted SP
@@ -116,64 +161,86 @@
*/
.endm
- .macro kernel_exit, el, ret = 0
+ .macro kernel_exit, el
+ .if \el != 0
+ /* Restore the task's original addr_limit. */
+ ldr x20, [sp, #S_ORIG_ADDR_LIMIT]
+ str x20, [tsk, #TI_ADDR_LIMIT]
+
+ /* No need to restore UAO, it will be restored from SPSR_EL1 */
+ .endif
+
ldp x21, x22, [sp, #S_PC] // load ELR, SPSR
.if \el == 0
ct_user_enter
- ldr x23, [sp, #S_SP] // load return stack pointer
+ .endif
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ /*
+ * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
+ * PAN bit checking.
+ */
+alternative_if ARM64_HAS_PAN
+ b 2f // skip TTBR0 PAN
+alternative_else_nop_endif
+
+ .if \el != 0
+ tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set
+ .endif
+ __uaccess_ttbr0_enable x0
+1:
+ .if \el != 0
+ and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit
+ .endif
+2:
+#endif
+
+ .if \el == 0
+ ldr x23, [sp, #S_SP] // load return stack pointer
+ msr sp_el0, x23
#ifdef CONFIG_ARM64_ERRATUM_845719
- alternative_insn \
- "nop", \
- "tbz x22, #4, 1f", \
- ARM64_WORKAROUND_845719
+alternative_if_not ARM64_WORKAROUND_845719
+ nop
+ nop
#ifdef CONFIG_PID_IN_CONTEXTIDR
- alternative_insn \
- "nop; nop", \
- "mrs x29, contextidr_el1; msr contextidr_el1, x29; 1:", \
- ARM64_WORKAROUND_845719
+ nop
+#endif
+alternative_else
+ tbz x22, #4, 1f
+#ifdef CONFIG_PID_IN_CONTEXTIDR
+ mrs x29, contextidr_el1
+ msr contextidr_el1, x29
#else
- alternative_insn \
- "nop", \
- "msr contextidr_el1, xzr; 1:", \
- ARM64_WORKAROUND_845719
+ msr contextidr_el1, xzr
#endif
+1:
+alternative_endif
#endif
.endif
- .if \ret
- ldr x1, [sp, #S_X1] // preserve x0 (syscall return)
- add sp, sp, S_X2
- .else
- pop x0, x1
- .endif
- pop x2, x3 // load the rest of the registers
- pop x4, x5
- pop x6, x7
- pop x8, x9
+
msr elr_el1, x21 // set up the return data
msr spsr_el1, x22
- .if \el == 0
- msr sp_el0, x23
- .endif
- pop x10, x11
- pop x12, x13
- pop x14, x15
- pop x16, x17
- pop x18, x19
- pop x20, x21
- pop x22, x23
- pop x24, x25
- pop x26, x27
- pop x28, x29
- ldr lr, [sp], #S_FRAME_SIZE - S_LR // load LR and restore SP
+ ldp x0, x1, [sp, #16 * 0]
+ ldp x2, x3, [sp, #16 * 1]
+ ldp x4, x5, [sp, #16 * 2]
+ ldp x6, x7, [sp, #16 * 3]
+ ldp x8, x9, [sp, #16 * 4]
+ ldp x10, x11, [sp, #16 * 5]
+ ldp x12, x13, [sp, #16 * 6]
+ ldp x14, x15, [sp, #16 * 7]
+ ldp x16, x17, [sp, #16 * 8]
+ ldp x18, x19, [sp, #16 * 9]
+ ldp x20, x21, [sp, #16 * 10]
+ ldp x22, x23, [sp, #16 * 11]
+ ldp x24, x25, [sp, #16 * 12]
+ ldp x26, x27, [sp, #16 * 13]
+ ldp x28, x29, [sp, #16 * 14]
+ ldr lr, [sp, #S_LR]
+ add sp, sp, #S_FRAME_SIZE // restore sp
eret // return to kernel
.endm
- .macro get_thread_info, rd
- mov \rd, sp
- and \rd, \rd, #~(THREAD_SIZE - 1) // top of stack
- .endm
-
/*
* These are the registers used in the syscall handler, and allow us to
* have in theory up to 7 arguments to a function - x0 to x6.
@@ -189,7 +256,8 @@ tsk .req x28 // current thread_info
* Interrupt handling.
*/
.macro irq_handler
- ldr x1, handle_arch_irq
+ adrp x1, handle_arch_irq
+ ldr x1, [x1, #:lo12:handle_arch_irq]
mov x0, sp
blr x1
.endm
@@ -234,7 +302,7 @@ END(vectors)
* Invalid mode handlers
*/
.macro inv_entry, el, reason, regsize = 64
- kernel_entry el, \regsize
+ kernel_entry \el, \regsize
mov x0, sp
mov x1, #\reason
mrs x2, esr_el1
@@ -290,20 +358,27 @@ ENDPROC(el1_error_invalid)
el1_sync:
kernel_entry 1
mrs x1, esr_el1 // read the syndrome register
- lsr x24, x1, #ESR_EL1_EC_SHIFT // exception class
- cmp x24, #ESR_EL1_EC_DABT_EL1 // data abort in EL1
+ lsr x24, x1, #ESR_ELx_EC_SHIFT // exception class
+ cmp x24, #ESR_ELx_EC_DABT_CUR // data abort in EL1
b.eq el1_da
- cmp x24, #ESR_EL1_EC_SYS64 // configurable trap
+ cmp x24, #ESR_ELx_EC_IABT_CUR // instruction abort in EL1
+ b.eq el1_ia
+ cmp x24, #ESR_ELx_EC_SYS64 // configurable trap
b.eq el1_undef
- cmp x24, #ESR_EL1_EC_SP_ALIGN // stack alignment exception
+ cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception
b.eq el1_sp_pc
- cmp x24, #ESR_EL1_EC_PC_ALIGN // pc alignment exception
+ cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception
b.eq el1_sp_pc
- cmp x24, #ESR_EL1_EC_UNKNOWN // unknown exception in EL1
+ cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL1
b.eq el1_undef
- cmp x24, #ESR_EL1_EC_BREAKPT_EL1 // debug exception in EL1
+ cmp x24, #ESR_ELx_EC_BREAKPT_CUR // debug exception in EL1
b.ge el1_dbg
b el1_inv
+
+el1_ia:
+ /*
+ * Fall through to the Data abort case
+ */
el1_da:
/*
* Data abort handling
@@ -340,7 +415,7 @@ el1_dbg:
/*
* Debug exception handling
*/
- cmp x24, #ESR_EL1_EC_BRK64 // if BRK64
+ cmp x24, #ESR_ELx_EC_BRK64 // if BRK64
cinc x24, x24, eq // set bit '0'
tbz x24, #0, el1_inv // EL1 only
mrs x0, far_el1
@@ -397,26 +472,26 @@ el1_preempt:
el0_sync:
kernel_entry 0
mrs x25, esr_el1 // read the syndrome register
- lsr x24, x25, #ESR_EL1_EC_SHIFT // exception class
- cmp x24, #ESR_EL1_EC_SVC64 // SVC in 64-bit state
+ lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class
+ cmp x24, #ESR_ELx_EC_SVC64 // SVC in 64-bit state
b.eq el0_svc
- cmp x24, #ESR_EL1_EC_DABT_EL0 // data abort in EL0
+ cmp x24, #ESR_ELx_EC_DABT_LOW // data abort in EL0
b.eq el0_da
- cmp x24, #ESR_EL1_EC_IABT_EL0 // instruction abort in EL0
+ cmp x24, #ESR_ELx_EC_IABT_LOW // instruction abort in EL0
b.eq el0_ia
- cmp x24, #ESR_EL1_EC_FP_ASIMD // FP/ASIMD access
+ cmp x24, #ESR_ELx_EC_FP_ASIMD // FP/ASIMD access
b.eq el0_fpsimd_acc
- cmp x24, #ESR_EL1_EC_FP_EXC64 // FP/ASIMD exception
+ cmp x24, #ESR_ELx_EC_FP_EXC64 // FP/ASIMD exception
b.eq el0_fpsimd_exc
- cmp x24, #ESR_EL1_EC_SYS64 // configurable trap
+ cmp x24, #ESR_ELx_EC_SYS64 // configurable trap
b.eq el0_undef
- cmp x24, #ESR_EL1_EC_SP_ALIGN // stack alignment exception
+ cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception
b.eq el0_sp_pc
- cmp x24, #ESR_EL1_EC_PC_ALIGN // pc alignment exception
+ cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception
b.eq el0_sp_pc
- cmp x24, #ESR_EL1_EC_UNKNOWN // unknown exception in EL0
+ cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL0
b.eq el0_undef
- cmp x24, #ESR_EL1_EC_BREAKPT_EL0 // debug exception in EL0
+ cmp x24, #ESR_ELx_EC_BREAKPT_LOW // debug exception in EL0
b.ge el0_dbg
b el0_inv
@@ -425,30 +500,30 @@ el0_sync:
el0_sync_compat:
kernel_entry 0, 32
mrs x25, esr_el1 // read the syndrome register
- lsr x24, x25, #ESR_EL1_EC_SHIFT // exception class
- cmp x24, #ESR_EL1_EC_SVC32 // SVC in 32-bit state
+ lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class
+ cmp x24, #ESR_ELx_EC_SVC32 // SVC in 32-bit state
b.eq el0_svc_compat
- cmp x24, #ESR_EL1_EC_DABT_EL0 // data abort in EL0
+ cmp x24, #ESR_ELx_EC_DABT_LOW // data abort in EL0
b.eq el0_da
- cmp x24, #ESR_EL1_EC_IABT_EL0 // instruction abort in EL0
+ cmp x24, #ESR_ELx_EC_IABT_LOW // instruction abort in EL0
b.eq el0_ia
- cmp x24, #ESR_EL1_EC_FP_ASIMD // FP/ASIMD access
+ cmp x24, #ESR_ELx_EC_FP_ASIMD // FP/ASIMD access
b.eq el0_fpsimd_acc
- cmp x24, #ESR_EL1_EC_FP_EXC32 // FP/ASIMD exception
+ cmp x24, #ESR_ELx_EC_FP_EXC32 // FP/ASIMD exception
b.eq el0_fpsimd_exc
- cmp x24, #ESR_EL1_EC_UNKNOWN // unknown exception in EL0
+ cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL0
b.eq el0_undef
- cmp x24, #ESR_EL1_EC_CP15_32 // CP15 MRC/MCR trap
+ cmp x24, #ESR_ELx_EC_CP15_32 // CP15 MRC/MCR trap
b.eq el0_undef
- cmp x24, #ESR_EL1_EC_CP15_64 // CP15 MRRC/MCRR trap
+ cmp x24, #ESR_ELx_EC_CP15_64 // CP15 MRRC/MCRR trap
b.eq el0_undef
- cmp x24, #ESR_EL1_EC_CP14_MR // CP14 MRC/MCR trap
+ cmp x24, #ESR_ELx_EC_CP14_MR // CP14 MRC/MCR trap
b.eq el0_undef
- cmp x24, #ESR_EL1_EC_CP14_LS // CP14 LDC/STC trap
+ cmp x24, #ESR_ELx_EC_CP14_LS // CP14 LDC/STC trap
b.eq el0_undef
- cmp x24, #ESR_EL1_EC_CP14_64 // CP14 MRRC/MCRR trap
+ cmp x24, #ESR_ELx_EC_CP14_64 // CP14 MRRC/MCRR trap
b.eq el0_undef
- cmp x24, #ESR_EL1_EC_BREAKPT_EL0 // debug exception in EL0
+ cmp x24, #ESR_ELx_EC_BREAKPT_LOW // debug exception in EL0
b.ge el0_dbg
b el0_inv
el0_svc_compat:
@@ -477,8 +552,8 @@ el0_da:
clear_address_tag x0, x26
mov x1, x25
mov x2, sp
- adr lr, ret_to_user
- b do_mem_abort
+ bl do_mem_abort
+ b ret_to_user
el0_ia:
/*
* Instruction abort handling
@@ -488,10 +563,10 @@ el0_ia:
enable_dbg_and_irq
ct_user_exit
mov x0, x26
- orr x1, x25, #1 << 24 // use reserved ISS bit for instruction aborts
+ mov x1, x25
mov x2, sp
- adr lr, ret_to_user
- b do_mem_abort
+ bl do_mem_abort
+ b ret_to_user
el0_fpsimd_acc:
/*
* Floating Point or Advanced SIMD access
@@ -500,8 +575,8 @@ el0_fpsimd_acc:
ct_user_exit
mov x0, x25
mov x1, sp
- adr lr, ret_to_user
- b do_fpsimd_acc
+ bl do_fpsimd_acc
+ b ret_to_user
el0_fpsimd_exc:
/*
* Floating Point or Advanced SIMD exception
@@ -510,8 +585,8 @@ el0_fpsimd_exc:
ct_user_exit
mov x0, x25
mov x1, sp
- adr lr, ret_to_user
- b do_fpsimd_exc
+ bl do_fpsimd_exc
+ b ret_to_user
el0_sp_pc:
/*
* Stack or PC alignment exception handling
@@ -523,8 +598,8 @@ el0_sp_pc:
mov x0, x26
mov x1, x25
mov x2, sp
- adr lr, ret_to_user
- b do_sp_pc_abort
+ bl do_sp_pc_abort
+ b ret_to_user
el0_undef:
/*
* Undefined instruction
@@ -533,8 +608,8 @@ el0_undef:
enable_dbg_and_irq
ct_user_exit
mov x0, sp
- adr lr, ret_to_user
- b do_undefinstr
+ bl do_undefinstr
+ b ret_to_user
el0_dbg:
/*
* Debug exception handling
@@ -602,6 +677,8 @@ ENTRY(cpu_switch_to)
ldp x29, x9, [x8], #16
ldr lr, [x8]
mov sp, x9
+ and x9, x9, #~(THREAD_SIZE - 1)
+ msr sp_el0, x9
ret
ENDPROC(cpu_switch_to)
@@ -611,17 +688,21 @@ ENDPROC(cpu_switch_to)
*/
ret_fast_syscall:
disable_irq // disable interrupts
- ldr x1, [tsk, #TI_FLAGS]
+ str x0, [sp, #S_X0] // returned x0
+ ldr x1, [tsk, #TI_FLAGS] // re-check for syscall tracing
+ and x2, x1, #_TIF_SYSCALL_WORK
+ cbnz x2, ret_fast_syscall_trace
and x2, x1, #_TIF_WORK_MASK
- cbnz x2, fast_work_pending
+ cbnz x2, work_pending
enable_step_tsk x1, x2
- kernel_exit 0, ret = 1
+ kernel_exit 0
+ret_fast_syscall_trace:
+ enable_irq // enable interrupts
+ b __sys_trace_return_skipped // we already saved x0
/*
* Ok, we need to do extra processing, enter the slow path.
*/
-fast_work_pending:
- str x0, [sp, #S_X0] // returned x0
work_pending:
tbnz x1, #TIF_NEED_RESCHED, work_resched
/* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
@@ -645,7 +726,7 @@ ret_to_user:
cbnz x2, work_pending
enable_step_tsk x1, x2
no_work_pending:
- kernel_exit 0, ret = 0
+ kernel_exit 0
ENDPROC(ret_to_user)
/*
@@ -676,14 +757,15 @@ el0_svc_naked: // compat entry point
ldr x16, [tsk, #TI_FLAGS] // check for syscall hooks
tst x16, #_TIF_SYSCALL_WORK
b.ne __sys_trace
- adr lr, ret_fast_syscall // return address
cmp scno, sc_nr // check upper syscall limit
b.hs ni_sys
ldr x16, [stbl, scno, lsl #3] // address in the syscall table
- br x16 // call sys_* routine
+ blr x16 // call sys_* routine
+ b ret_fast_syscall
ni_sys:
mov x0, sp
- b do_ni_syscall
+ bl do_ni_syscall
+ b ret_fast_syscall
ENDPROC(el0_svc)
/*
@@ -691,26 +773,38 @@ ENDPROC(el0_svc)
* switches, and waiting for our parent to respond.
*/
__sys_trace:
- mov x0, sp
+ mov w0, #-1 // set default errno for
+ cmp scno, x0 // user-issued syscall(-1)
+ b.ne 1f
+ mov x0, #-ENOSYS
+ str x0, [sp, #S_X0]
+1: mov x0, sp
bl syscall_trace_enter
- adr lr, __sys_trace_return // return address
+ cmp w0, #-1 // skip the syscall?
+ b.eq __sys_trace_return_skipped
uxtw scno, w0 // syscall number (possibly new)
mov x1, sp // pointer to regs
cmp scno, sc_nr // check upper syscall limit
- b.hs ni_sys
+ b.hs __ni_sys_trace
ldp x0, x1, [sp] // restore the syscall args
ldp x2, x3, [sp, #S_X2]
ldp x4, x5, [sp, #S_X4]
ldp x6, x7, [sp, #S_X6]
ldr x16, [stbl, scno, lsl #3] // address in the syscall table
- br x16 // call sys_* routine
+ blr x16 // call sys_* routine
__sys_trace_return:
- str x0, [sp] // save returned x0
+ str x0, [sp, #S_X0] // save returned x0
+__sys_trace_return_skipped:
mov x0, sp
bl syscall_trace_exit
b ret_to_user
+__ni_sys_trace:
+ mov x0, sp
+ bl do_ni_syscall
+ b __sys_trace_return
+
/*
* Special system call wrappers.
*/
@@ -718,6 +812,3 @@ ENTRY(sys_rt_sigreturn_wrapper)
mov x0, sp
b sys_rt_sigreturn
ENDPROC(sys_rt_sigreturn_wrapper)
-
-ENTRY(handle_arch_irq)
- .quad 0
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 3dca15634e69..acc1afd5c749 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -17,6 +17,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <linux/cpu.h>
#include <linux/cpu_pm.h>
#include <linux/kernel.h>
#include <linux/init.h>
@@ -157,6 +158,7 @@ void fpsimd_thread_switch(struct task_struct *next)
void fpsimd_flush_thread(void)
{
memset(&current->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
+ fpsimd_flush_task_state(current);
set_thread_flag(TIF_FOREIGN_FPSTATE);
}
@@ -287,7 +289,7 @@ static struct notifier_block fpsimd_cpu_pm_notifier_block = {
.notifier_call = fpsimd_cpu_pm_notifier,
};
-static void fpsimd_pm_init(void)
+static void __init fpsimd_pm_init(void)
{
cpu_pm_register_notifier(&fpsimd_cpu_pm_notifier_block);
}
@@ -296,25 +298,49 @@ static void fpsimd_pm_init(void)
static inline void fpsimd_pm_init(void) { }
#endif /* CONFIG_CPU_PM */
+#ifdef CONFIG_HOTPLUG_CPU
+static int fpsimd_cpu_hotplug_notifier(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ per_cpu(fpsimd_last_state, cpu) = NULL;
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block fpsimd_cpu_hotplug_notifier_block = {
+ .notifier_call = fpsimd_cpu_hotplug_notifier,
+};
+
+static inline void fpsimd_hotplug_init(void)
+{
+ register_cpu_notifier(&fpsimd_cpu_hotplug_notifier_block);
+}
+
+#else
+static inline void fpsimd_hotplug_init(void) { }
+#endif
+
/*
* FP/SIMD support code initialisation.
*/
static int __init fpsimd_init(void)
{
- u64 pfr = read_cpuid(ID_AA64PFR0_EL1);
-
- if (pfr & (0xf << 16)) {
+ if (elf_hwcap & HWCAP_FP) {
+ fpsimd_pm_init();
+ fpsimd_hotplug_init();
+ } else {
pr_notice("Floating-point is not implemented\n");
- return 0;
}
- elf_hwcap |= HWCAP_FP;
- if (pfr & (0xf << 20))
+ if (!(elf_hwcap & HWCAP_ASIMD))
pr_notice("Advanced SIMD is not implemented\n");
- else
- elf_hwcap |= HWCAP_ASIMD;
-
- fpsimd_pm_init();
return 0;
}
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 77bfa3470ca0..f3c9d89c2c1e 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -29,6 +29,7 @@
#include <asm/asm-offsets.h>
#include <asm/cache.h>
#include <asm/cputype.h>
+#include <asm/kernel-pgtable.h>
#include <asm/memory.h>
#include <asm/thread_info.h>
#include <asm/pgtable-hwdef.h>
@@ -36,7 +37,7 @@
#include <asm/page.h>
#include <asm/virt.h>
-#define KERNEL_RAM_VADDR (PAGE_OFFSET + TEXT_OFFSET)
+#define __PHYS_OFFSET (KERNEL_START - TEXT_OFFSET)
#if (TEXT_OFFSET & 0xfff) != 0
#error TEXT_OFFSET must be at least 4KB aligned
@@ -46,44 +47,10 @@
#error TEXT_OFFSET must be less than 2MB
#endif
- .macro pgtbl, ttb0, ttb1, virt_to_phys
- ldr \ttb1, =swapper_pg_dir
- ldr \ttb0, =idmap_pg_dir
- add \ttb1, \ttb1, \virt_to_phys
- add \ttb0, \ttb0, \virt_to_phys
- .endm
-
-#ifdef CONFIG_ARM64_64K_PAGES
-#define BLOCK_SHIFT PAGE_SHIFT
-#define BLOCK_SIZE PAGE_SIZE
-#define TABLE_SHIFT PMD_SHIFT
-#else
-#define BLOCK_SHIFT SECTION_SHIFT
-#define BLOCK_SIZE SECTION_SIZE
-#define TABLE_SHIFT PUD_SHIFT
-#endif
-
-#define KERNEL_START KERNEL_RAM_VADDR
+#define KERNEL_START _text
#define KERNEL_END _end
/*
- * Initial memory map attributes.
- */
-#ifndef CONFIG_SMP
-#define PTE_FLAGS PTE_TYPE_PAGE | PTE_AF
-#define PMD_FLAGS PMD_TYPE_SECT | PMD_SECT_AF
-#else
-#define PTE_FLAGS PTE_TYPE_PAGE | PTE_AF | PTE_SHARED
-#define PMD_FLAGS PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S
-#endif
-
-#ifdef CONFIG_ARM64_64K_PAGES
-#define MM_MMUFLAGS PTE_ATTRINDX(MT_NORMAL) | PTE_FLAGS
-#else
-#define MM_MMUFLAGS PMD_ATTRINDX(MT_NORMAL) | PMD_FLAGS
-#endif
-
-/*
* Kernel startup entry point.
* ---------------------------
*
@@ -132,6 +99,8 @@ efi_head:
#endif
#ifdef CONFIG_EFI
+ .globl stext_offset
+ .set stext_offset, stext - efi_head
.align 3
pe_header:
.ascii "PE"
@@ -155,12 +124,12 @@ optional_header:
.long 0 // SizeOfInitializedData
.long 0 // SizeOfUninitializedData
.long efi_stub_entry - efi_head // AddressOfEntryPoint
- .long stext - efi_head // BaseOfCode
+ .long stext_offset // BaseOfCode
extra_header_fields:
.quad 0 // ImageBase
- .long 0x20 // SectionAlignment
- .long 0x8 // FileAlignment
+ .long 0x1000 // SectionAlignment
+ .long PECOFF_FILE_ALIGNMENT // FileAlignment
.short 0 // MajorOperatingSystemVersion
.short 0 // MinorOperatingSystemVersion
.short 0 // MajorImageVersion
@@ -172,7 +141,7 @@ extra_header_fields:
.long _end - efi_head // SizeOfImage
// Everything before the kernel image is considered part of the header
- .long stext - efi_head // SizeOfHeaders
+ .long stext_offset // SizeOfHeaders
.long 0 // CheckSum
.short 0xa // Subsystem (EFI application)
.short 0 // DllCharacteristics
@@ -217,53 +186,64 @@ section_table:
.byte 0
.byte 0 // end of 0 padding of section name
.long _end - stext // VirtualSize
- .long stext - efi_head // VirtualAddress
+ .long stext_offset // VirtualAddress
.long _edata - stext // SizeOfRawData
- .long stext - efi_head // PointerToRawData
+ .long stext_offset // PointerToRawData
.long 0 // PointerToRelocations (0 for executables)
.long 0 // PointerToLineNumbers (0 for executables)
.short 0 // NumberOfRelocations (0 for executables)
.short 0 // NumberOfLineNumbers (0 for executables)
.long 0xe0500020 // Characteristics (section flags)
- .align 5
+
+ /*
+ * EFI will load stext onwards at the 4k section alignment
+ * described in the PE/COFF header. To ensure that instruction
+ * sequences using an adrp and a :lo12: immediate will function
+ * correctly at this alignment, we must ensure that stext is
+ * placed at a 4k boundary in the Image to begin with.
+ */
+ .align 12
#endif
ENTRY(stext)
- mov x21, x0 // x21=FDT
+ bl preserve_boot_args
bl el2_setup // Drop to EL1, w20=cpu_boot_mode
- bl __calc_phys_offset // x24=PHYS_OFFSET, x28=PHYS_OFFSET-PAGE_OFFSET
+ adrp x24, __PHYS_OFFSET
bl set_cpu_boot_mode_flag
- mrs x22, midr_el1 // x22=cpuid
- mov x0, x22
- bl lookup_processor_type
- mov x23, x0 // x23=current cpu_table
- /*
- * __error_p may end up out of range for cbz if text areas are
- * aligned up to section sizes.
- */
- cbnz x23, 1f // invalid processor (x23=0)?
- b __error_p
-1:
+
bl __vet_fdt
bl __create_page_tables // x25=TTBR0, x26=TTBR1
/*
- * The following calls CPU specific code in a position independent
- * manner. See arch/arm64/mm/proc.S for details. x23 = base of
- * cpu_info structure selected by lookup_processor_type above.
+ * The following calls CPU setup code, see arch/arm64/mm/proc.S for
+ * details.
* On return, the CPU will be ready for the MMU to be turned on and
* the TCR will have been set.
*/
- ldr x27, __switch_data // address to jump to after
+ ldr x27, =__mmap_switched // address to jump to after
// MMU has been enabled
- adrp lr, __enable_mmu // return (PIC) address
- add lr, lr, #:lo12:__enable_mmu
- ldr x12, [x23, #CPU_INFO_SETUP]
- add x12, x12, x28 // __virt_to_phys
- br x12 // initialise processor
+ adr_l lr, __enable_mmu // return (PIC) address
+ b __cpu_setup // initialise processor
ENDPROC(stext)
/*
+ * Preserve the arguments passed by the bootloader in x0 .. x3
+ */
+preserve_boot_args:
+ mov x21, x0 // x21=FDT
+
+ adr_l x0, boot_args // record the contents of
+ stp x21, x1, [x0] // x0 .. x3 at kernel entry
+ stp x2, x3, [x0, #16]
+
+ dmb sy // needed before dc ivac with
+ // MMU off
+
+ add x1, x0, #0x20 // 4 x 8 bytes
+ b __inval_cache_range // tail call
+ENDPROC(preserve_boot_args)
+
+/*
* Determine validity of the x21 FDT pointer.
* The dtb must be 8-byte aligned and live in the first 512M of memory.
*/
@@ -312,7 +292,7 @@ ENDPROC(__vet_fdt)
.macro create_pgd_entry, tbl, virt, tmp1, tmp2
create_table_entry \tbl, \virt, PGDIR_SHIFT, PTRS_PER_PGD, \tmp1, \tmp2
#if SWAPPER_PGTABLE_LEVELS == 3
- create_table_entry \tbl, \virt, TABLE_SHIFT, PTRS_PER_PTE, \tmp1, \tmp2
+ create_table_entry \tbl, \virt, SWAPPER_TABLE_SHIFT, PTRS_PER_PTE, \tmp1, \tmp2
#endif
.endm
@@ -324,15 +304,15 @@ ENDPROC(__vet_fdt)
* Corrupts: phys, start, end, pstate
*/
.macro create_block_map, tbl, flags, phys, start, end
- lsr \phys, \phys, #BLOCK_SHIFT
- lsr \start, \start, #BLOCK_SHIFT
+ lsr \phys, \phys, #SWAPPER_BLOCK_SHIFT
+ lsr \start, \start, #SWAPPER_BLOCK_SHIFT
and \start, \start, #PTRS_PER_PTE - 1 // table index
- orr \phys, \flags, \phys, lsl #BLOCK_SHIFT // table entry
- lsr \end, \end, #BLOCK_SHIFT
+ orr \phys, \flags, \phys, lsl #SWAPPER_BLOCK_SHIFT // table entry
+ lsr \end, \end, #SWAPPER_BLOCK_SHIFT
and \end, \end, #PTRS_PER_PTE - 1 // table end index
9999: str \phys, [\tbl, \start, lsl #3] // store the entry
add \start, \start, #1 // next entry
- add \phys, \phys, #BLOCK_SIZE // next block
+ add \phys, \phys, #SWAPPER_BLOCK_SIZE // next block
cmp \start, \end
b.ls 9999b
.endm
@@ -346,7 +326,8 @@ ENDPROC(__vet_fdt)
* - pgd entry for fixed mappings (TTBR1)
*/
__create_page_tables:
- pgtbl x25, x26, x28 // idmap_pg_dir and swapper_pg_dir addresses
+ adrp x25, idmap_pg_dir
+ adrp x26, swapper_pg_dir
mov x27, lr
/*
@@ -354,14 +335,14 @@ __create_page_tables:
* dirty cache lines being evicted.
*/
mov x0, x25
- add x1, x26, #SWAPPER_DIR_SIZE
+ add x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
bl __inval_cache_range
/*
* Clear the idmap and swapper page tables.
*/
mov x0, x25
- add x6, x26, #SWAPPER_DIR_SIZE
+ add x6, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
1: stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
@@ -369,18 +350,56 @@ __create_page_tables:
cmp x0, x6
b.lo 1b
- ldr x7, =MM_MMUFLAGS
+ ldr x7, =SWAPPER_MM_MMUFLAGS
/*
* Create the identity mapping.
*/
mov x0, x25 // idmap_pg_dir
- ldr x3, =KERNEL_START
- add x3, x3, x28 // __pa(KERNEL_START)
+ adrp x3, KERNEL_START // __pa(KERNEL_START)
+
+#ifndef CONFIG_ARM64_VA_BITS_48
+#define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3)
+#define EXTRA_PTRS (1 << (48 - EXTRA_SHIFT))
+
+ /*
+ * If VA_BITS < 48, it may be too small to allow for an ID mapping to be
+ * created that covers system RAM if that is located sufficiently high
+ * in the physical address space. So for the ID map, use an extended
+ * virtual range in that case, by configuring an additional translation
+ * level.
+ * First, we have to verify our assumption that the current value of
+ * VA_BITS was chosen such that all translation levels are fully
+ * utilised, and that lowering T0SZ will always result in an additional
+ * translation level to be configured.
+ */
+#if VA_BITS != EXTRA_SHIFT
+#error "Mismatch between VA_BITS and page size/number of translation levels"
+#endif
+
+ /*
+ * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
+ * entire kernel image can be ID mapped. As T0SZ == (64 - #bits used),
+ * this number conveniently equals the number of leading zeroes in
+ * the physical address of KERNEL_END.
+ */
+ adrp x5, KERNEL_END
+ clz x5, x5
+ cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough?
+ b.ge 1f // .. then skip additional level
+
+ adr_l x6, idmap_t0sz
+ str x5, [x6]
+ dmb sy
+ dc ivac, x6 // Invalidate potentially stale cache line
+
+ create_table_entry x0, x3, EXTRA_SHIFT, EXTRA_PTRS, x5, x6
+1:
+#endif
+
create_pgd_entry x0, x3, x5, x6
- ldr x6, =KERNEL_END
mov x5, x3 // __pa(KERNEL_START)
- add x6, x6, x28 // __pa(KERNEL_END)
+ adr_l x6, KERNEL_END // __pa(KERNEL_END)
create_block_map x0, x7, x3, x5, x6
/*
@@ -389,7 +408,7 @@ __create_page_tables:
mov x0, x26 // swapper_pg_dir
mov x5, #PAGE_OFFSET
create_pgd_entry x0, x5, x3, x6
- ldr x6, =KERNEL_END
+ ldr x6, =KERNEL_END // __va(KERNEL_END)
mov x3, x24 // phys offset
create_block_map x0, x7, x3, x5, x6
@@ -415,7 +434,8 @@ __create_page_tables:
* tables again to remove any speculatively loaded cache lines.
*/
mov x0, x25
- add x1, x26, #SWAPPER_DIR_SIZE
+ add x1, x26, #SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
+ dmb sy
bl __inval_cache_range
mov lr, x27
@@ -423,37 +443,25 @@ __create_page_tables:
ENDPROC(__create_page_tables)
.ltorg
- .align 3
- .type __switch_data, %object
-__switch_data:
- .quad __mmap_switched
- .quad __bss_start // x6
- .quad __bss_stop // x7
- .quad processor_id // x4
- .quad __fdt_pointer // x5
- .quad memstart_addr // x6
- .quad init_thread_union + THREAD_START_SP // sp
-
/*
- * The following fragment of code is executed with the MMU on in MMU mode, and
- * uses absolute addresses; this is not position independent.
+ * The following fragment of code is executed with the MMU enabled.
*/
+ .set initial_sp, init_thread_union + THREAD_START_SP
__mmap_switched:
- adr x3, __switch_data + 8
-
- ldp x6, x7, [x3], #16
-1: cmp x6, x7
- b.hs 2f
- str xzr, [x6], #8 // Clear BSS
- b 1b
-2:
- ldp x4, x5, [x3], #16
- ldr x6, [x3], #8
- ldr x16, [x3]
- mov sp, x16
- str x22, [x4] // Save processor ID
- str x21, [x5] // Save FDT pointer
- str x24, [x6] // Save PHYS_OFFSET
+ // Clear BSS
+ adr_l x0, __bss_start
+ mov x1, xzr
+ adr_l x2, __bss_stop
+ sub x2, x2, x0
+ bl __pi_memset
+ dsb ishst // Make zero page visible to PTW
+
+ adr_l sp, initial_sp, x4
+ mov x4, sp
+ and x4, x4, #~(THREAD_SIZE - 1)
+ msr sp_el0, x4 // Save thread_info
+ str_l x21, __fdt_pointer, x5 // Save FDT pointer
+ str_l x24, memstart_addr, x6 // Save PHYS_OFFSET
mov x29, #0
#ifdef CONFIG_KASAN
bl kasan_early_init
@@ -546,7 +554,8 @@ CPU_LE( movk x0, #0x30d0, lsl #16 ) // Clear EE and E0E on LE systems
msr vttbr_el2, xzr
/* Hypervisor stub */
- adr x0, __hyp_stub_vectors
+ adrp x0, __hyp_stub_vectors
+ add x0, x0, #:lo12:__hyp_stub_vectors
msr vbar_el2, x0
/* spsr */
@@ -563,8 +572,7 @@ ENDPROC(el2_setup)
* in x20. See arch/arm64/include/asm/virt.h for more info.
*/
ENTRY(set_cpu_boot_mode_flag)
- ldr x1, =__boot_cpu_mode // Compute __boot_cpu_mode
- add x1, x1, x28
+ adr_l x1, __boot_cpu_mode
cmp w20, #BOOT_CPU_MODE_EL2
b.ne 1f
add x1, x1, #4
@@ -588,7 +596,6 @@ ENTRY(__boot_cpu_mode)
.long 0
.popsection
-#ifdef CONFIG_SMP
.align 3
1: .quad .
.quad secondary_holding_pen_release
@@ -599,15 +606,11 @@ ENTRY(__boot_cpu_mode)
*/
ENTRY(secondary_holding_pen)
bl el2_setup // Drop to EL1, w20=cpu_boot_mode
- bl __calc_phys_offset // x24=PHYS_OFFSET, x28=PHYS_OFFSET-PAGE_OFFSET
bl set_cpu_boot_mode_flag
mrs x0, mpidr_el1
ldr x1, =MPIDR_HWID_BITMASK
and x0, x0, x1
- adr x1, 1b
- ldp x2, x3, [x1]
- sub x1, x1, x2
- add x3, x3, x1
+ adr_l x3, secondary_holding_pen_release
pen: ldr x4, [x3]
cmp x4, x0
b.eq secondary_startup
@@ -621,7 +624,6 @@ ENDPROC(secondary_holding_pen)
*/
ENTRY(secondary_entry)
bl el2_setup // Drop to EL1
- bl __calc_phys_offset // x24=PHYS_OFFSET, x28=PHYS_OFFSET-PAGE_OFFSET
bl set_cpu_boot_mode_flag
b secondary_startup
ENDPROC(secondary_entry)
@@ -630,16 +632,9 @@ ENTRY(secondary_startup)
/*
* Common entry point for secondary CPUs.
*/
- mrs x22, midr_el1 // x22=cpuid
- mov x0, x22
- bl lookup_processor_type
- mov x23, x0 // x23=current cpu_table
- cbz x23, __error_p // invalid processor (x23=0)?
-
- pgtbl x25, x26, x28 // x25=TTBR0, x26=TTBR1
- ldr x12, [x23, #CPU_INFO_SETUP]
- add x12, x12, x28 // __virt_to_phys
- blr x12 // initialise processor
+ adrp x25, idmap_pg_dir
+ adrp x26, swapper_pg_dir
+ bl __cpu_setup // initialise processor
ldr x21, =secondary_data
ldr x27, =__secondary_switched // address to jump to after enabling the MMU
@@ -649,17 +644,19 @@ ENDPROC(secondary_startup)
ENTRY(__secondary_switched)
ldr x0, [x21] // get secondary_data.stack
mov sp, x0
+ and x0, x0, #~(THREAD_SIZE - 1)
+ msr sp_el0, x0 // save thread_info
mov x29, #0
b secondary_start_kernel
ENDPROC(__secondary_switched)
-#endif /* CONFIG_SMP */
/*
- * Setup common bits before finally enabling the MMU. Essentially this is just
- * loading the page table pointer and vector base registers.
+ * Enable the MMU.
*
- * On entry to this code, x0 must contain the SCTLR_EL1 value for turning on
- * the MMU.
+ * x0 = SCTLR_EL1 value for turning on the MMU.
+ * x27 = *virtual* address to jump to upon completion
+ *
+ * other registers depend on the function called upon completion
*/
__enable_mmu:
ldr x5, =vectors
@@ -667,89 +664,15 @@ __enable_mmu:
msr ttbr0_el1, x25 // load TTBR0
msr ttbr1_el1, x26 // load TTBR1
isb
- b __turn_mmu_on
-ENDPROC(__enable_mmu)
-
-/*
- * Enable the MMU. This completely changes the structure of the visible memory
- * space. You will not be able to trace execution through this.
- *
- * x0 = system control register
- * x27 = *virtual* address to jump to upon completion
- *
- * other registers depend on the function called upon completion
- *
- * We align the entire function to the smallest power of two larger than it to
- * ensure it fits within a single block map entry. Otherwise were PHYS_OFFSET
- * close to the end of a 512MB or 1GB block we might require an additional
- * table to map the entire function.
- */
- .align 4
-__turn_mmu_on:
msr sctlr_el1, x0
isb
+ /*
+ * Invalidate the local I-cache so that any instructions fetched
+ * speculatively from the PoC are discarded, since they may have
+ * been dynamically patched at the PoU.
+ */
+ ic iallu
+ dsb nsh
+ isb
br x27
-ENDPROC(__turn_mmu_on)
-
-/*
- * Calculate the start of physical memory.
- */
-__calc_phys_offset:
- adr x0, 1f
- ldp x1, x2, [x0]
- sub x28, x0, x1 // x28 = PHYS_OFFSET - PAGE_OFFSET
- add x24, x2, x28 // x24 = PHYS_OFFSET
- ret
-ENDPROC(__calc_phys_offset)
-
- .align 3
-1: .quad .
- .quad PAGE_OFFSET
-
-/*
- * Exception handling. Something went wrong and we can't proceed. We ought to
- * tell the user, but since we don't have any guarantee that we're even
- * running on the right architecture, we do virtually nothing.
- */
-__error_p:
-ENDPROC(__error_p)
-
-__error:
-1: nop
- b 1b
-ENDPROC(__error)
-
-/*
- * This function gets the processor ID in w0 and searches the cpu_table[] for
- * a match. It returns a pointer to the struct cpu_info it found. The
- * cpu_table[] must end with an empty (all zeros) structure.
- *
- * This routine can be called via C code and it needs to work with the MMU
- * both disabled and enabled (the offset is calculated automatically).
- */
-ENTRY(lookup_processor_type)
- adr x1, __lookup_processor_type_data
- ldp x2, x3, [x1]
- sub x1, x1, x2 // get offset between VA and PA
- add x3, x3, x1 // convert VA to PA
-1:
- ldp w5, w6, [x3] // load cpu_id_val and cpu_id_mask
- cbz w5, 2f // end of list?
- and w6, w6, w0
- cmp w5, w6
- b.eq 3f
- add x3, x3, #CPU_INFO_SZ
- b 1b
-2:
- mov x3, #0 // unknown processor
-3:
- mov x0, x3
- ret
-ENDPROC(lookup_processor_type)
-
- .align 3
- .type __lookup_processor_type_data, %object
-__lookup_processor_type_data:
- .quad .
- .quad cpu_table
- .size __lookup_processor_type_data, . - __lookup_processor_type_data
+ENDPROC(__enable_mmu)
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 8e20edfc5d13..b2a5ac76db76 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -33,7 +33,9 @@
#include <asm/hw_breakpoint.h>
#include <asm/kdebug.h>
#include <asm/traps.h>
+#include <asm/cpufeature.h>
#include <asm/cputype.h>
+#include <asm/sysreg.h>
#include <asm/system_misc.h>
#include <asm/uaccess.h>
@@ -53,13 +55,17 @@ static int core_num_wrps;
/* Determine number of BRP registers available. */
static int get_num_brps(void)
{
- return ((read_cpuid(ID_AA64DFR0_EL1) >> 12) & 0xf) + 1;
+ return 1 +
+ cpuid_feature_extract_field(read_system_reg(SYS_ID_AA64DFR0_EL1),
+ ID_AA64DFR0_BRPS_SHIFT);
}
/* Determine number of WRP registers available. */
static int get_num_wrps(void)
{
- return ((read_cpuid(ID_AA64DFR0_EL1) >> 20) & 0xf) + 1;
+ return 1 +
+ cpuid_feature_extract_field(read_system_reg(SYS_ID_AA64DFR0_EL1),
+ ID_AA64DFR0_WRPS_SHIFT);
}
int hw_breakpoint_slots(int type)
@@ -312,9 +318,21 @@ static int get_hbp_len(u8 hbp_len)
case ARM_BREAKPOINT_LEN_2:
len_in_bytes = 2;
break;
+ case ARM_BREAKPOINT_LEN_3:
+ len_in_bytes = 3;
+ break;
case ARM_BREAKPOINT_LEN_4:
len_in_bytes = 4;
break;
+ case ARM_BREAKPOINT_LEN_5:
+ len_in_bytes = 5;
+ break;
+ case ARM_BREAKPOINT_LEN_6:
+ len_in_bytes = 6;
+ break;
+ case ARM_BREAKPOINT_LEN_7:
+ len_in_bytes = 7;
+ break;
case ARM_BREAKPOINT_LEN_8:
len_in_bytes = 8;
break;
@@ -344,7 +362,7 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
* to generic breakpoint descriptions.
*/
int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
- int *gen_len, int *gen_type)
+ int *gen_len, int *gen_type, int *offset)
{
/* Type */
switch (ctrl.type) {
@@ -364,17 +382,33 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
return -EINVAL;
}
+ if (!ctrl.len)
+ return -EINVAL;
+ *offset = __ffs(ctrl.len);
+
/* Len */
- switch (ctrl.len) {
+ switch (ctrl.len >> *offset) {
case ARM_BREAKPOINT_LEN_1:
*gen_len = HW_BREAKPOINT_LEN_1;
break;
case ARM_BREAKPOINT_LEN_2:
*gen_len = HW_BREAKPOINT_LEN_2;
break;
+ case ARM_BREAKPOINT_LEN_3:
+ *gen_len = HW_BREAKPOINT_LEN_3;
+ break;
case ARM_BREAKPOINT_LEN_4:
*gen_len = HW_BREAKPOINT_LEN_4;
break;
+ case ARM_BREAKPOINT_LEN_5:
+ *gen_len = HW_BREAKPOINT_LEN_5;
+ break;
+ case ARM_BREAKPOINT_LEN_6:
+ *gen_len = HW_BREAKPOINT_LEN_6;
+ break;
+ case ARM_BREAKPOINT_LEN_7:
+ *gen_len = HW_BREAKPOINT_LEN_7;
+ break;
case ARM_BREAKPOINT_LEN_8:
*gen_len = HW_BREAKPOINT_LEN_8;
break;
@@ -418,9 +452,21 @@ static int arch_build_bp_info(struct perf_event *bp)
case HW_BREAKPOINT_LEN_2:
info->ctrl.len = ARM_BREAKPOINT_LEN_2;
break;
+ case HW_BREAKPOINT_LEN_3:
+ info->ctrl.len = ARM_BREAKPOINT_LEN_3;
+ break;
case HW_BREAKPOINT_LEN_4:
info->ctrl.len = ARM_BREAKPOINT_LEN_4;
break;
+ case HW_BREAKPOINT_LEN_5:
+ info->ctrl.len = ARM_BREAKPOINT_LEN_5;
+ break;
+ case HW_BREAKPOINT_LEN_6:
+ info->ctrl.len = ARM_BREAKPOINT_LEN_6;
+ break;
+ case HW_BREAKPOINT_LEN_7:
+ info->ctrl.len = ARM_BREAKPOINT_LEN_7;
+ break;
case HW_BREAKPOINT_LEN_8:
info->ctrl.len = ARM_BREAKPOINT_LEN_8;
break;
@@ -512,18 +558,17 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
default:
return -EINVAL;
}
-
- info->address &= ~alignment_mask;
- info->ctrl.len <<= offset;
} else {
if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE)
alignment_mask = 0x3;
else
alignment_mask = 0x7;
- if (info->address & alignment_mask)
- return -EINVAL;
+ offset = info->address & alignment_mask;
}
+ info->address &= ~alignment_mask;
+ info->ctrl.len <<= offset;
+
/*
* Disallow per-task kernel breakpoints since these would
* complicate the stepping code.
@@ -654,12 +699,47 @@ unlock:
return 0;
}
+/*
+ * Arm64 hardware does not always report a watchpoint hit address that matches
+ * one of the watchpoints set. It can also report an address "near" the
+ * watchpoint if a single instruction access both watched and unwatched
+ * addresses. There is no straight-forward way, short of disassembling the
+ * offending instruction, to map that address back to the watchpoint. This
+ * function computes the distance of the memory access from the watchpoint as a
+ * heuristic for the likelyhood that a given access triggered the watchpoint.
+ *
+ * See Section D2.10.5 "Determining the memory location that caused a Watchpoint
+ * exception" of ARMv8 Architecture Reference Manual for details.
+ *
+ * The function returns the distance of the address from the bytes watched by
+ * the watchpoint. In case of an exact match, it returns 0.
+ */
+static u64 get_distance_from_watchpoint(unsigned long addr, u64 val,
+ struct arch_hw_breakpoint_ctrl *ctrl)
+{
+ u64 wp_low, wp_high;
+ u32 lens, lene;
+
+ lens = __ffs(ctrl->len);
+ lene = __fls(ctrl->len);
+
+ wp_low = val + lens;
+ wp_high = val + lene;
+ if (addr < wp_low)
+ return wp_low - addr;
+ else if (addr > wp_high)
+ return addr - wp_high;
+ else
+ return 0;
+}
+
static int watchpoint_handler(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
- int i, step = 0, *kernel_step, access;
+ int i, step = 0, *kernel_step, access, closest_match = 0;
+ u64 min_dist = -1, dist;
u32 ctrl_reg;
- u64 val, alignment_mask;
+ u64 val;
struct perf_event *wp, **slots;
struct debug_info *debug_info;
struct arch_hw_breakpoint *info;
@@ -668,35 +748,15 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
slots = this_cpu_ptr(wp_on_reg);
debug_info = &current->thread.debug;
+ /*
+ * Find all watchpoints that match the reported address. If no exact
+ * match is found. Attribute the hit to the closest watchpoint.
+ */
+ rcu_read_lock();
for (i = 0; i < core_num_wrps; ++i) {
- rcu_read_lock();
-
wp = slots[i];
-
if (wp == NULL)
- goto unlock;
-
- info = counter_arch_bp(wp);
- /* AArch32 watchpoints are either 4 or 8 bytes aligned. */
- if (is_compat_task()) {
- if (info->ctrl.len == ARM_BREAKPOINT_LEN_8)
- alignment_mask = 0x7;
- else
- alignment_mask = 0x3;
- } else {
- alignment_mask = 0x7;
- }
-
- /* Check if the watchpoint value matches. */
- val = read_wb_reg(AARCH64_DBG_REG_WVR, i);
- if (val != (untagged_addr(addr) & ~alignment_mask))
- goto unlock;
-
- /* Possible match, check the byte address select to confirm. */
- ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i);
- decode_ctrl_reg(ctrl_reg, &ctrl);
- if (!((1 << (addr & alignment_mask)) & ctrl.len))
- goto unlock;
+ continue;
/*
* Check that the access type matches.
@@ -705,18 +765,41 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
access = (esr & AARCH64_ESR_ACCESS_MASK) ? HW_BREAKPOINT_W :
HW_BREAKPOINT_R;
if (!(access & hw_breakpoint_type(wp)))
- goto unlock;
+ continue;
+ /* Check if the watchpoint value and byte select match. */
+ val = read_wb_reg(AARCH64_DBG_REG_WVR, i);
+ ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i);
+ decode_ctrl_reg(ctrl_reg, &ctrl);
+ dist = get_distance_from_watchpoint(addr, val, &ctrl);
+ if (dist < min_dist) {
+ min_dist = dist;
+ closest_match = i;
+ }
+ /* Is this an exact match? */
+ if (dist != 0)
+ continue;
+
+ info = counter_arch_bp(wp);
info->trigger = addr;
perf_bp_event(wp, regs);
/* Do we need to handle the stepping? */
if (!wp->overflow_handler)
step = 1;
+ }
+ if (min_dist > 0 && min_dist != -1) {
+ /* No exact match found. */
+ wp = slots[closest_match];
+ info = counter_arch_bp(wp);
+ info->trigger = addr;
+ perf_bp_event(wp, regs);
-unlock:
- rcu_read_unlock();
+ /* Do we need to handle the stepping? */
+ if (!wp->overflow_handler)
+ step = 1;
}
+ rcu_read_unlock();
if (!step)
return 0;
@@ -886,7 +969,7 @@ static int hw_breakpoint_reset_notify(struct notifier_block *self,
void *hcpu)
{
int cpu = (long)hcpu;
- if (action == CPU_ONLINE)
+ if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE)
smp_call_function_single(cpu, hw_breakpoint_reset, NULL, 1);
return NOTIFY_OK;
}
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 7e9327a0986d..dd9671cd0bb2 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -17,14 +17,19 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/bitops.h>
+#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
+#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/spinlock.h>
#include <linux/stop_machine.h>
+#include <linux/types.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/debug-monitors.h>
+#include <asm/fixmap.h>
#include <asm/insn.h>
#define AARCH64_INSN_SF_BIT BIT(31)
@@ -72,6 +77,39 @@ bool __kprobes aarch64_insn_is_nop(u32 insn)
}
}
+bool aarch64_insn_is_branch_imm(u32 insn)
+{
+ return (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn) ||
+ aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn) ||
+ aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+ aarch64_insn_is_bcond(insn));
+}
+
+static DEFINE_SPINLOCK(patch_lock);
+
+static void __kprobes *patch_map(void *addr, int fixmap)
+{
+ unsigned long uintaddr = (uintptr_t) addr;
+ bool module = !core_kernel_text(uintaddr);
+ struct page *page;
+
+ if (module && IS_ENABLED(CONFIG_DEBUG_SET_MODULE_RONX))
+ page = vmalloc_to_page(addr);
+ else if (!module && IS_ENABLED(CONFIG_DEBUG_RODATA))
+ page = virt_to_page(addr);
+ else
+ return addr;
+
+ BUG_ON(!page);
+ set_fixmap(fixmap, page_to_phys(page));
+
+ return (void *) (__fix_to_virt(fixmap) + (uintaddr & ~PAGE_MASK));
+}
+
+static void __kprobes patch_unmap(int fixmap)
+{
+ clear_fixmap(fixmap);
+}
/*
* In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always
* little-endian.
@@ -88,10 +126,27 @@ int __kprobes aarch64_insn_read(void *addr, u32 *insnp)
return ret;
}
+static int __kprobes __aarch64_insn_write(void *addr, u32 insn)
+{
+ void *waddr = addr;
+ unsigned long flags = 0;
+ int ret;
+
+ spin_lock_irqsave(&patch_lock, flags);
+ waddr = patch_map(addr, FIX_TEXT_POKE0);
+
+ ret = probe_kernel_write(waddr, &insn, AARCH64_INSN_SIZE);
+
+ patch_unmap(FIX_TEXT_POKE0);
+ spin_unlock_irqrestore(&patch_lock, flags);
+
+ return ret;
+}
+
int __kprobes aarch64_insn_write(void *addr, u32 insn)
{
insn = cpu_to_le32(insn);
- return probe_kernel_write(addr, &insn, AARCH64_INSN_SIZE);
+ return __aarch64_insn_write(addr, insn);
}
static bool __kprobes __aarch64_insn_hotpatch_safe(u32 insn)
@@ -218,23 +273,13 @@ int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt)
return aarch64_insn_patch_text_sync(addrs, insns, cnt);
}
-u32 __kprobes aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type,
- u32 insn, u64 imm)
+static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type,
+ u32 *maskp, int *shiftp)
{
- u32 immlo, immhi, lomask, himask, mask;
+ u32 mask;
int shift;
switch (type) {
- case AARCH64_INSN_IMM_ADR:
- lomask = 0x3;
- himask = 0x7ffff;
- immlo = imm & lomask;
- imm >>= 2;
- immhi = imm & himask;
- imm = (immlo << 24) | (immhi);
- mask = (lomask << 24) | (himask);
- shift = 5;
- break;
case AARCH64_INSN_IMM_26:
mask = BIT(26) - 1;
shift = 0;
@@ -273,9 +318,68 @@ u32 __kprobes aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type,
shift = 16;
break;
default:
- pr_err("aarch64_insn_encode_immediate: unknown immediate encoding %d\n",
- type);
- return 0;
+ return -EINVAL;
+ }
+
+ *maskp = mask;
+ *shiftp = shift;
+
+ return 0;
+}
+
+#define ADR_IMM_HILOSPLIT 2
+#define ADR_IMM_SIZE SZ_2M
+#define ADR_IMM_LOMASK ((1 << ADR_IMM_HILOSPLIT) - 1)
+#define ADR_IMM_HIMASK ((ADR_IMM_SIZE >> ADR_IMM_HILOSPLIT) - 1)
+#define ADR_IMM_LOSHIFT 29
+#define ADR_IMM_HISHIFT 5
+
+u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn)
+{
+ u32 immlo, immhi, mask;
+ int shift;
+
+ switch (type) {
+ case AARCH64_INSN_IMM_ADR:
+ shift = 0;
+ immlo = (insn >> ADR_IMM_LOSHIFT) & ADR_IMM_LOMASK;
+ immhi = (insn >> ADR_IMM_HISHIFT) & ADR_IMM_HIMASK;
+ insn = (immhi << ADR_IMM_HILOSPLIT) | immlo;
+ mask = ADR_IMM_SIZE - 1;
+ break;
+ default:
+ if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) {
+ pr_err("aarch64_insn_decode_immediate: unknown immediate encoding %d\n",
+ type);
+ return 0;
+ }
+ }
+
+ return (insn >> shift) & mask;
+}
+
+u32 __kprobes aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type,
+ u32 insn, u64 imm)
+{
+ u32 immlo, immhi, mask;
+ int shift;
+
+ switch (type) {
+ case AARCH64_INSN_IMM_ADR:
+ shift = 0;
+ immlo = (imm & ADR_IMM_LOMASK) << ADR_IMM_LOSHIFT;
+ imm >>= ADR_IMM_HILOSPLIT;
+ immhi = (imm & ADR_IMM_HIMASK) << ADR_IMM_HISHIFT;
+ imm = immlo | immhi;
+ mask = ((ADR_IMM_LOMASK << ADR_IMM_LOSHIFT) |
+ (ADR_IMM_HIMASK << ADR_IMM_HISHIFT));
+ break;
+ default:
+ if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) {
+ pr_err("aarch64_insn_encode_immediate: unknown immediate encoding %d\n",
+ type);
+ return 0;
+ }
}
/* Update the immediate field. */
@@ -961,6 +1065,58 @@ u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst,
return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
}
+/*
+ * Decode the imm field of a branch, and return the byte offset as a
+ * signed value (so it can be used when computing a new branch
+ * target).
+ */
+s32 aarch64_get_branch_offset(u32 insn)
+{
+ s32 imm;
+
+ if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) {
+ imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_26, insn);
+ return (imm << 6) >> 4;
+ }
+
+ if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+ aarch64_insn_is_bcond(insn)) {
+ imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_19, insn);
+ return (imm << 13) >> 11;
+ }
+
+ if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) {
+ imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_14, insn);
+ return (imm << 18) >> 16;
+ }
+
+ /* Unhandled instruction */
+ BUG();
+}
+
+/*
+ * Encode the displacement of a branch in the imm field and return the
+ * updated instruction.
+ */
+u32 aarch64_set_branch_offset(u32 insn, s32 offset)
+{
+ if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn))
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn,
+ offset >> 2);
+
+ if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+ aarch64_insn_is_bcond(insn))
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+ offset >> 2);
+
+ if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn))
+ return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_14, insn,
+ offset >> 2);
+
+ /* Unhandled instruction */
+ BUG();
+}
+
bool aarch32_insn_is_wide(u32 insn)
{
return insn >= 0xe800;
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index 071a6ec13bd8..de99d4bd31bb 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -33,13 +33,13 @@ unsigned long irq_err_count;
int arch_show_interrupts(struct seq_file *p, int prec)
{
-#ifdef CONFIG_SMP
show_ipi_list(p, prec);
-#endif
seq_printf(p, "%*s: %10lu\n", prec, "Err", irq_err_count);
return 0;
}
+void (*handle_arch_irq)(struct pt_regs *) = NULL;
+
void __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
{
if (handle_arch_irq)
diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c
index 263a166291fb..4f1fec7a46db 100644
--- a/arch/arm64/kernel/jump_label.c
+++ b/arch/arm64/kernel/jump_label.c
@@ -22,9 +22,8 @@
#ifdef HAVE_JUMP_LABEL
-static void __arch_jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type,
- bool is_static)
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
{
void *addr = (void *)entry->code;
u32 insn;
@@ -37,22 +36,18 @@ static void __arch_jump_label_transform(struct jump_entry *entry,
insn = aarch64_insn_gen_nop();
}
- if (is_static)
- aarch64_insn_patch_text_nosync(addr, insn);
- else
- aarch64_insn_patch_text(&addr, &insn, 1);
-}
-
-void arch_jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type)
-{
- __arch_jump_label_transform(entry, type, false);
+ aarch64_insn_patch_text(&addr, &insn, 1);
}
void arch_jump_label_transform_static(struct jump_entry *entry,
enum jump_label_type type)
{
- __arch_jump_label_transform(entry, type, true);
+ /*
+ * We use the architected A64 NOP in arch_static_branch, so there's no
+ * need to patch an identical A64 NOP over the top of it here. The core
+ * will call arch_jump_label_transform from a module notifier if the
+ * NOP needs to be replaced by a branch.
+ */
}
#endif /* HAVE_JUMP_LABEL */
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 51128018b907..ca450c42b83a 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -27,6 +27,7 @@
#include <linux/moduleloader.h>
#include <linux/vmalloc.h>
#include <asm/insn.h>
+#include <asm/sections.h>
#define AARCH64_INSN_IMM_MOVNZ AARCH64_INSN_IMM_MAX
#define AARCH64_INSN_IMM_MOVK AARCH64_INSN_IMM_16
@@ -406,3 +407,20 @@ overflow:
me->name, (int)ELF64_R_TYPE(rel[i].r_info), val);
return -ENOEXEC;
}
+
+int module_finalize(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *me)
+{
+ const Elf_Shdr *s, *se;
+ const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+ for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) {
+ if (strcmp(".altinstructions", secstrs + s->sh_name) == 0) {
+ apply_alternatives((void *)s->sh_addr, s->sh_size);
+ return 0;
+ }
+ }
+
+ return 0;
+}
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 78a5894b1621..dfbd178f198e 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -1332,9 +1332,9 @@ static void __init cpu_pmu_init(struct arm_pmu *armpmu)
static int __init init_hw_perf_events(void)
{
- u64 dfr = read_cpuid(ID_AA64DFR0_EL1);
+ u64 dfr = read_system_reg(SYS_ID_AA64DFR0_EL1);
- switch ((dfr >> 8) & 0xf) {
+ switch (cpuid_feature_extract_field(dfr, ID_AA64DFR0_PMUVER_SHIFT)) {
case 0x1: /* PMUv3 */
cpu_pmu = armv8_pmuv3_pmu_init();
break;
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index fde9923af859..365dc554594b 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -44,6 +44,7 @@
#include <linux/personality.h>
#include <linux/notifier.h>
+#include <asm/alternative.h>
#include <asm/compat.h>
#include <asm/cacheflush.h>
#include <asm/fpsimd.h>
@@ -57,14 +58,6 @@ unsigned long __stack_chk_guard __read_mostly;
EXPORT_SYMBOL(__stack_chk_guard);
#endif
-void soft_restart(unsigned long addr)
-{
- setup_mm_for_reboot();
- cpu_soft_restart(virt_to_phys(cpu_reset), addr);
- /* Should never get here */
- BUG();
-}
-
/*
* Function pointers to optional machine specific functions
*/
@@ -135,9 +128,7 @@ void machine_power_off(void)
/*
* Restart requires that the secondary CPUs stop performing any activity
- * while the primary CPU resets the system. Systems with a single CPU can
- * use soft_restart() as their machine descriptor's .restart hook, since that
- * will cause the only available CPU to reset. Systems with multiple CPUs must
+ * while the primary CPU resets the system. Systems with multiple CPUs must
* provide a HW restart implementation, to ensure that all CPUs reset at once.
* This is required so that any code running after reset on the primary CPU
* doesn't have to co-ordinate with other CPUs to ensure they aren't still
@@ -163,6 +154,70 @@ void machine_restart(char *cmd)
while (1);
}
+/*
+ * dump a block of kernel memory from around the given address
+ */
+static void show_data(unsigned long addr, int nbytes, const char *name)
+{
+ int i, j;
+ int nlines;
+ u32 *p;
+
+ /*
+ * don't attempt to dump non-kernel addresses or
+ * values that are probably just small negative numbers
+ */
+ if (addr < PAGE_OFFSET || addr > -256UL)
+ return;
+
+ printk("\n%s: %#lx:\n", name, addr);
+
+ /*
+ * round address down to a 32 bit boundary
+ * and always dump a multiple of 32 bytes
+ */
+ p = (u32 *)(addr & ~(sizeof(u32) - 1));
+ nbytes += (addr & (sizeof(u32) - 1));
+ nlines = (nbytes + 31) / 32;
+
+
+ for (i = 0; i < nlines; i++) {
+ /*
+ * just display low 16 bits of address to keep
+ * each line of the dump < 80 characters
+ */
+ printk("%04lx ", (unsigned long)p & 0xffff);
+ for (j = 0; j < 8; j++) {
+ u32 data;
+ if (probe_kernel_address(p, data)) {
+ printk(" ********");
+ } else {
+ printk(" %08x", data);
+ }
+ ++p;
+ }
+ printk("\n");
+ }
+}
+
+static void show_extra_register_data(struct pt_regs *regs, int nbytes)
+{
+ mm_segment_t fs;
+ unsigned int i;
+
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ show_data(regs->pc - nbytes, nbytes * 2, "PC");
+ show_data(regs->regs[30] - nbytes, nbytes * 2, "LR");
+ show_data(regs->sp - nbytes, nbytes * 2, "SP");
+ for (i = 0; i < 30; i++) {
+ char name[4];
+ snprintf(name, sizeof(name), "X%u", i);
+ show_data(regs->regs[i] - nbytes, nbytes * 2, name);
+ }
+ set_fs(fs);
+}
+
void __show_regs(struct pt_regs *regs)
{
int i, top_reg;
@@ -189,6 +244,8 @@ void __show_regs(struct pt_regs *regs)
if (i % 2 == 0)
printk("\n");
}
+ if (!user_mode(regs))
+ show_extra_register_data(regs, 128);
printk("\n");
}
@@ -235,7 +292,8 @@ void release_thread(struct task_struct *dead_task)
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
- fpsimd_preserve_current_state();
+ if (current->mm)
+ fpsimd_preserve_current_state();
*dst = *src;
return 0;
}
@@ -278,6 +336,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
} else {
memset(childregs, 0, sizeof(struct pt_regs));
childregs->pstate = PSR_MODE_EL1h;
+ if (IS_ENABLED(CONFIG_ARM64_UAO) &&
+ cpus_have_cap(ARM64_HAS_UAO))
+ childregs->pstate |= PSR_UAO_BIT;
p->thread.cpu_context.x19 = stack_start;
p->thread.cpu_context.x20 = stk_sz;
}
@@ -313,6 +374,17 @@ static void tls_thread_switch(struct task_struct *next)
: : "r" (tpidr), "r" (tpidrro));
}
+/* Restore the UAO state depending on next's addr_limit */
+static void uao_thread_switch(struct task_struct *next)
+{
+ if (IS_ENABLED(CONFIG_ARM64_UAO)) {
+ if (task_thread_info(next)->addr_limit == KERNEL_DS)
+ asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO));
+ else
+ asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO));
+ }
+}
+
/*
* Thread switching.
*/
@@ -325,6 +397,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
tls_thread_switch(next);
hw_breakpoint_thread_switch(next);
contextidr_thread_switch(next);
+ uao_thread_switch(next);
/*
* Complete any pending TLB or cache maintenance on this CPU in case
diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c
index 81c081eaca42..7a57cf5c3441 100644
--- a/arch/arm64/kernel/psci.c
+++ b/arch/arm64/kernel/psci.c
@@ -396,8 +396,6 @@ int __init psci_init(void)
return init_fn(np);
}
-#ifdef CONFIG_SMP
-
static int __init cpu_psci_cpu_init(struct device_node *dn, unsigned int cpu)
{
return 0;
@@ -476,7 +474,6 @@ static int cpu_psci_cpu_kill(unsigned int cpu)
return 0;
}
#endif
-#endif
static int psci_suspend_finisher(unsigned long index)
{
@@ -511,7 +508,6 @@ const struct cpu_operations cpu_psci_ops = {
.cpu_init_idle = cpu_psci_cpu_init_idle,
.cpu_suspend = cpu_psci_cpu_suspend,
#endif
-#ifdef CONFIG_SMP
.cpu_init = cpu_psci_cpu_init,
.cpu_prepare = cpu_psci_cpu_prepare,
.cpu_boot = cpu_psci_cpu_boot,
@@ -520,6 +516,5 @@ const struct cpu_operations cpu_psci_ops = {
.cpu_die = cpu_psci_cpu_die,
.cpu_kill = cpu_psci_cpu_kill,
#endif
-#endif
};
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 8a4ae8e73213..95fcbd53802d 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -27,6 +27,7 @@
#include <linux/smp.h>
#include <linux/ptrace.h>
#include <linux/user.h>
+#include <linux/seccomp.h>
#include <linux/security.h>
#include <linux/init.h>
#include <linux/signal.h>
@@ -219,13 +220,13 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type,
struct arch_hw_breakpoint_ctrl ctrl,
struct perf_event_attr *attr)
{
- int err, len, type, disabled = !ctrl.enabled;
+ int err, len, type, offset, disabled = !ctrl.enabled;
attr->disabled = disabled;
if (disabled)
return 0;
- err = arch_bp_generic_fields(ctrl, &len, &type);
+ err = arch_bp_generic_fields(ctrl, &len, &type, &offset);
if (err)
return err;
@@ -244,6 +245,7 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type,
attr->bp_len = len;
attr->bp_type = type;
+ attr->bp_addr += offset;
return 0;
}
@@ -296,7 +298,7 @@ static int ptrace_hbp_get_addr(unsigned int note_type,
if (IS_ERR(bp))
return PTR_ERR(bp);
- *addr = bp ? bp->attr.bp_addr : 0;
+ *addr = bp ? counter_arch_bp(bp)->address : 0;
return 0;
}
@@ -551,6 +553,32 @@ static int tls_set(struct task_struct *target, const struct user_regset *regset,
return ret;
}
+static int system_call_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ int syscallno = task_pt_regs(target)->syscallno;
+
+ return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ &syscallno, 0, -1);
+}
+
+static int system_call_set(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ int syscallno, ret;
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &syscallno, 0, -1);
+ if (ret)
+ return ret;
+
+ task_pt_regs(target)->syscallno = syscallno;
+ return ret;
+}
+
enum aarch64_regset {
REGSET_GPR,
REGSET_FPR,
@@ -559,6 +587,7 @@ enum aarch64_regset {
REGSET_HW_BREAK,
REGSET_HW_WATCH,
#endif
+ REGSET_SYSTEM_CALL,
};
static const struct user_regset aarch64_regsets[] = {
@@ -608,6 +637,14 @@ static const struct user_regset aarch64_regsets[] = {
.set = hw_break_set,
},
#endif
+ [REGSET_SYSTEM_CALL] = {
+ .core_note_type = NT_ARM_SYSTEM_CALL,
+ .n = 1,
+ .size = sizeof(int),
+ .align = sizeof(int),
+ .get = system_call_get,
+ .set = system_call_set,
+ },
};
static const struct user_regset_view user_aarch64_view = {
@@ -1114,6 +1151,10 @@ static void tracehook_report_syscall(struct pt_regs *regs,
asmlinkage int syscall_trace_enter(struct pt_regs *regs)
{
+ /* Do the secure computing check first; failures should be fast. */
+ if (secure_computing() == -1)
+ return -1;
+
if (test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 3843e546ab3d..c31d8f706cb5 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -27,7 +27,6 @@
#include <linux/console.h>
#include <linux/cache.h>
#include <linux/bootmem.h>
-#include <linux/seq_file.h>
#include <linux/screen_info.h>
#include <linux/init.h>
#include <linux/kexec.h>
@@ -43,13 +42,11 @@
#include <linux/of_fdt.h>
#include <linux/of_platform.h>
#include <linux/efi.h>
-#include <linux/personality.h>
#include <asm/fixmap.h>
#include <asm/cpu.h>
#include <asm/cputype.h>
#include <asm/elf.h>
-#include <asm/cputable.h>
#include <asm/cpufeature.h>
#include <asm/cpu_ops.h>
#include <asm/kasan.h>
@@ -63,27 +60,6 @@
#include <asm/psci.h>
#include <asm/efi.h>
-unsigned int processor_id;
-EXPORT_SYMBOL(processor_id);
-
-unsigned long elf_hwcap __read_mostly;
-EXPORT_SYMBOL_GPL(elf_hwcap);
-
-#ifdef CONFIG_COMPAT
-#define COMPAT_ELF_HWCAP_DEFAULT \
- (COMPAT_HWCAP_HALF|COMPAT_HWCAP_THUMB|\
- COMPAT_HWCAP_FAST_MULT|COMPAT_HWCAP_EDSP|\
- COMPAT_HWCAP_TLS|COMPAT_HWCAP_VFP|\
- COMPAT_HWCAP_VFPv3|COMPAT_HWCAP_VFPv4|\
- COMPAT_HWCAP_NEON|COMPAT_HWCAP_IDIV|\
- COMPAT_HWCAP_LPAE)
-unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT;
-unsigned int compat_elf_hwcap2 __read_mostly;
-#endif
-
-DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS);
-
-static const char *cpu_name;
phys_addr_t __fdt_pointer __initdata;
/*
@@ -119,6 +95,11 @@ void __init early_print(const char *str, ...)
printk("%s", buf);
}
+/*
+ * The recorded values of x0 .. x3 upon kernel entry.
+ */
+u64 __cacheline_aligned boot_args[4];
+
void __init smp_setup_processor_id(void)
{
/*
@@ -135,7 +116,6 @@ bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
}
struct mpidr_hash mpidr_hash;
-#ifdef CONFIG_SMP
/**
* smp_build_mpidr_hash - Pre-compute shifts required at each affinity
* level in order to build a linear index from an
@@ -201,107 +181,6 @@ static void __init smp_build_mpidr_hash(void)
pr_warn("Large number of MPIDR hash buckets detected\n");
__flush_dcache_area(&mpidr_hash, sizeof(struct mpidr_hash));
}
-#endif
-
-static void __init setup_processor(void)
-{
- struct cpu_info *cpu_info;
- u64 features, block;
- u32 cwg;
- int cls;
-
- cpu_info = lookup_processor_type(read_cpuid_id());
- if (!cpu_info) {
- printk("CPU configuration botched (ID %08x), unable to continue.\n",
- read_cpuid_id());
- while (1);
- }
-
- cpu_name = cpu_info->cpu_name;
-
- printk("CPU: %s [%08x] revision %d\n",
- cpu_name, read_cpuid_id(), read_cpuid_id() & 15);
-
- sprintf(init_utsname()->machine, ELF_PLATFORM);
- elf_hwcap = 0;
-
- cpuinfo_store_boot_cpu();
-
- /*
- * Check for sane CTR_EL0.CWG value.
- */
- cwg = cache_type_cwg();
- cls = cache_line_size();
- if (!cwg)
- pr_warn("No Cache Writeback Granule information, assuming cache line size %d\n",
- cls);
- if (L1_CACHE_BYTES < cls)
- pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n",
- L1_CACHE_BYTES, cls);
-
- /*
- * ID_AA64ISAR0_EL1 contains 4-bit wide signed feature blocks.
- * The blocks we test below represent incremental functionality
- * for non-negative values. Negative values are reserved.
- */
- features = read_cpuid(ID_AA64ISAR0_EL1);
- block = (features >> 4) & 0xf;
- if (!(block & 0x8)) {
- switch (block) {
- default:
- case 2:
- elf_hwcap |= HWCAP_PMULL;
- case 1:
- elf_hwcap |= HWCAP_AES;
- case 0:
- break;
- }
- }
-
- block = (features >> 8) & 0xf;
- if (block && !(block & 0x8))
- elf_hwcap |= HWCAP_SHA1;
-
- block = (features >> 12) & 0xf;
- if (block && !(block & 0x8))
- elf_hwcap |= HWCAP_SHA2;
-
- block = (features >> 16) & 0xf;
- if (block && !(block & 0x8))
- elf_hwcap |= HWCAP_CRC32;
-
-#ifdef CONFIG_COMPAT
- /*
- * ID_ISAR5_EL1 carries similar information as above, but pertaining to
- * the Aarch32 32-bit execution state.
- */
- features = read_cpuid(ID_ISAR5_EL1);
- block = (features >> 4) & 0xf;
- if (!(block & 0x8)) {
- switch (block) {
- default:
- case 2:
- compat_elf_hwcap2 |= COMPAT_HWCAP2_PMULL;
- case 1:
- compat_elf_hwcap2 |= COMPAT_HWCAP2_AES;
- case 0:
- break;
- }
- }
-
- block = (features >> 8) & 0xf;
- if (block && !(block & 0x8))
- compat_elf_hwcap2 |= COMPAT_HWCAP2_SHA1;
-
- block = (features >> 12) & 0xf;
- if (block && !(block & 0x8))
- compat_elf_hwcap2 |= COMPAT_HWCAP2_SHA2;
-
- block = (features >> 16) & 0xf;
- if (block && !(block & 0x8))
- compat_elf_hwcap2 |= COMPAT_HWCAP2_CRC32;
-#endif
-}
static void __init setup_machine_fdt(phys_addr_t dt_phys)
{
@@ -315,6 +194,8 @@ static void __init setup_machine_fdt(phys_addr_t dt_phys)
while (true)
cpu_relax();
}
+
+ dump_stack_set_arch_desc("%s (DT)", of_flat_dt_get_machine_name());
}
static void __init request_standard_resources(void)
@@ -323,7 +204,7 @@ static void __init request_standard_resources(void)
struct resource *res;
kernel_code.start = virt_to_phys(_text);
- kernel_code.end = virt_to_phys(_etext - 1);
+ kernel_code.end = virt_to_phys(__init_begin - 1);
kernel_data.start = virt_to_phys(_sdata);
kernel_data.end = virt_to_phys(_end - 1);
@@ -412,10 +293,11 @@ u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
void __init setup_arch(char **cmdline_p)
{
- setup_processor();
+ pr_info("Boot CPU: AArch64 Processor [%08x]\n", read_cpuid_id());
setup_machine_fdt(__fdt_pointer);
+ sprintf(init_utsname()->machine, ELF_PLATFORM);
init_mm.start_code = (unsigned long) _text;
init_mm.end_code = (unsigned long) _etext;
init_mm.end_data = (unsigned long) _edata;
@@ -423,6 +305,7 @@ void __init setup_arch(char **cmdline_p)
*cmdline_p = boot_command_line;
+ early_fixmap_init();
early_ioremap_init();
parse_early_param();
@@ -443,7 +326,6 @@ void __init setup_arch(char **cmdline_p)
request_standard_resources();
- efi_idmap_init();
early_ioremap_reset();
unflatten_device_tree();
@@ -452,9 +334,16 @@ void __init setup_arch(char **cmdline_p)
cpu_logical_map(0) = read_cpuid_mpidr() & MPIDR_HWID_BITMASK;
cpu_read_bootcpu_ops();
-#ifdef CONFIG_SMP
smp_init_cpus();
smp_build_mpidr_hash();
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ /*
+ * Make sure init_thread_info.ttbr0 always generates translation
+ * faults in case uaccess_enable() is inadvertently called by the init
+ * thread.
+ */
+ init_thread_info.ttbr0 = virt_to_phys(empty_zero_page);
#endif
#ifdef CONFIG_VT
@@ -464,6 +353,12 @@ void __init setup_arch(char **cmdline_p)
conswitchp = &dummy_con;
#endif
#endif
+ if (boot_args[1] || boot_args[2] || boot_args[3]) {
+ pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n"
+ "\tx1: %016llx\n\tx2: %016llx\n\tx3: %016llx\n"
+ "This indicates a broken bootloader or old kernel\n",
+ boot_args[1], boot_args[2], boot_args[3]);
+ }
}
static int __init arm64_device_init(void)
@@ -486,130 +381,3 @@ static int __init topology_init(void)
return 0;
}
subsys_initcall(topology_init);
-
-static const char *hwcap_str[] = {
- "fp",
- "asimd",
- "evtstrm",
- "aes",
- "pmull",
- "sha1",
- "sha2",
- "crc32",
- NULL
-};
-
-#ifdef CONFIG_COMPAT
-static const char *compat_hwcap_str[] = {
- "swp",
- "half",
- "thumb",
- "26bit",
- "fastmult",
- "fpa",
- "vfp",
- "edsp",
- "java",
- "iwmmxt",
- "crunch",
- "thumbee",
- "neon",
- "vfpv3",
- "vfpv3d16",
- "tls",
- "vfpv4",
- "idiva",
- "idivt",
- "vfpd32",
- "lpae",
- "evtstrm",
- NULL
-};
-
-static const char *compat_hwcap2_str[] = {
- "aes",
- "pmull",
- "sha1",
- "sha2",
- "crc32",
- NULL
-};
-#endif /* CONFIG_COMPAT */
-
-static int c_show(struct seq_file *m, void *v)
-{
- int i, j;
-
- for_each_online_cpu(i) {
- struct cpuinfo_arm64 *cpuinfo = &per_cpu(cpu_data, i);
- u32 midr = cpuinfo->reg_midr;
-
- /*
- * glibc reads /proc/cpuinfo to determine the number of
- * online processors, looking for lines beginning with
- * "processor". Give glibc what it expects.
- */
-#ifdef CONFIG_SMP
- seq_printf(m, "processor\t: %d\n", i);
-#endif
-
- seq_printf(m, "BogoMIPS\t: %lu.%02lu\n",
- loops_per_jiffy / (500000UL/HZ),
- loops_per_jiffy / (5000UL/HZ) % 100);
-
- /*
- * Dump out the common processor features in a single line.
- * Userspace should read the hwcaps with getauxval(AT_HWCAP)
- * rather than attempting to parse this, but there's a body of
- * software which does already (at least for 32-bit).
- */
- seq_puts(m, "Features\t:");
- if (personality(current->personality) == PER_LINUX32) {
-#ifdef CONFIG_COMPAT
- for (j = 0; compat_hwcap_str[j]; j++)
- if (compat_elf_hwcap & (1 << j))
- seq_printf(m, " %s", compat_hwcap_str[j]);
-
- for (j = 0; compat_hwcap2_str[j]; j++)
- if (compat_elf_hwcap2 & (1 << j))
- seq_printf(m, " %s", compat_hwcap2_str[j]);
-#endif /* CONFIG_COMPAT */
- } else {
- for (j = 0; hwcap_str[j]; j++)
- if (elf_hwcap & (1 << j))
- seq_printf(m, " %s", hwcap_str[j]);
- }
- seq_puts(m, "\n");
-
- seq_printf(m, "CPU implementer\t: 0x%02x\n",
- MIDR_IMPLEMENTOR(midr));
- seq_printf(m, "CPU architecture: 8\n");
- seq_printf(m, "CPU variant\t: 0x%x\n", MIDR_VARIANT(midr));
- seq_printf(m, "CPU part\t: 0x%03x\n", MIDR_PARTNUM(midr));
- seq_printf(m, "CPU revision\t: %d\n\n", MIDR_REVISION(midr));
- }
-
- return 0;
-}
-
-static void *c_start(struct seq_file *m, loff_t *pos)
-{
- return *pos < 1 ? (void *)1 : NULL;
-}
-
-static void *c_next(struct seq_file *m, void *v, loff_t *pos)
-{
- ++*pos;
- return NULL;
-}
-
-static void c_stop(struct seq_file *m, void *v)
-{
-}
-
-const struct seq_operations cpuinfo_op = {
- .start = c_start,
- .next = c_next,
- .stop = c_stop,
- .show = c_show
-};
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 6fa792137eda..660ccf9f7524 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -131,7 +131,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
struct rt_sigframe __user *frame;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Since we stacked the signal on a 128-bit boundary, then 'sp' should
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index b6da20fa7a48..c58aee062590 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -186,6 +186,12 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_int, &to->si_int);
break;
+ case __SI_SYS:
+ err |= __put_user((compat_uptr_t)(unsigned long)
+ from->si_call_addr, &to->si_call_addr);
+ err |= __put_user(from->si_syscall, &to->si_syscall);
+ err |= __put_user(from->si_arch, &to->si_arch);
+ break;
default: /* this is just in case for now ... */
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
@@ -364,7 +370,7 @@ asmlinkage int compat_sys_sigreturn(struct pt_regs *regs)
struct compat_sigframe __user *frame;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Since we stacked the signal on a 64-bit boundary,
@@ -398,7 +404,7 @@ asmlinkage int compat_sys_rt_sigreturn(struct pt_regs *regs)
struct compat_rt_sigframe __user *frame;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Since we stacked the signal on a 64-bit boundary,
@@ -521,7 +527,7 @@ static int compat_setup_sigframe(struct compat_sigframe __user *sf,
__put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.trap_no, err);
/* set the compat FSR WnR */
- __put_user_error(!!(current->thread.fault_code & ESR_EL1_WRITE) <<
+ __put_user_error(!!(current->thread.fault_code & ESR_ELx_WNR) <<
FSR_WRITE_SHIFT, &sf->uc.uc_mcontext.error_code, err);
__put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);
__put_user_error(set->sig[0], &sf->uc.uc_mcontext.oldmask, err);
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index ede186cdd452..3576b1590ebd 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -82,7 +82,6 @@ ENTRY(__cpu_suspend_enter)
str x2, [x0, #CPU_CTX_SP]
ldr x1, =sleep_save_sp
ldr x1, [x1, #SLEEP_SAVE_SP_VIRT]
-#ifdef CONFIG_SMP
mrs x7, mpidr_el1
ldr x9, =mpidr_hash
ldr x10, [x9, #MPIDR_HASH_MASK]
@@ -94,7 +93,6 @@ ENTRY(__cpu_suspend_enter)
ldp w5, w6, [x9, #(MPIDR_HASH_SHIFTS + 8)]
compute_mpidr_hash x8, x3, x4, x5, x6, x7, x10
add x1, x1, x8, lsl #3
-#endif
bl __cpu_suspend_save
/*
* Grab suspend finisher in x20 and its argument in x19
@@ -134,6 +132,14 @@ ENTRY(cpu_resume_mmu)
ldr x3, =cpu_resume_after_mmu
msr sctlr_el1, x0 // restore sctlr_el1
isb
+ /*
+ * Invalidate the local I-cache so that any instructions fetched
+ * speculatively from the PoC are discarded, since they may have
+ * been dynamically patched at the PoU.
+ */
+ ic iallu
+ dsb nsh
+ isb
br x3 // global jump to virtual address
ENDPROC(cpu_resume_mmu)
cpu_resume_after_mmu:
@@ -149,7 +155,6 @@ ENDPROC(cpu_resume_after_mmu)
ENTRY(cpu_resume)
bl el2_setup // if in EL2 drop to EL1 cleanly
-#ifdef CONFIG_SMP
mrs x1, mpidr_el1
adrp x8, mpidr_hash
add x8, x8, #:lo12:mpidr_hash // x8 = struct mpidr_hash phys address
@@ -159,9 +164,6 @@ ENTRY(cpu_resume)
ldp w5, w6, [x8, #(MPIDR_HASH_SHIFTS + 8)]
compute_mpidr_hash x7, x3, x4, x5, x6, x1, x2
/* x7 contains hash index, let's use it to grab context pointer */
-#else
- mov x7, xzr
-#endif
adrp x0, sleep_save_sp
add x0, x0, #:lo12:sleep_save_sp
ldr x0, [x0, #SLEEP_SAVE_SP_PHYS]
@@ -172,6 +174,9 @@ ENTRY(cpu_resume)
/* load physical address of identity map page table in x1 */
ldr x1, [x1, #:lo12:sleep_idmap_phys]
mov sp, x2
+ /* save thread_info */
+ and x2, x2, #~(THREAD_SIZE - 1)
+ msr sp_el0, x2
/*
* cpu_do_resume expects x0 to contain context physical address
* pointer and x1 to contain physical address of 1:1 page tables
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index a1f054549cb6..0d4cf1fbc680 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -141,21 +141,27 @@ asmlinkage void secondary_start_kernel(void)
*/
atomic_inc(&mm->mm_count);
current->active_mm = mm;
- cpumask_set_cpu(cpu, mm_cpumask(mm));
set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
- printk("CPU%u: Booted secondary processor\n", cpu);
/*
* TTBR0 is only used for the identity mapping at this stage. Make it
* point to zero page to avoid speculatively fetching new entries.
*/
cpu_set_reserved_ttbr0();
- flush_tlb_all();
+ local_flush_tlb_all();
+ cpu_set_default_tcr_t0sz();
preempt_disable();
trace_hardirqs_off();
+ /*
+ * If the system has established the capabilities, make sure
+ * this CPU ticks all of those. If it doesn't, the CPU will
+ * fail to come online.
+ */
+ verify_local_cpu_capabilities();
+
if (cpu_ops[cpu]->cpu_postboot)
cpu_ops[cpu]->cpu_postboot();
@@ -176,6 +182,8 @@ asmlinkage void secondary_start_kernel(void)
* the CPU migration code to notice that the CPU is online
* before we continue.
*/
+ pr_info("CPU%u: Booted secondary processor [%08x]\n",
+ cpu, read_cpuid_id());
set_cpu_online(cpu, true);
complete(&cpu_running);
@@ -230,12 +238,6 @@ int __cpu_disable(void)
* OK - migrate IRQs away from this CPU
*/
migrate_irqs();
-
- /*
- * Remove this CPU from the vm mask set of all processes.
- */
- clear_tasks_mm_cpumask(cpu);
-
return 0;
}
@@ -309,11 +311,13 @@ void cpu_die(void)
void __init smp_cpus_done(unsigned int max_cpus)
{
pr_info("SMP: Total of %d processors activated.\n", num_online_cpus());
- apply_alternatives();
+ setup_cpu_features();
+ apply_alternatives_all();
}
void __init smp_prepare_boot_cpu(void)
{
+ cpuinfo_store_boot_cpu();
set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
}
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index 23e35de789f7..ab14fee1f255 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -119,7 +119,7 @@ int __cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
else
cpu_switch_mm(mm->pgd, mm);
- flush_tlb_all();
+ local_flush_tlb_all();
/*
* Restore per-cpu offset before any kernel
diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c
index 1a7125c3099b..59630e4748f7 100644
--- a/arch/arm64/kernel/time.c
+++ b/arch/arm64/kernel/time.c
@@ -41,7 +41,6 @@
#include <asm/thread_info.h>
#include <asm/stacktrace.h>
-#ifdef CONFIG_SMP
unsigned long profile_pc(struct pt_regs *regs)
{
struct stackframe frame;
@@ -61,7 +60,6 @@ unsigned long profile_pc(struct pt_regs *regs)
return frame.pc;
}
EXPORT_SYMBOL(profile_pc);
-#endif
void __init time_init(void)
{
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index b6ee26b0939a..2f98601318c3 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -19,10 +19,30 @@
#include <linux/nodemask.h>
#include <linux/of.h>
#include <linux/sched.h>
+#include <linux/sched.h>
+#include <linux/sched_energy.h>
#include <asm/cputype.h>
#include <asm/topology.h>
+static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
+
+unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+#ifdef CONFIG_CPU_FREQ
+ unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu);
+
+ return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT;
+#else
+ return per_cpu(cpu_scale, cpu);
+#endif
+}
+
+static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
+{
+ per_cpu(cpu_scale, cpu) = capacity;
+}
+
static int __init get_cpu_for_node(struct device_node *node)
{
struct device_node *cpu_node;
@@ -206,11 +226,67 @@ out:
struct cpu_topology cpu_topology[NR_CPUS];
EXPORT_SYMBOL_GPL(cpu_topology);
+/* sd energy functions */
+static inline
+const struct sched_group_energy * const cpu_cluster_energy(int cpu)
+{
+ struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL1];
+
+ if (!sge) {
+ pr_warn("Invalid sched_group_energy for Cluster%d\n", cpu);
+ return NULL;
+ }
+
+ return sge;
+}
+
+static inline
+const struct sched_group_energy * const cpu_core_energy(int cpu)
+{
+ struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL0];
+
+ if (!sge) {
+ pr_warn("Invalid sched_group_energy for CPU%d\n", cpu);
+ return NULL;
+ }
+
+ return sge;
+}
+
const struct cpumask *cpu_coregroup_mask(int cpu)
{
return &cpu_topology[cpu].core_sibling;
}
+static inline int cpu_corepower_flags(void)
+{
+ return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \
+ SD_SHARE_CAP_STATES;
+}
+
+static struct sched_domain_topology_level arm64_topology[] = {
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+static void update_cpu_capacity(unsigned int cpu)
+{
+ unsigned long capacity = SCHED_CAPACITY_SCALE;
+
+ if (cpu_core_energy(cpu)) {
+ int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1;
+ capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap;
+ }
+
+ set_capacity_scale(cpu, capacity);
+
+ pr_info("CPU%d: update cpu_capacity %lu\n",
+ cpu, arch_scale_cpu_capacity(NULL, cpu));
+}
+
static void update_siblings_masks(unsigned int cpuid)
{
struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
@@ -269,6 +345,7 @@ void store_cpu_topology(unsigned int cpuid)
topology_populated:
update_siblings_masks(cpuid);
+ update_cpu_capacity(cpuid);
}
static void __init reset_cpu_topology(void)
@@ -299,4 +376,8 @@ void __init init_cpu_topology(void)
*/
if (parse_dt_topology())
reset_cpu_topology();
+ else
+ set_sched_topology(arm64_topology);
+
+ init_sched_energy_costs();
}
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 4a43f0583045..7a92c65bc3cf 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -33,6 +33,7 @@
#include <asm/atomic.h>
#include <asm/debug-monitors.h>
+#include <asm/esr.h>
#include <asm/traps.h>
#include <asm/stacktrace.h>
#include <asm/exception.h>
@@ -59,8 +60,7 @@ static void dump_mem(const char *lvl, const char *str, unsigned long bottom,
/*
* We need to switch to kernel mode so that we can use __get_user
- * to safely read from kernel space. Note that we now dump the
- * code first, just in case the backtrace kills us.
+ * to safely read from kernel space.
*/
fs = get_fs();
set_fs(KERNEL_DS);
@@ -97,21 +97,12 @@ static void dump_backtrace_entry(unsigned long where, unsigned long stack)
stack + sizeof(struct pt_regs));
}
-static void dump_instr(const char *lvl, struct pt_regs *regs)
+static void __dump_instr(const char *lvl, struct pt_regs *regs)
{
unsigned long addr = instruction_pointer(regs);
- mm_segment_t fs;
char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str;
int i;
- /*
- * We need to switch to kernel mode so that we can use __get_user
- * to safely read from kernel space. Note that we now dump the
- * code first, just in case the backtrace kills us.
- */
- fs = get_fs();
- set_fs(KERNEL_DS);
-
for (i = -4; i < 1; i++) {
unsigned int val, bad;
@@ -125,8 +116,18 @@ static void dump_instr(const char *lvl, struct pt_regs *regs)
}
}
printk("%sCode: %s\n", lvl, str);
+}
- set_fs(fs);
+static void dump_instr(const char *lvl, struct pt_regs *regs)
+{
+ if (!user_mode(regs)) {
+ mm_segment_t fs = get_fs();
+ set_fs(KERNEL_DS);
+ __dump_instr(lvl, regs);
+ set_fs(fs);
+ } else {
+ __dump_instr(lvl, regs);
+ }
}
static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
@@ -178,11 +179,7 @@ void show_stack(struct task_struct *tsk, unsigned long *sp)
#else
#define S_PREEMPT ""
#endif
-#ifdef CONFIG_SMP
#define S_SMP " SMP"
-#else
-#define S_SMP ""
-#endif
static int __die(const char *str, int err, struct thread_info *thread,
struct pt_regs *regs)
@@ -373,6 +370,51 @@ asmlinkage long do_ni_syscall(struct pt_regs *regs)
return sys_ni_syscall();
}
+static const char *esr_class_str[] = {
+ [0 ... ESR_ELx_EC_MAX] = "UNRECOGNIZED EC",
+ [ESR_ELx_EC_UNKNOWN] = "Unknown/Uncategorized",
+ [ESR_ELx_EC_WFx] = "WFI/WFE",
+ [ESR_ELx_EC_CP15_32] = "CP15 MCR/MRC",
+ [ESR_ELx_EC_CP15_64] = "CP15 MCRR/MRRC",
+ [ESR_ELx_EC_CP14_MR] = "CP14 MCR/MRC",
+ [ESR_ELx_EC_CP14_LS] = "CP14 LDC/STC",
+ [ESR_ELx_EC_FP_ASIMD] = "ASIMD",
+ [ESR_ELx_EC_CP10_ID] = "CP10 MRC/VMRS",
+ [ESR_ELx_EC_CP14_64] = "CP14 MCRR/MRRC",
+ [ESR_ELx_EC_ILL] = "PSTATE.IL",
+ [ESR_ELx_EC_SVC32] = "SVC (AArch32)",
+ [ESR_ELx_EC_HVC32] = "HVC (AArch32)",
+ [ESR_ELx_EC_SMC32] = "SMC (AArch32)",
+ [ESR_ELx_EC_SVC64] = "SVC (AArch64)",
+ [ESR_ELx_EC_HVC64] = "HVC (AArch64)",
+ [ESR_ELx_EC_SMC64] = "SMC (AArch64)",
+ [ESR_ELx_EC_SYS64] = "MSR/MRS (AArch64)",
+ [ESR_ELx_EC_IMP_DEF] = "EL3 IMP DEF",
+ [ESR_ELx_EC_IABT_LOW] = "IABT (lower EL)",
+ [ESR_ELx_EC_IABT_CUR] = "IABT (current EL)",
+ [ESR_ELx_EC_PC_ALIGN] = "PC Alignment",
+ [ESR_ELx_EC_DABT_LOW] = "DABT (lower EL)",
+ [ESR_ELx_EC_DABT_CUR] = "DABT (current EL)",
+ [ESR_ELx_EC_SP_ALIGN] = "SP Alignment",
+ [ESR_ELx_EC_FP_EXC32] = "FP (AArch32)",
+ [ESR_ELx_EC_FP_EXC64] = "FP (AArch64)",
+ [ESR_ELx_EC_SERROR] = "SError",
+ [ESR_ELx_EC_BREAKPT_LOW] = "Breakpoint (lower EL)",
+ [ESR_ELx_EC_BREAKPT_CUR] = "Breakpoint (current EL)",
+ [ESR_ELx_EC_SOFTSTP_LOW] = "Software Step (lower EL)",
+ [ESR_ELx_EC_SOFTSTP_CUR] = "Software Step (current EL)",
+ [ESR_ELx_EC_WATCHPT_LOW] = "Watchpoint (lower EL)",
+ [ESR_ELx_EC_WATCHPT_CUR] = "Watchpoint (current EL)",
+ [ESR_ELx_EC_BKPT32] = "BKPT (AArch32)",
+ [ESR_ELx_EC_VECTOR32] = "Vector catch (AArch32)",
+ [ESR_ELx_EC_BRK64] = "BRK (AArch64)",
+};
+
+const char *esr_get_class_string(u32 esr)
+{
+ return esr_class_str[ESR_ELx_EC(esr)];
+}
+
/*
* bad_mode handles the impossible case in the exception vector. This is always
* fatal.
@@ -381,8 +423,8 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr)
{
console_verbose();
- pr_crit("Bad mode in %s handler detected, code 0x%08x\n",
- handler[reason], esr);
+ pr_crit("Bad mode in %s handler detected, code 0x%08x -- %s\n",
+ handler[reason], esr, esr_get_class_string(esr));
die("Oops - bad mode", regs, 0);
local_irq_disable();
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 32aeea083d93..71766af43b70 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -55,7 +55,7 @@ struct vdso_data *vdso_data = &vdso_data_store.data;
*/
static struct page *vectors_page[1];
-static int alloc_vectors_page(void)
+static int __init alloc_vectors_page(void)
{
extern char __kuser_helper_start[], __kuser_helper_end[];
extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[];
@@ -88,7 +88,7 @@ int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp)
{
struct mm_struct *mm = current->mm;
unsigned long addr = AARCH32_VECTORS_BASE;
- static struct vm_special_mapping spec = {
+ static const struct vm_special_mapping spec = {
.name = "[vectors]",
.pages = vectors_page,
@@ -200,7 +200,7 @@ up_fail:
void update_vsyscall(struct timekeeper *tk)
{
struct timespec xtime_coarse;
- u32 use_syscall = strcmp(tk->tkr.clock->name, "arch_sys_counter");
+ u32 use_syscall = strcmp(tk->tkr_mono.clock->name, "arch_sys_counter");
++vdso_data->tb_seq_count;
smp_wmb();
@@ -213,11 +213,17 @@ void update_vsyscall(struct timekeeper *tk)
vdso_data->wtm_clock_nsec = tk->wall_to_monotonic.tv_nsec;
if (!use_syscall) {
- vdso_data->cs_cycle_last = tk->tkr.cycle_last;
+ /* tkr_mono.cycle_last == tkr_raw.cycle_last */
+ vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last;
+ vdso_data->raw_time_sec = tk->raw_time.tv_sec;
+ vdso_data->raw_time_nsec = tk->raw_time.tv_nsec;
vdso_data->xtime_clock_sec = tk->xtime_sec;
- vdso_data->xtime_clock_nsec = tk->tkr.xtime_nsec;
- vdso_data->cs_mult = tk->tkr.mult;
- vdso_data->cs_shift = tk->tkr.shift;
+ vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec;
+ /* tkr_raw.xtime_nsec == 0 */
+ vdso_data->cs_mono_mult = tk->tkr_mono.mult;
+ vdso_data->cs_raw_mult = tk->tkr_raw.mult;
+ /* tkr_mono.shift == tkr_raw.shift */
+ vdso_data->cs_shift = tk->tkr_mono.shift;
}
smp_wmb();
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index b467fd0a384b..62c84f7cb01b 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -23,7 +23,7 @@ GCOV_PROFILE := n
ccflags-y += -Wl,-shared
obj-y += vdso.o
-extra-y += vdso.lds vdso-offsets.h
+extra-y += vdso.lds
CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
# Force dependency (incbin is bad)
@@ -42,11 +42,10 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh
quiet_cmd_vdsosym = VDSOSYM $@
define cmd_vdsosym
- $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ && \
- cp $@ include/generated/
+ $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
endef
-$(obj)/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE
+include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE
$(call if_changed,vdsosym)
# Assembly rules for the .S files
diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S
index fe652ffd34c2..1f8bba27e2f3 100644
--- a/arch/arm64/kernel/vdso/gettimeofday.S
+++ b/arch/arm64/kernel/vdso/gettimeofday.S
@@ -26,24 +26,109 @@
#define NSEC_PER_SEC_HI16 0x3b9a
vdso_data .req x6
-use_syscall .req w7
-seqcnt .req w8
+seqcnt .req w7
+w_tmp .req w8
+x_tmp .req x8
+
+/*
+ * Conventions for macro arguments:
+ * - An argument is write-only if its name starts with "res".
+ * - All other arguments are read-only, unless otherwise specified.
+ */
.macro seqcnt_acquire
9999: ldr seqcnt, [vdso_data, #VDSO_TB_SEQ_COUNT]
tbnz seqcnt, #0, 9999b
dmb ishld
- ldr use_syscall, [vdso_data, #VDSO_USE_SYSCALL]
.endm
- .macro seqcnt_read, cnt
+ .macro seqcnt_check fail
dmb ishld
- ldr \cnt, [vdso_data, #VDSO_TB_SEQ_COUNT]
+ ldr w_tmp, [vdso_data, #VDSO_TB_SEQ_COUNT]
+ cmp w_tmp, seqcnt
+ b.ne \fail
.endm
- .macro seqcnt_check, cnt, fail
- cmp \cnt, seqcnt
- b.ne \fail
+ .macro syscall_check fail
+ ldr w_tmp, [vdso_data, #VDSO_USE_SYSCALL]
+ cbnz w_tmp, \fail
+ .endm
+
+ .macro get_nsec_per_sec res
+ mov \res, #NSEC_PER_SEC_LO16
+ movk \res, #NSEC_PER_SEC_HI16, lsl #16
+ .endm
+
+ /*
+ * Returns the clock delta, in nanoseconds left-shifted by the clock
+ * shift.
+ */
+ .macro get_clock_shifted_nsec res, cycle_last, mult
+ /* Read the virtual counter. */
+ isb
+ mrs x_tmp, cntvct_el0
+ /* Calculate cycle delta and convert to ns. */
+ sub \res, x_tmp, \cycle_last
+ /* We can only guarantee 56 bits of precision. */
+ movn x_tmp, #0xff00, lsl #48
+ and \res, x_tmp, \res
+ mul \res, \res, \mult
+ .endm
+
+ /*
+ * Returns in res_{sec,nsec} the REALTIME timespec, based on the
+ * "wall time" (xtime) and the clock_mono delta.
+ */
+ .macro get_ts_realtime res_sec, res_nsec, \
+ clock_nsec, xtime_sec, xtime_nsec, nsec_to_sec
+ add \res_nsec, \clock_nsec, \xtime_nsec
+ udiv x_tmp, \res_nsec, \nsec_to_sec
+ add \res_sec, \xtime_sec, x_tmp
+ msub \res_nsec, x_tmp, \nsec_to_sec, \res_nsec
+ .endm
+
+ /*
+ * Returns in res_{sec,nsec} the timespec based on the clock_raw delta,
+ * used for CLOCK_MONOTONIC_RAW.
+ */
+ .macro get_ts_clock_raw res_sec, res_nsec, clock_nsec, nsec_to_sec
+ udiv \res_sec, \clock_nsec, \nsec_to_sec
+ msub \res_nsec, \res_sec, \nsec_to_sec, \clock_nsec
+ .endm
+
+ /* sec and nsec are modified in place. */
+ .macro add_ts sec, nsec, ts_sec, ts_nsec, nsec_to_sec
+ /* Add timespec. */
+ add \sec, \sec, \ts_sec
+ add \nsec, \nsec, \ts_nsec
+
+ /* Normalise the new timespec. */
+ cmp \nsec, \nsec_to_sec
+ b.lt 9999f
+ sub \nsec, \nsec, \nsec_to_sec
+ add \sec, \sec, #1
+9999:
+ cmp \nsec, #0
+ b.ge 9998f
+ add \nsec, \nsec, \nsec_to_sec
+ sub \sec, \sec, #1
+9998:
+ .endm
+
+ .macro clock_gettime_return, shift=0
+ .if \shift == 1
+ lsr x11, x11, x12
+ .endif
+ stp x10, x11, [x1, #TSPEC_TV_SEC]
+ mov x0, xzr
+ ret
+ .endm
+
+ .macro jump_slot jumptable, index, label
+ .if (. - \jumptable) != 4 * (\index)
+ .error "Jump slot index mismatch"
+ .endif
+ b \label
.endm
.text
@@ -51,18 +136,25 @@ seqcnt .req w8
/* int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); */
ENTRY(__kernel_gettimeofday)
.cfi_startproc
- mov x2, x30
- .cfi_register x30, x2
-
- /* Acquire the sequence counter and get the timespec. */
adr vdso_data, _vdso_data
-1: seqcnt_acquire
- cbnz use_syscall, 4f
-
/* If tv is NULL, skip to the timezone code. */
cbz x0, 2f
- bl __do_get_tspec
- seqcnt_check w9, 1b
+
+ /* Compute the time of day. */
+1: seqcnt_acquire
+ syscall_check fail=4f
+ ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST]
+ /* w11 = cs_mono_mult, w12 = cs_shift */
+ ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT]
+ ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC]
+ seqcnt_check fail=1b
+
+ get_nsec_per_sec res=x9
+ lsl x9, x9, x12
+
+ get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11
+ get_ts_realtime res_sec=x10, res_nsec=x11, \
+ clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9
/* Convert ns to us. */
mov x13, #1000
@@ -76,95 +168,126 @@ ENTRY(__kernel_gettimeofday)
stp w4, w5, [x1, #TZ_MINWEST]
3:
mov x0, xzr
- ret x2
+ ret
4:
/* Syscall fallback. */
mov x8, #__NR_gettimeofday
svc #0
- ret x2
+ ret
.cfi_endproc
ENDPROC(__kernel_gettimeofday)
+#define JUMPSLOT_MAX CLOCK_MONOTONIC_COARSE
+
/* int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); */
ENTRY(__kernel_clock_gettime)
.cfi_startproc
- cmp w0, #CLOCK_REALTIME
- ccmp w0, #CLOCK_MONOTONIC, #0x4, ne
- b.ne 2f
+ cmp w0, #JUMPSLOT_MAX
+ b.hi syscall
+ adr vdso_data, _vdso_data
+ adr x_tmp, jumptable
+ add x_tmp, x_tmp, w0, uxtw #2
+ br x_tmp
+
+ ALIGN
+jumptable:
+ jump_slot jumptable, CLOCK_REALTIME, realtime
+ jump_slot jumptable, CLOCK_MONOTONIC, monotonic
+ b syscall
+ b syscall
+ jump_slot jumptable, CLOCK_MONOTONIC_RAW, monotonic_raw
+ jump_slot jumptable, CLOCK_REALTIME_COARSE, realtime_coarse
+ jump_slot jumptable, CLOCK_MONOTONIC_COARSE, monotonic_coarse
+
+ .if (. - jumptable) != 4 * (JUMPSLOT_MAX + 1)
+ .error "Wrong jumptable size"
+ .endif
+
+ ALIGN
+realtime:
+ seqcnt_acquire
+ syscall_check fail=syscall
+ ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST]
+ /* w11 = cs_mono_mult, w12 = cs_shift */
+ ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT]
+ ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC]
+ seqcnt_check fail=realtime
- mov x2, x30
- .cfi_register x30, x2
+ /* All computations are done with left-shifted nsecs. */
+ get_nsec_per_sec res=x9
+ lsl x9, x9, x12
- /* Get kernel timespec. */
- adr vdso_data, _vdso_data
-1: seqcnt_acquire
- cbnz use_syscall, 7f
+ get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11
+ get_ts_realtime res_sec=x10, res_nsec=x11, \
+ clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9
+ clock_gettime_return, shift=1
- bl __do_get_tspec
- seqcnt_check w9, 1b
+ ALIGN
+monotonic:
+ seqcnt_acquire
+ syscall_check fail=syscall
+ ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST]
+ /* w11 = cs_mono_mult, w12 = cs_shift */
+ ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT]
+ ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC]
+ ldp x3, x4, [vdso_data, #VDSO_WTM_CLK_SEC]
+ seqcnt_check fail=monotonic
- mov x30, x2
+ /* All computations are done with left-shifted nsecs. */
+ lsl x4, x4, x12
+ get_nsec_per_sec res=x9
+ lsl x9, x9, x12
- cmp w0, #CLOCK_MONOTONIC
- b.ne 6f
+ get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11
+ get_ts_realtime res_sec=x10, res_nsec=x11, \
+ clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9
- /* Get wtm timespec. */
- ldp x13, x14, [vdso_data, #VDSO_WTM_CLK_SEC]
+ add_ts sec=x10, nsec=x11, ts_sec=x3, ts_nsec=x4, nsec_to_sec=x9
+ clock_gettime_return, shift=1
- /* Check the sequence counter. */
- seqcnt_read w9
- seqcnt_check w9, 1b
- b 4f
-2:
- cmp w0, #CLOCK_REALTIME_COARSE
- ccmp w0, #CLOCK_MONOTONIC_COARSE, #0x4, ne
- b.ne 8f
+ ALIGN
+monotonic_raw:
+ seqcnt_acquire
+ syscall_check fail=syscall
+ ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST]
+ /* w11 = cs_raw_mult, w12 = cs_shift */
+ ldp w12, w11, [vdso_data, #VDSO_CS_SHIFT]
+ ldp x13, x14, [vdso_data, #VDSO_RAW_TIME_SEC]
+ seqcnt_check fail=monotonic_raw
- /* xtime_coarse_nsec is already right-shifted */
- mov x12, #0
+ /* All computations are done with left-shifted nsecs. */
+ lsl x14, x14, x12
+ get_nsec_per_sec res=x9
+ lsl x9, x9, x12
- /* Get coarse timespec. */
- adr vdso_data, _vdso_data
-3: seqcnt_acquire
+ get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11
+ get_ts_clock_raw res_sec=x10, res_nsec=x11, \
+ clock_nsec=x15, nsec_to_sec=x9
+
+ add_ts sec=x10, nsec=x11, ts_sec=x13, ts_nsec=x14, nsec_to_sec=x9
+ clock_gettime_return, shift=1
+
+ ALIGN
+realtime_coarse:
+ seqcnt_acquire
ldp x10, x11, [vdso_data, #VDSO_XTIME_CRS_SEC]
+ seqcnt_check fail=realtime_coarse
+ clock_gettime_return
- /* Get wtm timespec. */
+ ALIGN
+monotonic_coarse:
+ seqcnt_acquire
+ ldp x10, x11, [vdso_data, #VDSO_XTIME_CRS_SEC]
ldp x13, x14, [vdso_data, #VDSO_WTM_CLK_SEC]
+ seqcnt_check fail=monotonic_coarse
- /* Check the sequence counter. */
- seqcnt_read w9
- seqcnt_check w9, 3b
+ /* Computations are done in (non-shifted) nsecs. */
+ get_nsec_per_sec res=x9
+ add_ts sec=x10, nsec=x11, ts_sec=x13, ts_nsec=x14, nsec_to_sec=x9
+ clock_gettime_return
- cmp w0, #CLOCK_MONOTONIC_COARSE
- b.ne 6f
-4:
- /* Add on wtm timespec. */
- add x10, x10, x13
- lsl x14, x14, x12
- add x11, x11, x14
-
- /* Normalise the new timespec. */
- mov x15, #NSEC_PER_SEC_LO16
- movk x15, #NSEC_PER_SEC_HI16, lsl #16
- lsl x15, x15, x12
- cmp x11, x15
- b.lt 5f
- sub x11, x11, x15
- add x10, x10, #1
-5:
- cmp x11, #0
- b.ge 6f
- add x11, x11, x15
- sub x10, x10, #1
-
-6: /* Store to the user timespec. */
- lsr x11, x11, x12
- stp x10, x11, [x1, #TSPEC_TV_SEC]
- mov x0, xzr
- ret
-7:
- mov x30, x2
-8: /* Syscall fallback. */
+ ALIGN
+syscall: /* Syscall fallback. */
mov x8, #__NR_clock_gettime
svc #0
ret
@@ -178,6 +301,7 @@ ENTRY(__kernel_clock_getres)
cmp w0, #CLOCK_REALTIME
ccmp w0, #CLOCK_MONOTONIC, #0x4, ne
+ ccmp w0, #CLOCK_MONOTONIC_RAW, #0x4, ne
b.ne 1f
ldr x2, 5f
@@ -204,46 +328,3 @@ ENTRY(__kernel_clock_getres)
.quad CLOCK_COARSE_RES
.cfi_endproc
ENDPROC(__kernel_clock_getres)
-
-/*
- * Read the current time from the architected counter.
- * Expects vdso_data to be initialised.
- * Clobbers the temporary registers (x9 - x15).
- * Returns:
- * - w9 = vDSO sequence counter
- * - (x10, x11) = (ts->tv_sec, shifted ts->tv_nsec)
- * - w12 = cs_shift
- */
-ENTRY(__do_get_tspec)
- .cfi_startproc
-
- /* Read from the vDSO data page. */
- ldr x10, [vdso_data, #VDSO_CS_CYCLE_LAST]
- ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC]
- ldp w11, w12, [vdso_data, #VDSO_CS_MULT]
- seqcnt_read w9
-
- /* Read the virtual counter. */
- isb
- mrs x15, cntvct_el0
-
- /* Calculate cycle delta and convert to ns. */
- sub x10, x15, x10
- /* We can only guarantee 56 bits of precision. */
- movn x15, #0xff00, lsl #48
- and x10, x15, x10
- mul x10, x10, x11
-
- /* Use the kernel time to calculate the new timespec. */
- mov x11, #NSEC_PER_SEC_LO16
- movk x11, #NSEC_PER_SEC_HI16, lsl #16
- lsl x11, x11, x12
- add x15, x10, x14
- udiv x14, x15, x11
- add x10, x13, x14
- mul x13, x14, x11
- sub x11, x15, x13
-
- ret
- .cfi_endproc
-ENDPROC(__do_get_tspec)
diff --git a/arch/arm64/kernel/vdso/vdso.S b/arch/arm64/kernel/vdso/vdso.S
index 60c1db54b41a..82379a70ef03 100644
--- a/arch/arm64/kernel/vdso/vdso.S
+++ b/arch/arm64/kernel/vdso/vdso.S
@@ -21,9 +21,8 @@
#include <linux/const.h>
#include <asm/page.h>
- __PAGE_ALIGNED_DATA
-
.globl vdso_start, vdso_end
+ .section .rodata
.balign PAGE_SIZE
vdso_start:
.incbin "arch/arm64/kernel/vdso/vdso.so"
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 2f600294e8ca..724ed30e4992 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -5,9 +5,11 @@
*/
#include <asm-generic/vmlinux.lds.h>
+#include <asm/kernel-pgtable.h>
#include <asm/thread_info.h>
#include <asm/memory.h>
#include <asm/page.h>
+#include <asm/pgtable.h>
#include "image.h"
@@ -32,6 +34,30 @@ jiffies = jiffies_64;
*(.hyp.text) \
VMLINUX_SYMBOL(__hyp_text_end) = .;
+/*
+ * The size of the PE/COFF section that covers the kernel image, which
+ * runs from stext to _edata, must be a round multiple of the PE/COFF
+ * FileAlignment, which we set to its minimum value of 0x200. 'stext'
+ * itself is 4 KB aligned, so padding out _edata to a 0x200 aligned
+ * boundary should be sufficient.
+ */
+PECOFF_FILE_ALIGNMENT = 0x200;
+
+#ifdef CONFIG_EFI
+#define PECOFF_EDATA_PADDING \
+ .pecoff_edata_padding : { BYTE(0); . = ALIGN(PECOFF_FILE_ALIGNMENT); }
+#else
+#define PECOFF_EDATA_PADDING
+#endif
+
+#ifdef CONFIG_DEBUG_ALIGN_RODATA
+#define ALIGN_DEBUG_RO . = ALIGN(1<<SECTION_SHIFT);
+#define ALIGN_DEBUG_RO_MIN(min) ALIGN_DEBUG_RO
+#else
+#define ALIGN_DEBUG_RO
+#define ALIGN_DEBUG_RO_MIN(min) . = ALIGN(min);
+#endif
+
SECTIONS
{
/*
@@ -54,6 +80,7 @@ SECTIONS
_text = .;
HEAD_TEXT
}
+ ALIGN_DEBUG_RO
.text : { /* Real text segment */
_stext = .; /* Text and read-only data */
__exception_text_start = .;
@@ -70,19 +97,23 @@ SECTIONS
*(.got) /* Global offset table */
}
+ ALIGN_DEBUG_RO
+ _etext = .; /* End of text section */
+
RO_DATA(PAGE_SIZE)
EXCEPTION_TABLE(8)
NOTES
- _etext = .; /* End of text and rodata section */
+ ALIGN_DEBUG_RO
- . = ALIGN(PAGE_SIZE);
+ ALIGN_DEBUG_RO_MIN(PAGE_SIZE)
__init_begin = .;
INIT_TEXT_SECTION(8)
.exit.text : {
ARM_EXIT_KEEP(EXIT_TEXT)
}
- . = ALIGN(16);
+
+ ALIGN_DEBUG_RO_MIN(16)
.init.data : {
INIT_DATA
INIT_SETUP(16)
@@ -114,6 +145,7 @@ SECTIONS
_data = .;
_sdata = .;
RW_DATA_SECTION(64, PAGE_SIZE, THREAD_SIZE)
+ PECOFF_EDATA_PADDING
_edata = .;
BSS_SECTION(0, 0, 0)
@@ -124,6 +156,11 @@ SECTIONS
swapper_pg_dir = .;
. += SWAPPER_DIR_SIZE;
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ reserved_ttbr0 = .;
+ . += RESERVED_TTBR0_SIZE;
+#endif
+
_end = .;
STABS_DEBUG
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 0b4326578985..6dbc6ace341f 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -52,7 +52,7 @@ static bool cpu_has_32bit_el1(void)
{
u64 pfr0;
- pfr0 = read_cpuid(ID_AA64PFR0_EL1);
+ pfr0 = read_system_reg(SYS_ID_AA64PFR0_EL1);
return !!(pfr0 & 0x20);
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 3d7c2df89946..f4001cb14488 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -69,68 +69,31 @@ static u32 get_ccsidr(u32 csselr)
return ccsidr;
}
-static void do_dc_cisw(u32 val)
-{
- asm volatile("dc cisw, %x0" : : "r" (val));
- dsb(ish);
-}
-
-static void do_dc_csw(u32 val)
-{
- asm volatile("dc csw, %x0" : : "r" (val));
- dsb(ish);
-}
-
-/* See note at ARM ARM B1.14.4 */
+/*
+ * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
+ */
static bool access_dcsw(struct kvm_vcpu *vcpu,
const struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- unsigned long val;
- int cpu;
-
if (!p->is_write)
return read_from_write_only(vcpu, p);
- cpu = get_cpu();
-
- cpumask_setall(&vcpu->arch.require_dcache_flush);
- cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush);
-
- /* If we were already preempted, take the long way around */
- if (cpu != vcpu->arch.last_pcpu) {
- flush_cache_all();
- goto done;
- }
-
- val = *vcpu_reg(vcpu, p->Rt);
-
- switch (p->CRm) {
- case 6: /* Upgrade DCISW to DCCISW, as per HCR.SWIO */
- case 14: /* DCCISW */
- do_dc_cisw(val);
- break;
-
- case 10: /* DCCSW */
- do_dc_csw(val);
- break;
- }
-
-done:
- put_cpu();
-
+ kvm_set_way_flush(vcpu);
return true;
}
/*
* Generic accessor for VM registers. Only called as long as HCR_TVM
- * is set.
+ * is set. If the guest enables the MMU, we stop trapping the VM
+ * sys_regs and leave it in complete control of the caches.
*/
static bool access_vm_reg(struct kvm_vcpu *vcpu,
const struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
unsigned long val;
+ bool was_enabled = vcpu_has_cache_enabled(vcpu);
BUG_ON(!p->is_write);
@@ -143,25 +106,7 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu,
vcpu_cp15_64_low(vcpu, r->reg) = val & 0xffffffffUL;
}
- return true;
-}
-
-/*
- * SCTLR_EL1 accessor. Only called as long as HCR_TVM is set. If the
- * guest enables the MMU, we stop trapping the VM sys_regs and leave
- * it in complete control of the caches.
- */
-static bool access_sctlr(struct kvm_vcpu *vcpu,
- const struct sys_reg_params *p,
- const struct sys_reg_desc *r)
-{
- access_vm_reg(vcpu, p, r);
-
- if (vcpu_has_cache_enabled(vcpu)) { /* MMU+Caches enabled? */
- vcpu->arch.hcr_el2 &= ~HCR_TVM;
- stage2_flush_vm(vcpu->kvm);
- }
-
+ kvm_toggle_cache(vcpu, was_enabled);
return true;
}
@@ -377,7 +322,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
NULL, reset_mpidr, MPIDR_EL1 },
/* SCTLR_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b000),
- access_sctlr, reset_val, SCTLR_EL1, 0x00C50078 },
+ access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 },
/* CPACR_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b010),
NULL, reset_val, CPACR_EL1, 0 },
@@ -509,13 +454,13 @@ static bool trap_dbgidr(struct kvm_vcpu *vcpu,
if (p->is_write) {
return ignore_write(vcpu, p);
} else {
- u64 dfr = read_cpuid(ID_AA64DFR0_EL1);
- u64 pfr = read_cpuid(ID_AA64PFR0_EL1);
- u32 el3 = !!((pfr >> 12) & 0xf);
+ u64 dfr = read_system_reg(SYS_ID_AA64DFR0_EL1);
+ u64 pfr = read_system_reg(SYS_ID_AA64PFR0_EL1);
+ u32 el3 = !!cpuid_feature_extract_field(pfr, ID_AA64PFR0_EL3_SHIFT);
- *vcpu_reg(vcpu, p->Rt) = ((((dfr >> 20) & 0xf) << 28) |
- (((dfr >> 12) & 0xf) << 24) |
- (((dfr >> 28) & 0xf) << 20) |
+ *vcpu_reg(vcpu, p->Rt) = ((((dfr >> ID_AA64DFR0_WRPS_SHIFT) & 0xf) << 28) |
+ (((dfr >> ID_AA64DFR0_BRPS_SHIFT) & 0xf) << 24) |
+ (((dfr >> ID_AA64DFR0_CTX_CMPS_SHIFT) & 0xf) << 20) |
(6 << 16) | (el3 << 14) | (el3 << 12));
return true;
}
@@ -657,7 +602,7 @@ static const struct sys_reg_desc cp14_64_regs[] = {
* register).
*/
static const struct sys_reg_desc cp15_regs[] = {
- { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_sctlr, NULL, c1_SCTLR },
+ { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, c1_SCTLR },
{ Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
{ Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 },
{ Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, c2_TTBCR },
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index a9723c71c52b..d7150e30438a 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -17,10 +17,7 @@
*/
#include <linux/linkage.h>
-#include <asm/alternative.h>
-#include <asm/assembler.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
.text
@@ -33,29 +30,27 @@
* Alignment fixed up by hardware.
*/
ENTRY(__clear_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN)
+ uaccess_enable_not_uao x2, x3
mov x2, x1 // save the size for fixup return
subs x1, x1, #8
b.mi 2f
1:
-USER(9f, str xzr, [x0], #8 )
+uao_user_alternative 9f, str, sttr, xzr, x0, 8
subs x1, x1, #8
b.pl 1b
2: adds x1, x1, #4
b.mi 3f
-USER(9f, str wzr, [x0], #4 )
+uao_user_alternative 9f, str, sttr, wzr, x0, 4
sub x1, x1, #4
3: adds x1, x1, #2
b.mi 4f
-USER(9f, strh wzr, [x0], #2 )
+uao_user_alternative 9f, strh, sttrh, wzr, x0, 2
sub x1, x1, #2
4: adds x1, x1, #1
b.mi 5f
-USER(9f, strb wzr, [x0] )
+uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
5: mov x0, #0
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN)
+ uaccess_disable_not_uao x2
ret
ENDPROC(__clear_user)
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 1be9ef27be97..90154f3f7f2a 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -16,10 +16,8 @@
#include <linux/linkage.h>
-#include <asm/alternative.h>
-#include <asm/assembler.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/cache.h>
+#include <asm/uaccess.h>
/*
* Copy from user space to a kernel buffer (alignment handled by the hardware)
@@ -31,49 +29,56 @@
* Returns:
* x0 - bytes not copied
*/
-ENTRY(__copy_from_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN)
- add x5, x1, x2 // upper user buffer boundary
- subs x2, x2, #16
- b.mi 1f
-0:
-USER(9f, ldp x3, x4, [x1], #16)
- subs x2, x2, #16
- stp x3, x4, [x0], #16
- b.pl 0b
-1: adds x2, x2, #8
- b.mi 2f
-USER(9f, ldr x3, [x1], #8 )
- sub x2, x2, #8
- str x3, [x0], #8
-2: adds x2, x2, #4
- b.mi 3f
-USER(9f, ldr w3, [x1], #4 )
- sub x2, x2, #4
- str w3, [x0], #4
-3: adds x2, x2, #2
- b.mi 4f
-USER(9f, ldrh w3, [x1], #2 )
- sub x2, x2, #2
- strh w3, [x0], #2
-4: adds x2, x2, #1
- b.mi 5f
-USER(9f, ldrb w3, [x1] )
- strb w3, [x0]
-5: mov x0, #0
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN)
+
+ .macro ldrb1 ptr, regB, val
+ uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val
+ .endm
+
+ .macro strb1 ptr, regB, val
+ strb \ptr, [\regB], \val
+ .endm
+
+ .macro ldrh1 ptr, regB, val
+ uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val
+ .endm
+
+ .macro strh1 ptr, regB, val
+ strh \ptr, [\regB], \val
+ .endm
+
+ .macro ldr1 ptr, regB, val
+ uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val
+ .endm
+
+ .macro str1 ptr, regB, val
+ str \ptr, [\regB], \val
+ .endm
+
+ .macro ldp1 ptr, regB, regC, val
+ uao_ldp 9998f, \ptr, \regB, \regC, \val
+ .endm
+
+ .macro stp1 ptr, regB, regC, val
+ stp \ptr, \regB, [\regC], \val
+ .endm
+
+end .req x5
+ENTRY(__arch_copy_from_user)
+ uaccess_enable_not_uao x3, x4
+ add end, x0, x2
+#include "copy_template.S"
+ uaccess_disable_not_uao x3
+ mov x0, #0 // Nothing to copy
ret
-ENDPROC(__copy_from_user)
+ENDPROC(__arch_copy_from_user)
.section .fixup,"ax"
.align 2
-9: sub x2, x5, x1
- mov x3, x2
-10: strb wzr, [x0], #1 // zero remaining buffer space
- subs x3, x3, #1
- b.ne 10b
- mov x0, x2 // bytes not copied
+9998:
+ sub x0, end, dst
+9999:
+ strb wzr, [dst], #1 // zero remaining buffer space
+ cmp dst, end
+ b.lo 9999b
ret
.previous
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 1b94661e22b3..718b1c4e2f85 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -18,10 +18,8 @@
#include <linux/linkage.h>
-#include <asm/alternative.h>
-#include <asm/assembler.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/cache.h>
+#include <asm/uaccess.h>
/*
* Copy from user space to user space (alignment handled by the hardware)
@@ -33,44 +31,50 @@
* Returns:
* x0 - bytes not copied
*/
+ .macro ldrb1 ptr, regB, val
+ uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val
+ .endm
+
+ .macro strb1 ptr, regB, val
+ uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val
+ .endm
+
+ .macro ldrh1 ptr, regB, val
+ uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val
+ .endm
+
+ .macro strh1 ptr, regB, val
+ uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val
+ .endm
+
+ .macro ldr1 ptr, regB, val
+ uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val
+ .endm
+
+ .macro str1 ptr, regB, val
+ uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val
+ .endm
+
+ .macro ldp1 ptr, regB, regC, val
+ uao_ldp 9998f, \ptr, \regB, \regC, \val
+ .endm
+
+ .macro stp1 ptr, regB, regC, val
+ uao_stp 9998f, \ptr, \regB, \regC, \val
+ .endm
+
+end .req x5
ENTRY(__copy_in_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN)
- add x5, x0, x2 // upper user buffer boundary
- subs x2, x2, #16
- b.mi 1f
-0:
-USER(9f, ldp x3, x4, [x1], #16)
- subs x2, x2, #16
-USER(9f, stp x3, x4, [x0], #16)
- b.pl 0b
-1: adds x2, x2, #8
- b.mi 2f
-USER(9f, ldr x3, [x1], #8 )
- sub x2, x2, #8
-USER(9f, str x3, [x0], #8 )
-2: adds x2, x2, #4
- b.mi 3f
-USER(9f, ldr w3, [x1], #4 )
- sub x2, x2, #4
-USER(9f, str w3, [x0], #4 )
-3: adds x2, x2, #2
- b.mi 4f
-USER(9f, ldrh w3, [x1], #2 )
- sub x2, x2, #2
-USER(9f, strh w3, [x0], #2 )
-4: adds x2, x2, #1
- b.mi 5f
-USER(9f, ldrb w3, [x1] )
-USER(9f, strb w3, [x0] )
-5: mov x0, #0
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN)
+ uaccess_enable_not_uao x3, x4
+ add end, x0, x2
+#include "copy_template.S"
+ uaccess_disable_not_uao x3
+ mov x0, #0
ret
ENDPROC(__copy_in_user)
.section .fixup,"ax"
.align 2
-9: sub x0, x5, x0 // bytes not copied
+9998: sub x0, end, dst // bytes not copied
ret
.previous
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index a257b47e2dc4..e99e31c9acac 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -16,10 +16,8 @@
#include <linux/linkage.h>
-#include <asm/alternative.h>
-#include <asm/assembler.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/cache.h>
+#include <asm/uaccess.h>
/*
* Copy to user space from a kernel buffer (alignment handled by the hardware)
@@ -31,44 +29,50 @@
* Returns:
* x0 - bytes not copied
*/
-ENTRY(__copy_to_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN)
- add x5, x0, x2 // upper user buffer boundary
- subs x2, x2, #16
- b.mi 1f
-0:
- ldp x3, x4, [x1], #16
- subs x2, x2, #16
-USER(9f, stp x3, x4, [x0], #16)
- b.pl 0b
-1: adds x2, x2, #8
- b.mi 2f
- ldr x3, [x1], #8
- sub x2, x2, #8
-USER(9f, str x3, [x0], #8 )
-2: adds x2, x2, #4
- b.mi 3f
- ldr w3, [x1], #4
- sub x2, x2, #4
-USER(9f, str w3, [x0], #4 )
-3: adds x2, x2, #2
- b.mi 4f
- ldrh w3, [x1], #2
- sub x2, x2, #2
-USER(9f, strh w3, [x0], #2 )
-4: adds x2, x2, #1
- b.mi 5f
- ldrb w3, [x1]
-USER(9f, strb w3, [x0] )
-5: mov x0, #0
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN)
+ .macro ldrb1 ptr, regB, val
+ ldrb \ptr, [\regB], \val
+ .endm
+
+ .macro strb1 ptr, regB, val
+ uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val
+ .endm
+
+ .macro ldrh1 ptr, regB, val
+ ldrh \ptr, [\regB], \val
+ .endm
+
+ .macro strh1 ptr, regB, val
+ uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val
+ .endm
+
+ .macro ldr1 ptr, regB, val
+ ldr \ptr, [\regB], \val
+ .endm
+
+ .macro str1 ptr, regB, val
+ uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val
+ .endm
+
+ .macro ldp1 ptr, regB, regC, val
+ ldp \ptr, \regB, [\regC], \val
+ .endm
+
+ .macro stp1 ptr, regB, regC, val
+ uao_stp 9998f, \ptr, \regB, \regC, \val
+ .endm
+
+end .req x5
+ENTRY(__arch_copy_to_user)
+ uaccess_enable_not_uao x3, x4
+ add end, x0, x2
+#include "copy_template.S"
+ uaccess_disable_not_uao x3
+ mov x0, #0
ret
-ENDPROC(__copy_to_user)
+ENDPROC(__arch_copy_to_user)
.section .fixup,"ax"
.align 2
-9: sub x0, x5, x0 // bytes not copied
+9998: sub x0, end, dst // bytes not copied
ret
.previous
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index 00bc265f87e9..57f57fde5722 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -3,6 +3,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \
ioremap.o mmap.o pgd.o mmu.o \
context.o proc.o pageattr.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_ARM64_PTDUMP) += dump.o
obj-$(CONFIG_KASAN) += kasan_init.o
KASAN_SANITIZE_kasan_init.o := n
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 321a6ac84a94..1c10c80ff2d1 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -22,81 +22,7 @@
#include <asm/assembler.h>
#include <asm/cpufeature.h>
#include <asm/alternative.h>
-
-#include "proc-macros.S"
-
-/*
- * __flush_dcache_all()
- *
- * Flush the whole D-cache.
- *
- * Corrupted registers: x0-x7, x9-x11
- */
-__flush_dcache_all:
- dmb sy // ensure ordering with previous memory accesses
- mrs x0, clidr_el1 // read clidr
- and x3, x0, #0x7000000 // extract loc from clidr
- lsr x3, x3, #23 // left align loc bit field
- cbz x3, finished // if loc is 0, then no need to clean
- mov x10, #0 // start clean at cache level 0
-loop1:
- add x2, x10, x10, lsr #1 // work out 3x current cache level
- lsr x1, x0, x2 // extract cache type bits from clidr
- and x1, x1, #7 // mask of the bits for current cache only
- cmp x1, #2 // see what cache we have at this level
- b.lt skip // skip if no cache, or just i-cache
- save_and_disable_irqs x9 // make CSSELR and CCSIDR access atomic
- msr csselr_el1, x10 // select current cache level in csselr
- isb // isb to sych the new cssr&csidr
- mrs x1, ccsidr_el1 // read the new ccsidr
- restore_irqs x9
- and x2, x1, #7 // extract the length of the cache lines
- add x2, x2, #4 // add 4 (line length offset)
- mov x4, #0x3ff
- and x4, x4, x1, lsr #3 // find maximum number on the way size
- clz w5, w4 // find bit position of way size increment
- mov x7, #0x7fff
- and x7, x7, x1, lsr #13 // extract max number of the index size
-loop2:
- mov x9, x4 // create working copy of max way size
-loop3:
- lsl x6, x9, x5
- orr x11, x10, x6 // factor way and cache number into x11
- lsl x6, x7, x2
- orr x11, x11, x6 // factor index number into x11
- dc cisw, x11 // clean & invalidate by set/way
- subs x9, x9, #1 // decrement the way
- b.ge loop3
- subs x7, x7, #1 // decrement the index
- b.ge loop2
-skip:
- add x10, x10, #2 // increment cache number
- cmp x3, x10
- b.gt loop1
-finished:
- mov x10, #0 // swith back to cache level 0
- msr csselr_el1, x10 // select current cache level in csselr
- dsb sy
- isb
- ret
-ENDPROC(__flush_dcache_all)
-
-/*
- * flush_cache_all()
- *
- * Flush the entire cache system. The data cache flush is now achieved
- * using atomic clean / invalidates working outwards from L1 cache. This
- * is done using Set/Way based cache maintainance instructions. The
- * instruction cache can still be invalidated back to the point of
- * unification in a single instruction.
- */
-ENTRY(flush_cache_all)
- mov x12, lr
- bl __flush_dcache_all
- mov x0, #0
- ic ialluis // I+BTB cache invalidate
- ret x12
-ENDPROC(flush_cache_all)
+#include <asm/uaccess.h>
/*
* flush_icache_range(start,end)
@@ -122,6 +48,7 @@ ENTRY(flush_icache_range)
* - end - virtual end address of region
*/
ENTRY(__flush_cache_user_range)
+ uaccess_ttbr0_enable x2, x3
dcache_line_size x2, x3
sub x3, x2, #1
bic x4, x0, x3
@@ -143,6 +70,7 @@ USER(9f, ic ivau, x4 ) // invalidate I line PoU
9: // ignore any faulting cache operation
dsb ish
isb
+ uaccess_ttbr0_disable x1
ret
ENDPROC(flush_icache_range)
ENDPROC(__flush_cache_user_range)
@@ -212,7 +140,12 @@ __dma_clean_range:
dcache_line_size x2, x3
sub x3, x2, #1
bic x0, x0, x3
-1: alternative_insn "dc cvac, x0", "dc civac, x0", ARM64_WORKAROUND_CLEAN_CACHE
+1:
+alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE
+ dc cvac, x0
+alternative_else
+ dc civac, x0
+alternative_endif
add x0, x0, x2
cmp x0, x1
b.lo 1b
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 76c1e6cd36fc..a1877045141a 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -17,151 +17,190 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-#include <linux/init.h>
+#include <linux/bitops.h>
#include <linux/sched.h>
+#include <linux/slab.h>
#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/percpu.h>
+#include <asm/cpufeature.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
-#include <asm/cachetype.h>
-
-#define asid_bits(reg) \
- (((read_cpuid(ID_AA64MMFR0_EL1) & 0xf0) >> 2) + 8)
-
-#define ASID_FIRST_VERSION (1 << MAX_ASID_BITS)
+static u32 asid_bits;
static DEFINE_RAW_SPINLOCK(cpu_asid_lock);
-unsigned int cpu_last_asid = ASID_FIRST_VERSION;
-/*
- * We fork()ed a process, and we need a new context for the child to run in.
- */
-void __init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
- mm->context.id = 0;
- raw_spin_lock_init(&mm->context.id_lock);
-}
+static atomic64_t asid_generation;
+static unsigned long *asid_map;
-static void flush_context(void)
-{
- /* set the reserved TTBR0 before flushing the TLB */
- cpu_set_reserved_ttbr0();
- flush_tlb_all();
- if (icache_is_aivivt())
- __flush_icache_all();
-}
+static DEFINE_PER_CPU(atomic64_t, active_asids);
+static DEFINE_PER_CPU(u64, reserved_asids);
+static cpumask_t tlb_flush_pending;
-#ifdef CONFIG_SMP
+#define ASID_MASK (~GENMASK(asid_bits - 1, 0))
+#define ASID_FIRST_VERSION (1UL << asid_bits)
+#define NUM_USER_ASIDS ASID_FIRST_VERSION
-static void set_mm_context(struct mm_struct *mm, unsigned int asid)
+static void flush_context(unsigned int cpu)
{
- unsigned long flags;
+ int i;
+ u64 asid;
+
+ /* Update the list of reserved ASIDs and the ASID bitmap. */
+ bitmap_clear(asid_map, 0, NUM_USER_ASIDS);
/*
- * Locking needed for multi-threaded applications where the same
- * mm->context.id could be set from different CPUs during the
- * broadcast. This function is also called via IPI so the
- * mm->context.id_lock has to be IRQ-safe.
+ * Ensure the generation bump is observed before we xchg the
+ * active_asids.
*/
- raw_spin_lock_irqsave(&mm->context.id_lock, flags);
- if (likely((mm->context.id ^ cpu_last_asid) >> MAX_ASID_BITS)) {
+ smp_wmb();
+
+ for_each_possible_cpu(i) {
+ asid = atomic64_xchg_relaxed(&per_cpu(active_asids, i), 0);
/*
- * Old version of ASID found. Set the new one and reset
- * mm_cpumask(mm).
+ * If this CPU has already been through a
+ * rollover, but hasn't run another task in
+ * the meantime, we must preserve its reserved
+ * ASID, as this is the only trace we have of
+ * the process it is still running.
*/
- mm->context.id = asid;
- cpumask_clear(mm_cpumask(mm));
+ if (asid == 0)
+ asid = per_cpu(reserved_asids, i);
+ __set_bit(asid & ~ASID_MASK, asid_map);
+ per_cpu(reserved_asids, i) = asid;
}
- raw_spin_unlock_irqrestore(&mm->context.id_lock, flags);
- /*
- * Set the mm_cpumask(mm) bit for the current CPU.
- */
- cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
+ /* Queue a TLB invalidate and flush the I-cache if necessary. */
+ cpumask_setall(&tlb_flush_pending);
+
+ if (icache_is_aivivt())
+ __flush_icache_all();
}
-/*
- * Reset the ASID on the current CPU. This function call is broadcast from the
- * CPU handling the ASID rollover and holding cpu_asid_lock.
- */
-static void reset_context(void *info)
+static int is_reserved_asid(u64 asid)
{
- unsigned int asid;
- unsigned int cpu = smp_processor_id();
- struct mm_struct *mm = current->active_mm;
+ int cpu;
+ for_each_possible_cpu(cpu)
+ if (per_cpu(reserved_asids, cpu) == asid)
+ return 1;
+ return 0;
+}
+
+static u64 new_context(struct mm_struct *mm, unsigned int cpu)
+{
+ static u32 cur_idx = 1;
+ u64 asid = atomic64_read(&mm->context.id);
+ u64 generation = atomic64_read(&asid_generation);
+
+ if (asid != 0) {
+ /*
+ * If our current ASID was active during a rollover, we
+ * can continue to use it and this was just a false alarm.
+ */
+ if (is_reserved_asid(asid))
+ return generation | (asid & ~ASID_MASK);
+
+ /*
+ * We had a valid ASID in a previous life, so try to re-use
+ * it if possible.
+ */
+ asid &= ~ASID_MASK;
+ if (!__test_and_set_bit(asid, asid_map))
+ goto bump_gen;
+ }
/*
- * current->active_mm could be init_mm for the idle thread immediately
- * after secondary CPU boot or hotplug. TTBR0_EL1 is already set to
- * the reserved value, so no need to reset any context.
+ * Allocate a free ASID. If we can't find one, take a note of the
+ * currently active ASIDs and mark the TLBs as requiring flushes.
+ * We always count from ASID #1, as we use ASID #0 when setting a
+ * reserved TTBR0 for the init_mm.
*/
- if (mm == &init_mm)
- return;
+ asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, cur_idx);
+ if (asid != NUM_USER_ASIDS)
+ goto set_asid;
- smp_rmb();
- asid = cpu_last_asid + cpu;
+ /* We're out of ASIDs, so increment the global generation count */
+ generation = atomic64_add_return_relaxed(ASID_FIRST_VERSION,
+ &asid_generation);
+ flush_context(cpu);
- flush_context();
- set_mm_context(mm, asid);
+ /* We have at least 1 ASID per CPU, so this will always succeed */
+ asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, 1);
- /* set the new ASID */
- cpu_switch_mm(mm->pgd, mm);
-}
+set_asid:
+ __set_bit(asid, asid_map);
+ cur_idx = asid;
-#else
-
-static inline void set_mm_context(struct mm_struct *mm, unsigned int asid)
-{
- mm->context.id = asid;
- cpumask_copy(mm_cpumask(mm), cpumask_of(smp_processor_id()));
+bump_gen:
+ asid |= generation;
+ return asid;
}
-#endif
-
-void __new_context(struct mm_struct *mm)
+void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
{
- unsigned int asid;
- unsigned int bits = asid_bits();
+ unsigned long flags;
+ u64 asid;
+
+ asid = atomic64_read(&mm->context.id);
- raw_spin_lock(&cpu_asid_lock);
-#ifdef CONFIG_SMP
/*
- * Check the ASID again, in case the change was broadcast from another
- * CPU before we acquired the lock.
+ * The memory ordering here is subtle. We rely on the control
+ * dependency between the generation read and the update of
+ * active_asids to ensure that we are synchronised with a
+ * parallel rollover (i.e. this pairs with the smp_wmb() in
+ * flush_context).
*/
- if (!unlikely((mm->context.id ^ cpu_last_asid) >> MAX_ASID_BITS)) {
- cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
- raw_spin_unlock(&cpu_asid_lock);
- return;
+ if (!((asid ^ atomic64_read(&asid_generation)) >> asid_bits)
+ && atomic64_xchg_relaxed(&per_cpu(active_asids, cpu), asid))
+ goto switch_mm_fastpath;
+
+ raw_spin_lock_irqsave(&cpu_asid_lock, flags);
+ /* Check that our ASID belongs to the current generation. */
+ asid = atomic64_read(&mm->context.id);
+ if ((asid ^ atomic64_read(&asid_generation)) >> asid_bits) {
+ asid = new_context(mm, cpu);
+ atomic64_set(&mm->context.id, asid);
}
-#endif
- /*
- * At this point, it is guaranteed that the current mm (with an old
- * ASID) isn't active on any other CPU since the ASIDs are changed
- * simultaneously via IPI.
- */
- asid = ++cpu_last_asid;
+ if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
+ local_flush_tlb_all();
+
+ atomic64_set(&per_cpu(active_asids, cpu), asid);
+ raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
+
+switch_mm_fastpath:
/*
- * If we've used up all our ASIDs, we need to start a new version and
- * flush the TLB.
+ * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when
+ * emulating PAN.
*/
- if (unlikely((asid & ((1 << bits) - 1)) == 0)) {
- /* increment the ASID version */
- cpu_last_asid += (1 << MAX_ASID_BITS) - (1 << bits);
- if (cpu_last_asid == 0)
- cpu_last_asid = ASID_FIRST_VERSION;
- asid = cpu_last_asid + smp_processor_id();
- flush_context();
-#ifdef CONFIG_SMP
- smp_wmb();
- smp_call_function(reset_context, NULL, 1);
-#endif
- cpu_last_asid += NR_CPUS - 1;
+ if (!system_uses_ttbr0_pan())
+ cpu_switch_mm(mm->pgd, mm);
+}
+
+static int asids_init(void)
+{
+ int fld = cpuid_feature_extract_field(read_cpuid(SYS_ID_AA64MMFR0_EL1), 4);
+
+ switch (fld) {
+ default:
+ pr_warn("Unknown ASID size (%d); assuming 8-bit\n", fld);
+ /* Fallthrough */
+ case 0:
+ asid_bits = 8;
+ break;
+ case 2:
+ asid_bits = 16;
}
- set_mm_context(mm, asid);
- raw_spin_unlock(&cpu_asid_lock);
+ /* If we end up with more CPUs than ASIDs, expect things to crash */
+ WARN_ON(NUM_USER_ASIDS < num_possible_cpus());
+ atomic64_set(&asid_generation, ASID_FIRST_VERSION);
+ asid_map = kzalloc(BITS_TO_LONGS(NUM_USER_ASIDS) * sizeof(*asid_map),
+ GFP_KERNEL);
+ if (!asid_map)
+ panic("Failed to allocate bitmap for %lu ASIDs\n",
+ NUM_USER_ASIDS);
+
+ pr_info("ASID allocator initialised with %lu entries\n", NUM_USER_ASIDS);
+ return 0;
}
+early_initcall(asids_init);
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 6efbb52cb92e..97f741cd34e7 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -42,7 +42,7 @@ static pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot,
static struct gen_pool *atomic_pool;
#define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K
-static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE;
+static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
static int __init early_coherent_pool(char *p)
{
@@ -170,7 +170,7 @@ static void *__dma_alloc_noncoherent(struct device *dev, size_t size,
coherent_ptr = dma_common_contiguous_remap(page, size, VM_USERMAP,
__get_dma_pgprot(attrs,
__pgprot(PROT_NORMAL_NC), false),
- NULL);
+ __builtin_return_address(0));
if (!coherent_ptr)
goto no_map;
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
new file mode 100644
index 000000000000..bf69601be546
--- /dev/null
+++ b/arch/arm64/mm/dump.c
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2014, The Linux Foundation. All rights reserved.
+ * Debug helper to dump the current kernel pagetables of the system
+ * so that we can see what the various memory ranges are set to.
+ *
+ * Derived from x86 and arm implementation:
+ * (C) Copyright 2008 Intel Corporation
+ *
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+
+#define LOWEST_ADDR (UL(0xffffffffffffffff) << VA_BITS)
+
+struct addr_marker {
+ unsigned long start_address;
+ const char *name;
+};
+
+enum address_markers_idx {
+ VMALLOC_START_NR = 0,
+ VMALLOC_END_NR,
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ VMEMMAP_START_NR,
+ VMEMMAP_END_NR,
+#endif
+ PCI_START_NR,
+ PCI_END_NR,
+ FIXADDR_START_NR,
+ FIXADDR_END_NR,
+ MODULES_START_NR,
+ MODUELS_END_NR,
+ KERNEL_SPACE_NR,
+};
+
+static struct addr_marker address_markers[] = {
+ { VMALLOC_START, "vmalloc() Area" },
+ { VMALLOC_END, "vmalloc() End" },
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ { 0, "vmemmap start" },
+ { 0, "vmemmap end" },
+#endif
+ { (unsigned long) PCI_IOBASE, "PCI I/O start" },
+ { (unsigned long) PCI_IOBASE + SZ_16M, "PCI I/O end" },
+ { FIXADDR_START, "Fixmap start" },
+ { FIXADDR_TOP, "Fixmap end" },
+ { MODULES_VADDR, "Modules start" },
+ { MODULES_END, "Modules end" },
+ { PAGE_OFFSET, "Kernel Mapping" },
+ { -1, NULL },
+};
+
+struct pg_state {
+ struct seq_file *seq;
+ const struct addr_marker *marker;
+ unsigned long start_address;
+ unsigned level;
+ u64 current_prot;
+};
+
+struct prot_bits {
+ u64 mask;
+ u64 val;
+ const char *set;
+ const char *clear;
+};
+
+static const struct prot_bits pte_bits[] = {
+ {
+ .mask = PTE_USER,
+ .val = PTE_USER,
+ .set = "USR",
+ .clear = " ",
+ }, {
+ .mask = PTE_RDONLY,
+ .val = PTE_RDONLY,
+ .set = "ro",
+ .clear = "RW",
+ }, {
+ .mask = PTE_PXN,
+ .val = PTE_PXN,
+ .set = "NX",
+ .clear = "x ",
+ }, {
+ .mask = PTE_SHARED,
+ .val = PTE_SHARED,
+ .set = "SHD",
+ .clear = " ",
+ }, {
+ .mask = PTE_AF,
+ .val = PTE_AF,
+ .set = "AF",
+ .clear = " ",
+ }, {
+ .mask = PTE_NG,
+ .val = PTE_NG,
+ .set = "NG",
+ .clear = " ",
+ }, {
+ .mask = PTE_UXN,
+ .val = PTE_UXN,
+ .set = "UXN",
+ }, {
+ .mask = PTE_ATTRINDX_MASK,
+ .val = PTE_ATTRINDX(MT_DEVICE_nGnRnE),
+ .set = "DEVICE/nGnRnE",
+ }, {
+ .mask = PTE_ATTRINDX_MASK,
+ .val = PTE_ATTRINDX(MT_DEVICE_nGnRE),
+ .set = "DEVICE/nGnRE",
+ }, {
+ .mask = PTE_ATTRINDX_MASK,
+ .val = PTE_ATTRINDX(MT_DEVICE_GRE),
+ .set = "DEVICE/GRE",
+ }, {
+ .mask = PTE_ATTRINDX_MASK,
+ .val = PTE_ATTRINDX(MT_NORMAL_NC),
+ .set = "MEM/NORMAL-NC",
+ }, {
+ .mask = PTE_ATTRINDX_MASK,
+ .val = PTE_ATTRINDX(MT_NORMAL),
+ .set = "MEM/NORMAL",
+ }
+};
+
+struct pg_level {
+ const struct prot_bits *bits;
+ size_t num;
+ u64 mask;
+};
+
+static struct pg_level pg_level[] = {
+ {
+ }, { /* pgd */
+ .bits = pte_bits,
+ .num = ARRAY_SIZE(pte_bits),
+ }, { /* pud */
+ .bits = pte_bits,
+ .num = ARRAY_SIZE(pte_bits),
+ }, { /* pmd */
+ .bits = pte_bits,
+ .num = ARRAY_SIZE(pte_bits),
+ }, { /* pte */
+ .bits = pte_bits,
+ .num = ARRAY_SIZE(pte_bits),
+ },
+};
+
+static void dump_prot(struct pg_state *st, const struct prot_bits *bits,
+ size_t num)
+{
+ unsigned i;
+
+ for (i = 0; i < num; i++, bits++) {
+ const char *s;
+
+ if ((st->current_prot & bits->mask) == bits->val)
+ s = bits->set;
+ else
+ s = bits->clear;
+
+ if (s)
+ seq_printf(st->seq, " %s", s);
+ }
+}
+
+static void note_page(struct pg_state *st, unsigned long addr, unsigned level,
+ u64 val)
+{
+ static const char units[] = "KMGTPE";
+ u64 prot = val & pg_level[level].mask;
+
+ if (addr < LOWEST_ADDR)
+ return;
+
+ if (!st->level) {
+ st->level = level;
+ st->current_prot = prot;
+ st->start_address = addr;
+ seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+ } else if (prot != st->current_prot || level != st->level ||
+ addr >= st->marker[1].start_address) {
+ const char *unit = units;
+ unsigned long delta;
+
+ if (st->current_prot) {
+ seq_printf(st->seq, "0x%16lx-0x%16lx ",
+ st->start_address, addr);
+
+ delta = (addr - st->start_address) >> 10;
+ while (!(delta & 1023) && unit[1]) {
+ delta >>= 10;
+ unit++;
+ }
+ seq_printf(st->seq, "%9lu%c", delta, *unit);
+ if (pg_level[st->level].bits)
+ dump_prot(st, pg_level[st->level].bits,
+ pg_level[st->level].num);
+ seq_puts(st->seq, "\n");
+ }
+
+ if (addr >= st->marker[1].start_address) {
+ st->marker++;
+ seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+ }
+
+ st->start_address = addr;
+ st->current_prot = prot;
+ st->level = level;
+ }
+
+ if (addr >= st->marker[1].start_address) {
+ st->marker++;
+ seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+ }
+
+}
+
+static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
+{
+ pte_t *pte = pte_offset_kernel(pmd, 0);
+ unsigned long addr;
+ unsigned i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
+ addr = start + i * PAGE_SIZE;
+ note_page(st, addr, 4, pte_val(*pte));
+ }
+}
+
+static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
+{
+ pmd_t *pmd = pmd_offset(pud, 0);
+ unsigned long addr;
+ unsigned i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
+ addr = start + i * PMD_SIZE;
+ if (pmd_none(*pmd) || pmd_sect(*pmd) || pmd_bad(*pmd))
+ note_page(st, addr, 3, pmd_val(*pmd));
+ else
+ walk_pte(st, pmd, addr);
+ }
+}
+
+static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
+{
+ pud_t *pud = pud_offset(pgd, 0);
+ unsigned long addr;
+ unsigned i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
+ addr = start + i * PUD_SIZE;
+ if (pud_none(*pud) || pud_sect(*pud) || pud_bad(*pud))
+ note_page(st, addr, 2, pud_val(*pud));
+ else
+ walk_pmd(st, pud, addr);
+ }
+}
+
+static void walk_pgd(struct pg_state *st, struct mm_struct *mm, unsigned long start)
+{
+ pgd_t *pgd = pgd_offset(mm, 0);
+ unsigned i;
+ unsigned long addr;
+
+ for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
+ addr = start + i * PGDIR_SIZE;
+ if (pgd_none(*pgd) || pgd_bad(*pgd))
+ note_page(st, addr, 1, pgd_val(*pgd));
+ else
+ walk_pud(st, pgd, addr);
+ }
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+ struct pg_state st = {
+ .seq = m,
+ .marker = address_markers,
+ };
+
+ walk_pgd(&st, &init_mm, LOWEST_ADDR);
+
+ note_page(&st, 0, 0, 0);
+ return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+ .open = ptdump_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int ptdump_init(void)
+{
+ struct dentry *pe;
+ unsigned i, j;
+
+ for (i = 0; i < ARRAY_SIZE(pg_level); i++)
+ if (pg_level[i].bits)
+ for (j = 0; j < pg_level[i].num; j++)
+ pg_level[i].mask |= pg_level[i].bits[j].mask;
+
+ address_markers[VMEMMAP_START_NR].start_address =
+ (unsigned long)virt_to_page(PAGE_OFFSET);
+ address_markers[VMEMMAP_END_NR].start_address =
+ (unsigned long)virt_to_page(high_memory);
+
+ pe = debugfs_create_file("kernel_page_tables", 0400, NULL, NULL,
+ &ptdump_fops);
+ return pe ? 0 : -ENOMEM;
+}
+device_initcall(ptdump_init);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 71e5707ac7f4..44ba84434388 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -81,6 +81,11 @@ void show_pte(struct mm_struct *mm, unsigned long addr)
printk("\n");
}
+static bool is_el1_instruction_abort(unsigned int esr)
+{
+ return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
+}
+
/*
* The kernel tried to access some page that wasn't present.
*/
@@ -89,8 +94,9 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr,
{
/*
* Are we prepared to handle this kernel fault?
+ * We are almost certainly not prepared to handle instruction faults.
*/
- if (fixup_exception(regs))
+ if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
return;
/*
@@ -153,8 +159,6 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
#define VM_FAULT_BADMAP 0x010000
#define VM_FAULT_BADACCESS 0x020000
-#define ESR_LNX_EXEC (1 << 24)
-
static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
unsigned int mm_flags, unsigned long vm_flags,
struct task_struct *tsk)
@@ -193,6 +197,26 @@ out:
return fault;
}
+static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs)
+{
+ unsigned int ec = ESR_ELx_EC(esr);
+ unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
+
+ if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR)
+ return false;
+
+ if (system_uses_ttbr0_pan())
+ return fsc_type == ESR_ELx_FSC_FAULT &&
+ (regs->pstate & PSR_PAN_BIT);
+ else
+ return fsc_type == ESR_ELx_FSC_PERM;
+}
+
+static bool is_el0_instruction_abort(unsigned int esr)
+{
+ return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
+}
+
static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
@@ -219,19 +243,24 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
if (user_mode(regs))
mm_flags |= FAULT_FLAG_USER;
- if (esr & ESR_LNX_EXEC) {
+ if (is_el0_instruction_abort(esr)) {
vm_flags = VM_EXEC;
- } else if ((esr & ESR_EL1_WRITE) && !(esr & ESR_EL1_CM)) {
+ } else if ((esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM)) {
vm_flags = VM_WRITE;
mm_flags |= FAULT_FLAG_WRITE;
}
- /*
- * PAN bit set implies the fault happened in kernel space, but not
- * in the arch's user access functions.
- */
- if (IS_ENABLED(CONFIG_ARM64_PAN) && (regs->pstate & PSR_PAN_BIT))
- goto no_context;
+ if (addr < USER_DS && is_permission_fault(esr, regs)) {
+ /* regs->orig_addr_limit may be 0 if we entered from EL0 */
+ if (regs->orig_addr_limit == KERNEL_DS)
+ die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
+
+ if (is_el1_instruction_abort(esr))
+ die("Attempting to execute userspace memory", regs, esr);
+
+ if (!search_exception_tables(regs->pc))
+ panic("Accessing user space memory outside uaccess.h routines");
+ }
/*
* As per x86, we may deadlock here. However, since the kernel only
@@ -380,7 +409,7 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
return 1;
}
-static struct fault_info {
+static const struct fault_info {
int (*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs);
int sig;
int code;
@@ -542,8 +571,21 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
}
#ifdef CONFIG_ARM64_PAN
-void cpu_enable_pan(void)
+void cpu_enable_pan(void *__unused)
{
config_sctlr_el1(SCTLR_EL1_SPAN, 0);
}
#endif /* CONFIG_ARM64_PAN */
+
+#ifdef CONFIG_ARM64_UAO
+/*
+ * Kernel threads have fs=KERNEL_DS by default, and don't need to call
+ * set_fs(), devtmpfs in particular relies on this behaviour.
+ * We need to enable the feature at runtime (instead of adding it to
+ * PSR_MODE_EL1h) as the feature may not be implemented by the cpu.
+ */
+void cpu_enable_uao(void *__unused)
+{
+ asm(SET_PSTATE_UAO(1));
+}
+#endif /* CONFIG_ARM64_UAO */
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index bfb8eb168f2d..a90615baa529 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -60,14 +60,10 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
unsigned long uaddr, void *dst, const void *src,
unsigned long len)
{
-#ifdef CONFIG_SMP
preempt_disable();
-#endif
memcpy(dst, src, len);
flush_ptrace_access(vma, page, uaddr, dst, len);
-#ifdef CONFIG_SMP
preempt_enable();
-#endif
}
void __sync_icache_dcache(pte_t pte, unsigned long addr)
@@ -98,7 +94,6 @@ EXPORT_SYMBOL(flush_dcache_page);
/*
* Additional functions defined in assembly.
*/
-EXPORT_SYMBOL(flush_cache_all);
EXPORT_SYMBOL(flush_icache_range);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index c49a0a8152cf..c7c9f2b289fb 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -68,7 +68,7 @@ early_param("initrd", early_initrd);
* currently assumes that for memory starting above 4G, 32-bit devices will
* use a DMA offset.
*/
-static phys_addr_t max_zone_dma_phys(void)
+static phys_addr_t __init max_zone_dma_phys(void)
{
phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, 32);
return min(offset + (1ULL << 32), memblock_end_of_DRAM());
@@ -114,19 +114,21 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
}
#ifdef CONFIG_HAVE_ARCH_PFN_VALID
+#define PFN_MASK ((1UL << (64 - PAGE_SHIFT)) - 1)
+
int pfn_valid(unsigned long pfn)
{
- return memblock_is_memory(pfn << PAGE_SHIFT);
+ return (pfn & PFN_MASK) == pfn && memblock_is_memory(pfn << PAGE_SHIFT);
}
EXPORT_SYMBOL(pfn_valid);
#endif
#ifndef CONFIG_SPARSEMEM
-static void arm64_memory_present(void)
+static void __init arm64_memory_present(void)
{
}
#else
-static void arm64_memory_present(void)
+static void __init arm64_memory_present(void)
{
struct memblock_region *reg;
@@ -350,6 +352,7 @@ void __init mem_init(void)
void free_initmem(void)
{
+ fixup_init();
free_initmem_default(0);
free_alternatives_memory();
}
diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index 4a07630a6616..cbb99c8f1e04 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -103,97 +103,10 @@ void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size)
}
EXPORT_SYMBOL(ioremap_cache);
-static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
-static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss;
-#endif
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
-static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss;
-#endif
-
-static inline pud_t * __init early_ioremap_pud(unsigned long addr)
-{
- pgd_t *pgd;
-
- pgd = pgd_offset_k(addr);
- BUG_ON(pgd_none(*pgd) || pgd_bad(*pgd));
-
- return pud_offset(pgd, addr);
-}
-
-static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
-{
- pud_t *pud = early_ioremap_pud(addr);
-
- BUG_ON(pud_none(*pud) || pud_bad(*pud));
-
- return pmd_offset(pud, addr);
-}
-
-static inline pte_t * __init early_ioremap_pte(unsigned long addr)
-{
- pmd_t *pmd = early_ioremap_pmd(addr);
-
- BUG_ON(pmd_none(*pmd) || pmd_bad(*pmd));
-
- return pte_offset_kernel(pmd, addr);
-}
-
+/*
+ * Must be called after early_fixmap_init
+ */
void __init early_ioremap_init(void)
{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- unsigned long addr = fix_to_virt(FIX_BTMAP_BEGIN);
-
- pgd = pgd_offset_k(addr);
- pgd_populate(&init_mm, pgd, bm_pud);
- pud = pud_offset(pgd, addr);
- pud_populate(&init_mm, pud, bm_pmd);
- pmd = pmd_offset(pud, addr);
- pmd_populate_kernel(&init_mm, pmd, bm_pte);
-
- /*
- * The boot-ioremap range spans multiple pmds, for which
- * we are not prepared:
- */
- BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
- != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
-
- if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
- WARN_ON(1);
- pr_warn("pmd %p != %p\n",
- pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
- pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
- fix_to_virt(FIX_BTMAP_BEGIN));
- pr_warn("fix_to_virt(FIX_BTMAP_END): %08lx\n",
- fix_to_virt(FIX_BTMAP_END));
-
- pr_warn("FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
- pr_warn("FIX_BTMAP_BEGIN: %d\n",
- FIX_BTMAP_BEGIN);
- }
-
early_ioremap_setup();
}
-
-void __init __early_set_fixmap(enum fixed_addresses idx,
- phys_addr_t phys, pgprot_t flags)
-{
- unsigned long addr = __fix_to_virt(idx);
- pte_t *pte;
-
- if (idx >= __end_of_fixed_addresses) {
- BUG();
- return;
- }
-
- pte = early_ioremap_pte(addr);
-
- if (pgprot_val(flags))
- set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
- else {
- pte_clear(&init_mm, addr, pte);
- flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
- }
-}
diff --git a/arch/arm64/mm/mm.h b/arch/arm64/mm/mm.h
index d519f4f50c8c..ef47d99b5cbc 100644
--- a/arch/arm64/mm/mm.h
+++ b/arch/arm64/mm/mm.h
@@ -1,2 +1,3 @@
extern void __init bootmem_init(void);
-extern void __init arm64_swiotlb_init(void);
+
+void fixup_init(void);
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 54922d1275b8..c9fbfabb8cb0 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -51,9 +51,14 @@ static unsigned long mmap_rnd(void)
{
unsigned long rnd = 0;
- if (current->flags & PF_RANDOMIZE)
- rnd = (long)get_random_int() & STACK_RND_MASK;
-
+ if (current->flags & PF_RANDOMIZE) {
+#ifdef CONFIG_COMPAT
+ if (test_thread_flag(TIF_32BIT))
+ rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
+ else
+#endif
+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
+ }
return rnd << PAGE_SHIFT;
}
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index f4f8b500f74c..6147b780f805 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -26,8 +26,11 @@
#include <linux/memblock.h>
#include <linux/fs.h>
#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/stop_machine.h>
#include <asm/cputype.h>
+#include <asm/fixmap.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <asm/sizes.h>
@@ -37,87 +40,15 @@
#include "mm.h"
+u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
+
/*
* Empty_zero_page is a special page that is used for zero-initialized data
* and COW.
*/
-struct page *empty_zero_page;
+unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
EXPORT_SYMBOL(empty_zero_page);
-struct cachepolicy {
- const char policy[16];
- u64 mair;
- u64 tcr;
-};
-
-static struct cachepolicy cache_policies[] __initdata = {
- {
- .policy = "uncached",
- .mair = 0x44, /* inner, outer non-cacheable */
- .tcr = TCR_IRGN_NC | TCR_ORGN_NC,
- }, {
- .policy = "writethrough",
- .mair = 0xaa, /* inner, outer write-through, read-allocate */
- .tcr = TCR_IRGN_WT | TCR_ORGN_WT,
- }, {
- .policy = "writeback",
- .mair = 0xee, /* inner, outer write-back, read-allocate */
- .tcr = TCR_IRGN_WBnWA | TCR_ORGN_WBnWA,
- }
-};
-
-/*
- * These are useful for identifying cache coherency problems by allowing the
- * cache or the cache and writebuffer to be turned off. It changes the Normal
- * memory caching attributes in the MAIR_EL1 register.
- */
-static int __init early_cachepolicy(char *p)
-{
- int i;
- u64 tmp;
-
- for (i = 0; i < ARRAY_SIZE(cache_policies); i++) {
- int len = strlen(cache_policies[i].policy);
-
- if (memcmp(p, cache_policies[i].policy, len) == 0)
- break;
- }
- if (i == ARRAY_SIZE(cache_policies)) {
- pr_err("ERROR: unknown or unsupported cache policy: %s\n", p);
- return 0;
- }
-
- flush_cache_all();
-
- /*
- * Modify MT_NORMAL attributes in MAIR_EL1.
- */
- asm volatile(
- " mrs %0, mair_el1\n"
- " bfi %0, %1, %2, #8\n"
- " msr mair_el1, %0\n"
- " isb\n"
- : "=&r" (tmp)
- : "r" (cache_policies[i].mair), "i" (MT_NORMAL * 8));
-
- /*
- * Modify TCR PTW cacheability attributes.
- */
- asm volatile(
- " mrs %0, tcr_el1\n"
- " bic %0, %0, %2\n"
- " orr %0, %0, %1\n"
- " msr tcr_el1, %0\n"
- " isb\n"
- : "=&r" (tmp)
- : "r" (cache_policies[i].tcr), "r" (TCR_IRGN_MASK | TCR_ORGN_MASK));
-
- flush_cache_all();
-
- return 0;
-}
-early_param("cachepolicy", early_cachepolicy);
-
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
{
@@ -132,21 +63,43 @@ EXPORT_SYMBOL(phys_mem_access_prot);
static void __init *early_alloc(unsigned long sz)
{
void *ptr = __va(memblock_alloc(sz, sz));
+ BUG_ON(!ptr);
memset(ptr, 0, sz);
return ptr;
}
-static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
+/*
+ * remap a PMD into pages
+ */
+static void split_pmd(pmd_t *pmd, pte_t *pte)
+{
+ unsigned long pfn = pmd_pfn(*pmd);
+ int i = 0;
+
+ do {
+ /*
+ * Need to have the least restrictive permissions available
+ * permissions will be fixed up later
+ */
+ set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
+ pfn++;
+ } while (pte++, i++, i < PTRS_PER_PTE);
+}
+
+static void alloc_init_pte(pmd_t *pmd, unsigned long addr,
unsigned long end, unsigned long pfn,
- pgprot_t prot)
+ pgprot_t prot,
+ void *(*alloc)(unsigned long size))
{
pte_t *pte;
- if (pmd_none(*pmd)) {
- pte = early_alloc(PTRS_PER_PTE * sizeof(pte_t));
+ if (pmd_none(*pmd) || pmd_bad(*pmd)) {
+ pte = alloc(PTRS_PER_PTE * sizeof(pte_t));
+ if (pmd_sect(*pmd))
+ split_pmd(pmd, pte);
__pmd_populate(pmd, __pa(pte), PMD_TYPE_TABLE);
+ flush_tlb_all();
}
- BUG_ON(pmd_bad(*pmd));
pte = pte_offset_kernel(pmd, addr);
do {
@@ -155,29 +108,40 @@ static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
} while (pte++, addr += PAGE_SIZE, addr != end);
}
-static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
- unsigned long end, phys_addr_t phys,
- int map_io)
+void split_pud(pud_t *old_pud, pmd_t *pmd)
+{
+ unsigned long addr = pud_pfn(*old_pud) << PAGE_SHIFT;
+ pgprot_t prot = __pgprot(pud_val(*old_pud) ^ addr);
+ int i = 0;
+
+ do {
+ set_pmd(pmd, __pmd(addr | prot));
+ addr += PMD_SIZE;
+ } while (pmd++, i++, i < PTRS_PER_PMD);
+}
+
+static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ phys_addr_t phys, pgprot_t prot,
+ void *(*alloc)(unsigned long size))
{
pmd_t *pmd;
unsigned long next;
- pmdval_t prot_sect;
- pgprot_t prot_pte;
-
- if (map_io) {
- prot_sect = PROT_SECT_DEVICE_nGnRE;
- prot_pte = __pgprot(PROT_DEVICE_nGnRE);
- } else {
- prot_sect = PROT_SECT_NORMAL_EXEC;
- prot_pte = PAGE_KERNEL_EXEC;
- }
/*
* Check for initial section mappings in the pgd/pud and remove them.
*/
if (pud_none(*pud) || pud_bad(*pud)) {
- pmd = early_alloc(PTRS_PER_PMD * sizeof(pmd_t));
- pud_populate(&init_mm, pud, pmd);
+ pmd = alloc(PTRS_PER_PMD * sizeof(pmd_t));
+ if (pud_sect(*pud)) {
+ /*
+ * need to have the 1G of mappings continue to be
+ * present
+ */
+ split_pud(pud, pmd);
+ }
+ pud_populate(mm, pud, pmd);
+ flush_tlb_all();
}
pmd = pmd_offset(pud, addr);
@@ -186,31 +150,51 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
/* try section mapping first */
if (((addr | next | phys) & ~SECTION_MASK) == 0) {
pmd_t old_pmd =*pmd;
- set_pmd(pmd, __pmd(phys | prot_sect));
+ set_pmd(pmd, __pmd(phys |
+ pgprot_val(mk_sect_prot(prot))));
/*
* Check for previous table entries created during
* boot (__create_page_tables) and flush them.
*/
- if (!pmd_none(old_pmd))
+ if (!pmd_none(old_pmd)) {
flush_tlb_all();
+ if (pmd_table(old_pmd)) {
+ phys_addr_t table = __pa(pte_offset_map(&old_pmd, 0));
+ if (!WARN_ON_ONCE(slab_is_available()))
+ memblock_free(table, PAGE_SIZE);
+ }
+ }
} else {
alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys),
- prot_pte);
+ prot, alloc);
}
phys += next - addr;
} while (pmd++, addr = next, addr != end);
}
-static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr,
- unsigned long end, phys_addr_t phys,
- int map_io)
+static inline bool use_1G_block(unsigned long addr, unsigned long next,
+ unsigned long phys)
+{
+ if (PAGE_SHIFT != 12)
+ return false;
+
+ if (((addr | next | phys) & ~PUD_MASK) != 0)
+ return false;
+
+ return true;
+}
+
+static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ phys_addr_t phys, pgprot_t prot,
+ void *(*alloc)(unsigned long size))
{
pud_t *pud;
unsigned long next;
if (pgd_none(*pgd)) {
- pud = early_alloc(PTRS_PER_PUD * sizeof(pud_t));
- pgd_populate(&init_mm, pgd, pud);
+ pud = alloc(PTRS_PER_PUD * sizeof(pud_t));
+ pgd_populate(mm, pgd, pud);
}
BUG_ON(pgd_bad(*pgd));
@@ -221,10 +205,10 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr,
/*
* For 4K granule only, attempt to put down a 1GB block
*/
- if (!map_io && (PAGE_SHIFT == 12) &&
- ((addr | next | phys) & ~PUD_MASK) == 0) {
+ if (use_1G_block(addr, next, phys)) {
pud_t old_pud = *pud;
- set_pud(pud, __pud(phys | PROT_SECT_NORMAL_EXEC));
+ set_pud(pud, __pud(phys |
+ pgprot_val(mk_sect_prot(prot))));
/*
* If we have an old value for a pud, it will
@@ -234,12 +218,15 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr,
* Look up the old pmd table and free it.
*/
if (!pud_none(old_pud)) {
- phys_addr_t table = __pa(pmd_offset(&old_pud, 0));
- memblock_free(table, PAGE_SIZE);
flush_tlb_all();
+ if (pud_table(old_pud)) {
+ phys_addr_t table = __pa(pmd_offset(&old_pud, 0));
+ if (!WARN_ON_ONCE(slab_is_available()))
+ memblock_free(table, PAGE_SIZE);
+ }
}
} else {
- alloc_init_pmd(pud, addr, next, phys, map_io);
+ alloc_init_pmd(mm, pud, addr, next, phys, prot, alloc);
}
phys += next - addr;
} while (pud++, addr = next, addr != end);
@@ -249,9 +236,10 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr,
* Create the page directory entries and any necessary page tables for the
* mapping specified by 'md'.
*/
-static void __init __create_mapping(pgd_t *pgd, phys_addr_t phys,
- unsigned long virt, phys_addr_t size,
- int map_io)
+static void __create_mapping(struct mm_struct *mm, pgd_t *pgd,
+ phys_addr_t phys, unsigned long virt,
+ phys_addr_t size, pgprot_t prot,
+ void *(*alloc)(unsigned long size))
{
unsigned long addr, length, end, next;
@@ -261,31 +249,94 @@ static void __init __create_mapping(pgd_t *pgd, phys_addr_t phys,
end = addr + length;
do {
next = pgd_addr_end(addr, end);
- alloc_init_pud(pgd, addr, next, phys, map_io);
+ alloc_init_pud(mm, pgd, addr, next, phys, prot, alloc);
phys += next - addr;
} while (pgd++, addr = next, addr != end);
}
-static void __init create_mapping(phys_addr_t phys, unsigned long virt,
- phys_addr_t size)
+static void *late_alloc(unsigned long size)
+{
+ void *ptr;
+
+ BUG_ON(size > PAGE_SIZE);
+ ptr = (void *)__get_free_page(PGALLOC_GFP);
+ BUG_ON(!ptr);
+ return ptr;
+}
+
+static void __ref create_mapping(phys_addr_t phys, unsigned long virt,
+ phys_addr_t size, pgprot_t prot)
{
if (virt < VMALLOC_START) {
pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
&phys, virt);
return;
}
- __create_mapping(pgd_offset_k(virt & PAGE_MASK), phys, virt, size, 0);
+ __create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK), phys, virt,
+ size, prot, early_alloc);
+}
+
+void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
+ unsigned long virt, phys_addr_t size,
+ pgprot_t prot)
+{
+ __create_mapping(mm, pgd_offset(mm, virt), phys, virt, size, prot,
+ late_alloc);
}
-void __init create_id_mapping(phys_addr_t addr, phys_addr_t size, int map_io)
+static void create_mapping_late(phys_addr_t phys, unsigned long virt,
+ phys_addr_t size, pgprot_t prot)
{
- if ((addr >> PGDIR_SHIFT) >= ARRAY_SIZE(idmap_pg_dir)) {
- pr_warn("BUG: not creating id mapping for %pa\n", &addr);
+ if (virt < VMALLOC_START) {
+ pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
+ &phys, virt);
return;
}
- __create_mapping(&idmap_pg_dir[pgd_index(addr)],
- addr, addr, size, map_io);
+
+ return __create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK),
+ phys, virt, size, prot, late_alloc);
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+static void __init __map_memblock(phys_addr_t start, phys_addr_t end)
+{
+ /*
+ * Set up the executable regions using the existing section mappings
+ * for now. This will get more fine grained later once all memory
+ * is mapped
+ */
+ unsigned long kernel_x_start = round_down(__pa(_stext), SECTION_SIZE);
+ unsigned long kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE);
+
+ if (end < kernel_x_start) {
+ create_mapping(start, __phys_to_virt(start),
+ end - start, PAGE_KERNEL);
+ } else if (start >= kernel_x_end) {
+ create_mapping(start, __phys_to_virt(start),
+ end - start, PAGE_KERNEL);
+ } else {
+ if (start < kernel_x_start)
+ create_mapping(start, __phys_to_virt(start),
+ kernel_x_start - start,
+ PAGE_KERNEL);
+ create_mapping(kernel_x_start,
+ __phys_to_virt(kernel_x_start),
+ kernel_x_end - kernel_x_start,
+ PAGE_KERNEL_EXEC);
+ if (kernel_x_end < end)
+ create_mapping(kernel_x_end,
+ __phys_to_virt(kernel_x_end),
+ end - kernel_x_end,
+ PAGE_KERNEL);
+ }
+}
+#else
+static void __init __map_memblock(phys_addr_t start, phys_addr_t end)
+{
+ create_mapping(start, __phys_to_virt(start), end - start,
+ PAGE_KERNEL_EXEC);
}
+#endif
static void __init map_mem(void)
{
@@ -331,53 +382,70 @@ static void __init map_mem(void)
memblock_set_current_limit(limit);
}
#endif
-
- create_mapping(start, __phys_to_virt(start), end - start);
+ __map_memblock(start, end);
}
/* Limit no longer required. */
memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
}
+void __init fixup_executable(void)
+{
+#ifdef CONFIG_DEBUG_RODATA
+ /* now that we are actually fully mapped, make the start/end more fine grained */
+ if (!IS_ALIGNED((unsigned long)_stext, SECTION_SIZE)) {
+ unsigned long aligned_start = round_down(__pa(_stext),
+ SECTION_SIZE);
+
+ create_mapping(aligned_start, __phys_to_virt(aligned_start),
+ __pa(_stext) - aligned_start,
+ PAGE_KERNEL);
+ }
+
+ if (!IS_ALIGNED((unsigned long)__init_end, SECTION_SIZE)) {
+ unsigned long aligned_end = round_up(__pa(__init_end),
+ SECTION_SIZE);
+ create_mapping(__pa(__init_end), (unsigned long)__init_end,
+ aligned_end - __pa(__init_end),
+ PAGE_KERNEL);
+ }
+#endif
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void)
+{
+ create_mapping_late(__pa(_stext), (unsigned long)_stext,
+ (unsigned long)__init_begin - (unsigned long)_stext,
+ PAGE_KERNEL_EXEC | PTE_RDONLY);
+}
+#endif
+
+void fixup_init(void)
+{
+ create_mapping_late(__pa(__init_begin), (unsigned long)__init_begin,
+ (unsigned long)__init_end - (unsigned long)__init_begin,
+ PAGE_KERNEL);
+}
+
/*
* paging_init() sets up the page tables, initialises the zone memory
* maps and sets up the zero page.
*/
void __init paging_init(void)
{
- void *zero_page;
-
map_mem();
-
- /*
- * Finally flush the caches and tlb to ensure that we're in a
- * consistent state.
- */
- flush_cache_all();
- flush_tlb_all();
-
- /* allocate the zero page. */
- zero_page = early_alloc(PAGE_SIZE);
+ fixup_executable();
bootmem_init();
- empty_zero_page = virt_to_page(zero_page);
-
/*
* TTBR0 is only used for the identity mapping at this stage. Make it
* point to zero page to avoid speculatively fetching new entries.
*/
cpu_set_reserved_ttbr0();
- flush_tlb_all();
-}
-
-/*
- * Enable the identity mapping to allow the MMU disabling.
- */
-void setup_mm_for_reboot(void)
-{
- cpu_switch_mm(idmap_pg_dir, &init_mm);
- flush_tlb_all();
+ local_flush_tlb_all();
+ cpu_set_default_tcr_t0sz();
}
/*
@@ -463,3 +531,96 @@ void vmemmap_free(unsigned long start, unsigned long end)
{
}
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+
+static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
+#if CONFIG_PGTABLE_LEVELS > 2
+static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss;
+#endif
+#if CONFIG_PGTABLE_LEVELS > 3
+static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss;
+#endif
+
+static inline pud_t * fixmap_pud(unsigned long addr)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+
+ BUG_ON(pgd_none(*pgd) || pgd_bad(*pgd));
+
+ return pud_offset(pgd, addr);
+}
+
+static inline pmd_t * fixmap_pmd(unsigned long addr)
+{
+ pud_t *pud = fixmap_pud(addr);
+
+ BUG_ON(pud_none(*pud) || pud_bad(*pud));
+
+ return pmd_offset(pud, addr);
+}
+
+static inline pte_t * fixmap_pte(unsigned long addr)
+{
+ pmd_t *pmd = fixmap_pmd(addr);
+
+ BUG_ON(pmd_none(*pmd) || pmd_bad(*pmd));
+
+ return pte_offset_kernel(pmd, addr);
+}
+
+void __init early_fixmap_init(void)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ unsigned long addr = FIXADDR_START;
+
+ pgd = pgd_offset_k(addr);
+ pgd_populate(&init_mm, pgd, bm_pud);
+ pud = pud_offset(pgd, addr);
+ pud_populate(&init_mm, pud, bm_pmd);
+ pmd = pmd_offset(pud, addr);
+ pmd_populate_kernel(&init_mm, pmd, bm_pte);
+
+ /*
+ * The boot-ioremap range spans multiple pmds, for which
+ * we are not preparted:
+ */
+ BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
+ != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
+
+ if ((pmd != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)))
+ || pmd != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) {
+ WARN_ON(1);
+ pr_warn("pmd %p != %p, %p\n",
+ pmd, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)),
+ fixmap_pmd(fix_to_virt(FIX_BTMAP_END)));
+ pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
+ fix_to_virt(FIX_BTMAP_BEGIN));
+ pr_warn("fix_to_virt(FIX_BTMAP_END): %08lx\n",
+ fix_to_virt(FIX_BTMAP_END));
+
+ pr_warn("FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
+ pr_warn("FIX_BTMAP_BEGIN: %d\n", FIX_BTMAP_BEGIN);
+ }
+}
+
+void __set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t flags)
+{
+ unsigned long addr = __fix_to_virt(idx);
+ pte_t *pte;
+
+ if (idx >= __end_of_fixed_addresses) {
+ BUG();
+ return;
+ }
+
+ pte = fixmap_pte(addr);
+
+ if (pgprot_val(flags)) {
+ set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
+ } else {
+ pte_clear(&init_mm, addr, pte);
+ flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
+ }
+}
diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S
deleted file mode 100644
index 005d29e2977d..000000000000
--- a/arch/arm64/mm/proc-macros.S
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Based on arch/arm/mm/proc-macros.S
- *
- * Copyright (C) 2012 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-
-/*
- * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm)
- */
- .macro vma_vm_mm, rd, rn
- ldr \rd, [\rn, #VMA_VM_MM]
- .endm
-
-/*
- * mmid - get context id from mm pointer (mm->context.id)
- */
- .macro mmid, rd, rn
- ldr \rd, [\rn, #MM_CONTEXT_ID]
- .endm
-
-/*
- * dcache_line_size - get the minimum D-cache line size from the CTR register.
- */
- .macro dcache_line_size, reg, tmp
- mrs \tmp, ctr_el0 // read CTR
- ubfm \tmp, \tmp, #16, #19 // cache line size encoding
- mov \reg, #4 // bytes per word
- lsl \reg, \reg, \tmp // actual cache line size
- .endm
-
-/*
- * icache_line_size - get the minimum I-cache line size from the CTR register.
- */
- .macro icache_line_size, reg, tmp
- mrs \tmp, ctr_el0 // read CTR
- and \tmp, \tmp, #0xf // cache line size encoding
- mov \reg, #4 // bytes per word
- lsl \reg, \reg, \tmp // actual cache line size
- .endm
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 302353d9150b..ce2c71a52d05 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -23,22 +23,15 @@
#include <asm/assembler.h>
#include <asm/asm-offsets.h>
#include <asm/hwcap.h>
-#include <asm/pgtable-hwdef.h>
#include <asm/pgtable.h>
-#include "proc-macros.S"
-
#ifdef CONFIG_ARM64_64K_PAGES
#define TCR_TG_FLAGS TCR_TG0_64K | TCR_TG1_64K
#else
#define TCR_TG_FLAGS TCR_TG0_4K | TCR_TG1_4K
#endif
-#ifdef CONFIG_SMP
#define TCR_SMP_FLAGS TCR_SHARED
-#else
-#define TCR_SMP_FLAGS 0
-#endif
/* PTWs cacheable, inner/outer WBWA */
#define TCR_CACHE_FLAGS TCR_IRGN_WBWA | TCR_ORGN_WBWA
@@ -46,52 +39,6 @@
#define MAIR(attr, mt) ((attr) << ((mt) * 8))
/*
- * cpu_cache_off()
- *
- * Turn the CPU D-cache off.
- */
-ENTRY(cpu_cache_off)
- mrs x0, sctlr_el1
- bic x0, x0, #1 << 2 // clear SCTLR.C
- msr sctlr_el1, x0
- isb
- ret
-ENDPROC(cpu_cache_off)
-
-/*
- * cpu_reset(loc)
- *
- * Perform a soft reset of the system. Put the CPU into the same state
- * as it would be if it had been reset, and branch to what would be the
- * reset vector. It must be executed with the flat identity mapping.
- *
- * - loc - location to jump to for soft reset
- */
- .align 5
-ENTRY(cpu_reset)
- mrs x1, sctlr_el1
- bic x1, x1, #1
- msr sctlr_el1, x1 // disable the MMU
- isb
- ret x0
-ENDPROC(cpu_reset)
-
-ENTRY(cpu_soft_restart)
- /* Save address of cpu_reset() and reset address */
- mov x19, x0
- mov x20, x1
-
- /* Turn D-cache off */
- bl cpu_cache_off
-
- /* Push out all dirty data, and ensure cache is empty */
- bl flush_cache_all
-
- mov x0, x20
- ret x19
-ENDPROC(cpu_soft_restart)
-
-/*
* cpu_do_idle()
*
* Idle the processor (wait for interrupt).
@@ -156,6 +103,7 @@ ENTRY(cpu_do_resume)
msr cpacr_el1, x6
msr ttbr0_el1, x1
msr ttbr1_el1, x7
+ tcr_set_idmap_t0sz x8, x7
msr tcr_el1, x8
msr vbar_el1, x9
msr mdscr_el1, x10
@@ -179,7 +127,7 @@ ENDPROC(cpu_do_resume)
* - pgd_phys - physical address of new TTB
*/
ENTRY(cpu_do_switch_mm)
- mmid w1, x1 // get mm->context.id
+ mmid x1, x1 // get mm->context.id
bfi x0, x1, #48, #16 // set the ASID
msr ttbr0_el1, x0 // set TTBR0
isb
@@ -195,13 +143,13 @@ ENDPROC(cpu_do_switch_mm)
* value of the SCTLR_EL1 register.
*/
ENTRY(__cpu_setup)
- ic iallu // I+BTB cache invalidate
- tlbi vmalle1is // invalidate I + D TLBs
- dsb ish
+ tlbi vmalle1 // Invalidate local TLB
+ dsb nsh
mov x0, #3 << 20
msr cpacr_el1, x0 // Enable FP/ASIMD
- msr mdscr_el1, xzr // Reset mdscr_el1
+ mov x0, #1 << 12 // Reset mdscr_el1 and disable
+ msr mdscr_el1, x0 // access to the DCC from EL0
isb // Unmask debug exceptions now,
enable_dbg // since this is per-cpu
/*
@@ -235,6 +183,8 @@ ENTRY(__cpu_setup)
*/
ldr x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
TCR_TG_FLAGS | TCR_ASID16 | TCR_TBI0
+ tcr_set_idmap_t0sz x10, x9
+
/*
* Read the PARange bits from ID_AA64MMFR0_EL1 and set the IPS bits in
* TCR_EL1.
diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S
index 8bbe9401f4f0..b96db5dafec4 100644
--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -49,6 +49,7 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
+#include <asm/uaccess.h>
#include <xen/interface/xen.h>
@@ -89,6 +90,20 @@ ENTRY(privcmd_call)
mov x2, x3
mov x3, x4
mov x4, x5
+ /*
+ * Privcmd calls are issued by the userspace. The kernel needs to
+ * enable access to TTBR0_EL1 as the hypervisor would issue stage 1
+ * translations to user memory via AT instructions. Since AT
+ * instructions are not affected by the PAN bit (ARMv8.1), we only
+ * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation
+ * is enabled (it implies that hardware UAO and PAN disabled).
+ */
+ uaccess_ttbr0_enable x6, x7
hvc XEN_IMM
+
+ /*
+ * Disable userspace access from kernel once the hyp call completed.
+ */
+ uaccess_ttbr0_disable x6
ret
ENDPROC(privcmd_call);
diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h
index a978f3fe7c25..d56afa99a514 100644
--- a/arch/avr32/include/asm/thread_info.h
+++ b/arch/avr32/include/asm/thread_info.h
@@ -30,7 +30,6 @@ struct thread_info {
saved by debug handler
when setting up
trampoline */
- struct restart_block restart_block;
__u8 supervisor_stack[0];
};
@@ -41,9 +40,6 @@ struct thread_info {
.flags = 0, \
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall \
- } \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/avr32/kernel/asm-offsets.c b/arch/avr32/kernel/asm-offsets.c
index d6a8193a1d2f..e41c84516e5d 100644
--- a/arch/avr32/kernel/asm-offsets.c
+++ b/arch/avr32/kernel/asm-offsets.c
@@ -18,7 +18,6 @@ void foo(void)
OFFSET(TI_preempt_count, thread_info, preempt_count);
OFFSET(TI_rar_saved, thread_info, rar_saved);
OFFSET(TI_rsr_saved, thread_info, rsr_saved);
- OFFSET(TI_restart_block, thread_info, restart_block);
BLANK();
OFFSET(TSK_active_mm, task_struct, active_mm);
BLANK();
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index d309fbcc3bd6..8f1c63b9b983 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -69,7 +69,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
sigset_t set;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
frame = (struct rt_sigframe __user *)regs->sp;
pr_debug("SIG return: frame = %p\n", frame);
diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h
index 55f473bdad36..57c3a8bd583d 100644
--- a/arch/blackfin/include/asm/thread_info.h
+++ b/arch/blackfin/include/asm/thread_info.h
@@ -42,7 +42,6 @@ struct thread_info {
int cpu; /* cpu we're on */
int preempt_count; /* 0 => preemptable, <0 => BUG */
mm_segment_t addr_limit; /* address limit */
- struct restart_block restart_block;
#ifndef CONFIG_SMP
struct l1_scratch_task_info l1_task_info;
#endif
@@ -58,9 +57,6 @@ struct thread_info {
.flags = 0, \
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
#define init_stack (init_thread_union.stack)
diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c
index ef275571d885..f2a8b5493bd3 100644
--- a/arch/blackfin/kernel/signal.c
+++ b/arch/blackfin/kernel/signal.c
@@ -44,7 +44,7 @@ rt_restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *p
int err = 0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
#define RESTORE(x) err |= __get_user(regs->x, &sc->sc_##x)
diff --git a/arch/c6x/include/asm/thread_info.h b/arch/c6x/include/asm/thread_info.h
index d4e9ef87076d..584e253f3217 100644
--- a/arch/c6x/include/asm/thread_info.h
+++ b/arch/c6x/include/asm/thread_info.h
@@ -45,7 +45,6 @@ struct thread_info {
int cpu; /* cpu we're on */
int preempt_count; /* 0 = preemptable, <0 = BUG */
mm_segment_t addr_limit; /* thread address space */
- struct restart_block restart_block;
};
/*
@@ -61,9 +60,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c
index fe68226f6c4d..3c4bb5a5c382 100644
--- a/arch/c6x/kernel/signal.c
+++ b/arch/c6x/kernel/signal.c
@@ -68,7 +68,7 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs)
sigset_t set;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Since we stacked the signal on a dword boundary,
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index 9b32d338838b..74d7ba35120d 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -67,7 +67,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
unsigned long old_usp;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/* restore the regs from &sc->regs (same as sc, since regs is first)
* (sc is already checked for VERIFY_READ since the sigframe was
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index 78ce3b1c9bcb..870e3e069318 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -59,7 +59,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
unsigned long old_usp;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Restore the registers from &sc->regs. sc is already checked
diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h
index 55dede18c032..7286db5ed90e 100644
--- a/arch/cris/include/asm/thread_info.h
+++ b/arch/cris/include/asm/thread_info.h
@@ -38,7 +38,6 @@ struct thread_info {
0-0xBFFFFFFF for user-thead
0-0xFFFFFFFF for kernel-thread
*/
- struct restart_block restart_block;
__u8 supervisor_stack[0];
};
@@ -56,9 +55,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h
index af29e17c0181..6b917f1c2955 100644
--- a/arch/frv/include/asm/thread_info.h
+++ b/arch/frv/include/asm/thread_info.h
@@ -41,7 +41,6 @@ struct thread_info {
* 0-0xBFFFFFFF for user-thead
* 0-0xFFFFFFFF for kernel-thread
*/
- struct restart_block restart_block;
__u8 supervisor_stack[0];
};
@@ -65,9 +64,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/frv/kernel/asm-offsets.c b/arch/frv/kernel/asm-offsets.c
index 9de96843a278..446e89d500cc 100644
--- a/arch/frv/kernel/asm-offsets.c
+++ b/arch/frv/kernel/asm-offsets.c
@@ -40,7 +40,6 @@ void foo(void)
OFFSET(TI_CPU, thread_info, cpu);
OFFSET(TI_PREEMPT_COUNT, thread_info, preempt_count);
OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit);
- OFFSET(TI_RESTART_BLOCK, thread_info, restart_block);
BLANK();
/* offsets into register file storage */
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index dc3d59de0870..336713ab4745 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -62,7 +62,7 @@ static int restore_sigcontext(struct sigcontext __user *sc, int *_gr8)
unsigned long tbr, psr;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
tbr = user->i.tbr;
psr = user->i.psr;
diff --git a/arch/hexagon/include/asm/thread_info.h b/arch/hexagon/include/asm/thread_info.h
index a59dad3b3695..bacd3d6030c5 100644
--- a/arch/hexagon/include/asm/thread_info.h
+++ b/arch/hexagon/include/asm/thread_info.h
@@ -56,7 +56,6 @@ struct thread_info {
* used for syscalls somehow;
* seems to have a function pointer and four arguments
*/
- struct restart_block restart_block;
/* Points to the current pt_regs frame */
struct pt_regs *regs;
/*
@@ -83,9 +82,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = 1, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
.sp = 0, \
.regs = NULL, \
}
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index eadd70e47e7e..b039a624c170 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -239,7 +239,7 @@ asmlinkage int sys_rt_sigreturn(void)
sigset_t blocked;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
frame = (struct rt_sigframe __user *)pt_psp(regs);
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
index 5441b14994fc..4c5736930539 100644
--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -96,8 +96,8 @@ define archhelp
echo '* unwcheck - Check vmlinux for invalid unwind info'
endef
-archprepare: make_nr_irqs_h FORCE
+archprepare: make_nr_irqs_h
PHONY += make_nr_irqs_h FORCE
-make_nr_irqs_h: FORCE
+make_nr_irqs_h:
$(Q)$(MAKE) $(build)=arch/ia64/kernel include/generated/nr-irqs.h
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index 5b17418b4223..c16f21a068ff 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -27,7 +27,6 @@ struct thread_info {
__u32 status; /* Thread synchronous flags */
mm_segment_t addr_limit; /* user-level address space limit */
int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */
- struct restart_block restart_block;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
__u64 ac_stamp;
__u64 ac_leave;
@@ -46,9 +45,6 @@ struct thread_info {
.cpu = 0, \
.addr_limit = KERNEL_DS, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#ifndef ASM_OFFSETS_C
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 6d92170be457..b3a124da71e5 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -46,7 +46,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
long err;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/* restore scratch that always needs gets updated during signal delivery: */
err = __get_user(flags, &sc->sc_flags);
diff --git a/arch/m32r/include/asm/thread_info.h b/arch/m32r/include/asm/thread_info.h
index 00171703402f..32422d0211c3 100644
--- a/arch/m32r/include/asm/thread_info.h
+++ b/arch/m32r/include/asm/thread_info.h
@@ -34,7 +34,6 @@ struct thread_info {
0-0xBFFFFFFF for user-thread
0-0xFFFFFFFF for kernel-thread
*/
- struct restart_block restart_block;
__u8 supervisor_stack[0];
};
@@ -49,7 +48,6 @@ struct thread_info {
#define TI_CPU 0x00000010
#define TI_PRE_COUNT 0x00000014
#define TI_ADDR_LIMIT 0x00000018
-#define TI_RESTART_BLOCK 0x000001C
#endif
@@ -68,9 +66,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index 95408b8f130a..7736c6660a15 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -48,7 +48,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
unsigned int err = 0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
#define COPY(x) err |= __get_user(regs->x, &sc->sc_##x)
COPY(r4);
diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h
index 21a4784ca5a1..c54256e69e64 100644
--- a/arch/m68k/include/asm/thread_info.h
+++ b/arch/m68k/include/asm/thread_info.h
@@ -31,7 +31,6 @@ struct thread_info {
int preempt_count; /* 0 => preemptable, <0 => BUG */
__u32 cpu; /* should always be 0 on m68k */
unsigned long tp_value; /* thread pointer */
- struct restart_block restart_block;
};
#endif /* __ASSEMBLY__ */
@@ -41,9 +40,6 @@ struct thread_info {
.exec_domain = &default_exec_domain, \
.addr_limit = KERNEL_DS, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_stack (init_thread_union.stack)
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 967a8b7e1527..d7179281e74a 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -655,7 +655,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u
int err = 0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/* get previous context */
if (copy_from_user(&context, usc, sizeof(context)))
@@ -693,7 +693,7 @@ rt_restore_ucontext(struct pt_regs *regs, struct switch_stack *sw,
int err;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
err = __get_user(temp, &uc->uc_mcontext.version);
if (temp != MCONTEXT_VERSION)
diff --git a/arch/metag/include/asm/thread_info.h b/arch/metag/include/asm/thread_info.h
index 47711336119e..afb3ca4776d1 100644
--- a/arch/metag/include/asm/thread_info.h
+++ b/arch/metag/include/asm/thread_info.h
@@ -35,9 +35,8 @@ struct thread_info {
int preempt_count; /* 0 => preemptable, <0 => BUG */
mm_segment_t addr_limit; /* thread address space */
- struct restart_block restart_block;
- u8 supervisor_stack[0];
+ u8 supervisor_stack[0] __aligned(8);
};
#else /* !__ASSEMBLY__ */
@@ -74,9 +73,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/metag/kernel/signal.c b/arch/metag/kernel/signal.c
index 0d100d5c1407..ce49d429c74a 100644
--- a/arch/metag/kernel/signal.c
+++ b/arch/metag/kernel/signal.c
@@ -48,7 +48,7 @@ static int restore_sigcontext(struct pt_regs *regs,
int err;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
err = metag_gp_regs_copyin(regs, 0, sizeof(struct user_gp_regs), NULL,
&sc->regs);
diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h
index 8c9d36591a03..b699fbd7de4a 100644
--- a/arch/microblaze/include/asm/thread_info.h
+++ b/arch/microblaze/include/asm/thread_info.h
@@ -71,7 +71,6 @@ struct thread_info {
__u32 cpu; /* current CPU */
__s32 preempt_count; /* 0 => preemptable,< 0 => BUG*/
mm_segment_t addr_limit; /* thread address space */
- struct restart_block restart_block;
struct cpu_context cpu_context;
};
@@ -87,9 +86,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index 8955a3829cf0..0245c27fa720 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -89,7 +89,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
int rval;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h
index 7de865805deb..955bc921855a 100644
--- a/arch/mips/include/asm/thread_info.h
+++ b/arch/mips/include/asm/thread_info.h
@@ -34,7 +34,6 @@ struct thread_info {
* 0x7fffffff for user-thead
* 0xffffffff for kernel-thread
*/
- struct restart_block restart_block;
struct pt_regs *regs;
};
@@ -49,9 +48,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index b1d84bd4efb3..3b2dfdb4865f 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -98,7 +98,6 @@ void output_thread_info_defines(void)
OFFSET(TI_CPU, thread_info, cpu);
OFFSET(TI_PRE_COUNT, thread_info, preempt_count);
OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit);
- OFFSET(TI_RESTART_BLOCK, thread_info, restart_block);
OFFSET(TI_REGS, thread_info, regs);
DEFINE(_THREAD_SIZE, THREAD_SIZE);
DEFINE(_THREAD_MASK, THREAD_MASK);
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 16f1e4f2bf3c..27a1646a7540 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -243,7 +243,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
int i;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
err |= __get_user(regs->cp0_epc, &sc->sc_pc);
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index f019f100a4bd..5d7f2634996f 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -220,7 +220,7 @@ static int restore_sigcontext32(struct pt_regs *regs,
int i;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
err |= __get_user(regs->cp0_epc, &sc->sc_pc);
err |= __get_user(regs->hi, &sc->sc_mdhi);
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index 9be924f08f34..7f463465b4e8 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -147,7 +147,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
unsigned long random_factor = 0UL;
if (current->flags & PF_RANDOMIZE) {
- random_factor = get_random_int();
+ random_factor = get_random_long();
random_factor = random_factor << PAGE_SHIFT;
if (TASK_IS_32BIT_ADDR)
random_factor &= 0xfffffful;
@@ -166,7 +166,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
static inline unsigned long brk_rnd(void)
{
- unsigned long rnd = get_random_int();
+ unsigned long rnd = get_random_long();
rnd = rnd << PAGE_SHIFT;
/* 8MB for 32bit, 256MB for 64bit */
diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h
index bf280eaccd36..c1c374f0ec12 100644
--- a/arch/mn10300/include/asm/thread_info.h
+++ b/arch/mn10300/include/asm/thread_info.h
@@ -50,7 +50,6 @@ struct thread_info {
0-0xBFFFFFFF for user-thead
0-0xFFFFFFFF for kernel-thread
*/
- struct restart_block restart_block;
__u8 supervisor_stack[0];
};
@@ -80,9 +79,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/mn10300/kernel/asm-offsets.c b/arch/mn10300/kernel/asm-offsets.c
index 47b3bb0c04ff..d780670cbaf3 100644
--- a/arch/mn10300/kernel/asm-offsets.c
+++ b/arch/mn10300/kernel/asm-offsets.c
@@ -28,7 +28,6 @@ void foo(void)
OFFSET(TI_cpu, thread_info, cpu);
OFFSET(TI_preempt_count, thread_info, preempt_count);
OFFSET(TI_addr_limit, thread_info, addr_limit);
- OFFSET(TI_restart_block, thread_info, restart_block);
BLANK();
OFFSET(REG_D0, pt_regs, d0);
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index a6c0858592c3..8609845f12c5 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -40,7 +40,7 @@ static int restore_sigcontext(struct pt_regs *regs,
unsigned int err = 0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (is_using_fpu(current))
fpu_kill_state(current);
diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h
index d797acc901e4..875f0845a707 100644
--- a/arch/openrisc/include/asm/thread_info.h
+++ b/arch/openrisc/include/asm/thread_info.h
@@ -57,7 +57,6 @@ struct thread_info {
0-0x7FFFFFFF for user-thead
0-0xFFFFFFFF for kernel-thread
*/
- struct restart_block restart_block;
__u8 supervisor_stack[0];
/* saved context data */
@@ -79,9 +78,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = 1, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
.ksp = 0, \
}
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index 7d1b8235bf90..4112175bf803 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -46,7 +46,7 @@ static int restore_sigcontext(struct pt_regs *regs,
int err = 0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Restore the regs from &sc->regs.
diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h
index 47f11c707b65..03d4d5afb120 100644
--- a/arch/parisc/include/asm/cache.h
+++ b/arch/parisc/include/asm/cache.h
@@ -30,6 +30,9 @@
#define __read_mostly __attribute__((__section__(".data..read_mostly")))
+/* Read-only memory is marked before mark_rodata_ro() is called. */
+#define __ro_after_init __read_mostly
+
void parisc_cache_init(void); /* initializes cache-flushing */
void disable_sr_hashing_asm(int); /* low level support for above */
void disable_sr_hashing(void); /* turns off space register hashing */
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index de65f66ea64e..b188d1f26027 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -121,10 +121,6 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma
}
}
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
#include <asm/kmap_types.h>
#define ARCH_HAS_KMAP
diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h
index a84611835549..fb13e3865563 100644
--- a/arch/parisc/include/asm/thread_info.h
+++ b/arch/parisc/include/asm/thread_info.h
@@ -14,7 +14,6 @@ struct thread_info {
mm_segment_t addr_limit; /* user-level address space limit */
__u32 cpu; /* current CPU */
int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */
- struct restart_block restart_block;
};
#define INIT_THREAD_INFO(tsk) \
@@ -25,9 +24,6 @@ struct thread_info {
.cpu = 0, \
.addr_limit = KERNEL_DS, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall \
- } \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 012d4fa63d97..9b910a0251b8 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -99,7 +99,7 @@ sys_rt_sigreturn(struct pt_regs *regs, int in_syscall)
sigframe_size = PARISC_RT_SIGFRAME_SIZE32;
#endif
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/* Unwind the user stack to get the rt_sigframe structure. */
frame = (struct rt_sigframe __user *)
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index b034ecdb7c74..9344114b1037 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -43,7 +43,6 @@ struct thread_info {
int cpu; /* cpu we're on */
int preempt_count; /* 0 => preemptable,
<0 => BUG */
- struct restart_block restart_block;
unsigned long local_flags; /* private flags for thread */
/* low level flags - has atomic operations done on it */
@@ -59,9 +58,6 @@ struct thread_info {
.exec_domain = &default_exec_domain, \
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
.flags = 0, \
}
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 5d2ea3f90f72..1c165dd3ae3c 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1643,9 +1643,9 @@ static inline unsigned long brk_rnd(void)
/* 8MB for 32bit, 1GB for 64bit */
if (is_32bit_task())
- rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT)));
+ rnd = (get_random_long() % (1UL<<(23-PAGE_SHIFT)));
else
- rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT)));
+ rnd = (get_random_long() % (1UL<<(30-PAGE_SHIFT)));
return rnd << PAGE_SHIFT;
}
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 28f36b9c0e55..da50e0c9c57e 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -1229,7 +1229,7 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
int tm_restore = 0;
#endif
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
rt_sf = (struct rt_sigframe __user *)
(regs->gpr[1] + __SIGNAL_FRAMESIZE + 16);
@@ -1502,7 +1502,7 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
#endif
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
sc = &sf->sctx;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 2cb0c94cafa5..c7c24d2e2bdb 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -666,7 +666,7 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5,
#endif
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (!access_ok(VERIFY_READ, uc, sizeof(*uc)))
goto badframe;
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index cb8bdbe4972f..bde8ed0c6ddd 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -60,9 +60,9 @@ static unsigned long mmap_rnd(void)
if (current->flags & PF_RANDOMIZE) {
/* 8MB for 32bit, 1GB for 64bit */
if (is_32bit_task())
- rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT)));
+ rnd = get_random_long() % (1<<(23-PAGE_SHIFT));
else
- rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT)));
+ rnd = get_random_long() % (1UL<<(30-PAGE_SHIFT));
}
return rnd << PAGE_SHIFT;
}
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index 4d62fd5b56e5..ef1df718642d 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -39,7 +39,6 @@ struct thread_info {
unsigned long sys_call_table; /* System call table address */
unsigned int cpu; /* current CPU */
int preempt_count; /* 0 => preemptable, <0 => BUG */
- struct restart_block restart_block;
unsigned int system_call;
__u64 user_timer;
__u64 system_timer;
@@ -56,9 +55,6 @@ struct thread_info {
.flags = 0, \
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index eb95315109e0..708bff555116 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -222,7 +222,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
int i;
/* Alwys make any pending restarted system call return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs)))
return -EFAULT;
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 0c1a0ff0a558..237f878f3fb7 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -162,7 +162,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
_sigregs user_sregs;
/* Alwys make any pending restarted system call return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs)))
return -EFAULT;
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 005d665fe4a5..7197d73640a9 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -214,20 +214,20 @@ void update_vsyscall(struct timekeeper *tk)
{
u64 nsecps;
- if (tk->tkr.clock != &clocksource_tod)
+ if (tk->tkr_mono.clock != &clocksource_tod)
return;
/* Make userspace gettimeofday spin until we're done. */
++vdso_data->tb_update_count;
smp_wmb();
- vdso_data->xtime_tod_stamp = tk->tkr.cycle_last;
+ vdso_data->xtime_tod_stamp = tk->tkr_mono.cycle_last;
vdso_data->xtime_clock_sec = tk->xtime_sec;
- vdso_data->xtime_clock_nsec = tk->tkr.xtime_nsec;
+ vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec;
vdso_data->wtom_clock_sec =
tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- vdso_data->wtom_clock_nsec = tk->tkr.xtime_nsec +
- + ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr.shift);
- nsecps = (u64) NSEC_PER_SEC << tk->tkr.shift;
+ vdso_data->wtom_clock_nsec = tk->tkr_mono.xtime_nsec +
+ + ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
+ nsecps = (u64) NSEC_PER_SEC << tk->tkr_mono.shift;
while (vdso_data->wtom_clock_nsec >= nsecps) {
vdso_data->wtom_clock_nsec -= nsecps;
vdso_data->wtom_clock_sec++;
@@ -235,7 +235,7 @@ void update_vsyscall(struct timekeeper *tk)
vdso_data->xtime_coarse_sec = tk->xtime_sec;
vdso_data->xtime_coarse_nsec =
- (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+ (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
vdso_data->wtom_coarse_sec =
vdso_data->xtime_coarse_sec + tk->wall_to_monotonic.tv_sec;
vdso_data->wtom_coarse_nsec =
@@ -245,8 +245,8 @@ void update_vsyscall(struct timekeeper *tk)
vdso_data->wtom_coarse_sec++;
}
- vdso_data->tk_mult = tk->tkr.mult;
- vdso_data->tk_shift = tk->tkr.shift;
+ vdso_data->tk_mult = tk->tkr_mono.mult;
+ vdso_data->tk_shift = tk->tkr_mono.shift;
smp_wmb();
++vdso_data->tb_update_count;
}
diff --git a/arch/score/include/asm/thread_info.h b/arch/score/include/asm/thread_info.h
index 656b7ada9326..33864fa2a8d4 100644
--- a/arch/score/include/asm/thread_info.h
+++ b/arch/score/include/asm/thread_info.h
@@ -42,7 +42,6 @@ struct thread_info {
* 0-0xFFFFFFFF for kernel-thread
*/
mm_segment_t addr_limit;
- struct restart_block restart_block;
struct pt_regs *regs;
};
@@ -58,9 +57,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = 1, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/score/kernel/asm-offsets.c b/arch/score/kernel/asm-offsets.c
index 57788f44c6fb..b4d5214a7a7e 100644
--- a/arch/score/kernel/asm-offsets.c
+++ b/arch/score/kernel/asm-offsets.c
@@ -106,7 +106,6 @@ void output_thread_info_defines(void)
OFFSET(TI_CPU, thread_info, cpu);
OFFSET(TI_PRE_COUNT, thread_info, preempt_count);
OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit);
- OFFSET(TI_RESTART_BLOCK, thread_info, restart_block);
OFFSET(TI_REGS, thread_info, regs);
DEFINE(KERNEL_STACK_SIZE, THREAD_SIZE);
DEFINE(KERNEL_STACK_MASK, THREAD_MASK);
diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c
index 1651807774ad..e381c8c4ff65 100644
--- a/arch/score/kernel/signal.c
+++ b/arch/score/kernel/signal.c
@@ -141,7 +141,7 @@ score_rt_sigreturn(struct pt_regs *regs)
int sig;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
frame = (struct rt_sigframe __user *) regs->regs[0];
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h
index ad27ffa65e2e..657c03919627 100644
--- a/arch/sh/include/asm/thread_info.h
+++ b/arch/sh/include/asm/thread_info.h
@@ -33,7 +33,6 @@ struct thread_info {
__u32 cpu;
int preempt_count; /* 0 => preemptable, <0 => BUG */
mm_segment_t addr_limit; /* thread address space */
- struct restart_block restart_block;
unsigned long previous_sp; /* sp of previous stack in case
of nested IRQ stacks */
__u8 supervisor_stack[0];
@@ -63,9 +62,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c
index 08a2be775b6c..542225fedb11 100644
--- a/arch/sh/kernel/asm-offsets.c
+++ b/arch/sh/kernel/asm-offsets.c
@@ -25,7 +25,6 @@ int main(void)
DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count));
- DEFINE(TI_RESTART_BLOCK,offsetof(struct thread_info, restart_block));
DEFINE(TI_SIZE, sizeof(struct thread_info));
#ifdef CONFIG_HIBERNATION
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index 2f002b24fb92..0b34f2a704fe 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -156,7 +156,7 @@ asmlinkage int sys_sigreturn(void)
int r0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
@@ -186,7 +186,7 @@ asmlinkage int sys_rt_sigreturn(void)
int r0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index 897abe7b871e..71993c6a7d94 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -260,7 +260,7 @@ asmlinkage int sys_sigreturn(unsigned long r2, unsigned long r3,
long long ret;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
@@ -294,7 +294,7 @@ asmlinkage int sys_rt_sigreturn(unsigned long r2, unsigned long r3,
long long ret;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h
index 025c98446b1e..fd7bd0a440ca 100644
--- a/arch/sparc/include/asm/thread_info_32.h
+++ b/arch/sparc/include/asm/thread_info_32.h
@@ -47,8 +47,6 @@ struct thread_info {
struct reg_window32 reg_window[NSWINS]; /* align for ldd! */
unsigned long rwbuf_stkptrs[NSWINS];
unsigned long w_saved;
-
- struct restart_block restart_block;
};
/*
@@ -62,9 +60,6 @@ struct thread_info {
.flags = 0, \
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
@@ -103,7 +98,6 @@ register struct thread_info *current_thread_info_reg asm("g6");
#define TI_REG_WINDOW 0x30
#define TI_RWIN_SPTRS 0x230
#define TI_W_SAVED 0x250
-/* #define TI_RESTART_BLOCK 0x25n */ /* Nobody cares */
/*
* thread information flag bit numbers
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index 798f0279a4b5..ff455164732a 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -58,8 +58,6 @@ struct thread_info {
unsigned long gsr[7];
unsigned long xfsr[7];
- struct restart_block restart_block;
-
struct pt_regs *kern_una_regs;
unsigned int kern_una_insn;
@@ -92,10 +90,9 @@ struct thread_info {
#define TI_RWIN_SPTRS 0x000003c8
#define TI_GSR 0x00000400
#define TI_XFSR 0x00000438
-#define TI_RESTART_BLOCK 0x00000470
-#define TI_KUNA_REGS 0x000004a0
-#define TI_KUNA_INSN 0x000004a8
-#define TI_FPREGS 0x000004c0
+#define TI_KUNA_REGS 0x00000470
+#define TI_KUNA_INSN 0x00000478
+#define TI_FPREGS 0x00000480
/* We embed this in the uppermost byte of thread_info->flags */
#define FAULT_CODE_WRITE 0x01 /* Write access, implies D-TLB */
@@ -124,9 +121,6 @@ struct thread_info {
.current_ds = ASI_P, \
.exec_domain = &default_exec_domain, \
.preempt_count = INIT_PREEMPT_COUNT, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index 94646266f0e4..77655f0f0fc7 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -162,7 +162,7 @@ void do_sigreturn32(struct pt_regs *regs)
int err, i;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
synchronize_user_stack();
@@ -252,7 +252,7 @@ asmlinkage void do_rt_sigreturn32(struct pt_regs *regs)
int err, i;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
synchronize_user_stack();
regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL;
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 8492291424ab..c3c12efe0bc0 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -82,7 +82,7 @@ asmlinkage void do_sigreturn(struct pt_regs *regs)
int err;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
synchronize_user_stack();
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index 9acf9822cbbd..5ee930c48f4c 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -265,7 +265,7 @@ void do_rt_sigreturn(struct pt_regs *regs)
int err;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
synchronize_user_stack ();
sf = (struct rt_signal_frame __user *)
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 7f0f7c01b297..98a5cf313d39 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -264,7 +264,7 @@ static unsigned long mmap_rnd(void)
unsigned long rnd = 0UL;
if (current->flags & PF_RANDOMIZE) {
- unsigned long val = get_random_int();
+ unsigned long val = get_random_long();
if (test_thread_flag(TIF_32BIT))
rnd = (val % (1UL << (23UL-PAGE_SHIFT)));
else
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 21fc668a19a8..6547bff8a756 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2730,8 +2730,6 @@ void __init trap_init(void)
TI_NEW_CHILD != offsetof(struct thread_info, new_child) ||
TI_CURRENT_DS != offsetof(struct thread_info,
current_ds) ||
- TI_RESTART_BLOCK != offsetof(struct thread_info,
- restart_block) ||
TI_KUNA_REGS != offsetof(struct thread_info,
kern_una_regs) ||
TI_KUNA_INSN != offsetof(struct thread_info,
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index 48e4fd0f38e4..96c14c1430d8 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -36,7 +36,6 @@ struct thread_info {
mm_segment_t addr_limit; /* thread address space
(KERNEL_DS or USER_DS) */
- struct restart_block restart_block;
struct single_step_state *step_state; /* single step state
(if non-zero) */
int align_ctl; /* controls unaligned access */
@@ -57,9 +56,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
.step_state = NULL, \
.align_ctl = 0, \
}
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index 7c2fecc52177..886a4076034f 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -49,7 +49,7 @@ int restore_sigcontext(struct pt_regs *regs,
int i;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Enforce that sigcontext is like pt_regs, and doesn't mess
diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index b854a1cd0079..683db951c2eb 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -257,34 +257,34 @@ void update_vsyscall_tz(void)
void update_vsyscall(struct timekeeper *tk)
{
- if (tk->tkr.clock != &cycle_counter_cs)
+ if (tk->tkr_mono.clock != &cycle_counter_cs)
return;
write_seqcount_begin(&vdso_data->tb_seq);
- vdso_data->cycle_last = tk->tkr.cycle_last;
- vdso_data->mask = tk->tkr.mask;
- vdso_data->mult = tk->tkr.mult;
- vdso_data->shift = tk->tkr.shift;
+ vdso_data->cycle_last = tk->tkr_mono.cycle_last;
+ vdso_data->mask = tk->tkr_mono.mask;
+ vdso_data->mult = tk->tkr_mono.mult;
+ vdso_data->shift = tk->tkr_mono.shift;
vdso_data->wall_time_sec = tk->xtime_sec;
- vdso_data->wall_time_snsec = tk->tkr.xtime_nsec;
+ vdso_data->wall_time_snsec = tk->tkr_mono.xtime_nsec;
vdso_data->monotonic_time_sec = tk->xtime_sec
+ tk->wall_to_monotonic.tv_sec;
- vdso_data->monotonic_time_snsec = tk->tkr.xtime_nsec
+ vdso_data->monotonic_time_snsec = tk->tkr_mono.xtime_nsec
+ ((u64)tk->wall_to_monotonic.tv_nsec
- << tk->tkr.shift);
+ << tk->tkr_mono.shift);
while (vdso_data->monotonic_time_snsec >=
- (((u64)NSEC_PER_SEC) << tk->tkr.shift)) {
+ (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
vdso_data->monotonic_time_snsec -=
- ((u64)NSEC_PER_SEC) << tk->tkr.shift;
+ ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
vdso_data->monotonic_time_sec++;
}
vdso_data->wall_time_coarse_sec = tk->xtime_sec;
- vdso_data->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >>
- tk->tkr.shift);
+ vdso_data->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >>
+ tk->tkr_mono.shift);
vdso_data->monotonic_time_coarse_sec =
vdso_data->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h
index 1c5b2a83046a..e04114c4fcd9 100644
--- a/arch/um/include/asm/thread_info.h
+++ b/arch/um/include/asm/thread_info.h
@@ -22,7 +22,6 @@ struct thread_info {
mm_segment_t addr_limit; /* thread address space:
0-0xBFFFFFFF for user
0-0xFFFFFFFF for kernel */
- struct restart_block restart_block;
struct thread_info *real_thread; /* Points to non-IRQ stack */
};
@@ -34,9 +33,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
.real_thread = NULL, \
}
diff --git a/arch/unicore32/include/asm/memory.h b/arch/unicore32/include/asm/memory.h
index debafc40200a..3bb0a29fd2d7 100644
--- a/arch/unicore32/include/asm/memory.h
+++ b/arch/unicore32/include/asm/memory.h
@@ -61,12 +61,6 @@
#endif
/*
- * Convert a physical address to a Page Frame Number and back
- */
-#define __phys_to_pfn(paddr) ((paddr) >> PAGE_SHIFT)
-#define __pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT)
-
-/*
* Convert a page to/from a physical address
*/
#define page_to_phys(page) (__pfn_to_phys(page_to_pfn(page)))
diff --git a/arch/unicore32/include/asm/thread_info.h b/arch/unicore32/include/asm/thread_info.h
index af36d8eabdf1..63e2839dfeb8 100644
--- a/arch/unicore32/include/asm/thread_info.h
+++ b/arch/unicore32/include/asm/thread_info.h
@@ -79,7 +79,6 @@ struct thread_info {
#ifdef CONFIG_UNICORE_FPU_F64
struct fp_state fpstate __attribute__((aligned(8)));
#endif
- struct restart_block restart_block;
};
#define INIT_THREAD_INFO(tsk) \
@@ -89,9 +88,6 @@ struct thread_info {
.flags = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
index 7c8fb7018dc6..d329f85766cc 100644
--- a/arch/unicore32/kernel/signal.c
+++ b/arch/unicore32/kernel/signal.c
@@ -105,7 +105,7 @@ asmlinkage int __sys_rt_sigreturn(struct pt_regs *regs)
struct rt_sigframe __user *frame;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
/*
* Since we stacked the signal on a 64-bit boundary,
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 642a358f05c0..29aa9cc9128b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,6 +24,12 @@ config X86
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_HAS_FAST_MULTIPLIER
+ select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_HAS_KCOV if X86_64
+ select ARCH_HAS_PMEM_API if X86_64
+ select ARCH_HAS_MMIO_FLUSH
+ select ARCH_HAS_SG_CHAIN
+ select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select HAVE_AOUT if X86_32
@@ -41,6 +47,8 @@ config X86
select ARCH_DISCARD_MEMBLOCK
select ARCH_WANT_OPTIONAL_GPIOLIB
select ARCH_WANT_FRAME_POINTERS
+ select HAVE_ARCH_HARDENED_USERCOPY
+ select HAVE_ARCH_WITHIN_STACK_FRAMES
select HAVE_DMA_ATTRS
select HAVE_DMA_CONTIGUOUS
select HAVE_KRETPROBES
@@ -104,6 +112,8 @@ config X86
select DCACHE_WORD_ACCESS
select GENERIC_SMP_IDLE_THREAD
select ARCH_WANT_IPC_PARSE_VERSION if X86_32
+ select HAVE_ARCH_MMAP_RND_BITS if MMU
+ select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
select HAVE_ARCH_SECCOMP_FILTER
select BUILDTIME_EXTABLE_SORT
select GENERIC_CMOS_UPDATE
@@ -169,6 +179,20 @@ config HAVE_LATENCYTOP_SUPPORT
config MMU
def_bool y
+config ARCH_MMAP_RND_BITS_MIN
+ default 28 if 64BIT
+ default 8
+
+config ARCH_MMAP_RND_BITS_MAX
+ default 32 if 64BIT
+ default 16
+
+config ARCH_MMAP_RND_COMPAT_BITS_MIN
+ default 8
+
+config ARCH_MMAP_RND_COMPAT_BITS_MAX
+ default 16
+
config SBUS
bool
@@ -276,6 +300,9 @@ config ARCH_SUPPORTS_UPROBES
config FIX_EARLYCON_MEM
def_bool y
+config DEBUG_RODATA
+ def_bool y
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 61bd2ad94281..9922ecfc4ab4 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -90,23 +90,12 @@ config EFI_PGT_DUMP
issues with the mapping of the EFI runtime regions into that
table.
-config DEBUG_RODATA
- bool "Write protect kernel read-only data structures"
- default y
- depends on DEBUG_KERNEL
- ---help---
- Mark the kernel read-only data as write-protected in the pagetables,
- in order to catch accidental (and incorrect) writes to such const
- data. This is recommended so that we can catch kernel bugs sooner.
- If in doubt, say "Y".
-
config DEBUG_RODATA_TEST
- bool "Testcase for the DEBUG_RODATA feature"
- depends on DEBUG_RODATA
+ bool "Testcase for the marking rodata read-only"
default y
---help---
- This option enables a testcase for the DEBUG_RODATA
- feature as well as for the change_page_attr() infrastructure.
+ This option enables a testcase for the setting rodata read-only
+ as well as for the change_page_attr() infrastructure.
If in doubt, say "N"
config DEBUG_SET_MODULE_RONX
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index c45301c272d0..cf099068647f 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -9,7 +9,15 @@
# Changed by many, many contributors over the years.
#
-KASAN_SANITIZE := n
+KASAN_SANITIZE := n
+OBJECT_FILES_NON_STANDARD := y
+
+# Kernel does not boot with kcov instrumentation here.
+# One of the problems observed was insertion of __sanitizer_cov_trace_pc()
+# callback into middle of per-cpu data enabling code. Thus the callback observed
+# inconsistent state and crashed. We are interested mostly in syscall coverage,
+# so boot code is not interesting anyway.
+KCOV_INSTRUMENT := n
# If you want to preset the SVGA mode, uncomment the next line and
# set SVGA_MODE to whatever number you want.
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 2959cca0b90c..1ce09ba05eea 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -6,6 +6,9 @@
KASAN_SANITIZE := n
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
+
targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f9e181aaba97..d0165c9a2932 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -169,7 +169,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
u32 tmp;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
get_user_try {
/*
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 9863ee3747da..ba41e106a3b1 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -143,16 +143,10 @@ int set_pages_rw(struct page *page, int numpages);
void clflush_cache_range(void *addr, unsigned int size);
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
extern const int rodata_test_data;
extern int kernel_set_to_readonly;
void set_kernel_text_rw(void);
void set_kernel_text_ro(void);
-#else
-static inline void set_kernel_text_rw(void) { }
-static inline void set_kernel_text_ro(void) { }
-#endif
#ifdef CONFIG_DEBUG_RODATA_TEST
int rodata_test(void);
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
index c5d1785373ed..02bab09707f2 100644
--- a/arch/x86/include/asm/idle.h
+++ b/arch/x86/include/asm/idle.h
@@ -1,13 +1,6 @@
#ifndef _ASM_X86_IDLE_H
#define _ASM_X86_IDLE_H
-#define IDLE_START 1
-#define IDLE_END 2
-
-struct notifier_block;
-void idle_notifier_register(struct notifier_block *n);
-void idle_notifier_unregister(struct notifier_block *n);
-
#ifdef CONFIG_X86_64
void enter_idle(void);
void exit_idle(void);
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index 491e4fd7754e..1410b567ecde 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -17,15 +17,11 @@
#ifndef __ASSEMBLY__
-extern pte_t kasan_zero_pte[];
-extern pte_t kasan_zero_pmd[];
-extern pte_t kasan_zero_pud[];
-
#ifdef CONFIG_KASAN
-void __init kasan_map_early_shadow(pgd_t *pgd);
+void __init kasan_early_init(void);
void __init kasan_init(void);
#else
-static inline void kasan_map_early_shadow(pgd_t *pgd) { }
+static inline void kasan_early_init(void) { }
static inline void kasan_init(void) { }
#endif
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index e62cf897f781..745c117274d7 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -17,15 +17,8 @@ static inline bool kvm_check_and_clear_guest_paused(void)
}
#endif /* CONFIG_KVM_GUEST */
-#ifdef CONFIG_DEBUG_RODATA
#define KVM_HYPERCALL \
ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL)
-#else
-/* On AMD processors, vmcall will generate a trap that we will
- * then rewrite to the appropriate instruction.
- */
-#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
-#endif
/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
* instruction. The hypervisor may replace it with something else but only the
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 0a5242428659..13b6cdd0af57 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -7,7 +7,7 @@
extern char __brk_base[], __brk_limit[];
extern struct exception_table_entry __stop___ex_table[];
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
extern char __end_rodata_hpage_align[];
#endif
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index c4d96943e666..391a8711f39c 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -58,7 +58,6 @@ struct thread_info {
__u32 cpu; /* current CPU */
int saved_preempt_count;
mm_segment_t addr_limit;
- struct restart_block restart_block;
void __user *sysenter_return;
unsigned int sig_on_uaccess_error:1;
unsigned int uaccess_err:1; /* uaccess failed */
@@ -72,9 +71,6 @@ struct thread_info {
.cpu = 0, \
.saved_preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
@@ -197,6 +193,50 @@ static inline struct thread_info *current_thread_info(void)
return ti;
}
+/*
+ * Walks up the stack frames to make sure that the specified object is
+ * entirely contained by a single stack frame.
+ *
+ * Returns:
+ * 1 if within a frame
+ * -1 if placed across a frame boundary (or outside stack)
+ * 0 unable to determine (no frame pointers, etc)
+ */
+static inline int arch_within_stack_frames(const void * const stack,
+ const void * const stackend,
+ const void *obj, unsigned long len)
+{
+#if defined(CONFIG_FRAME_POINTER)
+ const void *frame = NULL;
+ const void *oldframe;
+
+ oldframe = __builtin_frame_address(1);
+ if (oldframe)
+ frame = __builtin_frame_address(2);
+ /*
+ * low ----------------------------------------------> high
+ * [saved bp][saved ip][args][local vars][saved bp][saved ip]
+ * ^----------------^
+ * allow copies only within here
+ */
+ while (stack <= frame && frame < stackend) {
+ /*
+ * If obj + len extends past the last frame, this
+ * check won't pass and the next frame will be 0,
+ * causing us to bail out and correctly report
+ * the copy as invalid.
+ */
+ if (obj + len <= frame)
+ return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1;
+ oldframe = frame;
+ frame = *(const void * const *)frame;
+ }
+ return -1;
+#else
+ return 0;
+#endif
+}
+
#else /* !__ASSEMBLY__ */
/* how to get the thread information struct from ASM */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 504be087d682..c158198e3cc8 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -133,6 +133,9 @@ extern int __get_user_4(void);
extern int __get_user_8(void);
extern int __get_user_bad(void);
+#define __uaccess_begin() stac()
+#define __uaccess_end() clac()
+
/*
* This is a type: either unsigned long, if the argument fits into
* that type, or otherwise unsigned long long.
@@ -179,8 +182,8 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
asm volatile("call __get_user_%P3" \
: "=a" (__ret_gu), "=r" (__val_gu) \
: "0" (ptr), "i" (sizeof(*(ptr)))); \
- (x) = (__typeof__(*(ptr))) __val_gu; \
- __ret_gu; \
+ (x) = (__force __typeof__(*(ptr))) __val_gu; \
+ __builtin_expect(__ret_gu, 0); \
})
#define __put_user_x(size, x, ptr, __ret_pu) \
@@ -191,10 +194,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
#ifdef CONFIG_X86_32
#define __put_user_asm_u64(x, addr, err, errret) \
- asm volatile(ASM_STAC "\n" \
+ asm volatile("\n" \
"1: movl %%eax,0(%2)\n" \
"2: movl %%edx,4(%2)\n" \
- "3: " ASM_CLAC "\n" \
+ "3:" \
".section .fixup,\"ax\"\n" \
"4: movl %3,%0\n" \
" jmp 3b\n" \
@@ -205,10 +208,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
: "A" (x), "r" (addr), "i" (errret), "0" (err))
#define __put_user_asm_ex_u64(x, addr) \
- asm volatile(ASM_STAC "\n" \
+ asm volatile("\n" \
"1: movl %%eax,0(%1)\n" \
"2: movl %%edx,4(%1)\n" \
- "3: " ASM_CLAC "\n" \
+ "3:" \
_ASM_EXTABLE_EX(1b, 2b) \
_ASM_EXTABLE_EX(2b, 3b) \
: : "A" (x), "r" (addr))
@@ -275,7 +278,7 @@ extern void __put_user_8(void);
__put_user_x(X, __pu_val, ptr, __ret_pu); \
break; \
} \
- __ret_pu; \
+ __builtin_expect(__ret_pu, 0); \
})
#define __put_user_size(x, ptr, size, retval, errret) \
@@ -301,6 +304,10 @@ do { \
} \
} while (0)
+/*
+ * This doesn't do __uaccess_begin/end - the exception handling
+ * around it must do that.
+ */
#define __put_user_size_ex(x, ptr, size) \
do { \
__chk_user_ptr(ptr); \
@@ -355,9 +362,9 @@ do { \
} while (0)
#define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \
- asm volatile(ASM_STAC "\n" \
+ asm volatile("\n" \
"1: mov"itype" %2,%"rtype"1\n" \
- "2: " ASM_CLAC "\n" \
+ "2:\n" \
".section .fixup,\"ax\"\n" \
"3: mov %3,%0\n" \
" xor"itype" %"rtype"1,%"rtype"1\n" \
@@ -367,6 +374,10 @@ do { \
: "=r" (err), ltype(x) \
: "m" (__m(addr)), "i" (errret), "0" (err))
+/*
+ * This doesn't do __uaccess_begin/end - the exception handling
+ * around it must do that.
+ */
#define __get_user_size_ex(x, ptr, size) \
do { \
__chk_user_ptr(ptr); \
@@ -401,17 +412,21 @@ do { \
#define __put_user_nocheck(x, ptr, size) \
({ \
int __pu_err; \
+ __uaccess_begin(); \
__put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
- __pu_err; \
+ __uaccess_end(); \
+ __builtin_expect(__pu_err, 0); \
})
#define __get_user_nocheck(x, ptr, size) \
({ \
int __gu_err; \
unsigned long __gu_val; \
+ __uaccess_begin(); \
__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
+ __uaccess_end(); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
- __gu_err; \
+ __builtin_expect(__gu_err, 0); \
})
/* FIXME: this hack is definitely wrong -AK */
@@ -424,9 +439,9 @@ struct __large_struct { unsigned long buf[100]; };
* aliasing issues.
*/
#define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \
- asm volatile(ASM_STAC "\n" \
+ asm volatile("\n" \
"1: mov"itype" %"rtype"1,%2\n" \
- "2: " ASM_CLAC "\n" \
+ "2:\n" \
".section .fixup,\"ax\"\n" \
"3: mov %3,%0\n" \
" jmp 2b\n" \
@@ -446,11 +461,11 @@ struct __large_struct { unsigned long buf[100]; };
*/
#define uaccess_try do { \
current_thread_info()->uaccess_err = 0; \
- stac(); \
+ __uaccess_begin(); \
barrier();
#define uaccess_catch(err) \
- clac(); \
+ __uaccess_end(); \
(err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \
} while (0)
@@ -546,12 +561,13 @@ extern void __cmpxchg_wrong_size(void)
__typeof__(ptr) __uval = (uval); \
__typeof__(*(ptr)) __old = (old); \
__typeof__(*(ptr)) __new = (new); \
+ __uaccess_begin(); \
switch (size) { \
case 1: \
{ \
- asm volatile("\t" ASM_STAC "\n" \
+ asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \
- "2:\t" ASM_CLAC "\n" \
+ "2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@@ -565,9 +581,9 @@ extern void __cmpxchg_wrong_size(void)
} \
case 2: \
{ \
- asm volatile("\t" ASM_STAC "\n" \
+ asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \
- "2:\t" ASM_CLAC "\n" \
+ "2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@@ -581,9 +597,9 @@ extern void __cmpxchg_wrong_size(void)
} \
case 4: \
{ \
- asm volatile("\t" ASM_STAC "\n" \
+ asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \
- "2:\t" ASM_CLAC "\n" \
+ "2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@@ -600,9 +616,9 @@ extern void __cmpxchg_wrong_size(void)
if (!IS_ENABLED(CONFIG_X86_64)) \
__cmpxchg_wrong_size(); \
\
- asm volatile("\t" ASM_STAC "\n" \
+ asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \
- "2:\t" ASM_CLAC "\n" \
+ "2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@@ -617,6 +633,7 @@ extern void __cmpxchg_wrong_size(void)
default: \
__cmpxchg_wrong_size(); \
} \
+ __uaccess_end(); \
*__uval = __old; \
__ret; \
})
@@ -688,7 +705,7 @@ __copy_from_user_overflow(int size, unsigned long count)
#endif
-static inline unsigned long __must_check
+static __always_inline unsigned long __must_check
copy_from_user(void *to, const void __user *from, unsigned long n)
{
int sz = __compiletime_object_size(to);
@@ -713,9 +730,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
* case, and do only runtime checking for non-constant sizes.
*/
- if (likely(sz < 0 || sz >= n))
+ if (likely(sz < 0 || sz >= n)) {
+ check_object_size(to, n, false);
n = _copy_from_user(to, from, n);
- else if(__builtin_constant_p(n))
+ } else if (__builtin_constant_p(n))
copy_from_user_overflow();
else
__copy_from_user_overflow(sz, n);
@@ -723,7 +741,7 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
return n;
}
-static inline unsigned long __must_check
+static __always_inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long n)
{
int sz = __compiletime_object_size(from);
@@ -731,9 +749,10 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
might_fault();
/* See the comment in copy_from_user() above. */
- if (likely(sz < 0 || sz >= n))
+ if (likely(sz < 0 || sz >= n)) {
+ check_object_size(from, n, true);
n = _copy_to_user(to, from, n);
- else if(__builtin_constant_p(n))
+ } else if (__builtin_constant_p(n))
copy_to_user_overflow();
else
__copy_to_user_overflow(sz, n);
@@ -744,5 +763,30 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
#undef __copy_from_user_overflow
#undef __copy_to_user_overflow
+/*
+ * The "unsafe" user accesses aren't really "unsafe", but the naming
+ * is a big fat warning: you have to not only do the access_ok()
+ * checking before using them, but you have to surround them with the
+ * user_access_begin/end() pair.
+ */
+#define user_access_begin() __uaccess_begin()
+#define user_access_end() __uaccess_end()
+
+#define unsafe_put_user(x, ptr, err_label) \
+do { \
+ int __pu_err; \
+ __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \
+ if (unlikely(__pu_err)) goto err_label; \
+} while (0)
+
+#define unsafe_get_user(x, ptr, err_label) \
+do { \
+ int __gu_err; \
+ unsigned long __gu_val; \
+ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \
+ (x) = (__force __typeof__(*(ptr)))__gu_val; \
+ if (unlikely(__gu_err)) goto err_label; \
+} while (0)
+
#endif /* _ASM_X86_UACCESS_H */
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 3c03a5de64d3..a0406077c0ce 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -43,21 +43,28 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero
static __always_inline unsigned long __must_check
__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
{
+ check_object_size(from, n, true);
if (__builtin_constant_p(n)) {
unsigned long ret;
switch (n) {
case 1:
+ __uaccess_begin();
__put_user_size(*(u8 *)from, (u8 __user *)to,
1, ret, 1);
+ __uaccess_end();
return ret;
case 2:
+ __uaccess_begin();
__put_user_size(*(u16 *)from, (u16 __user *)to,
2, ret, 2);
+ __uaccess_end();
return ret;
case 4:
+ __uaccess_begin();
__put_user_size(*(u32 *)from, (u32 __user *)to,
4, ret, 4);
+ __uaccess_end();
return ret;
}
}
@@ -98,13 +105,19 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
switch (n) {
case 1:
+ __uaccess_begin();
__get_user_size(*(u8 *)to, from, 1, ret, 1);
+ __uaccess_end();
return ret;
case 2:
+ __uaccess_begin();
__get_user_size(*(u16 *)to, from, 2, ret, 2);
+ __uaccess_end();
return ret;
case 4:
+ __uaccess_begin();
__get_user_size(*(u32 *)to, from, 4, ret, 4);
+ __uaccess_end();
return ret;
}
}
@@ -137,18 +150,25 @@ static __always_inline unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
might_fault();
+ check_object_size(to, n, false);
if (__builtin_constant_p(n)) {
unsigned long ret;
switch (n) {
case 1:
+ __uaccess_begin();
__get_user_size(*(u8 *)to, from, 1, ret, 1);
+ __uaccess_end();
return ret;
case 2:
+ __uaccess_begin();
__get_user_size(*(u16 *)to, from, 2, ret, 2);
+ __uaccess_end();
return ret;
case 4:
+ __uaccess_begin();
__get_user_size(*(u32 *)to, from, 4, ret, 4);
+ __uaccess_end();
return ret;
}
}
@@ -164,13 +184,19 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to,
switch (n) {
case 1:
+ __uaccess_begin();
__get_user_size(*(u8 *)to, from, 1, ret, 1);
+ __uaccess_end();
return ret;
case 2:
+ __uaccess_begin();
__get_user_size(*(u16 *)to, from, 2, ret, 2);
+ __uaccess_end();
return ret;
case 4:
+ __uaccess_begin();
__get_user_size(*(u32 *)to, from, 4, ret, 4);
+ __uaccess_end();
return ret;
}
}
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 12a26b979bf1..12496f789ae3 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -53,38 +53,53 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
{
int ret = 0;
+ check_object_size(dst, size, false);
if (!__builtin_constant_p(size))
return copy_user_generic(dst, (__force void *)src, size);
switch (size) {
- case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src,
+ case 1:
+ __uaccess_begin();
+ __get_user_asm(*(u8 *)dst, (u8 __user *)src,
ret, "b", "b", "=q", 1);
+ __uaccess_end();
return ret;
- case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src,
+ case 2:
+ __uaccess_begin();
+ __get_user_asm(*(u16 *)dst, (u16 __user *)src,
ret, "w", "w", "=r", 2);
+ __uaccess_end();
return ret;
- case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src,
+ case 4:
+ __uaccess_begin();
+ __get_user_asm(*(u32 *)dst, (u32 __user *)src,
ret, "l", "k", "=r", 4);
+ __uaccess_end();
return ret;
- case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src,
+ case 8:
+ __uaccess_begin();
+ __get_user_asm(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 8);
+ __uaccess_end();
return ret;
case 10:
+ __uaccess_begin();
__get_user_asm(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 10);
- if (unlikely(ret))
- return ret;
- __get_user_asm(*(u16 *)(8 + (char *)dst),
- (u16 __user *)(8 + (char __user *)src),
- ret, "w", "w", "=r", 2);
+ if (likely(!ret))
+ __get_user_asm(*(u16 *)(8 + (char *)dst),
+ (u16 __user *)(8 + (char __user *)src),
+ ret, "w", "w", "=r", 2);
+ __uaccess_end();
return ret;
case 16:
+ __uaccess_begin();
__get_user_asm(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 16);
- if (unlikely(ret))
- return ret;
- __get_user_asm(*(u64 *)(8 + (char *)dst),
- (u64 __user *)(8 + (char __user *)src),
- ret, "q", "", "=r", 8);
+ if (likely(!ret))
+ __get_user_asm(*(u64 *)(8 + (char *)dst),
+ (u64 __user *)(8 + (char __user *)src),
+ ret, "q", "", "=r", 8);
+ __uaccess_end();
return ret;
default:
return copy_user_generic(dst, (__force void *)src, size);
@@ -103,38 +118,55 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
{
int ret = 0;
+ check_object_size(src, size, true);
if (!__builtin_constant_p(size))
return copy_user_generic((__force void *)dst, src, size);
switch (size) {
- case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst,
+ case 1:
+ __uaccess_begin();
+ __put_user_asm(*(u8 *)src, (u8 __user *)dst,
ret, "b", "b", "iq", 1);
+ __uaccess_end();
return ret;
- case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst,
+ case 2:
+ __uaccess_begin();
+ __put_user_asm(*(u16 *)src, (u16 __user *)dst,
ret, "w", "w", "ir", 2);
+ __uaccess_end();
return ret;
- case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst,
+ case 4:
+ __uaccess_begin();
+ __put_user_asm(*(u32 *)src, (u32 __user *)dst,
ret, "l", "k", "ir", 4);
+ __uaccess_end();
return ret;
- case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ case 8:
+ __uaccess_begin();
+ __put_user_asm(*(u64 *)src, (u64 __user *)dst,
ret, "q", "", "er", 8);
+ __uaccess_end();
return ret;
case 10:
+ __uaccess_begin();
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
ret, "q", "", "er", 10);
- if (unlikely(ret))
- return ret;
- asm("":::"memory");
- __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
- ret, "w", "w", "ir", 2);
+ if (likely(!ret)) {
+ asm("":::"memory");
+ __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
+ ret, "w", "w", "ir", 2);
+ }
+ __uaccess_end();
return ret;
case 16:
+ __uaccess_begin();
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
ret, "q", "", "er", 16);
- if (unlikely(ret))
- return ret;
- asm("":::"memory");
- __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
- ret, "q", "", "er", 8);
+ if (likely(!ret)) {
+ asm("":::"memory");
+ __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
+ ret, "q", "", "er", 8);
+ }
+ __uaccess_end();
return ret;
default:
return copy_user_generic((__force void *)dst, src, size);
@@ -160,39 +192,47 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
switch (size) {
case 1: {
u8 tmp;
+ __uaccess_begin();
__get_user_asm(tmp, (u8 __user *)src,
ret, "b", "b", "=q", 1);
if (likely(!ret))
__put_user_asm(tmp, (u8 __user *)dst,
ret, "b", "b", "iq", 1);
+ __uaccess_end();
return ret;
}
case 2: {
u16 tmp;
+ __uaccess_begin();
__get_user_asm(tmp, (u16 __user *)src,
ret, "w", "w", "=r", 2);
if (likely(!ret))
__put_user_asm(tmp, (u16 __user *)dst,
ret, "w", "w", "ir", 2);
+ __uaccess_end();
return ret;
}
case 4: {
u32 tmp;
+ __uaccess_begin();
__get_user_asm(tmp, (u32 __user *)src,
ret, "l", "k", "=r", 4);
if (likely(!ret))
__put_user_asm(tmp, (u32 __user *)dst,
ret, "l", "k", "ir", 4);
+ __uaccess_end();
return ret;
}
case 8: {
u64 tmp;
+ __uaccess_begin();
__get_user_asm(tmp, (u64 __user *)src,
ret, "q", "", "=r", 8);
if (likely(!ret))
__put_user_asm(tmp, (u64 __user *)dst,
ret, "q", "", "er", 8);
+ __uaccess_end();
return ret;
}
default:
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2e7f86e8a4a6..4170d7da9659 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -20,6 +20,12 @@ KASAN_SANITIZE_head$(BITS).o := n
KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n
+# If instrumentation of this dir is enabled, boot hangs during first second.
+# Probably could be more selective here, but note that files related to irqs,
+# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
+# non-deterministic coverage.
+KCOV_INSTRUMENT := n
+
CFLAGS_irq.o := -I$(src)/../include/asm/trace
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index dcb5b15401ce..60e67f91271c 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,6 +2,10 @@
# Makefile for local APIC drivers and for the IO-APIC code
#
+# Leads to non-deterministic coverage that is not a function of syscall inputs.
+# In particualr, smp_apic_timer_interrupt() is called in random places.
+KCOV_INSTRUMENT := n
+
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o
obj-y += hw_nmi.o
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 80091ae54c2b..11fe4c56ae4e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -8,6 +8,10 @@ CFLAGS_REMOVE_common.o = -pg
CFLAGS_REMOVE_perf_event.o = -pg
endif
+# If these files are instrumented, boot hangs during the first second.
+KCOV_INSTRUMENT_common.o := n
+KCOV_INSTRUMENT_perf_event.o := n
+
# Make sure load_percpu_segment has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_common.o := $(nostackp)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 7cbea5eea0bd..6911d034536f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -80,9 +80,9 @@ within(unsigned long addr, unsigned long start, unsigned long end)
static unsigned long text_ip_addr(unsigned long ip)
{
/*
- * On x86_64, kernel text mappings are mapped read-only with
- * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
- * of the kernel text mapping to modify the kernel text.
+ * On x86_64, kernel text mappings are mapped read-only, so we use
+ * the kernel identity mapping instead of the kernel text mapping
+ * to modify the kernel text.
*
* For 32bit kernels, these mappings are same and we can use
* kernel identity mapping to modify code.
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b111ab5c4509..9ca92198ab11 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -161,11 +161,10 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
/* Kill off the identity-map trampoline */
reset_early_page_tables();
- kasan_map_early_shadow(early_level4_pgt);
-
- /* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
+ kasan_early_init();
+
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
set_intr_gate(i, early_idt_handler_array[i]);
load_idt((const struct desc_ptr *)&idt_descr);
@@ -184,8 +183,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
/* set init_level4_pgt kernel high mapping*/
init_level4_pgt[511] = early_level4_pgt[511];
- kasan_map_early_shadow(init_level4_pgt);
-
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index f8a8406033c3..e05430f78dfe 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -516,38 +516,9 @@ ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
-#ifdef CONFIG_KASAN
-#define FILL(VAL, COUNT) \
- .rept (COUNT) ; \
- .quad (VAL) ; \
- .endr
-
-NEXT_PAGE(kasan_zero_pte)
- FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512)
-NEXT_PAGE(kasan_zero_pmd)
- FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512)
-NEXT_PAGE(kasan_zero_pud)
- FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512)
-
-#undef FILL
-#endif
-
-
#include "../../x86/xen/xen-head.S"
__PAGE_ALIGNED_BSS
NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
-#ifdef CONFIG_KASAN
-/*
- * This page used as early shadow. We don't use empty_zero_page
- * at early stages, stack instrumentation could write some garbage
- * to this page.
- * Latter we reuse it as zero shadow for large ranges of memory
- * that allowed to access, but not instrumented by kasan
- * (vmalloc/vmemmap ...).
- */
-NEXT_PAGE(kasan_zero_page)
- .skip PAGE_SIZE
-#endif
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 7ec1d5f8d283..3bfb0c31a67e 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -745,9 +745,7 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
{
int err;
-#ifdef CONFIG_DEBUG_RODATA
char opc[BREAK_INSTR_SIZE];
-#endif /* CONFIG_DEBUG_RODATA */
bpt->type = BP_BREAKPOINT;
err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
@@ -756,7 +754,6 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
return err;
err = probe_kernel_write((char *)bpt->bpt_addr,
arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
-#ifdef CONFIG_DEBUG_RODATA
if (!err)
return err;
/*
@@ -773,13 +770,12 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
return -EINVAL;
bpt->type = BP_POKE_BREAKPOINT;
-#endif /* CONFIG_DEBUG_RODATA */
+
return err;
}
int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
{
-#ifdef CONFIG_DEBUG_RODATA
int err;
char opc[BREAK_INSTR_SIZE];
@@ -796,8 +792,8 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
goto knl_write;
return err;
+
knl_write:
-#endif /* CONFIG_DEBUG_RODATA */
return probe_kernel_write((char *)bpt->bpt_addr,
(char *)bpt->saved_instr, BREAK_INSTR_SIZE);
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index a388bb883128..0885df57ce7a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -42,19 +42,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU(unsigned char, is_idle);
-static ATOMIC_NOTIFIER_HEAD(idle_notifier);
-
-void idle_notifier_register(struct notifier_block *n)
-{
- atomic_notifier_chain_register(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
- atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_unregister);
#endif
struct kmem_cache *task_xstate_cachep;
@@ -262,14 +249,14 @@ static inline void play_dead(void)
void enter_idle(void)
{
this_cpu_write(is_idle, 1);
- atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+ idle_notifier_call_chain(IDLE_START);
}
static void __exit_idle(void)
{
if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
return;
- atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+ idle_notifier_call_chain(IDLE_END);
}
/* Called from interrupts to signify idle end */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ed37a768d0fc..0a62df4abcf7 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -69,7 +69,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
unsigned int err = 0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
get_user_try {
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 3f92ce07e525..27538f183c3b 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -142,7 +142,6 @@ static int test_NX(void)
* by the error message
*/
-#ifdef CONFIG_DEBUG_RODATA
/* Test 3: Check if the .rodata section is executable */
if (rodata_test_data != 0xC3) {
printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
@@ -151,7 +150,6 @@ static int test_NX(void)
printk(KERN_ERR "test_nx: .rodata section is executable\n");
ret = -ENODEV;
}
-#endif
#if 0
/* Test 4: Check if the .data section of a module is executable */
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index b79133abda48..46496203c216 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -76,5 +76,5 @@ int rodata_test(void)
}
MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
+MODULE_DESCRIPTION("Testcase for marking rodata as read-only");
MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 49edf2dd3613..4724bf17b808 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,29 +41,28 @@ ENTRY(phys_startup_64)
jiffies_64 = jiffies;
#endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
/*
- * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
- * we retain large page mappings for boundaries spanning kernel text, rodata
- * and data sections.
+ * On 64-bit, align RODATA to 2MB so we retain large page mappings for
+ * boundaries spanning kernel text, rodata and data sections.
*
* However, kernel identity mappings will have different RWX permissions
* to the pages mapping to text and to the pages padding (which are freed) the
* text section. Hence kernel identity mappings will be broken to smaller
* pages. For 64-bit, kernel text and kernel identity mappings are different,
- * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
- * as well as retain 2MB large page mappings for kernel text.
+ * so we can enable protection checks as well as retain 2MB large page
+ * mappings for kernel text.
*/
-#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
+#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
-#define X64_ALIGN_DEBUG_RODATA_END \
+#define X64_ALIGN_RODATA_END \
. = ALIGN(HPAGE_SIZE); \
__end_rodata_hpage_align = .;
#else
-#define X64_ALIGN_DEBUG_RODATA_BEGIN
-#define X64_ALIGN_DEBUG_RODATA_END
+#define X64_ALIGN_RODATA_BEGIN
+#define X64_ALIGN_RODATA_END
#endif
@@ -112,13 +111,11 @@ SECTIONS
EXCEPTION_TABLE(16) :text = 0x9090
-#if defined(CONFIG_DEBUG_RODATA)
/* .text should occupy whole number of pages */
. = ALIGN(PAGE_SIZE);
-#endif
- X64_ALIGN_DEBUG_RODATA_BEGIN
+ X64_ALIGN_RODATA_BEGIN
RO_DATA(PAGE_SIZE)
- X64_ALIGN_DEBUG_RODATA_END
+ X64_ALIGN_RODATA_END
/* Data */
.data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
index c7d791f32b98..51e330416995 100644
--- a/arch/x86/kernel/vsyscall_gtod.c
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -31,30 +31,30 @@ void update_vsyscall(struct timekeeper *tk)
gtod_write_begin(vdata);
/* copy vsyscall data */
- vdata->vclock_mode = tk->tkr.clock->archdata.vclock_mode;
- vdata->cycle_last = tk->tkr.cycle_last;
- vdata->mask = tk->tkr.mask;
- vdata->mult = tk->tkr.mult;
- vdata->shift = tk->tkr.shift;
+ vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
+ vdata->cycle_last = tk->tkr_mono.cycle_last;
+ vdata->mask = tk->tkr_mono.mask;
+ vdata->mult = tk->tkr_mono.mult;
+ vdata->shift = tk->tkr_mono.shift;
vdata->wall_time_sec = tk->xtime_sec;
- vdata->wall_time_snsec = tk->tkr.xtime_nsec;
+ vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec;
vdata->monotonic_time_sec = tk->xtime_sec
+ tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_snsec = tk->tkr.xtime_nsec
+ vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec
+ ((u64)tk->wall_to_monotonic.tv_nsec
- << tk->tkr.shift);
+ << tk->tkr_mono.shift);
while (vdata->monotonic_time_snsec >=
- (((u64)NSEC_PER_SEC) << tk->tkr.shift)) {
+ (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
vdata->monotonic_time_snsec -=
- ((u64)NSEC_PER_SEC) << tk->tkr.shift;
+ ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
vdata->monotonic_time_sec++;
}
vdata->wall_time_coarse_sec = tk->xtime_sec;
- vdata->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >>
- tk->tkr.shift);
+ vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >>
+ tk->tkr_mono.shift);
vdata->monotonic_time_coarse_sec =
vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1e839d801055..3411283e0b5e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1061,19 +1061,19 @@ static void update_pvclock_gtod(struct timekeeper *tk)
struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
u64 boot_ns;
- boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot));
+ boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
write_seqcount_begin(&vdata->seq);
/* copy pvclock gtod data */
- vdata->clock.vclock_mode = tk->tkr.clock->archdata.vclock_mode;
- vdata->clock.cycle_last = tk->tkr.cycle_last;
- vdata->clock.mask = tk->tkr.mask;
- vdata->clock.mult = tk->tkr.mult;
- vdata->clock.shift = tk->tkr.shift;
+ vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
+ vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
+ vdata->clock.mask = tk->tkr_mono.mask;
+ vdata->clock.mult = tk->tkr_mono.mult;
+ vdata->clock.shift = tk->tkr_mono.shift;
vdata->boot_ns = boot_ns;
- vdata->nsec_base = tk->tkr.xtime_nsec;
+ vdata->nsec_base = tk->tkr_mono.xtime_nsec;
write_seqcount_end(&vdata->seq);
}
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index db92793b7e23..d6377b7ea7bc 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,6 +2,9 @@
# Makefile for x86 specific library files.
#
+# Produces uninteresting flaky coverage.
+KCOV_INSTRUMENT_delay.o := n
+
inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
quiet_cmd_inat_tables = GEN $@
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 9648838b78fa..f8f15290e4f9 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,3 +1,6 @@
+# Kernel does not boot with instrumentation of tlb.c.
+KCOV_INSTRUMENT_tlb.o := n
+
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o gup.o setup_nx.o
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c23ab1ee3a9a..0f98553f995b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -871,7 +871,6 @@ static noinline int do_test_wp_bit(void)
return flag;
}
-#ifdef CONFIG_DEBUG_RODATA
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
@@ -958,5 +957,3 @@ void mark_rodata_ro(void)
#endif
mark_nxdata_nx();
}
-#endif
-
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index fa77995b62a4..a30510f47a46 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1074,7 +1074,6 @@ void __init mem_init(void)
mem_init_print_info(NULL);
}
-#ifdef CONFIG_DEBUG_RODATA
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
@@ -1164,8 +1163,6 @@ void mark_rodata_ro(void)
(unsigned long) __va(__pa_symbol(_sdata)));
}
-#endif
-
int kern_addr_valid(unsigned long addr)
{
unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 23dc7673e110..d66e3ff4b0c6 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -11,8 +11,6 @@
extern pgd_t early_level4_pgt[PTRS_PER_PGD];
extern struct range pfn_mapped[E820_X_MAX];
-extern unsigned char kasan_zero_page[PAGE_SIZE];
-
static int __init map_range(struct range *range)
{
unsigned long start;
@@ -36,7 +34,7 @@ static void __init clear_pgds(unsigned long start,
pgd_clear(pgd_offset_k(start));
}
-void __init kasan_map_early_shadow(pgd_t *pgd)
+static void __init kasan_map_early_shadow(pgd_t *pgd)
{
int i;
unsigned long start = KASAN_SHADOW_START;
@@ -49,106 +47,6 @@ void __init kasan_map_early_shadow(pgd_t *pgd)
}
}
-static int __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
- unsigned long end)
-{
- pte_t *pte = pte_offset_kernel(pmd, addr);
-
- while (addr + PAGE_SIZE <= end) {
- WARN_ON(!pte_none(*pte));
- set_pte(pte, __pte(__pa_nodebug(kasan_zero_page)
- | __PAGE_KERNEL_RO));
- addr += PAGE_SIZE;
- pte = pte_offset_kernel(pmd, addr);
- }
- return 0;
-}
-
-static int __init zero_pmd_populate(pud_t *pud, unsigned long addr,
- unsigned long end)
-{
- int ret = 0;
- pmd_t *pmd = pmd_offset(pud, addr);
-
- while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) {
- WARN_ON(!pmd_none(*pmd));
- set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte)
- | __PAGE_KERNEL_RO));
- addr += PMD_SIZE;
- pmd = pmd_offset(pud, addr);
- }
- if (addr < end) {
- if (pmd_none(*pmd)) {
- void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
- if (!p)
- return -ENOMEM;
- set_pmd(pmd, __pmd(__pa_nodebug(p) | _KERNPG_TABLE));
- }
- ret = zero_pte_populate(pmd, addr, end);
- }
- return ret;
-}
-
-
-static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
- unsigned long end)
-{
- int ret = 0;
- pud_t *pud = pud_offset(pgd, addr);
-
- while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) {
- WARN_ON(!pud_none(*pud));
- set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd)
- | __PAGE_KERNEL_RO));
- addr += PUD_SIZE;
- pud = pud_offset(pgd, addr);
- }
-
- if (addr < end) {
- if (pud_none(*pud)) {
- void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
- if (!p)
- return -ENOMEM;
- set_pud(pud, __pud(__pa_nodebug(p) | _KERNPG_TABLE));
- }
- ret = zero_pmd_populate(pud, addr, end);
- }
- return ret;
-}
-
-static int __init zero_pgd_populate(unsigned long addr, unsigned long end)
-{
- int ret = 0;
- pgd_t *pgd = pgd_offset_k(addr);
-
- while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) {
- WARN_ON(!pgd_none(*pgd));
- set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud)
- | __PAGE_KERNEL_RO));
- addr += PGDIR_SIZE;
- pgd = pgd_offset_k(addr);
- }
-
- if (addr < end) {
- if (pgd_none(*pgd)) {
- void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
- if (!p)
- return -ENOMEM;
- set_pgd(pgd, __pgd(__pa_nodebug(p) | _KERNPG_TABLE));
- }
- ret = zero_pud_populate(pgd, addr, end);
- }
- return ret;
-}
-
-
-static void __init populate_zero_shadow(const void *start, const void *end)
-{
- if (zero_pgd_populate((unsigned long)start, (unsigned long)end))
- panic("kasan: unable to map zero shadow!");
-}
-
-
#ifdef CONFIG_KASAN_INLINE
static int kasan_die_handler(struct notifier_block *self,
unsigned long val,
@@ -166,6 +64,26 @@ static struct notifier_block kasan_die_notifier = {
};
#endif
+void __init kasan_early_init(void)
+{
+ int i;
+ pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
+ pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
+ pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ kasan_zero_pte[i] = __pte(pte_val);
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ kasan_zero_pmd[i] = __pmd(pmd_val);
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ kasan_zero_pud[i] = __pud(pud_val);
+
+ kasan_map_early_shadow(early_level4_pgt);
+ kasan_map_early_shadow(init_level4_pgt);
+}
+
void __init kasan_init(void)
{
int i;
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index df4552bd239e..084c36f6b4e3 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -69,15 +69,15 @@ static unsigned long mmap_rnd(void)
{
unsigned long rnd = 0;
- /*
- * 8 bits of randomness in 32bit mmaps, 20 address space bits
- * 28 bits of randomness in 64bit mmaps, 40 address space bits
- */
if (current->flags & PF_RANDOMIZE) {
if (mmap_is_ia32())
- rnd = get_random_int() % (1<<8);
+#ifdef CONFIG_COMPAT
+ rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
+#else
+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
+#endif
else
- rnd = get_random_int() % (1<<28);
+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
}
return rnd << PAGE_SHIFT;
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e5545f2105f6..29add6bd93c4 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -281,7 +281,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
__pa_symbol(__end_rodata) >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_RW;
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
/*
* Once the kernel maps the text as RO (kernel_set_to_readonly is set),
* kernel text mappings for the large page aligned text, rodata sections
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 2730d775ef9a..59610be05468 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -8,6 +8,9 @@
#
KASAN_SANITIZE := n
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
+
always := realmode.bin realmode.relocs
wakeup-objs := wakeup_asm.o wakemain.o video-mode.o
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 79d824551c1a..0c8c32bfd792 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -157,7 +157,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
int err, pid;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
err = copy_from_user(&sc, from, sizeof(sc));
if (err)
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 2aacd7c63c7b..2567db0e8265 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -200,10 +200,10 @@ vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%)
$(MODLIB)/vdso: FORCE
@mkdir -p $(MODLIB)/vdso
-$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
+$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso
$(call cmd,vdso_install)
PHONY += vdso_install $(vdso_img_insttargets)
-vdso_install: $(vdso_img_insttargets) FORCE
+vdso_install: $(vdso_img_insttargets)
clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80*
diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h
index 0224987556ce..3f69326ed545 100644
--- a/arch/x86/vdso/vdso2c.h
+++ b/arch/x86/vdso/vdso2c.h
@@ -140,7 +140,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
fprintf(outfile, "#include <asm/vdso.h>\n");
fprintf(outfile, "\n");
fprintf(outfile,
- "static unsigned char raw_data[%lu] __page_aligned_data = {",
+ "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
mapping_size);
for (j = 0; j < stripped_len; j++) {
if (j % 10 == 0)
diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h
index 470153e8547c..a9b5d3ba196c 100644
--- a/arch/xtensa/include/asm/thread_info.h
+++ b/arch/xtensa/include/asm/thread_info.h
@@ -51,7 +51,6 @@ struct thread_info {
__s32 preempt_count; /* 0 => preemptable,< 0 => BUG*/
mm_segment_t addr_limit; /* thread address space */
- struct restart_block restart_block;
unsigned long cpenable;
@@ -72,7 +71,6 @@ struct thread_info {
#define TI_CPU 0x00000010
#define TI_PRE_COUNT 0x00000014
#define TI_ADDR_LIMIT 0x00000018
-#define TI_RESTART_BLOCK 0x000001C
#endif
@@ -90,9 +88,6 @@ struct thread_info {
.cpu = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index 4612321c73cc..3d733ba16f28 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -245,7 +245,7 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3,
int ret;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
if (regs->depc > 64)
panic("rt_sigreturn in double exception!\n");
diff --git a/block/blk-core.c b/block/blk-core.c
index 93f9152fc271..7d6aeb286271 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -40,6 +40,8 @@
#include "blk-cgroup.h"
#include "blk-mq.h"
+#include <linux/math64.h>
+
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
@@ -3324,3 +3326,85 @@ int __init blk_dev_init(void)
return 0;
}
+
+/*
+ * Blk IO latency support. We want this to be as cheap as possible, so doing
+ * this lockless (and avoiding atomics), a few off by a few errors in this
+ * code is not harmful, and we don't want to do anything that is
+ * perf-impactful.
+ * TODO : If necessary, we can make the histograms per-cpu and aggregate
+ * them when printing them out.
+ */
+void
+blk_zero_latency_hist(struct io_latency_state *s)
+{
+ memset(s->latency_y_axis_read, 0,
+ sizeof(s->latency_y_axis_read));
+ memset(s->latency_y_axis_write, 0,
+ sizeof(s->latency_y_axis_write));
+ s->latency_reads_elems = 0;
+ s->latency_writes_elems = 0;
+}
+EXPORT_SYMBOL(blk_zero_latency_hist);
+
+ssize_t
+blk_latency_hist_show(struct io_latency_state *s, char *buf)
+{
+ int i;
+ int bytes_written = 0;
+ u_int64_t num_elem, elem;
+ int pct;
+
+ num_elem = s->latency_reads_elems;
+ if (num_elem > 0) {
+ bytes_written += scnprintf(buf + bytes_written,
+ PAGE_SIZE - bytes_written,
+ "IO svc_time Read Latency Histogram (n = %llu):\n",
+ num_elem);
+ for (i = 0;
+ i < ARRAY_SIZE(latency_x_axis_us);
+ i++) {
+ elem = s->latency_y_axis_read[i];
+ pct = div64_u64(elem * 100, num_elem);
+ bytes_written += scnprintf(buf + bytes_written,
+ PAGE_SIZE - bytes_written,
+ "\t< %5lluus%15llu%15d%%\n",
+ latency_x_axis_us[i],
+ elem, pct);
+ }
+ /* Last element in y-axis table is overflow */
+ elem = s->latency_y_axis_read[i];
+ pct = div64_u64(elem * 100, num_elem);
+ bytes_written += scnprintf(buf + bytes_written,
+ PAGE_SIZE - bytes_written,
+ "\t> %5dms%15llu%15d%%\n", 10,
+ elem, pct);
+ }
+ num_elem = s->latency_writes_elems;
+ if (num_elem > 0) {
+ bytes_written += scnprintf(buf + bytes_written,
+ PAGE_SIZE - bytes_written,
+ "IO svc_time Write Latency Histogram (n = %llu):\n",
+ num_elem);
+ for (i = 0;
+ i < ARRAY_SIZE(latency_x_axis_us);
+ i++) {
+ elem = s->latency_y_axis_write[i];
+ pct = div64_u64(elem * 100, num_elem);
+ bytes_written += scnprintf(buf + bytes_written,
+ PAGE_SIZE - bytes_written,
+ "\t< %5lluus%15llu%15d%%\n",
+ latency_x_axis_us[i],
+ elem, pct);
+ }
+ /* Last element in y-axis table is overflow */
+ elem = s->latency_y_axis_write[i];
+ pct = div64_u64(elem * 100, num_elem);
+ bytes_written += scnprintf(buf + bytes_written,
+ PAGE_SIZE - bytes_written,
+ "\t> %5dms%15llu%15d%%\n", 10,
+ elem, pct);
+ }
+ return bytes_written;
+}
+EXPORT_SYMBOL(blk_latency_hist_show);
diff --git a/block/genhd.c b/block/genhd.c
index 40a06b934b1f..b529e5024d23 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1117,6 +1117,22 @@ static void disk_release(struct device *dev)
blk_put_queue(disk->queue);
kfree(disk);
}
+
+static int disk_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct disk_part_iter piter;
+ struct hd_struct *part;
+ int cnt = 0;
+
+ disk_part_iter_init(&piter, disk, 0);
+ while((part = disk_part_iter_next(&piter)))
+ cnt++;
+ disk_part_iter_exit(&piter);
+ add_uevent_var(env, "NPARTS=%u", cnt);
+ return 0;
+}
+
struct class block_class = {
.name = "block",
};
@@ -1136,6 +1152,7 @@ static struct device_type disk_type = {
.groups = disk_attr_groups,
.release = disk_release,
.devnode = block_devnode,
+ .uevent = disk_uevent,
};
#ifdef CONFIG_PROC_FS
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 0d9e5f97f0a8..47284e712650 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -217,10 +217,21 @@ static void part_release(struct device *dev)
kfree(p);
}
+static int part_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct hd_struct *part = dev_to_part(dev);
+
+ add_uevent_var(env, "PARTN=%u", part->partno);
+ if (part->info && part->info->volname[0])
+ add_uevent_var(env, "PARTNAME=%s", part->info->volname);
+ return 0;
+}
+
struct device_type part_type = {
.name = "partition",
.groups = part_attr_groups,
.release = part_release,
+ .uevent = part_uevent,
};
static void delete_partition_rcu_cb(struct rcu_head *head)
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 87bbc9c1e681..c753f662d7da 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -631,6 +631,14 @@ config CRYPTO_SHA512_SPARC64
SHA-512 secure hash standard (DFIPS 180-2) implemented
using sparc64 crypto instructions, when available.
+config CRYPTO_SHA256_ARM
+ tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
+ select CRYPTO_HASH
+ help
+ SHA-256 secure hash standard (DFIPS 180-2) implemented
+ using optimized ARM assembler and NEON, when available.
+
+
config CRYPTO_SHA512_ARM_NEON
tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
depends on ARM && KERNEL_MODE_NEON
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 1a693d3f9d51..e451dbab0964 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -100,6 +100,8 @@ source "drivers/memstick/Kconfig"
source "drivers/leds/Kconfig"
+source "drivers/switch/Kconfig"
+
source "drivers/accessibility/Kconfig"
source "drivers/infiniband/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 3471c903f4ff..c9df67885a70 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -117,6 +117,7 @@ obj-$(CONFIG_CPU_IDLE) += cpuidle/
obj-y += mmc/
obj-$(CONFIG_MEMSTICK) += memstick/
obj-y += leds/
+obj-$(CONFIG_SWITCH) += switch/
obj-$(CONFIG_INFINIBAND) += infiniband/
obj-$(CONFIG_SGI_SN) += sn/
obj-y += firmware/
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 508a8f67c028..cf848f27149e 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -32,6 +32,7 @@
#include <linux/cpufreq.h>
#include <linux/cpuidle.h>
#include <linux/timer.h>
+#include <linux/wakeup_reason.h>
#include "../base.h"
#include "power.h"
@@ -1337,6 +1338,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
pm_callback_t callback = NULL;
char *info = NULL;
int error = 0;
+ char suspend_abort[MAX_SUSPEND_ABORT_LEN];
DECLARE_DPM_WATCHDOG_ON_STACK(wd);
dpm_wait_for_children(dev, async);
@@ -1354,6 +1356,9 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
pm_wakeup_event(dev, 0);
if (pm_wakeup_pending()) {
+ pm_get_active_wakeup_sources(suspend_abort,
+ MAX_SUSPEND_ABORT_LEN);
+ log_suspend_abort_reason(suspend_abort);
async_error = -EBUSY;
goto Complete;
}
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index c2744b30d5d9..b88bebae5d94 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -14,6 +14,7 @@
#include <linux/suspend.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
+#include <linux/types.h>
#include <trace/events/power.h>
#include "power.h"
@@ -668,6 +669,37 @@ void pm_wakeup_event(struct device *dev, unsigned int msec)
}
EXPORT_SYMBOL_GPL(pm_wakeup_event);
+void pm_get_active_wakeup_sources(char *pending_wakeup_source, size_t max)
+{
+ struct wakeup_source *ws, *last_active_ws = NULL;
+ int len = 0;
+ bool active = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ws, &wakeup_sources, entry) {
+ if (ws->active && len < max) {
+ if (!active)
+ len += scnprintf(pending_wakeup_source, max,
+ "Pending Wakeup Sources: ");
+ len += scnprintf(pending_wakeup_source + len, max - len,
+ "%s ", ws->name);
+ active = true;
+ } else if (!active &&
+ (!last_active_ws ||
+ ktime_to_ns(ws->last_time) >
+ ktime_to_ns(last_active_ws->last_time))) {
+ last_active_ws = ws;
+ }
+ }
+ if (!active && last_active_ws) {
+ scnprintf(pending_wakeup_source, max,
+ "Last active Wakeup Source: %s",
+ last_active_ws->name);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(pm_get_active_wakeup_sources);
+
void pm_print_active_wakeup_sources(void)
{
struct wakeup_source *ws;
@@ -865,7 +897,7 @@ static int print_wakeup_source_stats(struct seq_file *m,
active_time = ktime_set(0, 0);
}
- ret = seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t"
+ ret = seq_printf(m, "%-32s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t"
"%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
ws->name, active_count, ws->event_count,
ws->wakeup_count, ws->expire_count,
@@ -886,7 +918,7 @@ static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
{
struct wakeup_source *ws;
- seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
+ seq_puts(m, "name\t\t\t\t\tactive_count\tevent_count\twakeup_count\t"
"expire_count\tactive_since\ttotal_time\tmax_time\t"
"last_change\tprevent_suspend_time\n");
diff --git a/drivers/base/syscore.c b/drivers/base/syscore.c
index 8d98a329f6ea..96c34a95cc62 100644
--- a/drivers/base/syscore.c
+++ b/drivers/base/syscore.c
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/suspend.h>
#include <trace/events/power.h>
+#include <linux/wakeup_reason.h>
static LIST_HEAD(syscore_ops_list);
static DEFINE_MUTEX(syscore_ops_lock);
@@ -75,6 +76,8 @@ int syscore_suspend(void)
return 0;
err_out:
+ log_suspend_abort_reason("System core suspend callback %pF failed",
+ ops->suspend);
pr_err("PM: System core suspend callback %pF failed.\n", ops->suspend);
list_for_each_entry_continue(ops, &syscore_ops_list, node)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 7e94459a489a..cb521441d096 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -43,20 +43,30 @@ static const char *default_compressor = "lzo";
/* Module params (documentation at end) */
static unsigned int num_devices = 1;
+static inline void deprecated_attr_warn(const char *name)
+{
+ pr_warn_once("%d (%s) Attribute %s (and others) will be removed. %s\n",
+ task_pid_nr(current),
+ current->comm,
+ name,
+ "See zram documentation.");
+}
+
#define ZRAM_ATTR_RO(name) \
-static ssize_t zram_attr_##name##_show(struct device *d, \
+static ssize_t name##_show(struct device *d, \
struct device_attribute *attr, char *b) \
{ \
struct zram *zram = dev_to_zram(d); \
+ \
+ deprecated_attr_warn(__stringify(name)); \
return scnprintf(b, PAGE_SIZE, "%llu\n", \
(u64)atomic64_read(&zram->stats.name)); \
} \
-static struct device_attribute dev_attr_##name = \
- __ATTR(name, S_IRUGO, zram_attr_##name##_show, NULL);
+static DEVICE_ATTR_RO(name);
-static inline int init_done(struct zram *zram)
+static inline bool init_done(struct zram *zram)
{
- return zram->meta != NULL;
+ return zram->disksize;
}
static inline struct zram *dev_to_zram(struct device *dev)
@@ -64,6 +74,27 @@ static inline struct zram *dev_to_zram(struct device *dev)
return (struct zram *)dev_to_disk(dev)->private_data;
}
+static ssize_t compact_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ unsigned long nr_migrated;
+ struct zram *zram = dev_to_zram(dev);
+ struct zram_meta *meta;
+
+ down_read(&zram->init_lock);
+ if (!init_done(zram)) {
+ up_read(&zram->init_lock);
+ return -EINVAL;
+ }
+
+ meta = zram->meta;
+ nr_migrated = zs_compact(meta->mem_pool);
+ atomic64_add(nr_migrated, &zram->stats.num_migrated);
+ up_read(&zram->init_lock);
+
+ return len;
+}
+
static ssize_t disksize_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -90,6 +121,7 @@ static ssize_t orig_data_size_show(struct device *dev,
{
struct zram *zram = dev_to_zram(dev);
+ deprecated_attr_warn("orig_data_size");
return scnprintf(buf, PAGE_SIZE, "%llu\n",
(u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
}
@@ -100,6 +132,7 @@ static ssize_t mem_used_total_show(struct device *dev,
u64 val = 0;
struct zram *zram = dev_to_zram(dev);
+ deprecated_attr_warn("mem_used_total");
down_read(&zram->init_lock);
if (init_done(zram)) {
struct zram_meta *meta = zram->meta;
@@ -129,6 +162,7 @@ static ssize_t mem_limit_show(struct device *dev,
u64 val;
struct zram *zram = dev_to_zram(dev);
+ deprecated_attr_warn("mem_limit");
down_read(&zram->init_lock);
val = zram->limit_pages;
up_read(&zram->init_lock);
@@ -160,6 +194,7 @@ static ssize_t mem_used_max_show(struct device *dev,
u64 val = 0;
struct zram *zram = dev_to_zram(dev);
+ deprecated_attr_warn("mem_used_max");
down_read(&zram->init_lock);
if (init_done(zram))
val = atomic_long_read(&zram->stats.max_used_pages);
@@ -287,19 +322,18 @@ static inline int is_partial_io(struct bio_vec *bvec)
/*
* Check if request is within bounds and aligned on zram logical blocks.
*/
-static inline int valid_io_request(struct zram *zram, struct bio *bio)
+static inline int valid_io_request(struct zram *zram,
+ sector_t start, unsigned int size)
{
- u64 start, end, bound;
+ u64 end, bound;
/* unaligned request */
- if (unlikely(bio->bi_iter.bi_sector &
- (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
+ if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
return 0;
- if (unlikely(bio->bi_iter.bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
+ if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
return 0;
- start = bio->bi_iter.bi_sector;
- end = start + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
+ end = start + (size >> SECTOR_SHIFT);
bound = zram->disksize >> SECTOR_SHIFT;
/* out of range range */
if (unlikely(start >= bound || end > bound || start > end))
@@ -309,42 +343,67 @@ static inline int valid_io_request(struct zram *zram, struct bio *bio)
return 1;
}
-static void zram_meta_free(struct zram_meta *meta)
+static void zram_meta_free(struct zram_meta *meta, u64 disksize)
{
+ size_t num_pages = disksize >> PAGE_SHIFT;
+ size_t index;
+
+ /* Free all pages that are still in this zram device */
+ for (index = 0; index < num_pages; index++) {
+ unsigned long handle = meta->table[index].handle;
+
+ if (!handle)
+ continue;
+
+ zs_free(meta->mem_pool, handle);
+ }
+
zs_destroy_pool(meta->mem_pool);
vfree(meta->table);
kfree(meta);
}
-static struct zram_meta *zram_meta_alloc(u64 disksize)
+static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
{
size_t num_pages;
+ char pool_name[8];
struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
+
if (!meta)
- goto out;
+ return NULL;
num_pages = disksize >> PAGE_SHIFT;
meta->table = vzalloc(num_pages * sizeof(*meta->table));
if (!meta->table) {
pr_err("Error allocating zram address table\n");
- goto free_meta;
+ goto out_error;
}
- meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM);
+ snprintf(pool_name, sizeof(pool_name), "zram%d", device_id);
+ meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM);
if (!meta->mem_pool) {
pr_err("Error creating memory pool\n");
- goto free_table;
+ goto out_error;
}
return meta;
-free_table:
+out_error:
vfree(meta->table);
-free_meta:
kfree(meta);
- meta = NULL;
-out:
- return meta;
+ return NULL;
+}
+
+static inline bool zram_meta_get(struct zram *zram)
+{
+ if (atomic_inc_not_zero(&zram->refcount))
+ return true;
+ return false;
+}
+
+static inline void zram_meta_put(struct zram *zram)
+{
+ atomic_dec(&zram->refcount);
}
static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
@@ -453,7 +512,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
}
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
- u32 index, int offset, struct bio *bio)
+ u32 index, int offset)
{
int ret;
struct page *page;
@@ -505,7 +564,7 @@ out_cleanup:
static inline void update_used_max(struct zram *zram,
const unsigned long pages)
{
- int old_max, cur_max;
+ unsigned long old_max, cur_max;
old_max = atomic_long_read(&zram->stats.max_used_pages);
@@ -645,14 +704,13 @@ out:
}
static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
- int offset, struct bio *bio)
+ int offset, int rw)
{
int ret;
- int rw = bio_data_dir(bio);
if (rw == READ) {
atomic64_inc(&zram->stats.num_reads);
- ret = zram_bvec_read(zram, bvec, index, offset, bio);
+ ret = zram_bvec_read(zram, bvec, index, offset);
} else {
atomic64_inc(&zram->stats.num_writes);
ret = zram_bvec_write(zram, bvec, index, offset);
@@ -707,10 +765,11 @@ static void zram_bio_discard(struct zram *zram, u32 index,
}
}
-static void zram_reset_device(struct zram *zram, bool reset_capacity)
+static void zram_reset_device(struct zram *zram)
{
- size_t index;
struct zram_meta *meta;
+ struct zcomp *comp;
+ u64 disksize;
down_write(&zram->init_lock);
@@ -722,36 +781,32 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
}
meta = zram->meta;
- /* Free all pages that are still in this zram device */
- for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
- unsigned long handle = meta->table[index].handle;
- if (!handle)
- continue;
-
- zs_free(meta->mem_pool, handle);
- }
-
- zcomp_destroy(zram->comp);
- zram->max_comp_streams = 1;
+ comp = zram->comp;
+ disksize = zram->disksize;
+ /*
+ * Refcount will go down to 0 eventually and r/w handler
+ * cannot handle further I/O so it will bail out by
+ * check zram_meta_get.
+ */
+ zram_meta_put(zram);
+ /*
+ * We want to free zram_meta in process context to avoid
+ * deadlock between reclaim path and any other locks.
+ */
+ wait_event(zram->io_done, atomic_read(&zram->refcount) == 0);
- zram_meta_free(zram->meta);
- zram->meta = NULL;
/* Reset stats */
memset(&zram->stats, 0, sizeof(zram->stats));
-
zram->disksize = 0;
- if (reset_capacity)
- set_capacity(zram->disk, 0);
+ zram->max_comp_streams = 1;
- up_write(&zram->init_lock);
+ set_capacity(zram->disk, 0);
+ part_stat_set_all(&zram->disk->part0, 0);
- /*
- * Revalidate disk out of the init_lock to avoid lockdep splat.
- * It's okay because disk's capacity is protected by init_lock
- * so that revalidate_disk always sees up-to-date capacity.
- */
- if (reset_capacity)
- revalidate_disk(zram->disk);
+ up_write(&zram->init_lock);
+ /* I/O operation under all of CPU are done so let's free */
+ zram_meta_free(meta, disksize);
+ zcomp_destroy(comp);
}
static ssize_t disksize_store(struct device *dev,
@@ -768,7 +823,7 @@ static ssize_t disksize_store(struct device *dev,
return -EINVAL;
disksize = PAGE_ALIGN(disksize);
- meta = zram_meta_alloc(disksize);
+ meta = zram_meta_alloc(zram->disk->first_minor, disksize);
if (!meta)
return -ENOMEM;
@@ -787,6 +842,8 @@ static ssize_t disksize_store(struct device *dev,
goto out_destroy_comp;
}
+ init_waitqueue_head(&zram->io_done);
+ atomic_set(&zram->refcount, 1);
zram->meta = meta;
zram->comp = comp;
zram->disksize = disksize;
@@ -806,7 +863,7 @@ out_destroy_comp:
up_write(&zram->init_lock);
zcomp_destroy(comp);
out_free_meta:
- zram_meta_free(meta);
+ zram_meta_free(meta, disksize);
return err;
}
@@ -824,8 +881,9 @@ static ssize_t reset_store(struct device *dev,
if (!bdev)
return -ENOMEM;
+ mutex_lock(&bdev->bd_mutex);
/* Do not reset an active device! */
- if (bdev->bd_holders) {
+ if (bdev->bd_openers) {
ret = -EBUSY;
goto out;
}
@@ -841,19 +899,23 @@ static ssize_t reset_store(struct device *dev,
/* Make sure all pending I/O is finished */
fsync_bdev(bdev);
+ zram_reset_device(zram);
+
+ mutex_unlock(&bdev->bd_mutex);
+ revalidate_disk(zram->disk);
bdput(bdev);
- zram_reset_device(zram, true);
return len;
out:
+ mutex_unlock(&bdev->bd_mutex);
bdput(bdev);
return ret;
}
static void __zram_make_request(struct zram *zram, struct bio *bio)
{
- int offset;
+ int offset, rw;
u32 index;
struct bio_vec bvec;
struct bvec_iter iter;
@@ -868,6 +930,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
return;
}
+ rw = bio_data_dir(bio);
bio_for_each_segment(bvec, bio, iter) {
int max_transfer_size = PAGE_SIZE - offset;
@@ -882,15 +945,15 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
bv.bv_len = max_transfer_size;
bv.bv_offset = bvec.bv_offset;
- if (zram_bvec_rw(zram, &bv, index, offset, bio) < 0)
+ if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0)
goto out;
bv.bv_len = bvec.bv_len - max_transfer_size;
bv.bv_offset += max_transfer_size;
- if (zram_bvec_rw(zram, &bv, index + 1, 0, bio) < 0)
+ if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0)
goto out;
} else
- if (zram_bvec_rw(zram, &bvec, index, offset, bio) < 0)
+ if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0)
goto out;
update_position(&index, &offset, &bvec);
@@ -911,22 +974,21 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
{
struct zram *zram = queue->queuedata;
- down_read(&zram->init_lock);
- if (unlikely(!init_done(zram)))
+ if (unlikely(!zram_meta_get(zram)))
goto error;
- if (!valid_io_request(zram, bio)) {
+ if (!valid_io_request(zram, bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size)) {
atomic64_inc(&zram->stats.invalid_io);
- goto error;
+ goto put_zram;
}
__zram_make_request(zram, bio);
- up_read(&zram->init_lock);
-
+ zram_meta_put(zram);
return;
-
+put_zram:
+ zram_meta_put(zram);
error:
- up_read(&zram->init_lock);
bio_io_error(bio);
}
@@ -945,26 +1007,114 @@ static void zram_slot_free_notify(struct block_device *bdev,
atomic64_inc(&zram->stats.notify_free);
}
+static int zram_rw_page(struct block_device *bdev, sector_t sector,
+ struct page *page, int rw)
+{
+ int offset, err = -EIO;
+ u32 index;
+ struct zram *zram;
+ struct bio_vec bv;
+
+ zram = bdev->bd_disk->private_data;
+ if (unlikely(!zram_meta_get(zram)))
+ goto out;
+
+ if (!valid_io_request(zram, sector, PAGE_SIZE)) {
+ atomic64_inc(&zram->stats.invalid_io);
+ err = -EINVAL;
+ goto put_zram;
+ }
+
+ index = sector >> SECTORS_PER_PAGE_SHIFT;
+ offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
+
+ bv.bv_page = page;
+ bv.bv_len = PAGE_SIZE;
+ bv.bv_offset = 0;
+
+ err = zram_bvec_rw(zram, &bv, index, offset, rw);
+put_zram:
+ zram_meta_put(zram);
+out:
+ /*
+ * If I/O fails, just return error(ie, non-zero) without
+ * calling page_endio.
+ * It causes resubmit the I/O with bio request by upper functions
+ * of rw_page(e.g., swap_readpage, __swap_writepage) and
+ * bio->bi_end_io does things to handle the error
+ * (e.g., SetPageError, set_page_dirty and extra works).
+ */
+ if (err == 0)
+ page_endio(page, rw, 0);
+ return err;
+}
+
static const struct block_device_operations zram_devops = {
.swap_slot_free_notify = zram_slot_free_notify,
+ .rw_page = zram_rw_page,
.owner = THIS_MODULE
};
-static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR,
- disksize_show, disksize_store);
-static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL);
-static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store);
-static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL);
-static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL);
-static DEVICE_ATTR(mem_limit, S_IRUGO | S_IWUSR, mem_limit_show,
- mem_limit_store);
-static DEVICE_ATTR(mem_used_max, S_IRUGO | S_IWUSR, mem_used_max_show,
- mem_used_max_store);
-static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR,
- max_comp_streams_show, max_comp_streams_store);
-static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR,
- comp_algorithm_show, comp_algorithm_store);
+static DEVICE_ATTR_WO(compact);
+static DEVICE_ATTR_RW(disksize);
+static DEVICE_ATTR_RO(initstate);
+static DEVICE_ATTR_WO(reset);
+static DEVICE_ATTR_RO(orig_data_size);
+static DEVICE_ATTR_RO(mem_used_total);
+static DEVICE_ATTR_RW(mem_limit);
+static DEVICE_ATTR_RW(mem_used_max);
+static DEVICE_ATTR_RW(max_comp_streams);
+static DEVICE_ATTR_RW(comp_algorithm);
+
+static ssize_t io_stat_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct zram *zram = dev_to_zram(dev);
+ ssize_t ret;
+
+ down_read(&zram->init_lock);
+ ret = scnprintf(buf, PAGE_SIZE,
+ "%8llu %8llu %8llu %8llu\n",
+ (u64)atomic64_read(&zram->stats.failed_reads),
+ (u64)atomic64_read(&zram->stats.failed_writes),
+ (u64)atomic64_read(&zram->stats.invalid_io),
+ (u64)atomic64_read(&zram->stats.notify_free));
+ up_read(&zram->init_lock);
+ return ret;
+}
+
+static ssize_t mm_stat_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct zram *zram = dev_to_zram(dev);
+ u64 orig_size, mem_used = 0;
+ long max_used;
+ ssize_t ret;
+
+ down_read(&zram->init_lock);
+ if (init_done(zram))
+ mem_used = zs_get_total_pages(zram->meta->mem_pool);
+
+ orig_size = atomic64_read(&zram->stats.pages_stored);
+ max_used = atomic_long_read(&zram->stats.max_used_pages);
+
+ ret = scnprintf(buf, PAGE_SIZE,
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n",
+ orig_size << PAGE_SHIFT,
+ (u64)atomic64_read(&zram->stats.compr_data_size),
+ mem_used << PAGE_SHIFT,
+ zram->limit_pages << PAGE_SHIFT,
+ max_used << PAGE_SHIFT,
+ (u64)atomic64_read(&zram->stats.zero_pages),
+ (u64)atomic64_read(&zram->stats.num_migrated));
+ up_read(&zram->init_lock);
+
+ return ret;
+}
+
+static DEVICE_ATTR_RO(io_stat);
+static DEVICE_ATTR_RO(mm_stat);
ZRAM_ATTR_RO(num_reads);
ZRAM_ATTR_RO(num_writes);
ZRAM_ATTR_RO(failed_reads);
@@ -982,6 +1132,7 @@ static struct attribute *zram_disk_attrs[] = {
&dev_attr_num_writes.attr,
&dev_attr_failed_reads.attr,
&dev_attr_failed_writes.attr,
+ &dev_attr_compact.attr,
&dev_attr_invalid_io.attr,
&dev_attr_notify_free.attr,
&dev_attr_zero_pages.attr,
@@ -992,6 +1143,8 @@ static struct attribute *zram_disk_attrs[] = {
&dev_attr_mem_used_max.attr,
&dev_attr_max_comp_streams.attr,
&dev_attr_comp_algorithm.attr,
+ &dev_attr_io_stat.attr,
+ &dev_attr_mm_stat.attr,
NULL,
};
@@ -1001,32 +1154,34 @@ static struct attribute_group zram_disk_attr_group = {
static int create_device(struct zram *zram, int device_id)
{
+ struct request_queue *queue;
int ret = -ENOMEM;
init_rwsem(&zram->init_lock);
- zram->queue = blk_alloc_queue(GFP_KERNEL);
- if (!zram->queue) {
+ queue = blk_alloc_queue(GFP_KERNEL);
+ if (!queue) {
pr_err("Error allocating disk queue for device %d\n",
device_id);
goto out;
}
- blk_queue_make_request(zram->queue, zram_make_request);
- zram->queue->queuedata = zram;
+ blk_queue_make_request(queue, zram_make_request);
/* gendisk structure */
zram->disk = alloc_disk(1);
if (!zram->disk) {
pr_warn("Error allocating disk structure for device %d\n",
device_id);
+ ret = -ENOMEM;
goto out_free_queue;
}
zram->disk->major = zram_major;
zram->disk->first_minor = device_id;
zram->disk->fops = &zram_devops;
- zram->disk->queue = zram->queue;
+ zram->disk->queue = queue;
+ zram->disk->queue->queuedata = zram;
zram->disk->private_data = zram;
snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
@@ -1077,20 +1232,35 @@ out_free_disk:
del_gendisk(zram->disk);
put_disk(zram->disk);
out_free_queue:
- blk_cleanup_queue(zram->queue);
+ blk_cleanup_queue(queue);
out:
return ret;
}
-static void destroy_device(struct zram *zram)
+static void destroy_devices(unsigned int nr)
{
- sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
- &zram_disk_attr_group);
+ struct zram *zram;
+ unsigned int i;
- del_gendisk(zram->disk);
- put_disk(zram->disk);
+ for (i = 0; i < nr; i++) {
+ zram = &zram_devices[i];
+ /*
+ * Remove sysfs first, so no one will perform a disksize
+ * store while we destroy the devices
+ */
+ sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
+ &zram_disk_attr_group);
+
+ zram_reset_device(zram);
- blk_cleanup_queue(zram->queue);
+ blk_cleanup_queue(zram->disk->queue);
+ del_gendisk(zram->disk);
+ put_disk(zram->disk);
+ }
+
+ kfree(zram_devices);
+ unregister_blkdev(zram_major, "zram");
+ pr_info("Destroyed %u device(s)\n", nr);
}
static int __init zram_init(void)
@@ -1100,64 +1270,39 @@ static int __init zram_init(void)
if (num_devices > max_num_devices) {
pr_warn("Invalid value for num_devices: %u\n",
num_devices);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
zram_major = register_blkdev(0, "zram");
if (zram_major <= 0) {
pr_warn("Unable to get major number\n");
- ret = -EBUSY;
- goto out;
+ return -EBUSY;
}
/* Allocate the device array and initialize each one */
zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL);
if (!zram_devices) {
- ret = -ENOMEM;
- goto unregister;
+ unregister_blkdev(zram_major, "zram");
+ return -ENOMEM;
}
for (dev_id = 0; dev_id < num_devices; dev_id++) {
ret = create_device(&zram_devices[dev_id], dev_id);
if (ret)
- goto free_devices;
+ goto out_error;
}
- pr_info("Created %u device(s) ...\n", num_devices);
-
+ pr_info("Created %u device(s)\n", num_devices);
return 0;
-free_devices:
- while (dev_id)
- destroy_device(&zram_devices[--dev_id]);
- kfree(zram_devices);
-unregister:
- unregister_blkdev(zram_major, "zram");
-out:
+out_error:
+ destroy_devices(dev_id);
return ret;
}
static void __exit zram_exit(void)
{
- int i;
- struct zram *zram;
-
- for (i = 0; i < num_devices; i++) {
- zram = &zram_devices[i];
-
- destroy_device(zram);
- /*
- * Shouldn't access zram->disk after destroy_device
- * because destroy_device already released zram->disk.
- */
- zram_reset_device(zram, false);
- }
-
- unregister_blkdev(zram_major, "zram");
-
- kfree(zram_devices);
- pr_debug("Cleanup done!\n");
+ destroy_devices(num_devices);
}
module_init(zram_init);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index c6ee271317f5..570c598f4ce9 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -66,8 +66,8 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
/* Flags for zram pages (table[page_no].value) */
enum zram_pageflags {
/* Page consists entirely of zeros */
- ZRAM_ZERO = ZRAM_FLAG_SHIFT + 1,
- ZRAM_ACCESS, /* page in now accessed */
+ ZRAM_ZERO = ZRAM_FLAG_SHIFT,
+ ZRAM_ACCESS, /* page is now accessed */
__NR_ZRAM_PAGEFLAGS,
};
@@ -84,6 +84,7 @@ struct zram_stats {
atomic64_t compr_data_size; /* compressed size of pages stored */
atomic64_t num_reads; /* failed + successful */
atomic64_t num_writes; /* --do-- */
+ atomic64_t num_migrated; /* no. of migrated object */
atomic64_t failed_reads; /* can happen when memory is too low */
atomic64_t failed_writes; /* can happen when memory is too low */
atomic64_t invalid_io; /* non-page-aligned I/O requests */
@@ -100,24 +101,25 @@ struct zram_meta {
struct zram {
struct zram_meta *meta;
- struct request_queue *queue;
- struct gendisk *disk;
struct zcomp *comp;
-
- /* Prevent concurrent execution of device init, reset and R/W request */
+ struct gendisk *disk;
+ /* Prevent concurrent execution of device init */
struct rw_semaphore init_lock;
/*
- * This is the limit on amount of *uncompressed* worth of data
- * we can store in a disk.
+ * the number of pages zram can consume for storing compressed data
*/
- u64 disksize; /* bytes */
+ unsigned long limit_pages;
int max_comp_streams;
+
struct zram_stats stats;
+ atomic_t refcount; /* refcount for zram_meta */
+ /* wait all IO under all of cpu are done */
+ wait_queue_head_t io_done;
/*
- * the number of pages zram can consume for storing compressed data
+ * This is the limit on amount of *uncompressed* worth of data
+ * we can store in a disk.
*/
- unsigned long limit_pages;
-
+ u64 disksize; /* bytes */
char compressor[10];
};
#endif
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 68dabd7edd1d..88985c0b2529 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -6,6 +6,19 @@ menu "Character devices"
source "drivers/tty/Kconfig"
+config DEVMEM
+ bool "Memory device driver"
+ default y
+ help
+ The memory driver provides two character devices, mem and kmem, which
+ provide access to the system's memory. The mem device is a view of
+ physical memory, and each byte in the device corresponds to the
+ matching physical address. The kmem device is the same as mem, but
+ the addresses correspond to the kernel's virtual address space rather
+ than physical memory. These devices are standard parts of a Linux
+ system and most users should say Y here. You might say N if very
+ security conscience or memory is tight.
+
config DEVKMEM
bool "/dev/kmem virtual device support"
default y
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 9a903511b357..1d66230f7fb8 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -58,6 +58,7 @@ static inline int valid_mmap_phys_addr_range(unsigned long pfn, size_t size)
}
#endif
+#if defined(CONFIG_DEVMEM) || defined(CONFIG_DEVKMEM)
#ifdef CONFIG_STRICT_DEVMEM
static inline int page_is_allowed(unsigned long pfn)
{
@@ -91,7 +92,9 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
return 1;
}
#endif
+#endif
+#ifdef CONFIG_DEVMEM
void __weak unxlate_dev_mem_ptr(unsigned long phys, void *addr)
{
}
@@ -238,6 +241,9 @@ static ssize_t write_mem(struct file *file, const char __user *buf,
*ppos += written;
return written;
}
+#endif /* CONFIG_DEVMEM */
+
+#if defined(CONFIG_DEVMEM) || defined(CONFIG_DEVKMEM)
int __weak phys_mem_access_prot_allowed(struct file *file,
unsigned long pfn, unsigned long size, pgprot_t *vma_prot)
@@ -364,6 +370,7 @@ static int mmap_mem(struct file *file, struct vm_area_struct *vma)
}
return 0;
}
+#endif /* CONFIG_DEVMEM */
#ifdef CONFIG_DEVKMEM
static int mmap_kmem(struct file *file, struct vm_area_struct *vma)
@@ -694,6 +701,8 @@ static loff_t null_lseek(struct file *file, loff_t offset, int orig)
return file->f_pos = 0;
}
+#if defined(CONFIG_DEVMEM) || defined(CONFIG_DEVKMEM) || defined(CONFIG_DEVPORT)
+
/*
* The memory devices use the full 32/64 bits of the offset, and so we cannot
* check against negative addresses: they are ok. The return value is weird,
@@ -727,10 +736,14 @@ static loff_t memory_lseek(struct file *file, loff_t offset, int orig)
return ret;
}
+#endif
+
+#if defined(CONFIG_DEVMEM) || defined(CONFIG_DEVKMEM) || defined(CONFIG_DEVPORT)
static int open_port(struct inode *inode, struct file *filp)
{
return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
}
+#endif
#define zero_lseek null_lseek
#define full_lseek null_lseek
@@ -739,6 +752,7 @@ static int open_port(struct inode *inode, struct file *filp)
#define open_mem open_port
#define open_kmem open_mem
+#ifdef CONFIG_DEVMEM
static const struct file_operations mem_fops = {
.llseek = memory_lseek,
.read = read_mem,
@@ -747,6 +761,7 @@ static const struct file_operations mem_fops = {
.open = open_mem,
.get_unmapped_area = get_unmapped_area_mem,
};
+#endif
#ifdef CONFIG_DEVKMEM
static const struct file_operations kmem_fops = {
@@ -809,7 +824,9 @@ static const struct memdev {
const struct file_operations *fops;
struct backing_dev_info *dev_info;
} devlist[] = {
+#ifdef CONFIG_DEVMEM
[1] = { "mem", 0, &mem_fops, &directly_mappable_cdev_bdi },
+#endif
#ifdef CONFIG_DEVKMEM
[2] = { "kmem", 0, &kmem_fops, &directly_mappable_cdev_bdi },
#endif
diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig
index 455fd17d938e..dd112f0f7ef0 100644
--- a/drivers/clk/Kconfig
+++ b/drivers/clk/Kconfig
@@ -134,6 +134,24 @@ config COMMON_CLK_PXA
---help---
Sypport for the Marvell PXA SoC.
+config COMMON_CLK_FREQ_STATS_ACCOUNTING
+ bool "Enable clock frequency stats accounting"
+ depends on COMMON_CLK
+ depends on DEBUG_FS
+ ---help---
+ Allows accounting of the time spent by various clocks in each
+ of its operating frequency. The stats get reported as a part
+ of clk_summary. Would be be useful in finding out which
+ components are running at what power states to debug
+ battery consumption issues.
+
+config COMMON_CLK_BEGIN_ACCOUNTING_FROM_BOOT
+ bool "Start clock frequency stats accounting from boot"
+ depends on COMMON_CLK_FREQ_STATS_ACCOUNTING
+ ---help---
+ Enabling this option starts the frequency accounting right from
+ the boot.
+
source "drivers/clk/qcom/Kconfig"
endmenu
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 7d74830e2ced..d2f0a565f75f 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -114,6 +114,134 @@ static struct hlist_head *orphan_list[] = {
NULL,
};
+#ifdef CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING
+
+#ifdef CONFIG_COMMON_CLK_BEGIN_ACCOUNTING_FROM_BOOT
+static bool freq_stats_on = true;
+#else
+static bool freq_stats_on;
+#endif /*CONFIG_COMMON_CLK_BEGIN_ACCOUNTING_FROM_BOOT*/
+
+static void free_tree(struct rb_node *node)
+{
+ struct freq_stats *this;
+
+ if (!node)
+ return;
+
+ free_tree(node->rb_left);
+ free_tree(node->rb_right);
+
+ this = rb_entry(node, struct freq_stats, node);
+ kfree(this);
+}
+
+static struct freq_stats *freq_stats_insert(struct rb_root *freq_stats_table,
+ unsigned long rate)
+{
+ struct rb_node **new = &(freq_stats_table->rb_node), *parent = NULL;
+ struct freq_stats *this;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ this = rb_entry(*new, struct freq_stats, node);
+ parent = *new;
+
+ if (rate < this->rate)
+ new = &((*new)->rb_left);
+ else if (rate > this->rate)
+ new = &((*new)->rb_right);
+ else
+ return this;
+ }
+
+ this = kzalloc(sizeof(*this), GFP_ATOMIC);
+ this->rate = rate;
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&this->node, parent, new);
+ rb_insert_color(&this->node, freq_stats_table);
+
+ return this;
+}
+
+static void generic_print_freq_stats_table(struct seq_file *m,
+ struct clk *clk,
+ bool indent, int level)
+{
+ struct rb_node *pos;
+ struct freq_stats *cur;
+
+ if (indent)
+ seq_printf(m, "%*s*%s%20s", level * 3 + 1, "",
+ !clk->current_freq_stats ? "[" : "",
+ "default_freq");
+ else
+ seq_printf(m, "%2s%20s", !clk->current_freq_stats ? "[" : "",
+ "default_freq");
+
+ if (!clk->current_freq_stats && !ktime_equal(clk->start_time,
+ ktime_set(0, 0)))
+ seq_printf(m, "%40llu",
+ ktime_to_ms(ktime_add(clk->default_freq_time,
+ ktime_sub(ktime_get(), clk->start_time))));
+ else
+ seq_printf(m, "%40llu", ktime_to_ms(clk->default_freq_time));
+
+ if (!clk->current_freq_stats)
+ seq_puts(m, "]");
+
+ seq_puts(m, "\n");
+
+ for (pos = rb_first(&clk->freq_stats_table); pos; pos = rb_next(pos)) {
+ cur = rb_entry(pos, typeof(*cur), node);
+
+ if (indent)
+ seq_printf(m, "%*s*%s%20lu", level * 3 + 1, "",
+ cur->rate == clk->rate ? "[" : "", cur->rate);
+ else
+ seq_printf(m, "%2s%20lu", cur->rate == clk->rate ?
+ "[" : "", cur->rate);
+
+ if (cur->rate == clk->rate && !ktime_equal(clk->start_time,
+ ktime_set(0, 0)))
+ seq_printf(m, "%40llu",
+ ktime_to_ms(ktime_add(cur->time_spent,
+ ktime_sub(ktime_get(), clk->start_time))));
+ else
+ seq_printf(m, "%40llu", ktime_to_ms(cur->time_spent));
+
+ if (cur->rate == clk->rate)
+ seq_puts(m, "]");
+ seq_puts(m, "\n");
+ }
+}
+
+static int clock_print_freq_stats_table(struct seq_file *m, void *unused)
+{
+ struct clk *clk = m->private;
+
+ if (!(clk->flags & CLK_GET_RATE_NOCACHE))
+ generic_print_freq_stats_table(m, clk, false, 0);
+
+ return 0;
+}
+
+static int freq_stats_table_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, clock_print_freq_stats_table,
+ inode->i_private);
+}
+
+static const struct file_operations freq_stats_table_fops = {
+ .open = freq_stats_table_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif /*CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING*/
+
+
static void clk_summary_show_one(struct seq_file *s, struct clk *c, int level)
{
if (!c)
@@ -124,6 +252,12 @@ static void clk_summary_show_one(struct seq_file *s, struct clk *c, int level)
30 - level * 3, c->name,
c->enable_count, c->prepare_count, clk_get_rate(c),
clk_get_accuracy(c), clk_get_phase(c));
+
+#ifdef CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING
+ if (!(c->flags & CLK_GET_RATE_NOCACHE))
+ generic_print_freq_stats_table(s, c, true, level);
+#endif /*CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING*/
+
}
static void clk_summary_show_subtree(struct seq_file *s, struct clk *c,
@@ -240,6 +374,78 @@ static const struct file_operations clk_dump_fops = {
.release = single_release,
};
+#ifdef CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING
+static int freq_stats_get(void *unused, u64 *val)
+{
+ *val = freq_stats_on;
+ return 0;
+}
+
+static void clk_traverse_subtree(struct clk *clk, int freq_stats_on)
+{
+ struct clk *child;
+ struct rb_node *node;
+
+ if (!clk)
+ return;
+
+ if (freq_stats_on) {
+ for (node = rb_first(&clk->freq_stats_table);
+ node; node = rb_next(node))
+ rb_entry(node, struct freq_stats, node)->time_spent =
+ ktime_set(0, 0);
+
+ clk->current_freq_stats = freq_stats_insert(
+ &clk->freq_stats_table,
+ clk_get_rate(clk));
+
+ if (clk->enable_count > 0)
+ clk->start_time = ktime_get();
+ } else {
+ if (clk->enable_count > 0) {
+ if (!clk->current_freq_stats)
+ clk->default_freq_time =
+ ktime_add(clk->default_freq_time,
+ ktime_sub(ktime_get(), clk->start_time));
+ else
+ clk->current_freq_stats->time_spent =
+ ktime_add(clk->current_freq_stats->time_spent,
+ ktime_sub(ktime_get(), clk->start_time));
+
+ clk->start_time = ktime_set(0, 0);
+ }
+ }
+ hlist_for_each_entry(child, &clk->children, child_node)
+ clk_traverse_subtree(child, freq_stats_on);
+}
+
+static int freq_stats_set(void *data, u64 val)
+{
+ struct clk *c;
+ unsigned long flags;
+ struct hlist_head **lists = (struct hlist_head **)data;
+
+ clk_prepare_lock();
+ flags = clk_enable_lock();
+
+ if (val == 0)
+ freq_stats_on = 0;
+ else
+ freq_stats_on = 1;
+
+ for (; *lists; lists++)
+ hlist_for_each_entry(c, *lists, child_node)
+ clk_traverse_subtree(c, freq_stats_on);
+
+ clk_enable_unlock(flags);
+ clk_prepare_unlock();
+
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(freq_stats_fops, freq_stats_get,
+ freq_stats_set, "%llu\n");
+#endif /*CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING*/
+
static int clk_debug_create_one(struct clk *clk, struct dentry *pdentry)
{
struct dentry *d;
@@ -291,6 +497,14 @@ static int clk_debug_create_one(struct clk *clk, struct dentry *pdentry)
if (!d)
goto err_out;
+#ifdef CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING
+ d = debugfs_create_file("frequency_stats_table", S_IRUGO, clk->dentry,
+ clk, &freq_stats_table_fops);
+
+ if (!d)
+ goto err_out;
+#endif /*CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING*/
+
if (clk->ops->debug_init) {
ret = clk->ops->debug_init(clk->hw, clk->dentry);
if (ret)
@@ -403,6 +617,13 @@ static int __init clk_debug_init(void)
if (!d)
return -ENOMEM;
+#ifdef CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING
+ d = debugfs_create_file("freq_stats_on", S_IRUGO|S_IWUSR,
+ rootdir, &all_lists, &freq_stats_fops);
+ if (!d)
+ return -ENOMEM;
+#endif /*CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING*/
+
mutex_lock(&clk_debug_lock);
hlist_for_each_entry(clk, &clk_debug_list, debug_node)
clk_debug_create_one(clk, rootdir);
@@ -852,6 +1073,22 @@ static void __clk_disable(struct clk *clk)
if (clk->ops->disable)
clk->ops->disable(clk->hw);
+#ifdef CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING
+
+ if (freq_stats_on) {
+ if (!clk->current_freq_stats)
+ clk->default_freq_time =
+ ktime_add(clk->default_freq_time,
+ ktime_sub(ktime_get(), clk->start_time));
+ else
+ clk->current_freq_stats->time_spent =
+ ktime_add(clk->current_freq_stats->time_spent,
+ ktime_sub(ktime_get(), clk->start_time));
+
+ clk->start_time = ktime_set(0, 0);
+ }
+#endif /*CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING*/
+
__clk_disable(clk->parent);
}
@@ -903,6 +1140,11 @@ static int __clk_enable(struct clk *clk)
return ret;
}
}
+
+#ifdef CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING
+ if (freq_stats_on)
+ clk->start_time = ktime_get();
+#endif /*CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING*/
}
clk->enable_count++;
@@ -1483,6 +1725,32 @@ static void clk_change_rate(struct clk *clk)
clk->rate = clk_recalc(clk, best_parent_rate);
+#ifdef CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING
+ if (freq_stats_on) {
+ if (!ktime_equal(clk->start_time, ktime_set(0, 0))) {
+ if (!clk->current_freq_stats)
+ clk->default_freq_time =
+ ktime_add(clk->default_freq_time,
+ ktime_sub(ktime_get(),
+ clk->start_time));
+ else
+ clk->current_freq_stats->time_spent =
+ ktime_add(
+ clk->current_freq_stats->time_spent,
+ ktime_sub(ktime_get(),
+ clk->start_time));
+ }
+
+ clk->current_freq_stats = freq_stats_insert(
+ &clk->freq_stats_table,
+ clk->rate);
+
+ if (clk->enable_count > 0)
+ clk->start_time = ktime_get();
+ }
+#endif /*CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING*/
+
+
if (clk->notifier_count && old_rate != clk->rate)
__clk_notify(clk, POST_RATE_CHANGE, old_rate, clk->rate);
@@ -2112,6 +2380,11 @@ static void __clk_release(struct kref *ref)
kfree(clk->parent_names);
kfree(clk->name);
+
+#ifdef CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING
+ free_tree(clk->freq_stats_table.rb_node);
+#endif/*CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING*/
+
kfree(clk);
}
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 3489f8f5fada..9a2669d61358 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -102,6 +102,23 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
Be aware that not all cpufreq drivers support the conservative
governor. If unsure have a look at the help section of the
driver. Fallback governor will be the performance governor.
+
+config CPU_FREQ_DEFAULT_GOV_INTERACTIVE
+ bool "interactive"
+ select CPU_FREQ_GOV_INTERACTIVE
+ help
+ Use the CPUFreq governor 'interactive' as default. This allows
+ you to get a full dynamic cpu frequency capable system by simply
+ loading your cpufreq low-level hardware driver, using the
+ 'interactive' governor for latency-sensitive workloads.
+
+config CPU_FREQ_DEFAULT_GOV_SCHED
+ bool "sched"
+ select CPU_FREQ_GOV_INTERACTIVE
+ help
+ Use the CPUfreq governor 'sched' as default. This scales
+ cpu frequency using CPU utilization estimates from the
+ scheduler.
endchoice
config CPU_FREQ_GOV_PERFORMANCE
@@ -159,6 +176,20 @@ config CPU_FREQ_GOV_ONDEMAND
If in doubt, say N.
+config CPU_FREQ_GOV_INTERACTIVE
+ bool "'interactive' cpufreq policy governor"
+ help
+ 'interactive' - This driver adds a dynamic cpufreq policy governor
+ designed for latency-sensitive workloads.
+
+ This governor attempts to reduce the latency of clock
+ increases so that the system is more responsive to
+ interactive workloads.
+
+ For details, take a look at linux/Documentation/cpu-freq.
+
+ If in doubt, say N.
+
config CPU_FREQ_GOV_CONSERVATIVE
tristate "'conservative' cpufreq governor"
depends on CPU_FREQ
@@ -183,6 +214,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
If in doubt, say N.
+config CPU_FREQ_GOV_SCHED
+ bool "'sched' cpufreq governor"
+ depends on CPU_FREQ
+ depends on SMP
+ select CPU_FREQ_GOV_COMMON
+ help
+ 'sched' - this governor scales cpu frequency from the
+ scheduler as a function of cpu capacity utilization. It does
+ not evaluate utilization on a periodic basis (as ondemand
+ does) but instead is event-driven by the scheduler.
+
+ If in doubt, say N.
+
+comment "CPU frequency scaling drivers"
+
config CPUFREQ_DT
tristate "Generic DT based cpufreq driver"
depends on HAVE_CLK && OF
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 92eeae40fe72..e040da342fee 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_CPU_FREQ_GOV_POWERSAVE) += cpufreq_powersave.o
obj-$(CONFIG_CPU_FREQ_GOV_USERSPACE) += cpufreq_userspace.o
obj-$(CONFIG_CPU_FREQ_GOV_ONDEMAND) += cpufreq_ondemand.o
obj-$(CONFIG_CPU_FREQ_GOV_CONSERVATIVE) += cpufreq_conservative.o
+obj-$(CONFIG_CPU_FREQ_GOV_INTERACTIVE) += cpufreq_interactive.o
obj-$(CONFIG_CPU_FREQ_GOV_COMMON) += cpufreq_governor.o
obj-$(CONFIG_CPUFREQ_DT) += cpufreq-dt.o
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 8277d62dd301..d390ebbcec39 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -28,6 +28,9 @@
#include <linux/slab.h>
#include <linux/suspend.h>
#include <linux/tick.h>
+#ifdef CONFIG_SMP
+#include <linux/sched.h>
+#endif
#include <trace/events/power.h>
/**
@@ -102,6 +105,12 @@ bool have_governor_per_policy(void)
}
EXPORT_SYMBOL_GPL(have_governor_per_policy);
+bool cpufreq_driver_is_slow(void)
+{
+ return !(cpufreq_driver->flags & CPUFREQ_DRIVER_FAST);
+}
+EXPORT_SYMBOL_GPL(cpufreq_driver_is_slow);
+
struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy)
{
if (have_governor_per_policy())
@@ -278,6 +287,50 @@ static inline void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
}
#endif
+/*********************************************************************
+ * FREQUENCY INVARIANT CPU CAPACITY *
+ *********************************************************************/
+
+static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
+static DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE;
+
+static void
+scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs)
+{
+ unsigned long cur = freqs ? freqs->new : policy->cur;
+ unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max;
+ struct cpufreq_cpuinfo *cpuinfo = &policy->cpuinfo;
+ int cpu;
+
+ pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n",
+ cpumask_pr_args(policy->cpus), cur, policy->max, scale);
+
+ for_each_cpu(cpu, policy->cpus)
+ per_cpu(freq_scale, cpu) = scale;
+
+ if (freqs)
+ return;
+
+ scale = (policy->max << SCHED_CAPACITY_SHIFT) / cpuinfo->max_freq;
+
+ pr_debug("cpus %*pbl cur max/max freq %u/%u kHz max freq scale %lu\n",
+ cpumask_pr_args(policy->cpus), policy->max, cpuinfo->max_freq,
+ scale);
+
+ for_each_cpu(cpu, policy->cpus)
+ per_cpu(max_freq_scale, cpu) = scale;
+}
+
+unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return per_cpu(freq_scale, cpu);
+}
+
+unsigned long cpufreq_scale_max_freq_capacity(int cpu)
+{
+ return per_cpu(max_freq_scale, cpu);
+}
+
static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
struct cpufreq_freqs *freqs, unsigned int state)
{
@@ -354,6 +407,9 @@ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy,
void cpufreq_freq_transition_begin(struct cpufreq_policy *policy,
struct cpufreq_freqs *freqs)
{
+#ifdef CONFIG_SMP
+ int cpu;
+#endif
/*
* Catch double invocations of _begin() which lead to self-deadlock.
@@ -381,6 +437,12 @@ wait:
spin_unlock(&policy->transition_lock);
+ scale_freq_capacity(policy, freqs);
+#ifdef CONFIG_SMP
+ for_each_cpu(cpu, policy->cpus)
+ trace_cpu_capacity(capacity_curr_of(cpu), cpu);
+#endif
+
cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE);
}
EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin);
@@ -2208,8 +2270,11 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
CPUFREQ_NOTIFY, new_policy);
+ scale_freq_capacity(new_policy, NULL);
+
policy->min = new_policy->min;
policy->max = new_policy->max;
+ trace_cpu_frequency_limits(policy->max, policy->min, policy->cpu);
pr_debug("new min and max freqs are %u - %u kHz\n",
policy->min, policy->max);
diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c
new file mode 100644
index 000000000000..dd171e47da54
--- /dev/null
+++ b/drivers/cpufreq/cpufreq_interactive.c
@@ -0,0 +1,1338 @@
+/*
+ * drivers/cpufreq/cpufreq_interactive.c
+ *
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Author: Mike Chan (mike@android.com)
+ *
+ */
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpufreq.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/tick.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/cpufreq_interactive.h>
+
+struct cpufreq_interactive_cpuinfo {
+ struct timer_list cpu_timer;
+ struct timer_list cpu_slack_timer;
+ spinlock_t load_lock; /* protects the next 4 fields */
+ u64 time_in_idle;
+ u64 time_in_idle_timestamp;
+ u64 cputime_speedadj;
+ u64 cputime_speedadj_timestamp;
+ struct cpufreq_policy *policy;
+ struct cpufreq_frequency_table *freq_table;
+ spinlock_t target_freq_lock; /*protects target freq */
+ unsigned int target_freq;
+ unsigned int floor_freq;
+ u64 pol_floor_val_time; /* policy floor_validate_time */
+ u64 loc_floor_val_time; /* per-cpu floor_validate_time */
+ u64 pol_hispeed_val_time; /* policy hispeed_validate_time */
+ u64 loc_hispeed_val_time; /* per-cpu hispeed_validate_time */
+ struct rw_semaphore enable_sem;
+ int governor_enabled;
+};
+
+static DEFINE_PER_CPU(struct cpufreq_interactive_cpuinfo, cpuinfo);
+
+/* realtime thread handles frequency scaling */
+static struct task_struct *speedchange_task;
+static cpumask_t speedchange_cpumask;
+static spinlock_t speedchange_cpumask_lock;
+static struct mutex gov_lock;
+
+/* Target load. Lower values result in higher CPU speeds. */
+#define DEFAULT_TARGET_LOAD 90
+static unsigned int default_target_loads[] = {DEFAULT_TARGET_LOAD};
+
+#define DEFAULT_TIMER_RATE (20 * USEC_PER_MSEC)
+#define DEFAULT_ABOVE_HISPEED_DELAY DEFAULT_TIMER_RATE
+static unsigned int default_above_hispeed_delay[] = {
+ DEFAULT_ABOVE_HISPEED_DELAY };
+
+struct cpufreq_interactive_tunables {
+ int usage_count;
+ /* Hi speed to bump to from lo speed when load burst (default max) */
+ unsigned int hispeed_freq;
+ /* Go to hi speed when CPU load at or above this value. */
+#define DEFAULT_GO_HISPEED_LOAD 99
+ unsigned long go_hispeed_load;
+ /* Target load. Lower values result in higher CPU speeds. */
+ spinlock_t target_loads_lock;
+ unsigned int *target_loads;
+ int ntarget_loads;
+ /*
+ * The minimum amount of time to spend at a frequency before we can ramp
+ * down.
+ */
+#define DEFAULT_MIN_SAMPLE_TIME (80 * USEC_PER_MSEC)
+ unsigned long min_sample_time;
+ /*
+ * The sample rate of the timer used to increase frequency
+ */
+ unsigned long timer_rate;
+ /*
+ * Wait this long before raising speed above hispeed, by default a
+ * single timer interval.
+ */
+ spinlock_t above_hispeed_delay_lock;
+ unsigned int *above_hispeed_delay;
+ int nabove_hispeed_delay;
+ /* Non-zero means indefinite speed boost active */
+ int boost_val;
+ /* Duration of a boot pulse in usecs */
+ int boostpulse_duration_val;
+ /* End time of boost pulse in ktime converted to usecs */
+ u64 boostpulse_endtime;
+ bool boosted;
+ /*
+ * Max additional time to wait in idle, beyond timer_rate, at speeds
+ * above minimum before wakeup to reduce speed, or -1 if unnecessary.
+ */
+#define DEFAULT_TIMER_SLACK (4 * DEFAULT_TIMER_RATE)
+ int timer_slack_val;
+ bool io_is_busy;
+};
+
+/* For cases where we have single governor instance for system */
+static struct cpufreq_interactive_tunables *common_tunables;
+
+static struct attribute_group *get_sysfs_attr(void);
+
+static void cpufreq_interactive_timer_resched(
+ struct cpufreq_interactive_cpuinfo *pcpu)
+{
+ struct cpufreq_interactive_tunables *tunables =
+ pcpu->policy->governor_data;
+ unsigned long expires;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pcpu->load_lock, flags);
+ pcpu->time_in_idle =
+ get_cpu_idle_time(smp_processor_id(),
+ &pcpu->time_in_idle_timestamp,
+ tunables->io_is_busy);
+ pcpu->cputime_speedadj = 0;
+ pcpu->cputime_speedadj_timestamp = pcpu->time_in_idle_timestamp;
+ expires = jiffies + usecs_to_jiffies(tunables->timer_rate);
+ mod_timer_pinned(&pcpu->cpu_timer, expires);
+
+ if (tunables->timer_slack_val >= 0 &&
+ pcpu->target_freq > pcpu->policy->min) {
+ expires += usecs_to_jiffies(tunables->timer_slack_val);
+ mod_timer_pinned(&pcpu->cpu_slack_timer, expires);
+ }
+
+ spin_unlock_irqrestore(&pcpu->load_lock, flags);
+}
+
+/* The caller shall take enable_sem write semaphore to avoid any timer race.
+ * The cpu_timer and cpu_slack_timer must be deactivated when calling this
+ * function.
+ */
+static void cpufreq_interactive_timer_start(
+ struct cpufreq_interactive_tunables *tunables, int cpu)
+{
+ struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, cpu);
+ unsigned long expires = jiffies +
+ usecs_to_jiffies(tunables->timer_rate);
+ unsigned long flags;
+
+ pcpu->cpu_timer.expires = expires;
+ add_timer_on(&pcpu->cpu_timer, cpu);
+ if (tunables->timer_slack_val >= 0 &&
+ pcpu->target_freq > pcpu->policy->min) {
+ expires += usecs_to_jiffies(tunables->timer_slack_val);
+ pcpu->cpu_slack_timer.expires = expires;
+ add_timer_on(&pcpu->cpu_slack_timer, cpu);
+ }
+
+ spin_lock_irqsave(&pcpu->load_lock, flags);
+ pcpu->time_in_idle =
+ get_cpu_idle_time(cpu, &pcpu->time_in_idle_timestamp,
+ tunables->io_is_busy);
+ pcpu->cputime_speedadj = 0;
+ pcpu->cputime_speedadj_timestamp = pcpu->time_in_idle_timestamp;
+ spin_unlock_irqrestore(&pcpu->load_lock, flags);
+}
+
+static unsigned int freq_to_above_hispeed_delay(
+ struct cpufreq_interactive_tunables *tunables,
+ unsigned int freq)
+{
+ int i;
+ unsigned int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tunables->above_hispeed_delay_lock, flags);
+
+ for (i = 0; i < tunables->nabove_hispeed_delay - 1 &&
+ freq >= tunables->above_hispeed_delay[i+1]; i += 2)
+ ;
+
+ ret = tunables->above_hispeed_delay[i];
+ spin_unlock_irqrestore(&tunables->above_hispeed_delay_lock, flags);
+ return ret;
+}
+
+static unsigned int freq_to_targetload(
+ struct cpufreq_interactive_tunables *tunables, unsigned int freq)
+{
+ int i;
+ unsigned int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tunables->target_loads_lock, flags);
+
+ for (i = 0; i < tunables->ntarget_loads - 1 &&
+ freq >= tunables->target_loads[i+1]; i += 2)
+ ;
+
+ ret = tunables->target_loads[i];
+ spin_unlock_irqrestore(&tunables->target_loads_lock, flags);
+ return ret;
+}
+
+/*
+ * If increasing frequencies never map to a lower target load then
+ * choose_freq() will find the minimum frequency that does not exceed its
+ * target load given the current load.
+ */
+static unsigned int choose_freq(struct cpufreq_interactive_cpuinfo *pcpu,
+ unsigned int loadadjfreq)
+{
+ unsigned int freq = pcpu->policy->cur;
+ unsigned int prevfreq, freqmin, freqmax;
+ unsigned int tl;
+ int index;
+
+ freqmin = 0;
+ freqmax = UINT_MAX;
+
+ do {
+ prevfreq = freq;
+ tl = freq_to_targetload(pcpu->policy->governor_data, freq);
+
+ /*
+ * Find the lowest frequency where the computed load is less
+ * than or equal to the target load.
+ */
+
+ if (cpufreq_frequency_table_target(
+ pcpu->policy, pcpu->freq_table, loadadjfreq / tl,
+ CPUFREQ_RELATION_L, &index))
+ break;
+ freq = pcpu->freq_table[index].frequency;
+
+ if (freq > prevfreq) {
+ /* The previous frequency is too low. */
+ freqmin = prevfreq;
+
+ if (freq >= freqmax) {
+ /*
+ * Find the highest frequency that is less
+ * than freqmax.
+ */
+ if (cpufreq_frequency_table_target(
+ pcpu->policy, pcpu->freq_table,
+ freqmax - 1, CPUFREQ_RELATION_H,
+ &index))
+ break;
+ freq = pcpu->freq_table[index].frequency;
+
+ if (freq == freqmin) {
+ /*
+ * The first frequency below freqmax
+ * has already been found to be too
+ * low. freqmax is the lowest speed
+ * we found that is fast enough.
+ */
+ freq = freqmax;
+ break;
+ }
+ }
+ } else if (freq < prevfreq) {
+ /* The previous frequency is high enough. */
+ freqmax = prevfreq;
+
+ if (freq <= freqmin) {
+ /*
+ * Find the lowest frequency that is higher
+ * than freqmin.
+ */
+ if (cpufreq_frequency_table_target(
+ pcpu->policy, pcpu->freq_table,
+ freqmin + 1, CPUFREQ_RELATION_L,
+ &index))
+ break;
+ freq = pcpu->freq_table[index].frequency;
+
+ /*
+ * If freqmax is the first frequency above
+ * freqmin then we have already found that
+ * this speed is fast enough.
+ */
+ if (freq == freqmax)
+ break;
+ }
+ }
+
+ /* If same frequency chosen as previous then done. */
+ } while (freq != prevfreq);
+
+ return freq;
+}
+
+static u64 update_load(int cpu)
+{
+ struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, cpu);
+ struct cpufreq_interactive_tunables *tunables =
+ pcpu->policy->governor_data;
+ u64 now;
+ u64 now_idle;
+ u64 delta_idle;
+ u64 delta_time;
+ u64 active_time;
+
+ now_idle = get_cpu_idle_time(cpu, &now, tunables->io_is_busy);
+ delta_idle = (now_idle - pcpu->time_in_idle);
+ delta_time = (now - pcpu->time_in_idle_timestamp);
+
+ if (delta_time <= delta_idle)
+ active_time = 0;
+ else
+ active_time = delta_time - delta_idle;
+
+ pcpu->cputime_speedadj += active_time * pcpu->policy->cur;
+
+ pcpu->time_in_idle = now_idle;
+ pcpu->time_in_idle_timestamp = now;
+ return now;
+}
+
+static void cpufreq_interactive_timer(unsigned long data)
+{
+ u64 now;
+ unsigned int delta_time;
+ u64 cputime_speedadj;
+ int cpu_load;
+ struct cpufreq_interactive_cpuinfo *pcpu =
+ &per_cpu(cpuinfo, data);
+ struct cpufreq_interactive_tunables *tunables =
+ pcpu->policy->governor_data;
+ unsigned int new_freq;
+ unsigned int loadadjfreq;
+ unsigned int index;
+ unsigned long flags;
+ u64 max_fvtime;
+
+ if (!down_read_trylock(&pcpu->enable_sem))
+ return;
+ if (!pcpu->governor_enabled)
+ goto exit;
+
+ spin_lock_irqsave(&pcpu->load_lock, flags);
+ now = update_load(data);
+ delta_time = (unsigned int)(now - pcpu->cputime_speedadj_timestamp);
+ cputime_speedadj = pcpu->cputime_speedadj;
+ spin_unlock_irqrestore(&pcpu->load_lock, flags);
+
+ if (WARN_ON_ONCE(!delta_time))
+ goto rearm;
+
+ spin_lock_irqsave(&pcpu->target_freq_lock, flags);
+ do_div(cputime_speedadj, delta_time);
+ loadadjfreq = (unsigned int)cputime_speedadj * 100;
+ cpu_load = loadadjfreq / pcpu->policy->cur;
+ tunables->boosted = tunables->boost_val || now < tunables->boostpulse_endtime;
+
+ if (cpu_load >= tunables->go_hispeed_load || tunables->boosted) {
+ if (pcpu->policy->cur < tunables->hispeed_freq) {
+ new_freq = tunables->hispeed_freq;
+ } else {
+ new_freq = choose_freq(pcpu, loadadjfreq);
+
+ if (new_freq < tunables->hispeed_freq)
+ new_freq = tunables->hispeed_freq;
+ }
+ } else {
+ new_freq = choose_freq(pcpu, loadadjfreq);
+ if (new_freq > tunables->hispeed_freq &&
+ pcpu->policy->cur < tunables->hispeed_freq)
+ new_freq = tunables->hispeed_freq;
+ }
+
+ if (pcpu->policy->cur >= tunables->hispeed_freq &&
+ new_freq > pcpu->policy->cur &&
+ now - pcpu->pol_hispeed_val_time <
+ freq_to_above_hispeed_delay(tunables, pcpu->policy->cur)) {
+ trace_cpufreq_interactive_notyet(
+ data, cpu_load, pcpu->target_freq,
+ pcpu->policy->cur, new_freq);
+ spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
+ goto rearm;
+ }
+
+ pcpu->loc_hispeed_val_time = now;
+
+ if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table,
+ new_freq, CPUFREQ_RELATION_L,
+ &index)) {
+ spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
+ goto rearm;
+ }
+
+ new_freq = pcpu->freq_table[index].frequency;
+
+ /*
+ * Do not scale below floor_freq unless we have been at or above the
+ * floor frequency for the minimum sample time since last validated.
+ */
+ max_fvtime = max(pcpu->pol_floor_val_time, pcpu->loc_floor_val_time);
+ if (new_freq < pcpu->floor_freq &&
+ pcpu->target_freq >= pcpu->policy->cur) {
+ if (now - max_fvtime < tunables->min_sample_time) {
+ trace_cpufreq_interactive_notyet(
+ data, cpu_load, pcpu->target_freq,
+ pcpu->policy->cur, new_freq);
+ spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
+ goto rearm;
+ }
+ }
+
+ /*
+ * Update the timestamp for checking whether speed has been held at
+ * or above the selected frequency for a minimum of min_sample_time,
+ * if not boosted to hispeed_freq. If boosted to hispeed_freq then we
+ * allow the speed to drop as soon as the boostpulse duration expires
+ * (or the indefinite boost is turned off).
+ */
+
+ if (!tunables->boosted || new_freq > tunables->hispeed_freq) {
+ pcpu->floor_freq = new_freq;
+ if (pcpu->target_freq >= pcpu->policy->cur ||
+ new_freq >= pcpu->policy->cur)
+ pcpu->loc_floor_val_time = now;
+ }
+
+ if (pcpu->target_freq == new_freq &&
+ pcpu->target_freq <= pcpu->policy->cur) {
+ trace_cpufreq_interactive_already(
+ data, cpu_load, pcpu->target_freq,
+ pcpu->policy->cur, new_freq);
+ spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
+ goto rearm;
+ }
+
+ trace_cpufreq_interactive_target(data, cpu_load, pcpu->target_freq,
+ pcpu->policy->cur, new_freq);
+
+ pcpu->target_freq = new_freq;
+ spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
+ spin_lock_irqsave(&speedchange_cpumask_lock, flags);
+ cpumask_set_cpu(data, &speedchange_cpumask);
+ spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
+ wake_up_process(speedchange_task);
+
+rearm:
+ if (!timer_pending(&pcpu->cpu_timer))
+ cpufreq_interactive_timer_resched(pcpu);
+
+exit:
+ up_read(&pcpu->enable_sem);
+ return;
+}
+
+static void cpufreq_interactive_idle_end(void)
+{
+ struct cpufreq_interactive_cpuinfo *pcpu =
+ &per_cpu(cpuinfo, smp_processor_id());
+
+ if (!down_read_trylock(&pcpu->enable_sem))
+ return;
+ if (!pcpu->governor_enabled) {
+ up_read(&pcpu->enable_sem);
+ return;
+ }
+
+ /* Arm the timer for 1-2 ticks later if not already. */
+ if (!timer_pending(&pcpu->cpu_timer)) {
+ cpufreq_interactive_timer_resched(pcpu);
+ } else if (time_after_eq(jiffies, pcpu->cpu_timer.expires)) {
+ del_timer(&pcpu->cpu_timer);
+ del_timer(&pcpu->cpu_slack_timer);
+ cpufreq_interactive_timer(smp_processor_id());
+ }
+
+ up_read(&pcpu->enable_sem);
+}
+
+static int cpufreq_interactive_speedchange_task(void *data)
+{
+ unsigned int cpu;
+ cpumask_t tmp_mask;
+ unsigned long flags;
+ struct cpufreq_interactive_cpuinfo *pcpu;
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_lock_irqsave(&speedchange_cpumask_lock, flags);
+
+ if (cpumask_empty(&speedchange_cpumask)) {
+ spin_unlock_irqrestore(&speedchange_cpumask_lock,
+ flags);
+ schedule();
+
+ if (kthread_should_stop())
+ break;
+
+ spin_lock_irqsave(&speedchange_cpumask_lock, flags);
+ }
+
+ set_current_state(TASK_RUNNING);
+ tmp_mask = speedchange_cpumask;
+ cpumask_clear(&speedchange_cpumask);
+ spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
+
+ for_each_cpu(cpu, &tmp_mask) {
+ unsigned int j;
+ unsigned int max_freq = 0;
+ struct cpufreq_interactive_cpuinfo *pjcpu;
+ u64 hvt = ~0ULL, fvt = 0;
+
+ pcpu = &per_cpu(cpuinfo, cpu);
+ if (!down_read_trylock(&pcpu->enable_sem))
+ continue;
+ if (!pcpu->governor_enabled) {
+ up_read(&pcpu->enable_sem);
+ continue;
+ }
+
+ for_each_cpu(j, pcpu->policy->cpus) {
+ pjcpu = &per_cpu(cpuinfo, j);
+
+ fvt = max(fvt, pjcpu->loc_floor_val_time);
+ if (pjcpu->target_freq > max_freq) {
+ max_freq = pjcpu->target_freq;
+ hvt = pjcpu->loc_hispeed_val_time;
+ } else if (pjcpu->target_freq == max_freq) {
+ hvt = min(hvt, pjcpu->loc_hispeed_val_time);
+ }
+ }
+ for_each_cpu(j, pcpu->policy->cpus) {
+ pjcpu = &per_cpu(cpuinfo, j);
+ pjcpu->pol_floor_val_time = fvt;
+ }
+
+ if (max_freq != pcpu->policy->cur) {
+ __cpufreq_driver_target(pcpu->policy,
+ max_freq,
+ CPUFREQ_RELATION_H);
+ for_each_cpu(j, pcpu->policy->cpus) {
+ pjcpu = &per_cpu(cpuinfo, j);
+ pjcpu->pol_hispeed_val_time = hvt;
+ }
+ }
+ trace_cpufreq_interactive_setspeed(cpu,
+ pcpu->target_freq,
+ pcpu->policy->cur);
+
+ up_read(&pcpu->enable_sem);
+ }
+ }
+
+ return 0;
+}
+
+static void cpufreq_interactive_boost(struct cpufreq_interactive_tunables *tunables)
+{
+ int i;
+ int anyboost = 0;
+ unsigned long flags[2];
+ struct cpufreq_interactive_cpuinfo *pcpu;
+
+ tunables->boosted = true;
+
+ spin_lock_irqsave(&speedchange_cpumask_lock, flags[0]);
+
+ for_each_online_cpu(i) {
+ pcpu = &per_cpu(cpuinfo, i);
+ if (tunables != pcpu->policy->governor_data)
+ continue;
+
+ spin_lock_irqsave(&pcpu->target_freq_lock, flags[1]);
+ if (pcpu->target_freq < tunables->hispeed_freq) {
+ pcpu->target_freq = tunables->hispeed_freq;
+ cpumask_set_cpu(i, &speedchange_cpumask);
+ pcpu->pol_hispeed_val_time =
+ ktime_to_us(ktime_get());
+ anyboost = 1;
+ }
+ spin_unlock_irqrestore(&pcpu->target_freq_lock, flags[1]);
+ }
+
+ spin_unlock_irqrestore(&speedchange_cpumask_lock, flags[0]);
+
+ if (anyboost)
+ wake_up_process(speedchange_task);
+}
+
+static int cpufreq_interactive_notifier(
+ struct notifier_block *nb, unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ struct cpufreq_interactive_cpuinfo *pcpu;
+ int cpu;
+ unsigned long flags;
+
+ if (val == CPUFREQ_POSTCHANGE) {
+ pcpu = &per_cpu(cpuinfo, freq->cpu);
+ if (!down_read_trylock(&pcpu->enable_sem))
+ return 0;
+ if (!pcpu->governor_enabled) {
+ up_read(&pcpu->enable_sem);
+ return 0;
+ }
+
+ for_each_cpu(cpu, pcpu->policy->cpus) {
+ struct cpufreq_interactive_cpuinfo *pjcpu =
+ &per_cpu(cpuinfo, cpu);
+ if (cpu != freq->cpu) {
+ if (!down_read_trylock(&pjcpu->enable_sem))
+ continue;
+ if (!pjcpu->governor_enabled) {
+ up_read(&pjcpu->enable_sem);
+ continue;
+ }
+ }
+ spin_lock_irqsave(&pjcpu->load_lock, flags);
+ update_load(cpu);
+ spin_unlock_irqrestore(&pjcpu->load_lock, flags);
+ if (cpu != freq->cpu)
+ up_read(&pjcpu->enable_sem);
+ }
+
+ up_read(&pcpu->enable_sem);
+ }
+ return 0;
+}
+
+static struct notifier_block cpufreq_notifier_block = {
+ .notifier_call = cpufreq_interactive_notifier,
+};
+
+static unsigned int *get_tokenized_data(const char *buf, int *num_tokens)
+{
+ const char *cp;
+ int i;
+ int ntokens = 1;
+ unsigned int *tokenized_data;
+ int err = -EINVAL;
+
+ cp = buf;
+ while ((cp = strpbrk(cp + 1, " :")))
+ ntokens++;
+
+ if (!(ntokens & 0x1))
+ goto err;
+
+ tokenized_data = kmalloc(ntokens * sizeof(unsigned int), GFP_KERNEL);
+ if (!tokenized_data) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ cp = buf;
+ i = 0;
+ while (i < ntokens) {
+ if (sscanf(cp, "%u", &tokenized_data[i++]) != 1)
+ goto err_kfree;
+
+ cp = strpbrk(cp, " :");
+ if (!cp)
+ break;
+ cp++;
+ }
+
+ if (i != ntokens)
+ goto err_kfree;
+
+ *num_tokens = ntokens;
+ return tokenized_data;
+
+err_kfree:
+ kfree(tokenized_data);
+err:
+ return ERR_PTR(err);
+}
+
+static ssize_t show_target_loads(
+ struct cpufreq_interactive_tunables *tunables,
+ char *buf)
+{
+ int i;
+ ssize_t ret = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tunables->target_loads_lock, flags);
+
+ for (i = 0; i < tunables->ntarget_loads; i++)
+ ret += sprintf(buf + ret, "%u%s", tunables->target_loads[i],
+ i & 0x1 ? ":" : " ");
+
+ sprintf(buf + ret - 1, "\n");
+ spin_unlock_irqrestore(&tunables->target_loads_lock, flags);
+ return ret;
+}
+
+static ssize_t store_target_loads(
+ struct cpufreq_interactive_tunables *tunables,
+ const char *buf, size_t count)
+{
+ int ntokens;
+ unsigned int *new_target_loads = NULL;
+ unsigned long flags;
+
+ new_target_loads = get_tokenized_data(buf, &ntokens);
+ if (IS_ERR(new_target_loads))
+ return PTR_RET(new_target_loads);
+
+ spin_lock_irqsave(&tunables->target_loads_lock, flags);
+ if (tunables->target_loads != default_target_loads)
+ kfree(tunables->target_loads);
+ tunables->target_loads = new_target_loads;
+ tunables->ntarget_loads = ntokens;
+ spin_unlock_irqrestore(&tunables->target_loads_lock, flags);
+ return count;
+}
+
+static ssize_t show_above_hispeed_delay(
+ struct cpufreq_interactive_tunables *tunables, char *buf)
+{
+ int i;
+ ssize_t ret = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tunables->above_hispeed_delay_lock, flags);
+
+ for (i = 0; i < tunables->nabove_hispeed_delay; i++)
+ ret += sprintf(buf + ret, "%u%s",
+ tunables->above_hispeed_delay[i],
+ i & 0x1 ? ":" : " ");
+
+ sprintf(buf + ret - 1, "\n");
+ spin_unlock_irqrestore(&tunables->above_hispeed_delay_lock, flags);
+ return ret;
+}
+
+static ssize_t store_above_hispeed_delay(
+ struct cpufreq_interactive_tunables *tunables,
+ const char *buf, size_t count)
+{
+ int ntokens;
+ unsigned int *new_above_hispeed_delay = NULL;
+ unsigned long flags;
+
+ new_above_hispeed_delay = get_tokenized_data(buf, &ntokens);
+ if (IS_ERR(new_above_hispeed_delay))
+ return PTR_RET(new_above_hispeed_delay);
+
+ spin_lock_irqsave(&tunables->above_hispeed_delay_lock, flags);
+ if (tunables->above_hispeed_delay != default_above_hispeed_delay)
+ kfree(tunables->above_hispeed_delay);
+ tunables->above_hispeed_delay = new_above_hispeed_delay;
+ tunables->nabove_hispeed_delay = ntokens;
+ spin_unlock_irqrestore(&tunables->above_hispeed_delay_lock, flags);
+ return count;
+
+}
+
+static ssize_t show_hispeed_freq(struct cpufreq_interactive_tunables *tunables,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", tunables->hispeed_freq);
+}
+
+static ssize_t store_hispeed_freq(struct cpufreq_interactive_tunables *tunables,
+ const char *buf, size_t count)
+{
+ int ret;
+ long unsigned int val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ tunables->hispeed_freq = val;
+ return count;
+}
+
+static ssize_t show_go_hispeed_load(struct cpufreq_interactive_tunables
+ *tunables, char *buf)
+{
+ return sprintf(buf, "%lu\n", tunables->go_hispeed_load);
+}
+
+static ssize_t store_go_hispeed_load(struct cpufreq_interactive_tunables
+ *tunables, const char *buf, size_t count)
+{
+ int ret;
+ unsigned long val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ tunables->go_hispeed_load = val;
+ return count;
+}
+
+static ssize_t show_min_sample_time(struct cpufreq_interactive_tunables
+ *tunables, char *buf)
+{
+ return sprintf(buf, "%lu\n", tunables->min_sample_time);
+}
+
+static ssize_t store_min_sample_time(struct cpufreq_interactive_tunables
+ *tunables, const char *buf, size_t count)
+{
+ int ret;
+ unsigned long val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ tunables->min_sample_time = val;
+ return count;
+}
+
+static ssize_t show_timer_rate(struct cpufreq_interactive_tunables *tunables,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", tunables->timer_rate);
+}
+
+static ssize_t store_timer_rate(struct cpufreq_interactive_tunables *tunables,
+ const char *buf, size_t count)
+{
+ int ret;
+ unsigned long val, val_round;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ val_round = jiffies_to_usecs(usecs_to_jiffies(val));
+ if (val != val_round)
+ pr_warn("timer_rate not aligned to jiffy. Rounded up to %lu\n",
+ val_round);
+
+ tunables->timer_rate = val_round;
+ return count;
+}
+
+static ssize_t show_timer_slack(struct cpufreq_interactive_tunables *tunables,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", tunables->timer_slack_val);
+}
+
+static ssize_t store_timer_slack(struct cpufreq_interactive_tunables *tunables,
+ const char *buf, size_t count)
+{
+ int ret;
+ unsigned long val;
+
+ ret = kstrtol(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->timer_slack_val = val;
+ return count;
+}
+
+static ssize_t show_boost(struct cpufreq_interactive_tunables *tunables,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", tunables->boost_val);
+}
+
+static ssize_t store_boost(struct cpufreq_interactive_tunables *tunables,
+ const char *buf, size_t count)
+{
+ int ret;
+ unsigned long val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->boost_val = val;
+
+ if (tunables->boost_val) {
+ trace_cpufreq_interactive_boost("on");
+ if (!tunables->boosted)
+ cpufreq_interactive_boost(tunables);
+ } else {
+ tunables->boostpulse_endtime = ktime_to_us(ktime_get());
+ trace_cpufreq_interactive_unboost("off");
+ }
+
+ return count;
+}
+
+static ssize_t store_boostpulse(struct cpufreq_interactive_tunables *tunables,
+ const char *buf, size_t count)
+{
+ int ret;
+ unsigned long val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->boostpulse_endtime = ktime_to_us(ktime_get()) +
+ tunables->boostpulse_duration_val;
+ trace_cpufreq_interactive_boost("pulse");
+ if (!tunables->boosted)
+ cpufreq_interactive_boost(tunables);
+ return count;
+}
+
+static ssize_t show_boostpulse_duration(struct cpufreq_interactive_tunables
+ *tunables, char *buf)
+{
+ return sprintf(buf, "%d\n", tunables->boostpulse_duration_val);
+}
+
+static ssize_t store_boostpulse_duration(struct cpufreq_interactive_tunables
+ *tunables, const char *buf, size_t count)
+{
+ int ret;
+ unsigned long val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->boostpulse_duration_val = val;
+ return count;
+}
+
+static ssize_t show_io_is_busy(struct cpufreq_interactive_tunables *tunables,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", tunables->io_is_busy);
+}
+
+static ssize_t store_io_is_busy(struct cpufreq_interactive_tunables *tunables,
+ const char *buf, size_t count)
+{
+ int ret;
+ unsigned long val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ tunables->io_is_busy = val;
+ return count;
+}
+
+/*
+ * Create show/store routines
+ * - sys: One governor instance for complete SYSTEM
+ * - pol: One governor instance per struct cpufreq_policy
+ */
+#define show_gov_pol_sys(file_name) \
+static ssize_t show_##file_name##_gov_sys \
+(struct kobject *kobj, struct attribute *attr, char *buf) \
+{ \
+ return show_##file_name(common_tunables, buf); \
+} \
+ \
+static ssize_t show_##file_name##_gov_pol \
+(struct cpufreq_policy *policy, char *buf) \
+{ \
+ return show_##file_name(policy->governor_data, buf); \
+}
+
+#define store_gov_pol_sys(file_name) \
+static ssize_t store_##file_name##_gov_sys \
+(struct kobject *kobj, struct attribute *attr, const char *buf, \
+ size_t count) \
+{ \
+ return store_##file_name(common_tunables, buf, count); \
+} \
+ \
+static ssize_t store_##file_name##_gov_pol \
+(struct cpufreq_policy *policy, const char *buf, size_t count) \
+{ \
+ return store_##file_name(policy->governor_data, buf, count); \
+}
+
+#define show_store_gov_pol_sys(file_name) \
+show_gov_pol_sys(file_name); \
+store_gov_pol_sys(file_name)
+
+show_store_gov_pol_sys(target_loads);
+show_store_gov_pol_sys(above_hispeed_delay);
+show_store_gov_pol_sys(hispeed_freq);
+show_store_gov_pol_sys(go_hispeed_load);
+show_store_gov_pol_sys(min_sample_time);
+show_store_gov_pol_sys(timer_rate);
+show_store_gov_pol_sys(timer_slack);
+show_store_gov_pol_sys(boost);
+store_gov_pol_sys(boostpulse);
+show_store_gov_pol_sys(boostpulse_duration);
+show_store_gov_pol_sys(io_is_busy);
+
+#define gov_sys_attr_rw(_name) \
+static struct global_attr _name##_gov_sys = \
+__ATTR(_name, 0644, show_##_name##_gov_sys, store_##_name##_gov_sys)
+
+#define gov_pol_attr_rw(_name) \
+static struct freq_attr _name##_gov_pol = \
+__ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
+
+#define gov_sys_pol_attr_rw(_name) \
+ gov_sys_attr_rw(_name); \
+ gov_pol_attr_rw(_name)
+
+gov_sys_pol_attr_rw(target_loads);
+gov_sys_pol_attr_rw(above_hispeed_delay);
+gov_sys_pol_attr_rw(hispeed_freq);
+gov_sys_pol_attr_rw(go_hispeed_load);
+gov_sys_pol_attr_rw(min_sample_time);
+gov_sys_pol_attr_rw(timer_rate);
+gov_sys_pol_attr_rw(timer_slack);
+gov_sys_pol_attr_rw(boost);
+gov_sys_pol_attr_rw(boostpulse_duration);
+gov_sys_pol_attr_rw(io_is_busy);
+
+static struct global_attr boostpulse_gov_sys =
+ __ATTR(boostpulse, 0200, NULL, store_boostpulse_gov_sys);
+
+static struct freq_attr boostpulse_gov_pol =
+ __ATTR(boostpulse, 0200, NULL, store_boostpulse_gov_pol);
+
+/* One Governor instance for entire system */
+static struct attribute *interactive_attributes_gov_sys[] = {
+ &target_loads_gov_sys.attr,
+ &above_hispeed_delay_gov_sys.attr,
+ &hispeed_freq_gov_sys.attr,
+ &go_hispeed_load_gov_sys.attr,
+ &min_sample_time_gov_sys.attr,
+ &timer_rate_gov_sys.attr,
+ &timer_slack_gov_sys.attr,
+ &boost_gov_sys.attr,
+ &boostpulse_gov_sys.attr,
+ &boostpulse_duration_gov_sys.attr,
+ &io_is_busy_gov_sys.attr,
+ NULL,
+};
+
+static struct attribute_group interactive_attr_group_gov_sys = {
+ .attrs = interactive_attributes_gov_sys,
+ .name = "interactive",
+};
+
+/* Per policy governor instance */
+static struct attribute *interactive_attributes_gov_pol[] = {
+ &target_loads_gov_pol.attr,
+ &above_hispeed_delay_gov_pol.attr,
+ &hispeed_freq_gov_pol.attr,
+ &go_hispeed_load_gov_pol.attr,
+ &min_sample_time_gov_pol.attr,
+ &timer_rate_gov_pol.attr,
+ &timer_slack_gov_pol.attr,
+ &boost_gov_pol.attr,
+ &boostpulse_gov_pol.attr,
+ &boostpulse_duration_gov_pol.attr,
+ &io_is_busy_gov_pol.attr,
+ NULL,
+};
+
+static struct attribute_group interactive_attr_group_gov_pol = {
+ .attrs = interactive_attributes_gov_pol,
+ .name = "interactive",
+};
+
+static struct attribute_group *get_sysfs_attr(void)
+{
+ if (have_governor_per_policy())
+ return &interactive_attr_group_gov_pol;
+ else
+ return &interactive_attr_group_gov_sys;
+}
+
+static int cpufreq_interactive_idle_notifier(struct notifier_block *nb,
+ unsigned long val,
+ void *data)
+{
+ if (val == IDLE_END)
+ cpufreq_interactive_idle_end();
+
+ return 0;
+}
+
+static struct notifier_block cpufreq_interactive_idle_nb = {
+ .notifier_call = cpufreq_interactive_idle_notifier,
+};
+
+static int cpufreq_governor_interactive(struct cpufreq_policy *policy,
+ unsigned int event)
+{
+ int rc;
+ unsigned int j;
+ struct cpufreq_interactive_cpuinfo *pcpu;
+ struct cpufreq_frequency_table *freq_table;
+ struct cpufreq_interactive_tunables *tunables;
+ unsigned long flags;
+
+ if (have_governor_per_policy())
+ tunables = policy->governor_data;
+ else
+ tunables = common_tunables;
+
+ WARN_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT));
+
+ switch (event) {
+ case CPUFREQ_GOV_POLICY_INIT:
+ if (have_governor_per_policy()) {
+ WARN_ON(tunables);
+ } else if (tunables) {
+ tunables->usage_count++;
+ policy->governor_data = tunables;
+ return 0;
+ }
+
+ tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
+ if (!tunables) {
+ pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__);
+ return -ENOMEM;
+ }
+
+ tunables->usage_count = 1;
+ tunables->above_hispeed_delay = default_above_hispeed_delay;
+ tunables->nabove_hispeed_delay =
+ ARRAY_SIZE(default_above_hispeed_delay);
+ tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;
+ tunables->target_loads = default_target_loads;
+ tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);
+ tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;
+ tunables->timer_rate = DEFAULT_TIMER_RATE;
+ tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME;
+ tunables->timer_slack_val = DEFAULT_TIMER_SLACK;
+
+ spin_lock_init(&tunables->target_loads_lock);
+ spin_lock_init(&tunables->above_hispeed_delay_lock);
+
+ policy->governor_data = tunables;
+ if (!have_governor_per_policy()) {
+ common_tunables = tunables;
+ WARN_ON(cpufreq_get_global_kobject());
+ }
+
+ rc = sysfs_create_group(get_governor_parent_kobj(policy),
+ get_sysfs_attr());
+ if (rc) {
+ kfree(tunables);
+ policy->governor_data = NULL;
+ if (!have_governor_per_policy()) {
+ common_tunables = NULL;
+ cpufreq_put_global_kobject();
+ }
+ return rc;
+ }
+
+ if (!policy->governor->initialized) {
+ idle_notifier_register(&cpufreq_interactive_idle_nb);
+ cpufreq_register_notifier(&cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ }
+
+ break;
+
+ case CPUFREQ_GOV_POLICY_EXIT:
+ if (!--tunables->usage_count) {
+ if (policy->governor->initialized == 1) {
+ cpufreq_unregister_notifier(&cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ idle_notifier_unregister(&cpufreq_interactive_idle_nb);
+ }
+
+ sysfs_remove_group(get_governor_parent_kobj(policy),
+ get_sysfs_attr());
+
+ if (!have_governor_per_policy())
+ cpufreq_put_global_kobject();
+
+ kfree(tunables);
+ common_tunables = NULL;
+ }
+
+ policy->governor_data = NULL;
+ break;
+
+ case CPUFREQ_GOV_START:
+ mutex_lock(&gov_lock);
+
+ freq_table = cpufreq_frequency_get_table(policy->cpu);
+ if (!tunables->hispeed_freq)
+ tunables->hispeed_freq = policy->max;
+
+ for_each_cpu(j, policy->cpus) {
+ pcpu = &per_cpu(cpuinfo, j);
+ pcpu->policy = policy;
+ pcpu->target_freq = policy->cur;
+ pcpu->freq_table = freq_table;
+ pcpu->floor_freq = pcpu->target_freq;
+ pcpu->pol_floor_val_time =
+ ktime_to_us(ktime_get());
+ pcpu->loc_floor_val_time = pcpu->pol_floor_val_time;
+ pcpu->pol_hispeed_val_time = pcpu->pol_floor_val_time;
+ pcpu->loc_hispeed_val_time = pcpu->pol_floor_val_time;
+ down_write(&pcpu->enable_sem);
+ del_timer_sync(&pcpu->cpu_timer);
+ del_timer_sync(&pcpu->cpu_slack_timer);
+ cpufreq_interactive_timer_start(tunables, j);
+ pcpu->governor_enabled = 1;
+ up_write(&pcpu->enable_sem);
+ }
+
+ mutex_unlock(&gov_lock);
+ break;
+
+ case CPUFREQ_GOV_STOP:
+ mutex_lock(&gov_lock);
+ for_each_cpu(j, policy->cpus) {
+ pcpu = &per_cpu(cpuinfo, j);
+ down_write(&pcpu->enable_sem);
+ pcpu->governor_enabled = 0;
+ del_timer_sync(&pcpu->cpu_timer);
+ del_timer_sync(&pcpu->cpu_slack_timer);
+ up_write(&pcpu->enable_sem);
+ }
+
+ mutex_unlock(&gov_lock);
+ break;
+
+ case CPUFREQ_GOV_LIMITS:
+ if (policy->max < policy->cur)
+ __cpufreq_driver_target(policy,
+ policy->max, CPUFREQ_RELATION_H);
+ else if (policy->min > policy->cur)
+ __cpufreq_driver_target(policy,
+ policy->min, CPUFREQ_RELATION_L);
+ for_each_cpu(j, policy->cpus) {
+ pcpu = &per_cpu(cpuinfo, j);
+
+ down_read(&pcpu->enable_sem);
+ if (pcpu->governor_enabled == 0) {
+ up_read(&pcpu->enable_sem);
+ continue;
+ }
+
+ spin_lock_irqsave(&pcpu->target_freq_lock, flags);
+ if (policy->max < pcpu->target_freq)
+ pcpu->target_freq = policy->max;
+ else if (policy->min > pcpu->target_freq)
+ pcpu->target_freq = policy->min;
+
+ spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
+ up_read(&pcpu->enable_sem);
+ }
+ break;
+ }
+ return 0;
+}
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE
+static
+#endif
+struct cpufreq_governor cpufreq_gov_interactive = {
+ .name = "interactive",
+ .governor = cpufreq_governor_interactive,
+ .max_transition_latency = 10000000,
+ .owner = THIS_MODULE,
+};
+
+static void cpufreq_interactive_nop_timer(unsigned long data)
+{
+}
+
+static int __init cpufreq_interactive_init(void)
+{
+ unsigned int i;
+ struct cpufreq_interactive_cpuinfo *pcpu;
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+
+ /* Initalize per-cpu timers */
+ for_each_possible_cpu(i) {
+ pcpu = &per_cpu(cpuinfo, i);
+ init_timer_deferrable(&pcpu->cpu_timer);
+ pcpu->cpu_timer.function = cpufreq_interactive_timer;
+ pcpu->cpu_timer.data = i;
+ init_timer(&pcpu->cpu_slack_timer);
+ pcpu->cpu_slack_timer.function = cpufreq_interactive_nop_timer;
+ spin_lock_init(&pcpu->load_lock);
+ spin_lock_init(&pcpu->target_freq_lock);
+ init_rwsem(&pcpu->enable_sem);
+ }
+
+ spin_lock_init(&speedchange_cpumask_lock);
+ mutex_init(&gov_lock);
+ speedchange_task =
+ kthread_create(cpufreq_interactive_speedchange_task, NULL,
+ "cfinteractive");
+ if (IS_ERR(speedchange_task))
+ return PTR_ERR(speedchange_task);
+
+ sched_setscheduler_nocheck(speedchange_task, SCHED_FIFO, &param);
+ get_task_struct(speedchange_task);
+
+ /* NB: wake up so the thread does not look hung to the freezer */
+ wake_up_process(speedchange_task);
+
+ return cpufreq_register_governor(&cpufreq_gov_interactive);
+}
+
+#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE
+fs_initcall(cpufreq_interactive_init);
+#else
+module_init(cpufreq_interactive_init);
+#endif
+
+static void __exit cpufreq_interactive_exit(void)
+{
+ cpufreq_unregister_governor(&cpufreq_gov_interactive);
+ kthread_stop(speedchange_task);
+ put_task_struct(speedchange_task);
+}
+
+module_exit(cpufreq_interactive_exit);
+
+MODULE_AUTHOR("Mike Chan <mike@android.com>");
+MODULE_DESCRIPTION("'cpufreq_interactive' - A cpufreq governor for "
+ "Latency sensitive workloads");
+MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 0cd9b4dcef99..a2f1d41820b8 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -13,6 +13,9 @@
#include <linux/cpufreq.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/of.h>
+#include <linux/sched.h>
#include <linux/cputime.h>
static spinlock_t cpufreq_stats_lock;
@@ -31,7 +34,28 @@ struct cpufreq_stats {
#endif
};
+struct all_cpufreq_stats {
+ unsigned int state_num;
+ cputime64_t *time_in_state;
+ unsigned int *freq_table;
+};
+
+struct cpufreq_power_stats {
+ unsigned int state_num;
+ unsigned int *curr;
+ unsigned int *freq_table;
+};
+
+struct all_freq_table {
+ unsigned int *freq_table;
+ unsigned int table_size;
+};
+
+static struct all_freq_table *all_freq_table;
+
+static DEFINE_PER_CPU(struct all_cpufreq_stats *, all_cpufreq_stats);
static DEFINE_PER_CPU(struct cpufreq_stats *, cpufreq_stats_table);
+static DEFINE_PER_CPU(struct cpufreq_power_stats *, cpufreq_power_stats);
struct cpufreq_stats_attribute {
struct attribute attr;
@@ -41,14 +65,24 @@ struct cpufreq_stats_attribute {
static int cpufreq_stats_update(unsigned int cpu)
{
struct cpufreq_stats *stat;
+ struct all_cpufreq_stats *all_stat;
unsigned long long cur_time;
cur_time = get_jiffies_64();
spin_lock(&cpufreq_stats_lock);
stat = per_cpu(cpufreq_stats_table, cpu);
- if (stat->time_in_state)
+ all_stat = per_cpu(all_cpufreq_stats, cpu);
+ if (!stat) {
+ spin_unlock(&cpufreq_stats_lock);
+ return 0;
+ }
+ if (stat->time_in_state) {
stat->time_in_state[stat->last_index] +=
cur_time - stat->last_time;
+ if (all_stat)
+ all_stat->time_in_state[stat->last_index] +=
+ cur_time - stat->last_time;
+ }
stat->last_time = cur_time;
spin_unlock(&cpufreq_stats_lock);
return 0;
@@ -79,6 +113,104 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
return len;
}
+static int get_index_all_cpufreq_stat(struct all_cpufreq_stats *all_stat,
+ unsigned int freq)
+{
+ int i;
+ if (!all_stat)
+ return -1;
+ for (i = 0; i < all_stat->state_num; i++) {
+ if (all_stat->freq_table[i] == freq)
+ return i;
+ }
+ return -1;
+}
+
+void acct_update_power(struct task_struct *task, cputime_t cputime) {
+ struct cpufreq_power_stats *powerstats;
+ struct cpufreq_stats *stats;
+ unsigned int cpu_num, curr;
+
+ if (!task)
+ return;
+ cpu_num = task_cpu(task);
+ powerstats = per_cpu(cpufreq_power_stats, cpu_num);
+ stats = per_cpu(cpufreq_stats_table, cpu_num);
+ if (!powerstats || !stats)
+ return;
+
+ curr = powerstats->curr[stats->last_index];
+ if (task->cpu_power != ULLONG_MAX)
+ task->cpu_power += curr * cputime_to_usecs(cputime);
+}
+EXPORT_SYMBOL_GPL(acct_update_power);
+
+static ssize_t show_current_in_state(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ ssize_t len = 0;
+ unsigned int i, cpu;
+ struct cpufreq_power_stats *powerstats;
+
+ spin_lock(&cpufreq_stats_lock);
+ for_each_possible_cpu(cpu) {
+ powerstats = per_cpu(cpufreq_power_stats, cpu);
+ if (!powerstats)
+ continue;
+ len += scnprintf(buf + len, PAGE_SIZE - len, "CPU%d:", cpu);
+ for (i = 0; i < powerstats->state_num; i++)
+ len += scnprintf(buf + len, PAGE_SIZE - len,
+ "%d=%d ", powerstats->freq_table[i],
+ powerstats->curr[i]);
+ len += scnprintf(buf + len, PAGE_SIZE - len, "\n");
+ }
+ spin_unlock(&cpufreq_stats_lock);
+ return len;
+}
+
+static ssize_t show_all_time_in_state(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ ssize_t len = 0;
+ unsigned int i, cpu, freq, index;
+ struct all_cpufreq_stats *all_stat;
+ struct cpufreq_policy *policy;
+
+ len += scnprintf(buf + len, PAGE_SIZE - len, "freq\t\t");
+ for_each_possible_cpu(cpu) {
+ len += scnprintf(buf + len, PAGE_SIZE - len, "cpu%d\t\t", cpu);
+ if (cpu_online(cpu))
+ cpufreq_stats_update(cpu);
+ }
+
+ if (!all_freq_table)
+ goto out;
+ for (i = 0; i < all_freq_table->table_size; i++) {
+ freq = all_freq_table->freq_table[i];
+ len += scnprintf(buf + len, PAGE_SIZE - len, "\n%u\t\t", freq);
+ for_each_possible_cpu(cpu) {
+ policy = cpufreq_cpu_get(cpu);
+ if (policy == NULL)
+ continue;
+ all_stat = per_cpu(all_cpufreq_stats, policy->cpu);
+ index = get_index_all_cpufreq_stat(all_stat, freq);
+ if (index != -1) {
+ len += scnprintf(buf + len, PAGE_SIZE - len,
+ "%llu\t\t", (unsigned long long)
+ cputime64_to_clock_t(all_stat->time_in_state[index]));
+ } else {
+ len += scnprintf(buf + len, PAGE_SIZE - len,
+ "N/A\t\t");
+ }
+ cpufreq_cpu_put(policy);
+ }
+ }
+
+out:
+ len += scnprintf(buf + len, PAGE_SIZE - len, "\n");
+ return len;
+}
+
#ifdef CONFIG_CPU_FREQ_STAT_DETAILS
static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
{
@@ -142,6 +274,12 @@ static struct attribute_group stats_attr_group = {
.name = "stats"
};
+static struct kobj_attribute _attr_all_time_in_state = __ATTR(all_time_in_state,
+ 0444, show_all_time_in_state, NULL);
+
+static struct kobj_attribute _attr_current_in_state = __ATTR(current_in_state,
+ 0444, show_current_in_state, NULL);
+
static int freq_table_get_index(struct cpufreq_stats *stat, unsigned int freq)
{
int index;
@@ -180,17 +318,54 @@ static void cpufreq_stats_free_table(unsigned int cpu)
cpufreq_cpu_put(policy);
}
-static int __cpufreq_stats_create_table(struct cpufreq_policy *policy)
+static void cpufreq_allstats_free(void)
+{
+ int cpu;
+ struct all_cpufreq_stats *all_stat;
+
+ sysfs_remove_file(cpufreq_global_kobject,
+ &_attr_all_time_in_state.attr);
+
+ for_each_possible_cpu(cpu) {
+ all_stat = per_cpu(all_cpufreq_stats, cpu);
+ if (!all_stat)
+ continue;
+ kfree(all_stat->time_in_state);
+ kfree(all_stat);
+ per_cpu(all_cpufreq_stats, cpu) = NULL;
+ }
+ if (all_freq_table) {
+ kfree(all_freq_table->freq_table);
+ kfree(all_freq_table);
+ all_freq_table = NULL;
+ }
+}
+
+static void cpufreq_powerstats_free(void)
+{
+ int cpu;
+ struct cpufreq_power_stats *powerstats;
+
+ sysfs_remove_file(cpufreq_global_kobject, &_attr_current_in_state.attr);
+
+ for_each_possible_cpu(cpu) {
+ powerstats = per_cpu(cpufreq_power_stats, cpu);
+ if (!powerstats)
+ continue;
+ kfree(powerstats->curr);
+ kfree(powerstats);
+ per_cpu(cpufreq_power_stats, cpu) = NULL;
+ }
+}
+
+static int __cpufreq_stats_create_table(struct cpufreq_policy *policy,
+ struct cpufreq_frequency_table *table, int count)
{
- unsigned int i, count = 0, ret = 0;
+ unsigned int i, ret = 0;
struct cpufreq_stats *stat;
unsigned int alloc_size;
unsigned int cpu = policy->cpu;
- struct cpufreq_frequency_table *pos, *table;
-
- table = cpufreq_frequency_get_table(cpu);
- if (unlikely(!table))
- return 0;
+ struct cpufreq_frequency_table *pos;
if (per_cpu(cpufreq_stats_table, cpu))
return -EBUSY;
@@ -205,9 +380,6 @@ static int __cpufreq_stats_create_table(struct cpufreq_policy *policy)
stat->cpu = cpu;
per_cpu(cpufreq_stats_table, cpu) = stat;
- cpufreq_for_each_valid_entry(pos, table)
- count++;
-
alloc_size = count * sizeof(int) + count * sizeof(u64);
#ifdef CONFIG_CPU_FREQ_STAT_DETAILS
@@ -242,10 +414,159 @@ error_out:
return ret;
}
+static void cpufreq_stats_update_policy_cpu(struct cpufreq_policy *policy)
+{
+ struct cpufreq_stats *stat = per_cpu(cpufreq_stats_table,
+ policy->last_cpu);
+
+ pr_debug("Updating stats_table for new_cpu %u from last_cpu %u\n",
+ policy->cpu, policy->last_cpu);
+ per_cpu(cpufreq_stats_table, policy->cpu) = per_cpu(cpufreq_stats_table,
+ policy->last_cpu);
+ per_cpu(cpufreq_stats_table, policy->last_cpu) = NULL;
+ stat->cpu = policy->cpu;
+}
+
+static void cpufreq_powerstats_create(unsigned int cpu,
+ struct cpufreq_frequency_table *table, int count) {
+ unsigned int alloc_size, i = 0, ret = 0;
+ struct cpufreq_power_stats *powerstats;
+ struct cpufreq_frequency_table *pos;
+ struct device_node *cpu_node;
+ char device_path[16];
+
+ powerstats = kzalloc(sizeof(struct cpufreq_power_stats),
+ GFP_KERNEL);
+ if (!powerstats)
+ return;
+
+ /* Allocate memory for freq table per cpu as well as clockticks per
+ * freq*/
+ alloc_size = count * sizeof(unsigned int) +
+ count * sizeof(unsigned int);
+ powerstats->curr = kzalloc(alloc_size, GFP_KERNEL);
+ if (!powerstats->curr) {
+ kfree(powerstats);
+ return;
+ }
+ powerstats->freq_table = powerstats->curr + count;
+
+ spin_lock(&cpufreq_stats_lock);
+ i = 0;
+ cpufreq_for_each_valid_entry(pos, table)
+ powerstats->freq_table[i++] = pos->frequency;
+ powerstats->state_num = i;
+
+ snprintf(device_path, sizeof(device_path), "/cpus/cpu@%d", cpu);
+ cpu_node = of_find_node_by_path(device_path);
+ if (cpu_node) {
+ ret = of_property_read_u32_array(cpu_node, "current",
+ powerstats->curr, count);
+ if (ret) {
+ kfree(powerstats->curr);
+ kfree(powerstats);
+ powerstats = NULL;
+ }
+ }
+ per_cpu(cpufreq_power_stats, cpu) = powerstats;
+ spin_unlock(&cpufreq_stats_lock);
+}
+
+static int compare_for_sort(const void *lhs_ptr, const void *rhs_ptr)
+{
+ unsigned int lhs = *(const unsigned int *)(lhs_ptr);
+ unsigned int rhs = *(const unsigned int *)(rhs_ptr);
+ if (lhs < rhs)
+ return -1;
+ if (lhs > rhs)
+ return 1;
+ return 0;
+}
+
+static bool check_all_freq_table(unsigned int freq)
+{
+ int i;
+ for (i = 0; i < all_freq_table->table_size; i++) {
+ if (freq == all_freq_table->freq_table[i])
+ return true;
+ }
+ return false;
+}
+
+static void create_all_freq_table(void)
+{
+ all_freq_table = kzalloc(sizeof(struct all_freq_table),
+ GFP_KERNEL);
+ if (!all_freq_table)
+ pr_warn("could not allocate memory for all_freq_table\n");
+ return;
+}
+
+static void add_all_freq_table(unsigned int freq)
+{
+ unsigned int size;
+ size = sizeof(unsigned int) * (all_freq_table->table_size + 1);
+ all_freq_table->freq_table = krealloc(all_freq_table->freq_table,
+ size, GFP_ATOMIC);
+ if (IS_ERR(all_freq_table->freq_table)) {
+ pr_warn("Could not reallocate memory for freq_table\n");
+ all_freq_table->freq_table = NULL;
+ return;
+ }
+ all_freq_table->freq_table[all_freq_table->table_size++] = freq;
+}
+
+static void cpufreq_allstats_create(unsigned int cpu,
+ struct cpufreq_frequency_table *table, int count)
+{
+ int i , j = 0;
+ unsigned int alloc_size;
+ struct all_cpufreq_stats *all_stat;
+ bool sort_needed = false;
+
+ all_stat = kzalloc(sizeof(struct all_cpufreq_stats),
+ GFP_KERNEL);
+ if (!all_stat) {
+ pr_warn("Cannot allocate memory for cpufreq stats\n");
+ return;
+ }
+
+ /*Allocate memory for freq table per cpu as well as clockticks per freq*/
+ alloc_size = count * sizeof(int) + count * sizeof(cputime64_t);
+ all_stat->time_in_state = kzalloc(alloc_size, GFP_KERNEL);
+ if (!all_stat->time_in_state) {
+ pr_warn("Cannot allocate memory for cpufreq time_in_state\n");
+ kfree(all_stat);
+ all_stat = NULL;
+ return;
+ }
+ all_stat->freq_table = (unsigned int *)
+ (all_stat->time_in_state + count);
+
+ spin_lock(&cpufreq_stats_lock);
+ for (i = 0; table[i].frequency != CPUFREQ_TABLE_END; i++) {
+ unsigned int freq = table[i].frequency;
+ if (freq == CPUFREQ_ENTRY_INVALID)
+ continue;
+ all_stat->freq_table[j++] = freq;
+ if (all_freq_table && !check_all_freq_table(freq)) {
+ add_all_freq_table(freq);
+ sort_needed = true;
+ }
+ }
+ if (sort_needed)
+ sort(all_freq_table->freq_table, all_freq_table->table_size,
+ sizeof(unsigned int), &compare_for_sort, NULL);
+ all_stat->state_num = j;
+ per_cpu(all_cpufreq_stats, cpu) = all_stat;
+ spin_unlock(&cpufreq_stats_lock);
+}
+
static void cpufreq_stats_create_table(unsigned int cpu)
{
struct cpufreq_policy *policy;
-
+ struct cpufreq_frequency_table *table, *pos;
+ int count = 0;
/*
* "likely(!policy)" because normally cpufreq_stats will be registered
* before cpufreq driver
@@ -254,37 +575,52 @@ static void cpufreq_stats_create_table(unsigned int cpu)
if (likely(!policy))
return;
- __cpufreq_stats_create_table(policy);
+ table = cpufreq_frequency_get_table(policy->cpu);
+ if (likely(table)) {
+ cpufreq_for_each_valid_entry(pos, table)
+ count++;
- cpufreq_cpu_put(policy);
-}
+ if (!per_cpu(all_cpufreq_stats, cpu))
+ cpufreq_allstats_create(cpu, table, count);
-static void cpufreq_stats_update_policy_cpu(struct cpufreq_policy *policy)
-{
- struct cpufreq_stats *stat = per_cpu(cpufreq_stats_table,
- policy->last_cpu);
+ if (!per_cpu(cpufreq_power_stats, cpu))
+ cpufreq_powerstats_create(cpu, table, count);
- pr_debug("Updating stats_table for new_cpu %u from last_cpu %u\n",
- policy->cpu, policy->last_cpu);
- per_cpu(cpufreq_stats_table, policy->cpu) = per_cpu(cpufreq_stats_table,
- policy->last_cpu);
- per_cpu(cpufreq_stats_table, policy->last_cpu) = NULL;
- stat->cpu = policy->cpu;
+ __cpufreq_stats_create_table(policy, table, count);
+ }
+ cpufreq_cpu_put(policy);
}
static int cpufreq_stat_notifier_policy(struct notifier_block *nb,
unsigned long val, void *data)
{
- int ret = 0;
+ int ret = 0, count = 0;
struct cpufreq_policy *policy = data;
+ struct cpufreq_frequency_table *table, *pos;
+ unsigned int cpu_num, cpu = policy->cpu;
if (val == CPUFREQ_UPDATE_POLICY_CPU) {
cpufreq_stats_update_policy_cpu(policy);
return 0;
}
+ table = cpufreq_frequency_get_table(cpu);
+ if (!table)
+ return 0;
+
+ cpufreq_for_each_valid_entry(pos, table)
+ count++;
+
+ if (!per_cpu(all_cpufreq_stats, cpu))
+ cpufreq_allstats_create(cpu, table, count);
+
+ for_each_possible_cpu(cpu_num) {
+ if (!per_cpu(cpufreq_power_stats, cpu_num))
+ cpufreq_powerstats_create(cpu_num, table, count);
+ }
+
if (val == CPUFREQ_CREATE_POLICY)
- ret = __cpufreq_stats_create_table(policy);
+ ret = __cpufreq_stats_create_table(policy, table, count);
else if (val == CPUFREQ_REMOVE_POLICY)
__cpufreq_stats_free_table(policy);
@@ -359,6 +695,18 @@ static int __init cpufreq_stats_init(void)
return ret;
}
+ create_all_freq_table();
+ WARN_ON(cpufreq_get_global_kobject());
+ ret = sysfs_create_file(cpufreq_global_kobject,
+ &_attr_all_time_in_state.attr);
+ if (ret)
+ pr_warn("Cannot create sysfs file for cpufreq stats\n");
+
+ ret = sysfs_create_file(cpufreq_global_kobject,
+ &_attr_current_in_state.attr);
+ if (ret)
+ pr_warn("Cannot create sysfs file for cpufreq current stats\n");
+
return 0;
}
static void __exit cpufreq_stats_exit(void)
@@ -371,6 +719,9 @@ static void __exit cpufreq_stats_exit(void)
CPUFREQ_TRANSITION_NOTIFIER);
for_each_online_cpu(cpu)
cpufreq_stats_free_table(cpu);
+ cpufreq_allstats_free();
+ cpufreq_powerstats_free();
+ cpufreq_put_global_kobject();
}
MODULE_AUTHOR("Zou Nan hai <nanhai.zou@intel.com>");
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 59dc6db6be15..ea5bc6f27c0d 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -19,6 +19,7 @@
#include <linux/ktime.h>
#include <linux/hrtimer.h>
#include <linux/module.h>
+#include <linux/suspend.h>
#include <trace/events/power.h>
#include "cpuidle.h"
@@ -32,7 +33,6 @@ LIST_HEAD(cpuidle_detected_devices);
static int enabled_devices;
static int off __read_mostly;
static int initialized __read_mostly;
-static bool use_deepest_state __read_mostly;
int cpuidle_disabled(void)
{
@@ -66,24 +66,9 @@ int cpuidle_play_dead(void)
}
/**
- * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode.
- * @enable: Whether enable or disable the feature.
- *
- * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and
- * always use the state with the greatest exit latency (out of the states that
- * are not disabled).
- *
- * This function can only be called after cpuidle_pause() to avoid races.
- */
-void cpuidle_use_deepest_state(bool enable)
-{
- use_deepest_state = enable;
-}
-
-/**
- * cpuidle_find_deepest_state - Find the state of the greatest exit latency.
- * @drv: cpuidle driver for a given CPU.
- * @dev: cpuidle device for a given CPU.
+ * cpuidle_find_deepest_state - Find deepest state meeting specific conditions.
+ * @drv: cpuidle driver for the given CPU.
+ * @dev: cpuidle device for the given CPU.
*/
static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
struct cpuidle_device *dev)
@@ -105,6 +90,27 @@ static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
}
/**
+ * cpuidle_enter_freeze - Enter an idle state suitable for suspend-to-idle.
+ *
+ * Find the deepest state available and enter it.
+ */
+void cpuidle_enter_freeze(void)
+{
+ struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+ struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+ int index;
+
+ index = cpuidle_find_deepest_state(drv, dev);
+ if (index >= 0)
+ cpuidle_enter(drv, dev, index);
+ else
+ arch_cpu_idle();
+
+ /* Interrupts are enabled again here. */
+ local_irq_disable();
+}
+
+/**
* cpuidle_enter_state - enter the state and update stats
* @dev: cpuidle device for this cpu
* @drv: cpuidle driver for this cpu
@@ -119,6 +125,9 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
ktime_t time_start, time_end;
s64 diff;
+ /* Take note of the planned idle state. */
+ sched_idle_set_state(target_state, index);
+
trace_cpu_idle_rcuidle(index, dev->cpu);
time_start = ktime_get();
@@ -127,6 +136,9 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
time_end = ktime_get();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
+ /* The cpu is no longer idle or about to enter idle. */
+ sched_idle_set_state(NULL, -1);
+
if (!cpuidle_state_is_coupled(dev, drv, index))
local_irq_enable();
@@ -166,9 +178,6 @@ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
if (!drv || !dev || !dev->enabled)
return -EBUSY;
- if (unlikely(use_deepest_state))
- return cpuidle_find_deepest_state(drv, dev);
-
return cpuidle_curr_governor->select(drv, dev);
}
@@ -200,7 +209,7 @@ int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev,
*/
void cpuidle_reflect(struct cpuidle_device *dev, int index)
{
- if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state))
+ if (cpuidle_curr_governor->reflect)
cpuidle_curr_governor->reflect(dev, index);
}
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 710a233b9b0d..386261afe2dd 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -178,7 +178,12 @@ static inline int performance_multiplier(unsigned long nr_iowaiters, unsigned lo
/* for higher loadavg, we are more reluctant */
- mult += 2 * get_loadavg(load);
+ /*
+ * this doesn't work as intended - it is almost always 0, but can
+ * sometimes, depending on workload, spike very high into the hundreds
+ * even when the average cpu load is under 10%.
+ */
+ /* mult += 2 * get_loadavg(); */
/* for IO wait tasks (per cpu!) we add 5x each */
mult += 10 * nr_iowaiters;
diff --git a/drivers/dma-buf/fence.c b/drivers/dma-buf/fence.c
index 7bb9d65d9a2c..ae50011fba3e 100644
--- a/drivers/dma-buf/fence.c
+++ b/drivers/dma-buf/fence.c
@@ -301,8 +301,12 @@ fence_remove_callback(struct fence *fence, struct fence_cb *cb)
spin_lock_irqsave(fence->lock, flags);
ret = !list_empty(&cb->node);
- if (ret)
+ if (ret) {
list_del_init(&cb->node);
+ if (list_empty(&fence->cb_list))
+ if (fence->ops->disable_signaling)
+ fence->ops->disable_signaling(fence);
+ }
spin_unlock_irqrestore(fence->lock, flags);
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 35286fe52823..29f93bf718e6 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -93,6 +93,12 @@ static void dmi_table(u8 *buf, int len, int num,
const struct dmi_header *dm = (const struct dmi_header *)data;
/*
+ * 7.45 End-of-Table (Type 127) [SMBIOS reference spec v3.0.0]
+ */
+ if (dm->type == DMI_ENTRY_END_OF_TABLE)
+ break;
+
+ /*
* We want to know the total length (formatted area and
* strings) before decoding to make sure we won't run off the
* table in dmi_decode or dmi_string
@@ -107,7 +113,7 @@ static void dmi_table(u8 *buf, int len, int num,
}
}
-static u32 dmi_base;
+static phys_addr_t dmi_base;
static u16 dmi_len;
static u16 dmi_num;
@@ -467,7 +473,7 @@ static int __init dmi_present(const u8 *buf)
if (memcmp(buf, "_SM_", 4) == 0 &&
buf[5] < 32 && dmi_checksum(buf, buf[5])) {
- smbios_ver = (buf[6] << 8) + buf[7];
+ smbios_ver = get_unaligned_be16(buf + 6);
/* Some BIOS report weird SMBIOS version, fix that up */
switch (smbios_ver) {
@@ -493,10 +499,9 @@ static int __init dmi_present(const u8 *buf)
dmi_ver = smbios_ver;
else
dmi_ver = (buf[14] & 0xF0) << 4 | (buf[14] & 0x0F);
- dmi_num = (buf[13] << 8) | buf[12];
- dmi_len = (buf[7] << 8) | buf[6];
- dmi_base = (buf[11] << 24) | (buf[10] << 16) |
- (buf[9] << 8) | buf[8];
+ dmi_num = get_unaligned_le16(buf + 12);
+ dmi_len = get_unaligned_le16(buf + 6);
+ dmi_base = get_unaligned_le32(buf + 8);
if (dmi_walk_early(dmi_decode) == 0) {
if (smbios_ver) {
@@ -515,12 +520,72 @@ static int __init dmi_present(const u8 *buf)
return 1;
}
+/*
+ * Check for the SMBIOS 3.0 64-bit entry point signature. Unlike the legacy
+ * 32-bit entry point, there is no embedded DMI header (_DMI_) in here.
+ */
+static int __init dmi_smbios3_present(const u8 *buf)
+{
+ if (memcmp(buf, "_SM3_", 5) == 0 &&
+ buf[6] < 32 && dmi_checksum(buf, buf[6])) {
+ dmi_ver = get_unaligned_be16(buf + 7);
+ dmi_len = get_unaligned_le32(buf + 12);
+ dmi_base = get_unaligned_le64(buf + 16);
+
+ /*
+ * The 64-bit SMBIOS 3.0 entry point no longer has a field
+ * containing the number of structures present in the table.
+ * Instead, it defines the table size as a maximum size, and
+ * relies on the end-of-table structure type (#127) to be used
+ * to signal the end of the table.
+ * So let's define dmi_num as an upper bound as well: each
+ * structure has a 4 byte header, so dmi_len / 4 is an upper
+ * bound for the number of structures in the table.
+ */
+ dmi_num = dmi_len / 4;
+
+ if (dmi_walk_early(dmi_decode) == 0) {
+ pr_info("SMBIOS %d.%d present.\n",
+ dmi_ver >> 8, dmi_ver & 0xFF);
+ dmi_format_ids(dmi_ids_string, sizeof(dmi_ids_string));
+ pr_debug("DMI: %s\n", dmi_ids_string);
+ return 0;
+ }
+ }
+ return 1;
+}
+
void __init dmi_scan_machine(void)
{
char __iomem *p, *q;
char buf[32];
if (efi_enabled(EFI_CONFIG_TABLES)) {
+ /*
+ * According to the DMTF SMBIOS reference spec v3.0.0, it is
+ * allowed to define both the 64-bit entry point (smbios3) and
+ * the 32-bit entry point (smbios), in which case they should
+ * either both point to the same SMBIOS structure table, or the
+ * table pointed to by the 64-bit entry point should contain a
+ * superset of the table contents pointed to by the 32-bit entry
+ * point (section 5.2)
+ * This implies that the 64-bit entry point should have
+ * precedence if it is defined and supported by the OS. If we
+ * have the 64-bit entry point, but fail to decode it, fall
+ * back to the legacy one (if available)
+ */
+ if (efi.smbios3 != EFI_INVALID_TABLE_ADDR) {
+ p = dmi_early_remap(efi.smbios3, 32);
+ if (p == NULL)
+ goto error;
+ memcpy_fromio(buf, p, 32);
+ dmi_early_unmap(p, 32);
+
+ if (!dmi_smbios3_present(buf)) {
+ dmi_available = 1;
+ goto out;
+ }
+ }
if (efi.smbios == EFI_INVALID_TABLE_ADDR)
goto error;
@@ -553,7 +618,7 @@ void __init dmi_scan_machine(void)
memset(buf, 0, 16);
for (q = p; q < p + 0x10000; q += 16) {
memcpy_fromio(buf + 16, q, 16);
- if (!dmi_present(buf)) {
+ if (!dmi_smbios3_present(buf) || !dmi_present(buf)) {
dmi_available = 1;
dmi_early_unmap(p, 0x10000);
goto out;
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 71e090c8c85e..49eb145043ff 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -30,6 +30,7 @@ struct efi __read_mostly efi = {
.acpi = EFI_INVALID_TABLE_ADDR,
.acpi20 = EFI_INVALID_TABLE_ADDR,
.smbios = EFI_INVALID_TABLE_ADDR,
+ .smbios3 = EFI_INVALID_TABLE_ADDR,
.sal_systab = EFI_INVALID_TABLE_ADDR,
.boot_info = EFI_INVALID_TABLE_ADDR,
.hcdp = EFI_INVALID_TABLE_ADDR,
@@ -86,6 +87,8 @@ static ssize_t systab_show(struct kobject *kobj,
str += sprintf(str, "ACPI=0x%lx\n", efi.acpi);
if (efi.smbios != EFI_INVALID_TABLE_ADDR)
str += sprintf(str, "SMBIOS=0x%lx\n", efi.smbios);
+ if (efi.smbios3 != EFI_INVALID_TABLE_ADDR)
+ str += sprintf(str, "SMBIOS3=0x%lx\n", efi.smbios3);
if (efi.hcdp != EFI_INVALID_TABLE_ADDR)
str += sprintf(str, "HCDP=0x%lx\n", efi.hcdp);
if (efi.boot_info != EFI_INVALID_TABLE_ADDR)
@@ -261,6 +264,7 @@ static __initdata efi_config_table_type_t common_tables[] = {
{MPS_TABLE_GUID, "MPS", &efi.mps},
{SAL_SYSTEM_TABLE_GUID, "SALsystab", &efi.sal_systab},
{SMBIOS_TABLE_GUID, "SMBIOS", &efi.smbios},
+ {SMBIOS3_TABLE_GUID, "SMBIOS 3.0", &efi.smbios3},
{UGA_IO_PROTOCOL_GUID, "UGA", &efi.uga},
{NULL_GUID, NULL, NULL},
};
@@ -290,29 +294,15 @@ static __init int match_config_table(efi_guid_t *guid,
return 0;
}
-int __init efi_config_init(efi_config_table_type_t *arch_tables)
+int __init efi_config_parse_tables(void *config_tables, int count, int sz,
+ efi_config_table_type_t *arch_tables)
{
- void *config_tables, *tablep;
- int i, sz;
-
- if (efi_enabled(EFI_64BIT))
- sz = sizeof(efi_config_table_64_t);
- else
- sz = sizeof(efi_config_table_32_t);
-
- /*
- * Let's see what config tables the firmware passed to us.
- */
- config_tables = early_memremap(efi.systab->tables,
- efi.systab->nr_tables * sz);
- if (config_tables == NULL) {
- pr_err("Could not map Configuration table!\n");
- return -ENOMEM;
- }
+ void *tablep;
+ int i;
tablep = config_tables;
pr_info("");
- for (i = 0; i < efi.systab->nr_tables; i++) {
+ for (i = 0; i < count; i++) {
efi_guid_t guid;
unsigned long table;
@@ -325,8 +315,6 @@ int __init efi_config_init(efi_config_table_type_t *arch_tables)
if (table64 >> 32) {
pr_cont("\n");
pr_err("Table located above 4GB, disabling EFI.\n");
- early_memunmap(config_tables,
- efi.systab->nr_tables * sz);
return -EINVAL;
}
#endif
@@ -341,13 +329,37 @@ int __init efi_config_init(efi_config_table_type_t *arch_tables)
tablep += sz;
}
pr_cont("\n");
- early_memunmap(config_tables, efi.systab->nr_tables * sz);
-
set_bit(EFI_CONFIG_TABLES, &efi.flags);
-
return 0;
}
+int __init efi_config_init(efi_config_table_type_t *arch_tables)
+{
+ void *config_tables;
+ int sz, ret;
+
+ if (efi_enabled(EFI_64BIT))
+ sz = sizeof(efi_config_table_64_t);
+ else
+ sz = sizeof(efi_config_table_32_t);
+
+ /*
+ * Let's see what config tables the firmware passed to us.
+ */
+ config_tables = early_memremap(efi.systab->tables,
+ efi.systab->nr_tables * sz);
+ if (config_tables == NULL) {
+ pr_err("Could not map Configuration table!\n");
+ return -ENOMEM;
+ }
+
+ ret = efi_config_parse_tables(config_tables, efi.systab->nr_tables, sz,
+ arch_tables);
+
+ early_memunmap(config_tables, efi.systab->nr_tables * sz);
+ return ret;
+}
+
#ifdef CONFIG_EFI_VARS_MODULE
static int __init efi_load_efivars(void)
{
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index c5533c76c202..d77be6251faf 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -21,6 +21,9 @@ KBUILD_CFLAGS := $(cflags-y) \
GCOV_PROFILE := n
KASAN_SANITIZE := n
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
+
lib-y := efi-stub-helper.o
lib-$(CONFIG_EFI_ARMSTUB) += arm-stub.o fdt.o
diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c
index 75ee05964cbc..e2432b39b6df 100644
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -247,9 +247,18 @@ unsigned long __init efi_entry(void *handle, efi_system_table_t *sys_table,
goto fail_free_cmdline;
}
}
- if (!fdt_addr)
+
+ if (fdt_addr) {
+ pr_efi(sys_table, "Using DTB from command line\n");
+ } else {
/* Look for a device tree configuration table entry. */
fdt_addr = (uintptr_t)get_fdt(sys_table);
+ if (fdt_addr)
+ pr_efi(sys_table, "Using DTB from configuration table\n");
+ }
+
+ if (!fdt_addr)
+ pr_efi(sys_table, "Generating empty DTB\n");
status = handle_cmdline_files(sys_table, image, cmdline_ptr,
"initrd=", dram_base + SZ_512M,
@@ -286,3 +295,62 @@ fail_free_image:
fail:
return EFI_ERROR;
}
+
+/*
+ * This is the base address at which to start allocating virtual memory ranges
+ * for UEFI Runtime Services. This is in the low TTBR0 range so that we can use
+ * any allocation we choose, and eliminate the risk of a conflict after kexec.
+ * The value chosen is the largest non-zero power of 2 suitable for this purpose
+ * both on 32-bit and 64-bit ARM CPUs, to maximize the likelihood that it can
+ * be mapped efficiently.
+ */
+#define EFI_RT_VIRTUAL_BASE 0x40000000
+
+/*
+ * efi_get_virtmap() - create a virtual mapping for the EFI memory map
+ *
+ * This function populates the virt_addr fields of all memory region descriptors
+ * in @memory_map whose EFI_MEMORY_RUNTIME attribute is set. Those descriptors
+ * are also copied to @runtime_map, and their total count is returned in @count.
+ */
+void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size,
+ unsigned long desc_size, efi_memory_desc_t *runtime_map,
+ int *count)
+{
+ u64 efi_virt_base = EFI_RT_VIRTUAL_BASE;
+ efi_memory_desc_t *out = runtime_map;
+ int l;
+
+ for (l = 0; l < map_size; l += desc_size) {
+ efi_memory_desc_t *in = (void *)memory_map + l;
+ u64 paddr, size;
+
+ if (!(in->attribute & EFI_MEMORY_RUNTIME))
+ continue;
+
+ /*
+ * Make the mapping compatible with 64k pages: this allows
+ * a 4k page size kernel to kexec a 64k page size kernel and
+ * vice versa.
+ */
+ paddr = round_down(in->phys_addr, SZ_64K);
+ size = round_up(in->num_pages * EFI_PAGE_SIZE +
+ in->phys_addr - paddr, SZ_64K);
+
+ /*
+ * Avoid wasting memory on PTEs by choosing a virtual base that
+ * is compatible with section mappings if this region has the
+ * appropriate size and physical alignment. (Sections are 2 MB
+ * on 4k granule kernels)
+ */
+ if (IS_ALIGNED(in->phys_addr, SZ_2M) && size >= SZ_2M)
+ efi_virt_base = round_up(efi_virt_base, SZ_2M);
+
+ in->virt_addr = efi_virt_base + in->phys_addr - paddr;
+ efi_virt_base += size;
+
+ memcpy(out, in, desc_size);
+ out = (void *)out + desc_size;
+ ++*count;
+ }
+}
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 5186eb01945a..77ef72775cbf 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -32,6 +32,15 @@
static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE;
+/*
+ * Allow the platform to override the allocation granularity: this allows
+ * systems that have the capability to run with a larger page size to deal
+ * with the allocations for initrd and fdt more efficiently.
+ */
+#ifndef EFI_ALLOC_ALIGN
+#define EFI_ALLOC_ALIGN EFI_PAGE_SIZE
+#endif
+
struct file_info {
efi_file_handle_t *handle;
u64 size;
@@ -150,10 +159,10 @@ efi_status_t efi_high_alloc(efi_system_table_t *sys_table_arg,
* a specific address. We are doing page-based allocations,
* so we must be aligned to a page.
*/
- if (align < EFI_PAGE_SIZE)
- align = EFI_PAGE_SIZE;
+ if (align < EFI_ALLOC_ALIGN)
+ align = EFI_ALLOC_ALIGN;
- nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+ nr_pages = round_up(size, EFI_ALLOC_ALIGN) / EFI_PAGE_SIZE;
again:
for (i = 0; i < map_size / desc_size; i++) {
efi_memory_desc_t *desc;
@@ -235,10 +244,10 @@ efi_status_t efi_low_alloc(efi_system_table_t *sys_table_arg,
* a specific address. We are doing page-based allocations,
* so we must be aligned to a page.
*/
- if (align < EFI_PAGE_SIZE)
- align = EFI_PAGE_SIZE;
+ if (align < EFI_ALLOC_ALIGN)
+ align = EFI_ALLOC_ALIGN;
- nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+ nr_pages = round_up(size, EFI_ALLOC_ALIGN) / EFI_PAGE_SIZE;
for (i = 0; i < map_size / desc_size; i++) {
efi_memory_desc_t *desc;
unsigned long m = (unsigned long)map;
@@ -292,7 +301,7 @@ void efi_free(efi_system_table_t *sys_table_arg, unsigned long size,
if (!size)
return;
- nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+ nr_pages = round_up(size, EFI_ALLOC_ALIGN) / EFI_PAGE_SIZE;
efi_call_early(free_pages, addr, nr_pages);
}
@@ -561,7 +570,7 @@ efi_status_t efi_relocate_kernel(efi_system_table_t *sys_table_arg,
* to the preferred address. If that fails, allocate as low
* as possible while respecting the required alignment.
*/
- nr_pages = round_up(alloc_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+ nr_pages = round_up(alloc_size, EFI_ALLOC_ALIGN) / EFI_PAGE_SIZE;
status = efi_call_early(allocate_pages,
EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
nr_pages, &efi_addr);
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index d1ba39c3ee4e..47437b16b186 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -43,4 +43,8 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
void *get_fdt(efi_system_table_t *sys_table);
+void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size,
+ unsigned long desc_size, efi_memory_desc_t *runtime_map,
+ int *count);
+
#endif
diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c
index c846a9608cbd..91da56c4fd54 100644
--- a/drivers/firmware/efi/libstub/fdt.c
+++ b/drivers/firmware/efi/libstub/fdt.c
@@ -14,6 +14,8 @@
#include <linux/libfdt.h>
#include <asm/efi.h>
+#include "efistub.h"
+
efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt,
unsigned long orig_fdt_size,
void *fdt, int new_fdt_size, char *cmdline_ptr,
@@ -193,9 +195,26 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
unsigned long map_size, desc_size;
u32 desc_ver;
unsigned long mmap_key;
- efi_memory_desc_t *memory_map;
+ efi_memory_desc_t *memory_map, *runtime_map;
unsigned long new_fdt_size;
efi_status_t status;
+ int runtime_entry_count = 0;
+
+ /*
+ * Get a copy of the current memory map that we will use to prepare
+ * the input for SetVirtualAddressMap(). We don't have to worry about
+ * subsequent allocations adding entries, since they could not affect
+ * the number of EFI_MEMORY_RUNTIME regions.
+ */
+ status = efi_get_memory_map(sys_table, &runtime_map, &map_size,
+ &desc_size, &desc_ver, &mmap_key);
+ if (status != EFI_SUCCESS) {
+ pr_efi_err(sys_table, "Unable to retrieve UEFI memory map.\n");
+ return status;
+ }
+
+ pr_efi(sys_table,
+ "Exiting boot services and installing virtual address map...\n");
/*
* Estimate size of new FDT, and allocate memory for it. We
@@ -248,12 +267,48 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
}
}
+ /*
+ * Update the memory map with virtual addresses. The function will also
+ * populate @runtime_map with copies of just the EFI_MEMORY_RUNTIME
+ * entries so that we can pass it straight into SetVirtualAddressMap()
+ */
+ efi_get_virtmap(memory_map, map_size, desc_size, runtime_map,
+ &runtime_entry_count);
+
/* Now we are ready to exit_boot_services.*/
status = sys_table->boottime->exit_boot_services(handle, mmap_key);
+ if (status == EFI_SUCCESS) {
+ efi_set_virtual_address_map_t *svam;
- if (status == EFI_SUCCESS)
- return status;
+ /* Install the new virtual address map */
+ svam = sys_table->runtime->set_virtual_address_map;
+ status = svam(runtime_entry_count * desc_size, desc_size,
+ desc_ver, runtime_map);
+
+ /*
+ * We are beyond the point of no return here, so if the call to
+ * SetVirtualAddressMap() failed, we need to signal that to the
+ * incoming kernel but proceed normally otherwise.
+ */
+ if (status != EFI_SUCCESS) {
+ int l;
+
+ /*
+ * Set the virtual address field of all
+ * EFI_MEMORY_RUNTIME entries to 0. This will signal
+ * the incoming kernel that no virtual translation has
+ * been installed.
+ */
+ for (l = 0; l < map_size; l += desc_size) {
+ efi_memory_desc_t *p = (void *)memory_map + l;
+
+ if (p->attribute & EFI_MEMORY_RUNTIME)
+ p->virt_addr = 0;
+ }
+ }
+ return EFI_SUCCESS;
+ }
pr_efi_err(sys_table, "Exit boot services failed.\n");
@@ -264,6 +319,7 @@ fail_free_new_fdt:
efi_free(sys_table, new_fdt_size, *new_fdt_addr);
fail:
+ sys_table->boottime->free_pool(runtime_map);
return EFI_LOAD_ERROR;
}
diff --git a/drivers/hid/hid-appleir.c b/drivers/hid/hid-appleir.c
index 0e6a42d37eb6..07cbc70f00e7 100644
--- a/drivers/hid/hid-appleir.c
+++ b/drivers/hid/hid-appleir.c
@@ -256,7 +256,7 @@ out:
return 0;
}
-static void appleir_input_configured(struct hid_device *hid,
+static int appleir_input_configured(struct hid_device *hid,
struct hid_input *hidinput)
{
struct input_dev *input_dev = hidinput->input;
@@ -275,6 +275,8 @@ static void appleir_input_configured(struct hid_device *hid,
for (i = 0; i < ARRAY_SIZE(appleir_key_table); i++)
set_bit(appleir->keymap[i], input_dev->keybit);
clear_bit(KEY_RESERVED, input_dev->keybit);
+
+ return 0;
}
static int appleir_input_mapping(struct hid_device *hid,
diff --git a/drivers/hid/hid-elo.c b/drivers/hid/hid-elo.c
index d0c8a1c1e1fe..0cd4f7216239 100644
--- a/drivers/hid/hid-elo.c
+++ b/drivers/hid/hid-elo.c
@@ -37,7 +37,7 @@ static bool use_fw_quirk = true;
module_param(use_fw_quirk, bool, S_IRUGO);
MODULE_PARM_DESC(use_fw_quirk, "Do periodic pokes for broken M firmwares (default = true)");
-static void elo_input_configured(struct hid_device *hdev,
+static int elo_input_configured(struct hid_device *hdev,
struct hid_input *hidinput)
{
struct input_dev *input = hidinput->input;
@@ -45,6 +45,8 @@ static void elo_input_configured(struct hid_device *hdev,
set_bit(BTN_TOUCH, input->keybit);
set_bit(ABS_PRESSURE, input->absbit);
input_set_abs_params(input, ABS_PRESSURE, 0, 256, 0, 0);
+
+ return 0;
}
static void elo_process_data(struct input_dev *input, const u8 *data, int size)
diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index 53d325968e42..8319b9a417f0 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -1468,8 +1468,9 @@ int hidinput_connect(struct hid_device *hid, unsigned int force)
* UGCI) cram a lot of unrelated inputs into the
* same interface. */
hidinput->report = report;
- if (drv->input_configured)
- drv->input_configured(hid, hidinput);
+ if (drv->input_configured &&
+ drv->input_configured(hid, hidinput))
+ goto out_cleanup;
if (input_register_device(hidinput->input))
goto out_cleanup;
hidinput = NULL;
@@ -1490,8 +1491,9 @@ int hidinput_connect(struct hid_device *hid, unsigned int force)
}
if (hidinput) {
- if (drv->input_configured)
- drv->input_configured(hid, hidinput);
+ if (drv->input_configured &&
+ drv->input_configured(hid, hidinput))
+ goto out_cleanup;
if (input_register_device(hidinput->input))
goto out_cleanup;
}
diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c
index 7dd9163f7e03..07bc1e66fb6d 100644
--- a/drivers/hid/hid-logitech-hidpp.c
+++ b/drivers/hid/hid-logitech-hidpp.c
@@ -652,13 +652,15 @@ static int hidpp_input_mapping(struct hid_device *hdev, struct hid_input *hi,
return 0;
}
-static void hidpp_input_configured(struct hid_device *hdev,
+static int hidpp_input_configured(struct hid_device *hdev,
struct hid_input *hidinput)
{
struct hidpp_device *hidpp = hid_get_drvdata(hdev);
if (hidpp->quirks & HIDPP_QUIRK_CLASS_WTP)
wtp_input_configured(hdev, hidinput);
+
+ return 0;
}
static int hidpp_raw_hidpp_event(struct hidpp_device *hidpp, u8 *data,
diff --git a/drivers/hid/hid-magicmouse.c b/drivers/hid/hid-magicmouse.c
index 29a74c1efcb8..d6fa496d0ca2 100644
--- a/drivers/hid/hid-magicmouse.c
+++ b/drivers/hid/hid-magicmouse.c
@@ -471,18 +471,22 @@ static int magicmouse_input_mapping(struct hid_device *hdev,
return 0;
}
-static void magicmouse_input_configured(struct hid_device *hdev,
+static int magicmouse_input_configured(struct hid_device *hdev,
struct hid_input *hi)
{
struct magicmouse_sc *msc = hid_get_drvdata(hdev);
+ int ret;
- int ret = magicmouse_setup_input(msc->input, hdev);
+ ret = magicmouse_setup_input(msc->input, hdev);
if (ret) {
hid_err(hdev, "magicmouse setup input failed (%d)\n", ret);
/* clean msc->input to notify probe() of the failure */
msc->input = NULL;
+ return ret;
}
+
+ return 0;
}
diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c
index e24e22687f33..15a777746c46 100644
--- a/drivers/hid/hid-multitouch.c
+++ b/drivers/hid/hid-multitouch.c
@@ -383,6 +383,16 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi,
td->inputmode_value = MT_INPUTMODE_TOUCHPAD;
}
+ /* Only map fields from TouchScreen or TouchPad collections.
+ * We need to ignore fields that belong to other collections
+ * such as Mouse that might have the same GenericDesktop usages. */
+ if (field->application == HID_DG_TOUCHSCREEN)
+ set_bit(INPUT_PROP_DIRECT, hi->input->propbit);
+ else if (field->application == HID_DG_TOUCHPAD)
+ set_bit(INPUT_PROP_POINTER, hi->input->propbit);
+ else
+ return 0;
+
if (usage->usage_index)
prev_usage = &field->usage[usage->usage_index - 1];
@@ -712,12 +722,13 @@ static void mt_touch_report(struct hid_device *hid, struct hid_report *report)
mt_sync_frame(td, report->field[0]->hidinput->input);
}
-static void mt_touch_input_configured(struct hid_device *hdev,
+static int mt_touch_input_configured(struct hid_device *hdev,
struct hid_input *hi)
{
struct mt_device *td = hid_get_drvdata(hdev);
struct mt_class *cls = &td->mtclass;
struct input_dev *input = hi->input;
+ int ret;
if (!td->maxcontacts)
td->maxcontacts = MT_DEFAULT_MAXCONTACT;
@@ -732,9 +743,12 @@ static void mt_touch_input_configured(struct hid_device *hdev,
if (cls->quirks & MT_QUIRK_NOT_SEEN_MEANS_UP)
td->mt_flags |= INPUT_MT_DROP_UNUSED;
- input_mt_init_slots(input, td->maxcontacts, td->mt_flags);
+ ret = input_mt_init_slots(input, td->maxcontacts, td->mt_flags);
+ if (ret)
+ return ret;
td->mt_flags = 0;
+ return 0;
}
static int mt_input_mapping(struct hid_device *hdev, struct hid_input *hi,
@@ -888,15 +902,19 @@ static void mt_post_parse(struct mt_device *td)
cls->quirks &= ~MT_QUIRK_CONTACT_CNT_ACCURATE;
}
-static void mt_input_configured(struct hid_device *hdev, struct hid_input *hi)
+static int mt_input_configured(struct hid_device *hdev, struct hid_input *hi)
{
struct mt_device *td = hid_get_drvdata(hdev);
char *name;
const char *suffix = NULL;
struct hid_field *field = hi->report->field[0];
+ int ret;
- if (hi->report->id == td->mt_report_id)
- mt_touch_input_configured(hdev, hi);
+ if (hi->report->id == td->mt_report_id) {
+ ret = mt_touch_input_configured(hdev, hi);
+ if (ret)
+ return ret;
+ }
/*
* some egalax touchscreens have "application == HID_DG_TOUCHSCREEN"
@@ -947,6 +965,8 @@ static void mt_input_configured(struct hid_device *hdev, struct hid_input *hi)
hi->input->name = name;
}
}
+
+ return 0;
}
static int mt_probe(struct hid_device *hdev, const struct hid_device_id *id)
diff --git a/drivers/hid/hid-ntrig.c b/drivers/hid/hid-ntrig.c
index 600f2075512f..756d1ef9bd99 100644
--- a/drivers/hid/hid-ntrig.c
+++ b/drivers/hid/hid-ntrig.c
@@ -859,14 +859,14 @@ not_claimed_input:
return 1;
}
-static void ntrig_input_configured(struct hid_device *hid,
+static int ntrig_input_configured(struct hid_device *hid,
struct hid_input *hidinput)
{
struct input_dev *input = hidinput->input;
if (hidinput->report->maxfield < 1)
- return;
+ return 0;
switch (hidinput->report->field[0]->application) {
case HID_DG_PEN:
@@ -890,6 +890,8 @@ static void ntrig_input_configured(struct hid_device *hid,
"N-Trig MultiTouch";
break;
}
+
+ return 0;
}
static int ntrig_probe(struct hid_device *hdev, const struct hid_device_id *id)
diff --git a/drivers/hid/hid-rmi.c b/drivers/hid/hid-rmi.c
index a994477bd25a..fb32e88b808d 100644
--- a/drivers/hid/hid-rmi.c
+++ b/drivers/hid/hid-rmi.c
@@ -842,7 +842,7 @@ static int rmi_populate(struct hid_device *hdev)
return 0;
}
-static void rmi_input_configured(struct hid_device *hdev, struct hid_input *hi)
+static int rmi_input_configured(struct hid_device *hdev, struct hid_input *hi)
{
struct rmi_data *data = hid_get_drvdata(hdev);
struct input_dev *input = hi->input;
@@ -854,7 +854,7 @@ static void rmi_input_configured(struct hid_device *hdev, struct hid_input *hi)
hid_dbg(hdev, "Opening low level driver\n");
ret = hid_hw_open(hdev);
if (ret)
- return;
+ return ret;
/* Allow incoming hid reports */
hid_device_io_start(hdev);
@@ -892,7 +892,9 @@ static void rmi_input_configured(struct hid_device *hdev, struct hid_input *hi)
input_set_abs_params(input, ABS_MT_TOUCH_MAJOR, 0, 0x0f, 0, 0);
input_set_abs_params(input, ABS_MT_TOUCH_MINOR, 0, 0x0f, 0, 0);
- input_mt_init_slots(input, data->max_fingers, INPUT_MT_POINTER);
+ ret = input_mt_init_slots(input, data->max_fingers, INPUT_MT_POINTER);
+ if (ret < 0)
+ goto exit;
if (data->button_count) {
__set_bit(EV_KEY, input->evbit);
@@ -908,6 +910,7 @@ static void rmi_input_configured(struct hid_device *hdev, struct hid_input *hi)
exit:
hid_device_io_stop(hdev);
hid_hw_close(hdev);
+ return ret;
}
static int rmi_input_mapping(struct hid_device *hdev,
diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c
index 15b3475e641d..8a4c1a10fa62 100644
--- a/drivers/hid/hid-sony.c
+++ b/drivers/hid/hid-sony.c
@@ -1100,20 +1100,27 @@ static int sony_register_touchpad(struct hid_input *hi, int touch_count,
return 0;
}
-static void sony_input_configured(struct hid_device *hdev,
+static int sony_input_configured(struct hid_device *hdev,
struct hid_input *hidinput)
{
struct sony_sc *sc = hid_get_drvdata(hdev);
+ int ret;
/*
* The Dualshock 4 touchpad supports 2 touches and has a
* resolution of 1920x942 (44.86 dots/mm).
*/
if (sc->quirks & DUALSHOCK4_CONTROLLER) {
- if (sony_register_touchpad(hidinput, 2, 1920, 942) != 0)
+ ret = sony_register_touchpad(hidinput, 2, 1920, 942);
+ if (ret) {
hid_err(sc->hdev,
- "Unable to initialize multi-touch slots\n");
+ "Unable to initialize multi-touch slots: %d\n",
+ ret);
+ return ret;
+ }
}
+
+ return 0;
}
/*
diff --git a/drivers/hid/hid-steelseries.c b/drivers/hid/hid-steelseries.c
index 29f328f411fb..8060650dcfe5 100644
--- a/drivers/hid/hid-steelseries.c
+++ b/drivers/hid/hid-steelseries.c
@@ -254,6 +254,11 @@ static int steelseries_srws1_probe(struct hid_device *hdev,
goto err_free;
}
+ if (!hid_validate_values(hdev, HID_OUTPUT_REPORT, 0, 0, 16)) {
+ ret = -ENODEV;
+ goto err_free;
+ }
+
ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT);
if (ret) {
hid_err(hdev, "hw start failed\n");
diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c
index 1a2032c2c1fb..690a9f0fa042 100644
--- a/drivers/hid/uhid.c
+++ b/drivers/hid/uhid.c
@@ -28,6 +28,8 @@
#define UHID_NAME "uhid"
#define UHID_BUFSIZE 32
+static DEFINE_MUTEX(uhid_open_mutex);
+
struct uhid_device {
struct mutex devlock;
bool running;
@@ -142,15 +144,26 @@ static void uhid_hid_stop(struct hid_device *hid)
static int uhid_hid_open(struct hid_device *hid)
{
struct uhid_device *uhid = hid->driver_data;
+ int retval = 0;
- return uhid_queue_event(uhid, UHID_OPEN);
+ mutex_lock(&uhid_open_mutex);
+ if (!hid->open++) {
+ retval = uhid_queue_event(uhid, UHID_OPEN);
+ if (retval)
+ hid->open--;
+ }
+ mutex_unlock(&uhid_open_mutex);
+ return retval;
}
static void uhid_hid_close(struct hid_device *hid)
{
struct uhid_device *uhid = hid->driver_data;
- uhid_queue_event(uhid, UHID_CLOSE);
+ mutex_lock(&uhid_open_mutex);
+ if (!--hid->open)
+ uhid_queue_event(uhid, UHID_CLOSE);
+ mutex_unlock(&uhid_open_mutex);
}
static int uhid_hid_parse(struct hid_device *hid)
diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index 158a760ada12..785257989b2f 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -492,6 +492,7 @@ int iio_device_register_eventset(struct iio_dev *indio_dev)
error_free_setup_event_lines:
iio_free_chan_devattr_list(&indio_dev->event_interface->dev_attr_list);
+ mutex_destroy(&indio_dev->event_interface->read_lock);
kfree(indio_dev->event_interface);
indio_dev->event_interface = NULL;
return ret;
@@ -517,5 +518,6 @@ void iio_device_unregister_eventset(struct iio_dev *indio_dev)
return;
iio_free_chan_devattr_list(&indio_dev->event_interface->dev_attr_list);
kfree(indio_dev->event_interface->group.attrs);
+ mutex_destroy(&indio_dev->event_interface->read_lock);
kfree(indio_dev->event_interface);
}
diff --git a/drivers/input/Kconfig b/drivers/input/Kconfig
index a11ff74a5127..2a4c51377f86 100644
--- a/drivers/input/Kconfig
+++ b/drivers/input/Kconfig
@@ -174,6 +174,19 @@ config INPUT_APMPOWER
To compile this driver as a module, choose M here: the
module will be called apm-power.
+config INPUT_KEYRESET
+ bool "Reset key"
+ depends on INPUT
+ select INPUT_KEYCOMBO
+ ---help---
+ Say Y here if you want to reboot when some keys are pressed;
+
+config INPUT_KEYCOMBO
+ bool "Key combo"
+ depends on INPUT
+ ---help---
+ Say Y here if you want to take action when some keys are pressed;
+
comment "Input Device Drivers"
source "drivers/input/keyboard/Kconfig"
diff --git a/drivers/input/Makefile b/drivers/input/Makefile
index 5ca3f631497f..ee4c06520bb4 100644
--- a/drivers/input/Makefile
+++ b/drivers/input/Makefile
@@ -25,3 +25,6 @@ obj-$(CONFIG_INPUT_TOUCHSCREEN) += touchscreen/
obj-$(CONFIG_INPUT_MISC) += misc/
obj-$(CONFIG_INPUT_APMPOWER) += apm-power.o
+obj-$(CONFIG_INPUT_KEYRESET) += keyreset.o
+obj-$(CONFIG_INPUT_KEYCOMBO) += keycombo.o
+
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 928dec311c2b..ae962b3874d6 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -26,6 +26,7 @@
#include <linux/major.h>
#include <linux/device.h>
#include <linux/cdev.h>
+#include <linux/wakelock.h>
#include "input-compat.h"
struct evdev {
@@ -46,6 +47,9 @@ struct evdev_client {
unsigned int tail;
unsigned int packet_head; /* [future] position of the first element of next packet */
spinlock_t buffer_lock; /* protects access to buffer, head and tail */
+ struct wake_lock wake_lock;
+ bool use_wake_lock;
+ char name[28];
struct fasync_struct *fasync;
struct evdev *evdev;
struct list_head node;
@@ -149,10 +153,14 @@ static void __pass_event(struct evdev_client *client,
client->buffer[client->tail].value = 0;
client->packet_head = client->tail;
+ if (client->use_wake_lock)
+ wake_unlock(&client->wake_lock);
}
if (event->type == EV_SYN && event->code == SYN_REPORT) {
client->packet_head = client->head;
+ if (client->use_wake_lock)
+ wake_lock(&client->wake_lock);
kill_fasync(&client->fasync, SIGIO, POLL_IN);
}
}
@@ -366,6 +374,9 @@ static int evdev_release(struct inode *inode, struct file *file)
evdev_detach_client(evdev, client);
+ if (client->use_wake_lock)
+ wake_lock_destroy(&client->wake_lock);
+
if (is_vmalloc_addr(client))
vfree(client);
else
@@ -402,6 +413,8 @@ static int evdev_open(struct inode *inode, struct file *file)
client->bufsize = bufsize;
spin_lock_init(&client->buffer_lock);
+ snprintf(client->name, sizeof(client->name), "%s-%d",
+ dev_name(&evdev->dev), task_tgid_vnr(current));
client->evdev = evdev;
evdev_attach_client(evdev, client);
@@ -468,6 +481,9 @@ static int evdev_fetch_next_event(struct evdev_client *client,
if (have_event) {
*event = client->buffer[client->tail++];
client->tail &= client->bufsize - 1;
+ if (client->use_wake_lock &&
+ client->packet_head == client->tail)
+ wake_unlock(&client->wake_lock);
}
spin_unlock_irq(&client->buffer_lock);
@@ -790,6 +806,11 @@ static int evdev_handle_mt_request(struct input_dev *dev,
return 0;
}
+/*
+ * HACK: disable conflicting EVIOCREVOKE until Android userspace stops using
+ * EVIOCSSUSPENDBLOCK
+ */
+/*
static int evdev_revoke(struct evdev *evdev, struct evdev_client *client,
struct file *file)
{
@@ -800,6 +821,36 @@ static int evdev_revoke(struct evdev *evdev, struct evdev_client *client,
return 0;
}
+*/
+
+static int evdev_enable_suspend_block(struct evdev *evdev,
+ struct evdev_client *client)
+{
+ if (client->use_wake_lock)
+ return 0;
+
+ spin_lock_irq(&client->buffer_lock);
+ wake_lock_init(&client->wake_lock, WAKE_LOCK_SUSPEND, client->name);
+ client->use_wake_lock = true;
+ if (client->packet_head != client->tail)
+ wake_lock(&client->wake_lock);
+ spin_unlock_irq(&client->buffer_lock);
+ return 0;
+}
+
+static int evdev_disable_suspend_block(struct evdev *evdev,
+ struct evdev_client *client)
+{
+ if (!client->use_wake_lock)
+ return 0;
+
+ spin_lock_irq(&client->buffer_lock);
+ client->use_wake_lock = false;
+ spin_unlock_irq(&client->buffer_lock);
+ wake_lock_destroy(&client->wake_lock);
+
+ return 0;
+}
static long evdev_do_ioctl(struct file *file, unsigned int cmd,
void __user *p, int compat_mode)
@@ -863,12 +914,17 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd,
else
return evdev_ungrab(evdev, client);
+ /*
+ * HACK: disable conflicting EVIOCREVOKE until Android userspace stops
+ * using EVIOCSSUSPENDBLOCK
+ */
+ /*
case EVIOCREVOKE:
if (p)
return -EINVAL;
else
return evdev_revoke(evdev, client, file);
-
+ */
case EVIOCSCLOCKID:
if (copy_from_user(&i, p, sizeof(unsigned int)))
return -EFAULT;
@@ -888,6 +944,15 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd,
case EVIOCSKEYCODE_V2:
return evdev_handle_set_keycode_v2(dev, p);
+
+ case EVIOCGSUSPENDBLOCK:
+ return put_user(client->use_wake_lock, ip);
+
+ case EVIOCSSUSPENDBLOCK:
+ if (p)
+ return evdev_enable_suspend_block(evdev, client);
+ else
+ return evdev_disable_suspend_block(evdev, client);
}
size = _IOC_SIZE(cmd);
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index e29c04e2aff4..e853a2134680 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -527,14 +527,14 @@ EXPORT_SYMBOL(gameport_set_phys);
*/
static void gameport_init_port(struct gameport *gameport)
{
- static atomic_t gameport_no = ATOMIC_INIT(0);
+ static atomic_t gameport_no = ATOMIC_INIT(-1);
__module_get(THIS_MODULE);
mutex_init(&gameport->drv_mutex);
device_initialize(&gameport->dev);
dev_set_name(&gameport->dev, "gameport%lu",
- (unsigned long)atomic_inc_return(&gameport_no) - 1);
+ (unsigned long)atomic_inc_return(&gameport_no));
gameport->dev.bus = &gameport_bus;
gameport->dev.release = gameport_release_port;
if (gameport->parent)
diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c
index 1a728f310b27..210425722aa2 100644
--- a/drivers/input/joystick/xpad.c
+++ b/drivers/input/joystick/xpad.c
@@ -31,12 +31,14 @@
* - the iForce driver drivers/char/joystick/iforce.c
* - the skeleton-driver drivers/usb/usb-skeleton.c
* - Xbox 360 information http://www.free60.org/wiki/Gamepad
+ * - Xbox One information https://github.com/quantus/xbox-one-controller-protocol
*
* Thanks to:
* - ITO Takayuki for providing essential xpad information on his website
* - Vojtech Pavlik - iforce driver / input subsystem
* - Greg Kroah-Hartman - usb-skeleton driver
* - XBOX Linux project - extra USB id's
+ * - Pekka Pöyry (quantus) - Xbox One controller reverse engineering
*
* TODO:
* - fine tune axes (especially trigger axes)
@@ -123,6 +125,7 @@ static const struct xpad_device {
{ 0x045e, 0x0289, "Microsoft X-Box pad v2 (US)", 0, XTYPE_XBOX },
{ 0x045e, 0x028e, "Microsoft X-Box 360 pad", 0, XTYPE_XBOX360 },
{ 0x045e, 0x02d1, "Microsoft X-Box One pad", 0, XTYPE_XBOXONE },
+ { 0x045e, 0x02dd, "Microsoft X-Box One pad (Covert Forces)", 0, XTYPE_XBOXONE },
{ 0x045e, 0x0291, "Xbox 360 Wireless Receiver (XBOX)", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W },
{ 0x045e, 0x0719, "Xbox 360 Wireless Receiver", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W },
{ 0x044f, 0x0f07, "Thrustmaster, Inc. Controller", 0, XTYPE_XBOX },
@@ -203,7 +206,7 @@ static const struct xpad_device {
{ 0x1bad, 0xf900, "Harmonix Xbox 360 Controller", 0, XTYPE_XBOX360 },
{ 0x1bad, 0xf901, "Gamestop Xbox 360 Controller", 0, XTYPE_XBOX360 },
{ 0x1bad, 0xf903, "Tron Xbox 360 controller", 0, XTYPE_XBOX360 },
- { 0x24c6, 0x5000, "Razer Atrox Arcade Stick", 0, XTYPE_XBOX360 },
+ { 0x24c6, 0x5000, "Razer Atrox Arcade Stick", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX360 },
{ 0x24c6, 0x5300, "PowerA MINI PROEX Controller", 0, XTYPE_XBOX360 },
{ 0x24c6, 0x5303, "Xbox Airflo wired controller", 0, XTYPE_XBOX360 },
{ 0x24c6, 0x5500, "Hori XBOX 360 EX 2 with Turbo", 0, XTYPE_XBOX360 },
@@ -241,7 +244,6 @@ static const signed short xpad_btn_triggers[] = {
-1
};
-
static const signed short xpad360_btn[] = { /* buttons for x360 controller */
BTN_TL, BTN_TR, /* Button LB/RB */
BTN_MODE, /* The big X button */
@@ -328,9 +330,6 @@ struct usb_xpad {
unsigned char *idata; /* input data */
dma_addr_t idata_dma;
- struct urb *bulk_out;
- unsigned char *bdata;
-
struct urb *irq_out; /* urb for interrupt out report */
unsigned char *odata; /* output data */
dma_addr_t odata_dma;
@@ -344,6 +343,8 @@ struct usb_xpad {
int mapping; /* map d-pad to buttons or to axes */
int xtype; /* type of xbox device */
+ int pad_nr; /* the order x360 pads were attached */
+ const char *name; /* name of the device */
};
/*
@@ -355,7 +356,6 @@ struct usb_xpad {
* The used report descriptor was taken from ITO Takayukis website:
* http://euc.jp/periphs/xbox-controller.ja.html
*/
-
static void xpad_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char *data)
{
struct input_dev *dev = xpad->dev;
@@ -438,7 +438,16 @@ static void xpad360_process_packet(struct usb_xpad *xpad,
input_report_key(dev, BTN_TRIGGER_HAPPY2, data[2] & 0x08);
input_report_key(dev, BTN_TRIGGER_HAPPY3, data[2] & 0x01);
input_report_key(dev, BTN_TRIGGER_HAPPY4, data[2] & 0x02);
- } else {
+ }
+
+ /*
+ * This should be a simple else block. However historically
+ * xbox360w has mapped DPAD to buttons while xbox360 did not. This
+ * made no sense, but now we can not just switch back and have to
+ * support both behaviors.
+ */
+ if (!(xpad->mapping & MAP_DPAD_TO_BUTTONS) ||
+ xpad->xtype == XTYPE_XBOX360W) {
input_report_abs(dev, ABS_HAT0X,
!!(data[2] & 0x08) - !!(data[2] & 0x04));
input_report_abs(dev, ABS_HAT0Y,
@@ -488,6 +497,8 @@ static void xpad360_process_packet(struct usb_xpad *xpad,
input_sync(dev);
}
+static void xpad_identify_controller(struct usb_xpad *xpad);
+
/*
* xpad360w_process_packet
*
@@ -502,14 +513,17 @@ static void xpad360_process_packet(struct usb_xpad *xpad,
* 01.1 - Pad state (Bytes 4+) valid
*
*/
-
static void xpad360w_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char *data)
{
/* Presence change */
if (data[0] & 0x08) {
if (data[1] & 0x80) {
xpad->pad_present = 1;
- usb_submit_urb(xpad->bulk_out, GFP_ATOMIC);
+ /*
+ * Light up the segment corresponding to
+ * controller number.
+ */
+ xpad_identify_controller(xpad);
} else
xpad->pad_present = 0;
}
@@ -666,28 +680,6 @@ exit:
__func__, retval);
}
-static void xpad_bulk_out(struct urb *urb)
-{
- struct usb_xpad *xpad = urb->context;
- struct device *dev = &xpad->intf->dev;
-
- switch (urb->status) {
- case 0:
- /* success */
- break;
- case -ECONNRESET:
- case -ENOENT:
- case -ESHUTDOWN:
- /* this urb is terminated, clean up */
- dev_dbg(dev, "%s - urb shutting down with status: %d\n",
- __func__, urb->status);
- break;
- default:
- dev_dbg(dev, "%s - nonzero urb status received: %d\n",
- __func__, urb->status);
- }
-}
-
static void xpad_irq_out(struct urb *urb)
{
struct usb_xpad *xpad = urb->context;
@@ -778,67 +770,109 @@ static void xpad_deinit_output(struct usb_xpad *xpad)
}
}
+static int xpad_inquiry_pad_presence(struct usb_xpad *xpad)
+{
+ int retval;
+
+ mutex_lock(&xpad->odata_mutex);
+
+ xpad->odata[0] = 0x08;
+ xpad->odata[1] = 0x00;
+ xpad->odata[2] = 0x0F;
+ xpad->odata[3] = 0xC0;
+ xpad->odata[4] = 0x00;
+ xpad->odata[5] = 0x00;
+ xpad->odata[6] = 0x00;
+ xpad->odata[7] = 0x00;
+ xpad->odata[8] = 0x00;
+ xpad->odata[9] = 0x00;
+ xpad->odata[10] = 0x00;
+ xpad->odata[11] = 0x00;
+ xpad->irq_out->transfer_buffer_length = 12;
+
+ retval = usb_submit_urb(xpad->irq_out, GFP_KERNEL);
+
+ mutex_unlock(&xpad->odata_mutex);
+
+ return retval;
+}
+
#ifdef CONFIG_JOYSTICK_XPAD_FF
static int xpad_play_effect(struct input_dev *dev, void *data, struct ff_effect *effect)
{
struct usb_xpad *xpad = input_get_drvdata(dev);
+ __u16 strong;
+ __u16 weak;
- if (effect->type == FF_RUMBLE) {
- __u16 strong = effect->u.rumble.strong_magnitude;
- __u16 weak = effect->u.rumble.weak_magnitude;
-
- switch (xpad->xtype) {
-
- case XTYPE_XBOX:
- xpad->odata[0] = 0x00;
- xpad->odata[1] = 0x06;
- xpad->odata[2] = 0x00;
- xpad->odata[3] = strong / 256; /* left actuator */
- xpad->odata[4] = 0x00;
- xpad->odata[5] = weak / 256; /* right actuator */
- xpad->irq_out->transfer_buffer_length = 6;
-
- return usb_submit_urb(xpad->irq_out, GFP_ATOMIC);
-
- case XTYPE_XBOX360:
- xpad->odata[0] = 0x00;
- xpad->odata[1] = 0x08;
- xpad->odata[2] = 0x00;
- xpad->odata[3] = strong / 256; /* left actuator? */
- xpad->odata[4] = weak / 256; /* right actuator? */
- xpad->odata[5] = 0x00;
- xpad->odata[6] = 0x00;
- xpad->odata[7] = 0x00;
- xpad->irq_out->transfer_buffer_length = 8;
-
- return usb_submit_urb(xpad->irq_out, GFP_ATOMIC);
-
- case XTYPE_XBOX360W:
- xpad->odata[0] = 0x00;
- xpad->odata[1] = 0x01;
- xpad->odata[2] = 0x0F;
- xpad->odata[3] = 0xC0;
- xpad->odata[4] = 0x00;
- xpad->odata[5] = strong / 256;
- xpad->odata[6] = weak / 256;
- xpad->odata[7] = 0x00;
- xpad->odata[8] = 0x00;
- xpad->odata[9] = 0x00;
- xpad->odata[10] = 0x00;
- xpad->odata[11] = 0x00;
- xpad->irq_out->transfer_buffer_length = 12;
-
- return usb_submit_urb(xpad->irq_out, GFP_ATOMIC);
-
- default:
- dev_dbg(&xpad->dev->dev,
- "%s - rumble command sent to unsupported xpad type: %d\n",
- __func__, xpad->xtype);
- return -1;
- }
+ if (effect->type != FF_RUMBLE)
+ return 0;
+
+ strong = effect->u.rumble.strong_magnitude;
+ weak = effect->u.rumble.weak_magnitude;
+
+ switch (xpad->xtype) {
+ case XTYPE_XBOX:
+ xpad->odata[0] = 0x00;
+ xpad->odata[1] = 0x06;
+ xpad->odata[2] = 0x00;
+ xpad->odata[3] = strong / 256; /* left actuator */
+ xpad->odata[4] = 0x00;
+ xpad->odata[5] = weak / 256; /* right actuator */
+ xpad->irq_out->transfer_buffer_length = 6;
+ break;
+
+ case XTYPE_XBOX360:
+ xpad->odata[0] = 0x00;
+ xpad->odata[1] = 0x08;
+ xpad->odata[2] = 0x00;
+ xpad->odata[3] = strong / 256; /* left actuator? */
+ xpad->odata[4] = weak / 256; /* right actuator? */
+ xpad->odata[5] = 0x00;
+ xpad->odata[6] = 0x00;
+ xpad->odata[7] = 0x00;
+ xpad->irq_out->transfer_buffer_length = 8;
+ break;
+
+ case XTYPE_XBOX360W:
+ xpad->odata[0] = 0x00;
+ xpad->odata[1] = 0x01;
+ xpad->odata[2] = 0x0F;
+ xpad->odata[3] = 0xC0;
+ xpad->odata[4] = 0x00;
+ xpad->odata[5] = strong / 256;
+ xpad->odata[6] = weak / 256;
+ xpad->odata[7] = 0x00;
+ xpad->odata[8] = 0x00;
+ xpad->odata[9] = 0x00;
+ xpad->odata[10] = 0x00;
+ xpad->odata[11] = 0x00;
+ xpad->irq_out->transfer_buffer_length = 12;
+ break;
+
+ case XTYPE_XBOXONE:
+ xpad->odata[0] = 0x09; /* activate rumble */
+ xpad->odata[1] = 0x08;
+ xpad->odata[2] = 0x00;
+ xpad->odata[3] = 0x08; /* continuous effect */
+ xpad->odata[4] = 0x00; /* simple rumble mode */
+ xpad->odata[5] = 0x03; /* L and R actuator only */
+ xpad->odata[6] = 0x00; /* TODO: LT actuator */
+ xpad->odata[7] = 0x00; /* TODO: RT actuator */
+ xpad->odata[8] = strong / 256; /* left actuator */
+ xpad->odata[9] = weak / 256; /* right actuator */
+ xpad->odata[10] = 0x80; /* length of pulse */
+ xpad->odata[11] = 0x00; /* stop period of pulse */
+ xpad->irq_out->transfer_buffer_length = 12;
+ break;
+
+ default:
+ dev_dbg(&xpad->dev->dev,
+ "%s - rumble command sent to unsupported xpad type: %d\n",
+ __func__, xpad->xtype);
+ return -EINVAL;
}
- return 0;
+ return usb_submit_urb(xpad->irq_out, GFP_ATOMIC);
}
static int xpad_init_ff(struct usb_xpad *xpad)
@@ -857,6 +891,9 @@ static int xpad_init_ff(struct usb_xpad *xpad) { return 0; }
#if defined(CONFIG_JOYSTICK_XPAD_LEDS)
#include <linux/leds.h>
+#include <linux/idr.h>
+
+static DEFINE_IDA(xpad_pad_seq);
struct xpad_led {
char name[16];
@@ -864,17 +901,67 @@ struct xpad_led {
struct usb_xpad *xpad;
};
+/**
+ * set the LEDs on Xbox360 / Wireless Controllers
+ * @param command
+ * 0: off
+ * 1: all blink, then previous setting
+ * 2: 1/top-left blink, then on
+ * 3: 2/top-right blink, then on
+ * 4: 3/bottom-left blink, then on
+ * 5: 4/bottom-right blink, then on
+ * 6: 1/top-left on
+ * 7: 2/top-right on
+ * 8: 3/bottom-left on
+ * 9: 4/bottom-right on
+ * 10: rotate
+ * 11: blink, based on previous setting
+ * 12: slow blink, based on previous setting
+ * 13: rotate with two lights
+ * 14: persistent slow all blink
+ * 15: blink once, then previous setting
+ */
static void xpad_send_led_command(struct usb_xpad *xpad, int command)
{
- if (command >= 0 && command < 14) {
- mutex_lock(&xpad->odata_mutex);
+ command %= 16;
+
+ mutex_lock(&xpad->odata_mutex);
+
+ switch (xpad->xtype) {
+ case XTYPE_XBOX360:
xpad->odata[0] = 0x01;
xpad->odata[1] = 0x03;
xpad->odata[2] = command;
xpad->irq_out->transfer_buffer_length = 3;
- usb_submit_urb(xpad->irq_out, GFP_KERNEL);
- mutex_unlock(&xpad->odata_mutex);
+ break;
+ case XTYPE_XBOX360W:
+ xpad->odata[0] = 0x00;
+ xpad->odata[1] = 0x00;
+ xpad->odata[2] = 0x08;
+ xpad->odata[3] = 0x40 + command;
+ xpad->odata[4] = 0x00;
+ xpad->odata[5] = 0x00;
+ xpad->odata[6] = 0x00;
+ xpad->odata[7] = 0x00;
+ xpad->odata[8] = 0x00;
+ xpad->odata[9] = 0x00;
+ xpad->odata[10] = 0x00;
+ xpad->odata[11] = 0x00;
+ xpad->irq_out->transfer_buffer_length = 12;
+ break;
}
+
+ usb_submit_urb(xpad->irq_out, GFP_KERNEL);
+ mutex_unlock(&xpad->odata_mutex);
+}
+
+/*
+ * Light up the segment corresponding to the pad number on
+ * Xbox 360 Controllers.
+ */
+static void xpad_identify_controller(struct usb_xpad *xpad)
+{
+ xpad_send_led_command(xpad, (xpad->pad_nr % 4) + 2);
}
static void xpad_led_set(struct led_classdev *led_cdev,
@@ -888,22 +975,24 @@ static void xpad_led_set(struct led_classdev *led_cdev,
static int xpad_led_probe(struct usb_xpad *xpad)
{
- static atomic_t led_seq = ATOMIC_INIT(0);
- long led_no;
struct xpad_led *led;
struct led_classdev *led_cdev;
int error;
- if (xpad->xtype != XTYPE_XBOX360)
+ if (xpad->xtype != XTYPE_XBOX360 && xpad->xtype != XTYPE_XBOX360W)
return 0;
xpad->led = led = kzalloc(sizeof(struct xpad_led), GFP_KERNEL);
if (!led)
return -ENOMEM;
- led_no = (long)atomic_inc_return(&led_seq) - 1;
+ xpad->pad_nr = ida_simple_get(&xpad_pad_seq, 0, 0, GFP_KERNEL);
+ if (xpad->pad_nr < 0) {
+ error = xpad->pad_nr;
+ goto err_free_mem;
+ }
- snprintf(led->name, sizeof(led->name), "xpad%ld", led_no);
+ snprintf(led->name, sizeof(led->name), "xpad%d", xpad->pad_nr);
led->xpad = xpad;
led_cdev = &led->led_cdev;
@@ -911,18 +1000,26 @@ static int xpad_led_probe(struct usb_xpad *xpad)
led_cdev->brightness_set = xpad_led_set;
error = led_classdev_register(&xpad->udev->dev, led_cdev);
- if (error) {
- kfree(led);
- xpad->led = NULL;
- return error;
- }
+ if (error)
+ goto err_free_id;
- /*
- * Light up the segment corresponding to controller number
- */
- xpad_send_led_command(xpad, (led_no % 4) + 2);
+ if (xpad->xtype == XTYPE_XBOX360) {
+ /*
+ * Light up the segment corresponding to controller
+ * number on wired devices. On wireless we'll do that
+ * when they respond to "presence" packet.
+ */
+ xpad_identify_controller(xpad);
+ }
return 0;
+
+err_free_id:
+ ida_simple_remove(&xpad_pad_seq, xpad->pad_nr);
+err_free_mem:
+ kfree(led);
+ xpad->led = NULL;
+ return error;
}
static void xpad_led_disconnect(struct usb_xpad *xpad)
@@ -931,15 +1028,16 @@ static void xpad_led_disconnect(struct usb_xpad *xpad)
if (xpad_led) {
led_classdev_unregister(&xpad_led->led_cdev);
+ ida_simple_remove(&xpad_pad_seq, xpad->pad_nr);
kfree(xpad_led);
}
}
#else
static int xpad_led_probe(struct usb_xpad *xpad) { return 0; }
static void xpad_led_disconnect(struct usb_xpad *xpad) { }
+static void xpad_identify_controller(struct usb_xpad *xpad) { }
#endif
-
static int xpad_open(struct input_dev *dev)
{
struct usb_xpad *xpad = input_get_drvdata(dev);
@@ -999,94 +1097,36 @@ static void xpad_set_up_abs(struct input_dev *input_dev, signed short abs)
}
}
-static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id)
+static void xpad_deinit_input(struct usb_xpad *xpad)
+{
+ xpad_led_disconnect(xpad);
+ input_unregister_device(xpad->dev);
+}
+
+static int xpad_init_input(struct usb_xpad *xpad)
{
- struct usb_device *udev = interface_to_usbdev(intf);
- struct usb_xpad *xpad;
struct input_dev *input_dev;
- struct usb_endpoint_descriptor *ep_irq_in;
- int ep_irq_in_idx;
int i, error;
- if (intf->cur_altsetting->desc.bNumEndpoints != 2)
- return -ENODEV;
-
- for (i = 0; xpad_device[i].idVendor; i++) {
- if ((le16_to_cpu(udev->descriptor.idVendor) == xpad_device[i].idVendor) &&
- (le16_to_cpu(udev->descriptor.idProduct) == xpad_device[i].idProduct))
- break;
- }
-
- if (xpad_device[i].xtype == XTYPE_XBOXONE &&
- intf->cur_altsetting->desc.bInterfaceNumber != 0) {
- /*
- * The Xbox One controller lists three interfaces all with the
- * same interface class, subclass and protocol. Differentiate by
- * interface number.
- */
- return -ENODEV;
- }
-
- xpad = kzalloc(sizeof(struct usb_xpad), GFP_KERNEL);
input_dev = input_allocate_device();
- if (!xpad || !input_dev) {
- error = -ENOMEM;
- goto fail1;
- }
-
- xpad->idata = usb_alloc_coherent(udev, XPAD_PKT_LEN,
- GFP_KERNEL, &xpad->idata_dma);
- if (!xpad->idata) {
- error = -ENOMEM;
- goto fail1;
- }
-
- xpad->irq_in = usb_alloc_urb(0, GFP_KERNEL);
- if (!xpad->irq_in) {
- error = -ENOMEM;
- goto fail2;
- }
-
- xpad->udev = udev;
- xpad->intf = intf;
- xpad->mapping = xpad_device[i].mapping;
- xpad->xtype = xpad_device[i].xtype;
-
- if (xpad->xtype == XTYPE_UNKNOWN) {
- if (intf->cur_altsetting->desc.bInterfaceClass == USB_CLASS_VENDOR_SPEC) {
- if (intf->cur_altsetting->desc.bInterfaceProtocol == 129)
- xpad->xtype = XTYPE_XBOX360W;
- else
- xpad->xtype = XTYPE_XBOX360;
- } else
- xpad->xtype = XTYPE_XBOX;
-
- if (dpad_to_buttons)
- xpad->mapping |= MAP_DPAD_TO_BUTTONS;
- if (triggers_to_buttons)
- xpad->mapping |= MAP_TRIGGERS_TO_BUTTONS;
- if (sticks_to_null)
- xpad->mapping |= MAP_STICKS_TO_NULL;
- }
+ if (!input_dev)
+ return -ENOMEM;
xpad->dev = input_dev;
- usb_make_path(udev, xpad->phys, sizeof(xpad->phys));
- strlcat(xpad->phys, "/input0", sizeof(xpad->phys));
-
- input_dev->name = xpad_device[i].name;
+ input_dev->name = xpad->name;
input_dev->phys = xpad->phys;
- usb_to_input_id(udev, &input_dev->id);
- input_dev->dev.parent = &intf->dev;
+ usb_to_input_id(xpad->udev, &input_dev->id);
+ input_dev->dev.parent = &xpad->intf->dev;
input_set_drvdata(input_dev, xpad);
input_dev->open = xpad_open;
input_dev->close = xpad_close;
- input_dev->evbit[0] = BIT_MASK(EV_KEY);
+ __set_bit(EV_KEY, input_dev->evbit);
if (!(xpad->mapping & MAP_STICKS_TO_NULL)) {
- input_dev->evbit[0] |= BIT_MASK(EV_ABS);
+ __set_bit(EV_ABS, input_dev->evbit);
/* set up axes */
for (i = 0; xpad_abs[i] >= 0; i++)
xpad_set_up_abs(input_dev, xpad_abs[i]);
@@ -1109,7 +1149,16 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
if (xpad->mapping & MAP_DPAD_TO_BUTTONS) {
for (i = 0; xpad_btn_pad[i] >= 0; i++)
__set_bit(xpad_btn_pad[i], input_dev->keybit);
- } else {
+ }
+
+ /*
+ * This should be a simple else block. However historically
+ * xbox360w has mapped DPAD to buttons while xbox360 did not. This
+ * made no sense, but now we can not just switch back and have to
+ * support both behaviors.
+ */
+ if (!(xpad->mapping & MAP_DPAD_TO_BUTTONS) ||
+ xpad->xtype == XTYPE_XBOX360W) {
for (i = 0; xpad_abs_pad[i] >= 0; i++)
xpad_set_up_abs(input_dev, xpad_abs_pad[i]);
}
@@ -1122,17 +1171,106 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
xpad_set_up_abs(input_dev, xpad_abs_triggers[i]);
}
- error = xpad_init_output(intf, xpad);
- if (error)
- goto fail3;
-
error = xpad_init_ff(xpad);
if (error)
- goto fail4;
+ goto err_free_input;
error = xpad_led_probe(xpad);
if (error)
- goto fail5;
+ goto err_destroy_ff;
+
+ error = input_register_device(xpad->dev);
+ if (error)
+ goto err_disconnect_led;
+
+ return 0;
+
+err_disconnect_led:
+ xpad_led_disconnect(xpad);
+err_destroy_ff:
+ input_ff_destroy(input_dev);
+err_free_input:
+ input_free_device(input_dev);
+ return error;
+}
+
+static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id)
+{
+ struct usb_device *udev = interface_to_usbdev(intf);
+ struct usb_xpad *xpad;
+ struct usb_endpoint_descriptor *ep_irq_in;
+ int ep_irq_in_idx;
+ int i, error;
+
+ if (intf->cur_altsetting->desc.bNumEndpoints != 2)
+ return -ENODEV;
+
+ for (i = 0; xpad_device[i].idVendor; i++) {
+ if ((le16_to_cpu(udev->descriptor.idVendor) == xpad_device[i].idVendor) &&
+ (le16_to_cpu(udev->descriptor.idProduct) == xpad_device[i].idProduct))
+ break;
+ }
+
+ xpad = kzalloc(sizeof(struct usb_xpad), GFP_KERNEL);
+ if (!xpad)
+ return -ENOMEM;
+
+ usb_make_path(udev, xpad->phys, sizeof(xpad->phys));
+ strlcat(xpad->phys, "/input0", sizeof(xpad->phys));
+
+ xpad->idata = usb_alloc_coherent(udev, XPAD_PKT_LEN,
+ GFP_KERNEL, &xpad->idata_dma);
+ if (!xpad->idata) {
+ error = -ENOMEM;
+ goto err_free_mem;
+ }
+
+ xpad->irq_in = usb_alloc_urb(0, GFP_KERNEL);
+ if (!xpad->irq_in) {
+ error = -ENOMEM;
+ goto err_free_idata;
+ }
+
+ xpad->udev = udev;
+ xpad->intf = intf;
+ xpad->mapping = xpad_device[i].mapping;
+ xpad->xtype = xpad_device[i].xtype;
+ xpad->name = xpad_device[i].name;
+
+ if (xpad->xtype == XTYPE_UNKNOWN) {
+ if (intf->cur_altsetting->desc.bInterfaceClass == USB_CLASS_VENDOR_SPEC) {
+ if (intf->cur_altsetting->desc.bInterfaceProtocol == 129)
+ xpad->xtype = XTYPE_XBOX360W;
+ else if (intf->cur_altsetting->desc.bInterfaceProtocol == 208)
+ xpad->xtype = XTYPE_XBOXONE;
+ else
+ xpad->xtype = XTYPE_XBOX360;
+ } else {
+ xpad->xtype = XTYPE_XBOX;
+ }
+
+ if (dpad_to_buttons)
+ xpad->mapping |= MAP_DPAD_TO_BUTTONS;
+ if (triggers_to_buttons)
+ xpad->mapping |= MAP_TRIGGERS_TO_BUTTONS;
+ if (sticks_to_null)
+ xpad->mapping |= MAP_STICKS_TO_NULL;
+ }
+
+ if (xpad->xtype == XTYPE_XBOXONE &&
+ intf->cur_altsetting->desc.bInterfaceNumber != 0) {
+ /*
+ * The Xbox One controller lists three interfaces all with the
+ * same interface class, subclass and protocol. Differentiate by
+ * interface number.
+ */
+ error = -ENODEV;
+ goto err_free_in_urb;
+ }
+
+ error = xpad_init_output(intf, xpad);
+ if (error)
+ goto err_free_in_urb;
/* Xbox One controller has in/out endpoints swapped. */
ep_irq_in_idx = xpad->xtype == XTYPE_XBOXONE ? 1 : 0;
@@ -1145,59 +1283,13 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
xpad->irq_in->transfer_dma = xpad->idata_dma;
xpad->irq_in->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
- error = input_register_device(xpad->dev);
- if (error)
- goto fail6;
-
usb_set_intfdata(intf, xpad);
- if (xpad->xtype == XTYPE_XBOX360W) {
- /*
- * Setup the message to set the LEDs on the
- * controller when it shows up
- */
- xpad->bulk_out = usb_alloc_urb(0, GFP_KERNEL);
- if (!xpad->bulk_out) {
- error = -ENOMEM;
- goto fail7;
- }
-
- xpad->bdata = kzalloc(XPAD_PKT_LEN, GFP_KERNEL);
- if (!xpad->bdata) {
- error = -ENOMEM;
- goto fail8;
- }
-
- xpad->bdata[2] = 0x08;
- switch (intf->cur_altsetting->desc.bInterfaceNumber) {
- case 0:
- xpad->bdata[3] = 0x42;
- break;
- case 2:
- xpad->bdata[3] = 0x43;
- break;
- case 4:
- xpad->bdata[3] = 0x44;
- break;
- case 6:
- xpad->bdata[3] = 0x45;
- }
-
- ep_irq_in = &intf->cur_altsetting->endpoint[1].desc;
- if (usb_endpoint_is_bulk_out(ep_irq_in)) {
- usb_fill_bulk_urb(xpad->bulk_out, udev,
- usb_sndbulkpipe(udev,
- ep_irq_in->bEndpointAddress),
- xpad->bdata, XPAD_PKT_LEN,
- xpad_bulk_out, xpad);
- } else {
- usb_fill_int_urb(xpad->bulk_out, udev,
- usb_sndintpipe(udev,
- ep_irq_in->bEndpointAddress),
- xpad->bdata, XPAD_PKT_LEN,
- xpad_bulk_out, xpad, 0);
- }
+ error = xpad_init_input(xpad);
+ if (error)
+ goto err_deinit_output;
+ if (xpad->xtype == XTYPE_XBOX360W) {
/*
* Submit the int URB immediately rather than waiting for open
* because we get status messages from the device whether
@@ -1208,22 +1300,32 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
xpad->irq_in->dev = xpad->udev;
error = usb_submit_urb(xpad->irq_in, GFP_KERNEL);
if (error)
- goto fail9;
- }
+ goto err_deinit_input;
+ /*
+ * Send presence packet.
+ * This will force the controller to resend connection packets.
+ * This is useful in the case we activate the module after the
+ * adapter has been plugged in, as it won't automatically
+ * send us info about the controllers.
+ */
+ error = xpad_inquiry_pad_presence(xpad);
+ if (error)
+ goto err_kill_in_urb;
+ }
return 0;
- fail9: kfree(xpad->bdata);
- fail8: usb_free_urb(xpad->bulk_out);
- fail7: input_unregister_device(input_dev);
- input_dev = NULL;
- fail6: xpad_led_disconnect(xpad);
- fail5: if (input_dev)
- input_ff_destroy(input_dev);
- fail4: xpad_deinit_output(xpad);
- fail3: usb_free_urb(xpad->irq_in);
- fail2: usb_free_coherent(udev, XPAD_PKT_LEN, xpad->idata, xpad->idata_dma);
- fail1: input_free_device(input_dev);
+err_kill_in_urb:
+ usb_kill_urb(xpad->irq_in);
+err_deinit_input:
+ xpad_deinit_input(xpad);
+err_deinit_output:
+ xpad_deinit_output(xpad);
+err_free_in_urb:
+ usb_free_urb(xpad->irq_in);
+err_free_idata:
+ usb_free_coherent(udev, XPAD_PKT_LEN, xpad->idata, xpad->idata_dma);
+err_free_mem:
kfree(xpad);
return error;
@@ -1233,13 +1335,10 @@ static void xpad_disconnect(struct usb_interface *intf)
{
struct usb_xpad *xpad = usb_get_intfdata (intf);
- xpad_led_disconnect(xpad);
- input_unregister_device(xpad->dev);
+ xpad_deinit_input(xpad);
xpad_deinit_output(xpad);
if (xpad->xtype == XTYPE_XBOX360W) {
- usb_kill_urb(xpad->bulk_out);
- usb_free_urb(xpad->bulk_out);
usb_kill_urb(xpad->irq_in);
}
@@ -1247,7 +1346,6 @@ static void xpad_disconnect(struct usb_interface *intf)
usb_free_coherent(xpad->udev, XPAD_PKT_LEN,
xpad->idata, xpad->idata_dma);
- kfree(xpad->bdata);
kfree(xpad);
usb_set_intfdata(intf, NULL);
diff --git a/drivers/input/keycombo.c b/drivers/input/keycombo.c
new file mode 100644
index 000000000000..2fba451b91d5
--- /dev/null
+++ b/drivers/input/keycombo.c
@@ -0,0 +1,261 @@
+/* drivers/input/keycombo.c
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/input.h>
+#include <linux/keycombo.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+struct keycombo_state {
+ struct input_handler input_handler;
+ unsigned long keybit[BITS_TO_LONGS(KEY_CNT)];
+ unsigned long upbit[BITS_TO_LONGS(KEY_CNT)];
+ unsigned long key[BITS_TO_LONGS(KEY_CNT)];
+ spinlock_t lock;
+ struct workqueue_struct *wq;
+ int key_down_target;
+ int key_down;
+ int key_up;
+ struct delayed_work key_down_work;
+ int delay;
+ struct work_struct key_up_work;
+ void (*key_up_fn)(void *);
+ void (*key_down_fn)(void *);
+ void *priv;
+ int key_is_down;
+ struct wakeup_source combo_held_wake_source;
+ struct wakeup_source combo_up_wake_source;
+};
+
+static void do_key_down(struct work_struct *work)
+{
+ struct delayed_work *dwork = container_of(work, struct delayed_work,
+ work);
+ struct keycombo_state *state = container_of(dwork,
+ struct keycombo_state, key_down_work);
+ if (state->key_down_fn)
+ state->key_down_fn(state->priv);
+}
+
+static void do_key_up(struct work_struct *work)
+{
+ struct keycombo_state *state = container_of(work, struct keycombo_state,
+ key_up_work);
+ if (state->key_up_fn)
+ state->key_up_fn(state->priv);
+ __pm_relax(&state->combo_up_wake_source);
+}
+
+static void keycombo_event(struct input_handle *handle, unsigned int type,
+ unsigned int code, int value)
+{
+ unsigned long flags;
+ struct keycombo_state *state = handle->private;
+
+ if (type != EV_KEY)
+ return;
+
+ if (code >= KEY_MAX)
+ return;
+
+ if (!test_bit(code, state->keybit))
+ return;
+
+ spin_lock_irqsave(&state->lock, flags);
+ if (!test_bit(code, state->key) == !value)
+ goto done;
+ __change_bit(code, state->key);
+ if (test_bit(code, state->upbit)) {
+ if (value)
+ state->key_up++;
+ else
+ state->key_up--;
+ } else {
+ if (value)
+ state->key_down++;
+ else
+ state->key_down--;
+ }
+ if (state->key_down == state->key_down_target && state->key_up == 0) {
+ __pm_stay_awake(&state->combo_held_wake_source);
+ state->key_is_down = 1;
+ if (queue_delayed_work(state->wq, &state->key_down_work,
+ state->delay))
+ pr_debug("Key down work already queued!");
+ } else if (state->key_is_down) {
+ if (!cancel_delayed_work(&state->key_down_work)) {
+ __pm_stay_awake(&state->combo_up_wake_source);
+ queue_work(state->wq, &state->key_up_work);
+ }
+ __pm_relax(&state->combo_held_wake_source);
+ state->key_is_down = 0;
+ }
+done:
+ spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static int keycombo_connect(struct input_handler *handler,
+ struct input_dev *dev,
+ const struct input_device_id *id)
+{
+ int i;
+ int ret;
+ struct input_handle *handle;
+ struct keycombo_state *state =
+ container_of(handler, struct keycombo_state, input_handler);
+ for (i = 0; i < KEY_MAX; i++) {
+ if (test_bit(i, state->keybit) && test_bit(i, dev->keybit))
+ break;
+ }
+ if (i == KEY_MAX)
+ return -ENODEV;
+
+ handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+ if (!handle)
+ return -ENOMEM;
+
+ handle->dev = dev;
+ handle->handler = handler;
+ handle->name = KEYCOMBO_NAME;
+ handle->private = state;
+
+ ret = input_register_handle(handle);
+ if (ret)
+ goto err_input_register_handle;
+
+ ret = input_open_device(handle);
+ if (ret)
+ goto err_input_open_device;
+
+ return 0;
+
+err_input_open_device:
+ input_unregister_handle(handle);
+err_input_register_handle:
+ kfree(handle);
+ return ret;
+}
+
+static void keycombo_disconnect(struct input_handle *handle)
+{
+ input_close_device(handle);
+ input_unregister_handle(handle);
+ kfree(handle);
+}
+
+static const struct input_device_id keycombo_ids[] = {
+ {
+ .flags = INPUT_DEVICE_ID_MATCH_EVBIT,
+ .evbit = { BIT_MASK(EV_KEY) },
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(input, keycombo_ids);
+
+static int keycombo_probe(struct platform_device *pdev)
+{
+ int ret;
+ int key, *keyp;
+ struct keycombo_state *state;
+ struct keycombo_platform_data *pdata = pdev->dev.platform_data;
+
+ if (!pdata)
+ return -EINVAL;
+
+ state = kzalloc(sizeof(*state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+
+ spin_lock_init(&state->lock);
+ keyp = pdata->keys_down;
+ while ((key = *keyp++)) {
+ if (key >= KEY_MAX)
+ continue;
+ state->key_down_target++;
+ __set_bit(key, state->keybit);
+ }
+ if (pdata->keys_up) {
+ keyp = pdata->keys_up;
+ while ((key = *keyp++)) {
+ if (key >= KEY_MAX)
+ continue;
+ __set_bit(key, state->keybit);
+ __set_bit(key, state->upbit);
+ }
+ }
+
+ state->wq = alloc_ordered_workqueue("keycombo", 0);
+ if (!state->wq)
+ return -ENOMEM;
+
+ state->priv = pdata->priv;
+
+ if (pdata->key_down_fn)
+ state->key_down_fn = pdata->key_down_fn;
+ INIT_DELAYED_WORK(&state->key_down_work, do_key_down);
+
+ if (pdata->key_up_fn)
+ state->key_up_fn = pdata->key_up_fn;
+ INIT_WORK(&state->key_up_work, do_key_up);
+
+ wakeup_source_init(&state->combo_held_wake_source, "key combo");
+ wakeup_source_init(&state->combo_up_wake_source, "key combo up");
+ state->delay = msecs_to_jiffies(pdata->key_down_delay);
+
+ state->input_handler.event = keycombo_event;
+ state->input_handler.connect = keycombo_connect;
+ state->input_handler.disconnect = keycombo_disconnect;
+ state->input_handler.name = KEYCOMBO_NAME;
+ state->input_handler.id_table = keycombo_ids;
+ ret = input_register_handler(&state->input_handler);
+ if (ret) {
+ kfree(state);
+ return ret;
+ }
+ platform_set_drvdata(pdev, state);
+ return 0;
+}
+
+int keycombo_remove(struct platform_device *pdev)
+{
+ struct keycombo_state *state = platform_get_drvdata(pdev);
+ input_unregister_handler(&state->input_handler);
+ destroy_workqueue(state->wq);
+ kfree(state);
+ return 0;
+}
+
+
+struct platform_driver keycombo_driver = {
+ .driver.name = KEYCOMBO_NAME,
+ .probe = keycombo_probe,
+ .remove = keycombo_remove,
+};
+
+static int __init keycombo_init(void)
+{
+ return platform_driver_register(&keycombo_driver);
+}
+
+static void __exit keycombo_exit(void)
+{
+ return platform_driver_unregister(&keycombo_driver);
+}
+
+module_init(keycombo_init);
+module_exit(keycombo_exit);
diff --git a/drivers/input/keyreset.c b/drivers/input/keyreset.c
new file mode 100644
index 000000000000..7fbf7247e65f
--- /dev/null
+++ b/drivers/input/keyreset.c
@@ -0,0 +1,145 @@
+/* drivers/input/keyreset.c
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/input.h>
+#include <linux/keyreset.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/keycombo.h>
+
+struct keyreset_state {
+ int restart_requested;
+ int (*reset_fn)(void);
+ struct platform_device *pdev_child;
+ struct work_struct restart_work;
+};
+
+static void do_restart(struct work_struct *unused)
+{
+ sys_sync();
+ kernel_restart(NULL);
+}
+
+static void do_reset_fn(void *priv)
+{
+ struct keyreset_state *state = priv;
+ if (state->restart_requested)
+ panic("keyboard reset failed, %d", state->restart_requested);
+ if (state->reset_fn) {
+ state->restart_requested = state->reset_fn();
+ } else {
+ pr_info("keyboard reset\n");
+ schedule_work(&state->restart_work);
+ state->restart_requested = 1;
+ }
+}
+
+static int keyreset_probe(struct platform_device *pdev)
+{
+ int ret = -ENOMEM;
+ struct keycombo_platform_data *pdata_child;
+ struct keyreset_platform_data *pdata = pdev->dev.platform_data;
+ int up_size = 0, down_size = 0, size;
+ int key, *keyp;
+ struct keyreset_state *state;
+
+ if (!pdata)
+ return -EINVAL;
+ state = devm_kzalloc(&pdev->dev, sizeof(*state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+
+ state->pdev_child = platform_device_alloc(KEYCOMBO_NAME,
+ PLATFORM_DEVID_AUTO);
+ if (!state->pdev_child)
+ return -ENOMEM;
+ state->pdev_child->dev.parent = &pdev->dev;
+ INIT_WORK(&state->restart_work, do_restart);
+
+ keyp = pdata->keys_down;
+ while ((key = *keyp++)) {
+ if (key >= KEY_MAX)
+ continue;
+ down_size++;
+ }
+ if (pdata->keys_up) {
+ keyp = pdata->keys_up;
+ while ((key = *keyp++)) {
+ if (key >= KEY_MAX)
+ continue;
+ up_size++;
+ }
+ }
+ size = sizeof(struct keycombo_platform_data)
+ + sizeof(int) * (down_size + 1);
+ pdata_child = devm_kzalloc(&pdev->dev, size, GFP_KERNEL);
+ if (!pdata_child)
+ goto error;
+ memcpy(pdata_child->keys_down, pdata->keys_down,
+ sizeof(int) * down_size);
+ if (up_size > 0) {
+ pdata_child->keys_up = devm_kzalloc(&pdev->dev, up_size + 1,
+ GFP_KERNEL);
+ if (!pdata_child->keys_up)
+ goto error;
+ memcpy(pdata_child->keys_up, pdata->keys_up,
+ sizeof(int) * up_size);
+ if (!pdata_child->keys_up)
+ goto error;
+ }
+ state->reset_fn = pdata->reset_fn;
+ pdata_child->key_down_fn = do_reset_fn;
+ pdata_child->priv = state;
+ pdata_child->key_down_delay = pdata->key_down_delay;
+ ret = platform_device_add_data(state->pdev_child, pdata_child, size);
+ if (ret)
+ goto error;
+ platform_set_drvdata(pdev, state);
+ return platform_device_add(state->pdev_child);
+error:
+ platform_device_put(state->pdev_child);
+ return ret;
+}
+
+int keyreset_remove(struct platform_device *pdev)
+{
+ struct keyreset_state *state = platform_get_drvdata(pdev);
+ platform_device_put(state->pdev_child);
+ return 0;
+}
+
+
+struct platform_driver keyreset_driver = {
+ .driver.name = KEYRESET_NAME,
+ .probe = keyreset_probe,
+ .remove = keyreset_remove,
+};
+
+static int __init keyreset_init(void)
+{
+ return platform_driver_register(&keyreset_driver);
+}
+
+static void __exit keyreset_exit(void)
+{
+ return platform_driver_unregister(&keyreset_driver);
+}
+
+module_init(keyreset_init);
+module_exit(keyreset_exit);
diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
index 23297ab6163f..838824bc7d31 100644
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -319,6 +319,17 @@ config INPUT_ATI_REMOTE2
To compile this driver as a module, choose M here: the module will be
called ati_remote2.
+config INPUT_KEYCHORD
+ tristate "Key chord input driver support"
+ help
+ Say Y here if you want to enable the key chord driver
+ accessible at /dev/keychord. This driver can be used
+ for receiving notifications when client specified key
+ combinations are pressed.
+
+ To compile this driver as a module, choose M here: the
+ module will be called keychord.
+
config INPUT_KEYSPAN_REMOTE
tristate "Keyspan DMR USB remote control"
depends on USB_ARCH_HAS_HCD
@@ -454,6 +465,11 @@ config INPUT_SGI_BTNS
To compile this driver as a module, choose M here: the
module will be called sgi_btns.
+config INPUT_GPIO
+ tristate "GPIO driver support"
+ help
+ Say Y here if you want to support gpio based keys, wheels etc...
+
config HP_SDC_RTC
tristate "HP SDC Real Time Clock"
depends on (GSC || HP300) && SERIO
diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile
index 19c760361f80..f77268edd9c2 100644
--- a/drivers/input/misc/Makefile
+++ b/drivers/input/misc/Makefile
@@ -31,9 +31,11 @@ obj-$(CONFIG_INPUT_DRV2667_HAPTICS) += drv2667.o
obj-$(CONFIG_INPUT_GP2A) += gp2ap002a00f.o
obj-$(CONFIG_INPUT_GPIO_BEEPER) += gpio-beeper.o
obj-$(CONFIG_INPUT_GPIO_TILT_POLLED) += gpio_tilt_polled.o
+obj-$(CONFIG_INPUT_GPIO) += gpio_event.o gpio_matrix.o gpio_input.o gpio_output.o gpio_axis.o
obj-$(CONFIG_HP_SDC_RTC) += hp_sdc_rtc.o
obj-$(CONFIG_INPUT_IMS_PCU) += ims-pcu.o
obj-$(CONFIG_INPUT_IXP4XX_BEEPER) += ixp4xx-beeper.o
+obj-$(CONFIG_INPUT_KEYCHORD) += keychord.o
obj-$(CONFIG_INPUT_KEYSPAN_REMOTE) += keyspan_remote.o
obj-$(CONFIG_INPUT_KXTJ9) += kxtj9.o
obj-$(CONFIG_INPUT_M68K_BEEP) += m68kspkr.o
diff --git a/drivers/input/misc/gpio_axis.c b/drivers/input/misc/gpio_axis.c
new file mode 100644
index 000000000000..0acf4a576f53
--- /dev/null
+++ b/drivers/input/misc/gpio_axis.c
@@ -0,0 +1,192 @@
+/* drivers/input/misc/gpio_axis.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+struct gpio_axis_state {
+ struct gpio_event_input_devs *input_devs;
+ struct gpio_event_axis_info *info;
+ uint32_t pos;
+};
+
+uint16_t gpio_axis_4bit_gray_map_table[] = {
+ [0x0] = 0x0, [0x1] = 0x1, /* 0000 0001 */
+ [0x3] = 0x2, [0x2] = 0x3, /* 0011 0010 */
+ [0x6] = 0x4, [0x7] = 0x5, /* 0110 0111 */
+ [0x5] = 0x6, [0x4] = 0x7, /* 0101 0100 */
+ [0xc] = 0x8, [0xd] = 0x9, /* 1100 1101 */
+ [0xf] = 0xa, [0xe] = 0xb, /* 1111 1110 */
+ [0xa] = 0xc, [0xb] = 0xd, /* 1010 1011 */
+ [0x9] = 0xe, [0x8] = 0xf, /* 1001 1000 */
+};
+uint16_t gpio_axis_4bit_gray_map(struct gpio_event_axis_info *info, uint16_t in)
+{
+ return gpio_axis_4bit_gray_map_table[in];
+}
+
+uint16_t gpio_axis_5bit_singletrack_map_table[] = {
+ [0x10] = 0x00, [0x14] = 0x01, [0x1c] = 0x02, /* 10000 10100 11100 */
+ [0x1e] = 0x03, [0x1a] = 0x04, [0x18] = 0x05, /* 11110 11010 11000 */
+ [0x08] = 0x06, [0x0a] = 0x07, [0x0e] = 0x08, /* 01000 01010 01110 */
+ [0x0f] = 0x09, [0x0d] = 0x0a, [0x0c] = 0x0b, /* 01111 01101 01100 */
+ [0x04] = 0x0c, [0x05] = 0x0d, [0x07] = 0x0e, /* 00100 00101 00111 */
+ [0x17] = 0x0f, [0x16] = 0x10, [0x06] = 0x11, /* 10111 10110 00110 */
+ [0x02] = 0x12, [0x12] = 0x13, [0x13] = 0x14, /* 00010 10010 10011 */
+ [0x1b] = 0x15, [0x0b] = 0x16, [0x03] = 0x17, /* 11011 01011 00011 */
+ [0x01] = 0x18, [0x09] = 0x19, [0x19] = 0x1a, /* 00001 01001 11001 */
+ [0x1d] = 0x1b, [0x15] = 0x1c, [0x11] = 0x1d, /* 11101 10101 10001 */
+};
+uint16_t gpio_axis_5bit_singletrack_map(
+ struct gpio_event_axis_info *info, uint16_t in)
+{
+ return gpio_axis_5bit_singletrack_map_table[in];
+}
+
+static void gpio_event_update_axis(struct gpio_axis_state *as, int report)
+{
+ struct gpio_event_axis_info *ai = as->info;
+ int i;
+ int change;
+ uint16_t state = 0;
+ uint16_t pos;
+ uint16_t old_pos = as->pos;
+ for (i = ai->count - 1; i >= 0; i--)
+ state = (state << 1) | gpio_get_value(ai->gpio[i]);
+ pos = ai->map(ai, state);
+ if (ai->flags & GPIOEAF_PRINT_RAW)
+ pr_info("axis %d-%d raw %x, pos %d -> %d\n",
+ ai->type, ai->code, state, old_pos, pos);
+ if (report && pos != old_pos) {
+ if (ai->type == EV_REL) {
+ change = (ai->decoded_size + pos - old_pos) %
+ ai->decoded_size;
+ if (change > ai->decoded_size / 2)
+ change -= ai->decoded_size;
+ if (change == ai->decoded_size / 2) {
+ if (ai->flags & GPIOEAF_PRINT_EVENT)
+ pr_info("axis %d-%d unknown direction, "
+ "pos %d -> %d\n", ai->type,
+ ai->code, old_pos, pos);
+ change = 0; /* no closest direction */
+ }
+ if (ai->flags & GPIOEAF_PRINT_EVENT)
+ pr_info("axis %d-%d change %d\n",
+ ai->type, ai->code, change);
+ input_report_rel(as->input_devs->dev[ai->dev],
+ ai->code, change);
+ } else {
+ if (ai->flags & GPIOEAF_PRINT_EVENT)
+ pr_info("axis %d-%d now %d\n",
+ ai->type, ai->code, pos);
+ input_event(as->input_devs->dev[ai->dev],
+ ai->type, ai->code, pos);
+ }
+ input_sync(as->input_devs->dev[ai->dev]);
+ }
+ as->pos = pos;
+}
+
+static irqreturn_t gpio_axis_irq_handler(int irq, void *dev_id)
+{
+ struct gpio_axis_state *as = dev_id;
+ gpio_event_update_axis(as, 1);
+ return IRQ_HANDLED;
+}
+
+int gpio_event_axis_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func)
+{
+ int ret;
+ int i;
+ int irq;
+ struct gpio_event_axis_info *ai;
+ struct gpio_axis_state *as;
+
+ ai = container_of(info, struct gpio_event_axis_info, info);
+ if (func == GPIO_EVENT_FUNC_SUSPEND) {
+ for (i = 0; i < ai->count; i++)
+ disable_irq(gpio_to_irq(ai->gpio[i]));
+ return 0;
+ }
+ if (func == GPIO_EVENT_FUNC_RESUME) {
+ for (i = 0; i < ai->count; i++)
+ enable_irq(gpio_to_irq(ai->gpio[i]));
+ return 0;
+ }
+
+ if (func == GPIO_EVENT_FUNC_INIT) {
+ *data = as = kmalloc(sizeof(*as), GFP_KERNEL);
+ if (as == NULL) {
+ ret = -ENOMEM;
+ goto err_alloc_axis_state_failed;
+ }
+ as->input_devs = input_devs;
+ as->info = ai;
+ if (ai->dev >= input_devs->count) {
+ pr_err("gpio_event_axis: bad device index %d >= %d "
+ "for %d:%d\n", ai->dev, input_devs->count,
+ ai->type, ai->code);
+ ret = -EINVAL;
+ goto err_bad_device_index;
+ }
+
+ input_set_capability(input_devs->dev[ai->dev],
+ ai->type, ai->code);
+ if (ai->type == EV_ABS) {
+ input_set_abs_params(input_devs->dev[ai->dev], ai->code,
+ 0, ai->decoded_size - 1, 0, 0);
+ }
+ for (i = 0; i < ai->count; i++) {
+ ret = gpio_request(ai->gpio[i], "gpio_event_axis");
+ if (ret < 0)
+ goto err_request_gpio_failed;
+ ret = gpio_direction_input(ai->gpio[i]);
+ if (ret < 0)
+ goto err_gpio_direction_input_failed;
+ ret = irq = gpio_to_irq(ai->gpio[i]);
+ if (ret < 0)
+ goto err_get_irq_num_failed;
+ ret = request_irq(irq, gpio_axis_irq_handler,
+ IRQF_TRIGGER_RISING |
+ IRQF_TRIGGER_FALLING,
+ "gpio_event_axis", as);
+ if (ret < 0)
+ goto err_request_irq_failed;
+ }
+ gpio_event_update_axis(as, 0);
+ return 0;
+ }
+
+ ret = 0;
+ as = *data;
+ for (i = ai->count - 1; i >= 0; i--) {
+ free_irq(gpio_to_irq(ai->gpio[i]), as);
+err_request_irq_failed:
+err_get_irq_num_failed:
+err_gpio_direction_input_failed:
+ gpio_free(ai->gpio[i]);
+err_request_gpio_failed:
+ ;
+ }
+err_bad_device_index:
+ kfree(as);
+ *data = NULL;
+err_alloc_axis_state_failed:
+ return ret;
+}
diff --git a/drivers/input/misc/gpio_event.c b/drivers/input/misc/gpio_event.c
new file mode 100644
index 000000000000..90f07eba3ce9
--- /dev/null
+++ b/drivers/input/misc/gpio_event.c
@@ -0,0 +1,228 @@
+/* drivers/input/misc/gpio_event.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/input.h>
+#include <linux/gpio_event.h>
+#include <linux/hrtimer.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+struct gpio_event {
+ struct gpio_event_input_devs *input_devs;
+ const struct gpio_event_platform_data *info;
+ void *state[0];
+};
+
+static int gpio_input_event(
+ struct input_dev *dev, unsigned int type, unsigned int code, int value)
+{
+ int i;
+ int devnr;
+ int ret = 0;
+ int tmp_ret;
+ struct gpio_event_info **ii;
+ struct gpio_event *ip = input_get_drvdata(dev);
+
+ for (devnr = 0; devnr < ip->input_devs->count; devnr++)
+ if (ip->input_devs->dev[devnr] == dev)
+ break;
+ if (devnr == ip->input_devs->count) {
+ pr_err("gpio_input_event: unknown device %p\n", dev);
+ return -EIO;
+ }
+
+ for (i = 0, ii = ip->info->info; i < ip->info->info_count; i++, ii++) {
+ if ((*ii)->event) {
+ tmp_ret = (*ii)->event(ip->input_devs, *ii,
+ &ip->state[i],
+ devnr, type, code, value);
+ if (tmp_ret)
+ ret = tmp_ret;
+ }
+ }
+ return ret;
+}
+
+static int gpio_event_call_all_func(struct gpio_event *ip, int func)
+{
+ int i;
+ int ret;
+ struct gpio_event_info **ii;
+
+ if (func == GPIO_EVENT_FUNC_INIT || func == GPIO_EVENT_FUNC_RESUME) {
+ ii = ip->info->info;
+ for (i = 0; i < ip->info->info_count; i++, ii++) {
+ if ((*ii)->func == NULL) {
+ ret = -ENODEV;
+ pr_err("gpio_event_probe: Incomplete pdata, "
+ "no function\n");
+ goto err_no_func;
+ }
+ if (func == GPIO_EVENT_FUNC_RESUME && (*ii)->no_suspend)
+ continue;
+ ret = (*ii)->func(ip->input_devs, *ii, &ip->state[i],
+ func);
+ if (ret) {
+ pr_err("gpio_event_probe: function failed\n");
+ goto err_func_failed;
+ }
+ }
+ return 0;
+ }
+
+ ret = 0;
+ i = ip->info->info_count;
+ ii = ip->info->info + i;
+ while (i > 0) {
+ i--;
+ ii--;
+ if ((func & ~1) == GPIO_EVENT_FUNC_SUSPEND && (*ii)->no_suspend)
+ continue;
+ (*ii)->func(ip->input_devs, *ii, &ip->state[i], func & ~1);
+err_func_failed:
+err_no_func:
+ ;
+ }
+ return ret;
+}
+
+static void __maybe_unused gpio_event_suspend(struct gpio_event *ip)
+{
+ gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_SUSPEND);
+ if (ip->info->power)
+ ip->info->power(ip->info, 0);
+}
+
+static void __maybe_unused gpio_event_resume(struct gpio_event *ip)
+{
+ if (ip->info->power)
+ ip->info->power(ip->info, 1);
+ gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_RESUME);
+}
+
+static int gpio_event_probe(struct platform_device *pdev)
+{
+ int err;
+ struct gpio_event *ip;
+ struct gpio_event_platform_data *event_info;
+ int dev_count = 1;
+ int i;
+ int registered = 0;
+
+ event_info = pdev->dev.platform_data;
+ if (event_info == NULL) {
+ pr_err("gpio_event_probe: No pdata\n");
+ return -ENODEV;
+ }
+ if ((!event_info->name && !event_info->names[0]) ||
+ !event_info->info || !event_info->info_count) {
+ pr_err("gpio_event_probe: Incomplete pdata\n");
+ return -ENODEV;
+ }
+ if (!event_info->name)
+ while (event_info->names[dev_count])
+ dev_count++;
+ ip = kzalloc(sizeof(*ip) +
+ sizeof(ip->state[0]) * event_info->info_count +
+ sizeof(*ip->input_devs) +
+ sizeof(ip->input_devs->dev[0]) * dev_count, GFP_KERNEL);
+ if (ip == NULL) {
+ err = -ENOMEM;
+ pr_err("gpio_event_probe: Failed to allocate private data\n");
+ goto err_kp_alloc_failed;
+ }
+ ip->input_devs = (void*)&ip->state[event_info->info_count];
+ platform_set_drvdata(pdev, ip);
+
+ for (i = 0; i < dev_count; i++) {
+ struct input_dev *input_dev = input_allocate_device();
+ if (input_dev == NULL) {
+ err = -ENOMEM;
+ pr_err("gpio_event_probe: "
+ "Failed to allocate input device\n");
+ goto err_input_dev_alloc_failed;
+ }
+ input_set_drvdata(input_dev, ip);
+ input_dev->name = event_info->name ?
+ event_info->name : event_info->names[i];
+ input_dev->event = gpio_input_event;
+ ip->input_devs->dev[i] = input_dev;
+ }
+ ip->input_devs->count = dev_count;
+ ip->info = event_info;
+ if (event_info->power)
+ ip->info->power(ip->info, 1);
+
+ err = gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_INIT);
+ if (err)
+ goto err_call_all_func_failed;
+
+ for (i = 0; i < dev_count; i++) {
+ err = input_register_device(ip->input_devs->dev[i]);
+ if (err) {
+ pr_err("gpio_event_probe: Unable to register %s "
+ "input device\n", ip->input_devs->dev[i]->name);
+ goto err_input_register_device_failed;
+ }
+ registered++;
+ }
+
+ return 0;
+
+err_input_register_device_failed:
+ gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_UNINIT);
+err_call_all_func_failed:
+ if (event_info->power)
+ ip->info->power(ip->info, 0);
+ for (i = 0; i < registered; i++)
+ input_unregister_device(ip->input_devs->dev[i]);
+ for (i = dev_count - 1; i >= registered; i--) {
+ input_free_device(ip->input_devs->dev[i]);
+err_input_dev_alloc_failed:
+ ;
+ }
+ kfree(ip);
+err_kp_alloc_failed:
+ return err;
+}
+
+static int gpio_event_remove(struct platform_device *pdev)
+{
+ struct gpio_event *ip = platform_get_drvdata(pdev);
+ int i;
+
+ gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_UNINIT);
+ if (ip->info->power)
+ ip->info->power(ip->info, 0);
+ for (i = 0; i < ip->input_devs->count; i++)
+ input_unregister_device(ip->input_devs->dev[i]);
+ kfree(ip);
+ return 0;
+}
+
+static struct platform_driver gpio_event_driver = {
+ .probe = gpio_event_probe,
+ .remove = gpio_event_remove,
+ .driver = {
+ .name = GPIO_EVENT_DEV_NAME,
+ },
+};
+
+module_platform_driver(gpio_event_driver);
+
+MODULE_DESCRIPTION("GPIO Event Driver");
+MODULE_LICENSE("GPL");
+
diff --git a/drivers/input/misc/gpio_input.c b/drivers/input/misc/gpio_input.c
new file mode 100644
index 000000000000..eefd02725aff
--- /dev/null
+++ b/drivers/input/misc/gpio_input.c
@@ -0,0 +1,390 @@
+/* drivers/input/misc/gpio_input.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+#include <linux/hrtimer.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/pm_wakeup.h>
+
+enum {
+ DEBOUNCE_UNSTABLE = BIT(0), /* Got irq, while debouncing */
+ DEBOUNCE_PRESSED = BIT(1),
+ DEBOUNCE_NOTPRESSED = BIT(2),
+ DEBOUNCE_WAIT_IRQ = BIT(3), /* Stable irq state */
+ DEBOUNCE_POLL = BIT(4), /* Stable polling state */
+
+ DEBOUNCE_UNKNOWN =
+ DEBOUNCE_PRESSED | DEBOUNCE_NOTPRESSED,
+};
+
+struct gpio_key_state {
+ struct gpio_input_state *ds;
+ uint8_t debounce;
+};
+
+struct gpio_input_state {
+ struct gpio_event_input_devs *input_devs;
+ const struct gpio_event_input_info *info;
+ struct hrtimer timer;
+ int use_irq;
+ int debounce_count;
+ spinlock_t irq_lock;
+ struct wakeup_source *ws;
+ struct gpio_key_state key_state[0];
+};
+
+static enum hrtimer_restart gpio_event_input_timer_func(struct hrtimer *timer)
+{
+ int i;
+ int pressed;
+ struct gpio_input_state *ds =
+ container_of(timer, struct gpio_input_state, timer);
+ unsigned gpio_flags = ds->info->flags;
+ unsigned npolarity;
+ int nkeys = ds->info->keymap_size;
+ const struct gpio_event_direct_entry *key_entry;
+ struct gpio_key_state *key_state;
+ unsigned long irqflags;
+ uint8_t debounce;
+ bool sync_needed;
+
+#if 0
+ key_entry = kp->keys_info->keymap;
+ key_state = kp->key_state;
+ for (i = 0; i < nkeys; i++, key_entry++, key_state++)
+ pr_info("gpio_read_detect_status %d %d\n", key_entry->gpio,
+ gpio_read_detect_status(key_entry->gpio));
+#endif
+ key_entry = ds->info->keymap;
+ key_state = ds->key_state;
+ sync_needed = false;
+ spin_lock_irqsave(&ds->irq_lock, irqflags);
+ for (i = 0; i < nkeys; i++, key_entry++, key_state++) {
+ debounce = key_state->debounce;
+ if (debounce & DEBOUNCE_WAIT_IRQ)
+ continue;
+ if (key_state->debounce & DEBOUNCE_UNSTABLE) {
+ debounce = key_state->debounce = DEBOUNCE_UNKNOWN;
+ enable_irq(gpio_to_irq(key_entry->gpio));
+ if (gpio_flags & GPIOEDF_PRINT_KEY_UNSTABLE)
+ pr_info("gpio_keys_scan_keys: key %x-%x, %d "
+ "(%d) continue debounce\n",
+ ds->info->type, key_entry->code,
+ i, key_entry->gpio);
+ }
+ npolarity = !(gpio_flags & GPIOEDF_ACTIVE_HIGH);
+ pressed = gpio_get_value(key_entry->gpio) ^ npolarity;
+ if (debounce & DEBOUNCE_POLL) {
+ if (pressed == !(debounce & DEBOUNCE_PRESSED)) {
+ ds->debounce_count++;
+ key_state->debounce = DEBOUNCE_UNKNOWN;
+ if (gpio_flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+ pr_info("gpio_keys_scan_keys: key %x-"
+ "%x, %d (%d) start debounce\n",
+ ds->info->type, key_entry->code,
+ i, key_entry->gpio);
+ }
+ continue;
+ }
+ if (pressed && (debounce & DEBOUNCE_NOTPRESSED)) {
+ if (gpio_flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+ pr_info("gpio_keys_scan_keys: key %x-%x, %d "
+ "(%d) debounce pressed 1\n",
+ ds->info->type, key_entry->code,
+ i, key_entry->gpio);
+ key_state->debounce = DEBOUNCE_PRESSED;
+ continue;
+ }
+ if (!pressed && (debounce & DEBOUNCE_PRESSED)) {
+ if (gpio_flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+ pr_info("gpio_keys_scan_keys: key %x-%x, %d "
+ "(%d) debounce pressed 0\n",
+ ds->info->type, key_entry->code,
+ i, key_entry->gpio);
+ key_state->debounce = DEBOUNCE_NOTPRESSED;
+ continue;
+ }
+ /* key is stable */
+ ds->debounce_count--;
+ if (ds->use_irq)
+ key_state->debounce |= DEBOUNCE_WAIT_IRQ;
+ else
+ key_state->debounce |= DEBOUNCE_POLL;
+ if (gpio_flags & GPIOEDF_PRINT_KEYS)
+ pr_info("gpio_keys_scan_keys: key %x-%x, %d (%d) "
+ "changed to %d\n", ds->info->type,
+ key_entry->code, i, key_entry->gpio, pressed);
+ input_event(ds->input_devs->dev[key_entry->dev], ds->info->type,
+ key_entry->code, pressed);
+ sync_needed = true;
+ }
+ if (sync_needed) {
+ for (i = 0; i < ds->input_devs->count; i++)
+ input_sync(ds->input_devs->dev[i]);
+ }
+
+#if 0
+ key_entry = kp->keys_info->keymap;
+ key_state = kp->key_state;
+ for (i = 0; i < nkeys; i++, key_entry++, key_state++) {
+ pr_info("gpio_read_detect_status %d %d\n", key_entry->gpio,
+ gpio_read_detect_status(key_entry->gpio));
+ }
+#endif
+
+ if (ds->debounce_count)
+ hrtimer_start(timer, ds->info->debounce_time, HRTIMER_MODE_REL);
+ else if (!ds->use_irq)
+ hrtimer_start(timer, ds->info->poll_time, HRTIMER_MODE_REL);
+ else
+ __pm_relax(ds->ws);
+
+ spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+
+ return HRTIMER_NORESTART;
+}
+
+static irqreturn_t gpio_event_input_irq_handler(int irq, void *dev_id)
+{
+ struct gpio_key_state *ks = dev_id;
+ struct gpio_input_state *ds = ks->ds;
+ int keymap_index = ks - ds->key_state;
+ const struct gpio_event_direct_entry *key_entry;
+ unsigned long irqflags;
+ int pressed;
+
+ if (!ds->use_irq)
+ return IRQ_HANDLED;
+
+ key_entry = &ds->info->keymap[keymap_index];
+
+ if (ds->info->debounce_time.tv64) {
+ spin_lock_irqsave(&ds->irq_lock, irqflags);
+ if (ks->debounce & DEBOUNCE_WAIT_IRQ) {
+ ks->debounce = DEBOUNCE_UNKNOWN;
+ if (ds->debounce_count++ == 0) {
+ __pm_stay_awake(ds->ws);
+ hrtimer_start(
+ &ds->timer, ds->info->debounce_time,
+ HRTIMER_MODE_REL);
+ }
+ if (ds->info->flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+ pr_info("gpio_event_input_irq_handler: "
+ "key %x-%x, %d (%d) start debounce\n",
+ ds->info->type, key_entry->code,
+ keymap_index, key_entry->gpio);
+ } else {
+ disable_irq_nosync(irq);
+ ks->debounce = DEBOUNCE_UNSTABLE;
+ }
+ spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+ } else {
+ pressed = gpio_get_value(key_entry->gpio) ^
+ !(ds->info->flags & GPIOEDF_ACTIVE_HIGH);
+ if (ds->info->flags & GPIOEDF_PRINT_KEYS)
+ pr_info("gpio_event_input_irq_handler: key %x-%x, %d "
+ "(%d) changed to %d\n",
+ ds->info->type, key_entry->code, keymap_index,
+ key_entry->gpio, pressed);
+ input_event(ds->input_devs->dev[key_entry->dev], ds->info->type,
+ key_entry->code, pressed);
+ input_sync(ds->input_devs->dev[key_entry->dev]);
+ }
+ return IRQ_HANDLED;
+}
+
+static int gpio_event_input_request_irqs(struct gpio_input_state *ds)
+{
+ int i;
+ int err;
+ unsigned int irq;
+ unsigned long req_flags = IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING;
+
+ for (i = 0; i < ds->info->keymap_size; i++) {
+ err = irq = gpio_to_irq(ds->info->keymap[i].gpio);
+ if (err < 0)
+ goto err_gpio_get_irq_num_failed;
+ err = request_irq(irq, gpio_event_input_irq_handler,
+ req_flags, "gpio_keys", &ds->key_state[i]);
+ if (err) {
+ pr_err("gpio_event_input_request_irqs: request_irq "
+ "failed for input %d, irq %d\n",
+ ds->info->keymap[i].gpio, irq);
+ goto err_request_irq_failed;
+ }
+ if (ds->info->info.no_suspend) {
+ err = enable_irq_wake(irq);
+ if (err) {
+ pr_err("gpio_event_input_request_irqs: "
+ "enable_irq_wake failed for input %d, "
+ "irq %d\n",
+ ds->info->keymap[i].gpio, irq);
+ goto err_enable_irq_wake_failed;
+ }
+ }
+ }
+ return 0;
+
+ for (i = ds->info->keymap_size - 1; i >= 0; i--) {
+ irq = gpio_to_irq(ds->info->keymap[i].gpio);
+ if (ds->info->info.no_suspend)
+ disable_irq_wake(irq);
+err_enable_irq_wake_failed:
+ free_irq(irq, &ds->key_state[i]);
+err_request_irq_failed:
+err_gpio_get_irq_num_failed:
+ ;
+ }
+ return err;
+}
+
+int gpio_event_input_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func)
+{
+ int ret;
+ int i;
+ unsigned long irqflags;
+ struct gpio_event_input_info *di;
+ struct gpio_input_state *ds = *data;
+ char *wlname;
+
+ di = container_of(info, struct gpio_event_input_info, info);
+
+ if (func == GPIO_EVENT_FUNC_SUSPEND) {
+ if (ds->use_irq)
+ for (i = 0; i < di->keymap_size; i++)
+ disable_irq(gpio_to_irq(di->keymap[i].gpio));
+ hrtimer_cancel(&ds->timer);
+ return 0;
+ }
+ if (func == GPIO_EVENT_FUNC_RESUME) {
+ spin_lock_irqsave(&ds->irq_lock, irqflags);
+ if (ds->use_irq)
+ for (i = 0; i < di->keymap_size; i++)
+ enable_irq(gpio_to_irq(di->keymap[i].gpio));
+ hrtimer_start(&ds->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+ spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+ return 0;
+ }
+
+ if (func == GPIO_EVENT_FUNC_INIT) {
+ if (ktime_to_ns(di->poll_time) <= 0)
+ di->poll_time = ktime_set(0, 20 * NSEC_PER_MSEC);
+
+ *data = ds = kzalloc(sizeof(*ds) + sizeof(ds->key_state[0]) *
+ di->keymap_size, GFP_KERNEL);
+ if (ds == NULL) {
+ ret = -ENOMEM;
+ pr_err("gpio_event_input_func: "
+ "Failed to allocate private data\n");
+ goto err_ds_alloc_failed;
+ }
+ ds->debounce_count = di->keymap_size;
+ ds->input_devs = input_devs;
+ ds->info = di;
+ wlname = kasprintf(GFP_KERNEL, "gpio_input:%s%s",
+ input_devs->dev[0]->name,
+ (input_devs->count > 1) ? "..." : "");
+
+ ds->ws = wakeup_source_register(wlname);
+ kfree(wlname);
+ if (!ds->ws) {
+ ret = -ENOMEM;
+ pr_err("gpio_event_input_func: "
+ "Failed to allocate wakeup source\n");
+ goto err_ws_failed;
+ }
+
+ spin_lock_init(&ds->irq_lock);
+
+ for (i = 0; i < di->keymap_size; i++) {
+ int dev = di->keymap[i].dev;
+ if (dev >= input_devs->count) {
+ pr_err("gpio_event_input_func: bad device "
+ "index %d >= %d for key code %d\n",
+ dev, input_devs->count,
+ di->keymap[i].code);
+ ret = -EINVAL;
+ goto err_bad_keymap;
+ }
+ input_set_capability(input_devs->dev[dev], di->type,
+ di->keymap[i].code);
+ ds->key_state[i].ds = ds;
+ ds->key_state[i].debounce = DEBOUNCE_UNKNOWN;
+ }
+
+ for (i = 0; i < di->keymap_size; i++) {
+ ret = gpio_request(di->keymap[i].gpio, "gpio_kp_in");
+ if (ret) {
+ pr_err("gpio_event_input_func: gpio_request "
+ "failed for %d\n", di->keymap[i].gpio);
+ goto err_gpio_request_failed;
+ }
+ ret = gpio_direction_input(di->keymap[i].gpio);
+ if (ret) {
+ pr_err("gpio_event_input_func: "
+ "gpio_direction_input failed for %d\n",
+ di->keymap[i].gpio);
+ goto err_gpio_configure_failed;
+ }
+ }
+
+ ret = gpio_event_input_request_irqs(ds);
+
+ spin_lock_irqsave(&ds->irq_lock, irqflags);
+ ds->use_irq = ret == 0;
+
+ pr_info("GPIO Input Driver: Start gpio inputs for %s%s in %s "
+ "mode\n", input_devs->dev[0]->name,
+ (input_devs->count > 1) ? "..." : "",
+ ret == 0 ? "interrupt" : "polling");
+
+ hrtimer_init(&ds->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ ds->timer.function = gpio_event_input_timer_func;
+ hrtimer_start(&ds->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+ spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+ return 0;
+ }
+
+ ret = 0;
+ spin_lock_irqsave(&ds->irq_lock, irqflags);
+ hrtimer_cancel(&ds->timer);
+ if (ds->use_irq) {
+ for (i = di->keymap_size - 1; i >= 0; i--) {
+ int irq = gpio_to_irq(di->keymap[i].gpio);
+ if (ds->info->info.no_suspend)
+ disable_irq_wake(irq);
+ free_irq(irq, &ds->key_state[i]);
+ }
+ }
+ spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+
+ for (i = di->keymap_size - 1; i >= 0; i--) {
+err_gpio_configure_failed:
+ gpio_free(di->keymap[i].gpio);
+err_gpio_request_failed:
+ ;
+ }
+err_bad_keymap:
+ wakeup_source_unregister(ds->ws);
+err_ws_failed:
+ kfree(ds);
+err_ds_alloc_failed:
+ return ret;
+}
diff --git a/drivers/input/misc/gpio_matrix.c b/drivers/input/misc/gpio_matrix.c
new file mode 100644
index 000000000000..eaa9e89d473a
--- /dev/null
+++ b/drivers/input/misc/gpio_matrix.c
@@ -0,0 +1,441 @@
+/* drivers/input/misc/gpio_matrix.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/wakelock.h>
+
+struct gpio_kp {
+ struct gpio_event_input_devs *input_devs;
+ struct gpio_event_matrix_info *keypad_info;
+ struct hrtimer timer;
+ struct wake_lock wake_lock;
+ int current_output;
+ unsigned int use_irq:1;
+ unsigned int key_state_changed:1;
+ unsigned int last_key_state_changed:1;
+ unsigned int some_keys_pressed:2;
+ unsigned int disabled_irq:1;
+ unsigned long keys_pressed[0];
+};
+
+static void clear_phantom_key(struct gpio_kp *kp, int out, int in)
+{
+ struct gpio_event_matrix_info *mi = kp->keypad_info;
+ int key_index = out * mi->ninputs + in;
+ unsigned short keyentry = mi->keymap[key_index];
+ unsigned short keycode = keyentry & MATRIX_KEY_MASK;
+ unsigned short dev = keyentry >> MATRIX_CODE_BITS;
+
+ if (!test_bit(keycode, kp->input_devs->dev[dev]->key)) {
+ if (mi->flags & GPIOKPF_PRINT_PHANTOM_KEYS)
+ pr_info("gpiomatrix: phantom key %x, %d-%d (%d-%d) "
+ "cleared\n", keycode, out, in,
+ mi->output_gpios[out], mi->input_gpios[in]);
+ __clear_bit(key_index, kp->keys_pressed);
+ } else {
+ if (mi->flags & GPIOKPF_PRINT_PHANTOM_KEYS)
+ pr_info("gpiomatrix: phantom key %x, %d-%d (%d-%d) "
+ "not cleared\n", keycode, out, in,
+ mi->output_gpios[out], mi->input_gpios[in]);
+ }
+}
+
+static int restore_keys_for_input(struct gpio_kp *kp, int out, int in)
+{
+ int rv = 0;
+ int key_index;
+
+ key_index = out * kp->keypad_info->ninputs + in;
+ while (out < kp->keypad_info->noutputs) {
+ if (test_bit(key_index, kp->keys_pressed)) {
+ rv = 1;
+ clear_phantom_key(kp, out, in);
+ }
+ key_index += kp->keypad_info->ninputs;
+ out++;
+ }
+ return rv;
+}
+
+static void remove_phantom_keys(struct gpio_kp *kp)
+{
+ int out, in, inp;
+ int key_index;
+
+ if (kp->some_keys_pressed < 3)
+ return;
+
+ for (out = 0; out < kp->keypad_info->noutputs; out++) {
+ inp = -1;
+ key_index = out * kp->keypad_info->ninputs;
+ for (in = 0; in < kp->keypad_info->ninputs; in++, key_index++) {
+ if (test_bit(key_index, kp->keys_pressed)) {
+ if (inp == -1) {
+ inp = in;
+ continue;
+ }
+ if (inp >= 0) {
+ if (!restore_keys_for_input(kp, out + 1,
+ inp))
+ break;
+ clear_phantom_key(kp, out, inp);
+ inp = -2;
+ }
+ restore_keys_for_input(kp, out, in);
+ }
+ }
+ }
+}
+
+static void report_key(struct gpio_kp *kp, int key_index, int out, int in)
+{
+ struct gpio_event_matrix_info *mi = kp->keypad_info;
+ int pressed = test_bit(key_index, kp->keys_pressed);
+ unsigned short keyentry = mi->keymap[key_index];
+ unsigned short keycode = keyentry & MATRIX_KEY_MASK;
+ unsigned short dev = keyentry >> MATRIX_CODE_BITS;
+
+ if (pressed != test_bit(keycode, kp->input_devs->dev[dev]->key)) {
+ if (keycode == KEY_RESERVED) {
+ if (mi->flags & GPIOKPF_PRINT_UNMAPPED_KEYS)
+ pr_info("gpiomatrix: unmapped key, %d-%d "
+ "(%d-%d) changed to %d\n",
+ out, in, mi->output_gpios[out],
+ mi->input_gpios[in], pressed);
+ } else {
+ if (mi->flags & GPIOKPF_PRINT_MAPPED_KEYS)
+ pr_info("gpiomatrix: key %x, %d-%d (%d-%d) "
+ "changed to %d\n", keycode,
+ out, in, mi->output_gpios[out],
+ mi->input_gpios[in], pressed);
+ input_report_key(kp->input_devs->dev[dev], keycode, pressed);
+ }
+ }
+}
+
+static void report_sync(struct gpio_kp *kp)
+{
+ int i;
+
+ for (i = 0; i < kp->input_devs->count; i++)
+ input_sync(kp->input_devs->dev[i]);
+}
+
+static enum hrtimer_restart gpio_keypad_timer_func(struct hrtimer *timer)
+{
+ int out, in;
+ int key_index;
+ int gpio;
+ struct gpio_kp *kp = container_of(timer, struct gpio_kp, timer);
+ struct gpio_event_matrix_info *mi = kp->keypad_info;
+ unsigned gpio_keypad_flags = mi->flags;
+ unsigned polarity = !!(gpio_keypad_flags & GPIOKPF_ACTIVE_HIGH);
+
+ out = kp->current_output;
+ if (out == mi->noutputs) {
+ out = 0;
+ kp->last_key_state_changed = kp->key_state_changed;
+ kp->key_state_changed = 0;
+ kp->some_keys_pressed = 0;
+ } else {
+ key_index = out * mi->ninputs;
+ for (in = 0; in < mi->ninputs; in++, key_index++) {
+ gpio = mi->input_gpios[in];
+ if (gpio_get_value(gpio) ^ !polarity) {
+ if (kp->some_keys_pressed < 3)
+ kp->some_keys_pressed++;
+ kp->key_state_changed |= !__test_and_set_bit(
+ key_index, kp->keys_pressed);
+ } else
+ kp->key_state_changed |= __test_and_clear_bit(
+ key_index, kp->keys_pressed);
+ }
+ gpio = mi->output_gpios[out];
+ if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+ gpio_set_value(gpio, !polarity);
+ else
+ gpio_direction_input(gpio);
+ out++;
+ }
+ kp->current_output = out;
+ if (out < mi->noutputs) {
+ gpio = mi->output_gpios[out];
+ if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+ gpio_set_value(gpio, polarity);
+ else
+ gpio_direction_output(gpio, polarity);
+ hrtimer_start(timer, mi->settle_time, HRTIMER_MODE_REL);
+ return HRTIMER_NORESTART;
+ }
+ if (gpio_keypad_flags & GPIOKPF_DEBOUNCE) {
+ if (kp->key_state_changed) {
+ hrtimer_start(&kp->timer, mi->debounce_delay,
+ HRTIMER_MODE_REL);
+ return HRTIMER_NORESTART;
+ }
+ kp->key_state_changed = kp->last_key_state_changed;
+ }
+ if (kp->key_state_changed) {
+ if (gpio_keypad_flags & GPIOKPF_REMOVE_SOME_PHANTOM_KEYS)
+ remove_phantom_keys(kp);
+ key_index = 0;
+ for (out = 0; out < mi->noutputs; out++)
+ for (in = 0; in < mi->ninputs; in++, key_index++)
+ report_key(kp, key_index, out, in);
+ report_sync(kp);
+ }
+ if (!kp->use_irq || kp->some_keys_pressed) {
+ hrtimer_start(timer, mi->poll_time, HRTIMER_MODE_REL);
+ return HRTIMER_NORESTART;
+ }
+
+ /* No keys are pressed, reenable interrupt */
+ for (out = 0; out < mi->noutputs; out++) {
+ if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+ gpio_set_value(mi->output_gpios[out], polarity);
+ else
+ gpio_direction_output(mi->output_gpios[out], polarity);
+ }
+ for (in = 0; in < mi->ninputs; in++)
+ enable_irq(gpio_to_irq(mi->input_gpios[in]));
+ wake_unlock(&kp->wake_lock);
+ return HRTIMER_NORESTART;
+}
+
+static irqreturn_t gpio_keypad_irq_handler(int irq_in, void *dev_id)
+{
+ int i;
+ struct gpio_kp *kp = dev_id;
+ struct gpio_event_matrix_info *mi = kp->keypad_info;
+ unsigned gpio_keypad_flags = mi->flags;
+
+ if (!kp->use_irq) {
+ /* ignore interrupt while registering the handler */
+ kp->disabled_irq = 1;
+ disable_irq_nosync(irq_in);
+ return IRQ_HANDLED;
+ }
+
+ for (i = 0; i < mi->ninputs; i++)
+ disable_irq_nosync(gpio_to_irq(mi->input_gpios[i]));
+ for (i = 0; i < mi->noutputs; i++) {
+ if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+ gpio_set_value(mi->output_gpios[i],
+ !(gpio_keypad_flags & GPIOKPF_ACTIVE_HIGH));
+ else
+ gpio_direction_input(mi->output_gpios[i]);
+ }
+ wake_lock(&kp->wake_lock);
+ hrtimer_start(&kp->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+ return IRQ_HANDLED;
+}
+
+static int gpio_keypad_request_irqs(struct gpio_kp *kp)
+{
+ int i;
+ int err;
+ unsigned int irq;
+ unsigned long request_flags;
+ struct gpio_event_matrix_info *mi = kp->keypad_info;
+
+ switch (mi->flags & (GPIOKPF_ACTIVE_HIGH|GPIOKPF_LEVEL_TRIGGERED_IRQ)) {
+ default:
+ request_flags = IRQF_TRIGGER_FALLING;
+ break;
+ case GPIOKPF_ACTIVE_HIGH:
+ request_flags = IRQF_TRIGGER_RISING;
+ break;
+ case GPIOKPF_LEVEL_TRIGGERED_IRQ:
+ request_flags = IRQF_TRIGGER_LOW;
+ break;
+ case GPIOKPF_LEVEL_TRIGGERED_IRQ | GPIOKPF_ACTIVE_HIGH:
+ request_flags = IRQF_TRIGGER_HIGH;
+ break;
+ }
+
+ for (i = 0; i < mi->ninputs; i++) {
+ err = irq = gpio_to_irq(mi->input_gpios[i]);
+ if (err < 0)
+ goto err_gpio_get_irq_num_failed;
+ err = request_irq(irq, gpio_keypad_irq_handler, request_flags,
+ "gpio_kp", kp);
+ if (err) {
+ pr_err("gpiomatrix: request_irq failed for input %d, "
+ "irq %d\n", mi->input_gpios[i], irq);
+ goto err_request_irq_failed;
+ }
+ err = enable_irq_wake(irq);
+ if (err) {
+ pr_err("gpiomatrix: set_irq_wake failed for input %d, "
+ "irq %d\n", mi->input_gpios[i], irq);
+ }
+ disable_irq(irq);
+ if (kp->disabled_irq) {
+ kp->disabled_irq = 0;
+ enable_irq(irq);
+ }
+ }
+ return 0;
+
+ for (i = mi->noutputs - 1; i >= 0; i--) {
+ free_irq(gpio_to_irq(mi->input_gpios[i]), kp);
+err_request_irq_failed:
+err_gpio_get_irq_num_failed:
+ ;
+ }
+ return err;
+}
+
+int gpio_event_matrix_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func)
+{
+ int i;
+ int err;
+ int key_count;
+ struct gpio_kp *kp;
+ struct gpio_event_matrix_info *mi;
+
+ mi = container_of(info, struct gpio_event_matrix_info, info);
+ if (func == GPIO_EVENT_FUNC_SUSPEND || func == GPIO_EVENT_FUNC_RESUME) {
+ /* TODO: disable scanning */
+ return 0;
+ }
+
+ if (func == GPIO_EVENT_FUNC_INIT) {
+ if (mi->keymap == NULL ||
+ mi->input_gpios == NULL ||
+ mi->output_gpios == NULL) {
+ err = -ENODEV;
+ pr_err("gpiomatrix: Incomplete pdata\n");
+ goto err_invalid_platform_data;
+ }
+ key_count = mi->ninputs * mi->noutputs;
+
+ *data = kp = kzalloc(sizeof(*kp) + sizeof(kp->keys_pressed[0]) *
+ BITS_TO_LONGS(key_count), GFP_KERNEL);
+ if (kp == NULL) {
+ err = -ENOMEM;
+ pr_err("gpiomatrix: Failed to allocate private data\n");
+ goto err_kp_alloc_failed;
+ }
+ kp->input_devs = input_devs;
+ kp->keypad_info = mi;
+ for (i = 0; i < key_count; i++) {
+ unsigned short keyentry = mi->keymap[i];
+ unsigned short keycode = keyentry & MATRIX_KEY_MASK;
+ unsigned short dev = keyentry >> MATRIX_CODE_BITS;
+ if (dev >= input_devs->count) {
+ pr_err("gpiomatrix: bad device index %d >= "
+ "%d for key code %d\n",
+ dev, input_devs->count, keycode);
+ err = -EINVAL;
+ goto err_bad_keymap;
+ }
+ if (keycode && keycode <= KEY_MAX)
+ input_set_capability(input_devs->dev[dev],
+ EV_KEY, keycode);
+ }
+
+ for (i = 0; i < mi->noutputs; i++) {
+ err = gpio_request(mi->output_gpios[i], "gpio_kp_out");
+ if (err) {
+ pr_err("gpiomatrix: gpio_request failed for "
+ "output %d\n", mi->output_gpios[i]);
+ goto err_request_output_gpio_failed;
+ }
+ if (gpio_cansleep(mi->output_gpios[i])) {
+ pr_err("gpiomatrix: unsupported output gpio %d,"
+ " can sleep\n", mi->output_gpios[i]);
+ err = -EINVAL;
+ goto err_output_gpio_configure_failed;
+ }
+ if (mi->flags & GPIOKPF_DRIVE_INACTIVE)
+ err = gpio_direction_output(mi->output_gpios[i],
+ !(mi->flags & GPIOKPF_ACTIVE_HIGH));
+ else
+ err = gpio_direction_input(mi->output_gpios[i]);
+ if (err) {
+ pr_err("gpiomatrix: gpio_configure failed for "
+ "output %d\n", mi->output_gpios[i]);
+ goto err_output_gpio_configure_failed;
+ }
+ }
+ for (i = 0; i < mi->ninputs; i++) {
+ err = gpio_request(mi->input_gpios[i], "gpio_kp_in");
+ if (err) {
+ pr_err("gpiomatrix: gpio_request failed for "
+ "input %d\n", mi->input_gpios[i]);
+ goto err_request_input_gpio_failed;
+ }
+ err = gpio_direction_input(mi->input_gpios[i]);
+ if (err) {
+ pr_err("gpiomatrix: gpio_direction_input failed"
+ " for input %d\n", mi->input_gpios[i]);
+ goto err_gpio_direction_input_failed;
+ }
+ }
+ kp->current_output = mi->noutputs;
+ kp->key_state_changed = 1;
+
+ hrtimer_init(&kp->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ kp->timer.function = gpio_keypad_timer_func;
+ wake_lock_init(&kp->wake_lock, WAKE_LOCK_SUSPEND, "gpio_kp");
+ err = gpio_keypad_request_irqs(kp);
+ kp->use_irq = err == 0;
+
+ pr_info("GPIO Matrix Keypad Driver: Start keypad matrix for "
+ "%s%s in %s mode\n", input_devs->dev[0]->name,
+ (input_devs->count > 1) ? "..." : "",
+ kp->use_irq ? "interrupt" : "polling");
+
+ if (kp->use_irq)
+ wake_lock(&kp->wake_lock);
+ hrtimer_start(&kp->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+
+ return 0;
+ }
+
+ err = 0;
+ kp = *data;
+
+ if (kp->use_irq)
+ for (i = mi->noutputs - 1; i >= 0; i--)
+ free_irq(gpio_to_irq(mi->input_gpios[i]), kp);
+
+ hrtimer_cancel(&kp->timer);
+ wake_lock_destroy(&kp->wake_lock);
+ for (i = mi->noutputs - 1; i >= 0; i--) {
+err_gpio_direction_input_failed:
+ gpio_free(mi->input_gpios[i]);
+err_request_input_gpio_failed:
+ ;
+ }
+ for (i = mi->noutputs - 1; i >= 0; i--) {
+err_output_gpio_configure_failed:
+ gpio_free(mi->output_gpios[i]);
+err_request_output_gpio_failed:
+ ;
+ }
+err_bad_keymap:
+ kfree(kp);
+err_kp_alloc_failed:
+err_invalid_platform_data:
+ return err;
+}
diff --git a/drivers/input/misc/gpio_output.c b/drivers/input/misc/gpio_output.c
new file mode 100644
index 000000000000..2aac2fad0a17
--- /dev/null
+++ b/drivers/input/misc/gpio_output.c
@@ -0,0 +1,97 @@
+/* drivers/input/misc/gpio_output.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+
+int gpio_event_output_event(
+ struct gpio_event_input_devs *input_devs, struct gpio_event_info *info,
+ void **data, unsigned int dev, unsigned int type,
+ unsigned int code, int value)
+{
+ int i;
+ struct gpio_event_output_info *oi;
+ oi = container_of(info, struct gpio_event_output_info, info);
+ if (type != oi->type)
+ return 0;
+ if (!(oi->flags & GPIOEDF_ACTIVE_HIGH))
+ value = !value;
+ for (i = 0; i < oi->keymap_size; i++)
+ if (dev == oi->keymap[i].dev && code == oi->keymap[i].code)
+ gpio_set_value(oi->keymap[i].gpio, value);
+ return 0;
+}
+
+int gpio_event_output_func(
+ struct gpio_event_input_devs *input_devs, struct gpio_event_info *info,
+ void **data, int func)
+{
+ int ret;
+ int i;
+ struct gpio_event_output_info *oi;
+ oi = container_of(info, struct gpio_event_output_info, info);
+
+ if (func == GPIO_EVENT_FUNC_SUSPEND || func == GPIO_EVENT_FUNC_RESUME)
+ return 0;
+
+ if (func == GPIO_EVENT_FUNC_INIT) {
+ int output_level = !(oi->flags & GPIOEDF_ACTIVE_HIGH);
+
+ for (i = 0; i < oi->keymap_size; i++) {
+ int dev = oi->keymap[i].dev;
+ if (dev >= input_devs->count) {
+ pr_err("gpio_event_output_func: bad device "
+ "index %d >= %d for key code %d\n",
+ dev, input_devs->count,
+ oi->keymap[i].code);
+ ret = -EINVAL;
+ goto err_bad_keymap;
+ }
+ input_set_capability(input_devs->dev[dev], oi->type,
+ oi->keymap[i].code);
+ }
+
+ for (i = 0; i < oi->keymap_size; i++) {
+ ret = gpio_request(oi->keymap[i].gpio,
+ "gpio_event_output");
+ if (ret) {
+ pr_err("gpio_event_output_func: gpio_request "
+ "failed for %d\n", oi->keymap[i].gpio);
+ goto err_gpio_request_failed;
+ }
+ ret = gpio_direction_output(oi->keymap[i].gpio,
+ output_level);
+ if (ret) {
+ pr_err("gpio_event_output_func: "
+ "gpio_direction_output failed for %d\n",
+ oi->keymap[i].gpio);
+ goto err_gpio_direction_output_failed;
+ }
+ }
+ return 0;
+ }
+
+ ret = 0;
+ for (i = oi->keymap_size - 1; i >= 0; i--) {
+err_gpio_direction_output_failed:
+ gpio_free(oi->keymap[i].gpio);
+err_gpio_request_failed:
+ ;
+ }
+err_bad_keymap:
+ return ret;
+}
+
diff --git a/drivers/input/misc/ims-pcu.c b/drivers/input/misc/ims-pcu.c
index 25bd4d701722..f4e8fbec6a94 100644
--- a/drivers/input/misc/ims-pcu.c
+++ b/drivers/input/misc/ims-pcu.c
@@ -1859,7 +1859,7 @@ static int ims_pcu_identify_type(struct ims_pcu *pcu, u8 *device_id)
static int ims_pcu_init_application_mode(struct ims_pcu *pcu)
{
- static atomic_t device_no = ATOMIC_INIT(0);
+ static atomic_t device_no = ATOMIC_INIT(-1);
const struct ims_pcu_device_info *info;
int error;
@@ -1890,7 +1890,7 @@ static int ims_pcu_init_application_mode(struct ims_pcu *pcu)
}
/* Device appears to be operable, complete initialization */
- pcu->device_no = atomic_inc_return(&device_no) - 1;
+ pcu->device_no = atomic_inc_return(&device_no);
/*
* PCU-B devices, both GEN_1 and GEN_2 do not have OFN sensor
diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c
new file mode 100644
index 000000000000..a5ea27ad0e16
--- /dev/null
+++ b/drivers/input/misc/keychord.c
@@ -0,0 +1,391 @@
+/*
+ * drivers/input/misc/keychord.c
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+*/
+
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/keychord.h>
+#include <linux/sched.h>
+
+#define KEYCHORD_NAME "keychord"
+#define BUFFER_SIZE 16
+
+MODULE_AUTHOR("Mike Lockwood <lockwood@android.com>");
+MODULE_DESCRIPTION("Key chord input driver");
+MODULE_SUPPORTED_DEVICE("keychord");
+MODULE_LICENSE("GPL");
+
+#define NEXT_KEYCHORD(kc) ((struct input_keychord *) \
+ ((char *)kc + sizeof(struct input_keychord) + \
+ kc->count * sizeof(kc->keycodes[0])))
+
+struct keychord_device {
+ struct input_handler input_handler;
+ int registered;
+
+ /* list of keychords to monitor */
+ struct input_keychord *keychords;
+ int keychord_count;
+
+ /* bitmask of keys contained in our keychords */
+ unsigned long keybit[BITS_TO_LONGS(KEY_CNT)];
+ /* current state of the keys */
+ unsigned long keystate[BITS_TO_LONGS(KEY_CNT)];
+ /* number of keys that are currently pressed */
+ int key_down;
+
+ /* second input_device_id is needed for null termination */
+ struct input_device_id device_ids[2];
+
+ spinlock_t lock;
+ wait_queue_head_t waitq;
+ unsigned char head;
+ unsigned char tail;
+ __u16 buff[BUFFER_SIZE];
+};
+
+static int check_keychord(struct keychord_device *kdev,
+ struct input_keychord *keychord)
+{
+ int i;
+
+ if (keychord->count != kdev->key_down)
+ return 0;
+
+ for (i = 0; i < keychord->count; i++) {
+ if (!test_bit(keychord->keycodes[i], kdev->keystate))
+ return 0;
+ }
+
+ /* we have a match */
+ return 1;
+}
+
+static void keychord_event(struct input_handle *handle, unsigned int type,
+ unsigned int code, int value)
+{
+ struct keychord_device *kdev = handle->private;
+ struct input_keychord *keychord;
+ unsigned long flags;
+ int i, got_chord = 0;
+
+ if (type != EV_KEY || code >= KEY_MAX)
+ return;
+
+ spin_lock_irqsave(&kdev->lock, flags);
+ /* do nothing if key state did not change */
+ if (!test_bit(code, kdev->keystate) == !value)
+ goto done;
+ __change_bit(code, kdev->keystate);
+ if (value)
+ kdev->key_down++;
+ else
+ kdev->key_down--;
+
+ /* don't notify on key up */
+ if (!value)
+ goto done;
+ /* ignore this event if it is not one of the keys we are monitoring */
+ if (!test_bit(code, kdev->keybit))
+ goto done;
+
+ keychord = kdev->keychords;
+ if (!keychord)
+ goto done;
+
+ /* check to see if the keyboard state matches any keychords */
+ for (i = 0; i < kdev->keychord_count; i++) {
+ if (check_keychord(kdev, keychord)) {
+ kdev->buff[kdev->head] = keychord->id;
+ kdev->head = (kdev->head + 1) % BUFFER_SIZE;
+ got_chord = 1;
+ break;
+ }
+ /* skip to next keychord */
+ keychord = NEXT_KEYCHORD(keychord);
+ }
+
+done:
+ spin_unlock_irqrestore(&kdev->lock, flags);
+
+ if (got_chord) {
+ pr_info("keychord: got keychord id %d. Any tasks: %d\n",
+ keychord->id,
+ !list_empty_careful(&kdev->waitq.task_list));
+ wake_up_interruptible(&kdev->waitq);
+ }
+}
+
+static int keychord_connect(struct input_handler *handler,
+ struct input_dev *dev,
+ const struct input_device_id *id)
+{
+ int i, ret;
+ struct input_handle *handle;
+ struct keychord_device *kdev =
+ container_of(handler, struct keychord_device, input_handler);
+
+ /*
+ * ignore this input device if it does not contain any keycodes
+ * that we are monitoring
+ */
+ for (i = 0; i < KEY_MAX; i++) {
+ if (test_bit(i, kdev->keybit) && test_bit(i, dev->keybit))
+ break;
+ }
+ if (i == KEY_MAX)
+ return -ENODEV;
+
+ handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+ if (!handle)
+ return -ENOMEM;
+
+ handle->dev = dev;
+ handle->handler = handler;
+ handle->name = KEYCHORD_NAME;
+ handle->private = kdev;
+
+ ret = input_register_handle(handle);
+ if (ret)
+ goto err_input_register_handle;
+
+ ret = input_open_device(handle);
+ if (ret)
+ goto err_input_open_device;
+
+ pr_info("keychord: using input dev %s for fevent\n", dev->name);
+
+ return 0;
+
+err_input_open_device:
+ input_unregister_handle(handle);
+err_input_register_handle:
+ kfree(handle);
+ return ret;
+}
+
+static void keychord_disconnect(struct input_handle *handle)
+{
+ input_close_device(handle);
+ input_unregister_handle(handle);
+ kfree(handle);
+}
+
+/*
+ * keychord_read is used to read keychord events from the driver
+ */
+static ssize_t keychord_read(struct file *file, char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct keychord_device *kdev = file->private_data;
+ __u16 id;
+ int retval;
+ unsigned long flags;
+
+ if (count < sizeof(id))
+ return -EINVAL;
+ count = sizeof(id);
+
+ if (kdev->head == kdev->tail && (file->f_flags & O_NONBLOCK))
+ return -EAGAIN;
+
+ retval = wait_event_interruptible(kdev->waitq,
+ kdev->head != kdev->tail);
+ if (retval)
+ return retval;
+
+ spin_lock_irqsave(&kdev->lock, flags);
+ /* pop a keychord ID off the queue */
+ id = kdev->buff[kdev->tail];
+ kdev->tail = (kdev->tail + 1) % BUFFER_SIZE;
+ spin_unlock_irqrestore(&kdev->lock, flags);
+
+ if (copy_to_user(buffer, &id, count))
+ return -EFAULT;
+
+ return count;
+}
+
+/*
+ * keychord_write is used to configure the driver
+ */
+static ssize_t keychord_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct keychord_device *kdev = file->private_data;
+ struct input_keychord *keychords = 0;
+ struct input_keychord *keychord, *next, *end;
+ int ret, i, key;
+ unsigned long flags;
+
+ if (count < sizeof(struct input_keychord))
+ return -EINVAL;
+ keychords = kzalloc(count, GFP_KERNEL);
+ if (!keychords)
+ return -ENOMEM;
+
+ /* read list of keychords from userspace */
+ if (copy_from_user(keychords, buffer, count)) {
+ kfree(keychords);
+ return -EFAULT;
+ }
+
+ /* unregister handler before changing configuration */
+ if (kdev->registered) {
+ input_unregister_handler(&kdev->input_handler);
+ kdev->registered = 0;
+ }
+
+ spin_lock_irqsave(&kdev->lock, flags);
+ /* clear any existing configuration */
+ kfree(kdev->keychords);
+ kdev->keychords = 0;
+ kdev->keychord_count = 0;
+ kdev->key_down = 0;
+ memset(kdev->keybit, 0, sizeof(kdev->keybit));
+ memset(kdev->keystate, 0, sizeof(kdev->keystate));
+ kdev->head = kdev->tail = 0;
+
+ keychord = keychords;
+ end = (struct input_keychord *)((char *)keychord + count);
+
+ while (keychord < end) {
+ next = NEXT_KEYCHORD(keychord);
+ if (keychord->count <= 0 || next > end) {
+ pr_err("keychord: invalid keycode count %d\n",
+ keychord->count);
+ goto err_unlock_return;
+ }
+ if (keychord->version != KEYCHORD_VERSION) {
+ pr_err("keychord: unsupported version %d\n",
+ keychord->version);
+ goto err_unlock_return;
+ }
+
+ /* keep track of the keys we are monitoring in keybit */
+ for (i = 0; i < keychord->count; i++) {
+ key = keychord->keycodes[i];
+ if (key < 0 || key >= KEY_CNT) {
+ pr_err("keychord: keycode %d out of range\n",
+ key);
+ goto err_unlock_return;
+ }
+ __set_bit(key, kdev->keybit);
+ }
+
+ kdev->keychord_count++;
+ keychord = next;
+ }
+
+ kdev->keychords = keychords;
+ spin_unlock_irqrestore(&kdev->lock, flags);
+
+ ret = input_register_handler(&kdev->input_handler);
+ if (ret) {
+ kfree(keychords);
+ kdev->keychords = 0;
+ return ret;
+ }
+ kdev->registered = 1;
+
+ return count;
+
+err_unlock_return:
+ spin_unlock_irqrestore(&kdev->lock, flags);
+ kfree(keychords);
+ return -EINVAL;
+}
+
+static unsigned int keychord_poll(struct file *file, poll_table *wait)
+{
+ struct keychord_device *kdev = file->private_data;
+
+ poll_wait(file, &kdev->waitq, wait);
+
+ if (kdev->head != kdev->tail)
+ return POLLIN | POLLRDNORM;
+
+ return 0;
+}
+
+static int keychord_open(struct inode *inode, struct file *file)
+{
+ struct keychord_device *kdev;
+
+ kdev = kzalloc(sizeof(struct keychord_device), GFP_KERNEL);
+ if (!kdev)
+ return -ENOMEM;
+
+ spin_lock_init(&kdev->lock);
+ init_waitqueue_head(&kdev->waitq);
+
+ kdev->input_handler.event = keychord_event;
+ kdev->input_handler.connect = keychord_connect;
+ kdev->input_handler.disconnect = keychord_disconnect;
+ kdev->input_handler.name = KEYCHORD_NAME;
+ kdev->input_handler.id_table = kdev->device_ids;
+
+ kdev->device_ids[0].flags = INPUT_DEVICE_ID_MATCH_EVBIT;
+ __set_bit(EV_KEY, kdev->device_ids[0].evbit);
+
+ file->private_data = kdev;
+
+ return 0;
+}
+
+static int keychord_release(struct inode *inode, struct file *file)
+{
+ struct keychord_device *kdev = file->private_data;
+
+ if (kdev->registered)
+ input_unregister_handler(&kdev->input_handler);
+ kfree(kdev);
+
+ return 0;
+}
+
+static const struct file_operations keychord_fops = {
+ .owner = THIS_MODULE,
+ .open = keychord_open,
+ .release = keychord_release,
+ .read = keychord_read,
+ .write = keychord_write,
+ .poll = keychord_poll,
+};
+
+static struct miscdevice keychord_misc = {
+ .fops = &keychord_fops,
+ .name = KEYCHORD_NAME,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+static int __init keychord_init(void)
+{
+ return misc_register(&keychord_misc);
+}
+
+static void __exit keychord_exit(void)
+{
+ misc_deregister(&keychord_misc);
+}
+
+module_init(keychord_init);
+module_exit(keychord_exit);
diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c
index d399b8b0f000..a05a5179da32 100644
--- a/drivers/input/serio/serio.c
+++ b/drivers/input/serio/serio.c
@@ -514,7 +514,7 @@ static void serio_release_port(struct device *dev)
*/
static void serio_init_port(struct serio *serio)
{
- static atomic_t serio_no = ATOMIC_INIT(0);
+ static atomic_t serio_no = ATOMIC_INIT(-1);
__module_get(THIS_MODULE);
@@ -525,7 +525,7 @@ static void serio_init_port(struct serio *serio)
mutex_init(&serio->drv_mutex);
device_initialize(&serio->dev);
dev_set_name(&serio->dev, "serio%lu",
- (unsigned long)atomic_inc_return(&serio_no) - 1);
+ (unsigned long)atomic_inc_return(&serio_no));
serio->dev.bus = &serio_bus;
serio->dev.release = serio_release_port;
serio->dev.groups = serio_device_attr_groups;
diff --git a/drivers/input/serio/serio_raw.c b/drivers/input/serio/serio_raw.c
index c9a02fe57576..71ef5d65a0c6 100644
--- a/drivers/input/serio/serio_raw.c
+++ b/drivers/input/serio/serio_raw.c
@@ -292,7 +292,7 @@ static irqreturn_t serio_raw_interrupt(struct serio *serio, unsigned char data,
static int serio_raw_connect(struct serio *serio, struct serio_driver *drv)
{
- static atomic_t serio_raw_no = ATOMIC_INIT(0);
+ static atomic_t serio_raw_no = ATOMIC_INIT(-1);
struct serio_raw *serio_raw;
int err;
@@ -303,7 +303,7 @@ static int serio_raw_connect(struct serio *serio, struct serio_driver *drv)
}
snprintf(serio_raw->name, sizeof(serio_raw->name),
- "serio_raw%ld", (long)atomic_inc_return(&serio_raw_no) - 1);
+ "serio_raw%ld", (long)atomic_inc_return(&serio_raw_no));
kref_init(&serio_raw->kref);
INIT_LIST_HEAD(&serio_raw->client_list);
init_waitqueue_head(&serio_raw->wait);
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 5bdedf6df153..f73295d7cbda 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -418,6 +418,52 @@ config DM_VERITY
If unsure, say N.
+config DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
+ bool "Prefetch size 128"
+
+config DM_VERITY_HASH_PREFETCH_MIN_SIZE
+ int "Verity hash prefetch minimum size"
+ depends on DM_VERITY
+ range 1 4096
+ default 128 if DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
+ default 1
+ ---help---
+ This sets minimum number of hash blocks to prefetch for dm-verity.
+ For devices like eMMC, having larger prefetch size like 128 can improve
+ performance with increased memory consumption for keeping more hashes
+ in RAM.
+
+config DM_ANDROID_VERITY
+ tristate "Android verity target support"
+ depends on DM_VERITY
+ depends on X509_CERTIFICATE_PARSER
+ depends on SYSTEM_TRUSTED_KEYRING
+ depends on PUBLIC_KEY_ALGO_RSA
+ depends on KEYS
+ depends on ASYMMETRIC_KEY_TYPE
+ depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE
+ depends on MD_LINEAR
+ select DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
+ ---help---
+ This device-mapper target is virtually a VERITY target. This
+ target is setup by reading the metadata contents piggybacked
+ to the actual data blocks in the block device. The signature
+ of the metadata contents are verified against the key included
+ in the system keyring. Upon success, the underlying verity
+ target is setup.
+
+config DM_VERITY_FEC
+ bool "Verity forward error correction support"
+ depends on DM_VERITY
+ select REED_SOLOMON
+ select REED_SOLOMON_DEC8
+ ---help---
+ Add forward error correction support to dm-verity. This option
+ makes it possible to use pre-generated error correction data to
+ recover from corrupted blocks.
+
+ If unsure, say N.
+
config DM_SWITCH
tristate "Switch target support (EXPERIMENTAL)"
depends on BLK_DEV_DM
@@ -432,4 +478,19 @@ config DM_SWITCH
If unsure, say N.
+config DM_LOG_WRITES
+ tristate "Log writes target support"
+ depends on BLK_DEV_DM
+ ---help---
+ This device-mapper target takes two devices, one device to use
+ normally, one to log all write operations done to the first device.
+ This is for use by file system developers wishing to verify that
+ their fs is writing a consitent file system at all times by allowing
+ them to replay the log in a variety of ways and to check the
+ contents.
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-log-writes.
+
+ If unsure, say N.
endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index a2da532b1c2b..efea0935a4d0 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -15,6 +15,7 @@ dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
dm-cache-mq-y += dm-cache-policy-mq.o
dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
+dm-verity-y += dm-verity-target.o
md-mod-y += md.o bitmap.o
raid456-y += raid5.o
@@ -55,7 +56,12 @@ obj-$(CONFIG_DM_CACHE) += dm-cache.o
obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o
obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
obj-$(CONFIG_DM_ERA) += dm-era.o
+obj-$(CONFIG_DM_ANDROID_VERITY) += dm-android-verity.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
endif
+
+ifeq ($(CONFIG_DM_VERITY_FEC),y)
+dm-verity-objs += dm-verity-fec.o
+endif
diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
new file mode 100644
index 000000000000..cc0760e1e3e1
--- /dev/null
+++ b/drivers/md/dm-android-verity.c
@@ -0,0 +1,946 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/device-mapper.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/of.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+
+#include <asm/setup.h>
+#include <crypto/hash.h>
+#include <crypto/public_key.h>
+#include <crypto/sha.h>
+#include <keys/asymmetric-type.h>
+#include <keys/system_keyring.h>
+
+#include "dm-verity.h"
+#include "dm-android-verity.h"
+
+static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH];
+static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH];
+static char veritykeyid[VERITY_DEFAULT_KEY_ID_LENGTH];
+static char buildvariant[BUILD_VARIANT];
+
+static bool target_added;
+static bool verity_enabled = true;
+struct dentry *debug_dir;
+static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv);
+
+static struct target_type android_verity_target = {
+ .name = "android-verity",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = android_verity_ctr,
+ .dtr = verity_dtr,
+ .map = verity_map,
+ .status = verity_status,
+ .ioctl = verity_ioctl,
+ .merge = verity_merge,
+ .iterate_devices = verity_iterate_devices,
+ .io_hints = verity_io_hints,
+};
+
+#ifndef MODULE
+static int __init verified_boot_state_param(char *line)
+{
+ strlcpy(verifiedbootstate, line, sizeof(verifiedbootstate));
+ return 1;
+}
+
+__setup("androidboot.verifiedbootstate=", verified_boot_state_param);
+
+static int __init verity_mode_param(char *line)
+{
+ strlcpy(veritymode, line, sizeof(veritymode));
+ return 1;
+}
+
+__setup("androidboot.veritymode=", verity_mode_param);
+
+static int __init verity_keyid_param(char *line)
+{
+ strlcpy(veritykeyid, line, sizeof(veritykeyid));
+ return 1;
+}
+
+__setup("veritykeyid=", verity_keyid_param);
+
+static int __init verity_buildvariant(char *line)
+{
+ strlcpy(buildvariant, line, sizeof(buildvariant));
+ return 1;
+}
+
+__setup("buildvariant=", verity_buildvariant);
+#endif
+
+static inline bool default_verity_key_id(void)
+{
+ return veritykeyid[0] != '\0';
+}
+
+static inline bool is_eng(void)
+{
+ static const char typeeng[] = "eng";
+
+ return !strncmp(buildvariant, typeeng, sizeof(typeeng));
+}
+
+static inline bool is_userdebug(void)
+{
+ static const char typeuserdebug[] = "userdebug";
+
+ return !strncmp(buildvariant, typeuserdebug, sizeof(typeuserdebug));
+}
+
+static inline bool is_unlocked(void)
+{
+ static const char unlocked[] = "orange";
+
+ return !strncmp(verifiedbootstate, unlocked, sizeof(unlocked));
+}
+
+static int table_extract_mpi_array(struct public_key_signature *pks,
+ const void *data, size_t len)
+{
+ MPI mpi = mpi_read_raw_data(data, len);
+
+ if (!mpi) {
+ DMERR("Error while allocating mpi array");
+ return -ENOMEM;
+ }
+
+ pks->mpi[0] = mpi;
+ pks->nr_mpi = 1;
+ return 0;
+}
+
+static struct public_key_signature *table_make_digest(
+ enum hash_algo hash,
+ const void *table,
+ unsigned long table_len)
+{
+ struct public_key_signature *pks = NULL;
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ size_t digest_size, desc_size;
+ int ret;
+
+ /* Allocate the hashing algorithm we're going to need and find out how
+ * big the hash operational data will be.
+ */
+ tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
+ if (IS_ERR(tfm))
+ return ERR_CAST(tfm);
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ digest_size = crypto_shash_digestsize(tfm);
+
+ /* We allocate the hash operational data storage on the end of out
+ * context data and the digest output buffer on the end of that.
+ */
+ ret = -ENOMEM;
+ pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
+ if (!pks)
+ goto error;
+
+ pks->pkey_hash_algo = hash;
+ pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
+ pks->digest_size = digest_size;
+
+ desc = (struct shash_desc *)(pks + 1);
+ desc->tfm = tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto error;
+
+ ret = crypto_shash_finup(desc, table, table_len, pks->digest);
+ if (ret < 0)
+ goto error;
+
+ crypto_free_shash(tfm);
+ return pks;
+
+error:
+ kfree(pks);
+ crypto_free_shash(tfm);
+ return ERR_PTR(ret);
+}
+
+static int read_block_dev(struct bio_read *payload, struct block_device *bdev,
+ sector_t offset, int length)
+{
+ struct bio *bio;
+ int err = 0, i;
+
+ payload->number_of_pages = DIV_ROUND_UP(length, PAGE_SIZE);
+
+ bio = bio_alloc(GFP_KERNEL, payload->number_of_pages);
+ if (!bio) {
+ DMERR("Error while allocating bio");
+ return -ENOMEM;
+ }
+
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = offset;
+
+ payload->page_io = kzalloc(sizeof(struct page *) *
+ payload->number_of_pages, GFP_KERNEL);
+ if (!payload->page_io) {
+ DMERR("page_io array alloc failed");
+ err = -ENOMEM;
+ goto free_bio;
+ }
+
+ for (i = 0; i < payload->number_of_pages; i++) {
+ payload->page_io[i] = alloc_page(GFP_KERNEL);
+ if (!payload->page_io[i]) {
+ DMERR("alloc_page failed");
+ err = -ENOMEM;
+ goto free_pages;
+ }
+ if (!bio_add_page(bio, payload->page_io[i], PAGE_SIZE, 0)) {
+ DMERR("bio_add_page error");
+ err = -EIO;
+ goto free_pages;
+ }
+ }
+
+ if (!submit_bio_wait(READ, bio))
+ /* success */
+ goto free_bio;
+ DMERR("bio read failed");
+ err = -EIO;
+
+free_pages:
+ for (i = 0; i < payload->number_of_pages; i++)
+ if (payload->page_io[i])
+ __free_page(payload->page_io[i]);
+ kfree(payload->page_io);
+free_bio:
+ bio_put(bio);
+ return err;
+}
+
+static inline u64 fec_div_round_up(u64 x, u64 y)
+{
+ u64 remainder;
+
+ return div64_u64_rem(x, y, &remainder) +
+ (remainder > 0 ? 1 : 0);
+}
+
+static inline void populate_fec_metadata(struct fec_header *header,
+ struct fec_ecc_metadata *ecc)
+{
+ ecc->blocks = fec_div_round_up(le64_to_cpu(header->inp_size),
+ FEC_BLOCK_SIZE);
+ ecc->roots = le32_to_cpu(header->roots);
+ ecc->start = le64_to_cpu(header->inp_size);
+}
+
+static inline int validate_fec_header(struct fec_header *header, u64 offset)
+{
+ /* move offset to make the sanity check work for backup header
+ * as well. */
+ offset -= offset % FEC_BLOCK_SIZE;
+ if (le32_to_cpu(header->magic) != FEC_MAGIC ||
+ le32_to_cpu(header->version) != FEC_VERSION ||
+ le32_to_cpu(header->size) != sizeof(struct fec_header) ||
+ le32_to_cpu(header->roots) == 0 ||
+ le32_to_cpu(header->roots) >= FEC_RSM)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int extract_fec_header(dev_t dev, struct fec_header *fec,
+ struct fec_ecc_metadata *ecc)
+{
+ u64 device_size;
+ struct bio_read payload;
+ int i, err = 0;
+ struct block_device *bdev;
+
+ bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+
+ if (IS_ERR_OR_NULL(bdev)) {
+ DMERR("bdev get error");
+ return PTR_ERR(bdev);
+ }
+
+ device_size = i_size_read(bdev->bd_inode);
+
+ /* fec metadata size is a power of 2 and PAGE_SIZE
+ * is a power of 2 as well.
+ */
+ BUG_ON(FEC_BLOCK_SIZE > PAGE_SIZE);
+ /* 512 byte sector alignment */
+ BUG_ON(((device_size - FEC_BLOCK_SIZE) % (1 << SECTOR_SHIFT)) != 0);
+
+ err = read_block_dev(&payload, bdev, (device_size -
+ FEC_BLOCK_SIZE) / (1 << SECTOR_SHIFT), FEC_BLOCK_SIZE);
+ if (err) {
+ DMERR("Error while reading verity metadata");
+ goto error;
+ }
+
+ BUG_ON(sizeof(struct fec_header) > PAGE_SIZE);
+ memcpy(fec, page_address(payload.page_io[0]),
+ sizeof(*fec));
+
+ ecc->valid = true;
+ if (validate_fec_header(fec, device_size - FEC_BLOCK_SIZE)) {
+ /* Try the backup header */
+ memcpy(fec, page_address(payload.page_io[0]) + FEC_BLOCK_SIZE
+ - sizeof(*fec) ,
+ sizeof(*fec));
+ if (validate_fec_header(fec, device_size -
+ sizeof(struct fec_header)))
+ ecc->valid = false;
+ }
+
+ if (ecc->valid)
+ populate_fec_metadata(fec, ecc);
+
+ for (i = 0; i < payload.number_of_pages; i++)
+ __free_page(payload.page_io[i]);
+ kfree(payload.page_io);
+
+error:
+ blkdev_put(bdev, FMODE_READ);
+ return err;
+}
+static void find_metadata_offset(struct fec_header *fec,
+ struct block_device *bdev, u64 *metadata_offset)
+{
+ u64 device_size;
+
+ device_size = i_size_read(bdev->bd_inode);
+
+ if (le32_to_cpu(fec->magic) == FEC_MAGIC)
+ *metadata_offset = le64_to_cpu(fec->inp_size) -
+ VERITY_METADATA_SIZE;
+ else
+ *metadata_offset = device_size - VERITY_METADATA_SIZE;
+}
+
+static int find_size(dev_t dev, u64 *device_size)
+{
+ struct block_device *bdev;
+
+ bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+ if (IS_ERR_OR_NULL(bdev)) {
+ DMERR("blkdev_get_by_dev failed");
+ return PTR_ERR(bdev);
+ }
+
+ *device_size = i_size_read(bdev->bd_inode);
+ *device_size >>= SECTOR_SHIFT;
+
+ DMINFO("blkdev size in sectors: %llu", *device_size);
+ blkdev_put(bdev, FMODE_READ);
+ return 0;
+}
+
+static int verify_header(struct android_metadata_header *header)
+{
+ int retval = -EINVAL;
+
+ if (is_userdebug() && le32_to_cpu(header->magic_number) ==
+ VERITY_METADATA_MAGIC_DISABLE)
+ return VERITY_STATE_DISABLE;
+
+ if (!(le32_to_cpu(header->magic_number) ==
+ VERITY_METADATA_MAGIC_NUMBER) ||
+ (le32_to_cpu(header->magic_number) ==
+ VERITY_METADATA_MAGIC_DISABLE)) {
+ DMERR("Incorrect magic number");
+ return retval;
+ }
+
+ if (le32_to_cpu(header->protocol_version) !=
+ VERITY_METADATA_VERSION) {
+ DMERR("Unsupported version %u",
+ le32_to_cpu(header->protocol_version));
+ return retval;
+ }
+
+ return 0;
+}
+
+static int extract_metadata(dev_t dev, struct fec_header *fec,
+ struct android_metadata **metadata,
+ bool *verity_enabled)
+{
+ struct block_device *bdev;
+ struct android_metadata_header *header;
+ int i;
+ u32 table_length, copy_length, offset;
+ u64 metadata_offset;
+ struct bio_read payload;
+ int err = 0;
+
+ bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+
+ if (IS_ERR_OR_NULL(bdev)) {
+ DMERR("blkdev_get_by_dev failed");
+ return -ENODEV;
+ }
+
+ find_metadata_offset(fec, bdev, &metadata_offset);
+
+ /* Verity metadata size is a power of 2 and PAGE_SIZE
+ * is a power of 2 as well.
+ * PAGE_SIZE is also a multiple of 512 bytes.
+ */
+ if (VERITY_METADATA_SIZE > PAGE_SIZE)
+ BUG_ON(VERITY_METADATA_SIZE % PAGE_SIZE != 0);
+ /* 512 byte sector alignment */
+ BUG_ON(metadata_offset % (1 << SECTOR_SHIFT) != 0);
+
+ err = read_block_dev(&payload, bdev, metadata_offset /
+ (1 << SECTOR_SHIFT), VERITY_METADATA_SIZE);
+ if (err) {
+ DMERR("Error while reading verity metadata");
+ goto blkdev_release;
+ }
+
+ header = kzalloc(sizeof(*header), GFP_KERNEL);
+ if (!header) {
+ DMERR("kzalloc failed for header");
+ err = -ENOMEM;
+ goto free_payload;
+ }
+
+ memcpy(header, page_address(payload.page_io[0]),
+ sizeof(*header));
+
+ DMINFO("bio magic_number:%u protocol_version:%d table_length:%u",
+ le32_to_cpu(header->magic_number),
+ le32_to_cpu(header->protocol_version),
+ le32_to_cpu(header->table_length));
+
+ err = verify_header(header);
+
+ if (err == VERITY_STATE_DISABLE) {
+ DMERR("Mounting root with verity disabled");
+ *verity_enabled = false;
+ /* we would still have to read the metadata to figure out
+ * the data blocks size. Or may be could map the entire
+ * partition similar to mounting the device.
+ *
+ * Reset error as well as the verity_enabled flag is changed.
+ */
+ err = 0;
+ } else if (err)
+ goto free_header;
+
+ *metadata = kzalloc(sizeof(**metadata), GFP_KERNEL);
+ if (!*metadata) {
+ DMERR("kzalloc for metadata failed");
+ err = -ENOMEM;
+ goto free_header;
+ }
+
+ (*metadata)->header = header;
+ table_length = le32_to_cpu(header->table_length);
+
+ if (table_length == 0 ||
+ table_length > (VERITY_METADATA_SIZE -
+ sizeof(struct android_metadata_header))) {
+ DMERR("table_length too long");
+ err = -EINVAL;
+ goto free_metadata;
+ }
+
+ (*metadata)->verity_table = kzalloc(table_length + 1, GFP_KERNEL);
+
+ if (!(*metadata)->verity_table) {
+ DMERR("kzalloc verity_table failed");
+ err = -ENOMEM;
+ goto free_metadata;
+ }
+
+ if (sizeof(struct android_metadata_header) +
+ table_length <= PAGE_SIZE) {
+ memcpy((*metadata)->verity_table,
+ page_address(payload.page_io[0])
+ + sizeof(struct android_metadata_header),
+ table_length);
+ } else {
+ copy_length = PAGE_SIZE -
+ sizeof(struct android_metadata_header);
+ memcpy((*metadata)->verity_table,
+ page_address(payload.page_io[0])
+ + sizeof(struct android_metadata_header),
+ copy_length);
+ table_length -= copy_length;
+ offset = copy_length;
+ i = 1;
+ while (table_length != 0) {
+ if (table_length > PAGE_SIZE) {
+ memcpy((*metadata)->verity_table + offset,
+ page_address(payload.page_io[i]),
+ PAGE_SIZE);
+ offset += PAGE_SIZE;
+ table_length -= PAGE_SIZE;
+ } else {
+ memcpy((*metadata)->verity_table + offset,
+ page_address(payload.page_io[i]),
+ table_length);
+ table_length = 0;
+ }
+ i++;
+ }
+ }
+ (*metadata)->verity_table[table_length] = '\0';
+
+ DMINFO("verity_table: %s", (*metadata)->verity_table);
+ goto free_payload;
+
+free_metadata:
+ kfree(*metadata);
+free_header:
+ kfree(header);
+free_payload:
+ for (i = 0; i < payload.number_of_pages; i++)
+ if (payload.page_io[i])
+ __free_page(payload.page_io[i]);
+ kfree(payload.page_io);
+blkdev_release:
+ blkdev_put(bdev, FMODE_READ);
+ return err;
+}
+
+/* helper functions to extract properties from dts */
+const char *find_dt_value(const char *name)
+{
+ struct device_node *firmware;
+ const char *value;
+
+ firmware = of_find_node_by_path("/firmware/android");
+ if (!firmware)
+ return NULL;
+ value = of_get_property(firmware, name, NULL);
+ of_node_put(firmware);
+
+ return value;
+}
+
+static int verity_mode(void)
+{
+ static const char enforcing[] = "enforcing";
+ static const char verified_mode_prop[] = "veritymode";
+ const char *value;
+
+ value = find_dt_value(verified_mode_prop);
+ if (!value)
+ value = veritymode;
+ if (!strncmp(value, enforcing, sizeof(enforcing) - 1))
+ return DM_VERITY_MODE_RESTART;
+
+ return DM_VERITY_MODE_EIO;
+}
+
+static int verify_verity_signature(char *key_id,
+ struct android_metadata *metadata)
+{
+ key_ref_t key_ref;
+ struct key *key;
+ struct public_key_signature *pks = NULL;
+ int retval = -EINVAL;
+
+ key_ref = keyring_search(make_key_ref(system_trusted_keyring, 1),
+ &key_type_asymmetric, key_id);
+
+ if (IS_ERR(key_ref)) {
+ DMERR("keyring: key not found");
+ return -ENOKEY;
+ }
+
+ key = key_ref_to_ptr(key_ref);
+
+ pks = table_make_digest(HASH_ALGO_SHA256,
+ (const void *)metadata->verity_table,
+ le32_to_cpu(metadata->header->table_length));
+
+ if (IS_ERR(pks)) {
+ DMERR("hashing failed");
+ goto error;
+ }
+
+ retval = table_extract_mpi_array(pks, &metadata->header->signature[0],
+ RSANUMBYTES);
+ if (retval < 0) {
+ DMERR("Error extracting mpi %d", retval);
+ goto error;
+ }
+
+ retval = verify_signature(key, pks);
+ mpi_free(pks->rsa.s);
+error:
+ kfree(pks);
+ key_put(key);
+
+ return retval;
+}
+
+static void handle_error(void)
+{
+ int mode = verity_mode();
+ if (mode == DM_VERITY_MODE_RESTART) {
+ DMERR("triggering restart");
+ kernel_restart("dm-verity device corrupted");
+ } else {
+ DMERR("Mounting verity root failed");
+ }
+}
+
+static inline bool test_mult_overflow(sector_t a, u32 b)
+{
+ sector_t r = (sector_t)~0ULL;
+
+ sector_div(r, b);
+ return a > r;
+}
+
+static int add_as_linear_device(struct dm_target *ti, char *dev)
+{
+ /*Move to linear mapping defines*/
+ char *linear_table_args[DM_LINEAR_ARGS] = {dev,
+ DM_LINEAR_TARGET_OFFSET};
+ int err = 0;
+
+ android_verity_target.dtr = dm_linear_dtr,
+ android_verity_target.map = dm_linear_map,
+ android_verity_target.status = dm_linear_status,
+ android_verity_target.ioctl = dm_linear_ioctl,
+ android_verity_target.merge = dm_linear_merge,
+ android_verity_target.iterate_devices = dm_linear_iterate_devices,
+ android_verity_target.io_hints = NULL;
+
+ err = dm_linear_ctr(ti, DM_LINEAR_ARGS, linear_table_args);
+
+ if (!err) {
+ DMINFO("Added android-verity as a linear target");
+ target_added = true;
+ } else
+ DMERR("Failed to add android-verity as linear target");
+
+ return err;
+}
+
+static int create_linear_device(struct dm_target *ti, dev_t dev,
+ char *target_device)
+{
+ u64 device_size = 0;
+ int err = find_size(dev, &device_size);
+
+ if (err) {
+ DMERR("error finding bdev size");
+ handle_error();
+ return err;
+ }
+
+ ti->len = device_size;
+ err = add_as_linear_device(ti, target_device);
+ if (err) {
+ handle_error();
+ return err;
+ }
+ verity_enabled = false;
+ return 0;
+}
+
+/*
+ * Target parameters:
+ * <key id> Key id of the public key in the system keyring.
+ * Verity metadata's signature would be verified against
+ * this. If the key id contains spaces, replace them
+ * with '#'.
+ * <block device> The block device for which dm-verity is being setup.
+ */
+static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ dev_t uninitialized_var(dev);
+ struct android_metadata *metadata = NULL;
+ int err = 0, i, mode;
+ char *key_id, *table_ptr, dummy, *target_device,
+ *verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS];
+ /* One for specifying number of opt args and one for mode */
+ sector_t data_sectors;
+ u32 data_block_size;
+ unsigned int no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS;
+ struct fec_header uninitialized_var(fec);
+ struct fec_ecc_metadata uninitialized_var(ecc);
+ char buf[FEC_ARG_LENGTH], *buf_ptr;
+ unsigned long long tmpll;
+
+ if (argc == 1) {
+ /* Use the default keyid */
+ if (default_verity_key_id())
+ key_id = veritykeyid;
+ else if (!is_eng()) {
+ DMERR("veritykeyid= is not set");
+ handle_error();
+ return -EINVAL;
+ }
+ } else if (argc == 2)
+ key_id = argv[1];
+ else {
+ DMERR("Incorrect number of arguments");
+ handle_error();
+ return -EINVAL;
+ }
+
+ target_device = argv[0];
+
+ dev = name_to_dev_t(target_device);
+ if (!dev) {
+ DMERR("no dev found for %s", target_device);
+ handle_error();
+ return -EINVAL;
+ }
+
+ if (is_eng())
+ return create_linear_device(ti, dev, target_device);
+
+ strreplace(key_id, '#', ' ');
+
+ DMINFO("key:%s dev:%s", key_id, target_device);
+
+ if (extract_fec_header(dev, &fec, &ecc)) {
+ DMERR("Error while extracting fec header");
+ handle_error();
+ return -EINVAL;
+ }
+
+ err = extract_metadata(dev, &fec, &metadata, &verity_enabled);
+
+ if (err) {
+ /* Allow invalid metadata when the device is unlocked */
+ if (is_unlocked()) {
+ DMWARN("Allow invalid metadata when unlocked");
+ return create_linear_device(ti, dev, target_device);
+ }
+ DMERR("Error while extracting metadata");
+ handle_error();
+ goto free_metadata;
+ }
+
+ if (verity_enabled) {
+ err = verify_verity_signature(key_id, metadata);
+
+ if (err) {
+ DMERR("Signature verification failed");
+ handle_error();
+ goto free_metadata;
+ } else
+ DMINFO("Signature verification success");
+ }
+
+ table_ptr = metadata->verity_table;
+
+ for (i = 0; i < VERITY_TABLE_ARGS; i++) {
+ verity_table_args[i] = strsep(&table_ptr, " ");
+ if (verity_table_args[i] == NULL)
+ break;
+ }
+
+ if (i != VERITY_TABLE_ARGS) {
+ DMERR("Verity table not in the expected format");
+ err = -EINVAL;
+ handle_error();
+ goto free_metadata;
+ }
+
+ if (sscanf(verity_table_args[5], "%llu%c", &tmpll, &dummy)
+ != 1) {
+ DMERR("Verity table not in the expected format");
+ handle_error();
+ err = -EINVAL;
+ goto free_metadata;
+ }
+
+ if (tmpll > ULONG_MAX) {
+ DMERR("<num_data_blocks> too large. Forgot to turn on CONFIG_LBDAF?");
+ handle_error();
+ err = -EINVAL;
+ goto free_metadata;
+ }
+
+ data_sectors = tmpll;
+
+ if (sscanf(verity_table_args[3], "%u%c", &data_block_size, &dummy)
+ != 1) {
+ DMERR("Verity table not in the expected format");
+ handle_error();
+ err = -EINVAL;
+ goto free_metadata;
+ }
+
+ if (test_mult_overflow(data_sectors, data_block_size >>
+ SECTOR_SHIFT)) {
+ DMERR("data_sectors too large");
+ handle_error();
+ err = -EOVERFLOW;
+ goto free_metadata;
+ }
+
+ data_sectors *= data_block_size >> SECTOR_SHIFT;
+ DMINFO("Data sectors %llu", (unsigned long long)data_sectors);
+
+ /* update target length */
+ ti->len = data_sectors;
+
+ /* Setup linear target and free */
+ if (!verity_enabled) {
+ err = add_as_linear_device(ti, target_device);
+ goto free_metadata;
+ }
+
+ /*substitute data_dev and hash_dev*/
+ verity_table_args[1] = target_device;
+ verity_table_args[2] = target_device;
+
+ mode = verity_mode();
+
+ if (ecc.valid && IS_BUILTIN(CONFIG_DM_VERITY_FEC)) {
+ if (mode) {
+ err = snprintf(buf, FEC_ARG_LENGTH,
+ "%u %s " VERITY_TABLE_OPT_FEC_FORMAT,
+ 1 + VERITY_TABLE_OPT_FEC_ARGS,
+ mode == DM_VERITY_MODE_RESTART ?
+ VERITY_TABLE_OPT_RESTART :
+ VERITY_TABLE_OPT_LOGGING,
+ target_device,
+ ecc.start / FEC_BLOCK_SIZE, ecc.blocks,
+ ecc.roots);
+ } else {
+ err = snprintf(buf, FEC_ARG_LENGTH,
+ "%u " VERITY_TABLE_OPT_FEC_FORMAT,
+ VERITY_TABLE_OPT_FEC_ARGS, target_device,
+ ecc.start / FEC_BLOCK_SIZE, ecc.blocks,
+ ecc.roots);
+ }
+ } else if (mode) {
+ err = snprintf(buf, FEC_ARG_LENGTH,
+ "2 " VERITY_TABLE_OPT_IGNZERO " %s",
+ mode == DM_VERITY_MODE_RESTART ?
+ VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING);
+ } else {
+ err = snprintf(buf, FEC_ARG_LENGTH, "1 %s",
+ "ignore_zero_blocks");
+ }
+
+ if (err < 0 || err >= FEC_ARG_LENGTH)
+ goto free_metadata;
+
+ buf_ptr = buf;
+
+ for (i = VERITY_TABLE_ARGS; i < (VERITY_TABLE_ARGS +
+ VERITY_TABLE_OPT_FEC_ARGS + 2); i++) {
+ verity_table_args[i] = strsep(&buf_ptr, " ");
+ if (verity_table_args[i] == NULL) {
+ no_of_args = i;
+ break;
+ }
+ }
+
+ err = verity_ctr(ti, no_of_args, verity_table_args);
+
+ if (err)
+ DMERR("android-verity failed to mount as verity target");
+ else {
+ target_added = true;
+ DMINFO("android-verity mounted as verity target");
+ }
+
+free_metadata:
+ if (metadata) {
+ kfree(metadata->header);
+ kfree(metadata->verity_table);
+ }
+ kfree(metadata);
+ return err;
+}
+
+static int __init dm_android_verity_init(void)
+{
+ int r;
+ struct dentry *file;
+
+ r = dm_register_target(&android_verity_target);
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ /* Tracks the status of the last added target */
+ debug_dir = debugfs_create_dir("android_verity", NULL);
+
+ if (IS_ERR_OR_NULL(debug_dir)) {
+ DMERR("Cannot create android_verity debugfs directory: %ld",
+ PTR_ERR(debug_dir));
+ goto end;
+ }
+
+ file = debugfs_create_bool("target_added", S_IRUGO, debug_dir,
+ (u32 *)&target_added);
+
+ if (IS_ERR_OR_NULL(file)) {
+ DMERR("Cannot create android_verity debugfs directory: %ld",
+ PTR_ERR(debug_dir));
+ debugfs_remove_recursive(debug_dir);
+ goto end;
+ }
+
+ file = debugfs_create_bool("verity_enabled", S_IRUGO, debug_dir,
+ (u32 *)&verity_enabled);
+
+ if (IS_ERR_OR_NULL(file)) {
+ DMERR("Cannot create android_verity debugfs directory: %ld",
+ PTR_ERR(debug_dir));
+ debugfs_remove_recursive(debug_dir);
+ }
+
+end:
+ return r;
+}
+
+static void __exit dm_android_verity_exit(void)
+{
+ if (!IS_ERR_OR_NULL(debug_dir))
+ debugfs_remove_recursive(debug_dir);
+
+ dm_unregister_target(&android_verity_target);
+}
+
+module_init(dm_android_verity_init);
+module_exit(dm_android_verity_exit);
diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h
new file mode 100644
index 000000000000..f43b02fbb475
--- /dev/null
+++ b/drivers/md/dm-android-verity.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef DM_ANDROID_VERITY_H
+#define DM_ANDROID_VERITY_H
+
+#include <crypto/sha.h>
+
+#define RSANUMBYTES 256
+#define VERITY_METADATA_MAGIC_NUMBER 0xb001b001
+#define VERITY_METADATA_MAGIC_DISABLE 0x46464f56
+#define VERITY_METADATA_VERSION 0
+#define VERITY_STATE_DISABLE 1
+#define DATA_BLOCK_SIZE (4 * 1024)
+#define VERITY_METADATA_SIZE (8 * DATA_BLOCK_SIZE)
+#define VERITY_TABLE_ARGS 10
+#define VERITY_COMMANDLINE_PARAM_LENGTH 20
+#define BUILD_VARIANT 20
+
+/*
+ * <subject>:<sha1-id> is the format for the identifier.
+ * subject can either be the Common Name(CN) + Organization Name(O) or
+ * just the CN if the it is prefixed with O
+ * From https://tools.ietf.org/html/rfc5280#appendix-A
+ * ub-organization-name-length INTEGER ::= 64
+ * ub-common-name-length INTEGER ::= 64
+ *
+ * http://lxr.free-electrons.com/source/crypto/asymmetric_keys/x509_cert_parser.c?v=3.9#L278
+ * ctx->o_size + 2 + ctx->cn_size + 1
+ * + 41 characters for ":" and sha1 id
+ * 64 + 2 + 64 + 1 + 1 + 40 (172)
+ * setting VERITY_DEFAULT_KEY_ID_LENGTH to 200 characters.
+ */
+#define VERITY_DEFAULT_KEY_ID_LENGTH 200
+
+#define FEC_MAGIC 0xFECFECFE
+#define FEC_BLOCK_SIZE (4 * 1024)
+#define FEC_VERSION 0
+#define FEC_RSM 255
+#define FEC_ARG_LENGTH 300
+
+#define VERITY_TABLE_OPT_RESTART "restart_on_corruption"
+#define VERITY_TABLE_OPT_LOGGING "ignore_corruption"
+#define VERITY_TABLE_OPT_IGNZERO "ignore_zero_blocks"
+
+#define VERITY_TABLE_OPT_FEC_FORMAT \
+ "use_fec_from_device %s fec_start %llu fec_blocks %llu fec_roots %u ignore_zero_blocks"
+#define VERITY_TABLE_OPT_FEC_ARGS 9
+
+#define VERITY_DEBUG 0
+
+#define DM_MSG_PREFIX "android-verity"
+
+#define DM_LINEAR_ARGS 2
+#define DM_LINEAR_TARGET_OFFSET "0"
+
+/*
+ * There can be two formats.
+ * if fec is present
+ * <data_blocks> <verity_tree> <verity_metdata_32K><fec_data><fec_data_4K>
+ * if fec is not present
+ * <data_blocks> <verity_tree> <verity_metdata_32K>
+ */
+struct fec_header {
+ __le32 magic;
+ __le32 version;
+ __le32 size;
+ __le32 roots;
+ __le32 fec_size;
+ __le64 inp_size;
+ u8 hash[SHA256_DIGEST_SIZE];
+} __attribute__((packed));
+
+struct android_metadata_header {
+ __le32 magic_number;
+ __le32 protocol_version;
+ char signature[RSANUMBYTES];
+ __le32 table_length;
+};
+
+struct android_metadata {
+ struct android_metadata_header *header;
+ char *verity_table;
+};
+
+struct fec_ecc_metadata {
+ bool valid;
+ u32 roots;
+ u64 blocks;
+ u64 rounds;
+ u64 start;
+};
+
+struct bio_read {
+ struct page **page_io;
+ int number_of_pages;
+};
+
+extern struct target_type linear_target;
+
+extern void dm_linear_dtr(struct dm_target *ti);
+extern int dm_linear_map(struct dm_target *ti, struct bio *bio);
+extern void dm_linear_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen);
+extern int dm_linear_ioctl(struct dm_target *ti, unsigned int cmd,
+ unsigned long arg);
+extern int dm_linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size);
+extern int dm_linear_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data);
+extern int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv);
+#endif /* DM_ANDROID_VERITY_H */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 3f56ab28cee0..470e2ad33933 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1844,16 +1844,19 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
ret = -ENOMEM;
- cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1);
+ cc->io_queue = alloc_workqueue("kcryptd_io",
+ WQ_HIGHPRI |
+ WQ_MEM_RECLAIM,
+ 1);
if (!cc->io_queue) {
ti->error = "Couldn't create kcryptd io queue";
goto bad;
}
if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
- cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+ cc->crypt_queue = alloc_workqueue("kcryptd", WQ_HIGHPRI | WQ_MEM_RECLAIM, 1);
else
- cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
+ cc->crypt_queue = alloc_workqueue("kcryptd", WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_UNBOUND,
num_online_cpus());
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 63d64ca80bc0..755050e5587f 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1921,6 +1921,45 @@ void dm_interface_exit(void)
dm_hash_exit();
}
+
+/**
+ * dm_ioctl_export - Permanently export a mapped device via the ioctl interface
+ * @md: Pointer to mapped_device
+ * @name: Buffer (size DM_NAME_LEN) for name
+ * @uuid: Buffer (size DM_UUID_LEN) for uuid or NULL if not desired
+ */
+int dm_ioctl_export(struct mapped_device *md, const char *name,
+ const char *uuid)
+{
+ int r = 0;
+ struct hash_cell *hc;
+
+ if (!md) {
+ r = -ENXIO;
+ goto out;
+ }
+
+ /* The name and uuid can only be set once. */
+ mutex_lock(&dm_hash_cells_mutex);
+ hc = dm_get_mdptr(md);
+ mutex_unlock(&dm_hash_cells_mutex);
+ if (hc) {
+ DMERR("%s: already exported", dm_device_name(md));
+ r = -ENXIO;
+ goto out;
+ }
+
+ r = dm_hash_insert(name, uuid, md);
+ if (r) {
+ DMERR("%s: could not bind to '%s'", dm_device_name(md), name);
+ goto out;
+ }
+
+ /* Let udev know we've changed. */
+ dm_kobject_uevent(md, KOBJ_CHANGE, dm_get_event_nr(md));
+out:
+ return r;
+}
/**
* dm_copy_name_and_uuid - Copy mapped device name & uuid into supplied buffers
* @md: Pointer to mapped_device
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 53e848c10939..acffc181b568 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -25,7 +25,7 @@ struct linear_c {
/*
* Construct a linear mapping: <dev_path> <offset>
*/
-static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct linear_c *lc;
unsigned long long tmp;
@@ -63,14 +63,16 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
kfree(lc);
return -EINVAL;
}
+EXPORT_SYMBOL_GPL(dm_linear_ctr);
-static void linear_dtr(struct dm_target *ti)
+void dm_linear_dtr(struct dm_target *ti)
{
struct linear_c *lc = (struct linear_c *) ti->private;
dm_put_device(ti, lc->dev);
kfree(lc);
}
+EXPORT_SYMBOL_GPL(dm_linear_dtr);
static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector)
{
@@ -89,14 +91,15 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
linear_map_sector(ti, bio->bi_iter.bi_sector);
}
-static int linear_map(struct dm_target *ti, struct bio *bio)
+int dm_linear_map(struct dm_target *ti, struct bio *bio)
{
linear_map_bio(ti, bio);
return DM_MAPIO_REMAPPED;
}
+EXPORT_SYMBOL_GPL(dm_linear_map);
-static void linear_status(struct dm_target *ti, status_type_t type,
+void dm_linear_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
struct linear_c *lc = (struct linear_c *) ti->private;
@@ -112,8 +115,9 @@ static void linear_status(struct dm_target *ti, status_type_t type,
break;
}
}
+EXPORT_SYMBOL_GPL(dm_linear_status);
-static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
+int dm_linear_ioctl(struct dm_target *ti, unsigned int cmd,
unsigned long arg)
{
struct linear_c *lc = (struct linear_c *) ti->private;
@@ -129,8 +133,9 @@ static int linear_ioctl(struct dm_target *ti, unsigned int cmd,
return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
}
+EXPORT_SYMBOL_GPL(dm_linear_ioctl);
-static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+int dm_linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
struct bio_vec *biovec, int max_size)
{
struct linear_c *lc = ti->private;
@@ -144,26 +149,28 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
}
+EXPORT_SYMBOL_GPL(dm_linear_merge);
-static int linear_iterate_devices(struct dm_target *ti,
+int dm_linear_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct linear_c *lc = ti->private;
return fn(ti, lc->dev, lc->start, ti->len, data);
}
+EXPORT_SYMBOL_GPL(dm_linear_iterate_devices);
static struct target_type linear_target = {
.name = "linear",
.version = {1, 2, 1},
.module = THIS_MODULE,
- .ctr = linear_ctr,
- .dtr = linear_dtr,
- .map = linear_map,
- .status = linear_status,
- .ioctl = linear_ioctl,
- .merge = linear_merge,
- .iterate_devices = linear_iterate_devices,
+ .ctr = dm_linear_ctr,
+ .dtr = dm_linear_dtr,
+ .map = dm_linear_map,
+ .status = dm_linear_status,
+ .ioctl = dm_linear_ioctl,
+ .merge = dm_linear_merge,
+ .iterate_devices = dm_linear_iterate_devices,
};
int __init dm_linear_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index fc93bd80270f..66cbcda8d75c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -11,6 +11,7 @@
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/namei.h>
+#include <linux/mount.h>
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/slab.h>
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
new file mode 100644
index 000000000000..1dd667b97530
--- /dev/null
+++ b/drivers/md/dm-verity-fec.c
@@ -0,0 +1,870 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * Author: Sami Tolvanen <samitolvanen@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include "dm-verity-fec.h"
+#include <linux/math64.h>
+#include <linux/sysfs.h>
+
+#define DM_MSG_PREFIX "verity-fec"
+
+/*
+ * If error correction has been configured, returns true.
+ */
+bool verity_fec_is_enabled(struct dm_verity *v)
+{
+ return v->fec && v->fec->dev;
+}
+
+/*
+ * Return a pointer to dm_verity_fec_io after dm_verity_io and its variable
+ * length fields.
+ */
+static inline struct dm_verity_fec_io *fec_io(struct dm_verity_io *io)
+{
+ return (struct dm_verity_fec_io *) verity_io_digest_end(io->v, io);
+}
+
+/*
+ * Return an interleaved offset for a byte in RS block.
+ */
+static inline u64 fec_interleave(struct dm_verity *v, u64 offset)
+{
+ u32 mod;
+
+ mod = do_div(offset, v->fec->rsn);
+ return offset + mod * (v->fec->rounds << v->data_dev_block_bits);
+}
+
+/*
+ * Decode an RS block using Reed-Solomon.
+ */
+static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio,
+ u8 *data, u8 *fec, int neras)
+{
+ int i;
+ uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN];
+
+ for (i = 0; i < v->fec->roots; i++)
+ par[i] = fec[i];
+
+ return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras,
+ fio->erasures, 0, NULL);
+}
+
+/*
+ * Read error-correcting codes for the requested RS block. Returns a pointer
+ * to the data block. Caller is responsible for releasing buf.
+ */
+static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index,
+ unsigned *offset, struct dm_buffer **buf)
+{
+ u64 position, block;
+ u8 *res;
+
+ position = (index + rsb) * v->fec->roots;
+ block = position >> v->data_dev_block_bits;
+ *offset = (unsigned)(position - (block << v->data_dev_block_bits));
+
+ res = dm_bufio_read(v->fec->bufio, v->fec->start + block, buf);
+ if (unlikely(IS_ERR(res))) {
+ DMERR("%s: FEC %llu: parity read failed (block %llu): %ld",
+ v->data_dev->name, (unsigned long long)rsb,
+ (unsigned long long)(v->fec->start + block),
+ PTR_ERR(res));
+ *buf = NULL;
+ }
+
+ return res;
+}
+
+/* Loop over each preallocated buffer slot. */
+#define fec_for_each_prealloc_buffer(__i) \
+ for (__i = 0; __i < DM_VERITY_FEC_BUF_PREALLOC; __i++)
+
+/* Loop over each extra buffer slot. */
+#define fec_for_each_extra_buffer(io, __i) \
+ for (__i = DM_VERITY_FEC_BUF_PREALLOC; __i < DM_VERITY_FEC_BUF_MAX; __i++)
+
+/* Loop over each allocated buffer. */
+#define fec_for_each_buffer(io, __i) \
+ for (__i = 0; __i < (io)->nbufs; __i++)
+
+/* Loop over each RS block in each allocated buffer. */
+#define fec_for_each_buffer_rs_block(io, __i, __j) \
+ fec_for_each_buffer(io, __i) \
+ for (__j = 0; __j < 1 << DM_VERITY_FEC_BUF_RS_BITS; __j++)
+
+/*
+ * Return a pointer to the current RS block when called inside
+ * fec_for_each_buffer_rs_block.
+ */
+static inline u8 *fec_buffer_rs_block(struct dm_verity *v,
+ struct dm_verity_fec_io *fio,
+ unsigned i, unsigned j)
+{
+ return &fio->bufs[i][j * v->fec->rsn];
+}
+
+/*
+ * Return an index to the current RS block when called inside
+ * fec_for_each_buffer_rs_block.
+ */
+static inline unsigned fec_buffer_rs_index(unsigned i, unsigned j)
+{
+ return (i << DM_VERITY_FEC_BUF_RS_BITS) + j;
+}
+
+/*
+ * Decode all RS blocks from buffers and copy corrected bytes into fio->output
+ * starting from block_offset.
+ */
+static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio,
+ u64 rsb, int byte_index, unsigned block_offset,
+ int neras)
+{
+ int r, corrected = 0, res;
+ struct dm_buffer *buf;
+ unsigned n, i, offset;
+ u8 *par, *block;
+
+ par = fec_read_parity(v, rsb, block_offset, &offset, &buf);
+ if (IS_ERR(par))
+ return PTR_ERR(par);
+
+ /*
+ * Decode the RS blocks we have in bufs. Each RS block results in
+ * one corrected target byte and consumes fec->roots parity bytes.
+ */
+ fec_for_each_buffer_rs_block(fio, n, i) {
+ block = fec_buffer_rs_block(v, fio, n, i);
+ res = fec_decode_rs8(v, fio, block, &par[offset], neras);
+ if (res < 0) {
+ dm_bufio_release(buf);
+
+ r = res;
+ goto error;
+ }
+
+ corrected += res;
+ fio->output[block_offset] = block[byte_index];
+
+ block_offset++;
+ if (block_offset >= 1 << v->data_dev_block_bits)
+ goto done;
+
+ /* read the next block when we run out of parity bytes */
+ offset += v->fec->roots;
+ if (offset >= 1 << v->data_dev_block_bits) {
+ dm_bufio_release(buf);
+
+ par = fec_read_parity(v, rsb, block_offset, &offset, &buf);
+ if (unlikely(IS_ERR(par)))
+ return PTR_ERR(par);
+ }
+ }
+done:
+ r = corrected;
+error:
+ if (r < 0 && neras)
+ DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
+ v->data_dev->name, (unsigned long long)rsb, r);
+ else if (r > 0) {
+ DMWARN_LIMIT("%s: FEC %llu: corrected %d errors",
+ v->data_dev->name, (unsigned long long)rsb, r);
+ atomic_add_unless(&v->fec->corrected, 1, INT_MAX);
+ }
+
+ return r;
+}
+
+/*
+ * Locate data block erasures using verity hashes.
+ */
+static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
+ u8 *want_digest, u8 *data)
+{
+ if (unlikely(verity_hash(v, verity_io_hash_desc(v, io),
+ data, 1 << v->data_dev_block_bits,
+ verity_io_real_digest(v, io))))
+ return 0;
+
+ return memcmp(verity_io_real_digest(v, io), want_digest,
+ v->digest_size) != 0;
+}
+
+/*
+ * Read data blocks that are part of the RS block and deinterleave as much as
+ * fits into buffers. Check for erasure locations if @neras is non-NULL.
+ */
+static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
+ u64 rsb, u64 target, unsigned block_offset,
+ int *neras)
+{
+ bool is_zero;
+ int i, j, target_index = -1;
+ struct dm_buffer *buf;
+ struct dm_bufio_client *bufio;
+ struct dm_verity_fec_io *fio = fec_io(io);
+ u64 block, ileaved;
+ u8 *bbuf, *rs_block;
+ u8 want_digest[v->digest_size];
+ unsigned n, k;
+
+ if (neras)
+ *neras = 0;
+
+ /*
+ * read each of the rsn data blocks that are part of the RS block, and
+ * interleave contents to available bufs
+ */
+ for (i = 0; i < v->fec->rsn; i++) {
+ ileaved = fec_interleave(v, rsb * v->fec->rsn + i);
+
+ /*
+ * target is the data block we want to correct, target_index is
+ * the index of this block within the rsn RS blocks
+ */
+ if (ileaved == target)
+ target_index = i;
+
+ block = ileaved >> v->data_dev_block_bits;
+ bufio = v->fec->data_bufio;
+
+ if (block >= v->data_blocks) {
+ block -= v->data_blocks;
+
+ /*
+ * blocks outside the area were assumed to contain
+ * zeros when encoding data was generated
+ */
+ if (unlikely(block >= v->fec->hash_blocks))
+ continue;
+
+ block += v->hash_start;
+ bufio = v->bufio;
+ }
+
+ bbuf = dm_bufio_read(bufio, block, &buf);
+ if (unlikely(IS_ERR(bbuf))) {
+ DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld",
+ v->data_dev->name,
+ (unsigned long long)rsb,
+ (unsigned long long)block, PTR_ERR(bbuf));
+
+ /* assume the block is corrupted */
+ if (neras && *neras <= v->fec->roots)
+ fio->erasures[(*neras)++] = i;
+
+ continue;
+ }
+
+ /* locate erasures if the block is on the data device */
+ if (bufio == v->fec->data_bufio &&
+ verity_hash_for_block(v, io, block, want_digest,
+ &is_zero) == 0) {
+ /* skip known zero blocks entirely */
+ if (is_zero)
+ continue;
+
+ /*
+ * skip if we have already found the theoretical
+ * maximum number (i.e. fec->roots) of erasures
+ */
+ if (neras && *neras <= v->fec->roots &&
+ fec_is_erasure(v, io, want_digest, bbuf))
+ fio->erasures[(*neras)++] = i;
+ }
+
+ /*
+ * deinterleave and copy the bytes that fit into bufs,
+ * starting from block_offset
+ */
+ fec_for_each_buffer_rs_block(fio, n, j) {
+ k = fec_buffer_rs_index(n, j) + block_offset;
+
+ if (k >= 1 << v->data_dev_block_bits)
+ goto done;
+
+ rs_block = fec_buffer_rs_block(v, fio, n, j);
+ rs_block[i] = bbuf[k];
+ }
+done:
+ dm_bufio_release(buf);
+ }
+
+ return target_index;
+}
+
+/*
+ * Allocate RS control structure and FEC buffers from preallocated mempools,
+ * and attempt to allocate as many extra buffers as available.
+ */
+static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
+{
+ unsigned n;
+
+ if (!fio->rs) {
+ fio->rs = mempool_alloc(v->fec->rs_pool, 0);
+ if (unlikely(!fio->rs)) {
+ DMERR("failed to allocate RS");
+ return -ENOMEM;
+ }
+ }
+
+ fec_for_each_prealloc_buffer(n) {
+ if (fio->bufs[n])
+ continue;
+
+ fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOIO);
+ if (unlikely(!fio->bufs[n])) {
+ DMERR("failed to allocate FEC buffer");
+ return -ENOMEM;
+ }
+ }
+
+ /* try to allocate the maximum number of buffers */
+ fec_for_each_extra_buffer(fio, n) {
+ if (fio->bufs[n])
+ continue;
+
+ fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOIO);
+ /* we can manage with even one buffer if necessary */
+ if (unlikely(!fio->bufs[n]))
+ break;
+ }
+ fio->nbufs = n;
+
+ if (!fio->output) {
+ fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO);
+
+ if (!fio->output) {
+ DMERR("failed to allocate FEC page");
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Initialize buffers and clear erasures. fec_read_bufs() assumes buffers are
+ * zeroed before deinterleaving.
+ */
+static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
+{
+ unsigned n;
+
+ fec_for_each_buffer(fio, n)
+ memset(fio->bufs[n], 0, v->fec->rsn << DM_VERITY_FEC_BUF_RS_BITS);
+
+ memset(fio->erasures, 0, sizeof(fio->erasures));
+}
+
+/*
+ * Decode all RS blocks in a single data block and return the target block
+ * (indicated by @offset) in fio->output. If @use_erasures is non-zero, uses
+ * hashes to locate erasures.
+ */
+static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
+ struct dm_verity_fec_io *fio, u64 rsb, u64 offset,
+ bool use_erasures)
+{
+ int r, neras = 0;
+ unsigned pos;
+
+ r = fec_alloc_bufs(v, fio);
+ if (unlikely(r < 0))
+ return r;
+
+ for (pos = 0; pos < 1 << v->data_dev_block_bits; ) {
+ fec_init_bufs(v, fio);
+
+ r = fec_read_bufs(v, io, rsb, offset, pos,
+ use_erasures ? &neras : NULL);
+ if (unlikely(r < 0))
+ return r;
+
+ r = fec_decode_bufs(v, fio, rsb, r, pos, neras);
+ if (r < 0)
+ return r;
+
+ pos += fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS;
+ }
+
+ /* Always re-validate the corrected block against the expected hash */
+ r = verity_hash(v, verity_io_hash_desc(v, io), fio->output,
+ 1 << v->data_dev_block_bits,
+ verity_io_real_digest(v, io));
+ if (unlikely(r < 0))
+ return r;
+
+ if (memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io),
+ v->digest_size)) {
+ DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)",
+ v->data_dev->name, (unsigned long long)rsb, neras);
+ return -EILSEQ;
+ }
+
+ return 0;
+}
+
+static int fec_bv_copy(struct dm_verity *v, struct dm_verity_io *io, u8 *data,
+ size_t len)
+{
+ struct dm_verity_fec_io *fio = fec_io(io);
+
+ memcpy(data, &fio->output[fio->output_pos], len);
+ fio->output_pos += len;
+
+ return 0;
+}
+
+/*
+ * Correct errors in a block. Copies corrected block to dest if non-NULL,
+ * otherwise to a bio_vec starting from iter.
+ */
+int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
+ enum verity_block_type type, sector_t block, u8 *dest,
+ struct bvec_iter *iter)
+{
+ int r;
+ struct dm_verity_fec_io *fio = fec_io(io);
+ u64 offset, res, rsb;
+
+ if (!verity_fec_is_enabled(v))
+ return -EOPNOTSUPP;
+
+ if (fio->level >= DM_VERITY_FEC_MAX_RECURSION) {
+ DMWARN_LIMIT("%s: FEC: recursion too deep", v->data_dev->name);
+ return -EIO;
+ }
+
+ fio->level++;
+
+ if (type == DM_VERITY_BLOCK_TYPE_METADATA)
+ block += v->data_blocks;
+
+ /*
+ * For RS(M, N), the continuous FEC data is divided into blocks of N
+ * bytes. Since block size may not be divisible by N, the last block
+ * is zero padded when decoding.
+ *
+ * Each byte of the block is covered by a different RS(M, N) code,
+ * and each code is interleaved over N blocks to make it less likely
+ * that bursty corruption will leave us in unrecoverable state.
+ */
+
+ offset = block << v->data_dev_block_bits;
+ res = div64_u64(offset, v->fec->rounds << v->data_dev_block_bits);
+
+ /*
+ * The base RS block we can feed to the interleaver to find out all
+ * blocks required for decoding.
+ */
+ rsb = offset - res * (v->fec->rounds << v->data_dev_block_bits);
+
+ /*
+ * Locating erasures is slow, so attempt to recover the block without
+ * them first. Do a second attempt with erasures if the corruption is
+ * bad enough.
+ */
+ r = fec_decode_rsb(v, io, fio, rsb, offset, false);
+ if (r < 0) {
+ r = fec_decode_rsb(v, io, fio, rsb, offset, true);
+ if (r < 0)
+ goto done;
+ }
+
+ if (dest)
+ memcpy(dest, fio->output, 1 << v->data_dev_block_bits);
+ else if (iter) {
+ fio->output_pos = 0;
+ r = verity_for_bv_block(v, io, iter, fec_bv_copy);
+ }
+
+done:
+ fio->level--;
+ return r;
+}
+
+/*
+ * Clean up per-bio data.
+ */
+void verity_fec_finish_io(struct dm_verity_io *io)
+{
+ unsigned n;
+ struct dm_verity_fec *f = io->v->fec;
+ struct dm_verity_fec_io *fio = fec_io(io);
+
+ if (!verity_fec_is_enabled(io->v))
+ return;
+
+ mempool_free(fio->rs, f->rs_pool);
+
+ fec_for_each_prealloc_buffer(n)
+ mempool_free(fio->bufs[n], f->prealloc_pool);
+
+ fec_for_each_extra_buffer(fio, n)
+ mempool_free(fio->bufs[n], f->extra_pool);
+
+ mempool_free(fio->output, f->output_pool);
+}
+
+/*
+ * Initialize per-bio data.
+ */
+void verity_fec_init_io(struct dm_verity_io *io)
+{
+ struct dm_verity_fec_io *fio = fec_io(io);
+
+ if (!verity_fec_is_enabled(io->v))
+ return;
+
+ fio->rs = NULL;
+ memset(fio->bufs, 0, sizeof(fio->bufs));
+ fio->nbufs = 0;
+ fio->output = NULL;
+ fio->level = 0;
+}
+
+/*
+ * Append feature arguments and values to the status table.
+ */
+unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
+ char *result, unsigned maxlen)
+{
+ if (!verity_fec_is_enabled(v))
+ return sz;
+
+ DMEMIT(" " DM_VERITY_OPT_FEC_DEV " %s "
+ DM_VERITY_OPT_FEC_BLOCKS " %llu "
+ DM_VERITY_OPT_FEC_START " %llu "
+ DM_VERITY_OPT_FEC_ROOTS " %d",
+ v->fec->dev->name,
+ (unsigned long long)v->fec->blocks,
+ (unsigned long long)v->fec->start,
+ v->fec->roots);
+
+ return sz;
+}
+
+void verity_fec_dtr(struct dm_verity *v)
+{
+ struct dm_verity_fec *f = v->fec;
+ struct kobject *kobj = &f->kobj_holder.kobj;
+
+ if (!verity_fec_is_enabled(v))
+ goto out;
+
+ mempool_destroy(f->rs_pool);
+ mempool_destroy(f->prealloc_pool);
+ mempool_destroy(f->extra_pool);
+ kmem_cache_destroy(f->cache);
+
+ if (f->data_bufio)
+ dm_bufio_client_destroy(f->data_bufio);
+ if (f->bufio)
+ dm_bufio_client_destroy(f->bufio);
+
+ if (f->dev)
+ dm_put_device(v->ti, f->dev);
+
+ if (kobj->state_initialized) {
+ kobject_put(kobj);
+ wait_for_completion(dm_get_completion_from_kobject(kobj));
+ }
+
+out:
+ kfree(f);
+ v->fec = NULL;
+}
+
+static void *fec_rs_alloc(gfp_t gfp_mask, void *pool_data)
+{
+ struct dm_verity *v = (struct dm_verity *)pool_data;
+
+ return init_rs(8, 0x11d, 0, 1, v->fec->roots);
+}
+
+static void fec_rs_free(void *element, void *pool_data)
+{
+ struct rs_control *rs = (struct rs_control *)element;
+
+ if (rs)
+ free_rs(rs);
+}
+
+bool verity_is_fec_opt_arg(const char *arg_name)
+{
+ return (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV) ||
+ !strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS) ||
+ !strcasecmp(arg_name, DM_VERITY_OPT_FEC_START) ||
+ !strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS));
+}
+
+int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
+ unsigned *argc, const char *arg_name)
+{
+ int r;
+ struct dm_target *ti = v->ti;
+ const char *arg_value;
+ unsigned long long num_ll;
+ unsigned char num_c;
+ char dummy;
+
+ if (!*argc) {
+ ti->error = "FEC feature arguments require a value";
+ return -EINVAL;
+ }
+
+ arg_value = dm_shift_arg(as);
+ (*argc)--;
+
+ if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) {
+ r = dm_get_device(ti, arg_value, FMODE_READ, &v->fec->dev);
+ if (r) {
+ ti->error = "FEC device lookup failed";
+ return r;
+ }
+
+ } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS)) {
+ if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 ||
+ ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT))
+ >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) {
+ ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS;
+ return -EINVAL;
+ }
+ v->fec->blocks = num_ll;
+
+ } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_START)) {
+ if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 ||
+ ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) >>
+ (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) {
+ ti->error = "Invalid " DM_VERITY_OPT_FEC_START;
+ return -EINVAL;
+ }
+ v->fec->start = num_ll;
+
+ } else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS)) {
+ if (sscanf(arg_value, "%hhu%c", &num_c, &dummy) != 1 || !num_c ||
+ num_c < (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MAX_RSN) ||
+ num_c > (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN)) {
+ ti->error = "Invalid " DM_VERITY_OPT_FEC_ROOTS;
+ return -EINVAL;
+ }
+ v->fec->roots = num_c;
+
+ } else {
+ ti->error = "Unrecognized verity FEC feature request";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static ssize_t corrected_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct dm_verity_fec *f = container_of(kobj, struct dm_verity_fec,
+ kobj_holder.kobj);
+
+ return sprintf(buf, "%d\n", atomic_read(&f->corrected));
+}
+
+static struct kobj_attribute attr_corrected = __ATTR_RO(corrected);
+
+static struct attribute *fec_attrs[] = {
+ &attr_corrected.attr,
+ NULL
+};
+
+static struct kobj_type fec_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_attrs = fec_attrs,
+ .release = dm_kobject_release
+};
+
+/*
+ * Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr.
+ */
+int verity_fec_ctr_alloc(struct dm_verity *v)
+{
+ struct dm_verity_fec *f;
+
+ f = kzalloc(sizeof(struct dm_verity_fec), GFP_KERNEL);
+ if (!f) {
+ v->ti->error = "Cannot allocate FEC structure";
+ return -ENOMEM;
+ }
+ v->fec = f;
+
+ return 0;
+}
+
+/*
+ * Validate arguments and preallocate memory. Must be called after arguments
+ * have been parsed using verity_fec_parse_opt_args.
+ */
+int verity_fec_ctr(struct dm_verity *v)
+{
+ int r;
+ struct dm_verity_fec *f = v->fec;
+ struct dm_target *ti = v->ti;
+ struct mapped_device *md = dm_table_get_md(ti->table);
+ u64 hash_blocks;
+
+ if (!verity_fec_is_enabled(v)) {
+ verity_fec_dtr(v);
+ return 0;
+ }
+
+ /* Create a kobject and sysfs attributes */
+ init_completion(&f->kobj_holder.completion);
+
+ r = kobject_init_and_add(&f->kobj_holder.kobj, &fec_ktype,
+ &disk_to_dev(dm_disk(md))->kobj, "%s", "fec");
+ if (r) {
+ ti->error = "Cannot create kobject";
+ return r;
+ }
+
+ /*
+ * FEC is computed over data blocks, possible metadata, and
+ * hash blocks. In other words, FEC covers total of fec_blocks
+ * blocks consisting of the following:
+ *
+ * data blocks | hash blocks | metadata (optional)
+ *
+ * We allow metadata after hash blocks to support a use case
+ * where all data is stored on the same device and FEC covers
+ * the entire area.
+ *
+ * If metadata is included, we require it to be available on the
+ * hash device after the hash blocks.
+ */
+
+ hash_blocks = v->hash_blocks - v->hash_start;
+
+ /*
+ * Require matching block sizes for data and hash devices for
+ * simplicity.
+ */
+ if (v->data_dev_block_bits != v->hash_dev_block_bits) {
+ ti->error = "Block sizes must match to use FEC";
+ return -EINVAL;
+ }
+
+ if (!f->roots) {
+ ti->error = "Missing " DM_VERITY_OPT_FEC_ROOTS;
+ return -EINVAL;
+ }
+ f->rsn = DM_VERITY_FEC_RSM - f->roots;
+
+ if (!f->blocks) {
+ ti->error = "Missing " DM_VERITY_OPT_FEC_BLOCKS;
+ return -EINVAL;
+ }
+
+ f->rounds = f->blocks;
+ if (sector_div(f->rounds, f->rsn))
+ f->rounds++;
+
+ /*
+ * Due to optional metadata, f->blocks can be larger than
+ * data_blocks and hash_blocks combined.
+ */
+ if (f->blocks < v->data_blocks + hash_blocks || !f->rounds) {
+ ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS;
+ return -EINVAL;
+ }
+
+ /*
+ * Metadata is accessed through the hash device, so we require
+ * it to be large enough.
+ */
+ f->hash_blocks = f->blocks - v->data_blocks;
+ if (dm_bufio_get_device_size(v->bufio) < f->hash_blocks) {
+ ti->error = "Hash device is too small for "
+ DM_VERITY_OPT_FEC_BLOCKS;
+ return -E2BIG;
+ }
+
+ f->bufio = dm_bufio_client_create(f->dev->bdev,
+ 1 << v->data_dev_block_bits,
+ 1, 0, NULL, NULL);
+ if (IS_ERR(f->bufio)) {
+ ti->error = "Cannot initialize FEC bufio client";
+ return PTR_ERR(f->bufio);
+ }
+
+ if (dm_bufio_get_device_size(f->bufio) <
+ ((f->start + f->rounds * f->roots) >> v->data_dev_block_bits)) {
+ ti->error = "FEC device is too small";
+ return -E2BIG;
+ }
+
+ f->data_bufio = dm_bufio_client_create(v->data_dev->bdev,
+ 1 << v->data_dev_block_bits,
+ 1, 0, NULL, NULL);
+ if (IS_ERR(f->data_bufio)) {
+ ti->error = "Cannot initialize FEC data bufio client";
+ return PTR_ERR(f->data_bufio);
+ }
+
+ if (dm_bufio_get_device_size(f->data_bufio) < v->data_blocks) {
+ ti->error = "Data device is too small";
+ return -E2BIG;
+ }
+
+ /* Preallocate an rs_control structure for each worker thread */
+ f->rs_pool = mempool_create(num_online_cpus(), fec_rs_alloc,
+ fec_rs_free, (void *) v);
+ if (!f->rs_pool) {
+ ti->error = "Cannot allocate RS pool";
+ return -ENOMEM;
+ }
+
+ f->cache = kmem_cache_create("dm_verity_fec_buffers",
+ f->rsn << DM_VERITY_FEC_BUF_RS_BITS,
+ 0, 0, NULL);
+ if (!f->cache) {
+ ti->error = "Cannot create FEC buffer cache";
+ return -ENOMEM;
+ }
+
+ /* Preallocate DM_VERITY_FEC_BUF_PREALLOC buffers for each thread */
+ f->prealloc_pool = mempool_create_slab_pool(num_online_cpus() *
+ DM_VERITY_FEC_BUF_PREALLOC,
+ f->cache);
+ if (!f->prealloc_pool) {
+ ti->error = "Cannot allocate FEC buffer prealloc pool";
+ return -ENOMEM;
+ }
+
+ f->extra_pool = mempool_create_slab_pool(0, f->cache);
+ if (!f->extra_pool) {
+ ti->error = "Cannot allocate FEC buffer extra pool";
+ return -ENOMEM;
+ }
+
+ /* Preallocate an output buffer for each thread */
+ f->output_pool = mempool_create_kmalloc_pool(num_online_cpus(),
+ 1 << v->data_dev_block_bits);
+ if (!f->output_pool) {
+ ti->error = "Cannot allocate FEC output pool";
+ return -ENOMEM;
+ }
+
+ /* Reserve space for our per-bio data */
+ ti->per_bio_data_size += sizeof(struct dm_verity_fec_io);
+
+ return 0;
+}
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
new file mode 100644
index 000000000000..b8e21cef3ad1
--- /dev/null
+++ b/drivers/md/dm-verity-fec.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * Author: Sami Tolvanen <samitolvanen@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#ifndef DM_VERITY_FEC_H
+#define DM_VERITY_FEC_H
+
+#include "dm.h"
+#include "dm-verity.h"
+#include <linux/rslib.h>
+
+/* Reed-Solomon(M, N) parameters */
+#define DM_VERITY_FEC_RSM 255
+#define DM_VERITY_FEC_MAX_RSN 253
+#define DM_VERITY_FEC_MIN_RSN 231 /* ~10% space overhead */
+
+/* buffers for deinterleaving and decoding */
+#define DM_VERITY_FEC_BUF_PREALLOC 1 /* buffers to preallocate */
+#define DM_VERITY_FEC_BUF_RS_BITS 4 /* 1 << RS blocks per buffer */
+/* we need buffers for at most 1 << block size RS blocks */
+#define DM_VERITY_FEC_BUF_MAX \
+ (1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS))
+
+/* maximum recursion level for verity_fec_decode */
+#define DM_VERITY_FEC_MAX_RECURSION 4
+
+#define DM_VERITY_OPT_FEC_DEV "use_fec_from_device"
+#define DM_VERITY_OPT_FEC_BLOCKS "fec_blocks"
+#define DM_VERITY_OPT_FEC_START "fec_start"
+#define DM_VERITY_OPT_FEC_ROOTS "fec_roots"
+
+/* configuration */
+struct dm_verity_fec {
+ struct dm_dev *dev; /* parity data device */
+ struct dm_bufio_client *data_bufio; /* for data dev access */
+ struct dm_bufio_client *bufio; /* for parity data access */
+ sector_t start; /* parity data start in blocks */
+ sector_t blocks; /* number of blocks covered */
+ sector_t rounds; /* number of interleaving rounds */
+ sector_t hash_blocks; /* blocks covered after v->hash_start */
+ unsigned char roots; /* number of parity bytes, M-N of RS(M, N) */
+ unsigned char rsn; /* N of RS(M, N) */
+ mempool_t *rs_pool; /* mempool for fio->rs */
+ mempool_t *prealloc_pool; /* mempool for preallocated buffers */
+ mempool_t *extra_pool; /* mempool for extra buffers */
+ mempool_t *output_pool; /* mempool for output */
+ struct kmem_cache *cache; /* cache for buffers */
+ atomic_t corrected; /* corrected errors */
+ struct dm_kobject_holder kobj_holder; /* for sysfs attributes */
+};
+
+/* per-bio data */
+struct dm_verity_fec_io {
+ struct rs_control *rs; /* Reed-Solomon state */
+ int erasures[DM_VERITY_FEC_MAX_RSN]; /* erasures for decode_rs8 */
+ u8 *bufs[DM_VERITY_FEC_BUF_MAX]; /* bufs for deinterleaving */
+ unsigned nbufs; /* number of buffers allocated */
+ u8 *output; /* buffer for corrected output */
+ size_t output_pos;
+ unsigned level; /* recursion level */
+};
+
+#ifdef CONFIG_DM_VERITY_FEC
+
+/* each feature parameter requires a value */
+#define DM_VERITY_OPTS_FEC 8
+
+extern bool verity_fec_is_enabled(struct dm_verity *v);
+
+extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
+ enum verity_block_type type, sector_t block,
+ u8 *dest, struct bvec_iter *iter);
+
+extern unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
+ char *result, unsigned maxlen);
+
+extern void verity_fec_finish_io(struct dm_verity_io *io);
+extern void verity_fec_init_io(struct dm_verity_io *io);
+
+extern bool verity_is_fec_opt_arg(const char *arg_name);
+extern int verity_fec_parse_opt_args(struct dm_arg_set *as,
+ struct dm_verity *v, unsigned *argc,
+ const char *arg_name);
+
+extern void verity_fec_dtr(struct dm_verity *v);
+
+extern int verity_fec_ctr_alloc(struct dm_verity *v);
+extern int verity_fec_ctr(struct dm_verity *v);
+
+#else /* !CONFIG_DM_VERITY_FEC */
+
+#define DM_VERITY_OPTS_FEC 0
+
+static inline bool verity_fec_is_enabled(struct dm_verity *v)
+{
+ return false;
+}
+
+static inline int verity_fec_decode(struct dm_verity *v,
+ struct dm_verity_io *io,
+ enum verity_block_type type,
+ sector_t block, u8 *dest,
+ struct bvec_iter *iter)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline unsigned verity_fec_status_table(struct dm_verity *v,
+ unsigned sz, char *result,
+ unsigned maxlen)
+{
+ return sz;
+}
+
+static inline void verity_fec_finish_io(struct dm_verity_io *io)
+{
+}
+
+static inline void verity_fec_init_io(struct dm_verity_io *io)
+{
+}
+
+static inline bool verity_is_fec_opt_arg(const char *arg_name)
+{
+ return false;
+}
+
+static inline int verity_fec_parse_opt_args(struct dm_arg_set *as,
+ struct dm_verity *v,
+ unsigned *argc,
+ const char *arg_name)
+{
+ return -EINVAL;
+}
+
+static inline void verity_fec_dtr(struct dm_verity *v)
+{
+}
+
+static inline int verity_fec_ctr_alloc(struct dm_verity *v)
+{
+ return 0;
+}
+
+static inline int verity_fec_ctr(struct dm_verity *v)
+{
+ return 0;
+}
+
+#endif /* CONFIG_DM_VERITY_FEC */
+
+#endif /* DM_VERITY_FEC_H */
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity-target.c
index 7a7bab8947ae..db8d6d01f429 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity-target.c
@@ -14,79 +14,30 @@
* access behavior.
*/
-#include "dm-bufio.h"
+#include "dm-verity.h"
+#include "dm-verity-fec.h"
#include <linux/module.h>
-#include <linux/device-mapper.h>
-#include <crypto/hash.h>
+#include <linux/reboot.h>
#define DM_MSG_PREFIX "verity"
-#define DM_VERITY_IO_VEC_INLINE 16
-#define DM_VERITY_MEMPOOL_SIZE 4
-#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
-
-#define DM_VERITY_MAX_LEVELS 63
-
-static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
-
-module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
-
-struct dm_verity {
- struct dm_dev *data_dev;
- struct dm_dev *hash_dev;
- struct dm_target *ti;
- struct dm_bufio_client *bufio;
- char *alg_name;
- struct crypto_shash *tfm;
- u8 *root_digest; /* digest of the root block */
- u8 *salt; /* salt: its size is salt_size */
- unsigned salt_size;
- sector_t data_start; /* data offset in 512-byte sectors */
- sector_t hash_start; /* hash start in blocks */
- sector_t data_blocks; /* the number of data blocks */
- sector_t hash_blocks; /* the number of hash blocks */
- unsigned char data_dev_block_bits; /* log2(data blocksize) */
- unsigned char hash_dev_block_bits; /* log2(hash blocksize) */
- unsigned char hash_per_block_bits; /* log2(hashes in hash block) */
- unsigned char levels; /* the number of tree levels */
- unsigned char version;
- unsigned digest_size; /* digest size for the current hash algorithm */
- unsigned shash_descsize;/* the size of temporary space for crypto */
- int hash_failed; /* set to 1 if hash of any block failed */
-
- mempool_t *vec_mempool; /* mempool of bio vector */
-
- struct workqueue_struct *verify_wq;
-
- /* starting blocks for each tree level. 0 is the lowest level. */
- sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
-};
+#define DM_VERITY_ENV_LENGTH 42
+#define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR"
-struct dm_verity_io {
- struct dm_verity *v;
+#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
- /* original values of bio->bi_end_io and bio->bi_private */
- bio_end_io_t *orig_bi_end_io;
- void *orig_bi_private;
+#define DM_VERITY_MAX_CORRUPTED_ERRS 100
- sector_t block;
- unsigned n_blocks;
+#define DM_VERITY_OPT_LOGGING "ignore_corruption"
+#define DM_VERITY_OPT_RESTART "restart_on_corruption"
+#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks"
- struct bvec_iter iter;
+#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC)
- struct work_struct work;
+static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
- /*
- * Three variably-size fields follow this struct:
- *
- * u8 hash_desc[v->shash_descsize];
- * u8 real_digest[v->digest_size];
- * u8 want_digest[v->digest_size];
- *
- * To access them use: io_hash_desc(), io_real_digest() and io_want_digest().
- */
-};
+module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
struct dm_verity_prefetch_work {
struct work_struct work;
@@ -95,21 +46,6 @@ struct dm_verity_prefetch_work {
unsigned n_blocks;
};
-static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
-{
- return (struct shash_desc *)(io + 1);
-}
-
-static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io)
-{
- return (u8 *)(io + 1) + v->shash_descsize;
-}
-
-static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io)
-{
- return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
-}
-
/*
* Auxiliary structure appended to each dm-bufio buffer. If the value
* hash_verified is nonzero, hash of the block has been verified.
@@ -156,6 +92,84 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
return block >> (level * v->hash_per_block_bits);
}
+/*
+ * Wrapper for crypto_shash_init, which handles verity salting.
+ */
+static int verity_hash_init(struct dm_verity *v, struct shash_desc *desc)
+{
+ int r;
+
+ desc->tfm = v->tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ r = crypto_shash_init(desc);
+
+ if (unlikely(r < 0)) {
+ DMERR("crypto_shash_init failed: %d", r);
+ return r;
+ }
+
+ if (likely(v->version >= 1)) {
+ r = crypto_shash_update(desc, v->salt, v->salt_size);
+
+ if (unlikely(r < 0)) {
+ DMERR("crypto_shash_update failed: %d", r);
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+static int verity_hash_update(struct dm_verity *v, struct shash_desc *desc,
+ const u8 *data, size_t len)
+{
+ int r = crypto_shash_update(desc, data, len);
+
+ if (unlikely(r < 0))
+ DMERR("crypto_shash_update failed: %d", r);
+
+ return r;
+}
+
+static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc,
+ u8 *digest)
+{
+ int r;
+
+ if (unlikely(!v->version)) {
+ r = crypto_shash_update(desc, v->salt, v->salt_size);
+
+ if (r < 0) {
+ DMERR("crypto_shash_update failed: %d", r);
+ return r;
+ }
+ }
+
+ r = crypto_shash_final(desc, digest);
+
+ if (unlikely(r < 0))
+ DMERR("crypto_shash_final failed: %d", r);
+
+ return r;
+}
+
+int verity_hash(struct dm_verity *v, struct shash_desc *desc,
+ const u8 *data, size_t len, u8 *digest)
+{
+ int r;
+
+ r = verity_hash_init(v, desc);
+ if (unlikely(r < 0))
+ return r;
+
+ r = verity_hash_update(v, desc, data, len);
+ if (unlikely(r < 0))
+ return r;
+
+ return verity_hash_final(v, desc, digest);
+}
+
static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
sector_t *hash_block, unsigned *offset)
{
@@ -175,20 +189,71 @@ static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
}
/*
+ * Handle verification errors.
+ */
+static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
+ unsigned long long block)
+{
+ char verity_env[DM_VERITY_ENV_LENGTH];
+ char *envp[] = { verity_env, NULL };
+ const char *type_str = "";
+ struct mapped_device *md = dm_table_get_md(v->ti->table);
+
+ /* Corruption should be visible in device status in all modes */
+ v->hash_failed = 1;
+
+ if (v->corrupted_errs >= DM_VERITY_MAX_CORRUPTED_ERRS)
+ goto out;
+
+ v->corrupted_errs++;
+
+ switch (type) {
+ case DM_VERITY_BLOCK_TYPE_DATA:
+ type_str = "data";
+ break;
+ case DM_VERITY_BLOCK_TYPE_METADATA:
+ type_str = "metadata";
+ break;
+ default:
+ BUG();
+ }
+
+ DMERR("%s: %s block %llu is corrupted", v->data_dev->name, type_str,
+ block);
+
+ if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS)
+ DMERR("%s: reached maximum errors", v->data_dev->name);
+
+ snprintf(verity_env, DM_VERITY_ENV_LENGTH, "%s=%d,%llu",
+ DM_VERITY_ENV_VAR_NAME, type, block);
+
+ kobject_uevent_env(&disk_to_dev(dm_disk(md))->kobj, KOBJ_CHANGE, envp);
+
+out:
+ if (v->mode == DM_VERITY_MODE_LOGGING)
+ return 0;
+
+ if (v->mode == DM_VERITY_MODE_RESTART)
+ kernel_restart("dm-verity device corrupted");
+
+ return 1;
+}
+
+/*
* Verify hash of a metadata block pertaining to the specified data block
* ("block" argument) at a specified level ("level" argument).
*
- * On successful return, io_want_digest(v, io) contains the hash value for
- * a lower tree level or for the data block (if we're at the lowest leve).
+ * On successful return, verity_io_want_digest(v, io) contains the hash value
+ * for a lower tree level or for the data block (if we're at the lowest level).
*
* If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
* If "skip_unverified" is false, unverified buffer is hashed and verified
- * against current value of io_want_digest(v, io).
+ * against current value of verity_io_want_digest(v, io).
*/
-static int verity_verify_level(struct dm_verity_io *io, sector_t block,
- int level, bool skip_unverified)
+static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
+ sector_t block, int level, bool skip_unverified,
+ u8 *want_digest)
{
- struct dm_verity *v = io->v;
struct dm_buffer *buf;
struct buffer_aux *aux;
u8 *data;
@@ -199,78 +264,134 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block,
verity_hash_at_level(v, block, level, &hash_block, &offset);
data = dm_bufio_read(v->bufio, hash_block, &buf);
- if (unlikely(IS_ERR(data)))
+ if (IS_ERR(data))
return PTR_ERR(data);
aux = dm_bufio_get_aux_data(buf);
if (!aux->hash_verified) {
- struct shash_desc *desc;
- u8 *result;
-
if (skip_unverified) {
r = 1;
goto release_ret_r;
}
- desc = io_hash_desc(v, io);
- desc->tfm = v->tfm;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
- r = crypto_shash_init(desc);
- if (r < 0) {
- DMERR("crypto_shash_init failed: %d", r);
+ r = verity_hash(v, verity_io_hash_desc(v, io),
+ data, 1 << v->hash_dev_block_bits,
+ verity_io_real_digest(v, io));
+ if (unlikely(r < 0))
goto release_ret_r;
- }
- if (likely(v->version >= 1)) {
- r = crypto_shash_update(desc, v->salt, v->salt_size);
- if (r < 0) {
- DMERR("crypto_shash_update failed: %d", r);
- goto release_ret_r;
- }
- }
-
- r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits);
- if (r < 0) {
- DMERR("crypto_shash_update failed: %d", r);
+ if (likely(memcmp(verity_io_real_digest(v, io), want_digest,
+ v->digest_size) == 0))
+ aux->hash_verified = 1;
+ else if (verity_fec_decode(v, io,
+ DM_VERITY_BLOCK_TYPE_METADATA,
+ hash_block, data, NULL) == 0)
+ aux->hash_verified = 1;
+ else if (verity_handle_err(v,
+ DM_VERITY_BLOCK_TYPE_METADATA,
+ hash_block)) {
+ r = -EIO;
goto release_ret_r;
}
+ }
- if (!v->version) {
- r = crypto_shash_update(desc, v->salt, v->salt_size);
- if (r < 0) {
- DMERR("crypto_shash_update failed: %d", r);
- goto release_ret_r;
- }
- }
+ data += offset;
+ memcpy(want_digest, data, v->digest_size);
+ r = 0;
- result = io_real_digest(v, io);
- r = crypto_shash_final(desc, result);
- if (r < 0) {
- DMERR("crypto_shash_final failed: %d", r);
- goto release_ret_r;
- }
- if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
- DMERR_LIMIT("metadata block %llu is corrupted",
- (unsigned long long)hash_block);
- v->hash_failed = 1;
- r = -EIO;
- goto release_ret_r;
- } else
- aux->hash_verified = 1;
+release_ret_r:
+ dm_bufio_release(buf);
+ return r;
+}
+
+/*
+ * Find a hash for a given block, write it to digest and verify the integrity
+ * of the hash tree if necessary.
+ */
+int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
+ sector_t block, u8 *digest, bool *is_zero)
+{
+ int r = 0, i;
+
+ if (likely(v->levels)) {
+ /*
+ * First, we try to get the requested hash for
+ * the current block. If the hash block itself is
+ * verified, zero is returned. If it isn't, this
+ * function returns 1 and we fall back to whole
+ * chain verification.
+ */
+ r = verity_verify_level(v, io, block, 0, true, digest);
+ if (likely(r <= 0))
+ goto out;
}
- data += offset;
+ memcpy(digest, v->root_digest, v->digest_size);
- memcpy(io_want_digest(v, io), data, v->digest_size);
+ for (i = v->levels - 1; i >= 0; i--) {
+ r = verity_verify_level(v, io, block, i, false, digest);
+ if (unlikely(r))
+ goto out;
+ }
+out:
+ if (!r && v->zero_digest)
+ *is_zero = !memcmp(v->zero_digest, digest, v->digest_size);
+ else
+ *is_zero = false;
+
+ return r;
+}
+
+/*
+ * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec
+ * starting from iter.
+ */
+int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
+ struct bvec_iter *iter,
+ int (*process)(struct dm_verity *v,
+ struct dm_verity_io *io, u8 *data,
+ size_t len))
+{
+ unsigned todo = 1 << v->data_dev_block_bits;
+ struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
+
+ do {
+ int r;
+ u8 *page;
+ unsigned len;
+ struct bio_vec bv = bio_iter_iovec(bio, *iter);
+
+ page = kmap_atomic(bv.bv_page);
+ len = bv.bv_len;
+
+ if (likely(len >= todo))
+ len = todo;
+
+ r = process(v, io, page + bv.bv_offset, len);
+ kunmap_atomic(page);
+
+ if (r < 0)
+ return r;
+
+ bio_advance_iter(bio, iter, len);
+ todo -= len;
+ } while (todo);
- dm_bufio_release(buf);
return 0;
+}
-release_ret_r:
- dm_bufio_release(buf);
+static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io,
+ u8 *data, size_t len)
+{
+ return verity_hash_update(v, verity_io_hash_desc(v, io), data, len);
+}
- return r;
+static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
+ u8 *data, size_t len)
+{
+ memset(data, 0, len);
+ return 0;
}
/*
@@ -278,100 +399,56 @@ release_ret_r:
*/
static int verity_verify_io(struct dm_verity_io *io)
{
+ bool is_zero;
struct dm_verity *v = io->v;
- struct bio *bio = dm_bio_from_per_bio_data(io,
- v->ti->per_bio_data_size);
+ struct bvec_iter start;
unsigned b;
- int i;
for (b = 0; b < io->n_blocks; b++) {
- struct shash_desc *desc;
- u8 *result;
int r;
- unsigned todo;
+ struct shash_desc *desc = verity_io_hash_desc(v, io);
+
+ r = verity_hash_for_block(v, io, io->block + b,
+ verity_io_want_digest(v, io),
+ &is_zero);
+ if (unlikely(r < 0))
+ return r;
- if (likely(v->levels)) {
+ if (is_zero) {
/*
- * First, we try to get the requested hash for
- * the current block. If the hash block itself is
- * verified, zero is returned. If it isn't, this
- * function returns 0 and we fall back to whole
- * chain verification.
+ * If we expect a zero block, don't validate, just
+ * return zeros.
*/
- int r = verity_verify_level(io, io->block + b, 0, true);
- if (likely(!r))
- goto test_block_hash;
- if (r < 0)
+ r = verity_for_bv_block(v, io, &io->iter,
+ verity_bv_zero);
+ if (unlikely(r < 0))
return r;
- }
- memcpy(io_want_digest(v, io), v->root_digest, v->digest_size);
-
- for (i = v->levels - 1; i >= 0; i--) {
- int r = verity_verify_level(io, io->block + b, i, false);
- if (unlikely(r))
- return r;
+ continue;
}
-test_block_hash:
- desc = io_hash_desc(v, io);
- desc->tfm = v->tfm;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
- r = crypto_shash_init(desc);
- if (r < 0) {
- DMERR("crypto_shash_init failed: %d", r);
+ r = verity_hash_init(v, desc);
+ if (unlikely(r < 0))
return r;
- }
-
- if (likely(v->version >= 1)) {
- r = crypto_shash_update(desc, v->salt, v->salt_size);
- if (r < 0) {
- DMERR("crypto_shash_update failed: %d", r);
- return r;
- }
- }
- todo = 1 << v->data_dev_block_bits;
- do {
- u8 *page;
- unsigned len;
- struct bio_vec bv = bio_iter_iovec(bio, io->iter);
-
- page = kmap_atomic(bv.bv_page);
- len = bv.bv_len;
- if (likely(len >= todo))
- len = todo;
- r = crypto_shash_update(desc, page + bv.bv_offset, len);
- kunmap_atomic(page);
-
- if (r < 0) {
- DMERR("crypto_shash_update failed: %d", r);
- return r;
- }
- bio_advance_iter(bio, &io->iter, len);
- todo -= len;
- } while (todo);
-
- if (!v->version) {
- r = crypto_shash_update(desc, v->salt, v->salt_size);
- if (r < 0) {
- DMERR("crypto_shash_update failed: %d", r);
- return r;
- }
- }
+ start = io->iter;
+ r = verity_for_bv_block(v, io, &io->iter, verity_bv_hash_update);
+ if (unlikely(r < 0))
+ return r;
- result = io_real_digest(v, io);
- r = crypto_shash_final(desc, result);
- if (r < 0) {
- DMERR("crypto_shash_final failed: %d", r);
+ r = verity_hash_final(v, desc, verity_io_real_digest(v, io));
+ if (unlikely(r < 0))
return r;
- }
- if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
- DMERR_LIMIT("data block %llu is corrupted",
- (unsigned long long)(io->block + b));
- v->hash_failed = 1;
+
+ if (likely(memcmp(verity_io_real_digest(v, io),
+ verity_io_want_digest(v, io), v->digest_size) == 0))
+ continue;
+ else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
+ io->block + b, NULL, &start) == 0)
+ continue;
+ else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
+ io->block + b))
return -EIO;
- }
}
return 0;
@@ -388,6 +465,8 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
bio->bi_end_io = io->orig_bi_end_io;
bio->bi_private = io->orig_bi_private;
+ verity_fec_finish_io(io);
+
bio_endio_nodec(bio, error);
}
@@ -402,7 +481,7 @@ static void verity_end_io(struct bio *bio, int error)
{
struct dm_verity_io *io = bio->bi_private;
- if (error) {
+ if (error && !verity_fec_is_enabled(io->v)) {
verity_finish_io(io, error);
return;
}
@@ -422,6 +501,7 @@ static void verity_prefetch_io(struct work_struct *work)
container_of(work, struct dm_verity_prefetch_work, work);
struct dm_verity *v = pw->v;
int i;
+ sector_t prefetch_size;
for (i = v->levels - 2; i >= 0; i--) {
sector_t hash_block_start;
@@ -444,8 +524,14 @@ static void verity_prefetch_io(struct work_struct *work)
hash_block_end = v->hash_blocks - 1;
}
no_prefetch_cluster:
+ // for emmc, it is more efficient to send bigger read
+ prefetch_size = max((sector_t)CONFIG_DM_VERITY_HASH_PREFETCH_MIN_SIZE,
+ hash_block_end - hash_block_start + 1);
+ if ((hash_block_start + prefetch_size) >= (v->hash_start + v->hash_blocks)) {
+ prefetch_size = hash_block_end - hash_block_start + 1;
+ }
dm_bufio_prefetch(v->bufio, hash_block_start,
- hash_block_end - hash_block_start + 1);
+ prefetch_size);
}
kfree(pw);
@@ -472,7 +558,7 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
* Bio map function. It allocates dm_verity_io structure and bio vector and
* fills them. Then it issues prefetches and the I/O.
*/
-static int verity_map(struct dm_target *ti, struct bio *bio)
+int verity_map(struct dm_target *ti, struct bio *bio)
{
struct dm_verity *v = ti->private;
struct dm_verity_io *io;
@@ -506,20 +592,24 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
bio->bi_private = io;
io->iter = bio->bi_iter;
+ verity_fec_init_io(io);
+
verity_submit_prefetch(v, io);
generic_make_request(bio);
return DM_MAPIO_SUBMITTED;
}
+EXPORT_SYMBOL_GPL(verity_map);
/*
* Status: V (valid) or C (corruption found)
*/
-static void verity_status(struct dm_target *ti, status_type_t type,
+void verity_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
struct dm_verity *v = ti->private;
+ unsigned args = 0;
unsigned sz = 0;
unsigned x;
@@ -546,11 +636,37 @@ static void verity_status(struct dm_target *ti, status_type_t type,
else
for (x = 0; x < v->salt_size; x++)
DMEMIT("%02x", v->salt[x]);
+ if (v->mode != DM_VERITY_MODE_EIO)
+ args++;
+ if (verity_fec_is_enabled(v))
+ args += DM_VERITY_OPTS_FEC;
+ if (v->zero_digest)
+ args++;
+ if (!args)
+ return;
+ DMEMIT(" %u", args);
+ if (v->mode != DM_VERITY_MODE_EIO) {
+ DMEMIT(" ");
+ switch (v->mode) {
+ case DM_VERITY_MODE_LOGGING:
+ DMEMIT(DM_VERITY_OPT_LOGGING);
+ break;
+ case DM_VERITY_MODE_RESTART:
+ DMEMIT(DM_VERITY_OPT_RESTART);
+ break;
+ default:
+ BUG();
+ }
+ }
+ if (v->zero_digest)
+ DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES);
+ sz = verity_fec_status_table(v, sz, result, maxlen);
break;
}
}
+EXPORT_SYMBOL_GPL(verity_status);
-static int verity_ioctl(struct dm_target *ti, unsigned cmd,
+int verity_ioctl(struct dm_target *ti, unsigned cmd,
unsigned long arg)
{
struct dm_verity *v = ti->private;
@@ -563,8 +679,9 @@ static int verity_ioctl(struct dm_target *ti, unsigned cmd,
return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode,
cmd, arg);
}
+EXPORT_SYMBOL_GPL(verity_ioctl);
-static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
struct bio_vec *biovec, int max_size)
{
struct dm_verity *v = ti->private;
@@ -578,16 +695,18 @@ static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
}
+EXPORT_SYMBOL_GPL(verity_merge);
-static int verity_iterate_devices(struct dm_target *ti,
+int verity_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct dm_verity *v = ti->private;
return fn(ti, v->data_dev, v->data_start, ti->len, data);
}
+EXPORT_SYMBOL_GPL(verity_iterate_devices);
-static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
+void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct dm_verity *v = ti->private;
@@ -599,22 +718,21 @@ static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
blk_limits_io_min(limits, limits->logical_block_size);
}
+EXPORT_SYMBOL_GPL(verity_io_hints);
-static void verity_dtr(struct dm_target *ti)
+void verity_dtr(struct dm_target *ti)
{
struct dm_verity *v = ti->private;
if (v->verify_wq)
destroy_workqueue(v->verify_wq);
- if (v->vec_mempool)
- mempool_destroy(v->vec_mempool);
-
if (v->bufio)
dm_bufio_client_destroy(v->bufio);
kfree(v->salt);
kfree(v->root_digest);
+ kfree(v->zero_digest);
if (v->tfm)
crypto_free_shash(v->tfm);
@@ -627,8 +745,94 @@ static void verity_dtr(struct dm_target *ti)
if (v->data_dev)
dm_put_device(ti, v->data_dev);
+ verity_fec_dtr(v);
+
kfree(v);
}
+EXPORT_SYMBOL_GPL(verity_dtr);
+
+static int verity_alloc_zero_digest(struct dm_verity *v)
+{
+ int r = -ENOMEM;
+ struct shash_desc *desc;
+ u8 *zero_data;
+
+ v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL);
+
+ if (!v->zero_digest)
+ return r;
+
+ desc = kmalloc(v->shash_descsize, GFP_KERNEL);
+
+ if (!desc)
+ return r; /* verity_dtr will free zero_digest */
+
+ zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL);
+
+ if (!zero_data)
+ goto out;
+
+ r = verity_hash(v, desc, zero_data, 1 << v->data_dev_block_bits,
+ v->zero_digest);
+
+out:
+ kfree(desc);
+ kfree(zero_data);
+
+ return r;
+}
+
+static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
+{
+ int r;
+ unsigned argc;
+ struct dm_target *ti = v->ti;
+ const char *arg_name;
+
+ static struct dm_arg _args[] = {
+ {0, DM_VERITY_OPTS_MAX, "Invalid number of feature args"},
+ };
+
+ r = dm_read_arg_group(_args, as, &argc, &ti->error);
+ if (r)
+ return -EINVAL;
+
+ if (!argc)
+ return 0;
+
+ do {
+ arg_name = dm_shift_arg(as);
+ argc--;
+
+ if (!strcasecmp(arg_name, DM_VERITY_OPT_LOGGING)) {
+ v->mode = DM_VERITY_MODE_LOGGING;
+ continue;
+
+ } else if (!strcasecmp(arg_name, DM_VERITY_OPT_RESTART)) {
+ v->mode = DM_VERITY_MODE_RESTART;
+ continue;
+
+ } else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) {
+ r = verity_alloc_zero_digest(v);
+ if (r) {
+ ti->error = "Cannot allocate zero digest";
+ return r;
+ }
+ continue;
+
+ } else if (verity_is_fec_opt_arg(arg_name)) {
+ r = verity_fec_parse_opt_args(as, v, &argc, arg_name);
+ if (r)
+ return r;
+ continue;
+ }
+
+ ti->error = "Unrecognized verity feature request";
+ return -EINVAL;
+ } while (argc && !r);
+
+ return r;
+}
/*
* Target parameters:
@@ -644,10 +848,11 @@ static void verity_dtr(struct dm_target *ti)
* <digest>
* <salt> Hex string or "-" if no salt.
*/
-static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
struct dm_verity *v;
- unsigned num;
+ struct dm_arg_set as;
+ unsigned int num;
unsigned long long num_ll;
int r;
int i;
@@ -662,14 +867,18 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->private = v;
v->ti = ti;
+ r = verity_fec_ctr_alloc(v);
+ if (r)
+ goto bad;
+
if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
ti->error = "Device must be readonly";
r = -EINVAL;
goto bad;
}
- if (argc != 10) {
- ti->error = "Invalid argument count: exactly 10 arguments required";
+ if (argc < 10) {
+ ti->error = "Not enough arguments";
r = -EINVAL;
goto bad;
}
@@ -790,6 +999,19 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
}
+ argv += 10;
+ argc -= 10;
+
+ /* Optional parameters */
+ if (argc) {
+ as.argc = argc;
+ as.argv = argv;
+
+ r = verity_parse_opt_args(&as, v);
+ if (r < 0)
+ goto bad;
+ }
+
v->hash_per_block_bits =
__fls((1 << v->hash_dev_block_bits) / v->digest_size);
@@ -837,24 +1059,26 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
- ti->per_bio_data_size = roundup(sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2, __alignof__(struct dm_verity_io));
-
- v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
- BIO_MAX_PAGES * sizeof(struct bio_vec));
- if (!v->vec_mempool) {
- ti->error = "Cannot allocate vector mempool";
- r = -ENOMEM;
- goto bad;
- }
-
/* WQ_UNBOUND greatly improves performance when running on ramdisk */
- v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
+ v->verify_wq = alloc_workqueue("kverityd",
+ WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_UNBOUND,
+ num_online_cpus());
if (!v->verify_wq) {
ti->error = "Cannot allocate workqueue";
r = -ENOMEM;
goto bad;
}
+ ti->per_bio_data_size = sizeof(struct dm_verity_io) +
+ v->shash_descsize + v->digest_size * 2;
+
+ r = verity_fec_ctr(v);
+ if (r)
+ goto bad;
+
+ ti->per_bio_data_size = roundup(ti->per_bio_data_size,
+ __alignof__(struct dm_verity_io));
+
return 0;
bad:
@@ -862,10 +1086,11 @@ bad:
return r;
}
+EXPORT_SYMBOL_GPL(verity_ctr);
static struct target_type verity_target = {
.name = "verity",
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
new file mode 100644
index 000000000000..47a96a56794b
--- /dev/null
+++ b/drivers/md/dm-verity.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * Author: Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef DM_VERITY_H
+#define DM_VERITY_H
+
+#include "dm-bufio.h"
+#include <linux/device-mapper.h>
+#include <crypto/hash.h>
+
+#define DM_VERITY_MAX_LEVELS 63
+
+enum verity_mode {
+ DM_VERITY_MODE_EIO,
+ DM_VERITY_MODE_LOGGING,
+ DM_VERITY_MODE_RESTART
+};
+
+enum verity_block_type {
+ DM_VERITY_BLOCK_TYPE_DATA,
+ DM_VERITY_BLOCK_TYPE_METADATA
+};
+
+struct dm_verity_fec;
+
+struct dm_verity {
+ struct dm_dev *data_dev;
+ struct dm_dev *hash_dev;
+ struct dm_target *ti;
+ struct dm_bufio_client *bufio;
+ char *alg_name;
+ struct crypto_shash *tfm;
+ u8 *root_digest; /* digest of the root block */
+ u8 *salt; /* salt: its size is salt_size */
+ u8 *zero_digest; /* digest for a zero block */
+ unsigned salt_size;
+ sector_t data_start; /* data offset in 512-byte sectors */
+ sector_t hash_start; /* hash start in blocks */
+ sector_t data_blocks; /* the number of data blocks */
+ sector_t hash_blocks; /* the number of hash blocks */
+ unsigned char data_dev_block_bits; /* log2(data blocksize) */
+ unsigned char hash_dev_block_bits; /* log2(hash blocksize) */
+ unsigned char hash_per_block_bits; /* log2(hashes in hash block) */
+ unsigned char levels; /* the number of tree levels */
+ unsigned char version;
+ unsigned digest_size; /* digest size for the current hash algorithm */
+ unsigned shash_descsize;/* the size of temporary space for crypto */
+ int hash_failed; /* set to 1 if hash of any block failed */
+ enum verity_mode mode; /* mode for handling verification errors */
+ unsigned corrupted_errs;/* Number of errors for corrupted blocks */
+
+ struct workqueue_struct *verify_wq;
+
+ /* starting blocks for each tree level. 0 is the lowest level. */
+ sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
+
+ struct dm_verity_fec *fec; /* forward error correction */
+};
+
+struct dm_verity_io {
+ struct dm_verity *v;
+
+ /* original values of bio->bi_end_io and bio->bi_private */
+ bio_end_io_t *orig_bi_end_io;
+ void *orig_bi_private;
+
+ sector_t block;
+ unsigned n_blocks;
+
+ struct bvec_iter iter;
+
+ struct work_struct work;
+
+ /*
+ * Three variably-size fields follow this struct:
+ *
+ * u8 hash_desc[v->shash_descsize];
+ * u8 real_digest[v->digest_size];
+ * u8 want_digest[v->digest_size];
+ *
+ * To access them use: verity_io_hash_desc(), verity_io_real_digest()
+ * and verity_io_want_digest().
+ */
+};
+
+static inline struct shash_desc *verity_io_hash_desc(struct dm_verity *v,
+ struct dm_verity_io *io)
+{
+ return (struct shash_desc *)(io + 1);
+}
+
+static inline u8 *verity_io_real_digest(struct dm_verity *v,
+ struct dm_verity_io *io)
+{
+ return (u8 *)(io + 1) + v->shash_descsize;
+}
+
+static inline u8 *verity_io_want_digest(struct dm_verity *v,
+ struct dm_verity_io *io)
+{
+ return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
+}
+
+static inline u8 *verity_io_digest_end(struct dm_verity *v,
+ struct dm_verity_io *io)
+{
+ return verity_io_want_digest(v, io) + v->digest_size;
+}
+
+extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
+ struct bvec_iter *iter,
+ int (*process)(struct dm_verity *v,
+ struct dm_verity_io *io,
+ u8 *data, size_t len));
+
+extern int verity_hash(struct dm_verity *v, struct shash_desc *desc,
+ const u8 *data, size_t len, u8 *digest);
+
+extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
+ sector_t block, u8 *digest, bool *is_zero);
+
+extern void verity_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen);
+extern int verity_ioctl(struct dm_target *ti, unsigned cmd,
+ unsigned long arg);
+extern int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size);
+extern int verity_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data);
+extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits);
+extern void verity_dtr(struct dm_target *ti);
+extern int verity_ctr(struct dm_target *ti, unsigned argc, char **argv);
+extern int verity_map(struct dm_target *ti, struct bio *bio);
+#endif /* DM_VERITY_H */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index fb07be386287..6bba5ccc731e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -3021,6 +3021,7 @@ struct gendisk *dm_disk(struct mapped_device *md)
{
return md->disk;
}
+EXPORT_SYMBOL_GPL(dm_disk);
struct kobject *dm_kobject(struct mapped_device *md)
{
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 878d5430973c..d42d6b478111 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -402,6 +402,10 @@ config TI_DAC7512
This driver can also be built as a module. If so, the module
will be called ti_dac7512.
+config UID_STAT
+ bool "UID based statistics tracking exported to /proc/uid_stat"
+ default n
+
config VMWARE_BALLOON
tristate "VMware Balloon Driver"
depends on X86 && HYPERVISOR_GUEST
@@ -515,6 +519,19 @@ config VEXPRESS_SYSCFG
bus. System Configuration interface is one of the possible means
of generating transactions on this bus.
+config UID_SYS_STATS
+ bool "Per-UID statistics"
+ depends on PROFILING
+ help
+ Per UID based cpu time statistics exported to /proc/uid_cputime
+ Per UID based io statistics exported to /proc/uid_io
+
+config MEMORY_STATE_TIME
+ tristate "Memory freq/bandwidth time statistics"
+ depends on PROFILING
+ help
+ Memory time statistics exported to /sys/kernel/memory_state_time
+
source "drivers/misc/c2port/Kconfig"
source "drivers/misc/eeprom/Kconfig"
source "drivers/misc/cb710/Kconfig"
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 7d5c4cd118c4..f3f592a94905 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_ISL29020) += isl29020.o
obj-$(CONFIG_SENSORS_TSL2550) += tsl2550.o
obj-$(CONFIG_DS1682) += ds1682.o
obj-$(CONFIG_TI_DAC7512) += ti_dac7512.o
+obj-$(CONFIG_UID_STAT) += uid_stat.o
obj-$(CONFIG_C2PORT) += c2port/
obj-$(CONFIG_HMC6352) += hmc6352.o
obj-y += eeprom/
@@ -56,3 +57,5 @@ obj-$(CONFIG_GENWQE) += genwqe/
obj-$(CONFIG_ECHO) += echo/
obj-$(CONFIG_VEXPRESS_SYSCFG) += vexpress-syscfg.o
obj-$(CONFIG_CXL_BASE) += cxl/
+obj-$(CONFIG_UID_SYS_STATS) += uid_sys_stats.o
+obj-$(CONFIG_MEMORY_STATE_TIME) += memory_state_time.o
diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c
index b5abe34120b8..da9fb01cd441 100644
--- a/drivers/misc/lkdtm.c
+++ b/drivers/misc/lkdtm.c
@@ -103,6 +103,7 @@ enum ctype {
CT_EXEC_USERSPACE,
CT_ACCESS_USERSPACE,
CT_WRITE_RO,
+ CT_WRITE_RO_AFTER_INIT,
CT_WRITE_KERN,
};
@@ -140,6 +141,7 @@ static char* cp_type[] = {
"EXEC_USERSPACE",
"ACCESS_USERSPACE",
"WRITE_RO",
+ "WRITE_RO_AFTER_INIT",
"WRITE_KERN",
};
@@ -162,6 +164,7 @@ static DEFINE_SPINLOCK(lock_me_up);
static u8 data_area[EXEC_SIZE];
static const unsigned long rodata = 0xAA55AA55;
+static unsigned long ro_after_init __ro_after_init = 0x55AA5500;
module_param(recur_count, int, 0644);
MODULE_PARM_DESC(recur_count, " Recursion level for the stack overflow test");
@@ -497,11 +500,28 @@ static void lkdtm_do_action(enum ctype which)
break;
}
case CT_WRITE_RO: {
- unsigned long *ptr;
+ /* Explicitly cast away "const" for the test. */
+ unsigned long *ptr = (unsigned long *)&rodata;
- ptr = (unsigned long *)&rodata;
+ pr_info("attempting bad rodata write at %p\n", ptr);
+ *ptr ^= 0xabcd1234;
- pr_info("attempting bad write at %p\n", ptr);
+ break;
+ }
+ case CT_WRITE_RO_AFTER_INIT: {
+ unsigned long *ptr = &ro_after_init;
+
+ /*
+ * Verify we were written to during init. Since an Oops
+ * is considered a "success", a failure is to just skip the
+ * real test.
+ */
+ if ((*ptr & 0xAA) != 0xAA) {
+ pr_info("%p was NOT written during init!?\n", ptr);
+ break;
+ }
+
+ pr_info("attempting bad ro_after_init write at %p\n", ptr);
*ptr ^= 0xabcd1234;
break;
@@ -811,6 +831,9 @@ static int __init lkdtm_module_init(void)
int n_debugfs_entries = 1; /* Assume only the direct entry */
int i;
+ /* Make sure we can write to __ro_after_init values during __init */
+ ro_after_init |= 0xAA;
+
/* Register debugfs interface */
lkdtm_debugfs_root = debugfs_create_dir("provoke-crash", NULL);
if (!lkdtm_debugfs_root) {
diff --git a/drivers/misc/memory_state_time.c b/drivers/misc/memory_state_time.c
new file mode 100644
index 000000000000..ba94dcf09169
--- /dev/null
+++ b/drivers/misc/memory_state_time.c
@@ -0,0 +1,462 @@
+/* drivers/misc/memory_state_time.c
+ *
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/hashtable.h>
+#include <linux/kconfig.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/memory-state-time.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of_platform.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/time.h>
+#include <linux/timekeeping.h>
+#include <linux/workqueue.h>
+
+#define KERNEL_ATTR_RO(_name) \
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define KERNEL_ATTR_RW(_name) \
+static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+#define FREQ_HASH_BITS 4
+DECLARE_HASHTABLE(freq_hash_table, FREQ_HASH_BITS);
+
+static DEFINE_MUTEX(mem_lock);
+
+#define TAG "memory_state_time"
+#define BW_NODE "/soc/memory-state-time"
+#define FREQ_TBL "freq-tbl"
+#define BW_TBL "bw-buckets"
+#define NUM_SOURCES "num-sources"
+
+#define LOWEST_FREQ 2
+
+static int curr_bw;
+static int curr_freq;
+static u32 *bw_buckets;
+static u32 *freq_buckets;
+static int num_freqs;
+static int num_buckets;
+static int registered_bw_sources;
+static u64 last_update;
+static bool init_success;
+static struct workqueue_struct *memory_wq;
+static u32 num_sources = 10;
+static int *bandwidths;
+
+struct freq_entry {
+ int freq;
+ u64 *buckets; /* Bandwidth buckets. */
+ struct hlist_node hash;
+};
+
+struct queue_container {
+ struct work_struct update_state;
+ int value;
+ u64 time_now;
+ int id;
+ struct mutex *lock;
+};
+
+static int find_bucket(int bw)
+{
+ int i;
+
+ if (bw_buckets != NULL) {
+ for (i = 0; i < num_buckets; i++) {
+ if (bw_buckets[i] > bw) {
+ pr_debug("Found bucket %d for bandwidth %d\n",
+ i, bw);
+ return i;
+ }
+ }
+ return num_buckets - 1;
+ }
+ return 0;
+}
+
+static u64 get_time_diff(u64 time_now)
+{
+ u64 ms;
+
+ ms = time_now - last_update;
+ last_update = time_now;
+ return ms;
+}
+
+static ssize_t show_stat_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int i, j;
+ int len = 0;
+ struct freq_entry *freq_entry;
+
+ for (i = 0; i < num_freqs; i++) {
+ hash_for_each_possible(freq_hash_table, freq_entry, hash,
+ freq_buckets[i]) {
+ if (freq_entry->freq == freq_buckets[i]) {
+ len += scnprintf(buf + len, PAGE_SIZE - len,
+ "%d ", freq_buckets[i]);
+ if (len >= PAGE_SIZE)
+ break;
+ for (j = 0; j < num_buckets; j++) {
+ len += scnprintf(buf + len,
+ PAGE_SIZE - len,
+ "%llu ",
+ freq_entry->buckets[j]);
+ }
+ len += scnprintf(buf + len, PAGE_SIZE - len,
+ "\n");
+ }
+ }
+ }
+ pr_debug("Current Time: %llu\n", ktime_get_boot_ns());
+ return len;
+}
+KERNEL_ATTR_RO(show_stat);
+
+static void update_table(u64 time_now)
+{
+ struct freq_entry *freq_entry;
+
+ pr_debug("Last known bw %d freq %d\n", curr_bw, curr_freq);
+ hash_for_each_possible(freq_hash_table, freq_entry, hash, curr_freq) {
+ if (curr_freq == freq_entry->freq) {
+ freq_entry->buckets[find_bucket(curr_bw)]
+ += get_time_diff(time_now);
+ break;
+ }
+ }
+}
+
+static bool freq_exists(int freq)
+{
+ int i;
+
+ for (i = 0; i < num_freqs; i++) {
+ if (freq == freq_buckets[i])
+ return true;
+ }
+ return false;
+}
+
+static int calculate_total_bw(int bw, int index)
+{
+ int i;
+ int total_bw = 0;
+
+ pr_debug("memory_state_time New bw %d for id %d\n", bw, index);
+ bandwidths[index] = bw;
+ for (i = 0; i < registered_bw_sources; i++)
+ total_bw += bandwidths[i];
+ return total_bw;
+}
+
+static void freq_update_do_work(struct work_struct *work)
+{
+ struct queue_container *freq_state_update
+ = container_of(work, struct queue_container,
+ update_state);
+ if (freq_state_update) {
+ mutex_lock(&mem_lock);
+ update_table(freq_state_update->time_now);
+ curr_freq = freq_state_update->value;
+ mutex_unlock(&mem_lock);
+ kfree(freq_state_update);
+ }
+}
+
+static void bw_update_do_work(struct work_struct *work)
+{
+ struct queue_container *bw_state_update
+ = container_of(work, struct queue_container,
+ update_state);
+ if (bw_state_update) {
+ mutex_lock(&mem_lock);
+ update_table(bw_state_update->time_now);
+ curr_bw = calculate_total_bw(bw_state_update->value,
+ bw_state_update->id);
+ mutex_unlock(&mem_lock);
+ kfree(bw_state_update);
+ }
+}
+
+static void memory_state_freq_update(struct memory_state_update_block *ub,
+ int value)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+ if (freq_exists(value) && init_success) {
+ struct queue_container *freq_container
+ = kmalloc(sizeof(struct queue_container),
+ GFP_KERNEL);
+ if (!freq_container)
+ return;
+ INIT_WORK(&freq_container->update_state,
+ freq_update_do_work);
+ freq_container->time_now = ktime_get_boot_ns();
+ freq_container->value = value;
+ pr_debug("Scheduling freq update in work queue\n");
+ queue_work(memory_wq, &freq_container->update_state);
+ } else {
+ pr_debug("Freq does not exist.\n");
+ }
+ }
+}
+
+static void memory_state_bw_update(struct memory_state_update_block *ub,
+ int value)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+ if (init_success) {
+ struct queue_container *bw_container
+ = kmalloc(sizeof(struct queue_container),
+ GFP_KERNEL);
+ if (!bw_container)
+ return;
+ INIT_WORK(&bw_container->update_state,
+ bw_update_do_work);
+ bw_container->time_now = ktime_get_boot_ns();
+ bw_container->value = value;
+ bw_container->id = ub->id;
+ pr_debug("Scheduling bandwidth update in work queue\n");
+ queue_work(memory_wq, &bw_container->update_state);
+ }
+ }
+}
+
+struct memory_state_update_block *memory_state_register_frequency_source(void)
+{
+ struct memory_state_update_block *block;
+
+ if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+ pr_debug("Allocating frequency source\n");
+ block = kmalloc(sizeof(struct memory_state_update_block),
+ GFP_KERNEL);
+ if (!block)
+ return NULL;
+ block->update_call = memory_state_freq_update;
+ return block;
+ }
+ pr_err("Config option disabled.\n");
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(memory_state_register_frequency_source);
+
+struct memory_state_update_block *memory_state_register_bandwidth_source(void)
+{
+ struct memory_state_update_block *block;
+
+ if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+ pr_debug("Allocating bandwidth source %d\n",
+ registered_bw_sources);
+ block = kmalloc(sizeof(struct memory_state_update_block),
+ GFP_KERNEL);
+ if (!block)
+ return NULL;
+ block->update_call = memory_state_bw_update;
+ if (registered_bw_sources < num_sources) {
+ block->id = registered_bw_sources++;
+ } else {
+ pr_err("Unable to allocate source; max number reached\n");
+ kfree(block);
+ return NULL;
+ }
+ return block;
+ }
+ pr_err("Config option disabled.\n");
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(memory_state_register_bandwidth_source);
+
+/* Buckets are designated by their maximum.
+ * Returns the buckets decided by the capability of the device.
+ */
+static int get_bw_buckets(struct device *dev)
+{
+ int ret, lenb;
+ struct device_node *node = dev->of_node;
+
+ of_property_read_u32(node, NUM_SOURCES, &num_sources);
+ if (!of_find_property(node, BW_TBL, &lenb)) {
+ pr_err("Missing %s property\n", BW_TBL);
+ return -ENODATA;
+ }
+
+ bandwidths = devm_kzalloc(dev,
+ sizeof(*bandwidths) * num_sources, GFP_KERNEL);
+ if (!bandwidths)
+ return -ENOMEM;
+ lenb /= sizeof(*bw_buckets);
+ bw_buckets = devm_kzalloc(dev, lenb * sizeof(*bw_buckets),
+ GFP_KERNEL);
+ if (!bw_buckets) {
+ devm_kfree(dev, bandwidths);
+ return -ENOMEM;
+ }
+ ret = of_property_read_u32_array(node, BW_TBL, bw_buckets,
+ lenb);
+ if (ret < 0) {
+ devm_kfree(dev, bandwidths);
+ devm_kfree(dev, bw_buckets);
+ pr_err("Unable to read bandwidth table from device tree.\n");
+ return ret;
+ }
+
+ curr_bw = 0;
+ num_buckets = lenb;
+ return 0;
+}
+
+/* Adds struct freq_entry nodes to the hashtable for each compatible frequency.
+ * Returns the supported number of frequencies.
+ */
+static int freq_buckets_init(struct device *dev)
+{
+ struct freq_entry *freq_entry;
+ int i;
+ int ret, lenf;
+ struct device_node *node = dev->of_node;
+
+ if (!of_find_property(node, FREQ_TBL, &lenf)) {
+ pr_err("Missing %s property\n", FREQ_TBL);
+ return -ENODATA;
+ }
+
+ lenf /= sizeof(*freq_buckets);
+ freq_buckets = devm_kzalloc(dev, lenf * sizeof(*freq_buckets),
+ GFP_KERNEL);
+ if (!freq_buckets)
+ return -ENOMEM;
+ pr_debug("freqs found len %d\n", lenf);
+ ret = of_property_read_u32_array(node, FREQ_TBL, freq_buckets,
+ lenf);
+ if (ret < 0) {
+ devm_kfree(dev, freq_buckets);
+ pr_err("Unable to read frequency table from device tree.\n");
+ return ret;
+ }
+ pr_debug("ret freq %d\n", ret);
+
+ num_freqs = lenf;
+ curr_freq = freq_buckets[LOWEST_FREQ];
+
+ for (i = 0; i < num_freqs; i++) {
+ freq_entry = devm_kzalloc(dev, sizeof(struct freq_entry),
+ GFP_KERNEL);
+ if (!freq_entry)
+ return -ENOMEM;
+ freq_entry->buckets = devm_kzalloc(dev, sizeof(u64)*num_buckets,
+ GFP_KERNEL);
+ if (!freq_entry->buckets) {
+ devm_kfree(dev, freq_entry);
+ return -ENOMEM;
+ }
+ pr_debug("memory_state_time Adding freq to ht %d\n",
+ freq_buckets[i]);
+ freq_entry->freq = freq_buckets[i];
+ hash_add(freq_hash_table, &freq_entry->hash, freq_buckets[i]);
+ }
+ return 0;
+}
+
+struct kobject *memory_kobj;
+EXPORT_SYMBOL_GPL(memory_kobj);
+
+static struct attribute *memory_attrs[] = {
+ &show_stat_attr.attr,
+ NULL
+};
+
+static struct attribute_group memory_attr_group = {
+ .attrs = memory_attrs,
+};
+
+static int memory_state_time_probe(struct platform_device *pdev)
+{
+ int error;
+
+ error = get_bw_buckets(&pdev->dev);
+ if (error)
+ return error;
+ error = freq_buckets_init(&pdev->dev);
+ if (error)
+ return error;
+ last_update = ktime_get_boot_ns();
+ init_success = true;
+
+ pr_debug("memory_state_time initialized with num_freqs %d\n",
+ num_freqs);
+ return 0;
+}
+
+static const struct of_device_id match_table[] = {
+ { .compatible = "memory-state-time" },
+ {}
+};
+
+static struct platform_driver memory_state_time_driver = {
+ .probe = memory_state_time_probe,
+ .driver = {
+ .name = "memory-state-time",
+ .of_match_table = match_table,
+ .owner = THIS_MODULE,
+ },
+};
+
+static int __init memory_state_time_init(void)
+{
+ int error;
+
+ hash_init(freq_hash_table);
+ memory_wq = create_singlethread_workqueue("memory_wq");
+ if (!memory_wq) {
+ pr_err("Unable to create workqueue.\n");
+ return -EINVAL;
+ }
+ /*
+ * Create sys/kernel directory for memory_state_time.
+ */
+ memory_kobj = kobject_create_and_add(TAG, kernel_kobj);
+ if (!memory_kobj) {
+ pr_err("Unable to allocate memory_kobj for sysfs directory.\n");
+ error = -ENOMEM;
+ goto wq;
+ }
+ error = sysfs_create_group(memory_kobj, &memory_attr_group);
+ if (error) {
+ pr_err("Unable to create sysfs folder.\n");
+ goto kobj;
+ }
+
+ error = platform_driver_register(&memory_state_time_driver);
+ if (error) {
+ pr_err("Unable to register memory_state_time platform driver.\n");
+ goto group;
+ }
+ return 0;
+
+group: sysfs_remove_group(memory_kobj, &memory_attr_group);
+kobj: kobject_put(memory_kobj);
+wq: destroy_workqueue(memory_wq);
+ return error;
+}
+module_init(memory_state_time_init);
diff --git a/drivers/misc/uid_stat.c b/drivers/misc/uid_stat.c
new file mode 100644
index 000000000000..27b516b8a90e
--- /dev/null
+++ b/drivers/misc/uid_stat.c
@@ -0,0 +1,152 @@
+/* drivers/misc/uid_stat.c
+ *
+ * Copyright (C) 2008 - 2009 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <asm/atomic.h>
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/uid_stat.h>
+#include <net/activity_stats.h>
+
+static DEFINE_SPINLOCK(uid_lock);
+static LIST_HEAD(uid_list);
+static struct proc_dir_entry *parent;
+
+struct uid_stat {
+ struct list_head link;
+ uid_t uid;
+ atomic_t tcp_rcv;
+ atomic_t tcp_snd;
+};
+
+static struct uid_stat *find_uid_stat(uid_t uid) {
+ struct uid_stat *entry;
+
+ list_for_each_entry(entry, &uid_list, link) {
+ if (entry->uid == uid) {
+ return entry;
+ }
+ }
+ return NULL;
+}
+
+static int uid_stat_atomic_int_show(struct seq_file *m, void *v)
+{
+ unsigned int bytes;
+ atomic_t *counter = m->private;
+
+ bytes = (unsigned int) (atomic_read(counter) + INT_MIN);
+ return seq_printf(m, "%u\n", bytes);
+}
+
+static int uid_stat_read_atomic_int_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, uid_stat_atomic_int_show, PDE_DATA(inode));
+}
+
+static const struct file_operations uid_stat_read_atomic_int_fops = {
+ .open = uid_stat_read_atomic_int_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/* Create a new entry for tracking the specified uid. */
+static struct uid_stat *create_stat(uid_t uid) {
+ struct uid_stat *new_uid;
+ /* Create the uid stat struct and append it to the list. */
+ new_uid = kmalloc(sizeof(struct uid_stat), GFP_ATOMIC);
+ if (!new_uid)
+ return NULL;
+
+ new_uid->uid = uid;
+ /* Counters start at INT_MIN, so we can track 4GB of network traffic. */
+ atomic_set(&new_uid->tcp_rcv, INT_MIN);
+ atomic_set(&new_uid->tcp_snd, INT_MIN);
+
+ list_add_tail(&new_uid->link, &uid_list);
+ return new_uid;
+}
+
+static void create_stat_proc(struct uid_stat *new_uid)
+{
+ char uid_s[32];
+ struct proc_dir_entry *entry;
+ sprintf(uid_s, "%d", new_uid->uid);
+ entry = proc_mkdir(uid_s, parent);
+
+ /* Keep reference to uid_stat so we know what uid to read stats from. */
+ proc_create_data("tcp_snd", S_IRUGO, entry,
+ &uid_stat_read_atomic_int_fops, &new_uid->tcp_snd);
+
+ proc_create_data("tcp_rcv", S_IRUGO, entry,
+ &uid_stat_read_atomic_int_fops, &new_uid->tcp_rcv);
+}
+
+static struct uid_stat *find_or_create_uid_stat(uid_t uid)
+{
+ struct uid_stat *entry;
+ unsigned long flags;
+ spin_lock_irqsave(&uid_lock, flags);
+ entry = find_uid_stat(uid);
+ if (entry) {
+ spin_unlock_irqrestore(&uid_lock, flags);
+ return entry;
+ }
+ entry = create_stat(uid);
+ spin_unlock_irqrestore(&uid_lock, flags);
+ if (entry)
+ create_stat_proc(entry);
+ return entry;
+}
+
+int uid_stat_tcp_snd(uid_t uid, int size) {
+ struct uid_stat *entry;
+ activity_stats_update();
+ entry = find_or_create_uid_stat(uid);
+ if (!entry)
+ return -1;
+ atomic_add(size, &entry->tcp_snd);
+ return 0;
+}
+
+int uid_stat_tcp_rcv(uid_t uid, int size) {
+ struct uid_stat *entry;
+ activity_stats_update();
+ entry = find_or_create_uid_stat(uid);
+ if (!entry)
+ return -1;
+ atomic_add(size, &entry->tcp_rcv);
+ return 0;
+}
+
+static int __init uid_stat_init(void)
+{
+ parent = proc_mkdir("uid_stat", NULL);
+ if (!parent) {
+ pr_err("uid_stat: failed to create proc entry\n");
+ return -1;
+ }
+ return 0;
+}
+
+__initcall(uid_stat_init);
diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
new file mode 100644
index 000000000000..d4ef4977e111
--- /dev/null
+++ b/drivers/misc/uid_sys_stats.c
@@ -0,0 +1,496 @@
+/* drivers/misc/uid_cputime.c
+ *
+ * Copyright (C) 2014 - 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/atomic.h>
+#include <linux/err.h>
+#include <linux/hashtable.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/profile.h>
+#include <linux/rtmutex.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#define UID_HASH_BITS 10
+DECLARE_HASHTABLE(hash_table, UID_HASH_BITS);
+
+static DEFINE_RT_MUTEX(uid_lock);
+static struct proc_dir_entry *cpu_parent;
+static struct proc_dir_entry *io_parent;
+static struct proc_dir_entry *proc_parent;
+
+struct io_stats {
+ u64 read_bytes;
+ u64 write_bytes;
+ u64 rchar;
+ u64 wchar;
+ u64 fsync;
+};
+
+#define UID_STATE_FOREGROUND 0
+#define UID_STATE_BACKGROUND 1
+#define UID_STATE_BUCKET_SIZE 2
+
+#define UID_STATE_TOTAL_CURR 2
+#define UID_STATE_TOTAL_LAST 3
+#define UID_STATE_DEAD_TASKS 4
+#define UID_STATE_SIZE 5
+
+struct uid_entry {
+ uid_t uid;
+ cputime_t utime;
+ cputime_t stime;
+ cputime_t active_utime;
+ cputime_t active_stime;
+ unsigned long long active_power;
+ unsigned long long power;
+ int state;
+ struct io_stats io[UID_STATE_SIZE];
+ struct hlist_node hash;
+};
+
+static struct uid_entry *find_uid_entry(uid_t uid)
+{
+ struct uid_entry *uid_entry;
+ hash_for_each_possible(hash_table, uid_entry, hash, uid) {
+ if (uid_entry->uid == uid)
+ return uid_entry;
+ }
+ return NULL;
+}
+
+static struct uid_entry *find_or_register_uid(uid_t uid)
+{
+ struct uid_entry *uid_entry;
+
+ uid_entry = find_uid_entry(uid);
+ if (uid_entry)
+ return uid_entry;
+
+ uid_entry = kzalloc(sizeof(struct uid_entry), GFP_ATOMIC);
+ if (!uid_entry)
+ return NULL;
+
+ uid_entry->uid = uid;
+
+ hash_add(hash_table, &uid_entry->hash, uid);
+
+ return uid_entry;
+}
+
+static int uid_cputime_show(struct seq_file *m, void *v)
+{
+ struct uid_entry *uid_entry;
+ struct task_struct *task, *temp;
+ struct user_namespace *user_ns = current_user_ns();
+ cputime_t utime;
+ cputime_t stime;
+ unsigned long bkt;
+ uid_t uid;
+
+ rt_mutex_lock(&uid_lock);
+
+ hash_for_each(hash_table, bkt, uid_entry, hash) {
+ uid_entry->active_stime = 0;
+ uid_entry->active_utime = 0;
+ uid_entry->active_power = 0;
+ }
+
+ read_lock(&tasklist_lock);
+ do_each_thread(temp, task) {
+ uid = from_kuid_munged(user_ns, task_uid(task));
+ uid_entry = find_or_register_uid(uid);
+ if (!uid_entry) {
+ read_unlock(&tasklist_lock);
+ rt_mutex_unlock(&uid_lock);
+ pr_err("%s: failed to find the uid_entry for uid %d\n",
+ __func__, uid);
+ return -ENOMEM;
+ }
+ /* if this task is exiting, we have already accounted for the
+ * time and power.
+ */
+ if (task->cpu_power == ULLONG_MAX)
+ continue;
+ task_cputime_adjusted(task, &utime, &stime);
+ uid_entry->active_utime += utime;
+ uid_entry->active_stime += stime;
+ uid_entry->active_power += task->cpu_power;
+ } while_each_thread(temp, task);
+ read_unlock(&tasklist_lock);
+
+ hash_for_each(hash_table, bkt, uid_entry, hash) {
+ cputime_t total_utime = uid_entry->utime +
+ uid_entry->active_utime;
+ cputime_t total_stime = uid_entry->stime +
+ uid_entry->active_stime;
+ unsigned long long total_power = uid_entry->power +
+ uid_entry->active_power;
+ seq_printf(m, "%d: %llu %llu %llu\n", uid_entry->uid,
+ (unsigned long long)jiffies_to_msecs(
+ cputime_to_jiffies(total_utime)) * USEC_PER_MSEC,
+ (unsigned long long)jiffies_to_msecs(
+ cputime_to_jiffies(total_stime)) * USEC_PER_MSEC,
+ total_power);
+ }
+
+ rt_mutex_unlock(&uid_lock);
+ return 0;
+}
+
+static int uid_cputime_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, uid_cputime_show, PDE_DATA(inode));
+}
+
+static const struct file_operations uid_cputime_fops = {
+ .open = uid_cputime_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int uid_remove_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, NULL, NULL);
+}
+
+static ssize_t uid_remove_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
+{
+ struct uid_entry *uid_entry;
+ struct hlist_node *tmp;
+ char uids[128];
+ char *start_uid, *end_uid = NULL;
+ long int uid_start = 0, uid_end = 0;
+
+ if (count >= sizeof(uids))
+ count = sizeof(uids) - 1;
+
+ if (copy_from_user(uids, buffer, count))
+ return -EFAULT;
+
+ uids[count] = '\0';
+ end_uid = uids;
+ start_uid = strsep(&end_uid, "-");
+
+ if (!start_uid || !end_uid)
+ return -EINVAL;
+
+ if (kstrtol(start_uid, 10, &uid_start) != 0 ||
+ kstrtol(end_uid, 10, &uid_end) != 0) {
+ return -EINVAL;
+ }
+ rt_mutex_lock(&uid_lock);
+
+ for (; uid_start <= uid_end; uid_start++) {
+ hash_for_each_possible_safe(hash_table, uid_entry, tmp,
+ hash, (uid_t)uid_start) {
+ if (uid_start == uid_entry->uid) {
+ hash_del(&uid_entry->hash);
+ kfree(uid_entry);
+ }
+ }
+ }
+
+ rt_mutex_unlock(&uid_lock);
+ return count;
+}
+
+static const struct file_operations uid_remove_fops = {
+ .open = uid_remove_open,
+ .release = single_release,
+ .write = uid_remove_write,
+};
+
+static u64 compute_write_bytes(struct task_struct *task)
+{
+ if (task->ioac.write_bytes <= task->ioac.cancelled_write_bytes)
+ return 0;
+
+ return task->ioac.write_bytes - task->ioac.cancelled_write_bytes;
+}
+
+static void add_uid_io_stats(struct uid_entry *uid_entry,
+ struct task_struct *task, int slot)
+{
+ struct io_stats *io_slot = &uid_entry->io[slot];
+
+ io_slot->read_bytes += task->ioac.read_bytes;
+ io_slot->write_bytes += compute_write_bytes(task);
+ io_slot->rchar += task->ioac.rchar;
+ io_slot->wchar += task->ioac.wchar;
+ io_slot->fsync += task->ioac.syscfs;
+}
+
+static void compute_uid_io_bucket_stats(struct io_stats *io_bucket,
+ struct io_stats *io_curr,
+ struct io_stats *io_last,
+ struct io_stats *io_dead)
+{
+ io_bucket->read_bytes += io_curr->read_bytes + io_dead->read_bytes -
+ io_last->read_bytes;
+ io_bucket->write_bytes += io_curr->write_bytes + io_dead->write_bytes -
+ io_last->write_bytes;
+ io_bucket->rchar += io_curr->rchar + io_dead->rchar - io_last->rchar;
+ io_bucket->wchar += io_curr->wchar + io_dead->wchar - io_last->wchar;
+ io_bucket->fsync += io_curr->fsync + io_dead->fsync - io_last->fsync;
+
+ io_last->read_bytes = io_curr->read_bytes;
+ io_last->write_bytes = io_curr->write_bytes;
+ io_last->rchar = io_curr->rchar;
+ io_last->wchar = io_curr->wchar;
+ io_last->fsync = io_curr->fsync;
+
+ memset(io_dead, 0, sizeof(struct io_stats));
+}
+
+static void update_io_stats_all_locked(void)
+{
+ struct uid_entry *uid_entry;
+ struct task_struct *task, *temp;
+ struct user_namespace *user_ns = current_user_ns();
+ unsigned long bkt;
+ uid_t uid;
+
+ hash_for_each(hash_table, bkt, uid_entry, hash)
+ memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0,
+ sizeof(struct io_stats));
+
+ rcu_read_lock();
+ do_each_thread(temp, task) {
+ uid = from_kuid_munged(user_ns, task_uid(task));
+ uid_entry = find_or_register_uid(uid);
+ if (!uid_entry)
+ continue;
+ add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR);
+ } while_each_thread(temp, task);
+ rcu_read_unlock();
+
+ hash_for_each(hash_table, bkt, uid_entry, hash) {
+ compute_uid_io_bucket_stats(&uid_entry->io[uid_entry->state],
+ &uid_entry->io[UID_STATE_TOTAL_CURR],
+ &uid_entry->io[UID_STATE_TOTAL_LAST],
+ &uid_entry->io[UID_STATE_DEAD_TASKS]);
+ }
+}
+
+static void update_io_stats_uid_locked(struct uid_entry *uid_entry)
+{
+ struct task_struct *task, *temp;
+ struct user_namespace *user_ns = current_user_ns();
+
+ memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0,
+ sizeof(struct io_stats));
+
+ rcu_read_lock();
+ do_each_thread(temp, task) {
+ if (from_kuid_munged(user_ns, task_uid(task)) != uid_entry->uid)
+ continue;
+ add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR);
+ } while_each_thread(temp, task);
+ rcu_read_unlock();
+
+ compute_uid_io_bucket_stats(&uid_entry->io[uid_entry->state],
+ &uid_entry->io[UID_STATE_TOTAL_CURR],
+ &uid_entry->io[UID_STATE_TOTAL_LAST],
+ &uid_entry->io[UID_STATE_DEAD_TASKS]);
+}
+
+static int uid_io_show(struct seq_file *m, void *v)
+{
+ struct uid_entry *uid_entry;
+ unsigned long bkt;
+
+ rt_mutex_lock(&uid_lock);
+
+ update_io_stats_all_locked();
+
+ hash_for_each(hash_table, bkt, uid_entry, hash) {
+ seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ uid_entry->uid,
+ uid_entry->io[UID_STATE_FOREGROUND].rchar,
+ uid_entry->io[UID_STATE_FOREGROUND].wchar,
+ uid_entry->io[UID_STATE_FOREGROUND].read_bytes,
+ uid_entry->io[UID_STATE_FOREGROUND].write_bytes,
+ uid_entry->io[UID_STATE_BACKGROUND].rchar,
+ uid_entry->io[UID_STATE_BACKGROUND].wchar,
+ uid_entry->io[UID_STATE_BACKGROUND].read_bytes,
+ uid_entry->io[UID_STATE_BACKGROUND].write_bytes,
+ uid_entry->io[UID_STATE_FOREGROUND].fsync,
+ uid_entry->io[UID_STATE_BACKGROUND].fsync);
+ }
+
+ rt_mutex_unlock(&uid_lock);
+
+ return 0;
+}
+
+static int uid_io_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, uid_io_show, PDE_DATA(inode));
+}
+
+static const struct file_operations uid_io_fops = {
+ .open = uid_io_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int uid_procstat_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, NULL, NULL);
+}
+
+static ssize_t uid_procstat_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
+{
+ struct uid_entry *uid_entry;
+ uid_t uid;
+ int argc, state;
+ char input[128];
+
+ if (count >= sizeof(input))
+ return -EINVAL;
+
+ if (copy_from_user(input, buffer, count))
+ return -EFAULT;
+
+ input[count] = '\0';
+
+ argc = sscanf(input, "%u %d", &uid, &state);
+ if (argc != 2)
+ return -EINVAL;
+
+ if (state != UID_STATE_BACKGROUND && state != UID_STATE_FOREGROUND)
+ return -EINVAL;
+
+ rt_mutex_lock(&uid_lock);
+
+ uid_entry = find_or_register_uid(uid);
+ if (!uid_entry) {
+ rt_mutex_unlock(&uid_lock);
+ return -EINVAL;
+ }
+
+ if (uid_entry->state == state) {
+ rt_mutex_unlock(&uid_lock);
+ return count;
+ }
+
+ update_io_stats_uid_locked(uid_entry);
+
+ uid_entry->state = state;
+
+ rt_mutex_unlock(&uid_lock);
+
+ return count;
+}
+
+static const struct file_operations uid_procstat_fops = {
+ .open = uid_procstat_open,
+ .release = single_release,
+ .write = uid_procstat_write,
+};
+
+static int process_notifier(struct notifier_block *self,
+ unsigned long cmd, void *v)
+{
+ struct task_struct *task = v;
+ struct uid_entry *uid_entry;
+ cputime_t utime, stime;
+ uid_t uid;
+
+ if (!task)
+ return NOTIFY_OK;
+
+ rt_mutex_lock(&uid_lock);
+ uid = from_kuid_munged(current_user_ns(), task_uid(task));
+ uid_entry = find_or_register_uid(uid);
+ if (!uid_entry) {
+ pr_err("%s: failed to find uid %d\n", __func__, uid);
+ goto exit;
+ }
+
+ task_cputime_adjusted(task, &utime, &stime);
+ uid_entry->utime += utime;
+ uid_entry->stime += stime;
+ uid_entry->power += task->cpu_power;
+ task->cpu_power = ULLONG_MAX;
+
+ add_uid_io_stats(uid_entry, task, UID_STATE_DEAD_TASKS);
+
+exit:
+ rt_mutex_unlock(&uid_lock);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block process_notifier_block = {
+ .notifier_call = process_notifier,
+};
+
+static int __init proc_uid_sys_stats_init(void)
+{
+ hash_init(hash_table);
+
+ cpu_parent = proc_mkdir("uid_cputime", NULL);
+ if (!cpu_parent) {
+ pr_err("%s: failed to create uid_cputime proc entry\n",
+ __func__);
+ goto err;
+ }
+
+ proc_create_data("remove_uid_range", 0222, cpu_parent,
+ &uid_remove_fops, NULL);
+ proc_create_data("show_uid_stat", 0444, cpu_parent,
+ &uid_cputime_fops, NULL);
+
+ io_parent = proc_mkdir("uid_io", NULL);
+ if (!io_parent) {
+ pr_err("%s: failed to create uid_io proc entry\n",
+ __func__);
+ goto err;
+ }
+
+ proc_create_data("stats", 0444, io_parent,
+ &uid_io_fops, NULL);
+
+ proc_parent = proc_mkdir("uid_procstat", NULL);
+ if (!proc_parent) {
+ pr_err("%s: failed to create uid_procstat proc entry\n",
+ __func__);
+ goto err;
+ }
+
+ proc_create_data("set", 0222, proc_parent,
+ &uid_procstat_fops, NULL);
+
+ profile_event_register(PROFILE_TASK_EXIT, &process_notifier_block);
+
+ return 0;
+
+err:
+ remove_proc_subtree("uid_cputime", NULL);
+ remove_proc_subtree("uid_io", NULL);
+ remove_proc_subtree("uid_procstat", NULL);
+ return -ENOMEM;
+}
+
+early_initcall(proc_uid_sys_stats_init);
diff --git a/drivers/mmc/card/Kconfig b/drivers/mmc/card/Kconfig
index 5562308699bc..6142ec1b9dfb 100644
--- a/drivers/mmc/card/Kconfig
+++ b/drivers/mmc/card/Kconfig
@@ -68,3 +68,15 @@ config MMC_TEST
This driver is only of interest to those developing or
testing a host driver. Most people should say N here.
+
+config MMC_SIMULATE_MAX_SPEED
+ bool "Turn on maximum speed control per block device"
+ depends on MMC_BLOCK
+ help
+ Say Y here to enable MMC device speed limiting. Used to test and
+ simulate the behavior of the system when confronted with a slow MMC.
+
+ Enables max_read_speed, max_write_speed and cache_size attributes to
+ control the write or read maximum KB/second speed behaviors.
+
+ If unsure, say N here.
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index dec6b0ec4def..a773189dfa48 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -36,6 +36,8 @@
#include <linux/compat.h>
#include <linux/pm_runtime.h>
+#include <trace/events/mmc.h>
+
#include <linux/mmc/ioctl.h>
#include <linux/mmc/card.h>
#include <linux/mmc/host.h>
@@ -166,11 +168,7 @@ static struct mmc_blk_data *mmc_blk_get(struct gendisk *disk)
static inline int mmc_get_devidx(struct gendisk *disk)
{
- int devmaj = MAJOR(disk_devt(disk));
- int devidx = MINOR(disk_devt(disk)) / perdev_minors;
-
- if (!devmaj)
- devidx = disk->first_minor / perdev_minors;
+ int devidx = disk->first_minor / perdev_minors;
return devidx;
}
@@ -288,6 +286,250 @@ out:
return ret;
}
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+
+static int max_read_speed, max_write_speed, cache_size = 4;
+
+module_param(max_read_speed, int, S_IRUSR | S_IRGRP);
+MODULE_PARM_DESC(max_read_speed, "maximum KB/s read speed 0=off");
+module_param(max_write_speed, int, S_IRUSR | S_IRGRP);
+MODULE_PARM_DESC(max_write_speed, "maximum KB/s write speed 0=off");
+module_param(cache_size, int, S_IRUSR | S_IRGRP);
+MODULE_PARM_DESC(cache_size, "MB high speed memory or SLC cache");
+
+/*
+ * helper macros and expectations:
+ * size - unsigned long number of bytes
+ * jiffies - unsigned long HZ timestamp difference
+ * speed - unsigned KB/s transfer rate
+ */
+#define size_and_speed_to_jiffies(size, speed) \
+ ((size) * HZ / (speed) / 1024UL)
+#define jiffies_and_speed_to_size(jiffies, speed) \
+ (((speed) * (jiffies) * 1024UL) / HZ)
+#define jiffies_and_size_to_speed(jiffies, size) \
+ ((size) * HZ / (jiffies) / 1024UL)
+
+/* Limits to report warning */
+/* jiffies_and_size_to_speed(10*HZ, queue_max_hw_sectors(q) * 512UL) ~ 25 */
+#define MIN_SPEED(q) 250 /* 10 times faster than a floppy disk */
+#define MAX_SPEED(q) jiffies_and_size_to_speed(1, queue_max_sectors(q) * 512UL)
+
+#define speed_valid(speed) ((speed) > 0)
+
+static const char off[] = "off\n";
+
+static int max_speed_show(int speed, char *buf)
+{
+ if (speed)
+ return scnprintf(buf, PAGE_SIZE, "%uKB/s\n", speed);
+ else
+ return scnprintf(buf, PAGE_SIZE, off);
+}
+
+static int max_speed_store(const char *buf, struct request_queue *q)
+{
+ unsigned int limit, set = 0;
+
+ if (!strncasecmp(off, buf, sizeof(off) - 2))
+ return set;
+ if (kstrtouint(buf, 0, &set) || (set > INT_MAX))
+ return -EINVAL;
+ if (set == 0)
+ return set;
+ limit = MAX_SPEED(q);
+ if (set > limit)
+ pr_warn("max speed %u ineffective above %u\n", set, limit);
+ limit = MIN_SPEED(q);
+ if (set < limit)
+ pr_warn("max speed %u painful below %u\n", set, limit);
+ return set;
+}
+
+static ssize_t max_write_speed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+ int ret = max_speed_show(atomic_read(&md->queue.max_write_speed), buf);
+
+ mmc_blk_put(md);
+ return ret;
+}
+
+static ssize_t max_write_speed_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+ int set = max_speed_store(buf, md->queue.queue);
+
+ if (set < 0) {
+ mmc_blk_put(md);
+ return set;
+ }
+
+ atomic_set(&md->queue.max_write_speed, set);
+ mmc_blk_put(md);
+ return count;
+}
+
+static const DEVICE_ATTR(max_write_speed, S_IRUGO | S_IWUSR,
+ max_write_speed_show, max_write_speed_store);
+
+static ssize_t max_read_speed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+ int ret = max_speed_show(atomic_read(&md->queue.max_read_speed), buf);
+
+ mmc_blk_put(md);
+ return ret;
+}
+
+static ssize_t max_read_speed_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+ int set = max_speed_store(buf, md->queue.queue);
+
+ if (set < 0) {
+ mmc_blk_put(md);
+ return set;
+ }
+
+ atomic_set(&md->queue.max_read_speed, set);
+ mmc_blk_put(md);
+ return count;
+}
+
+static const DEVICE_ATTR(max_read_speed, S_IRUGO | S_IWUSR,
+ max_read_speed_show, max_read_speed_store);
+
+static ssize_t cache_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+ struct mmc_queue *mq = &md->queue;
+ int cache_size = atomic_read(&mq->cache_size);
+ int ret;
+
+ if (!cache_size)
+ ret = scnprintf(buf, PAGE_SIZE, off);
+ else {
+ int speed = atomic_read(&mq->max_write_speed);
+
+ if (!speed_valid(speed))
+ ret = scnprintf(buf, PAGE_SIZE, "%uMB\n", cache_size);
+ else { /* We accept race between cache_jiffies and cache_used */
+ unsigned long size = jiffies_and_speed_to_size(
+ jiffies - mq->cache_jiffies, speed);
+ long used = atomic_long_read(&mq->cache_used);
+
+ if (size >= used)
+ size = 0;
+ else
+ size = (used - size) * 100 / cache_size
+ / 1024UL / 1024UL;
+
+ ret = scnprintf(buf, PAGE_SIZE, "%uMB %lu%% used\n",
+ cache_size, size);
+ }
+ }
+
+ mmc_blk_put(md);
+ return ret;
+}
+
+static ssize_t cache_size_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct mmc_blk_data *md;
+ unsigned int set = 0;
+
+ if (strncasecmp(off, buf, sizeof(off) - 2)
+ && (kstrtouint(buf, 0, &set) || (set > INT_MAX)))
+ return -EINVAL;
+
+ md = mmc_blk_get(dev_to_disk(dev));
+ atomic_set(&md->queue.cache_size, set);
+ mmc_blk_put(md);
+ return count;
+}
+
+static const DEVICE_ATTR(cache_size, S_IRUGO | S_IWUSR,
+ cache_size_show, cache_size_store);
+
+/* correct for write-back */
+static long mmc_blk_cache_used(struct mmc_queue *mq, unsigned long waitfor)
+{
+ long used = 0;
+ int speed = atomic_read(&mq->max_write_speed);
+
+ if (speed_valid(speed)) {
+ unsigned long size = jiffies_and_speed_to_size(
+ waitfor - mq->cache_jiffies, speed);
+ used = atomic_long_read(&mq->cache_used);
+
+ if (size >= used)
+ used = 0;
+ else
+ used -= size;
+ }
+
+ atomic_long_set(&mq->cache_used, used);
+ mq->cache_jiffies = waitfor;
+
+ return used;
+}
+
+static void mmc_blk_simulate_delay(
+ struct mmc_queue *mq,
+ struct request *req,
+ unsigned long waitfor)
+{
+ int max_speed;
+
+ if (!req)
+ return;
+
+ max_speed = (rq_data_dir(req) == READ)
+ ? atomic_read(&mq->max_read_speed)
+ : atomic_read(&mq->max_write_speed);
+ if (speed_valid(max_speed)) {
+ unsigned long bytes = blk_rq_bytes(req);
+
+ if (rq_data_dir(req) != READ) {
+ int cache_size = atomic_read(&mq->cache_size);
+
+ if (cache_size) {
+ unsigned long size = cache_size * 1024L * 1024L;
+ long used = mmc_blk_cache_used(mq, waitfor);
+
+ used += bytes;
+ atomic_long_set(&mq->cache_used, used);
+ bytes = 0;
+ if (used > size)
+ bytes = used - size;
+ }
+ }
+ waitfor += size_and_speed_to_jiffies(bytes, max_speed);
+ if (time_is_after_jiffies(waitfor)) {
+ long msecs = jiffies_to_msecs(waitfor - jiffies);
+
+ if (likely(msecs > 0))
+ msleep(msecs);
+ }
+ }
+}
+
+#else
+
+#define mmc_blk_simulate_delay(mq, req, waitfor)
+
+#endif
+
static int mmc_blk_open(struct block_device *bdev, fmode_t mode)
{
struct mmc_blk_data *md = mmc_blk_get(bdev->bd_disk);
@@ -427,9 +669,11 @@ static int ioctl_do_sanitize(struct mmc_card *card)
pr_debug("%s: %s - SANITIZE IN PROGRESS...\n",
mmc_hostname(card->host), __func__);
+ trace_mmc_blk_erase_start(EXT_CSD_SANITIZE_START, 0, 0);
err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
EXT_CSD_SANITIZE_START, 1,
MMC_SANITIZE_REQ_TIMEOUT);
+ trace_mmc_blk_erase_end(EXT_CSD_SANITIZE_START, 0, 0);
if (err)
pr_err("%s: %s - EXT_CSD_SANITIZE_START failed. err=%d\n",
@@ -852,18 +1096,22 @@ static int mmc_blk_cmd_error(struct request *req, const char *name, int error,
req->rq_disk->disk_name, "timed out", name, status);
/* If the status cmd initially failed, retry the r/w cmd */
- if (!status_valid)
+ if (!status_valid) {
+ pr_err("%s: status not valid, retrying timeout\n", req->rq_disk->disk_name);
return ERR_RETRY;
-
+ }
/*
* If it was a r/w cmd crc error, or illegal command
* (eg, issued in wrong state) then retry - we should
* have corrected the state problem above.
*/
- if (status & (R1_COM_CRC_ERROR | R1_ILLEGAL_COMMAND))
+ if (status & (R1_COM_CRC_ERROR | R1_ILLEGAL_COMMAND)) {
+ pr_err("%s: command error, retrying timeout\n", req->rq_disk->disk_name);
return ERR_RETRY;
+ }
/* Otherwise abort the command */
+ pr_err("%s: not retrying timeout\n", req->rq_disk->disk_name);
return ERR_ABORT;
default:
@@ -1159,6 +1407,23 @@ static int mmc_blk_issue_flush(struct mmc_queue *mq, struct request *req)
if (ret)
ret = -EIO;
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ else if (atomic_read(&mq->cache_size)) {
+ long used = mmc_blk_cache_used(mq, jiffies);
+
+ if (used) {
+ int speed = atomic_read(&mq->max_write_speed);
+
+ if (speed_valid(speed)) {
+ unsigned long msecs = jiffies_to_msecs(
+ size_and_speed_to_jiffies(
+ used, speed));
+ if (msecs)
+ msleep(msecs);
+ }
+ }
+ }
+#endif
blk_end_request_all(req, ret);
return ret ? 0 : 1;
@@ -1844,6 +2109,9 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc)
struct mmc_async_req *areq;
const u8 packed_nr = 2;
u8 reqs = 0;
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ unsigned long waitfor = jiffies;
+#endif
if (!rqc && !mq->mqrq_prev->req)
return 0;
@@ -1894,6 +2162,8 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc)
*/
mmc_blk_reset_success(md, type);
+ mmc_blk_simulate_delay(mq, rqc, waitfor);
+
if (mmc_packed_cmd(mq_rq->cmd_type)) {
ret = mmc_blk_end_packed_req(mq_rq);
break;
@@ -2152,6 +2422,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
md->disk->queue = md->queue.queue;
md->disk->driverfs_dev = parent;
set_disk_ro(md->disk, md->read_only || default_ro);
+ md->disk->flags = GENHD_FL_EXT_DEVT;
if (area_type & (MMC_BLK_DATA_AREA_RPMB | MMC_BLK_DATA_AREA_BOOT))
md->disk->flags |= GENHD_FL_NO_PART_SCAN;
@@ -2310,6 +2581,14 @@ static void mmc_blk_remove_req(struct mmc_blk_data *md)
card->ext_csd.boot_ro_lockable)
device_remove_file(disk_to_dev(md->disk),
&md->power_ro_lock);
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ device_remove_file(disk_to_dev(md->disk),
+ &dev_attr_max_write_speed);
+ device_remove_file(disk_to_dev(md->disk),
+ &dev_attr_max_read_speed);
+ device_remove_file(disk_to_dev(md->disk),
+ &dev_attr_cache_size);
+#endif
del_gendisk(md->disk);
}
@@ -2345,6 +2624,24 @@ static int mmc_add_disk(struct mmc_blk_data *md)
ret = device_create_file(disk_to_dev(md->disk), &md->force_ro);
if (ret)
goto force_ro_fail;
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ atomic_set(&md->queue.max_write_speed, max_write_speed);
+ ret = device_create_file(disk_to_dev(md->disk),
+ &dev_attr_max_write_speed);
+ if (ret)
+ goto max_write_speed_fail;
+ atomic_set(&md->queue.max_read_speed, max_read_speed);
+ ret = device_create_file(disk_to_dev(md->disk),
+ &dev_attr_max_read_speed);
+ if (ret)
+ goto max_read_speed_fail;
+ atomic_set(&md->queue.cache_size, cache_size);
+ atomic_long_set(&md->queue.cache_used, 0);
+ md->queue.cache_jiffies = jiffies;
+ ret = device_create_file(disk_to_dev(md->disk), &dev_attr_cache_size);
+ if (ret)
+ goto cache_size_fail;
+#endif
if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) &&
card->ext_csd.boot_ro_lockable) {
@@ -2369,6 +2666,14 @@ static int mmc_add_disk(struct mmc_blk_data *md)
return ret;
power_ro_lock_fail:
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ device_remove_file(disk_to_dev(md->disk), &dev_attr_cache_size);
+cache_size_fail:
+ device_remove_file(disk_to_dev(md->disk), &dev_attr_max_read_speed);
+max_read_speed_fail:
+ device_remove_file(disk_to_dev(md->disk), &dev_attr_max_write_speed);
+max_write_speed_fail:
+#endif
device_remove_file(disk_to_dev(md->disk), &md->force_ro);
force_ro_fail:
del_gendisk(md->disk);
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index 7a41a221d248..aecb13a74f6c 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -19,6 +19,7 @@
#include <linux/mmc/card.h>
#include <linux/mmc/host.h>
+#include <linux/sched/rt.h>
#include "queue.h"
#define MMC_QUEUE_BOUNCESZ 65536
@@ -50,6 +51,11 @@ static int mmc_queue_thread(void *d)
{
struct mmc_queue *mq = d;
struct request_queue *q = mq->queue;
+ struct sched_param scheduler_params = {0};
+
+ scheduler_params.sched_priority = 1;
+
+ sched_setscheduler(current, SCHED_FIFO, &scheduler_params);
current->flags |= PF_MEMALLOC;
diff --git a/drivers/mmc/card/queue.h b/drivers/mmc/card/queue.h
index f42c11293dd8..0edb9683f0ea 100644
--- a/drivers/mmc/card/queue.h
+++ b/drivers/mmc/card/queue.h
@@ -57,6 +57,14 @@ struct mmc_queue {
struct mmc_queue_req mqrq[2];
struct mmc_queue_req *mqrq_cur;
struct mmc_queue_req *mqrq_prev;
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ atomic_t max_write_speed;
+ atomic_t max_read_speed;
+ atomic_t cache_size;
+ /* i/o tracking */
+ atomic_long_t cache_used;
+ unsigned long cache_jiffies;
+#endif
};
extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *,
diff --git a/drivers/mmc/core/Kconfig b/drivers/mmc/core/Kconfig
index 9ebee72d9c3f..f771bc3496a4 100644
--- a/drivers/mmc/core/Kconfig
+++ b/drivers/mmc/core/Kconfig
@@ -11,3 +11,18 @@ config MMC_CLKGATE
support handling this in order for it to be of any use.
If unsure, say N.
+
+config MMC_EMBEDDED_SDIO
+ boolean "MMC embedded SDIO device support (EXPERIMENTAL)"
+ help
+ If you say Y here, support will be added for embedded SDIO
+ devices which do not contain the necessary enumeration
+ support in hardware to be properly detected.
+
+config MMC_PARANOID_SD_INIT
+ bool "Enable paranoid SD card initialization (EXPERIMENTAL)"
+ help
+ If you say Y here, the MMC layer will be extra paranoid
+ about re-trying SD init requests. This can be a useful
+ work-around for buggy controllers and hardware. Enable
+ if you are experiencing issues with SD detection.
diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c
index 8a1f1240e058..78685657a8dd 100644
--- a/drivers/mmc/core/bus.c
+++ b/drivers/mmc/core/bus.c
@@ -16,6 +16,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/stat.h>
+#include <linux/of.h>
#include <linux/pm_runtime.h>
#include <linux/mmc/card.h>
@@ -352,6 +353,8 @@ int mmc_add_card(struct mmc_card *card)
#endif
mmc_init_context_info(card->host);
+ card->dev.of_node = mmc_of_find_child_device(card->host, 0);
+
ret = device_add(&card->dev);
if (ret)
return ret;
@@ -380,6 +383,7 @@ void mmc_remove_card(struct mmc_card *card)
mmc_hostname(card->host), card->rca);
}
device_del(&card->dev);
+ of_node_put(card->dev.of_node);
}
put_device(&card->dev);
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index eefe36afa601..8f7f855d2473 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -30,6 +30,9 @@
#include <linux/slab.h>
#include <linux/of.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/mmc.h>
+
#include <linux/mmc/card.h>
#include <linux/mmc/host.h>
#include <linux/mmc/mmc.h>
@@ -45,6 +48,11 @@
#include "sd_ops.h"
#include "sdio_ops.h"
+EXPORT_TRACEPOINT_SYMBOL_GPL(mmc_blk_erase_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(mmc_blk_erase_end);
+EXPORT_TRACEPOINT_SYMBOL_GPL(mmc_blk_rw_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(mmc_blk_rw_end);
+
/* If the device is not responding */
#define MMC_CORE_TIMEOUT_MS (10 * 60 * 1000) /* 10 minute timeout */
@@ -158,6 +166,20 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq)
pr_debug("%s: %d bytes transferred: %d\n",
mmc_hostname(host),
mrq->data->bytes_xfered, mrq->data->error);
+#ifdef CONFIG_BLOCK
+ if (mrq->lat_hist_enabled) {
+ ktime_t completion;
+ u_int64_t delta_us;
+
+ completion = ktime_get();
+ delta_us = ktime_us_delta(completion,
+ mrq->io_start);
+ blk_update_latency_hist(&host->io_lat_s,
+ (mrq->data->flags & MMC_DATA_READ),
+ delta_us);
+ }
+#endif
+ trace_mmc_blk_rw_end(cmd->opcode, cmd->arg, mrq->data);
}
if (mrq->stop) {
@@ -544,8 +566,19 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host,
mmc_start_bkops(host->card, true);
}
- if (!err && areq)
+ if (!err && areq) {
+#ifdef CONFIG_BLOCK
+ if (host->latency_hist_enabled) {
+ areq->mrq->io_start = ktime_get();
+ areq->mrq->lat_hist_enabled = 1;
+ } else
+ areq->mrq->lat_hist_enabled = 0;
+#endif
+ trace_mmc_blk_rw_start(areq->mrq->cmd->opcode,
+ areq->mrq->cmd->arg,
+ areq->mrq->data);
start_err = __mmc_start_data_req(host, areq->mrq);
+ }
if (host->areq)
mmc_post_req(host, host->areq->mrq, 0);
@@ -1231,6 +1264,34 @@ EXPORT_SYMBOL(mmc_of_parse_voltage);
#endif /* CONFIG_OF */
+static int mmc_of_get_func_num(struct device_node *node)
+{
+ u32 reg;
+ int ret;
+
+ ret = of_property_read_u32(node, "reg", &reg);
+ if (ret < 0)
+ return ret;
+
+ return reg;
+}
+
+struct device_node *mmc_of_find_child_device(struct mmc_host *host,
+ unsigned func_num)
+{
+ struct device_node *node;
+
+ if (!host->parent || !host->parent->of_node)
+ return NULL;
+
+ for_each_child_of_node(host->parent->of_node, node) {
+ if (mmc_of_get_func_num(node) == func_num)
+ return node;
+ }
+
+ return NULL;
+}
+
#ifdef CONFIG_REGULATOR
/**
@@ -1801,7 +1862,7 @@ void mmc_init_erase(struct mmc_card *card)
}
static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card,
- unsigned int arg, unsigned int qty)
+ unsigned int arg, unsigned int qty)
{
unsigned int erase_timeout;
@@ -1905,8 +1966,13 @@ static int mmc_do_erase(struct mmc_card *card, unsigned int from,
struct mmc_command cmd = {0};
unsigned int qty = 0;
unsigned long timeout;
+ unsigned int fr, nr;
int err;
+ fr = from;
+ nr = to - from + 1;
+ trace_mmc_blk_erase_start(arg, fr, nr);
+
/*
* qty is used to calculate the erase timeout which depends on how many
* erase groups (or allocation units in SD terminology) are affected.
@@ -2010,6 +2076,8 @@ static int mmc_do_erase(struct mmc_card *card, unsigned int from,
} while (!(cmd.resp[0] & R1_READY_FOR_DATA) ||
(R1_CURRENT_STATE(cmd.resp[0]) == R1_STATE_PRG));
out:
+
+ trace_mmc_blk_erase_end(arg, fr, nr);
return err;
}
@@ -2697,6 +2765,22 @@ void mmc_init_context_info(struct mmc_host *host)
init_waitqueue_head(&host->context_info.wait);
}
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+void mmc_set_embedded_sdio_data(struct mmc_host *host,
+ struct sdio_cis *cis,
+ struct sdio_cccr *cccr,
+ struct sdio_embedded_func *funcs,
+ int num_funcs)
+{
+ host->embedded_sdio_data.cis = cis;
+ host->embedded_sdio_data.cccr = cccr;
+ host->embedded_sdio_data.funcs = funcs;
+ host->embedded_sdio_data.num_funcs = num_funcs;
+}
+
+EXPORT_SYMBOL(mmc_set_embedded_sdio_data);
+#endif
+
static int __init mmc_init(void)
{
int ret;
@@ -2737,6 +2821,56 @@ static void __exit mmc_exit(void)
destroy_workqueue(workqueue);
}
+#ifdef CONFIG_BLOCK
+static ssize_t
+latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct mmc_host *host = cls_dev_to_mmc_host(dev);
+
+ return blk_latency_hist_show(&host->io_lat_s, buf);
+}
+
+/*
+ * Values permitted 0, 1, 2.
+ * 0 -> Disable IO latency histograms (default)
+ * 1 -> Enable IO latency histograms
+ * 2 -> Zero out IO latency histograms
+ */
+static ssize_t
+latency_hist_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct mmc_host *host = cls_dev_to_mmc_host(dev);
+ long value;
+
+ if (kstrtol(buf, 0, &value))
+ return -EINVAL;
+ if (value == BLK_IO_LAT_HIST_ZERO)
+ blk_zero_latency_hist(&host->io_lat_s);
+ else if (value == BLK_IO_LAT_HIST_ENABLE ||
+ value == BLK_IO_LAT_HIST_DISABLE)
+ host->latency_hist_enabled = value;
+ return count;
+}
+
+static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR,
+ latency_hist_show, latency_hist_store);
+
+void
+mmc_latency_hist_sysfs_init(struct mmc_host *host)
+{
+ if (device_create_file(&host->class_dev, &dev_attr_latency_hist))
+ dev_err(&host->class_dev,
+ "Failed to create latency_hist sysfs entry\n");
+}
+
+void
+mmc_latency_hist_sysfs_exit(struct mmc_host *host)
+{
+ device_remove_file(&host->class_dev, &dev_attr_latency_hist);
+}
+#endif
+
subsys_initcall(mmc_init);
module_exit(mmc_exit);
diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h
index 3ea5ad3a252a..953c6a2b3731 100644
--- a/drivers/mmc/core/core.h
+++ b/drivers/mmc/core/core.h
@@ -32,6 +32,9 @@ struct mmc_bus_ops {
void mmc_attach_bus(struct mmc_host *host, const struct mmc_bus_ops *ops);
void mmc_detach_bus(struct mmc_host *host);
+struct device_node *mmc_of_find_child_device(struct mmc_host *host,
+ unsigned func_num);
+
void mmc_init_erase(struct mmc_card *card);
void mmc_set_chip_select(struct mmc_host *host, int mode);
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 270d58a4c43d..8b1a0cf414a6 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -30,8 +30,6 @@
#include "core.h"
#include "host.h"
-#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev)
-
static void mmc_host_classdev_release(struct device *dev)
{
struct mmc_host *host = cls_dev_to_mmc_host(dev);
@@ -558,9 +556,12 @@ int mmc_add_host(struct mmc_host *host)
mmc_add_host_debugfs(host);
#endif
mmc_host_clk_sysfs_init(host);
-
+#ifdef CONFIG_BLOCK
+ mmc_latency_hist_sysfs_init(host);
+#endif
mmc_start_host(host);
- register_pm_notifier(&host->pm_notify);
+ if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY))
+ register_pm_notifier(&host->pm_notify);
return 0;
}
@@ -577,12 +578,17 @@ EXPORT_SYMBOL(mmc_add_host);
*/
void mmc_remove_host(struct mmc_host *host)
{
- unregister_pm_notifier(&host->pm_notify);
+ if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY))
+ unregister_pm_notifier(&host->pm_notify);
+
mmc_stop_host(host);
#ifdef CONFIG_DEBUG_FS
mmc_remove_host_debugfs(host);
#endif
+#ifdef CONFIG_BLOCK
+ mmc_latency_hist_sysfs_exit(host);
+#endif
device_del(&host->class_dev);
diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h
index f2ab9e578126..effb76bed05f 100644
--- a/drivers/mmc/core/host.h
+++ b/drivers/mmc/core/host.h
@@ -12,8 +12,15 @@
#define _MMC_CORE_HOST_H
#include <linux/mmc/host.h>
+#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev)
+
int mmc_register_host_class(void);
void mmc_unregister_host_class(void);
+#ifdef CONFIG_BLOCK
+void mmc_latency_hist_sysfs_init(struct mmc_host *host);
+void mmc_latency_hist_sysfs_exit(struct mmc_host *host);
+#endif
+
#endif
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 449aa7a4227d..11ceebc48fd1 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -635,6 +635,14 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
card->ext_csd.data_sector_size = 512;
}
+ /* eMMC v5 or later */
+ if (card->ext_csd.rev >= 7) {
+ card->ext_csd.pre_eol_info = ext_csd[EXT_CSD_PRE_EOL_INFO];
+ card->ext_csd.device_life_time_est_typ_a =
+ ext_csd[EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_A];
+ card->ext_csd.device_life_time_est_typ_b =
+ ext_csd[EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_B];
+ }
out:
return err;
}
@@ -735,6 +743,11 @@ MMC_DEV_ATTR(manfid, "0x%06x\n", card->cid.manfid);
MMC_DEV_ATTR(name, "%s\n", card->cid.prod_name);
MMC_DEV_ATTR(oemid, "0x%04x\n", card->cid.oemid);
MMC_DEV_ATTR(prv, "0x%x\n", card->cid.prv);
+MMC_DEV_ATTR(rev, "0x%x\n", card->ext_csd.rev);
+MMC_DEV_ATTR(pre_eol_info, "%02x\n", card->ext_csd.pre_eol_info);
+MMC_DEV_ATTR(life_time, "0x%02x 0x%02x\n",
+ card->ext_csd.device_life_time_est_typ_a,
+ card->ext_csd.device_life_time_est_typ_b);
MMC_DEV_ATTR(serial, "0x%08x\n", card->cid.serial);
MMC_DEV_ATTR(enhanced_area_offset, "%llu\n",
card->ext_csd.enhanced_area_offset);
@@ -754,6 +767,9 @@ static struct attribute *mmc_std_attrs[] = {
&dev_attr_name.attr,
&dev_attr_oemid.attr,
&dev_attr_prv.attr,
+ &dev_attr_rev.attr,
+ &dev_attr_pre_eol_info.attr,
+ &dev_attr_life_time.attr,
&dev_attr_serial.attr,
&dev_attr_enhanced_area_offset.attr,
&dev_attr_enhanced_area_size.attr,
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index d4c8c6fa369f..275b19b7f27c 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -824,6 +824,9 @@ int mmc_sd_setup_card(struct mmc_host *host, struct mmc_card *card,
bool reinit)
{
int err;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ int retries;
+#endif
if (!reinit) {
/*
@@ -850,7 +853,26 @@ int mmc_sd_setup_card(struct mmc_host *host, struct mmc_card *card,
/*
* Fetch switch information from card.
*/
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ for (retries = 1; retries <= 3; retries++) {
+ err = mmc_read_switch(card);
+ if (!err) {
+ if (retries > 1) {
+ printk(KERN_WARNING
+ "%s: recovered\n",
+ mmc_hostname(host));
+ }
+ break;
+ } else {
+ printk(KERN_WARNING
+ "%s: read switch failed (attempt %d)\n",
+ mmc_hostname(host), retries);
+ }
+ }
+#else
err = mmc_read_switch(card);
+#endif
+
if (err)
return err;
}
@@ -1048,7 +1070,10 @@ static int mmc_sd_alive(struct mmc_host *host)
*/
static void mmc_sd_detect(struct mmc_host *host)
{
- int err;
+ int err = 0;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ int retries = 5;
+#endif
BUG_ON(!host);
BUG_ON(!host->card);
@@ -1058,7 +1083,23 @@ static void mmc_sd_detect(struct mmc_host *host)
/*
* Just check if our card has been removed.
*/
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ while(retries) {
+ err = mmc_send_status(host->card, NULL);
+ if (err) {
+ retries--;
+ udelay(5);
+ continue;
+ }
+ break;
+ }
+ if (!retries) {
+ printk(KERN_ERR "%s(%s): Unable to re-detect card (%d)\n",
+ __func__, mmc_hostname(host), err);
+ }
+#else
err = _mmc_detect_card_removed(host);
+#endif
mmc_put_card(host->card);
@@ -1120,6 +1161,9 @@ static int mmc_sd_suspend(struct mmc_host *host)
static int _mmc_sd_resume(struct mmc_host *host)
{
int err = 0;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ int retries;
+#endif
BUG_ON(!host);
BUG_ON(!host->card);
@@ -1130,7 +1174,23 @@ static int _mmc_sd_resume(struct mmc_host *host)
goto out;
mmc_power_up(host, host->card->ocr);
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ retries = 5;
+ while (retries) {
+ err = mmc_sd_init_card(host, host->card->ocr, host->card);
+
+ if (err) {
+ printk(KERN_ERR "%s: Re-init card rc = %d (retries = %d)\n",
+ mmc_hostname(host), err, retries);
+ mdelay(5);
+ retries--;
+ continue;
+ }
+ break;
+ }
+#else
err = mmc_sd_init_card(host, host->card->ocr, host->card);
+#endif
mmc_card_clr_suspended(host->card);
out:
@@ -1221,6 +1281,9 @@ int mmc_attach_sd(struct mmc_host *host)
{
int err;
u32 ocr, rocr;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ int retries;
+#endif
BUG_ON(!host);
WARN_ON(!host->claimed);
@@ -1257,9 +1320,27 @@ int mmc_attach_sd(struct mmc_host *host)
/*
* Detect and init the card.
*/
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ retries = 5;
+ while (retries) {
+ err = mmc_sd_init_card(host, rocr, NULL);
+ if (err) {
+ retries--;
+ continue;
+ }
+ break;
+ }
+
+ if (!retries) {
+ printk(KERN_ERR "%s: mmc_sd_init_card() failure (err = %d)\n",
+ mmc_hostname(host), err);
+ goto err;
+ }
+#else
err = mmc_sd_init_card(host, rocr, NULL);
if (err)
goto err;
+#endif
mmc_release_host(host);
err = mmc_add_card(host->card);
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index a3582929b2f7..2fa53140617b 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -10,6 +10,7 @@
*/
#include <linux/err.h>
+#include <linux/module.h>
#include <linux/pm_runtime.h>
#include <linux/mmc/host.h>
@@ -28,6 +29,10 @@
#include "sdio_ops.h"
#include "sdio_cis.h"
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+#include <linux/mmc/sdio_ids.h>
+#endif
+
static int sdio_read_fbr(struct sdio_func *func)
{
int ret;
@@ -732,19 +737,35 @@ try_again:
goto finish;
}
- /*
- * Read the common registers.
- */
- err = sdio_read_cccr(card, ocr);
- if (err)
- goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ if (host->embedded_sdio_data.cccr)
+ memcpy(&card->cccr, host->embedded_sdio_data.cccr, sizeof(struct sdio_cccr));
+ else {
+#endif
+ /*
+ * Read the common registers.
+ */
+ err = sdio_read_cccr(card, ocr);
+ if (err)
+ goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ }
+#endif
- /*
- * Read the common CIS tuples.
- */
- err = sdio_read_common_cis(card);
- if (err)
- goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ if (host->embedded_sdio_data.cis)
+ memcpy(&card->cis, host->embedded_sdio_data.cis, sizeof(struct sdio_cis));
+ else {
+#endif
+ /*
+ * Read the common CIS tuples.
+ */
+ err = sdio_read_common_cis(card);
+ if (err)
+ goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ }
+#endif
if (oldcard) {
int same = (card->cis.vendor == oldcard->cis.vendor &&
@@ -1137,14 +1158,36 @@ int mmc_attach_sdio(struct mmc_host *host)
funcs = (ocr & 0x70000000) >> 28;
card->sdio_funcs = 0;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ if (host->embedded_sdio_data.funcs)
+ card->sdio_funcs = funcs = host->embedded_sdio_data.num_funcs;
+#endif
+
/*
* Initialize (but don't add) all present functions.
*/
for (i = 0; i < funcs; i++, card->sdio_funcs++) {
- err = sdio_init_func(host->card, i + 1);
- if (err)
- goto remove;
-
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ if (host->embedded_sdio_data.funcs) {
+ struct sdio_func *tmp;
+
+ tmp = sdio_alloc_func(host->card);
+ if (IS_ERR(tmp))
+ goto remove;
+ tmp->num = (i + 1);
+ card->sdio_func[i] = tmp;
+ tmp->class = host->embedded_sdio_data.funcs[i].f_class;
+ tmp->max_blksize = host->embedded_sdio_data.funcs[i].f_maxblksize;
+ tmp->vendor = card->cis.vendor;
+ tmp->device = card->cis.device;
+ } else {
+#endif
+ err = sdio_init_func(host->card, i + 1);
+ if (err)
+ goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ }
+#endif
/*
* Enable Runtime PM for this func (if supported)
*/
@@ -1192,3 +1235,40 @@ err:
return err;
}
+int sdio_reset_comm(struct mmc_card *card)
+{
+ struct mmc_host *host = card->host;
+ u32 ocr;
+ u32 rocr;
+ int err;
+
+ printk("%s():\n", __func__);
+ mmc_claim_host(host);
+
+ mmc_go_idle(host);
+
+ mmc_set_clock(host, host->f_min);
+
+ err = mmc_send_io_op_cond(host, 0, &ocr);
+ if (err)
+ goto err;
+
+ rocr = mmc_select_voltage(host, ocr);
+ if (!rocr) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = mmc_sdio_init_card(host, rocr, card, 0);
+ if (err)
+ goto err;
+
+ mmc_release_host(host);
+ return 0;
+err:
+ printk("%s: Error resetting SDIO communications (%d)\n",
+ mmc_hostname(host), err);
+ mmc_release_host(host);
+ return err;
+}
+EXPORT_SYMBOL(sdio_reset_comm);
diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c
index 6da97b170563..b4803b819748 100644
--- a/drivers/mmc/core/sdio_bus.c
+++ b/drivers/mmc/core/sdio_bus.c
@@ -22,10 +22,16 @@
#include <linux/mmc/card.h>
#include <linux/mmc/host.h>
#include <linux/mmc/sdio_func.h>
+#include <linux/of.h>
+#include "core.h"
#include "sdio_cis.h"
#include "sdio_bus.h"
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+#include <linux/mmc/host.h>
+#endif
+
/* show configuration fields */
#define sdio_config_attr(field, format_string) \
static ssize_t \
@@ -262,7 +268,14 @@ static void sdio_release_func(struct device *dev)
{
struct sdio_func *func = dev_to_sdio_func(dev);
- sdio_free_func_cis(func);
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ /*
+ * If this device is embedded then we never allocated
+ * cis tables for this func
+ */
+ if (!func->card->host->embedded_sdio_data.funcs)
+#endif
+ sdio_free_func_cis(func);
kfree(func->info);
@@ -303,6 +316,13 @@ static void sdio_acpi_set_handle(struct sdio_func *func)
static inline void sdio_acpi_set_handle(struct sdio_func *func) {}
#endif
+static void sdio_set_of_node(struct sdio_func *func)
+{
+ struct mmc_host *host = func->card->host;
+
+ func->dev.of_node = mmc_of_find_child_device(host, func->num);
+}
+
/*
* Register a new SDIO function with the driver model.
*/
@@ -312,6 +332,7 @@ int sdio_add_func(struct sdio_func *func)
dev_set_name(&func->dev, "%s:%d", mmc_card_id(func->card), func->num);
+ sdio_set_of_node(func);
sdio_acpi_set_handle(func);
ret = device_add(&func->dev);
if (ret == 0) {
@@ -335,6 +356,7 @@ void sdio_remove_func(struct sdio_func *func)
dev_pm_domain_detach(&func->dev, false);
device_del(&func->dev);
+ of_node_put(func->dev.of_node);
put_device(&func->dev);
}
diff --git a/drivers/mmc/core/sdio_io.c b/drivers/mmc/core/sdio_io.c
index 78cb4d5d9d58..8fdeb07723a6 100644..100755
--- a/drivers/mmc/core/sdio_io.c
+++ b/drivers/mmc/core/sdio_io.c
@@ -384,6 +384,39 @@ u8 sdio_readb(struct sdio_func *func, unsigned int addr, int *err_ret)
EXPORT_SYMBOL_GPL(sdio_readb);
/**
+ * sdio_readb_ext - read a single byte from a SDIO function
+ * @func: SDIO function to access
+ * @addr: address to read
+ * @err_ret: optional status value from transfer
+ * @in: value to add to argument
+ *
+ * Reads a single byte from the address space of a given SDIO
+ * function. If there is a problem reading the address, 0xff
+ * is returned and @err_ret will contain the error code.
+ */
+unsigned char sdio_readb_ext(struct sdio_func *func, unsigned int addr,
+ int *err_ret, unsigned in)
+{
+ int ret;
+ unsigned char val;
+
+ BUG_ON(!func);
+
+ if (err_ret)
+ *err_ret = 0;
+
+ ret = mmc_io_rw_direct(func->card, 0, func->num, addr, (u8)in, &val);
+ if (ret) {
+ if (err_ret)
+ *err_ret = ret;
+ return 0xFF;
+ }
+
+ return val;
+}
+EXPORT_SYMBOL_GPL(sdio_readb_ext);
+
+/**
* sdio_writeb - write a single byte to a SDIO function
* @func: SDIO function to access
* @b: byte to write
diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index dd10646982ae..0f7c07abc3e7 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -1,3 +1,10 @@
+config MTD_NAND_IDS
+ tristate "Include chip ids for known NAND devices."
+ depends on MTD
+ help
+ Useful for NAND drivers that do not use the NAND subsystem but
+ still like to take advantage of the known chip information.
+
config MTD_NAND_ECC
tristate
@@ -109,9 +116,6 @@ config MTD_NAND_OMAP_BCH
config MTD_NAND_OMAP_BCH_BUILD
def_tristate MTD_NAND_OMAP2 && MTD_NAND_OMAP_BCH
-config MTD_NAND_IDS
- tristate
-
config MTD_NAND_RICOH
tristate "Ricoh xD card reader"
default n
diff --git a/drivers/net/ppp/Kconfig b/drivers/net/ppp/Kconfig
index 1373c6d7278d..282aec4860eb 100644
--- a/drivers/net/ppp/Kconfig
+++ b/drivers/net/ppp/Kconfig
@@ -149,6 +149,23 @@ config PPPOL2TP
tunnels. L2TP is replacing PPTP for VPN uses.
if TTY
+config PPPOLAC
+ tristate "PPP on L2TP Access Concentrator"
+ depends on PPP && INET
+ help
+ L2TP (RFC 2661) is a tunneling protocol widely used in virtual private
+ networks. This driver handles L2TP data packets between a UDP socket
+ and a PPP channel, but only permits one session per socket. Thus it is
+ fairly simple and suited for clients.
+
+config PPPOPNS
+ tristate "PPP on PPTP Network Server"
+ depends on PPP && INET
+ help
+ PPTP (RFC 2637) is a tunneling protocol widely used in virtual private
+ networks. This driver handles PPTP data packets between a RAW socket
+ and a PPP channel. It is fairly simple and easy to use.
+
config PPP_ASYNC
tristate "PPP support for async serial ports"
depends on PPP
diff --git a/drivers/net/ppp/Makefile b/drivers/net/ppp/Makefile
index a6b6297b0066..d283d03c4683 100644
--- a/drivers/net/ppp/Makefile
+++ b/drivers/net/ppp/Makefile
@@ -11,3 +11,5 @@ obj-$(CONFIG_PPP_SYNC_TTY) += ppp_synctty.o
obj-$(CONFIG_PPPOE) += pppox.o pppoe.o
obj-$(CONFIG_PPPOL2TP) += pppox.o
obj-$(CONFIG_PPTP) += pppox.o pptp.o
+obj-$(CONFIG_PPPOLAC) += pppox.o pppolac.o
+obj-$(CONFIG_PPPOPNS) += pppox.o pppopns.o
diff --git a/drivers/net/ppp/pppolac.c b/drivers/net/ppp/pppolac.c
new file mode 100644
index 000000000000..a5d3d634fd9a
--- /dev/null
+++ b/drivers/net/ppp/pppolac.c
@@ -0,0 +1,449 @@
+/* drivers/net/pppolac.c
+ *
+ * Driver for PPP on L2TP Access Concentrator / PPPoLAC Socket (RFC 2661)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* This driver handles L2TP data packets between a UDP socket and a PPP channel.
+ * The socket must keep connected, and only one session per socket is permitted.
+ * Sequencing of outgoing packets is controlled by LNS. Incoming packets with
+ * sequences are reordered within a sliding window of one second. Currently
+ * reordering only happens when a packet is received. It is done for simplicity
+ * since no additional locks or threads are required. This driver only works on
+ * IPv4 due to the lack of UDP encapsulation support in IPv6. */
+
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <linux/netdevice.h>
+#include <linux/net.h>
+#include <linux/udp.h>
+#include <linux/ppp_defs.h>
+#include <linux/if_ppp.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_channel.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+
+#define L2TP_CONTROL_BIT 0x80
+#define L2TP_LENGTH_BIT 0x40
+#define L2TP_SEQUENCE_BIT 0x08
+#define L2TP_OFFSET_BIT 0x02
+#define L2TP_VERSION 0x02
+#define L2TP_VERSION_MASK 0x0F
+
+#define PPP_ADDR 0xFF
+#define PPP_CTRL 0x03
+
+union unaligned {
+ __u32 u32;
+} __attribute__((packed));
+
+static inline union unaligned *unaligned(void *ptr)
+{
+ return (union unaligned *)ptr;
+}
+
+struct meta {
+ __u32 sequence;
+ __u32 timestamp;
+};
+
+static inline struct meta *skb_meta(struct sk_buff *skb)
+{
+ return (struct meta *)skb->cb;
+}
+
+/******************************************************************************/
+
+static int pppolac_recv_core(struct sock *sk_udp, struct sk_buff *skb)
+{
+ struct sock *sk = (struct sock *)sk_udp->sk_user_data;
+ struct pppolac_opt *opt = &pppox_sk(sk)->proto.lac;
+ struct meta *meta = skb_meta(skb);
+ __u32 now = jiffies;
+ __u8 bits;
+ __u8 *ptr;
+
+ /* Drop the packet if L2TP header is missing. */
+ if (skb->len < sizeof(struct udphdr) + 6)
+ goto drop;
+
+ /* Put it back if it is a control packet. */
+ if (skb->data[sizeof(struct udphdr)] & L2TP_CONTROL_BIT)
+ return opt->backlog_rcv(sk_udp, skb);
+
+ /* Skip UDP header. */
+ skb_pull(skb, sizeof(struct udphdr));
+
+ /* Check the version. */
+ if ((skb->data[1] & L2TP_VERSION_MASK) != L2TP_VERSION)
+ goto drop;
+ bits = skb->data[0];
+ ptr = &skb->data[2];
+
+ /* Check the length if it is present. */
+ if (bits & L2TP_LENGTH_BIT) {
+ if ((ptr[0] << 8 | ptr[1]) != skb->len)
+ goto drop;
+ ptr += 2;
+ }
+
+ /* Skip all fields including optional ones. */
+ if (!skb_pull(skb, 6 + (bits & L2TP_SEQUENCE_BIT ? 4 : 0) +
+ (bits & L2TP_LENGTH_BIT ? 2 : 0) +
+ (bits & L2TP_OFFSET_BIT ? 2 : 0)))
+ goto drop;
+
+ /* Skip the offset padding if it is present. */
+ if (bits & L2TP_OFFSET_BIT &&
+ !skb_pull(skb, skb->data[-2] << 8 | skb->data[-1]))
+ goto drop;
+
+ /* Check the tunnel and the session. */
+ if (unaligned(ptr)->u32 != opt->local)
+ goto drop;
+
+ /* Check the sequence if it is present. */
+ if (bits & L2TP_SEQUENCE_BIT) {
+ meta->sequence = ptr[4] << 8 | ptr[5];
+ if ((__s16)(meta->sequence - opt->recv_sequence) < 0)
+ goto drop;
+ }
+
+ /* Skip PPP address and control if they are present. */
+ if (skb->len >= 2 && skb->data[0] == PPP_ADDR &&
+ skb->data[1] == PPP_CTRL)
+ skb_pull(skb, 2);
+
+ /* Fix PPP protocol if it is compressed. */
+ if (skb->len >= 1 && skb->data[0] & 1)
+ skb_push(skb, 1)[0] = 0;
+
+ /* Drop the packet if PPP protocol is missing. */
+ if (skb->len < 2)
+ goto drop;
+
+ /* Perform reordering if sequencing is enabled. */
+ atomic_set(&opt->sequencing, bits & L2TP_SEQUENCE_BIT);
+ if (bits & L2TP_SEQUENCE_BIT) {
+ struct sk_buff *skb1;
+
+ /* Insert the packet into receive queue in order. */
+ skb_set_owner_r(skb, sk);
+ skb_queue_walk(&sk->sk_receive_queue, skb1) {
+ struct meta *meta1 = skb_meta(skb1);
+ __s16 order = meta->sequence - meta1->sequence;
+ if (order == 0)
+ goto drop;
+ if (order < 0) {
+ meta->timestamp = meta1->timestamp;
+ skb_insert(skb1, skb, &sk->sk_receive_queue);
+ skb = NULL;
+ break;
+ }
+ }
+ if (skb) {
+ meta->timestamp = now;
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ }
+
+ /* Remove packets from receive queue as long as
+ * 1. the receive buffer is full,
+ * 2. they are queued longer than one second, or
+ * 3. there are no missing packets before them. */
+ skb_queue_walk_safe(&sk->sk_receive_queue, skb, skb1) {
+ meta = skb_meta(skb);
+ if (atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
+ now - meta->timestamp < HZ &&
+ meta->sequence != opt->recv_sequence)
+ break;
+ skb_unlink(skb, &sk->sk_receive_queue);
+ opt->recv_sequence = (__u16)(meta->sequence + 1);
+ skb_orphan(skb);
+ ppp_input(&pppox_sk(sk)->chan, skb);
+ }
+ return NET_RX_SUCCESS;
+ }
+
+ /* Flush receive queue if sequencing is disabled. */
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_orphan(skb);
+ ppp_input(&pppox_sk(sk)->chan, skb);
+ return NET_RX_SUCCESS;
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static int pppolac_recv(struct sock *sk_udp, struct sk_buff *skb)
+{
+ sock_hold(sk_udp);
+ sk_receive_skb(sk_udp, skb, 0);
+ return 0;
+}
+
+static struct sk_buff_head delivery_queue;
+
+static void pppolac_xmit_core(struct work_struct *delivery_work)
+{
+ mm_segment_t old_fs = get_fs();
+ struct sk_buff *skb;
+
+ set_fs(KERNEL_DS);
+ while ((skb = skb_dequeue(&delivery_queue))) {
+ struct sock *sk_udp = skb->sk;
+ struct kvec iov = {.iov_base = skb->data, .iov_len = skb->len};
+ struct msghdr msg = {
+ .msg_iov = (struct iovec *)&iov,
+ .msg_iovlen = 1,
+ .msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT,
+ };
+ sk_udp->sk_prot->sendmsg(NULL, sk_udp, &msg, skb->len);
+ kfree_skb(skb);
+ }
+ set_fs(old_fs);
+}
+
+static DECLARE_WORK(delivery_work, pppolac_xmit_core);
+
+static int pppolac_xmit(struct ppp_channel *chan, struct sk_buff *skb)
+{
+ struct sock *sk_udp = (struct sock *)chan->private;
+ struct pppolac_opt *opt = &pppox_sk(sk_udp->sk_user_data)->proto.lac;
+
+ /* Install PPP address and control. */
+ skb_push(skb, 2);
+ skb->data[0] = PPP_ADDR;
+ skb->data[1] = PPP_CTRL;
+
+ /* Install L2TP header. */
+ if (atomic_read(&opt->sequencing)) {
+ skb_push(skb, 10);
+ skb->data[0] = L2TP_SEQUENCE_BIT;
+ skb->data[6] = opt->xmit_sequence >> 8;
+ skb->data[7] = opt->xmit_sequence;
+ skb->data[8] = 0;
+ skb->data[9] = 0;
+ opt->xmit_sequence++;
+ } else {
+ skb_push(skb, 6);
+ skb->data[0] = 0;
+ }
+ skb->data[1] = L2TP_VERSION;
+ unaligned(&skb->data[2])->u32 = opt->remote;
+
+ /* Now send the packet via the delivery queue. */
+ skb_set_owner_w(skb, sk_udp);
+ skb_queue_tail(&delivery_queue, skb);
+ schedule_work(&delivery_work);
+ return 1;
+}
+
+/******************************************************************************/
+
+static struct ppp_channel_ops pppolac_channel_ops = {
+ .start_xmit = pppolac_xmit,
+};
+
+static int pppolac_connect(struct socket *sock, struct sockaddr *useraddr,
+ int addrlen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct pppox_sock *po = pppox_sk(sk);
+ struct sockaddr_pppolac *addr = (struct sockaddr_pppolac *)useraddr;
+ struct socket *sock_udp = NULL;
+ struct sock *sk_udp;
+ int error;
+
+ if (addrlen != sizeof(struct sockaddr_pppolac) ||
+ !addr->local.tunnel || !addr->local.session ||
+ !addr->remote.tunnel || !addr->remote.session) {
+ return -EINVAL;
+ }
+
+ lock_sock(sk);
+ error = -EALREADY;
+ if (sk->sk_state != PPPOX_NONE)
+ goto out;
+
+ sock_udp = sockfd_lookup(addr->udp_socket, &error);
+ if (!sock_udp)
+ goto out;
+ sk_udp = sock_udp->sk;
+ lock_sock(sk_udp);
+
+ /* Remove this check when IPv6 supports UDP encapsulation. */
+ error = -EAFNOSUPPORT;
+ if (sk_udp->sk_family != AF_INET)
+ goto out;
+ error = -EPROTONOSUPPORT;
+ if (sk_udp->sk_protocol != IPPROTO_UDP)
+ goto out;
+ error = -EDESTADDRREQ;
+ if (sk_udp->sk_state != TCP_ESTABLISHED)
+ goto out;
+ error = -EBUSY;
+ if (udp_sk(sk_udp)->encap_type || sk_udp->sk_user_data)
+ goto out;
+ if (!sk_udp->sk_bound_dev_if) {
+ struct dst_entry *dst = sk_dst_get(sk_udp);
+ error = -ENODEV;
+ if (!dst)
+ goto out;
+ sk_udp->sk_bound_dev_if = dst->dev->ifindex;
+ dst_release(dst);
+ }
+
+ po->chan.hdrlen = 12;
+ po->chan.private = sk_udp;
+ po->chan.ops = &pppolac_channel_ops;
+ po->chan.mtu = PPP_MRU - 80;
+ po->proto.lac.local = unaligned(&addr->local)->u32;
+ po->proto.lac.remote = unaligned(&addr->remote)->u32;
+ atomic_set(&po->proto.lac.sequencing, 1);
+ po->proto.lac.backlog_rcv = sk_udp->sk_backlog_rcv;
+
+ error = ppp_register_channel(&po->chan);
+ if (error)
+ goto out;
+
+ sk->sk_state = PPPOX_CONNECTED;
+ udp_sk(sk_udp)->encap_type = UDP_ENCAP_L2TPINUDP;
+ udp_sk(sk_udp)->encap_rcv = pppolac_recv;
+ sk_udp->sk_backlog_rcv = pppolac_recv_core;
+ sk_udp->sk_user_data = sk;
+out:
+ if (sock_udp) {
+ release_sock(sk_udp);
+ if (error)
+ sockfd_put(sock_udp);
+ }
+ release_sock(sk);
+ return error;
+}
+
+static int pppolac_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (!sk)
+ return 0;
+
+ lock_sock(sk);
+ if (sock_flag(sk, SOCK_DEAD)) {
+ release_sock(sk);
+ return -EBADF;
+ }
+
+ if (sk->sk_state != PPPOX_NONE) {
+ struct sock *sk_udp = (struct sock *)pppox_sk(sk)->chan.private;
+ lock_sock(sk_udp);
+ skb_queue_purge(&sk->sk_receive_queue);
+ pppox_unbind_sock(sk);
+ udp_sk(sk_udp)->encap_type = 0;
+ udp_sk(sk_udp)->encap_rcv = NULL;
+ sk_udp->sk_backlog_rcv = pppox_sk(sk)->proto.lac.backlog_rcv;
+ sk_udp->sk_user_data = NULL;
+ release_sock(sk_udp);
+ sockfd_put(sk_udp->sk_socket);
+ }
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+ release_sock(sk);
+ sock_put(sk);
+ return 0;
+}
+
+/******************************************************************************/
+
+static struct proto pppolac_proto = {
+ .name = "PPPOLAC",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct pppox_sock),
+};
+
+static struct proto_ops pppolac_proto_ops = {
+ .family = PF_PPPOX,
+ .owner = THIS_MODULE,
+ .release = pppolac_release,
+ .bind = sock_no_bind,
+ .connect = pppolac_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = sock_no_poll,
+ .ioctl = pppox_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = sock_no_sendmsg,
+ .recvmsg = sock_no_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static int pppolac_create(struct net *net, struct socket *sock)
+{
+ struct sock *sk;
+
+ sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppolac_proto);
+ if (!sk)
+ return -ENOMEM;
+
+ sock_init_data(sock, sk);
+ sock->state = SS_UNCONNECTED;
+ sock->ops = &pppolac_proto_ops;
+ sk->sk_protocol = PX_PROTO_OLAC;
+ sk->sk_state = PPPOX_NONE;
+ return 0;
+}
+
+/******************************************************************************/
+
+static struct pppox_proto pppolac_pppox_proto = {
+ .create = pppolac_create,
+ .owner = THIS_MODULE,
+};
+
+static int __init pppolac_init(void)
+{
+ int error;
+
+ error = proto_register(&pppolac_proto, 0);
+ if (error)
+ return error;
+
+ error = register_pppox_proto(PX_PROTO_OLAC, &pppolac_pppox_proto);
+ if (error)
+ proto_unregister(&pppolac_proto);
+ else
+ skb_queue_head_init(&delivery_queue);
+ return error;
+}
+
+static void __exit pppolac_exit(void)
+{
+ unregister_pppox_proto(PX_PROTO_OLAC);
+ proto_unregister(&pppolac_proto);
+}
+
+module_init(pppolac_init);
+module_exit(pppolac_exit);
+
+MODULE_DESCRIPTION("PPP on L2TP Access Concentrator (PPPoLAC)");
+MODULE_AUTHOR("Chia-chi Yeh <chiachi@android.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ppp/pppopns.c b/drivers/net/ppp/pppopns.c
new file mode 100644
index 000000000000..dc15f978c922
--- /dev/null
+++ b/drivers/net/ppp/pppopns.c
@@ -0,0 +1,428 @@
+/* drivers/net/pppopns.c
+ *
+ * Driver for PPP on PPTP Network Server / PPPoPNS Socket (RFC 2637)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* This driver handles PPTP data packets between a RAW socket and a PPP channel.
+ * The socket is created in the kernel space and connected to the same address
+ * of the control socket. Outgoing packets are always sent with sequences but
+ * without acknowledgements. Incoming packets with sequences are reordered
+ * within a sliding window of one second. Currently reordering only happens when
+ * a packet is received. It is done for simplicity since no additional locks or
+ * threads are required. This driver should work on both IPv4 and IPv6. */
+
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <linux/netdevice.h>
+#include <linux/net.h>
+#include <linux/ppp_defs.h>
+#include <linux/if.h>
+#include <linux/if_ppp.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_channel.h>
+#include <asm/uaccess.h>
+
+#define GRE_HEADER_SIZE 8
+
+#define PPTP_GRE_BITS htons(0x2001)
+#define PPTP_GRE_BITS_MASK htons(0xEF7F)
+#define PPTP_GRE_SEQ_BIT htons(0x1000)
+#define PPTP_GRE_ACK_BIT htons(0x0080)
+#define PPTP_GRE_TYPE htons(0x880B)
+
+#define PPP_ADDR 0xFF
+#define PPP_CTRL 0x03
+
+struct header {
+ __u16 bits;
+ __u16 type;
+ __u16 length;
+ __u16 call;
+ __u32 sequence;
+} __attribute__((packed));
+
+struct meta {
+ __u32 sequence;
+ __u32 timestamp;
+};
+
+static inline struct meta *skb_meta(struct sk_buff *skb)
+{
+ return (struct meta *)skb->cb;
+}
+
+/******************************************************************************/
+
+static int pppopns_recv_core(struct sock *sk_raw, struct sk_buff *skb)
+{
+ struct sock *sk = (struct sock *)sk_raw->sk_user_data;
+ struct pppopns_opt *opt = &pppox_sk(sk)->proto.pns;
+ struct meta *meta = skb_meta(skb);
+ __u32 now = jiffies;
+ struct header *hdr;
+
+ /* Skip transport header */
+ skb_pull(skb, skb_transport_header(skb) - skb->data);
+
+ /* Drop the packet if GRE header is missing. */
+ if (skb->len < GRE_HEADER_SIZE)
+ goto drop;
+ hdr = (struct header *)skb->data;
+
+ /* Check the header. */
+ if (hdr->type != PPTP_GRE_TYPE || hdr->call != opt->local ||
+ (hdr->bits & PPTP_GRE_BITS_MASK) != PPTP_GRE_BITS)
+ goto drop;
+
+ /* Skip all fields including optional ones. */
+ if (!skb_pull(skb, GRE_HEADER_SIZE +
+ (hdr->bits & PPTP_GRE_SEQ_BIT ? 4 : 0) +
+ (hdr->bits & PPTP_GRE_ACK_BIT ? 4 : 0)))
+ goto drop;
+
+ /* Check the length. */
+ if (skb->len != ntohs(hdr->length))
+ goto drop;
+
+ /* Check the sequence if it is present. */
+ if (hdr->bits & PPTP_GRE_SEQ_BIT) {
+ meta->sequence = ntohl(hdr->sequence);
+ if ((__s32)(meta->sequence - opt->recv_sequence) < 0)
+ goto drop;
+ }
+
+ /* Skip PPP address and control if they are present. */
+ if (skb->len >= 2 && skb->data[0] == PPP_ADDR &&
+ skb->data[1] == PPP_CTRL)
+ skb_pull(skb, 2);
+
+ /* Fix PPP protocol if it is compressed. */
+ if (skb->len >= 1 && skb->data[0] & 1)
+ skb_push(skb, 1)[0] = 0;
+
+ /* Drop the packet if PPP protocol is missing. */
+ if (skb->len < 2)
+ goto drop;
+
+ /* Perform reordering if sequencing is enabled. */
+ if (hdr->bits & PPTP_GRE_SEQ_BIT) {
+ struct sk_buff *skb1;
+
+ /* Insert the packet into receive queue in order. */
+ skb_set_owner_r(skb, sk);
+ skb_queue_walk(&sk->sk_receive_queue, skb1) {
+ struct meta *meta1 = skb_meta(skb1);
+ __s32 order = meta->sequence - meta1->sequence;
+ if (order == 0)
+ goto drop;
+ if (order < 0) {
+ meta->timestamp = meta1->timestamp;
+ skb_insert(skb1, skb, &sk->sk_receive_queue);
+ skb = NULL;
+ break;
+ }
+ }
+ if (skb) {
+ meta->timestamp = now;
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ }
+
+ /* Remove packets from receive queue as long as
+ * 1. the receive buffer is full,
+ * 2. they are queued longer than one second, or
+ * 3. there are no missing packets before them. */
+ skb_queue_walk_safe(&sk->sk_receive_queue, skb, skb1) {
+ meta = skb_meta(skb);
+ if (atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
+ now - meta->timestamp < HZ &&
+ meta->sequence != opt->recv_sequence)
+ break;
+ skb_unlink(skb, &sk->sk_receive_queue);
+ opt->recv_sequence = meta->sequence + 1;
+ skb_orphan(skb);
+ ppp_input(&pppox_sk(sk)->chan, skb);
+ }
+ return NET_RX_SUCCESS;
+ }
+
+ /* Flush receive queue if sequencing is disabled. */
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_orphan(skb);
+ ppp_input(&pppox_sk(sk)->chan, skb);
+ return NET_RX_SUCCESS;
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static void pppopns_recv(struct sock *sk_raw)
+{
+ struct sk_buff *skb;
+ while ((skb = skb_dequeue(&sk_raw->sk_receive_queue))) {
+ sock_hold(sk_raw);
+ sk_receive_skb(sk_raw, skb, 0);
+ }
+}
+
+static struct sk_buff_head delivery_queue;
+
+static void pppopns_xmit_core(struct work_struct *delivery_work)
+{
+ mm_segment_t old_fs = get_fs();
+ struct sk_buff *skb;
+
+ set_fs(KERNEL_DS);
+ while ((skb = skb_dequeue(&delivery_queue))) {
+ struct sock *sk_raw = skb->sk;
+ struct kvec iov = {.iov_base = skb->data, .iov_len = skb->len};
+ struct msghdr msg = {
+ .msg_iov = (struct iovec *)&iov,
+ .msg_iovlen = 1,
+ .msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT,
+ };
+ sk_raw->sk_prot->sendmsg(NULL, sk_raw, &msg, skb->len);
+ kfree_skb(skb);
+ }
+ set_fs(old_fs);
+}
+
+static DECLARE_WORK(delivery_work, pppopns_xmit_core);
+
+static int pppopns_xmit(struct ppp_channel *chan, struct sk_buff *skb)
+{
+ struct sock *sk_raw = (struct sock *)chan->private;
+ struct pppopns_opt *opt = &pppox_sk(sk_raw->sk_user_data)->proto.pns;
+ struct header *hdr;
+ __u16 length;
+
+ /* Install PPP address and control. */
+ skb_push(skb, 2);
+ skb->data[0] = PPP_ADDR;
+ skb->data[1] = PPP_CTRL;
+ length = skb->len;
+
+ /* Install PPTP GRE header. */
+ hdr = (struct header *)skb_push(skb, 12);
+ hdr->bits = PPTP_GRE_BITS | PPTP_GRE_SEQ_BIT;
+ hdr->type = PPTP_GRE_TYPE;
+ hdr->length = htons(length);
+ hdr->call = opt->remote;
+ hdr->sequence = htonl(opt->xmit_sequence);
+ opt->xmit_sequence++;
+
+ /* Now send the packet via the delivery queue. */
+ skb_set_owner_w(skb, sk_raw);
+ skb_queue_tail(&delivery_queue, skb);
+ schedule_work(&delivery_work);
+ return 1;
+}
+
+/******************************************************************************/
+
+static struct ppp_channel_ops pppopns_channel_ops = {
+ .start_xmit = pppopns_xmit,
+};
+
+static int pppopns_connect(struct socket *sock, struct sockaddr *useraddr,
+ int addrlen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct pppox_sock *po = pppox_sk(sk);
+ struct sockaddr_pppopns *addr = (struct sockaddr_pppopns *)useraddr;
+ struct sockaddr_storage ss;
+ struct socket *sock_tcp = NULL;
+ struct socket *sock_raw = NULL;
+ struct sock *sk_tcp;
+ struct sock *sk_raw;
+ int error;
+
+ if (addrlen != sizeof(struct sockaddr_pppopns))
+ return -EINVAL;
+
+ lock_sock(sk);
+ error = -EALREADY;
+ if (sk->sk_state != PPPOX_NONE)
+ goto out;
+
+ sock_tcp = sockfd_lookup(addr->tcp_socket, &error);
+ if (!sock_tcp)
+ goto out;
+ sk_tcp = sock_tcp->sk;
+ error = -EPROTONOSUPPORT;
+ if (sk_tcp->sk_protocol != IPPROTO_TCP)
+ goto out;
+ addrlen = sizeof(struct sockaddr_storage);
+ error = kernel_getpeername(sock_tcp, (struct sockaddr *)&ss, &addrlen);
+ if (error)
+ goto out;
+ if (!sk_tcp->sk_bound_dev_if) {
+ struct dst_entry *dst = sk_dst_get(sk_tcp);
+ error = -ENODEV;
+ if (!dst)
+ goto out;
+ sk_tcp->sk_bound_dev_if = dst->dev->ifindex;
+ dst_release(dst);
+ }
+
+ error = sock_create(ss.ss_family, SOCK_RAW, IPPROTO_GRE, &sock_raw);
+ if (error)
+ goto out;
+ sk_raw = sock_raw->sk;
+ sk_raw->sk_bound_dev_if = sk_tcp->sk_bound_dev_if;
+ error = kernel_connect(sock_raw, (struct sockaddr *)&ss, addrlen, 0);
+ if (error)
+ goto out;
+
+ po->chan.hdrlen = 14;
+ po->chan.private = sk_raw;
+ po->chan.ops = &pppopns_channel_ops;
+ po->chan.mtu = PPP_MRU - 80;
+ po->proto.pns.local = addr->local;
+ po->proto.pns.remote = addr->remote;
+ po->proto.pns.data_ready = sk_raw->sk_data_ready;
+ po->proto.pns.backlog_rcv = sk_raw->sk_backlog_rcv;
+
+ error = ppp_register_channel(&po->chan);
+ if (error)
+ goto out;
+
+ sk->sk_state = PPPOX_CONNECTED;
+ lock_sock(sk_raw);
+ sk_raw->sk_data_ready = pppopns_recv;
+ sk_raw->sk_backlog_rcv = pppopns_recv_core;
+ sk_raw->sk_user_data = sk;
+ release_sock(sk_raw);
+out:
+ if (sock_tcp)
+ sockfd_put(sock_tcp);
+ if (error && sock_raw)
+ sock_release(sock_raw);
+ release_sock(sk);
+ return error;
+}
+
+static int pppopns_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (!sk)
+ return 0;
+
+ lock_sock(sk);
+ if (sock_flag(sk, SOCK_DEAD)) {
+ release_sock(sk);
+ return -EBADF;
+ }
+
+ if (sk->sk_state != PPPOX_NONE) {
+ struct sock *sk_raw = (struct sock *)pppox_sk(sk)->chan.private;
+ lock_sock(sk_raw);
+ skb_queue_purge(&sk->sk_receive_queue);
+ pppox_unbind_sock(sk);
+ sk_raw->sk_data_ready = pppox_sk(sk)->proto.pns.data_ready;
+ sk_raw->sk_backlog_rcv = pppox_sk(sk)->proto.pns.backlog_rcv;
+ sk_raw->sk_user_data = NULL;
+ release_sock(sk_raw);
+ sock_release(sk_raw->sk_socket);
+ }
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+ release_sock(sk);
+ sock_put(sk);
+ return 0;
+}
+
+/******************************************************************************/
+
+static struct proto pppopns_proto = {
+ .name = "PPPOPNS",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct pppox_sock),
+};
+
+static struct proto_ops pppopns_proto_ops = {
+ .family = PF_PPPOX,
+ .owner = THIS_MODULE,
+ .release = pppopns_release,
+ .bind = sock_no_bind,
+ .connect = pppopns_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = sock_no_poll,
+ .ioctl = pppox_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = sock_no_sendmsg,
+ .recvmsg = sock_no_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static int pppopns_create(struct net *net, struct socket *sock)
+{
+ struct sock *sk;
+
+ sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppopns_proto);
+ if (!sk)
+ return -ENOMEM;
+
+ sock_init_data(sock, sk);
+ sock->state = SS_UNCONNECTED;
+ sock->ops = &pppopns_proto_ops;
+ sk->sk_protocol = PX_PROTO_OPNS;
+ sk->sk_state = PPPOX_NONE;
+ return 0;
+}
+
+/******************************************************************************/
+
+static struct pppox_proto pppopns_pppox_proto = {
+ .create = pppopns_create,
+ .owner = THIS_MODULE,
+};
+
+static int __init pppopns_init(void)
+{
+ int error;
+
+ error = proto_register(&pppopns_proto, 0);
+ if (error)
+ return error;
+
+ error = register_pppox_proto(PX_PROTO_OPNS, &pppopns_pppox_proto);
+ if (error)
+ proto_unregister(&pppopns_proto);
+ else
+ skb_queue_head_init(&delivery_queue);
+ return error;
+}
+
+static void __exit pppopns_exit(void)
+{
+ unregister_pppox_proto(PX_PROTO_OPNS);
+ proto_unregister(&pppopns_proto);
+}
+
+module_init(pppopns_init);
+module_exit(pppopns_exit);
+
+MODULE_DESCRIPTION("PPP on PPTP Network Server (PPPoPNS)");
+MODULE_AUTHOR("Chia-chi Yeh <chiachi@android.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a4685a22f665..362477376145 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1881,6 +1881,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
unsigned int ifindex;
int ret;
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+ if (cmd != TUNGETIFF && !capable(CAP_NET_ADMIN)) {
+ return -EPERM;
+ }
+#endif
+
if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) {
if (copy_from_user(&ifr, argp, ifreq_len))
return -EFAULT;
diff --git a/drivers/net/wireless/ti/wilink_platform_data.c b/drivers/net/wireless/ti/wilink_platform_data.c
index a92bd3e89796..ea0e359bdb43 100644
--- a/drivers/net/wireless/ti/wilink_platform_data.c
+++ b/drivers/net/wireless/ti/wilink_platform_data.c
@@ -23,31 +23,6 @@
#include <linux/err.h>
#include <linux/wl12xx.h>
-static struct wl12xx_platform_data *wl12xx_platform_data;
-
-int __init wl12xx_set_platform_data(const struct wl12xx_platform_data *data)
-{
- if (wl12xx_platform_data)
- return -EBUSY;
- if (!data)
- return -EINVAL;
-
- wl12xx_platform_data = kmemdup(data, sizeof(*data), GFP_KERNEL);
- if (!wl12xx_platform_data)
- return -ENOMEM;
-
- return 0;
-}
-
-struct wl12xx_platform_data *wl12xx_get_platform_data(void)
-{
- if (!wl12xx_platform_data)
- return ERR_PTR(-ENODEV);
-
- return wl12xx_platform_data;
-}
-EXPORT_SYMBOL(wl12xx_get_platform_data);
-
static struct wl1251_platform_data *wl1251_platform_data;
int __init wl1251_set_platform_data(const struct wl1251_platform_data *data)
diff --git a/drivers/net/wireless/ti/wl12xx/main.c b/drivers/net/wireless/ti/wl12xx/main.c
index 0bccf123831e..a9167cd19de3 100644
--- a/drivers/net/wireless/ti/wl12xx/main.c
+++ b/drivers/net/wireless/ti/wl12xx/main.c
@@ -24,8 +24,6 @@
#include <linux/err.h>
-#include <linux/wl12xx.h>
-
#include "../wlcore/wlcore.h"
#include "../wlcore/debug.h"
#include "../wlcore/io.h"
@@ -1766,11 +1764,44 @@ wl12xx_iface_combinations[] = {
},
};
+static const struct wl12xx_clock wl12xx_refclock_table[] = {
+ { 19200000, false, WL12XX_REFCLOCK_19 },
+ { 26000000, false, WL12XX_REFCLOCK_26 },
+ { 26000000, true, WL12XX_REFCLOCK_26_XTAL },
+ { 38400000, false, WL12XX_REFCLOCK_38 },
+ { 38400000, true, WL12XX_REFCLOCK_38_XTAL },
+ { 52000000, false, WL12XX_REFCLOCK_52 },
+ { 0, false, 0 }
+};
+
+static const struct wl12xx_clock wl12xx_tcxoclock_table[] = {
+ { 16368000, true, WL12XX_TCXOCLOCK_16_368 },
+ { 16800000, true, WL12XX_TCXOCLOCK_16_8 },
+ { 19200000, true, WL12XX_TCXOCLOCK_19_2 },
+ { 26000000, true, WL12XX_TCXOCLOCK_26 },
+ { 32736000, true, WL12XX_TCXOCLOCK_32_736 },
+ { 33600000, true, WL12XX_TCXOCLOCK_33_6 },
+ { 38400000, true, WL12XX_TCXOCLOCK_38_4 },
+ { 52000000, true, WL12XX_TCXOCLOCK_52 },
+ { 0, false, 0 }
+};
+
+static int wl12xx_get_clock_idx(const struct wl12xx_clock *table,
+ u32 freq, bool xtal)
+{
+ int i;
+
+ for (i = 0; table[i].freq != 0; i++)
+ if ((table[i].freq == freq) && (table[i].xtal == xtal))
+ return table[i].hw_idx;
+
+ return -EINVAL;
+}
+
static int wl12xx_setup(struct wl1271 *wl)
{
struct wl12xx_priv *priv = wl->priv;
struct wlcore_platdev_data *pdev_data = dev_get_platdata(&wl->pdev->dev);
- struct wl12xx_platform_data *pdata = pdev_data->pdata;
BUILD_BUG_ON(WL12XX_MAX_LINKS > WLCORE_MAX_LINKS);
BUILD_BUG_ON(WL12XX_MAX_AP_STATIONS > WL12XX_MAX_LINKS);
@@ -1795,7 +1826,17 @@ static int wl12xx_setup(struct wl1271 *wl)
wl12xx_conf_init(wl);
if (!fref_param) {
- priv->ref_clock = pdata->board_ref_clock;
+ priv->ref_clock = wl12xx_get_clock_idx(wl12xx_refclock_table,
+ pdev_data->ref_clock_freq,
+ pdev_data->ref_clock_xtal);
+ if (priv->ref_clock < 0) {
+ wl1271_error("Invalid ref_clock frequency (%d Hz, %s)",
+ pdev_data->ref_clock_freq,
+ pdev_data->ref_clock_xtal ?
+ "XTAL" : "not XTAL");
+
+ return priv->ref_clock;
+ }
} else {
if (!strcmp(fref_param, "19.2"))
priv->ref_clock = WL12XX_REFCLOCK_19;
@@ -1813,9 +1854,17 @@ static int wl12xx_setup(struct wl1271 *wl)
wl1271_error("Invalid fref parameter %s", fref_param);
}
- if (!tcxo_param) {
- priv->tcxo_clock = pdata->board_tcxo_clock;
- } else {
+ if (!tcxo_param && pdev_data->tcxo_clock_freq) {
+ priv->tcxo_clock = wl12xx_get_clock_idx(wl12xx_tcxoclock_table,
+ pdev_data->tcxo_clock_freq,
+ true);
+ if (priv->tcxo_clock < 0) {
+ wl1271_error("Invalid tcxo_clock frequency (%d Hz)",
+ pdev_data->tcxo_clock_freq);
+
+ return priv->tcxo_clock;
+ }
+ } else if (tcxo_param) {
if (!strcmp(tcxo_param, "19.2"))
priv->tcxo_clock = WL12XX_TCXOCLOCK_19_2;
else if (!strcmp(tcxo_param, "26"))
diff --git a/drivers/net/wireless/ti/wl12xx/scan.c b/drivers/net/wireless/ti/wl12xx/scan.c
index 0c0d5cd98514..7c355fff2c5e 100644
--- a/drivers/net/wireless/ti/wl12xx/scan.c
+++ b/drivers/net/wireless/ti/wl12xx/scan.c
@@ -118,7 +118,11 @@ static int wl1271_scan_send(struct wl1271 *wl, struct wl12xx_vif *wlvif,
if (passive)
scan_options |= WL1271_SCAN_OPT_PASSIVE;
- cmd->params.role_id = wlvif->role_id;
+ /* scan on the dev role if the regular one is not started */
+ if (wlcore_is_p2p_mgmt(wlvif))
+ cmd->params.role_id = wlvif->dev_role_id;
+ else
+ cmd->params.role_id = wlvif->role_id;
if (WARN_ON(cmd->params.role_id == WL12XX_INVALID_ROLE_ID)) {
ret = -EINVAL;
diff --git a/drivers/net/wireless/ti/wl12xx/wl12xx.h b/drivers/net/wireless/ti/wl12xx/wl12xx.h
index 75c92658bfea..5952e99ace1b 100644
--- a/drivers/net/wireless/ti/wl12xx/wl12xx.h
+++ b/drivers/net/wireless/ti/wl12xx/wl12xx.h
@@ -82,6 +82,34 @@ struct wl12xx_priv {
struct wl127x_rx_mem_pool_addr *rx_mem_addr;
};
+/* Reference clock values */
+enum {
+ WL12XX_REFCLOCK_19 = 0, /* 19.2 MHz */
+ WL12XX_REFCLOCK_26 = 1, /* 26 MHz */
+ WL12XX_REFCLOCK_38 = 2, /* 38.4 MHz */
+ WL12XX_REFCLOCK_52 = 3, /* 52 MHz */
+ WL12XX_REFCLOCK_38_XTAL = 4, /* 38.4 MHz, XTAL */
+ WL12XX_REFCLOCK_26_XTAL = 5, /* 26 MHz, XTAL */
+};
+
+/* TCXO clock values */
+enum {
+ WL12XX_TCXOCLOCK_19_2 = 0, /* 19.2MHz */
+ WL12XX_TCXOCLOCK_26 = 1, /* 26 MHz */
+ WL12XX_TCXOCLOCK_38_4 = 2, /* 38.4MHz */
+ WL12XX_TCXOCLOCK_52 = 3, /* 52 MHz */
+ WL12XX_TCXOCLOCK_16_368 = 4, /* 16.368 MHz */
+ WL12XX_TCXOCLOCK_32_736 = 5, /* 32.736 MHz */
+ WL12XX_TCXOCLOCK_16_8 = 6, /* 16.8 MHz */
+ WL12XX_TCXOCLOCK_33_6 = 7, /* 33.6 MHz */
+};
+
+struct wl12xx_clock {
+ u32 freq;
+ bool xtal;
+ u8 hw_idx;
+};
+
struct wl12xx_fw_packet_counters {
/* Cumulative counter of released packets per AC */
u8 tx_released_pkts[NUM_TX_QUEUES];
diff --git a/drivers/net/wireless/ti/wl18xx/cmd.c b/drivers/net/wireless/ti/wl18xx/cmd.c
index 44f0b205b065..10f9d1c064ba 100644
--- a/drivers/net/wireless/ti/wl18xx/cmd.c
+++ b/drivers/net/wireless/ti/wl18xx/cmd.c
@@ -167,3 +167,34 @@ out_free:
out:
return ret;
}
+
+int wl18xx_cmd_set_cac(struct wl1271 *wl, struct wl12xx_vif *wlvif, bool start)
+{
+ struct wlcore_cmd_cac_start *cmd;
+ int ret = 0;
+
+ wl1271_debug(DEBUG_CMD, "cmd cac (channel %d) %s",
+ wlvif->channel, start ? "start" : "stop");
+
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ cmd->role_id = wlvif->role_id;
+ cmd->channel = wlvif->channel;
+ if (wlvif->band == IEEE80211_BAND_5GHZ)
+ cmd->band = WLCORE_BAND_5GHZ;
+ cmd->bandwidth = wlcore_get_native_channel_type(wlvif->channel_type);
+
+ ret = wl1271_cmd_send(wl,
+ start ? CMD_CAC_START : CMD_CAC_STOP,
+ cmd, sizeof(*cmd), 0);
+ if (ret < 0) {
+ wl1271_error("failed to send cac command");
+ goto out_free;
+ }
+
+out_free:
+ kfree(cmd);
+ return ret;
+}
diff --git a/drivers/net/wireless/ti/wl18xx/cmd.h b/drivers/net/wireless/ti/wl18xx/cmd.h
index 92499e2dfa83..91b3e2fe77cd 100644
--- a/drivers/net/wireless/ti/wl18xx/cmd.h
+++ b/drivers/net/wireless/ti/wl18xx/cmd.h
@@ -59,6 +59,16 @@ struct wl18xx_cmd_smart_config_set_group_key {
u8 key[16];
} __packed;
+/* cac_start and cac_stop share the same params */
+struct wlcore_cmd_cac_start {
+ struct wl1271_cmd_header header;
+
+ u8 role_id;
+ u8 channel;
+ u8 band;
+ u8 bandwidth;
+} __packed;
+
int wl18xx_cmd_channel_switch(struct wl1271 *wl,
struct wl12xx_vif *wlvif,
struct ieee80211_channel_switch *ch_switch);
@@ -66,4 +76,5 @@ int wl18xx_cmd_smart_config_start(struct wl1271 *wl, u32 group_bitmap);
int wl18xx_cmd_smart_config_stop(struct wl1271 *wl);
int wl18xx_cmd_smart_config_set_group_key(struct wl1271 *wl, u16 group_id,
u8 key_len, u8 *key);
+int wl18xx_cmd_set_cac(struct wl1271 *wl, struct wl12xx_vif *wlvif, bool start);
#endif
diff --git a/drivers/net/wireless/ti/wl18xx/event.c b/drivers/net/wireless/ti/wl18xx/event.c
index eb1848e08424..c28f06854195 100644
--- a/drivers/net/wireless/ti/wl18xx/event.c
+++ b/drivers/net/wireless/ti/wl18xx/event.c
@@ -47,6 +47,19 @@ int wl18xx_wait_for_event(struct wl1271 *wl, enum wlcore_wait_event event,
return wlcore_cmd_wait_for_event_or_timeout(wl, local_event, timeout);
}
+static const char *wl18xx_radar_type_decode(u8 radar_type)
+{
+ switch (radar_type) {
+ case RADAR_TYPE_REGULAR:
+ return "REGULAR";
+ case RADAR_TYPE_CHIRP:
+ return "CHIRP";
+ case RADAR_TYPE_NONE:
+ default:
+ return "N/A";
+ }
+}
+
static int wlcore_smart_config_sync_event(struct wl1271 *wl, u8 sync_channel,
u8 sync_band)
{
@@ -115,6 +128,14 @@ int wl18xx_process_mailbox_events(struct wl1271 *wl)
wl18xx_scan_completed(wl, wl->scan_wlvif);
}
+ if (vector & RADAR_DETECTED_EVENT_ID) {
+ wl1271_info("radar event: channel %d type %s",
+ mbox->radar_channel,
+ wl18xx_radar_type_decode(mbox->radar_type));
+
+ ieee80211_radar_detected(wl->hw);
+ }
+
if (vector & PERIODIC_SCAN_REPORT_EVENT_ID) {
wl1271_debug(DEBUG_EVENT,
"PERIODIC_SCAN_REPORT_EVENT (results %d)",
diff --git a/drivers/net/wireless/ti/wl18xx/event.h b/drivers/net/wireless/ti/wl18xx/event.h
index 0680312d4943..266ee87834e4 100644
--- a/drivers/net/wireless/ti/wl18xx/event.h
+++ b/drivers/net/wireless/ti/wl18xx/event.h
@@ -42,6 +42,12 @@ enum {
SMART_CONFIG_DECODE_EVENT_ID = BIT(23),
};
+enum wl18xx_radar_types {
+ RADAR_TYPE_NONE,
+ RADAR_TYPE_REGULAR,
+ RADAR_TYPE_CHIRP
+};
+
struct wl18xx_event_mailbox {
__le32 events_vector;
@@ -83,13 +89,19 @@ struct wl18xx_event_mailbox {
u8 sc_token_len;
u8 padding1;
u8 sc_ssid[32];
- u8 sc_pwd[32];
+ u8 sc_pwd[64];
u8 sc_token[32];
/* smart config sync channel */
u8 sc_sync_channel;
u8 sc_sync_band;
u8 padding2[2];
+
+ /* radar detect */
+ u8 radar_channel;
+ u8 radar_type;
+
+ u8 padding3[2];
} __packed;
int wl18xx_wait_for_event(struct wl1271 *wl, enum wlcore_wait_event event,
diff --git a/drivers/net/wireless/ti/wl18xx/main.c b/drivers/net/wireless/ti/wl18xx/main.c
index 7af1936719eb..98cb9a2205d1 100644
--- a/drivers/net/wireless/ti/wl18xx/main.c
+++ b/drivers/net/wireless/ti/wl18xx/main.c
@@ -648,7 +648,7 @@ static const struct wl18xx_clk_cfg wl18xx_clk_table[NUM_CLOCK_CONFIGS] = {
};
/* TODO: maybe move to a new header file? */
-#define WL18XX_FW_NAME "ti-connectivity/wl18xx-fw-3.bin"
+#define WL18XX_FW_NAME "ti-connectivity/wl18xx-fw-4.bin"
static int wl18xx_identify_chip(struct wl1271 *wl)
{
@@ -983,6 +983,7 @@ static int wl18xx_boot(struct wl1271 *wl)
wl->event_mask = BSS_LOSS_EVENT_ID |
SCAN_COMPLETE_EVENT_ID |
+ RADAR_DETECTED_EVENT_ID |
RSSI_SNR_TRIGGER_0_EVENT_ID |
PERIODIC_SCAN_COMPLETE_EVENT_ID |
PERIODIC_SCAN_REPORT_EVENT_ID |
@@ -1343,9 +1344,10 @@ out:
}
#define WL18XX_CONF_FILE_NAME "ti-connectivity/wl18xx-conf.bin"
-static int wl18xx_conf_init(struct wl1271 *wl, struct device *dev)
+
+static int wl18xx_load_conf_file(struct device *dev, struct wlcore_conf *conf,
+ struct wl18xx_priv_conf *priv_conf)
{
- struct wl18xx_priv *priv = wl->priv;
struct wlcore_conf_file *conf_file;
const struct firmware *fw;
int ret;
@@ -1354,14 +1356,14 @@ static int wl18xx_conf_init(struct wl1271 *wl, struct device *dev)
if (ret < 0) {
wl1271_error("could not get configuration binary %s: %d",
WL18XX_CONF_FILE_NAME, ret);
- goto out_fallback;
+ return ret;
}
if (fw->size != WL18XX_CONF_SIZE) {
wl1271_error("configuration binary file size is wrong, expected %zu got %zu",
WL18XX_CONF_SIZE, fw->size);
ret = -EINVAL;
- goto out;
+ goto out_release;
}
conf_file = (struct wlcore_conf_file *) fw->data;
@@ -1371,7 +1373,7 @@ static int wl18xx_conf_init(struct wl1271 *wl, struct device *dev)
"expected 0x%0x got 0x%0x", WL18XX_CONF_MAGIC,
conf_file->header.magic);
ret = -EINVAL;
- goto out;
+ goto out_release;
}
if (conf_file->header.version != cpu_to_le32(WL18XX_CONF_VERSION)) {
@@ -1379,28 +1381,32 @@ static int wl18xx_conf_init(struct wl1271 *wl, struct device *dev)
"expected 0x%08x got 0x%08x",
WL18XX_CONF_VERSION, conf_file->header.version);
ret = -EINVAL;
- goto out;
+ goto out_release;
}
- memcpy(&wl->conf, &conf_file->core, sizeof(wl18xx_conf));
- memcpy(&priv->conf, &conf_file->priv, sizeof(priv->conf));
+ memcpy(conf, &conf_file->core, sizeof(*conf));
+ memcpy(priv_conf, &conf_file->priv, sizeof(*priv_conf));
- goto out;
+out_release:
+ release_firmware(fw);
+ return ret;
+}
-out_fallback:
- wl1271_warning("falling back to default config");
+static int wl18xx_conf_init(struct wl1271 *wl, struct device *dev)
+{
+ struct wl18xx_priv *priv = wl->priv;
- /* apply driver default configuration */
- memcpy(&wl->conf, &wl18xx_conf, sizeof(wl18xx_conf));
- /* apply default private configuration */
- memcpy(&priv->conf, &wl18xx_default_priv_conf, sizeof(priv->conf));
+ if (wl18xx_load_conf_file(dev, &wl->conf, &priv->conf) < 0) {
+ wl1271_warning("falling back to default config");
- /* For now we just fallback */
- return 0;
+ /* apply driver default configuration */
+ memcpy(&wl->conf, &wl18xx_conf, sizeof(wl->conf));
+ /* apply default private configuration */
+ memcpy(&priv->conf, &wl18xx_default_priv_conf,
+ sizeof(priv->conf));
+ }
-out:
- release_firmware(fw);
- return ret;
+ return 0;
}
static int wl18xx_plt_init(struct wl1271 *wl)
@@ -1703,6 +1709,7 @@ static struct wlcore_ops wl18xx_ops = {
.smart_config_start = wl18xx_cmd_smart_config_start,
.smart_config_stop = wl18xx_cmd_smart_config_stop,
.smart_config_set_group_key = wl18xx_cmd_smart_config_set_group_key,
+ .set_cac = wl18xx_cmd_set_cac,
};
/* HT cap appropriate for wide channels in 2Ghz */
@@ -1765,7 +1772,7 @@ static struct ieee80211_sta_ht_cap wl18xx_mimo_ht_cap_2ghz = {
static const struct ieee80211_iface_limit wl18xx_iface_limits[] = {
{
- .max = 3,
+ .max = 2,
.types = BIT(NL80211_IFTYPE_STATION),
},
{
@@ -1774,6 +1781,10 @@ static const struct ieee80211_iface_limit wl18xx_iface_limits[] = {
BIT(NL80211_IFTYPE_P2P_GO) |
BIT(NL80211_IFTYPE_P2P_CLIENT),
},
+ {
+ .max = 1,
+ .types = BIT(NL80211_IFTYPE_P2P_DEVICE),
+ },
};
static const struct ieee80211_iface_limit wl18xx_iface_ap_limits[] = {
@@ -1781,6 +1792,48 @@ static const struct ieee80211_iface_limit wl18xx_iface_ap_limits[] = {
.max = 2,
.types = BIT(NL80211_IFTYPE_AP),
},
+ {
+ .max = 1,
+ .types = BIT(NL80211_IFTYPE_P2P_DEVICE),
+ },
+};
+
+static const struct ieee80211_iface_limit wl18xx_iface_ap_cl_limits[] = {
+ {
+ .max = 1,
+ .types = BIT(NL80211_IFTYPE_STATION),
+ },
+ {
+ .max = 1,
+ .types = BIT(NL80211_IFTYPE_AP),
+ },
+ {
+ .max = 1,
+ .types = BIT(NL80211_IFTYPE_P2P_CLIENT),
+ },
+ {
+ .max = 1,
+ .types = BIT(NL80211_IFTYPE_P2P_DEVICE),
+ },
+};
+
+static const struct ieee80211_iface_limit wl18xx_iface_ap_go_limits[] = {
+ {
+ .max = 1,
+ .types = BIT(NL80211_IFTYPE_STATION),
+ },
+ {
+ .max = 1,
+ .types = BIT(NL80211_IFTYPE_AP),
+ },
+ {
+ .max = 1,
+ .types = BIT(NL80211_IFTYPE_P2P_GO),
+ },
+ {
+ .max = 1,
+ .types = BIT(NL80211_IFTYPE_P2P_DEVICE),
+ },
};
static const struct ieee80211_iface_combination
diff --git a/drivers/net/wireless/ti/wl18xx/scan.c b/drivers/net/wireless/ti/wl18xx/scan.c
index 98666f235a12..a733ca90ae64 100644
--- a/drivers/net/wireless/ti/wl18xx/scan.c
+++ b/drivers/net/wireless/ti/wl18xx/scan.c
@@ -51,7 +51,11 @@ static int wl18xx_scan_send(struct wl1271 *wl, struct wl12xx_vif *wlvif,
goto out;
}
- cmd->role_id = wlvif->role_id;
+ /* scan on the dev role if the regular one is not started */
+ if (wlcore_is_p2p_mgmt(wlvif))
+ cmd->role_id = wlvif->dev_role_id;
+ else
+ cmd->role_id = wlvif->role_id;
if (WARN_ON(cmd->role_id == WL12XX_INVALID_ROLE_ID)) {
ret = -EINVAL;
diff --git a/drivers/net/wireless/ti/wl18xx/wl18xx.h b/drivers/net/wireless/ti/wl18xx/wl18xx.h
index 6a2b88030c1d..71e9e382ce80 100644
--- a/drivers/net/wireless/ti/wl18xx/wl18xx.h
+++ b/drivers/net/wireless/ti/wl18xx/wl18xx.h
@@ -26,10 +26,10 @@
/* minimum FW required for driver */
#define WL18XX_CHIP_VER 8
-#define WL18XX_IFTYPE_VER 8
+#define WL18XX_IFTYPE_VER 9
#define WL18XX_MAJOR_VER WLCORE_FW_VER_IGNORE
#define WL18XX_SUBTYPE_VER WLCORE_FW_VER_IGNORE
-#define WL18XX_MINOR_VER 13
+#define WL18XX_MINOR_VER 11
#define WL18XX_CMD_MAX_SIZE 740
diff --git a/drivers/net/wireless/ti/wlcore/boot.c b/drivers/net/wireless/ti/wlcore/boot.c
index 77752b03f189..19b7ec7b69c2 100644
--- a/drivers/net/wireless/ti/wlcore/boot.c
+++ b/drivers/net/wireless/ti/wlcore/boot.c
@@ -22,7 +22,6 @@
*/
#include <linux/slab.h>
-#include <linux/wl12xx.h>
#include <linux/export.h>
#include "debug.h"
diff --git a/drivers/net/wireless/ti/wlcore/cmd.c b/drivers/net/wireless/ti/wlcore/cmd.c
index 05604ee31224..57dad571628c 100644
--- a/drivers/net/wireless/ti/wlcore/cmd.c
+++ b/drivers/net/wireless/ti/wlcore/cmd.c
@@ -399,7 +399,7 @@ void wl12xx_free_link(struct wl1271 *wl, struct wl12xx_vif *wlvif, u8 *hlid)
WARN_ON_ONCE(wl->active_link_count < 0);
}
-static u8 wlcore_get_native_channel_type(u8 nl_channel_type)
+u8 wlcore_get_native_channel_type(u8 nl_channel_type)
{
switch (nl_channel_type) {
case NL80211_CHAN_NO_HT:
@@ -415,6 +415,7 @@ static u8 wlcore_get_native_channel_type(u8 nl_channel_type)
return WLCORE_CHAN_NO_HT;
}
}
+EXPORT_SYMBOL_GPL(wlcore_get_native_channel_type);
static int wl12xx_cmd_role_start_dev(struct wl1271 *wl,
struct wl12xx_vif *wlvif,
@@ -1992,12 +1993,15 @@ int wl12xx_start_dev(struct wl1271 *wl, struct wl12xx_vif *wlvif,
wlvif->bss_type == BSS_TYPE_IBSS)))
return -EINVAL;
- ret = wl12xx_cmd_role_enable(wl,
- wl12xx_wlvif_to_vif(wlvif)->addr,
- WL1271_ROLE_DEVICE,
- &wlvif->dev_role_id);
- if (ret < 0)
- goto out;
+ /* the dev role is already started for p2p mgmt interfaces */
+ if (!wlcore_is_p2p_mgmt(wlvif)) {
+ ret = wl12xx_cmd_role_enable(wl,
+ wl12xx_wlvif_to_vif(wlvif)->addr,
+ WL1271_ROLE_DEVICE,
+ &wlvif->dev_role_id);
+ if (ret < 0)
+ goto out;
+ }
ret = wl12xx_cmd_role_start_dev(wl, wlvif, band, channel);
if (ret < 0)
@@ -2012,7 +2016,8 @@ int wl12xx_start_dev(struct wl1271 *wl, struct wl12xx_vif *wlvif,
out_stop:
wl12xx_cmd_role_stop_dev(wl, wlvif);
out_disable:
- wl12xx_cmd_role_disable(wl, &wlvif->dev_role_id);
+ if (!wlcore_is_p2p_mgmt(wlvif))
+ wl12xx_cmd_role_disable(wl, &wlvif->dev_role_id);
out:
return ret;
}
@@ -2041,9 +2046,11 @@ int wl12xx_stop_dev(struct wl1271 *wl, struct wl12xx_vif *wlvif)
if (ret < 0)
goto out;
- ret = wl12xx_cmd_role_disable(wl, &wlvif->dev_role_id);
- if (ret < 0)
- goto out;
+ if (!wlcore_is_p2p_mgmt(wlvif)) {
+ ret = wl12xx_cmd_role_disable(wl, &wlvif->dev_role_id);
+ if (ret < 0)
+ goto out;
+ }
out:
return ret;
diff --git a/drivers/net/wireless/ti/wlcore/cmd.h b/drivers/net/wireless/ti/wlcore/cmd.h
index ca6a28b03f8f..33f8683a37ad 100644
--- a/drivers/net/wireless/ti/wlcore/cmd.h
+++ b/drivers/net/wireless/ti/wlcore/cmd.h
@@ -107,6 +107,7 @@ int wl12xx_allocate_link(struct wl1271 *wl, struct wl12xx_vif *wlvif,
void wl12xx_free_link(struct wl1271 *wl, struct wl12xx_vif *wlvif, u8 *hlid);
int wlcore_cmd_wait_for_event_or_timeout(struct wl1271 *wl,
u32 mask, bool *timeout);
+u8 wlcore_get_native_channel_type(u8 nl_channel_type);
enum wl1271_commands {
CMD_INTERROGATE = 1, /* use this to read information elements */
@@ -174,6 +175,11 @@ enum wl1271_commands {
CMD_SMART_CONFIG_STOP = 62,
CMD_SMART_CONFIG_SET_GROUP_KEY = 63,
+ CMD_CAC_START = 64,
+ CMD_CAC_STOP = 65,
+ CMD_DFS_MASTER_RESTART = 66,
+ CMD_DFS_RADAR_DETECTION_DEBUG = 67,
+
MAX_COMMAND_ID = 0xFFFF,
};
diff --git a/drivers/net/wireless/ti/wlcore/debugfs.c b/drivers/net/wireless/ti/wlcore/debugfs.c
index 0be21f62fcb0..ad246267d7de 100644
--- a/drivers/net/wireless/ti/wlcore/debugfs.c
+++ b/drivers/net/wireless/ti/wlcore/debugfs.c
@@ -502,7 +502,7 @@ static ssize_t driver_state_read(struct file *file, char __user *user_buf,
DRIVER_STATE_PRINT_HEX(irq);
/* TODO: ref_clock and tcxo_clock were moved to wl12xx priv */
DRIVER_STATE_PRINT_HEX(hw_pg_ver);
- DRIVER_STATE_PRINT_HEX(platform_quirks);
+ DRIVER_STATE_PRINT_HEX(irq_flags);
DRIVER_STATE_PRINT_HEX(chip.id);
DRIVER_STATE_PRINT_STR(chip.fw_ver_str);
DRIVER_STATE_PRINT_STR(chip.phy_fw_ver_str);
diff --git a/drivers/net/wireless/ti/wlcore/hw_ops.h b/drivers/net/wireless/ti/wlcore/hw_ops.h
index aa9f82c72296..a3a5f2ffc664 100644
--- a/drivers/net/wireless/ti/wlcore/hw_ops.h
+++ b/drivers/net/wireless/ti/wlcore/hw_ops.h
@@ -287,4 +287,13 @@ wlcore_smart_config_set_group_key(struct wl1271 *wl, u16 group_id,
return wl->ops->smart_config_set_group_key(wl, group_id, key_len, key);
}
+
+static inline int
+wlcore_hw_set_cac(struct wl1271 *wl, struct wl12xx_vif *wlvif, bool start)
+{
+ if (!wl->ops->set_cac)
+ return -EINVAL;
+
+ return wl->ops->set_cac(wl, wlvif, start);
+}
#endif
diff --git a/drivers/net/wireless/ti/wlcore/init.c b/drivers/net/wireless/ti/wlcore/init.c
index 199e94120864..7c240e2508dc 100644
--- a/drivers/net/wireless/ti/wlcore/init.c
+++ b/drivers/net/wireless/ti/wlcore/init.c
@@ -348,7 +348,7 @@ static int wl12xx_init_fwlog(struct wl1271 *wl)
}
/* generic sta initialization (non vif-specific) */
-static int wl1271_sta_hw_init(struct wl1271 *wl, struct wl12xx_vif *wlvif)
+int wl1271_sta_hw_init(struct wl1271 *wl, struct wl12xx_vif *wlvif)
{
int ret;
diff --git a/drivers/net/wireless/ti/wlcore/init.h b/drivers/net/wireless/ti/wlcore/init.h
index a45fbfddec19..fd1cdb6bc3e4 100644
--- a/drivers/net/wireless/ti/wlcore/init.h
+++ b/drivers/net/wireless/ti/wlcore/init.h
@@ -35,5 +35,6 @@ int wl1271_hw_init(struct wl1271 *wl);
int wl1271_init_vif_specific(struct wl1271 *wl, struct ieee80211_vif *vif);
int wl1271_init_ap_rates(struct wl1271 *wl, struct wl12xx_vif *wlvif);
int wl1271_ap_init_templates(struct wl1271 *wl, struct ieee80211_vif *vif);
+int wl1271_sta_hw_init(struct wl1271 *wl, struct wl12xx_vif *wlvif);
#endif
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index 575c8f6d4009..ca03f22ec6a0 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -25,8 +25,8 @@
#include <linux/firmware.h>
#include <linux/etherdevice.h>
#include <linux/vmalloc.h>
-#include <linux/wl12xx.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include "wlcore.h"
#include "debug.h"
@@ -525,7 +525,7 @@ static int wlcore_irq_locked(struct wl1271 *wl)
* In case edge triggered interrupt must be used, we cannot iterate
* more than once without introducing race conditions with the hardirq.
*/
- if (wl->platform_quirks & WL12XX_PLATFORM_QUIRK_EDGE_IRQ)
+ if (wl->irq_flags & (IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING))
loopcount = 1;
wl1271_debug(DEBUG_IRQ, "IRQ work");
@@ -1783,6 +1783,9 @@ static int wl1271_op_suspend(struct ieee80211_hw *hw,
mutex_lock(&wl->mutex);
wl->wow_enabled = true;
wl12xx_for_each_wlvif(wl, wlvif) {
+ if (wlcore_is_p2p_mgmt(wlvif))
+ continue;
+
ret = wl1271_configure_suspend(wl, wlvif, wow);
if (ret < 0) {
mutex_unlock(&wl->mutex);
@@ -1868,6 +1871,9 @@ static int wl1271_op_resume(struct ieee80211_hw *hw)
}
wl12xx_for_each_wlvif(wl, wlvif) {
+ if (wlcore_is_p2p_mgmt(wlvif))
+ continue;
+
wl1271_configure_resume(wl, wlvif);
}
@@ -2211,6 +2217,7 @@ static int wl12xx_init_vif_data(struct wl1271 *wl, struct ieee80211_vif *vif)
wlvif->p2p = 1;
/* fall-through */
case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_DEVICE:
wlvif->bss_type = BSS_TYPE_STA_BSS;
break;
case NL80211_IFTYPE_ADHOC:
@@ -2431,7 +2438,8 @@ static void wlcore_hw_queue_iter(void *data, u8 *mac,
{
struct wlcore_hw_queue_iter_data *iter_data = data;
- if (WARN_ON_ONCE(vif->hw_queue[0] == IEEE80211_INVAL_HW_QUEUE))
+ if (vif->type == NL80211_IFTYPE_P2P_DEVICE ||
+ WARN_ON_ONCE(vif->hw_queue[0] == IEEE80211_INVAL_HW_QUEUE))
return;
if (iter_data->cur_running || vif == iter_data->vif) {
@@ -2449,6 +2457,11 @@ static int wlcore_allocate_hw_queue_base(struct wl1271 *wl,
struct wlcore_hw_queue_iter_data iter_data = {};
int i, q_base;
+ if (vif->type == NL80211_IFTYPE_P2P_DEVICE) {
+ vif->cab_queue = IEEE80211_INVAL_HW_QUEUE;
+ return 0;
+ }
+
iter_data.vif = vif;
/* mark all bits taken by active interfaces */
@@ -2571,14 +2584,27 @@ static int wl1271_op_add_interface(struct ieee80211_hw *hw,
goto out;
}
- ret = wl12xx_cmd_role_enable(wl, vif->addr,
- role_type, &wlvif->role_id);
- if (ret < 0)
- goto out;
+ if (!wlcore_is_p2p_mgmt(wlvif)) {
+ ret = wl12xx_cmd_role_enable(wl, vif->addr,
+ role_type, &wlvif->role_id);
+ if (ret < 0)
+ goto out;
- ret = wl1271_init_vif_specific(wl, vif);
- if (ret < 0)
- goto out;
+ ret = wl1271_init_vif_specific(wl, vif);
+ if (ret < 0)
+ goto out;
+
+ } else {
+ ret = wl12xx_cmd_role_enable(wl, vif->addr, WL1271_ROLE_DEVICE,
+ &wlvif->dev_role_id);
+ if (ret < 0)
+ goto out;
+
+ /* needed mainly for configuring rate policies */
+ ret = wl1271_sta_hw_init(wl, wlvif);
+ if (ret < 0)
+ goto out;
+ }
list_add(&wlvif->list, &wl->wlvif_list);
set_bit(WLVIF_FLAG_INITIALIZED, &wlvif->flags);
@@ -2649,9 +2675,15 @@ static void __wl1271_op_remove_interface(struct wl1271 *wl,
wl12xx_stop_dev(wl, wlvif);
}
- ret = wl12xx_cmd_role_disable(wl, &wlvif->role_id);
- if (ret < 0)
- goto deinit;
+ if (!wlcore_is_p2p_mgmt(wlvif)) {
+ ret = wl12xx_cmd_role_disable(wl, &wlvif->role_id);
+ if (ret < 0)
+ goto deinit;
+ } else {
+ ret = wl12xx_cmd_role_disable(wl, &wlvif->dev_role_id);
+ if (ret < 0)
+ goto deinit;
+ }
wl1271_ps_elp_sleep(wl);
}
@@ -3040,6 +3072,9 @@ static int wl12xx_config_vif(struct wl1271 *wl, struct wl12xx_vif *wlvif,
{
int ret;
+ if (wlcore_is_p2p_mgmt(wlvif))
+ return 0;
+
if (conf->power_level != wlvif->power_level) {
ret = wl1271_acx_tx_power(wl, wlvif, conf->power_level);
if (ret < 0)
@@ -3160,6 +3195,9 @@ static void wl1271_op_configure_filter(struct ieee80211_hw *hw,
goto out;
wl12xx_for_each_wlvif(wl, wlvif) {
+ if (wlcore_is_p2p_mgmt(wlvif))
+ continue;
+
if (wlvif->bss_type != BSS_TYPE_AP_BSS) {
if (*total & FIF_ALLMULTI)
ret = wl1271_acx_group_address_tbl(wl, wlvif,
@@ -4574,10 +4612,46 @@ static void wlcore_op_change_chanctx(struct ieee80211_hw *hw,
struct ieee80211_chanctx_conf *ctx,
u32 changed)
{
+ struct wl1271 *wl = hw->priv;
+ struct wl12xx_vif *wlvif;
+ int ret;
+ int channel = ieee80211_frequency_to_channel(
+ ctx->def.chan->center_freq);
+
wl1271_debug(DEBUG_MAC80211,
"mac80211 change chanctx %d (type %d) changed 0x%x",
- ieee80211_frequency_to_channel(ctx->def.chan->center_freq),
- cfg80211_get_chandef_type(&ctx->def), changed);
+ channel, cfg80211_get_chandef_type(&ctx->def), changed);
+
+ mutex_lock(&wl->mutex);
+
+ ret = wl1271_ps_elp_wakeup(wl);
+ if (ret < 0)
+ goto out;
+
+ wl12xx_for_each_wlvif(wl, wlvif) {
+ struct ieee80211_vif *vif = wl12xx_wlvif_to_vif(wlvif);
+
+ rcu_read_lock();
+ if (rcu_access_pointer(vif->chanctx_conf) != ctx) {
+ rcu_read_unlock();
+ continue;
+ }
+ rcu_read_unlock();
+
+ /* start radar if needed */
+ if (changed & IEEE80211_CHANCTX_CHANGE_RADAR &&
+ wlvif->bss_type == BSS_TYPE_AP_BSS &&
+ ctx->radar_enabled && !wlvif->radar_enabled &&
+ ctx->def.chan->dfs_state == NL80211_DFS_USABLE) {
+ wl1271_debug(DEBUG_MAC80211, "Start radar detection");
+ wlcore_hw_set_cac(wl, wlvif, true);
+ wlvif->radar_enabled = true;
+ }
+ }
+
+ wl1271_ps_elp_sleep(wl);
+out:
+ mutex_unlock(&wl->mutex);
}
static int wlcore_op_assign_vif_chanctx(struct ieee80211_hw *hw,
@@ -4588,13 +4662,26 @@ static int wlcore_op_assign_vif_chanctx(struct ieee80211_hw *hw,
struct wl12xx_vif *wlvif = wl12xx_vif_to_data(vif);
int channel = ieee80211_frequency_to_channel(
ctx->def.chan->center_freq);
+ int ret = -EINVAL;
wl1271_debug(DEBUG_MAC80211,
- "mac80211 assign chanctx (role %d) %d (type %d)",
- wlvif->role_id, channel, cfg80211_get_chandef_type(&ctx->def));
+ "mac80211 assign chanctx (role %d) %d (type %d) (radar %d dfs_state %d)",
+ wlvif->role_id, channel,
+ cfg80211_get_chandef_type(&ctx->def),
+ ctx->radar_enabled, ctx->def.chan->dfs_state);
mutex_lock(&wl->mutex);
+ if (unlikely(wl->state != WLCORE_STATE_ON))
+ goto out;
+
+ if (unlikely(!test_bit(WLVIF_FLAG_INITIALIZED, &wlvif->flags)))
+ goto out;
+
+ ret = wl1271_ps_elp_wakeup(wl);
+ if (ret < 0)
+ goto out;
+
wlvif->band = ctx->def.chan->band;
wlvif->channel = channel;
wlvif->channel_type = cfg80211_get_chandef_type(&ctx->def);
@@ -4602,6 +4689,15 @@ static int wlcore_op_assign_vif_chanctx(struct ieee80211_hw *hw,
/* update default rates according to the band */
wl1271_set_band_rate(wl, wlvif);
+ if (ctx->radar_enabled &&
+ ctx->def.chan->dfs_state == NL80211_DFS_USABLE) {
+ wl1271_debug(DEBUG_MAC80211, "Start radar detection");
+ wlcore_hw_set_cac(wl, wlvif, true);
+ wlvif->radar_enabled = true;
+ }
+
+ wl1271_ps_elp_sleep(wl);
+out:
mutex_unlock(&wl->mutex);
return 0;
@@ -4613,6 +4709,7 @@ static void wlcore_op_unassign_vif_chanctx(struct ieee80211_hw *hw,
{
struct wl1271 *wl = hw->priv;
struct wl12xx_vif *wlvif = wl12xx_vif_to_data(vif);
+ int ret;
wl1271_debug(DEBUG_MAC80211,
"mac80211 unassign chanctx (role %d) %d (type %d)",
@@ -4621,6 +4718,97 @@ static void wlcore_op_unassign_vif_chanctx(struct ieee80211_hw *hw,
cfg80211_get_chandef_type(&ctx->def));
wl1271_tx_flush(wl);
+
+ mutex_lock(&wl->mutex);
+
+ if (unlikely(wl->state != WLCORE_STATE_ON))
+ goto out;
+
+ if (unlikely(!test_bit(WLVIF_FLAG_INITIALIZED, &wlvif->flags)))
+ goto out;
+
+ ret = wl1271_ps_elp_wakeup(wl);
+ if (ret < 0)
+ goto out;
+
+ if (wlvif->radar_enabled) {
+ wl1271_debug(DEBUG_MAC80211, "Stop radar detection");
+ wlcore_hw_set_cac(wl, wlvif, false);
+ wlvif->radar_enabled = false;
+ }
+
+ wl1271_ps_elp_sleep(wl);
+out:
+ mutex_unlock(&wl->mutex);
+}
+
+static int __wlcore_switch_vif_chan(struct wl1271 *wl,
+ struct wl12xx_vif *wlvif,
+ struct ieee80211_chanctx_conf *new_ctx)
+{
+ int channel = ieee80211_frequency_to_channel(
+ new_ctx->def.chan->center_freq);
+
+ wl1271_debug(DEBUG_MAC80211,
+ "switch vif (role %d) %d -> %d chan_type: %d",
+ wlvif->role_id, wlvif->channel, channel,
+ cfg80211_get_chandef_type(&new_ctx->def));
+
+ if (WARN_ON_ONCE(wlvif->bss_type != BSS_TYPE_AP_BSS))
+ return 0;
+
+ if (wlvif->radar_enabled) {
+ wl1271_debug(DEBUG_MAC80211, "Stop radar detection");
+ wlcore_hw_set_cac(wl, wlvif, false);
+ wlvif->radar_enabled = false;
+ }
+
+ wlvif->band = new_ctx->def.chan->band;
+ wlvif->channel = channel;
+ wlvif->channel_type = cfg80211_get_chandef_type(&new_ctx->def);
+
+ /* start radar if needed */
+ if (new_ctx->radar_enabled) {
+ wl1271_debug(DEBUG_MAC80211, "Start radar detection");
+ wlcore_hw_set_cac(wl, wlvif, true);
+ wlvif->radar_enabled = true;
+ }
+
+ return 0;
+}
+
+static int
+wlcore_op_switch_vif_chanctx(struct ieee80211_hw *hw,
+ struct ieee80211_vif_chanctx_switch *vifs,
+ int n_vifs,
+ enum ieee80211_chanctx_switch_mode mode)
+{
+ struct wl1271 *wl = hw->priv;
+ int i, ret;
+
+ wl1271_debug(DEBUG_MAC80211,
+ "mac80211 switch chanctx n_vifs %d mode %d",
+ n_vifs, mode);
+
+ mutex_lock(&wl->mutex);
+
+ ret = wl1271_ps_elp_wakeup(wl);
+ if (ret < 0)
+ goto out;
+
+ for (i = 0; i < n_vifs; i++) {
+ struct wl12xx_vif *wlvif = wl12xx_vif_to_data(vifs[i].vif);
+
+ ret = __wlcore_switch_vif_chan(wl, wlvif, vifs[i].new_ctx);
+ if (ret)
+ goto out_sleep;
+ }
+out_sleep:
+ wl1271_ps_elp_sleep(wl);
+out:
+ mutex_unlock(&wl->mutex);
+
+ return 0;
}
static int wl1271_op_conf_tx(struct ieee80211_hw *hw,
@@ -4632,6 +4820,9 @@ static int wl1271_op_conf_tx(struct ieee80211_hw *hw,
u8 ps_scheme;
int ret = 0;
+ if (wlcore_is_p2p_mgmt(wlvif))
+ return 0;
+
mutex_lock(&wl->mutex);
wl1271_debug(DEBUG_MAC80211, "mac80211 conf tx %d", queue);
@@ -5611,6 +5802,7 @@ static const struct ieee80211_ops wl1271_ops = {
.change_chanctx = wlcore_op_change_chanctx,
.assign_vif_chanctx = wlcore_op_assign_vif_chanctx,
.unassign_vif_chanctx = wlcore_op_unassign_vif_chanctx,
+ .switch_vif_chanctx = wlcore_op_switch_vif_chanctx,
.sta_rc_update = wlcore_op_sta_rc_update,
.get_rssi = wlcore_op_get_rssi,
CFG80211_TESTMODE_CMD(wl1271_tm_cmd)
@@ -5798,8 +5990,10 @@ static int wl1271_init_ieee80211(struct wl1271 *wl)
wl->hw->wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites);
wl->hw->wiphy->interface_modes = BIT(NL80211_IFTYPE_STATION) |
- BIT(NL80211_IFTYPE_ADHOC) | BIT(NL80211_IFTYPE_AP) |
- BIT(NL80211_IFTYPE_P2P_CLIENT) | BIT(NL80211_IFTYPE_P2P_GO);
+ BIT(NL80211_IFTYPE_AP) |
+ BIT(NL80211_IFTYPE_P2P_DEVICE) |
+ BIT(NL80211_IFTYPE_P2P_CLIENT) |
+ BIT(NL80211_IFTYPE_P2P_GO);
wl->hw->wiphy->max_scan_ssids = 1;
wl->hw->wiphy->max_sched_scan_ssids = 16;
wl->hw->wiphy->max_match_sets = 16;
@@ -5963,7 +6157,6 @@ struct ieee80211_hw *wlcore_alloc_hw(size_t priv_size, u32 aggr_buf_size,
wl->ap_ps_map = 0;
wl->ap_fw_ps_map = 0;
wl->quirks = 0;
- wl->platform_quirks = 0;
wl->system_hlid = WL12XX_SYSTEM_HLID;
wl->active_sta_count = 0;
wl->active_link_count = 0;
@@ -6104,8 +6297,8 @@ static void wlcore_nvs_cb(const struct firmware *fw, void *context)
struct wl1271 *wl = context;
struct platform_device *pdev = wl->pdev;
struct wlcore_platdev_data *pdev_data = dev_get_platdata(&pdev->dev);
- struct wl12xx_platform_data *pdata = pdev_data->pdata;
- unsigned long irqflags;
+ struct resource *res;
+
int ret;
irq_handler_t hardirq_fn = NULL;
@@ -6132,19 +6325,23 @@ static void wlcore_nvs_cb(const struct firmware *fw, void *context)
/* adjust some runtime configuration parameters */
wlcore_adjust_conf(wl);
- wl->irq = platform_get_irq(pdev, 0);
- wl->platform_quirks = pdata->platform_quirks;
+ res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+ if (!res) {
+ wl1271_error("Could not get IRQ resource");
+ goto out_free_nvs;
+ }
+
+ wl->irq = res->start;
+ wl->irq_flags = res->flags & IRQF_TRIGGER_MASK;
wl->if_ops = pdev_data->if_ops;
- if (wl->platform_quirks & WL12XX_PLATFORM_QUIRK_EDGE_IRQ) {
- irqflags = IRQF_TRIGGER_RISING;
+ if (wl->irq_flags & (IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING))
hardirq_fn = wlcore_hardirq;
- } else {
- irqflags = IRQF_TRIGGER_HIGH | IRQF_ONESHOT;
- }
+ else
+ wl->irq_flags |= IRQF_ONESHOT;
ret = request_threaded_irq(wl->irq, hardirq_fn, wlcore_irq,
- irqflags, pdev->name, wl);
+ wl->irq_flags, pdev->name, wl);
if (ret < 0) {
wl1271_error("request_irq() failed: %d", ret);
goto out_free_nvs;
@@ -6155,7 +6352,7 @@ static void wlcore_nvs_cb(const struct firmware *fw, void *context)
if (!ret) {
wl->irq_wake_enabled = true;
device_init_wakeup(wl->dev, 1);
- if (pdata->pwr_in_suspend)
+ if (pdev_data->pwr_in_suspend)
wl->hw->wiphy->wowlan = &wlcore_wowlan_support;
}
#endif
diff --git a/drivers/net/wireless/ti/wlcore/sdio.c b/drivers/net/wireless/ti/wlcore/sdio.c
index d3dd7bfdf3f1..c172da56b550 100644
--- a/drivers/net/wireless/ti/wlcore/sdio.c
+++ b/drivers/net/wireless/ti/wlcore/sdio.c
@@ -31,9 +31,10 @@
#include <linux/mmc/card.h>
#include <linux/mmc/host.h>
#include <linux/gpio.h>
-#include <linux/wl12xx.h>
#include <linux/pm_runtime.h>
#include <linux/printk.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
#include "wlcore.h"
#include "wl12xx_80211.h"
@@ -214,6 +215,52 @@ static struct wl1271_if_operations sdio_ops = {
.set_block_size = wl1271_sdio_set_block_size,
};
+#ifdef CONFIG_OF
+static const struct of_device_id wlcore_sdio_of_match_table[] = {
+ { .compatible = "ti,wl1271" },
+ { .compatible = "ti,wl1273" },
+ { .compatible = "ti,wl1281" },
+ { .compatible = "ti,wl1283" },
+ { .compatible = "ti,wl1801" },
+ { .compatible = "ti,wl1805" },
+ { .compatible = "ti,wl1807" },
+ { .compatible = "ti,wl1831" },
+ { .compatible = "ti,wl1835" },
+ { .compatible = "ti,wl1837" },
+ { }
+};
+
+static int wlcore_probe_of(struct device *dev, int *irq,
+ struct wlcore_platdev_data *pdev_data)
+{
+ struct device_node *np = dev->of_node;
+
+ if (!np || !of_match_node(wlcore_sdio_of_match_table, np))
+ return -ENODATA;
+
+ *irq = irq_of_parse_and_map(np, 0);
+ if (!*irq) {
+ dev_err(dev, "No irq in platform data\n");
+ kfree(pdev_data);
+ return -EINVAL;
+ }
+
+ /* optional clock frequency params */
+ of_property_read_u32(np, "ref-clock-frequency",
+ &pdev_data->ref_clock_freq);
+ of_property_read_u32(np, "tcxo-clock-frequency",
+ &pdev_data->tcxo_clock_freq);
+
+ return 0;
+}
+#else
+static int wlcore_probe_of(struct device *dev, int *irq,
+ struct wlcore_platdev_data *pdev_data)
+{
+ return -ENODATA;
+}
+#endif
+
static int wl1271_probe(struct sdio_func *func,
const struct sdio_device_id *id)
{
@@ -222,6 +269,7 @@ static int wl1271_probe(struct sdio_func *func,
struct resource res[1];
mmc_pm_flag_t mmcflags;
int ret = -ENOMEM;
+ int irq;
const char *chip_family;
/* We are only able to handle the wlan function */
@@ -245,19 +293,16 @@ static int wl1271_probe(struct sdio_func *func,
/* Use block mode for transferring over one block size of data */
func->card->quirks |= MMC_QUIRK_BLKSZ_FOR_BYTE_MODE;
- pdev_data.pdata = wl12xx_get_platform_data();
- if (IS_ERR(pdev_data.pdata)) {
- ret = PTR_ERR(pdev_data.pdata);
- dev_err(glue->dev, "missing wlan platform data: %d\n", ret);
+ ret = wlcore_probe_of(&func->dev, &irq, &pdev_data);
+ if (ret)
goto out_free_glue;
- }
/* if sdio can keep power while host is suspended, enable wow */
mmcflags = sdio_get_host_pm_caps(func);
dev_dbg(glue->dev, "sdio PM caps = 0x%x\n", mmcflags);
if (mmcflags & MMC_PM_KEEP_POWER)
- pdev_data.pdata->pwr_in_suspend = true;
+ pdev_data.pwr_in_suspend = true;
sdio_set_drvdata(func, glue);
@@ -286,8 +331,9 @@ static int wl1271_probe(struct sdio_func *func,
memset(res, 0x00, sizeof(res));
- res[0].start = pdev_data.pdata->irq;
- res[0].flags = IORESOURCE_IRQ;
+ res[0].start = irq;
+ res[0].flags = IORESOURCE_IRQ |
+ irqd_get_trigger_type(irq_get_irq_data(irq));
res[0].name = "irq";
ret = platform_device_add_resources(glue->core, res, ARRAY_SIZE(res));
diff --git a/drivers/net/wireless/ti/wlcore/spi.c b/drivers/net/wireless/ti/wlcore/spi.c
index 2073305e0a58..720e4e4b5a3c 100644
--- a/drivers/net/wireless/ti/wlcore/spi.c
+++ b/drivers/net/wireless/ti/wlcore/spi.c
@@ -335,11 +335,7 @@ static int wl1271_probe(struct spi_device *spi)
memset(&pdev_data, 0x00, sizeof(pdev_data));
- pdev_data.pdata = dev_get_platdata(&spi->dev);
- if (!pdev_data.pdata) {
- dev_err(&spi->dev, "no platform data\n");
- return -ENODEV;
- }
+ /* TODO: add DT parsing when needed */
pdev_data.if_ops = &spi_ops;
diff --git a/drivers/net/wireless/ti/wlcore/wlcore.h b/drivers/net/wireless/ti/wlcore/wlcore.h
index df78cf12ef15..52f64ebb3fa1 100644
--- a/drivers/net/wireless/ti/wlcore/wlcore.h
+++ b/drivers/net/wireless/ti/wlcore/wlcore.h
@@ -121,6 +121,8 @@ struct wlcore_ops {
int (*smart_config_stop)(struct wl1271 *wl);
int (*smart_config_set_group_key)(struct wl1271 *wl, u16 group_id,
u8 key_len, u8 *key);
+ int (*set_cac)(struct wl1271 *wl, struct wl12xx_vif *wlvif,
+ bool start);
};
enum wlcore_partitions {
@@ -192,6 +194,8 @@ struct wl1271 {
int irq;
+ int irq_flags;
+
spinlock_t wl_lock;
enum wlcore_state state;
@@ -399,9 +403,6 @@ struct wl1271 {
/* Quirks of specific hardware revisions */
unsigned int quirks;
- /* Platform limitations */
- unsigned int platform_quirks;
-
/* number of currently active RX BA sessions */
int ba_rx_session_count;
diff --git a/drivers/net/wireless/ti/wlcore/wlcore_i.h b/drivers/net/wireless/ti/wlcore/wlcore_i.h
index 0e52556044d9..68a06eae481b 100644
--- a/drivers/net/wireless/ti/wlcore/wlcore_i.h
+++ b/drivers/net/wireless/ti/wlcore/wlcore_i.h
@@ -201,8 +201,12 @@ struct wl1271_if_operations {
};
struct wlcore_platdev_data {
- struct wl12xx_platform_data *pdata;
struct wl1271_if_operations *if_ops;
+
+ bool ref_clock_xtal; /* specify whether the clock is XTAL or not */
+ u32 ref_clock_freq; /* in Hertz */
+ u32 tcxo_clock_freq; /* in Hertz, tcxo is always XTAL */
+ bool pwr_in_suspend;
};
#define MAX_NUM_KEYS 14
@@ -434,6 +438,8 @@ struct wl12xx_vif {
bool wmm_enabled;
+ bool radar_enabled;
+
/* Rx Streaming */
struct work_struct rx_streaming_enable_work;
struct work_struct rx_streaming_disable_work;
@@ -492,6 +498,11 @@ struct ieee80211_vif *wl12xx_wlvif_to_vif(struct wl12xx_vif *wlvif)
return container_of((void *)wlvif, struct ieee80211_vif, drv_priv);
}
+static inline bool wlcore_is_p2p_mgmt(struct wl12xx_vif *wlvif)
+{
+ return wl12xx_wlvif_to_vif(wlvif)->type == NL80211_IFTYPE_P2P_DEVICE;
+}
+
#define wl12xx_for_each_wlvif(wl, wlvif) \
list_for_each_entry(wlvif, &wl->wlvif_list, list)
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index e6665ee1846f..eeaf9f270774 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -905,36 +905,66 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
return 0;
}
+/*
+ * Convert configs to something easy to use in C code
+ */
+#if defined(CONFIG_CMDLINE_FORCE)
+static const int overwrite_incoming_cmdline = 1;
+static const int read_dt_cmdline;
+static const int concat_cmdline;
+#elif defined(CONFIG_CMDLINE_EXTEND)
+static const int overwrite_incoming_cmdline;
+static const int read_dt_cmdline = 1;
+static const int concat_cmdline = 1;
+#else /* CMDLINE_FROM_BOOTLOADER */
+static const int overwrite_incoming_cmdline;
+static const int read_dt_cmdline = 1;
+static const int concat_cmdline;
+#endif
+
+#ifdef CONFIG_CMDLINE
+static const char *config_cmdline = CONFIG_CMDLINE;
+#else
+static const char *config_cmdline = "";
+#endif
+
int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
int depth, void *data)
{
- int l;
- const char *p;
+ int l = 0;
+ const char *p = NULL;
+ char *cmdline = data;
pr_debug("search \"chosen\", depth: %d, uname: %s\n", depth, uname);
- if (depth != 1 || !data ||
+ if (depth != 1 || !cmdline ||
(strcmp(uname, "chosen") != 0 && strcmp(uname, "chosen@0") != 0))
return 0;
early_init_dt_check_for_initrd(node);
- /* Retrieve command line */
- p = of_get_flat_dt_prop(node, "bootargs", &l);
- if (p != NULL && l > 0)
- strlcpy(data, p, min((int)l, COMMAND_LINE_SIZE));
-
- /*
- * CONFIG_CMDLINE is meant to be a default in case nothing else
- * managed to set the command line, unless CONFIG_CMDLINE_FORCE
- * is set in which case we override whatever was found earlier.
- */
-#ifdef CONFIG_CMDLINE
-#ifndef CONFIG_CMDLINE_FORCE
- if (!((char *)data)[0])
-#endif
- strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
-#endif /* CONFIG_CMDLINE */
+ /* Put CONFIG_CMDLINE in if forced or if data had nothing in it to start */
+ if (overwrite_incoming_cmdline || !cmdline[0])
+ strlcpy(cmdline, config_cmdline, COMMAND_LINE_SIZE);
+
+ /* Retrieve command line unless forcing */
+ if (read_dt_cmdline)
+ p = of_get_flat_dt_prop(node, "bootargs", &l);
+
+ if (p != NULL && l > 0) {
+ if (concat_cmdline) {
+ int cmdline_len;
+ int copy_len;
+ strlcat(cmdline, " ", COMMAND_LINE_SIZE);
+ cmdline_len = strlen(cmdline);
+ copy_len = COMMAND_LINE_SIZE - cmdline_len - 1;
+ copy_len = min((int)l, copy_len);
+ strncpy(cmdline + cmdline_len, p, copy_len);
+ cmdline[cmdline_len + copy_len] = '\0';
+ } else {
+ strlcpy(cmdline, p, min((int)l, COMMAND_LINE_SIZE));
+ }
+ }
pr_debug("Command line is: %s\n", (char*)data);
diff --git a/drivers/pci/host/pcie-xilinx.c b/drivers/pci/host/pcie-xilinx.c
index c177977ef957..cd5c0d38b306 100644
--- a/drivers/pci/host/pcie-xilinx.c
+++ b/drivers/pci/host/pcie-xilinx.c
@@ -302,7 +302,7 @@ static void xilinx_pcie_destroy_msi(unsigned int irq)
if (!test_bit(irq, msi_irq_in_use)) {
msi = irq_get_msi_desc(irq);
- port = sys_to_pcie(msi_desc_to_pci_sys_data(msi));
+ port = sys_to_pcie(msi_desc_to_pci_sysdata(msi));
dev_err(port->dev, "Trying to free unused MSI#%d\n", irq);
} else {
clear_bit(irq, msi_irq_in_use);
diff --git a/drivers/platform/Kconfig b/drivers/platform/Kconfig
index 09fde58b12e0..bf45a623a5d2 100644
--- a/drivers/platform/Kconfig
+++ b/drivers/platform/Kconfig
@@ -1,8 +1,6 @@
if X86
source "drivers/platform/x86/Kconfig"
endif
-if GOLDFISH
source "drivers/platform/goldfish/Kconfig"
-endif
source "drivers/platform/chrome/Kconfig"
diff --git a/drivers/platform/goldfish/Kconfig b/drivers/platform/goldfish/Kconfig
index 635ef25cc722..f4f00f75d267 100644
--- a/drivers/platform/goldfish/Kconfig
+++ b/drivers/platform/goldfish/Kconfig
@@ -1,5 +1,23 @@
+menuconfig GOLDFISH
+ bool "Platform support for Goldfish virtual devices"
+ depends on X86_32 || X86_64 || ARM || ARM64 || MIPS
+ ---help---
+ Say Y here to get to see options for the Goldfish virtual platform.
+ This option alone does not add any kernel code.
+
+ Unless you are building for the Android Goldfish emulator say N here.
+
+if GOLDFISH
+
+config GOLDFISH_BUS
+ tristate "Goldfish platform bus"
+ ---help---
+ This is a virtual bus to host Goldfish Android Virtual Devices.
+
config GOLDFISH_PIPE
tristate "Goldfish virtual device for QEMU pipes"
---help---
This is a virtual device to drive the QEMU pipe interface used by
the Goldfish Android Virtual Device.
+
+endif # GOLDFISH
diff --git a/drivers/platform/goldfish/Makefile b/drivers/platform/goldfish/Makefile
index a0022395eee9..d3487125838c 100644
--- a/drivers/platform/goldfish/Makefile
+++ b/drivers/platform/goldfish/Makefile
@@ -1,5 +1,5 @@
#
# Makefile for Goldfish platform specific drivers
#
-obj-$(CONFIG_GOLDFISH) += pdev_bus.o
+obj-$(CONFIG_GOLDFISH_BUS) += pdev_bus.o
obj-$(CONFIG_GOLDFISH_PIPE) += goldfish_pipe.o
diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index d9a09d9637d9..9ceac07245a1 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -2,6 +2,7 @@
* Copyright (C) 2011 Google, Inc.
* Copyright (C) 2012 Intel, Inc.
* Copyright (C) 2013 Intel, Inc.
+ * Copyright (C) 2014 Linaro Limited
*
* This software is licensed under the terms of the GNU General Public
* License version 2, as published by the Free Software Foundation, and
@@ -57,6 +58,8 @@
#include <linux/slab.h>
#include <linux/io.h>
#include <linux/goldfish.h>
+#include <linux/mm.h>
+#include <linux/acpi.h>
/*
* IMPORTANT: The following constants must match the ones used and defined
@@ -75,6 +78,7 @@
#define PIPE_REG_PARAMS_ADDR_LOW 0x18 /* read/write: batch data address */
#define PIPE_REG_PARAMS_ADDR_HIGH 0x1c /* read/write: batch data address */
#define PIPE_REG_ACCESS_PARAMS 0x20 /* write: batch access */
+#define PIPE_REG_VERSION 0x24 /* read: device version */
/* list of commands for PIPE_REG_COMMAND */
#define CMD_OPEN 1 /* open new channel */
@@ -90,12 +94,6 @@
#define CMD_WRITE_BUFFER 4 /* send a user buffer to the emulator */
#define CMD_WAKE_ON_WRITE 5 /* tell the emulator to wake us when writing
is possible */
-
-/* The following commands are related to read operations, they must be
- * listed in the same order than the corresponding write ones, since we
- * will use (CMD_READ_BUFFER - CMD_WRITE_BUFFER) as a special offset
- * in goldfish_pipe_read_write() below.
- */
#define CMD_READ_BUFFER 6 /* receive a user buffer from the emulator */
#define CMD_WAKE_ON_READ 7 /* tell the emulator to wake us when reading
* is possible */
@@ -130,6 +128,7 @@ struct goldfish_pipe_dev {
unsigned char __iomem *base;
struct access_params *aps;
int irq;
+ u32 version;
};
static struct goldfish_pipe_dev pipe_dev[1];
@@ -263,19 +262,14 @@ static int access_with_param(struct goldfish_pipe_dev *dev, const int cmd,
return 0;
}
-/* This function is used for both reading from and writing to a given
- * pipe.
- */
static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
size_t bufflen, int is_write)
{
unsigned long irq_flags;
struct goldfish_pipe *pipe = filp->private_data;
struct goldfish_pipe_dev *dev = pipe->dev;
- const int cmd_offset = is_write ? 0
- : (CMD_READ_BUFFER - CMD_WRITE_BUFFER);
unsigned long address, address_end;
- int ret = 0;
+ int count = 0, ret = -EINVAL;
/* If the emulator already closed the pipe, no need to go further */
if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags))
@@ -303,74 +297,100 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
: address_end;
unsigned long avail = next - address;
int status, wakeBit;
-
- /* Ensure that the corresponding page is properly mapped */
- /* FIXME: this isn't safe or sufficient - use get_user_pages */
- if (is_write) {
- char c;
- /* Ensure that the page is mapped and readable */
- if (__get_user(c, (char __user *)address)) {
- if (!ret)
- ret = -EFAULT;
- break;
- }
+ struct page *page;
+
+ /* Either vaddr or paddr depending on the device version */
+ unsigned long xaddr;
+
+ /*
+ * We grab the pages on a page-by-page basis in case user
+ * space gives us a potentially huge buffer but the read only
+ * returns a small amount, then there's no need to pin that
+ * much memory to the process.
+ */
+ down_read(&current->mm->mmap_sem);
+ ret = get_user_pages(current, current->mm, address, 1,
+ !is_write, 0, &page, NULL);
+ up_read(&current->mm->mmap_sem);
+ if (ret < 0)
+ return ret;
+
+ if (dev->version) {
+ /* Device version 1 or newer (qemu-android) expects the
+ * physical address. */
+ xaddr = page_to_phys(page) | (address & ~PAGE_MASK);
} else {
- /* Ensure that the page is mapped and writable */
- if (__put_user(0, (char __user *)address)) {
- if (!ret)
- ret = -EFAULT;
- break;
- }
+ /* Device version 0 (classic emulator) expects the
+ * virtual address. */
+ xaddr = address;
}
/* Now, try to transfer the bytes in the current page */
spin_lock_irqsave(&dev->lock, irq_flags);
- if (access_with_param(dev, CMD_WRITE_BUFFER + cmd_offset,
- address, avail, pipe, &status)) {
+ if (access_with_param(dev,
+ is_write ? CMD_WRITE_BUFFER : CMD_READ_BUFFER,
+ xaddr, avail, pipe, &status)) {
gf_write64((u64)(unsigned long)pipe,
dev->base + PIPE_REG_CHANNEL,
dev->base + PIPE_REG_CHANNEL_HIGH);
writel(avail, dev->base + PIPE_REG_SIZE);
- gf_write64(address, dev->base + PIPE_REG_ADDRESS,
+ gf_write64(xaddr, dev->base + PIPE_REG_ADDRESS,
dev->base + PIPE_REG_ADDRESS_HIGH);
- writel(CMD_WRITE_BUFFER + cmd_offset,
- dev->base + PIPE_REG_COMMAND);
+ writel(is_write ? CMD_WRITE_BUFFER : CMD_READ_BUFFER,
+ dev->base + PIPE_REG_COMMAND);
status = readl(dev->base + PIPE_REG_STATUS);
}
spin_unlock_irqrestore(&dev->lock, irq_flags);
+ if (status > 0 && !is_write)
+ set_page_dirty(page);
+ put_page(page);
+
if (status > 0) { /* Correct transfer */
- ret += status;
+ count += status;
address += status;
continue;
- }
-
- if (status == 0) /* EOF */
+ } else if (status == 0) { /* EOF */
+ ret = 0;
break;
-
- /* An error occured. If we already transfered stuff, just
- * return with its count. We expect the next call to return
- * an error code */
- if (ret > 0)
+ } else if (status < 0 && count > 0) {
+ /*
+ * An error occured and we already transfered
+ * something on one of the previous pages.
+ * Just return what we already copied and log this
+ * err.
+ *
+ * Note: This seems like an incorrect approach but
+ * cannot change it until we check if any user space
+ * ABI relies on this behavior.
+ */
+ if (status != PIPE_ERROR_AGAIN)
+ pr_info_ratelimited("goldfish_pipe: backend returned error %d on %s\n",
+ status, is_write ? "write" : "read");
+ ret = 0;
break;
+ }
- /* If the error is not PIPE_ERROR_AGAIN, or if we are not in
- * non-blocking mode, just return the error code.
- */
+ /*
+ * If the error is not PIPE_ERROR_AGAIN, or if we are not in
+ * non-blocking mode, just return the error code.
+ */
if (status != PIPE_ERROR_AGAIN ||
(filp->f_flags & O_NONBLOCK) != 0) {
ret = goldfish_pipe_error_convert(status);
break;
}
- /* We will have to wait until more data/space is available.
- * First, mark the pipe as waiting for a specific wake signal.
- */
+ /*
+ * The backend blocked the read/write, wait until the backend
+ * tells us it's ready to process more data.
+ */
wakeBit = is_write ? BIT_WAKE_ON_WRITE : BIT_WAKE_ON_READ;
set_bit(wakeBit, &pipe->flags);
/* Tell the emulator we're going to wait for a wake event */
- goldfish_cmd(pipe, CMD_WAKE_ON_WRITE + cmd_offset);
+ goldfish_cmd(pipe,
+ is_write ? CMD_WAKE_ON_WRITE : CMD_WAKE_ON_READ);
/* Unlock the pipe, then wait for the wake signal */
mutex_unlock(&pipe->lock);
@@ -378,22 +398,29 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
while (test_bit(wakeBit, &pipe->flags)) {
if (wait_event_interruptible(
pipe->wake_queue,
- !test_bit(wakeBit, &pipe->flags)))
- return -ERESTARTSYS;
+ !test_bit(wakeBit, &pipe->flags))) {
+ ret = -ERESTARTSYS;
+ break;
+ }
- if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags))
- return -EIO;
+ if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags)) {
+ ret = -EIO;
+ break;
+ }
}
/* Try to re-acquire the lock */
- if (mutex_lock_interruptible(&pipe->lock))
- return -ERESTARTSYS;
-
- /* Try the transfer again */
- continue;
+ if (mutex_lock_interruptible(&pipe->lock)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
}
mutex_unlock(&pipe->lock);
- return ret;
+
+ if (ret < 0)
+ return ret;
+ else
+ return count;
}
static ssize_t goldfish_pipe_read(struct file *filp, char __user *buffer,
@@ -446,10 +473,11 @@ static irqreturn_t goldfish_pipe_interrupt(int irq, void *dev_id)
unsigned long irq_flags;
int count = 0;
- /* We're going to read from the emulator a list of (channel,flags)
- * pairs corresponding to the wake events that occured on each
- * blocked pipe (i.e. channel).
- */
+ /*
+ * We're going to read from the emulator a list of (channel,flags)
+ * pairs corresponding to the wake events that occured on each
+ * blocked pipe (i.e. channel).
+ */
spin_lock_irqsave(&dev->lock, irq_flags);
for (;;) {
/* First read the channel, 0 means the end of the list */
@@ -600,6 +628,11 @@ static int goldfish_pipe_probe(struct platform_device *pdev)
goto error;
}
setup_access_params_addr(pdev, dev);
+
+ /* Although the pipe device in the classic Android emulator does not
+ * recognize the 'version' register, it won't treat this as an error
+ * either and will simply return 0, which is fine. */
+ dev->version = readl(dev->base + PIPE_REG_VERSION);
return 0;
error:
@@ -615,11 +648,26 @@ static int goldfish_pipe_remove(struct platform_device *pdev)
return 0;
}
+static const struct acpi_device_id goldfish_pipe_acpi_match[] = {
+ { "GFSH0003", 0 },
+ { },
+};
+MODULE_DEVICE_TABLE(acpi, goldfish_pipe_acpi_match);
+
+static const struct of_device_id goldfish_pipe_of_match[] = {
+ { .compatible = "generic,android-pipe", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, goldfish_pipe_of_match);
+
static struct platform_driver goldfish_pipe = {
.probe = goldfish_pipe_probe,
.remove = goldfish_pipe_remove,
.driver = {
- .name = "goldfish_pipe"
+ .name = "goldfish_pipe",
+ .owner = THIS_MODULE,
+ .of_match_table = goldfish_pipe_of_match,
+ .acpi_match_table = ACPI_PTR(goldfish_pipe_acpi_match),
}
};
diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c
index 62653f50a524..87a348cc1e6e 100644
--- a/drivers/power/power_supply_sysfs.c
+++ b/drivers/power/power_supply_sysfs.c
@@ -106,7 +106,10 @@ static ssize_t power_supply_show_property(struct device *dev,
else if (off >= POWER_SUPPLY_PROP_MODEL_NAME)
return sprintf(buf, "%s\n", value.strval);
- return sprintf(buf, "%d\n", value.intval);
+ if (off == POWER_SUPPLY_PROP_CHARGE_COUNTER_EXT)
+ return sprintf(buf, "%lld\n", value.int64val);
+ else
+ return sprintf(buf, "%d\n", value.intval);
}
static ssize_t power_supply_store_property(struct device *dev,
@@ -197,6 +200,12 @@ static struct device_attribute power_supply_attrs[] = {
POWER_SUPPLY_ATTR(scope),
POWER_SUPPLY_ATTR(charge_term_current),
POWER_SUPPLY_ATTR(calibrate),
+ /* Local extensions */
+ POWER_SUPPLY_ATTR(usb_hc),
+ POWER_SUPPLY_ATTR(usb_otg),
+ POWER_SUPPLY_ATTR(charge_enabled),
+ /* Local extensions of type int64_t */
+ POWER_SUPPLY_ATTR(charge_counter_ext),
/* Properties of type `const char *' */
POWER_SUPPLY_ATTR(model_name),
POWER_SUPPLY_ATTR(manufacturer),
diff --git a/drivers/rtc/rtc-palmas.c b/drivers/rtc/rtc-palmas.c
index 4dfe2d793fa3..d2346e0b4d29 100644
--- a/drivers/rtc/rtc-palmas.c
+++ b/drivers/rtc/rtc-palmas.c
@@ -45,6 +45,42 @@ struct palmas_rtc {
/* Total number of RTC registers needed to set time*/
#define PALMAS_NUM_TIME_REGS (PALMAS_YEARS_REG - PALMAS_SECONDS_REG + 1)
+/*
+ * Special bin2bcd mapping to deal with bcd storage of year.
+ *
+ * 0-69 -> 0xD0
+ * 70-99 (1970 - 1999) -> 0xD0 - 0xF9 (correctly rolls to 0x00)
+ * 100-199 (2000 - 2099) -> 0x00 - 0x99 (does not roll to 0xA0 :-( )
+ * 200-229 (2100 - 2129) -> 0xA0 - 0xC9 (really for completeness)
+ * 230- -> 0xC9
+ *
+ * Confirmed: the only transition that does not work correctly for this rtc
+ * clock is the transition from 2099 to 2100, it proceeds to 2000. We will
+ * accept this issue since the clock retains and transitions the year correctly
+ * in all other conditions.
+ */
+static unsigned char year_bin2bcd(int val)
+{
+ if (val < 70)
+ return 0xD0;
+ if (val < 100)
+ return bin2bcd(val - 20) | 0x80; /* KISS leverage of bin2bcd */
+ if (val >= 230)
+ return 0xC9;
+ if (val >= 200)
+ return bin2bcd(val - 180) | 0x80;
+ return bin2bcd(val - 100);
+}
+
+static int year_bcd2bin(unsigned char val)
+{
+ if (val >= 0xD0)
+ return bcd2bin(val & 0x7F) + 20;
+ if (val >= 0xA0)
+ return bcd2bin(val & 0x7F) + 180;
+ return bcd2bin(val) + 100;
+}
+
static int palmas_rtc_read_time(struct device *dev, struct rtc_time *tm)
{
unsigned char rtc_data[PALMAS_NUM_TIME_REGS];
@@ -71,7 +107,7 @@ static int palmas_rtc_read_time(struct device *dev, struct rtc_time *tm)
tm->tm_hour = bcd2bin(rtc_data[2]);
tm->tm_mday = bcd2bin(rtc_data[3]);
tm->tm_mon = bcd2bin(rtc_data[4]) - 1;
- tm->tm_year = bcd2bin(rtc_data[5]) + 100;
+ tm->tm_year = year_bcd2bin(rtc_data[5]);
return ret;
}
@@ -87,7 +123,7 @@ static int palmas_rtc_set_time(struct device *dev, struct rtc_time *tm)
rtc_data[2] = bin2bcd(tm->tm_hour);
rtc_data[3] = bin2bcd(tm->tm_mday);
rtc_data[4] = bin2bcd(tm->tm_mon + 1);
- rtc_data[5] = bin2bcd(tm->tm_year - 100);
+ rtc_data[5] = year_bin2bcd(tm->tm_year);
/* Stop RTC while updating the RTC time registers */
ret = palmas_update_bits(palmas, PALMAS_RTC_BASE, PALMAS_RTC_CTRL_REG,
@@ -142,7 +178,7 @@ static int palmas_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
alm->time.tm_hour = bcd2bin(alarm_data[2]);
alm->time.tm_mday = bcd2bin(alarm_data[3]);
alm->time.tm_mon = bcd2bin(alarm_data[4]) - 1;
- alm->time.tm_year = bcd2bin(alarm_data[5]) + 100;
+ alm->time.tm_year = year_bcd2bin(alarm_data[5]);
ret = palmas_read(palmas, PALMAS_RTC_BASE, PALMAS_RTC_INTERRUPTS_REG,
&int_val);
@@ -173,7 +209,7 @@ static int palmas_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
alarm_data[2] = bin2bcd(alm->time.tm_hour);
alarm_data[3] = bin2bcd(alm->time.tm_mday);
alarm_data[4] = bin2bcd(alm->time.tm_mon + 1);
- alarm_data[5] = bin2bcd(alm->time.tm_year - 100);
+ alarm_data[5] = year_bin2bcd(alm->time.tm_year);
ret = palmas_bulk_write(palmas, PALMAS_RTC_BASE,
PALMAS_ALARM_SECONDS_REG, alarm_data, PALMAS_NUM_TIME_REGS);
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 605ca60e8a10..64b1b2ba3a44 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -39,6 +39,7 @@
#include <linux/async.h>
#include <linux/devfreq.h>
+#include <linux/blkdev.h>
#include "ufshcd.h"
#include "unipro.h"
@@ -1313,6 +1314,16 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
clear_bit_unlock(tag, &hba->lrb_in_use);
goto out;
}
+ /* IO svc time latency histogram */
+ if (hba != NULL && cmd->request != NULL) {
+ if (hba->latency_hist_enabled &&
+ (cmd->request->cmd_type == REQ_TYPE_FS)) {
+ cmd->request->lat_hist_io_start = ktime_get();
+ cmd->request->lat_hist_enabled = 1;
+ } else
+ cmd->request->lat_hist_enabled = 0;
+ }
+
WARN_ON(hba->clk_gating.state != CLKS_ON);
lrbp = &hba->lrb[tag];
@@ -3051,6 +3062,7 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba)
u32 tr_doorbell;
int result;
int index;
+ struct request *req;
/* Resetting interrupt aggregation counters first and reading the
* DOOR_BELL afterward allows us to handle all the completed requests.
@@ -3074,6 +3086,22 @@ static void ufshcd_transfer_req_compl(struct ufs_hba *hba)
/* Mark completed command as NULL in LRB */
lrbp->cmd = NULL;
clear_bit_unlock(index, &hba->lrb_in_use);
+ req = cmd->request;
+ if (req) {
+ /* Update IO svc time latency histogram */
+ if (req->lat_hist_enabled) {
+ ktime_t completion;
+ u_int64_t delta_us;
+
+ completion = ktime_get();
+ delta_us = ktime_us_delta(completion,
+ req->lat_hist_io_start);
+ /* rq_data_dir() => true if WRITE */
+ blk_update_latency_hist(&hba->io_lat_s,
+ (rq_data_dir(req) == READ),
+ delta_us);
+ }
+ }
/* Do not touch lrbp after scsi done */
cmd->scsi_done(cmd);
__ufshcd_release(hba);
@@ -5248,6 +5276,54 @@ out:
}
EXPORT_SYMBOL(ufshcd_shutdown);
+/*
+ * Values permitted 0, 1, 2.
+ * 0 -> Disable IO latency histograms (default)
+ * 1 -> Enable IO latency histograms
+ * 2 -> Zero out IO latency histograms
+ */
+static ssize_t
+latency_hist_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ufs_hba *hba = dev_get_drvdata(dev);
+ long value;
+
+ if (kstrtol(buf, 0, &value))
+ return -EINVAL;
+ if (value == BLK_IO_LAT_HIST_ZERO)
+ blk_zero_latency_hist(&hba->io_lat_s);
+ else if (value == BLK_IO_LAT_HIST_ENABLE ||
+ value == BLK_IO_LAT_HIST_DISABLE)
+ hba->latency_hist_enabled = value;
+ return count;
+}
+
+ssize_t
+latency_hist_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ufs_hba *hba = dev_get_drvdata(dev);
+
+ return blk_latency_hist_show(&hba->io_lat_s, buf);
+}
+
+static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR,
+ latency_hist_show, latency_hist_store);
+
+static void
+ufshcd_init_latency_hist(struct ufs_hba *hba)
+{
+ if (device_create_file(hba->dev, &dev_attr_latency_hist))
+ dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n");
+}
+
+static void
+ufshcd_exit_latency_hist(struct ufs_hba *hba)
+{
+ device_create_file(hba->dev, &dev_attr_latency_hist);
+}
+
/**
* ufshcd_remove - de-allocate SCSI host and host memory space
* data structure memory
@@ -5263,6 +5339,7 @@ void ufshcd_remove(struct ufs_hba *hba)
scsi_host_put(hba->host);
ufshcd_exit_clk_gating(hba);
+ ufshcd_exit_latency_hist(hba);
if (ufshcd_is_clkscaling_enabled(hba))
devfreq_remove_device(hba->devfreq);
ufshcd_hba_exit(hba);
@@ -5552,6 +5629,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
/* Hold auto suspend until async scan completes */
pm_runtime_get_sync(dev);
+ ufshcd_init_latency_hist(hba);
+
/*
* The device-initialize-sequence hasn't been invoked yet.
* Set the device to power-off state
@@ -5566,6 +5645,7 @@ out_remove_scsi_host:
scsi_remove_host(hba->host);
exit_gating:
ufshcd_exit_clk_gating(hba);
+ ufshcd_exit_latency_hist(hba);
out_disable:
hba->is_irq_enabled = false;
scsi_host_put(host);
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index 4a574aa45855..241810c83099 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -473,6 +473,9 @@ struct ufs_hba {
struct devfreq *devfreq;
struct ufs_clk_scaling clk_scaling;
bool is_sys_suspended;
+
+ int latency_hist_enabled;
+ struct io_latency_state io_lat_s;
};
/* Returns true if clocks can be gated. Otherwise false */
diff --git a/drivers/staging/android/Kconfig b/drivers/staging/android/Kconfig
index 7a0e28852965..bd06dc5be88b 100644
--- a/drivers/staging/android/Kconfig
+++ b/drivers/staging/android/Kconfig
@@ -32,6 +32,18 @@ config ANDROID_BINDER_IPC_32BIT
Note that enabling this will break newer Android user-space.
+config ANDROID_BINDER_DEVICES
+ string "Android Binder devices"
+ depends on ANDROID_BINDER_IPC
+ default "binder,hwbinder,vndbinder"
+ ---help---
+ Default value for the binder.devices parameter.
+
+ The binder.devices parameter is a comma-separated list of strings
+ that specifies the names of the binder device nodes that will be
+ created. Each binder device has its own context manager, and is
+ therefore logically separated from the other devices.
+
config ASHMEM
bool "Enable the Anonymous Shared Memory Subsystem"
default n
@@ -44,23 +56,6 @@ config ASHMEM
It is, in theory, a good memory allocator for low-memory devices,
because it can discard shared memory units when under memory pressure.
-config ANDROID_LOGGER
- tristate "Android log driver"
- default n
- ---help---
- This adds support for system-wide logging using four log buffers.
-
- These are:
-
- 1: main
- 2: events
- 3: radio
- 4: system
-
- Log reading and writing is performed via normal Linux reads and
- optimized writes. This optimization avoids logging having too
- much overhead in the system.
-
config ANDROID_TIMED_OUTPUT
bool "Timed output class driver"
default y
@@ -75,14 +70,14 @@ config ANDROID_LOW_MEMORY_KILLER
---help---
Registers processes to be killed when memory is low
-config ANDROID_INTF_ALARM_DEV
- tristate "Android alarm driver"
- depends on RTC_CLASS
- default n
+config ANDROID_LOW_MEMORY_KILLER_AUTODETECT_OOM_ADJ_VALUES
+ bool "Android Low Memory Killer: detect oom_adj values"
+ depends on ANDROID_LOW_MEMORY_KILLER
+ default y
---help---
- Provides non-wakeup and rtc backed wakeup alarms based on rtc or
- elapsed realtime, and a non-wakeup alarm on the monotonic clock.
- Also exports the alarm interface to user-space.
+ Detect oom_adj values written to
+ /sys/module/lowmemorykiller/parameters/adj and convert them
+ to oom_score_adj values.
config SYNC
bool "Synchronization framework"
@@ -114,6 +109,8 @@ config SW_SYNC_USER
source "drivers/staging/android/ion/Kconfig"
+source "drivers/staging/android/fiq_debugger/Kconfig"
+
endif # if ANDROID
endmenu
diff --git a/drivers/staging/android/Makefile b/drivers/staging/android/Makefile
index 517ad5ffa429..b7b9980bea97 100644
--- a/drivers/staging/android/Makefile
+++ b/drivers/staging/android/Makefile
@@ -1,13 +1,12 @@
ccflags-y += -I$(src) # needed for trace events
obj-y += ion/
+obj-$(CONFIG_FIQ_DEBUGGER) += fiq_debugger/
obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o
obj-$(CONFIG_ASHMEM) += ashmem.o
-obj-$(CONFIG_ANDROID_LOGGER) += logger.o
obj-$(CONFIG_ANDROID_TIMED_OUTPUT) += timed_output.o
obj-$(CONFIG_ANDROID_TIMED_GPIO) += timed_gpio.o
obj-$(CONFIG_ANDROID_LOW_MEMORY_KILLER) += lowmemorykiller.o
-obj-$(CONFIG_ANDROID_INTF_ALARM_DEV) += alarm-dev.o
obj-$(CONFIG_SYNC) += sync.o sync_debug.o
obj-$(CONFIG_SW_SYNC) += sw_sync.o
diff --git a/drivers/staging/android/TODO b/drivers/staging/android/TODO
deleted file mode 100644
index b15fb0d6b152..000000000000
--- a/drivers/staging/android/TODO
+++ /dev/null
@@ -1,10 +0,0 @@
-TODO:
- - checkpatch.pl cleanups
- - sparse fixes
- - rename files to be not so "generic"
- - make sure things build as modules properly
- - add proper arch dependencies as needed
- - audit userspace interfaces to make sure they are sane
-
-Please send patches to Greg Kroah-Hartman <greg@kroah.com> and Cc:
-Brian Swetland <swetland@google.com>
diff --git a/drivers/staging/android/alarm-dev.c b/drivers/staging/android/alarm-dev.c
deleted file mode 100644
index ff4b3e8758a7..000000000000
--- a/drivers/staging/android/alarm-dev.c
+++ /dev/null
@@ -1,446 +0,0 @@
-/* drivers/rtc/alarm-dev.c
- *
- * Copyright (C) 2007-2009 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- */
-
-#include <linux/time.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/miscdevice.h>
-#include <linux/fs.h>
-#include <linux/platform_device.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-#include <linux/uaccess.h>
-#include <linux/alarmtimer.h>
-#include "android_alarm.h"
-
-#define ANDROID_ALARM_PRINT_INFO (1U << 0)
-#define ANDROID_ALARM_PRINT_IO (1U << 1)
-#define ANDROID_ALARM_PRINT_INT (1U << 2)
-
-static int debug_mask = ANDROID_ALARM_PRINT_INFO;
-module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP);
-
-#define alarm_dbg(debug_level_mask, fmt, ...) \
-do { \
- if (debug_mask & ANDROID_ALARM_PRINT_##debug_level_mask) \
- pr_info(fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define ANDROID_ALARM_WAKEUP_MASK ( \
- ANDROID_ALARM_RTC_WAKEUP_MASK | \
- ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP_MASK)
-
-static int alarm_opened;
-static DEFINE_SPINLOCK(alarm_slock);
-static struct wakeup_source alarm_wake_lock;
-static DECLARE_WAIT_QUEUE_HEAD(alarm_wait_queue);
-static uint32_t alarm_pending;
-static uint32_t alarm_enabled;
-static uint32_t wait_pending;
-
-struct devalarm {
- union {
- struct hrtimer hrt;
- struct alarm alrm;
- } u;
- enum android_alarm_type type;
-};
-
-static struct devalarm alarms[ANDROID_ALARM_TYPE_COUNT];
-
-/**
- * is_wakeup() - Checks to see if this alarm can wake the device
- * @type: The type of alarm being checked
- *
- * Return: 1 if this is a wakeup alarm, otherwise 0
- */
-static int is_wakeup(enum android_alarm_type type)
-{
- return type == ANDROID_ALARM_RTC_WAKEUP ||
- type == ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP;
-}
-
-static void devalarm_start(struct devalarm *alrm, ktime_t exp)
-{
- if (is_wakeup(alrm->type))
- alarm_start(&alrm->u.alrm, exp);
- else
- hrtimer_start(&alrm->u.hrt, exp, HRTIMER_MODE_ABS);
-}
-
-static int devalarm_try_to_cancel(struct devalarm *alrm)
-{
- if (is_wakeup(alrm->type))
- return alarm_try_to_cancel(&alrm->u.alrm);
- return hrtimer_try_to_cancel(&alrm->u.hrt);
-}
-
-static void devalarm_cancel(struct devalarm *alrm)
-{
- if (is_wakeup(alrm->type))
- alarm_cancel(&alrm->u.alrm);
- else
- hrtimer_cancel(&alrm->u.hrt);
-}
-
-static void alarm_clear(enum android_alarm_type alarm_type)
-{
- uint32_t alarm_type_mask = 1U << alarm_type;
- unsigned long flags;
-
- spin_lock_irqsave(&alarm_slock, flags);
- alarm_dbg(IO, "alarm %d clear\n", alarm_type);
- devalarm_try_to_cancel(&alarms[alarm_type]);
- if (alarm_pending) {
- alarm_pending &= ~alarm_type_mask;
- if (!alarm_pending && !wait_pending)
- __pm_relax(&alarm_wake_lock);
- }
- alarm_enabled &= ~alarm_type_mask;
- spin_unlock_irqrestore(&alarm_slock, flags);
-}
-
-static void alarm_set(enum android_alarm_type alarm_type,
- struct timespec *ts)
-{
- uint32_t alarm_type_mask = 1U << alarm_type;
- unsigned long flags;
-
- spin_lock_irqsave(&alarm_slock, flags);
- alarm_dbg(IO, "alarm %d set %ld.%09ld\n",
- alarm_type, ts->tv_sec, ts->tv_nsec);
- alarm_enabled |= alarm_type_mask;
- devalarm_start(&alarms[alarm_type], timespec_to_ktime(*ts));
- spin_unlock_irqrestore(&alarm_slock, flags);
-}
-
-static int alarm_wait(void)
-{
- unsigned long flags;
- int rv = 0;
-
- spin_lock_irqsave(&alarm_slock, flags);
- alarm_dbg(IO, "alarm wait\n");
- if (!alarm_pending && wait_pending) {
- __pm_relax(&alarm_wake_lock);
- wait_pending = 0;
- }
- spin_unlock_irqrestore(&alarm_slock, flags);
-
- rv = wait_event_interruptible(alarm_wait_queue, alarm_pending);
- if (rv)
- return rv;
-
- spin_lock_irqsave(&alarm_slock, flags);
- rv = alarm_pending;
- wait_pending = 1;
- alarm_pending = 0;
- spin_unlock_irqrestore(&alarm_slock, flags);
-
- return rv;
-}
-
-static int alarm_set_rtc(struct timespec *ts)
-{
- struct rtc_time new_rtc_tm;
- struct rtc_device *rtc_dev;
- unsigned long flags;
- int rv = 0;
-
- rtc_time_to_tm(ts->tv_sec, &new_rtc_tm);
- rtc_dev = alarmtimer_get_rtcdev();
- rv = do_settimeofday(ts);
- if (rv < 0)
- return rv;
- if (rtc_dev)
- rv = rtc_set_time(rtc_dev, &new_rtc_tm);
-
- spin_lock_irqsave(&alarm_slock, flags);
- alarm_pending |= ANDROID_ALARM_TIME_CHANGE_MASK;
- wake_up(&alarm_wait_queue);
- spin_unlock_irqrestore(&alarm_slock, flags);
-
- return rv;
-}
-
-static int alarm_get_time(enum android_alarm_type alarm_type,
- struct timespec *ts)
-{
- int rv = 0;
-
- switch (alarm_type) {
- case ANDROID_ALARM_RTC_WAKEUP:
- case ANDROID_ALARM_RTC:
- getnstimeofday(ts);
- break;
- case ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP:
- case ANDROID_ALARM_ELAPSED_REALTIME:
- get_monotonic_boottime(ts);
- break;
- case ANDROID_ALARM_SYSTEMTIME:
- ktime_get_ts(ts);
- break;
- default:
- rv = -EINVAL;
- }
- return rv;
-}
-
-static long alarm_do_ioctl(struct file *file, unsigned int cmd,
- struct timespec *ts)
-{
- int rv = 0;
- unsigned long flags;
- enum android_alarm_type alarm_type = ANDROID_ALARM_IOCTL_TO_TYPE(cmd);
-
- if (alarm_type >= ANDROID_ALARM_TYPE_COUNT)
- return -EINVAL;
-
- if (ANDROID_ALARM_BASE_CMD(cmd) != ANDROID_ALARM_GET_TIME(0)) {
- if ((file->f_flags & O_ACCMODE) == O_RDONLY)
- return -EPERM;
- if (file->private_data == NULL &&
- cmd != ANDROID_ALARM_SET_RTC) {
- spin_lock_irqsave(&alarm_slock, flags);
- if (alarm_opened) {
- spin_unlock_irqrestore(&alarm_slock, flags);
- return -EBUSY;
- }
- alarm_opened = 1;
- file->private_data = (void *)1;
- spin_unlock_irqrestore(&alarm_slock, flags);
- }
- }
-
- switch (ANDROID_ALARM_BASE_CMD(cmd)) {
- case ANDROID_ALARM_CLEAR(0):
- alarm_clear(alarm_type);
- break;
- case ANDROID_ALARM_SET(0):
- alarm_set(alarm_type, ts);
- break;
- case ANDROID_ALARM_SET_AND_WAIT(0):
- alarm_set(alarm_type, ts);
- /* fall though */
- case ANDROID_ALARM_WAIT:
- rv = alarm_wait();
- break;
- case ANDROID_ALARM_SET_RTC:
- rv = alarm_set_rtc(ts);
- break;
- case ANDROID_ALARM_GET_TIME(0):
- rv = alarm_get_time(alarm_type, ts);
- break;
-
- default:
- rv = -EINVAL;
- }
- return rv;
-}
-
-static long alarm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-
- struct timespec ts;
- int rv;
-
- switch (ANDROID_ALARM_BASE_CMD(cmd)) {
- case ANDROID_ALARM_SET_AND_WAIT(0):
- case ANDROID_ALARM_SET(0):
- case ANDROID_ALARM_SET_RTC:
- if (copy_from_user(&ts, (void __user *)arg, sizeof(ts)))
- return -EFAULT;
- break;
- }
-
- rv = alarm_do_ioctl(file, cmd, &ts);
- if (rv)
- return rv;
-
- switch (ANDROID_ALARM_BASE_CMD(cmd)) {
- case ANDROID_ALARM_GET_TIME(0):
- if (copy_to_user((void __user *)arg, &ts, sizeof(ts)))
- return -EFAULT;
- break;
- }
-
- return 0;
-}
-
-#ifdef CONFIG_COMPAT
-static long alarm_compat_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg)
-{
-
- struct timespec ts;
- int rv;
-
- switch (ANDROID_ALARM_BASE_CMD(cmd)) {
- case ANDROID_ALARM_SET_AND_WAIT_COMPAT(0):
- case ANDROID_ALARM_SET_COMPAT(0):
- case ANDROID_ALARM_SET_RTC_COMPAT:
- if (compat_get_timespec(&ts, (void __user *)arg))
- return -EFAULT;
- /* fall through */
- case ANDROID_ALARM_GET_TIME_COMPAT(0):
- cmd = ANDROID_ALARM_COMPAT_TO_NORM(cmd);
- break;
- }
-
- rv = alarm_do_ioctl(file, cmd, &ts);
- if (rv)
- return rv;
-
- switch (ANDROID_ALARM_BASE_CMD(cmd)) {
- case ANDROID_ALARM_GET_TIME(0): /* NOTE: we modified cmd above */
- if (compat_put_timespec(&ts, (void __user *)arg))
- return -EFAULT;
- break;
- }
-
- return 0;
-}
-#endif
-
-static int alarm_open(struct inode *inode, struct file *file)
-{
- file->private_data = NULL;
- return 0;
-}
-
-static int alarm_release(struct inode *inode, struct file *file)
-{
- int i;
- unsigned long flags;
-
- spin_lock_irqsave(&alarm_slock, flags);
- if (file->private_data) {
- for (i = 0; i < ANDROID_ALARM_TYPE_COUNT; i++) {
- uint32_t alarm_type_mask = 1U << i;
-
- if (alarm_enabled & alarm_type_mask) {
- alarm_dbg(INFO,
- "%s: clear alarm, pending %d\n",
- __func__,
- !!(alarm_pending & alarm_type_mask));
- alarm_enabled &= ~alarm_type_mask;
- }
- spin_unlock_irqrestore(&alarm_slock, flags);
- devalarm_cancel(&alarms[i]);
- spin_lock_irqsave(&alarm_slock, flags);
- }
- if (alarm_pending | wait_pending) {
- if (alarm_pending)
- alarm_dbg(INFO, "%s: clear pending alarms %x\n",
- __func__, alarm_pending);
- __pm_relax(&alarm_wake_lock);
- wait_pending = 0;
- alarm_pending = 0;
- }
- alarm_opened = 0;
- }
- spin_unlock_irqrestore(&alarm_slock, flags);
- return 0;
-}
-
-static void devalarm_triggered(struct devalarm *alarm)
-{
- unsigned long flags;
- uint32_t alarm_type_mask = 1U << alarm->type;
-
- alarm_dbg(INT, "%s: type %d\n", __func__, alarm->type);
- spin_lock_irqsave(&alarm_slock, flags);
- if (alarm_enabled & alarm_type_mask) {
- __pm_wakeup_event(&alarm_wake_lock, 5000); /* 5secs */
- alarm_enabled &= ~alarm_type_mask;
- alarm_pending |= alarm_type_mask;
- wake_up(&alarm_wait_queue);
- }
- spin_unlock_irqrestore(&alarm_slock, flags);
-}
-
-static enum hrtimer_restart devalarm_hrthandler(struct hrtimer *hrt)
-{
- struct devalarm *devalrm = container_of(hrt, struct devalarm, u.hrt);
-
- devalarm_triggered(devalrm);
- return HRTIMER_NORESTART;
-}
-
-static enum alarmtimer_restart devalarm_alarmhandler(struct alarm *alrm,
- ktime_t now)
-{
- struct devalarm *devalrm = container_of(alrm, struct devalarm, u.alrm);
-
- devalarm_triggered(devalrm);
- return ALARMTIMER_NORESTART;
-}
-
-
-static const struct file_operations alarm_fops = {
- .owner = THIS_MODULE,
- .unlocked_ioctl = alarm_ioctl,
- .open = alarm_open,
- .release = alarm_release,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = alarm_compat_ioctl,
-#endif
-};
-
-static struct miscdevice alarm_device = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = "alarm",
- .fops = &alarm_fops,
-};
-
-static int __init alarm_dev_init(void)
-{
- int err;
- int i;
-
- err = misc_register(&alarm_device);
- if (err)
- return err;
-
- alarm_init(&alarms[ANDROID_ALARM_RTC_WAKEUP].u.alrm,
- ALARM_REALTIME, devalarm_alarmhandler);
- hrtimer_init(&alarms[ANDROID_ALARM_RTC].u.hrt,
- CLOCK_REALTIME, HRTIMER_MODE_ABS);
- alarm_init(&alarms[ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP].u.alrm,
- ALARM_BOOTTIME, devalarm_alarmhandler);
- hrtimer_init(&alarms[ANDROID_ALARM_ELAPSED_REALTIME].u.hrt,
- CLOCK_BOOTTIME, HRTIMER_MODE_ABS);
- hrtimer_init(&alarms[ANDROID_ALARM_SYSTEMTIME].u.hrt,
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-
- for (i = 0; i < ANDROID_ALARM_TYPE_COUNT; i++) {
- alarms[i].type = i;
- if (!is_wakeup(i))
- alarms[i].u.hrt.function = devalarm_hrthandler;
- }
-
- wakeup_source_init(&alarm_wake_lock, "alarm");
- return 0;
-}
-
-static void __exit alarm_dev_exit(void)
-{
- misc_deregister(&alarm_device);
- wakeup_source_trash(&alarm_wake_lock);
-}
-
-module_init(alarm_dev_init);
-module_exit(alarm_dev_exit);
-MODULE_LICENSE("GPL");
diff --git a/drivers/staging/android/android_alarm.h b/drivers/staging/android/android_alarm.h
deleted file mode 100644
index 495b20cf3bf6..000000000000
--- a/drivers/staging/android/android_alarm.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* include/linux/android_alarm.h
- *
- * Copyright (C) 2006-2007 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef _LINUX_ANDROID_ALARM_H
-#define _LINUX_ANDROID_ALARM_H
-
-#include <linux/compat.h>
-#include <linux/ioctl.h>
-
-#include "uapi/android_alarm.h"
-
-#ifdef CONFIG_COMPAT
-#define ANDROID_ALARM_SET_COMPAT(type) ALARM_IOW(2, type, \
- struct compat_timespec)
-#define ANDROID_ALARM_SET_AND_WAIT_COMPAT(type) ALARM_IOW(3, type, \
- struct compat_timespec)
-#define ANDROID_ALARM_GET_TIME_COMPAT(type) ALARM_IOW(4, type, \
- struct compat_timespec)
-#define ANDROID_ALARM_SET_RTC_COMPAT _IOW('a', 5, \
- struct compat_timespec)
-#define ANDROID_ALARM_IOCTL_NR(cmd) (_IOC_NR(cmd) & ((1<<4)-1))
-#define ANDROID_ALARM_COMPAT_TO_NORM(cmd) \
- ALARM_IOW(ANDROID_ALARM_IOCTL_NR(cmd), \
- ANDROID_ALARM_IOCTL_TO_TYPE(cmd), \
- struct timespec)
-
-#endif
-
-#endif
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 658d640022be..9ae743ce7291 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -397,22 +397,14 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
}
get_file(asma->file);
- /*
- * XXX - Reworked to use shmem_zero_setup() instead of
- * shmem_set_file while we're in staging. -jstultz
- */
- if (vma->vm_flags & VM_SHARED) {
- ret = shmem_zero_setup(vma);
- if (ret) {
- fput(asma->file);
- goto out;
- }
+ if (vma->vm_flags & VM_SHARED)
+ shmem_set_file(vma, asma->file);
+ else {
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ vma->vm_file = asma->file;
}
- if (vma->vm_file)
- fput(vma->vm_file);
- vma->vm_file = asma->file;
-
out:
mutex_unlock(&ashmem_mutex);
return ret;
@@ -442,12 +434,14 @@ ashmem_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
if (!(sc->gfp_mask & __GFP_FS))
return SHRINK_STOP;
- mutex_lock(&ashmem_mutex);
+ if (!mutex_trylock(&ashmem_mutex))
+ return -1;
+
list_for_each_entry_safe(range, next, &ashmem_lru_list, lru) {
loff_t start = range->pgstart * PAGE_SIZE;
loff_t end = (range->pgend + 1) * PAGE_SIZE;
- do_fallocate(range->asma->file,
+ range->asma->file->f_op->fallocate(range->asma->file,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
start, end - start);
range->purged = ASHMEM_WAS_PURGED;
diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c
index 2186380a6f51..7c2315c6e6da 100644
--- a/drivers/staging/android/binder.c
+++ b/drivers/staging/android/binder.c
@@ -18,6 +18,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <asm/cacheflush.h>
+#include <linux/atomic.h>
#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/freezer.h>
@@ -37,24 +38,16 @@
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/pid_namespace.h>
+#include <linux/security.h>
#include "binder.h"
#include "binder_trace.h"
-static DEFINE_MUTEX(binder_main_lock);
-static DEFINE_MUTEX(binder_deferred_lock);
-static DEFINE_MUTEX(binder_mmap_lock);
-
-static HLIST_HEAD(binder_procs);
-static HLIST_HEAD(binder_deferred_list);
-static HLIST_HEAD(binder_dead_nodes);
+static HLIST_HEAD(binder_devices);
static struct dentry *binder_debugfs_dir_entry_root;
static struct dentry *binder_debugfs_dir_entry_proc;
-static struct binder_node *binder_context_mgr_node;
-static kuid_t binder_context_mgr_uid = INVALID_UID;
-static int binder_last_id;
-static struct workqueue_struct *binder_deferred_workqueue;
+atomic_t binder_last_id;
#define BINDER_DEBUG_ENTRY(name) \
static int binder_##name##_open(struct inode *inode, struct file *file) \
@@ -111,6 +104,9 @@ module_param_named(debug_mask, binder_debug_mask, uint, S_IWUSR | S_IRUGO);
static bool binder_debug_no_lock;
module_param_named(proc_no_lock, binder_debug_no_lock, bool, S_IWUSR | S_IRUGO);
+static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES;
+module_param_named(devices, binder_devices_param, charp, S_IRUGO);
+
static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait);
static int binder_stop_on_user_error;
@@ -141,6 +137,17 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
binder_stop_on_user_error = 2; \
} while (0)
+#define to_flat_binder_object(hdr) \
+ container_of(hdr, struct flat_binder_object, hdr)
+
+#define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr)
+
+#define to_binder_buffer_object(hdr) \
+ container_of(hdr, struct binder_buffer_object, hdr)
+
+#define to_binder_fd_array_object(hdr) \
+ container_of(hdr, struct binder_fd_array_object, hdr)
+
enum binder_stat_types {
BINDER_STAT_PROC,
BINDER_STAT_THREAD,
@@ -154,21 +161,25 @@ enum binder_stat_types {
struct binder_stats {
int br[_IOC_NR(BR_FAILED_REPLY) + 1];
- int bc[_IOC_NR(BC_DEAD_BINDER_DONE) + 1];
- int obj_created[BINDER_STAT_COUNT];
- int obj_deleted[BINDER_STAT_COUNT];
+ int bc[_IOC_NR(BC_REPLY_SG) + 1];
+};
+
+/* These are still global, since it's not always easy to get the context */
+struct binder_obj_stats {
+ atomic_t obj_created[BINDER_STAT_COUNT];
+ atomic_t obj_deleted[BINDER_STAT_COUNT];
};
-static struct binder_stats binder_stats;
+static struct binder_obj_stats binder_obj_stats;
static inline void binder_stats_deleted(enum binder_stat_types type)
{
- binder_stats.obj_deleted[type]++;
+ atomic_inc(&binder_obj_stats.obj_deleted[type]);
}
static inline void binder_stats_created(enum binder_stat_types type)
{
- binder_stats.obj_created[type]++;
+ atomic_inc(&binder_obj_stats.obj_created[type]);
}
struct binder_transaction_log_entry {
@@ -182,14 +193,13 @@ struct binder_transaction_log_entry {
int to_node;
int data_size;
int offsets_size;
+ const char *context_name;
};
struct binder_transaction_log {
int next;
int full;
struct binder_transaction_log_entry entry[32];
};
-static struct binder_transaction_log binder_transaction_log;
-static struct binder_transaction_log binder_transaction_log_failed;
static struct binder_transaction_log_entry *binder_transaction_log_add(
struct binder_transaction_log *log)
@@ -206,6 +216,33 @@ static struct binder_transaction_log_entry *binder_transaction_log_add(
return e;
}
+struct binder_context {
+ struct binder_node *binder_context_mgr_node;
+ kuid_t binder_context_mgr_uid;
+ const char *name;
+
+ struct mutex binder_main_lock;
+ struct mutex binder_deferred_lock;
+ struct mutex binder_mmap_lock;
+
+ struct hlist_head binder_procs;
+ struct hlist_head binder_dead_nodes;
+ struct hlist_head binder_deferred_list;
+
+ struct work_struct deferred_work;
+ struct workqueue_struct *binder_deferred_workqueue;
+ struct binder_transaction_log transaction_log;
+ struct binder_transaction_log transaction_log_failed;
+
+ struct binder_stats binder_stats;
+};
+
+struct binder_device {
+ struct hlist_node hlist;
+ struct miscdevice miscdev;
+ struct binder_context context;
+};
+
struct binder_work {
struct list_head entry;
enum {
@@ -278,6 +315,7 @@ struct binder_buffer {
struct binder_node *target_node;
size_t data_size;
size_t offsets_size;
+ size_t extra_buffers_size;
uint8_t data[0];
};
@@ -321,6 +359,7 @@ struct binder_proc {
int ready_threads;
long default_priority;
struct dentry *debugfs_entry;
+ struct binder_context *context;
};
enum {
@@ -418,17 +457,18 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
return retval;
}
-static inline void binder_lock(const char *tag)
+static inline void binder_lock(struct binder_context *context, const char *tag)
{
trace_binder_lock(tag);
- mutex_lock(&binder_main_lock);
+ mutex_lock(&context->binder_main_lock);
trace_binder_locked(tag);
}
-static inline void binder_unlock(const char *tag)
+static inline void binder_unlock(struct binder_context *context,
+ const char *tag)
{
trace_binder_unlock(tag);
- mutex_unlock(&binder_main_lock);
+ mutex_unlock(&context->binder_main_lock);
}
static void binder_set_nice(long nice)
@@ -546,7 +586,6 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate,
{
void *page_addr;
unsigned long user_page_addr;
- struct vm_struct tmp_area;
struct page **page;
struct mm_struct *mm;
@@ -595,10 +634,11 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate,
proc->pid, page_addr);
goto err_alloc_page_failed;
}
- tmp_area.addr = page_addr;
- tmp_area.size = PAGE_SIZE + PAGE_SIZE /* guard page? */;
- ret = map_vm_area(&tmp_area, PAGE_KERNEL, page);
- if (ret) {
+ ret = map_kernel_range_noflush((unsigned long)page_addr,
+ PAGE_SIZE, PAGE_KERNEL, page);
+ flush_cache_vmap((unsigned long)page_addr,
+ (unsigned long)page_addr + PAGE_SIZE);
+ if (ret != 1) {
pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n",
proc->pid, page_addr);
goto err_map_kernel_failed;
@@ -644,7 +684,9 @@ err_no_vma:
static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
size_t data_size,
- size_t offsets_size, int is_async)
+ size_t offsets_size,
+ size_t extra_buffers_size,
+ int is_async)
{
struct rb_node *n = proc->free_buffers.rb_node;
struct binder_buffer *buffer;
@@ -652,7 +694,7 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
struct rb_node *best_fit = NULL;
void *has_page_addr;
void *end_page_addr;
- size_t size;
+ size_t size, data_offsets_size;
if (proc->vma == NULL) {
pr_err("%d: binder_alloc_buf, no vma\n",
@@ -660,15 +702,20 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
return NULL;
}
- size = ALIGN(data_size, sizeof(void *)) +
+ data_offsets_size = ALIGN(data_size, sizeof(void *)) +
ALIGN(offsets_size, sizeof(void *));
- if (size < data_size || size < offsets_size) {
+ if (data_offsets_size < data_size || data_offsets_size < offsets_size) {
binder_user_error("%d: got transaction with invalid size %zd-%zd\n",
proc->pid, data_size, offsets_size);
return NULL;
}
-
+ size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *));
+ if (size < data_offsets_size || size < extra_buffers_size) {
+ binder_user_error("%d: got transaction with invalid extra_buffers_size %zd\n",
+ proc->pid, extra_buffers_size);
+ return NULL;
+ }
if (is_async &&
proc->free_async_space < size + sizeof(struct binder_buffer)) {
binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
@@ -737,6 +784,7 @@ static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
proc->pid, size, buffer);
buffer->data_size = data_size;
buffer->offsets_size = offsets_size;
+ buffer->extra_buffers_size = extra_buffers_size;
buffer->async_transaction = is_async;
if (is_async) {
proc->free_async_space -= size + sizeof(struct binder_buffer);
@@ -811,7 +859,8 @@ static void binder_free_buf(struct binder_proc *proc,
buffer_size = binder_buffer_size(proc, buffer);
size = ALIGN(buffer->data_size, sizeof(void *)) +
- ALIGN(buffer->offsets_size, sizeof(void *));
+ ALIGN(buffer->offsets_size, sizeof(void *)) +
+ ALIGN(buffer->extra_buffers_size, sizeof(void *));
binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
"%d: binder_free_buf %p size %zd buffer_size %zd\n",
@@ -904,7 +953,7 @@ static struct binder_node *binder_new_node(struct binder_proc *proc,
binder_stats_created(BINDER_STAT_NODE);
rb_link_node(&node->rb_node, parent, p);
rb_insert_color(&node->rb_node, &proc->nodes);
- node->debug_id = ++binder_last_id;
+ node->debug_id = atomic_inc_return(&binder_last_id);
node->proc = proc;
node->ptr = ptr;
node->cookie = cookie;
@@ -925,8 +974,10 @@ static int binder_inc_node(struct binder_node *node, int strong, int internal,
if (internal) {
if (target_list == NULL &&
node->internal_strong_refs == 0 &&
- !(node == binder_context_mgr_node &&
- node->has_strong_ref)) {
+ !(node->proc &&
+ node == node->proc->context->
+ binder_context_mgr_node &&
+ node->has_strong_ref)) {
pr_err("invalid inc strong node for %d\n",
node->debug_id);
return -EINVAL;
@@ -998,7 +1049,7 @@ static int binder_dec_node(struct binder_node *node, int strong, int internal)
static struct binder_ref *binder_get_ref(struct binder_proc *proc,
- u32 desc, bool need_strong_ref)
+ uint32_t desc, bool need_strong_ref)
{
struct rb_node *n = proc->refs_by_desc.rb_node;
struct binder_ref *ref;
@@ -1027,6 +1078,7 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
struct rb_node **p = &proc->refs_by_node.rb_node;
struct rb_node *parent = NULL;
struct binder_ref *ref, *new_ref;
+ struct binder_context *context = proc->context;
while (*p) {
parent = *p;
@@ -1043,13 +1095,13 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
if (new_ref == NULL)
return NULL;
binder_stats_created(BINDER_STAT_REF);
- new_ref->debug_id = ++binder_last_id;
+ new_ref->debug_id = atomic_inc_return(&binder_last_id);
new_ref->proc = proc;
new_ref->node = node;
rb_link_node(&new_ref->rb_node_node, parent, p);
rb_insert_color(&new_ref->rb_node_node, &proc->refs_by_node);
- new_ref->desc = (node == binder_context_mgr_node) ? 0 : 1;
+ new_ref->desc = (node == context->binder_context_mgr_node) ? 0 : 1;
for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) {
ref = rb_entry(n, struct binder_ref, rb_node_desc);
if (ref->desc > new_ref->desc)
@@ -1236,11 +1288,158 @@ static void binder_send_failed_reply(struct binder_transaction *t,
}
}
+/**
+ * binder_validate_object() - checks for a valid metadata object in a buffer.
+ * @buffer: binder_buffer that we're parsing.
+ * @offset: offset in the buffer at which to validate an object.
+ *
+ * Return: If there's a valid metadata object at @offset in @buffer, the
+ * size of that object. Otherwise, it returns zero.
+ */
+static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset)
+{
+ /* Check if we can read a header first */
+ struct binder_object_header *hdr;
+ size_t object_size = 0;
+
+ if (offset > buffer->data_size - sizeof(*hdr) ||
+ buffer->data_size < sizeof(*hdr) ||
+ !IS_ALIGNED(offset, sizeof(u32)))
+ return 0;
+
+ /* Ok, now see if we can read a complete object. */
+ hdr = (struct binder_object_header *)(buffer->data + offset);
+ switch (hdr->type) {
+ case BINDER_TYPE_BINDER:
+ case BINDER_TYPE_WEAK_BINDER:
+ case BINDER_TYPE_HANDLE:
+ case BINDER_TYPE_WEAK_HANDLE:
+ object_size = sizeof(struct flat_binder_object);
+ break;
+ case BINDER_TYPE_FD:
+ object_size = sizeof(struct binder_fd_object);
+ break;
+ case BINDER_TYPE_PTR:
+ object_size = sizeof(struct binder_buffer_object);
+ break;
+ case BINDER_TYPE_FDA:
+ object_size = sizeof(struct binder_fd_array_object);
+ break;
+ default:
+ return 0;
+ }
+ if (offset <= buffer->data_size - object_size &&
+ buffer->data_size >= object_size)
+ return object_size;
+ else
+ return 0;
+}
+
+/**
+ * binder_validate_ptr() - validates binder_buffer_object in a binder_buffer.
+ * @b: binder_buffer containing the object
+ * @index: index in offset array at which the binder_buffer_object is
+ * located
+ * @start: points to the start of the offset array
+ * @num_valid: the number of valid offsets in the offset array
+ *
+ * Return: If @index is within the valid range of the offset array
+ * described by @start and @num_valid, and if there's a valid
+ * binder_buffer_object at the offset found in index @index
+ * of the offset array, that object is returned. Otherwise,
+ * %NULL is returned.
+ * Note that the offset found in index @index itself is not
+ * verified; this function assumes that @num_valid elements
+ * from @start were previously verified to have valid offsets.
+ */
+static struct binder_buffer_object *binder_validate_ptr(struct binder_buffer *b,
+ binder_size_t index,
+ binder_size_t *start,
+ binder_size_t num_valid)
+{
+ struct binder_buffer_object *buffer_obj;
+ binder_size_t *offp;
+
+ if (index >= num_valid)
+ return NULL;
+
+ offp = start + index;
+ buffer_obj = (struct binder_buffer_object *)(b->data + *offp);
+ if (buffer_obj->hdr.type != BINDER_TYPE_PTR)
+ return NULL;
+
+ return buffer_obj;
+}
+
+/**
+ * binder_validate_fixup() - validates pointer/fd fixups happen in order.
+ * @b: transaction buffer
+ * @objects_start start of objects buffer
+ * @buffer: binder_buffer_object in which to fix up
+ * @offset: start offset in @buffer to fix up
+ * @last_obj: last binder_buffer_object that we fixed up in
+ * @last_min_offset: minimum fixup offset in @last_obj
+ *
+ * Return: %true if a fixup in buffer @buffer at offset @offset is
+ * allowed.
+ *
+ * For safety reasons, we only allow fixups inside a buffer to happen
+ * at increasing offsets; additionally, we only allow fixup on the last
+ * buffer object that was verified, or one of its parents.
+ *
+ * Example of what is allowed:
+ *
+ * A
+ * B (parent = A, offset = 0)
+ * C (parent = A, offset = 16)
+ * D (parent = C, offset = 0)
+ * E (parent = A, offset = 32) // min_offset is 16 (C.parent_offset)
+ *
+ * Examples of what is not allowed:
+ *
+ * Decreasing offsets within the same parent:
+ * A
+ * C (parent = A, offset = 16)
+ * B (parent = A, offset = 0) // decreasing offset within A
+ *
+ * Referring to a parent that wasn't the last object or any of its parents:
+ * A
+ * B (parent = A, offset = 0)
+ * C (parent = A, offset = 0)
+ * C (parent = A, offset = 16)
+ * D (parent = B, offset = 0) // B is not A or any of A's parents
+ */
+static bool binder_validate_fixup(struct binder_buffer *b,
+ binder_size_t *objects_start,
+ struct binder_buffer_object *buffer,
+ binder_size_t fixup_offset,
+ struct binder_buffer_object *last_obj,
+ binder_size_t last_min_offset)
+{
+ if (!last_obj) {
+ /* Nothing to fix up in */
+ return false;
+ }
+
+ while (last_obj != buffer) {
+ /*
+ * Safe to retrieve the parent of last_obj, since it
+ * was already previously verified by the driver.
+ */
+ if ((last_obj->flags & BINDER_BUFFER_FLAG_HAS_PARENT) == 0)
+ return false;
+ last_min_offset = last_obj->parent_offset + sizeof(uintptr_t);
+ last_obj = (struct binder_buffer_object *)
+ (b->data + *(objects_start + last_obj->parent));
+ }
+ return (fixup_offset >= last_min_offset);
+}
+
static void binder_transaction_buffer_release(struct binder_proc *proc,
struct binder_buffer *buffer,
binder_size_t *failed_at)
{
- binder_size_t *offp, *off_end;
+ binder_size_t *offp, *off_start, *off_end;
int debug_id = buffer->debug_id;
binder_debug(BINDER_DEBUG_TRANSACTION,
@@ -1251,28 +1450,30 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
if (buffer->target_node)
binder_dec_node(buffer->target_node, 1, 0);
- offp = (binder_size_t *)(buffer->data +
- ALIGN(buffer->data_size, sizeof(void *)));
+ off_start = (binder_size_t *)(buffer->data +
+ ALIGN(buffer->data_size, sizeof(void *)));
if (failed_at)
off_end = failed_at;
else
- off_end = (void *)offp + buffer->offsets_size;
- for (; offp < off_end; offp++) {
- struct flat_binder_object *fp;
+ off_end = (void *)off_start + buffer->offsets_size;
+ for (offp = off_start; offp < off_end; offp++) {
+ struct binder_object_header *hdr;
+ size_t object_size = binder_validate_object(buffer, *offp);
- if (*offp > buffer->data_size - sizeof(*fp) ||
- buffer->data_size < sizeof(*fp) ||
- !IS_ALIGNED(*offp, sizeof(u32))) {
- pr_err("transaction release %d bad offset %lld, size %zd\n",
+ if (object_size == 0) {
+ pr_err("transaction release %d bad object at offset %lld, size %zd\n",
debug_id, (u64)*offp, buffer->data_size);
continue;
}
- fp = (struct flat_binder_object *)(buffer->data + *offp);
- switch (fp->type) {
+ hdr = (struct binder_object_header *)(buffer->data + *offp);
+ switch (hdr->type) {
case BINDER_TYPE_BINDER:
case BINDER_TYPE_WEAK_BINDER: {
- struct binder_node *node = binder_get_node(proc, fp->binder);
+ struct flat_binder_object *fp;
+ struct binder_node *node;
+ fp = to_flat_binder_object(hdr);
+ node = binder_get_node(proc, fp->binder);
if (node == NULL) {
pr_err("transaction release %d bad node %016llx\n",
debug_id, (u64)fp->binder);
@@ -1281,15 +1482,17 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
binder_debug(BINDER_DEBUG_TRANSACTION,
" node %d u%016llx\n",
node->debug_id, (u64)node->ptr);
- binder_dec_node(node, fp->type == BINDER_TYPE_BINDER, 0);
+ binder_dec_node(node, hdr->type == BINDER_TYPE_BINDER,
+ 0);
} break;
case BINDER_TYPE_HANDLE:
case BINDER_TYPE_WEAK_HANDLE: {
+ struct flat_binder_object *fp;
struct binder_ref *ref;
+ fp = to_flat_binder_object(hdr);
ref = binder_get_ref(proc, fp->handle,
- fp->type == BINDER_TYPE_HANDLE);
-
+ hdr->type == BINDER_TYPE_HANDLE);
if (ref == NULL) {
pr_err("transaction release %d bad handle %d\n",
debug_id, fp->handle);
@@ -1298,31 +1501,348 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
binder_debug(BINDER_DEBUG_TRANSACTION,
" ref %d desc %d (node %d)\n",
ref->debug_id, ref->desc, ref->node->debug_id);
- binder_dec_ref(ref, fp->type == BINDER_TYPE_HANDLE);
+ binder_dec_ref(ref, hdr->type == BINDER_TYPE_HANDLE);
} break;
- case BINDER_TYPE_FD:
+ case BINDER_TYPE_FD: {
+ struct binder_fd_object *fp = to_binder_fd_object(hdr);
+
binder_debug(BINDER_DEBUG_TRANSACTION,
- " fd %d\n", fp->handle);
+ " fd %d\n", fp->fd);
if (failed_at)
- task_close_fd(proc, fp->handle);
+ task_close_fd(proc, fp->fd);
+ } break;
+ case BINDER_TYPE_PTR:
+ /*
+ * Nothing to do here, this will get cleaned up when the
+ * transaction buffer gets freed
+ */
break;
-
+ case BINDER_TYPE_FDA: {
+ struct binder_fd_array_object *fda;
+ struct binder_buffer_object *parent;
+ uintptr_t parent_buffer;
+ u32 *fd_array;
+ size_t fd_index;
+ binder_size_t fd_buf_size;
+
+ fda = to_binder_fd_array_object(hdr);
+ parent = binder_validate_ptr(buffer, fda->parent,
+ off_start,
+ offp - off_start);
+ if (!parent) {
+ pr_err("transaction release %d bad parent offset",
+ debug_id);
+ continue;
+ }
+ /*
+ * Since the parent was already fixed up, convert it
+ * back to kernel address space to access it
+ */
+ parent_buffer = parent->buffer -
+ proc->user_buffer_offset;
+
+ fd_buf_size = sizeof(u32) * fda->num_fds;
+ if (fda->num_fds >= SIZE_MAX / sizeof(u32)) {
+ pr_err("transaction release %d invalid number of fds (%lld)\n",
+ debug_id, (u64)fda->num_fds);
+ continue;
+ }
+ if (fd_buf_size > parent->length ||
+ fda->parent_offset > parent->length - fd_buf_size) {
+ /* No space for all file descriptors here. */
+ pr_err("transaction release %d not enough space for %lld fds in buffer\n",
+ debug_id, (u64)fda->num_fds);
+ continue;
+ }
+ fd_array = (u32 *)(parent_buffer + fda->parent_offset);
+ for (fd_index = 0; fd_index < fda->num_fds; fd_index++)
+ task_close_fd(proc, fd_array[fd_index]);
+ } break;
default:
pr_err("transaction release %d bad object type %x\n",
- debug_id, fp->type);
+ debug_id, hdr->type);
break;
}
}
}
+static int binder_translate_binder(struct flat_binder_object *fp,
+ struct binder_transaction *t,
+ struct binder_thread *thread)
+{
+ struct binder_node *node;
+ struct binder_ref *ref;
+ struct binder_proc *proc = thread->proc;
+ struct binder_proc *target_proc = t->to_proc;
+
+ node = binder_get_node(proc, fp->binder);
+ if (!node) {
+ node = binder_new_node(proc, fp->binder, fp->cookie);
+ if (!node)
+ return -ENOMEM;
+
+ node->min_priority = fp->flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
+ node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
+ }
+ if (fp->cookie != node->cookie) {
+ binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n",
+ proc->pid, thread->pid, (u64)fp->binder,
+ node->debug_id, (u64)fp->cookie,
+ (u64)node->cookie);
+ return -EINVAL;
+ }
+ if (security_binder_transfer_binder(proc->tsk, target_proc->tsk))
+ return -EPERM;
+
+ ref = binder_get_ref_for_node(target_proc, node);
+ if (!ref)
+ return -EINVAL;
+
+ if (fp->hdr.type == BINDER_TYPE_BINDER)
+ fp->hdr.type = BINDER_TYPE_HANDLE;
+ else
+ fp->hdr.type = BINDER_TYPE_WEAK_HANDLE;
+ fp->binder = 0;
+ fp->handle = ref->desc;
+ fp->cookie = 0;
+ binder_inc_ref(ref, fp->hdr.type == BINDER_TYPE_HANDLE, &thread->todo);
+
+ trace_binder_transaction_node_to_ref(t, node, ref);
+ binder_debug(BINDER_DEBUG_TRANSACTION,
+ " node %d u%016llx -> ref %d desc %d\n",
+ node->debug_id, (u64)node->ptr,
+ ref->debug_id, ref->desc);
+
+ return 0;
+}
+
+static int binder_translate_handle(struct flat_binder_object *fp,
+ struct binder_transaction *t,
+ struct binder_thread *thread)
+{
+ struct binder_ref *ref;
+ struct binder_proc *proc = thread->proc;
+ struct binder_proc *target_proc = t->to_proc;
+
+ ref = binder_get_ref(proc, fp->handle,
+ fp->hdr.type == BINDER_TYPE_HANDLE);
+ if (!ref) {
+ binder_user_error("%d:%d got transaction with invalid handle, %d\n",
+ proc->pid, thread->pid, fp->handle);
+ return -EINVAL;
+ }
+ if (security_binder_transfer_binder(proc->tsk, target_proc->tsk))
+ return -EPERM;
+
+ if (ref->node->proc == target_proc) {
+ if (fp->hdr.type == BINDER_TYPE_HANDLE)
+ fp->hdr.type = BINDER_TYPE_BINDER;
+ else
+ fp->hdr.type = BINDER_TYPE_WEAK_BINDER;
+ fp->binder = ref->node->ptr;
+ fp->cookie = ref->node->cookie;
+ binder_inc_node(ref->node, fp->hdr.type == BINDER_TYPE_BINDER,
+ 0, NULL);
+ trace_binder_transaction_ref_to_node(t, ref);
+ binder_debug(BINDER_DEBUG_TRANSACTION,
+ " ref %d desc %d -> node %d u%016llx\n",
+ ref->debug_id, ref->desc, ref->node->debug_id,
+ (u64)ref->node->ptr);
+ } else {
+ struct binder_ref *new_ref;
+
+ new_ref = binder_get_ref_for_node(target_proc, ref->node);
+ if (!new_ref)
+ return -EINVAL;
+
+ fp->binder = 0;
+ fp->handle = new_ref->desc;
+ fp->cookie = 0;
+ binder_inc_ref(new_ref, fp->hdr.type == BINDER_TYPE_HANDLE,
+ NULL);
+ trace_binder_transaction_ref_to_ref(t, ref, new_ref);
+ binder_debug(BINDER_DEBUG_TRANSACTION,
+ " ref %d desc %d -> ref %d desc %d (node %d)\n",
+ ref->debug_id, ref->desc, new_ref->debug_id,
+ new_ref->desc, ref->node->debug_id);
+ }
+ return 0;
+}
+
+static int binder_translate_fd(int fd,
+ struct binder_transaction *t,
+ struct binder_thread *thread,
+ struct binder_transaction *in_reply_to)
+{
+ struct binder_proc *proc = thread->proc;
+ struct binder_proc *target_proc = t->to_proc;
+ int target_fd;
+ struct file *file;
+ int ret;
+ bool target_allows_fd;
+
+ if (in_reply_to)
+ target_allows_fd = !!(in_reply_to->flags & TF_ACCEPT_FDS);
+ else
+ target_allows_fd = t->buffer->target_node->accept_fds;
+ if (!target_allows_fd) {
+ binder_user_error("%d:%d got %s with fd, %d, but target does not allow fds\n",
+ proc->pid, thread->pid,
+ in_reply_to ? "reply" : "transaction",
+ fd);
+ ret = -EPERM;
+ goto err_fd_not_accepted;
+ }
+
+ file = fget(fd);
+ if (!file) {
+ binder_user_error("%d:%d got transaction with invalid fd, %d\n",
+ proc->pid, thread->pid, fd);
+ ret = -EBADF;
+ goto err_fget;
+ }
+ ret = security_binder_transfer_file(proc->tsk, target_proc->tsk, file);
+ if (ret < 0) {
+ ret = -EPERM;
+ goto err_security;
+ }
+
+ target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC);
+ if (target_fd < 0) {
+ ret = -ENOMEM;
+ goto err_get_unused_fd;
+ }
+ task_fd_install(target_proc, target_fd, file);
+ trace_binder_transaction_fd(t, fd, target_fd);
+ binder_debug(BINDER_DEBUG_TRANSACTION, " fd %d -> %d\n",
+ fd, target_fd);
+
+ return target_fd;
+
+err_get_unused_fd:
+err_security:
+ fput(file);
+err_fget:
+err_fd_not_accepted:
+ return ret;
+}
+
+static int binder_translate_fd_array(struct binder_fd_array_object *fda,
+ struct binder_buffer_object *parent,
+ struct binder_transaction *t,
+ struct binder_thread *thread,
+ struct binder_transaction *in_reply_to)
+{
+ binder_size_t fdi, fd_buf_size, num_installed_fds;
+ int target_fd;
+ uintptr_t parent_buffer;
+ u32 *fd_array;
+ struct binder_proc *proc = thread->proc;
+ struct binder_proc *target_proc = t->to_proc;
+
+ fd_buf_size = sizeof(u32) * fda->num_fds;
+ if (fda->num_fds >= SIZE_MAX / sizeof(u32)) {
+ binder_user_error("%d:%d got transaction with invalid number of fds (%lld)\n",
+ proc->pid, thread->pid, (u64)fda->num_fds);
+ return -EINVAL;
+ }
+ if (fd_buf_size > parent->length ||
+ fda->parent_offset > parent->length - fd_buf_size) {
+ /* No space for all file descriptors here. */
+ binder_user_error("%d:%d not enough space to store %lld fds in buffer\n",
+ proc->pid, thread->pid, (u64)fda->num_fds);
+ return -EINVAL;
+ }
+ /*
+ * Since the parent was already fixed up, convert it
+ * back to the kernel address space to access it
+ */
+ parent_buffer = parent->buffer - target_proc->user_buffer_offset;
+ fd_array = (u32 *)(parent_buffer + fda->parent_offset);
+ if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) {
+ binder_user_error("%d:%d parent offset not aligned correctly.\n",
+ proc->pid, thread->pid);
+ return -EINVAL;
+ }
+ for (fdi = 0; fdi < fda->num_fds; fdi++) {
+ target_fd = binder_translate_fd(fd_array[fdi], t, thread,
+ in_reply_to);
+ if (target_fd < 0)
+ goto err_translate_fd_failed;
+ fd_array[fdi] = target_fd;
+ }
+ return 0;
+
+err_translate_fd_failed:
+ /*
+ * Failed to allocate fd or security error, free fds
+ * installed so far.
+ */
+ num_installed_fds = fdi;
+ for (fdi = 0; fdi < num_installed_fds; fdi++)
+ task_close_fd(target_proc, fd_array[fdi]);
+ return target_fd;
+}
+
+static int binder_fixup_parent(struct binder_transaction *t,
+ struct binder_thread *thread,
+ struct binder_buffer_object *bp,
+ binder_size_t *off_start,
+ binder_size_t num_valid,
+ struct binder_buffer_object *last_fixup_obj,
+ binder_size_t last_fixup_min_off)
+{
+ struct binder_buffer_object *parent;
+ u8 *parent_buffer;
+ struct binder_buffer *b = t->buffer;
+ struct binder_proc *proc = thread->proc;
+ struct binder_proc *target_proc = t->to_proc;
+
+ if (!(bp->flags & BINDER_BUFFER_FLAG_HAS_PARENT))
+ return 0;
+
+ parent = binder_validate_ptr(b, bp->parent, off_start, num_valid);
+ if (!parent) {
+ binder_user_error("%d:%d got transaction with invalid parent offset or type\n",
+ proc->pid, thread->pid);
+ return -EINVAL;
+ }
+
+ if (!binder_validate_fixup(b, off_start,
+ parent, bp->parent_offset,
+ last_fixup_obj,
+ last_fixup_min_off)) {
+ binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n",
+ proc->pid, thread->pid);
+ return -EINVAL;
+ }
+
+ if (parent->length < sizeof(binder_uintptr_t) ||
+ bp->parent_offset > parent->length - sizeof(binder_uintptr_t)) {
+ /* No space for a pointer here! */
+ binder_user_error("%d:%d got transaction with invalid parent offset\n",
+ proc->pid, thread->pid);
+ return -EINVAL;
+ }
+ parent_buffer = (u8 *)(parent->buffer -
+ target_proc->user_buffer_offset);
+ *(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer;
+
+ return 0;
+}
+
static void binder_transaction(struct binder_proc *proc,
struct binder_thread *thread,
- struct binder_transaction_data *tr, int reply)
+ struct binder_transaction_data *tr, int reply,
+ binder_size_t extra_buffers_size)
{
+ int ret;
struct binder_transaction *t;
struct binder_work *tcomplete;
- binder_size_t *offp, *off_end;
+ binder_size_t *offp, *off_end, *off_start;
+ binder_size_t off_min;
+ u8 *sg_bufp, *sg_buf_end;
struct binder_proc *target_proc;
struct binder_thread *target_thread = NULL;
struct binder_node *target_node = NULL;
@@ -1331,14 +1851,18 @@ static void binder_transaction(struct binder_proc *proc,
struct binder_transaction *in_reply_to = NULL;
struct binder_transaction_log_entry *e;
uint32_t return_error;
+ struct binder_buffer_object *last_fixup_obj = NULL;
+ binder_size_t last_fixup_min_off = 0;
+ struct binder_context *context = proc->context;
- e = binder_transaction_log_add(&binder_transaction_log);
+ e = binder_transaction_log_add(&context->transaction_log);
e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY);
e->from_proc = proc->pid;
e->from_thread = thread->pid;
e->target_handle = tr->target.handle;
e->data_size = tr->data_size;
e->offsets_size = tr->offsets_size;
+ e->context_name = proc->context->name;
if (reply) {
in_reply_to = thread->transaction_stack;
@@ -1391,7 +1915,7 @@ static void binder_transaction(struct binder_proc *proc,
}
target_node = ref->node;
} else {
- target_node = binder_context_mgr_node;
+ target_node = context->binder_context_mgr_node;
if (target_node == NULL) {
return_error = BR_DEAD_REPLY;
goto err_no_context_mgr_node;
@@ -1403,6 +1927,10 @@ static void binder_transaction(struct binder_proc *proc,
return_error = BR_DEAD_REPLY;
goto err_dead_binder;
}
+ if (security_binder_transaction(proc->tsk, target_proc->tsk) < 0) {
+ return_error = BR_FAILED_REPLY;
+ goto err_invalid_target_handle;
+ }
if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) {
struct binder_transaction *tmp;
@@ -1448,25 +1976,27 @@ static void binder_transaction(struct binder_proc *proc,
}
binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE);
- t->debug_id = ++binder_last_id;
+ t->debug_id = atomic_inc_return(&binder_last_id);
e->debug_id = t->debug_id;
if (reply)
binder_debug(BINDER_DEBUG_TRANSACTION,
- "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld\n",
+ "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld-%lld\n",
proc->pid, thread->pid, t->debug_id,
target_proc->pid, target_thread->pid,
(u64)tr->data.ptr.buffer,
(u64)tr->data.ptr.offsets,
- (u64)tr->data_size, (u64)tr->offsets_size);
+ (u64)tr->data_size, (u64)tr->offsets_size,
+ (u64)extra_buffers_size);
else
binder_debug(BINDER_DEBUG_TRANSACTION,
- "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld\n",
+ "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld-%lld\n",
proc->pid, thread->pid, t->debug_id,
target_proc->pid, target_node->debug_id,
(u64)tr->data.ptr.buffer,
(u64)tr->data.ptr.offsets,
- (u64)tr->data_size, (u64)tr->offsets_size);
+ (u64)tr->data_size, (u64)tr->offsets_size,
+ (u64)extra_buffers_size);
if (!reply && !(tr->flags & TF_ONE_WAY))
t->from = thread;
@@ -1482,7 +2012,8 @@ static void binder_transaction(struct binder_proc *proc,
trace_binder_transaction(reply, t, target_node);
t->buffer = binder_alloc_buf(target_proc, tr->data_size,
- tr->offsets_size, !reply && (t->flags & TF_ONE_WAY));
+ tr->offsets_size, extra_buffers_size,
+ !reply && (t->flags & TF_ONE_WAY));
if (t->buffer == NULL) {
return_error = BR_FAILED_REPLY;
goto err_binder_alloc_buf_failed;
@@ -1495,8 +2026,9 @@ static void binder_transaction(struct binder_proc *proc,
if (target_node)
binder_inc_node(target_node, 1, 0, NULL);
- offp = (binder_size_t *)(t->buffer->data +
- ALIGN(tr->data_size, sizeof(void *)));
+ off_start = (binder_size_t *)(t->buffer->data +
+ ALIGN(tr->data_size, sizeof(void *)));
+ offp = off_start;
if (copy_from_user(t->buffer->data, (const void __user *)(uintptr_t)
tr->data.ptr.buffer, tr->data_size)) {
@@ -1518,154 +2050,138 @@ static void binder_transaction(struct binder_proc *proc,
return_error = BR_FAILED_REPLY;
goto err_bad_offset;
}
- off_end = (void *)offp + tr->offsets_size;
+ if (!IS_ALIGNED(extra_buffers_size, sizeof(u64))) {
+ binder_user_error("%d:%d got transaction with unaligned buffers size, %lld\n",
+ proc->pid, thread->pid,
+ (u64)extra_buffers_size);
+ return_error = BR_FAILED_REPLY;
+ goto err_bad_offset;
+ }
+ off_end = (void *)off_start + tr->offsets_size;
+ sg_bufp = (u8 *)(PTR_ALIGN(off_end, sizeof(void *)));
+ sg_buf_end = sg_bufp + extra_buffers_size;
+ off_min = 0;
for (; offp < off_end; offp++) {
- struct flat_binder_object *fp;
-
- if (*offp > t->buffer->data_size - sizeof(*fp) ||
- t->buffer->data_size < sizeof(*fp) ||
- !IS_ALIGNED(*offp, sizeof(u32))) {
- binder_user_error("%d:%d got transaction with invalid offset, %lld\n",
- proc->pid, thread->pid, (u64)*offp);
+ struct binder_object_header *hdr;
+ size_t object_size = binder_validate_object(t->buffer, *offp);
+
+ if (object_size == 0 || *offp < off_min) {
+ binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n",
+ proc->pid, thread->pid, (u64)*offp,
+ (u64)off_min,
+ (u64)t->buffer->data_size);
return_error = BR_FAILED_REPLY;
goto err_bad_offset;
}
- fp = (struct flat_binder_object *)(t->buffer->data + *offp);
- switch (fp->type) {
+
+ hdr = (struct binder_object_header *)(t->buffer->data + *offp);
+ off_min = *offp + object_size;
+ switch (hdr->type) {
case BINDER_TYPE_BINDER:
case BINDER_TYPE_WEAK_BINDER: {
- struct binder_ref *ref;
- struct binder_node *node = binder_get_node(proc, fp->binder);
+ struct flat_binder_object *fp;
- if (node == NULL) {
- node = binder_new_node(proc, fp->binder, fp->cookie);
- if (node == NULL) {
- return_error = BR_FAILED_REPLY;
- goto err_binder_new_node_failed;
- }
- node->min_priority = fp->flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
- node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
- }
- if (fp->cookie != node->cookie) {
- binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n",
- proc->pid, thread->pid,
- (u64)fp->binder, node->debug_id,
- (u64)fp->cookie, (u64)node->cookie);
+ fp = to_flat_binder_object(hdr);
+ ret = binder_translate_binder(fp, t, thread);
+ if (ret < 0) {
return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_for_node_failed;
+ goto err_translate_failed;
}
- ref = binder_get_ref_for_node(target_proc, node);
- if (ref == NULL) {
- return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_for_node_failed;
- }
- if (fp->type == BINDER_TYPE_BINDER)
- fp->type = BINDER_TYPE_HANDLE;
- else
- fp->type = BINDER_TYPE_WEAK_HANDLE;
- fp->binder = 0;
- fp->handle = ref->desc;
- fp->cookie = 0;
- binder_inc_ref(ref, fp->type == BINDER_TYPE_HANDLE,
- &thread->todo);
-
- trace_binder_transaction_node_to_ref(t, node, ref);
- binder_debug(BINDER_DEBUG_TRANSACTION,
- " node %d u%016llx -> ref %d desc %d\n",
- node->debug_id, (u64)node->ptr,
- ref->debug_id, ref->desc);
} break;
case BINDER_TYPE_HANDLE:
case BINDER_TYPE_WEAK_HANDLE: {
- struct binder_ref *ref;
+ struct flat_binder_object *fp;
- ref = binder_get_ref(proc, fp->handle,
- fp->type == BINDER_TYPE_HANDLE);
-
- if (ref == NULL) {
- binder_user_error("%d:%d got transaction with invalid handle, %d\n",
- proc->pid,
- thread->pid, fp->handle);
+ fp = to_flat_binder_object(hdr);
+ ret = binder_translate_handle(fp, t, thread);
+ if (ret < 0) {
return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_failed;
- }
- if (ref->node->proc == target_proc) {
- if (fp->type == BINDER_TYPE_HANDLE)
- fp->type = BINDER_TYPE_BINDER;
- else
- fp->type = BINDER_TYPE_WEAK_BINDER;
- fp->binder = ref->node->ptr;
- fp->cookie = ref->node->cookie;
- binder_inc_node(ref->node, fp->type == BINDER_TYPE_BINDER, 0, NULL);
- trace_binder_transaction_ref_to_node(t, ref);
- binder_debug(BINDER_DEBUG_TRANSACTION,
- " ref %d desc %d -> node %d u%016llx\n",
- ref->debug_id, ref->desc, ref->node->debug_id,
- (u64)ref->node->ptr);
- } else {
- struct binder_ref *new_ref;
-
- new_ref = binder_get_ref_for_node(target_proc, ref->node);
- if (new_ref == NULL) {
- return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_for_node_failed;
- }
- fp->binder = 0;
- fp->handle = new_ref->desc;
- fp->cookie = 0;
- binder_inc_ref(new_ref, fp->type == BINDER_TYPE_HANDLE, NULL);
- trace_binder_transaction_ref_to_ref(t, ref,
- new_ref);
- binder_debug(BINDER_DEBUG_TRANSACTION,
- " ref %d desc %d -> ref %d desc %d (node %d)\n",
- ref->debug_id, ref->desc, new_ref->debug_id,
- new_ref->desc, ref->node->debug_id);
+ goto err_translate_failed;
}
} break;
case BINDER_TYPE_FD: {
- int target_fd;
- struct file *file;
-
- if (reply) {
- if (!(in_reply_to->flags & TF_ACCEPT_FDS)) {
- binder_user_error("%d:%d got reply with fd, %d, but target does not allow fds\n",
- proc->pid, thread->pid, fp->handle);
- return_error = BR_FAILED_REPLY;
- goto err_fd_not_allowed;
- }
- } else if (!target_node->accept_fds) {
- binder_user_error("%d:%d got transaction with fd, %d, but target does not allow fds\n",
- proc->pid, thread->pid, fp->handle);
+ struct binder_fd_object *fp = to_binder_fd_object(hdr);
+ int target_fd = binder_translate_fd(fp->fd, t, thread,
+ in_reply_to);
+
+ if (target_fd < 0) {
return_error = BR_FAILED_REPLY;
- goto err_fd_not_allowed;
+ goto err_translate_failed;
}
-
- file = fget(fp->handle);
- if (file == NULL) {
- binder_user_error("%d:%d got transaction with invalid fd, %d\n",
- proc->pid, thread->pid, fp->handle);
+ fp->pad_binder = 0;
+ fp->fd = target_fd;
+ } break;
+ case BINDER_TYPE_FDA: {
+ struct binder_fd_array_object *fda =
+ to_binder_fd_array_object(hdr);
+ struct binder_buffer_object *parent =
+ binder_validate_ptr(t->buffer, fda->parent,
+ off_start,
+ offp - off_start);
+ if (!parent) {
+ binder_user_error("%d:%d got transaction with invalid parent offset or type\n",
+ proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
- goto err_fget_failed;
+ goto err_bad_parent;
}
- target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC);
- if (target_fd < 0) {
- fput(file);
+ if (!binder_validate_fixup(t->buffer, off_start,
+ parent, fda->parent_offset,
+ last_fixup_obj,
+ last_fixup_min_off)) {
+ binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n",
+ proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
- goto err_get_unused_fd_failed;
+ goto err_bad_parent;
}
- task_fd_install(target_proc, target_fd, file);
- trace_binder_transaction_fd(t, fp->handle, target_fd);
- binder_debug(BINDER_DEBUG_TRANSACTION,
- " fd %d -> %d\n", fp->handle, target_fd);
- /* TODO: fput? */
- fp->binder = 0;
- fp->handle = target_fd;
+ ret = binder_translate_fd_array(fda, parent, t, thread,
+ in_reply_to);
+ if (ret < 0) {
+ return_error = BR_FAILED_REPLY;
+ goto err_translate_failed;
+ }
+ last_fixup_obj = parent;
+ last_fixup_min_off =
+ fda->parent_offset + sizeof(u32) * fda->num_fds;
+ } break;
+ case BINDER_TYPE_PTR: {
+ struct binder_buffer_object *bp =
+ to_binder_buffer_object(hdr);
+ size_t buf_left = sg_buf_end - sg_bufp;
+
+ if (bp->length > buf_left) {
+ binder_user_error("%d:%d got transaction with too large buffer\n",
+ proc->pid, thread->pid);
+ return_error = BR_FAILED_REPLY;
+ goto err_bad_offset;
+ }
+ if (copy_from_user(sg_bufp,
+ (const void __user *)(uintptr_t)
+ bp->buffer, bp->length)) {
+ binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
+ proc->pid, thread->pid);
+ return_error = BR_FAILED_REPLY;
+ goto err_copy_data_failed;
+ }
+ /* Fixup buffer pointer to target proc address space */
+ bp->buffer = (uintptr_t)sg_bufp +
+ target_proc->user_buffer_offset;
+ sg_bufp += ALIGN(bp->length, sizeof(u64));
+
+ ret = binder_fixup_parent(t, thread, bp, off_start,
+ offp - off_start,
+ last_fixup_obj,
+ last_fixup_min_off);
+ if (ret < 0) {
+ return_error = BR_FAILED_REPLY;
+ goto err_translate_failed;
+ }
+ last_fixup_obj = bp;
+ last_fixup_min_off = 0;
} break;
-
default:
binder_user_error("%d:%d got transaction with invalid object type, %x\n",
- proc->pid, thread->pid, fp->type);
+ proc->pid, thread->pid, hdr->type);
return_error = BR_FAILED_REPLY;
goto err_bad_object_type;
}
@@ -1695,14 +2211,10 @@ static void binder_transaction(struct binder_proc *proc,
wake_up_interruptible(target_wait);
return;
-err_get_unused_fd_failed:
-err_fget_failed:
-err_fd_not_allowed:
-err_binder_get_ref_for_node_failed:
-err_binder_get_ref_failed:
-err_binder_new_node_failed:
+err_translate_failed:
err_bad_object_type:
err_bad_offset:
+err_bad_parent:
err_copy_data_failed:
trace_binder_transaction_failed_buffer_release(t->buffer);
binder_transaction_buffer_release(target_proc, t->buffer, offp);
@@ -1728,7 +2240,8 @@ err_no_context_mgr_node:
{
struct binder_transaction_log_entry *fe;
- fe = binder_transaction_log_add(&binder_transaction_log_failed);
+ fe = binder_transaction_log_add(
+ &context->transaction_log_failed);
*fe = *e;
}
@@ -1746,6 +2259,7 @@ static int binder_thread_write(struct binder_proc *proc,
binder_size_t *consumed)
{
uint32_t cmd;
+ struct binder_context *context = proc->context;
void __user *buffer = (void __user *)(uintptr_t)binder_buffer;
void __user *ptr = buffer + *consumed;
void __user *end = buffer + size;
@@ -1755,8 +2269,8 @@ static int binder_thread_write(struct binder_proc *proc,
return -EFAULT;
ptr += sizeof(uint32_t);
trace_binder_command(cmd);
- if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) {
- binder_stats.bc[_IOC_NR(cmd)]++;
+ if (_IOC_NR(cmd) < ARRAY_SIZE(context->binder_stats.bc)) {
+ context->binder_stats.bc[_IOC_NR(cmd)]++;
proc->stats.bc[_IOC_NR(cmd)]++;
thread->stats.bc[_IOC_NR(cmd)]++;
}
@@ -1772,10 +2286,10 @@ static int binder_thread_write(struct binder_proc *proc,
if (get_user(target, (uint32_t __user *)ptr))
return -EFAULT;
ptr += sizeof(uint32_t);
- if (target == 0 && binder_context_mgr_node &&
+ if (target == 0 && context->binder_context_mgr_node &&
(cmd == BC_INCREFS || cmd == BC_ACQUIRE)) {
ref = binder_get_ref_for_node(proc,
- binder_context_mgr_node);
+ context->binder_context_mgr_node);
if (ref->desc != target) {
binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n",
proc->pid, thread->pid,
@@ -1920,6 +2434,17 @@ static int binder_thread_write(struct binder_proc *proc,
break;
}
+ case BC_TRANSACTION_SG:
+ case BC_REPLY_SG: {
+ struct binder_transaction_data_sg tr;
+
+ if (copy_from_user(&tr, ptr, sizeof(tr)))
+ return -EFAULT;
+ ptr += sizeof(tr);
+ binder_transaction(proc, thread, &tr.transaction_data,
+ cmd == BC_REPLY_SG, tr.buffers_size);
+ break;
+ }
case BC_TRANSACTION:
case BC_REPLY: {
struct binder_transaction_data tr;
@@ -1927,7 +2452,8 @@ static int binder_thread_write(struct binder_proc *proc,
if (copy_from_user(&tr, ptr, sizeof(tr)))
return -EFAULT;
ptr += sizeof(tr);
- binder_transaction(proc, thread, &tr, cmd == BC_REPLY);
+ binder_transaction(proc, thread, &tr,
+ cmd == BC_REPLY, 0);
break;
}
@@ -2064,7 +2590,7 @@ static int binder_thread_write(struct binder_proc *proc,
if (get_user(cookie, (binder_uintptr_t __user *)ptr))
return -EFAULT;
- ptr += sizeof(void *);
+ ptr += sizeof(cookie);
list_for_each_entry(w, &proc->delivered_death, entry) {
struct binder_ref_death *tmp_death = container_of(w, struct binder_ref_death, work);
@@ -2109,8 +2635,8 @@ static void binder_stat_br(struct binder_proc *proc,
struct binder_thread *thread, uint32_t cmd)
{
trace_binder_return(cmd);
- if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) {
- binder_stats.br[_IOC_NR(cmd)]++;
+ if (_IOC_NR(cmd) < ARRAY_SIZE(proc->stats.br)) {
+ proc->context->binder_stats.br[_IOC_NR(cmd)]++;
proc->stats.br[_IOC_NR(cmd)]++;
thread->stats.br[_IOC_NR(cmd)]++;
}
@@ -2174,7 +2700,7 @@ retry:
if (wait_for_proc_work)
proc->ready_threads++;
- binder_unlock(__func__);
+ binder_unlock(proc->context, __func__);
trace_binder_wait_for_work(wait_for_proc_work,
!!thread->transaction_stack,
@@ -2201,7 +2727,7 @@ retry:
ret = wait_event_freezable(thread->wait, binder_has_thread_work(thread));
}
- binder_lock(__func__);
+ binder_lock(proc->context, __func__);
if (wait_for_proc_work)
proc->ready_threads--;
@@ -2588,14 +3114,14 @@ static unsigned int binder_poll(struct file *filp,
struct binder_thread *thread = NULL;
int wait_for_proc_work;
- binder_lock(__func__);
+ binder_lock(proc->context, __func__);
thread = binder_get_thread(proc);
wait_for_proc_work = thread->transaction_stack == NULL &&
list_empty(&thread->todo) && thread->return_error == BR_OK;
- binder_unlock(__func__);
+ binder_unlock(proc->context, __func__);
if (wait_for_proc_work) {
if (binder_has_proc_work(proc, thread))
@@ -2681,34 +3207,36 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp)
{
int ret = 0;
struct binder_proc *proc = filp->private_data;
+ struct binder_context *context = proc->context;
+
kuid_t curr_euid = current_euid();
- if (binder_context_mgr_node != NULL) {
+ if (context->binder_context_mgr_node) {
pr_err("BINDER_SET_CONTEXT_MGR already set\n");
ret = -EBUSY;
goto out;
}
- if (uid_valid(binder_context_mgr_uid)) {
- if (!uid_eq(binder_context_mgr_uid, curr_euid)) {
+ if (uid_valid(context->binder_context_mgr_uid)) {
+ if (!uid_eq(context->binder_context_mgr_uid, curr_euid)) {
pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n",
from_kuid(&init_user_ns, curr_euid),
from_kuid(&init_user_ns,
- binder_context_mgr_uid));
+ context->binder_context_mgr_uid));
ret = -EPERM;
goto out;
}
} else {
- binder_context_mgr_uid = curr_euid;
+ context->binder_context_mgr_uid = curr_euid;
}
- binder_context_mgr_node = binder_new_node(proc, 0, 0);
- if (binder_context_mgr_node == NULL) {
+ context->binder_context_mgr_node = binder_new_node(proc, 0, 0);
+ if (!context->binder_context_mgr_node) {
ret = -ENOMEM;
goto out;
}
- binder_context_mgr_node->local_weak_refs++;
- binder_context_mgr_node->local_strong_refs++;
- binder_context_mgr_node->has_strong_ref = 1;
- binder_context_mgr_node->has_weak_ref = 1;
+ context->binder_context_mgr_node->local_weak_refs++;
+ context->binder_context_mgr_node->local_strong_refs++;
+ context->binder_context_mgr_node->has_strong_ref = 1;
+ context->binder_context_mgr_node->has_weak_ref = 1;
out:
return ret;
}
@@ -2717,6 +3245,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int ret;
struct binder_proc *proc = filp->private_data;
+ struct binder_context *context = proc->context;
struct binder_thread *thread;
unsigned int size = _IOC_SIZE(cmd);
void __user *ubuf = (void __user *)arg;
@@ -2730,7 +3259,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (ret)
goto err_unlocked;
- binder_lock(__func__);
+ binder_lock(context, __func__);
thread = binder_get_thread(proc);
if (thread == NULL) {
ret = -ENOMEM;
@@ -2753,6 +3282,9 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
ret = binder_ioctl_set_ctx_mgr(filp);
if (ret)
goto err;
+ ret = security_binder_set_context_mgr(proc->tsk);
+ if (ret < 0)
+ goto err;
break;
case BINDER_THREAD_EXIT:
binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n",
@@ -2782,7 +3314,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
err:
if (thread)
thread->looper &= ~BINDER_LOOPER_STATE_NEED_RETURN;
- binder_unlock(__func__);
+ binder_unlock(context, __func__);
wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
if (ret && ret != -ERESTARTSYS)
pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret);
@@ -2835,7 +3367,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
const char *failure_string;
struct binder_buffer *buffer;
- if (proc->tsk != current)
+ if (proc->tsk != current->group_leader)
return -EINVAL;
if ((vma->vm_end - vma->vm_start) > SZ_4M)
@@ -2854,7 +3386,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
}
vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE;
- mutex_lock(&binder_mmap_lock);
+ mutex_lock(&proc->context->binder_mmap_lock);
if (proc->buffer) {
ret = -EBUSY;
failure_string = "already mapped";
@@ -2869,7 +3401,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
}
proc->buffer = area->addr;
proc->user_buffer_offset = vma->vm_start - (uintptr_t)proc->buffer;
- mutex_unlock(&binder_mmap_lock);
+ mutex_unlock(&proc->context->binder_mmap_lock);
#ifdef CONFIG_CPU_CACHE_VIPT
if (cache_is_vipt_aliasing()) {
@@ -2914,12 +3446,12 @@ err_alloc_small_buf_failed:
kfree(proc->pages);
proc->pages = NULL;
err_alloc_pages_failed:
- mutex_lock(&binder_mmap_lock);
+ mutex_lock(&proc->context->binder_mmap_lock);
vfree(proc->buffer);
proc->buffer = NULL;
err_get_vm_area_failed:
err_already_mapped:
- mutex_unlock(&binder_mmap_lock);
+ mutex_unlock(&proc->context->binder_mmap_lock);
err_bad_arg:
pr_err("binder_mmap: %d %lx-%lx %s failed %d\n",
proc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
@@ -2929,6 +3461,7 @@ err_bad_arg:
static int binder_open(struct inode *nodp, struct file *filp)
{
struct binder_proc *proc;
+ struct binder_device *binder_dev;
binder_debug(BINDER_DEBUG_OPEN_CLOSE, "binder_open: %d:%d\n",
current->group_leader->pid, current->pid);
@@ -2936,28 +3469,40 @@ static int binder_open(struct inode *nodp, struct file *filp)
proc = kzalloc(sizeof(*proc), GFP_KERNEL);
if (proc == NULL)
return -ENOMEM;
- get_task_struct(current);
- proc->tsk = current;
+ get_task_struct(current->group_leader);
+ proc->tsk = current->group_leader;
INIT_LIST_HEAD(&proc->todo);
init_waitqueue_head(&proc->wait);
proc->default_priority = task_nice(current);
+ binder_dev = container_of(filp->private_data, struct binder_device,
+ miscdev);
+ proc->context = &binder_dev->context;
- binder_lock(__func__);
+ binder_lock(proc->context, __func__);
binder_stats_created(BINDER_STAT_PROC);
- hlist_add_head(&proc->proc_node, &binder_procs);
+ hlist_add_head(&proc->proc_node, &proc->context->binder_procs);
proc->pid = current->group_leader->pid;
INIT_LIST_HEAD(&proc->delivered_death);
filp->private_data = proc;
- binder_unlock(__func__);
+ binder_unlock(proc->context, __func__);
if (binder_debugfs_dir_entry_proc) {
char strbuf[11];
snprintf(strbuf, sizeof(strbuf), "%u", proc->pid);
+ /*
+ * proc debug entries are shared between contexts, so
+ * this will fail if the process tries to open the driver
+ * again with a different context. The priting code will
+ * anyway print all contexts that a given PID has, so this
+ * is not a problem.
+ */
proc->debugfs_entry = debugfs_create_file(strbuf, S_IRUGO,
- binder_debugfs_dir_entry_proc, proc, &binder_proc_fops);
+ binder_debugfs_dir_entry_proc,
+ (void *)(unsigned long)proc->pid,
+ &binder_proc_fops);
}
return 0;
@@ -3006,6 +3551,7 @@ static int binder_release(struct inode *nodp, struct file *filp)
static int binder_node_release(struct binder_node *node, int refs)
{
struct binder_ref *ref;
+ struct binder_context *context = node->proc->context;
int death = 0;
list_del_init(&node->work.entry);
@@ -3021,7 +3567,7 @@ static int binder_node_release(struct binder_node *node, int refs)
node->proc = NULL;
node->local_strong_refs = 0;
node->local_weak_refs = 0;
- hlist_add_head(&node->dead_node, &binder_dead_nodes);
+ hlist_add_head(&node->dead_node, &context->binder_dead_nodes);
hlist_for_each_entry(ref, &node->refs, node_entry) {
refs++;
@@ -3050,6 +3596,7 @@ static int binder_node_release(struct binder_node *node, int refs)
static void binder_deferred_release(struct binder_proc *proc)
{
struct binder_transaction *t;
+ struct binder_context *context = proc->context;
struct rb_node *n;
int threads, nodes, incoming_refs, outgoing_refs, buffers,
active_transactions, page_count;
@@ -3059,11 +3606,12 @@ static void binder_deferred_release(struct binder_proc *proc)
hlist_del(&proc->proc_node);
- if (binder_context_mgr_node && binder_context_mgr_node->proc == proc) {
+ if (context->binder_context_mgr_node &&
+ context->binder_context_mgr_node->proc == proc) {
binder_debug(BINDER_DEBUG_DEAD_BINDER,
"%s: %d context_mgr_node gone\n",
__func__, proc->pid);
- binder_context_mgr_node = NULL;
+ context->binder_context_mgr_node = NULL;
}
threads = 0;
@@ -3084,7 +3632,8 @@ static void binder_deferred_release(struct binder_proc *proc)
node = rb_entry(n, struct binder_node, rb_node);
nodes++;
rb_erase(&node->rb_node, &proc->nodes);
- incoming_refs = binder_node_release(node, incoming_refs);
+ incoming_refs = binder_node_release(node,
+ incoming_refs);
}
outgoing_refs = 0;
@@ -3156,14 +3705,16 @@ static void binder_deferred_func(struct work_struct *work)
{
struct binder_proc *proc;
struct files_struct *files;
+ struct binder_context *context =
+ container_of(work, struct binder_context, deferred_work);
int defer;
do {
- binder_lock(__func__);
- mutex_lock(&binder_deferred_lock);
- if (!hlist_empty(&binder_deferred_list)) {
- proc = hlist_entry(binder_deferred_list.first,
+ binder_lock(context, __func__);
+ mutex_lock(&context->binder_deferred_lock);
+ if (!hlist_empty(&context->binder_deferred_list)) {
+ proc = hlist_entry(context->binder_deferred_list.first,
struct binder_proc, deferred_work_node);
hlist_del_init(&proc->deferred_work_node);
defer = proc->deferred_work;
@@ -3172,7 +3723,7 @@ static void binder_deferred_func(struct work_struct *work)
proc = NULL;
defer = 0;
}
- mutex_unlock(&binder_deferred_lock);
+ mutex_unlock(&context->binder_deferred_lock);
files = NULL;
if (defer & BINDER_DEFERRED_PUT_FILES) {
@@ -3187,24 +3738,24 @@ static void binder_deferred_func(struct work_struct *work)
if (defer & BINDER_DEFERRED_RELEASE)
binder_deferred_release(proc); /* frees proc */
- binder_unlock(__func__);
+ binder_unlock(context, __func__);
if (files)
put_files_struct(files);
} while (proc);
}
-static DECLARE_WORK(binder_deferred_work, binder_deferred_func);
static void
binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer)
{
- mutex_lock(&binder_deferred_lock);
+ mutex_lock(&proc->context->binder_deferred_lock);
proc->deferred_work |= defer;
if (hlist_unhashed(&proc->deferred_work_node)) {
hlist_add_head(&proc->deferred_work_node,
- &binder_deferred_list);
- queue_work(binder_deferred_workqueue, &binder_deferred_work);
+ &proc->context->binder_deferred_list);
+ queue_work(proc->context->binder_deferred_workqueue,
+ &proc->context->deferred_work);
}
- mutex_unlock(&binder_deferred_lock);
+ mutex_unlock(&proc->context->binder_deferred_lock);
}
static void print_binder_transaction(struct seq_file *m, const char *prefix,
@@ -3350,6 +3901,7 @@ static void print_binder_proc(struct seq_file *m,
size_t header_pos;
seq_printf(m, "proc %d\n", proc->pid);
+ seq_printf(m, "context %s\n", proc->context->name);
header_pos = m->count;
for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
@@ -3419,7 +3971,9 @@ static const char * const binder_command_strings[] = {
"BC_EXIT_LOOPER",
"BC_REQUEST_DEATH_NOTIFICATION",
"BC_CLEAR_DEATH_NOTIFICATION",
- "BC_DEAD_BINDER_DONE"
+ "BC_DEAD_BINDER_DONE",
+ "BC_TRANSACTION_SG",
+ "BC_REPLY_SG",
};
static const char * const binder_objstat_strings[] = {
@@ -3432,8 +3986,20 @@ static const char * const binder_objstat_strings[] = {
"transaction_complete"
};
+static void add_binder_stats(struct binder_stats *from, struct binder_stats *to)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(to->bc); i++)
+ to->bc[i] += from->bc[i];
+
+ for (i = 0; i < ARRAY_SIZE(to->br); i++)
+ to->br[i] += from->br[i];
+}
+
static void print_binder_stats(struct seq_file *m, const char *prefix,
- struct binder_stats *stats)
+ struct binder_stats *stats,
+ struct binder_obj_stats *obj_stats)
{
int i;
@@ -3453,16 +4019,21 @@ static void print_binder_stats(struct seq_file *m, const char *prefix,
binder_return_strings[i], stats->br[i]);
}
- BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) !=
+ if (!obj_stats)
+ return;
+
+ BUILD_BUG_ON(ARRAY_SIZE(obj_stats->obj_created) !=
ARRAY_SIZE(binder_objstat_strings));
- BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) !=
- ARRAY_SIZE(stats->obj_deleted));
- for (i = 0; i < ARRAY_SIZE(stats->obj_created); i++) {
- if (stats->obj_created[i] || stats->obj_deleted[i])
+ BUILD_BUG_ON(ARRAY_SIZE(obj_stats->obj_created) !=
+ ARRAY_SIZE(obj_stats->obj_deleted));
+ for (i = 0; i < ARRAY_SIZE(obj_stats->obj_created); i++) {
+ int obj_created = atomic_read(&obj_stats->obj_created[i]);
+ int obj_deleted = atomic_read(&obj_stats->obj_deleted[i]);
+
+ if (obj_created || obj_deleted)
seq_printf(m, "%s%s: active %d total %d\n", prefix,
- binder_objstat_strings[i],
- stats->obj_created[i] - stats->obj_deleted[i],
- stats->obj_created[i]);
+ binder_objstat_strings[i],
+ obj_created - obj_deleted, obj_created);
}
}
@@ -3474,6 +4045,7 @@ static void print_binder_proc_stats(struct seq_file *m,
int count, strong, weak;
seq_printf(m, "proc %d\n", proc->pid);
+ seq_printf(m, "context %s\n", proc->context->name);
count = 0;
for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
count++;
@@ -3516,79 +4088,131 @@ static void print_binder_proc_stats(struct seq_file *m,
}
seq_printf(m, " pending transactions: %d\n", count);
- print_binder_stats(m, " ", &proc->stats);
+ print_binder_stats(m, " ", &proc->stats, NULL);
}
static int binder_state_show(struct seq_file *m, void *unused)
{
+ struct binder_device *device;
+ struct binder_context *context;
struct binder_proc *proc;
struct binder_node *node;
int do_lock = !binder_debug_no_lock;
-
- if (do_lock)
- binder_lock(__func__);
+ bool wrote_dead_nodes_header = false;
seq_puts(m, "binder state:\n");
- if (!hlist_empty(&binder_dead_nodes))
- seq_puts(m, "dead nodes:\n");
- hlist_for_each_entry(node, &binder_dead_nodes, dead_node)
- print_binder_node(m, node);
+ hlist_for_each_entry(device, &binder_devices, hlist) {
+ context = &device->context;
+ if (do_lock)
+ binder_lock(context, __func__);
+ if (!wrote_dead_nodes_header &&
+ !hlist_empty(&context->binder_dead_nodes)) {
+ seq_puts(m, "dead nodes:\n");
+ wrote_dead_nodes_header = true;
+ }
+ hlist_for_each_entry(node, &context->binder_dead_nodes,
+ dead_node)
+ print_binder_node(m, node);
+
+ if (do_lock)
+ binder_unlock(context, __func__);
+ }
+
+ hlist_for_each_entry(device, &binder_devices, hlist) {
+ context = &device->context;
+ if (do_lock)
+ binder_lock(context, __func__);
- hlist_for_each_entry(proc, &binder_procs, proc_node)
- print_binder_proc(m, proc, 1);
- if (do_lock)
- binder_unlock(__func__);
+ hlist_for_each_entry(proc, &context->binder_procs, proc_node)
+ print_binder_proc(m, proc, 1);
+ if (do_lock)
+ binder_unlock(context, __func__);
+ }
return 0;
}
static int binder_stats_show(struct seq_file *m, void *unused)
{
+ struct binder_device *device;
+ struct binder_context *context;
struct binder_proc *proc;
+ struct binder_stats total_binder_stats;
int do_lock = !binder_debug_no_lock;
- if (do_lock)
- binder_lock(__func__);
+ memset(&total_binder_stats, 0, sizeof(struct binder_stats));
+
+ hlist_for_each_entry(device, &binder_devices, hlist) {
+ context = &device->context;
+ if (do_lock)
+ binder_lock(context, __func__);
+
+ add_binder_stats(&context->binder_stats, &total_binder_stats);
+
+ if (do_lock)
+ binder_unlock(context, __func__);
+ }
seq_puts(m, "binder stats:\n");
+ print_binder_stats(m, "", &total_binder_stats, &binder_obj_stats);
- print_binder_stats(m, "", &binder_stats);
+ hlist_for_each_entry(device, &binder_devices, hlist) {
+ context = &device->context;
+ if (do_lock)
+ binder_lock(context, __func__);
- hlist_for_each_entry(proc, &binder_procs, proc_node)
- print_binder_proc_stats(m, proc);
- if (do_lock)
- binder_unlock(__func__);
+ hlist_for_each_entry(proc, &context->binder_procs, proc_node)
+ print_binder_proc_stats(m, proc);
+ if (do_lock)
+ binder_unlock(context, __func__);
+ }
return 0;
}
static int binder_transactions_show(struct seq_file *m, void *unused)
{
+ struct binder_device *device;
+ struct binder_context *context;
struct binder_proc *proc;
int do_lock = !binder_debug_no_lock;
- if (do_lock)
- binder_lock(__func__);
-
seq_puts(m, "binder transactions:\n");
- hlist_for_each_entry(proc, &binder_procs, proc_node)
- print_binder_proc(m, proc, 0);
- if (do_lock)
- binder_unlock(__func__);
+ hlist_for_each_entry(device, &binder_devices, hlist) {
+ context = &device->context;
+ if (do_lock)
+ binder_lock(context, __func__);
+
+ hlist_for_each_entry(proc, &context->binder_procs, proc_node)
+ print_binder_proc(m, proc, 0);
+ if (do_lock)
+ binder_unlock(context, __func__);
+ }
return 0;
}
static int binder_proc_show(struct seq_file *m, void *unused)
{
- struct binder_proc *proc = m->private;
+ struct binder_device *device;
+ struct binder_context *context;
+ struct binder_proc *itr;
+ int pid = (unsigned long)m->private;
int do_lock = !binder_debug_no_lock;
- if (do_lock)
- binder_lock(__func__);
- seq_puts(m, "binder proc state:\n");
- print_binder_proc(m, proc, 1);
- if (do_lock)
- binder_unlock(__func__);
+ hlist_for_each_entry(device, &binder_devices, hlist) {
+ context = &device->context;
+ if (do_lock)
+ binder_lock(context, __func__);
+
+ hlist_for_each_entry(itr, &context->binder_procs, proc_node) {
+ if (itr->pid == pid) {
+ seq_puts(m, "binder proc state:\n");
+ print_binder_proc(m, itr, 1);
+ }
+ }
+ if (do_lock)
+ binder_unlock(context, __func__);
+ }
return 0;
}
@@ -3596,18 +4220,17 @@ static void print_binder_transaction_log_entry(struct seq_file *m,
struct binder_transaction_log_entry *e)
{
seq_printf(m,
- "%d: %s from %d:%d to %d:%d node %d handle %d size %d:%d\n",
+ "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d\n",
e->debug_id, (e->call_type == 2) ? "reply" :
((e->call_type == 1) ? "async" : "call "), e->from_proc,
- e->from_thread, e->to_proc, e->to_thread, e->to_node,
- e->target_handle, e->data_size, e->offsets_size);
+ e->from_thread, e->to_proc, e->to_thread, e->context_name,
+ e->to_node, e->target_handle, e->data_size, e->offsets_size);
}
-static int binder_transaction_log_show(struct seq_file *m, void *unused)
+static int print_binder_transaction_log(struct seq_file *m,
+ struct binder_transaction_log *log)
{
- struct binder_transaction_log *log = m->private;
int i;
-
if (log->full) {
for (i = log->next; i < ARRAY_SIZE(log->entry); i++)
print_binder_transaction_log_entry(m, &log->entry[i]);
@@ -3617,6 +4240,31 @@ static int binder_transaction_log_show(struct seq_file *m, void *unused)
return 0;
}
+static int binder_transaction_log_show(struct seq_file *m, void *unused)
+{
+ struct binder_device *device;
+ struct binder_context *context;
+
+ hlist_for_each_entry(device, &binder_devices, hlist) {
+ context = &device->context;
+ print_binder_transaction_log(m, &context->transaction_log);
+ }
+ return 0;
+}
+
+static int binder_failed_transaction_log_show(struct seq_file *m, void *unused)
+{
+ struct binder_device *device;
+ struct binder_context *context;
+
+ hlist_for_each_entry(device, &binder_devices, hlist) {
+ context = &device->context;
+ print_binder_transaction_log(m,
+ &context->transaction_log_failed);
+ }
+ return 0;
+}
+
static const struct file_operations binder_fops = {
.owner = THIS_MODULE,
.poll = binder_poll,
@@ -3628,30 +4276,97 @@ static const struct file_operations binder_fops = {
.release = binder_release,
};
-static struct miscdevice binder_miscdev = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = "binder",
- .fops = &binder_fops
-};
-
BINDER_DEBUG_ENTRY(state);
BINDER_DEBUG_ENTRY(stats);
BINDER_DEBUG_ENTRY(transactions);
BINDER_DEBUG_ENTRY(transaction_log);
+BINDER_DEBUG_ENTRY(failed_transaction_log);
-static int __init binder_init(void)
+static void __init free_binder_device(struct binder_device *device)
+{
+ if (device->context.binder_deferred_workqueue)
+ destroy_workqueue(device->context.binder_deferred_workqueue);
+ kfree(device);
+}
+
+static int __init init_binder_device(const char *name)
{
int ret;
+ struct binder_device *binder_device;
+ struct binder_context *context;
+
+ binder_device = kzalloc(sizeof(*binder_device), GFP_KERNEL);
+ if (!binder_device)
+ return -ENOMEM;
+
+ binder_device->miscdev.fops = &binder_fops;
+ binder_device->miscdev.minor = MISC_DYNAMIC_MINOR;
+ binder_device->miscdev.name = name;
+
+ context = &binder_device->context;
+ context->binder_context_mgr_uid = INVALID_UID;
+ context->name = name;
+
+ mutex_init(&context->binder_main_lock);
+ mutex_init(&context->binder_deferred_lock);
+ mutex_init(&context->binder_mmap_lock);
+
+ context->binder_deferred_workqueue =
+ create_singlethread_workqueue(name);
+
+ if (!context->binder_deferred_workqueue) {
+ ret = -ENOMEM;
+ goto err_create_singlethread_workqueue_failed;
+ }
+
+ INIT_HLIST_HEAD(&context->binder_procs);
+ INIT_HLIST_HEAD(&context->binder_dead_nodes);
+ INIT_HLIST_HEAD(&context->binder_deferred_list);
+ INIT_WORK(&context->deferred_work, binder_deferred_func);
+
+ ret = misc_register(&binder_device->miscdev);
+ if (ret < 0) {
+ goto err_misc_register_failed;
+ }
+
+ hlist_add_head(&binder_device->hlist, &binder_devices);
+ return ret;
+
+err_create_singlethread_workqueue_failed:
+err_misc_register_failed:
+ free_binder_device(binder_device);
+
+ return ret;
+}
- binder_deferred_workqueue = create_singlethread_workqueue("binder");
- if (!binder_deferred_workqueue)
+static int __init binder_init(void)
+{
+ int ret = 0;
+ char *device_name, *device_names;
+ struct binder_device *device;
+ struct hlist_node *tmp;
+
+ /*
+ * Copy the module_parameter string, because we don't want to
+ * tokenize it in-place.
+ */
+ device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL);
+ if (!device_names)
return -ENOMEM;
+ strcpy(device_names, binder_devices_param);
+
+ while ((device_name = strsep(&device_names, ","))) {
+ ret = init_binder_device(device_name);
+ if (ret)
+ goto err_init_binder_device_failed;
+ }
+
binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL);
if (binder_debugfs_dir_entry_root)
binder_debugfs_dir_entry_proc = debugfs_create_dir("proc",
binder_debugfs_dir_entry_root);
- ret = misc_register(&binder_miscdev);
+
if (binder_debugfs_dir_entry_root) {
debugfs_create_file("state",
S_IRUGO,
@@ -3671,14 +4386,24 @@ static int __init binder_init(void)
debugfs_create_file("transaction_log",
S_IRUGO,
binder_debugfs_dir_entry_root,
- &binder_transaction_log,
+ NULL,
&binder_transaction_log_fops);
debugfs_create_file("failed_transaction_log",
S_IRUGO,
binder_debugfs_dir_entry_root,
- &binder_transaction_log_failed,
- &binder_transaction_log_fops);
+ NULL,
+ &binder_failed_transaction_log_fops);
}
+
+ return ret;
+
+err_init_binder_device_failed:
+ hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) {
+ misc_deregister(&device->miscdev);
+ hlist_del(&device->hlist);
+ free_binder_device(device);
+ }
+
return ret;
}
diff --git a/drivers/staging/android/fiq_debugger/Kconfig b/drivers/staging/android/fiq_debugger/Kconfig
new file mode 100644
index 000000000000..56f7f999377e
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/Kconfig
@@ -0,0 +1,49 @@
+config FIQ_DEBUGGER
+ bool "FIQ Mode Serial Debugger"
+ default n
+ depends on ARM || ARM64
+ help
+ The FIQ serial debugger can accept commands even when the
+ kernel is unresponsive due to being stuck with interrupts
+ disabled.
+
+config FIQ_DEBUGGER_NO_SLEEP
+ bool "Keep serial debugger active"
+ depends on FIQ_DEBUGGER
+ default n
+ help
+ Enables the serial debugger at boot. Passing
+ fiq_debugger.no_sleep on the kernel commandline will
+ override this config option.
+
+config FIQ_DEBUGGER_WAKEUP_IRQ_ALWAYS_ON
+ bool "Don't disable wakeup IRQ when debugger is active"
+ depends on FIQ_DEBUGGER
+ default n
+ help
+ Don't disable the wakeup irq when enabling the uart clock. This will
+ cause extra interrupts, but it makes the serial debugger usable with
+ on some MSM radio builds that ignore the uart clock request in power
+ collapse.
+
+config FIQ_DEBUGGER_CONSOLE
+ bool "Console on FIQ Serial Debugger port"
+ depends on FIQ_DEBUGGER
+ default n
+ help
+ Enables a console so that printk messages are displayed on
+ the debugger serial port as the occur.
+
+config FIQ_DEBUGGER_CONSOLE_DEFAULT_ENABLE
+ bool "Put the FIQ debugger into console mode by default"
+ depends on FIQ_DEBUGGER_CONSOLE
+ default n
+ help
+ If enabled, this puts the fiq debugger into console mode by default.
+ Otherwise, the fiq debugger will start out in debug mode.
+
+config FIQ_WATCHDOG
+ bool
+ select FIQ_DEBUGGER
+ select PSTORE_RAM
+ default n
diff --git a/drivers/staging/android/fiq_debugger/Makefile b/drivers/staging/android/fiq_debugger/Makefile
new file mode 100644
index 000000000000..a7ca4871cad3
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/Makefile
@@ -0,0 +1,4 @@
+obj-y += fiq_debugger.o
+obj-$(CONFIG_ARM) += fiq_debugger_arm.o
+obj-$(CONFIG_ARM64) += fiq_debugger_arm64.o
+obj-$(CONFIG_FIQ_WATCHDOG) += fiq_watchdog.o
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.c b/drivers/staging/android/fiq_debugger/fiq_debugger.c
new file mode 100644
index 000000000000..52f681674c53
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger.c
@@ -0,0 +1,1212 @@
+/*
+ * drivers/staging/android/fiq_debugger.c
+ *
+ * Serial Debugger Interface accessed through an FIQ interrupt.
+ *
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/console.h>
+#include <linux/interrupt.h>
+#include <linux/clk.h>
+#include <linux/platform_device.h>
+#include <linux/kernel_stat.h>
+#include <linux/kmsg_dump.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/timer.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/wakelock.h>
+
+#ifdef CONFIG_FIQ_GLUE
+#include <asm/fiq_glue.h>
+#endif
+
+#include <linux/uaccess.h>
+
+#include "fiq_debugger.h"
+#include "fiq_debugger_priv.h"
+#include "fiq_debugger_ringbuf.h"
+
+#define DEBUG_MAX 64
+#define MAX_UNHANDLED_FIQ_COUNT 1000000
+
+#define MAX_FIQ_DEBUGGER_PORTS 4
+
+struct fiq_debugger_state {
+#ifdef CONFIG_FIQ_GLUE
+ struct fiq_glue_handler handler;
+#endif
+ struct fiq_debugger_output output;
+
+ int fiq;
+ int uart_irq;
+ int signal_irq;
+ int wakeup_irq;
+ bool wakeup_irq_no_set_wake;
+ struct clk *clk;
+ struct fiq_debugger_pdata *pdata;
+ struct platform_device *pdev;
+
+ char debug_cmd[DEBUG_MAX];
+ int debug_busy;
+ int debug_abort;
+
+ char debug_buf[DEBUG_MAX];
+ int debug_count;
+
+ bool no_sleep;
+ bool debug_enable;
+ bool ignore_next_wakeup_irq;
+ struct timer_list sleep_timer;
+ spinlock_t sleep_timer_lock;
+ bool uart_enabled;
+ struct wake_lock debugger_wake_lock;
+ bool console_enable;
+ int current_cpu;
+ atomic_t unhandled_fiq_count;
+ bool in_fiq;
+
+ struct work_struct work;
+ spinlock_t work_lock;
+ char work_cmd[DEBUG_MAX];
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+ spinlock_t console_lock;
+ struct console console;
+ struct tty_port tty_port;
+ struct fiq_debugger_ringbuf *tty_rbuf;
+ bool syslog_dumping;
+#endif
+
+ unsigned int last_irqs[NR_IRQS];
+ unsigned int last_local_timer_irqs[NR_CPUS];
+};
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+struct tty_driver *fiq_tty_driver;
+#endif
+
+#ifdef CONFIG_FIQ_DEBUGGER_NO_SLEEP
+static bool initial_no_sleep = true;
+#else
+static bool initial_no_sleep;
+#endif
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE_DEFAULT_ENABLE
+static bool initial_debug_enable = true;
+static bool initial_console_enable = true;
+#else
+static bool initial_debug_enable;
+static bool initial_console_enable;
+#endif
+
+static bool fiq_kgdb_enable;
+
+module_param_named(no_sleep, initial_no_sleep, bool, 0644);
+module_param_named(debug_enable, initial_debug_enable, bool, 0644);
+module_param_named(console_enable, initial_console_enable, bool, 0644);
+module_param_named(kgdb_enable, fiq_kgdb_enable, bool, 0644);
+
+#ifdef CONFIG_FIQ_DEBUGGER_WAKEUP_IRQ_ALWAYS_ON
+static inline
+void fiq_debugger_enable_wakeup_irq(struct fiq_debugger_state *state) {}
+static inline
+void fiq_debugger_disable_wakeup_irq(struct fiq_debugger_state *state) {}
+#else
+static inline
+void fiq_debugger_enable_wakeup_irq(struct fiq_debugger_state *state)
+{
+ if (state->wakeup_irq < 0)
+ return;
+ enable_irq(state->wakeup_irq);
+ if (!state->wakeup_irq_no_set_wake)
+ enable_irq_wake(state->wakeup_irq);
+}
+static inline
+void fiq_debugger_disable_wakeup_irq(struct fiq_debugger_state *state)
+{
+ if (state->wakeup_irq < 0)
+ return;
+ disable_irq_nosync(state->wakeup_irq);
+ if (!state->wakeup_irq_no_set_wake)
+ disable_irq_wake(state->wakeup_irq);
+}
+#endif
+
+static inline bool fiq_debugger_have_fiq(struct fiq_debugger_state *state)
+{
+ return (state->fiq >= 0);
+}
+
+#ifdef CONFIG_FIQ_GLUE
+static void fiq_debugger_force_irq(struct fiq_debugger_state *state)
+{
+ unsigned int irq = state->signal_irq;
+
+ if (WARN_ON(!fiq_debugger_have_fiq(state)))
+ return;
+ if (state->pdata->force_irq) {
+ state->pdata->force_irq(state->pdev, irq);
+ } else {
+ struct irq_chip *chip = irq_get_chip(irq);
+ if (chip && chip->irq_retrigger)
+ chip->irq_retrigger(irq_get_irq_data(irq));
+ }
+}
+#endif
+
+static void fiq_debugger_uart_enable(struct fiq_debugger_state *state)
+{
+ if (state->clk)
+ clk_enable(state->clk);
+ if (state->pdata->uart_enable)
+ state->pdata->uart_enable(state->pdev);
+}
+
+static void fiq_debugger_uart_disable(struct fiq_debugger_state *state)
+{
+ if (state->pdata->uart_disable)
+ state->pdata->uart_disable(state->pdev);
+ if (state->clk)
+ clk_disable(state->clk);
+}
+
+static void fiq_debugger_uart_flush(struct fiq_debugger_state *state)
+{
+ if (state->pdata->uart_flush)
+ state->pdata->uart_flush(state->pdev);
+}
+
+static void fiq_debugger_putc(struct fiq_debugger_state *state, char c)
+{
+ state->pdata->uart_putc(state->pdev, c);
+}
+
+static void fiq_debugger_puts(struct fiq_debugger_state *state, char *s)
+{
+ unsigned c;
+ while ((c = *s++)) {
+ if (c == '\n')
+ fiq_debugger_putc(state, '\r');
+ fiq_debugger_putc(state, c);
+ }
+}
+
+static void fiq_debugger_prompt(struct fiq_debugger_state *state)
+{
+ fiq_debugger_puts(state, "debug> ");
+}
+
+static void fiq_debugger_dump_kernel_log(struct fiq_debugger_state *state)
+{
+ char buf[512];
+ size_t len;
+ struct kmsg_dumper dumper = { .active = true };
+
+
+ kmsg_dump_rewind_nolock(&dumper);
+ while (kmsg_dump_get_line_nolock(&dumper, true, buf,
+ sizeof(buf) - 1, &len)) {
+ buf[len] = 0;
+ fiq_debugger_puts(state, buf);
+ }
+}
+
+static void fiq_debugger_printf(struct fiq_debugger_output *output,
+ const char *fmt, ...)
+{
+ struct fiq_debugger_state *state;
+ char buf[256];
+ va_list ap;
+
+ state = container_of(output, struct fiq_debugger_state, output);
+ va_start(ap, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+
+ fiq_debugger_puts(state, buf);
+}
+
+/* Safe outside fiq context */
+static int fiq_debugger_printf_nfiq(void *cookie, const char *fmt, ...)
+{
+ struct fiq_debugger_state *state = cookie;
+ char buf[256];
+ va_list ap;
+ unsigned long irq_flags;
+
+ va_start(ap, fmt);
+ vsnprintf(buf, 128, fmt, ap);
+ va_end(ap);
+
+ local_irq_save(irq_flags);
+ fiq_debugger_puts(state, buf);
+ fiq_debugger_uart_flush(state);
+ local_irq_restore(irq_flags);
+ return state->debug_abort;
+}
+
+static void fiq_debugger_dump_irqs(struct fiq_debugger_state *state)
+{
+ int n;
+ struct irq_desc *desc;
+
+ fiq_debugger_printf(&state->output,
+ "irqnr total since-last status name\n");
+ for_each_irq_desc(n, desc) {
+ struct irqaction *act = desc->action;
+ if (!act && !kstat_irqs(n))
+ continue;
+ fiq_debugger_printf(&state->output, "%5d: %10u %11u %8x %s\n", n,
+ kstat_irqs(n),
+ kstat_irqs(n) - state->last_irqs[n],
+ desc->status_use_accessors,
+ (act && act->name) ? act->name : "???");
+ state->last_irqs[n] = kstat_irqs(n);
+ }
+}
+
+static void fiq_debugger_do_ps(struct fiq_debugger_state *state)
+{
+ struct task_struct *g;
+ struct task_struct *p;
+ unsigned task_state;
+ static const char stat_nam[] = "RSDTtZX";
+
+ fiq_debugger_printf(&state->output, "pid ppid prio task pc\n");
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ task_state = p->state ? __ffs(p->state) + 1 : 0;
+ fiq_debugger_printf(&state->output,
+ "%5d %5d %4d ", p->pid, p->parent->pid, p->prio);
+ fiq_debugger_printf(&state->output, "%-13.13s %c", p->comm,
+ task_state >= sizeof(stat_nam) ? '?' : stat_nam[task_state]);
+ if (task_state == TASK_RUNNING)
+ fiq_debugger_printf(&state->output, " running\n");
+ else
+ fiq_debugger_printf(&state->output, " %08lx\n",
+ thread_saved_pc(p));
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+}
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+static void fiq_debugger_begin_syslog_dump(struct fiq_debugger_state *state)
+{
+ state->syslog_dumping = true;
+}
+
+static void fiq_debugger_end_syslog_dump(struct fiq_debugger_state *state)
+{
+ state->syslog_dumping = false;
+}
+#else
+extern int do_syslog(int type, char __user *bug, int count);
+static void fiq_debugger_begin_syslog_dump(struct fiq_debugger_state *state)
+{
+ do_syslog(5 /* clear */, NULL, 0);
+}
+
+static void fiq_debugger_end_syslog_dump(struct fiq_debugger_state *state)
+{
+ fiq_debugger_dump_kernel_log(state);
+}
+#endif
+
+static void fiq_debugger_do_sysrq(struct fiq_debugger_state *state, char rq)
+{
+ if ((rq == 'g' || rq == 'G') && !fiq_kgdb_enable) {
+ fiq_debugger_printf(&state->output, "sysrq-g blocked\n");
+ return;
+ }
+ fiq_debugger_begin_syslog_dump(state);
+ handle_sysrq(rq);
+ fiq_debugger_end_syslog_dump(state);
+}
+
+#ifdef CONFIG_KGDB
+static void fiq_debugger_do_kgdb(struct fiq_debugger_state *state)
+{
+ if (!fiq_kgdb_enable) {
+ fiq_debugger_printf(&state->output, "kgdb through fiq debugger not enabled\n");
+ return;
+ }
+
+ fiq_debugger_printf(&state->output, "enabling console and triggering kgdb\n");
+ state->console_enable = true;
+ handle_sysrq('g');
+}
+#endif
+
+static void fiq_debugger_schedule_work(struct fiq_debugger_state *state,
+ char *cmd)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&state->work_lock, flags);
+ if (state->work_cmd[0] != '\0') {
+ fiq_debugger_printf(&state->output, "work command processor busy\n");
+ spin_unlock_irqrestore(&state->work_lock, flags);
+ return;
+ }
+
+ strlcpy(state->work_cmd, cmd, sizeof(state->work_cmd));
+ spin_unlock_irqrestore(&state->work_lock, flags);
+
+ schedule_work(&state->work);
+}
+
+static void fiq_debugger_work(struct work_struct *work)
+{
+ struct fiq_debugger_state *state;
+ char work_cmd[DEBUG_MAX];
+ char *cmd;
+ unsigned long flags;
+
+ state = container_of(work, struct fiq_debugger_state, work);
+
+ spin_lock_irqsave(&state->work_lock, flags);
+
+ strlcpy(work_cmd, state->work_cmd, sizeof(work_cmd));
+ state->work_cmd[0] = '\0';
+
+ spin_unlock_irqrestore(&state->work_lock, flags);
+
+ cmd = work_cmd;
+ if (!strncmp(cmd, "reboot", 6)) {
+ cmd += 6;
+ while (*cmd == ' ')
+ cmd++;
+ if (cmd != '\0')
+ kernel_restart(cmd);
+ else
+ kernel_restart(NULL);
+ } else {
+ fiq_debugger_printf(&state->output, "unknown work command '%s'\n",
+ work_cmd);
+ }
+}
+
+/* This function CANNOT be called in FIQ context */
+static void fiq_debugger_irq_exec(struct fiq_debugger_state *state, char *cmd)
+{
+ if (!strcmp(cmd, "ps"))
+ fiq_debugger_do_ps(state);
+ if (!strcmp(cmd, "sysrq"))
+ fiq_debugger_do_sysrq(state, 'h');
+ if (!strncmp(cmd, "sysrq ", 6))
+ fiq_debugger_do_sysrq(state, cmd[6]);
+#ifdef CONFIG_KGDB
+ if (!strcmp(cmd, "kgdb"))
+ fiq_debugger_do_kgdb(state);
+#endif
+ if (!strncmp(cmd, "reboot", 6))
+ fiq_debugger_schedule_work(state, cmd);
+}
+
+static void fiq_debugger_help(struct fiq_debugger_state *state)
+{
+ fiq_debugger_printf(&state->output,
+ "FIQ Debugger commands:\n"
+ " pc PC status\n"
+ " regs Register dump\n"
+ " allregs Extended Register dump\n"
+ " bt Stack trace\n"
+ " reboot [<c>] Reboot with command <c>\n"
+ " reset [<c>] Hard reset with command <c>\n"
+ " irqs Interupt status\n"
+ " kmsg Kernel log\n"
+ " version Kernel version\n");
+ fiq_debugger_printf(&state->output,
+ " sleep Allow sleep while in FIQ\n"
+ " nosleep Disable sleep while in FIQ\n"
+ " console Switch terminal to console\n"
+ " cpu Current CPU\n"
+ " cpu <number> Switch to CPU<number>\n");
+ fiq_debugger_printf(&state->output,
+ " ps Process list\n"
+ " sysrq sysrq options\n"
+ " sysrq <param> Execute sysrq with <param>\n");
+#ifdef CONFIG_KGDB
+ fiq_debugger_printf(&state->output,
+ " kgdb Enter kernel debugger\n");
+#endif
+}
+
+static void fiq_debugger_take_affinity(void *info)
+{
+ struct fiq_debugger_state *state = info;
+ struct cpumask cpumask;
+
+ cpumask_clear(&cpumask);
+ cpumask_set_cpu(get_cpu(), &cpumask);
+
+ irq_set_affinity(state->uart_irq, &cpumask);
+}
+
+static void fiq_debugger_switch_cpu(struct fiq_debugger_state *state, int cpu)
+{
+ if (!fiq_debugger_have_fiq(state))
+ smp_call_function_single(cpu, fiq_debugger_take_affinity, state,
+ false);
+ state->current_cpu = cpu;
+}
+
+static bool fiq_debugger_fiq_exec(struct fiq_debugger_state *state,
+ const char *cmd, const struct pt_regs *regs,
+ void *svc_sp)
+{
+ bool signal_helper = false;
+
+ if (!strcmp(cmd, "help") || !strcmp(cmd, "?")) {
+ fiq_debugger_help(state);
+ } else if (!strcmp(cmd, "pc")) {
+ fiq_debugger_dump_pc(&state->output, regs);
+ } else if (!strcmp(cmd, "regs")) {
+ fiq_debugger_dump_regs(&state->output, regs);
+ } else if (!strcmp(cmd, "allregs")) {
+ fiq_debugger_dump_allregs(&state->output, regs);
+ } else if (!strcmp(cmd, "bt")) {
+ fiq_debugger_dump_stacktrace(&state->output, regs, 100, svc_sp);
+ } else if (!strncmp(cmd, "reset", 5)) {
+ cmd += 5;
+ while (*cmd == ' ')
+ cmd++;
+ if (*cmd) {
+ char tmp_cmd[32];
+ strlcpy(tmp_cmd, cmd, sizeof(tmp_cmd));
+ machine_restart(tmp_cmd);
+ } else {
+ machine_restart(NULL);
+ }
+ } else if (!strcmp(cmd, "irqs")) {
+ fiq_debugger_dump_irqs(state);
+ } else if (!strcmp(cmd, "kmsg")) {
+ fiq_debugger_dump_kernel_log(state);
+ } else if (!strcmp(cmd, "version")) {
+ fiq_debugger_printf(&state->output, "%s\n", linux_banner);
+ } else if (!strcmp(cmd, "sleep")) {
+ state->no_sleep = false;
+ fiq_debugger_printf(&state->output, "enabling sleep\n");
+ } else if (!strcmp(cmd, "nosleep")) {
+ state->no_sleep = true;
+ fiq_debugger_printf(&state->output, "disabling sleep\n");
+ } else if (!strcmp(cmd, "console")) {
+ fiq_debugger_printf(&state->output, "console mode\n");
+ fiq_debugger_uart_flush(state);
+ state->console_enable = true;
+ } else if (!strcmp(cmd, "cpu")) {
+ fiq_debugger_printf(&state->output, "cpu %d\n", state->current_cpu);
+ } else if (!strncmp(cmd, "cpu ", 4)) {
+ unsigned long cpu = 0;
+ if (kstrtoul(cmd + 4, 10, &cpu) == 0)
+ fiq_debugger_switch_cpu(state, cpu);
+ else
+ fiq_debugger_printf(&state->output, "invalid cpu\n");
+ fiq_debugger_printf(&state->output, "cpu %d\n", state->current_cpu);
+ } else {
+ if (state->debug_busy) {
+ fiq_debugger_printf(&state->output,
+ "command processor busy. trying to abort.\n");
+ state->debug_abort = -1;
+ } else {
+ strcpy(state->debug_cmd, cmd);
+ state->debug_busy = 1;
+ }
+
+ return true;
+ }
+ if (!state->console_enable)
+ fiq_debugger_prompt(state);
+
+ return signal_helper;
+}
+
+static void fiq_debugger_sleep_timer_expired(unsigned long data)
+{
+ struct fiq_debugger_state *state = (struct fiq_debugger_state *)data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&state->sleep_timer_lock, flags);
+ if (state->uart_enabled && !state->no_sleep) {
+ if (state->debug_enable && !state->console_enable) {
+ state->debug_enable = false;
+ fiq_debugger_printf_nfiq(state,
+ "suspending fiq debugger\n");
+ }
+ state->ignore_next_wakeup_irq = true;
+ fiq_debugger_uart_disable(state);
+ state->uart_enabled = false;
+ fiq_debugger_enable_wakeup_irq(state);
+ }
+ wake_unlock(&state->debugger_wake_lock);
+ spin_unlock_irqrestore(&state->sleep_timer_lock, flags);
+}
+
+static void fiq_debugger_handle_wakeup(struct fiq_debugger_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&state->sleep_timer_lock, flags);
+ if (state->wakeup_irq >= 0 && state->ignore_next_wakeup_irq) {
+ state->ignore_next_wakeup_irq = false;
+ } else if (!state->uart_enabled) {
+ wake_lock(&state->debugger_wake_lock);
+ fiq_debugger_uart_enable(state);
+ state->uart_enabled = true;
+ fiq_debugger_disable_wakeup_irq(state);
+ mod_timer(&state->sleep_timer, jiffies + HZ / 2);
+ }
+ spin_unlock_irqrestore(&state->sleep_timer_lock, flags);
+}
+
+static irqreturn_t fiq_debugger_wakeup_irq_handler(int irq, void *dev)
+{
+ struct fiq_debugger_state *state = dev;
+
+ if (!state->no_sleep)
+ fiq_debugger_puts(state, "WAKEUP\n");
+ fiq_debugger_handle_wakeup(state);
+
+ return IRQ_HANDLED;
+}
+
+static
+void fiq_debugger_handle_console_irq_context(struct fiq_debugger_state *state)
+{
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+ if (state->tty_port.ops) {
+ int i;
+ int count = fiq_debugger_ringbuf_level(state->tty_rbuf);
+ for (i = 0; i < count; i++) {
+ int c = fiq_debugger_ringbuf_peek(state->tty_rbuf, 0);
+ tty_insert_flip_char(&state->tty_port, c, TTY_NORMAL);
+ if (!fiq_debugger_ringbuf_consume(state->tty_rbuf, 1))
+ pr_warn("fiq tty failed to consume byte\n");
+ }
+ tty_flip_buffer_push(&state->tty_port);
+ }
+#endif
+}
+
+static void fiq_debugger_handle_irq_context(struct fiq_debugger_state *state)
+{
+ if (!state->no_sleep) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&state->sleep_timer_lock, flags);
+ wake_lock(&state->debugger_wake_lock);
+ mod_timer(&state->sleep_timer, jiffies + HZ * 5);
+ spin_unlock_irqrestore(&state->sleep_timer_lock, flags);
+ }
+ fiq_debugger_handle_console_irq_context(state);
+ if (state->debug_busy) {
+ fiq_debugger_irq_exec(state, state->debug_cmd);
+ if (!state->console_enable)
+ fiq_debugger_prompt(state);
+ state->debug_busy = 0;
+ }
+}
+
+static int fiq_debugger_getc(struct fiq_debugger_state *state)
+{
+ return state->pdata->uart_getc(state->pdev);
+}
+
+static bool fiq_debugger_handle_uart_interrupt(struct fiq_debugger_state *state,
+ int this_cpu, const struct pt_regs *regs, void *svc_sp)
+{
+ int c;
+ static int last_c;
+ int count = 0;
+ bool signal_helper = false;
+
+ if (this_cpu != state->current_cpu) {
+ if (state->in_fiq)
+ return false;
+
+ if (atomic_inc_return(&state->unhandled_fiq_count) !=
+ MAX_UNHANDLED_FIQ_COUNT)
+ return false;
+
+ fiq_debugger_printf(&state->output,
+ "fiq_debugger: cpu %d not responding, "
+ "reverting to cpu %d\n", state->current_cpu,
+ this_cpu);
+
+ atomic_set(&state->unhandled_fiq_count, 0);
+ fiq_debugger_switch_cpu(state, this_cpu);
+ return false;
+ }
+
+ state->in_fiq = true;
+
+ while ((c = fiq_debugger_getc(state)) != FIQ_DEBUGGER_NO_CHAR) {
+ count++;
+ if (!state->debug_enable) {
+ if ((c == 13) || (c == 10)) {
+ state->debug_enable = true;
+ state->debug_count = 0;
+ fiq_debugger_prompt(state);
+ }
+ } else if (c == FIQ_DEBUGGER_BREAK) {
+ state->console_enable = false;
+ fiq_debugger_puts(state, "fiq debugger mode\n");
+ state->debug_count = 0;
+ fiq_debugger_prompt(state);
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+ } else if (state->console_enable && state->tty_rbuf) {
+ fiq_debugger_ringbuf_push(state->tty_rbuf, c);
+ signal_helper = true;
+#endif
+ } else if ((c >= ' ') && (c < 127)) {
+ if (state->debug_count < (DEBUG_MAX - 1)) {
+ state->debug_buf[state->debug_count++] = c;
+ fiq_debugger_putc(state, c);
+ }
+ } else if ((c == 8) || (c == 127)) {
+ if (state->debug_count > 0) {
+ state->debug_count--;
+ fiq_debugger_putc(state, 8);
+ fiq_debugger_putc(state, ' ');
+ fiq_debugger_putc(state, 8);
+ }
+ } else if ((c == 13) || (c == 10)) {
+ if (c == '\r' || (c == '\n' && last_c != '\r')) {
+ fiq_debugger_putc(state, '\r');
+ fiq_debugger_putc(state, '\n');
+ }
+ if (state->debug_count) {
+ state->debug_buf[state->debug_count] = 0;
+ state->debug_count = 0;
+ signal_helper |=
+ fiq_debugger_fiq_exec(state,
+ state->debug_buf,
+ regs, svc_sp);
+ } else {
+ fiq_debugger_prompt(state);
+ }
+ }
+ last_c = c;
+ }
+ if (!state->console_enable)
+ fiq_debugger_uart_flush(state);
+ if (state->pdata->fiq_ack)
+ state->pdata->fiq_ack(state->pdev, state->fiq);
+
+ /* poke sleep timer if necessary */
+ if (state->debug_enable && !state->no_sleep)
+ signal_helper = true;
+
+ atomic_set(&state->unhandled_fiq_count, 0);
+ state->in_fiq = false;
+
+ return signal_helper;
+}
+
+#ifdef CONFIG_FIQ_GLUE
+static void fiq_debugger_fiq(struct fiq_glue_handler *h,
+ const struct pt_regs *regs, void *svc_sp)
+{
+ struct fiq_debugger_state *state =
+ container_of(h, struct fiq_debugger_state, handler);
+ unsigned int this_cpu = THREAD_INFO(svc_sp)->cpu;
+ bool need_irq;
+
+ need_irq = fiq_debugger_handle_uart_interrupt(state, this_cpu, regs,
+ svc_sp);
+ if (need_irq)
+ fiq_debugger_force_irq(state);
+}
+#endif
+
+/*
+ * When not using FIQs, we only use this single interrupt as an entry point.
+ * This just effectively takes over the UART interrupt and does all the work
+ * in this context.
+ */
+static irqreturn_t fiq_debugger_uart_irq(int irq, void *dev)
+{
+ struct fiq_debugger_state *state = dev;
+ bool not_done;
+
+ fiq_debugger_handle_wakeup(state);
+
+ /* handle the debugger irq in regular context */
+ not_done = fiq_debugger_handle_uart_interrupt(state, smp_processor_id(),
+ get_irq_regs(),
+ current_thread_info());
+ if (not_done)
+ fiq_debugger_handle_irq_context(state);
+
+ return IRQ_HANDLED;
+}
+
+/*
+ * If FIQs are used, not everything can happen in fiq context.
+ * FIQ handler does what it can and then signals this interrupt to finish the
+ * job in irq context.
+ */
+static irqreturn_t fiq_debugger_signal_irq(int irq, void *dev)
+{
+ struct fiq_debugger_state *state = dev;
+
+ if (state->pdata->force_irq_ack)
+ state->pdata->force_irq_ack(state->pdev, state->signal_irq);
+
+ fiq_debugger_handle_irq_context(state);
+
+ return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_FIQ_GLUE
+static void fiq_debugger_resume(struct fiq_glue_handler *h)
+{
+ struct fiq_debugger_state *state =
+ container_of(h, struct fiq_debugger_state, handler);
+ if (state->pdata->uart_resume)
+ state->pdata->uart_resume(state->pdev);
+}
+#endif
+
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+struct tty_driver *fiq_debugger_console_device(struct console *co, int *index)
+{
+ *index = co->index;
+ return fiq_tty_driver;
+}
+
+static void fiq_debugger_console_write(struct console *co,
+ const char *s, unsigned int count)
+{
+ struct fiq_debugger_state *state;
+ unsigned long flags;
+
+ state = container_of(co, struct fiq_debugger_state, console);
+
+ if (!state->console_enable && !state->syslog_dumping)
+ return;
+
+ fiq_debugger_uart_enable(state);
+ spin_lock_irqsave(&state->console_lock, flags);
+ while (count--) {
+ if (*s == '\n')
+ fiq_debugger_putc(state, '\r');
+ fiq_debugger_putc(state, *s++);
+ }
+ fiq_debugger_uart_flush(state);
+ spin_unlock_irqrestore(&state->console_lock, flags);
+ fiq_debugger_uart_disable(state);
+}
+
+static struct console fiq_debugger_console = {
+ .name = "ttyFIQ",
+ .device = fiq_debugger_console_device,
+ .write = fiq_debugger_console_write,
+ .flags = CON_PRINTBUFFER | CON_ANYTIME | CON_ENABLED,
+};
+
+int fiq_tty_open(struct tty_struct *tty, struct file *filp)
+{
+ int line = tty->index;
+ struct fiq_debugger_state **states = tty->driver->driver_state;
+ struct fiq_debugger_state *state = states[line];
+
+ return tty_port_open(&state->tty_port, tty, filp);
+}
+
+void fiq_tty_close(struct tty_struct *tty, struct file *filp)
+{
+ tty_port_close(tty->port, tty, filp);
+}
+
+int fiq_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
+{
+ int i;
+ int line = tty->index;
+ struct fiq_debugger_state **states = tty->driver->driver_state;
+ struct fiq_debugger_state *state = states[line];
+
+ if (!state->console_enable)
+ return count;
+
+ fiq_debugger_uart_enable(state);
+ spin_lock_irq(&state->console_lock);
+ for (i = 0; i < count; i++)
+ fiq_debugger_putc(state, *buf++);
+ spin_unlock_irq(&state->console_lock);
+ fiq_debugger_uart_disable(state);
+
+ return count;
+}
+
+int fiq_tty_write_room(struct tty_struct *tty)
+{
+ return 16;
+}
+
+#ifdef CONFIG_CONSOLE_POLL
+static int fiq_tty_poll_init(struct tty_driver *driver, int line, char *options)
+{
+ return 0;
+}
+
+static int fiq_tty_poll_get_char(struct tty_driver *driver, int line)
+{
+ struct fiq_debugger_state **states = driver->driver_state;
+ struct fiq_debugger_state *state = states[line];
+ int c = NO_POLL_CHAR;
+
+ fiq_debugger_uart_enable(state);
+ if (fiq_debugger_have_fiq(state)) {
+ int count = fiq_debugger_ringbuf_level(state->tty_rbuf);
+ if (count > 0) {
+ c = fiq_debugger_ringbuf_peek(state->tty_rbuf, 0);
+ fiq_debugger_ringbuf_consume(state->tty_rbuf, 1);
+ }
+ } else {
+ c = fiq_debugger_getc(state);
+ if (c == FIQ_DEBUGGER_NO_CHAR)
+ c = NO_POLL_CHAR;
+ }
+ fiq_debugger_uart_disable(state);
+
+ return c;
+}
+
+static void fiq_tty_poll_put_char(struct tty_driver *driver, int line, char ch)
+{
+ struct fiq_debugger_state **states = driver->driver_state;
+ struct fiq_debugger_state *state = states[line];
+ fiq_debugger_uart_enable(state);
+ fiq_debugger_putc(state, ch);
+ fiq_debugger_uart_disable(state);
+}
+#endif
+
+static const struct tty_port_operations fiq_tty_port_ops;
+
+static const struct tty_operations fiq_tty_driver_ops = {
+ .write = fiq_tty_write,
+ .write_room = fiq_tty_write_room,
+ .open = fiq_tty_open,
+ .close = fiq_tty_close,
+#ifdef CONFIG_CONSOLE_POLL
+ .poll_init = fiq_tty_poll_init,
+ .poll_get_char = fiq_tty_poll_get_char,
+ .poll_put_char = fiq_tty_poll_put_char,
+#endif
+};
+
+static int fiq_debugger_tty_init(void)
+{
+ int ret;
+ struct fiq_debugger_state **states = NULL;
+
+ states = kzalloc(sizeof(*states) * MAX_FIQ_DEBUGGER_PORTS, GFP_KERNEL);
+ if (!states) {
+ pr_err("Failed to allocate fiq debugger state structres\n");
+ return -ENOMEM;
+ }
+
+ fiq_tty_driver = alloc_tty_driver(MAX_FIQ_DEBUGGER_PORTS);
+ if (!fiq_tty_driver) {
+ pr_err("Failed to allocate fiq debugger tty\n");
+ ret = -ENOMEM;
+ goto err_free_state;
+ }
+
+ fiq_tty_driver->owner = THIS_MODULE;
+ fiq_tty_driver->driver_name = "fiq-debugger";
+ fiq_tty_driver->name = "ttyFIQ";
+ fiq_tty_driver->type = TTY_DRIVER_TYPE_SERIAL;
+ fiq_tty_driver->subtype = SERIAL_TYPE_NORMAL;
+ fiq_tty_driver->init_termios = tty_std_termios;
+ fiq_tty_driver->flags = TTY_DRIVER_REAL_RAW |
+ TTY_DRIVER_DYNAMIC_DEV;
+ fiq_tty_driver->driver_state = states;
+
+ fiq_tty_driver->init_termios.c_cflag =
+ B115200 | CS8 | CREAD | HUPCL | CLOCAL;
+ fiq_tty_driver->init_termios.c_ispeed = 115200;
+ fiq_tty_driver->init_termios.c_ospeed = 115200;
+
+ tty_set_operations(fiq_tty_driver, &fiq_tty_driver_ops);
+
+ ret = tty_register_driver(fiq_tty_driver);
+ if (ret) {
+ pr_err("Failed to register fiq tty: %d\n", ret);
+ goto err_free_tty;
+ }
+
+ pr_info("Registered FIQ tty driver\n");
+ return 0;
+
+err_free_tty:
+ put_tty_driver(fiq_tty_driver);
+ fiq_tty_driver = NULL;
+err_free_state:
+ kfree(states);
+ return ret;
+}
+
+static int fiq_debugger_tty_init_one(struct fiq_debugger_state *state)
+{
+ int ret;
+ struct device *tty_dev;
+ struct fiq_debugger_state **states = fiq_tty_driver->driver_state;
+
+ states[state->pdev->id] = state;
+
+ state->tty_rbuf = fiq_debugger_ringbuf_alloc(1024);
+ if (!state->tty_rbuf) {
+ pr_err("Failed to allocate fiq debugger ringbuf\n");
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ tty_port_init(&state->tty_port);
+ state->tty_port.ops = &fiq_tty_port_ops;
+
+ tty_dev = tty_port_register_device(&state->tty_port, fiq_tty_driver,
+ state->pdev->id, &state->pdev->dev);
+ if (IS_ERR(tty_dev)) {
+ pr_err("Failed to register fiq debugger tty device\n");
+ ret = PTR_ERR(tty_dev);
+ goto err;
+ }
+
+ device_set_wakeup_capable(tty_dev, 1);
+
+ pr_info("Registered fiq debugger ttyFIQ%d\n", state->pdev->id);
+
+ return 0;
+
+err:
+ fiq_debugger_ringbuf_free(state->tty_rbuf);
+ state->tty_rbuf = NULL;
+ return ret;
+}
+#endif
+
+static int fiq_debugger_dev_suspend(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct fiq_debugger_state *state = platform_get_drvdata(pdev);
+
+ if (state->pdata->uart_dev_suspend)
+ return state->pdata->uart_dev_suspend(pdev);
+ return 0;
+}
+
+static int fiq_debugger_dev_resume(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct fiq_debugger_state *state = platform_get_drvdata(pdev);
+
+ if (state->pdata->uart_dev_resume)
+ return state->pdata->uart_dev_resume(pdev);
+ return 0;
+}
+
+static int fiq_debugger_probe(struct platform_device *pdev)
+{
+ int ret;
+ struct fiq_debugger_pdata *pdata = dev_get_platdata(&pdev->dev);
+ struct fiq_debugger_state *state;
+ int fiq;
+ int uart_irq;
+
+ if (pdev->id >= MAX_FIQ_DEBUGGER_PORTS)
+ return -EINVAL;
+
+ if (!pdata->uart_getc || !pdata->uart_putc)
+ return -EINVAL;
+ if ((pdata->uart_enable && !pdata->uart_disable) ||
+ (!pdata->uart_enable && pdata->uart_disable))
+ return -EINVAL;
+
+ fiq = platform_get_irq_byname(pdev, "fiq");
+ uart_irq = platform_get_irq_byname(pdev, "uart_irq");
+
+ /* uart_irq mode and fiq mode are mutually exclusive, but one of them
+ * is required */
+ if ((uart_irq < 0 && fiq < 0) || (uart_irq >= 0 && fiq >= 0))
+ return -EINVAL;
+ if (fiq >= 0 && !pdata->fiq_enable)
+ return -EINVAL;
+
+ state = kzalloc(sizeof(*state), GFP_KERNEL);
+ state->output.printf = fiq_debugger_printf;
+ setup_timer(&state->sleep_timer, fiq_debugger_sleep_timer_expired,
+ (unsigned long)state);
+ state->pdata = pdata;
+ state->pdev = pdev;
+ state->no_sleep = initial_no_sleep;
+ state->debug_enable = initial_debug_enable;
+ state->console_enable = initial_console_enable;
+
+ state->fiq = fiq;
+ state->uart_irq = uart_irq;
+ state->signal_irq = platform_get_irq_byname(pdev, "signal");
+ state->wakeup_irq = platform_get_irq_byname(pdev, "wakeup");
+
+ INIT_WORK(&state->work, fiq_debugger_work);
+ spin_lock_init(&state->work_lock);
+
+ platform_set_drvdata(pdev, state);
+
+ spin_lock_init(&state->sleep_timer_lock);
+
+ if (state->wakeup_irq < 0 && fiq_debugger_have_fiq(state))
+ state->no_sleep = true;
+ state->ignore_next_wakeup_irq = !state->no_sleep;
+
+ wake_lock_init(&state->debugger_wake_lock,
+ WAKE_LOCK_SUSPEND, "serial-debug");
+
+ state->clk = clk_get(&pdev->dev, NULL);
+ if (IS_ERR(state->clk))
+ state->clk = NULL;
+
+ /* do not call pdata->uart_enable here since uart_init may still
+ * need to do some initialization before uart_enable can work.
+ * So, only try to manage the clock during init.
+ */
+ if (state->clk)
+ clk_enable(state->clk);
+
+ if (pdata->uart_init) {
+ ret = pdata->uart_init(pdev);
+ if (ret)
+ goto err_uart_init;
+ }
+
+ fiq_debugger_printf_nfiq(state,
+ "<hit enter %sto activate fiq debugger>\n",
+ state->no_sleep ? "" : "twice ");
+
+#ifdef CONFIG_FIQ_GLUE
+ if (fiq_debugger_have_fiq(state)) {
+ state->handler.fiq = fiq_debugger_fiq;
+ state->handler.resume = fiq_debugger_resume;
+ ret = fiq_glue_register_handler(&state->handler);
+ if (ret) {
+ pr_err("%s: could not install fiq handler\n", __func__);
+ goto err_register_irq;
+ }
+
+ pdata->fiq_enable(pdev, state->fiq, 1);
+ } else
+#endif
+ {
+ ret = request_irq(state->uart_irq, fiq_debugger_uart_irq,
+ IRQF_NO_SUSPEND, "debug", state);
+ if (ret) {
+ pr_err("%s: could not install irq handler\n", __func__);
+ goto err_register_irq;
+ }
+
+ /* for irq-only mode, we want this irq to wake us up, if it
+ * can.
+ */
+ enable_irq_wake(state->uart_irq);
+ }
+
+ if (state->clk)
+ clk_disable(state->clk);
+
+ if (state->signal_irq >= 0) {
+ ret = request_irq(state->signal_irq, fiq_debugger_signal_irq,
+ IRQF_TRIGGER_RISING, "debug-signal", state);
+ if (ret)
+ pr_err("serial_debugger: could not install signal_irq");
+ }
+
+ if (state->wakeup_irq >= 0) {
+ ret = request_irq(state->wakeup_irq,
+ fiq_debugger_wakeup_irq_handler,
+ IRQF_TRIGGER_FALLING | IRQF_DISABLED,
+ "debug-wakeup", state);
+ if (ret) {
+ pr_err("serial_debugger: "
+ "could not install wakeup irq\n");
+ state->wakeup_irq = -1;
+ } else {
+ ret = enable_irq_wake(state->wakeup_irq);
+ if (ret) {
+ pr_err("serial_debugger: "
+ "could not enable wakeup\n");
+ state->wakeup_irq_no_set_wake = true;
+ }
+ }
+ }
+ if (state->no_sleep)
+ fiq_debugger_handle_wakeup(state);
+
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+ spin_lock_init(&state->console_lock);
+ state->console = fiq_debugger_console;
+ state->console.index = pdev->id;
+ if (!console_set_on_cmdline)
+ add_preferred_console(state->console.name,
+ state->console.index, NULL);
+ register_console(&state->console);
+ fiq_debugger_tty_init_one(state);
+#endif
+ return 0;
+
+err_register_irq:
+ if (pdata->uart_free)
+ pdata->uart_free(pdev);
+err_uart_init:
+ if (state->clk)
+ clk_disable(state->clk);
+ if (state->clk)
+ clk_put(state->clk);
+ wake_lock_destroy(&state->debugger_wake_lock);
+ platform_set_drvdata(pdev, NULL);
+ kfree(state);
+ return ret;
+}
+
+static const struct dev_pm_ops fiq_debugger_dev_pm_ops = {
+ .suspend = fiq_debugger_dev_suspend,
+ .resume = fiq_debugger_dev_resume,
+};
+
+static struct platform_driver fiq_debugger_driver = {
+ .probe = fiq_debugger_probe,
+ .driver = {
+ .name = "fiq_debugger",
+ .pm = &fiq_debugger_dev_pm_ops,
+ },
+};
+
+static int __init fiq_debugger_init(void)
+{
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+ fiq_debugger_tty_init();
+#endif
+ return platform_driver_register(&fiq_debugger_driver);
+}
+
+postcore_initcall(fiq_debugger_init);
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.h b/drivers/staging/android/fiq_debugger/fiq_debugger.h
new file mode 100644
index 000000000000..c9ec4f8db086
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger.h
@@ -0,0 +1,64 @@
+/*
+ * drivers/staging/android/fiq_debugger/fiq_debugger.h
+ *
+ * Copyright (C) 2010 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _ARCH_ARM_MACH_TEGRA_FIQ_DEBUGGER_H_
+#define _ARCH_ARM_MACH_TEGRA_FIQ_DEBUGGER_H_
+
+#include <linux/serial_core.h>
+
+#define FIQ_DEBUGGER_NO_CHAR NO_POLL_CHAR
+#define FIQ_DEBUGGER_BREAK 0x00ff0100
+
+#define FIQ_DEBUGGER_FIQ_IRQ_NAME "fiq"
+#define FIQ_DEBUGGER_SIGNAL_IRQ_NAME "signal"
+#define FIQ_DEBUGGER_WAKEUP_IRQ_NAME "wakeup"
+
+/**
+ * struct fiq_debugger_pdata - fiq debugger platform data
+ * @uart_resume: used to restore uart state right before enabling
+ * the fiq.
+ * @uart_enable: Do the work necessary to communicate with the uart
+ * hw (enable clocks, etc.). This must be ref-counted.
+ * @uart_disable: Do the work necessary to disable the uart hw
+ * (disable clocks, etc.). This must be ref-counted.
+ * @uart_dev_suspend: called during PM suspend, generally not needed
+ * for real fiq mode debugger.
+ * @uart_dev_resume: called during PM resume, generally not needed
+ * for real fiq mode debugger.
+ */
+struct fiq_debugger_pdata {
+ int (*uart_init)(struct platform_device *pdev);
+ void (*uart_free)(struct platform_device *pdev);
+ int (*uart_resume)(struct platform_device *pdev);
+ int (*uart_getc)(struct platform_device *pdev);
+ void (*uart_putc)(struct platform_device *pdev, unsigned int c);
+ void (*uart_flush)(struct platform_device *pdev);
+ void (*uart_enable)(struct platform_device *pdev);
+ void (*uart_disable)(struct platform_device *pdev);
+
+ int (*uart_dev_suspend)(struct platform_device *pdev);
+ int (*uart_dev_resume)(struct platform_device *pdev);
+
+ void (*fiq_enable)(struct platform_device *pdev, unsigned int fiq,
+ bool enable);
+ void (*fiq_ack)(struct platform_device *pdev, unsigned int fiq);
+
+ void (*force_irq)(struct platform_device *pdev, unsigned int irq);
+ void (*force_irq_ack)(struct platform_device *pdev, unsigned int irq);
+};
+
+#endif
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_arm.c b/drivers/staging/android/fiq_debugger/fiq_debugger_arm.c
new file mode 100644
index 000000000000..8b3e0137be1a
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_arm.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/ptrace.h>
+#include <linux/uaccess.h>
+
+#include <asm/stacktrace.h>
+
+#include "fiq_debugger_priv.h"
+
+static char *mode_name(unsigned cpsr)
+{
+ switch (cpsr & MODE_MASK) {
+ case USR_MODE: return "USR";
+ case FIQ_MODE: return "FIQ";
+ case IRQ_MODE: return "IRQ";
+ case SVC_MODE: return "SVC";
+ case ABT_MODE: return "ABT";
+ case UND_MODE: return "UND";
+ case SYSTEM_MODE: return "SYS";
+ default: return "???";
+ }
+}
+
+void fiq_debugger_dump_pc(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ output->printf(output, " pc %08x cpsr %08x mode %s\n",
+ regs->ARM_pc, regs->ARM_cpsr, mode_name(regs->ARM_cpsr));
+}
+
+void fiq_debugger_dump_regs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ output->printf(output,
+ " r0 %08x r1 %08x r2 %08x r3 %08x\n",
+ regs->ARM_r0, regs->ARM_r1, regs->ARM_r2, regs->ARM_r3);
+ output->printf(output,
+ " r4 %08x r5 %08x r6 %08x r7 %08x\n",
+ regs->ARM_r4, regs->ARM_r5, regs->ARM_r6, regs->ARM_r7);
+ output->printf(output,
+ " r8 %08x r9 %08x r10 %08x r11 %08x mode %s\n",
+ regs->ARM_r8, regs->ARM_r9, regs->ARM_r10, regs->ARM_fp,
+ mode_name(regs->ARM_cpsr));
+ output->printf(output,
+ " ip %08x sp %08x lr %08x pc %08x cpsr %08x\n",
+ regs->ARM_ip, regs->ARM_sp, regs->ARM_lr, regs->ARM_pc,
+ regs->ARM_cpsr);
+}
+
+struct mode_regs {
+ unsigned long sp_svc;
+ unsigned long lr_svc;
+ unsigned long spsr_svc;
+
+ unsigned long sp_abt;
+ unsigned long lr_abt;
+ unsigned long spsr_abt;
+
+ unsigned long sp_und;
+ unsigned long lr_und;
+ unsigned long spsr_und;
+
+ unsigned long sp_irq;
+ unsigned long lr_irq;
+ unsigned long spsr_irq;
+
+ unsigned long r8_fiq;
+ unsigned long r9_fiq;
+ unsigned long r10_fiq;
+ unsigned long r11_fiq;
+ unsigned long r12_fiq;
+ unsigned long sp_fiq;
+ unsigned long lr_fiq;
+ unsigned long spsr_fiq;
+};
+
+static void __naked get_mode_regs(struct mode_regs *regs)
+{
+ asm volatile (
+ "mrs r1, cpsr\n"
+ "msr cpsr_c, #0xd3 @(SVC_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+ "stmia r0!, {r13 - r14}\n"
+ "mrs r2, spsr\n"
+ "msr cpsr_c, #0xd7 @(ABT_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+ "stmia r0!, {r2, r13 - r14}\n"
+ "mrs r2, spsr\n"
+ "msr cpsr_c, #0xdb @(UND_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+ "stmia r0!, {r2, r13 - r14}\n"
+ "mrs r2, spsr\n"
+ "msr cpsr_c, #0xd2 @(IRQ_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+ "stmia r0!, {r2, r13 - r14}\n"
+ "mrs r2, spsr\n"
+ "msr cpsr_c, #0xd1 @(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+ "stmia r0!, {r2, r8 - r14}\n"
+ "mrs r2, spsr\n"
+ "stmia r0!, {r2}\n"
+ "msr cpsr_c, r1\n"
+ "bx lr\n");
+}
+
+
+void fiq_debugger_dump_allregs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ struct mode_regs mode_regs;
+ unsigned long mode = regs->ARM_cpsr & MODE_MASK;
+
+ fiq_debugger_dump_regs(output, regs);
+ get_mode_regs(&mode_regs);
+
+ output->printf(output,
+ "%csvc: sp %08x lr %08x spsr %08x\n",
+ mode == SVC_MODE ? '*' : ' ',
+ mode_regs.sp_svc, mode_regs.lr_svc, mode_regs.spsr_svc);
+ output->printf(output,
+ "%cabt: sp %08x lr %08x spsr %08x\n",
+ mode == ABT_MODE ? '*' : ' ',
+ mode_regs.sp_abt, mode_regs.lr_abt, mode_regs.spsr_abt);
+ output->printf(output,
+ "%cund: sp %08x lr %08x spsr %08x\n",
+ mode == UND_MODE ? '*' : ' ',
+ mode_regs.sp_und, mode_regs.lr_und, mode_regs.spsr_und);
+ output->printf(output,
+ "%cirq: sp %08x lr %08x spsr %08x\n",
+ mode == IRQ_MODE ? '*' : ' ',
+ mode_regs.sp_irq, mode_regs.lr_irq, mode_regs.spsr_irq);
+ output->printf(output,
+ "%cfiq: r8 %08x r9 %08x r10 %08x r11 %08x r12 %08x\n",
+ mode == FIQ_MODE ? '*' : ' ',
+ mode_regs.r8_fiq, mode_regs.r9_fiq, mode_regs.r10_fiq,
+ mode_regs.r11_fiq, mode_regs.r12_fiq);
+ output->printf(output,
+ " fiq: sp %08x lr %08x spsr %08x\n",
+ mode_regs.sp_fiq, mode_regs.lr_fiq, mode_regs.spsr_fiq);
+}
+
+struct stacktrace_state {
+ struct fiq_debugger_output *output;
+ unsigned int depth;
+};
+
+static int report_trace(struct stackframe *frame, void *d)
+{
+ struct stacktrace_state *sts = d;
+
+ if (sts->depth) {
+ sts->output->printf(sts->output,
+ " pc: %p (%pF), lr %p (%pF), sp %p, fp %p\n",
+ frame->pc, frame->pc, frame->lr, frame->lr,
+ frame->sp, frame->fp);
+ sts->depth--;
+ return 0;
+ }
+ sts->output->printf(sts->output, " ...\n");
+
+ return sts->depth == 0;
+}
+
+struct frame_tail {
+ struct frame_tail *fp;
+ unsigned long sp;
+ unsigned long lr;
+} __attribute__((packed));
+
+static struct frame_tail *user_backtrace(struct fiq_debugger_output *output,
+ struct frame_tail *tail)
+{
+ struct frame_tail buftail[2];
+
+ /* Also check accessibility of one struct frame_tail beyond */
+ if (!access_ok(VERIFY_READ, tail, sizeof(buftail))) {
+ output->printf(output, " invalid frame pointer %p\n",
+ tail);
+ return NULL;
+ }
+ if (__copy_from_user_inatomic(buftail, tail, sizeof(buftail))) {
+ output->printf(output,
+ " failed to copy frame pointer %p\n", tail);
+ return NULL;
+ }
+
+ output->printf(output, " %p\n", buftail[0].lr);
+
+ /* frame pointers should strictly progress back up the stack
+ * (towards higher addresses) */
+ if (tail >= buftail[0].fp)
+ return NULL;
+
+ return buftail[0].fp-1;
+}
+
+void fiq_debugger_dump_stacktrace(struct fiq_debugger_output *output,
+ const struct pt_regs *regs, unsigned int depth, void *ssp)
+{
+ struct frame_tail *tail;
+ struct thread_info *real_thread_info = THREAD_INFO(ssp);
+ struct stacktrace_state sts;
+
+ sts.depth = depth;
+ sts.output = output;
+ *current_thread_info() = *real_thread_info;
+
+ if (!current)
+ output->printf(output, "current NULL\n");
+ else
+ output->printf(output, "pid: %d comm: %s\n",
+ current->pid, current->comm);
+ fiq_debugger_dump_regs(output, regs);
+
+ if (!user_mode(regs)) {
+ struct stackframe frame;
+ frame.fp = regs->ARM_fp;
+ frame.sp = regs->ARM_sp;
+ frame.lr = regs->ARM_lr;
+ frame.pc = regs->ARM_pc;
+ output->printf(output,
+ " pc: %p (%pF), lr %p (%pF), sp %p, fp %p\n",
+ regs->ARM_pc, regs->ARM_pc, regs->ARM_lr, regs->ARM_lr,
+ regs->ARM_sp, regs->ARM_fp);
+ walk_stackframe(&frame, report_trace, &sts);
+ return;
+ }
+
+ tail = ((struct frame_tail *) regs->ARM_fp) - 1;
+ while (depth-- && tail && !((unsigned long) tail & 3))
+ tail = user_backtrace(output, tail);
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c b/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c
new file mode 100644
index 000000000000..99c6584fcfa5
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/ptrace.h>
+#include <asm/stacktrace.h>
+
+#include "fiq_debugger_priv.h"
+
+static char *mode_name(const struct pt_regs *regs)
+{
+ if (compat_user_mode(regs)) {
+ return "USR";
+ } else {
+ switch (processor_mode(regs)) {
+ case PSR_MODE_EL0t: return "EL0t";
+ case PSR_MODE_EL1t: return "EL1t";
+ case PSR_MODE_EL1h: return "EL1h";
+ case PSR_MODE_EL2t: return "EL2t";
+ case PSR_MODE_EL2h: return "EL2h";
+ default: return "???";
+ }
+ }
+}
+
+void fiq_debugger_dump_pc(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ output->printf(output, " pc %016lx cpsr %08lx mode %s\n",
+ regs->pc, regs->pstate, mode_name(regs));
+}
+
+void fiq_debugger_dump_regs_aarch32(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ output->printf(output, " r0 %08x r1 %08x r2 %08x r3 %08x\n",
+ regs->compat_usr(0), regs->compat_usr(1),
+ regs->compat_usr(2), regs->compat_usr(3));
+ output->printf(output, " r4 %08x r5 %08x r6 %08x r7 %08x\n",
+ regs->compat_usr(4), regs->compat_usr(5),
+ regs->compat_usr(6), regs->compat_usr(7));
+ output->printf(output, " r8 %08x r9 %08x r10 %08x r11 %08x\n",
+ regs->compat_usr(8), regs->compat_usr(9),
+ regs->compat_usr(10), regs->compat_usr(11));
+ output->printf(output, " ip %08x sp %08x lr %08x pc %08x\n",
+ regs->compat_usr(12), regs->compat_sp,
+ regs->compat_lr, regs->pc);
+ output->printf(output, " cpsr %08x (%s)\n",
+ regs->pstate, mode_name(regs));
+}
+
+void fiq_debugger_dump_regs_aarch64(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+
+ output->printf(output, " x0 %016lx x1 %016lx\n",
+ regs->regs[0], regs->regs[1]);
+ output->printf(output, " x2 %016lx x3 %016lx\n",
+ regs->regs[2], regs->regs[3]);
+ output->printf(output, " x4 %016lx x5 %016lx\n",
+ regs->regs[4], regs->regs[5]);
+ output->printf(output, " x6 %016lx x7 %016lx\n",
+ regs->regs[6], regs->regs[7]);
+ output->printf(output, " x8 %016lx x9 %016lx\n",
+ regs->regs[8], regs->regs[9]);
+ output->printf(output, " x10 %016lx x11 %016lx\n",
+ regs->regs[10], regs->regs[11]);
+ output->printf(output, " x12 %016lx x13 %016lx\n",
+ regs->regs[12], regs->regs[13]);
+ output->printf(output, " x14 %016lx x15 %016lx\n",
+ regs->regs[14], regs->regs[15]);
+ output->printf(output, " x16 %016lx x17 %016lx\n",
+ regs->regs[16], regs->regs[17]);
+ output->printf(output, " x18 %016lx x19 %016lx\n",
+ regs->regs[18], regs->regs[19]);
+ output->printf(output, " x20 %016lx x21 %016lx\n",
+ regs->regs[20], regs->regs[21]);
+ output->printf(output, " x22 %016lx x23 %016lx\n",
+ regs->regs[22], regs->regs[23]);
+ output->printf(output, " x24 %016lx x25 %016lx\n",
+ regs->regs[24], regs->regs[25]);
+ output->printf(output, " x26 %016lx x27 %016lx\n",
+ regs->regs[26], regs->regs[27]);
+ output->printf(output, " x28 %016lx x29 %016lx\n",
+ regs->regs[28], regs->regs[29]);
+ output->printf(output, " x30 %016lx sp %016lx\n",
+ regs->regs[30], regs->sp);
+ output->printf(output, " pc %016lx cpsr %08x (%s)\n",
+ regs->pc, regs->pstate, mode_name(regs));
+}
+
+void fiq_debugger_dump_regs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ if (compat_user_mode(regs))
+ fiq_debugger_dump_regs_aarch32(output, regs);
+ else
+ fiq_debugger_dump_regs_aarch64(output, regs);
+}
+
+#define READ_SPECIAL_REG(x) ({ \
+ u64 val; \
+ asm volatile ("mrs %0, " # x : "=r"(val)); \
+ val; \
+})
+
+void fiq_debugger_dump_allregs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ u32 pstate = READ_SPECIAL_REG(CurrentEl);
+ bool in_el2 = (pstate & PSR_MODE_MASK) >= PSR_MODE_EL2t;
+
+ fiq_debugger_dump_regs(output, regs);
+
+ output->printf(output, " sp_el0 %016lx\n",
+ READ_SPECIAL_REG(sp_el0));
+
+ if (in_el2)
+ output->printf(output, " sp_el1 %016lx\n",
+ READ_SPECIAL_REG(sp_el1));
+
+ output->printf(output, " elr_el1 %016lx\n",
+ READ_SPECIAL_REG(elr_el1));
+
+ output->printf(output, " spsr_el1 %08lx\n",
+ READ_SPECIAL_REG(spsr_el1));
+
+ if (in_el2) {
+ output->printf(output, " spsr_irq %08lx\n",
+ READ_SPECIAL_REG(spsr_irq));
+ output->printf(output, " spsr_abt %08lx\n",
+ READ_SPECIAL_REG(spsr_abt));
+ output->printf(output, " spsr_und %08lx\n",
+ READ_SPECIAL_REG(spsr_und));
+ output->printf(output, " spsr_fiq %08lx\n",
+ READ_SPECIAL_REG(spsr_fiq));
+ output->printf(output, " spsr_el2 %08lx\n",
+ READ_SPECIAL_REG(elr_el2));
+ output->printf(output, " spsr_el2 %08lx\n",
+ READ_SPECIAL_REG(spsr_el2));
+ }
+}
+
+struct stacktrace_state {
+ struct fiq_debugger_output *output;
+ unsigned int depth;
+};
+
+static int report_trace(struct stackframe *frame, void *d)
+{
+ struct stacktrace_state *sts = d;
+
+ if (sts->depth) {
+ sts->output->printf(sts->output, "%pF:\n", frame->pc);
+ sts->output->printf(sts->output,
+ " pc %016lx sp %016lx fp %016lx\n",
+ frame->pc, frame->sp, frame->fp);
+ sts->depth--;
+ return 0;
+ }
+ sts->output->printf(sts->output, " ...\n");
+
+ return sts->depth == 0;
+}
+
+void fiq_debugger_dump_stacktrace(struct fiq_debugger_output *output,
+ const struct pt_regs *regs, unsigned int depth, void *ssp)
+{
+ struct thread_info *real_thread_info = THREAD_INFO(ssp);
+ struct stacktrace_state sts;
+
+ sts.depth = depth;
+ sts.output = output;
+ *current_thread_info() = *real_thread_info;
+
+ if (!current)
+ output->printf(output, "current NULL\n");
+ else
+ output->printf(output, "pid: %d comm: %s\n",
+ current->pid, current->comm);
+ fiq_debugger_dump_regs(output, regs);
+
+ if (!user_mode(regs)) {
+ struct stackframe frame;
+ frame.fp = regs->regs[29];
+ frame.sp = regs->sp;
+ frame.pc = regs->pc;
+ output->printf(output, "\n");
+ walk_stackframe(&frame, report_trace, &sts);
+ }
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_priv.h b/drivers/staging/android/fiq_debugger/fiq_debugger_priv.h
new file mode 100644
index 000000000000..d5d051f727a8
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_priv.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _FIQ_DEBUGGER_PRIV_H_
+#define _FIQ_DEBUGGER_PRIV_H_
+
+#define THREAD_INFO(sp) ((struct thread_info *) \
+ ((unsigned long)(sp) & ~(THREAD_SIZE - 1)))
+
+struct fiq_debugger_output {
+ void (*printf)(struct fiq_debugger_output *output, const char *fmt, ...);
+};
+
+struct pt_regs;
+
+void fiq_debugger_dump_pc(struct fiq_debugger_output *output,
+ const struct pt_regs *regs);
+void fiq_debugger_dump_regs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs);
+void fiq_debugger_dump_allregs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs);
+void fiq_debugger_dump_stacktrace(struct fiq_debugger_output *output,
+ const struct pt_regs *regs, unsigned int depth, void *ssp);
+
+#endif
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h b/drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h
new file mode 100644
index 000000000000..10c3c5d09098
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h
@@ -0,0 +1,94 @@
+/*
+ * drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h
+ *
+ * simple lockless ringbuffer
+ *
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+struct fiq_debugger_ringbuf {
+ int len;
+ int head;
+ int tail;
+ u8 buf[];
+};
+
+
+static inline struct fiq_debugger_ringbuf *fiq_debugger_ringbuf_alloc(int len)
+{
+ struct fiq_debugger_ringbuf *rbuf;
+
+ rbuf = kzalloc(sizeof(*rbuf) + len, GFP_KERNEL);
+ if (rbuf == NULL)
+ return NULL;
+
+ rbuf->len = len;
+ rbuf->head = 0;
+ rbuf->tail = 0;
+ smp_mb();
+
+ return rbuf;
+}
+
+static inline void fiq_debugger_ringbuf_free(struct fiq_debugger_ringbuf *rbuf)
+{
+ kfree(rbuf);
+}
+
+static inline int fiq_debugger_ringbuf_level(struct fiq_debugger_ringbuf *rbuf)
+{
+ int level = rbuf->head - rbuf->tail;
+
+ if (level < 0)
+ level = rbuf->len + level;
+
+ return level;
+}
+
+static inline int fiq_debugger_ringbuf_room(struct fiq_debugger_ringbuf *rbuf)
+{
+ return rbuf->len - fiq_debugger_ringbuf_level(rbuf) - 1;
+}
+
+static inline u8
+fiq_debugger_ringbuf_peek(struct fiq_debugger_ringbuf *rbuf, int i)
+{
+ return rbuf->buf[(rbuf->tail + i) % rbuf->len];
+}
+
+static inline int
+fiq_debugger_ringbuf_consume(struct fiq_debugger_ringbuf *rbuf, int count)
+{
+ count = min(count, fiq_debugger_ringbuf_level(rbuf));
+
+ rbuf->tail = (rbuf->tail + count) % rbuf->len;
+ smp_mb();
+
+ return count;
+}
+
+static inline int
+fiq_debugger_ringbuf_push(struct fiq_debugger_ringbuf *rbuf, u8 datum)
+{
+ if (fiq_debugger_ringbuf_room(rbuf) == 0)
+ return 0;
+
+ rbuf->buf[rbuf->head] = datum;
+ smp_mb();
+ rbuf->head = (rbuf->head + 1) % rbuf->len;
+ smp_mb();
+
+ return 1;
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_watchdog.c b/drivers/staging/android/fiq_debugger/fiq_watchdog.c
new file mode 100644
index 000000000000..194b54138417
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_watchdog.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/pstore_ram.h>
+
+#include "fiq_watchdog.h"
+#include "fiq_debugger_priv.h"
+
+static DEFINE_RAW_SPINLOCK(fiq_watchdog_lock);
+
+static void fiq_watchdog_printf(struct fiq_debugger_output *output,
+ const char *fmt, ...)
+{
+ char buf[256];
+ va_list ap;
+ int len;
+
+ va_start(ap, fmt);
+ len = vscnprintf(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+
+ ramoops_console_write_buf(buf, len);
+}
+
+struct fiq_debugger_output fiq_watchdog_output = {
+ .printf = fiq_watchdog_printf,
+};
+
+void fiq_watchdog_triggered(const struct pt_regs *regs, void *svc_sp)
+{
+ char msg[24];
+ int len;
+
+ raw_spin_lock(&fiq_watchdog_lock);
+
+ len = scnprintf(msg, sizeof(msg), "watchdog fiq cpu %d\n",
+ THREAD_INFO(svc_sp)->cpu);
+ ramoops_console_write_buf(msg, len);
+
+ fiq_debugger_dump_stacktrace(&fiq_watchdog_output, regs, 100, svc_sp);
+
+ raw_spin_unlock(&fiq_watchdog_lock);
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_watchdog.h b/drivers/staging/android/fiq_debugger/fiq_watchdog.h
new file mode 100644
index 000000000000..c6b507f8d976
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_watchdog.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _FIQ_WATCHDOG_H_
+#define _FIQ_WATCHDOG_H_
+
+void fiq_watchdog_triggered(const struct pt_regs *regs, void *svc_sp);
+
+#endif
diff --git a/drivers/staging/android/ion/Kconfig b/drivers/staging/android/ion/Kconfig
index 345234624492..356e10969272 100644
--- a/drivers/staging/android/ion/Kconfig
+++ b/drivers/staging/android/ion/Kconfig
@@ -33,3 +33,10 @@ config ION_TEGRA
help
Choose this option if you wish to use ion on an nVidia Tegra.
+config ION_POOL_CACHE_POLICY
+ bool "Ion set page pool cache policy"
+ depends on ION && X86
+ default y if X86
+ help
+ Choose this option if need to explicity set cache policy of the
+ pages in the page pool.
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index 8cc4cdb33400..e76ae28492c5 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -213,10 +213,10 @@ static struct ion_buffer *ion_buffer_create(struct ion_heap *heap,
"heap->ops->map_dma should return ERR_PTR on error"))
table = ERR_PTR(-EINVAL);
if (IS_ERR(table)) {
- heap->ops->free(buffer);
- kfree(buffer);
- return ERR_CAST(table);
+ ret = -EINVAL;
+ goto err1;
}
+
buffer->sg_table = table;
if (ion_buffer_fault_user_mappings(buffer)) {
int num_pages = PAGE_ALIGN(buffer->size) / PAGE_SIZE;
@@ -226,7 +226,7 @@ static struct ion_buffer *ion_buffer_create(struct ion_heap *heap,
buffer->pages = vmalloc(sizeof(struct page *) * num_pages);
if (!buffer->pages) {
ret = -ENOMEM;
- goto err1;
+ goto err;
}
for_each_sg(table->sgl, sg, table->nents, i) {
@@ -235,9 +235,6 @@ static struct ion_buffer *ion_buffer_create(struct ion_heap *heap,
for (j = 0; j < sg->length / PAGE_SIZE; j++)
buffer->pages[k++] = page++;
}
-
- if (ret)
- goto err;
}
buffer->dev = dev;
@@ -250,7 +247,7 @@ static struct ion_buffer *ion_buffer_create(struct ion_heap *heap,
our systems the only dma_address space is physical addresses.
Additionally, we can't afford the overhead of invalidating every
allocation via dma_map_sg. The implicit contract here is that
- memory comming from the heaps is ready for dma, ie if it has a
+ memory coming from the heaps is ready for dma, ie if it has a
cached mapping that mapping has been invalidated */
for_each_sg(buffer->sg_table->sgl, sg, buffer->sg_table->nents, i)
sg_dma_address(sg) = sg_phys(sg);
@@ -261,10 +258,8 @@ static struct ion_buffer *ion_buffer_create(struct ion_heap *heap,
err:
heap->ops->unmap_dma(heap, buffer);
- heap->ops->free(buffer);
err1:
- if (buffer->pages)
- vfree(buffer->pages);
+ heap->ops->free(buffer);
err2:
kfree(buffer);
return ERR_PTR(ret);
@@ -927,7 +922,7 @@ void ion_pages_sync_for_device(struct device *dev, struct page *page,
sg_set_page(&sg, page, size, 0);
/*
* This is not correct - sg_dma_address needs a dma_addr_t that is valid
- * for the the targeted device, but this works on the currently targeted
+ * for the targeted device, but this works on the currently targeted
* hardware.
*/
sg_dma_address(&sg) = page_to_phys(page);
@@ -1492,7 +1487,6 @@ static const struct file_operations debug_heap_fops = {
.release = single_release,
};
-#ifdef DEBUG_HEAP_SHRINKER
static int debug_shrink_set(void *data, u64 val)
{
struct ion_heap *heap = data;
@@ -1500,15 +1494,14 @@ static int debug_shrink_set(void *data, u64 val)
int objs;
sc.gfp_mask = -1;
- sc.nr_to_scan = 0;
+ sc.nr_to_scan = val;
- if (!val)
- return 0;
-
- objs = heap->shrinker.shrink(&heap->shrinker, &sc);
- sc.nr_to_scan = objs;
+ if (!val) {
+ objs = heap->shrinker.count_objects(&heap->shrinker, &sc);
+ sc.nr_to_scan = objs;
+ }
- heap->shrinker.shrink(&heap->shrinker, &sc);
+ heap->shrinker.scan_objects(&heap->shrinker, &sc);
return 0;
}
@@ -1521,14 +1514,13 @@ static int debug_shrink_get(void *data, u64 *val)
sc.gfp_mask = -1;
sc.nr_to_scan = 0;
- objs = heap->shrinker.shrink(&heap->shrinker, &sc);
+ objs = heap->shrinker.count_objects(&heap->shrinker, &sc);
*val = objs;
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(debug_shrink_fops, debug_shrink_get,
debug_shrink_set, "%llu\n");
-#endif
void ion_device_add_heap(struct ion_device *dev, struct ion_heap *heap)
{
@@ -1563,8 +1555,7 @@ void ion_device_add_heap(struct ion_device *dev, struct ion_heap *heap)
path, heap->name);
}
-#ifdef DEBUG_HEAP_SHRINKER
- if (heap->shrinker.shrink) {
+ if (heap->shrinker.count_objects && heap->shrinker.scan_objects) {
char debug_name[64];
snprintf(debug_name, 64, "%s_shrink", heap->name);
@@ -1579,7 +1570,7 @@ void ion_device_add_heap(struct ion_device *dev, struct ion_heap *heap)
path, debug_name);
}
}
-#endif
+
up_write(&dev->lock);
}
diff --git a/drivers/staging/android/ion/ion.h b/drivers/staging/android/ion/ion.h
index d305bb7e9a74..443db8459a9e 100644
--- a/drivers/staging/android/ion/ion.h
+++ b/drivers/staging/android/ion/ion.h
@@ -76,7 +76,7 @@ struct ion_platform_data {
* size
*
* Calls memblock reserve to set aside memory for heaps that are
- * located at specific memory addresses or of specfic sizes not
+ * located at specific memory addresses or of specific sizes not
* managed by the kernel
*/
void ion_reserve(struct ion_platform_data *data);
diff --git a/drivers/staging/android/ion/ion_carveout_heap.c b/drivers/staging/android/ion/ion_carveout_heap.c
index 9156d8238c97..e702ce6461fc 100644..100755
--- a/drivers/staging/android/ion/ion_carveout_heap.c
+++ b/drivers/staging/android/ion/ion_carveout_heap.c
@@ -167,7 +167,7 @@ struct ion_heap *ion_carveout_heap_create(struct ion_platform_heap *heap_data)
if (!carveout_heap)
return ERR_PTR(-ENOMEM);
- carveout_heap->pool = gen_pool_create(12, -1);
+ carveout_heap->pool = gen_pool_create(PAGE_SHIFT, -1);
if (!carveout_heap->pool) {
kfree(carveout_heap);
return ERR_PTR(-ENOMEM);
diff --git a/drivers/staging/android/ion/ion_page_pool.c b/drivers/staging/android/ion/ion_page_pool.c
index 5864f3dfcbc6..b0217488ef92 100644
--- a/drivers/staging/android/ion/ion_page_pool.c
+++ b/drivers/staging/android/ion/ion_page_pool.c
@@ -30,6 +30,8 @@ static void *ion_page_pool_alloc_pages(struct ion_page_pool *pool)
if (!page)
return NULL;
+ ion_page_pool_alloc_set_cache_policy(pool, page);
+
ion_pages_sync_for_device(NULL, page, PAGE_SIZE << pool->order,
DMA_BIDIRECTIONAL);
return page;
@@ -38,6 +40,7 @@ static void *ion_page_pool_alloc_pages(struct ion_page_pool *pool)
static void ion_page_pool_free_pages(struct ion_page_pool *pool,
struct page *page)
{
+ ion_page_pool_free_set_cache_policy(pool, page);
__free_pages(page, pool->order);
}
@@ -103,6 +106,11 @@ void ion_page_pool_free(struct ion_page_pool *pool, struct page *page)
ion_page_pool_free_pages(pool, page);
}
+void ion_page_pool_free_immediate(struct ion_page_pool *pool, struct page *page)
+{
+ ion_page_pool_free_pages(pool, page);
+}
+
static int ion_page_pool_total(struct ion_page_pool *pool, bool high)
{
int count = pool->low_count;
@@ -120,7 +128,7 @@ int ion_page_pool_shrink(struct ion_page_pool *pool, gfp_t gfp_mask,
bool high;
if (current_is_kswapd())
- high = 1;
+ high = true;
else
high = !!(gfp_mask & __GFP_HIGHMEM);
diff --git a/drivers/staging/android/ion/ion_priv.h b/drivers/staging/android/ion/ion_priv.h
index c8f01757abfa..0e3b8a6f3bf7 100644
--- a/drivers/staging/android/ion/ion_priv.h
+++ b/drivers/staging/android/ion/ion_priv.h
@@ -26,6 +26,9 @@
#include <linux/sched.h>
#include <linux/shrinker.h>
#include <linux/types.h>
+#ifdef CONFIG_ION_POOL_CACHE_POLICY
+#include <asm/cacheflush.h>
+#endif
#include "ion.h"
@@ -345,7 +348,7 @@ void ion_carveout_free(struct ion_heap *heap, ion_phys_addr_t addr,
* functions for creating and destroying a heap pool -- allows you
* to keep a pool of pre allocated memory to use from your heap. Keeping
* a pool of memory that is ready for dma, ie any cached mapping have been
- * invalidated from the cache, provides a significant peformance benefit on
+ * invalidated from the cache, provides a significant performance benefit on
* many systems */
/**
@@ -362,7 +365,7 @@ void ion_carveout_free(struct ion_heap *heap, ion_phys_addr_t addr,
*
* Allows you to keep a pool of pre allocated pages to use from your heap.
* Keeping a pool of pages that is ready for dma, ie any cached mapping have
- * been invalidated from the cache, provides a significant peformance benefit
+ * been invalidated from the cache, provides a significant performance benefit
* on many systems
*/
struct ion_page_pool {
@@ -380,6 +383,37 @@ struct ion_page_pool *ion_page_pool_create(gfp_t gfp_mask, unsigned int order);
void ion_page_pool_destroy(struct ion_page_pool *);
struct page *ion_page_pool_alloc(struct ion_page_pool *);
void ion_page_pool_free(struct ion_page_pool *, struct page *);
+void ion_page_pool_free_immediate(struct ion_page_pool *, struct page *);
+
+#ifdef CONFIG_ION_POOL_CACHE_POLICY
+static inline void ion_page_pool_alloc_set_cache_policy
+ (struct ion_page_pool *pool,
+ struct page *page){
+ void *va = page_address(page);
+
+ if (va)
+ set_memory_wc((unsigned long)va, 1 << pool->order);
+}
+
+static inline void ion_page_pool_free_set_cache_policy
+ (struct ion_page_pool *pool,
+ struct page *page){
+ void *va = page_address(page);
+
+ if (va)
+ set_memory_wb((unsigned long)va, 1 << pool->order);
+
+}
+#else
+static inline void ion_page_pool_alloc_set_cache_policy
+ (struct ion_page_pool *pool,
+ struct page *page){ }
+
+static inline void ion_page_pool_free_set_cache_policy
+ (struct ion_page_pool *pool,
+ struct page *page){ }
+#endif
+
/** ion_page_pool_shrink - shrinks the size of the memory cached in the pool
* @pool: the pool
diff --git a/drivers/staging/android/ion/ion_system_heap.c b/drivers/staging/android/ion/ion_system_heap.c
index da2a63c0a9ba..9adfd6de8f8c 100644
--- a/drivers/staging/android/ion/ion_system_heap.c
+++ b/drivers/staging/android/ion/ion_system_heap.c
@@ -83,10 +83,12 @@ static void free_buffer_page(struct ion_system_heap *heap,
unsigned int order = compound_order(page);
bool cached = ion_buffer_cached(buffer);
- if (!cached && !(buffer->private_flags & ION_PRIV_FLAG_SHRINKER_FREE)) {
+ if (!cached) {
struct ion_page_pool *pool = heap->pools[order_to_index(order)];
-
- ion_page_pool_free(pool, page);
+ if (buffer->private_flags & ION_PRIV_FLAG_SHRINKER_FREE)
+ ion_page_pool_free_immediate(pool, page);
+ else
+ ion_page_pool_free(pool, page);
} else {
__free_pages(page, order);
}
diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c
deleted file mode 100644
index a673ffa34aa3..000000000000
--- a/drivers/staging/android/logger.c
+++ /dev/null
@@ -1,808 +0,0 @@
-/*
- * drivers/misc/logger.c
- *
- * A Logging Subsystem
- *
- * Copyright (C) 2007-2008 Google, Inc.
- *
- * Robert Love <rlove@google.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- */
-
-#define pr_fmt(fmt) "logger: " fmt
-
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/miscdevice.h>
-#include <linux/uaccess.h>
-#include <linux/poll.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/vmalloc.h>
-#include <linux/aio.h>
-#include "logger.h"
-
-#include <asm/ioctls.h>
-
-/**
- * struct logger_log - represents a specific log, such as 'main' or 'radio'
- * @buffer: The actual ring buffer
- * @misc: The "misc" device representing the log
- * @wq: The wait queue for @readers
- * @readers: This log's readers
- * @mutex: The mutex that protects the @buffer
- * @w_off: The current write head offset
- * @head: The head, or location that readers start reading at.
- * @size: The size of the log
- * @logs: The list of log channels
- *
- * This structure lives from module insertion until module removal, so it does
- * not need additional reference counting. The structure is protected by the
- * mutex 'mutex'.
- */
-struct logger_log {
- unsigned char *buffer;
- struct miscdevice misc;
- wait_queue_head_t wq;
- struct list_head readers;
- struct mutex mutex;
- size_t w_off;
- size_t head;
- size_t size;
- struct list_head logs;
-};
-
-static LIST_HEAD(log_list);
-
-
-/**
- * struct logger_reader - a logging device open for reading
- * @log: The associated log
- * @list: The associated entry in @logger_log's list
- * @r_off: The current read head offset.
- * @r_all: Reader can read all entries
- * @r_ver: Reader ABI version
- *
- * This object lives from open to release, so we don't need additional
- * reference counting. The structure is protected by log->mutex.
- */
-struct logger_reader {
- struct logger_log *log;
- struct list_head list;
- size_t r_off;
- bool r_all;
- int r_ver;
-};
-
-/* logger_offset - returns index 'n' into the log via (optimized) modulus */
-static size_t logger_offset(struct logger_log *log, size_t n)
-{
- return n & (log->size - 1);
-}
-
-
-/*
- * file_get_log - Given a file structure, return the associated log
- *
- * This isn't aesthetic. We have several goals:
- *
- * 1) Need to quickly obtain the associated log during an I/O operation
- * 2) Readers need to maintain state (logger_reader)
- * 3) Writers need to be very fast (open() should be a near no-op)
- *
- * In the reader case, we can trivially go file->logger_reader->logger_log.
- * For a writer, we don't want to maintain a logger_reader, so we just go
- * file->logger_log. Thus what file->private_data points at depends on whether
- * or not the file was opened for reading. This function hides that dirtiness.
- */
-static inline struct logger_log *file_get_log(struct file *file)
-{
- if (file->f_mode & FMODE_READ) {
- struct logger_reader *reader = file->private_data;
-
- return reader->log;
- }
- return file->private_data;
-}
-
-/*
- * get_entry_header - returns a pointer to the logger_entry header within
- * 'log' starting at offset 'off'. A temporary logger_entry 'scratch' must
- * be provided. Typically the return value will be a pointer within
- * 'logger->buf'. However, a pointer to 'scratch' may be returned if
- * the log entry spans the end and beginning of the circular buffer.
- */
-static struct logger_entry *get_entry_header(struct logger_log *log,
- size_t off, struct logger_entry *scratch)
-{
- size_t len = min(sizeof(struct logger_entry), log->size - off);
-
- if (len != sizeof(struct logger_entry)) {
- memcpy(((void *) scratch), log->buffer + off, len);
- memcpy(((void *) scratch) + len, log->buffer,
- sizeof(struct logger_entry) - len);
- return scratch;
- }
-
- return (struct logger_entry *) (log->buffer + off);
-}
-
-/*
- * get_entry_msg_len - Grabs the length of the message of the entry
- * starting from from 'off'.
- *
- * An entry length is 2 bytes (16 bits) in host endian order.
- * In the log, the length does not include the size of the log entry structure.
- * This function returns the size including the log entry structure.
- *
- * Caller needs to hold log->mutex.
- */
-static __u32 get_entry_msg_len(struct logger_log *log, size_t off)
-{
- struct logger_entry scratch;
- struct logger_entry *entry;
-
- entry = get_entry_header(log, off, &scratch);
- return entry->len;
-}
-
-static size_t get_user_hdr_len(int ver)
-{
- if (ver < 2)
- return sizeof(struct user_logger_entry_compat);
- return sizeof(struct logger_entry);
-}
-
-static ssize_t copy_header_to_user(int ver, struct logger_entry *entry,
- char __user *buf)
-{
- void *hdr;
- size_t hdr_len;
- struct user_logger_entry_compat v1;
-
- if (ver < 2) {
- v1.len = entry->len;
- v1.__pad = 0;
- v1.pid = entry->pid;
- v1.tid = entry->tid;
- v1.sec = entry->sec;
- v1.nsec = entry->nsec;
- hdr = &v1;
- hdr_len = sizeof(struct user_logger_entry_compat);
- } else {
- hdr = entry;
- hdr_len = sizeof(struct logger_entry);
- }
-
- return copy_to_user(buf, hdr, hdr_len);
-}
-
-/*
- * do_read_log_to_user - reads exactly 'count' bytes from 'log' into the
- * user-space buffer 'buf'. Returns 'count' on success.
- *
- * Caller must hold log->mutex.
- */
-static ssize_t do_read_log_to_user(struct logger_log *log,
- struct logger_reader *reader,
- char __user *buf,
- size_t count)
-{
- struct logger_entry scratch;
- struct logger_entry *entry;
- size_t len;
- size_t msg_start;
-
- /*
- * First, copy the header to userspace, using the version of
- * the header requested
- */
- entry = get_entry_header(log, reader->r_off, &scratch);
- if (copy_header_to_user(reader->r_ver, entry, buf))
- return -EFAULT;
-
- count -= get_user_hdr_len(reader->r_ver);
- buf += get_user_hdr_len(reader->r_ver);
- msg_start = logger_offset(log,
- reader->r_off + sizeof(struct logger_entry));
-
- /*
- * We read from the msg in two disjoint operations. First, we read from
- * the current msg head offset up to 'count' bytes or to the end of
- * the log, whichever comes first.
- */
- len = min(count, log->size - msg_start);
- if (copy_to_user(buf, log->buffer + msg_start, len))
- return -EFAULT;
-
- /*
- * Second, we read any remaining bytes, starting back at the head of
- * the log.
- */
- if (count != len)
- if (copy_to_user(buf + len, log->buffer, count - len))
- return -EFAULT;
-
- reader->r_off = logger_offset(log, reader->r_off +
- sizeof(struct logger_entry) + count);
-
- return count + get_user_hdr_len(reader->r_ver);
-}
-
-/*
- * get_next_entry_by_uid - Starting at 'off', returns an offset into
- * 'log->buffer' which contains the first entry readable by 'euid'
- */
-static size_t get_next_entry_by_uid(struct logger_log *log,
- size_t off, kuid_t euid)
-{
- while (off != log->w_off) {
- struct logger_entry *entry;
- struct logger_entry scratch;
- size_t next_len;
-
- entry = get_entry_header(log, off, &scratch);
-
- if (uid_eq(entry->euid, euid))
- return off;
-
- next_len = sizeof(struct logger_entry) + entry->len;
- off = logger_offset(log, off + next_len);
- }
-
- return off;
-}
-
-/*
- * logger_read - our log's read() method
- *
- * Behavior:
- *
- * - O_NONBLOCK works
- * - If there are no log entries to read, blocks until log is written to
- * - Atomically reads exactly one log entry
- *
- * Will set errno to EINVAL if read
- * buffer is insufficient to hold next entry.
- */
-static ssize_t logger_read(struct file *file, char __user *buf,
- size_t count, loff_t *pos)
-{
- struct logger_reader *reader = file->private_data;
- struct logger_log *log = reader->log;
- ssize_t ret;
- DEFINE_WAIT(wait);
-
-start:
- while (1) {
- mutex_lock(&log->mutex);
-
- prepare_to_wait(&log->wq, &wait, TASK_INTERRUPTIBLE);
-
- ret = (log->w_off == reader->r_off);
- mutex_unlock(&log->mutex);
- if (!ret)
- break;
-
- if (file->f_flags & O_NONBLOCK) {
- ret = -EAGAIN;
- break;
- }
-
- if (signal_pending(current)) {
- ret = -EINTR;
- break;
- }
-
- schedule();
- }
-
- finish_wait(&log->wq, &wait);
- if (ret)
- return ret;
-
- mutex_lock(&log->mutex);
-
- if (!reader->r_all)
- reader->r_off = get_next_entry_by_uid(log,
- reader->r_off, current_euid());
-
- /* is there still something to read or did we race? */
- if (unlikely(log->w_off == reader->r_off)) {
- mutex_unlock(&log->mutex);
- goto start;
- }
-
- /* get the size of the next entry */
- ret = get_user_hdr_len(reader->r_ver) +
- get_entry_msg_len(log, reader->r_off);
- if (count < ret) {
- ret = -EINVAL;
- goto out;
- }
-
- /* get exactly one entry from the log */
- ret = do_read_log_to_user(log, reader, buf, ret);
-
-out:
- mutex_unlock(&log->mutex);
-
- return ret;
-}
-
-/*
- * get_next_entry - return the offset of the first valid entry at least 'len'
- * bytes after 'off'.
- *
- * Caller must hold log->mutex.
- */
-static size_t get_next_entry(struct logger_log *log, size_t off, size_t len)
-{
- size_t count = 0;
-
- do {
- size_t nr = sizeof(struct logger_entry) +
- get_entry_msg_len(log, off);
- off = logger_offset(log, off + nr);
- count += nr;
- } while (count < len);
-
- return off;
-}
-
-/*
- * is_between - is a < c < b, accounting for wrapping of a, b, and c
- * positions in the buffer
- *
- * That is, if a<b, check for c between a and b
- * and if a>b, check for c outside (not between) a and b
- *
- * |------- a xxxxxxxx b --------|
- * c^
- *
- * |xxxxx b --------- a xxxxxxxxx|
- * c^
- * or c^
- */
-static inline int is_between(size_t a, size_t b, size_t c)
-{
- if (a < b) {
- /* is c between a and b? */
- if (a < c && c <= b)
- return 1;
- } else {
- /* is c outside of b through a? */
- if (c <= b || a < c)
- return 1;
- }
-
- return 0;
-}
-
-/*
- * fix_up_readers - walk the list of all readers and "fix up" any who were
- * lapped by the writer; also do the same for the default "start head".
- * We do this by "pulling forward" the readers and start head to the first
- * entry after the new write head.
- *
- * The caller needs to hold log->mutex.
- */
-static void fix_up_readers(struct logger_log *log, size_t len)
-{
- size_t old = log->w_off;
- size_t new = logger_offset(log, old + len);
- struct logger_reader *reader;
-
- if (is_between(old, new, log->head))
- log->head = get_next_entry(log, log->head, len);
-
- list_for_each_entry(reader, &log->readers, list)
- if (is_between(old, new, reader->r_off))
- reader->r_off = get_next_entry(log, reader->r_off, len);
-}
-
-/*
- * logger_write_iter - our write method, implementing support for write(),
- * writev(), and aio_write(). Writes are our fast path, and we try to optimize
- * them above all else.
- */
-static ssize_t logger_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- struct logger_log *log = file_get_log(iocb->ki_filp);
- struct logger_entry header;
- struct timespec now;
- size_t len, count, w_off;
-
- count = min_t(size_t, iocb->ki_nbytes, LOGGER_ENTRY_MAX_PAYLOAD);
-
- now = current_kernel_time();
-
- header.pid = current->tgid;
- header.tid = current->pid;
- header.sec = now.tv_sec;
- header.nsec = now.tv_nsec;
- header.euid = current_euid();
- header.len = count;
- header.hdr_size = sizeof(struct logger_entry);
-
- /* null writes succeed, return zero */
- if (unlikely(!header.len))
- return 0;
-
- mutex_lock(&log->mutex);
-
- /*
- * Fix up any readers, pulling them forward to the first readable
- * entry after (what will be) the new write offset. We do this now
- * because if we partially fail, we can end up with clobbered log
- * entries that encroach on readable buffer.
- */
- fix_up_readers(log, sizeof(struct logger_entry) + header.len);
-
- len = min(sizeof(header), log->size - log->w_off);
- memcpy(log->buffer + log->w_off, &header, len);
- memcpy(log->buffer, (char *)&header + len, sizeof(header) - len);
-
- /* Work with a copy until we are ready to commit the whole entry */
- w_off = logger_offset(log, log->w_off + sizeof(struct logger_entry));
-
- len = min(count, log->size - w_off);
-
- if (copy_from_iter(log->buffer + w_off, len, from) != len) {
- /*
- * Note that by not updating log->w_off, this abandons the
- * portion of the new entry that *was* successfully
- * copied, just above. This is intentional to avoid
- * message corruption from missing fragments.
- */
- mutex_unlock(&log->mutex);
- return -EFAULT;
- }
-
- if (copy_from_iter(log->buffer, count - len, from) != count - len) {
- mutex_unlock(&log->mutex);
- return -EFAULT;
- }
-
- log->w_off = logger_offset(log, w_off + count);
- mutex_unlock(&log->mutex);
-
- /* wake up any blocked readers */
- wake_up_interruptible(&log->wq);
-
- return len;
-}
-
-static struct logger_log *get_log_from_minor(int minor)
-{
- struct logger_log *log;
-
- list_for_each_entry(log, &log_list, logs)
- if (log->misc.minor == minor)
- return log;
- return NULL;
-}
-
-/*
- * logger_open - the log's open() file operation
- *
- * Note how near a no-op this is in the write-only case. Keep it that way!
- */
-static int logger_open(struct inode *inode, struct file *file)
-{
- struct logger_log *log;
- int ret;
-
- ret = nonseekable_open(inode, file);
- if (ret)
- return ret;
-
- log = get_log_from_minor(MINOR(inode->i_rdev));
- if (!log)
- return -ENODEV;
-
- if (file->f_mode & FMODE_READ) {
- struct logger_reader *reader;
-
- reader = kmalloc(sizeof(struct logger_reader), GFP_KERNEL);
- if (!reader)
- return -ENOMEM;
-
- reader->log = log;
- reader->r_ver = 1;
- reader->r_all = in_egroup_p(inode->i_gid) ||
- capable(CAP_SYSLOG);
-
- INIT_LIST_HEAD(&reader->list);
-
- mutex_lock(&log->mutex);
- reader->r_off = log->head;
- list_add_tail(&reader->list, &log->readers);
- mutex_unlock(&log->mutex);
-
- file->private_data = reader;
- } else
- file->private_data = log;
-
- return 0;
-}
-
-/*
- * logger_release - the log's release file operation
- *
- * Note this is a total no-op in the write-only case. Keep it that way!
- */
-static int logger_release(struct inode *ignored, struct file *file)
-{
- if (file->f_mode & FMODE_READ) {
- struct logger_reader *reader = file->private_data;
- struct logger_log *log = reader->log;
-
- mutex_lock(&log->mutex);
- list_del(&reader->list);
- mutex_unlock(&log->mutex);
-
- kfree(reader);
- }
-
- return 0;
-}
-
-/*
- * logger_poll - the log's poll file operation, for poll/select/epoll
- *
- * Note we always return POLLOUT, because you can always write() to the log.
- * Note also that, strictly speaking, a return value of POLLIN does not
- * guarantee that the log is readable without blocking, as there is a small
- * chance that the writer can lap the reader in the interim between poll()
- * returning and the read() request.
- */
-static unsigned int logger_poll(struct file *file, poll_table *wait)
-{
- struct logger_reader *reader;
- struct logger_log *log;
- unsigned int ret = POLLOUT | POLLWRNORM;
-
- if (!(file->f_mode & FMODE_READ))
- return ret;
-
- reader = file->private_data;
- log = reader->log;
-
- poll_wait(file, &log->wq, wait);
-
- mutex_lock(&log->mutex);
- if (!reader->r_all)
- reader->r_off = get_next_entry_by_uid(log,
- reader->r_off, current_euid());
-
- if (log->w_off != reader->r_off)
- ret |= POLLIN | POLLRDNORM;
- mutex_unlock(&log->mutex);
-
- return ret;
-}
-
-static long logger_set_version(struct logger_reader *reader, void __user *arg)
-{
- int version;
-
- if (copy_from_user(&version, arg, sizeof(int)))
- return -EFAULT;
-
- if ((version < 1) || (version > 2))
- return -EINVAL;
-
- reader->r_ver = version;
- return 0;
-}
-
-static long logger_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- struct logger_log *log = file_get_log(file);
- struct logger_reader *reader;
- long ret = -EINVAL;
- void __user *argp = (void __user *) arg;
-
- mutex_lock(&log->mutex);
-
- switch (cmd) {
- case LOGGER_GET_LOG_BUF_SIZE:
- ret = log->size;
- break;
- case LOGGER_GET_LOG_LEN:
- if (!(file->f_mode & FMODE_READ)) {
- ret = -EBADF;
- break;
- }
- reader = file->private_data;
- if (log->w_off >= reader->r_off)
- ret = log->w_off - reader->r_off;
- else
- ret = (log->size - reader->r_off) + log->w_off;
- break;
- case LOGGER_GET_NEXT_ENTRY_LEN:
- if (!(file->f_mode & FMODE_READ)) {
- ret = -EBADF;
- break;
- }
- reader = file->private_data;
-
- if (!reader->r_all)
- reader->r_off = get_next_entry_by_uid(log,
- reader->r_off, current_euid());
-
- if (log->w_off != reader->r_off)
- ret = get_user_hdr_len(reader->r_ver) +
- get_entry_msg_len(log, reader->r_off);
- else
- ret = 0;
- break;
- case LOGGER_FLUSH_LOG:
- if (!(file->f_mode & FMODE_WRITE)) {
- ret = -EBADF;
- break;
- }
- if (!(in_egroup_p(file_inode(file)->i_gid) ||
- capable(CAP_SYSLOG))) {
- ret = -EPERM;
- break;
- }
- list_for_each_entry(reader, &log->readers, list)
- reader->r_off = log->w_off;
- log->head = log->w_off;
- ret = 0;
- break;
- case LOGGER_GET_VERSION:
- if (!(file->f_mode & FMODE_READ)) {
- ret = -EBADF;
- break;
- }
- reader = file->private_data;
- ret = reader->r_ver;
- break;
- case LOGGER_SET_VERSION:
- if (!(file->f_mode & FMODE_READ)) {
- ret = -EBADF;
- break;
- }
- reader = file->private_data;
- ret = logger_set_version(reader, argp);
- break;
- }
-
- mutex_unlock(&log->mutex);
-
- return ret;
-}
-
-static const struct file_operations logger_fops = {
- .owner = THIS_MODULE,
- .read = logger_read,
- .write_iter = logger_write_iter,
- .poll = logger_poll,
- .unlocked_ioctl = logger_ioctl,
- .compat_ioctl = logger_ioctl,
- .open = logger_open,
- .release = logger_release,
-};
-
-/*
- * Log size must must be a power of two, and greater than
- * (LOGGER_ENTRY_MAX_PAYLOAD + sizeof(struct logger_entry)).
- */
-static int __init create_log(char *log_name, int size)
-{
- int ret = 0;
- struct logger_log *log;
- unsigned char *buffer;
-
- buffer = vmalloc(size);
- if (buffer == NULL)
- return -ENOMEM;
-
- log = kzalloc(sizeof(struct logger_log), GFP_KERNEL);
- if (log == NULL) {
- ret = -ENOMEM;
- goto out_free_buffer;
- }
- log->buffer = buffer;
-
- log->misc.minor = MISC_DYNAMIC_MINOR;
- log->misc.name = kstrdup(log_name, GFP_KERNEL);
- if (log->misc.name == NULL) {
- ret = -ENOMEM;
- goto out_free_log;
- }
-
- log->misc.fops = &logger_fops;
- log->misc.parent = NULL;
-
- init_waitqueue_head(&log->wq);
- INIT_LIST_HEAD(&log->readers);
- mutex_init(&log->mutex);
- log->w_off = 0;
- log->head = 0;
- log->size = size;
-
- INIT_LIST_HEAD(&log->logs);
- list_add_tail(&log->logs, &log_list);
-
- /* finally, initialize the misc device for this log */
- ret = misc_register(&log->misc);
- if (unlikely(ret)) {
- pr_err("failed to register misc device for log '%s'!\n",
- log->misc.name);
- goto out_free_misc_name;
- }
-
- pr_info("created %luK log '%s'\n",
- (unsigned long) log->size >> 10, log->misc.name);
-
- return 0;
-
-out_free_misc_name:
- kfree(log->misc.name);
-
-out_free_log:
- kfree(log);
-
-out_free_buffer:
- vfree(buffer);
- return ret;
-}
-
-static int __init logger_init(void)
-{
- int ret;
-
- ret = create_log(LOGGER_LOG_MAIN, 256*1024);
- if (unlikely(ret))
- goto out;
-
- ret = create_log(LOGGER_LOG_EVENTS, 256*1024);
- if (unlikely(ret))
- goto out;
-
- ret = create_log(LOGGER_LOG_RADIO, 256*1024);
- if (unlikely(ret))
- goto out;
-
- ret = create_log(LOGGER_LOG_SYSTEM, 256*1024);
- if (unlikely(ret))
- goto out;
-
-out:
- return ret;
-}
-
-static void __exit logger_exit(void)
-{
- struct logger_log *current_log, *next_log;
-
- list_for_each_entry_safe(current_log, next_log, &log_list, logs) {
- /* we have to delete all the entry inside log_list */
- misc_deregister(&current_log->misc);
- vfree(current_log->buffer);
- kfree(current_log->misc.name);
- list_del(&current_log->logs);
- kfree(current_log);
- }
-}
-
-
-device_initcall(logger_init);
-module_exit(logger_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Robert Love, <rlove@google.com>");
-MODULE_DESCRIPTION("Android Logger");
diff --git a/drivers/staging/android/logger.h b/drivers/staging/android/logger.h
deleted file mode 100644
index 70af7d805dff..000000000000
--- a/drivers/staging/android/logger.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* include/linux/logger.h
- *
- * Copyright (C) 2007-2008 Google, Inc.
- * Author: Robert Love <rlove@android.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef _LINUX_LOGGER_H
-#define _LINUX_LOGGER_H
-
-#include <linux/types.h>
-#include <linux/ioctl.h>
-
-/**
- * struct user_logger_entry_compat - defines a single entry that is given to a logger
- * @len: The length of the payload
- * @__pad: Two bytes of padding that appear to be required
- * @pid: The generating process' process ID
- * @tid: The generating process' thread ID
- * @sec: The number of seconds that have elapsed since the Epoch
- * @nsec: The number of nanoseconds that have elapsed since @sec
- * @msg: The message that is to be logged
- *
- * The userspace structure for version 1 of the logger_entry ABI.
- * This structure is returned to userspace unless the caller requests
- * an upgrade to a newer ABI version.
- */
-struct user_logger_entry_compat {
- __u16 len;
- __u16 __pad;
- __s32 pid;
- __s32 tid;
- __s32 sec;
- __s32 nsec;
- char msg[0];
-};
-
-/**
- * struct logger_entry - defines a single entry that is given to a logger
- * @len: The length of the payload
- * @hdr_size: sizeof(struct logger_entry_v2)
- * @pid: The generating process' process ID
- * @tid: The generating process' thread ID
- * @sec: The number of seconds that have elapsed since the Epoch
- * @nsec: The number of nanoseconds that have elapsed since @sec
- * @euid: Effective UID of logger
- * @msg: The message that is to be logged
- *
- * The structure for version 2 of the logger_entry ABI.
- * This structure is returned to userspace if ioctl(LOGGER_SET_VERSION)
- * is called with version >= 2
- */
-struct logger_entry {
- __u16 len;
- __u16 hdr_size;
- __s32 pid;
- __s32 tid;
- __s32 sec;
- __s32 nsec;
- kuid_t euid;
- char msg[0];
-};
-
-#define LOGGER_LOG_RADIO "log_radio" /* radio-related messages */
-#define LOGGER_LOG_EVENTS "log_events" /* system/hardware events */
-#define LOGGER_LOG_SYSTEM "log_system" /* system/framework messages */
-#define LOGGER_LOG_MAIN "log_main" /* everything else */
-
-#define LOGGER_ENTRY_MAX_PAYLOAD 4076
-
-#define __LOGGERIO 0xAE
-
-#define LOGGER_GET_LOG_BUF_SIZE _IO(__LOGGERIO, 1) /* size of log */
-#define LOGGER_GET_LOG_LEN _IO(__LOGGERIO, 2) /* used log len */
-#define LOGGER_GET_NEXT_ENTRY_LEN _IO(__LOGGERIO, 3) /* next entry len */
-#define LOGGER_FLUSH_LOG _IO(__LOGGERIO, 4) /* flush log */
-#define LOGGER_GET_VERSION _IO(__LOGGERIO, 5) /* abi version */
-#define LOGGER_SET_VERSION _IO(__LOGGERIO, 6) /* abi version */
-
-#endif /* _LINUX_LOGGER_H */
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index b545d3d1da3e..6fe48eb9bda4 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -39,9 +39,11 @@
#include <linux/sched.h>
#include <linux/swap.h>
#include <linux/rcupdate.h>
-#include <linux/profile.h>
#include <linux/notifier.h>
+#define CREATE_TRACE_POINTS
+#include "trace/lowmemorykiller.h"
+
static uint32_t lowmem_debug_level = 1;
static short lowmem_adj[6] = {
0,
@@ -83,12 +85,14 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
int tasksize;
int i;
short min_score_adj = OOM_SCORE_ADJ_MAX + 1;
+ int minfree = 0;
int selected_tasksize = 0;
short selected_oom_score_adj;
int array_size = ARRAY_SIZE(lowmem_adj);
int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
int other_file = global_page_state(NR_FILE_PAGES) -
global_page_state(NR_SHMEM) -
+ global_page_state(NR_UNEVICTABLE) -
total_swapcache_pages();
if (lowmem_adj_size < array_size)
@@ -96,8 +100,8 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
if (lowmem_minfree_size < array_size)
array_size = lowmem_minfree_size;
for (i = 0; i < array_size; i++) {
- if (other_free < lowmem_minfree[i] &&
- other_file < lowmem_minfree[i]) {
+ minfree = lowmem_minfree[i];
+ if (other_free < minfree && other_file < minfree) {
min_score_adj = lowmem_adj[i];
break;
}
@@ -152,13 +156,25 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
selected = p;
selected_tasksize = tasksize;
selected_oom_score_adj = oom_score_adj;
- lowmem_print(2, "select %d (%s), adj %hd, size %d, to kill\n",
- p->pid, p->comm, oom_score_adj, tasksize);
+ lowmem_print(2, "select '%s' (%d), adj %hd, size %d, to kill\n",
+ p->comm, p->pid, oom_score_adj, tasksize);
}
if (selected) {
- lowmem_print(1, "send sigkill to %d (%s), adj %hd, size %d\n",
- selected->pid, selected->comm,
- selected_oom_score_adj, selected_tasksize);
+ long cache_size = other_file * (long)(PAGE_SIZE / 1024);
+ long cache_limit = minfree * (long)(PAGE_SIZE / 1024);
+ long free = other_free * (long)(PAGE_SIZE / 1024);
+ trace_lowmemory_kill(selected, cache_size, cache_limit, free);
+ lowmem_print(1, "Killing '%s' (%d), adj %hd,\n" \
+ " to free %ldkB on behalf of '%s' (%d) because\n" \
+ " cache %ldkB is below limit %ldkB for oom_score_adj %hd\n" \
+ " Free memory is %ldkB above reserved\n",
+ selected->comm, selected->pid,
+ selected_oom_score_adj,
+ selected_tasksize * (long)(PAGE_SIZE / 1024),
+ current->comm, current->pid,
+ cache_size, cache_limit,
+ min_score_adj,
+ free);
lowmem_deathpending_timeout = jiffies + HZ;
set_tsk_thread_flag(selected, TIF_MEMDIE);
send_sig(SIGKILL, selected, 0);
@@ -188,9 +204,92 @@ static void __exit lowmem_exit(void)
unregister_shrinker(&lowmem_shrinker);
}
+#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_AUTODETECT_OOM_ADJ_VALUES
+static short lowmem_oom_adj_to_oom_score_adj(short oom_adj)
+{
+ if (oom_adj == OOM_ADJUST_MAX)
+ return OOM_SCORE_ADJ_MAX;
+ else
+ return (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
+}
+
+static void lowmem_autodetect_oom_adj_values(void)
+{
+ int i;
+ short oom_adj;
+ short oom_score_adj;
+ int array_size = ARRAY_SIZE(lowmem_adj);
+
+ if (lowmem_adj_size < array_size)
+ array_size = lowmem_adj_size;
+
+ if (array_size <= 0)
+ return;
+
+ oom_adj = lowmem_adj[array_size - 1];
+ if (oom_adj > OOM_ADJUST_MAX)
+ return;
+
+ oom_score_adj = lowmem_oom_adj_to_oom_score_adj(oom_adj);
+ if (oom_score_adj <= OOM_ADJUST_MAX)
+ return;
+
+ lowmem_print(1, "lowmem_shrink: convert oom_adj to oom_score_adj:\n");
+ for (i = 0; i < array_size; i++) {
+ oom_adj = lowmem_adj[i];
+ oom_score_adj = lowmem_oom_adj_to_oom_score_adj(oom_adj);
+ lowmem_adj[i] = oom_score_adj;
+ lowmem_print(1, "oom_adj %d => oom_score_adj %d\n",
+ oom_adj, oom_score_adj);
+ }
+}
+
+static int lowmem_adj_array_set(const char *val, const struct kernel_param *kp)
+{
+ int ret;
+
+ ret = param_array_ops.set(val, kp);
+
+ /* HACK: Autodetect oom_adj values in lowmem_adj array */
+ lowmem_autodetect_oom_adj_values();
+
+ return ret;
+}
+
+static int lowmem_adj_array_get(char *buffer, const struct kernel_param *kp)
+{
+ return param_array_ops.get(buffer, kp);
+}
+
+static void lowmem_adj_array_free(void *arg)
+{
+ param_array_ops.free(arg);
+}
+
+static struct kernel_param_ops lowmem_adj_array_ops = {
+ .set = lowmem_adj_array_set,
+ .get = lowmem_adj_array_get,
+ .free = lowmem_adj_array_free,
+};
+
+static const struct kparam_array __param_arr_adj = {
+ .max = ARRAY_SIZE(lowmem_adj),
+ .num = &lowmem_adj_size,
+ .ops = &param_ops_short,
+ .elemsize = sizeof(lowmem_adj[0]),
+ .elem = lowmem_adj,
+};
+#endif
+
module_param_named(cost, lowmem_shrinker.seeks, int, S_IRUGO | S_IWUSR);
+#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_AUTODETECT_OOM_ADJ_VALUES
+module_param_cb(adj, &lowmem_adj_array_ops,
+ .arr = &__param_arr_adj, S_IRUGO | S_IWUSR);
+__MODULE_PARM_TYPE(adj, "array of short");
+#else
module_param_array_named(adj, lowmem_adj, short, &lowmem_adj_size,
S_IRUGO | S_IWUSR);
+#endif
module_param_array_named(minfree, lowmem_minfree, uint, &lowmem_minfree_size,
S_IRUGO | S_IWUSR);
module_param_named(debug_level, lowmem_debug_level, uint, S_IRUGO | S_IWUSR);
diff --git a/drivers/staging/android/sync.c b/drivers/staging/android/sync.c
index f83e00c78051..da101a506cd2 100644
--- a/drivers/staging/android/sync.c
+++ b/drivers/staging/android/sync.c
@@ -465,6 +465,13 @@ static bool android_fence_enable_signaling(struct fence *fence)
return true;
}
+static void android_fence_disable_signaling(struct fence *fence)
+{
+ struct sync_pt *pt = container_of(fence, struct sync_pt, base);
+
+ list_del_init(&pt->active_list);
+}
+
static int android_fence_fill_driver_data(struct fence *fence,
void *data, int size)
{
@@ -508,6 +515,7 @@ static const struct fence_ops android_fence_ops = {
.get_driver_name = android_fence_get_driver_name,
.get_timeline_name = android_fence_get_timeline_name,
.enable_signaling = android_fence_enable_signaling,
+ .disable_signaling = android_fence_disable_signaling,
.signaled = android_fence_signaled,
.wait = fence_default_wait,
.release = android_fence_release,
@@ -519,12 +527,10 @@ static const struct fence_ops android_fence_ops = {
static void sync_fence_free(struct kref *kref)
{
struct sync_fence *fence = container_of(kref, struct sync_fence, kref);
- int i, status = atomic_read(&fence->status);
+ int i;
for (i = 0; i < fence->num_fences; ++i) {
- if (status)
- fence_remove_callback(fence->cbs[i].sync_pt,
- &fence->cbs[i].cb);
+ fence_remove_callback(fence->cbs[i].sync_pt, &fence->cbs[i].cb);
fence_put(fence->cbs[i].sync_pt);
}
diff --git a/drivers/staging/android/trace/lowmemorykiller.h b/drivers/staging/android/trace/lowmemorykiller.h
new file mode 100644
index 000000000000..f43d3fae75ee
--- /dev/null
+++ b/drivers/staging/android/trace/lowmemorykiller.h
@@ -0,0 +1,41 @@
+#undef TRACE_SYSTEM
+#define TRACE_INCLUDE_PATH ../../drivers/staging/android/trace
+#define TRACE_SYSTEM lowmemorykiller
+
+#if !defined(_TRACE_LOWMEMORYKILLER_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_LOWMEMORYKILLER_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(lowmemory_kill,
+ TP_PROTO(struct task_struct *killed_task, long cache_size, \
+ long cache_limit, long free),
+
+ TP_ARGS(killed_task, cache_size, cache_limit, free),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(long, pagecache_size)
+ __field(long, pagecache_limit)
+ __field(long, free)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, killed_task->comm, TASK_COMM_LEN);
+ __entry->pid = killed_task->pid;
+ __entry->pagecache_size = cache_size;
+ __entry->pagecache_limit = cache_limit;
+ __entry->free = free;
+ ),
+
+ TP_printk("%s (%d), page cache %ldkB (limit %ldkB), free %ldKb",
+ __entry->comm, __entry->pid, __entry->pagecache_size,
+ __entry->pagecache_limit, __entry->free)
+);
+
+
+#endif /* if !defined(_TRACE_LOWMEMORYKILLER_H) || defined(TRACE_HEADER_MULTI_READ) */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/drivers/staging/android/uapi/android_alarm.h b/drivers/staging/android/uapi/android_alarm.h
deleted file mode 100644
index aa013f6f5f3a..000000000000
--- a/drivers/staging/android/uapi/android_alarm.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* drivers/staging/android/uapi/android_alarm.h
- *
- * Copyright (C) 2006-2007 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef _UAPI_LINUX_ANDROID_ALARM_H
-#define _UAPI_LINUX_ANDROID_ALARM_H
-
-#include <linux/ioctl.h>
-#include <linux/time.h>
-
-enum android_alarm_type {
- /* return code bit numbers or set alarm arg */
- ANDROID_ALARM_RTC_WAKEUP,
- ANDROID_ALARM_RTC,
- ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP,
- ANDROID_ALARM_ELAPSED_REALTIME,
- ANDROID_ALARM_SYSTEMTIME,
-
- ANDROID_ALARM_TYPE_COUNT,
-
- /* return code bit numbers */
- /* ANDROID_ALARM_TIME_CHANGE = 16 */
-};
-
-enum android_alarm_return_flags {
- ANDROID_ALARM_RTC_WAKEUP_MASK = 1U << ANDROID_ALARM_RTC_WAKEUP,
- ANDROID_ALARM_RTC_MASK = 1U << ANDROID_ALARM_RTC,
- ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP_MASK =
- 1U << ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP,
- ANDROID_ALARM_ELAPSED_REALTIME_MASK =
- 1U << ANDROID_ALARM_ELAPSED_REALTIME,
- ANDROID_ALARM_SYSTEMTIME_MASK = 1U << ANDROID_ALARM_SYSTEMTIME,
- ANDROID_ALARM_TIME_CHANGE_MASK = 1U << 16
-};
-
-/* Disable alarm */
-#define ANDROID_ALARM_CLEAR(type) _IO('a', 0 | ((type) << 4))
-
-/* Ack last alarm and wait for next */
-#define ANDROID_ALARM_WAIT _IO('a', 1)
-
-#define ALARM_IOW(c, type, size) _IOW('a', (c) | ((type) << 4), size)
-/* Set alarm */
-#define ANDROID_ALARM_SET(type) ALARM_IOW(2, type, struct timespec)
-#define ANDROID_ALARM_SET_AND_WAIT(type) ALARM_IOW(3, type, struct timespec)
-#define ANDROID_ALARM_GET_TIME(type) ALARM_IOW(4, type, struct timespec)
-#define ANDROID_ALARM_SET_RTC _IOW('a', 5, struct timespec)
-#define ANDROID_ALARM_BASE_CMD(cmd) (cmd & ~(_IOC(0, 0, 0xf0, 0)))
-#define ANDROID_ALARM_IOCTL_TO_TYPE(cmd) (_IOC_NR(cmd) >> 4)
-
-#endif
diff --git a/drivers/staging/android/uapi/ashmem.h b/drivers/staging/android/uapi/ashmem.h
index ba4743c71d6b..13df42d200b7 100644
--- a/drivers/staging/android/uapi/ashmem.h
+++ b/drivers/staging/android/uapi/ashmem.h
@@ -13,6 +13,7 @@
#define _UAPI_LINUX_ASHMEM_H
#include <linux/ioctl.h>
+#include <linux/types.h>
#define ASHMEM_NAME_LEN 256
diff --git a/drivers/staging/android/uapi/binder.h b/drivers/staging/android/uapi/binder.h
index dba4cef3a8d3..0fd7ce41c75f 100644
--- a/drivers/staging/android/uapi/binder.h
+++ b/drivers/staging/android/uapi/binder.h
@@ -32,6 +32,8 @@ enum {
BINDER_TYPE_HANDLE = B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE),
BINDER_TYPE_WEAK_HANDLE = B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE),
BINDER_TYPE_FD = B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE),
+ BINDER_TYPE_FDA = B_PACK_CHARS('f', 'd', 'a', B_TYPE_LARGE),
+ BINDER_TYPE_PTR = B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE),
};
enum {
@@ -47,6 +49,14 @@ typedef __u64 binder_size_t;
typedef __u64 binder_uintptr_t;
#endif
+/**
+ * struct binder_object_header - header shared by all binder metadata objects.
+ * @type: type of the object
+ */
+struct binder_object_header {
+ __u32 type;
+};
+
/*
* This is the flattened representation of a Binder object for transfer
* between processes. The 'offsets' supplied as part of a binder transaction
@@ -55,9 +65,8 @@ typedef __u64 binder_uintptr_t;
* between processes.
*/
struct flat_binder_object {
- /* 8 bytes for large_flat_header. */
- __u32 type;
- __u32 flags;
+ struct binder_object_header hdr;
+ __u32 flags;
/* 8 bytes of data. */
union {
@@ -69,6 +78,86 @@ struct flat_binder_object {
binder_uintptr_t cookie;
};
+/**
+ * struct binder_fd_object - describes a filedescriptor to be fixed up.
+ * @hdr: common header structure
+ * @pad_flags: padding to remain compatible with old userspace code
+ * @pad_binder: padding to remain compatible with old userspace code
+ * @fd: file descriptor
+ * @cookie: opaque data, used by user-space
+ */
+struct binder_fd_object {
+ struct binder_object_header hdr;
+ __u32 pad_flags;
+ union {
+ binder_uintptr_t pad_binder;
+ __u32 fd;
+ };
+
+ binder_uintptr_t cookie;
+};
+
+/* struct binder_buffer_object - object describing a userspace buffer
+ * @hdr: common header structure
+ * @flags: one or more BINDER_BUFFER_* flags
+ * @buffer: address of the buffer
+ * @length: length of the buffer
+ * @parent: index in offset array pointing to parent buffer
+ * @parent_offset: offset in @parent pointing to this buffer
+ *
+ * A binder_buffer object represents an object that the
+ * binder kernel driver can copy verbatim to the target
+ * address space. A buffer itself may be pointed to from
+ * within another buffer, meaning that the pointer inside
+ * that other buffer needs to be fixed up as well. This
+ * can be done by setting the BINDER_BUFFER_FLAG_HAS_PARENT
+ * flag in @flags, by setting @parent buffer to the index
+ * in the offset array pointing to the parent binder_buffer_object,
+ * and by setting @parent_offset to the offset in the parent buffer
+ * at which the pointer to this buffer is located.
+ */
+struct binder_buffer_object {
+ struct binder_object_header hdr;
+ __u32 flags;
+ binder_uintptr_t buffer;
+ binder_size_t length;
+ binder_size_t parent;
+ binder_size_t parent_offset;
+};
+
+enum {
+ BINDER_BUFFER_FLAG_HAS_PARENT = 0x01,
+};
+
+/* struct binder_fd_array_object - object describing an array of fds in a buffer
+ * @hdr: common header structure
+ * @pad: padding to ensure correct alignment
+ * @num_fds: number of file descriptors in the buffer
+ * @parent: index in offset array to buffer holding the fd array
+ * @parent_offset: start offset of fd array in the buffer
+ *
+ * A binder_fd_array object represents an array of file
+ * descriptors embedded in a binder_buffer_object. It is
+ * different from a regular binder_buffer_object because it
+ * describes a list of file descriptors to fix up, not an opaque
+ * blob of memory, and hence the kernel needs to treat it differently.
+ *
+ * An example of how this would be used is with Android's
+ * native_handle_t object, which is a struct with a list of integers
+ * and a list of file descriptors. The native_handle_t struct itself
+ * will be represented by a struct binder_buffer_objct, whereas the
+ * embedded list of file descriptors is represented by a
+ * struct binder_fd_array_object with that binder_buffer_object as
+ * a parent.
+ */
+struct binder_fd_array_object {
+ struct binder_object_header hdr;
+ __u32 pad;
+ binder_size_t num_fds;
+ binder_size_t parent;
+ binder_size_t parent_offset;
+};
+
/*
* On 64-bit platforms where user code may run in 32-bits the driver must
* translate the buffer (and local binder) addresses appropriately.
@@ -161,6 +250,11 @@ struct binder_transaction_data {
} data;
};
+struct binder_transaction_data_sg {
+ struct binder_transaction_data transaction_data;
+ binder_size_t buffers_size;
+};
+
struct binder_ptr_cookie {
binder_uintptr_t ptr;
binder_uintptr_t cookie;
@@ -345,6 +439,12 @@ enum binder_driver_command_protocol {
/*
* void *: cookie
*/
+
+ BC_TRANSACTION_SG = _IOW('c', 17, struct binder_transaction_data_sg),
+ BC_REPLY_SG = _IOW('c', 18, struct binder_transaction_data_sg),
+ /*
+ * binder_transaction_data_sg: the sent command.
+ */
};
#endif /* _UAPI_LINUX_BINDER_H */
diff --git a/drivers/switch/Kconfig b/drivers/switch/Kconfig
new file mode 100644
index 000000000000..19404b6f7778
--- /dev/null
+++ b/drivers/switch/Kconfig
@@ -0,0 +1,15 @@
+menuconfig SWITCH
+ tristate "Switch class support"
+ help
+ Say Y here to enable switch class support. This allows
+ monitoring switches by userspace via sysfs and uevent.
+
+if SWITCH
+
+config SWITCH_GPIO
+ tristate "GPIO Swith support"
+ depends on GPIOLIB
+ help
+ Say Y here to enable GPIO based switch support.
+
+endif # SWITCH
diff --git a/drivers/switch/Makefile b/drivers/switch/Makefile
new file mode 100644
index 000000000000..f7606ed4a719
--- /dev/null
+++ b/drivers/switch/Makefile
@@ -0,0 +1,4 @@
+# Switch Class Driver
+obj-$(CONFIG_SWITCH) += switch_class.o
+obj-$(CONFIG_SWITCH_GPIO) += switch_gpio.o
+
diff --git a/drivers/switch/switch_class.c b/drivers/switch/switch_class.c
new file mode 100644
index 000000000000..e373b625806e
--- /dev/null
+++ b/drivers/switch/switch_class.c
@@ -0,0 +1,174 @@
+/*
+ * drivers/switch/switch_class.c
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+*/
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/switch.h>
+
+struct class *switch_class;
+static atomic_t device_count;
+
+static ssize_t state_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct switch_dev *sdev = (struct switch_dev *)
+ dev_get_drvdata(dev);
+
+ if (sdev->print_state) {
+ int ret = sdev->print_state(sdev, buf);
+ if (ret >= 0)
+ return ret;
+ }
+ return sprintf(buf, "%d\n", sdev->state);
+}
+
+static ssize_t name_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct switch_dev *sdev = (struct switch_dev *)
+ dev_get_drvdata(dev);
+
+ if (sdev->print_name) {
+ int ret = sdev->print_name(sdev, buf);
+ if (ret >= 0)
+ return ret;
+ }
+ return sprintf(buf, "%s\n", sdev->name);
+}
+
+static DEVICE_ATTR(state, S_IRUGO, state_show, NULL);
+static DEVICE_ATTR(name, S_IRUGO, name_show, NULL);
+
+void switch_set_state(struct switch_dev *sdev, int state)
+{
+ char name_buf[120];
+ char state_buf[120];
+ char *prop_buf;
+ char *envp[3];
+ int env_offset = 0;
+ int length;
+
+ if (sdev->state != state) {
+ sdev->state = state;
+
+ prop_buf = (char *)get_zeroed_page(GFP_KERNEL);
+ if (prop_buf) {
+ length = name_show(sdev->dev, NULL, prop_buf);
+ if (length > 0) {
+ if (prop_buf[length - 1] == '\n')
+ prop_buf[length - 1] = 0;
+ snprintf(name_buf, sizeof(name_buf),
+ "SWITCH_NAME=%s", prop_buf);
+ envp[env_offset++] = name_buf;
+ }
+ length = state_show(sdev->dev, NULL, prop_buf);
+ if (length > 0) {
+ if (prop_buf[length - 1] == '\n')
+ prop_buf[length - 1] = 0;
+ snprintf(state_buf, sizeof(state_buf),
+ "SWITCH_STATE=%s", prop_buf);
+ envp[env_offset++] = state_buf;
+ }
+ envp[env_offset] = NULL;
+ kobject_uevent_env(&sdev->dev->kobj, KOBJ_CHANGE, envp);
+ free_page((unsigned long)prop_buf);
+ } else {
+ printk(KERN_ERR "out of memory in switch_set_state\n");
+ kobject_uevent(&sdev->dev->kobj, KOBJ_CHANGE);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(switch_set_state);
+
+static int create_switch_class(void)
+{
+ if (!switch_class) {
+ switch_class = class_create(THIS_MODULE, "switch");
+ if (IS_ERR(switch_class))
+ return PTR_ERR(switch_class);
+ atomic_set(&device_count, 0);
+ }
+
+ return 0;
+}
+
+int switch_dev_register(struct switch_dev *sdev)
+{
+ int ret;
+
+ if (!switch_class) {
+ ret = create_switch_class();
+ if (ret < 0)
+ return ret;
+ }
+
+ sdev->index = atomic_inc_return(&device_count);
+ sdev->dev = device_create(switch_class, NULL,
+ MKDEV(0, sdev->index), NULL, sdev->name);
+ if (IS_ERR(sdev->dev))
+ return PTR_ERR(sdev->dev);
+
+ ret = device_create_file(sdev->dev, &dev_attr_state);
+ if (ret < 0)
+ goto err_create_file_1;
+ ret = device_create_file(sdev->dev, &dev_attr_name);
+ if (ret < 0)
+ goto err_create_file_2;
+
+ dev_set_drvdata(sdev->dev, sdev);
+ sdev->state = 0;
+ return 0;
+
+err_create_file_2:
+ device_remove_file(sdev->dev, &dev_attr_state);
+err_create_file_1:
+ device_destroy(switch_class, MKDEV(0, sdev->index));
+ printk(KERN_ERR "switch: Failed to register driver %s\n", sdev->name);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(switch_dev_register);
+
+void switch_dev_unregister(struct switch_dev *sdev)
+{
+ device_remove_file(sdev->dev, &dev_attr_name);
+ device_remove_file(sdev->dev, &dev_attr_state);
+ device_destroy(switch_class, MKDEV(0, sdev->index));
+ dev_set_drvdata(sdev->dev, NULL);
+}
+EXPORT_SYMBOL_GPL(switch_dev_unregister);
+
+static int __init switch_class_init(void)
+{
+ return create_switch_class();
+}
+
+static void __exit switch_class_exit(void)
+{
+ class_destroy(switch_class);
+}
+
+module_init(switch_class_init);
+module_exit(switch_class_exit);
+
+MODULE_AUTHOR("Mike Lockwood <lockwood@android.com>");
+MODULE_DESCRIPTION("Switch class driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/switch/switch_gpio.c b/drivers/switch/switch_gpio.c
new file mode 100644
index 000000000000..621d62d20c99
--- /dev/null
+++ b/drivers/switch/switch_gpio.c
@@ -0,0 +1,172 @@
+/*
+ * drivers/switch/switch_gpio.c
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+*/
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/switch.h>
+#include <linux/workqueue.h>
+#include <linux/gpio.h>
+
+struct gpio_switch_data {
+ struct switch_dev sdev;
+ unsigned gpio;
+ const char *name_on;
+ const char *name_off;
+ const char *state_on;
+ const char *state_off;
+ int irq;
+ struct work_struct work;
+};
+
+static void gpio_switch_work(struct work_struct *work)
+{
+ int state;
+ struct gpio_switch_data *data =
+ container_of(work, struct gpio_switch_data, work);
+
+ state = gpio_get_value(data->gpio);
+ switch_set_state(&data->sdev, state);
+}
+
+static irqreturn_t gpio_irq_handler(int irq, void *dev_id)
+{
+ struct gpio_switch_data *switch_data =
+ (struct gpio_switch_data *)dev_id;
+
+ schedule_work(&switch_data->work);
+ return IRQ_HANDLED;
+}
+
+static ssize_t switch_gpio_print_state(struct switch_dev *sdev, char *buf)
+{
+ struct gpio_switch_data *switch_data =
+ container_of(sdev, struct gpio_switch_data, sdev);
+ const char *state;
+ if (switch_get_state(sdev))
+ state = switch_data->state_on;
+ else
+ state = switch_data->state_off;
+
+ if (state)
+ return sprintf(buf, "%s\n", state);
+ return -1;
+}
+
+static int gpio_switch_probe(struct platform_device *pdev)
+{
+ struct gpio_switch_platform_data *pdata = pdev->dev.platform_data;
+ struct gpio_switch_data *switch_data;
+ int ret = 0;
+
+ if (!pdata)
+ return -EBUSY;
+
+ switch_data = kzalloc(sizeof(struct gpio_switch_data), GFP_KERNEL);
+ if (!switch_data)
+ return -ENOMEM;
+
+ switch_data->sdev.name = pdata->name;
+ switch_data->gpio = pdata->gpio;
+ switch_data->name_on = pdata->name_on;
+ switch_data->name_off = pdata->name_off;
+ switch_data->state_on = pdata->state_on;
+ switch_data->state_off = pdata->state_off;
+ switch_data->sdev.print_state = switch_gpio_print_state;
+
+ ret = switch_dev_register(&switch_data->sdev);
+ if (ret < 0)
+ goto err_switch_dev_register;
+
+ ret = gpio_request(switch_data->gpio, pdev->name);
+ if (ret < 0)
+ goto err_request_gpio;
+
+ ret = gpio_direction_input(switch_data->gpio);
+ if (ret < 0)
+ goto err_set_gpio_input;
+
+ INIT_WORK(&switch_data->work, gpio_switch_work);
+
+ switch_data->irq = gpio_to_irq(switch_data->gpio);
+ if (switch_data->irq < 0) {
+ ret = switch_data->irq;
+ goto err_detect_irq_num_failed;
+ }
+
+ ret = request_irq(switch_data->irq, gpio_irq_handler,
+ IRQF_TRIGGER_LOW, pdev->name, switch_data);
+ if (ret < 0)
+ goto err_request_irq;
+
+ /* Perform initial detection */
+ gpio_switch_work(&switch_data->work);
+
+ return 0;
+
+err_request_irq:
+err_detect_irq_num_failed:
+err_set_gpio_input:
+ gpio_free(switch_data->gpio);
+err_request_gpio:
+ switch_dev_unregister(&switch_data->sdev);
+err_switch_dev_register:
+ kfree(switch_data);
+
+ return ret;
+}
+
+static int gpio_switch_remove(struct platform_device *pdev)
+{
+ struct gpio_switch_data *switch_data = platform_get_drvdata(pdev);
+
+ cancel_work_sync(&switch_data->work);
+ gpio_free(switch_data->gpio);
+ switch_dev_unregister(&switch_data->sdev);
+ kfree(switch_data);
+
+ return 0;
+}
+
+static struct platform_driver gpio_switch_driver = {
+ .probe = gpio_switch_probe,
+ .remove = gpio_switch_remove,
+ .driver = {
+ .name = "switch-gpio",
+ .owner = THIS_MODULE,
+ },
+};
+
+static int __init gpio_switch_init(void)
+{
+ return platform_driver_register(&gpio_switch_driver);
+}
+
+static void __exit gpio_switch_exit(void)
+{
+ platform_driver_unregister(&gpio_switch_driver);
+}
+
+module_init(gpio_switch_init);
+module_exit(gpio_switch_exit);
+
+MODULE_AUTHOR("Mike Lockwood <lockwood@android.com>");
+MODULE_DESCRIPTION("GPIO Switch driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 7b64a5fdd916..d6144cbe2035 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -90,6 +90,7 @@
struct n_tty_data {
/* producer-published */
size_t read_head;
+ size_t commit_head;
size_t canon_head;
size_t echo_head;
size_t echo_commit;
@@ -165,14 +166,16 @@ static int receive_room(struct tty_struct *tty)
{
struct n_tty_data *ldata = tty->disc_data;
int left;
+ size_t tail = smp_load_acquire(&ldata->read_tail);
+ size_t head = ldata->read_head;
if (I_PARMRK(tty)) {
/* Multiply read_cnt by 3, since each byte might take up to
* three times as many spaces when PARMRK is set (depending on
* its flags, e.g. parity error). */
- left = N_TTY_BUF_SIZE - read_cnt(ldata) * 3 - 1;
+ left = N_TTY_BUF_SIZE - (head - tail) * 3 - 1;
} else
- left = N_TTY_BUF_SIZE - read_cnt(ldata) - 1;
+ left = N_TTY_BUF_SIZE - (head - tail) - 1;
/*
* If we are doing input canonicalization, and there are no
@@ -181,7 +184,7 @@ static int receive_room(struct tty_struct *tty)
* characters will be beeped.
*/
if (left <= 0)
- left = ldata->icanon && ldata->canon_head == ldata->read_tail;
+ left = ldata->icanon && ldata->canon_head == tail;
return left;
}
@@ -235,7 +238,7 @@ static ssize_t chars_in_buffer(struct tty_struct *tty)
ssize_t n = 0;
if (!ldata->icanon)
- n = read_cnt(ldata);
+ n = ldata->commit_head - ldata->read_tail;
else
n = ldata->canon_head - ldata->read_tail;
return n;
@@ -319,10 +322,6 @@ static void n_tty_check_unthrottle(struct tty_struct *tty)
*
* n_tty_receive_buf()/producer path:
* caller holds non-exclusive termios_rwsem
- * modifies read_head
- *
- * read_head is only considered 'published' if canonical mode is
- * not active.
*/
static inline void put_tty_queue(unsigned char c, struct n_tty_data *ldata)
@@ -346,6 +345,7 @@ static void reset_buffer_flags(struct n_tty_data *ldata)
{
ldata->read_head = ldata->canon_head = ldata->read_tail = 0;
ldata->echo_head = ldata->echo_tail = ldata->echo_commit = 0;
+ ldata->commit_head = 0;
ldata->echo_mark = 0;
ldata->line_start = 0;
@@ -992,10 +992,6 @@ static inline void finish_erasing(struct n_tty_data *ldata)
*
* n_tty_receive_buf()/producer path:
* caller holds non-exclusive termios_rwsem
- * modifies read_head
- *
- * Modifying the read_head is not considered a publish in this context
- * because canonical mode is active -- only canon_head publishes
*/
static void eraser(unsigned char c, struct tty_struct *tty)
@@ -1144,7 +1140,6 @@ static void isig(int sig, struct tty_struct *tty)
*
* n_tty_receive_buf()/producer path:
* caller holds non-exclusive termios_rwsem
- * publishes read_head via put_tty_queue()
*
* Note: may get exclusive termios_rwsem if flushing input buffer
*/
@@ -1214,7 +1209,6 @@ static void n_tty_receive_overrun(struct tty_struct *tty)
*
* n_tty_receive_buf()/producer path:
* caller holds non-exclusive termios_rwsem
- * publishes read_head via put_tty_queue()
*/
static void n_tty_receive_parity_error(struct tty_struct *tty, unsigned char c)
{
@@ -1268,7 +1262,6 @@ n_tty_receive_signal_char(struct tty_struct *tty, int signal, unsigned char c)
* n_tty_receive_buf()/producer path:
* caller holds non-exclusive termios_rwsem
* publishes canon_head if canonical mode is active
- * otherwise, publishes read_head via put_tty_queue()
*
* Returns 1 if LNEXT was received, else returns 0
*/
@@ -1381,7 +1374,7 @@ n_tty_receive_char_special(struct tty_struct *tty, unsigned char c)
handle_newline:
set_bit(ldata->read_head & (N_TTY_BUF_SIZE - 1), ldata->read_flags);
put_tty_queue(c, ldata);
- ldata->canon_head = ldata->read_head;
+ smp_store_release(&ldata->canon_head, ldata->read_head);
kill_fasync(&tty->fasync, SIGIO, POLL_IN);
wake_up_interruptible_poll(&tty->read_wait, POLLIN);
return 0;
@@ -1524,16 +1517,14 @@ n_tty_receive_buf_real_raw(struct tty_struct *tty, const unsigned char *cp,
size_t n, head;
head = ldata->read_head & (N_TTY_BUF_SIZE - 1);
- n = N_TTY_BUF_SIZE - max(read_cnt(ldata), head);
- n = min_t(size_t, count, n);
+ n = min_t(size_t, count, N_TTY_BUF_SIZE - head);
memcpy(read_buf_addr(ldata, head), cp, n);
ldata->read_head += n;
cp += n;
count -= n;
head = ldata->read_head & (N_TTY_BUF_SIZE - 1);
- n = N_TTY_BUF_SIZE - max(read_cnt(ldata), head);
- n = min_t(size_t, count, n);
+ n = min_t(size_t, count, N_TTY_BUF_SIZE - head);
memcpy(read_buf_addr(ldata, head), cp, n);
ldata->read_head += n;
}
@@ -1663,8 +1654,13 @@ static void __receive_buf(struct tty_struct *tty, const unsigned char *cp,
tty->ops->flush_chars(tty);
}
- if ((!ldata->icanon && (read_cnt(ldata) >= ldata->minimum_to_wake)) ||
- L_EXTPROC(tty)) {
+ if (ldata->icanon && !L_EXTPROC(tty))
+ return;
+
+ /* publish read head to consumer */
+ smp_store_release(&ldata->commit_head, ldata->read_head);
+
+ if ((read_cnt(ldata) >= ldata->minimum_to_wake) || L_EXTPROC(tty)) {
kill_fasync(&tty->fasync, SIGIO, POLL_IN);
wake_up_interruptible_poll(&tty->read_wait, POLLIN);
}
@@ -1821,6 +1817,7 @@ static void n_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
ldata->canon_head = ldata->read_head;
ldata->push = 1;
}
+ ldata->commit_head = ldata->read_head;
ldata->erasing = 0;
ldata->lnext = 0;
}
@@ -1960,7 +1957,7 @@ static inline int input_available_p(struct tty_struct *tty, int poll)
if (ldata->icanon && !L_EXTPROC(tty))
return ldata->canon_head != ldata->read_tail;
else
- return read_cnt(ldata) >= amt;
+ return ldata->commit_head - ldata->read_tail >= amt;
}
/**
@@ -1992,10 +1989,11 @@ static int copy_from_read_buf(struct tty_struct *tty,
int retval;
size_t n;
bool is_eof;
+ size_t head = smp_load_acquire(&ldata->commit_head);
size_t tail = ldata->read_tail & (N_TTY_BUF_SIZE - 1);
retval = 0;
- n = min(read_cnt(ldata), N_TTY_BUF_SIZE - tail);
+ n = min(head - ldata->read_tail, N_TTY_BUF_SIZE - tail);
n = min(*nr, n);
if (n) {
retval = copy_to_user(*b, read_buf_addr(ldata, tail), n);
@@ -2003,10 +2001,11 @@ static int copy_from_read_buf(struct tty_struct *tty,
is_eof = n == 1 && read_buf(ldata, tail) == EOF_CHAR(tty);
tty_audit_add_data(tty, read_buf_addr(ldata, tail), n,
ldata->icanon);
- ldata->read_tail += n;
+ smp_store_release(&ldata->read_tail, ldata->read_tail + n);
/* Turn single EOF into zero-length read */
- if (L_EXTPROC(tty) && ldata->icanon && is_eof && !read_cnt(ldata))
- n = 0;
+ if (L_EXTPROC(tty) && ldata->icanon && is_eof &&
+ (head == ldata->read_tail))
+ n = 0;
*b += n;
*nr -= n;
}
@@ -2048,7 +2047,7 @@ static int canon_copy_from_read_buf(struct tty_struct *tty,
bool eof_push = 0;
/* N.B. avoid overrun if nr == 0 */
- n = min(*nr, read_cnt(ldata));
+ n = min(*nr, smp_load_acquire(&ldata->canon_head) - ldata->read_tail);
if (!n)
return 0;
@@ -2098,8 +2097,7 @@ static int canon_copy_from_read_buf(struct tty_struct *tty,
if (found)
clear_bit(eol, ldata->read_flags);
- smp_mb__after_atomic();
- ldata->read_tail += c;
+ smp_store_release(&ldata->read_tail, ldata->read_tail + c);
if (found) {
if (!ldata->push)
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index a28dee9d5017..ec40c5983aa2 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -95,6 +95,9 @@ static void __uart_start(struct tty_struct *tty)
struct uart_state *state = tty->driver_data;
struct uart_port *port = state->uart_port;
+ if (port->ops->wake_peer)
+ port->ops->wake_peer(port);
+
if (!uart_tx_stopped(port))
port->ops->start_tx(port);
}
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index a3cf7f110fe7..b62d65d46902 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -1202,10 +1202,11 @@ static int proc_getdriver(struct usb_dev_state *ps, void __user *arg)
static int proc_connectinfo(struct usb_dev_state *ps, void __user *arg)
{
- struct usbdevfs_connectinfo ci = {
- .devnum = ps->dev->devnum,
- .slow = ps->dev->speed == USB_SPEED_LOW
- };
+ struct usbdevfs_connectinfo ci;
+
+ memset(&ci, 0, sizeof(ci));
+ ci.devnum = ps->dev->devnum;
+ ci.slow = ps->dev->speed == USB_SPEED_LOW;
if (copy_to_user(arg, &ci, sizeof(ci)))
return -EFAULT;
diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig
index c4880fc0d86e..51896f6332b4 100644
--- a/drivers/usb/gadget/Kconfig
+++ b/drivers/usb/gadget/Kconfig
@@ -183,13 +183,28 @@ config USB_F_FS
config USB_F_UAC1
tristate
-
+
config USB_F_UAC2
tristate
config USB_F_UVC
tristate
+config USB_F_MTP
+ tristate
+
+config USB_F_PTP
+ tristate
+
+config USB_F_AUDIO_SRC
+ tristate
+
+config USB_F_ACC
+ tristate
+
+config USB_F_MIDI
+ tristate
+
choice
tristate "USB Gadget Drivers"
default USB_ETH
@@ -362,6 +377,57 @@ config USB_CONFIGFS_F_FS
implemented in kernel space (for instance Ethernet, serial or
mass storage) and other are implemented in user space.
+config USB_CONFIGFS_F_MTP
+ boolean "MTP gadget"
+ depends on USB_CONFIGFS
+ select USB_F_MTP
+ help
+ USB gadget MTP support
+
+config USB_CONFIGFS_F_PTP
+ boolean "PTP gadget"
+ depends on USB_CONFIGFS && USB_CONFIGFS_F_MTP
+ select USB_F_PTP
+ help
+ USB gadget PTP support
+
+config USB_CONFIGFS_F_ACC
+ boolean "Accessory gadget"
+ depends on USB_CONFIGFS
+ select USB_F_ACC
+ help
+ USB gadget Accessory support
+
+config USB_CONFIGFS_F_AUDIO_SRC
+ boolean "Audio Source gadget"
+ depends on USB_CONFIGFS && USB_CONFIGFS_F_ACC
+ depends on SND
+ select SND_PCM
+ select USB_F_AUDIO_SRC
+ help
+ USB gadget Audio Source support
+
+config USB_CONFIGFS_UEVENT
+ boolean "Uevent notification of Gadget state"
+ depends on USB_CONFIGFS
+ help
+ Enable uevent notifications to userspace when the gadget
+ state changes. The gadget can be in any of the following
+ three states: "CONNECTED/DISCONNECTED/CONFIGURED"
+
+config USB_CONFIGFS_F_MIDI
+ boolean "MIDI function"
+ depends on USB_CONFIGFS
+ depends on SND
+ select SND_RAWMIDI
+ select USB_F_MIDI
+ help
+ The MIDI Function acts as a USB Audio device, with one MIDI
+ input and one MIDI output. These MIDI jacks appear as
+ a sound "card" in the ALSA sound system. Other MIDI
+ connections can then be made on the gadget system, using
+ ALSA's aconnect utility etc.
+
source "drivers/usb/gadget/legacy/Kconfig"
endchoice
diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index 7b2ba34cf5e7..93d5a03648a4 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -1780,6 +1780,12 @@ void composite_disconnect(struct usb_gadget *gadget)
struct usb_composite_dev *cdev = get_gadget_data(gadget);
unsigned long flags;
+ if (cdev == NULL) {
+ WARN(1, "%s: Calling disconnect on a Gadget that is \
+ not connected\n", __func__);
+ return;
+ }
+
/* REVISIT: should we have config and device level
* disconnect callbacks?
*/
diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c
index a7e1a96e69a5..b6bfa6b7c551 100644
--- a/drivers/usb/gadget/configfs.c
+++ b/drivers/usb/gadget/configfs.c
@@ -9,6 +9,31 @@
#include "u_f.h"
#include "u_os_desc.h"
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+#include <linux/platform_device.h>
+#include <linux/kdev_t.h>
+#include <linux/usb/ch9.h>
+
+#ifdef CONFIG_USB_CONFIGFS_F_ACC
+extern int acc_ctrlrequest(struct usb_composite_dev *cdev,
+ const struct usb_ctrlrequest *ctrl);
+void acc_disconnect(void);
+#endif
+static struct class *android_class;
+static struct device *android_device;
+static int index;
+
+struct device *create_function_device(char *name)
+{
+ if (android_device && !IS_ERR(android_device))
+ return device_create(android_class, android_device,
+ MKDEV(0, index++), NULL, name);
+ else
+ return ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL_GPL(create_function_device);
+#endif
+
int check_user_usb_string(const char *name,
struct usb_gadget_strings *stringtab_dev)
{
@@ -63,6 +88,12 @@ struct gadget_info {
bool use_os_desc;
char b_vendor_code;
char qw_sign[OS_STRING_QW_SIGN_LEN];
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ bool connected;
+ bool sw_connected;
+ struct work_struct work;
+ struct device *dev;
+#endif
};
struct config_usb_cfg {
@@ -262,7 +293,7 @@ static ssize_t gadget_dev_desc_UDC_store(struct gadget_info *gi,
mutex_lock(&gi->lock);
- if (!strlen(name)) {
+ if (!strlen(name) || strcmp(name, "none") == 0) {
ret = unregister_gadget(gi);
if (ret)
goto err;
@@ -1427,6 +1458,60 @@ err_comp_cleanup:
return ret;
}
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+static void android_work(struct work_struct *data)
+{
+ struct gadget_info *gi = container_of(data, struct gadget_info, work);
+ struct usb_composite_dev *cdev = &gi->cdev;
+ char *disconnected[2] = { "USB_STATE=DISCONNECTED", NULL };
+ char *connected[2] = { "USB_STATE=CONNECTED", NULL };
+ char *configured[2] = { "USB_STATE=CONFIGURED", NULL };
+ /* 0-connected 1-configured 2-disconnected*/
+ bool status[3] = { false, false, false };
+ unsigned long flags;
+ bool uevent_sent = false;
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (cdev->config)
+ status[1] = true;
+
+ if (gi->connected != gi->sw_connected) {
+ if (gi->connected)
+ status[0] = true;
+ else
+ status[2] = true;
+ gi->sw_connected = gi->connected;
+ }
+ spin_unlock_irqrestore(&cdev->lock, flags);
+
+ if (status[0]) {
+ kobject_uevent_env(&android_device->kobj,
+ KOBJ_CHANGE, connected);
+ pr_info("%s: sent uevent %s\n", __func__, connected[0]);
+ uevent_sent = true;
+ }
+
+ if (status[1]) {
+ kobject_uevent_env(&android_device->kobj,
+ KOBJ_CHANGE, configured);
+ pr_info("%s: sent uevent %s\n", __func__, configured[0]);
+ uevent_sent = true;
+ }
+
+ if (status[2]) {
+ kobject_uevent_env(&android_device->kobj,
+ KOBJ_CHANGE, disconnected);
+ pr_info("%s: sent uevent %s\n", __func__, disconnected[0]);
+ uevent_sent = true;
+ }
+
+ if (!uevent_sent) {
+ pr_info("%s: did not send uevent (%d %d %p)\n", __func__,
+ gi->connected, gi->sw_connected, cdev->config);
+ }
+}
+#endif
+
static void configfs_composite_unbind(struct usb_gadget *gadget)
{
struct usb_composite_dev *cdev;
@@ -1444,14 +1529,79 @@ static void configfs_composite_unbind(struct usb_gadget *gadget)
set_gadget_data(gadget, NULL);
}
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+static int android_setup(struct usb_gadget *gadget,
+ const struct usb_ctrlrequest *c)
+{
+ struct usb_composite_dev *cdev = get_gadget_data(gadget);
+ unsigned long flags;
+ struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev);
+ int value = -EOPNOTSUPP;
+ struct usb_function_instance *fi;
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (!gi->connected) {
+ gi->connected = 1;
+ schedule_work(&gi->work);
+ }
+ spin_unlock_irqrestore(&cdev->lock, flags);
+ list_for_each_entry(fi, &gi->available_func, cfs_list) {
+ if (fi != NULL && fi->f != NULL && fi->f->setup != NULL) {
+ value = fi->f->setup(fi->f, c);
+ if (value >= 0)
+ break;
+ }
+ }
+
+#ifdef CONFIG_USB_CONFIGFS_F_ACC
+ if (value < 0)
+ value = acc_ctrlrequest(cdev, c);
+#endif
+
+ if (value < 0)
+ value = composite_setup(gadget, c);
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (c->bRequest == USB_REQ_SET_CONFIGURATION &&
+ cdev->config) {
+ schedule_work(&gi->work);
+ }
+ spin_unlock_irqrestore(&cdev->lock, flags);
+
+ return value;
+}
+
+static void android_disconnect(struct usb_gadget *gadget)
+{
+ struct usb_composite_dev *cdev = get_gadget_data(gadget);
+ struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev);
+
+ /* accessory HID support can be active while the
+ accessory function is not actually enabled,
+ so we need to inform it when we are disconnected.
+ */
+
+#ifdef CONFIG_USB_CONFIGFS_F_ACC
+ acc_disconnect();
+#endif
+ gi->connected = 0;
+ schedule_work(&gi->work);
+ composite_disconnect(gadget);
+}
+#endif
+
static const struct usb_gadget_driver configfs_driver_template = {
.bind = configfs_composite_bind,
.unbind = configfs_composite_unbind,
-
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ .setup = android_setup,
+ .reset = android_disconnect,
+ .disconnect = android_disconnect,
+#else
.setup = composite_setup,
.reset = composite_disconnect,
.disconnect = composite_disconnect,
-
+#endif
.max_speed = USB_SPEED_SUPER,
.driver = {
.owner = THIS_MODULE,
@@ -1459,6 +1609,89 @@ static const struct usb_gadget_driver configfs_driver_template = {
},
};
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+static ssize_t state_show(struct device *pdev, struct device_attribute *attr,
+ char *buf)
+{
+ struct gadget_info *dev = dev_get_drvdata(pdev);
+ struct usb_composite_dev *cdev;
+ char *state = "DISCONNECTED";
+ unsigned long flags;
+
+ if (!dev)
+ goto out;
+
+ cdev = &dev->cdev;
+
+ if (!cdev)
+ goto out;
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (cdev->config)
+ state = "CONFIGURED";
+ else if (dev->connected)
+ state = "CONNECTED";
+ spin_unlock_irqrestore(&cdev->lock, flags);
+out:
+ return sprintf(buf, "%s\n", state);
+}
+
+static DEVICE_ATTR(state, S_IRUGO, state_show, NULL);
+
+static struct device_attribute *android_usb_attributes[] = {
+ &dev_attr_state,
+ NULL
+};
+
+static int android_device_create(struct gadget_info *gi)
+{
+ struct device_attribute **attrs;
+ struct device_attribute *attr;
+
+ INIT_WORK(&gi->work, android_work);
+ android_device = device_create(android_class, NULL,
+ MKDEV(0, 0), NULL, "android0");
+ if (IS_ERR(android_device))
+ return PTR_ERR(android_device);
+
+ dev_set_drvdata(android_device, gi);
+
+ attrs = android_usb_attributes;
+ while ((attr = *attrs++)) {
+ int err;
+
+ err = device_create_file(android_device, attr);
+ if (err) {
+ device_destroy(android_device->class,
+ android_device->devt);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static void android_device_destroy(void)
+{
+ struct device_attribute **attrs;
+ struct device_attribute *attr;
+
+ attrs = android_usb_attributes;
+ while ((attr = *attrs++))
+ device_remove_file(android_device, attr);
+ device_destroy(android_device->class, android_device->devt);
+}
+#else
+static inline int android_device_create(struct gadget_info *gi)
+{
+ return 0;
+}
+
+static inline void android_device_destroy(void)
+{
+}
+#endif
+
static struct config_group *gadgets_make(
struct config_group *group,
const char *name)
@@ -1468,7 +1701,6 @@ static struct config_group *gadgets_make(
gi = kzalloc(sizeof(*gi), GFP_KERNEL);
if (!gi)
return ERR_PTR(-ENOMEM);
-
gi->group.default_groups = gi->default_groups;
gi->group.default_groups[0] = &gi->functions_group;
gi->group.default_groups[1] = &gi->configs_group;
@@ -1507,6 +1739,9 @@ static struct config_group *gadgets_make(
if (!gi->composite.gadget_driver.function)
goto err;
+ if (android_device_create(gi) < 0)
+ goto err;
+
#ifdef CONFIG_USB_OTG
gi->otg.bLength = sizeof(struct usb_otg_descriptor);
gi->otg.bDescriptorType = USB_DT_OTG;
@@ -1516,6 +1751,7 @@ static struct config_group *gadgets_make(
config_group_init_type_name(&gi->group, name,
&gadget_root_type);
return &gi->group;
+
err:
kfree(gi);
return ERR_PTR(-ENOMEM);
@@ -1524,6 +1760,7 @@ err:
static void gadgets_drop(struct config_group *group, struct config_item *item)
{
config_item_put(item);
+ android_device_destroy();
}
static struct configfs_group_operations gadgets_ops = {
@@ -1550,7 +1787,9 @@ void unregister_gadget_item(struct config_item *item)
{
struct gadget_info *gi = to_gadget_info(item);
+ mutex_lock(&gi->lock);
unregister_gadget(gi);
+ mutex_unlock(&gi->lock);
}
EXPORT_SYMBOL_GPL(unregister_gadget_item);
@@ -1561,6 +1800,13 @@ static int __init gadget_cfs_init(void)
config_group_init(&gadget_subsys.su_group);
ret = configfs_register_subsystem(&gadget_subsys);
+
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ android_class = class_create(THIS_MODULE, "android_usb");
+ if (IS_ERR(android_class))
+ return PTR_ERR(android_class);
+#endif
+
return ret;
}
module_init(gadget_cfs_init);
@@ -1568,5 +1814,10 @@ module_init(gadget_cfs_init);
static void __exit gadget_cfs_exit(void)
{
configfs_unregister_subsystem(&gadget_subsys);
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ if (!IS_ERR(android_class))
+ class_destroy(android_class);
+#endif
+
}
module_exit(gadget_cfs_exit);
diff --git a/drivers/usb/gadget/function/Makefile b/drivers/usb/gadget/function/Makefile
index 90701aa5a826..5d0a437a74cd 100644
--- a/drivers/usb/gadget/function/Makefile
+++ b/drivers/usb/gadget/function/Makefile
@@ -38,3 +38,15 @@ usb_f_uac2-y := f_uac2.o
obj-$(CONFIG_USB_F_UAC2) += usb_f_uac2.o
usb_f_uvc-y := f_uvc.o uvc_queue.o uvc_v4l2.o uvc_video.o
obj-$(CONFIG_USB_F_UVC) += usb_f_uvc.o
+usb_f_mtp-y := f_mtp.o
+obj-$(CONFIG_USB_F_MTP) += usb_f_mtp.o
+usb_f_ptp-y := f_ptp.o
+obj-$(CONFIG_USB_F_PTP) += usb_f_ptp.o
+usb_f_audio_source-y := f_audio_source.o
+obj-$(CONFIG_USB_F_AUDIO_SRC) += usb_f_audio_source.o
+usb_f_accessory-y := f_accessory.o
+obj-$(CONFIG_USB_F_ACC) += usb_f_accessory.o
+
+
+usb_f_midi-y := f_midi.o
+obj-$(CONFIG_USB_F_MIDI) += usb_f_midi.o
diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c
new file mode 100644
index 000000000000..8187d6ba0f7e
--- /dev/null
+++ b/drivers/usb/gadget/function/f_accessory.c
@@ -0,0 +1,1330 @@
+/*
+ * Gadget Function Driver for Android USB accessories
+ *
+ * Copyright (C) 2011 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/* #define DEBUG */
+/* #define VERBOSE_DEBUG */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+
+#include <linux/hid.h>
+#include <linux/hiddev.h>
+#include <linux/usb.h>
+#include <linux/usb/ch9.h>
+#include <linux/usb/f_accessory.h>
+
+#include <linux/configfs.h>
+#include <linux/usb/composite.h>
+
+#define MAX_INST_NAME_LEN 40
+#define BULK_BUFFER_SIZE 16384
+#define ACC_STRING_SIZE 256
+
+#define PROTOCOL_VERSION 2
+
+/* String IDs */
+#define INTERFACE_STRING_INDEX 0
+
+/* number of tx and rx requests to allocate */
+#define TX_REQ_MAX 4
+#define RX_REQ_MAX 2
+
+struct acc_hid_dev {
+ struct list_head list;
+ struct hid_device *hid;
+ struct acc_dev *dev;
+ /* accessory defined ID */
+ int id;
+ /* HID report descriptor */
+ u8 *report_desc;
+ /* length of HID report descriptor */
+ int report_desc_len;
+ /* number of bytes of report_desc we have received so far */
+ int report_desc_offset;
+};
+
+struct acc_dev {
+ struct usb_function function;
+ struct usb_composite_dev *cdev;
+ spinlock_t lock;
+
+ struct usb_ep *ep_in;
+ struct usb_ep *ep_out;
+
+ /* online indicates state of function_set_alt & function_unbind
+ * set to 1 when we connect
+ */
+ int online:1;
+
+ /* disconnected indicates state of open & release
+ * Set to 1 when we disconnect.
+ * Not cleared until our file is closed.
+ */
+ int disconnected:1;
+
+ /* strings sent by the host */
+ char manufacturer[ACC_STRING_SIZE];
+ char model[ACC_STRING_SIZE];
+ char description[ACC_STRING_SIZE];
+ char version[ACC_STRING_SIZE];
+ char uri[ACC_STRING_SIZE];
+ char serial[ACC_STRING_SIZE];
+
+ /* for acc_complete_set_string */
+ int string_index;
+
+ /* set to 1 if we have a pending start request */
+ int start_requested;
+
+ int audio_mode;
+
+ /* synchronize access to our device file */
+ atomic_t open_excl;
+
+ struct list_head tx_idle;
+
+ wait_queue_head_t read_wq;
+ wait_queue_head_t write_wq;
+ struct usb_request *rx_req[RX_REQ_MAX];
+ int rx_done;
+
+ /* delayed work for handling ACCESSORY_START */
+ struct delayed_work start_work;
+
+ /* worker for registering and unregistering hid devices */
+ struct work_struct hid_work;
+
+ /* list of active HID devices */
+ struct list_head hid_list;
+
+ /* list of new HID devices to register */
+ struct list_head new_hid_list;
+
+ /* list of dead HID devices to unregister */
+ struct list_head dead_hid_list;
+};
+
+static struct usb_interface_descriptor acc_interface_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bInterfaceNumber = 0,
+ .bNumEndpoints = 2,
+ .bInterfaceClass = USB_CLASS_VENDOR_SPEC,
+ .bInterfaceSubClass = USB_SUBCLASS_VENDOR_SPEC,
+ .bInterfaceProtocol = 0,
+};
+
+static struct usb_endpoint_descriptor acc_highspeed_in_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor acc_highspeed_out_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor acc_fullspeed_in_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_endpoint_descriptor acc_fullspeed_out_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_descriptor_header *fs_acc_descs[] = {
+ (struct usb_descriptor_header *) &acc_interface_desc,
+ (struct usb_descriptor_header *) &acc_fullspeed_in_desc,
+ (struct usb_descriptor_header *) &acc_fullspeed_out_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *hs_acc_descs[] = {
+ (struct usb_descriptor_header *) &acc_interface_desc,
+ (struct usb_descriptor_header *) &acc_highspeed_in_desc,
+ (struct usb_descriptor_header *) &acc_highspeed_out_desc,
+ NULL,
+};
+
+static struct usb_string acc_string_defs[] = {
+ [INTERFACE_STRING_INDEX].s = "Android Accessory Interface",
+ { }, /* end of list */
+};
+
+static struct usb_gadget_strings acc_string_table = {
+ .language = 0x0409, /* en-US */
+ .strings = acc_string_defs,
+};
+
+static struct usb_gadget_strings *acc_strings[] = {
+ &acc_string_table,
+ NULL,
+};
+
+/* temporary variable used between acc_open() and acc_gadget_bind() */
+static struct acc_dev *_acc_dev;
+
+struct acc_instance {
+ struct usb_function_instance func_inst;
+ const char *name;
+};
+
+static inline struct acc_dev *func_to_dev(struct usb_function *f)
+{
+ return container_of(f, struct acc_dev, function);
+}
+
+static struct usb_request *acc_request_new(struct usb_ep *ep, int buffer_size)
+{
+ struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL);
+ if (!req)
+ return NULL;
+
+ /* now allocate buffers for the requests */
+ req->buf = kmalloc(buffer_size, GFP_KERNEL);
+ if (!req->buf) {
+ usb_ep_free_request(ep, req);
+ return NULL;
+ }
+
+ return req;
+}
+
+static void acc_request_free(struct usb_request *req, struct usb_ep *ep)
+{
+ if (req) {
+ kfree(req->buf);
+ usb_ep_free_request(ep, req);
+ }
+}
+
+/* add a request to the tail of a list */
+static void req_put(struct acc_dev *dev, struct list_head *head,
+ struct usb_request *req)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ list_add_tail(&req->list, head);
+ spin_unlock_irqrestore(&dev->lock, flags);
+}
+
+/* remove a request from the head of a list */
+static struct usb_request *req_get(struct acc_dev *dev, struct list_head *head)
+{
+ unsigned long flags;
+ struct usb_request *req;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ if (list_empty(head)) {
+ req = 0;
+ } else {
+ req = list_first_entry(head, struct usb_request, list);
+ list_del(&req->list);
+ }
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return req;
+}
+
+static void acc_set_disconnected(struct acc_dev *dev)
+{
+ dev->disconnected = 1;
+}
+
+static void acc_complete_in(struct usb_ep *ep, struct usb_request *req)
+{
+ struct acc_dev *dev = _acc_dev;
+
+ if (req->status == -ESHUTDOWN) {
+ pr_debug("acc_complete_in set disconnected");
+ acc_set_disconnected(dev);
+ }
+
+ req_put(dev, &dev->tx_idle, req);
+
+ wake_up(&dev->write_wq);
+}
+
+static void acc_complete_out(struct usb_ep *ep, struct usb_request *req)
+{
+ struct acc_dev *dev = _acc_dev;
+
+ dev->rx_done = 1;
+ if (req->status == -ESHUTDOWN) {
+ pr_debug("acc_complete_out set disconnected");
+ acc_set_disconnected(dev);
+ }
+
+ wake_up(&dev->read_wq);
+}
+
+static void acc_complete_set_string(struct usb_ep *ep, struct usb_request *req)
+{
+ struct acc_dev *dev = ep->driver_data;
+ char *string_dest = NULL;
+ int length = req->actual;
+
+ if (req->status != 0) {
+ pr_err("acc_complete_set_string, err %d\n", req->status);
+ return;
+ }
+
+ switch (dev->string_index) {
+ case ACCESSORY_STRING_MANUFACTURER:
+ string_dest = dev->manufacturer;
+ break;
+ case ACCESSORY_STRING_MODEL:
+ string_dest = dev->model;
+ break;
+ case ACCESSORY_STRING_DESCRIPTION:
+ string_dest = dev->description;
+ break;
+ case ACCESSORY_STRING_VERSION:
+ string_dest = dev->version;
+ break;
+ case ACCESSORY_STRING_URI:
+ string_dest = dev->uri;
+ break;
+ case ACCESSORY_STRING_SERIAL:
+ string_dest = dev->serial;
+ break;
+ }
+ if (string_dest) {
+ unsigned long flags;
+
+ if (length >= ACC_STRING_SIZE)
+ length = ACC_STRING_SIZE - 1;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ memcpy(string_dest, req->buf, length);
+ /* ensure zero termination */
+ string_dest[length] = 0;
+ spin_unlock_irqrestore(&dev->lock, flags);
+ } else {
+ pr_err("unknown accessory string index %d\n",
+ dev->string_index);
+ }
+}
+
+static void acc_complete_set_hid_report_desc(struct usb_ep *ep,
+ struct usb_request *req)
+{
+ struct acc_hid_dev *hid = req->context;
+ struct acc_dev *dev = hid->dev;
+ int length = req->actual;
+
+ if (req->status != 0) {
+ pr_err("acc_complete_set_hid_report_desc, err %d\n",
+ req->status);
+ return;
+ }
+
+ memcpy(hid->report_desc + hid->report_desc_offset, req->buf, length);
+ hid->report_desc_offset += length;
+ if (hid->report_desc_offset == hid->report_desc_len) {
+ /* After we have received the entire report descriptor
+ * we schedule work to initialize the HID device
+ */
+ schedule_work(&dev->hid_work);
+ }
+}
+
+static void acc_complete_send_hid_event(struct usb_ep *ep,
+ struct usb_request *req)
+{
+ struct acc_hid_dev *hid = req->context;
+ int length = req->actual;
+
+ if (req->status != 0) {
+ pr_err("acc_complete_send_hid_event, err %d\n", req->status);
+ return;
+ }
+
+ hid_report_raw_event(hid->hid, HID_INPUT_REPORT, req->buf, length, 1);
+}
+
+static int acc_hid_parse(struct hid_device *hid)
+{
+ struct acc_hid_dev *hdev = hid->driver_data;
+
+ hid_parse_report(hid, hdev->report_desc, hdev->report_desc_len);
+ return 0;
+}
+
+static int acc_hid_start(struct hid_device *hid)
+{
+ return 0;
+}
+
+static void acc_hid_stop(struct hid_device *hid)
+{
+}
+
+static int acc_hid_open(struct hid_device *hid)
+{
+ return 0;
+}
+
+static void acc_hid_close(struct hid_device *hid)
+{
+}
+
+static int acc_hid_raw_request(struct hid_device *hid, unsigned char reportnum,
+ __u8 *buf, size_t len, unsigned char rtype, int reqtype)
+{
+ return 0;
+}
+
+static struct hid_ll_driver acc_hid_ll_driver = {
+ .parse = acc_hid_parse,
+ .start = acc_hid_start,
+ .stop = acc_hid_stop,
+ .open = acc_hid_open,
+ .close = acc_hid_close,
+ .raw_request = acc_hid_raw_request,
+};
+
+static struct acc_hid_dev *acc_hid_new(struct acc_dev *dev,
+ int id, int desc_len)
+{
+ struct acc_hid_dev *hdev;
+
+ hdev = kzalloc(sizeof(*hdev), GFP_ATOMIC);
+ if (!hdev)
+ return NULL;
+ hdev->report_desc = kzalloc(desc_len, GFP_ATOMIC);
+ if (!hdev->report_desc) {
+ kfree(hdev);
+ return NULL;
+ }
+ hdev->dev = dev;
+ hdev->id = id;
+ hdev->report_desc_len = desc_len;
+
+ return hdev;
+}
+
+static struct acc_hid_dev *acc_hid_get(struct list_head *list, int id)
+{
+ struct acc_hid_dev *hid;
+
+ list_for_each_entry(hid, list, list) {
+ if (hid->id == id)
+ return hid;
+ }
+ return NULL;
+}
+
+static int acc_register_hid(struct acc_dev *dev, int id, int desc_length)
+{
+ struct acc_hid_dev *hid;
+ unsigned long flags;
+
+ /* report descriptor length must be > 0 */
+ if (desc_length <= 0)
+ return -EINVAL;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ /* replace HID if one already exists with this ID */
+ hid = acc_hid_get(&dev->hid_list, id);
+ if (!hid)
+ hid = acc_hid_get(&dev->new_hid_list, id);
+ if (hid)
+ list_move(&hid->list, &dev->dead_hid_list);
+
+ hid = acc_hid_new(dev, id, desc_length);
+ if (!hid) {
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return -ENOMEM;
+ }
+
+ list_add(&hid->list, &dev->new_hid_list);
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ /* schedule work to register the HID device */
+ schedule_work(&dev->hid_work);
+ return 0;
+}
+
+static int acc_unregister_hid(struct acc_dev *dev, int id)
+{
+ struct acc_hid_dev *hid;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ hid = acc_hid_get(&dev->hid_list, id);
+ if (!hid)
+ hid = acc_hid_get(&dev->new_hid_list, id);
+ if (!hid) {
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return -EINVAL;
+ }
+
+ list_move(&hid->list, &dev->dead_hid_list);
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ schedule_work(&dev->hid_work);
+ return 0;
+}
+
+static int create_bulk_endpoints(struct acc_dev *dev,
+ struct usb_endpoint_descriptor *in_desc,
+ struct usb_endpoint_descriptor *out_desc)
+{
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *req;
+ struct usb_ep *ep;
+ int i;
+
+ DBG(cdev, "create_bulk_endpoints dev: %p\n", dev);
+
+ ep = usb_ep_autoconfig(cdev->gadget, in_desc);
+ if (!ep) {
+ DBG(cdev, "usb_ep_autoconfig for ep_in failed\n");
+ return -ENODEV;
+ }
+ DBG(cdev, "usb_ep_autoconfig for ep_in got %s\n", ep->name);
+ ep->driver_data = dev; /* claim the endpoint */
+ dev->ep_in = ep;
+
+ ep = usb_ep_autoconfig(cdev->gadget, out_desc);
+ if (!ep) {
+ DBG(cdev, "usb_ep_autoconfig for ep_out failed\n");
+ return -ENODEV;
+ }
+ DBG(cdev, "usb_ep_autoconfig for ep_out got %s\n", ep->name);
+ ep->driver_data = dev; /* claim the endpoint */
+ dev->ep_out = ep;
+
+ /* now allocate requests for our endpoints */
+ for (i = 0; i < TX_REQ_MAX; i++) {
+ req = acc_request_new(dev->ep_in, BULK_BUFFER_SIZE);
+ if (!req)
+ goto fail;
+ req->complete = acc_complete_in;
+ req_put(dev, &dev->tx_idle, req);
+ }
+ for (i = 0; i < RX_REQ_MAX; i++) {
+ req = acc_request_new(dev->ep_out, BULK_BUFFER_SIZE);
+ if (!req)
+ goto fail;
+ req->complete = acc_complete_out;
+ dev->rx_req[i] = req;
+ }
+
+ return 0;
+
+fail:
+ pr_err("acc_bind() could not allocate requests\n");
+ while ((req = req_get(dev, &dev->tx_idle)))
+ acc_request_free(req, dev->ep_in);
+ for (i = 0; i < RX_REQ_MAX; i++)
+ acc_request_free(dev->rx_req[i], dev->ep_out);
+ return -1;
+}
+
+static ssize_t acc_read(struct file *fp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct acc_dev *dev = fp->private_data;
+ struct usb_request *req;
+ ssize_t r = count;
+ unsigned xfer;
+ int ret = 0;
+
+ pr_debug("acc_read(%zu)\n", count);
+
+ if (dev->disconnected) {
+ pr_debug("acc_read disconnected");
+ return -ENODEV;
+ }
+
+ if (count > BULK_BUFFER_SIZE)
+ count = BULK_BUFFER_SIZE;
+
+ /* we will block until we're online */
+ pr_debug("acc_read: waiting for online\n");
+ ret = wait_event_interruptible(dev->read_wq, dev->online);
+ if (ret < 0) {
+ r = ret;
+ goto done;
+ }
+
+ if (dev->rx_done) {
+ // last req cancelled. try to get it.
+ req = dev->rx_req[0];
+ goto copy_data;
+ }
+
+requeue_req:
+ /* queue a request */
+ req = dev->rx_req[0];
+ req->length = count;
+ dev->rx_done = 0;
+ ret = usb_ep_queue(dev->ep_out, req, GFP_KERNEL);
+ if (ret < 0) {
+ r = -EIO;
+ goto done;
+ } else {
+ pr_debug("rx %p queue\n", req);
+ }
+
+ /* wait for a request to complete */
+ ret = wait_event_interruptible(dev->read_wq, dev->rx_done);
+ if (ret < 0) {
+ r = ret;
+ ret = usb_ep_dequeue(dev->ep_out, req);
+ if (ret != 0) {
+ // cancel failed. There can be a data already received.
+ // it will be retrieved in the next read.
+ pr_debug("acc_read: cancelling failed %d", ret);
+ }
+ goto done;
+ }
+
+copy_data:
+ dev->rx_done = 0;
+ if (dev->online) {
+ /* If we got a 0-len packet, throw it back and try again. */
+ if (req->actual == 0)
+ goto requeue_req;
+
+ pr_debug("rx %p %u\n", req, req->actual);
+ xfer = (req->actual < count) ? req->actual : count;
+ r = xfer;
+ if (copy_to_user(buf, req->buf, xfer))
+ r = -EFAULT;
+ } else
+ r = -EIO;
+
+done:
+ pr_debug("acc_read returning %zd\n", r);
+ return r;
+}
+
+static ssize_t acc_write(struct file *fp, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct acc_dev *dev = fp->private_data;
+ struct usb_request *req = 0;
+ ssize_t r = count;
+ unsigned xfer;
+ int ret;
+
+ pr_debug("acc_write(%zu)\n", count);
+
+ if (!dev->online || dev->disconnected) {
+ pr_debug("acc_write disconnected or not online");
+ return -ENODEV;
+ }
+
+ while (count > 0) {
+ if (!dev->online) {
+ pr_debug("acc_write dev->error\n");
+ r = -EIO;
+ break;
+ }
+
+ /* get an idle tx request to use */
+ req = 0;
+ ret = wait_event_interruptible(dev->write_wq,
+ ((req = req_get(dev, &dev->tx_idle)) || !dev->online));
+ if (!req) {
+ r = ret;
+ break;
+ }
+
+ if (count > BULK_BUFFER_SIZE) {
+ xfer = BULK_BUFFER_SIZE;
+ /* ZLP, They will be more TX requests so not yet. */
+ req->zero = 0;
+ } else {
+ xfer = count;
+ /* If the data length is a multple of the
+ * maxpacket size then send a zero length packet(ZLP).
+ */
+ req->zero = ((xfer % dev->ep_in->maxpacket) == 0);
+ }
+ if (copy_from_user(req->buf, buf, xfer)) {
+ r = -EFAULT;
+ break;
+ }
+
+ req->length = xfer;
+ ret = usb_ep_queue(dev->ep_in, req, GFP_KERNEL);
+ if (ret < 0) {
+ pr_debug("acc_write: xfer error %d\n", ret);
+ r = -EIO;
+ break;
+ }
+
+ buf += xfer;
+ count -= xfer;
+
+ /* zero this so we don't try to free it on error exit */
+ req = 0;
+ }
+
+ if (req)
+ req_put(dev, &dev->tx_idle, req);
+
+ pr_debug("acc_write returning %zd\n", r);
+ return r;
+}
+
+static long acc_ioctl(struct file *fp, unsigned code, unsigned long value)
+{
+ struct acc_dev *dev = fp->private_data;
+ char *src = NULL;
+ int ret;
+
+ switch (code) {
+ case ACCESSORY_GET_STRING_MANUFACTURER:
+ src = dev->manufacturer;
+ break;
+ case ACCESSORY_GET_STRING_MODEL:
+ src = dev->model;
+ break;
+ case ACCESSORY_GET_STRING_DESCRIPTION:
+ src = dev->description;
+ break;
+ case ACCESSORY_GET_STRING_VERSION:
+ src = dev->version;
+ break;
+ case ACCESSORY_GET_STRING_URI:
+ src = dev->uri;
+ break;
+ case ACCESSORY_GET_STRING_SERIAL:
+ src = dev->serial;
+ break;
+ case ACCESSORY_IS_START_REQUESTED:
+ return dev->start_requested;
+ case ACCESSORY_GET_AUDIO_MODE:
+ return dev->audio_mode;
+ }
+ if (!src)
+ return -EINVAL;
+
+ ret = strlen(src) + 1;
+ if (copy_to_user((void __user *)value, src, ret))
+ ret = -EFAULT;
+ return ret;
+}
+
+static int acc_open(struct inode *ip, struct file *fp)
+{
+ printk(KERN_INFO "acc_open\n");
+ if (atomic_xchg(&_acc_dev->open_excl, 1))
+ return -EBUSY;
+
+ _acc_dev->disconnected = 0;
+ fp->private_data = _acc_dev;
+ return 0;
+}
+
+static int acc_release(struct inode *ip, struct file *fp)
+{
+ printk(KERN_INFO "acc_release\n");
+
+ WARN_ON(!atomic_xchg(&_acc_dev->open_excl, 0));
+ /* indicate that we are disconnected
+ * still could be online so don't touch online flag
+ */
+ _acc_dev->disconnected = 1;
+ return 0;
+}
+
+/* file operations for /dev/usb_accessory */
+static const struct file_operations acc_fops = {
+ .owner = THIS_MODULE,
+ .read = acc_read,
+ .write = acc_write,
+ .unlocked_ioctl = acc_ioctl,
+ .open = acc_open,
+ .release = acc_release,
+};
+
+static int acc_hid_probe(struct hid_device *hdev,
+ const struct hid_device_id *id)
+{
+ int ret;
+
+ ret = hid_parse(hdev);
+ if (ret)
+ return ret;
+ return hid_hw_start(hdev, HID_CONNECT_DEFAULT);
+}
+
+static struct miscdevice acc_device = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "usb_accessory",
+ .fops = &acc_fops,
+};
+
+static const struct hid_device_id acc_hid_table[] = {
+ { HID_USB_DEVICE(HID_ANY_ID, HID_ANY_ID) },
+ { }
+};
+
+static struct hid_driver acc_hid_driver = {
+ .name = "USB accessory",
+ .id_table = acc_hid_table,
+ .probe = acc_hid_probe,
+};
+
+int acc_ctrlrequest(struct usb_composite_dev *cdev,
+ const struct usb_ctrlrequest *ctrl)
+{
+ struct acc_dev *dev = _acc_dev;
+ int value = -EOPNOTSUPP;
+ struct acc_hid_dev *hid;
+ int offset;
+ u8 b_requestType = ctrl->bRequestType;
+ u8 b_request = ctrl->bRequest;
+ u16 w_index = le16_to_cpu(ctrl->wIndex);
+ u16 w_value = le16_to_cpu(ctrl->wValue);
+ u16 w_length = le16_to_cpu(ctrl->wLength);
+ unsigned long flags;
+
+/*
+ printk(KERN_INFO "acc_ctrlrequest "
+ "%02x.%02x v%04x i%04x l%u\n",
+ b_requestType, b_request,
+ w_value, w_index, w_length);
+*/
+
+ if (b_requestType == (USB_DIR_OUT | USB_TYPE_VENDOR)) {
+ if (b_request == ACCESSORY_START) {
+ dev->start_requested = 1;
+ schedule_delayed_work(
+ &dev->start_work, msecs_to_jiffies(10));
+ value = 0;
+ } else if (b_request == ACCESSORY_SEND_STRING) {
+ dev->string_index = w_index;
+ cdev->gadget->ep0->driver_data = dev;
+ cdev->req->complete = acc_complete_set_string;
+ value = w_length;
+ } else if (b_request == ACCESSORY_SET_AUDIO_MODE &&
+ w_index == 0 && w_length == 0) {
+ dev->audio_mode = w_value;
+ value = 0;
+ } else if (b_request == ACCESSORY_REGISTER_HID) {
+ value = acc_register_hid(dev, w_value, w_index);
+ } else if (b_request == ACCESSORY_UNREGISTER_HID) {
+ value = acc_unregister_hid(dev, w_value);
+ } else if (b_request == ACCESSORY_SET_HID_REPORT_DESC) {
+ spin_lock_irqsave(&dev->lock, flags);
+ hid = acc_hid_get(&dev->new_hid_list, w_value);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ if (!hid) {
+ value = -EINVAL;
+ goto err;
+ }
+ offset = w_index;
+ if (offset != hid->report_desc_offset
+ || offset + w_length > hid->report_desc_len) {
+ value = -EINVAL;
+ goto err;
+ }
+ cdev->req->context = hid;
+ cdev->req->complete = acc_complete_set_hid_report_desc;
+ value = w_length;
+ } else if (b_request == ACCESSORY_SEND_HID_EVENT) {
+ spin_lock_irqsave(&dev->lock, flags);
+ hid = acc_hid_get(&dev->hid_list, w_value);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ if (!hid) {
+ value = -EINVAL;
+ goto err;
+ }
+ cdev->req->context = hid;
+ cdev->req->complete = acc_complete_send_hid_event;
+ value = w_length;
+ }
+ } else if (b_requestType == (USB_DIR_IN | USB_TYPE_VENDOR)) {
+ if (b_request == ACCESSORY_GET_PROTOCOL) {
+ *((u16 *)cdev->req->buf) = PROTOCOL_VERSION;
+ value = sizeof(u16);
+
+ /* clear any string left over from a previous session */
+ memset(dev->manufacturer, 0, sizeof(dev->manufacturer));
+ memset(dev->model, 0, sizeof(dev->model));
+ memset(dev->description, 0, sizeof(dev->description));
+ memset(dev->version, 0, sizeof(dev->version));
+ memset(dev->uri, 0, sizeof(dev->uri));
+ memset(dev->serial, 0, sizeof(dev->serial));
+ dev->start_requested = 0;
+ dev->audio_mode = 0;
+ }
+ }
+
+ if (value >= 0) {
+ cdev->req->zero = 0;
+ cdev->req->length = value;
+ value = usb_ep_queue(cdev->gadget->ep0, cdev->req, GFP_ATOMIC);
+ if (value < 0)
+ ERROR(cdev, "%s setup response queue error\n",
+ __func__);
+ }
+
+err:
+ if (value == -EOPNOTSUPP)
+ VDBG(cdev,
+ "unknown class-specific control req "
+ "%02x.%02x v%04x i%04x l%u\n",
+ ctrl->bRequestType, ctrl->bRequest,
+ w_value, w_index, w_length);
+ return value;
+}
+EXPORT_SYMBOL_GPL(acc_ctrlrequest);
+
+static int
+acc_function_bind_configfs(struct usb_configuration *c, struct usb_function *f)
+{
+ struct usb_composite_dev *cdev = c->cdev;
+ struct acc_dev *dev = func_to_dev(f);
+ int id;
+ int ret;
+
+ DBG(cdev, "acc_function_bind dev: %p\n", dev);
+
+ if (acc_string_defs[INTERFACE_STRING_INDEX].id == 0) {
+ ret = usb_string_id(c->cdev);
+ if (ret < 0)
+ return ret;
+ acc_string_defs[INTERFACE_STRING_INDEX].id = ret;
+ acc_interface_desc.iInterface = ret;
+ }
+ dev->cdev = c->cdev;
+
+ ret = hid_register_driver(&acc_hid_driver);
+ if (ret)
+ return ret;
+
+ dev->start_requested = 0;
+
+ /* allocate interface ID(s) */
+ id = usb_interface_id(c, f);
+ if (id < 0)
+ return id;
+ acc_interface_desc.bInterfaceNumber = id;
+
+ /* allocate endpoints */
+ ret = create_bulk_endpoints(dev, &acc_fullspeed_in_desc,
+ &acc_fullspeed_out_desc);
+ if (ret)
+ return ret;
+
+ /* support high speed hardware */
+ if (gadget_is_dualspeed(c->cdev->gadget)) {
+ acc_highspeed_in_desc.bEndpointAddress =
+ acc_fullspeed_in_desc.bEndpointAddress;
+ acc_highspeed_out_desc.bEndpointAddress =
+ acc_fullspeed_out_desc.bEndpointAddress;
+ }
+
+ DBG(cdev, "%s speed %s: IN/%s, OUT/%s\n",
+ gadget_is_dualspeed(c->cdev->gadget) ? "dual" : "full",
+ f->name, dev->ep_in->name, dev->ep_out->name);
+ return 0;
+}
+
+static void
+kill_all_hid_devices(struct acc_dev *dev)
+{
+ struct acc_hid_dev *hid;
+ struct list_head *entry, *temp;
+ unsigned long flags;
+
+ /* do nothing if usb accessory device doesn't exist */
+ if (!dev)
+ return;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ list_for_each_safe(entry, temp, &dev->hid_list) {
+ hid = list_entry(entry, struct acc_hid_dev, list);
+ list_del(&hid->list);
+ list_add(&hid->list, &dev->dead_hid_list);
+ }
+ list_for_each_safe(entry, temp, &dev->new_hid_list) {
+ hid = list_entry(entry, struct acc_hid_dev, list);
+ list_del(&hid->list);
+ list_add(&hid->list, &dev->dead_hid_list);
+ }
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ schedule_work(&dev->hid_work);
+}
+
+static void
+acc_hid_unbind(struct acc_dev *dev)
+{
+ hid_unregister_driver(&acc_hid_driver);
+ kill_all_hid_devices(dev);
+}
+
+static void
+acc_function_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct acc_dev *dev = func_to_dev(f);
+ struct usb_request *req;
+ int i;
+
+ dev->online = 0; /* clear online flag */
+ wake_up(&dev->read_wq); /* unblock reads on closure */
+ wake_up(&dev->write_wq); /* likewise for writes */
+
+ while ((req = req_get(dev, &dev->tx_idle)))
+ acc_request_free(req, dev->ep_in);
+ for (i = 0; i < RX_REQ_MAX; i++)
+ acc_request_free(dev->rx_req[i], dev->ep_out);
+
+ acc_hid_unbind(dev);
+}
+
+static void acc_start_work(struct work_struct *data)
+{
+ char *envp[2] = { "ACCESSORY=START", NULL };
+ kobject_uevent_env(&acc_device.this_device->kobj, KOBJ_CHANGE, envp);
+}
+
+static int acc_hid_init(struct acc_hid_dev *hdev)
+{
+ struct hid_device *hid;
+ int ret;
+
+ hid = hid_allocate_device();
+ if (IS_ERR(hid))
+ return PTR_ERR(hid);
+
+ hid->ll_driver = &acc_hid_ll_driver;
+ hid->dev.parent = acc_device.this_device;
+
+ hid->bus = BUS_USB;
+ hid->vendor = HID_ANY_ID;
+ hid->product = HID_ANY_ID;
+ hid->driver_data = hdev;
+ ret = hid_add_device(hid);
+ if (ret) {
+ pr_err("can't add hid device: %d\n", ret);
+ hid_destroy_device(hid);
+ return ret;
+ }
+
+ hdev->hid = hid;
+ return 0;
+}
+
+static void acc_hid_delete(struct acc_hid_dev *hid)
+{
+ kfree(hid->report_desc);
+ kfree(hid);
+}
+
+static void acc_hid_work(struct work_struct *data)
+{
+ struct acc_dev *dev = _acc_dev;
+ struct list_head *entry, *temp;
+ struct acc_hid_dev *hid;
+ struct list_head new_list, dead_list;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&new_list);
+
+ spin_lock_irqsave(&dev->lock, flags);
+
+ /* copy hids that are ready for initialization to new_list */
+ list_for_each_safe(entry, temp, &dev->new_hid_list) {
+ hid = list_entry(entry, struct acc_hid_dev, list);
+ if (hid->report_desc_offset == hid->report_desc_len)
+ list_move(&hid->list, &new_list);
+ }
+
+ if (list_empty(&dev->dead_hid_list)) {
+ INIT_LIST_HEAD(&dead_list);
+ } else {
+ /* move all of dev->dead_hid_list to dead_list */
+ dead_list.prev = dev->dead_hid_list.prev;
+ dead_list.next = dev->dead_hid_list.next;
+ dead_list.next->prev = &dead_list;
+ dead_list.prev->next = &dead_list;
+ INIT_LIST_HEAD(&dev->dead_hid_list);
+ }
+
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ /* register new HID devices */
+ list_for_each_safe(entry, temp, &new_list) {
+ hid = list_entry(entry, struct acc_hid_dev, list);
+ if (acc_hid_init(hid)) {
+ pr_err("can't add HID device %p\n", hid);
+ acc_hid_delete(hid);
+ } else {
+ spin_lock_irqsave(&dev->lock, flags);
+ list_move(&hid->list, &dev->hid_list);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ }
+ }
+
+ /* remove dead HID devices */
+ list_for_each_safe(entry, temp, &dead_list) {
+ hid = list_entry(entry, struct acc_hid_dev, list);
+ list_del(&hid->list);
+ if (hid->hid)
+ hid_destroy_device(hid->hid);
+ acc_hid_delete(hid);
+ }
+}
+
+static int acc_function_set_alt(struct usb_function *f,
+ unsigned intf, unsigned alt)
+{
+ struct acc_dev *dev = func_to_dev(f);
+ struct usb_composite_dev *cdev = f->config->cdev;
+ int ret;
+
+ DBG(cdev, "acc_function_set_alt intf: %d alt: %d\n", intf, alt);
+
+ ret = config_ep_by_speed(cdev->gadget, f, dev->ep_in);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(dev->ep_in);
+ if (ret)
+ return ret;
+
+ ret = config_ep_by_speed(cdev->gadget, f, dev->ep_out);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(dev->ep_out);
+ if (ret) {
+ usb_ep_disable(dev->ep_in);
+ return ret;
+ }
+
+ dev->online = 1;
+ dev->disconnected = 0; /* if online then not disconnected */
+
+ /* readers may be blocked waiting for us to go online */
+ wake_up(&dev->read_wq);
+ return 0;
+}
+
+static void acc_function_disable(struct usb_function *f)
+{
+ struct acc_dev *dev = func_to_dev(f);
+ struct usb_composite_dev *cdev = dev->cdev;
+
+ DBG(cdev, "acc_function_disable\n");
+ acc_set_disconnected(dev); /* this now only sets disconnected */
+ dev->online = 0; /* so now need to clear online flag here too */
+ usb_ep_disable(dev->ep_in);
+ usb_ep_disable(dev->ep_out);
+
+ /* readers may be blocked waiting for us to go online */
+ wake_up(&dev->read_wq);
+
+ VDBG(cdev, "%s disabled\n", dev->function.name);
+}
+
+static int acc_setup(void)
+{
+ struct acc_dev *dev;
+ int ret;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+
+ spin_lock_init(&dev->lock);
+ init_waitqueue_head(&dev->read_wq);
+ init_waitqueue_head(&dev->write_wq);
+ atomic_set(&dev->open_excl, 0);
+ INIT_LIST_HEAD(&dev->tx_idle);
+ INIT_LIST_HEAD(&dev->hid_list);
+ INIT_LIST_HEAD(&dev->new_hid_list);
+ INIT_LIST_HEAD(&dev->dead_hid_list);
+ INIT_DELAYED_WORK(&dev->start_work, acc_start_work);
+ INIT_WORK(&dev->hid_work, acc_hid_work);
+
+ /* _acc_dev must be set before calling usb_gadget_register_driver */
+ _acc_dev = dev;
+
+ ret = misc_register(&acc_device);
+ if (ret)
+ goto err;
+
+ return 0;
+
+err:
+ kfree(dev);
+ pr_err("USB accessory gadget driver failed to initialize\n");
+ return ret;
+}
+
+void acc_disconnect(void)
+{
+ /* unregister all HID devices if USB is disconnected */
+ kill_all_hid_devices(_acc_dev);
+}
+EXPORT_SYMBOL_GPL(acc_disconnect);
+
+static void acc_cleanup(void)
+{
+ misc_deregister(&acc_device);
+ kfree(_acc_dev);
+ _acc_dev = NULL;
+}
+static struct acc_instance *to_acc_instance(struct config_item *item)
+{
+ return container_of(to_config_group(item), struct acc_instance,
+ func_inst.group);
+}
+
+static void acc_attr_release(struct config_item *item)
+{
+ struct acc_instance *fi_acc = to_acc_instance(item);
+
+ usb_put_function_instance(&fi_acc->func_inst);
+}
+
+static struct configfs_item_operations acc_item_ops = {
+ .release = acc_attr_release,
+};
+
+static struct config_item_type acc_func_type = {
+ .ct_item_ops = &acc_item_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct acc_instance *to_fi_acc(struct usb_function_instance *fi)
+{
+ return container_of(fi, struct acc_instance, func_inst);
+}
+
+static int acc_set_inst_name(struct usb_function_instance *fi, const char *name)
+{
+ struct acc_instance *fi_acc;
+ char *ptr;
+ int name_len;
+
+ name_len = strlen(name) + 1;
+ if (name_len > MAX_INST_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ ptr = kstrndup(name, name_len, GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ fi_acc = to_fi_acc(fi);
+ fi_acc->name = ptr;
+ return 0;
+}
+
+static void acc_free_inst(struct usb_function_instance *fi)
+{
+ struct acc_instance *fi_acc;
+
+ fi_acc = to_fi_acc(fi);
+ kfree(fi_acc->name);
+ acc_cleanup();
+}
+
+static struct usb_function_instance *acc_alloc_inst(void)
+{
+ struct acc_instance *fi_acc;
+ struct acc_dev *dev;
+ int err;
+
+ fi_acc = kzalloc(sizeof(*fi_acc), GFP_KERNEL);
+ if (!fi_acc)
+ return ERR_PTR(-ENOMEM);
+ fi_acc->func_inst.set_inst_name = acc_set_inst_name;
+ fi_acc->func_inst.free_func_inst = acc_free_inst;
+
+ err = acc_setup();
+ if (err) {
+ kfree(fi_acc);
+ pr_err("Error setting ACCESSORY\n");
+ return ERR_PTR(err);
+ }
+
+ config_group_init_type_name(&fi_acc->func_inst.group,
+ "", &acc_func_type);
+ dev = _acc_dev;
+ return &fi_acc->func_inst;
+}
+
+static void acc_free(struct usb_function *f)
+{
+/*NO-OP: no function specific resource allocation in mtp_alloc*/
+}
+
+int acc_ctrlrequest_configfs(struct usb_function *f,
+ const struct usb_ctrlrequest *ctrl) {
+ if (f->config != NULL && f->config->cdev != NULL)
+ return acc_ctrlrequest(f->config->cdev, ctrl);
+ else
+ return -1;
+}
+
+static struct usb_function *acc_alloc(struct usb_function_instance *fi)
+{
+ struct acc_dev *dev = _acc_dev;
+
+ pr_info("acc_alloc\n");
+
+ dev->function.name = "accessory";
+ dev->function.strings = acc_strings,
+ dev->function.fs_descriptors = fs_acc_descs;
+ dev->function.hs_descriptors = hs_acc_descs;
+ dev->function.bind = acc_function_bind_configfs;
+ dev->function.unbind = acc_function_unbind;
+ dev->function.set_alt = acc_function_set_alt;
+ dev->function.disable = acc_function_disable;
+ dev->function.free_func = acc_free;
+ dev->function.setup = acc_ctrlrequest_configfs;
+
+ return &dev->function;
+}
+DECLARE_USB_FUNCTION_INIT(accessory, acc_alloc_inst, acc_alloc);
+MODULE_LICENSE("GPL");
diff --git a/drivers/usb/gadget/function/f_audio_source.c b/drivers/usb/gadget/function/f_audio_source.c
new file mode 100644
index 000000000000..bcd817439dbf
--- /dev/null
+++ b/drivers/usb/gadget/function/f_audio_source.c
@@ -0,0 +1,1060 @@
+/*
+ * Gadget Function Driver for USB audio source device
+ *
+ * Copyright (C) 2012 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/usb/audio.h>
+#include <linux/wait.h>
+#include <sound/core.h>
+#include <sound/initval.h>
+#include <sound/pcm.h>
+
+#include <linux/usb.h>
+#include <linux/usb_usual.h>
+#include <linux/usb/ch9.h>
+#include <linux/configfs.h>
+#include <linux/usb/composite.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#define SAMPLE_RATE 44100
+#define FRAMES_PER_MSEC (SAMPLE_RATE / 1000)
+
+#define IN_EP_MAX_PACKET_SIZE 256
+
+/* Number of requests to allocate */
+#define IN_EP_REQ_COUNT 4
+
+#define AUDIO_AC_INTERFACE 0
+#define AUDIO_AS_INTERFACE 1
+#define AUDIO_NUM_INTERFACES 2
+#define MAX_INST_NAME_LEN 40
+
+/* B.3.1 Standard AC Interface Descriptor */
+static struct usb_interface_descriptor ac_interface_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bNumEndpoints = 0,
+ .bInterfaceClass = USB_CLASS_AUDIO,
+ .bInterfaceSubClass = USB_SUBCLASS_AUDIOCONTROL,
+};
+
+DECLARE_UAC_AC_HEADER_DESCRIPTOR(2);
+
+#define UAC_DT_AC_HEADER_LENGTH UAC_DT_AC_HEADER_SIZE(AUDIO_NUM_INTERFACES)
+/* 1 input terminal, 1 output terminal and 1 feature unit */
+#define UAC_DT_TOTAL_LENGTH (UAC_DT_AC_HEADER_LENGTH \
+ + UAC_DT_INPUT_TERMINAL_SIZE + UAC_DT_OUTPUT_TERMINAL_SIZE \
+ + UAC_DT_FEATURE_UNIT_SIZE(0))
+/* B.3.2 Class-Specific AC Interface Descriptor */
+static struct uac1_ac_header_descriptor_2 ac_header_desc = {
+ .bLength = UAC_DT_AC_HEADER_LENGTH,
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_HEADER,
+ .bcdADC = __constant_cpu_to_le16(0x0100),
+ .wTotalLength = __constant_cpu_to_le16(UAC_DT_TOTAL_LENGTH),
+ .bInCollection = AUDIO_NUM_INTERFACES,
+ .baInterfaceNr = {
+ [0] = AUDIO_AC_INTERFACE,
+ [1] = AUDIO_AS_INTERFACE,
+ }
+};
+
+#define INPUT_TERMINAL_ID 1
+static struct uac_input_terminal_descriptor input_terminal_desc = {
+ .bLength = UAC_DT_INPUT_TERMINAL_SIZE,
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_INPUT_TERMINAL,
+ .bTerminalID = INPUT_TERMINAL_ID,
+ .wTerminalType = UAC_INPUT_TERMINAL_MICROPHONE,
+ .bAssocTerminal = 0,
+ .wChannelConfig = 0x3,
+};
+
+DECLARE_UAC_FEATURE_UNIT_DESCRIPTOR(0);
+
+#define FEATURE_UNIT_ID 2
+static struct uac_feature_unit_descriptor_0 feature_unit_desc = {
+ .bLength = UAC_DT_FEATURE_UNIT_SIZE(0),
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_FEATURE_UNIT,
+ .bUnitID = FEATURE_UNIT_ID,
+ .bSourceID = INPUT_TERMINAL_ID,
+ .bControlSize = 2,
+};
+
+#define OUTPUT_TERMINAL_ID 3
+static struct uac1_output_terminal_descriptor output_terminal_desc = {
+ .bLength = UAC_DT_OUTPUT_TERMINAL_SIZE,
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_OUTPUT_TERMINAL,
+ .bTerminalID = OUTPUT_TERMINAL_ID,
+ .wTerminalType = UAC_TERMINAL_STREAMING,
+ .bAssocTerminal = FEATURE_UNIT_ID,
+ .bSourceID = FEATURE_UNIT_ID,
+};
+
+/* B.4.1 Standard AS Interface Descriptor */
+static struct usb_interface_descriptor as_interface_alt_0_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bAlternateSetting = 0,
+ .bNumEndpoints = 0,
+ .bInterfaceClass = USB_CLASS_AUDIO,
+ .bInterfaceSubClass = USB_SUBCLASS_AUDIOSTREAMING,
+};
+
+static struct usb_interface_descriptor as_interface_alt_1_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bAlternateSetting = 1,
+ .bNumEndpoints = 1,
+ .bInterfaceClass = USB_CLASS_AUDIO,
+ .bInterfaceSubClass = USB_SUBCLASS_AUDIOSTREAMING,
+};
+
+/* B.4.2 Class-Specific AS Interface Descriptor */
+static struct uac1_as_header_descriptor as_header_desc = {
+ .bLength = UAC_DT_AS_HEADER_SIZE,
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_AS_GENERAL,
+ .bTerminalLink = INPUT_TERMINAL_ID,
+ .bDelay = 1,
+ .wFormatTag = UAC_FORMAT_TYPE_I_PCM,
+};
+
+DECLARE_UAC_FORMAT_TYPE_I_DISCRETE_DESC(1);
+
+static struct uac_format_type_i_discrete_descriptor_1 as_type_i_desc = {
+ .bLength = UAC_FORMAT_TYPE_I_DISCRETE_DESC_SIZE(1),
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_FORMAT_TYPE,
+ .bFormatType = UAC_FORMAT_TYPE_I,
+ .bSubframeSize = 2,
+ .bBitResolution = 16,
+ .bSamFreqType = 1,
+};
+
+/* Standard ISO IN Endpoint Descriptor for highspeed */
+static struct usb_endpoint_descriptor hs_as_in_ep_desc = {
+ .bLength = USB_DT_ENDPOINT_AUDIO_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_SYNC_SYNC
+ | USB_ENDPOINT_XFER_ISOC,
+ .wMaxPacketSize = __constant_cpu_to_le16(IN_EP_MAX_PACKET_SIZE),
+ .bInterval = 4, /* poll 1 per millisecond */
+};
+
+/* Standard ISO IN Endpoint Descriptor for highspeed */
+static struct usb_endpoint_descriptor fs_as_in_ep_desc = {
+ .bLength = USB_DT_ENDPOINT_AUDIO_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_SYNC_SYNC
+ | USB_ENDPOINT_XFER_ISOC,
+ .wMaxPacketSize = __constant_cpu_to_le16(IN_EP_MAX_PACKET_SIZE),
+ .bInterval = 1, /* poll 1 per millisecond */
+};
+
+/* Class-specific AS ISO OUT Endpoint Descriptor */
+static struct uac_iso_endpoint_descriptor as_iso_in_desc = {
+ .bLength = UAC_ISO_ENDPOINT_DESC_SIZE,
+ .bDescriptorType = USB_DT_CS_ENDPOINT,
+ .bDescriptorSubtype = UAC_EP_GENERAL,
+ .bmAttributes = 1,
+ .bLockDelayUnits = 1,
+ .wLockDelay = __constant_cpu_to_le16(1),
+};
+
+static struct usb_descriptor_header *hs_audio_desc[] = {
+ (struct usb_descriptor_header *)&ac_interface_desc,
+ (struct usb_descriptor_header *)&ac_header_desc,
+
+ (struct usb_descriptor_header *)&input_terminal_desc,
+ (struct usb_descriptor_header *)&output_terminal_desc,
+ (struct usb_descriptor_header *)&feature_unit_desc,
+
+ (struct usb_descriptor_header *)&as_interface_alt_0_desc,
+ (struct usb_descriptor_header *)&as_interface_alt_1_desc,
+ (struct usb_descriptor_header *)&as_header_desc,
+
+ (struct usb_descriptor_header *)&as_type_i_desc,
+
+ (struct usb_descriptor_header *)&hs_as_in_ep_desc,
+ (struct usb_descriptor_header *)&as_iso_in_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *fs_audio_desc[] = {
+ (struct usb_descriptor_header *)&ac_interface_desc,
+ (struct usb_descriptor_header *)&ac_header_desc,
+
+ (struct usb_descriptor_header *)&input_terminal_desc,
+ (struct usb_descriptor_header *)&output_terminal_desc,
+ (struct usb_descriptor_header *)&feature_unit_desc,
+
+ (struct usb_descriptor_header *)&as_interface_alt_0_desc,
+ (struct usb_descriptor_header *)&as_interface_alt_1_desc,
+ (struct usb_descriptor_header *)&as_header_desc,
+
+ (struct usb_descriptor_header *)&as_type_i_desc,
+
+ (struct usb_descriptor_header *)&fs_as_in_ep_desc,
+ (struct usb_descriptor_header *)&as_iso_in_desc,
+ NULL,
+};
+
+static struct snd_pcm_hardware audio_hw_info = {
+ .info = SNDRV_PCM_INFO_MMAP |
+ SNDRV_PCM_INFO_MMAP_VALID |
+ SNDRV_PCM_INFO_BATCH |
+ SNDRV_PCM_INFO_INTERLEAVED |
+ SNDRV_PCM_INFO_BLOCK_TRANSFER,
+
+ .formats = SNDRV_PCM_FMTBIT_S16_LE,
+ .channels_min = 2,
+ .channels_max = 2,
+ .rate_min = SAMPLE_RATE,
+ .rate_max = SAMPLE_RATE,
+
+ .buffer_bytes_max = 1024 * 1024,
+ .period_bytes_min = 64,
+ .period_bytes_max = 512 * 1024,
+ .periods_min = 2,
+ .periods_max = 1024,
+};
+
+/*-------------------------------------------------------------------------*/
+
+struct audio_source_config {
+ int card;
+ int device;
+};
+
+struct audio_dev {
+ struct usb_function func;
+ struct snd_card *card;
+ struct snd_pcm *pcm;
+ struct snd_pcm_substream *substream;
+
+ struct list_head idle_reqs;
+ struct usb_ep *in_ep;
+
+ spinlock_t lock;
+
+ /* beginning, end and current position in our buffer */
+ void *buffer_start;
+ void *buffer_end;
+ void *buffer_pos;
+
+ /* byte size of a "period" */
+ unsigned int period;
+ /* bytes sent since last call to snd_pcm_period_elapsed */
+ unsigned int period_offset;
+ /* time we started playing */
+ ktime_t start_time;
+ /* number of frames sent since start_time */
+ s64 frames_sent;
+ struct audio_source_config *config;
+};
+
+static inline struct audio_dev *func_to_audio(struct usb_function *f)
+{
+ return container_of(f, struct audio_dev, func);
+}
+
+/*-------------------------------------------------------------------------*/
+
+struct audio_source_instance {
+ struct usb_function_instance func_inst;
+ const char *name;
+ struct audio_source_config *config;
+ struct device *audio_device;
+};
+
+static void audio_source_attr_release(struct config_item *item);
+
+static struct configfs_item_operations audio_source_item_ops = {
+ .release = audio_source_attr_release,
+};
+
+static struct config_item_type audio_source_func_type = {
+ .ct_item_ops = &audio_source_item_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static ssize_t audio_source_pcm_show(struct device *dev,
+ struct device_attribute *attr, char *buf);
+
+static DEVICE_ATTR(pcm, S_IRUGO, audio_source_pcm_show, NULL);
+
+static struct device_attribute *audio_source_function_attributes[] = {
+ &dev_attr_pcm,
+ NULL
+};
+
+/*--------------------------------------------------------------------------*/
+
+static struct usb_request *audio_request_new(struct usb_ep *ep, int buffer_size)
+{
+ struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL);
+ if (!req)
+ return NULL;
+
+ req->buf = kmalloc(buffer_size, GFP_KERNEL);
+ if (!req->buf) {
+ usb_ep_free_request(ep, req);
+ return NULL;
+ }
+ req->length = buffer_size;
+ return req;
+}
+
+static void audio_request_free(struct usb_request *req, struct usb_ep *ep)
+{
+ if (req) {
+ kfree(req->buf);
+ usb_ep_free_request(ep, req);
+ }
+}
+
+static void audio_req_put(struct audio_dev *audio, struct usb_request *req)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&audio->lock, flags);
+ list_add_tail(&req->list, &audio->idle_reqs);
+ spin_unlock_irqrestore(&audio->lock, flags);
+}
+
+static struct usb_request *audio_req_get(struct audio_dev *audio)
+{
+ unsigned long flags;
+ struct usb_request *req;
+
+ spin_lock_irqsave(&audio->lock, flags);
+ if (list_empty(&audio->idle_reqs)) {
+ req = 0;
+ } else {
+ req = list_first_entry(&audio->idle_reqs, struct usb_request,
+ list);
+ list_del(&req->list);
+ }
+ spin_unlock_irqrestore(&audio->lock, flags);
+ return req;
+}
+
+/* send the appropriate number of packets to match our bitrate */
+static void audio_send(struct audio_dev *audio)
+{
+ struct snd_pcm_runtime *runtime;
+ struct usb_request *req;
+ int length, length1, length2, ret;
+ s64 msecs;
+ s64 frames;
+ ktime_t now;
+
+ /* audio->substream will be null if we have been closed */
+ if (!audio->substream)
+ return;
+ /* audio->buffer_pos will be null if we have been stopped */
+ if (!audio->buffer_pos)
+ return;
+
+ runtime = audio->substream->runtime;
+
+ /* compute number of frames to send */
+ now = ktime_get();
+ msecs = ktime_to_ns(now) - ktime_to_ns(audio->start_time);
+ do_div(msecs, 1000000);
+ frames = msecs * SAMPLE_RATE;
+ do_div(frames, 1000);
+
+ /* Readjust our frames_sent if we fall too far behind.
+ * If we get too far behind it is better to drop some frames than
+ * to keep sending data too fast in an attempt to catch up.
+ */
+ if (frames - audio->frames_sent > 10 * FRAMES_PER_MSEC)
+ audio->frames_sent = frames - FRAMES_PER_MSEC;
+
+ frames -= audio->frames_sent;
+
+ /* We need to send something to keep the pipeline going */
+ if (frames <= 0)
+ frames = FRAMES_PER_MSEC;
+
+ while (frames > 0) {
+ req = audio_req_get(audio);
+ if (!req)
+ break;
+
+ length = frames_to_bytes(runtime, frames);
+ if (length > IN_EP_MAX_PACKET_SIZE)
+ length = IN_EP_MAX_PACKET_SIZE;
+
+ if (audio->buffer_pos + length > audio->buffer_end)
+ length1 = audio->buffer_end - audio->buffer_pos;
+ else
+ length1 = length;
+ memcpy(req->buf, audio->buffer_pos, length1);
+ if (length1 < length) {
+ /* Wrap around and copy remaining length
+ * at beginning of buffer.
+ */
+ length2 = length - length1;
+ memcpy(req->buf + length1, audio->buffer_start,
+ length2);
+ audio->buffer_pos = audio->buffer_start + length2;
+ } else {
+ audio->buffer_pos += length1;
+ if (audio->buffer_pos >= audio->buffer_end)
+ audio->buffer_pos = audio->buffer_start;
+ }
+
+ req->length = length;
+ ret = usb_ep_queue(audio->in_ep, req, GFP_ATOMIC);
+ if (ret < 0) {
+ pr_err("usb_ep_queue failed ret: %d\n", ret);
+ audio_req_put(audio, req);
+ break;
+ }
+
+ frames -= bytes_to_frames(runtime, length);
+ audio->frames_sent += bytes_to_frames(runtime, length);
+ }
+}
+
+static void audio_control_complete(struct usb_ep *ep, struct usb_request *req)
+{
+ /* nothing to do here */
+}
+
+static void audio_data_complete(struct usb_ep *ep, struct usb_request *req)
+{
+ struct audio_dev *audio = req->context;
+
+ pr_debug("audio_data_complete req->status %d req->actual %d\n",
+ req->status, req->actual);
+
+ audio_req_put(audio, req);
+
+ if (!audio->buffer_start || req->status)
+ return;
+
+ audio->period_offset += req->actual;
+ if (audio->period_offset >= audio->period) {
+ snd_pcm_period_elapsed(audio->substream);
+ audio->period_offset = 0;
+ }
+ audio_send(audio);
+}
+
+static int audio_set_endpoint_req(struct usb_function *f,
+ const struct usb_ctrlrequest *ctrl)
+{
+ int value = -EOPNOTSUPP;
+ u16 ep = le16_to_cpu(ctrl->wIndex);
+ u16 len = le16_to_cpu(ctrl->wLength);
+ u16 w_value = le16_to_cpu(ctrl->wValue);
+
+ pr_debug("bRequest 0x%x, w_value 0x%04x, len %d, endpoint %d\n",
+ ctrl->bRequest, w_value, len, ep);
+
+ switch (ctrl->bRequest) {
+ case UAC_SET_CUR:
+ case UAC_SET_MIN:
+ case UAC_SET_MAX:
+ case UAC_SET_RES:
+ value = len;
+ break;
+ default:
+ break;
+ }
+
+ return value;
+}
+
+static int audio_get_endpoint_req(struct usb_function *f,
+ const struct usb_ctrlrequest *ctrl)
+{
+ struct usb_composite_dev *cdev = f->config->cdev;
+ int value = -EOPNOTSUPP;
+ u8 ep = ((le16_to_cpu(ctrl->wIndex) >> 8) & 0xFF);
+ u16 len = le16_to_cpu(ctrl->wLength);
+ u16 w_value = le16_to_cpu(ctrl->wValue);
+ u8 *buf = cdev->req->buf;
+
+ pr_debug("bRequest 0x%x, w_value 0x%04x, len %d, endpoint %d\n",
+ ctrl->bRequest, w_value, len, ep);
+
+ if (w_value == UAC_EP_CS_ATTR_SAMPLE_RATE << 8) {
+ switch (ctrl->bRequest) {
+ case UAC_GET_CUR:
+ case UAC_GET_MIN:
+ case UAC_GET_MAX:
+ case UAC_GET_RES:
+ /* return our sample rate */
+ buf[0] = (u8)SAMPLE_RATE;
+ buf[1] = (u8)(SAMPLE_RATE >> 8);
+ buf[2] = (u8)(SAMPLE_RATE >> 16);
+ value = 3;
+ break;
+ default:
+ break;
+ }
+ }
+
+ return value;
+}
+
+static int
+audio_setup(struct usb_function *f, const struct usb_ctrlrequest *ctrl)
+{
+ struct usb_composite_dev *cdev = f->config->cdev;
+ struct usb_request *req = cdev->req;
+ int value = -EOPNOTSUPP;
+ u16 w_index = le16_to_cpu(ctrl->wIndex);
+ u16 w_value = le16_to_cpu(ctrl->wValue);
+ u16 w_length = le16_to_cpu(ctrl->wLength);
+
+ /* composite driver infrastructure handles everything; interface
+ * activation uses set_alt().
+ */
+ switch (ctrl->bRequestType) {
+ case USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_ENDPOINT:
+ value = audio_set_endpoint_req(f, ctrl);
+ break;
+
+ case USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_ENDPOINT:
+ value = audio_get_endpoint_req(f, ctrl);
+ break;
+ }
+
+ /* respond with data transfer or status phase? */
+ if (value >= 0) {
+ pr_debug("audio req%02x.%02x v%04x i%04x l%d\n",
+ ctrl->bRequestType, ctrl->bRequest,
+ w_value, w_index, w_length);
+ req->zero = 0;
+ req->length = value;
+ req->complete = audio_control_complete;
+ value = usb_ep_queue(cdev->gadget->ep0, req, GFP_ATOMIC);
+ if (value < 0)
+ pr_err("audio response on err %d\n", value);
+ }
+
+ /* device either stalls (value < 0) or reports success */
+ return value;
+}
+
+static int audio_set_alt(struct usb_function *f, unsigned intf, unsigned alt)
+{
+ struct audio_dev *audio = func_to_audio(f);
+ struct usb_composite_dev *cdev = f->config->cdev;
+ int ret;
+
+ pr_debug("audio_set_alt intf %d, alt %d\n", intf, alt);
+
+ ret = config_ep_by_speed(cdev->gadget, f, audio->in_ep);
+ if (ret)
+ return ret;
+
+ usb_ep_enable(audio->in_ep);
+ return 0;
+}
+
+static void audio_disable(struct usb_function *f)
+{
+ struct audio_dev *audio = func_to_audio(f);
+
+ pr_debug("audio_disable\n");
+ usb_ep_disable(audio->in_ep);
+}
+
+static void audio_free_func(struct usb_function *f)
+{
+ /* no-op */
+}
+
+/*-------------------------------------------------------------------------*/
+
+static void audio_build_desc(struct audio_dev *audio)
+{
+ u8 *sam_freq;
+ int rate;
+
+ /* Set channel numbers */
+ input_terminal_desc.bNrChannels = 2;
+ as_type_i_desc.bNrChannels = 2;
+
+ /* Set sample rates */
+ rate = SAMPLE_RATE;
+ sam_freq = as_type_i_desc.tSamFreq[0];
+ memcpy(sam_freq, &rate, 3);
+}
+
+
+static int snd_card_setup(struct usb_configuration *c,
+ struct audio_source_config *config);
+static struct audio_source_instance *to_fi_audio_source(
+ const struct usb_function_instance *fi);
+
+
+/* audio function driver setup/binding */
+static int
+audio_bind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct usb_composite_dev *cdev = c->cdev;
+ struct audio_dev *audio = func_to_audio(f);
+ int status;
+ struct usb_ep *ep;
+ struct usb_request *req;
+ int i;
+ int err;
+
+ if (IS_ENABLED(CONFIG_USB_CONFIGFS)) {
+ struct audio_source_instance *fi_audio =
+ to_fi_audio_source(f->fi);
+ struct audio_source_config *config =
+ fi_audio->config;
+
+ err = snd_card_setup(c, config);
+ if (err)
+ return err;
+ }
+
+ audio_build_desc(audio);
+
+ /* allocate instance-specific interface IDs, and patch descriptors */
+ status = usb_interface_id(c, f);
+ if (status < 0)
+ goto fail;
+ ac_interface_desc.bInterfaceNumber = status;
+
+ /* AUDIO_AC_INTERFACE */
+ ac_header_desc.baInterfaceNr[0] = status;
+
+ status = usb_interface_id(c, f);
+ if (status < 0)
+ goto fail;
+ as_interface_alt_0_desc.bInterfaceNumber = status;
+ as_interface_alt_1_desc.bInterfaceNumber = status;
+
+ /* AUDIO_AS_INTERFACE */
+ ac_header_desc.baInterfaceNr[1] = status;
+
+ status = -ENODEV;
+
+ /* allocate our endpoint */
+ ep = usb_ep_autoconfig(cdev->gadget, &fs_as_in_ep_desc);
+ if (!ep)
+ goto fail;
+ audio->in_ep = ep;
+ ep->driver_data = audio; /* claim */
+
+ if (gadget_is_dualspeed(c->cdev->gadget))
+ hs_as_in_ep_desc.bEndpointAddress =
+ fs_as_in_ep_desc.bEndpointAddress;
+
+ f->fs_descriptors = fs_audio_desc;
+ f->hs_descriptors = hs_audio_desc;
+
+ for (i = 0, status = 0; i < IN_EP_REQ_COUNT && status == 0; i++) {
+ req = audio_request_new(ep, IN_EP_MAX_PACKET_SIZE);
+ if (req) {
+ req->context = audio;
+ req->complete = audio_data_complete;
+ audio_req_put(audio, req);
+ } else
+ status = -ENOMEM;
+ }
+
+fail:
+ return status;
+}
+
+static void
+audio_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct audio_dev *audio = func_to_audio(f);
+ struct usb_request *req;
+
+ while ((req = audio_req_get(audio)))
+ audio_request_free(req, audio->in_ep);
+
+ snd_card_free_when_closed(audio->card);
+ audio->card = NULL;
+ audio->pcm = NULL;
+ audio->substream = NULL;
+ audio->in_ep = NULL;
+
+ if (IS_ENABLED(CONFIG_USB_CONFIGFS)) {
+ struct audio_source_instance *fi_audio =
+ to_fi_audio_source(f->fi);
+ struct audio_source_config *config =
+ fi_audio->config;
+
+ config->card = -1;
+ config->device = -1;
+ }
+}
+
+static void audio_pcm_playback_start(struct audio_dev *audio)
+{
+ audio->start_time = ktime_get();
+ audio->frames_sent = 0;
+ audio_send(audio);
+}
+
+static void audio_pcm_playback_stop(struct audio_dev *audio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&audio->lock, flags);
+ audio->buffer_start = 0;
+ audio->buffer_end = 0;
+ audio->buffer_pos = 0;
+ spin_unlock_irqrestore(&audio->lock, flags);
+}
+
+static int audio_pcm_open(struct snd_pcm_substream *substream)
+{
+ struct snd_pcm_runtime *runtime = substream->runtime;
+ struct audio_dev *audio = substream->private_data;
+
+ runtime->private_data = audio;
+ runtime->hw = audio_hw_info;
+ snd_pcm_limit_hw_rates(runtime);
+ runtime->hw.channels_max = 2;
+
+ audio->substream = substream;
+ return 0;
+}
+
+static int audio_pcm_close(struct snd_pcm_substream *substream)
+{
+ struct audio_dev *audio = substream->private_data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&audio->lock, flags);
+ audio->substream = NULL;
+ spin_unlock_irqrestore(&audio->lock, flags);
+
+ return 0;
+}
+
+static int audio_pcm_hw_params(struct snd_pcm_substream *substream,
+ struct snd_pcm_hw_params *params)
+{
+ unsigned int channels = params_channels(params);
+ unsigned int rate = params_rate(params);
+
+ if (rate != SAMPLE_RATE)
+ return -EINVAL;
+ if (channels != 2)
+ return -EINVAL;
+
+ return snd_pcm_lib_alloc_vmalloc_buffer(substream,
+ params_buffer_bytes(params));
+}
+
+static int audio_pcm_hw_free(struct snd_pcm_substream *substream)
+{
+ return snd_pcm_lib_free_vmalloc_buffer(substream);
+}
+
+static int audio_pcm_prepare(struct snd_pcm_substream *substream)
+{
+ struct snd_pcm_runtime *runtime = substream->runtime;
+ struct audio_dev *audio = runtime->private_data;
+
+ audio->period = snd_pcm_lib_period_bytes(substream);
+ audio->period_offset = 0;
+ audio->buffer_start = runtime->dma_area;
+ audio->buffer_end = audio->buffer_start
+ + snd_pcm_lib_buffer_bytes(substream);
+ audio->buffer_pos = audio->buffer_start;
+
+ return 0;
+}
+
+static snd_pcm_uframes_t audio_pcm_pointer(struct snd_pcm_substream *substream)
+{
+ struct snd_pcm_runtime *runtime = substream->runtime;
+ struct audio_dev *audio = runtime->private_data;
+ ssize_t bytes = audio->buffer_pos - audio->buffer_start;
+
+ /* return offset of next frame to fill in our buffer */
+ return bytes_to_frames(runtime, bytes);
+}
+
+static int audio_pcm_playback_trigger(struct snd_pcm_substream *substream,
+ int cmd)
+{
+ struct audio_dev *audio = substream->runtime->private_data;
+ int ret = 0;
+
+ switch (cmd) {
+ case SNDRV_PCM_TRIGGER_START:
+ case SNDRV_PCM_TRIGGER_RESUME:
+ audio_pcm_playback_start(audio);
+ break;
+
+ case SNDRV_PCM_TRIGGER_STOP:
+ case SNDRV_PCM_TRIGGER_SUSPEND:
+ audio_pcm_playback_stop(audio);
+ break;
+
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static struct audio_dev _audio_dev = {
+ .func = {
+ .name = "audio_source",
+ .bind = audio_bind,
+ .unbind = audio_unbind,
+ .set_alt = audio_set_alt,
+ .setup = audio_setup,
+ .disable = audio_disable,
+ .free_func = audio_free_func,
+ },
+ .lock = __SPIN_LOCK_UNLOCKED(_audio_dev.lock),
+ .idle_reqs = LIST_HEAD_INIT(_audio_dev.idle_reqs),
+};
+
+static struct snd_pcm_ops audio_playback_ops = {
+ .open = audio_pcm_open,
+ .close = audio_pcm_close,
+ .ioctl = snd_pcm_lib_ioctl,
+ .hw_params = audio_pcm_hw_params,
+ .hw_free = audio_pcm_hw_free,
+ .prepare = audio_pcm_prepare,
+ .trigger = audio_pcm_playback_trigger,
+ .pointer = audio_pcm_pointer,
+};
+
+int audio_source_bind_config(struct usb_configuration *c,
+ struct audio_source_config *config)
+{
+ struct audio_dev *audio;
+ int err;
+
+ config->card = -1;
+ config->device = -1;
+
+ audio = &_audio_dev;
+
+ err = snd_card_setup(c, config);
+ if (err)
+ return err;
+
+ err = usb_add_function(c, &audio->func);
+ if (err)
+ goto add_fail;
+
+ return 0;
+
+add_fail:
+ snd_card_free(audio->card);
+ return err;
+}
+
+static int snd_card_setup(struct usb_configuration *c,
+ struct audio_source_config *config)
+{
+ struct audio_dev *audio;
+ struct snd_card *card;
+ struct snd_pcm *pcm;
+ int err;
+
+ audio = &_audio_dev;
+
+ err = snd_card_new(&c->cdev->gadget->dev,
+ SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1,
+ THIS_MODULE, 0, &card);
+ if (err)
+ return err;
+
+ err = snd_pcm_new(card, "USB audio source", 0, 1, 0, &pcm);
+ if (err)
+ goto pcm_fail;
+
+ pcm->private_data = audio;
+ pcm->info_flags = 0;
+ audio->pcm = pcm;
+
+ strlcpy(pcm->name, "USB gadget audio", sizeof(pcm->name));
+
+ snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &audio_playback_ops);
+ snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_DEV,
+ NULL, 0, 64 * 1024);
+
+ strlcpy(card->driver, "audio_source", sizeof(card->driver));
+ strlcpy(card->shortname, card->driver, sizeof(card->shortname));
+ strlcpy(card->longname, "USB accessory audio source",
+ sizeof(card->longname));
+
+ err = snd_card_register(card);
+ if (err)
+ goto register_fail;
+
+ config->card = pcm->card->number;
+ config->device = pcm->device;
+ audio->card = card;
+ return 0;
+
+register_fail:
+pcm_fail:
+ snd_card_free(audio->card);
+ return err;
+}
+
+static struct audio_source_instance *to_audio_source_instance(
+ struct config_item *item)
+{
+ return container_of(to_config_group(item), struct audio_source_instance,
+ func_inst.group);
+}
+
+static struct audio_source_instance *to_fi_audio_source(
+ const struct usb_function_instance *fi)
+{
+ return container_of(fi, struct audio_source_instance, func_inst);
+}
+
+static void audio_source_attr_release(struct config_item *item)
+{
+ struct audio_source_instance *fi_audio = to_audio_source_instance(item);
+
+ usb_put_function_instance(&fi_audio->func_inst);
+}
+
+static int audio_source_set_inst_name(struct usb_function_instance *fi,
+ const char *name)
+{
+ struct audio_source_instance *fi_audio;
+ char *ptr;
+ int name_len;
+
+ name_len = strlen(name) + 1;
+ if (name_len > MAX_INST_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ ptr = kstrndup(name, name_len, GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ fi_audio = to_fi_audio_source(fi);
+ fi_audio->name = ptr;
+
+ return 0;
+}
+
+static void audio_source_free_inst(struct usb_function_instance *fi)
+{
+ struct audio_source_instance *fi_audio;
+
+ fi_audio = to_fi_audio_source(fi);
+ device_destroy(fi_audio->audio_device->class,
+ fi_audio->audio_device->devt);
+ kfree(fi_audio->name);
+ kfree(fi_audio->config);
+}
+
+static ssize_t audio_source_pcm_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct audio_source_instance *fi_audio = dev_get_drvdata(dev);
+ struct audio_source_config *config = fi_audio->config;
+
+ /* print PCM card and device numbers */
+ return sprintf(buf, "%d %d\n", config->card, config->device);
+}
+
+struct device *create_function_device(char *name);
+
+static struct usb_function_instance *audio_source_alloc_inst(void)
+{
+ struct audio_source_instance *fi_audio;
+ struct device_attribute **attrs;
+ struct device_attribute *attr;
+ struct device *dev;
+ void *err_ptr;
+ int err = 0;
+
+ fi_audio = kzalloc(sizeof(*fi_audio), GFP_KERNEL);
+ if (!fi_audio)
+ return ERR_PTR(-ENOMEM);
+
+ fi_audio->func_inst.set_inst_name = audio_source_set_inst_name;
+ fi_audio->func_inst.free_func_inst = audio_source_free_inst;
+
+ fi_audio->config = kzalloc(sizeof(struct audio_source_config),
+ GFP_KERNEL);
+ if (!fi_audio->config) {
+ err_ptr = ERR_PTR(-ENOMEM);
+ goto fail_audio;
+ }
+
+ config_group_init_type_name(&fi_audio->func_inst.group, "",
+ &audio_source_func_type);
+ dev = create_function_device("f_audio_source");
+
+ if (IS_ERR(dev)) {
+ err_ptr = dev;
+ goto fail_audio_config;
+ }
+
+ fi_audio->config->card = -1;
+ fi_audio->config->device = -1;
+ fi_audio->audio_device = dev;
+
+ attrs = audio_source_function_attributes;
+ if (attrs) {
+ while ((attr = *attrs++) && !err)
+ err = device_create_file(dev, attr);
+ if (err) {
+ err_ptr = ERR_PTR(-EINVAL);
+ goto fail_device;
+ }
+ }
+
+ dev_set_drvdata(dev, fi_audio);
+ _audio_dev.config = fi_audio->config;
+
+ return &fi_audio->func_inst;
+
+fail_device:
+ device_destroy(dev->class, dev->devt);
+fail_audio_config:
+ kfree(fi_audio->config);
+fail_audio:
+ kfree(fi_audio);
+ return err_ptr;
+
+}
+
+static struct usb_function *audio_source_alloc(struct usb_function_instance *fi)
+{
+ return &_audio_dev.func;
+}
+
+DECLARE_USB_FUNCTION_INIT(audio_source, audio_source_alloc_inst,
+ audio_source_alloc);
+MODULE_LICENSE("GPL");
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index a5d34216e962..d7cc4f133677 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -86,12 +86,15 @@ ffs_setup_state_clear_cancelled(struct ffs_data *ffs)
cmpxchg(&ffs->setup_state, FFS_SETUP_CANCELLED, FFS_NO_SETUP);
}
+static void ffs_func_free(struct ffs_function *func);
static void ffs_func_eps_disable(struct ffs_function *func);
static int __must_check ffs_func_eps_enable(struct ffs_function *func);
static int ffs_func_bind(struct usb_configuration *,
struct usb_function *);
+static void old_ffs_func_unbind(struct usb_configuration *,
+ struct usb_function *);
static int ffs_func_set_alt(struct usb_function *, unsigned, unsigned);
static void ffs_func_disable(struct usb_function *);
static int ffs_func_setup(struct usb_function *,
@@ -675,7 +678,6 @@ static void ffs_user_copy_worker(struct work_struct *work)
usb_ep_free_request(io_data->ep, io_data->req);
- io_data->kiocb->private = NULL;
if (io_data->read)
kfree(io_data->iovec);
kfree(io_data->buf);
@@ -1616,6 +1618,71 @@ static void ffs_epfiles_destroy(struct ffs_epfile *epfiles, unsigned count)
kfree(epfiles);
}
+static int __maybe_unused functionfs_bind_config(struct usb_composite_dev *cdev,
+ struct usb_configuration *c,
+ struct ffs_data *ffs)
+{
+ struct ffs_function *func;
+ int ret;
+
+ ENTER();
+
+ func = kzalloc(sizeof *func, GFP_KERNEL);
+ if (unlikely(!func))
+ return -ENOMEM;
+
+ func->function.name = "Function FS Gadget";
+ func->function.strings = ffs->stringtabs;
+
+ func->function.bind = ffs_func_bind;
+ func->function.unbind = old_ffs_func_unbind;
+ func->function.set_alt = ffs_func_set_alt;
+ func->function.disable = ffs_func_disable;
+ func->function.setup = ffs_func_setup;
+ func->function.suspend = ffs_func_suspend;
+ func->function.resume = ffs_func_resume;
+
+ func->conf = c;
+ func->gadget = cdev->gadget;
+ func->ffs = ffs;
+ ffs_data_get(ffs);
+
+ ret = usb_add_function(c, &func->function);
+ if (unlikely(ret))
+ ffs_func_free(func);
+
+ return ret;
+}
+
+static void ffs_func_free(struct ffs_function *func)
+{
+ struct ffs_ep *ep = func->eps;
+ unsigned count = func->ffs->eps_count;
+ unsigned long flags;
+
+ ENTER();
+
+ /* cleanup after autoconfig */
+ spin_lock_irqsave(&func->ffs->eps_lock, flags);
+ do {
+ if (ep->ep && ep->req)
+ usb_ep_free_request(ep->ep, ep->req);
+ ep->req = NULL;
+ ++ep;
+ } while (--count);
+ spin_unlock_irqrestore(&func->ffs->eps_lock, flags);
+
+ ffs_data_put(func->ffs);
+
+ kfree(func->eps);
+ /*
+ * eps and interfaces_nums are allocated in the same chunk so
+ * only one free is required. Descriptors are also allocated
+ * in the same chunk.
+ */
+
+ kfree(func);
+}
static void ffs_func_eps_disable(struct ffs_function *func)
{
@@ -2899,6 +2966,24 @@ static int ffs_func_bind(struct usb_configuration *c,
/* Other USB function hooks *************************************************/
+static void old_ffs_func_unbind(struct usb_configuration *c,
+ struct usb_function *f)
+{
+ struct ffs_function *func = ffs_func_from_usb(f);
+ struct ffs_data *ffs = func->ffs;
+
+ ENTER();
+
+ if (ffs->func == func) {
+ ffs_func_eps_disable(func);
+ ffs->func = NULL;
+ }
+
+ ffs_event_add(ffs, FUNCTIONFS_UNBIND);
+
+ ffs_func_free(func);
+}
+
static int ffs_func_set_alt(struct usb_function *f,
unsigned interface, unsigned alt)
{
diff --git a/drivers/usb/gadget/function/f_hid.c b/drivers/usb/gadget/function/f_hid.c
index 59ab62c92b66..6f71a698b13f 100644
--- a/drivers/usb/gadget/function/f_hid.c
+++ b/drivers/usb/gadget/function/f_hid.c
@@ -373,8 +373,9 @@ static int hidg_setup(struct usb_function *f,
value = __le16_to_cpu(ctrl->wValue);
length = __le16_to_cpu(ctrl->wLength);
- VDBG(cdev, "hid_setup crtl_request : bRequestType:0x%x bRequest:0x%x "
- "Value:0x%x\n", ctrl->bRequestType, ctrl->bRequest, value);
+ VDBG(cdev,
+ "%s crtl_request : bRequestType:0x%x bRequest:0x%x Value:0x%x\n",
+ __func__, ctrl->bRequestType, ctrl->bRequest, value);
switch ((ctrl->bRequestType << 8) | ctrl->bRequest) {
case ((USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE) << 8
diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c
index 4588338e03fc..49c38eb4b8f0 100644
--- a/drivers/usb/gadget/function/f_midi.c
+++ b/drivers/usb/gadget/function/f_midi.c
@@ -20,6 +20,7 @@
*/
#include <linux/kernel.h>
+#include <linux/module.h>
#include <linux/slab.h>
#include <linux/device.h>
@@ -33,6 +34,7 @@
#include <linux/usb/midi.h>
#include "u_f.h"
+#include "u_midi.h"
MODULE_AUTHOR("Ben Williamson");
MODULE_LICENSE("GPL v2");
@@ -99,7 +101,7 @@ DECLARE_USB_MIDI_OUT_JACK_DESCRIPTOR(1);
DECLARE_USB_MS_ENDPOINT_DESCRIPTOR(16);
/* B.3.1 Standard AC Interface Descriptor */
-static struct usb_interface_descriptor ac_interface_desc __initdata = {
+static struct usb_interface_descriptor ac_interface_desc = {
.bLength = USB_DT_INTERFACE_SIZE,
.bDescriptorType = USB_DT_INTERFACE,
/* .bInterfaceNumber = DYNAMIC */
@@ -110,7 +112,7 @@ static struct usb_interface_descriptor ac_interface_desc __initdata = {
};
/* B.3.2 Class-Specific AC Interface Descriptor */
-static struct uac1_ac_header_descriptor_1 ac_header_desc __initdata = {
+static struct uac1_ac_header_descriptor_1 ac_header_desc = {
.bLength = UAC_DT_AC_HEADER_SIZE(1),
.bDescriptorType = USB_DT_CS_INTERFACE,
.bDescriptorSubtype = USB_MS_HEADER,
@@ -121,7 +123,7 @@ static struct uac1_ac_header_descriptor_1 ac_header_desc __initdata = {
};
/* B.4.1 Standard MS Interface Descriptor */
-static struct usb_interface_descriptor ms_interface_desc __initdata = {
+static struct usb_interface_descriptor ms_interface_desc = {
.bLength = USB_DT_INTERFACE_SIZE,
.bDescriptorType = USB_DT_INTERFACE,
/* .bInterfaceNumber = DYNAMIC */
@@ -132,7 +134,7 @@ static struct usb_interface_descriptor ms_interface_desc __initdata = {
};
/* B.4.2 Class-Specific MS Interface Descriptor */
-static struct usb_ms_header_descriptor ms_header_desc __initdata = {
+static struct usb_ms_header_descriptor ms_header_desc = {
.bLength = USB_DT_MS_HEADER_SIZE,
.bDescriptorType = USB_DT_CS_INTERFACE,
.bDescriptorSubtype = USB_MS_HEADER,
@@ -327,6 +329,10 @@ static int f_midi_set_alt(struct usb_function *f, unsigned intf, unsigned alt)
unsigned i;
int err;
+ /* For Control Device interface we do nothing */
+ if (intf == 0)
+ return 0;
+
err = f_midi_start_ep(midi, f, midi->in_ep);
if (err)
return err;
@@ -389,29 +395,6 @@ static void f_midi_disable(struct usb_function *f)
usb_ep_disable(midi->out_ep);
}
-static void f_midi_unbind(struct usb_configuration *c, struct usb_function *f)
-{
- struct usb_composite_dev *cdev = f->config->cdev;
- struct f_midi *midi = func_to_midi(f);
- struct snd_card *card;
-
- DBG(cdev, "unbind\n");
-
- /* just to be sure */
- f_midi_disable(f);
-
- card = midi->card;
- midi->card = NULL;
- if (card)
- snd_card_free(card);
-
- kfree(midi->id);
- midi->id = NULL;
-
- usb_free_all_descriptors(f);
- kfree(midi);
-}
-
static int f_midi_snd_free(struct snd_device *device)
{
return 0;
@@ -543,7 +526,7 @@ static void f_midi_transmit(struct f_midi *midi, struct usb_request *req)
req = midi_alloc_ep_req(ep, midi->buflen);
if (!req) {
- ERROR(midi, "gmidi_transmit: alloc_ep_request failed\n");
+ ERROR(midi, "%s: alloc_ep_request failed\n", __func__);
return;
}
req->length = 0;
@@ -656,6 +639,14 @@ static struct snd_rawmidi_ops gmidi_out_ops = {
.trigger = f_midi_out_trigger
};
+static inline void f_midi_unregister_card(struct f_midi *midi)
+{
+ if (midi->card) {
+ snd_card_free(midi->card);
+ midi->card = NULL;
+ }
+}
+
/* register as a sound "card" */
static int f_midi_register_card(struct f_midi *midi)
{
@@ -717,17 +708,13 @@ static int f_midi_register_card(struct f_midi *midi)
return 0;
fail:
- if (midi->card) {
- snd_card_free(midi->card);
- midi->card = NULL;
- }
+ f_midi_unregister_card(midi);
return err;
}
/* MIDI function driver setup/binding */
-static int __init
-f_midi_bind(struct usb_configuration *c, struct usb_function *f)
+static int f_midi_bind(struct usb_configuration *c, struct usb_function *f)
{
struct usb_descriptor_header **midi_function;
struct usb_midi_in_jack_descriptor jack_in_ext_desc[MAX_PORTS];
@@ -736,15 +723,23 @@ f_midi_bind(struct usb_configuration *c, struct usb_function *f)
struct usb_midi_out_jack_descriptor_1 jack_out_emb_desc[MAX_PORTS];
struct usb_composite_dev *cdev = c->cdev;
struct f_midi *midi = func_to_midi(f);
+ struct usb_string *us;
int status, n, jack = 1, i = 0;
+ midi->gadget = cdev->gadget;
+ tasklet_init(&midi->tasklet, f_midi_in_tasklet, (unsigned long) midi);
+ status = f_midi_register_card(midi);
+ if (status < 0)
+ goto fail_register;
+
/* maybe allocate device-global string ID */
- if (midi_string_defs[0].id == 0) {
- status = usb_string_id(c->cdev);
- if (status < 0)
- goto fail;
- midi_string_defs[0].id = status;
+ us = usb_gstrings_attach(c->cdev, midi_strings,
+ ARRAY_SIZE(midi_string_defs));
+ if (IS_ERR(us)) {
+ status = PTR_ERR(us);
+ goto fail;
}
+ ac_interface_desc.iInterface = us[STRING_FUNC_IDX].id;
/* We have two interfaces, AudioControl and MIDIStreaming */
status = usb_interface_id(c, f);
@@ -894,6 +889,8 @@ fail_f_midi:
kfree(midi_function);
usb_free_descriptors(f->hs_descriptors);
fail:
+ f_midi_unregister_card(midi);
+fail_register:
/* we might as well release our claims on endpoints */
if (midi->out_ep)
midi->out_ep->driver_data = NULL;
@@ -905,42 +902,306 @@ fail:
return status;
}
-/**
- * f_midi_bind_config - add USB MIDI function to a configuration
- * @c: the configuration to supcard the USB audio function
- * @index: the soundcard index to use for the ALSA device creation
- * @id: the soundcard id to use for the ALSA device creation
- * @buflen: the buffer length to use
- * @qlen the number of read requests to pre-allocate
- * Context: single threaded during gadget setup
- *
- * Returns zero on success, else negative errno.
- */
-int __init f_midi_bind_config(struct usb_configuration *c,
- int index, char *id,
- unsigned int in_ports,
- unsigned int out_ports,
- unsigned int buflen,
- unsigned int qlen)
+static inline struct f_midi_opts *to_f_midi_opts(struct config_item *item)
+{
+ return container_of(to_config_group(item), struct f_midi_opts,
+ func_inst.group);
+}
+
+CONFIGFS_ATTR_STRUCT(f_midi_opts);
+CONFIGFS_ATTR_OPS(f_midi_opts);
+
+static void midi_attr_release(struct config_item *item)
+{
+ struct f_midi_opts *opts = to_f_midi_opts(item);
+
+ usb_put_function_instance(&opts->func_inst);
+}
+
+static struct configfs_item_operations midi_item_ops = {
+ .release = midi_attr_release,
+ .show_attribute = f_midi_opts_attr_show,
+ .store_attribute = f_midi_opts_attr_store,
+};
+
+#define F_MIDI_OPT(name, test_limit, limit) \
+static ssize_t f_midi_opts_##name##_show(struct f_midi_opts *opts, char *page) \
+{ \
+ int result; \
+ \
+ mutex_lock(&opts->lock); \
+ result = sprintf(page, "%d\n", opts->name); \
+ mutex_unlock(&opts->lock); \
+ \
+ return result; \
+} \
+ \
+static ssize_t f_midi_opts_##name##_store(struct f_midi_opts *opts, \
+ const char *page, size_t len) \
+{ \
+ int ret; \
+ u32 num; \
+ \
+ mutex_lock(&opts->lock); \
+ if (opts->refcnt) { \
+ ret = -EBUSY; \
+ goto end; \
+ } \
+ \
+ ret = kstrtou32(page, 0, &num); \
+ if (ret) \
+ goto end; \
+ \
+ if (test_limit && num > limit) { \
+ ret = -EINVAL; \
+ goto end; \
+ } \
+ opts->name = num; \
+ ret = len; \
+ \
+end: \
+ mutex_unlock(&opts->lock); \
+ return ret; \
+} \
+ \
+static struct f_midi_opts_attribute f_midi_opts_##name = \
+ __CONFIGFS_ATTR(name, S_IRUGO | S_IWUSR, f_midi_opts_##name##_show, \
+ f_midi_opts_##name##_store)
+
+F_MIDI_OPT(index, true, SNDRV_CARDS);
+F_MIDI_OPT(buflen, false, 0);
+F_MIDI_OPT(qlen, false, 0);
+F_MIDI_OPT(in_ports, true, MAX_PORTS);
+F_MIDI_OPT(out_ports, true, MAX_PORTS);
+
+static ssize_t f_midi_opts_id_show(struct f_midi_opts *opts, char *page)
+{
+ int result;
+
+ mutex_lock(&opts->lock);
+ if (opts->id) {
+ result = strlcpy(page, opts->id, PAGE_SIZE);
+ } else {
+ page[0] = 0;
+ result = 0;
+ }
+
+ mutex_unlock(&opts->lock);
+
+ return result;
+}
+
+static ssize_t f_midi_opts_id_store(struct f_midi_opts *opts,
+ const char *page, size_t len)
+{
+ int ret;
+ char *c;
+
+ mutex_lock(&opts->lock);
+ if (opts->refcnt) {
+ ret = -EBUSY;
+ goto end;
+ }
+
+ c = kstrndup(page, len, GFP_KERNEL);
+ if (!c) {
+ ret = -ENOMEM;
+ goto end;
+ }
+ if (opts->id_allocated)
+ kfree(opts->id);
+ opts->id = c;
+ opts->id_allocated = true;
+ ret = len;
+end:
+ mutex_unlock(&opts->lock);
+ return ret;
+}
+
+static struct f_midi_opts_attribute f_midi_opts_id =
+ __CONFIGFS_ATTR(id, S_IRUGO | S_IWUSR, f_midi_opts_id_show,
+ f_midi_opts_id_store);
+
+static struct configfs_attribute *midi_attrs[] = {
+ &f_midi_opts_index.attr,
+ &f_midi_opts_buflen.attr,
+ &f_midi_opts_qlen.attr,
+ &f_midi_opts_in_ports.attr,
+ &f_midi_opts_out_ports.attr,
+ &f_midi_opts_id.attr,
+ NULL,
+};
+
+static struct config_item_type midi_func_type = {
+ .ct_item_ops = &midi_item_ops,
+ .ct_attrs = midi_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static void f_midi_free_inst(struct usb_function_instance *f)
+{
+ struct f_midi_opts *opts;
+
+ opts = container_of(f, struct f_midi_opts, func_inst);
+
+ if (opts->id_allocated)
+ kfree(opts->id);
+
+ kfree(opts);
+}
+
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+extern struct device *create_function_device(char *name);
+static ssize_t alsa_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
+ struct usb_function_instance *fi_midi = dev_get_drvdata(dev);
struct f_midi *midi;
+
+ if (!fi_midi->f)
+ dev_warn(dev, "f_midi: function not set\n");
+
+ if (fi_midi && fi_midi->f) {
+ midi = func_to_midi(fi_midi->f);
+ if (midi->rmidi && midi->rmidi->card)
+ return sprintf(buf, "%d %d\n",
+ midi->rmidi->card->number, midi->rmidi->device);
+ }
+
+ /* print PCM card and device numbers */
+ return sprintf(buf, "%d %d\n", -1, -1);
+}
+
+static DEVICE_ATTR(alsa, S_IRUGO, alsa_show, NULL);
+
+static struct device_attribute *alsa_function_attributes[] = {
+ &dev_attr_alsa,
+ NULL
+};
+
+static int create_alsa_device(struct usb_function_instance *fi)
+{
+ struct device *dev;
+ struct device_attribute **attrs;
+ struct device_attribute *attr;
+ int err = 0;
+
+ dev = create_function_device("f_midi");
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
+ attrs = alsa_function_attributes;
+ if (attrs) {
+ while ((attr = *attrs++) && !err)
+ err = device_create_file(dev, attr);
+ if (err) {
+ device_destroy(dev->class, dev->devt);
+ return -EINVAL;
+ }
+ }
+ dev_set_drvdata(dev, fi);
+ return 0;
+}
+#else
+static int create_alsa_device(struct usb_function_instance *fi)
+{
+ return 0;
+}
+#endif
+
+static struct usb_function_instance *f_midi_alloc_inst(void)
+{
+ struct f_midi_opts *opts;
+
+ opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+ if (!opts)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&opts->lock);
+ opts->func_inst.free_func_inst = f_midi_free_inst;
+ opts->index = SNDRV_DEFAULT_IDX1;
+ opts->id = SNDRV_DEFAULT_STR1;
+ opts->buflen = 256;
+ opts->qlen = 32;
+ opts->in_ports = 1;
+ opts->out_ports = 1;
+
+ if (create_alsa_device(&opts->func_inst)) {
+ kfree(opts);
+ return ERR_PTR(-ENODEV);
+ }
+
+ config_group_init_type_name(&opts->func_inst.group, "",
+ &midi_func_type);
+
+ return &opts->func_inst;
+}
+
+static void f_midi_free(struct usb_function *f)
+{
+ struct f_midi *midi;
+ struct f_midi_opts *opts;
+ int i;
+
+ midi = func_to_midi(f);
+ opts = container_of(f->fi, struct f_midi_opts, func_inst);
+ kfree(midi->id);
+ mutex_lock(&opts->lock);
+ for (i = opts->in_ports - 1; i >= 0; --i)
+ kfree(midi->in_port[i]);
+ kfree(midi);
+ opts->func_inst.f = NULL;
+ --opts->refcnt;
+ mutex_unlock(&opts->lock);
+}
+
+static void f_midi_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct usb_composite_dev *cdev = f->config->cdev;
+ struct f_midi *midi = func_to_midi(f);
+ struct snd_card *card;
+
+ DBG(cdev, "unbind\n");
+
+ /* just to be sure */
+ f_midi_disable(f);
+
+ card = midi->card;
+ midi->card = NULL;
+ if (card)
+ snd_card_free(card);
+
+ usb_free_all_descriptors(f);
+}
+
+static struct usb_function *f_midi_alloc(struct usb_function_instance *fi)
+{
+ struct f_midi *midi;
+ struct f_midi_opts *opts;
int status, i;
+ opts = container_of(fi, struct f_midi_opts, func_inst);
+
+ mutex_lock(&opts->lock);
/* sanity check */
- if (in_ports > MAX_PORTS || out_ports > MAX_PORTS)
- return -EINVAL;
+ if (opts->in_ports > MAX_PORTS || opts->out_ports > MAX_PORTS) {
+ mutex_unlock(&opts->lock);
+ return ERR_PTR(-EINVAL);
+ }
/* allocate and initialize one new instance */
- midi = kzalloc(sizeof *midi, GFP_KERNEL);
+ midi = kzalloc(sizeof(*midi), GFP_KERNEL);
if (!midi) {
- status = -ENOMEM;
- goto fail;
+ mutex_unlock(&opts->lock);
+ return ERR_PTR(-ENOMEM);
}
- for (i = 0; i < in_ports; i++) {
+ for (i = 0; i < opts->in_ports; i++) {
struct gmidi_in_port *port = kzalloc(sizeof(*port), GFP_KERNEL);
+
if (!port) {
status = -ENOMEM;
+ mutex_unlock(&opts->lock);
goto setup_fail;
}
@@ -950,39 +1211,36 @@ int __init f_midi_bind_config(struct usb_configuration *c,
midi->in_port[i] = port;
}
- midi->gadget = c->cdev->gadget;
- tasklet_init(&midi->tasklet, f_midi_in_tasklet, (unsigned long) midi);
-
/* set up ALSA midi devices */
- midi->in_ports = in_ports;
- midi->out_ports = out_ports;
- status = f_midi_register_card(midi);
- if (status < 0)
- goto setup_fail;
-
- midi->func.name = "gmidi function";
- midi->func.strings = midi_strings;
- midi->func.bind = f_midi_bind;
- midi->func.unbind = f_midi_unbind;
- midi->func.set_alt = f_midi_set_alt;
- midi->func.disable = f_midi_disable;
-
- midi->id = kstrdup(id, GFP_KERNEL);
- midi->index = index;
- midi->buflen = buflen;
- midi->qlen = qlen;
-
- status = usb_add_function(c, &midi->func);
- if (status)
+ midi->id = kstrdup(opts->id, GFP_KERNEL);
+ if (opts->id && !midi->id) {
+ status = -ENOMEM;
+ mutex_unlock(&opts->lock);
goto setup_fail;
-
- return 0;
+ }
+ midi->in_ports = opts->in_ports;
+ midi->out_ports = opts->out_ports;
+ midi->index = opts->index;
+ midi->buflen = opts->buflen;
+ midi->qlen = opts->qlen;
+ ++opts->refcnt;
+ mutex_unlock(&opts->lock);
+
+ midi->func.name = "gmidi function";
+ midi->func.bind = f_midi_bind;
+ midi->func.unbind = f_midi_unbind;
+ midi->func.set_alt = f_midi_set_alt;
+ midi->func.disable = f_midi_disable;
+ midi->func.free_func = f_midi_free;
+
+ fi->f = &midi->func;
+ return &midi->func;
setup_fail:
for (--i; i >= 0; i--)
kfree(midi->in_port[i]);
kfree(midi);
-fail:
- return status;
+ return ERR_PTR(status);
}
+DECLARE_USB_FUNCTION_INIT(midi, f_midi_alloc_inst, f_midi_alloc);
diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c
new file mode 100644
index 000000000000..2894073fbddd
--- /dev/null
+++ b/drivers/usb/gadget/function/f_mtp.c
@@ -0,0 +1,1583 @@
+/*
+ * Gadget Function Driver for MTP
+ *
+ * Copyright (C) 2010 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/* #define DEBUG */
+/* #define VERBOSE_DEBUG */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+
+#include <linux/usb.h>
+#include <linux/usb_usual.h>
+#include <linux/usb/ch9.h>
+#include <linux/usb/f_mtp.h>
+#include <linux/configfs.h>
+#include <linux/usb/composite.h>
+
+#include "configfs.h"
+
+#define MTP_BULK_BUFFER_SIZE 16384
+#define INTR_BUFFER_SIZE 28
+#define MAX_INST_NAME_LEN 40
+
+/* String IDs */
+#define INTERFACE_STRING_INDEX 0
+
+/* values for mtp_dev.state */
+#define STATE_OFFLINE 0 /* initial state, disconnected */
+#define STATE_READY 1 /* ready for userspace calls */
+#define STATE_BUSY 2 /* processing userspace calls */
+#define STATE_CANCELED 3 /* transaction canceled by host */
+#define STATE_ERROR 4 /* error from completion routine */
+
+/* number of tx and rx requests to allocate */
+#define TX_REQ_MAX 4
+#define RX_REQ_MAX 2
+#define INTR_REQ_MAX 5
+
+/* ID for Microsoft MTP OS String */
+#define MTP_OS_STRING_ID 0xEE
+
+/* MTP class reqeusts */
+#define MTP_REQ_CANCEL 0x64
+#define MTP_REQ_GET_EXT_EVENT_DATA 0x65
+#define MTP_REQ_RESET 0x66
+#define MTP_REQ_GET_DEVICE_STATUS 0x67
+
+/* constants for device status */
+#define MTP_RESPONSE_OK 0x2001
+#define MTP_RESPONSE_DEVICE_BUSY 0x2019
+#define DRIVER_NAME "mtp"
+
+static const char mtp_shortname[] = DRIVER_NAME "_usb";
+
+struct mtp_dev {
+ struct usb_function function;
+ struct usb_composite_dev *cdev;
+ spinlock_t lock;
+
+ struct usb_ep *ep_in;
+ struct usb_ep *ep_out;
+ struct usb_ep *ep_intr;
+
+ int state;
+
+ /* synchronize access to our device file */
+ atomic_t open_excl;
+ /* to enforce only one ioctl at a time */
+ atomic_t ioctl_excl;
+
+ struct list_head tx_idle;
+ struct list_head intr_idle;
+
+ wait_queue_head_t read_wq;
+ wait_queue_head_t write_wq;
+ wait_queue_head_t intr_wq;
+ struct usb_request *rx_req[RX_REQ_MAX];
+ int rx_done;
+
+ /* for processing MTP_SEND_FILE, MTP_RECEIVE_FILE and
+ * MTP_SEND_FILE_WITH_HEADER ioctls on a work queue
+ */
+ struct workqueue_struct *wq;
+ struct work_struct send_file_work;
+ struct work_struct receive_file_work;
+ struct file *xfer_file;
+ loff_t xfer_file_offset;
+ int64_t xfer_file_length;
+ unsigned xfer_send_header;
+ uint16_t xfer_command;
+ uint32_t xfer_transaction_id;
+ int xfer_result;
+};
+
+static struct usb_interface_descriptor mtp_interface_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bInterfaceNumber = 0,
+ .bNumEndpoints = 3,
+ .bInterfaceClass = USB_CLASS_VENDOR_SPEC,
+ .bInterfaceSubClass = USB_SUBCLASS_VENDOR_SPEC,
+ .bInterfaceProtocol = 0,
+};
+
+static struct usb_interface_descriptor ptp_interface_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bInterfaceNumber = 0,
+ .bNumEndpoints = 3,
+ .bInterfaceClass = USB_CLASS_STILL_IMAGE,
+ .bInterfaceSubClass = 1,
+ .bInterfaceProtocol = 1,
+};
+
+static struct usb_endpoint_descriptor mtp_ss_in_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(1024),
+};
+
+static struct usb_ss_ep_comp_descriptor mtp_ss_in_comp_desc = {
+ .bLength = sizeof(mtp_ss_in_comp_desc),
+ .bDescriptorType = USB_DT_SS_ENDPOINT_COMP,
+ /* .bMaxBurst = DYNAMIC, */
+};
+
+static struct usb_endpoint_descriptor mtp_ss_out_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(1024),
+};
+
+static struct usb_ss_ep_comp_descriptor mtp_ss_out_comp_desc = {
+ .bLength = sizeof(mtp_ss_out_comp_desc),
+ .bDescriptorType = USB_DT_SS_ENDPOINT_COMP,
+ /* .bMaxBurst = DYNAMIC, */
+};
+
+static struct usb_endpoint_descriptor mtp_highspeed_in_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor mtp_highspeed_out_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor mtp_fullspeed_in_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_endpoint_descriptor mtp_fullspeed_out_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_endpoint_descriptor mtp_intr_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_INT,
+ .wMaxPacketSize = __constant_cpu_to_le16(INTR_BUFFER_SIZE),
+ .bInterval = 6,
+};
+
+static struct usb_ss_ep_comp_descriptor mtp_intr_ss_comp_desc = {
+ .bLength = sizeof(mtp_intr_ss_comp_desc),
+ .bDescriptorType = USB_DT_SS_ENDPOINT_COMP,
+ .wBytesPerInterval = cpu_to_le16(INTR_BUFFER_SIZE),
+};
+
+static struct usb_descriptor_header *fs_mtp_descs[] = {
+ (struct usb_descriptor_header *) &mtp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_fullspeed_in_desc,
+ (struct usb_descriptor_header *) &mtp_fullspeed_out_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *hs_mtp_descs[] = {
+ (struct usb_descriptor_header *) &mtp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_highspeed_in_desc,
+ (struct usb_descriptor_header *) &mtp_highspeed_out_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *ss_mtp_descs[] = {
+ (struct usb_descriptor_header *) &mtp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_ss_in_desc,
+ (struct usb_descriptor_header *) &mtp_ss_in_comp_desc,
+ (struct usb_descriptor_header *) &mtp_ss_out_desc,
+ (struct usb_descriptor_header *) &mtp_ss_out_comp_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ (struct usb_descriptor_header *) &mtp_intr_ss_comp_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *fs_ptp_descs[] = {
+ (struct usb_descriptor_header *) &ptp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_fullspeed_in_desc,
+ (struct usb_descriptor_header *) &mtp_fullspeed_out_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *hs_ptp_descs[] = {
+ (struct usb_descriptor_header *) &ptp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_highspeed_in_desc,
+ (struct usb_descriptor_header *) &mtp_highspeed_out_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *ss_ptp_descs[] = {
+ (struct usb_descriptor_header *) &ptp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_ss_in_desc,
+ (struct usb_descriptor_header *) &mtp_ss_in_comp_desc,
+ (struct usb_descriptor_header *) &mtp_ss_out_desc,
+ (struct usb_descriptor_header *) &mtp_ss_out_comp_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ (struct usb_descriptor_header *) &mtp_intr_ss_comp_desc,
+ NULL,
+};
+
+static struct usb_string mtp_string_defs[] = {
+ /* Naming interface "MTP" so libmtp will recognize us */
+ [INTERFACE_STRING_INDEX].s = "MTP",
+ { }, /* end of list */
+};
+
+static struct usb_gadget_strings mtp_string_table = {
+ .language = 0x0409, /* en-US */
+ .strings = mtp_string_defs,
+};
+
+static struct usb_gadget_strings *mtp_strings[] = {
+ &mtp_string_table,
+ NULL,
+};
+
+/* Microsoft MTP OS String */
+static u8 mtp_os_string[] = {
+ 18, /* sizeof(mtp_os_string) */
+ USB_DT_STRING,
+ /* Signature field: "MSFT100" */
+ 'M', 0, 'S', 0, 'F', 0, 'T', 0, '1', 0, '0', 0, '0', 0,
+ /* vendor code */
+ 1,
+ /* padding */
+ 0
+};
+
+/* Microsoft Extended Configuration Descriptor Header Section */
+struct mtp_ext_config_desc_header {
+ __le32 dwLength;
+ __u16 bcdVersion;
+ __le16 wIndex;
+ __u8 bCount;
+ __u8 reserved[7];
+};
+
+/* Microsoft Extended Configuration Descriptor Function Section */
+struct mtp_ext_config_desc_function {
+ __u8 bFirstInterfaceNumber;
+ __u8 bInterfaceCount;
+ __u8 compatibleID[8];
+ __u8 subCompatibleID[8];
+ __u8 reserved[6];
+};
+
+/* MTP Extended Configuration Descriptor */
+struct mtp_ext_config_desc {
+ struct mtp_ext_config_desc_header header;
+ struct mtp_ext_config_desc_function function;
+};
+
+static struct mtp_ext_config_desc mtp_ext_config_desc = {
+ .header = {
+ .dwLength = __constant_cpu_to_le32(sizeof(mtp_ext_config_desc)),
+ .bcdVersion = __constant_cpu_to_le16(0x0100),
+ .wIndex = __constant_cpu_to_le16(4),
+ .bCount = 1,
+ },
+ .function = {
+ .bFirstInterfaceNumber = 0,
+ .bInterfaceCount = 1,
+ .compatibleID = { 'M', 'T', 'P' },
+ },
+};
+
+struct mtp_device_status {
+ __le16 wLength;
+ __le16 wCode;
+};
+
+struct mtp_data_header {
+ /* length of packet, including this header */
+ __le32 length;
+ /* container type (2 for data packet) */
+ __le16 type;
+ /* MTP command code */
+ __le16 command;
+ /* MTP transaction ID */
+ __le32 transaction_id;
+};
+
+struct mtp_instance {
+ struct usb_function_instance func_inst;
+ const char *name;
+ struct mtp_dev *dev;
+ char mtp_ext_compat_id[16];
+ struct usb_os_desc mtp_os_desc;
+};
+
+/* temporary variable used between mtp_open() and mtp_gadget_bind() */
+static struct mtp_dev *_mtp_dev;
+
+static inline struct mtp_dev *func_to_mtp(struct usb_function *f)
+{
+ return container_of(f, struct mtp_dev, function);
+}
+
+static struct usb_request *mtp_request_new(struct usb_ep *ep, int buffer_size)
+{
+ struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL);
+ if (!req)
+ return NULL;
+
+ /* now allocate buffers for the requests */
+ req->buf = kmalloc(buffer_size, GFP_KERNEL);
+ if (!req->buf) {
+ usb_ep_free_request(ep, req);
+ return NULL;
+ }
+
+ return req;
+}
+
+static void mtp_request_free(struct usb_request *req, struct usb_ep *ep)
+{
+ if (req) {
+ kfree(req->buf);
+ usb_ep_free_request(ep, req);
+ }
+}
+
+static inline int mtp_lock(atomic_t *excl)
+{
+ if (atomic_inc_return(excl) == 1) {
+ return 0;
+ } else {
+ atomic_dec(excl);
+ return -1;
+ }
+}
+
+static inline void mtp_unlock(atomic_t *excl)
+{
+ atomic_dec(excl);
+}
+
+/* add a request to the tail of a list */
+static void mtp_req_put(struct mtp_dev *dev, struct list_head *head,
+ struct usb_request *req)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ list_add_tail(&req->list, head);
+ spin_unlock_irqrestore(&dev->lock, flags);
+}
+
+/* remove a request from the head of a list */
+static struct usb_request
+*mtp_req_get(struct mtp_dev *dev, struct list_head *head)
+{
+ unsigned long flags;
+ struct usb_request *req;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ if (list_empty(head)) {
+ req = 0;
+ } else {
+ req = list_first_entry(head, struct usb_request, list);
+ list_del(&req->list);
+ }
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return req;
+}
+
+static void mtp_complete_in(struct usb_ep *ep, struct usb_request *req)
+{
+ struct mtp_dev *dev = _mtp_dev;
+
+ if (req->status != 0)
+ dev->state = STATE_ERROR;
+
+ mtp_req_put(dev, &dev->tx_idle, req);
+
+ wake_up(&dev->write_wq);
+}
+
+static void mtp_complete_out(struct usb_ep *ep, struct usb_request *req)
+{
+ struct mtp_dev *dev = _mtp_dev;
+
+ dev->rx_done = 1;
+ if (req->status != 0)
+ dev->state = STATE_ERROR;
+
+ wake_up(&dev->read_wq);
+}
+
+static void mtp_complete_intr(struct usb_ep *ep, struct usb_request *req)
+{
+ struct mtp_dev *dev = _mtp_dev;
+
+ if (req->status != 0)
+ dev->state = STATE_ERROR;
+
+ mtp_req_put(dev, &dev->intr_idle, req);
+
+ wake_up(&dev->intr_wq);
+}
+
+static int mtp_create_bulk_endpoints(struct mtp_dev *dev,
+ struct usb_endpoint_descriptor *in_desc,
+ struct usb_endpoint_descriptor *out_desc,
+ struct usb_endpoint_descriptor *intr_desc)
+{
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *req;
+ struct usb_ep *ep;
+ int i;
+
+ DBG(cdev, "create_bulk_endpoints dev: %p\n", dev);
+
+ ep = usb_ep_autoconfig(cdev->gadget, in_desc);
+ if (!ep) {
+ DBG(cdev, "usb_ep_autoconfig for ep_in failed\n");
+ return -ENODEV;
+ }
+ DBG(cdev, "usb_ep_autoconfig for ep_in got %s\n", ep->name);
+ ep->driver_data = dev; /* claim the endpoint */
+ dev->ep_in = ep;
+
+ ep = usb_ep_autoconfig(cdev->gadget, out_desc);
+ if (!ep) {
+ DBG(cdev, "usb_ep_autoconfig for ep_out failed\n");
+ return -ENODEV;
+ }
+ DBG(cdev, "usb_ep_autoconfig for mtp ep_out got %s\n", ep->name);
+ ep->driver_data = dev; /* claim the endpoint */
+ dev->ep_out = ep;
+
+ ep = usb_ep_autoconfig(cdev->gadget, intr_desc);
+ if (!ep) {
+ DBG(cdev, "usb_ep_autoconfig for ep_intr failed\n");
+ return -ENODEV;
+ }
+ DBG(cdev, "usb_ep_autoconfig for mtp ep_intr got %s\n", ep->name);
+ ep->driver_data = dev; /* claim the endpoint */
+ dev->ep_intr = ep;
+
+ /* now allocate requests for our endpoints */
+ for (i = 0; i < TX_REQ_MAX; i++) {
+ req = mtp_request_new(dev->ep_in, MTP_BULK_BUFFER_SIZE);
+ if (!req)
+ goto fail;
+ req->complete = mtp_complete_in;
+ mtp_req_put(dev, &dev->tx_idle, req);
+ }
+ for (i = 0; i < RX_REQ_MAX; i++) {
+ req = mtp_request_new(dev->ep_out, MTP_BULK_BUFFER_SIZE);
+ if (!req)
+ goto fail;
+ req->complete = mtp_complete_out;
+ dev->rx_req[i] = req;
+ }
+ for (i = 0; i < INTR_REQ_MAX; i++) {
+ req = mtp_request_new(dev->ep_intr, INTR_BUFFER_SIZE);
+ if (!req)
+ goto fail;
+ req->complete = mtp_complete_intr;
+ mtp_req_put(dev, &dev->intr_idle, req);
+ }
+
+ return 0;
+
+fail:
+ pr_err("mtp_bind() could not allocate requests\n");
+ return -1;
+}
+
+static ssize_t mtp_read(struct file *fp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct mtp_dev *dev = fp->private_data;
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *req;
+ ssize_t r = count;
+ unsigned xfer;
+ int ret = 0;
+
+ DBG(cdev, "mtp_read(%zu)\n", count);
+
+ if (count > MTP_BULK_BUFFER_SIZE)
+ return -EINVAL;
+
+ /* we will block until we're online */
+ DBG(cdev, "mtp_read: waiting for online state\n");
+ ret = wait_event_interruptible(dev->read_wq,
+ dev->state != STATE_OFFLINE);
+ if (ret < 0) {
+ r = ret;
+ goto done;
+ }
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED) {
+ /* report cancelation to userspace */
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+ return -ECANCELED;
+ }
+ dev->state = STATE_BUSY;
+ spin_unlock_irq(&dev->lock);
+
+requeue_req:
+ /* queue a request */
+ req = dev->rx_req[0];
+ req->length = count;
+ dev->rx_done = 0;
+ ret = usb_ep_queue(dev->ep_out, req, GFP_KERNEL);
+ if (ret < 0) {
+ r = -EIO;
+ goto done;
+ } else {
+ DBG(cdev, "rx %p queue\n", req);
+ }
+
+ /* wait for a request to complete */
+ ret = wait_event_interruptible(dev->read_wq, dev->rx_done);
+ if (ret < 0) {
+ r = ret;
+ usb_ep_dequeue(dev->ep_out, req);
+ goto done;
+ }
+ if (dev->state == STATE_BUSY) {
+ /* If we got a 0-len packet, throw it back and try again. */
+ if (req->actual == 0)
+ goto requeue_req;
+
+ DBG(cdev, "rx %p %d\n", req, req->actual);
+ xfer = (req->actual < count) ? req->actual : count;
+ r = xfer;
+ if (copy_to_user(buf, req->buf, xfer))
+ r = -EFAULT;
+ } else
+ r = -EIO;
+
+done:
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED)
+ r = -ECANCELED;
+ else if (dev->state != STATE_OFFLINE)
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+
+ DBG(cdev, "mtp_read returning %zd\n", r);
+ return r;
+}
+
+static ssize_t mtp_write(struct file *fp, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct mtp_dev *dev = fp->private_data;
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *req = 0;
+ ssize_t r = count;
+ unsigned xfer;
+ int sendZLP = 0;
+ int ret;
+
+ DBG(cdev, "mtp_write(%zu)\n", count);
+
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED) {
+ /* report cancelation to userspace */
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+ return -ECANCELED;
+ }
+ if (dev->state == STATE_OFFLINE) {
+ spin_unlock_irq(&dev->lock);
+ return -ENODEV;
+ }
+ dev->state = STATE_BUSY;
+ spin_unlock_irq(&dev->lock);
+
+ /* we need to send a zero length packet to signal the end of transfer
+ * if the transfer size is aligned to a packet boundary.
+ */
+ if ((count & (dev->ep_in->maxpacket - 1)) == 0)
+ sendZLP = 1;
+
+ while (count > 0 || sendZLP) {
+ /* so we exit after sending ZLP */
+ if (count == 0)
+ sendZLP = 0;
+
+ if (dev->state != STATE_BUSY) {
+ DBG(cdev, "mtp_write dev->error\n");
+ r = -EIO;
+ break;
+ }
+
+ /* get an idle tx request to use */
+ req = 0;
+ ret = wait_event_interruptible(dev->write_wq,
+ ((req = mtp_req_get(dev, &dev->tx_idle))
+ || dev->state != STATE_BUSY));
+ if (!req) {
+ r = ret;
+ break;
+ }
+
+ if (count > MTP_BULK_BUFFER_SIZE)
+ xfer = MTP_BULK_BUFFER_SIZE;
+ else
+ xfer = count;
+ if (xfer && copy_from_user(req->buf, buf, xfer)) {
+ r = -EFAULT;
+ break;
+ }
+
+ req->length = xfer;
+ ret = usb_ep_queue(dev->ep_in, req, GFP_KERNEL);
+ if (ret < 0) {
+ DBG(cdev, "mtp_write: xfer error %d\n", ret);
+ r = -EIO;
+ break;
+ }
+
+ buf += xfer;
+ count -= xfer;
+
+ /* zero this so we don't try to free it on error exit */
+ req = 0;
+ }
+
+ if (req)
+ mtp_req_put(dev, &dev->tx_idle, req);
+
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED)
+ r = -ECANCELED;
+ else if (dev->state != STATE_OFFLINE)
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+
+ DBG(cdev, "mtp_write returning %zd\n", r);
+ return r;
+}
+
+/* read from a local file and write to USB */
+static void send_file_work(struct work_struct *data)
+{
+ struct mtp_dev *dev = container_of(data, struct mtp_dev,
+ send_file_work);
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *req = 0;
+ struct mtp_data_header *header;
+ struct file *filp;
+ loff_t offset;
+ int64_t count;
+ int xfer, ret, hdr_size;
+ int r = 0;
+ int sendZLP = 0;
+
+ /* read our parameters */
+ smp_rmb();
+ filp = dev->xfer_file;
+ offset = dev->xfer_file_offset;
+ count = dev->xfer_file_length;
+
+ DBG(cdev, "send_file_work(%lld %lld)\n", offset, count);
+
+ if (dev->xfer_send_header) {
+ hdr_size = sizeof(struct mtp_data_header);
+ count += hdr_size;
+ } else {
+ hdr_size = 0;
+ }
+
+ /* we need to send a zero length packet to signal the end of transfer
+ * if the transfer size is aligned to a packet boundary.
+ */
+ if ((count & (dev->ep_in->maxpacket - 1)) == 0)
+ sendZLP = 1;
+
+ while (count > 0 || sendZLP) {
+ /* so we exit after sending ZLP */
+ if (count == 0)
+ sendZLP = 0;
+
+ /* get an idle tx request to use */
+ req = 0;
+ ret = wait_event_interruptible(dev->write_wq,
+ (req = mtp_req_get(dev, &dev->tx_idle))
+ || dev->state != STATE_BUSY);
+ if (dev->state == STATE_CANCELED) {
+ r = -ECANCELED;
+ break;
+ }
+ if (!req) {
+ r = ret;
+ break;
+ }
+
+ if (count > MTP_BULK_BUFFER_SIZE)
+ xfer = MTP_BULK_BUFFER_SIZE;
+ else
+ xfer = count;
+
+ if (hdr_size) {
+ /* prepend MTP data header */
+ header = (struct mtp_data_header *)req->buf;
+ header->length = __cpu_to_le32(count);
+ header->type = __cpu_to_le16(2); /* data packet */
+ header->command = __cpu_to_le16(dev->xfer_command);
+ header->transaction_id =
+ __cpu_to_le32(dev->xfer_transaction_id);
+ }
+
+ ret = vfs_read(filp, req->buf + hdr_size, xfer - hdr_size,
+ &offset);
+ if (ret < 0) {
+ r = ret;
+ break;
+ }
+ xfer = ret + hdr_size;
+ hdr_size = 0;
+
+ req->length = xfer;
+ ret = usb_ep_queue(dev->ep_in, req, GFP_KERNEL);
+ if (ret < 0) {
+ DBG(cdev, "send_file_work: xfer error %d\n", ret);
+ dev->state = STATE_ERROR;
+ r = -EIO;
+ break;
+ }
+
+ count -= xfer;
+
+ /* zero this so we don't try to free it on error exit */
+ req = 0;
+ }
+
+ if (req)
+ mtp_req_put(dev, &dev->tx_idle, req);
+
+ DBG(cdev, "send_file_work returning %d\n", r);
+ /* write the result */
+ dev->xfer_result = r;
+ smp_wmb();
+}
+
+/* read from USB and write to a local file */
+static void receive_file_work(struct work_struct *data)
+{
+ struct mtp_dev *dev = container_of(data, struct mtp_dev,
+ receive_file_work);
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *read_req = NULL, *write_req = NULL;
+ struct file *filp;
+ loff_t offset;
+ int64_t count;
+ int ret, cur_buf = 0;
+ int r = 0;
+
+ /* read our parameters */
+ smp_rmb();
+ filp = dev->xfer_file;
+ offset = dev->xfer_file_offset;
+ count = dev->xfer_file_length;
+
+ DBG(cdev, "receive_file_work(%lld)\n", count);
+
+ while (count > 0 || write_req) {
+ if (count > 0) {
+ /* queue a request */
+ read_req = dev->rx_req[cur_buf];
+ cur_buf = (cur_buf + 1) % RX_REQ_MAX;
+
+ read_req->length = (count > MTP_BULK_BUFFER_SIZE
+ ? MTP_BULK_BUFFER_SIZE : count);
+ dev->rx_done = 0;
+ ret = usb_ep_queue(dev->ep_out, read_req, GFP_KERNEL);
+ if (ret < 0) {
+ r = -EIO;
+ dev->state = STATE_ERROR;
+ break;
+ }
+ }
+
+ if (write_req) {
+ DBG(cdev, "rx %p %d\n", write_req, write_req->actual);
+ ret = vfs_write(filp, write_req->buf, write_req->actual,
+ &offset);
+ DBG(cdev, "vfs_write %d\n", ret);
+ if (ret != write_req->actual) {
+ r = -EIO;
+ dev->state = STATE_ERROR;
+ break;
+ }
+ write_req = NULL;
+ }
+
+ if (read_req) {
+ /* wait for our last read to complete */
+ ret = wait_event_interruptible(dev->read_wq,
+ dev->rx_done || dev->state != STATE_BUSY);
+ if (dev->state == STATE_CANCELED) {
+ r = -ECANCELED;
+ if (!dev->rx_done)
+ usb_ep_dequeue(dev->ep_out, read_req);
+ break;
+ }
+ /* if xfer_file_length is 0xFFFFFFFF, then we read until
+ * we get a zero length packet
+ */
+ if (count != 0xFFFFFFFF)
+ count -= read_req->actual;
+ if (read_req->actual < read_req->length) {
+ /*
+ * short packet is used to signal EOF for
+ * sizes > 4 gig
+ */
+ DBG(cdev, "got short packet\n");
+ count = 0;
+ }
+
+ write_req = read_req;
+ read_req = NULL;
+ }
+ }
+
+ DBG(cdev, "receive_file_work returning %d\n", r);
+ /* write the result */
+ dev->xfer_result = r;
+ smp_wmb();
+}
+
+static int mtp_send_event(struct mtp_dev *dev, struct mtp_event *event)
+{
+ struct usb_request *req = NULL;
+ int ret;
+ int length = event->length;
+
+ DBG(dev->cdev, "mtp_send_event(%zu)\n", event->length);
+
+ if (length < 0 || length > INTR_BUFFER_SIZE)
+ return -EINVAL;
+ if (dev->state == STATE_OFFLINE)
+ return -ENODEV;
+
+ ret = wait_event_interruptible_timeout(dev->intr_wq,
+ (req = mtp_req_get(dev, &dev->intr_idle)),
+ msecs_to_jiffies(1000));
+ if (!req)
+ return -ETIME;
+
+ if (copy_from_user(req->buf, (void __user *)event->data, length)) {
+ mtp_req_put(dev, &dev->intr_idle, req);
+ return -EFAULT;
+ }
+ req->length = length;
+ ret = usb_ep_queue(dev->ep_intr, req, GFP_KERNEL);
+ if (ret)
+ mtp_req_put(dev, &dev->intr_idle, req);
+
+ return ret;
+}
+
+static long mtp_ioctl(struct file *fp, unsigned code, unsigned long value)
+{
+ struct mtp_dev *dev = fp->private_data;
+ struct file *filp = NULL;
+ int ret = -EINVAL;
+
+ if (mtp_lock(&dev->ioctl_excl))
+ return -EBUSY;
+
+ switch (code) {
+ case MTP_SEND_FILE:
+ case MTP_RECEIVE_FILE:
+ case MTP_SEND_FILE_WITH_HEADER:
+ {
+ struct mtp_file_range mfr;
+ struct work_struct *work;
+
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED) {
+ /* report cancelation to userspace */
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+ ret = -ECANCELED;
+ goto out;
+ }
+ if (dev->state == STATE_OFFLINE) {
+ spin_unlock_irq(&dev->lock);
+ ret = -ENODEV;
+ goto out;
+ }
+ dev->state = STATE_BUSY;
+ spin_unlock_irq(&dev->lock);
+
+ if (copy_from_user(&mfr, (void __user *)value, sizeof(mfr))) {
+ ret = -EFAULT;
+ goto fail;
+ }
+ /* hold a reference to the file while we are working with it */
+ filp = fget(mfr.fd);
+ if (!filp) {
+ ret = -EBADF;
+ goto fail;
+ }
+
+ /* write the parameters */
+ dev->xfer_file = filp;
+ dev->xfer_file_offset = mfr.offset;
+ dev->xfer_file_length = mfr.length;
+ smp_wmb();
+
+ if (code == MTP_SEND_FILE_WITH_HEADER) {
+ work = &dev->send_file_work;
+ dev->xfer_send_header = 1;
+ dev->xfer_command = mfr.command;
+ dev->xfer_transaction_id = mfr.transaction_id;
+ } else if (code == MTP_SEND_FILE) {
+ work = &dev->send_file_work;
+ dev->xfer_send_header = 0;
+ } else {
+ work = &dev->receive_file_work;
+ }
+
+ /* We do the file transfer on a work queue so it will run
+ * in kernel context, which is necessary for vfs_read and
+ * vfs_write to use our buffers in the kernel address space.
+ */
+ queue_work(dev->wq, work);
+ /* wait for operation to complete */
+ flush_workqueue(dev->wq);
+ fput(filp);
+
+ /* read the result */
+ smp_rmb();
+ ret = dev->xfer_result;
+ break;
+ }
+ case MTP_SEND_EVENT:
+ {
+ struct mtp_event event;
+ /* return here so we don't change dev->state below,
+ * which would interfere with bulk transfer state.
+ */
+ if (copy_from_user(&event, (void __user *)value, sizeof(event)))
+ ret = -EFAULT;
+ else
+ ret = mtp_send_event(dev, &event);
+ goto out;
+ }
+ }
+
+fail:
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED)
+ ret = -ECANCELED;
+ else if (dev->state != STATE_OFFLINE)
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+out:
+ mtp_unlock(&dev->ioctl_excl);
+ DBG(dev->cdev, "ioctl returning %d\n", ret);
+ return ret;
+}
+
+static int mtp_open(struct inode *ip, struct file *fp)
+{
+ printk(KERN_INFO "mtp_open\n");
+ if (mtp_lock(&_mtp_dev->open_excl))
+ return -EBUSY;
+
+ /* clear any error condition */
+ if (_mtp_dev->state != STATE_OFFLINE)
+ _mtp_dev->state = STATE_READY;
+
+ fp->private_data = _mtp_dev;
+ return 0;
+}
+
+static int mtp_release(struct inode *ip, struct file *fp)
+{
+ printk(KERN_INFO "mtp_release\n");
+
+ mtp_unlock(&_mtp_dev->open_excl);
+ return 0;
+}
+
+/* file operations for /dev/mtp_usb */
+static const struct file_operations mtp_fops = {
+ .owner = THIS_MODULE,
+ .read = mtp_read,
+ .write = mtp_write,
+ .unlocked_ioctl = mtp_ioctl,
+ .open = mtp_open,
+ .release = mtp_release,
+};
+
+static struct miscdevice mtp_device = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = mtp_shortname,
+ .fops = &mtp_fops,
+};
+
+static int mtp_ctrlrequest(struct usb_composite_dev *cdev,
+ const struct usb_ctrlrequest *ctrl)
+{
+ struct mtp_dev *dev = _mtp_dev;
+ int value = -EOPNOTSUPP;
+ u16 w_index = le16_to_cpu(ctrl->wIndex);
+ u16 w_value = le16_to_cpu(ctrl->wValue);
+ u16 w_length = le16_to_cpu(ctrl->wLength);
+ unsigned long flags;
+
+ VDBG(cdev, "mtp_ctrlrequest "
+ "%02x.%02x v%04x i%04x l%u\n",
+ ctrl->bRequestType, ctrl->bRequest,
+ w_value, w_index, w_length);
+
+ /* Handle MTP OS string */
+ if (ctrl->bRequestType ==
+ (USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE)
+ && ctrl->bRequest == USB_REQ_GET_DESCRIPTOR
+ && (w_value >> 8) == USB_DT_STRING
+ && (w_value & 0xFF) == MTP_OS_STRING_ID) {
+ value = (w_length < sizeof(mtp_os_string)
+ ? w_length : sizeof(mtp_os_string));
+ memcpy(cdev->req->buf, mtp_os_string, value);
+ } else if ((ctrl->bRequestType & USB_TYPE_MASK) == USB_TYPE_VENDOR) {
+ /* Handle MTP OS descriptor */
+ DBG(cdev, "vendor request: %d index: %d value: %d length: %d\n",
+ ctrl->bRequest, w_index, w_value, w_length);
+
+ if (ctrl->bRequest == 1
+ && (ctrl->bRequestType & USB_DIR_IN)
+ && (w_index == 4 || w_index == 5)) {
+ value = (w_length < sizeof(mtp_ext_config_desc) ?
+ w_length : sizeof(mtp_ext_config_desc));
+ memcpy(cdev->req->buf, &mtp_ext_config_desc, value);
+
+ /* update compatibleID if PTP */
+ if (dev->function.fs_descriptors == fs_ptp_descs) {
+ struct mtp_ext_config_desc *d = cdev->req->buf;
+
+ d->function.compatibleID[0] = 'P';
+ }
+ }
+ } else if ((ctrl->bRequestType & USB_TYPE_MASK) == USB_TYPE_CLASS) {
+ DBG(cdev, "class request: %d index: %d value: %d length: %d\n",
+ ctrl->bRequest, w_index, w_value, w_length);
+
+ if (ctrl->bRequest == MTP_REQ_CANCEL && w_index == 0
+ && w_value == 0) {
+ DBG(cdev, "MTP_REQ_CANCEL\n");
+
+ spin_lock_irqsave(&dev->lock, flags);
+ if (dev->state == STATE_BUSY) {
+ dev->state = STATE_CANCELED;
+ wake_up(&dev->read_wq);
+ wake_up(&dev->write_wq);
+ }
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ /* We need to queue a request to read the remaining
+ * bytes, but we don't actually need to look at
+ * the contents.
+ */
+ value = w_length;
+ } else if (ctrl->bRequest == MTP_REQ_GET_DEVICE_STATUS
+ && w_index == 0 && w_value == 0) {
+ struct mtp_device_status *status = cdev->req->buf;
+ status->wLength =
+ __constant_cpu_to_le16(sizeof(*status));
+
+ DBG(cdev, "MTP_REQ_GET_DEVICE_STATUS\n");
+ spin_lock_irqsave(&dev->lock, flags);
+ /* device status is "busy" until we report
+ * the cancelation to userspace
+ */
+ if (dev->state == STATE_CANCELED)
+ status->wCode =
+ __cpu_to_le16(MTP_RESPONSE_DEVICE_BUSY);
+ else
+ status->wCode =
+ __cpu_to_le16(MTP_RESPONSE_OK);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ value = sizeof(*status);
+ }
+ }
+
+ /* respond with data transfer or status phase? */
+ if (value >= 0) {
+ int rc;
+ cdev->req->zero = value < w_length;
+ cdev->req->length = value;
+ rc = usb_ep_queue(cdev->gadget->ep0, cdev->req, GFP_ATOMIC);
+ if (rc < 0)
+ ERROR(cdev, "%s: response queue error\n", __func__);
+ }
+ return value;
+}
+
+static int
+mtp_function_bind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct usb_composite_dev *cdev = c->cdev;
+ struct mtp_dev *dev = func_to_mtp(f);
+ int id;
+ int ret;
+ struct mtp_instance *fi_mtp;
+
+ dev->cdev = cdev;
+ DBG(cdev, "mtp_function_bind dev: %p\n", dev);
+
+ /* allocate interface ID(s) */
+ id = usb_interface_id(c, f);
+ if (id < 0)
+ return id;
+ mtp_interface_desc.bInterfaceNumber = id;
+
+ if (mtp_string_defs[INTERFACE_STRING_INDEX].id == 0) {
+ ret = usb_string_id(c->cdev);
+ if (ret < 0)
+ return ret;
+ mtp_string_defs[INTERFACE_STRING_INDEX].id = ret;
+ mtp_interface_desc.iInterface = ret;
+ }
+
+ fi_mtp = container_of(f->fi, struct mtp_instance, func_inst);
+
+ if (cdev->use_os_string) {
+ f->os_desc_table = kzalloc(sizeof(*f->os_desc_table),
+ GFP_KERNEL);
+ if (!f->os_desc_table)
+ return -ENOMEM;
+ f->os_desc_n = 1;
+ f->os_desc_table[0].os_desc = &fi_mtp->mtp_os_desc;
+ }
+
+ /* allocate endpoints */
+ ret = mtp_create_bulk_endpoints(dev, &mtp_fullspeed_in_desc,
+ &mtp_fullspeed_out_desc, &mtp_intr_desc);
+ if (ret)
+ return ret;
+
+ /* support high speed hardware */
+ if (gadget_is_dualspeed(c->cdev->gadget)) {
+ mtp_highspeed_in_desc.bEndpointAddress =
+ mtp_fullspeed_in_desc.bEndpointAddress;
+ mtp_highspeed_out_desc.bEndpointAddress =
+ mtp_fullspeed_out_desc.bEndpointAddress;
+ }
+ /* support super speed hardware */
+ if (gadget_is_superspeed(c->cdev->gadget)) {
+ unsigned max_burst;
+
+ /* Calculate bMaxBurst, we know packet size is 1024 */
+ max_burst = min_t(unsigned, MTP_BULK_BUFFER_SIZE / 1024, 15);
+ mtp_ss_in_desc.bEndpointAddress =
+ mtp_fullspeed_in_desc.bEndpointAddress;
+ mtp_ss_in_comp_desc.bMaxBurst = max_burst;
+ mtp_ss_out_desc.bEndpointAddress =
+ mtp_fullspeed_out_desc.bEndpointAddress;
+ mtp_ss_out_comp_desc.bMaxBurst = max_burst;
+ }
+
+ DBG(cdev, "%s speed %s: IN/%s, OUT/%s\n",
+ gadget_is_superspeed(c->cdev->gadget) ? "super" :
+ (gadget_is_dualspeed(c->cdev->gadget) ? "dual" : "full"),
+ f->name, dev->ep_in->name, dev->ep_out->name);
+ return 0;
+}
+
+static void
+mtp_function_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct mtp_dev *dev = func_to_mtp(f);
+ struct usb_request *req;
+ int i;
+
+ mtp_string_defs[INTERFACE_STRING_INDEX].id = 0;
+ while ((req = mtp_req_get(dev, &dev->tx_idle)))
+ mtp_request_free(req, dev->ep_in);
+ for (i = 0; i < RX_REQ_MAX; i++)
+ mtp_request_free(dev->rx_req[i], dev->ep_out);
+ while ((req = mtp_req_get(dev, &dev->intr_idle)))
+ mtp_request_free(req, dev->ep_intr);
+ dev->state = STATE_OFFLINE;
+ kfree(f->os_desc_table);
+ f->os_desc_n = 0;
+}
+
+static int mtp_function_set_alt(struct usb_function *f,
+ unsigned intf, unsigned alt)
+{
+ struct mtp_dev *dev = func_to_mtp(f);
+ struct usb_composite_dev *cdev = f->config->cdev;
+ int ret;
+
+ DBG(cdev, "mtp_function_set_alt intf: %d alt: %d\n", intf, alt);
+
+ ret = config_ep_by_speed(cdev->gadget, f, dev->ep_in);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(dev->ep_in);
+ if (ret)
+ return ret;
+
+ ret = config_ep_by_speed(cdev->gadget, f, dev->ep_out);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(dev->ep_out);
+ if (ret) {
+ usb_ep_disable(dev->ep_in);
+ return ret;
+ }
+
+ ret = config_ep_by_speed(cdev->gadget, f, dev->ep_intr);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(dev->ep_intr);
+ if (ret) {
+ usb_ep_disable(dev->ep_out);
+ usb_ep_disable(dev->ep_in);
+ return ret;
+ }
+ dev->state = STATE_READY;
+
+ /* readers may be blocked waiting for us to go online */
+ wake_up(&dev->read_wq);
+ return 0;
+}
+
+static void mtp_function_disable(struct usb_function *f)
+{
+ struct mtp_dev *dev = func_to_mtp(f);
+ struct usb_composite_dev *cdev = dev->cdev;
+
+ DBG(cdev, "mtp_function_disable\n");
+ dev->state = STATE_OFFLINE;
+ usb_ep_disable(dev->ep_in);
+ usb_ep_disable(dev->ep_out);
+ usb_ep_disable(dev->ep_intr);
+
+ /* readers may be blocked waiting for us to go online */
+ wake_up(&dev->read_wq);
+
+ VDBG(cdev, "%s disabled\n", dev->function.name);
+}
+
+static int __maybe_unused mtp_bind_config(struct usb_configuration *c, bool ptp_config)
+{
+ struct mtp_dev *dev = _mtp_dev;
+ int ret = 0;
+
+ printk(KERN_INFO "mtp_bind_config\n");
+
+ /* allocate a string ID for our interface */
+ if (mtp_string_defs[INTERFACE_STRING_INDEX].id == 0) {
+ ret = usb_string_id(c->cdev);
+ if (ret < 0)
+ return ret;
+ mtp_string_defs[INTERFACE_STRING_INDEX].id = ret;
+ mtp_interface_desc.iInterface = ret;
+ }
+
+ dev->cdev = c->cdev;
+ dev->function.name = DRIVER_NAME;
+ dev->function.strings = mtp_strings;
+ if (ptp_config) {
+ dev->function.fs_descriptors = fs_ptp_descs;
+ dev->function.hs_descriptors = hs_ptp_descs;
+ dev->function.ss_descriptors = ss_ptp_descs;
+ } else {
+ dev->function.fs_descriptors = fs_mtp_descs;
+ dev->function.hs_descriptors = hs_mtp_descs;
+ dev->function.ss_descriptors = ss_mtp_descs;
+ }
+ dev->function.bind = mtp_function_bind;
+ dev->function.unbind = mtp_function_unbind;
+ dev->function.set_alt = mtp_function_set_alt;
+ dev->function.disable = mtp_function_disable;
+
+ return usb_add_function(c, &dev->function);
+}
+
+static int __mtp_setup(struct mtp_instance *fi_mtp)
+{
+ struct mtp_dev *dev;
+ int ret;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+
+ if (fi_mtp != NULL)
+ fi_mtp->dev = dev;
+
+ if (!dev)
+ return -ENOMEM;
+
+ spin_lock_init(&dev->lock);
+ init_waitqueue_head(&dev->read_wq);
+ init_waitqueue_head(&dev->write_wq);
+ init_waitqueue_head(&dev->intr_wq);
+ atomic_set(&dev->open_excl, 0);
+ atomic_set(&dev->ioctl_excl, 0);
+ INIT_LIST_HEAD(&dev->tx_idle);
+ INIT_LIST_HEAD(&dev->intr_idle);
+
+ dev->wq = create_singlethread_workqueue("f_mtp");
+ if (!dev->wq) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+ INIT_WORK(&dev->send_file_work, send_file_work);
+ INIT_WORK(&dev->receive_file_work, receive_file_work);
+
+ _mtp_dev = dev;
+
+ ret = misc_register(&mtp_device);
+ if (ret)
+ goto err2;
+
+ return 0;
+
+err2:
+ destroy_workqueue(dev->wq);
+err1:
+ _mtp_dev = NULL;
+ kfree(dev);
+ printk(KERN_ERR "mtp gadget driver failed to initialize\n");
+ return ret;
+}
+
+static int __maybe_unused mtp_setup(void)
+{
+ return __mtp_setup(NULL);
+}
+
+static int mtp_setup_configfs(struct mtp_instance *fi_mtp)
+{
+ return __mtp_setup(fi_mtp);
+}
+
+
+static void mtp_cleanup(void)
+{
+ struct mtp_dev *dev = _mtp_dev;
+
+ if (!dev)
+ return;
+
+ misc_deregister(&mtp_device);
+ destroy_workqueue(dev->wq);
+ _mtp_dev = NULL;
+ kfree(dev);
+}
+
+static struct mtp_instance *to_mtp_instance(struct config_item *item)
+{
+ return container_of(to_config_group(item), struct mtp_instance,
+ func_inst.group);
+}
+
+static void mtp_attr_release(struct config_item *item)
+{
+ struct mtp_instance *fi_mtp = to_mtp_instance(item);
+ usb_put_function_instance(&fi_mtp->func_inst);
+}
+
+static struct configfs_item_operations mtp_item_ops = {
+ .release = mtp_attr_release,
+};
+
+static struct config_item_type mtp_func_type = {
+ .ct_item_ops = &mtp_item_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+
+static struct mtp_instance *to_fi_mtp(struct usb_function_instance *fi)
+{
+ return container_of(fi, struct mtp_instance, func_inst);
+}
+
+static int mtp_set_inst_name(struct usb_function_instance *fi, const char *name)
+{
+ struct mtp_instance *fi_mtp;
+ char *ptr;
+ int name_len;
+
+ name_len = strlen(name) + 1;
+ if (name_len > MAX_INST_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ ptr = kstrndup(name, name_len, GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ fi_mtp = to_fi_mtp(fi);
+ fi_mtp->name = ptr;
+
+ return 0;
+}
+
+static void mtp_free_inst(struct usb_function_instance *fi)
+{
+ struct mtp_instance *fi_mtp;
+
+ fi_mtp = to_fi_mtp(fi);
+ kfree(fi_mtp->name);
+ mtp_cleanup();
+ kfree(fi_mtp->mtp_os_desc.group.default_groups);
+ kfree(fi_mtp);
+}
+
+struct usb_function_instance *alloc_inst_mtp_ptp(bool mtp_config)
+{
+ struct mtp_instance *fi_mtp;
+ int ret = 0;
+ struct usb_os_desc *descs[1];
+ char *names[1];
+
+ fi_mtp = kzalloc(sizeof(*fi_mtp), GFP_KERNEL);
+ if (!fi_mtp)
+ return ERR_PTR(-ENOMEM);
+ fi_mtp->func_inst.set_inst_name = mtp_set_inst_name;
+ fi_mtp->func_inst.free_func_inst = mtp_free_inst;
+
+ fi_mtp->mtp_os_desc.ext_compat_id = fi_mtp->mtp_ext_compat_id;
+ INIT_LIST_HEAD(&fi_mtp->mtp_os_desc.ext_prop);
+ descs[0] = &fi_mtp->mtp_os_desc;
+ names[0] = "MTP";
+ usb_os_desc_prepare_interf_dir(&fi_mtp->func_inst.group, 1,
+ descs, names, THIS_MODULE);
+
+ if (mtp_config) {
+ ret = mtp_setup_configfs(fi_mtp);
+ if (ret) {
+ kfree(fi_mtp);
+ pr_err("Error setting MTP\n");
+ return ERR_PTR(ret);
+ }
+ } else
+ fi_mtp->dev = _mtp_dev;
+
+ config_group_init_type_name(&fi_mtp->func_inst.group,
+ "", &mtp_func_type);
+
+ return &fi_mtp->func_inst;
+}
+EXPORT_SYMBOL_GPL(alloc_inst_mtp_ptp);
+
+static struct usb_function_instance *mtp_alloc_inst(void)
+{
+ return alloc_inst_mtp_ptp(true);
+}
+
+static int mtp_ctrlreq_configfs(struct usb_function *f,
+ const struct usb_ctrlrequest *ctrl)
+{
+ return mtp_ctrlrequest(f->config->cdev, ctrl);
+}
+
+static void mtp_free(struct usb_function *f)
+{
+ /*NO-OP: no function specific resource allocation in mtp_alloc*/
+}
+
+struct usb_function *function_alloc_mtp_ptp(struct usb_function_instance *fi,
+ bool mtp_config)
+{
+ struct mtp_instance *fi_mtp = to_fi_mtp(fi);
+ struct mtp_dev *dev;
+
+ /*
+ * PTP piggybacks on MTP function so make sure we have
+ * created MTP function before we associate this PTP
+ * function with a gadget configuration.
+ */
+ if (fi_mtp->dev == NULL) {
+ pr_err("Error: Create MTP function before linking"
+ " PTP function with a gadget configuration\n");
+ pr_err("\t1: Delete existing PTP function if any\n");
+ pr_err("\t2: Create MTP function\n");
+ pr_err("\t3: Create and symlink PTP function"
+ " with a gadget configuration\n");
+ return ERR_PTR(-EINVAL); /* Invalid Configuration */
+ }
+
+ dev = fi_mtp->dev;
+ dev->function.name = DRIVER_NAME;
+ dev->function.strings = mtp_strings;
+ if (mtp_config) {
+ dev->function.fs_descriptors = fs_mtp_descs;
+ dev->function.hs_descriptors = hs_mtp_descs;
+ dev->function.ss_descriptors = ss_mtp_descs;
+ } else {
+ dev->function.fs_descriptors = fs_ptp_descs;
+ dev->function.hs_descriptors = hs_ptp_descs;
+ dev->function.ss_descriptors = ss_ptp_descs;
+ }
+ dev->function.bind = mtp_function_bind;
+ dev->function.unbind = mtp_function_unbind;
+ dev->function.set_alt = mtp_function_set_alt;
+ dev->function.disable = mtp_function_disable;
+ dev->function.setup = mtp_ctrlreq_configfs;
+ dev->function.free_func = mtp_free;
+
+ return &dev->function;
+}
+EXPORT_SYMBOL_GPL(function_alloc_mtp_ptp);
+
+static struct usb_function *mtp_alloc(struct usb_function_instance *fi)
+{
+ return function_alloc_mtp_ptp(fi, true);
+}
+
+DECLARE_USB_FUNCTION_INIT(mtp, mtp_alloc_inst, mtp_alloc);
+MODULE_LICENSE("GPL");
diff --git a/drivers/usb/gadget/function/f_mtp.h b/drivers/usb/gadget/function/f_mtp.h
new file mode 100644
index 000000000000..7adb1ff08eff
--- /dev/null
+++ b/drivers/usb/gadget/function/f_mtp.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Badhri Jagan Sridharan <badhri@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+extern struct usb_function_instance *alloc_inst_mtp_ptp(bool mtp_config);
+extern struct usb_function *function_alloc_mtp_ptp(
+ struct usb_function_instance *fi, bool mtp_config);
diff --git a/drivers/usb/gadget/function/f_ptp.c b/drivers/usb/gadget/function/f_ptp.c
new file mode 100644
index 000000000000..da3e4d53e085
--- /dev/null
+++ b/drivers/usb/gadget/function/f_ptp.c
@@ -0,0 +1,38 @@
+/*
+ * Gadget Function Driver for PTP
+ *
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Badhri Jagan Sridharan <badhri@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include <linux/configfs.h>
+#include <linux/usb/composite.h>
+
+#include "f_mtp.h"
+
+static struct usb_function_instance *ptp_alloc_inst(void)
+{
+ return alloc_inst_mtp_ptp(false);
+}
+
+static struct usb_function *ptp_alloc(struct usb_function_instance *fi)
+{
+ return function_alloc_mtp_ptp(fi, false);
+}
+
+DECLARE_USB_FUNCTION_INIT(ptp, ptp_alloc_inst, ptp_alloc);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Badhri Jagan Sridharan");
diff --git a/drivers/usb/gadget/function/f_rndis.c b/drivers/usb/gadget/function/f_rndis.c
index f13fc6a58565..f60ec148d70c 100644
--- a/drivers/usb/gadget/function/f_rndis.c
+++ b/drivers/usb/gadget/function/f_rndis.c
@@ -70,6 +70,16 @@
* - MS-Windows drivers sometimes emit undocumented requests.
*/
+static unsigned int rndis_dl_max_pkt_per_xfer = 3;
+module_param(rndis_dl_max_pkt_per_xfer, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(rndis_dl_max_pkt_per_xfer,
+ "Maximum packets per transfer for DL aggregation");
+
+static unsigned int rndis_ul_max_pkt_per_xfer = 3;
+module_param(rndis_ul_max_pkt_per_xfer, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(rndis_ul_max_pkt_per_xfer,
+ "Maximum packets per transfer for UL aggregation");
+
struct f_rndis {
struct gether port;
u8 ctrl_id, data_id;
@@ -451,6 +461,7 @@ static void rndis_command_complete(struct usb_ep *ep, struct usb_request *req)
{
struct f_rndis *rndis = req->context;
int status;
+ rndis_init_msg_type *buf;
/* received RNDIS command from USB_CDC_SEND_ENCAPSULATED_COMMAND */
// spin_lock(&dev->lock);
@@ -458,6 +469,21 @@ static void rndis_command_complete(struct usb_ep *ep, struct usb_request *req)
if (status < 0)
pr_err("RNDIS command error %d, %d/%d\n",
status, req->actual, req->length);
+
+ buf = (rndis_init_msg_type *)req->buf;
+
+ if (buf->MessageType == RNDIS_MSG_INIT) {
+ if (buf->MaxTransferSize > 2048)
+ rndis->port.multi_pkt_xfer = 1;
+ else
+ rndis->port.multi_pkt_xfer = 0;
+ pr_info("%s: MaxTransferSize: %d : Multi_pkt_txr: %s\n",
+ __func__, buf->MaxTransferSize,
+ rndis->port.multi_pkt_xfer ? "enabled" :
+ "disabled");
+ if (rndis_dl_max_pkt_per_xfer <= 1)
+ rndis->port.multi_pkt_xfer = 0;
+ }
// spin_unlock(&dev->lock);
}
@@ -799,6 +825,7 @@ rndis_bind(struct usb_configuration *c, struct usb_function *f)
rndis_set_param_medium(rndis->config, RNDIS_MEDIUM_802_3, 0);
rndis_set_host_mac(rndis->config, rndis->ethaddr);
+ rndis_set_max_pkt_xfer(rndis->config, rndis_ul_max_pkt_per_xfer);
if (rndis->manufacturer && rndis->vendorID &&
rndis_set_param_vendor(rndis->config, rndis->vendorID,
@@ -843,6 +870,62 @@ fail:
return status;
}
+static void
+rndis_old_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct f_rndis *rndis = func_to_rndis(f);
+
+ rndis_deregister(rndis->config);
+
+ usb_free_all_descriptors(f);
+
+ kfree(rndis->notify_req->buf);
+ usb_ep_free_request(rndis->notify, rndis->notify_req);
+
+ kfree(rndis);
+}
+
+int
+rndis_bind_config_vendor(struct usb_configuration *c, u8 ethaddr[ETH_ALEN],
+ u32 vendorID, const char *manufacturer, struct eth_dev *dev)
+{
+ struct f_rndis *rndis;
+ int status;
+
+ /* allocate and initialize one new instance */
+ status = -ENOMEM;
+ rndis = kzalloc(sizeof *rndis, GFP_KERNEL);
+ if (!rndis)
+ goto fail;
+
+ memcpy(rndis->ethaddr, ethaddr, ETH_ALEN);
+ rndis->vendorID = vendorID;
+ rndis->manufacturer = manufacturer;
+
+ rndis->port.ioport = dev;
+ /* RNDIS activates when the host changes this filter */
+ rndis->port.cdc_filter = 0;
+
+ /* RNDIS has special (and complex) framing */
+ rndis->port.header_len = sizeof(struct rndis_packet_msg_type);
+ rndis->port.wrap = rndis_add_header;
+ rndis->port.unwrap = rndis_rm_hdr;
+
+ rndis->port.func.name = "rndis";
+ /* descriptors are per-instance copies */
+ rndis->port.func.bind = rndis_bind;
+ rndis->port.func.unbind = rndis_old_unbind;
+ rndis->port.func.set_alt = rndis_set_alt;
+ rndis->port.func.setup = rndis_setup;
+ rndis->port.func.disable = rndis_disable;
+
+ status = usb_add_function(c, &rndis->port.func);
+ if (status)
+ kfree(rndis);
+fail:
+ return status;
+}
+
void rndis_borrow_net(struct usb_function_instance *f, struct net_device *net)
{
struct f_rndis_opts *opts;
@@ -993,6 +1076,8 @@ static struct usb_function *rndis_alloc(struct usb_function_instance *fi)
rndis->port.header_len = sizeof(struct rndis_packet_msg_type);
rndis->port.wrap = rndis_add_header;
rndis->port.unwrap = rndis_rm_hdr;
+ rndis->port.ul_max_pkts_per_xfer = rndis_ul_max_pkt_per_xfer;
+ rndis->port.dl_max_pkts_per_xfer = rndis_dl_max_pkt_per_xfer;
rndis->port.func.name = "rndis";
/* descriptors are per-instance copies */
diff --git a/drivers/usb/gadget/function/rndis.c b/drivers/usb/gadget/function/rndis.c
index 95d2324f6977..ef948877f8ab 100644
--- a/drivers/usb/gadget/function/rndis.c
+++ b/drivers/usb/gadget/function/rndis.c
@@ -59,6 +59,16 @@ MODULE_PARM_DESC (rndis_debug, "enable debugging");
#define RNDIS_MAX_CONFIGS 1
+int rndis_ul_max_pkt_per_xfer_rcvd;
+module_param(rndis_ul_max_pkt_per_xfer_rcvd, int, S_IRUGO);
+MODULE_PARM_DESC(rndis_ul_max_pkt_per_xfer_rcvd,
+ "Max num of REMOTE_NDIS_PACKET_MSGs received in a single transfer");
+
+int rndis_ul_max_xfer_size_rcvd;
+module_param(rndis_ul_max_xfer_size_rcvd, int, S_IRUGO);
+MODULE_PARM_DESC(rndis_ul_max_xfer_size_rcvd,
+ "Max size of bus transfer received");
+
static rndis_params rndis_per_dev_params[RNDIS_MAX_CONFIGS];
@@ -585,12 +595,12 @@ static int rndis_init_response(int configNr, rndis_init_msg_type *buf)
resp->MinorVersion = cpu_to_le32(RNDIS_MINOR_VERSION);
resp->DeviceFlags = cpu_to_le32(RNDIS_DF_CONNECTIONLESS);
resp->Medium = cpu_to_le32(RNDIS_MEDIUM_802_3);
- resp->MaxPacketsPerTransfer = cpu_to_le32(1);
- resp->MaxTransferSize = cpu_to_le32(
- params->dev->mtu
+ resp->MaxPacketsPerTransfer = cpu_to_le32(params->max_pkt_per_xfer);
+ resp->MaxTransferSize = cpu_to_le32(params->max_pkt_per_xfer *
+ (params->dev->mtu
+ sizeof(struct ethhdr)
+ sizeof(struct rndis_packet_msg_type)
- + 22);
+ + 22));
resp->PacketAlignmentFactor = cpu_to_le32(0);
resp->AFListOffset = cpu_to_le32(0);
resp->AFListSize = cpu_to_le32(0);
@@ -686,6 +696,12 @@ static int rndis_reset_response(int configNr, rndis_reset_msg_type *buf)
rndis_reset_cmplt_type *resp;
rndis_resp_t *r;
struct rndis_params *params = rndis_per_dev_params + configNr;
+ u32 length;
+ u8 *xbuf;
+
+ /* drain the response queue */
+ while ((xbuf = rndis_get_next_response(configNr, &length)))
+ rndis_free_response(configNr, xbuf);
r = rndis_add_response(configNr, sizeof(rndis_reset_cmplt_type));
if (!r)
@@ -917,6 +933,8 @@ int rndis_set_param_dev(u8 configNr, struct net_device *dev, u16 *cdc_filter)
rndis_per_dev_params[configNr].dev = dev;
rndis_per_dev_params[configNr].filter = cdc_filter;
+ rndis_ul_max_xfer_size_rcvd = 0;
+ rndis_ul_max_pkt_per_xfer_rcvd = 0;
return 0;
}
EXPORT_SYMBOL_GPL(rndis_set_param_dev);
@@ -946,6 +964,13 @@ int rndis_set_param_medium(u8 configNr, u32 medium, u32 speed)
}
EXPORT_SYMBOL_GPL(rndis_set_param_medium);
+void rndis_set_max_pkt_xfer(u8 configNr, u8 max_pkt_per_xfer)
+{
+ pr_debug("%s:\n", __func__);
+
+ rndis_per_dev_params[configNr].max_pkt_per_xfer = max_pkt_per_xfer;
+}
+
void rndis_add_hdr(struct sk_buff *skb)
{
struct rndis_packet_msg_type *header;
@@ -1021,23 +1046,73 @@ int rndis_rm_hdr(struct gether *port,
struct sk_buff *skb,
struct sk_buff_head *list)
{
- /* tmp points to a struct rndis_packet_msg_type */
- __le32 *tmp = (void *)skb->data;
+ int num_pkts = 1;
- /* MessageType, MessageLength */
- if (cpu_to_le32(RNDIS_MSG_PACKET)
- != get_unaligned(tmp++)) {
- dev_kfree_skb_any(skb);
- return -EINVAL;
- }
- tmp++;
+ if (skb->len > rndis_ul_max_xfer_size_rcvd)
+ rndis_ul_max_xfer_size_rcvd = skb->len;
+
+ while (skb->len) {
+ struct rndis_packet_msg_type *hdr;
+ struct sk_buff *skb2;
+ u32 msg_len, data_offset, data_len;
+
+ /* some rndis hosts send extra byte to avoid zlp, ignore it */
+ if (skb->len == 1) {
+ dev_kfree_skb_any(skb);
+ return 0;
+ }
+
+ if (skb->len < sizeof *hdr) {
+ pr_err("invalid rndis pkt: skblen:%u hdr_len:%zu",
+ skb->len, sizeof *hdr);
+ dev_kfree_skb_any(skb);
+ return -EINVAL;
+ }
+
+ hdr = (void *)skb->data;
+ msg_len = le32_to_cpu(hdr->MessageLength);
+ data_offset = le32_to_cpu(hdr->DataOffset);
+ data_len = le32_to_cpu(hdr->DataLength);
+
+ if (skb->len < msg_len ||
+ ((data_offset + data_len + 8) > msg_len)) {
+ pr_err("invalid rndis message: %d/%d/%d/%d, len:%d\n",
+ le32_to_cpu(hdr->MessageType),
+ msg_len, data_offset, data_len, skb->len);
+ dev_kfree_skb_any(skb);
+ return -EOVERFLOW;
+ }
+ if (le32_to_cpu(hdr->MessageType) != RNDIS_MSG_PACKET) {
+ pr_err("invalid rndis message: %d/%d/%d/%d, len:%d\n",
+ le32_to_cpu(hdr->MessageType),
+ msg_len, data_offset, data_len, skb->len);
+ dev_kfree_skb_any(skb);
+ return -EINVAL;
+ }
+
+ skb_pull(skb, data_offset + 8);
- /* DataOffset, DataLength */
- if (!skb_pull(skb, get_unaligned_le32(tmp++) + 8)) {
- dev_kfree_skb_any(skb);
- return -EOVERFLOW;
+ if (msg_len == skb->len) {
+ skb_trim(skb, data_len);
+ break;
+ }
+
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2) {
+ pr_err("%s:skb clone failed\n", __func__);
+ dev_kfree_skb_any(skb);
+ return -ENOMEM;
+ }
+
+ skb_pull(skb, msg_len - sizeof *hdr);
+ skb_trim(skb2, data_len);
+ skb_queue_tail(list, skb2);
+
+ num_pkts++;
}
- skb_trim(skb, get_unaligned_le32(tmp++));
+
+ if (num_pkts > rndis_ul_max_pkt_per_xfer_rcvd)
+ rndis_ul_max_pkt_per_xfer_rcvd = num_pkts;
skb_queue_tail(list, skb);
return 0;
diff --git a/drivers/usb/gadget/function/rndis.h b/drivers/usb/gadget/function/rndis.h
index 0f4abb4c3775..145f01b18190 100644
--- a/drivers/usb/gadget/function/rndis.h
+++ b/drivers/usb/gadget/function/rndis.h
@@ -190,6 +190,7 @@ typedef struct rndis_params
struct net_device *dev;
u32 vendorID;
+ u8 max_pkt_per_xfer;
const char *vendorDescr;
void (*resp_avail)(void *v);
void *v;
@@ -205,6 +206,7 @@ int rndis_set_param_dev (u8 configNr, struct net_device *dev,
int rndis_set_param_vendor (u8 configNr, u32 vendorID,
const char *vendorDescr);
int rndis_set_param_medium (u8 configNr, u32 medium, u32 speed);
+void rndis_set_max_pkt_xfer(u8 configNr, u8 max_pkt_per_xfer);
void rndis_add_hdr (struct sk_buff *skb);
int rndis_rm_hdr(struct gether *port, struct sk_buff *skb,
struct sk_buff_head *list);
diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c
index 33011f8ee865..08eab6d4d4f1 100644
--- a/drivers/usb/gadget/function/u_ether.c
+++ b/drivers/usb/gadget/function/u_ether.c
@@ -48,6 +48,8 @@
#define UETH__VERSION "29-May-2008"
+static struct workqueue_struct *uether_wq;
+
struct eth_dev {
/* lock is held while accessing port_usb
*/
@@ -59,19 +61,27 @@ struct eth_dev {
spinlock_t req_lock; /* guard {rx,tx}_reqs */
struct list_head tx_reqs, rx_reqs;
- atomic_t tx_qlen;
+ unsigned tx_qlen;
+/* Minimum number of TX USB request queued to UDC */
+#define TX_REQ_THRESHOLD 5
+ int no_tx_req_used;
+ int tx_skb_hold_count;
+ u32 tx_req_bufsize;
struct sk_buff_head rx_frames;
unsigned qmult;
unsigned header_len;
+ unsigned ul_max_pkts_per_xfer;
+ unsigned dl_max_pkts_per_xfer;
struct sk_buff *(*wrap)(struct gether *, struct sk_buff *skb);
int (*unwrap)(struct gether *,
struct sk_buff *skb,
struct sk_buff_head *list);
struct work_struct work;
+ struct work_struct rx_work;
unsigned long todo;
#define WORK_RX_MEMORY 0
@@ -225,9 +235,13 @@ rx_submit(struct eth_dev *dev, struct usb_request *req, gfp_t gfp_flags)
size += out->maxpacket - 1;
size -= size % out->maxpacket;
+ if (dev->ul_max_pkts_per_xfer)
+ size *= dev->ul_max_pkts_per_xfer;
+
if (dev->port_usb->is_fixed)
size = max_t(size_t, size, dev->port_usb->fixed_out_len);
+ DBG(dev, "%s: size: %zd\n", __func__, size);
skb = alloc_skb(size + NET_IP_ALIGN, gfp_flags);
if (skb == NULL) {
DBG(dev, "no rx skb\n");
@@ -253,18 +267,16 @@ enomem:
DBG(dev, "rx submit --> %d\n", retval);
if (skb)
dev_kfree_skb_any(skb);
- spin_lock_irqsave(&dev->req_lock, flags);
- list_add(&req->list, &dev->rx_reqs);
- spin_unlock_irqrestore(&dev->req_lock, flags);
}
return retval;
}
static void rx_complete(struct usb_ep *ep, struct usb_request *req)
{
- struct sk_buff *skb = req->context, *skb2;
+ struct sk_buff *skb = req->context;
struct eth_dev *dev = ep->driver_data;
int status = req->status;
+ bool queue = 0;
switch (status) {
@@ -280,6 +292,10 @@ static void rx_complete(struct usb_ep *ep, struct usb_request *req)
status = dev->unwrap(dev->port_usb,
skb,
&dev->rx_frames);
+ if (status == -EINVAL)
+ dev->net->stats.rx_errors++;
+ else if (status == -EOVERFLOW)
+ dev->net->stats.rx_over_errors++;
} else {
dev_kfree_skb_any(skb);
status = -ENOTCONN;
@@ -288,30 +304,9 @@ static void rx_complete(struct usb_ep *ep, struct usb_request *req)
} else {
skb_queue_tail(&dev->rx_frames, skb);
}
- skb = NULL;
-
- skb2 = skb_dequeue(&dev->rx_frames);
- while (skb2) {
- if (status < 0
- || ETH_HLEN > skb2->len
- || skb2->len > VLAN_ETH_FRAME_LEN) {
- dev->net->stats.rx_errors++;
- dev->net->stats.rx_length_errors++;
- DBG(dev, "rx length %d\n", skb2->len);
- dev_kfree_skb_any(skb2);
- goto next_frame;
- }
- skb2->protocol = eth_type_trans(skb2, dev->net);
- dev->net->stats.rx_packets++;
- dev->net->stats.rx_bytes += skb2->len;
- /* no buffer copies needed, unless hardware can't
- * use skb buffers.
- */
- status = netif_rx(skb2);
-next_frame:
- skb2 = skb_dequeue(&dev->rx_frames);
- }
+ if (!status)
+ queue = 1;
break;
/* software-driven interface shutdown */
@@ -334,22 +329,20 @@ quiesce:
/* FALLTHROUGH */
default:
+ queue = 1;
+ dev_kfree_skb_any(skb);
dev->net->stats.rx_errors++;
DBG(dev, "rx status %d\n", status);
break;
}
- if (skb)
- dev_kfree_skb_any(skb);
- if (!netif_running(dev->net)) {
clean:
- spin_lock(&dev->req_lock);
- list_add(&req->list, &dev->rx_reqs);
- spin_unlock(&dev->req_lock);
- req = NULL;
- }
- if (req)
- rx_submit(dev, req, GFP_ATOMIC);
+ spin_lock(&dev->req_lock);
+ list_add(&req->list, &dev->rx_reqs);
+ spin_unlock(&dev->req_lock);
+
+ if (queue)
+ queue_work(uether_wq, &dev->rx_work);
}
static int prealloc(struct list_head *list, struct usb_ep *ep, unsigned n)
@@ -414,16 +407,24 @@ static void rx_fill(struct eth_dev *dev, gfp_t gfp_flags)
{
struct usb_request *req;
unsigned long flags;
+ int req_cnt = 0;
/* fill unused rxq slots with some skb */
spin_lock_irqsave(&dev->req_lock, flags);
while (!list_empty(&dev->rx_reqs)) {
+ /* break the nexus of continuous completion and re-submission*/
+ if (++req_cnt > qlen(dev->gadget, dev->qmult))
+ break;
+
req = container_of(dev->rx_reqs.next,
struct usb_request, list);
list_del_init(&req->list);
spin_unlock_irqrestore(&dev->req_lock, flags);
if (rx_submit(dev, req, gfp_flags) < 0) {
+ spin_lock_irqsave(&dev->req_lock, flags);
+ list_add(&req->list, &dev->rx_reqs);
+ spin_unlock_irqrestore(&dev->req_lock, flags);
defer_kevent(dev, WORK_RX_MEMORY);
return;
}
@@ -433,6 +434,36 @@ static void rx_fill(struct eth_dev *dev, gfp_t gfp_flags)
spin_unlock_irqrestore(&dev->req_lock, flags);
}
+static void process_rx_w(struct work_struct *work)
+{
+ struct eth_dev *dev = container_of(work, struct eth_dev, rx_work);
+ struct sk_buff *skb;
+ int status = 0;
+
+ if (!dev->port_usb)
+ return;
+
+ while ((skb = skb_dequeue(&dev->rx_frames))) {
+ if (status < 0
+ || ETH_HLEN > skb->len
+ || skb->len > ETH_FRAME_LEN) {
+ dev->net->stats.rx_errors++;
+ dev->net->stats.rx_length_errors++;
+ DBG(dev, "rx length %d\n", skb->len);
+ dev_kfree_skb_any(skb);
+ continue;
+ }
+ skb->protocol = eth_type_trans(skb, dev->net);
+ dev->net->stats.rx_packets++;
+ dev->net->stats.rx_bytes += skb->len;
+
+ status = netif_rx_ni(skb);
+ }
+
+ if (netif_running(dev->net))
+ rx_fill(dev, GFP_KERNEL);
+}
+
static void eth_work(struct work_struct *work)
{
struct eth_dev *dev = container_of(work, struct eth_dev, work);
@@ -450,6 +481,11 @@ static void tx_complete(struct usb_ep *ep, struct usb_request *req)
{
struct sk_buff *skb = req->context;
struct eth_dev *dev = ep->driver_data;
+ struct net_device *net = dev->net;
+ struct usb_request *new_req;
+ struct usb_ep *in;
+ int length;
+ int retval;
switch (req->status) {
default:
@@ -460,16 +496,74 @@ static void tx_complete(struct usb_ep *ep, struct usb_request *req)
case -ESHUTDOWN: /* disconnect etc */
break;
case 0:
- dev->net->stats.tx_bytes += skb->len;
+ if (!req->zero)
+ dev->net->stats.tx_bytes += req->length-1;
+ else
+ dev->net->stats.tx_bytes += req->length;
}
dev->net->stats.tx_packets++;
spin_lock(&dev->req_lock);
- list_add(&req->list, &dev->tx_reqs);
- spin_unlock(&dev->req_lock);
- dev_kfree_skb_any(skb);
+ list_add_tail(&req->list, &dev->tx_reqs);
+
+ if (dev->port_usb->multi_pkt_xfer) {
+ dev->no_tx_req_used--;
+ req->length = 0;
+ in = dev->port_usb->in_ep;
+
+ if (!list_empty(&dev->tx_reqs)) {
+ new_req = container_of(dev->tx_reqs.next,
+ struct usb_request, list);
+ list_del(&new_req->list);
+ spin_unlock(&dev->req_lock);
+ if (new_req->length > 0) {
+ length = new_req->length;
+
+ /* NCM requires no zlp if transfer is
+ * dwNtbInMaxSize */
+ if (dev->port_usb->is_fixed &&
+ length == dev->port_usb->fixed_in_len &&
+ (length % in->maxpacket) == 0)
+ new_req->zero = 0;
+ else
+ new_req->zero = 1;
+
+ /* use zlp framing on tx for strict CDC-Ether
+ * conformance, though any robust network rx
+ * path ignores extra padding. and some hardware
+ * doesn't like to write zlps.
+ */
+ if (new_req->zero && !dev->zlp &&
+ (length % in->maxpacket) == 0) {
+ new_req->zero = 0;
+ length++;
+ }
+
+ new_req->length = length;
+ retval = usb_ep_queue(in, new_req, GFP_ATOMIC);
+ switch (retval) {
+ default:
+ DBG(dev, "tx queue err %d\n", retval);
+ break;
+ case 0:
+ spin_lock(&dev->req_lock);
+ dev->no_tx_req_used++;
+ spin_unlock(&dev->req_lock);
+ net->trans_start = jiffies;
+ }
+ } else {
+ spin_lock(&dev->req_lock);
+ list_add(&new_req->list, &dev->tx_reqs);
+ spin_unlock(&dev->req_lock);
+ }
+ } else {
+ spin_unlock(&dev->req_lock);
+ }
+ } else {
+ spin_unlock(&dev->req_lock);
+ dev_kfree_skb_any(skb);
+ }
- atomic_dec(&dev->tx_qlen);
if (netif_carrier_ok(dev->net))
netif_wake_queue(dev->net);
}
@@ -479,6 +573,26 @@ static inline int is_promisc(u16 cdc_filter)
return cdc_filter & USB_CDC_PACKET_TYPE_PROMISCUOUS;
}
+static void alloc_tx_buffer(struct eth_dev *dev)
+{
+ struct list_head *act;
+ struct usb_request *req;
+
+ dev->tx_req_bufsize = (dev->dl_max_pkts_per_xfer *
+ (dev->net->mtu
+ + sizeof(struct ethhdr)
+ /* size of rndis_packet_msg_type */
+ + 44
+ + 22));
+
+ list_for_each(act, &dev->tx_reqs) {
+ req = container_of(act, struct usb_request, list);
+ if (!req->buf)
+ req->buf = kmalloc(dev->tx_req_bufsize,
+ GFP_ATOMIC);
+ }
+}
+
static netdev_tx_t eth_start_xmit(struct sk_buff *skb,
struct net_device *net)
{
@@ -505,6 +619,10 @@ static netdev_tx_t eth_start_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
}
+ /* Allocate memory for tx_reqs to support multi packet transfer */
+ if (dev->port_usb->multi_pkt_xfer && !dev->tx_req_bufsize)
+ alloc_tx_buffer(dev);
+
/* apply outgoing CDC or RNDIS filters */
if (skb && !is_promisc(cdc_filter)) {
u8 *dest = skb->data;
@@ -567,9 +685,37 @@ static netdev_tx_t eth_start_xmit(struct sk_buff *skb,
}
}
- length = skb->len;
- req->buf = skb->data;
- req->context = skb;
+ spin_lock_irqsave(&dev->req_lock, flags);
+ dev->tx_skb_hold_count++;
+ spin_unlock_irqrestore(&dev->req_lock, flags);
+
+ if (dev->port_usb->multi_pkt_xfer) {
+ memcpy(req->buf + req->length, skb->data, skb->len);
+ req->length = req->length + skb->len;
+ length = req->length;
+ dev_kfree_skb_any(skb);
+
+ spin_lock_irqsave(&dev->req_lock, flags);
+ if (dev->tx_skb_hold_count < dev->dl_max_pkts_per_xfer) {
+ if (dev->no_tx_req_used > TX_REQ_THRESHOLD) {
+ list_add(&req->list, &dev->tx_reqs);
+ spin_unlock_irqrestore(&dev->req_lock, flags);
+ goto success;
+ }
+ }
+
+ dev->no_tx_req_used++;
+ spin_unlock_irqrestore(&dev->req_lock, flags);
+
+ spin_lock_irqsave(&dev->lock, flags);
+ dev->tx_skb_hold_count = 0;
+ spin_unlock_irqrestore(&dev->lock, flags);
+ } else {
+ length = skb->len;
+ req->buf = skb->data;
+ req->context = skb;
+ }
+
req->complete = tx_complete;
/* NCM requires no zlp if transfer is dwNtbInMaxSize */
@@ -584,8 +730,10 @@ static netdev_tx_t eth_start_xmit(struct sk_buff *skb,
* though any robust network rx path ignores extra padding.
* and some hardware doesn't like to write zlps.
*/
- if (req->zero && !dev->zlp && (length % in->maxpacket) == 0)
+ if (req->zero && !dev->zlp && (length % in->maxpacket) == 0) {
+ req->zero = 0;
length++;
+ }
req->length = length;
@@ -596,11 +744,11 @@ static netdev_tx_t eth_start_xmit(struct sk_buff *skb,
break;
case 0:
net->trans_start = jiffies;
- atomic_inc(&dev->tx_qlen);
}
if (retval) {
- dev_kfree_skb_any(skb);
+ if (!dev->port_usb->multi_pkt_xfer)
+ dev_kfree_skb_any(skb);
drop:
dev->net->stats.tx_dropped++;
multiframe:
@@ -610,6 +758,7 @@ multiframe:
list_add(&req->list, &dev->tx_reqs);
spin_unlock_irqrestore(&dev->req_lock, flags);
}
+success:
return NETDEV_TX_OK;
}
@@ -623,7 +772,7 @@ static void eth_start(struct eth_dev *dev, gfp_t gfp_flags)
rx_fill(dev, gfp_flags);
/* and open the tx floodgates */
- atomic_set(&dev->tx_qlen, 0);
+ dev->tx_qlen = 0;
netif_wake_queue(dev->net);
}
@@ -696,6 +845,8 @@ static int eth_stop(struct net_device *net)
/*-------------------------------------------------------------------------*/
+static u8 host_ethaddr[ETH_ALEN];
+
static int get_ether_addr(const char *str, u8 *dev_addr)
{
if (str) {
@@ -728,6 +879,17 @@ static int get_ether_addr_str(u8 dev_addr[ETH_ALEN], char *str, int len)
return 18;
}
+static int get_host_ether_addr(u8 *str, u8 *dev_addr)
+{
+ memcpy(dev_addr, str, ETH_ALEN);
+ if (is_valid_ether_addr(dev_addr))
+ return 0;
+
+ random_ether_addr(dev_addr);
+ memcpy(str, dev_addr, ETH_ALEN);
+ return 1;
+}
+
static const struct net_device_ops eth_netdev_ops = {
.ndo_open = eth_open,
.ndo_stop = eth_stop,
@@ -771,6 +933,7 @@ struct eth_dev *gether_setup_name(struct usb_gadget *g,
spin_lock_init(&dev->lock);
spin_lock_init(&dev->req_lock);
INIT_WORK(&dev->work, eth_work);
+ INIT_WORK(&dev->rx_work, process_rx_w);
INIT_LIST_HEAD(&dev->tx_reqs);
INIT_LIST_HEAD(&dev->rx_reqs);
@@ -784,9 +947,11 @@ struct eth_dev *gether_setup_name(struct usb_gadget *g,
if (get_ether_addr(dev_addr, net->dev_addr))
dev_warn(&g->dev,
"using random %s ethernet address\n", "self");
- if (get_ether_addr(host_addr, dev->host_mac))
- dev_warn(&g->dev,
- "using random %s ethernet address\n", "host");
+
+ if (get_host_ether_addr(host_ethaddr, dev->host_mac))
+ dev_warn(&g->dev, "using random %s ethernet address\n", "host");
+ else
+ dev_warn(&g->dev, "using previous %s ethernet address\n", "host");
if (ethaddr)
memcpy(ethaddr, dev->host_mac, ETH_ALEN);
@@ -833,6 +998,7 @@ struct net_device *gether_setup_name_default(const char *netname)
spin_lock_init(&dev->lock);
spin_lock_init(&dev->req_lock);
INIT_WORK(&dev->work, eth_work);
+ INIT_WORK(&dev->rx_work, process_rx_w);
INIT_LIST_HEAD(&dev->tx_reqs);
INIT_LIST_HEAD(&dev->rx_reqs);
@@ -845,8 +1011,10 @@ struct net_device *gether_setup_name_default(const char *netname)
eth_random_addr(dev->dev_mac);
pr_warn("using random %s ethernet address\n", "self");
- eth_random_addr(dev->host_mac);
- pr_warn("using random %s ethernet address\n", "host");
+ if (get_host_ether_addr(host_ethaddr, dev->host_mac))
+ pr_warn("using random %s ethernet address\n", "host");
+ else
+ pr_warn("using previous %s ethernet address\n", "host");
net->netdev_ops = &eth_netdev_ops;
@@ -1067,8 +1235,13 @@ struct net_device *gether_connect(struct gether *link)
dev->header_len = link->header_len;
dev->unwrap = link->unwrap;
dev->wrap = link->wrap;
+ dev->ul_max_pkts_per_xfer = link->ul_max_pkts_per_xfer;
+ dev->dl_max_pkts_per_xfer = link->dl_max_pkts_per_xfer;
spin_lock(&dev->lock);
+ dev->tx_skb_hold_count = 0;
+ dev->no_tx_req_used = 0;
+ dev->tx_req_bufsize = 0;
dev->port_usb = link;
if (netif_running(dev->net)) {
if (link->open)
@@ -1113,6 +1286,7 @@ void gether_disconnect(struct gether *link)
{
struct eth_dev *dev = link->ioport;
struct usb_request *req;
+ struct sk_buff *skb;
WARN_ON(!dev);
if (!dev)
@@ -1135,6 +1309,8 @@ void gether_disconnect(struct gether *link)
list_del(&req->list);
spin_unlock(&dev->req_lock);
+ if (link->multi_pkt_xfer)
+ kfree(req->buf);
usb_ep_free_request(link->in_ep, req);
spin_lock(&dev->req_lock);
}
@@ -1154,6 +1330,12 @@ void gether_disconnect(struct gether *link)
spin_lock(&dev->req_lock);
}
spin_unlock(&dev->req_lock);
+
+ spin_lock(&dev->rx_frames.lock);
+ while ((skb = __skb_dequeue(&dev->rx_frames)))
+ dev_kfree_skb_any(skb);
+ spin_unlock(&dev->rx_frames.lock);
+
link->out_ep->driver_data = NULL;
link->out_ep->desc = NULL;
@@ -1168,5 +1350,23 @@ void gether_disconnect(struct gether *link)
}
EXPORT_SYMBOL_GPL(gether_disconnect);
-MODULE_LICENSE("GPL");
+static int __init gether_init(void)
+{
+ uether_wq = create_singlethread_workqueue("uether");
+ if (!uether_wq) {
+ pr_err("%s: Unable to create workqueue: uether\n", __func__);
+ return -ENOMEM;
+ }
+ return 0;
+}
+module_init(gether_init);
+
+static void __exit gether_exit(void)
+{
+ destroy_workqueue(uether_wq);
+
+}
+module_exit(gether_exit);
MODULE_AUTHOR("David Brownell");
+MODULE_DESCRIPTION("ethernet over USB driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/usb/gadget/function/u_ether.h b/drivers/usb/gadget/function/u_ether.h
index 334b38947916..f591f74d2705 100644
--- a/drivers/usb/gadget/function/u_ether.h
+++ b/drivers/usb/gadget/function/u_ether.h
@@ -75,6 +75,9 @@ struct gether {
bool is_fixed;
u32 fixed_out_len;
u32 fixed_in_len;
+ unsigned ul_max_pkts_per_xfer;
+ unsigned dl_max_pkts_per_xfer;
+ bool multi_pkt_xfer;
bool supports_multi_frame;
struct sk_buff *(*wrap)(struct gether *port,
struct sk_buff *skb);
diff --git a/drivers/usb/gadget/function/u_midi.h b/drivers/usb/gadget/function/u_midi.h
new file mode 100644
index 000000000000..22510189758e
--- /dev/null
+++ b/drivers/usb/gadget/function/u_midi.h
@@ -0,0 +1,40 @@
+/*
+ * u_midi.h
+ *
+ * Utility definitions for the midi function
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com
+ *
+ * Author: Andrzej Pietrasiewicz <andrzej.p@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef U_MIDI_H
+#define U_MIDI_H
+
+#include <linux/usb/composite.h>
+
+struct f_midi_opts {
+ struct usb_function_instance func_inst;
+ int index;
+ char *id;
+ bool id_allocated;
+ unsigned int in_ports;
+ unsigned int out_ports;
+ unsigned int buflen;
+ unsigned int qlen;
+
+ /*
+ * Protect the data form concurrent access by read/write
+ * and create symlink/remove symlink.
+ */
+ struct mutex lock;
+ int refcnt;
+};
+
+#endif /* U_MIDI_H */
+
diff --git a/drivers/usb/gadget/function/u_serial.c b/drivers/usb/gadget/function/u_serial.c
index 491082aaf103..d9b867a4f982 100644
--- a/drivers/usb/gadget/function/u_serial.c
+++ b/drivers/usb/gadget/function/u_serial.c
@@ -1122,6 +1122,7 @@ int gserial_alloc_line(unsigned char *line_num)
tty_dev = tty_port_register_device(&ports[port_num].port->port,
gs_tty_driver, port_num, NULL);
+
if (IS_ERR(tty_dev)) {
struct gs_port *port;
pr_err("%s: failed to register tty for port %d, err %ld\n",
diff --git a/drivers/usb/gadget/legacy/Kconfig b/drivers/usb/gadget/legacy/Kconfig
index 24392d269709..e60063bab872 100644
--- a/drivers/usb/gadget/legacy/Kconfig
+++ b/drivers/usb/gadget/legacy/Kconfig
@@ -287,6 +287,7 @@ config USB_MIDI_GADGET
depends on SND
select USB_LIBCOMPOSITE
select SND_RAWMIDI
+ select USB_F_MIDI
help
The MIDI Gadget acts as a USB Audio device, with one MIDI
input and one MIDI output. These MIDI jacks appear as
@@ -313,6 +314,26 @@ config USB_G_PRINTER
For more information, see Documentation/usb/gadget_printer.txt
which includes sample code for accessing the device file.
+config USB_G_ANDROID
+ boolean "Android Composite Gadget"
+ select USB_F_ACM
+ select USB_LIBCOMPOSITE
+ select USB_U_SERIAL
+ select USB_F_MASS_STORAGE
+ select SND_PCM
+ help
+ The Android Composite Gadget supports multiple USB
+ functions: adb, acm, mass storage, mtp, accessory
+ and rndis.
+ Each function can be configured and enabled/disabled
+ dynamically from userspace through a sysfs interface.
+
+config USB_ANDROID_RNDIS_DWORD_ALIGNED
+ boolean "Use double word aligned"
+ depends on USB_G_ANDROID
+ help
+ Provides dword aligned for DMA controller.
+
if TTY
config USB_CDC_COMPOSITE
diff --git a/drivers/usb/gadget/legacy/Makefile b/drivers/usb/gadget/legacy/Makefile
index 7f485f25705e..f7c7a59d2059 100644
--- a/drivers/usb/gadget/legacy/Makefile
+++ b/drivers/usb/gadget/legacy/Makefile
@@ -22,6 +22,7 @@ g_nokia-y := nokia.o
g_webcam-y := webcam.o
g_ncm-y := ncm.o
g_acm_ms-y := acm_ms.o
+g_android-y := android.o
g_tcm_usb_gadget-y := tcm_usb_gadget.o
obj-$(CONFIG_USB_ZERO) += g_zero.o
@@ -41,4 +42,5 @@ obj-$(CONFIG_USB_G_NOKIA) += g_nokia.o
obj-$(CONFIG_USB_G_WEBCAM) += g_webcam.o
obj-$(CONFIG_USB_G_NCM) += g_ncm.o
obj-$(CONFIG_USB_G_ACM_MS) += g_acm_ms.o
+obj-$(CONFIG_USB_G_ANDROID) += g_android.o
obj-$(CONFIG_USB_GADGET_TARGET) += tcm_usb_gadget.o
diff --git a/drivers/usb/gadget/legacy/android.c b/drivers/usb/gadget/legacy/android.c
new file mode 100644
index 000000000000..1d38c8da9142
--- /dev/null
+++ b/drivers/usb/gadget/legacy/android.c
@@ -0,0 +1,1568 @@
+/*
+ * Gadget Driver for Android
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ * Benoit Goby <benoit@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/utsname.h>
+#include <linux/platform_device.h>
+
+#include <linux/usb/ch9.h>
+#include <linux/usb/composite.h>
+#include <linux/usb/gadget.h>
+
+#include "gadget_chips.h"
+
+#include "f_fs.c"
+#include "f_audio_source.c"
+#include "f_mtp.c"
+#include "f_accessory.c"
+#define USB_ETH_RNDIS y
+#include "f_rndis.c"
+#include "rndis.c"
+#include "u_ether.c"
+
+USB_ETHERNET_MODULE_PARAMETERS();
+
+MODULE_AUTHOR("Mike Lockwood");
+MODULE_DESCRIPTION("Android Composite USB Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0");
+
+static const char longname[] = "Gadget Android";
+
+/* Default vendor and product IDs, overridden by userspace */
+#define VENDOR_ID 0x18D1
+#define PRODUCT_ID 0x0001
+
+struct android_usb_function {
+ char *name;
+ void *config;
+
+ struct device *dev;
+ char *dev_name;
+ struct device_attribute **attributes;
+
+ /* for android_dev.enabled_functions */
+ struct list_head enabled_list;
+
+ /* Optional: initialization during gadget bind */
+ int (*init)(struct android_usb_function *, struct usb_composite_dev *);
+ /* Optional: cleanup during gadget unbind */
+ void (*cleanup)(struct android_usb_function *);
+ /* Optional: called when the function is added the list of
+ * enabled functions */
+ void (*enable)(struct android_usb_function *);
+ /* Optional: called when it is removed */
+ void (*disable)(struct android_usb_function *);
+
+ int (*bind_config)(struct android_usb_function *,
+ struct usb_configuration *);
+
+ /* Optional: called when the configuration is removed */
+ void (*unbind_config)(struct android_usb_function *,
+ struct usb_configuration *);
+ /* Optional: handle ctrl requests before the device is configured */
+ int (*ctrlrequest)(struct android_usb_function *,
+ struct usb_composite_dev *,
+ const struct usb_ctrlrequest *);
+};
+
+struct android_dev {
+ struct android_usb_function **functions;
+ struct list_head enabled_functions;
+ struct usb_composite_dev *cdev;
+ struct device *dev;
+
+ void (*setup_complete)(struct usb_ep *ep,
+ struct usb_request *req);
+
+ bool enabled;
+ int disable_depth;
+ struct mutex mutex;
+ bool connected;
+ bool sw_connected;
+ struct work_struct work;
+ char ffs_aliases[256];
+};
+
+static struct class *android_class;
+static struct android_dev *_android_dev;
+static int android_bind_config(struct usb_configuration *c);
+static void android_unbind_config(struct usb_configuration *c);
+
+/* string IDs are assigned dynamically */
+#define STRING_MANUFACTURER_IDX 0
+#define STRING_PRODUCT_IDX 1
+#define STRING_SERIAL_IDX 2
+
+static char manufacturer_string[256];
+static char product_string[256];
+static char serial_string[256];
+
+/*---Copied from configfs.c to let this composite driver build---*/
+static struct class *android_class;
+static struct device *android_device;
+static int index;
+
+struct device *create_function_device(char *name)
+{
+ if (android_device && !IS_ERR(android_device))
+ return device_create(android_class, android_device,
+ MKDEV(0, index++), NULL, name);
+ else
+ return ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL_GPL(create_function_device);
+/*---------------------------------------------------------------*/
+
+/* String Table */
+static struct usb_string strings_dev[] = {
+ [STRING_MANUFACTURER_IDX].s = manufacturer_string,
+ [STRING_PRODUCT_IDX].s = product_string,
+ [STRING_SERIAL_IDX].s = serial_string,
+ { } /* end of list */
+};
+
+static struct usb_gadget_strings stringtab_dev = {
+ .language = 0x0409, /* en-us */
+ .strings = strings_dev,
+};
+
+static struct usb_gadget_strings *dev_strings[] = {
+ &stringtab_dev,
+ NULL,
+};
+
+static struct usb_device_descriptor device_desc = {
+ .bLength = sizeof(device_desc),
+ .bDescriptorType = USB_DT_DEVICE,
+ .bcdUSB = __constant_cpu_to_le16(0x0200),
+ .bDeviceClass = USB_CLASS_PER_INTERFACE,
+ .idVendor = __constant_cpu_to_le16(VENDOR_ID),
+ .idProduct = __constant_cpu_to_le16(PRODUCT_ID),
+ .bcdDevice = __constant_cpu_to_le16(0xffff),
+ .bNumConfigurations = 1,
+};
+
+static struct usb_configuration android_config_driver = {
+ .label = "android",
+ .unbind = android_unbind_config,
+ .bConfigurationValue = 1,
+ .bmAttributes = USB_CONFIG_ATT_ONE | USB_CONFIG_ATT_SELFPOWER,
+ .MaxPower = 500, /* 500ma */
+};
+
+static void android_work(struct work_struct *data)
+{
+ struct android_dev *dev = container_of(data, struct android_dev, work);
+ struct usb_composite_dev *cdev = dev->cdev;
+ char *disconnected[2] = { "USB_STATE=DISCONNECTED", NULL };
+ char *connected[2] = { "USB_STATE=CONNECTED", NULL };
+ char *configured[2] = { "USB_STATE=CONFIGURED", NULL };
+ char **uevent_envp = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (cdev->config)
+ uevent_envp = configured;
+ else if (dev->connected != dev->sw_connected)
+ uevent_envp = dev->connected ? connected : disconnected;
+ dev->sw_connected = dev->connected;
+ spin_unlock_irqrestore(&cdev->lock, flags);
+
+ if (uevent_envp) {
+ kobject_uevent_env(&dev->dev->kobj, KOBJ_CHANGE, uevent_envp);
+ pr_info("%s: sent uevent %s\n", __func__, uevent_envp[0]);
+ } else {
+ pr_info("%s: did not send uevent (%d %d %p)\n", __func__,
+ dev->connected, dev->sw_connected, cdev->config);
+ }
+}
+
+static void android_enable(struct android_dev *dev)
+{
+ struct usb_composite_dev *cdev = dev->cdev;
+
+ if (WARN_ON(!dev->disable_depth))
+ return;
+
+ if (--dev->disable_depth == 0) {
+ usb_add_config(cdev, &android_config_driver,
+ android_bind_config);
+ usb_gadget_connect(cdev->gadget);
+ }
+}
+
+static void android_disable(struct android_dev *dev)
+{
+ struct usb_composite_dev *cdev = dev->cdev;
+
+ if (dev->disable_depth++ == 0) {
+ usb_gadget_disconnect(cdev->gadget);
+ /* Cancel pending control requests */
+ usb_ep_dequeue(cdev->gadget->ep0, cdev->req);
+ usb_remove_config(cdev, &android_config_driver);
+ }
+}
+
+/*-------------------------------------------------------------------------*/
+/* Supported functions initialization */
+
+struct functionfs_config {
+ bool opened;
+ bool enabled;
+ struct ffs_data *data;
+};
+
+static int ffs_function_init(struct android_usb_function *f,
+ struct usb_composite_dev *cdev)
+{
+ f->config = kzalloc(sizeof(struct functionfs_config), GFP_KERNEL);
+ if (!f->config)
+ return -ENOMEM;
+
+ return functionfs_init();
+}
+
+static void ffs_function_cleanup(struct android_usb_function *f)
+{
+ functionfs_cleanup();
+ kfree(f->config);
+}
+
+static void ffs_function_enable(struct android_usb_function *f)
+{
+ struct android_dev *dev = _android_dev;
+ struct functionfs_config *config = f->config;
+
+ config->enabled = true;
+
+ /* Disable the gadget until the function is ready */
+ if (!config->opened)
+ android_disable(dev);
+}
+
+static void ffs_function_disable(struct android_usb_function *f)
+{
+ struct android_dev *dev = _android_dev;
+ struct functionfs_config *config = f->config;
+
+ config->enabled = false;
+
+ /* Balance the disable that was called in closed_callback */
+ if (!config->opened)
+ android_enable(dev);
+}
+
+static int ffs_function_bind_config(struct android_usb_function *f,
+ struct usb_configuration *c)
+{
+ struct functionfs_config *config = f->config;
+ return functionfs_bind_config(c->cdev, c, config->data);
+}
+
+static ssize_t
+ffs_aliases_show(struct device *pdev, struct device_attribute *attr, char *buf)
+{
+ struct android_dev *dev = _android_dev;
+ int ret;
+
+ mutex_lock(&dev->mutex);
+ ret = sprintf(buf, "%s\n", dev->ffs_aliases);
+ mutex_unlock(&dev->mutex);
+
+ return ret;
+}
+
+static ssize_t
+ffs_aliases_store(struct device *pdev, struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ struct android_dev *dev = _android_dev;
+ char buff[256];
+
+ mutex_lock(&dev->mutex);
+
+ if (dev->enabled) {
+ mutex_unlock(&dev->mutex);
+ return -EBUSY;
+ }
+
+ strlcpy(buff, buf, sizeof(buff));
+ strlcpy(dev->ffs_aliases, strim(buff), sizeof(dev->ffs_aliases));
+
+ mutex_unlock(&dev->mutex);
+
+ return size;
+}
+
+static DEVICE_ATTR(aliases, S_IRUGO | S_IWUSR, ffs_aliases_show,
+ ffs_aliases_store);
+static struct device_attribute *ffs_function_attributes[] = {
+ &dev_attr_aliases,
+ NULL
+};
+
+static struct android_usb_function ffs_function = {
+ .name = "ffs",
+ .init = ffs_function_init,
+ .enable = ffs_function_enable,
+ .disable = ffs_function_disable,
+ .cleanup = ffs_function_cleanup,
+ .bind_config = ffs_function_bind_config,
+ .attributes = ffs_function_attributes,
+};
+
+static int functionfs_ready_callback(struct ffs_data *ffs)
+{
+ struct android_dev *dev = _android_dev;
+ struct functionfs_config *config = ffs_function.config;
+ int ret = 0;
+
+ mutex_lock(&dev->mutex);
+
+ ret = functionfs_bind(ffs, dev->cdev);
+ if (ret)
+ goto err;
+
+ config->data = ffs;
+ config->opened = true;
+
+ if (config->enabled)
+ android_enable(dev);
+
+err:
+ mutex_unlock(&dev->mutex);
+ return ret;
+}
+
+static void functionfs_closed_callback(struct ffs_data *ffs)
+{
+ struct android_dev *dev = _android_dev;
+ struct functionfs_config *config = ffs_function.config;
+
+ mutex_lock(&dev->mutex);
+
+ if (config->enabled)
+ android_disable(dev);
+
+ config->opened = false;
+ config->data = NULL;
+
+ functionfs_unbind(ffs);
+
+ mutex_unlock(&dev->mutex);
+}
+
+static void *functionfs_acquire_dev_callback(const char *dev_name)
+{
+ return 0;
+}
+
+static void functionfs_release_dev_callback(struct ffs_data *ffs_data)
+{
+}
+
+#define MAX_ACM_INSTANCES 4
+struct acm_function_config {
+ int instances;
+ int instances_on;
+ struct usb_function *f_acm[MAX_ACM_INSTANCES];
+ struct usb_function_instance *f_acm_inst[MAX_ACM_INSTANCES];
+};
+
+static int
+acm_function_init(struct android_usb_function *f,
+ struct usb_composite_dev *cdev)
+{
+ int i;
+ int ret;
+ struct acm_function_config *config;
+
+ config = kzalloc(sizeof(struct acm_function_config), GFP_KERNEL);
+ if (!config)
+ return -ENOMEM;
+ f->config = config;
+
+ for (i = 0; i < MAX_ACM_INSTANCES; i++) {
+ config->f_acm_inst[i] = usb_get_function_instance("acm");
+ if (IS_ERR(config->f_acm_inst[i])) {
+ ret = PTR_ERR(config->f_acm_inst[i]);
+ goto err_usb_get_function_instance;
+ }
+ config->f_acm[i] = usb_get_function(config->f_acm_inst[i]);
+ if (IS_ERR(config->f_acm[i])) {
+ ret = PTR_ERR(config->f_acm[i]);
+ goto err_usb_get_function;
+ }
+ }
+ return 0;
+err_usb_get_function_instance:
+ while (i-- > 0) {
+ usb_put_function(config->f_acm[i]);
+err_usb_get_function:
+ usb_put_function_instance(config->f_acm_inst[i]);
+ }
+ return ret;
+}
+
+static void acm_function_cleanup(struct android_usb_function *f)
+{
+ int i;
+ struct acm_function_config *config = f->config;
+
+ for (i = 0; i < MAX_ACM_INSTANCES; i++) {
+ usb_put_function(config->f_acm[i]);
+ usb_put_function_instance(config->f_acm_inst[i]);
+ }
+ kfree(f->config);
+ f->config = NULL;
+}
+
+static int
+acm_function_bind_config(struct android_usb_function *f,
+ struct usb_configuration *c)
+{
+ int i;
+ int ret = 0;
+ struct acm_function_config *config = f->config;
+
+ config->instances_on = config->instances;
+ for (i = 0; i < config->instances_on; i++) {
+ ret = usb_add_function(c, config->f_acm[i]);
+ if (ret) {
+ pr_err("Could not bind acm%u config\n", i);
+ goto err_usb_add_function;
+ }
+ }
+
+ return 0;
+
+err_usb_add_function:
+ while (i-- > 0)
+ usb_remove_function(c, config->f_acm[i]);
+ return ret;
+}
+
+static void acm_function_unbind_config(struct android_usb_function *f,
+ struct usb_configuration *c)
+{
+ int i;
+ struct acm_function_config *config = f->config;
+
+ for (i = 0; i < config->instances_on; i++)
+ usb_remove_function(c, config->f_acm[i]);
+}
+
+static ssize_t acm_instances_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct android_usb_function *f = dev_get_drvdata(dev);
+ struct acm_function_config *config = f->config;
+ return sprintf(buf, "%d\n", config->instances);
+}
+
+static ssize_t acm_instances_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t size)
+{
+ struct android_usb_function *f = dev_get_drvdata(dev);
+ struct acm_function_config *config = f->config;
+ int value;
+
+ sscanf(buf, "%d", &value);
+ if (value > MAX_ACM_INSTANCES)
+ value = MAX_ACM_INSTANCES;
+ config->instances = value;
+ return size;
+}
+
+static DEVICE_ATTR(instances, S_IRUGO | S_IWUSR, acm_instances_show,
+ acm_instances_store);
+static struct device_attribute *acm_function_attributes[] = {
+ &dev_attr_instances,
+ NULL
+};
+
+static struct android_usb_function acm_function = {
+ .name = "acm",
+ .init = acm_function_init,
+ .cleanup = acm_function_cleanup,
+ .bind_config = acm_function_bind_config,
+ .unbind_config = acm_function_unbind_config,
+ .attributes = acm_function_attributes,
+};
+
+
+static int
+mtp_function_init(struct android_usb_function *f,
+ struct usb_composite_dev *cdev)
+{
+ return mtp_setup();
+}
+
+static void mtp_function_cleanup(struct android_usb_function *f)
+{
+ mtp_cleanup();
+}
+
+static int
+mtp_function_bind_config(struct android_usb_function *f,
+ struct usb_configuration *c)
+{
+ return mtp_bind_config(c, false);
+}
+
+static int
+ptp_function_init(struct android_usb_function *f,
+ struct usb_composite_dev *cdev)
+{
+ /* nothing to do - initialization is handled by mtp_function_init */
+ return 0;
+}
+
+static void ptp_function_cleanup(struct android_usb_function *f)
+{
+ /* nothing to do - cleanup is handled by mtp_function_cleanup */
+}
+
+static int
+ptp_function_bind_config(struct android_usb_function *f,
+ struct usb_configuration *c)
+{
+ return mtp_bind_config(c, true);
+}
+
+static int mtp_function_ctrlrequest(struct android_usb_function *f,
+ struct usb_composite_dev *cdev,
+ const struct usb_ctrlrequest *c)
+{
+ return mtp_ctrlrequest(cdev, c);
+}
+
+static struct android_usb_function mtp_function = {
+ .name = "mtp",
+ .init = mtp_function_init,
+ .cleanup = mtp_function_cleanup,
+ .bind_config = mtp_function_bind_config,
+ .ctrlrequest = mtp_function_ctrlrequest,
+};
+
+/* PTP function is same as MTP with slightly different interface descriptor */
+static struct android_usb_function ptp_function = {
+ .name = "ptp",
+ .init = ptp_function_init,
+ .cleanup = ptp_function_cleanup,
+ .bind_config = ptp_function_bind_config,
+};
+
+
+struct rndis_function_config {
+ u8 ethaddr[ETH_ALEN];
+ u32 vendorID;
+ char manufacturer[256];
+ /* "Wireless" RNDIS; auto-detected by Windows */
+ bool wceis;
+ struct eth_dev *dev;
+};
+
+static int
+rndis_function_init(struct android_usb_function *f,
+ struct usb_composite_dev *cdev)
+{
+ f->config = kzalloc(sizeof(struct rndis_function_config), GFP_KERNEL);
+ if (!f->config)
+ return -ENOMEM;
+ return 0;
+}
+
+static void rndis_function_cleanup(struct android_usb_function *f)
+{
+ kfree(f->config);
+ f->config = NULL;
+}
+
+static int
+rndis_function_bind_config(struct android_usb_function *f,
+ struct usb_configuration *c)
+{
+ int ret;
+ struct eth_dev *dev;
+ struct rndis_function_config *rndis = f->config;
+
+ if (!rndis) {
+ pr_err("%s: rndis_pdata\n", __func__);
+ return -1;
+ }
+
+ pr_info("%s MAC: %02X:%02X:%02X:%02X:%02X:%02X\n", __func__,
+ rndis->ethaddr[0], rndis->ethaddr[1], rndis->ethaddr[2],
+ rndis->ethaddr[3], rndis->ethaddr[4], rndis->ethaddr[5]);
+
+ dev = gether_setup_name(c->cdev->gadget,dev_addr, host_addr, rndis->ethaddr, qmult, "rndis");
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ pr_err("%s: gether_setup failed\n", __func__);
+ return ret;
+ }
+ rndis->dev = dev;
+
+ if (rndis->wceis) {
+ /* "Wireless" RNDIS; auto-detected by Windows */
+ rndis_iad_descriptor.bFunctionClass =
+ USB_CLASS_WIRELESS_CONTROLLER;
+ rndis_iad_descriptor.bFunctionSubClass = 0x01;
+ rndis_iad_descriptor.bFunctionProtocol = 0x03;
+ rndis_control_intf.bInterfaceClass =
+ USB_CLASS_WIRELESS_CONTROLLER;
+ rndis_control_intf.bInterfaceSubClass = 0x01;
+ rndis_control_intf.bInterfaceProtocol = 0x03;
+ }
+
+ return rndis_bind_config_vendor(c, rndis->ethaddr, rndis->vendorID,
+ rndis->manufacturer, rndis->dev);
+}
+
+static void rndis_function_unbind_config(struct android_usb_function *f,
+ struct usb_configuration *c)
+{
+ struct rndis_function_config *rndis = f->config;
+ gether_cleanup(rndis->dev);
+}
+
+static ssize_t rndis_manufacturer_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct android_usb_function *f = dev_get_drvdata(dev);
+ struct rndis_function_config *config = f->config;
+ return sprintf(buf, "%s\n", config->manufacturer);
+}
+
+static ssize_t rndis_manufacturer_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t size)
+{
+ struct android_usb_function *f = dev_get_drvdata(dev);
+ struct rndis_function_config *config = f->config;
+
+ if (size >= sizeof(config->manufacturer))
+ return -EINVAL;
+ if (sscanf(buf, "%s", config->manufacturer) == 1)
+ return size;
+ return -1;
+}
+
+static DEVICE_ATTR(manufacturer, S_IRUGO | S_IWUSR, rndis_manufacturer_show,
+ rndis_manufacturer_store);
+
+static ssize_t rndis_wceis_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct android_usb_function *f = dev_get_drvdata(dev);
+ struct rndis_function_config *config = f->config;
+ return sprintf(buf, "%d\n", config->wceis);
+}
+
+static ssize_t rndis_wceis_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t size)
+{
+ struct android_usb_function *f = dev_get_drvdata(dev);
+ struct rndis_function_config *config = f->config;
+ int value;
+
+ if (sscanf(buf, "%d", &value) == 1) {
+ config->wceis = value;
+ return size;
+ }
+ return -EINVAL;
+}
+
+static DEVICE_ATTR(wceis, S_IRUGO | S_IWUSR, rndis_wceis_show,
+ rndis_wceis_store);
+
+static ssize_t rndis_ethaddr_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct android_usb_function *f = dev_get_drvdata(dev);
+ struct rndis_function_config *rndis = f->config;
+ return sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x\n",
+ rndis->ethaddr[0], rndis->ethaddr[1], rndis->ethaddr[2],
+ rndis->ethaddr[3], rndis->ethaddr[4], rndis->ethaddr[5]);
+}
+
+static ssize_t rndis_ethaddr_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t size)
+{
+ struct android_usb_function *f = dev_get_drvdata(dev);
+ struct rndis_function_config *rndis = f->config;
+
+ if (sscanf(buf, "%02x:%02x:%02x:%02x:%02x:%02x\n",
+ (int *)&rndis->ethaddr[0], (int *)&rndis->ethaddr[1],
+ (int *)&rndis->ethaddr[2], (int *)&rndis->ethaddr[3],
+ (int *)&rndis->ethaddr[4], (int *)&rndis->ethaddr[5]) == 6)
+ return size;
+ return -EINVAL;
+}
+
+static DEVICE_ATTR(ethaddr, S_IRUGO | S_IWUSR, rndis_ethaddr_show,
+ rndis_ethaddr_store);
+
+static ssize_t rndis_vendorID_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct android_usb_function *f = dev_get_drvdata(dev);
+ struct rndis_function_config *config = f->config;
+ return sprintf(buf, "%04x\n", config->vendorID);
+}
+
+static ssize_t rndis_vendorID_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t size)
+{
+ struct android_usb_function *f = dev_get_drvdata(dev);
+ struct rndis_function_config *config = f->config;
+ int value;
+
+ if (sscanf(buf, "%04x", &value) == 1) {
+ config->vendorID = value;
+ return size;
+ }
+ return -EINVAL;
+}
+
+static DEVICE_ATTR(vendorID, S_IRUGO | S_IWUSR, rndis_vendorID_show,
+ rndis_vendorID_store);
+
+static struct device_attribute *rndis_function_attributes[] = {
+ &dev_attr_manufacturer,
+ &dev_attr_wceis,
+ &dev_attr_ethaddr,
+ &dev_attr_vendorID,
+ NULL
+};
+
+static struct android_usb_function rndis_function = {
+ .name = "rndis",
+ .init = rndis_function_init,
+ .cleanup = rndis_function_cleanup,
+ .bind_config = rndis_function_bind_config,
+ .unbind_config = rndis_function_unbind_config,
+ .attributes = rndis_function_attributes,
+};
+
+
+#define MAX_MS_INSTANCES 1
+struct mass_storage_function_config {
+ int instances;
+ int instances_on;
+ struct usb_function *f_ms[MAX_MS_INSTANCES];
+ struct usb_function_instance *f_ms_inst[MAX_MS_INSTANCES];
+};
+
+static int mass_storage_function_init(struct android_usb_function *f,
+ struct usb_composite_dev *cdev)
+{
+ struct mass_storage_function_config *config;
+ int i;
+ int ret;
+
+ config = kzalloc(sizeof(struct mass_storage_function_config),
+ GFP_KERNEL);
+ if (!config)
+ return -ENOMEM;
+ f->config = config;
+
+ for (i = 0; i < MAX_MS_INSTANCES; i++) {
+ config->f_ms_inst[i] = usb_get_function_instance("mass_storage");
+ if (IS_ERR(config->f_ms_inst[i])) {
+ ret = PTR_ERR(config->f_ms_inst[i]);
+ goto err_usb_get_function_instance;
+ }
+ config->f_ms[i] = usb_get_function(config->f_ms_inst[i]);
+ if (IS_ERR(config->f_ms[i])) {
+ ret = PTR_ERR(config->f_ms[i]);
+ goto err_usb_get_function;
+ }
+ }
+
+ return 0;
+err_usb_get_function_instance:
+ while (i-- > 0) {
+ usb_put_function(config->f_ms[i]);
+err_usb_get_function:
+ usb_put_function_instance(config->f_ms_inst[i]);
+ }
+ return ret;
+}
+
+static void mass_storage_function_cleanup(struct android_usb_function *f)
+{
+ struct mass_storage_function_config *config = f->config;
+ int i;
+
+ for (i = 0; i < MAX_MS_INSTANCES; i++) {
+ usb_put_function(config->f_ms[i]);
+ usb_put_function_instance(config->f_ms_inst[i]);
+ }
+ kfree(f->config);
+ f->config = NULL;
+}
+
+static int mass_storage_function_bind_config(struct android_usb_function *f,
+ struct usb_configuration *c)
+{
+ struct mass_storage_function_config *config = f->config;
+ int ret = 0;
+ int i;
+
+ config->instances_on = config->instances;
+ for (i = 0; i < config->instances_on; i++) {
+ ret = usb_add_function(c, config->f_ms[i]);
+ if (ret) {
+ pr_err("Could not bind ms%u config\n", i);
+ goto err_usb_add_function;
+ }
+ }
+
+ return 0;
+
+err_usb_add_function:
+ while (i-- > 0)
+ usb_remove_function(c, config->f_ms[i]);
+ return ret;
+}
+
+static void mass_storage_function_unbind_config(struct android_usb_function *f,
+ struct usb_configuration *c)
+{
+ int i;
+ struct mass_storage_function_config *config = f->config;
+
+ for (i = 0; i < config->instances_on; i++)
+ usb_remove_function(c, config->f_ms[i]);
+}
+
+static ssize_t mass_storage_inquiry_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct android_usb_function *f = dev_get_drvdata(dev);
+ struct mass_storage_function_config *config = f->config;
+ return sprintf(buf, "%d\n", config->instances);
+}
+
+static ssize_t mass_storage_inquiry_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t size)
+{
+ struct android_usb_function *f = dev_get_drvdata(dev);
+ struct mass_storage_function_config *config = f->config;
+ int value;
+
+ sscanf(buf, "%d", &value);
+ if (value > MAX_MS_INSTANCES)
+ value = MAX_MS_INSTANCES;
+ config->instances = value;
+ return size;
+}
+
+static DEVICE_ATTR(inquiry_string, S_IRUGO | S_IWUSR,
+ mass_storage_inquiry_show,
+ mass_storage_inquiry_store);
+
+static struct device_attribute *mass_storage_function_attributes[] = {
+ &dev_attr_inquiry_string,
+ NULL
+};
+
+static struct android_usb_function mass_storage_function = {
+ .name = "mass_storage",
+ .init = mass_storage_function_init,
+ .cleanup = mass_storage_function_cleanup,
+ .bind_config = mass_storage_function_bind_config,
+ .attributes = mass_storage_function_attributes,
+};
+
+
+static int accessory_function_init(struct android_usb_function *f,
+ struct usb_composite_dev *cdev)
+{
+ return acc_setup();
+}
+
+static void accessory_function_cleanup(struct android_usb_function *f)
+{
+ acc_cleanup();
+}
+
+static int accessory_function_bind_config(struct android_usb_function *f,
+ struct usb_configuration *c)
+{
+ return acc_bind_config(c);
+}
+
+static int accessory_function_ctrlrequest(struct android_usb_function *f,
+ struct usb_composite_dev *cdev,
+ const struct usb_ctrlrequest *c)
+{
+ return acc_ctrlrequest(cdev, c);
+}
+
+static struct android_usb_function accessory_function = {
+ .name = "accessory",
+ .init = accessory_function_init,
+ .cleanup = accessory_function_cleanup,
+ .bind_config = accessory_function_bind_config,
+ .ctrlrequest = accessory_function_ctrlrequest,
+};
+
+static int audio_source_function_init(struct android_usb_function *f,
+ struct usb_composite_dev *cdev)
+{
+ struct audio_source_config *config;
+
+ config = kzalloc(sizeof(struct audio_source_config), GFP_KERNEL);
+ if (!config)
+ return -ENOMEM;
+ config->card = -1;
+ config->device = -1;
+ f->config = config;
+ return 0;
+}
+
+static void audio_source_function_cleanup(struct android_usb_function *f)
+{
+ kfree(f->config);
+}
+
+static int audio_source_function_bind_config(struct android_usb_function *f,
+ struct usb_configuration *c)
+{
+ struct audio_source_config *config = f->config;
+
+ return audio_source_bind_config(c, config);
+}
+
+static void audio_source_function_unbind_config(struct android_usb_function *f,
+ struct usb_configuration *c)
+{
+ struct audio_source_config *config = f->config;
+
+ config->card = -1;
+ config->device = -1;
+}
+
+static struct android_usb_function audio_source_function = {
+ .name = "audio_source",
+ .init = audio_source_function_init,
+ .cleanup = audio_source_function_cleanup,
+ .bind_config = audio_source_function_bind_config,
+ .unbind_config = audio_source_function_unbind_config,
+ .attributes = audio_source_function_attributes,
+};
+
+static struct android_usb_function *supported_functions[] = {
+ &ffs_function,
+ &acm_function,
+ &mtp_function,
+ &ptp_function,
+ &rndis_function,
+ &mass_storage_function,
+ &accessory_function,
+ &audio_source_function,
+ NULL
+};
+
+
+static int android_init_functions(struct android_usb_function **functions,
+ struct usb_composite_dev *cdev)
+{
+ struct android_dev *dev = _android_dev;
+ struct android_usb_function *f;
+ struct device_attribute **attrs;
+ struct device_attribute *attr;
+ int err;
+ int index = 0;
+
+ for (; (f = *functions++); index++) {
+ f->dev_name = kasprintf(GFP_KERNEL, "f_%s", f->name);
+ f->dev = device_create(android_class, dev->dev,
+ MKDEV(0, index), f, f->dev_name);
+ if (IS_ERR(f->dev)) {
+ pr_err("%s: Failed to create dev %s", __func__,
+ f->dev_name);
+ err = PTR_ERR(f->dev);
+ goto err_create;
+ }
+
+ if (f->init) {
+ err = f->init(f, cdev);
+ if (err) {
+ pr_err("%s: Failed to init %s", __func__,
+ f->name);
+ goto err_out;
+ }
+ }
+
+ attrs = f->attributes;
+ if (attrs) {
+ while ((attr = *attrs++) && !err)
+ err = device_create_file(f->dev, attr);
+ }
+ if (err) {
+ pr_err("%s: Failed to create function %s attributes",
+ __func__, f->name);
+ goto err_out;
+ }
+ }
+ return 0;
+
+err_out:
+ device_destroy(android_class, f->dev->devt);
+err_create:
+ kfree(f->dev_name);
+ return err;
+}
+
+static void android_cleanup_functions(struct android_usb_function **functions)
+{
+ struct android_usb_function *f;
+
+ while (*functions) {
+ f = *functions++;
+
+ if (f->dev) {
+ device_destroy(android_class, f->dev->devt);
+ kfree(f->dev_name);
+ }
+
+ if (f->cleanup)
+ f->cleanup(f);
+ }
+}
+
+static int
+android_bind_enabled_functions(struct android_dev *dev,
+ struct usb_configuration *c)
+{
+ struct android_usb_function *f;
+ int ret;
+
+ list_for_each_entry(f, &dev->enabled_functions, enabled_list) {
+ ret = f->bind_config(f, c);
+ if (ret) {
+ pr_err("%s: %s failed", __func__, f->name);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+static void
+android_unbind_enabled_functions(struct android_dev *dev,
+ struct usb_configuration *c)
+{
+ struct android_usb_function *f;
+
+ list_for_each_entry(f, &dev->enabled_functions, enabled_list) {
+ if (f->unbind_config)
+ f->unbind_config(f, c);
+ }
+}
+
+static int android_enable_function(struct android_dev *dev, char *name)
+{
+ struct android_usb_function **functions = dev->functions;
+ struct android_usb_function *f;
+ while ((f = *functions++)) {
+ if (!strcmp(name, f->name)) {
+ list_add_tail(&f->enabled_list,
+ &dev->enabled_functions);
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+/*-------------------------------------------------------------------------*/
+/* /sys/class/android_usb/android%d/ interface */
+
+static ssize_t
+functions_show(struct device *pdev, struct device_attribute *attr, char *buf)
+{
+ struct android_dev *dev = dev_get_drvdata(pdev);
+ struct android_usb_function *f;
+ char *buff = buf;
+
+ mutex_lock(&dev->mutex);
+
+ list_for_each_entry(f, &dev->enabled_functions, enabled_list)
+ buff += sprintf(buff, "%s,", f->name);
+
+ mutex_unlock(&dev->mutex);
+
+ if (buff != buf)
+ *(buff-1) = '\n';
+ return buff - buf;
+}
+
+static ssize_t
+functions_store(struct device *pdev, struct device_attribute *attr,
+ const char *buff, size_t size)
+{
+ struct android_dev *dev = dev_get_drvdata(pdev);
+ char *name;
+ char buf[256], *b;
+ char aliases[256], *a;
+ int err;
+ int is_ffs;
+ int ffs_enabled = 0;
+
+ mutex_lock(&dev->mutex);
+
+ if (dev->enabled) {
+ mutex_unlock(&dev->mutex);
+ return -EBUSY;
+ }
+
+ INIT_LIST_HEAD(&dev->enabled_functions);
+
+ strlcpy(buf, buff, sizeof(buf));
+ b = strim(buf);
+
+ while (b) {
+ name = strsep(&b, ",");
+ if (!name)
+ continue;
+
+ is_ffs = 0;
+ strlcpy(aliases, dev->ffs_aliases, sizeof(aliases));
+ a = aliases;
+
+ while (a) {
+ char *alias = strsep(&a, ",");
+ if (alias && !strcmp(name, alias)) {
+ is_ffs = 1;
+ break;
+ }
+ }
+
+ if (is_ffs) {
+ if (ffs_enabled)
+ continue;
+ err = android_enable_function(dev, "ffs");
+ if (err)
+ pr_err("android_usb: Cannot enable ffs (%d)",
+ err);
+ else
+ ffs_enabled = 1;
+ continue;
+ }
+
+ err = android_enable_function(dev, name);
+ if (err)
+ pr_err("android_usb: Cannot enable '%s' (%d)",
+ name, err);
+ }
+
+ mutex_unlock(&dev->mutex);
+
+ return size;
+}
+
+static ssize_t enable_show(struct device *pdev, struct device_attribute *attr,
+ char *buf)
+{
+ struct android_dev *dev = dev_get_drvdata(pdev);
+ return sprintf(buf, "%d\n", dev->enabled);
+}
+
+static ssize_t enable_store(struct device *pdev, struct device_attribute *attr,
+ const char *buff, size_t size)
+{
+ struct android_dev *dev = dev_get_drvdata(pdev);
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct android_usb_function *f;
+ int enabled = 0;
+
+
+ if (!cdev)
+ return -ENODEV;
+
+ mutex_lock(&dev->mutex);
+
+ sscanf(buff, "%d", &enabled);
+ if (enabled && !dev->enabled) {
+ cdev->next_string_id = 0;
+ /*
+ * Update values in composite driver's copy of
+ * device descriptor.
+ */
+ cdev->desc.idVendor = device_desc.idVendor;
+ cdev->desc.idProduct = device_desc.idProduct;
+ cdev->desc.bcdDevice = device_desc.bcdDevice;
+ cdev->desc.bDeviceClass = device_desc.bDeviceClass;
+ cdev->desc.bDeviceSubClass = device_desc.bDeviceSubClass;
+ cdev->desc.bDeviceProtocol = device_desc.bDeviceProtocol;
+ list_for_each_entry(f, &dev->enabled_functions, enabled_list) {
+ if (f->enable)
+ f->enable(f);
+ }
+ android_enable(dev);
+ dev->enabled = true;
+ } else if (!enabled && dev->enabled) {
+ android_disable(dev);
+ list_for_each_entry(f, &dev->enabled_functions, enabled_list) {
+ if (f->disable)
+ f->disable(f);
+ }
+ dev->enabled = false;
+ } else {
+ pr_err("android_usb: already %s\n",
+ dev->enabled ? "enabled" : "disabled");
+ }
+
+ mutex_unlock(&dev->mutex);
+ return size;
+}
+
+static ssize_t state_show(struct device *pdev, struct device_attribute *attr,
+ char *buf)
+{
+ struct android_dev *dev = dev_get_drvdata(pdev);
+ struct usb_composite_dev *cdev = dev->cdev;
+ char *state = "DISCONNECTED";
+ unsigned long flags;
+
+ if (!cdev)
+ goto out;
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (cdev->config)
+ state = "CONFIGURED";
+ else if (dev->connected)
+ state = "CONNECTED";
+ spin_unlock_irqrestore(&cdev->lock, flags);
+out:
+ return sprintf(buf, "%s\n", state);
+}
+
+#define DESCRIPTOR_ATTR(field, format_string) \
+static ssize_t \
+field ## _show(struct device *dev, struct device_attribute *attr, \
+ char *buf) \
+{ \
+ return sprintf(buf, format_string, device_desc.field); \
+} \
+static ssize_t \
+field ## _store(struct device *dev, struct device_attribute *attr, \
+ const char *buf, size_t size) \
+{ \
+ int value; \
+ if (sscanf(buf, format_string, &value) == 1) { \
+ device_desc.field = value; \
+ return size; \
+ } \
+ return -1; \
+} \
+static DEVICE_ATTR(field, S_IRUGO | S_IWUSR, field ## _show, field ## _store);
+
+#define DESCRIPTOR_STRING_ATTR(field, buffer) \
+static ssize_t \
+field ## _show(struct device *dev, struct device_attribute *attr, \
+ char *buf) \
+{ \
+ return sprintf(buf, "%s", buffer); \
+} \
+static ssize_t \
+field ## _store(struct device *dev, struct device_attribute *attr, \
+ const char *buf, size_t size) \
+{ \
+ if (size >= sizeof(buffer)) \
+ return -EINVAL; \
+ return strlcpy(buffer, buf, sizeof(buffer)); \
+} \
+static DEVICE_ATTR(field, S_IRUGO | S_IWUSR, field ## _show, field ## _store);
+
+
+DESCRIPTOR_ATTR(idVendor, "%04x\n")
+DESCRIPTOR_ATTR(idProduct, "%04x\n")
+DESCRIPTOR_ATTR(bcdDevice, "%04x\n")
+DESCRIPTOR_ATTR(bDeviceClass, "%d\n")
+DESCRIPTOR_ATTR(bDeviceSubClass, "%d\n")
+DESCRIPTOR_ATTR(bDeviceProtocol, "%d\n")
+DESCRIPTOR_STRING_ATTR(iManufacturer, manufacturer_string)
+DESCRIPTOR_STRING_ATTR(iProduct, product_string)
+DESCRIPTOR_STRING_ATTR(iSerial, serial_string)
+
+static DEVICE_ATTR(functions, S_IRUGO | S_IWUSR, functions_show,
+ functions_store);
+static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR, enable_show, enable_store);
+static DEVICE_ATTR(state, S_IRUGO, state_show, NULL);
+
+static struct device_attribute *android_usb_attributes[] = {
+ &dev_attr_idVendor,
+ &dev_attr_idProduct,
+ &dev_attr_bcdDevice,
+ &dev_attr_bDeviceClass,
+ &dev_attr_bDeviceSubClass,
+ &dev_attr_bDeviceProtocol,
+ &dev_attr_iManufacturer,
+ &dev_attr_iProduct,
+ &dev_attr_iSerial,
+ &dev_attr_functions,
+ &dev_attr_enable,
+ &dev_attr_state,
+ NULL
+};
+
+/*-------------------------------------------------------------------------*/
+/* Composite driver */
+
+static int android_bind_config(struct usb_configuration *c)
+{
+ struct android_dev *dev = _android_dev;
+ int ret = 0;
+
+ ret = android_bind_enabled_functions(dev, c);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static void android_unbind_config(struct usb_configuration *c)
+{
+ struct android_dev *dev = _android_dev;
+
+ android_unbind_enabled_functions(dev, c);
+}
+
+static int android_bind(struct usb_composite_dev *cdev)
+{
+ struct android_dev *dev = _android_dev;
+ struct usb_gadget *gadget = cdev->gadget;
+ int id, ret;
+
+ /* Save the default handler */
+ dev->setup_complete = cdev->req->complete;
+
+ /*
+ * Start disconnected. Userspace will connect the gadget once
+ * it is done configuring the functions.
+ */
+ usb_gadget_disconnect(gadget);
+
+ ret = android_init_functions(dev->functions, cdev);
+ if (ret)
+ return ret;
+
+ /* Allocate string descriptor numbers ... note that string
+ * contents can be overridden by the composite_dev glue.
+ */
+ id = usb_string_id(cdev);
+ if (id < 0)
+ return id;
+ strings_dev[STRING_MANUFACTURER_IDX].id = id;
+ device_desc.iManufacturer = id;
+
+ id = usb_string_id(cdev);
+ if (id < 0)
+ return id;
+ strings_dev[STRING_PRODUCT_IDX].id = id;
+ device_desc.iProduct = id;
+
+ /* Default strings - should be updated by userspace */
+ strncpy(manufacturer_string, "Android", sizeof(manufacturer_string)-1);
+ strncpy(product_string, "Android", sizeof(product_string) - 1);
+ strncpy(serial_string, "0123456789ABCDEF", sizeof(serial_string) - 1);
+
+ id = usb_string_id(cdev);
+ if (id < 0)
+ return id;
+ strings_dev[STRING_SERIAL_IDX].id = id;
+ device_desc.iSerialNumber = id;
+
+ usb_gadget_set_selfpowered(gadget);
+ dev->cdev = cdev;
+
+ return 0;
+}
+
+static int android_usb_unbind(struct usb_composite_dev *cdev)
+{
+ struct android_dev *dev = _android_dev;
+
+ cancel_work_sync(&dev->work);
+ android_cleanup_functions(dev->functions);
+ return 0;
+}
+
+/* HACK: android needs to override setup for accessory to work */
+static int (*composite_setup_func)(struct usb_gadget *gadget, const struct usb_ctrlrequest *c);
+
+static int
+android_setup(struct usb_gadget *gadget, const struct usb_ctrlrequest *c)
+{
+ struct android_dev *dev = _android_dev;
+ struct usb_composite_dev *cdev = get_gadget_data(gadget);
+ struct usb_request *req = cdev->req;
+ struct android_usb_function *f;
+ int value = -EOPNOTSUPP;
+ unsigned long flags;
+
+ req->zero = 0;
+ req->length = 0;
+ req->complete = dev->setup_complete;
+ gadget->ep0->driver_data = cdev;
+
+ list_for_each_entry(f, &dev->enabled_functions, enabled_list) {
+ if (f->ctrlrequest) {
+ value = f->ctrlrequest(f, cdev, c);
+ if (value >= 0)
+ break;
+ }
+ }
+
+ /* Special case the accessory function.
+ * It needs to handle control requests before it is enabled.
+ */
+ if (value < 0)
+ value = acc_ctrlrequest(cdev, c);
+
+ if (value < 0)
+ value = composite_setup_func(gadget, c);
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (!dev->connected) {
+ dev->connected = 1;
+ schedule_work(&dev->work);
+ } else if (c->bRequest == USB_REQ_SET_CONFIGURATION &&
+ cdev->config) {
+ schedule_work(&dev->work);
+ }
+ spin_unlock_irqrestore(&cdev->lock, flags);
+
+ return value;
+}
+
+static void android_disconnect(struct usb_composite_dev *cdev)
+{
+ struct android_dev *dev = _android_dev;
+
+ /* accessory HID support can be active while the
+ accessory function is not actually enabled,
+ so we need to inform it when we are disconnected.
+ */
+ acc_disconnect();
+
+ dev->connected = 0;
+ schedule_work(&dev->work);
+}
+
+static struct usb_composite_driver android_usb_driver = {
+ .name = "android_usb",
+ .dev = &device_desc,
+ .strings = dev_strings,
+ .bind = android_bind,
+ .unbind = android_usb_unbind,
+ .disconnect = android_disconnect,
+ .max_speed = USB_SPEED_HIGH,
+};
+
+static int android_create_device(struct android_dev *dev)
+{
+ struct device_attribute **attrs = android_usb_attributes;
+ struct device_attribute *attr;
+ int err;
+
+ dev->dev = device_create(android_class, NULL,
+ MKDEV(0, 0), NULL, "android0");
+ if (IS_ERR(dev->dev))
+ return PTR_ERR(dev->dev);
+
+ dev_set_drvdata(dev->dev, dev);
+
+ while ((attr = *attrs++)) {
+ err = device_create_file(dev->dev, attr);
+ if (err) {
+ device_destroy(android_class, dev->dev->devt);
+ return err;
+ }
+ }
+ return 0;
+}
+
+
+static int __init init(void)
+{
+ struct android_dev *dev;
+ int err;
+
+ android_class = class_create(THIS_MODULE, "android_usb");
+ if (IS_ERR(android_class))
+ return PTR_ERR(android_class);
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev) {
+ err = -ENOMEM;
+ goto err_dev;
+ }
+
+ dev->disable_depth = 1;
+ dev->functions = supported_functions;
+ INIT_LIST_HEAD(&dev->enabled_functions);
+ INIT_WORK(&dev->work, android_work);
+ mutex_init(&dev->mutex);
+
+ err = android_create_device(dev);
+ if (err) {
+ pr_err("%s: failed to create android device %d", __func__, err);
+ goto err_create;
+ }
+
+ _android_dev = dev;
+
+ err = usb_composite_probe(&android_usb_driver);
+ if (err) {
+ pr_err("%s: failed to probe driver %d", __func__, err);
+ goto err_probe;
+ }
+
+ /* HACK: exchange composite's setup with ours */
+ composite_setup_func = android_usb_driver.gadget_driver.setup;
+ android_usb_driver.gadget_driver.setup = android_setup;
+
+ return 0;
+
+err_probe:
+ device_destroy(android_class, dev->dev->devt);
+err_create:
+ kfree(dev);
+err_dev:
+ class_destroy(android_class);
+ return err;
+}
+late_initcall(init);
+
+static void __exit cleanup(void)
+{
+ usb_composite_unregister(&android_usb_driver);
+ class_destroy(android_class);
+ kfree(_android_dev);
+ _android_dev = NULL;
+}
+module_exit(cleanup);
diff --git a/drivers/usb/gadget/legacy/gmidi.c b/drivers/usb/gadget/legacy/gmidi.c
index 3d696b86ff76..e02a095294ac 100644
--- a/drivers/usb/gadget/legacy/gmidi.c
+++ b/drivers/usb/gadget/legacy/gmidi.c
@@ -37,7 +37,7 @@
#include "gadget_chips.h"
-#include "f_midi.c"
+#include "u_midi.h"
/*-------------------------------------------------------------------------*/
@@ -115,8 +115,13 @@ static struct usb_gadget_strings *dev_strings[] = {
NULL,
};
+static struct usb_function_instance *fi_midi;
+static struct usb_function *f_midi;
+
static int __exit midi_unbind(struct usb_composite_dev *dev)
{
+ usb_put_function(f_midi);
+ usb_put_function_instance(fi_midi);
return 0;
}
@@ -130,28 +135,54 @@ static struct usb_configuration midi_config = {
static int __init midi_bind_config(struct usb_configuration *c)
{
- return f_midi_bind_config(c, index, id,
- in_ports, out_ports,
- buflen, qlen);
+ int status;
+
+ f_midi = usb_get_function(fi_midi);
+ if (IS_ERR(f_midi))
+ return PTR_ERR(f_midi);
+
+ status = usb_add_function(c, f_midi);
+ if (status < 0) {
+ usb_put_function(f_midi);
+ return status;
+ }
+
+ return 0;
}
static int __init midi_bind(struct usb_composite_dev *cdev)
{
+ struct f_midi_opts *midi_opts;
int status;
+ fi_midi = usb_get_function_instance("midi");
+ if (IS_ERR(fi_midi))
+ return PTR_ERR(fi_midi);
+
+ midi_opts = container_of(fi_midi, struct f_midi_opts, func_inst);
+ midi_opts->index = index;
+ midi_opts->id = id;
+ midi_opts->in_ports = in_ports;
+ midi_opts->out_ports = out_ports;
+ midi_opts->buflen = buflen;
+ midi_opts->qlen = qlen;
+
status = usb_string_ids_tab(cdev, strings_dev);
if (status < 0)
- return status;
+ goto put;
device_desc.iManufacturer = strings_dev[USB_GADGET_MANUFACTURER_IDX].id;
device_desc.iProduct = strings_dev[USB_GADGET_PRODUCT_IDX].id;
midi_config.iConfiguration = strings_dev[STRING_DESCRIPTION_IDX].id;
status = usb_add_config(cdev, &midi_config, midi_bind_config);
if (status < 0)
- return status;
+ goto put;
usb_composite_overwrite_options(cdev, &coverwrite);
pr_info("%s\n", longname);
return 0;
+put:
+ usb_put_function_instance(fi_midi);
+ return status;
}
static __refdata struct usb_composite_driver midi_driver = {
diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig
index 0cd1f44f0ee8..77ef4bb73d41 100644
--- a/drivers/usb/phy/Kconfig
+++ b/drivers/usb/phy/Kconfig
@@ -6,6 +6,14 @@ menu "USB Physical Layer drivers"
config USB_PHY
def_bool n
+config USB_OTG_WAKELOCK
+ bool "Hold a wakelock when USB connected"
+ depends on WAKELOCK
+ select USB_OTG_UTILS
+ help
+ Select this to automatically hold a wakelock when USB is
+ connected, preventing suspend.
+
#
# USB Transceiver Drivers
#
@@ -213,4 +221,13 @@ config USB_ULPI_VIEWPORT
Provides read/write operations to the ULPI phy register set for
controllers with a viewport register (e.g. Chipidea/ARC controllers).
+config DUAL_ROLE_USB_INTF
+ bool "Generic DUAL ROLE sysfs interface"
+ depends on SYSFS && USB_PHY
+ help
+ A generic sysfs interface to track and change the state of
+ dual role usb phys. The usb phy drivers can register to
+ this interface to expose it capabilities to the userspace
+ and thereby allowing userspace to change the port mode.
+
endmenu
diff --git a/drivers/usb/phy/Makefile b/drivers/usb/phy/Makefile
index 75f2bba58c84..5cd78717b13f 100644
--- a/drivers/usb/phy/Makefile
+++ b/drivers/usb/phy/Makefile
@@ -3,6 +3,8 @@
#
obj-$(CONFIG_USB_PHY) += phy.o
obj-$(CONFIG_OF) += of.o
+obj-$(CONFIG_USB_OTG_WAKELOCK) += otg-wakelock.o
+obj-$(CONFIG_DUAL_ROLE_USB_INTF) += class-dual-role.o
# transceiver drivers, keep the list sorted
diff --git a/drivers/usb/phy/class-dual-role.c b/drivers/usb/phy/class-dual-role.c
new file mode 100644
index 000000000000..51fcb545a9d5
--- /dev/null
+++ b/drivers/usb/phy/class-dual-role.c
@@ -0,0 +1,529 @@
+/*
+ * class-dual-role.c
+ *
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/usb/class-dual-role.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/types.h>
+
+#define DUAL_ROLE_NOTIFICATION_TIMEOUT 2000
+
+static ssize_t dual_role_store_property(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count);
+static ssize_t dual_role_show_property(struct device *dev,
+ struct device_attribute *attr,
+ char *buf);
+
+#define DUAL_ROLE_ATTR(_name) \
+{ \
+ .attr = { .name = #_name }, \
+ .show = dual_role_show_property, \
+ .store = dual_role_store_property, \
+}
+
+static struct device_attribute dual_role_attrs[] = {
+ DUAL_ROLE_ATTR(supported_modes),
+ DUAL_ROLE_ATTR(mode),
+ DUAL_ROLE_ATTR(power_role),
+ DUAL_ROLE_ATTR(data_role),
+ DUAL_ROLE_ATTR(powers_vconn),
+};
+
+struct class *dual_role_class;
+EXPORT_SYMBOL_GPL(dual_role_class);
+
+static struct device_type dual_role_dev_type;
+
+static char *kstrdupcase(const char *str, gfp_t gfp, bool to_upper)
+{
+ char *ret, *ustr;
+
+ ustr = ret = kmalloc(strlen(str) + 1, gfp);
+
+ if (!ret)
+ return NULL;
+
+ while (*str)
+ *ustr++ = to_upper ? toupper(*str++) : tolower(*str++);
+
+ *ustr = 0;
+
+ return ret;
+}
+
+static void dual_role_changed_work(struct work_struct *work)
+{
+ struct dual_role_phy_instance *dual_role =
+ container_of(work, struct dual_role_phy_instance,
+ changed_work);
+
+ dev_dbg(&dual_role->dev, "%s\n", __func__);
+ kobject_uevent(&dual_role->dev.kobj, KOBJ_CHANGE);
+}
+
+void dual_role_instance_changed(struct dual_role_phy_instance *dual_role)
+{
+ dev_dbg(&dual_role->dev, "%s\n", __func__);
+ pm_wakeup_event(&dual_role->dev, DUAL_ROLE_NOTIFICATION_TIMEOUT);
+ schedule_work(&dual_role->changed_work);
+}
+EXPORT_SYMBOL_GPL(dual_role_instance_changed);
+
+int dual_role_get_property(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ unsigned int *val)
+{
+ return dual_role->desc->get_property(dual_role, prop, val);
+}
+EXPORT_SYMBOL_GPL(dual_role_get_property);
+
+int dual_role_set_property(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ const unsigned int *val)
+{
+ if (!dual_role->desc->set_property)
+ return -ENODEV;
+
+ return dual_role->desc->set_property(dual_role, prop, val);
+}
+EXPORT_SYMBOL_GPL(dual_role_set_property);
+
+int dual_role_property_is_writeable(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop)
+{
+ if (!dual_role->desc->property_is_writeable)
+ return -ENODEV;
+
+ return dual_role->desc->property_is_writeable(dual_role, prop);
+}
+EXPORT_SYMBOL_GPL(dual_role_property_is_writeable);
+
+static void dual_role_dev_release(struct device *dev)
+{
+ struct dual_role_phy_instance *dual_role =
+ container_of(dev, struct dual_role_phy_instance, dev);
+ pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
+ kfree(dual_role);
+}
+
+static struct dual_role_phy_instance *__must_check
+__dual_role_register(struct device *parent,
+ const struct dual_role_phy_desc *desc)
+{
+ struct device *dev;
+ struct dual_role_phy_instance *dual_role;
+ int rc;
+
+ dual_role = kzalloc(sizeof(*dual_role), GFP_KERNEL);
+ if (!dual_role)
+ return ERR_PTR(-ENOMEM);
+
+ dev = &dual_role->dev;
+
+ device_initialize(dev);
+
+ dev->class = dual_role_class;
+ dev->type = &dual_role_dev_type;
+ dev->parent = parent;
+ dev->release = dual_role_dev_release;
+ dev_set_drvdata(dev, dual_role);
+ dual_role->desc = desc;
+
+ rc = dev_set_name(dev, "%s", desc->name);
+ if (rc)
+ goto dev_set_name_failed;
+
+ INIT_WORK(&dual_role->changed_work, dual_role_changed_work);
+
+ rc = device_init_wakeup(dev, true);
+ if (rc)
+ goto wakeup_init_failed;
+
+ rc = device_add(dev);
+ if (rc)
+ goto device_add_failed;
+
+ dual_role_instance_changed(dual_role);
+
+ return dual_role;
+
+device_add_failed:
+ device_init_wakeup(dev, false);
+wakeup_init_failed:
+dev_set_name_failed:
+ put_device(dev);
+ kfree(dual_role);
+
+ return ERR_PTR(rc);
+}
+
+static void dual_role_instance_unregister(struct dual_role_phy_instance
+ *dual_role)
+{
+ cancel_work_sync(&dual_role->changed_work);
+ device_init_wakeup(&dual_role->dev, false);
+ device_unregister(&dual_role->dev);
+}
+
+static void devm_dual_role_release(struct device *dev, void *res)
+{
+ struct dual_role_phy_instance **dual_role = res;
+
+ dual_role_instance_unregister(*dual_role);
+}
+
+struct dual_role_phy_instance *__must_check
+devm_dual_role_instance_register(struct device *parent,
+ const struct dual_role_phy_desc *desc)
+{
+ struct dual_role_phy_instance **ptr, *dual_role;
+
+ ptr = devres_alloc(devm_dual_role_release, sizeof(*ptr), GFP_KERNEL);
+
+ if (!ptr)
+ return ERR_PTR(-ENOMEM);
+ dual_role = __dual_role_register(parent, desc);
+ if (IS_ERR(dual_role)) {
+ devres_free(ptr);
+ } else {
+ *ptr = dual_role;
+ devres_add(parent, ptr);
+ }
+ return dual_role;
+}
+EXPORT_SYMBOL_GPL(devm_dual_role_instance_register);
+
+static int devm_dual_role_match(struct device *dev, void *res, void *data)
+{
+ struct dual_role_phy_instance **r = res;
+
+ if (WARN_ON(!r || !*r))
+ return 0;
+
+ return *r == data;
+}
+
+void devm_dual_role_instance_unregister(struct device *dev,
+ struct dual_role_phy_instance
+ *dual_role)
+{
+ int rc;
+
+ rc = devres_release(dev, devm_dual_role_release,
+ devm_dual_role_match, dual_role);
+ WARN_ON(rc);
+}
+EXPORT_SYMBOL_GPL(devm_dual_role_instance_unregister);
+
+void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role)
+{
+ return dual_role->drv_data;
+}
+EXPORT_SYMBOL_GPL(dual_role_get_drvdata);
+
+/***************** Device attribute functions **************************/
+
+/* port type */
+static char *supported_modes_text[] = {
+ "ufp dfp", "dfp", "ufp"
+};
+
+/* current mode */
+static char *mode_text[] = {
+ "ufp", "dfp", "none"
+};
+
+/* Power role */
+static char *pr_text[] = {
+ "source", "sink", "none"
+};
+
+/* Data role */
+static char *dr_text[] = {
+ "host", "device", "none"
+};
+
+/* Vconn supply */
+static char *vconn_supply_text[] = {
+ "n", "y"
+};
+
+static ssize_t dual_role_show_property(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ ssize_t ret = 0;
+ struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+ const ptrdiff_t off = attr - dual_role_attrs;
+ unsigned int value;
+
+ if (off == DUAL_ROLE_PROP_SUPPORTED_MODES) {
+ value = dual_role->desc->supported_modes;
+ } else {
+ ret = dual_role_get_property(dual_role, off, &value);
+
+ if (ret < 0) {
+ if (ret == -ENODATA)
+ dev_dbg(dev,
+ "driver has no data for `%s' property\n",
+ attr->attr.name);
+ else if (ret != -ENODEV)
+ dev_err(dev,
+ "driver failed to report `%s' property: %zd\n",
+ attr->attr.name, ret);
+ return ret;
+ }
+ }
+
+ if (off == DUAL_ROLE_PROP_SUPPORTED_MODES) {
+ BUILD_BUG_ON(DUAL_ROLE_PROP_SUPPORTED_MODES_TOTAL !=
+ ARRAY_SIZE(supported_modes_text));
+ if (value < DUAL_ROLE_PROP_SUPPORTED_MODES_TOTAL)
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ supported_modes_text[value]);
+ else
+ return -EIO;
+ } else if (off == DUAL_ROLE_PROP_MODE) {
+ BUILD_BUG_ON(DUAL_ROLE_PROP_MODE_TOTAL !=
+ ARRAY_SIZE(mode_text));
+ if (value < DUAL_ROLE_PROP_MODE_TOTAL)
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ mode_text[value]);
+ else
+ return -EIO;
+ } else if (off == DUAL_ROLE_PROP_PR) {
+ BUILD_BUG_ON(DUAL_ROLE_PROP_PR_TOTAL != ARRAY_SIZE(pr_text));
+ if (value < DUAL_ROLE_PROP_PR_TOTAL)
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ pr_text[value]);
+ else
+ return -EIO;
+ } else if (off == DUAL_ROLE_PROP_DR) {
+ BUILD_BUG_ON(DUAL_ROLE_PROP_DR_TOTAL != ARRAY_SIZE(dr_text));
+ if (value < DUAL_ROLE_PROP_DR_TOTAL)
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ dr_text[value]);
+ else
+ return -EIO;
+ } else if (off == DUAL_ROLE_PROP_VCONN_SUPPLY) {
+ BUILD_BUG_ON(DUAL_ROLE_PROP_VCONN_SUPPLY_TOTAL !=
+ ARRAY_SIZE(vconn_supply_text));
+ if (value < DUAL_ROLE_PROP_VCONN_SUPPLY_TOTAL)
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ vconn_supply_text[value]);
+ else
+ return -EIO;
+ } else
+ return -EIO;
+}
+
+static ssize_t dual_role_store_property(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret;
+ struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+ const ptrdiff_t off = attr - dual_role_attrs;
+ unsigned int value;
+ int total, i;
+ char *dup_buf, **text_array;
+ bool result = false;
+
+ dup_buf = kstrdupcase(buf, GFP_KERNEL, false);
+ switch (off) {
+ case DUAL_ROLE_PROP_MODE:
+ total = DUAL_ROLE_PROP_MODE_TOTAL;
+ text_array = mode_text;
+ break;
+ case DUAL_ROLE_PROP_PR:
+ total = DUAL_ROLE_PROP_PR_TOTAL;
+ text_array = pr_text;
+ break;
+ case DUAL_ROLE_PROP_DR:
+ total = DUAL_ROLE_PROP_DR_TOTAL;
+ text_array = dr_text;
+ break;
+ case DUAL_ROLE_PROP_VCONN_SUPPLY:
+ ret = strtobool(dup_buf, &result);
+ value = result;
+ if (!ret)
+ goto setprop;
+ default:
+ ret = -EINVAL;
+ goto error;
+ }
+
+ for (i = 0; i <= total; i++) {
+ if (i == total) {
+ ret = -ENOTSUPP;
+ goto error;
+ }
+ if (!strncmp(*(text_array + i), dup_buf,
+ strlen(*(text_array + i)))) {
+ value = i;
+ break;
+ }
+ }
+
+setprop:
+ ret = dual_role->desc->set_property(dual_role, off, &value);
+
+error:
+ kfree(dup_buf);
+
+ if (ret < 0)
+ return ret;
+
+ return count;
+}
+
+static umode_t dual_role_attr_is_visible(struct kobject *kobj,
+ struct attribute *attr, int attrno)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+ umode_t mode = S_IRUSR | S_IRGRP | S_IROTH;
+ int i;
+
+ if (attrno == DUAL_ROLE_PROP_SUPPORTED_MODES)
+ return mode;
+
+ for (i = 0; i < dual_role->desc->num_properties; i++) {
+ int property = dual_role->desc->properties[i];
+
+ if (property == attrno) {
+ if (dual_role->desc->property_is_writeable &&
+ dual_role_property_is_writeable(dual_role, property)
+ > 0)
+ mode |= S_IWUSR;
+
+ return mode;
+ }
+ }
+
+ return 0;
+}
+
+static struct attribute *__dual_role_attrs[ARRAY_SIZE(dual_role_attrs) + 1];
+
+static struct attribute_group dual_role_attr_group = {
+ .attrs = __dual_role_attrs,
+ .is_visible = dual_role_attr_is_visible,
+};
+
+static const struct attribute_group *dual_role_attr_groups[] = {
+ &dual_role_attr_group,
+ NULL,
+};
+
+void dual_role_init_attrs(struct device_type *dev_type)
+{
+ int i;
+
+ dev_type->groups = dual_role_attr_groups;
+
+ for (i = 0; i < ARRAY_SIZE(dual_role_attrs); i++)
+ __dual_role_attrs[i] = &dual_role_attrs[i].attr;
+}
+
+int dual_role_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+ int ret = 0, j;
+ char *prop_buf;
+ char *attrname;
+
+ dev_dbg(dev, "uevent\n");
+
+ if (!dual_role || !dual_role->desc) {
+ dev_dbg(dev, "No dual_role phy yet\n");
+ return ret;
+ }
+
+ dev_dbg(dev, "DUAL_ROLE_NAME=%s\n", dual_role->desc->name);
+
+ ret = add_uevent_var(env, "DUAL_ROLE_NAME=%s", dual_role->desc->name);
+ if (ret)
+ return ret;
+
+ prop_buf = (char *)get_zeroed_page(GFP_KERNEL);
+ if (!prop_buf)
+ return -ENOMEM;
+
+ for (j = 0; j < dual_role->desc->num_properties; j++) {
+ struct device_attribute *attr;
+ char *line;
+
+ attr = &dual_role_attrs[dual_role->desc->properties[j]];
+
+ ret = dual_role_show_property(dev, attr, prop_buf);
+ if (ret == -ENODEV || ret == -ENODATA) {
+ ret = 0;
+ continue;
+ }
+
+ if (ret < 0)
+ goto out;
+ line = strnchr(prop_buf, PAGE_SIZE, '\n');
+ if (line)
+ *line = 0;
+
+ attrname = kstrdupcase(attr->attr.name, GFP_KERNEL, true);
+ if (!attrname)
+ ret = -ENOMEM;
+
+ dev_dbg(dev, "prop %s=%s\n", attrname, prop_buf);
+
+ ret = add_uevent_var(env, "DUAL_ROLE_%s=%s", attrname,
+ prop_buf);
+ kfree(attrname);
+ if (ret)
+ goto out;
+ }
+
+out:
+ free_page((unsigned long)prop_buf);
+
+ return ret;
+}
+
+/******************* Module Init ***********************************/
+
+static int __init dual_role_class_init(void)
+{
+ dual_role_class = class_create(THIS_MODULE, "dual_role_usb");
+
+ if (IS_ERR(dual_role_class))
+ return PTR_ERR(dual_role_class);
+
+ dual_role_class->dev_uevent = dual_role_uevent;
+ dual_role_init_attrs(&dual_role_dev_type);
+
+ return 0;
+}
+
+static void __exit dual_role_class_exit(void)
+{
+ class_destroy(dual_role_class);
+}
+
+subsys_initcall(dual_role_class_init);
+module_exit(dual_role_class_exit);
diff --git a/drivers/usb/phy/otg-wakelock.c b/drivers/usb/phy/otg-wakelock.c
new file mode 100644
index 000000000000..479376bfa484
--- /dev/null
+++ b/drivers/usb/phy/otg-wakelock.c
@@ -0,0 +1,173 @@
+/*
+ * otg-wakelock.c
+ *
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/wakelock.h>
+#include <linux/spinlock.h>
+#include <linux/usb/otg.h>
+
+#define TEMPORARY_HOLD_TIME 2000
+
+static bool enabled = true;
+static struct usb_phy *otgwl_xceiv;
+static struct notifier_block otgwl_nb;
+
+/*
+ * otgwl_spinlock is held while the VBUS lock is grabbed or dropped and the
+ * held field is updated to match.
+ */
+
+static DEFINE_SPINLOCK(otgwl_spinlock);
+
+/*
+ * Only one lock, but since these 3 fields are associated with each other...
+ */
+
+struct otgwl_lock {
+ char name[40];
+ struct wake_lock wakelock;
+ bool held;
+};
+
+/*
+ * VBUS present lock. Also used as a timed lock on charger
+ * connect/disconnect and USB host disconnect, to allow the system
+ * to react to the change in power.
+ */
+
+static struct otgwl_lock vbus_lock;
+
+static void otgwl_hold(struct otgwl_lock *lock)
+{
+ if (!lock->held) {
+ wake_lock(&lock->wakelock);
+ lock->held = true;
+ }
+}
+
+static void otgwl_temporary_hold(struct otgwl_lock *lock)
+{
+ wake_lock_timeout(&lock->wakelock,
+ msecs_to_jiffies(TEMPORARY_HOLD_TIME));
+ lock->held = false;
+}
+
+static void otgwl_drop(struct otgwl_lock *lock)
+{
+ if (lock->held) {
+ wake_unlock(&lock->wakelock);
+ lock->held = false;
+ }
+}
+
+static void otgwl_handle_event(unsigned long event)
+{
+ unsigned long irqflags;
+
+ spin_lock_irqsave(&otgwl_spinlock, irqflags);
+
+ if (!enabled) {
+ otgwl_drop(&vbus_lock);
+ spin_unlock_irqrestore(&otgwl_spinlock, irqflags);
+ return;
+ }
+
+ switch (event) {
+ case USB_EVENT_VBUS:
+ case USB_EVENT_ENUMERATED:
+ otgwl_hold(&vbus_lock);
+ break;
+
+ case USB_EVENT_NONE:
+ case USB_EVENT_ID:
+ case USB_EVENT_CHARGER:
+ otgwl_temporary_hold(&vbus_lock);
+ break;
+
+ default:
+ break;
+ }
+
+ spin_unlock_irqrestore(&otgwl_spinlock, irqflags);
+}
+
+static int otgwl_otg_notifications(struct notifier_block *nb,
+ unsigned long event, void *unused)
+{
+ otgwl_handle_event(event);
+ return NOTIFY_OK;
+}
+
+static int set_enabled(const char *val, const struct kernel_param *kp)
+{
+ int rv = param_set_bool(val, kp);
+
+ if (rv)
+ return rv;
+
+ if (otgwl_xceiv)
+ otgwl_handle_event(otgwl_xceiv->last_event);
+
+ return 0;
+}
+
+static struct kernel_param_ops enabled_param_ops = {
+ .set = set_enabled,
+ .get = param_get_bool,
+};
+
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0644);
+MODULE_PARM_DESC(enabled, "enable wakelock when VBUS present");
+
+static int __init otg_wakelock_init(void)
+{
+ int ret;
+ struct usb_phy *phy;
+
+ phy = usb_get_phy(USB_PHY_TYPE_USB2);
+
+ if (IS_ERR(phy)) {
+ pr_err("%s: No USB transceiver found\n", __func__);
+ return PTR_ERR(phy);
+ }
+ otgwl_xceiv = phy;
+
+ snprintf(vbus_lock.name, sizeof(vbus_lock.name), "vbus-%s",
+ dev_name(otgwl_xceiv->dev));
+ wake_lock_init(&vbus_lock.wakelock, WAKE_LOCK_SUSPEND,
+ vbus_lock.name);
+
+ otgwl_nb.notifier_call = otgwl_otg_notifications;
+ ret = usb_register_notifier(otgwl_xceiv, &otgwl_nb);
+
+ if (ret) {
+ pr_err("%s: usb_register_notifier on transceiver %s"
+ " failed\n", __func__,
+ dev_name(otgwl_xceiv->dev));
+ otgwl_xceiv = NULL;
+ wake_lock_destroy(&vbus_lock.wakelock);
+ return ret;
+ }
+
+ otgwl_handle_event(otgwl_xceiv->last_event);
+ return ret;
+}
+
+late_initcall(otg_wakelock_init);
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index 8bf495ffb020..49fcbb1632af 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -31,6 +31,7 @@ source "drivers/video/fbdev/Kconfig"
endmenu
source "drivers/video/backlight/Kconfig"
+source "drivers/video/adf/Kconfig"
config VGASTATE
tristate
diff --git a/drivers/video/Makefile b/drivers/video/Makefile
index 9ad3c17d6456..1a8c4ced39b2 100644
--- a/drivers/video/Makefile
+++ b/drivers/video/Makefile
@@ -1,6 +1,7 @@
obj-$(CONFIG_VGASTATE) += vgastate.o
obj-$(CONFIG_HDMI) += hdmi.o
+obj-$(CONFIG_ADF) += adf/
obj-$(CONFIG_VT) += console/
obj-$(CONFIG_LOGO) += logo/
obj-y += backlight/
diff --git a/drivers/video/adf/Kconfig b/drivers/video/adf/Kconfig
new file mode 100644
index 000000000000..2777db48fae0
--- /dev/null
+++ b/drivers/video/adf/Kconfig
@@ -0,0 +1,14 @@
+menuconfig ADF
+ depends on SYNC
+ depends on DMA_SHARED_BUFFER
+ tristate "Atomic Display Framework"
+
+menuconfig ADF_FBDEV
+ depends on ADF
+ depends on FB
+ tristate "Helper for implementing the fbdev API in ADF drivers"
+
+menuconfig ADF_MEMBLOCK
+ depends on ADF
+ depends on HAVE_MEMBLOCK
+ bool "Helper for using memblocks as buffers in ADF drivers"
diff --git a/drivers/video/adf/Makefile b/drivers/video/adf/Makefile
new file mode 100644
index 000000000000..cdf34a666dc7
--- /dev/null
+++ b/drivers/video/adf/Makefile
@@ -0,0 +1,17 @@
+ccflags-y := -Idrivers/staging/android
+
+CFLAGS_adf.o := -I$(src)
+
+obj-$(CONFIG_ADF) += adf_core.o
+
+adf_core-y := adf.o \
+ adf_client.o \
+ adf_fops.o \
+ adf_format.o \
+ adf_sysfs.o
+
+adf_core-$(CONFIG_COMPAT) += adf_fops32.o
+
+obj-$(CONFIG_ADF_FBDEV) += adf_fbdev.o
+
+obj-$(CONFIG_ADF_MEMBLOCK) += adf_memblock.o
diff --git a/drivers/video/adf/adf.c b/drivers/video/adf/adf.c
new file mode 100644
index 000000000000..42c30c05826a
--- /dev/null
+++ b/drivers/video/adf/adf.c
@@ -0,0 +1,1188 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ * adf_modeinfo_{set_name,set_vrefresh} modified from
+ * drivers/gpu/drm/drm_modes.c
+ * adf_format_validate_yuv modified from framebuffer_check in
+ * drivers/gpu/drm/drm_crtc.c
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/idr.h>
+#include <linux/highmem.h>
+#include <linux/memblock.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+#include <video/adf_format.h>
+
+#include "sw_sync.h"
+#include "sync.h"
+
+#include "adf.h"
+#include "adf_fops.h"
+#include "adf_sysfs.h"
+
+#define CREATE_TRACE_POINTS
+#include "adf_trace.h"
+
+#define ADF_SHORT_FENCE_TIMEOUT (1 * MSEC_PER_SEC)
+#define ADF_LONG_FENCE_TIMEOUT (10 * MSEC_PER_SEC)
+
+static DEFINE_IDR(adf_devices);
+
+static void adf_fence_wait(struct adf_device *dev, struct sync_fence *fence)
+{
+ /* sync_fence_wait() dumps debug information on timeout. Experience
+ has shown that if the pipeline gets stuck, a short timeout followed
+ by a longer one provides useful information for debugging. */
+ int err = sync_fence_wait(fence, ADF_SHORT_FENCE_TIMEOUT);
+ if (err >= 0)
+ return;
+
+ if (err == -ETIME)
+ err = sync_fence_wait(fence, ADF_LONG_FENCE_TIMEOUT);
+
+ if (err < 0)
+ dev_warn(&dev->base.dev, "error waiting on fence: %d\n", err);
+}
+
+void adf_buffer_cleanup(struct adf_buffer *buf)
+{
+ size_t i;
+ for (i = 0; i < ARRAY_SIZE(buf->dma_bufs); i++)
+ if (buf->dma_bufs[i])
+ dma_buf_put(buf->dma_bufs[i]);
+
+ if (buf->acquire_fence)
+ sync_fence_put(buf->acquire_fence);
+}
+
+void adf_buffer_mapping_cleanup(struct adf_buffer_mapping *mapping,
+ struct adf_buffer *buf)
+{
+ /* calling adf_buffer_mapping_cleanup() is safe even if mapping is
+ uninitialized or partially-initialized, as long as it was
+ zeroed on allocation */
+ size_t i;
+ for (i = 0; i < ARRAY_SIZE(mapping->sg_tables); i++) {
+ if (mapping->sg_tables[i])
+ dma_buf_unmap_attachment(mapping->attachments[i],
+ mapping->sg_tables[i], DMA_TO_DEVICE);
+ if (mapping->attachments[i])
+ dma_buf_detach(buf->dma_bufs[i],
+ mapping->attachments[i]);
+ }
+}
+
+void adf_post_cleanup(struct adf_device *dev, struct adf_pending_post *post)
+{
+ size_t i;
+
+ if (post->state)
+ dev->ops->state_free(dev, post->state);
+
+ for (i = 0; i < post->config.n_bufs; i++) {
+ adf_buffer_mapping_cleanup(&post->config.mappings[i],
+ &post->config.bufs[i]);
+ adf_buffer_cleanup(&post->config.bufs[i]);
+ }
+
+ kfree(post->config.custom_data);
+ kfree(post->config.mappings);
+ kfree(post->config.bufs);
+ kfree(post);
+}
+
+static void adf_sw_advance_timeline(struct adf_device *dev)
+{
+#ifdef CONFIG_SW_SYNC
+ sw_sync_timeline_inc(dev->timeline, 1);
+#else
+ BUG();
+#endif
+}
+
+static void adf_post_work_func(struct kthread_work *work)
+{
+ struct adf_device *dev =
+ container_of(work, struct adf_device, post_work);
+ struct adf_pending_post *post, *next;
+ struct list_head saved_list;
+
+ mutex_lock(&dev->post_lock);
+ memcpy(&saved_list, &dev->post_list, sizeof(saved_list));
+ list_replace_init(&dev->post_list, &saved_list);
+ mutex_unlock(&dev->post_lock);
+
+ list_for_each_entry_safe(post, next, &saved_list, head) {
+ int i;
+
+ for (i = 0; i < post->config.n_bufs; i++) {
+ struct sync_fence *fence =
+ post->config.bufs[i].acquire_fence;
+ if (fence)
+ adf_fence_wait(dev, fence);
+ }
+
+ dev->ops->post(dev, &post->config, post->state);
+
+ if (dev->ops->advance_timeline)
+ dev->ops->advance_timeline(dev, &post->config,
+ post->state);
+ else
+ adf_sw_advance_timeline(dev);
+
+ list_del(&post->head);
+ if (dev->onscreen)
+ adf_post_cleanup(dev, dev->onscreen);
+ dev->onscreen = post;
+ }
+}
+
+void adf_attachment_free(struct adf_attachment_list *attachment)
+{
+ list_del(&attachment->head);
+ kfree(attachment);
+}
+
+struct adf_event_refcount *adf_obj_find_event_refcount(struct adf_obj *obj,
+ enum adf_event_type type)
+{
+ struct rb_root *root = &obj->event_refcount;
+ struct rb_node **new = &(root->rb_node);
+ struct rb_node *parent = NULL;
+ struct adf_event_refcount *refcount;
+
+ while (*new) {
+ refcount = container_of(*new, struct adf_event_refcount, node);
+ parent = *new;
+
+ if (refcount->type > type)
+ new = &(*new)->rb_left;
+ else if (refcount->type < type)
+ new = &(*new)->rb_right;
+ else
+ return refcount;
+ }
+
+ refcount = kzalloc(sizeof(*refcount), GFP_KERNEL);
+ if (!refcount)
+ return NULL;
+ refcount->type = type;
+
+ rb_link_node(&refcount->node, parent, new);
+ rb_insert_color(&refcount->node, root);
+ return refcount;
+}
+
+/**
+ * adf_event_get - increase the refcount for an event
+ *
+ * @obj: the object that produces the event
+ * @type: the event type
+ *
+ * ADF will call the object's set_event() op if needed. ops are allowed
+ * to sleep, so adf_event_get() must NOT be called from an atomic context.
+ *
+ * Returns 0 if successful, or -%EINVAL if the object does not support the
+ * requested event type.
+ */
+int adf_event_get(struct adf_obj *obj, enum adf_event_type type)
+{
+ struct adf_event_refcount *refcount;
+ int old_refcount;
+ int ret;
+
+ ret = adf_obj_check_supports_event(obj, type);
+ if (ret < 0)
+ return ret;
+
+ mutex_lock(&obj->event_lock);
+
+ refcount = adf_obj_find_event_refcount(obj, type);
+ if (!refcount) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ old_refcount = refcount->refcount++;
+
+ if (old_refcount == 0) {
+ obj->ops->set_event(obj, type, true);
+ trace_adf_event_enable(obj, type);
+ }
+
+done:
+ mutex_unlock(&obj->event_lock);
+ return ret;
+}
+EXPORT_SYMBOL(adf_event_get);
+
+/**
+ * adf_event_put - decrease the refcount for an event
+ *
+ * @obj: the object that produces the event
+ * @type: the event type
+ *
+ * ADF will call the object's set_event() op if needed. ops are allowed
+ * to sleep, so adf_event_put() must NOT be called from an atomic context.
+ *
+ * Returns 0 if successful, -%EINVAL if the object does not support the
+ * requested event type, or -%EALREADY if the refcount is already 0.
+ */
+int adf_event_put(struct adf_obj *obj, enum adf_event_type type)
+{
+ struct adf_event_refcount *refcount;
+ int old_refcount;
+ int ret;
+
+ ret = adf_obj_check_supports_event(obj, type);
+ if (ret < 0)
+ return ret;
+
+
+ mutex_lock(&obj->event_lock);
+
+ refcount = adf_obj_find_event_refcount(obj, type);
+ if (!refcount) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ old_refcount = refcount->refcount--;
+
+ if (WARN_ON(old_refcount == 0)) {
+ refcount->refcount++;
+ ret = -EALREADY;
+ } else if (old_refcount == 1) {
+ obj->ops->set_event(obj, type, false);
+ trace_adf_event_disable(obj, type);
+ }
+
+done:
+ mutex_unlock(&obj->event_lock);
+ return ret;
+}
+EXPORT_SYMBOL(adf_event_put);
+
+/**
+ * adf_vsync_wait - wait for a vsync event on a display interface
+ *
+ * @intf: the display interface
+ * @timeout: timeout in jiffies (0 = wait indefinitely)
+ *
+ * adf_vsync_wait() may sleep, so it must NOT be called from an atomic context.
+ *
+ * This function returns -%ERESTARTSYS if it is interrupted by a signal.
+ * If @timeout == 0 then this function returns 0 on vsync. If @timeout > 0 then
+ * this function returns the number of remaining jiffies or -%ETIMEDOUT on
+ * timeout.
+ */
+int adf_vsync_wait(struct adf_interface *intf, long timeout)
+{
+ ktime_t timestamp;
+ int ret;
+ unsigned long flags;
+
+ read_lock_irqsave(&intf->vsync_lock, flags);
+ timestamp = intf->vsync_timestamp;
+ read_unlock_irqrestore(&intf->vsync_lock, flags);
+
+ adf_vsync_get(intf);
+ if (timeout) {
+ ret = wait_event_interruptible_timeout(intf->vsync_wait,
+ !ktime_equal(timestamp,
+ intf->vsync_timestamp),
+ msecs_to_jiffies(timeout));
+ if (ret == 0 && ktime_equal(timestamp, intf->vsync_timestamp))
+ ret = -ETIMEDOUT;
+ } else {
+ ret = wait_event_interruptible(intf->vsync_wait,
+ !ktime_equal(timestamp,
+ intf->vsync_timestamp));
+ }
+ adf_vsync_put(intf);
+
+ return ret;
+}
+EXPORT_SYMBOL(adf_vsync_wait);
+
+static void adf_event_queue(struct adf_obj *obj, struct adf_event *event)
+{
+ struct adf_file *file;
+ unsigned long flags;
+
+ trace_adf_event(obj, event->type);
+
+ spin_lock_irqsave(&obj->file_lock, flags);
+
+ list_for_each_entry(file, &obj->file_list, head)
+ if (test_bit(event->type, file->event_subscriptions))
+ adf_file_queue_event(file, event);
+
+ spin_unlock_irqrestore(&obj->file_lock, flags);
+}
+
+/**
+ * adf_event_notify - notify userspace of a driver-private event
+ *
+ * @obj: the ADF object that produced the event
+ * @event: the event
+ *
+ * adf_event_notify() may be called safely from an atomic context. It will
+ * copy @event if needed, so @event may point to a variable on the stack.
+ *
+ * Drivers must NOT call adf_event_notify() for vsync and hotplug events.
+ * ADF provides adf_vsync_notify() and
+ * adf_hotplug_notify_{connected,disconnected}() for these events.
+ */
+int adf_event_notify(struct adf_obj *obj, struct adf_event *event)
+{
+ if (WARN_ON(event->type == ADF_EVENT_VSYNC ||
+ event->type == ADF_EVENT_HOTPLUG))
+ return -EINVAL;
+
+ adf_event_queue(obj, event);
+ return 0;
+}
+EXPORT_SYMBOL(adf_event_notify);
+
+/**
+ * adf_vsync_notify - notify ADF of a display interface's vsync event
+ *
+ * @intf: the display interface
+ * @timestamp: the time the vsync occurred
+ *
+ * adf_vsync_notify() may be called safely from an atomic context.
+ */
+void adf_vsync_notify(struct adf_interface *intf, ktime_t timestamp)
+{
+ unsigned long flags;
+ struct adf_vsync_event event;
+
+ write_lock_irqsave(&intf->vsync_lock, flags);
+ intf->vsync_timestamp = timestamp;
+ write_unlock_irqrestore(&intf->vsync_lock, flags);
+
+ wake_up_interruptible_all(&intf->vsync_wait);
+
+ event.base.type = ADF_EVENT_VSYNC;
+ event.base.length = sizeof(event);
+ event.timestamp = ktime_to_ns(timestamp);
+ adf_event_queue(&intf->base, &event.base);
+}
+EXPORT_SYMBOL(adf_vsync_notify);
+
+void adf_hotplug_notify(struct adf_interface *intf, bool connected,
+ struct drm_mode_modeinfo *modelist, size_t n_modes)
+{
+ unsigned long flags;
+ struct adf_hotplug_event event;
+ struct drm_mode_modeinfo *old_modelist;
+
+ write_lock_irqsave(&intf->hotplug_modelist_lock, flags);
+ old_modelist = intf->modelist;
+ intf->hotplug_detect = connected;
+ intf->modelist = modelist;
+ intf->n_modes = n_modes;
+ write_unlock_irqrestore(&intf->hotplug_modelist_lock, flags);
+
+ kfree(old_modelist);
+
+ event.base.length = sizeof(event);
+ event.base.type = ADF_EVENT_HOTPLUG;
+ event.connected = connected;
+ adf_event_queue(&intf->base, &event.base);
+}
+
+/**
+ * adf_hotplug_notify_connected - notify ADF of a display interface being
+ * connected to a display
+ *
+ * @intf: the display interface
+ * @modelist: hardware modes supported by display
+ * @n_modes: length of modelist
+ *
+ * @modelist is copied as needed, so it may point to a variable on the stack.
+ *
+ * adf_hotplug_notify_connected() may NOT be called safely from an atomic
+ * context.
+ *
+ * Returns 0 on success or error code (<0) on error.
+ */
+int adf_hotplug_notify_connected(struct adf_interface *intf,
+ struct drm_mode_modeinfo *modelist, size_t n_modes)
+{
+ struct drm_mode_modeinfo *modelist_copy;
+
+ if (n_modes > ADF_MAX_MODES)
+ return -ENOMEM;
+
+ modelist_copy = kzalloc(sizeof(modelist_copy[0]) * n_modes,
+ GFP_KERNEL);
+ if (!modelist_copy)
+ return -ENOMEM;
+ memcpy(modelist_copy, modelist, sizeof(modelist_copy[0]) * n_modes);
+
+ adf_hotplug_notify(intf, true, modelist_copy, n_modes);
+ return 0;
+}
+EXPORT_SYMBOL(adf_hotplug_notify_connected);
+
+/**
+ * adf_hotplug_notify_disconnected - notify ADF of a display interface being
+ * disconnected from a display
+ *
+ * @intf: the display interface
+ *
+ * adf_hotplug_notify_disconnected() may be called safely from an atomic
+ * context.
+ */
+void adf_hotplug_notify_disconnected(struct adf_interface *intf)
+{
+ adf_hotplug_notify(intf, false, NULL, 0);
+}
+EXPORT_SYMBOL(adf_hotplug_notify_disconnected);
+
+static int adf_obj_init(struct adf_obj *obj, enum adf_obj_type type,
+ struct idr *idr, struct adf_device *parent,
+ const struct adf_obj_ops *ops, const char *fmt, va_list args)
+{
+ int ret;
+
+ if (ops && ops->supports_event && !ops->set_event) {
+ pr_err("%s: %s implements supports_event but not set_event\n",
+ __func__, adf_obj_type_str(type));
+ return -EINVAL;
+ }
+
+ ret = idr_alloc(idr, obj, 0, 0, GFP_KERNEL);
+ if (ret < 0) {
+ pr_err("%s: allocating object id failed: %d\n", __func__, ret);
+ return ret;
+ }
+ obj->id = ret;
+
+ vscnprintf(obj->name, sizeof(obj->name), fmt, args);
+
+ obj->type = type;
+ obj->ops = ops;
+ obj->parent = parent;
+ mutex_init(&obj->event_lock);
+ obj->event_refcount = RB_ROOT;
+ spin_lock_init(&obj->file_lock);
+ INIT_LIST_HEAD(&obj->file_list);
+ return 0;
+}
+
+static void adf_obj_destroy(struct adf_obj *obj, struct idr *idr)
+{
+ struct rb_node *node = rb_first(&obj->event_refcount);
+
+ while (node) {
+ struct adf_event_refcount *refcount =
+ container_of(node, struct adf_event_refcount,
+ node);
+ rb_erase(&refcount->node, &obj->event_refcount);
+ kfree(refcount);
+ node = rb_first(&obj->event_refcount);
+ }
+
+ mutex_destroy(&obj->event_lock);
+ idr_remove(idr, obj->id);
+}
+
+/**
+ * adf_device_init - initialize ADF-internal data for a display device
+ * and create sysfs entries
+ *
+ * @dev: the display device
+ * @parent: the device's parent device
+ * @ops: the device's associated ops
+ * @fmt: formatting string for the display device's name
+ *
+ * @fmt specifies the device's sysfs filename and the name returned to
+ * userspace through the %ADF_GET_DEVICE_DATA ioctl.
+ *
+ * Returns 0 on success or error code (<0) on failure.
+ */
+int adf_device_init(struct adf_device *dev, struct device *parent,
+ const struct adf_device_ops *ops, const char *fmt, ...)
+{
+ int ret;
+ va_list args;
+
+ if (!ops->validate || !ops->post) {
+ pr_err("%s: device must implement validate and post\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ if (!ops->complete_fence && !ops->advance_timeline) {
+ if (!IS_ENABLED(CONFIG_SW_SYNC)) {
+ pr_err("%s: device requires sw_sync but it is not enabled in the kernel\n",
+ __func__);
+ return -EINVAL;
+ }
+ } else if (!(ops->complete_fence && ops->advance_timeline)) {
+ pr_err("%s: device must implement both complete_fence and advance_timeline, or implement neither\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ memset(dev, 0, sizeof(*dev));
+
+ va_start(args, fmt);
+ ret = adf_obj_init(&dev->base, ADF_OBJ_DEVICE, &adf_devices, dev,
+ &ops->base, fmt, args);
+ va_end(args);
+ if (ret < 0)
+ return ret;
+
+ dev->dev = parent;
+ dev->ops = ops;
+ idr_init(&dev->overlay_engines);
+ idr_init(&dev->interfaces);
+ mutex_init(&dev->client_lock);
+ INIT_LIST_HEAD(&dev->post_list);
+ mutex_init(&dev->post_lock);
+ init_kthread_worker(&dev->post_worker);
+ INIT_LIST_HEAD(&dev->attached);
+ INIT_LIST_HEAD(&dev->attach_allowed);
+
+ dev->post_thread = kthread_run(kthread_worker_fn,
+ &dev->post_worker, dev->base.name);
+ if (IS_ERR(dev->post_thread)) {
+ ret = PTR_ERR(dev->post_thread);
+ dev->post_thread = NULL;
+
+ pr_err("%s: failed to run config posting thread: %d\n",
+ __func__, ret);
+ goto err;
+ }
+ init_kthread_work(&dev->post_work, adf_post_work_func);
+
+ ret = adf_device_sysfs_init(dev);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+
+err:
+ adf_device_destroy(dev);
+ return ret;
+}
+EXPORT_SYMBOL(adf_device_init);
+
+/**
+ * adf_device_destroy - clean up ADF-internal data for a display device
+ *
+ * @dev: the display device
+ */
+void adf_device_destroy(struct adf_device *dev)
+{
+ struct adf_attachment_list *entry, *next;
+
+ idr_destroy(&dev->interfaces);
+ idr_destroy(&dev->overlay_engines);
+
+ if (dev->post_thread) {
+ flush_kthread_worker(&dev->post_worker);
+ kthread_stop(dev->post_thread);
+ }
+
+ if (dev->onscreen)
+ adf_post_cleanup(dev, dev->onscreen);
+ adf_device_sysfs_destroy(dev);
+ list_for_each_entry_safe(entry, next, &dev->attach_allowed, head) {
+ adf_attachment_free(entry);
+ }
+ list_for_each_entry_safe(entry, next, &dev->attached, head) {
+ adf_attachment_free(entry);
+ }
+ mutex_destroy(&dev->post_lock);
+ mutex_destroy(&dev->client_lock);
+
+ if (dev->timeline)
+ sync_timeline_destroy(&dev->timeline->obj);
+
+ adf_obj_destroy(&dev->base, &adf_devices);
+}
+EXPORT_SYMBOL(adf_device_destroy);
+
+/**
+ * adf_interface_init - initialize ADF-internal data for a display interface
+ * and create sysfs entries
+ *
+ * @intf: the display interface
+ * @dev: the interface's "parent" display device
+ * @type: interface type (see enum @adf_interface_type)
+ * @idx: which interface of type @type;
+ * e.g. interface DSI.1 -> @type=%ADF_INTF_TYPE_DSI, @idx=1
+ * @flags: informational flags (bitmask of %ADF_INTF_FLAG_* values)
+ * @ops: the interface's associated ops
+ * @fmt: formatting string for the display interface's name
+ *
+ * @dev must have previously been initialized with adf_device_init().
+ *
+ * @fmt affects the name returned to userspace through the
+ * %ADF_GET_INTERFACE_DATA ioctl. It does not affect the sysfs filename,
+ * which is derived from @dev's name.
+ *
+ * Returns 0 on success or error code (<0) on failure.
+ */
+int adf_interface_init(struct adf_interface *intf, struct adf_device *dev,
+ enum adf_interface_type type, u32 idx, u32 flags,
+ const struct adf_interface_ops *ops, const char *fmt, ...)
+{
+ int ret;
+ va_list args;
+ const u32 allowed_flags = ADF_INTF_FLAG_PRIMARY |
+ ADF_INTF_FLAG_EXTERNAL;
+
+ if (dev->n_interfaces == ADF_MAX_INTERFACES) {
+ pr_err("%s: parent device %s has too many interfaces\n",
+ __func__, dev->base.name);
+ return -ENOMEM;
+ }
+
+ if (type >= ADF_INTF_MEMORY && type <= ADF_INTF_TYPE_DEVICE_CUSTOM) {
+ pr_err("%s: invalid interface type %u\n", __func__, type);
+ return -EINVAL;
+ }
+
+ if (flags & ~allowed_flags) {
+ pr_err("%s: invalid interface flags 0x%X\n", __func__,
+ flags & ~allowed_flags);
+ return -EINVAL;
+ }
+
+ memset(intf, 0, sizeof(*intf));
+
+ va_start(args, fmt);
+ ret = adf_obj_init(&intf->base, ADF_OBJ_INTERFACE, &dev->interfaces,
+ dev, ops ? &ops->base : NULL, fmt, args);
+ va_end(args);
+ if (ret < 0)
+ return ret;
+
+ intf->type = type;
+ intf->idx = idx;
+ intf->flags = flags;
+ intf->ops = ops;
+ intf->dpms_state = DRM_MODE_DPMS_OFF;
+ init_waitqueue_head(&intf->vsync_wait);
+ rwlock_init(&intf->vsync_lock);
+ rwlock_init(&intf->hotplug_modelist_lock);
+
+ ret = adf_interface_sysfs_init(intf);
+ if (ret < 0)
+ goto err;
+ dev->n_interfaces++;
+
+ return 0;
+
+err:
+ adf_obj_destroy(&intf->base, &dev->interfaces);
+ return ret;
+}
+EXPORT_SYMBOL(adf_interface_init);
+
+/**
+ * adf_interface_destroy - clean up ADF-internal data for a display interface
+ *
+ * @intf: the display interface
+ */
+void adf_interface_destroy(struct adf_interface *intf)
+{
+ struct adf_device *dev = adf_interface_parent(intf);
+ struct adf_attachment_list *entry, *next;
+
+ mutex_lock(&dev->client_lock);
+ list_for_each_entry_safe(entry, next, &dev->attach_allowed, head) {
+ if (entry->attachment.interface == intf) {
+ adf_attachment_free(entry);
+ dev->n_attach_allowed--;
+ }
+ }
+ list_for_each_entry_safe(entry, next, &dev->attached, head) {
+ if (entry->attachment.interface == intf) {
+ adf_device_detach_op(dev,
+ entry->attachment.overlay_engine, intf);
+ adf_attachment_free(entry);
+ dev->n_attached--;
+ }
+ }
+ kfree(intf->modelist);
+ adf_interface_sysfs_destroy(intf);
+ adf_obj_destroy(&intf->base, &dev->interfaces);
+ dev->n_interfaces--;
+ mutex_unlock(&dev->client_lock);
+}
+EXPORT_SYMBOL(adf_interface_destroy);
+
+static bool adf_overlay_engine_has_custom_formats(
+ const struct adf_overlay_engine_ops *ops)
+{
+ size_t i;
+ for (i = 0; i < ops->n_supported_formats; i++)
+ if (!adf_format_is_standard(ops->supported_formats[i]))
+ return true;
+ return false;
+}
+
+/**
+ * adf_overlay_engine_init - initialize ADF-internal data for an
+ * overlay engine and create sysfs entries
+ *
+ * @eng: the overlay engine
+ * @dev: the overlay engine's "parent" display device
+ * @ops: the overlay engine's associated ops
+ * @fmt: formatting string for the overlay engine's name
+ *
+ * @dev must have previously been initialized with adf_device_init().
+ *
+ * @fmt affects the name returned to userspace through the
+ * %ADF_GET_OVERLAY_ENGINE_DATA ioctl. It does not affect the sysfs filename,
+ * which is derived from @dev's name.
+ *
+ * Returns 0 on success or error code (<0) on failure.
+ */
+int adf_overlay_engine_init(struct adf_overlay_engine *eng,
+ struct adf_device *dev,
+ const struct adf_overlay_engine_ops *ops, const char *fmt, ...)
+{
+ int ret;
+ va_list args;
+
+ if (!ops->supported_formats) {
+ pr_err("%s: overlay engine must support at least one format\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ if (ops->n_supported_formats > ADF_MAX_SUPPORTED_FORMATS) {
+ pr_err("%s: overlay engine supports too many formats\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ if (adf_overlay_engine_has_custom_formats(ops) &&
+ !dev->ops->validate_custom_format) {
+ pr_err("%s: overlay engine has custom formats but parent device %s does not implement validate_custom_format\n",
+ __func__, dev->base.name);
+ return -EINVAL;
+ }
+
+ memset(eng, 0, sizeof(*eng));
+
+ va_start(args, fmt);
+ ret = adf_obj_init(&eng->base, ADF_OBJ_OVERLAY_ENGINE,
+ &dev->overlay_engines, dev, &ops->base, fmt, args);
+ va_end(args);
+ if (ret < 0)
+ return ret;
+
+ eng->ops = ops;
+
+ ret = adf_overlay_engine_sysfs_init(eng);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+
+err:
+ adf_obj_destroy(&eng->base, &dev->overlay_engines);
+ return ret;
+}
+EXPORT_SYMBOL(adf_overlay_engine_init);
+
+/**
+ * adf_interface_destroy - clean up ADF-internal data for an overlay engine
+ *
+ * @eng: the overlay engine
+ */
+void adf_overlay_engine_destroy(struct adf_overlay_engine *eng)
+{
+ struct adf_device *dev = adf_overlay_engine_parent(eng);
+ struct adf_attachment_list *entry, *next;
+
+ mutex_lock(&dev->client_lock);
+ list_for_each_entry_safe(entry, next, &dev->attach_allowed, head) {
+ if (entry->attachment.overlay_engine == eng) {
+ adf_attachment_free(entry);
+ dev->n_attach_allowed--;
+ }
+ }
+ list_for_each_entry_safe(entry, next, &dev->attached, head) {
+ if (entry->attachment.overlay_engine == eng) {
+ adf_device_detach_op(dev, eng,
+ entry->attachment.interface);
+ adf_attachment_free(entry);
+ dev->n_attached--;
+ }
+ }
+ adf_overlay_engine_sysfs_destroy(eng);
+ adf_obj_destroy(&eng->base, &dev->overlay_engines);
+ mutex_unlock(&dev->client_lock);
+}
+EXPORT_SYMBOL(adf_overlay_engine_destroy);
+
+struct adf_attachment_list *adf_attachment_find(struct list_head *list,
+ struct adf_overlay_engine *eng, struct adf_interface *intf)
+{
+ struct adf_attachment_list *entry;
+ list_for_each_entry(entry, list, head) {
+ if (entry->attachment.interface == intf &&
+ entry->attachment.overlay_engine == eng)
+ return entry;
+ }
+ return NULL;
+}
+
+int adf_attachment_validate(struct adf_device *dev,
+ struct adf_overlay_engine *eng, struct adf_interface *intf)
+{
+ struct adf_device *intf_dev = adf_interface_parent(intf);
+ struct adf_device *eng_dev = adf_overlay_engine_parent(eng);
+
+ if (intf_dev != dev) {
+ dev_err(&dev->base.dev, "can't attach interface %s belonging to device %s\n",
+ intf->base.name, intf_dev->base.name);
+ return -EINVAL;
+ }
+
+ if (eng_dev != dev) {
+ dev_err(&dev->base.dev, "can't attach overlay engine %s belonging to device %s\n",
+ eng->base.name, eng_dev->base.name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * adf_attachment_allow - add a new entry to the list of allowed
+ * attachments
+ *
+ * @dev: the parent device
+ * @eng: the overlay engine
+ * @intf: the interface
+ *
+ * adf_attachment_allow() indicates that the underlying display hardware allows
+ * @intf to scan out @eng's output. It is intended to be called at
+ * driver initialization for each supported overlay engine + interface pair.
+ *
+ * Returns 0 on success, -%EALREADY if the entry already exists, or -errno on
+ * any other failure.
+ */
+int adf_attachment_allow(struct adf_device *dev,
+ struct adf_overlay_engine *eng, struct adf_interface *intf)
+{
+ int ret;
+ struct adf_attachment_list *entry = NULL;
+
+ ret = adf_attachment_validate(dev, eng, intf);
+ if (ret < 0)
+ return ret;
+
+ mutex_lock(&dev->client_lock);
+
+ if (dev->n_attach_allowed == ADF_MAX_ATTACHMENTS) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ if (adf_attachment_find(&dev->attach_allowed, eng, intf)) {
+ ret = -EALREADY;
+ goto done;
+ }
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ entry->attachment.interface = intf;
+ entry->attachment.overlay_engine = eng;
+ list_add_tail(&entry->head, &dev->attach_allowed);
+ dev->n_attach_allowed++;
+
+done:
+ mutex_unlock(&dev->client_lock);
+ if (ret < 0)
+ kfree(entry);
+
+ return ret;
+}
+EXPORT_SYMBOL(adf_attachment_allow);
+
+/**
+ * adf_obj_type_str - string representation of an adf_obj_type
+ *
+ * @type: the object type
+ */
+const char *adf_obj_type_str(enum adf_obj_type type)
+{
+ switch (type) {
+ case ADF_OBJ_OVERLAY_ENGINE:
+ return "overlay engine";
+
+ case ADF_OBJ_INTERFACE:
+ return "interface";
+
+ case ADF_OBJ_DEVICE:
+ return "device";
+
+ default:
+ return "unknown";
+ }
+}
+EXPORT_SYMBOL(adf_obj_type_str);
+
+/**
+ * adf_interface_type_str - string representation of an adf_interface's type
+ *
+ * @intf: the interface
+ */
+const char *adf_interface_type_str(struct adf_interface *intf)
+{
+ switch (intf->type) {
+ case ADF_INTF_DSI:
+ return "DSI";
+
+ case ADF_INTF_eDP:
+ return "eDP";
+
+ case ADF_INTF_DPI:
+ return "DPI";
+
+ case ADF_INTF_VGA:
+ return "VGA";
+
+ case ADF_INTF_DVI:
+ return "DVI";
+
+ case ADF_INTF_HDMI:
+ return "HDMI";
+
+ case ADF_INTF_MEMORY:
+ return "memory";
+
+ default:
+ if (intf->type >= ADF_INTF_TYPE_DEVICE_CUSTOM) {
+ if (intf->ops && intf->ops->type_str)
+ return intf->ops->type_str(intf);
+ return "custom";
+ }
+ return "unknown";
+ }
+}
+EXPORT_SYMBOL(adf_interface_type_str);
+
+/**
+ * adf_event_type_str - string representation of an adf_event_type
+ *
+ * @obj: ADF object that produced the event
+ * @type: event type
+ */
+const char *adf_event_type_str(struct adf_obj *obj, enum adf_event_type type)
+{
+ switch (type) {
+ case ADF_EVENT_VSYNC:
+ return "vsync";
+
+ case ADF_EVENT_HOTPLUG:
+ return "hotplug";
+
+ default:
+ if (type >= ADF_EVENT_DEVICE_CUSTOM) {
+ if (obj->ops && obj->ops->event_type_str)
+ return obj->ops->event_type_str(obj, type);
+ return "custom";
+ }
+ return "unknown";
+ }
+}
+EXPORT_SYMBOL(adf_event_type_str);
+
+/**
+ * adf_format_str - string representation of an ADF/DRM fourcc format
+ *
+ * @format: format fourcc
+ * @buf: target buffer for the format's string representation
+ */
+void adf_format_str(u32 format, char buf[ADF_FORMAT_STR_SIZE])
+{
+ buf[0] = format & 0xFF;
+ buf[1] = (format >> 8) & 0xFF;
+ buf[2] = (format >> 16) & 0xFF;
+ buf[3] = (format >> 24) & 0xFF;
+ buf[4] = '\0';
+}
+EXPORT_SYMBOL(adf_format_str);
+
+/**
+ * adf_format_validate_yuv - validate the number and size of planes in buffers
+ * with a custom YUV format.
+ *
+ * @dev: ADF device performing the validation
+ * @buf: buffer to validate
+ * @num_planes: expected number of planes
+ * @hsub: expected horizontal chroma subsampling factor, in pixels
+ * @vsub: expected vertical chroma subsampling factor, in pixels
+ * @cpp: expected bytes per pixel for each plane (length @num_planes)
+ *
+ * adf_format_validate_yuv() is intended to be called as a helper from @dev's
+ * validate_custom_format() op.
+ *
+ * Returns 0 if @buf has the expected number of planes and each plane
+ * has sufficient size, or -EINVAL otherwise.
+ */
+int adf_format_validate_yuv(struct adf_device *dev, struct adf_buffer *buf,
+ u8 num_planes, u8 hsub, u8 vsub, u8 cpp[])
+{
+ u8 i;
+
+ if (num_planes != buf->n_planes) {
+ char format_str[ADF_FORMAT_STR_SIZE];
+ adf_format_str(buf->format, format_str);
+ dev_err(&dev->base.dev, "%u planes expected for format %s but %u planes provided\n",
+ num_planes, format_str, buf->n_planes);
+ return -EINVAL;
+ }
+
+ if (buf->w == 0 || buf->w % hsub) {
+ dev_err(&dev->base.dev, "bad buffer width %u\n", buf->w);
+ return -EINVAL;
+ }
+
+ if (buf->h == 0 || buf->h % vsub) {
+ dev_err(&dev->base.dev, "bad buffer height %u\n", buf->h);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < num_planes; i++) {
+ u32 width = buf->w / (i != 0 ? hsub : 1);
+ u32 height = buf->h / (i != 0 ? vsub : 1);
+ u8 cpp = adf_format_plane_cpp(buf->format, i);
+ u32 last_line_size;
+
+ if (buf->pitch[i] < (u64) width * cpp) {
+ dev_err(&dev->base.dev, "plane %u pitch is shorter than buffer width (pitch = %u, width = %u, bpp = %u)\n",
+ i, buf->pitch[i], width, cpp * 8);
+ return -EINVAL;
+ }
+
+ switch (dev->ops->quirks.buffer_padding) {
+ case ADF_BUFFER_PADDED_TO_PITCH:
+ last_line_size = buf->pitch[i];
+ break;
+
+ case ADF_BUFFER_UNPADDED:
+ last_line_size = width * cpp;
+ break;
+
+ default:
+ BUG();
+ }
+
+ if ((u64) (height - 1) * buf->pitch[i] + last_line_size +
+ buf->offset[i] > buf->dma_bufs[i]->size) {
+ dev_err(&dev->base.dev, "plane %u buffer too small (height = %u, pitch = %u, offset = %u, size = %zu)\n",
+ i, height, buf->pitch[i],
+ buf->offset[i], buf->dma_bufs[i]->size);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(adf_format_validate_yuv);
+
+/**
+ * adf_modeinfo_set_name - sets the name of a mode from its display resolution
+ *
+ * @mode: mode
+ *
+ * adf_modeinfo_set_name() fills in @mode->name in the format
+ * "[hdisplay]x[vdisplay](i)". It is intended to help drivers create
+ * ADF/DRM-style modelists from other mode formats.
+ */
+void adf_modeinfo_set_name(struct drm_mode_modeinfo *mode)
+{
+ bool interlaced = mode->flags & DRM_MODE_FLAG_INTERLACE;
+
+ snprintf(mode->name, DRM_DISPLAY_MODE_LEN, "%dx%d%s",
+ mode->hdisplay, mode->vdisplay,
+ interlaced ? "i" : "");
+}
+EXPORT_SYMBOL(adf_modeinfo_set_name);
+
+/**
+ * adf_modeinfo_set_vrefresh - sets the vrefresh of a mode from its other
+ * timing data
+ *
+ * @mode: mode
+ *
+ * adf_modeinfo_set_vrefresh() calculates @mode->vrefresh from
+ * @mode->{h,v}display and @mode->flags. It is intended to help drivers
+ * create ADF/DRM-style modelists from other mode formats.
+ */
+void adf_modeinfo_set_vrefresh(struct drm_mode_modeinfo *mode)
+{
+ int refresh = 0;
+ unsigned int calc_val;
+
+ if (mode->vrefresh > 0)
+ return;
+
+ if (mode->htotal <= 0 || mode->vtotal <= 0)
+ return;
+
+ /* work out vrefresh the value will be x1000 */
+ calc_val = (mode->clock * 1000);
+ calc_val /= mode->htotal;
+ refresh = (calc_val + mode->vtotal / 2) / mode->vtotal;
+
+ if (mode->flags & DRM_MODE_FLAG_INTERLACE)
+ refresh *= 2;
+ if (mode->flags & DRM_MODE_FLAG_DBLSCAN)
+ refresh /= 2;
+ if (mode->vscan > 1)
+ refresh /= mode->vscan;
+
+ mode->vrefresh = refresh;
+}
+EXPORT_SYMBOL(adf_modeinfo_set_vrefresh);
+
+static int __init adf_init(void)
+{
+ int err;
+
+ err = adf_sysfs_init();
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static void __exit adf_exit(void)
+{
+ adf_sysfs_destroy();
+}
+
+module_init(adf_init);
+module_exit(adf_exit);
diff --git a/drivers/video/adf/adf.h b/drivers/video/adf/adf.h
new file mode 100644
index 000000000000..3bcf1fabc23c
--- /dev/null
+++ b/drivers/video/adf/adf.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __VIDEO_ADF_ADF_H
+#define __VIDEO_ADF_ADF_H
+
+#include <linux/idr.h>
+#include <linux/list.h>
+#include <video/adf.h>
+#include "sync.h"
+
+struct adf_event_refcount {
+ struct rb_node node;
+ enum adf_event_type type;
+ int refcount;
+};
+
+void adf_buffer_cleanup(struct adf_buffer *buf);
+void adf_buffer_mapping_cleanup(struct adf_buffer_mapping *mapping,
+ struct adf_buffer *buf);
+void adf_post_cleanup(struct adf_device *dev, struct adf_pending_post *post);
+
+struct adf_attachment_list *adf_attachment_find(struct list_head *list,
+ struct adf_overlay_engine *eng, struct adf_interface *intf);
+int adf_attachment_validate(struct adf_device *dev,
+ struct adf_overlay_engine *eng, struct adf_interface *intf);
+void adf_attachment_free(struct adf_attachment_list *attachment);
+
+struct adf_event_refcount *adf_obj_find_event_refcount(struct adf_obj *obj,
+ enum adf_event_type type);
+
+static inline int adf_obj_check_supports_event(struct adf_obj *obj,
+ enum adf_event_type type)
+{
+ if (!obj->ops || !obj->ops->supports_event)
+ return -EOPNOTSUPP;
+ if (!obj->ops->supports_event(obj, type))
+ return -EINVAL;
+ return 0;
+}
+
+static inline int adf_device_attach_op(struct adf_device *dev,
+ struct adf_overlay_engine *eng, struct adf_interface *intf)
+{
+ if (!dev->ops->attach)
+ return 0;
+
+ return dev->ops->attach(dev, eng, intf);
+}
+
+static inline int adf_device_detach_op(struct adf_device *dev,
+ struct adf_overlay_engine *eng, struct adf_interface *intf)
+{
+ if (!dev->ops->detach)
+ return 0;
+
+ return dev->ops->detach(dev, eng, intf);
+}
+
+#endif /* __VIDEO_ADF_ADF_H */
diff --git a/drivers/video/adf/adf_client.c b/drivers/video/adf/adf_client.c
new file mode 100644
index 000000000000..8061d8e6b9fb
--- /dev/null
+++ b/drivers/video/adf/adf_client.c
@@ -0,0 +1,811 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+
+#include "sw_sync.h"
+
+#include <video/adf.h>
+#include <video/adf_client.h>
+#include <video/adf_format.h>
+
+#include "adf.h"
+
+static inline bool vsync_active(u8 state)
+{
+ return state == DRM_MODE_DPMS_ON || state == DRM_MODE_DPMS_STANDBY;
+}
+
+/**
+ * adf_interface_blank - set interface's DPMS state
+ *
+ * @intf: the interface
+ * @state: one of %DRM_MODE_DPMS_*
+ *
+ * Returns 0 on success or -errno on failure.
+ */
+int adf_interface_blank(struct adf_interface *intf, u8 state)
+{
+ struct adf_device *dev = adf_interface_parent(intf);
+ u8 prev_state;
+ bool disable_vsync;
+ bool enable_vsync;
+ int ret = 0;
+ struct adf_event_refcount *vsync_refcount;
+
+ if (!intf->ops || !intf->ops->blank)
+ return -EOPNOTSUPP;
+
+ if (state > DRM_MODE_DPMS_OFF)
+ return -EINVAL;
+
+ mutex_lock(&dev->client_lock);
+ if (state != DRM_MODE_DPMS_ON)
+ flush_kthread_worker(&dev->post_worker);
+ mutex_lock(&intf->base.event_lock);
+
+ vsync_refcount = adf_obj_find_event_refcount(&intf->base,
+ ADF_EVENT_VSYNC);
+ if (!vsync_refcount) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ prev_state = intf->dpms_state;
+ if (prev_state == state) {
+ ret = -EBUSY;
+ goto done;
+ }
+
+ disable_vsync = vsync_active(prev_state) &&
+ !vsync_active(state) &&
+ vsync_refcount->refcount;
+ enable_vsync = !vsync_active(prev_state) &&
+ vsync_active(state) &&
+ vsync_refcount->refcount;
+
+ if (disable_vsync)
+ intf->base.ops->set_event(&intf->base, ADF_EVENT_VSYNC,
+ false);
+
+ ret = intf->ops->blank(intf, state);
+ if (ret < 0) {
+ if (disable_vsync)
+ intf->base.ops->set_event(&intf->base, ADF_EVENT_VSYNC,
+ true);
+ goto done;
+ }
+
+ if (enable_vsync)
+ intf->base.ops->set_event(&intf->base, ADF_EVENT_VSYNC,
+ true);
+
+ intf->dpms_state = state;
+done:
+ mutex_unlock(&intf->base.event_lock);
+ mutex_unlock(&dev->client_lock);
+ return ret;
+}
+EXPORT_SYMBOL(adf_interface_blank);
+
+/**
+ * adf_interface_blank - get interface's current DPMS state
+ *
+ * @intf: the interface
+ *
+ * Returns one of %DRM_MODE_DPMS_*.
+ */
+u8 adf_interface_dpms_state(struct adf_interface *intf)
+{
+ struct adf_device *dev = adf_interface_parent(intf);
+ u8 dpms_state;
+
+ mutex_lock(&dev->client_lock);
+ dpms_state = intf->dpms_state;
+ mutex_unlock(&dev->client_lock);
+
+ return dpms_state;
+}
+EXPORT_SYMBOL(adf_interface_dpms_state);
+
+/**
+ * adf_interface_current_mode - get interface's current display mode
+ *
+ * @intf: the interface
+ * @mode: returns the current mode
+ */
+void adf_interface_current_mode(struct adf_interface *intf,
+ struct drm_mode_modeinfo *mode)
+{
+ struct adf_device *dev = adf_interface_parent(intf);
+
+ mutex_lock(&dev->client_lock);
+ memcpy(mode, &intf->current_mode, sizeof(*mode));
+ mutex_unlock(&dev->client_lock);
+}
+EXPORT_SYMBOL(adf_interface_current_mode);
+
+/**
+ * adf_interface_modelist - get interface's modelist
+ *
+ * @intf: the interface
+ * @modelist: storage for the modelist (optional)
+ * @n_modes: length of @modelist
+ *
+ * If @modelist is not NULL, adf_interface_modelist() will copy up to @n_modes
+ * modelist entries into @modelist.
+ *
+ * Returns the length of the modelist.
+ */
+size_t adf_interface_modelist(struct adf_interface *intf,
+ struct drm_mode_modeinfo *modelist, size_t n_modes)
+{
+ unsigned long flags;
+ size_t retval;
+
+ read_lock_irqsave(&intf->hotplug_modelist_lock, flags);
+ if (modelist)
+ memcpy(modelist, intf->modelist, sizeof(modelist[0]) *
+ min(n_modes, intf->n_modes));
+ retval = intf->n_modes;
+ read_unlock_irqrestore(&intf->hotplug_modelist_lock, flags);
+
+ return retval;
+}
+EXPORT_SYMBOL(adf_interface_modelist);
+
+/**
+ * adf_interface_set_mode - set interface's display mode
+ *
+ * @intf: the interface
+ * @mode: the new mode
+ *
+ * Returns 0 on success or -errno on failure.
+ */
+int adf_interface_set_mode(struct adf_interface *intf,
+ struct drm_mode_modeinfo *mode)
+{
+ struct adf_device *dev = adf_interface_parent(intf);
+ int ret = 0;
+
+ if (!intf->ops || !intf->ops->modeset)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&dev->client_lock);
+ flush_kthread_worker(&dev->post_worker);
+
+ ret = intf->ops->modeset(intf, mode);
+ if (ret < 0)
+ goto done;
+
+ memcpy(&intf->current_mode, mode, sizeof(*mode));
+done:
+ mutex_unlock(&dev->client_lock);
+ return ret;
+}
+EXPORT_SYMBOL(adf_interface_set_mode);
+
+/**
+ * adf_interface_screen_size - get size of screen connected to interface
+ *
+ * @intf: the interface
+ * @width_mm: returns the screen width in mm
+ * @height_mm: returns the screen width in mm
+ *
+ * Returns 0 on success or -errno on failure.
+ */
+int adf_interface_get_screen_size(struct adf_interface *intf, u16 *width_mm,
+ u16 *height_mm)
+{
+ struct adf_device *dev = adf_interface_parent(intf);
+ int ret;
+
+ if (!intf->ops || !intf->ops->screen_size)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&dev->client_lock);
+ ret = intf->ops->screen_size(intf, width_mm, height_mm);
+ mutex_unlock(&dev->client_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(adf_interface_get_screen_size);
+
+/**
+ * adf_overlay_engine_supports_format - returns whether a format is in an
+ * overlay engine's supported list
+ *
+ * @eng: the overlay engine
+ * @format: format fourcc
+ */
+bool adf_overlay_engine_supports_format(struct adf_overlay_engine *eng,
+ u32 format)
+{
+ size_t i;
+ for (i = 0; i < eng->ops->n_supported_formats; i++)
+ if (format == eng->ops->supported_formats[i])
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL(adf_overlay_engine_supports_format);
+
+static int adf_buffer_validate(struct adf_buffer *buf)
+{
+ struct adf_overlay_engine *eng = buf->overlay_engine;
+ struct device *dev = &eng->base.dev;
+ struct adf_device *parent = adf_overlay_engine_parent(eng);
+ u8 hsub, vsub, num_planes, cpp[ADF_MAX_PLANES], i;
+
+ if (!adf_overlay_engine_supports_format(eng, buf->format)) {
+ char format_str[ADF_FORMAT_STR_SIZE];
+ adf_format_str(buf->format, format_str);
+ dev_err(dev, "unsupported format %s\n", format_str);
+ return -EINVAL;
+ }
+
+ if (!adf_format_is_standard(buf->format))
+ return parent->ops->validate_custom_format(parent, buf);
+
+ hsub = adf_format_horz_chroma_subsampling(buf->format);
+ vsub = adf_format_vert_chroma_subsampling(buf->format);
+ num_planes = adf_format_num_planes(buf->format);
+ for (i = 0; i < num_planes; i++)
+ cpp[i] = adf_format_plane_cpp(buf->format, i);
+
+ return adf_format_validate_yuv(parent, buf, num_planes, hsub, vsub,
+ cpp);
+}
+
+static int adf_buffer_map(struct adf_device *dev, struct adf_buffer *buf,
+ struct adf_buffer_mapping *mapping)
+{
+ int ret = 0;
+ size_t i;
+
+ for (i = 0; i < buf->n_planes; i++) {
+ struct dma_buf_attachment *attachment;
+ struct sg_table *sg_table;
+
+ attachment = dma_buf_attach(buf->dma_bufs[i], dev->dev);
+ if (IS_ERR(attachment)) {
+ ret = PTR_ERR(attachment);
+ dev_err(&dev->base.dev, "attaching plane %zu failed: %d\n",
+ i, ret);
+ goto done;
+ }
+ mapping->attachments[i] = attachment;
+
+ sg_table = dma_buf_map_attachment(attachment, DMA_TO_DEVICE);
+ if (IS_ERR(sg_table)) {
+ ret = PTR_ERR(sg_table);
+ dev_err(&dev->base.dev, "mapping plane %zu failed: %d",
+ i, ret);
+ goto done;
+ } else if (!sg_table) {
+ ret = -ENOMEM;
+ dev_err(&dev->base.dev, "mapping plane %zu failed\n",
+ i);
+ goto done;
+ }
+ mapping->sg_tables[i] = sg_table;
+ }
+
+done:
+ if (ret < 0)
+ adf_buffer_mapping_cleanup(mapping, buf);
+
+ return ret;
+}
+
+static struct sync_fence *adf_sw_complete_fence(struct adf_device *dev)
+{
+ struct sync_pt *pt;
+ struct sync_fence *complete_fence;
+
+ if (!dev->timeline) {
+ dev->timeline = sw_sync_timeline_create(dev->base.name);
+ if (!dev->timeline)
+ return ERR_PTR(-ENOMEM);
+ dev->timeline_max = 1;
+ }
+
+ dev->timeline_max++;
+ pt = sw_sync_pt_create(dev->timeline, dev->timeline_max);
+ if (!pt)
+ goto err_pt_create;
+ complete_fence = sync_fence_create(dev->base.name, pt);
+ if (!complete_fence)
+ goto err_fence_create;
+
+ return complete_fence;
+
+err_fence_create:
+ sync_pt_free(pt);
+err_pt_create:
+ dev->timeline_max--;
+ return ERR_PTR(-ENOSYS);
+}
+
+/**
+ * adf_device_post - flip to a new set of buffers
+ *
+ * @dev: device targeted by the flip
+ * @intfs: interfaces targeted by the flip
+ * @n_intfs: number of targeted interfaces
+ * @bufs: description of buffers displayed
+ * @n_bufs: number of buffers displayed
+ * @custom_data: driver-private data
+ * @custom_data_size: size of driver-private data
+ *
+ * adf_device_post() will copy @intfs, @bufs, and @custom_data, so they may
+ * point to variables on the stack. adf_device_post() also takes its own
+ * reference on each of the dma-bufs in @bufs. The adf_device_post_nocopy()
+ * variant transfers ownership of these resources to ADF instead.
+ *
+ * On success, returns a sync fence which signals when the buffers are removed
+ * from the screen. On failure, returns ERR_PTR(-errno).
+ */
+struct sync_fence *adf_device_post(struct adf_device *dev,
+ struct adf_interface **intfs, size_t n_intfs,
+ struct adf_buffer *bufs, size_t n_bufs, void *custom_data,
+ size_t custom_data_size)
+{
+ struct adf_interface **intfs_copy = NULL;
+ struct adf_buffer *bufs_copy = NULL;
+ void *custom_data_copy = NULL;
+ struct sync_fence *ret;
+ size_t i;
+
+ intfs_copy = kzalloc(sizeof(intfs_copy[0]) * n_intfs, GFP_KERNEL);
+ if (!intfs_copy)
+ return ERR_PTR(-ENOMEM);
+
+ bufs_copy = kzalloc(sizeof(bufs_copy[0]) * n_bufs, GFP_KERNEL);
+ if (!bufs_copy) {
+ ret = ERR_PTR(-ENOMEM);
+ goto err_alloc;
+ }
+
+ custom_data_copy = kzalloc(custom_data_size, GFP_KERNEL);
+ if (!custom_data_copy) {
+ ret = ERR_PTR(-ENOMEM);
+ goto err_alloc;
+ }
+
+ for (i = 0; i < n_bufs; i++) {
+ size_t j;
+ for (j = 0; j < bufs[i].n_planes; j++)
+ get_dma_buf(bufs[i].dma_bufs[j]);
+ }
+
+ memcpy(intfs_copy, intfs, sizeof(intfs_copy[0]) * n_intfs);
+ memcpy(bufs_copy, bufs, sizeof(bufs_copy[0]) * n_bufs);
+ memcpy(custom_data_copy, custom_data, custom_data_size);
+
+ ret = adf_device_post_nocopy(dev, intfs_copy, n_intfs, bufs_copy,
+ n_bufs, custom_data_copy, custom_data_size);
+ if (IS_ERR(ret))
+ goto err_post;
+
+ return ret;
+
+err_post:
+ for (i = 0; i < n_bufs; i++) {
+ size_t j;
+ for (j = 0; j < bufs[i].n_planes; j++)
+ dma_buf_put(bufs[i].dma_bufs[j]);
+ }
+err_alloc:
+ kfree(custom_data_copy);
+ kfree(bufs_copy);
+ kfree(intfs_copy);
+ return ret;
+}
+EXPORT_SYMBOL(adf_device_post);
+
+/**
+ * adf_device_post_nocopy - flip to a new set of buffers
+ *
+ * adf_device_post_nocopy() has the same behavior as adf_device_post(),
+ * except ADF does not copy @intfs, @bufs, or @custom_data, and it does
+ * not take an extra reference on the dma-bufs in @bufs.
+ *
+ * @intfs, @bufs, and @custom_data must point to buffers allocated by
+ * kmalloc(). On success, ADF takes ownership of these buffers and the dma-bufs
+ * in @bufs, and will kfree()/dma_buf_put() them when they are no longer needed.
+ * On failure, adf_device_post_nocopy() does NOT take ownership of these
+ * buffers or the dma-bufs, and the caller must clean them up.
+ *
+ * adf_device_post_nocopy() is mainly intended for implementing ADF's ioctls.
+ * Clients may find the nocopy variant useful in limited cases, but most should
+ * call adf_device_post() instead.
+ */
+struct sync_fence *adf_device_post_nocopy(struct adf_device *dev,
+ struct adf_interface **intfs, size_t n_intfs,
+ struct adf_buffer *bufs, size_t n_bufs,
+ void *custom_data, size_t custom_data_size)
+{
+ struct adf_pending_post *cfg;
+ struct adf_buffer_mapping *mappings;
+ struct sync_fence *ret;
+ size_t i;
+ int err;
+
+ cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+ if (!cfg)
+ return ERR_PTR(-ENOMEM);
+
+ mappings = kzalloc(sizeof(mappings[0]) * n_bufs, GFP_KERNEL);
+ if (!mappings) {
+ ret = ERR_PTR(-ENOMEM);
+ goto err_alloc;
+ }
+
+ mutex_lock(&dev->client_lock);
+
+ for (i = 0; i < n_bufs; i++) {
+ err = adf_buffer_validate(&bufs[i]);
+ if (err < 0) {
+ ret = ERR_PTR(err);
+ goto err_buf;
+ }
+
+ err = adf_buffer_map(dev, &bufs[i], &mappings[i]);
+ if (err < 0) {
+ ret = ERR_PTR(err);
+ goto err_buf;
+ }
+ }
+
+ INIT_LIST_HEAD(&cfg->head);
+ cfg->config.n_bufs = n_bufs;
+ cfg->config.bufs = bufs;
+ cfg->config.mappings = mappings;
+ cfg->config.custom_data = custom_data;
+ cfg->config.custom_data_size = custom_data_size;
+
+ err = dev->ops->validate(dev, &cfg->config, &cfg->state);
+ if (err < 0) {
+ ret = ERR_PTR(err);
+ goto err_buf;
+ }
+
+ mutex_lock(&dev->post_lock);
+
+ if (dev->ops->complete_fence)
+ ret = dev->ops->complete_fence(dev, &cfg->config,
+ cfg->state);
+ else
+ ret = adf_sw_complete_fence(dev);
+
+ if (IS_ERR(ret))
+ goto err_fence;
+
+ list_add_tail(&cfg->head, &dev->post_list);
+ queue_kthread_work(&dev->post_worker, &dev->post_work);
+ mutex_unlock(&dev->post_lock);
+ mutex_unlock(&dev->client_lock);
+ kfree(intfs);
+ return ret;
+
+err_fence:
+ mutex_unlock(&dev->post_lock);
+
+err_buf:
+ for (i = 0; i < n_bufs; i++)
+ adf_buffer_mapping_cleanup(&mappings[i], &bufs[i]);
+
+ mutex_unlock(&dev->client_lock);
+ kfree(mappings);
+
+err_alloc:
+ kfree(cfg);
+ return ret;
+}
+EXPORT_SYMBOL(adf_device_post_nocopy);
+
+static void adf_attachment_list_to_array(struct adf_device *dev,
+ struct list_head *src, struct adf_attachment *dst, size_t size)
+{
+ struct adf_attachment_list *entry;
+ size_t i = 0;
+
+ if (!dst)
+ return;
+
+ list_for_each_entry(entry, src, head) {
+ if (i == size)
+ return;
+ dst[i] = entry->attachment;
+ i++;
+ }
+}
+
+/**
+ * adf_device_attachments - get device's list of active attachments
+ *
+ * @dev: the device
+ * @attachments: storage for the attachment list (optional)
+ * @n_attachments: length of @attachments
+ *
+ * If @attachments is not NULL, adf_device_attachments() will copy up to
+ * @n_attachments entries into @attachments.
+ *
+ * Returns the length of the active attachment list.
+ */
+size_t adf_device_attachments(struct adf_device *dev,
+ struct adf_attachment *attachments, size_t n_attachments)
+{
+ size_t retval;
+
+ mutex_lock(&dev->client_lock);
+ adf_attachment_list_to_array(dev, &dev->attached, attachments,
+ n_attachments);
+ retval = dev->n_attached;
+ mutex_unlock(&dev->client_lock);
+
+ return retval;
+}
+EXPORT_SYMBOL(adf_device_attachments);
+
+/**
+ * adf_device_attachments_allowed - get device's list of allowed attachments
+ *
+ * @dev: the device
+ * @attachments: storage for the attachment list (optional)
+ * @n_attachments: length of @attachments
+ *
+ * If @attachments is not NULL, adf_device_attachments_allowed() will copy up to
+ * @n_attachments entries into @attachments.
+ *
+ * Returns the length of the allowed attachment list.
+ */
+size_t adf_device_attachments_allowed(struct adf_device *dev,
+ struct adf_attachment *attachments, size_t n_attachments)
+{
+ size_t retval;
+
+ mutex_lock(&dev->client_lock);
+ adf_attachment_list_to_array(dev, &dev->attach_allowed, attachments,
+ n_attachments);
+ retval = dev->n_attach_allowed;
+ mutex_unlock(&dev->client_lock);
+
+ return retval;
+}
+EXPORT_SYMBOL(adf_device_attachments_allowed);
+
+/**
+ * adf_device_attached - return whether an overlay engine and interface are
+ * attached
+ *
+ * @dev: the parent device
+ * @eng: the overlay engine
+ * @intf: the interface
+ */
+bool adf_device_attached(struct adf_device *dev, struct adf_overlay_engine *eng,
+ struct adf_interface *intf)
+{
+ struct adf_attachment_list *attachment;
+
+ mutex_lock(&dev->client_lock);
+ attachment = adf_attachment_find(&dev->attached, eng, intf);
+ mutex_unlock(&dev->client_lock);
+
+ return attachment != NULL;
+}
+EXPORT_SYMBOL(adf_device_attached);
+
+/**
+ * adf_device_attach_allowed - return whether the ADF device supports attaching
+ * an overlay engine and interface
+ *
+ * @dev: the parent device
+ * @eng: the overlay engine
+ * @intf: the interface
+ */
+bool adf_device_attach_allowed(struct adf_device *dev,
+ struct adf_overlay_engine *eng, struct adf_interface *intf)
+{
+ struct adf_attachment_list *attachment;
+
+ mutex_lock(&dev->client_lock);
+ attachment = adf_attachment_find(&dev->attach_allowed, eng, intf);
+ mutex_unlock(&dev->client_lock);
+
+ return attachment != NULL;
+}
+EXPORT_SYMBOL(adf_device_attach_allowed);
+/**
+ * adf_device_attach - attach an overlay engine to an interface
+ *
+ * @dev: the parent device
+ * @eng: the overlay engine
+ * @intf: the interface
+ *
+ * Returns 0 on success, -%EINVAL if attaching @intf and @eng is not allowed,
+ * -%EALREADY if @intf and @eng are already attached, or -errno on any other
+ * failure.
+ */
+int adf_device_attach(struct adf_device *dev, struct adf_overlay_engine *eng,
+ struct adf_interface *intf)
+{
+ int ret;
+ struct adf_attachment_list *attachment = NULL;
+
+ ret = adf_attachment_validate(dev, eng, intf);
+ if (ret < 0)
+ return ret;
+
+ mutex_lock(&dev->client_lock);
+
+ if (dev->n_attached == ADF_MAX_ATTACHMENTS) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ if (!adf_attachment_find(&dev->attach_allowed, eng, intf)) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ if (adf_attachment_find(&dev->attached, eng, intf)) {
+ ret = -EALREADY;
+ goto done;
+ }
+
+ ret = adf_device_attach_op(dev, eng, intf);
+ if (ret < 0)
+ goto done;
+
+ attachment = kzalloc(sizeof(*attachment), GFP_KERNEL);
+ if (!attachment) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ attachment->attachment.interface = intf;
+ attachment->attachment.overlay_engine = eng;
+ list_add_tail(&attachment->head, &dev->attached);
+ dev->n_attached++;
+
+done:
+ mutex_unlock(&dev->client_lock);
+ if (ret < 0)
+ kfree(attachment);
+
+ return ret;
+}
+EXPORT_SYMBOL(adf_device_attach);
+
+/**
+ * adf_device_detach - detach an overlay engine from an interface
+ *
+ * @dev: the parent device
+ * @eng: the overlay engine
+ * @intf: the interface
+ *
+ * Returns 0 on success, -%EINVAL if @intf and @eng are not attached,
+ * or -errno on any other failure.
+ */
+int adf_device_detach(struct adf_device *dev, struct adf_overlay_engine *eng,
+ struct adf_interface *intf)
+{
+ int ret;
+ struct adf_attachment_list *attachment;
+
+ ret = adf_attachment_validate(dev, eng, intf);
+ if (ret < 0)
+ return ret;
+
+ mutex_lock(&dev->client_lock);
+
+ attachment = adf_attachment_find(&dev->attached, eng, intf);
+ if (!attachment) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ ret = adf_device_detach_op(dev, eng, intf);
+ if (ret < 0)
+ goto done;
+
+ adf_attachment_free(attachment);
+ dev->n_attached--;
+done:
+ mutex_unlock(&dev->client_lock);
+ return ret;
+}
+EXPORT_SYMBOL(adf_device_detach);
+
+/**
+ * adf_interface_simple_buffer_alloc - allocate a simple buffer
+ *
+ * @intf: target interface
+ * @w: width in pixels
+ * @h: height in pixels
+ * @format: format fourcc
+ * @dma_buf: returns the allocated buffer
+ * @offset: returns the byte offset of the allocated buffer's first pixel
+ * @pitch: returns the allocated buffer's pitch
+ *
+ * See &struct adf_simple_buffer_alloc for a description of simple buffers and
+ * their limitations.
+ *
+ * Returns 0 on success or -errno on failure.
+ */
+int adf_interface_simple_buffer_alloc(struct adf_interface *intf, u16 w, u16 h,
+ u32 format, struct dma_buf **dma_buf, u32 *offset, u32 *pitch)
+{
+ if (!intf->ops || !intf->ops->alloc_simple_buffer)
+ return -EOPNOTSUPP;
+
+ if (!adf_format_is_rgb(format))
+ return -EINVAL;
+
+ return intf->ops->alloc_simple_buffer(intf, w, h, format, dma_buf,
+ offset, pitch);
+}
+EXPORT_SYMBOL(adf_interface_simple_buffer_alloc);
+
+/**
+ * adf_interface_simple_post - flip to a single buffer
+ *
+ * @intf: interface targeted by the flip
+ * @buf: buffer to display
+ *
+ * adf_interface_simple_post() can be used generically for simple display
+ * configurations, since the client does not need to provide any driver-private
+ * configuration data.
+ *
+ * adf_interface_simple_post() has the same copying semantics as
+ * adf_device_post().
+ *
+ * On success, returns a sync fence which signals when the buffer is removed
+ * from the screen. On failure, returns ERR_PTR(-errno).
+ */
+struct sync_fence *adf_interface_simple_post(struct adf_interface *intf,
+ struct adf_buffer *buf)
+{
+ size_t custom_data_size = 0;
+ void *custom_data = NULL;
+ struct sync_fence *ret;
+
+ if (intf->ops && intf->ops->describe_simple_post) {
+ int err;
+
+ custom_data = kzalloc(ADF_MAX_CUSTOM_DATA_SIZE, GFP_KERNEL);
+ if (!custom_data) {
+ ret = ERR_PTR(-ENOMEM);
+ goto done;
+ }
+
+ err = intf->ops->describe_simple_post(intf, buf, custom_data,
+ &custom_data_size);
+ if (err < 0) {
+ ret = ERR_PTR(err);
+ goto done;
+ }
+ }
+
+ ret = adf_device_post(adf_interface_parent(intf), &intf, 1, buf, 1,
+ custom_data, custom_data_size);
+done:
+ kfree(custom_data);
+ return ret;
+}
+EXPORT_SYMBOL(adf_interface_simple_post);
diff --git a/drivers/video/adf/adf_fbdev.c b/drivers/video/adf/adf_fbdev.c
new file mode 100644
index 000000000000..a5b53bc08c3f
--- /dev/null
+++ b/drivers/video/adf/adf_fbdev.c
@@ -0,0 +1,665 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/vmalloc.h>
+
+#include <video/adf.h>
+#include <video/adf_client.h>
+#include <video/adf_fbdev.h>
+#include <video/adf_format.h>
+
+#include "adf.h"
+
+struct adf_fbdev_format {
+ u32 fourcc;
+ u32 bpp;
+ u32 r_length;
+ u32 g_length;
+ u32 b_length;
+ u32 a_length;
+ u32 r_offset;
+ u32 g_offset;
+ u32 b_offset;
+ u32 a_offset;
+};
+
+static const struct adf_fbdev_format format_table[] = {
+ {DRM_FORMAT_RGB332, 8, 3, 3, 2, 0, 5, 2, 0, 0},
+ {DRM_FORMAT_BGR233, 8, 3, 3, 2, 0, 0, 3, 5, 0},
+
+ {DRM_FORMAT_XRGB4444, 16, 4, 4, 4, 0, 8, 4, 0, 0},
+ {DRM_FORMAT_XBGR4444, 16, 4, 4, 4, 0, 0, 4, 8, 0},
+ {DRM_FORMAT_RGBX4444, 16, 4, 4, 4, 0, 12, 8, 4, 0},
+ {DRM_FORMAT_BGRX4444, 16, 4, 4, 4, 0, 0, 4, 8, 0},
+
+ {DRM_FORMAT_ARGB4444, 16, 4, 4, 4, 4, 8, 4, 0, 12},
+ {DRM_FORMAT_ABGR4444, 16, 4, 4, 4, 4, 0, 4, 8, 12},
+ {DRM_FORMAT_RGBA4444, 16, 4, 4, 4, 4, 12, 8, 4, 0},
+ {DRM_FORMAT_BGRA4444, 16, 4, 4, 4, 4, 0, 4, 8, 0},
+
+ {DRM_FORMAT_XRGB1555, 16, 5, 5, 5, 0, 10, 5, 0, 0},
+ {DRM_FORMAT_XBGR1555, 16, 5, 5, 5, 0, 0, 5, 10, 0},
+ {DRM_FORMAT_RGBX5551, 16, 5, 5, 5, 0, 11, 6, 1, 0},
+ {DRM_FORMAT_BGRX5551, 16, 5, 5, 5, 0, 1, 6, 11, 0},
+
+ {DRM_FORMAT_ARGB1555, 16, 5, 5, 5, 1, 10, 5, 0, 15},
+ {DRM_FORMAT_ABGR1555, 16, 5, 5, 5, 1, 0, 5, 10, 15},
+ {DRM_FORMAT_RGBA5551, 16, 5, 5, 5, 1, 11, 6, 1, 0},
+ {DRM_FORMAT_BGRA5551, 16, 5, 5, 5, 1, 1, 6, 11, 0},
+
+ {DRM_FORMAT_RGB565, 16, 5, 6, 5, 0, 11, 5, 0, 0},
+ {DRM_FORMAT_BGR565, 16, 5, 6, 5, 0, 0, 5, 11, 0},
+
+ {DRM_FORMAT_RGB888, 24, 8, 8, 8, 0, 16, 8, 0, 0},
+ {DRM_FORMAT_BGR888, 24, 8, 8, 8, 0, 0, 8, 16, 0},
+
+ {DRM_FORMAT_XRGB8888, 32, 8, 8, 8, 0, 16, 8, 0, 0},
+ {DRM_FORMAT_XBGR8888, 32, 8, 8, 8, 0, 0, 8, 16, 0},
+ {DRM_FORMAT_RGBX8888, 32, 8, 8, 8, 0, 24, 16, 8, 0},
+ {DRM_FORMAT_BGRX8888, 32, 8, 8, 8, 0, 8, 16, 24, 0},
+
+ {DRM_FORMAT_ARGB8888, 32, 8, 8, 8, 8, 16, 8, 0, 24},
+ {DRM_FORMAT_ABGR8888, 32, 8, 8, 8, 8, 0, 8, 16, 24},
+ {DRM_FORMAT_RGBA8888, 32, 8, 8, 8, 8, 24, 16, 8, 0},
+ {DRM_FORMAT_BGRA8888, 32, 8, 8, 8, 8, 8, 16, 24, 0},
+
+ {DRM_FORMAT_XRGB2101010, 32, 10, 10, 10, 0, 20, 10, 0, 0},
+ {DRM_FORMAT_XBGR2101010, 32, 10, 10, 10, 0, 0, 10, 20, 0},
+ {DRM_FORMAT_RGBX1010102, 32, 10, 10, 10, 0, 22, 12, 2, 0},
+ {DRM_FORMAT_BGRX1010102, 32, 10, 10, 10, 0, 2, 12, 22, 0},
+
+ {DRM_FORMAT_ARGB2101010, 32, 10, 10, 10, 2, 20, 10, 0, 30},
+ {DRM_FORMAT_ABGR2101010, 32, 10, 10, 10, 2, 0, 10, 20, 30},
+ {DRM_FORMAT_RGBA1010102, 32, 10, 10, 10, 2, 22, 12, 2, 0},
+ {DRM_FORMAT_BGRA1010102, 32, 10, 10, 10, 2, 2, 12, 22, 0},
+};
+
+static u32 drm_fourcc_from_fb_var(struct fb_var_screeninfo *var)
+{
+ size_t i;
+ for (i = 0; i < ARRAY_SIZE(format_table); i++) {
+ const struct adf_fbdev_format *f = &format_table[i];
+ if (var->red.length == f->r_length &&
+ var->red.offset == f->r_offset &&
+ var->green.length == f->g_length &&
+ var->green.offset == f->g_offset &&
+ var->blue.length == f->b_length &&
+ var->blue.offset == f->b_offset &&
+ var->transp.length == f->a_length &&
+ (var->transp.length == 0 ||
+ var->transp.offset == f->a_offset))
+ return f->fourcc;
+ }
+
+ return 0;
+}
+
+static const struct adf_fbdev_format *fbdev_format_info(u32 format)
+{
+ size_t i;
+ for (i = 0; i < ARRAY_SIZE(format_table); i++) {
+ const struct adf_fbdev_format *f = &format_table[i];
+ if (f->fourcc == format)
+ return f;
+ }
+
+ BUG();
+}
+
+void adf_modeinfo_to_fb_videomode(const struct drm_mode_modeinfo *mode,
+ struct fb_videomode *vmode)
+{
+ memset(vmode, 0, sizeof(*vmode));
+
+ vmode->refresh = mode->vrefresh;
+
+ vmode->xres = mode->hdisplay;
+ vmode->yres = mode->vdisplay;
+
+ vmode->pixclock = mode->clock ? KHZ2PICOS(mode->clock) : 0;
+ vmode->left_margin = mode->htotal - mode->hsync_end;
+ vmode->right_margin = mode->hsync_start - mode->hdisplay;
+ vmode->upper_margin = mode->vtotal - mode->vsync_end;
+ vmode->lower_margin = mode->vsync_start - mode->vdisplay;
+ vmode->hsync_len = mode->hsync_end - mode->hsync_start;
+ vmode->vsync_len = mode->vsync_end - mode->vsync_start;
+
+ vmode->sync = 0;
+ if (mode->flags & DRM_MODE_FLAG_PHSYNC)
+ vmode->sync |= FB_SYNC_HOR_HIGH_ACT;
+ if (mode->flags & DRM_MODE_FLAG_PVSYNC)
+ vmode->sync |= FB_SYNC_VERT_HIGH_ACT;
+ if (mode->flags & DRM_MODE_FLAG_PCSYNC)
+ vmode->sync |= FB_SYNC_COMP_HIGH_ACT;
+ if (mode->flags & DRM_MODE_FLAG_BCAST)
+ vmode->sync |= FB_SYNC_BROADCAST;
+
+ vmode->vmode = 0;
+ if (mode->flags & DRM_MODE_FLAG_INTERLACE)
+ vmode->vmode |= FB_VMODE_INTERLACED;
+ if (mode->flags & DRM_MODE_FLAG_DBLSCAN)
+ vmode->vmode |= FB_VMODE_DOUBLE;
+}
+EXPORT_SYMBOL(adf_modeinfo_to_fb_videomode);
+
+void adf_modeinfo_from_fb_videomode(const struct fb_videomode *vmode,
+ struct drm_mode_modeinfo *mode)
+{
+ memset(mode, 0, sizeof(*mode));
+
+ mode->hdisplay = vmode->xres;
+ mode->hsync_start = mode->hdisplay + vmode->right_margin;
+ mode->hsync_end = mode->hsync_start + vmode->hsync_len;
+ mode->htotal = mode->hsync_end + vmode->left_margin;
+
+ mode->vdisplay = vmode->yres;
+ mode->vsync_start = mode->vdisplay + vmode->lower_margin;
+ mode->vsync_end = mode->vsync_start + vmode->vsync_len;
+ mode->vtotal = mode->vsync_end + vmode->upper_margin;
+
+ mode->clock = vmode->pixclock ? PICOS2KHZ(vmode->pixclock) : 0;
+
+ mode->flags = 0;
+ if (vmode->sync & FB_SYNC_HOR_HIGH_ACT)
+ mode->flags |= DRM_MODE_FLAG_PHSYNC;
+ if (vmode->sync & FB_SYNC_VERT_HIGH_ACT)
+ mode->flags |= DRM_MODE_FLAG_PVSYNC;
+ if (vmode->sync & FB_SYNC_COMP_HIGH_ACT)
+ mode->flags |= DRM_MODE_FLAG_PCSYNC;
+ if (vmode->sync & FB_SYNC_BROADCAST)
+ mode->flags |= DRM_MODE_FLAG_BCAST;
+ if (vmode->vmode & FB_VMODE_INTERLACED)
+ mode->flags |= DRM_MODE_FLAG_INTERLACE;
+ if (vmode->vmode & FB_VMODE_DOUBLE)
+ mode->flags |= DRM_MODE_FLAG_DBLSCAN;
+
+ if (vmode->refresh)
+ mode->vrefresh = vmode->refresh;
+ else
+ adf_modeinfo_set_vrefresh(mode);
+
+ if (vmode->name)
+ strlcpy(mode->name, vmode->name, sizeof(mode->name));
+ else
+ adf_modeinfo_set_name(mode);
+}
+EXPORT_SYMBOL(adf_modeinfo_from_fb_videomode);
+
+static int adf_fbdev_post(struct adf_fbdev *fbdev)
+{
+ struct adf_buffer buf;
+ struct sync_fence *complete_fence;
+ int ret = 0;
+
+ memset(&buf, 0, sizeof(buf));
+ buf.overlay_engine = fbdev->eng;
+ buf.w = fbdev->info->var.xres;
+ buf.h = fbdev->info->var.yres;
+ buf.format = fbdev->format;
+ buf.dma_bufs[0] = fbdev->dma_buf;
+ buf.offset[0] = fbdev->offset +
+ fbdev->info->var.yoffset * fbdev->pitch +
+ fbdev->info->var.xoffset *
+ (fbdev->info->var.bits_per_pixel / 8);
+ buf.pitch[0] = fbdev->pitch;
+ buf.n_planes = 1;
+
+ complete_fence = adf_interface_simple_post(fbdev->intf, &buf);
+ if (IS_ERR(complete_fence)) {
+ ret = PTR_ERR(complete_fence);
+ goto done;
+ }
+
+ sync_fence_put(complete_fence);
+done:
+ return ret;
+}
+
+static const u16 vga_palette[][3] = {
+ {0x0000, 0x0000, 0x0000},
+ {0x0000, 0x0000, 0xAAAA},
+ {0x0000, 0xAAAA, 0x0000},
+ {0x0000, 0xAAAA, 0xAAAA},
+ {0xAAAA, 0x0000, 0x0000},
+ {0xAAAA, 0x0000, 0xAAAA},
+ {0xAAAA, 0x5555, 0x0000},
+ {0xAAAA, 0xAAAA, 0xAAAA},
+ {0x5555, 0x5555, 0x5555},
+ {0x5555, 0x5555, 0xFFFF},
+ {0x5555, 0xFFFF, 0x5555},
+ {0x5555, 0xFFFF, 0xFFFF},
+ {0xFFFF, 0x5555, 0x5555},
+ {0xFFFF, 0x5555, 0xFFFF},
+ {0xFFFF, 0xFFFF, 0x5555},
+ {0xFFFF, 0xFFFF, 0xFFFF},
+};
+
+static int adf_fb_alloc(struct adf_fbdev *fbdev)
+{
+ int ret;
+
+ ret = adf_interface_simple_buffer_alloc(fbdev->intf,
+ fbdev->default_xres_virtual,
+ fbdev->default_yres_virtual,
+ fbdev->default_format,
+ &fbdev->dma_buf, &fbdev->offset, &fbdev->pitch);
+ if (ret < 0) {
+ dev_err(fbdev->info->dev, "allocating fb failed: %d\n", ret);
+ return ret;
+ }
+
+ fbdev->vaddr = dma_buf_vmap(fbdev->dma_buf);
+ if (!fbdev->vaddr) {
+ ret = -ENOMEM;
+ dev_err(fbdev->info->dev, "vmapping fb failed\n");
+ goto err_vmap;
+ }
+ fbdev->info->fix.line_length = fbdev->pitch;
+ fbdev->info->var.xres_virtual = fbdev->default_xres_virtual;
+ fbdev->info->var.yres_virtual = fbdev->default_yres_virtual;
+ fbdev->info->fix.smem_len = fbdev->dma_buf->size;
+ fbdev->info->screen_base = fbdev->vaddr;
+
+ return 0;
+
+err_vmap:
+ dma_buf_put(fbdev->dma_buf);
+ return ret;
+}
+
+static void adf_fb_destroy(struct adf_fbdev *fbdev)
+{
+ dma_buf_vunmap(fbdev->dma_buf, fbdev->vaddr);
+ dma_buf_put(fbdev->dma_buf);
+}
+
+static void adf_fbdev_set_format(struct adf_fbdev *fbdev, u32 format)
+{
+ size_t i;
+ const struct adf_fbdev_format *info = fbdev_format_info(format);
+ for (i = 0; i < ARRAY_SIZE(vga_palette); i++) {
+ u16 r = vga_palette[i][0];
+ u16 g = vga_palette[i][1];
+ u16 b = vga_palette[i][2];
+
+ r >>= (16 - info->r_length);
+ g >>= (16 - info->g_length);
+ b >>= (16 - info->b_length);
+
+ fbdev->pseudo_palette[i] =
+ (r << info->r_offset) |
+ (g << info->g_offset) |
+ (b << info->b_offset);
+
+ if (info->a_length) {
+ u16 a = BIT(info->a_length) - 1;
+ fbdev->pseudo_palette[i] |= (a << info->a_offset);
+ }
+ }
+
+ fbdev->info->var.bits_per_pixel = adf_format_bpp(format);
+ fbdev->info->var.red.length = info->r_length;
+ fbdev->info->var.red.offset = info->r_offset;
+ fbdev->info->var.green.length = info->g_length;
+ fbdev->info->var.green.offset = info->g_offset;
+ fbdev->info->var.blue.length = info->b_length;
+ fbdev->info->var.blue.offset = info->b_offset;
+ fbdev->info->var.transp.length = info->a_length;
+ fbdev->info->var.transp.offset = info->a_offset;
+ fbdev->format = format;
+}
+
+static void adf_fbdev_fill_modelist(struct adf_fbdev *fbdev)
+{
+ struct drm_mode_modeinfo *modelist;
+ struct fb_videomode fbmode;
+ size_t n_modes, i;
+ int ret = 0;
+
+ n_modes = adf_interface_modelist(fbdev->intf, NULL, 0);
+ modelist = kzalloc(sizeof(modelist[0]) * n_modes, GFP_KERNEL);
+ if (!modelist) {
+ dev_warn(fbdev->info->dev, "allocating new modelist failed; keeping old modelist\n");
+ return;
+ }
+ adf_interface_modelist(fbdev->intf, modelist, n_modes);
+
+ fb_destroy_modelist(&fbdev->info->modelist);
+
+ for (i = 0; i < n_modes; i++) {
+ adf_modeinfo_to_fb_videomode(&modelist[i], &fbmode);
+ ret = fb_add_videomode(&fbmode, &fbdev->info->modelist);
+ if (ret < 0)
+ dev_warn(fbdev->info->dev, "adding mode %s to modelist failed: %d\n",
+ modelist[i].name, ret);
+ }
+
+ kfree(modelist);
+}
+
+/**
+ * adf_fbdev_open - default implementation of fbdev open op
+ */
+int adf_fbdev_open(struct fb_info *info, int user)
+{
+ struct adf_fbdev *fbdev = info->par;
+ int ret;
+
+ mutex_lock(&fbdev->refcount_lock);
+
+ if (unlikely(fbdev->refcount == UINT_MAX)) {
+ ret = -EMFILE;
+ goto done;
+ }
+
+ if (!fbdev->refcount) {
+ struct drm_mode_modeinfo mode;
+ struct fb_videomode fbmode;
+ struct adf_device *dev = adf_interface_parent(fbdev->intf);
+
+ ret = adf_device_attach(dev, fbdev->eng, fbdev->intf);
+ if (ret < 0 && ret != -EALREADY)
+ goto done;
+
+ ret = adf_fb_alloc(fbdev);
+ if (ret < 0)
+ goto done;
+
+ adf_interface_current_mode(fbdev->intf, &mode);
+ adf_modeinfo_to_fb_videomode(&mode, &fbmode);
+ fb_videomode_to_var(&fbdev->info->var, &fbmode);
+
+ adf_fbdev_set_format(fbdev, fbdev->default_format);
+ adf_fbdev_fill_modelist(fbdev);
+ }
+
+ ret = adf_fbdev_post(fbdev);
+ if (ret < 0) {
+ if (!fbdev->refcount)
+ adf_fb_destroy(fbdev);
+ goto done;
+ }
+
+ fbdev->refcount++;
+done:
+ mutex_unlock(&fbdev->refcount_lock);
+ return ret;
+}
+EXPORT_SYMBOL(adf_fbdev_open);
+
+/**
+ * adf_fbdev_release - default implementation of fbdev release op
+ */
+int adf_fbdev_release(struct fb_info *info, int user)
+{
+ struct adf_fbdev *fbdev = info->par;
+ mutex_lock(&fbdev->refcount_lock);
+ BUG_ON(!fbdev->refcount);
+ fbdev->refcount--;
+ if (!fbdev->refcount)
+ adf_fb_destroy(fbdev);
+ mutex_unlock(&fbdev->refcount_lock);
+ return 0;
+}
+EXPORT_SYMBOL(adf_fbdev_release);
+
+/**
+ * adf_fbdev_check_var - default implementation of fbdev check_var op
+ */
+int adf_fbdev_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
+{
+ struct adf_fbdev *fbdev = info->par;
+ bool valid_format = true;
+ u32 format = drm_fourcc_from_fb_var(var);
+ u32 pitch = var->xres_virtual * var->bits_per_pixel / 8;
+
+ if (!format) {
+ dev_dbg(info->dev, "%s: unrecognized format\n", __func__);
+ valid_format = false;
+ }
+
+ if (valid_format && var->grayscale) {
+ dev_dbg(info->dev, "%s: grayscale modes not supported\n",
+ __func__);
+ valid_format = false;
+ }
+
+ if (valid_format && var->nonstd) {
+ dev_dbg(info->dev, "%s: nonstandard formats not supported\n",
+ __func__);
+ valid_format = false;
+ }
+
+ if (valid_format && !adf_overlay_engine_supports_format(fbdev->eng,
+ format)) {
+ char format_str[ADF_FORMAT_STR_SIZE];
+ adf_format_str(format, format_str);
+ dev_dbg(info->dev, "%s: format %s not supported by overlay engine %s\n",
+ __func__, format_str, fbdev->eng->base.name);
+ valid_format = false;
+ }
+
+ if (valid_format && pitch > fbdev->pitch) {
+ dev_dbg(info->dev, "%s: fb pitch too small for var (pitch = %u, xres_virtual = %u, bits_per_pixel = %u)\n",
+ __func__, fbdev->pitch, var->xres_virtual,
+ var->bits_per_pixel);
+ valid_format = false;
+ }
+
+ if (valid_format && var->yres_virtual > fbdev->default_yres_virtual) {
+ dev_dbg(info->dev, "%s: fb height too small for var (h = %u, yres_virtual = %u)\n",
+ __func__, fbdev->default_yres_virtual,
+ var->yres_virtual);
+ valid_format = false;
+ }
+
+ if (valid_format) {
+ var->activate = info->var.activate;
+ var->height = info->var.height;
+ var->width = info->var.width;
+ var->accel_flags = info->var.accel_flags;
+ var->rotate = info->var.rotate;
+ var->colorspace = info->var.colorspace;
+ /* userspace can't change these */
+ } else {
+ /* if any part of the format is invalid then fixing it up is
+ impractical, so save just the modesetting bits and
+ overwrite everything else */
+ struct fb_videomode mode;
+ fb_var_to_videomode(&mode, var);
+ memcpy(var, &info->var, sizeof(*var));
+ fb_videomode_to_var(var, &mode);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(adf_fbdev_check_var);
+
+/**
+ * adf_fbdev_set_par - default implementation of fbdev set_par op
+ */
+int adf_fbdev_set_par(struct fb_info *info)
+{
+ struct adf_fbdev *fbdev = info->par;
+ struct adf_interface *intf = fbdev->intf;
+ struct fb_videomode vmode;
+ struct drm_mode_modeinfo mode;
+ int ret;
+ u32 format = drm_fourcc_from_fb_var(&info->var);
+
+ fb_var_to_videomode(&vmode, &info->var);
+ adf_modeinfo_from_fb_videomode(&vmode, &mode);
+ ret = adf_interface_set_mode(intf, &mode);
+ if (ret < 0)
+ return ret;
+
+ ret = adf_fbdev_post(fbdev);
+ if (ret < 0)
+ return ret;
+
+ if (format != fbdev->format)
+ adf_fbdev_set_format(fbdev, format);
+
+ return 0;
+}
+EXPORT_SYMBOL(adf_fbdev_set_par);
+
+/**
+ * adf_fbdev_blank - default implementation of fbdev blank op
+ */
+int adf_fbdev_blank(int blank, struct fb_info *info)
+{
+ struct adf_fbdev *fbdev = info->par;
+ struct adf_interface *intf = fbdev->intf;
+ u8 dpms_state;
+
+ switch (blank) {
+ case FB_BLANK_UNBLANK:
+ dpms_state = DRM_MODE_DPMS_ON;
+ break;
+ case FB_BLANK_NORMAL:
+ dpms_state = DRM_MODE_DPMS_STANDBY;
+ break;
+ case FB_BLANK_VSYNC_SUSPEND:
+ dpms_state = DRM_MODE_DPMS_SUSPEND;
+ break;
+ case FB_BLANK_HSYNC_SUSPEND:
+ dpms_state = DRM_MODE_DPMS_STANDBY;
+ break;
+ case FB_BLANK_POWERDOWN:
+ dpms_state = DRM_MODE_DPMS_OFF;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return adf_interface_blank(intf, dpms_state);
+}
+EXPORT_SYMBOL(adf_fbdev_blank);
+
+/**
+ * adf_fbdev_pan_display - default implementation of fbdev pan_display op
+ */
+int adf_fbdev_pan_display(struct fb_var_screeninfo *var, struct fb_info *info)
+{
+ struct adf_fbdev *fbdev = info->par;
+ return adf_fbdev_post(fbdev);
+}
+EXPORT_SYMBOL(adf_fbdev_pan_display);
+
+/**
+ * adf_fbdev_mmap - default implementation of fbdev mmap op
+ */
+int adf_fbdev_mmap(struct fb_info *info, struct vm_area_struct *vma)
+{
+ struct adf_fbdev *fbdev = info->par;
+
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ return dma_buf_mmap(fbdev->dma_buf, vma, 0);
+}
+EXPORT_SYMBOL(adf_fbdev_mmap);
+
+/**
+ * adf_fbdev_init - initialize helper to wrap ADF device in fbdev API
+ *
+ * @fbdev: the fbdev helper
+ * @interface: the ADF interface that will display the framebuffer
+ * @eng: the ADF overlay engine that will scan out the framebuffer
+ * @xres_virtual: the virtual width of the framebuffer
+ * @yres_virtual: the virtual height of the framebuffer
+ * @format: the format of the framebuffer
+ * @fbops: the device's fbdev ops
+ * @fmt: formatting for the framebuffer identification string
+ * @...: variable arguments
+ *
+ * @format must be a standard, non-indexed RGB format, i.e.,
+ * adf_format_is_rgb(@format) && @format != @DRM_FORMAT_C8.
+ *
+ * Returns 0 on success or -errno on failure.
+ */
+int adf_fbdev_init(struct adf_fbdev *fbdev, struct adf_interface *interface,
+ struct adf_overlay_engine *eng,
+ u16 xres_virtual, u16 yres_virtual, u32 format,
+ struct fb_ops *fbops, const char *fmt, ...)
+{
+ struct adf_device *parent = adf_interface_parent(interface);
+ struct device *dev = &parent->base.dev;
+ u16 width_mm, height_mm;
+ va_list args;
+ int ret;
+
+ if (!adf_format_is_rgb(format) ||
+ format == DRM_FORMAT_C8) {
+ dev_err(dev, "fbdev helper does not support format %u\n",
+ format);
+ return -EINVAL;
+ }
+
+ memset(fbdev, 0, sizeof(*fbdev));
+ fbdev->intf = interface;
+ fbdev->eng = eng;
+ fbdev->info = framebuffer_alloc(0, dev);
+ if (!fbdev->info) {
+ dev_err(dev, "allocating framebuffer device failed\n");
+ return -ENOMEM;
+ }
+ mutex_init(&fbdev->refcount_lock);
+ fbdev->default_xres_virtual = xres_virtual;
+ fbdev->default_yres_virtual = yres_virtual;
+ fbdev->default_format = format;
+
+ fbdev->info->flags = FBINFO_FLAG_DEFAULT;
+ ret = adf_interface_get_screen_size(interface, &width_mm, &height_mm);
+ if (ret < 0) {
+ width_mm = 0;
+ height_mm = 0;
+ }
+ fbdev->info->var.width = width_mm;
+ fbdev->info->var.height = height_mm;
+ fbdev->info->var.activate = FB_ACTIVATE_VBL;
+ va_start(args, fmt);
+ vsnprintf(fbdev->info->fix.id, sizeof(fbdev->info->fix.id), fmt, args);
+ va_end(args);
+ fbdev->info->fix.type = FB_TYPE_PACKED_PIXELS;
+ fbdev->info->fix.visual = FB_VISUAL_TRUECOLOR;
+ fbdev->info->fix.xpanstep = 1;
+ fbdev->info->fix.ypanstep = 1;
+ INIT_LIST_HEAD(&fbdev->info->modelist);
+ fbdev->info->fbops = fbops;
+ fbdev->info->pseudo_palette = fbdev->pseudo_palette;
+ fbdev->info->par = fbdev;
+
+ ret = register_framebuffer(fbdev->info);
+ if (ret < 0) {
+ dev_err(dev, "registering framebuffer failed: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(adf_fbdev_init);
+
+/**
+ * adf_fbdev_destroy - destroy helper to wrap ADF device in fbdev API
+ *
+ * @fbdev: the fbdev helper
+ */
+void adf_fbdev_destroy(struct adf_fbdev *fbdev)
+{
+ unregister_framebuffer(fbdev->info);
+ BUG_ON(fbdev->refcount);
+ mutex_destroy(&fbdev->refcount_lock);
+ framebuffer_release(fbdev->info);
+}
+EXPORT_SYMBOL(adf_fbdev_destroy);
diff --git a/drivers/video/adf/adf_fops.c b/drivers/video/adf/adf_fops.c
new file mode 100644
index 000000000000..d5c1e5466cff
--- /dev/null
+++ b/drivers/video/adf/adf_fops.c
@@ -0,0 +1,946 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/bitops.h>
+#include <linux/circ_buf.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <video/adf_client.h>
+#include <video/adf_format.h>
+
+#include "sw_sync.h"
+#include "sync.h"
+
+#include "adf.h"
+#include "adf_fops.h"
+#include "adf_sysfs.h"
+
+#ifdef CONFIG_COMPAT
+#include "adf_fops32.h"
+#endif
+
+static int adf_obj_set_event(struct adf_obj *obj, struct adf_file *file,
+ struct adf_set_event __user *arg)
+{
+ struct adf_set_event data;
+ bool enabled;
+ unsigned long flags;
+ int err;
+
+ if (copy_from_user(&data, arg, sizeof(data)))
+ return -EFAULT;
+
+ err = adf_obj_check_supports_event(obj, data.type);
+ if (err < 0)
+ return err;
+
+ spin_lock_irqsave(&obj->file_lock, flags);
+ if (data.enabled)
+ enabled = test_and_set_bit(data.type,
+ file->event_subscriptions);
+ else
+ enabled = test_and_clear_bit(data.type,
+ file->event_subscriptions);
+ spin_unlock_irqrestore(&obj->file_lock, flags);
+
+ if (data.enabled == enabled)
+ return -EALREADY;
+
+ if (data.enabled)
+ adf_event_get(obj, data.type);
+ else
+ adf_event_put(obj, data.type);
+
+ return 0;
+}
+
+static int adf_obj_copy_custom_data_to_user(struct adf_obj *obj,
+ void __user *dst, size_t *dst_size)
+{
+ void *custom_data;
+ size_t custom_data_size;
+ int ret;
+
+ if (!obj->ops || !obj->ops->custom_data) {
+ dev_dbg(&obj->dev, "%s: no custom_data op\n", __func__);
+ return 0;
+ }
+
+ custom_data = kzalloc(ADF_MAX_CUSTOM_DATA_SIZE, GFP_KERNEL);
+ if (!custom_data)
+ return -ENOMEM;
+
+ ret = obj->ops->custom_data(obj, custom_data, &custom_data_size);
+ if (ret < 0)
+ goto done;
+
+ if (copy_to_user(dst, custom_data, min(*dst_size, custom_data_size))) {
+ ret = -EFAULT;
+ goto done;
+ }
+ *dst_size = custom_data_size;
+
+done:
+ kfree(custom_data);
+ return ret;
+}
+
+static int adf_eng_get_data(struct adf_overlay_engine *eng,
+ struct adf_overlay_engine_data __user *arg)
+{
+ struct adf_device *dev = adf_overlay_engine_parent(eng);
+ struct adf_overlay_engine_data data;
+ size_t n_supported_formats;
+ u32 *supported_formats = NULL;
+ int ret = 0;
+
+ if (copy_from_user(&data, arg, sizeof(data)))
+ return -EFAULT;
+
+ strlcpy(data.name, eng->base.name, sizeof(data.name));
+
+ if (data.n_supported_formats > ADF_MAX_SUPPORTED_FORMATS)
+ return -EINVAL;
+
+ n_supported_formats = data.n_supported_formats;
+ data.n_supported_formats = eng->ops->n_supported_formats;
+
+ if (n_supported_formats) {
+ supported_formats = kzalloc(n_supported_formats *
+ sizeof(supported_formats[0]), GFP_KERNEL);
+ if (!supported_formats)
+ return -ENOMEM;
+ }
+
+ memcpy(supported_formats, eng->ops->supported_formats,
+ sizeof(u32) * min(n_supported_formats,
+ eng->ops->n_supported_formats));
+
+ mutex_lock(&dev->client_lock);
+ ret = adf_obj_copy_custom_data_to_user(&eng->base, data.custom_data,
+ &data.custom_data_size);
+ mutex_unlock(&dev->client_lock);
+
+ if (ret < 0)
+ goto done;
+
+ if (copy_to_user(arg, &data, sizeof(data))) {
+ ret = -EFAULT;
+ goto done;
+ }
+
+ if (supported_formats && copy_to_user(data.supported_formats,
+ supported_formats,
+ n_supported_formats * sizeof(supported_formats[0])))
+ ret = -EFAULT;
+
+done:
+ kfree(supported_formats);
+ return ret;
+}
+
+static int adf_buffer_import(struct adf_device *dev,
+ struct adf_buffer_config __user *cfg, struct adf_buffer *buf)
+{
+ struct adf_buffer_config user_buf;
+ size_t i;
+ int ret = 0;
+
+ if (copy_from_user(&user_buf, cfg, sizeof(user_buf)))
+ return -EFAULT;
+
+ memset(buf, 0, sizeof(*buf));
+
+ if (user_buf.n_planes > ADF_MAX_PLANES) {
+ dev_err(&dev->base.dev, "invalid plane count %u\n",
+ user_buf.n_planes);
+ return -EINVAL;
+ }
+
+ buf->overlay_engine = idr_find(&dev->overlay_engines,
+ user_buf.overlay_engine);
+ if (!buf->overlay_engine) {
+ dev_err(&dev->base.dev, "invalid overlay engine id %u\n",
+ user_buf.overlay_engine);
+ return -ENOENT;
+ }
+
+ buf->w = user_buf.w;
+ buf->h = user_buf.h;
+ buf->format = user_buf.format;
+ for (i = 0; i < user_buf.n_planes; i++) {
+ buf->dma_bufs[i] = dma_buf_get(user_buf.fd[i]);
+ if (IS_ERR(buf->dma_bufs[i])) {
+ ret = PTR_ERR(buf->dma_bufs[i]);
+ dev_err(&dev->base.dev, "importing dma_buf fd %d failed: %d\n",
+ user_buf.fd[i], ret);
+ buf->dma_bufs[i] = NULL;
+ goto done;
+ }
+ buf->offset[i] = user_buf.offset[i];
+ buf->pitch[i] = user_buf.pitch[i];
+ }
+ buf->n_planes = user_buf.n_planes;
+
+ if (user_buf.acquire_fence >= 0) {
+ buf->acquire_fence = sync_fence_fdget(user_buf.acquire_fence);
+ if (!buf->acquire_fence) {
+ dev_err(&dev->base.dev, "getting fence fd %d failed\n",
+ user_buf.acquire_fence);
+ ret = -EINVAL;
+ goto done;
+ }
+ }
+
+done:
+ if (ret < 0)
+ adf_buffer_cleanup(buf);
+ return ret;
+}
+
+static int adf_device_post_config(struct adf_device *dev,
+ struct adf_post_config __user *arg)
+{
+ struct sync_fence *complete_fence;
+ int complete_fence_fd;
+ struct adf_buffer *bufs = NULL;
+ struct adf_interface **intfs = NULL;
+ struct adf_post_config data;
+ size_t i;
+ void *custom_data = NULL;
+ int ret = 0;
+
+ if (copy_from_user(&data, arg, sizeof(data)))
+ return -EFAULT;
+
+ complete_fence_fd = get_unused_fd();
+ if (complete_fence_fd < 0)
+ return complete_fence_fd;
+
+ if (data.n_interfaces > ADF_MAX_INTERFACES) {
+ ret = -EINVAL;
+ goto err_get_user;
+ }
+
+ if (data.n_bufs > ADF_MAX_BUFFERS) {
+ ret = -EINVAL;
+ goto err_get_user;
+ }
+
+ if (data.custom_data_size > ADF_MAX_CUSTOM_DATA_SIZE) {
+ ret = -EINVAL;
+ goto err_get_user;
+ }
+
+ if (data.n_interfaces) {
+ intfs = kmalloc(sizeof(intfs[0]) * data.n_interfaces,
+ GFP_KERNEL);
+ if (!intfs) {
+ ret = -ENOMEM;
+ goto err_get_user;
+ }
+ }
+
+ for (i = 0; i < data.n_interfaces; i++) {
+ u32 intf_id;
+ if (get_user(intf_id, &data.interfaces[i])) {
+ ret = -EFAULT;
+ goto err_get_user;
+ }
+
+ intfs[i] = idr_find(&dev->interfaces, intf_id);
+ if (!intfs[i]) {
+ ret = -EINVAL;
+ goto err_get_user;
+ }
+ }
+
+ if (data.n_bufs) {
+ bufs = kzalloc(sizeof(bufs[0]) * data.n_bufs, GFP_KERNEL);
+ if (!bufs) {
+ ret = -ENOMEM;
+ goto err_get_user;
+ }
+ }
+
+ for (i = 0; i < data.n_bufs; i++) {
+ ret = adf_buffer_import(dev, &data.bufs[i], &bufs[i]);
+ if (ret < 0) {
+ memset(&bufs[i], 0, sizeof(bufs[i]));
+ goto err_import;
+ }
+ }
+
+ if (data.custom_data_size) {
+ custom_data = kzalloc(data.custom_data_size, GFP_KERNEL);
+ if (!custom_data) {
+ ret = -ENOMEM;
+ goto err_import;
+ }
+
+ if (copy_from_user(custom_data, data.custom_data,
+ data.custom_data_size)) {
+ ret = -EFAULT;
+ goto err_import;
+ }
+ }
+
+ if (put_user(complete_fence_fd, &arg->complete_fence)) {
+ ret = -EFAULT;
+ goto err_import;
+ }
+
+ complete_fence = adf_device_post_nocopy(dev, intfs, data.n_interfaces,
+ bufs, data.n_bufs, custom_data, data.custom_data_size);
+ if (IS_ERR(complete_fence)) {
+ ret = PTR_ERR(complete_fence);
+ goto err_import;
+ }
+
+ sync_fence_install(complete_fence, complete_fence_fd);
+ return 0;
+
+err_import:
+ for (i = 0; i < data.n_bufs; i++)
+ adf_buffer_cleanup(&bufs[i]);
+
+err_get_user:
+ kfree(custom_data);
+ kfree(bufs);
+ kfree(intfs);
+ put_unused_fd(complete_fence_fd);
+ return ret;
+}
+
+static int adf_intf_simple_post_config(struct adf_interface *intf,
+ struct adf_simple_post_config __user *arg)
+{
+ struct adf_device *dev = intf->base.parent;
+ struct sync_fence *complete_fence;
+ int complete_fence_fd;
+ struct adf_buffer buf;
+ int ret = 0;
+
+ complete_fence_fd = get_unused_fd();
+ if (complete_fence_fd < 0)
+ return complete_fence_fd;
+
+ ret = adf_buffer_import(dev, &arg->buf, &buf);
+ if (ret < 0)
+ goto err_import;
+
+ if (put_user(complete_fence_fd, &arg->complete_fence)) {
+ ret = -EFAULT;
+ goto err_put_user;
+ }
+
+ complete_fence = adf_interface_simple_post(intf, &buf);
+ if (IS_ERR(complete_fence)) {
+ ret = PTR_ERR(complete_fence);
+ goto err_put_user;
+ }
+
+ sync_fence_install(complete_fence, complete_fence_fd);
+ return 0;
+
+err_put_user:
+ adf_buffer_cleanup(&buf);
+err_import:
+ put_unused_fd(complete_fence_fd);
+ return ret;
+}
+
+static int adf_intf_simple_buffer_alloc(struct adf_interface *intf,
+ struct adf_simple_buffer_alloc __user *arg)
+{
+ struct adf_simple_buffer_alloc data;
+ struct dma_buf *dma_buf;
+ int ret = 0;
+
+ if (copy_from_user(&data, arg, sizeof(data)))
+ return -EFAULT;
+
+ data.fd = get_unused_fd_flags(O_CLOEXEC);
+ if (data.fd < 0)
+ return data.fd;
+
+ ret = adf_interface_simple_buffer_alloc(intf, data.w, data.h,
+ data.format, &dma_buf, &data.offset, &data.pitch);
+ if (ret < 0)
+ goto err_alloc;
+
+ if (copy_to_user(arg, &data, sizeof(*arg))) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ fd_install(data.fd, dma_buf->file);
+ return 0;
+
+err_copy:
+ dma_buf_put(dma_buf);
+
+err_alloc:
+ put_unused_fd(data.fd);
+ return ret;
+}
+
+static int adf_copy_attachment_list_to_user(
+ struct adf_attachment_config __user *to, size_t n_to,
+ struct adf_attachment *from, size_t n_from)
+{
+ struct adf_attachment_config *temp;
+ size_t n = min(n_to, n_from);
+ size_t i;
+ int ret = 0;
+
+ if (!n)
+ return 0;
+
+ temp = kzalloc(n * sizeof(temp[0]), GFP_KERNEL);
+ if (!temp)
+ return -ENOMEM;
+
+ for (i = 0; i < n; i++) {
+ temp[i].interface = from[i].interface->base.id;
+ temp[i].overlay_engine = from[i].overlay_engine->base.id;
+ }
+
+ if (copy_to_user(to, temp, n * sizeof(to[0]))) {
+ ret = -EFAULT;
+ goto done;
+ }
+
+done:
+ kfree(temp);
+ return ret;
+}
+
+static int adf_device_get_data(struct adf_device *dev,
+ struct adf_device_data __user *arg)
+{
+ struct adf_device_data data;
+ size_t n_attach;
+ struct adf_attachment *attach = NULL;
+ size_t n_allowed_attach;
+ struct adf_attachment *allowed_attach = NULL;
+ int ret = 0;
+
+ if (copy_from_user(&data, arg, sizeof(data)))
+ return -EFAULT;
+
+ if (data.n_attachments > ADF_MAX_ATTACHMENTS ||
+ data.n_allowed_attachments > ADF_MAX_ATTACHMENTS)
+ return -EINVAL;
+
+ strlcpy(data.name, dev->base.name, sizeof(data.name));
+
+ if (data.n_attachments) {
+ attach = kzalloc(data.n_attachments * sizeof(attach[0]),
+ GFP_KERNEL);
+ if (!attach)
+ return -ENOMEM;
+ }
+ n_attach = adf_device_attachments(dev, attach, data.n_attachments);
+
+ if (data.n_allowed_attachments) {
+ allowed_attach = kzalloc(data.n_allowed_attachments *
+ sizeof(allowed_attach[0]), GFP_KERNEL);
+ if (!allowed_attach) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ }
+ n_allowed_attach = adf_device_attachments_allowed(dev, allowed_attach,
+ data.n_allowed_attachments);
+
+ mutex_lock(&dev->client_lock);
+ ret = adf_obj_copy_custom_data_to_user(&dev->base, data.custom_data,
+ &data.custom_data_size);
+ mutex_unlock(&dev->client_lock);
+
+ if (ret < 0)
+ goto done;
+
+ ret = adf_copy_attachment_list_to_user(data.attachments,
+ data.n_attachments, attach, n_attach);
+ if (ret < 0)
+ goto done;
+
+ ret = adf_copy_attachment_list_to_user(data.allowed_attachments,
+ data.n_allowed_attachments, allowed_attach,
+ n_allowed_attach);
+ if (ret < 0)
+ goto done;
+
+ data.n_attachments = n_attach;
+ data.n_allowed_attachments = n_allowed_attach;
+
+ if (copy_to_user(arg, &data, sizeof(data)))
+ ret = -EFAULT;
+
+done:
+ kfree(allowed_attach);
+ kfree(attach);
+ return ret;
+}
+
+static int adf_device_handle_attachment(struct adf_device *dev,
+ struct adf_attachment_config __user *arg, bool attach)
+{
+ struct adf_attachment_config data;
+ struct adf_overlay_engine *eng;
+ struct adf_interface *intf;
+
+ if (copy_from_user(&data, arg, sizeof(data)))
+ return -EFAULT;
+
+ eng = idr_find(&dev->overlay_engines, data.overlay_engine);
+ if (!eng) {
+ dev_err(&dev->base.dev, "invalid overlay engine id %u\n",
+ data.overlay_engine);
+ return -EINVAL;
+ }
+
+ intf = idr_find(&dev->interfaces, data.interface);
+ if (!intf) {
+ dev_err(&dev->base.dev, "invalid interface id %u\n",
+ data.interface);
+ return -EINVAL;
+ }
+
+ if (attach)
+ return adf_device_attach(dev, eng, intf);
+ else
+ return adf_device_detach(dev, eng, intf);
+}
+
+static int adf_intf_set_mode(struct adf_interface *intf,
+ struct drm_mode_modeinfo __user *arg)
+{
+ struct drm_mode_modeinfo mode;
+
+ if (copy_from_user(&mode, arg, sizeof(mode)))
+ return -EFAULT;
+
+ return adf_interface_set_mode(intf, &mode);
+}
+
+static int adf_intf_get_data(struct adf_interface *intf,
+ struct adf_interface_data __user *arg)
+{
+ struct adf_device *dev = adf_interface_parent(intf);
+ struct adf_interface_data data;
+ struct drm_mode_modeinfo *modelist;
+ size_t modelist_size;
+ int err;
+ int ret = 0;
+ unsigned long flags;
+
+ if (copy_from_user(&data, arg, sizeof(data)))
+ return -EFAULT;
+
+ strlcpy(data.name, intf->base.name, sizeof(data.name));
+
+ data.type = intf->type;
+ data.id = intf->idx;
+ data.flags = intf->flags;
+
+ err = adf_interface_get_screen_size(intf, &data.width_mm,
+ &data.height_mm);
+ if (err < 0) {
+ data.width_mm = 0;
+ data.height_mm = 0;
+ }
+
+ modelist = kmalloc(sizeof(modelist[0]) * ADF_MAX_MODES, GFP_KERNEL);
+ if (!modelist)
+ return -ENOMEM;
+
+ mutex_lock(&dev->client_lock);
+ read_lock_irqsave(&intf->hotplug_modelist_lock, flags);
+ data.hotplug_detect = intf->hotplug_detect;
+ modelist_size = min(data.n_available_modes, intf->n_modes) *
+ sizeof(intf->modelist[0]);
+ memcpy(modelist, intf->modelist, modelist_size);
+ data.n_available_modes = intf->n_modes;
+ read_unlock_irqrestore(&intf->hotplug_modelist_lock, flags);
+
+ if (copy_to_user(data.available_modes, modelist, modelist_size)) {
+ ret = -EFAULT;
+ goto done;
+ }
+
+ data.dpms_state = intf->dpms_state;
+ memcpy(&data.current_mode, &intf->current_mode,
+ sizeof(intf->current_mode));
+
+ ret = adf_obj_copy_custom_data_to_user(&intf->base, data.custom_data,
+ &data.custom_data_size);
+done:
+ mutex_unlock(&dev->client_lock);
+ kfree(modelist);
+
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(arg, &data, sizeof(data)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static inline long adf_obj_custom_ioctl(struct adf_obj *obj, unsigned int cmd,
+ unsigned long arg)
+{
+ if (obj->ops && obj->ops->ioctl)
+ return obj->ops->ioctl(obj, cmd, arg);
+ return -ENOTTY;
+}
+
+static long adf_overlay_engine_ioctl(struct adf_overlay_engine *eng,
+ struct adf_file *file, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case ADF_SET_EVENT:
+ return adf_obj_set_event(&eng->base, file,
+ (struct adf_set_event __user *)arg);
+
+ case ADF_GET_OVERLAY_ENGINE_DATA:
+ return adf_eng_get_data(eng,
+ (struct adf_overlay_engine_data __user *)arg);
+
+ case ADF_BLANK:
+ case ADF_POST_CONFIG:
+ case ADF_SET_MODE:
+ case ADF_GET_DEVICE_DATA:
+ case ADF_GET_INTERFACE_DATA:
+ case ADF_SIMPLE_POST_CONFIG:
+ case ADF_SIMPLE_BUFFER_ALLOC:
+ case ADF_ATTACH:
+ case ADF_DETACH:
+ return -EINVAL;
+
+ default:
+ return adf_obj_custom_ioctl(&eng->base, cmd, arg);
+ }
+}
+
+static long adf_interface_ioctl(struct adf_interface *intf,
+ struct adf_file *file, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case ADF_SET_EVENT:
+ return adf_obj_set_event(&intf->base, file,
+ (struct adf_set_event __user *)arg);
+
+ case ADF_BLANK:
+ return adf_interface_blank(intf, arg);
+
+ case ADF_SET_MODE:
+ return adf_intf_set_mode(intf,
+ (struct drm_mode_modeinfo __user *)arg);
+
+ case ADF_GET_INTERFACE_DATA:
+ return adf_intf_get_data(intf,
+ (struct adf_interface_data __user *)arg);
+
+ case ADF_SIMPLE_POST_CONFIG:
+ return adf_intf_simple_post_config(intf,
+ (struct adf_simple_post_config __user *)arg);
+
+ case ADF_SIMPLE_BUFFER_ALLOC:
+ return adf_intf_simple_buffer_alloc(intf,
+ (struct adf_simple_buffer_alloc __user *)arg);
+
+ case ADF_POST_CONFIG:
+ case ADF_GET_DEVICE_DATA:
+ case ADF_GET_OVERLAY_ENGINE_DATA:
+ case ADF_ATTACH:
+ case ADF_DETACH:
+ return -EINVAL;
+
+ default:
+ return adf_obj_custom_ioctl(&intf->base, cmd, arg);
+ }
+}
+
+static long adf_device_ioctl(struct adf_device *dev, struct adf_file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case ADF_SET_EVENT:
+ return adf_obj_set_event(&dev->base, file,
+ (struct adf_set_event __user *)arg);
+
+ case ADF_POST_CONFIG:
+ return adf_device_post_config(dev,
+ (struct adf_post_config __user *)arg);
+
+ case ADF_GET_DEVICE_DATA:
+ return adf_device_get_data(dev,
+ (struct adf_device_data __user *)arg);
+
+ case ADF_ATTACH:
+ return adf_device_handle_attachment(dev,
+ (struct adf_attachment_config __user *)arg,
+ true);
+
+ case ADF_DETACH:
+ return adf_device_handle_attachment(dev,
+ (struct adf_attachment_config __user *)arg,
+ false);
+
+ case ADF_BLANK:
+ case ADF_SET_MODE:
+ case ADF_GET_INTERFACE_DATA:
+ case ADF_GET_OVERLAY_ENGINE_DATA:
+ case ADF_SIMPLE_POST_CONFIG:
+ case ADF_SIMPLE_BUFFER_ALLOC:
+ return -EINVAL;
+
+ default:
+ return adf_obj_custom_ioctl(&dev->base, cmd, arg);
+ }
+}
+
+static int adf_file_open(struct inode *inode, struct file *file)
+{
+ struct adf_obj *obj;
+ struct adf_file *fpriv = NULL;
+ unsigned long flags;
+ int ret = 0;
+
+ obj = adf_obj_sysfs_find(iminor(inode));
+ if (!obj)
+ return -ENODEV;
+
+ dev_dbg(&obj->dev, "opening %s\n", dev_name(&obj->dev));
+
+ if (!try_module_get(obj->parent->ops->owner)) {
+ dev_err(&obj->dev, "getting owner module failed\n");
+ return -ENODEV;
+ }
+
+ fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL);
+ if (!fpriv) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ INIT_LIST_HEAD(&fpriv->head);
+ fpriv->obj = obj;
+ init_waitqueue_head(&fpriv->event_wait);
+
+ file->private_data = fpriv;
+
+ if (obj->ops && obj->ops->open) {
+ ret = obj->ops->open(obj, inode, file);
+ if (ret < 0)
+ goto done;
+ }
+
+ spin_lock_irqsave(&obj->file_lock, flags);
+ list_add_tail(&fpriv->head, &obj->file_list);
+ spin_unlock_irqrestore(&obj->file_lock, flags);
+
+done:
+ if (ret < 0) {
+ kfree(fpriv);
+ module_put(obj->parent->ops->owner);
+ }
+ return ret;
+}
+
+static int adf_file_release(struct inode *inode, struct file *file)
+{
+ struct adf_file *fpriv = file->private_data;
+ struct adf_obj *obj = fpriv->obj;
+ enum adf_event_type event_type;
+ unsigned long flags;
+
+ if (obj->ops && obj->ops->release)
+ obj->ops->release(obj, inode, file);
+
+ spin_lock_irqsave(&obj->file_lock, flags);
+ list_del(&fpriv->head);
+ spin_unlock_irqrestore(&obj->file_lock, flags);
+
+ for_each_set_bit(event_type, fpriv->event_subscriptions,
+ ADF_EVENT_TYPE_MAX) {
+ adf_event_put(obj, event_type);
+ }
+
+ kfree(fpriv);
+ module_put(obj->parent->ops->owner);
+
+ dev_dbg(&obj->dev, "released %s\n", dev_name(&obj->dev));
+ return 0;
+}
+
+long adf_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct adf_file *fpriv = file->private_data;
+ struct adf_obj *obj = fpriv->obj;
+ long ret = -EINVAL;
+
+ dev_dbg(&obj->dev, "%s ioctl %u\n", dev_name(&obj->dev), _IOC_NR(cmd));
+
+ switch (obj->type) {
+ case ADF_OBJ_OVERLAY_ENGINE:
+ ret = adf_overlay_engine_ioctl(adf_obj_to_overlay_engine(obj),
+ fpriv, cmd, arg);
+ break;
+
+ case ADF_OBJ_INTERFACE:
+ ret = adf_interface_ioctl(adf_obj_to_interface(obj), fpriv, cmd,
+ arg);
+ break;
+
+ case ADF_OBJ_DEVICE:
+ ret = adf_device_ioctl(adf_obj_to_device(obj), fpriv, cmd, arg);
+ break;
+ }
+
+ return ret;
+}
+
+static inline bool adf_file_event_available(struct adf_file *fpriv)
+{
+ int head = fpriv->event_head;
+ int tail = fpriv->event_tail;
+ return CIRC_CNT(head, tail, sizeof(fpriv->event_buf)) != 0;
+}
+
+void adf_file_queue_event(struct adf_file *fpriv, struct adf_event *event)
+{
+ int head = fpriv->event_head;
+ int tail = fpriv->event_tail;
+ size_t space = CIRC_SPACE(head, tail, sizeof(fpriv->event_buf));
+ size_t space_to_end =
+ CIRC_SPACE_TO_END(head, tail, sizeof(fpriv->event_buf));
+
+ if (space < event->length) {
+ dev_dbg(&fpriv->obj->dev,
+ "insufficient buffer space for event %u\n",
+ event->type);
+ return;
+ }
+
+ if (space_to_end >= event->length) {
+ memcpy(fpriv->event_buf + head, event, event->length);
+ } else {
+ memcpy(fpriv->event_buf + head, event, space_to_end);
+ memcpy(fpriv->event_buf, (u8 *)event + space_to_end,
+ event->length - space_to_end);
+ }
+
+ smp_wmb();
+ fpriv->event_head = (fpriv->event_head + event->length) &
+ (sizeof(fpriv->event_buf) - 1);
+ wake_up_interruptible_all(&fpriv->event_wait);
+}
+
+static ssize_t adf_file_copy_to_user(struct adf_file *fpriv,
+ char __user *buffer, size_t buffer_size)
+{
+ int head, tail;
+ u8 *event_buf;
+ size_t cnt, cnt_to_end, copy_size = 0;
+ ssize_t ret = 0;
+ unsigned long flags;
+
+ event_buf = kmalloc(min(buffer_size, sizeof(fpriv->event_buf)),
+ GFP_KERNEL);
+ if (!event_buf)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&fpriv->obj->file_lock, flags);
+
+ if (!adf_file_event_available(fpriv))
+ goto out;
+
+ head = fpriv->event_head;
+ tail = fpriv->event_tail;
+
+ cnt = CIRC_CNT(head, tail, sizeof(fpriv->event_buf));
+ cnt_to_end = CIRC_CNT_TO_END(head, tail, sizeof(fpriv->event_buf));
+ copy_size = min(buffer_size, cnt);
+
+ if (cnt_to_end >= copy_size) {
+ memcpy(event_buf, fpriv->event_buf + tail, copy_size);
+ } else {
+ memcpy(event_buf, fpriv->event_buf + tail, cnt_to_end);
+ memcpy(event_buf + cnt_to_end, fpriv->event_buf,
+ copy_size - cnt_to_end);
+ }
+
+ fpriv->event_tail = (fpriv->event_tail + copy_size) &
+ (sizeof(fpriv->event_buf) - 1);
+
+out:
+ spin_unlock_irqrestore(&fpriv->obj->file_lock, flags);
+ if (copy_size) {
+ if (copy_to_user(buffer, event_buf, copy_size))
+ ret = -EFAULT;
+ else
+ ret = copy_size;
+ }
+ kfree(event_buf);
+ return ret;
+}
+
+ssize_t adf_file_read(struct file *filp, char __user *buffer,
+ size_t count, loff_t *offset)
+{
+ struct adf_file *fpriv = filp->private_data;
+ int err;
+
+ err = wait_event_interruptible(fpriv->event_wait,
+ adf_file_event_available(fpriv));
+ if (err < 0)
+ return err;
+
+ return adf_file_copy_to_user(fpriv, buffer, count);
+}
+
+unsigned int adf_file_poll(struct file *filp, struct poll_table_struct *wait)
+{
+ struct adf_file *fpriv = filp->private_data;
+ unsigned int mask = 0;
+
+ poll_wait(filp, &fpriv->event_wait, wait);
+
+ if (adf_file_event_available(fpriv))
+ mask |= POLLIN | POLLRDNORM;
+
+ return mask;
+}
+
+const struct file_operations adf_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = adf_file_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = adf_file_compat_ioctl,
+#endif
+ .open = adf_file_open,
+ .release = adf_file_release,
+ .llseek = default_llseek,
+ .read = adf_file_read,
+ .poll = adf_file_poll,
+};
diff --git a/drivers/video/adf/adf_fops.h b/drivers/video/adf/adf_fops.h
new file mode 100644
index 000000000000..90a3a74796d6
--- /dev/null
+++ b/drivers/video/adf/adf_fops.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __VIDEO_ADF_ADF_FOPS_H
+#define __VIDEO_ADF_ADF_FOPS_H
+
+#include <linux/bitmap.h>
+#include <linux/fs.h>
+
+extern const struct file_operations adf_fops;
+
+struct adf_file {
+ struct list_head head;
+ struct adf_obj *obj;
+
+ DECLARE_BITMAP(event_subscriptions, ADF_EVENT_TYPE_MAX);
+ u8 event_buf[4096];
+ int event_head;
+ int event_tail;
+ wait_queue_head_t event_wait;
+};
+
+void adf_file_queue_event(struct adf_file *file, struct adf_event *event);
+long adf_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
+#endif /* __VIDEO_ADF_ADF_FOPS_H */
diff --git a/drivers/video/adf/adf_fops32.c b/drivers/video/adf/adf_fops32.c
new file mode 100644
index 000000000000..d299a8161491
--- /dev/null
+++ b/drivers/video/adf/adf_fops32.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/uaccess.h>
+#include <video/adf.h>
+
+#include "adf_fops.h"
+#include "adf_fops32.h"
+
+long adf_compat_post_config(struct file *file,
+ struct adf_post_config32 __user *arg)
+{
+ struct adf_post_config32 cfg32;
+ struct adf_post_config __user *cfg;
+ int ret;
+
+ if (copy_from_user(&cfg32, arg, sizeof(cfg32)))
+ return -EFAULT;
+
+ cfg = compat_alloc_user_space(sizeof(*cfg));
+ if (!access_ok(VERIFY_WRITE, cfg, sizeof(*cfg)))
+ return -EFAULT;
+
+ if (put_user(cfg32.n_interfaces, &cfg->n_interfaces) ||
+ put_user(compat_ptr(cfg32.interfaces),
+ &cfg->interfaces) ||
+ put_user(cfg32.n_bufs, &cfg->n_bufs) ||
+ put_user(compat_ptr(cfg32.bufs), &cfg->bufs) ||
+ put_user(cfg32.custom_data_size,
+ &cfg->custom_data_size) ||
+ put_user(compat_ptr(cfg32.custom_data),
+ &cfg->custom_data))
+ return -EFAULT;
+
+ ret = adf_file_ioctl(file, ADF_POST_CONFIG, (unsigned long)cfg);
+ if (ret < 0)
+ return ret;
+
+ if (copy_in_user(&arg->complete_fence, &cfg->complete_fence,
+ sizeof(cfg->complete_fence)))
+ return -EFAULT;
+
+ return 0;
+}
+
+long adf_compat_get_device_data(struct file *file,
+ struct adf_device_data32 __user *arg)
+{
+ struct adf_device_data32 data32;
+ struct adf_device_data __user *data;
+ int ret;
+
+ if (copy_from_user(&data32, arg, sizeof(data32)))
+ return -EFAULT;
+
+ data = compat_alloc_user_space(sizeof(*data));
+ if (!access_ok(VERIFY_WRITE, data, sizeof(*data)))
+ return -EFAULT;
+
+ if (put_user(data32.n_attachments, &data->n_attachments) ||
+ put_user(compat_ptr(data32.attachments),
+ &data->attachments) ||
+ put_user(data32.n_allowed_attachments,
+ &data->n_allowed_attachments) ||
+ put_user(compat_ptr(data32.allowed_attachments),
+ &data->allowed_attachments) ||
+ put_user(data32.custom_data_size,
+ &data->custom_data_size) ||
+ put_user(compat_ptr(data32.custom_data),
+ &data->custom_data))
+ return -EFAULT;
+
+ ret = adf_file_ioctl(file, ADF_GET_DEVICE_DATA, (unsigned long)data);
+ if (ret < 0)
+ return ret;
+
+ if (copy_in_user(arg->name, data->name, sizeof(arg->name)) ||
+ copy_in_user(&arg->n_attachments, &data->n_attachments,
+ sizeof(arg->n_attachments)) ||
+ copy_in_user(&arg->n_allowed_attachments,
+ &data->n_allowed_attachments,
+ sizeof(arg->n_allowed_attachments)) ||
+ copy_in_user(&arg->custom_data_size,
+ &data->custom_data_size,
+ sizeof(arg->custom_data_size)))
+ return -EFAULT;
+
+ return 0;
+}
+
+long adf_compat_get_interface_data(struct file *file,
+ struct adf_interface_data32 __user *arg)
+{
+ struct adf_interface_data32 data32;
+ struct adf_interface_data __user *data;
+ int ret;
+
+ if (copy_from_user(&data32, arg, sizeof(data32)))
+ return -EFAULT;
+
+ data = compat_alloc_user_space(sizeof(*data));
+ if (!access_ok(VERIFY_WRITE, data, sizeof(*data)))
+ return -EFAULT;
+
+ if (put_user(data32.n_available_modes, &data->n_available_modes) ||
+ put_user(compat_ptr(data32.available_modes),
+ &data->available_modes) ||
+ put_user(data32.custom_data_size,
+ &data->custom_data_size) ||
+ put_user(compat_ptr(data32.custom_data),
+ &data->custom_data))
+ return -EFAULT;
+
+ ret = adf_file_ioctl(file, ADF_GET_INTERFACE_DATA, (unsigned long)data);
+ if (ret < 0)
+ return ret;
+
+ if (copy_in_user(arg->name, data->name, sizeof(arg->name)) ||
+ copy_in_user(&arg->type, &data->type,
+ sizeof(arg->type)) ||
+ copy_in_user(&arg->id, &data->id, sizeof(arg->id)) ||
+ copy_in_user(&arg->flags, &data->flags,
+ sizeof(arg->flags)) ||
+ copy_in_user(&arg->dpms_state, &data->dpms_state,
+ sizeof(arg->dpms_state)) ||
+ copy_in_user(&arg->hotplug_detect,
+ &data->hotplug_detect,
+ sizeof(arg->hotplug_detect)) ||
+ copy_in_user(&arg->width_mm, &data->width_mm,
+ sizeof(arg->width_mm)) ||
+ copy_in_user(&arg->height_mm, &data->height_mm,
+ sizeof(arg->height_mm)) ||
+ copy_in_user(&arg->current_mode, &data->current_mode,
+ sizeof(arg->current_mode)) ||
+ copy_in_user(&arg->n_available_modes,
+ &data->n_available_modes,
+ sizeof(arg->n_available_modes)) ||
+ copy_in_user(&arg->custom_data_size,
+ &data->custom_data_size,
+ sizeof(arg->custom_data_size)))
+ return -EFAULT;
+
+ return 0;
+}
+
+long adf_compat_get_overlay_engine_data(struct file *file,
+ struct adf_overlay_engine_data32 __user *arg)
+{
+ struct adf_overlay_engine_data32 data32;
+ struct adf_overlay_engine_data __user *data;
+ int ret;
+
+ if (copy_from_user(&data32, arg, sizeof(data32)))
+ return -EFAULT;
+
+ data = compat_alloc_user_space(sizeof(*data));
+ if (!access_ok(VERIFY_WRITE, data, sizeof(*data)))
+ return -EFAULT;
+
+ if (put_user(data32.n_supported_formats, &data->n_supported_formats) ||
+ put_user(compat_ptr(data32.supported_formats),
+ &data->supported_formats) ||
+ put_user(data32.custom_data_size,
+ &data->custom_data_size) ||
+ put_user(compat_ptr(data32.custom_data),
+ &data->custom_data))
+ return -EFAULT;
+
+ ret = adf_file_ioctl(file, ADF_GET_OVERLAY_ENGINE_DATA,
+ (unsigned long)data);
+ if (ret < 0)
+ return ret;
+
+ if (copy_in_user(arg->name, data->name, sizeof(arg->name)) ||
+ copy_in_user(&arg->n_supported_formats,
+ &data->n_supported_formats,
+ sizeof(arg->n_supported_formats)) ||
+ copy_in_user(&arg->custom_data_size,
+ &data->custom_data_size,
+ sizeof(arg->custom_data_size)))
+ return -EFAULT;
+
+ return 0;
+}
+
+long adf_file_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case ADF_POST_CONFIG32:
+ return adf_compat_post_config(file, compat_ptr(arg));
+
+ case ADF_GET_DEVICE_DATA32:
+ return adf_compat_get_device_data(file, compat_ptr(arg));
+
+ case ADF_GET_INTERFACE_DATA32:
+ return adf_compat_get_interface_data(file, compat_ptr(arg));
+
+ case ADF_GET_OVERLAY_ENGINE_DATA32:
+ return adf_compat_get_overlay_engine_data(file,
+ compat_ptr(arg));
+
+ default:
+ return adf_file_ioctl(file, cmd, arg);
+ }
+}
diff --git a/drivers/video/adf/adf_fops32.h b/drivers/video/adf/adf_fops32.h
new file mode 100644
index 000000000000..64034ce33a6b
--- /dev/null
+++ b/drivers/video/adf/adf_fops32.h
@@ -0,0 +1,78 @@
+#ifndef __VIDEO_ADF_ADF_FOPS32_H
+#define __VIDEO_ADF_ADF_FOPS32_H
+
+#include <linux/compat.h>
+#include <linux/ioctl.h>
+
+#include <video/adf.h>
+
+#define ADF_POST_CONFIG32 \
+ _IOW(ADF_IOCTL_TYPE, 2, struct adf_post_config32)
+#define ADF_GET_DEVICE_DATA32 \
+ _IOR(ADF_IOCTL_TYPE, 4, struct adf_device_data32)
+#define ADF_GET_INTERFACE_DATA32 \
+ _IOR(ADF_IOCTL_TYPE, 5, struct adf_interface_data32)
+#define ADF_GET_OVERLAY_ENGINE_DATA32 \
+ _IOR(ADF_IOCTL_TYPE, 6, struct adf_overlay_engine_data32)
+
+struct adf_post_config32 {
+ compat_size_t n_interfaces;
+ compat_uptr_t interfaces;
+
+ compat_size_t n_bufs;
+ compat_uptr_t bufs;
+
+ compat_size_t custom_data_size;
+ compat_uptr_t custom_data;
+
+ __s32 complete_fence;
+};
+
+struct adf_device_data32 {
+ char name[ADF_NAME_LEN];
+
+ compat_size_t n_attachments;
+ compat_uptr_t attachments;
+
+ compat_size_t n_allowed_attachments;
+ compat_uptr_t allowed_attachments;
+
+ compat_size_t custom_data_size;
+ compat_uptr_t custom_data;
+};
+
+struct adf_interface_data32 {
+ char name[ADF_NAME_LEN];
+
+ __u8 type;
+ __u32 id;
+ /* e.g. type=ADF_INTF_TYPE_DSI, id=1 => DSI.1 */
+ __u32 flags;
+
+ __u8 dpms_state;
+ __u8 hotplug_detect;
+ __u16 width_mm;
+ __u16 height_mm;
+
+ struct drm_mode_modeinfo current_mode;
+ compat_size_t n_available_modes;
+ compat_uptr_t available_modes;
+
+ compat_size_t custom_data_size;
+ compat_uptr_t custom_data;
+};
+
+struct adf_overlay_engine_data32 {
+ char name[ADF_NAME_LEN];
+
+ compat_size_t n_supported_formats;
+ compat_uptr_t supported_formats;
+
+ compat_size_t custom_data_size;
+ compat_uptr_t custom_data;
+};
+
+long adf_file_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg);
+
+#endif /* __VIDEO_ADF_ADF_FOPS32_H */
diff --git a/drivers/video/adf/adf_format.c b/drivers/video/adf/adf_format.c
new file mode 100644
index 000000000000..e3f22c7c85d9
--- /dev/null
+++ b/drivers/video/adf/adf_format.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ * modified from drivers/gpu/drm/drm_crtc.c
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <drm/drm_fourcc.h>
+#include <video/adf_format.h>
+
+bool adf_format_is_standard(u32 format)
+{
+ switch (format) {
+ case DRM_FORMAT_C8:
+ case DRM_FORMAT_RGB332:
+ case DRM_FORMAT_BGR233:
+ case DRM_FORMAT_XRGB4444:
+ case DRM_FORMAT_XBGR4444:
+ case DRM_FORMAT_RGBX4444:
+ case DRM_FORMAT_BGRX4444:
+ case DRM_FORMAT_ARGB4444:
+ case DRM_FORMAT_ABGR4444:
+ case DRM_FORMAT_RGBA4444:
+ case DRM_FORMAT_BGRA4444:
+ case DRM_FORMAT_XRGB1555:
+ case DRM_FORMAT_XBGR1555:
+ case DRM_FORMAT_RGBX5551:
+ case DRM_FORMAT_BGRX5551:
+ case DRM_FORMAT_ARGB1555:
+ case DRM_FORMAT_ABGR1555:
+ case DRM_FORMAT_RGBA5551:
+ case DRM_FORMAT_BGRA5551:
+ case DRM_FORMAT_RGB565:
+ case DRM_FORMAT_BGR565:
+ case DRM_FORMAT_RGB888:
+ case DRM_FORMAT_BGR888:
+ case DRM_FORMAT_XRGB8888:
+ case DRM_FORMAT_XBGR8888:
+ case DRM_FORMAT_RGBX8888:
+ case DRM_FORMAT_BGRX8888:
+ case DRM_FORMAT_ARGB8888:
+ case DRM_FORMAT_ABGR8888:
+ case DRM_FORMAT_RGBA8888:
+ case DRM_FORMAT_BGRA8888:
+ case DRM_FORMAT_XRGB2101010:
+ case DRM_FORMAT_XBGR2101010:
+ case DRM_FORMAT_RGBX1010102:
+ case DRM_FORMAT_BGRX1010102:
+ case DRM_FORMAT_ARGB2101010:
+ case DRM_FORMAT_ABGR2101010:
+ case DRM_FORMAT_RGBA1010102:
+ case DRM_FORMAT_BGRA1010102:
+ case DRM_FORMAT_YUYV:
+ case DRM_FORMAT_YVYU:
+ case DRM_FORMAT_UYVY:
+ case DRM_FORMAT_VYUY:
+ case DRM_FORMAT_AYUV:
+ case DRM_FORMAT_NV12:
+ case DRM_FORMAT_NV21:
+ case DRM_FORMAT_NV16:
+ case DRM_FORMAT_NV61:
+ case DRM_FORMAT_YUV410:
+ case DRM_FORMAT_YVU410:
+ case DRM_FORMAT_YUV411:
+ case DRM_FORMAT_YVU411:
+ case DRM_FORMAT_YUV420:
+ case DRM_FORMAT_YVU420:
+ case DRM_FORMAT_YUV422:
+ case DRM_FORMAT_YVU422:
+ case DRM_FORMAT_YUV444:
+ case DRM_FORMAT_YVU444:
+ return true;
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL(adf_format_is_standard);
+
+bool adf_format_is_rgb(u32 format)
+{
+ switch (format) {
+ case DRM_FORMAT_C8:
+ case DRM_FORMAT_RGB332:
+ case DRM_FORMAT_BGR233:
+ case DRM_FORMAT_XRGB1555:
+ case DRM_FORMAT_XBGR1555:
+ case DRM_FORMAT_RGBX5551:
+ case DRM_FORMAT_BGRX5551:
+ case DRM_FORMAT_ARGB1555:
+ case DRM_FORMAT_ABGR1555:
+ case DRM_FORMAT_RGBA5551:
+ case DRM_FORMAT_BGRA5551:
+ case DRM_FORMAT_RGB565:
+ case DRM_FORMAT_BGR565:
+ case DRM_FORMAT_RGB888:
+ case DRM_FORMAT_BGR888:
+ case DRM_FORMAT_XRGB8888:
+ case DRM_FORMAT_XBGR8888:
+ case DRM_FORMAT_RGBX8888:
+ case DRM_FORMAT_BGRX8888:
+ case DRM_FORMAT_XRGB2101010:
+ case DRM_FORMAT_XBGR2101010:
+ case DRM_FORMAT_RGBX1010102:
+ case DRM_FORMAT_BGRX1010102:
+ case DRM_FORMAT_ARGB2101010:
+ case DRM_FORMAT_ABGR2101010:
+ case DRM_FORMAT_RGBA1010102:
+ case DRM_FORMAT_BGRA1010102:
+ case DRM_FORMAT_ARGB8888:
+ case DRM_FORMAT_ABGR8888:
+ case DRM_FORMAT_RGBA8888:
+ case DRM_FORMAT_BGRA8888:
+ return true;
+
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL(adf_format_is_rgb);
+
+u8 adf_format_num_planes(u32 format)
+{
+ switch (format) {
+ case DRM_FORMAT_YUV410:
+ case DRM_FORMAT_YVU410:
+ case DRM_FORMAT_YUV411:
+ case DRM_FORMAT_YVU411:
+ case DRM_FORMAT_YUV420:
+ case DRM_FORMAT_YVU420:
+ case DRM_FORMAT_YUV422:
+ case DRM_FORMAT_YVU422:
+ case DRM_FORMAT_YUV444:
+ case DRM_FORMAT_YVU444:
+ return 3;
+ case DRM_FORMAT_NV12:
+ case DRM_FORMAT_NV21:
+ case DRM_FORMAT_NV16:
+ case DRM_FORMAT_NV61:
+ return 2;
+ default:
+ return 1;
+ }
+}
+EXPORT_SYMBOL(adf_format_num_planes);
+
+u8 adf_format_bpp(u32 format)
+{
+ switch (format) {
+ case DRM_FORMAT_C8:
+ case DRM_FORMAT_RGB332:
+ case DRM_FORMAT_BGR233:
+ return 8;
+
+ case DRM_FORMAT_XRGB1555:
+ case DRM_FORMAT_XBGR1555:
+ case DRM_FORMAT_RGBX5551:
+ case DRM_FORMAT_BGRX5551:
+ case DRM_FORMAT_ARGB1555:
+ case DRM_FORMAT_ABGR1555:
+ case DRM_FORMAT_RGBA5551:
+ case DRM_FORMAT_BGRA5551:
+ case DRM_FORMAT_RGB565:
+ case DRM_FORMAT_BGR565:
+ return 16;
+
+ case DRM_FORMAT_RGB888:
+ case DRM_FORMAT_BGR888:
+ return 24;
+
+ case DRM_FORMAT_XRGB8888:
+ case DRM_FORMAT_XBGR8888:
+ case DRM_FORMAT_RGBX8888:
+ case DRM_FORMAT_BGRX8888:
+ case DRM_FORMAT_XRGB2101010:
+ case DRM_FORMAT_XBGR2101010:
+ case DRM_FORMAT_RGBX1010102:
+ case DRM_FORMAT_BGRX1010102:
+ case DRM_FORMAT_ARGB2101010:
+ case DRM_FORMAT_ABGR2101010:
+ case DRM_FORMAT_RGBA1010102:
+ case DRM_FORMAT_BGRA1010102:
+ case DRM_FORMAT_ARGB8888:
+ case DRM_FORMAT_ABGR8888:
+ case DRM_FORMAT_RGBA8888:
+ case DRM_FORMAT_BGRA8888:
+ return 32;
+
+ default:
+ pr_debug("%s: unsupported pixel format %u\n", __func__, format);
+ return 0;
+ }
+}
+EXPORT_SYMBOL(adf_format_bpp);
+
+u8 adf_format_plane_cpp(u32 format, int plane)
+{
+ if (plane >= adf_format_num_planes(format))
+ return 0;
+
+ switch (format) {
+ case DRM_FORMAT_YUYV:
+ case DRM_FORMAT_YVYU:
+ case DRM_FORMAT_UYVY:
+ case DRM_FORMAT_VYUY:
+ return 2;
+ case DRM_FORMAT_NV12:
+ case DRM_FORMAT_NV21:
+ case DRM_FORMAT_NV16:
+ case DRM_FORMAT_NV61:
+ return plane ? 2 : 1;
+ case DRM_FORMAT_YUV410:
+ case DRM_FORMAT_YVU410:
+ case DRM_FORMAT_YUV411:
+ case DRM_FORMAT_YVU411:
+ case DRM_FORMAT_YUV420:
+ case DRM_FORMAT_YVU420:
+ case DRM_FORMAT_YUV422:
+ case DRM_FORMAT_YVU422:
+ case DRM_FORMAT_YUV444:
+ case DRM_FORMAT_YVU444:
+ return 1;
+ default:
+ return adf_format_bpp(format) / 8;
+ }
+}
+EXPORT_SYMBOL(adf_format_plane_cpp);
+
+u8 adf_format_horz_chroma_subsampling(u32 format)
+{
+ switch (format) {
+ case DRM_FORMAT_YUV411:
+ case DRM_FORMAT_YVU411:
+ case DRM_FORMAT_YUV410:
+ case DRM_FORMAT_YVU410:
+ return 4;
+ case DRM_FORMAT_YUYV:
+ case DRM_FORMAT_YVYU:
+ case DRM_FORMAT_UYVY:
+ case DRM_FORMAT_VYUY:
+ case DRM_FORMAT_NV12:
+ case DRM_FORMAT_NV21:
+ case DRM_FORMAT_NV16:
+ case DRM_FORMAT_NV61:
+ case DRM_FORMAT_YUV422:
+ case DRM_FORMAT_YVU422:
+ case DRM_FORMAT_YUV420:
+ case DRM_FORMAT_YVU420:
+ return 2;
+ default:
+ return 1;
+ }
+}
+EXPORT_SYMBOL(adf_format_horz_chroma_subsampling);
+
+u8 adf_format_vert_chroma_subsampling(u32 format)
+{
+ switch (format) {
+ case DRM_FORMAT_YUV410:
+ case DRM_FORMAT_YVU410:
+ return 4;
+ case DRM_FORMAT_YUV420:
+ case DRM_FORMAT_YVU420:
+ case DRM_FORMAT_NV12:
+ case DRM_FORMAT_NV21:
+ return 2;
+ default:
+ return 1;
+ }
+}
+EXPORT_SYMBOL(adf_format_vert_chroma_subsampling);
diff --git a/drivers/video/adf/adf_memblock.c b/drivers/video/adf/adf_memblock.c
new file mode 100644
index 000000000000..ab583f838db2
--- /dev/null
+++ b/drivers/video/adf/adf_memblock.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/dma-buf.h>
+#include <linux/highmem.h>
+#include <linux/memblock.h>
+#include <linux/slab.h>
+
+struct adf_memblock_pdata {
+ phys_addr_t base;
+};
+
+static struct sg_table *adf_memblock_map(struct dma_buf_attachment *attach,
+ enum dma_data_direction direction)
+{
+ struct adf_memblock_pdata *pdata = attach->dmabuf->priv;
+ unsigned long pfn = PFN_DOWN(pdata->base);
+ struct page *page = pfn_to_page(pfn);
+ struct sg_table *table;
+ int nents, ret;
+
+ table = kzalloc(sizeof(*table), GFP_KERNEL);
+ if (!table)
+ return ERR_PTR(-ENOMEM);
+
+ ret = sg_alloc_table(table, 1, GFP_KERNEL);
+ if (ret < 0)
+ goto err_alloc;
+
+ sg_set_page(table->sgl, page, attach->dmabuf->size, 0);
+
+ nents = dma_map_sg(attach->dev, table->sgl, 1, direction);
+ if (!nents) {
+ ret = -EINVAL;
+ goto err_map;
+ }
+
+ return table;
+
+err_map:
+ sg_free_table(table);
+err_alloc:
+ kfree(table);
+ return ERR_PTR(ret);
+}
+
+static void adf_memblock_unmap(struct dma_buf_attachment *attach,
+ struct sg_table *table, enum dma_data_direction direction)
+{
+ dma_unmap_sg(attach->dev, table->sgl, 1, direction);
+ sg_free_table(table);
+}
+
+static void __init_memblock adf_memblock_release(struct dma_buf *buf)
+{
+ struct adf_memblock_pdata *pdata = buf->priv;
+ int err = memblock_free(pdata->base, buf->size);
+
+ if (err < 0)
+ pr_warn("%s: freeing memblock failed: %d\n", __func__, err);
+ kfree(pdata);
+}
+
+static void *adf_memblock_do_kmap(struct dma_buf *buf, unsigned long pgoffset,
+ bool atomic)
+{
+ struct adf_memblock_pdata *pdata = buf->priv;
+ unsigned long pfn = PFN_DOWN(pdata->base) + pgoffset;
+ struct page *page = pfn_to_page(pfn);
+
+ if (atomic)
+ return kmap_atomic(page);
+ else
+ return kmap(page);
+}
+
+static void *adf_memblock_kmap_atomic(struct dma_buf *buf,
+ unsigned long pgoffset)
+{
+ return adf_memblock_do_kmap(buf, pgoffset, true);
+}
+
+static void adf_memblock_kunmap_atomic(struct dma_buf *buf,
+ unsigned long pgoffset, void *vaddr)
+{
+ kunmap_atomic(vaddr);
+}
+
+static void *adf_memblock_kmap(struct dma_buf *buf, unsigned long pgoffset)
+{
+ return adf_memblock_do_kmap(buf, pgoffset, false);
+}
+
+static void adf_memblock_kunmap(struct dma_buf *buf, unsigned long pgoffset,
+ void *vaddr)
+{
+ kunmap(vaddr);
+}
+
+static int adf_memblock_mmap(struct dma_buf *buf, struct vm_area_struct *vma)
+{
+ struct adf_memblock_pdata *pdata = buf->priv;
+
+ return remap_pfn_range(vma, vma->vm_start, PFN_DOWN(pdata->base),
+ vma->vm_end - vma->vm_start, vma->vm_page_prot);
+}
+
+struct dma_buf_ops adf_memblock_ops = {
+ .map_dma_buf = adf_memblock_map,
+ .unmap_dma_buf = adf_memblock_unmap,
+ .release = adf_memblock_release,
+ .kmap_atomic = adf_memblock_kmap_atomic,
+ .kunmap_atomic = adf_memblock_kunmap_atomic,
+ .kmap = adf_memblock_kmap,
+ .kunmap = adf_memblock_kunmap,
+ .mmap = adf_memblock_mmap,
+};
+
+/**
+ * adf_memblock_export - export a memblock reserved area as a dma-buf
+ *
+ * @base: base physical address
+ * @size: memblock size
+ * @flags: mode flags for the dma-buf's file
+ *
+ * @base and @size must be page-aligned.
+ *
+ * Returns a dma-buf on success or ERR_PTR(-errno) on failure.
+ */
+struct dma_buf *adf_memblock_export(phys_addr_t base, size_t size, int flags)
+{
+ struct adf_memblock_pdata *pdata;
+ struct dma_buf *buf;
+
+ if (PAGE_ALIGN(base) != base || PAGE_ALIGN(size) != size)
+ return ERR_PTR(-EINVAL);
+
+ pdata = kzalloc(sizeof(*pdata), GFP_KERNEL);
+ if (!pdata)
+ return ERR_PTR(-ENOMEM);
+
+ pdata->base = base;
+ buf = dma_buf_export(pdata, &adf_memblock_ops, size, flags, NULL);
+ if (IS_ERR(buf))
+ kfree(pdata);
+
+ return buf;
+}
+EXPORT_SYMBOL(adf_memblock_export);
diff --git a/drivers/video/adf/adf_sysfs.c b/drivers/video/adf/adf_sysfs.c
new file mode 100644
index 000000000000..8c659c71ffa8
--- /dev/null
+++ b/drivers/video/adf/adf_sysfs.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <video/adf_client.h>
+
+#include "adf.h"
+#include "adf_fops.h"
+#include "adf_sysfs.h"
+
+static struct class *adf_class;
+static int adf_major;
+static DEFINE_IDR(adf_minors);
+
+#define dev_to_adf_interface(p) \
+ adf_obj_to_interface(container_of(p, struct adf_obj, dev))
+
+static ssize_t dpms_state_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct adf_interface *intf = dev_to_adf_interface(dev);
+ return scnprintf(buf, PAGE_SIZE, "%u\n",
+ adf_interface_dpms_state(intf));
+}
+
+static ssize_t dpms_state_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct adf_interface *intf = dev_to_adf_interface(dev);
+ u8 dpms_state;
+ int err;
+
+ err = kstrtou8(buf, 0, &dpms_state);
+ if (err < 0)
+ return err;
+
+ err = adf_interface_blank(intf, dpms_state);
+ if (err < 0)
+ return err;
+
+ return count;
+}
+
+static ssize_t current_mode_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct adf_interface *intf = dev_to_adf_interface(dev);
+ struct drm_mode_modeinfo mode;
+
+ adf_interface_current_mode(intf, &mode);
+
+ if (mode.name[0]) {
+ return scnprintf(buf, PAGE_SIZE, "%s\n", mode.name);
+ } else {
+ bool interlaced = !!(mode.flags & DRM_MODE_FLAG_INTERLACE);
+ return scnprintf(buf, PAGE_SIZE, "%ux%u%s\n", mode.hdisplay,
+ mode.vdisplay, interlaced ? "i" : "");
+ }
+}
+
+static ssize_t type_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct adf_interface *intf = dev_to_adf_interface(dev);
+ return scnprintf(buf, PAGE_SIZE, "%s\n",
+ adf_interface_type_str(intf));
+}
+
+static ssize_t vsync_timestamp_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct adf_interface *intf = dev_to_adf_interface(dev);
+ ktime_t timestamp;
+ unsigned long flags;
+
+ read_lock_irqsave(&intf->vsync_lock, flags);
+ memcpy(&timestamp, &intf->vsync_timestamp, sizeof(timestamp));
+ read_unlock_irqrestore(&intf->vsync_lock, flags);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", ktime_to_ns(timestamp));
+}
+
+static ssize_t hotplug_detect_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct adf_interface *intf = dev_to_adf_interface(dev);
+ return scnprintf(buf, PAGE_SIZE, "%u\n", intf->hotplug_detect);
+}
+
+static struct device_attribute adf_interface_attrs[] = {
+ __ATTR(dpms_state, S_IRUGO|S_IWUSR, dpms_state_show, dpms_state_store),
+ __ATTR_RO(current_mode),
+ __ATTR_RO(hotplug_detect),
+ __ATTR_RO(type),
+ __ATTR_RO(vsync_timestamp),
+};
+
+int adf_obj_sysfs_init(struct adf_obj *obj, struct device *parent)
+{
+ int ret = idr_alloc(&adf_minors, obj, 0, 0, GFP_KERNEL);
+ if (ret < 0) {
+ pr_err("%s: allocating adf minor failed: %d\n", __func__,
+ ret);
+ return ret;
+ }
+
+ obj->minor = ret;
+ obj->dev.parent = parent;
+ obj->dev.class = adf_class;
+ obj->dev.devt = MKDEV(adf_major, obj->minor);
+
+ ret = device_register(&obj->dev);
+ if (ret < 0) {
+ pr_err("%s: registering adf object failed: %d\n", __func__,
+ ret);
+ goto err_device_register;
+ }
+
+ return 0;
+
+err_device_register:
+ idr_remove(&adf_minors, obj->minor);
+ return ret;
+}
+
+static char *adf_device_devnode(struct device *dev, umode_t *mode,
+ kuid_t *uid, kgid_t *gid)
+{
+ struct adf_obj *obj = container_of(dev, struct adf_obj, dev);
+ return kasprintf(GFP_KERNEL, "adf%d", obj->id);
+}
+
+static char *adf_interface_devnode(struct device *dev, umode_t *mode,
+ kuid_t *uid, kgid_t *gid)
+{
+ struct adf_obj *obj = container_of(dev, struct adf_obj, dev);
+ struct adf_interface *intf = adf_obj_to_interface(obj);
+ struct adf_device *parent = adf_interface_parent(intf);
+ return kasprintf(GFP_KERNEL, "adf-interface%d.%d",
+ parent->base.id, intf->base.id);
+}
+
+static char *adf_overlay_engine_devnode(struct device *dev, umode_t *mode,
+ kuid_t *uid, kgid_t *gid)
+{
+ struct adf_obj *obj = container_of(dev, struct adf_obj, dev);
+ struct adf_overlay_engine *eng = adf_obj_to_overlay_engine(obj);
+ struct adf_device *parent = adf_overlay_engine_parent(eng);
+ return kasprintf(GFP_KERNEL, "adf-overlay-engine%d.%d",
+ parent->base.id, eng->base.id);
+}
+
+static void adf_noop_release(struct device *dev)
+{
+}
+
+static struct device_type adf_device_type = {
+ .name = "adf_device",
+ .devnode = adf_device_devnode,
+ .release = adf_noop_release,
+};
+
+static struct device_type adf_interface_type = {
+ .name = "adf_interface",
+ .devnode = adf_interface_devnode,
+ .release = adf_noop_release,
+};
+
+static struct device_type adf_overlay_engine_type = {
+ .name = "adf_overlay_engine",
+ .devnode = adf_overlay_engine_devnode,
+ .release = adf_noop_release,
+};
+
+int adf_device_sysfs_init(struct adf_device *dev)
+{
+ dev->base.dev.type = &adf_device_type;
+ dev_set_name(&dev->base.dev, "%s", dev->base.name);
+ return adf_obj_sysfs_init(&dev->base, dev->dev);
+}
+
+int adf_interface_sysfs_init(struct adf_interface *intf)
+{
+ struct adf_device *parent = adf_interface_parent(intf);
+ size_t i, j;
+ int ret;
+
+ intf->base.dev.type = &adf_interface_type;
+ dev_set_name(&intf->base.dev, "%s-interface%d", parent->base.name,
+ intf->base.id);
+
+ ret = adf_obj_sysfs_init(&intf->base, &parent->base.dev);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < ARRAY_SIZE(adf_interface_attrs); i++) {
+ ret = device_create_file(&intf->base.dev,
+ &adf_interface_attrs[i]);
+ if (ret < 0) {
+ dev_err(&intf->base.dev, "creating sysfs attribute %s failed: %d\n",
+ adf_interface_attrs[i].attr.name, ret);
+ goto err;
+ }
+ }
+
+ return 0;
+
+err:
+ for (j = 0; j < i; j++)
+ device_remove_file(&intf->base.dev, &adf_interface_attrs[j]);
+ return ret;
+}
+
+int adf_overlay_engine_sysfs_init(struct adf_overlay_engine *eng)
+{
+ struct adf_device *parent = adf_overlay_engine_parent(eng);
+
+ eng->base.dev.type = &adf_overlay_engine_type;
+ dev_set_name(&eng->base.dev, "%s-overlay-engine%d", parent->base.name,
+ eng->base.id);
+
+ return adf_obj_sysfs_init(&eng->base, &parent->base.dev);
+}
+
+struct adf_obj *adf_obj_sysfs_find(int minor)
+{
+ return idr_find(&adf_minors, minor);
+}
+
+void adf_obj_sysfs_destroy(struct adf_obj *obj)
+{
+ idr_remove(&adf_minors, obj->minor);
+ device_unregister(&obj->dev);
+}
+
+void adf_device_sysfs_destroy(struct adf_device *dev)
+{
+ adf_obj_sysfs_destroy(&dev->base);
+}
+
+void adf_interface_sysfs_destroy(struct adf_interface *intf)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(adf_interface_attrs); i++)
+ device_remove_file(&intf->base.dev, &adf_interface_attrs[i]);
+ adf_obj_sysfs_destroy(&intf->base);
+}
+
+void adf_overlay_engine_sysfs_destroy(struct adf_overlay_engine *eng)
+{
+ adf_obj_sysfs_destroy(&eng->base);
+}
+
+int adf_sysfs_init(void)
+{
+ struct class *class;
+ int ret;
+
+ class = class_create(THIS_MODULE, "adf");
+ if (IS_ERR(class)) {
+ ret = PTR_ERR(class);
+ pr_err("%s: creating class failed: %d\n", __func__, ret);
+ return ret;
+ }
+
+ ret = register_chrdev(0, "adf", &adf_fops);
+ if (ret < 0) {
+ pr_err("%s: registering device failed: %d\n", __func__, ret);
+ goto err_chrdev;
+ }
+
+ adf_class = class;
+ adf_major = ret;
+ return 0;
+
+err_chrdev:
+ class_destroy(adf_class);
+ return ret;
+}
+
+void adf_sysfs_destroy(void)
+{
+ idr_destroy(&adf_minors);
+ class_destroy(adf_class);
+}
diff --git a/drivers/video/adf/adf_sysfs.h b/drivers/video/adf/adf_sysfs.h
new file mode 100644
index 000000000000..0613ac364f8d
--- /dev/null
+++ b/drivers/video/adf/adf_sysfs.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __VIDEO_ADF_ADF_SYSFS_H
+#define __VIDEO_ADF_ADF_SYSFS_H
+
+struct adf_device;
+struct adf_interface;
+struct adf_overlay_engine;
+
+int adf_device_sysfs_init(struct adf_device *dev);
+void adf_device_sysfs_destroy(struct adf_device *dev);
+int adf_interface_sysfs_init(struct adf_interface *intf);
+void adf_interface_sysfs_destroy(struct adf_interface *intf);
+int adf_overlay_engine_sysfs_init(struct adf_overlay_engine *eng);
+void adf_overlay_engine_sysfs_destroy(struct adf_overlay_engine *eng);
+struct adf_obj *adf_obj_sysfs_find(int minor);
+
+int adf_sysfs_init(void);
+void adf_sysfs_destroy(void);
+
+#endif /* __VIDEO_ADF_ADF_SYSFS_H */
diff --git a/drivers/video/adf/adf_trace.h b/drivers/video/adf/adf_trace.h
new file mode 100644
index 000000000000..3cb2a84d728c
--- /dev/null
+++ b/drivers/video/adf/adf_trace.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM adf
+
+#if !defined(__VIDEO_ADF_ADF_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __VIDEO_ADF_ADF_TRACE_H
+
+#include <linux/tracepoint.h>
+#include <video/adf.h>
+
+TRACE_EVENT(adf_event,
+ TP_PROTO(struct adf_obj *obj, enum adf_event_type type),
+ TP_ARGS(obj, type),
+
+ TP_STRUCT__entry(
+ __string(name, obj->name)
+ __field(enum adf_event_type, type)
+ __array(char, type_str, 32)
+ ),
+ TP_fast_assign(
+ __assign_str(name, obj->name);
+ __entry->type = type;
+ strlcpy(__entry->type_str, adf_event_type_str(obj, type),
+ sizeof(__entry->type_str));
+ ),
+ TP_printk("obj=%s type=%u (%s)",
+ __get_str(name),
+ __entry->type,
+ __entry->type_str)
+);
+
+TRACE_EVENT(adf_event_enable,
+ TP_PROTO(struct adf_obj *obj, enum adf_event_type type),
+ TP_ARGS(obj, type),
+
+ TP_STRUCT__entry(
+ __string(name, obj->name)
+ __field(enum adf_event_type, type)
+ __array(char, type_str, 32)
+ ),
+ TP_fast_assign(
+ __assign_str(name, obj->name);
+ __entry->type = type;
+ strlcpy(__entry->type_str, adf_event_type_str(obj, type),
+ sizeof(__entry->type_str));
+ ),
+ TP_printk("obj=%s type=%u (%s)",
+ __get_str(name),
+ __entry->type,
+ __entry->type_str)
+);
+
+TRACE_EVENT(adf_event_disable,
+ TP_PROTO(struct adf_obj *obj, enum adf_event_type type),
+ TP_ARGS(obj, type),
+
+ TP_STRUCT__entry(
+ __string(name, obj->name)
+ __field(enum adf_event_type, type)
+ __array(char, type_str, 32)
+ ),
+ TP_fast_assign(
+ __assign_str(name, obj->name);
+ __entry->type = type;
+ strlcpy(__entry->type_str, adf_event_type_str(obj, type),
+ sizeof(__entry->type_str));
+ ),
+ TP_printk("obj=%s type=%u (%s)",
+ __get_str(name),
+ __entry->type,
+ __entry->type_str)
+);
+
+#endif /* __VIDEO_ADF_ADF_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE adf_trace
+#include <trace/define_trace.h>
diff --git a/drivers/w1/masters/ds2482.c b/drivers/w1/masters/ds2482.c
index e76a9b39abb2..5e505d496f92 100644
--- a/drivers/w1/masters/ds2482.c
+++ b/drivers/w1/masters/ds2482.c
@@ -18,6 +18,8 @@
#include <linux/slab.h>
#include <linux/i2c.h>
#include <linux/delay.h>
+#include <linux/gpio.h>
+#include <linux/platform_data/ds2482.h>
#include <asm/delay.h>
#include "../w1.h"
@@ -84,7 +86,8 @@ static const u8 ds2482_chan_rd[8] =
static int ds2482_probe(struct i2c_client *client,
const struct i2c_device_id *id);
static int ds2482_remove(struct i2c_client *client);
-
+static int ds2482_suspend(struct device *dev);
+static int ds2482_resume(struct device *dev);
/**
* Driver data (common to all clients)
@@ -94,10 +97,16 @@ static const struct i2c_device_id ds2482_id[] = {
{ }
};
+static const struct dev_pm_ops ds2482_pm_ops = {
+ .suspend = ds2482_suspend,
+ .resume = ds2482_resume,
+};
+
static struct i2c_driver ds2482_driver = {
.driver = {
.owner = THIS_MODULE,
.name = "ds2482",
+ .pm = &ds2482_pm_ops,
},
.probe = ds2482_probe,
.remove = ds2482_remove,
@@ -119,6 +128,7 @@ struct ds2482_w1_chan {
struct ds2482_data {
struct i2c_client *client;
struct mutex access_lock;
+ int slpz_gpio;
/* 1-wire interface(s) */
int w1_count; /* 1 or 8 */
@@ -444,11 +454,31 @@ static u8 ds2482_w1_set_pullup(void *data, int delay)
return retval;
}
+static int ds2482_suspend(struct device *dev)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ struct ds2482_data *data = i2c_get_clientdata(client);
+
+ if (data->slpz_gpio >= 0)
+ gpio_set_value(data->slpz_gpio, 0);
+ return 0;
+}
+
+static int ds2482_resume(struct device *dev)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ struct ds2482_data *data = i2c_get_clientdata(client);
+
+ if (data->slpz_gpio >= 0)
+ gpio_set_value(data->slpz_gpio, 1);
+ return 0;
+}
static int ds2482_probe(struct i2c_client *client,
const struct i2c_device_id *id)
{
struct ds2482_data *data;
+ struct ds2482_platform_data *pdata;
int err = -ENODEV;
int temp1;
int idx;
@@ -515,6 +545,16 @@ static int ds2482_probe(struct i2c_client *client,
}
}
+ pdata = client->dev.platform_data;
+ data->slpz_gpio = pdata ? pdata->slpz_gpio : -1;
+
+ if (data->slpz_gpio >= 0) {
+ err = gpio_request_one(data->slpz_gpio, GPIOF_OUT_INIT_HIGH,
+ "ds2482.slpz");
+ if (err < 0)
+ goto exit_w1_remove;
+ }
+
return 0;
exit_w1_remove:
@@ -539,6 +579,11 @@ static int ds2482_remove(struct i2c_client *client)
w1_remove_master_device(&data->w1_ch[idx].w1_bm);
}
+ if (data->slpz_gpio >= 0) {
+ gpio_set_value(data->slpz_gpio, 0);
+ gpio_free(data->slpz_gpio);
+ }
+
/* Free the memory */
kfree(data);
return 0;
diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c
index 1f850c97482f..f745db270171 100644
--- a/drivers/xen/efi.c
+++ b/drivers/xen/efi.c
@@ -294,6 +294,7 @@ static const struct efi efi_xen __initconst = {
.acpi = EFI_INVALID_TABLE_ADDR,
.acpi20 = EFI_INVALID_TABLE_ADDR,
.smbios = EFI_INVALID_TABLE_ADDR,
+ .smbios3 = EFI_INVALID_TABLE_ADDR,
.sal_systab = EFI_INVALID_TABLE_ADDR,
.boot_info = EFI_INVALID_TABLE_ADDR,
.hcdp = EFI_INVALID_TABLE_ADDR,
diff --git a/fs/Kconfig b/fs/Kconfig
index 664991afe0c0..874e1c4c5906 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -61,6 +61,8 @@ config FILE_LOCKING
for filesystems like NFS and for the flock() system
call. Disabling this option saves about 11k.
+source "fs/crypto/Kconfig"
+
source "fs/notify/Kconfig"
source "fs/quota/Kconfig"
@@ -186,6 +188,7 @@ if MISC_FILESYSTEMS
source "fs/adfs/Kconfig"
source "fs/affs/Kconfig"
source "fs/ecryptfs/Kconfig"
+source "fs/sdcardfs/Kconfig"
source "fs/hfs/Kconfig"
source "fs/hfsplus/Kconfig"
source "fs/befs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index da0bbb456d3f..f67fb07a0b68 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -3,7 +3,7 @@
#
# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
# Rewritten to use lists instead of if-statements.
-#
+#
obj-y := open.o read_write.o file_table.o super.o \
char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
@@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_AIO) += aio.o
+obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FILE_LOCKING) += locks.o
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
@@ -58,7 +59,7 @@ obj-y += devpts/
obj-$(CONFIG_PROFILING) += dcookies.o
obj-$(CONFIG_DLM) += dlm/
-
+
# Do not add any filesystems before this line
obj-$(CONFIG_FSCACHE) += fscache/
obj-$(CONFIG_REISERFS_FS) += reiserfs/
@@ -81,6 +82,7 @@ obj-$(CONFIG_ISO9660_FS) += isofs/
obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+
obj-$(CONFIG_HFS_FS) += hfs/
obj-$(CONFIG_ECRYPT_FS) += ecryptfs/
+obj-$(CONFIG_SDCARD_FS) += sdcardfs/
obj-$(CONFIG_VXFS_FS) += freevxfs/
obj-$(CONFIG_NFS_FS) += nfs/
obj-$(CONFIG_EXPORTFS) += exportfs/
@@ -118,6 +120,7 @@ obj-$(CONFIG_HOSTFS) += hostfs/
obj-$(CONFIG_HPPFS) += hppfs/
obj-$(CONFIG_CACHEFILES) += cachefiles/
obj-$(CONFIG_DEBUG_FS) += debugfs/
+obj-$(CONFIG_TRACING) += tracefs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/attr.c b/fs/attr.c
index 6530ced19697..34926834cd49 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -187,7 +187,7 @@ EXPORT_SYMBOL(setattr_copy);
* the file open for write, as there can be no conflicting delegation in
* that case.
*/
-int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
+int notify_change2(struct vfsmount *mnt, struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
{
struct inode *inode = dentry->d_inode;
umode_t mode = inode->i_mode;
@@ -262,7 +262,9 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
if (error)
return error;
- if (inode->i_op->setattr)
+ if (mnt && inode->i_op->setattr2)
+ error = inode->i_op->setattr2(mnt, dentry, attr);
+ else if (inode->i_op->setattr)
error = inode->i_op->setattr(dentry, attr);
else
error = simple_setattr(dentry, attr);
@@ -275,4 +277,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
return error;
}
+EXPORT_SYMBOL(notify_change2);
+
+int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
+{
+ return notify_change2(NULL, dentry, attr, delegated_inode);
+}
EXPORT_SYMBOL(notify_change);
diff --git a/fs/coredump.c b/fs/coredump.c
index 0ad7a30e3107..bb4a7cca8802 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -710,7 +710,7 @@ void do_coredump(const siginfo_t *siginfo)
goto close_fail;
if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
goto close_fail;
- if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+ if (do_truncate2(cprm.file->f_path.mnt, cprm.file->f_path.dentry, 0, 0, cprm.file))
goto close_fail;
}
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
new file mode 100644
index 000000000000..92348faf9865
--- /dev/null
+++ b/fs/crypto/Kconfig
@@ -0,0 +1,18 @@
+config FS_ENCRYPTION
+ tristate "FS Encryption (Per-file encryption)"
+ depends on BLOCK
+ select CRYPTO
+ select CRYPTO_AES
+ select CRYPTO_CBC
+ select CRYPTO_ECB
+ select CRYPTO_XTS
+ select CRYPTO_CTS
+ select CRYPTO_CTR
+ select CRYPTO_SHA256
+ select KEYS
+ select ENCRYPTED_KEYS
+ help
+ Enable encryption of files and directories. This
+ feature is similar to ecryptfs, but it is more memory
+ efficient since it avoids caching the encrypted and
+ decrypted pages in the page cache.
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
new file mode 100644
index 000000000000..9f6607f17b53
--- /dev/null
+++ b/fs/crypto/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o
+
+fscrypto-y := crypto.o fname.o policy.o keyinfo.o
+fscrypto-$(CONFIG_BLOCK) += bio.o
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
new file mode 100644
index 000000000000..a91ed46fe503
--- /dev/null
+++ b/fs/crypto/bio.c
@@ -0,0 +1,143 @@
+/*
+ * This contains encryption functions for per-file encryption.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ * Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ * Ildar Muslukhov, 2014
+ * Add fscrypt_pullback_bio_page()
+ * Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/namei.h>
+#include "fscrypt_private.h"
+
+/*
+ * Call fscrypt_decrypt_page on every single page, reusing the encryption
+ * context.
+ */
+static void completion_pages(struct work_struct *work)
+{
+ struct fscrypt_ctx *ctx =
+ container_of(work, struct fscrypt_ctx, r.work);
+ struct bio *bio = ctx->r.bio;
+ struct bio_vec *bv;
+ int i;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ struct page *page = bv->bv_page;
+ int ret = fscrypt_decrypt_page(page->mapping->host, page,
+ PAGE_SIZE, 0, page->index);
+
+ if (ret) {
+ WARN_ON_ONCE(1);
+ SetPageError(page);
+ } else {
+ SetPageUptodate(page);
+ }
+ unlock_page(page);
+ }
+ fscrypt_release_ctx(ctx);
+ bio_put(bio);
+}
+
+void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
+{
+ INIT_WORK(&ctx->r.work, completion_pages);
+ ctx->r.bio = bio;
+ queue_work(fscrypt_read_workqueue, &ctx->r.work);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
+
+void fscrypt_pullback_bio_page(struct page **page, bool restore)
+{
+ struct fscrypt_ctx *ctx;
+ struct page *bounce_page;
+
+ /* The bounce data pages are unmapped. */
+ if ((*page)->mapping)
+ return;
+
+ /* The bounce data page is unmapped. */
+ bounce_page = *page;
+ ctx = (struct fscrypt_ctx *)page_private(bounce_page);
+
+ /* restore control page */
+ *page = ctx->w.control_page;
+
+ if (restore)
+ fscrypt_restore_control_page(bounce_page);
+}
+EXPORT_SYMBOL(fscrypt_pullback_bio_page);
+
+int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
+ sector_t pblk, unsigned int len)
+{
+ struct fscrypt_ctx *ctx;
+ struct page *ciphertext_page = NULL;
+ struct bio *bio;
+ int ret, err = 0;
+
+ BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
+
+ ctx = fscrypt_get_ctx(inode, GFP_NOFS);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ciphertext_page = fscrypt_alloc_bounce_page(ctx, GFP_NOWAIT);
+ if (IS_ERR(ciphertext_page)) {
+ err = PTR_ERR(ciphertext_page);
+ goto errout;
+ }
+
+ while (len--) {
+ err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk,
+ ZERO_PAGE(0), ciphertext_page,
+ PAGE_SIZE, 0, GFP_NOFS);
+ if (err)
+ goto errout;
+
+ bio = bio_alloc(GFP_NOWAIT, 1);
+ if (!bio) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ bio->bi_bdev = inode->i_sb->s_bdev;
+ bio->bi_iter.bi_sector =
+ pblk << (inode->i_sb->s_blocksize_bits - 9);
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ ret = bio_add_page(bio, ciphertext_page,
+ inode->i_sb->s_blocksize, 0);
+ if (ret != inode->i_sb->s_blocksize) {
+ /* should never happen! */
+ WARN_ON(1);
+ bio_put(bio);
+ err = -EIO;
+ goto errout;
+ }
+ err = submit_bio_wait(0, bio);
+ bio_put(bio);
+ if (err)
+ goto errout;
+ lblk++;
+ pblk++;
+ }
+ err = 0;
+errout:
+ fscrypt_release_ctx(ctx);
+ return err;
+}
+EXPORT_SYMBOL(fscrypt_zeroout_range);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
new file mode 100644
index 000000000000..2100bc8ac7b1
--- /dev/null
+++ b/fs/crypto/crypto.c
@@ -0,0 +1,483 @@
+/*
+ * This contains encryption functions for per-file encryption.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ * Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ * Ildar Muslukhov, 2014
+ * Add fscrypt_pullback_bio_page()
+ * Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/ratelimit.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include "fscrypt_private.h"
+
+static unsigned int num_prealloc_crypto_pages = 32;
+static unsigned int num_prealloc_crypto_ctxs = 128;
+
+module_param(num_prealloc_crypto_pages, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_pages,
+ "Number of crypto pages to preallocate");
+module_param(num_prealloc_crypto_ctxs, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
+ "Number of crypto contexts to preallocate");
+
+static mempool_t *fscrypt_bounce_page_pool = NULL;
+
+static LIST_HEAD(fscrypt_free_ctxs);
+static DEFINE_SPINLOCK(fscrypt_ctx_lock);
+
+struct workqueue_struct *fscrypt_read_workqueue;
+static DEFINE_MUTEX(fscrypt_init_mutex);
+
+static struct kmem_cache *fscrypt_ctx_cachep;
+struct kmem_cache *fscrypt_info_cachep;
+
+/**
+ * fscrypt_release_ctx() - Releases an encryption context
+ * @ctx: The encryption context to release.
+ *
+ * If the encryption context was allocated from the pre-allocated pool, returns
+ * it to that pool. Else, frees it.
+ *
+ * If there's a bounce page in the context, this frees that.
+ */
+void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
+{
+ unsigned long flags;
+
+ if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) {
+ mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool);
+ ctx->w.bounce_page = NULL;
+ }
+ ctx->w.control_page = NULL;
+ if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
+ kmem_cache_free(fscrypt_ctx_cachep, ctx);
+ } else {
+ spin_lock_irqsave(&fscrypt_ctx_lock, flags);
+ list_add(&ctx->free_list, &fscrypt_free_ctxs);
+ spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
+ }
+}
+EXPORT_SYMBOL(fscrypt_release_ctx);
+
+/**
+ * fscrypt_get_ctx() - Gets an encryption context
+ * @inode: The inode for which we are doing the crypto
+ * @gfp_flags: The gfp flag for memory allocation
+ *
+ * Allocates and initializes an encryption context.
+ *
+ * Return: An allocated and initialized encryption context on success; error
+ * value or NULL otherwise.
+ */
+struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags)
+{
+ struct fscrypt_ctx *ctx = NULL;
+ struct fscrypt_info *ci = inode->i_crypt_info;
+ unsigned long flags;
+
+ if (ci == NULL)
+ return ERR_PTR(-ENOKEY);
+
+ /*
+ * We first try getting the ctx from a free list because in
+ * the common case the ctx will have an allocated and
+ * initialized crypto tfm, so it's probably a worthwhile
+ * optimization. For the bounce page, we first try getting it
+ * from the kernel allocator because that's just about as fast
+ * as getting it from a list and because a cache of free pages
+ * should generally be a "last resort" option for a filesystem
+ * to be able to do its job.
+ */
+ spin_lock_irqsave(&fscrypt_ctx_lock, flags);
+ ctx = list_first_entry_or_null(&fscrypt_free_ctxs,
+ struct fscrypt_ctx, free_list);
+ if (ctx)
+ list_del(&ctx->free_list);
+ spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
+ if (!ctx) {
+ ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, gfp_flags);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+ ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+ } else {
+ ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+ }
+ ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL;
+ return ctx;
+}
+EXPORT_SYMBOL(fscrypt_get_ctx);
+
+/**
+ * page_crypt_complete() - completion callback for page crypto
+ * @req: The asynchronous cipher request context
+ * @res: The result of the cipher operation
+ */
+static void page_crypt_complete(struct crypto_async_request *req, int res)
+{
+ struct fscrypt_completion_result *ecr = req->data;
+
+ if (res == -EINPROGRESS)
+ return;
+ ecr->res = res;
+ complete(&ecr->completion);
+}
+
+int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
+ u64 lblk_num, struct page *src_page,
+ struct page *dest_page, unsigned int len,
+ unsigned int offs, gfp_t gfp_flags)
+{
+ struct {
+ __le64 index;
+ u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)];
+ } xts_tweak;
+ struct ablkcipher_request *req = NULL;
+ DECLARE_FS_COMPLETION_RESULT(ecr);
+ struct scatterlist dst, src;
+ struct fscrypt_info *ci = inode->i_crypt_info;
+ struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ int res = 0;
+
+ BUG_ON(len == 0);
+
+ req = ablkcipher_request_alloc(tfm, gfp_flags);
+ if (!req) {
+ printk_ratelimited(KERN_ERR
+ "%s: crypto_request_alloc() failed\n",
+ __func__);
+ return -ENOMEM;
+ }
+
+ ablkcipher_request_set_callback(
+ req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ page_crypt_complete, &ecr);
+
+ BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE);
+ xts_tweak.index = cpu_to_le64(lblk_num);
+ memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding));
+
+ sg_init_table(&dst, 1);
+ sg_set_page(&dst, dest_page, len, offs);
+ sg_init_table(&src, 1);
+ sg_set_page(&src, src_page, len, offs);
+ ablkcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak);
+ if (rw == FS_DECRYPT)
+ res = crypto_ablkcipher_decrypt(req);
+ else
+ res = crypto_ablkcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ BUG_ON(req->base.data != &ecr);
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+ ablkcipher_request_free(req);
+ if (res) {
+ printk_ratelimited(KERN_ERR
+ "%s: crypto_ablkcipher_encrypt() returned %d\n",
+ __func__, res);
+ return res;
+ }
+ return 0;
+}
+
+struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
+ gfp_t gfp_flags)
+{
+ ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
+ if (ctx->w.bounce_page == NULL)
+ return ERR_PTR(-ENOMEM);
+ ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL;
+ return ctx->w.bounce_page;
+}
+
+/**
+ * fscypt_encrypt_page() - Encrypts a page
+ * @inode: The inode for which the encryption should take place
+ * @page: The page to encrypt. Must be locked for bounce-page
+ * encryption.
+ * @len: Length of data to encrypt in @page and encrypted
+ * data in returned page.
+ * @offs: Offset of data within @page and returned
+ * page holding encrypted data.
+ * @lblk_num: Logical block number. This must be unique for multiple
+ * calls with same inode, except when overwriting
+ * previously written data.
+ * @gfp_flags: The gfp flag for memory allocation
+ *
+ * Encrypts @page using the ctx encryption context. Performs encryption
+ * either in-place or into a newly allocated bounce page.
+ * Called on the page write path.
+ *
+ * Bounce page allocation is the default.
+ * In this case, the contents of @page are encrypted and stored in an
+ * allocated bounce page. @page has to be locked and the caller must call
+ * fscrypt_restore_control_page() on the returned ciphertext page to
+ * release the bounce buffer and the encryption context.
+ *
+ * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in
+ * fscrypt_operations. Here, the input-page is returned with its content
+ * encrypted.
+ *
+ * Return: A page with the encrypted content on success. Else, an
+ * error value or NULL.
+ */
+struct page *fscrypt_encrypt_page(const struct inode *inode,
+ struct page *page,
+ unsigned int len,
+ unsigned int offs,
+ u64 lblk_num, gfp_t gfp_flags)
+
+{
+ struct fscrypt_ctx *ctx;
+ struct page *ciphertext_page = page;
+ int err;
+
+ BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0);
+
+ if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) {
+ /* with inplace-encryption we just encrypt the page */
+ err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, page,
+ ciphertext_page, len, offs,
+ gfp_flags);
+ if (err)
+ return ERR_PTR(err);
+
+ return ciphertext_page;
+ }
+
+ BUG_ON(!PageLocked(page));
+
+ ctx = fscrypt_get_ctx(inode, gfp_flags);
+ if (IS_ERR(ctx))
+ return (struct page *)ctx;
+
+ /* The encryption operation will require a bounce page. */
+ ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags);
+ if (IS_ERR(ciphertext_page))
+ goto errout;
+
+ ctx->w.control_page = page;
+ err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num,
+ page, ciphertext_page, len, offs,
+ gfp_flags);
+ if (err) {
+ ciphertext_page = ERR_PTR(err);
+ goto errout;
+ }
+ SetPagePrivate(ciphertext_page);
+ set_page_private(ciphertext_page, (unsigned long)ctx);
+ lock_page(ciphertext_page);
+ return ciphertext_page;
+
+errout:
+ fscrypt_release_ctx(ctx);
+ return ciphertext_page;
+}
+EXPORT_SYMBOL(fscrypt_encrypt_page);
+
+/**
+ * fscrypt_decrypt_page() - Decrypts a page in-place
+ * @inode: The corresponding inode for the page to decrypt.
+ * @page: The page to decrypt. Must be locked in case
+ * it is a writeback page (FS_CFLG_OWN_PAGES unset).
+ * @len: Number of bytes in @page to be decrypted.
+ * @offs: Start of data in @page.
+ * @lblk_num: Logical block number.
+ *
+ * Decrypts page in-place using the ctx encryption context.
+ *
+ * Called from the read completion callback.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
+ unsigned int len, unsigned int offs, u64 lblk_num)
+{
+ if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES))
+ BUG_ON(!PageLocked(page));
+
+ return fscrypt_do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page,
+ len, offs, GFP_NOFS);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_page);
+
+/*
+ * Validate dentries for encrypted directories to make sure we aren't
+ * potentially caching stale data after a key has been added or
+ * removed.
+ */
+static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct dentry *dir;
+ int dir_has_key, cached_with_key;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ dir = dget_parent(dentry);
+ if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) {
+ dput(dir);
+ return 0;
+ }
+
+ /* this should eventually be an flag in d_flags */
+ spin_lock(&dentry->d_lock);
+ cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY;
+ spin_unlock(&dentry->d_lock);
+ dir_has_key = (d_inode(dir)->i_crypt_info != NULL);
+ dput(dir);
+
+ /*
+ * If the dentry was cached without the key, and it is a
+ * negative dentry, it might be a valid name. We can't check
+ * if the key has since been made available due to locking
+ * reasons, so we fail the validation so ext4_lookup() can do
+ * this check.
+ *
+ * We also fail the validation if the dentry was created with
+ * the key present, but we no longer have the key, or vice versa.
+ */
+ if ((!cached_with_key && d_is_negative(dentry)) ||
+ (!cached_with_key && dir_has_key) ||
+ (cached_with_key && !dir_has_key))
+ return 0;
+ return 1;
+}
+
+const struct dentry_operations fscrypt_d_ops = {
+ .d_revalidate = fscrypt_d_revalidate,
+};
+EXPORT_SYMBOL(fscrypt_d_ops);
+
+void fscrypt_restore_control_page(struct page *page)
+{
+ struct fscrypt_ctx *ctx;
+
+ ctx = (struct fscrypt_ctx *)page_private(page);
+ set_page_private(page, (unsigned long)NULL);
+ ClearPagePrivate(page);
+ unlock_page(page);
+ fscrypt_release_ctx(ctx);
+}
+EXPORT_SYMBOL(fscrypt_restore_control_page);
+
+static void fscrypt_destroy(void)
+{
+ struct fscrypt_ctx *pos, *n;
+
+ list_for_each_entry_safe(pos, n, &fscrypt_free_ctxs, free_list)
+ kmem_cache_free(fscrypt_ctx_cachep, pos);
+ INIT_LIST_HEAD(&fscrypt_free_ctxs);
+ mempool_destroy(fscrypt_bounce_page_pool);
+ fscrypt_bounce_page_pool = NULL;
+}
+
+/**
+ * fscrypt_initialize() - allocate major buffers for fs encryption.
+ * @cop_flags: fscrypt operations flags
+ *
+ * We only call this when we start accessing encrypted files, since it
+ * results in memory getting allocated that wouldn't otherwise be used.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int fscrypt_initialize(unsigned int cop_flags)
+{
+ int i, res = -ENOMEM;
+
+ /*
+ * No need to allocate a bounce page pool if there already is one or
+ * this FS won't use it.
+ */
+ if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool)
+ return 0;
+
+ mutex_lock(&fscrypt_init_mutex);
+ if (fscrypt_bounce_page_pool)
+ goto already_initialized;
+
+ for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
+ struct fscrypt_ctx *ctx;
+
+ ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS);
+ if (!ctx)
+ goto fail;
+ list_add(&ctx->free_list, &fscrypt_free_ctxs);
+ }
+
+ fscrypt_bounce_page_pool =
+ mempool_create_page_pool(num_prealloc_crypto_pages, 0);
+ if (!fscrypt_bounce_page_pool)
+ goto fail;
+
+already_initialized:
+ mutex_unlock(&fscrypt_init_mutex);
+ return 0;
+fail:
+ fscrypt_destroy();
+ mutex_unlock(&fscrypt_init_mutex);
+ return res;
+}
+
+/**
+ * fscrypt_init() - Set up for fs encryption.
+ */
+static int __init fscrypt_init(void)
+{
+ fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue",
+ WQ_HIGHPRI, 0);
+ if (!fscrypt_read_workqueue)
+ goto fail;
+
+ fscrypt_ctx_cachep = KMEM_CACHE(fscrypt_ctx, SLAB_RECLAIM_ACCOUNT);
+ if (!fscrypt_ctx_cachep)
+ goto fail_free_queue;
+
+ fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT);
+ if (!fscrypt_info_cachep)
+ goto fail_free_ctx;
+
+ return 0;
+
+fail_free_ctx:
+ kmem_cache_destroy(fscrypt_ctx_cachep);
+fail_free_queue:
+ destroy_workqueue(fscrypt_read_workqueue);
+fail:
+ return -ENOMEM;
+}
+module_init(fscrypt_init)
+
+/**
+ * fscrypt_exit() - Shutdown the fs encryption system
+ */
+static void __exit fscrypt_exit(void)
+{
+ fscrypt_destroy();
+
+ if (fscrypt_read_workqueue)
+ destroy_workqueue(fscrypt_read_workqueue);
+ kmem_cache_destroy(fscrypt_ctx_cachep);
+ kmem_cache_destroy(fscrypt_info_cachep);
+}
+module_exit(fscrypt_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
new file mode 100644
index 000000000000..17905ce0b92c
--- /dev/null
+++ b/fs/crypto/fname.c
@@ -0,0 +1,464 @@
+/*
+ * This contains functions for filename crypto management
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * Written by Uday Savagaonkar, 2014.
+ * Modified by Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ */
+
+#include <linux/scatterlist.h>
+#include <linux/ratelimit.h>
+#include "fscrypt_private.h"
+
+/**
+ * fname_crypt_complete() - completion callback for filename crypto
+ * @req: The asynchronous cipher request context
+ * @res: The result of the cipher operation
+ */
+static void fname_crypt_complete(struct crypto_async_request *req, int res)
+{
+ struct fscrypt_completion_result *ecr = req->data;
+
+ if (res == -EINPROGRESS)
+ return;
+ ecr->res = res;
+ complete(&ecr->completion);
+}
+
+/**
+ * fname_encrypt() - encrypt a filename
+ *
+ * The caller must have allocated sufficient memory for the @oname string.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int fname_encrypt(struct inode *inode,
+ const struct qstr *iname, struct fscrypt_str *oname)
+{
+ struct ablkcipher_request *req = NULL;
+ DECLARE_FS_COMPLETION_RESULT(ecr);
+ struct fscrypt_info *ci = inode->i_crypt_info;
+ struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ int res = 0;
+ char iv[FS_CRYPTO_BLOCK_SIZE];
+ struct scatterlist sg;
+ int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
+ unsigned int lim;
+ unsigned int cryptlen;
+
+ lim = inode->i_sb->s_cop->max_namelen(inode);
+ if (iname->len <= 0 || iname->len > lim)
+ return -EIO;
+
+ /*
+ * Copy the filename to the output buffer for encrypting in-place and
+ * pad it with the needed number of NUL bytes.
+ */
+ cryptlen = max_t(unsigned int, iname->len, FS_CRYPTO_BLOCK_SIZE);
+ cryptlen = round_up(cryptlen, padding);
+ cryptlen = min(cryptlen, lim);
+ memcpy(oname->name, iname->name, iname->len);
+ memset(oname->name + iname->len, 0, cryptlen - iname->len);
+
+ /* Initialize the IV */
+ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
+
+ /* Set up the encryption request */
+ req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ printk_ratelimited(KERN_ERR
+ "%s: ablkcipher_request_alloc() failed\n", __func__);
+ return -ENOMEM;
+ }
+ ablkcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ fname_crypt_complete, &ecr);
+ sg_init_one(&sg, oname->name, cryptlen);
+ ablkcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv);
+
+ /* Do the encryption */
+ res = crypto_ablkcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ /* Request is being completed asynchronously; wait for it */
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+ ablkcipher_request_free(req);
+ if (res < 0) {
+ printk_ratelimited(KERN_ERR
+ "%s: Error (error code %d)\n", __func__, res);
+ return res;
+ }
+
+ oname->len = cryptlen;
+ return 0;
+}
+
+/**
+ * fname_decrypt() - decrypt a filename
+ *
+ * The caller must have allocated sufficient memory for the @oname string.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+static int fname_decrypt(struct inode *inode,
+ const struct fscrypt_str *iname,
+ struct fscrypt_str *oname)
+{
+ struct ablkcipher_request *req = NULL;
+ DECLARE_FS_COMPLETION_RESULT(ecr);
+ struct scatterlist src_sg, dst_sg;
+ struct fscrypt_info *ci = inode->i_crypt_info;
+ struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ int res = 0;
+ char iv[FS_CRYPTO_BLOCK_SIZE];
+ unsigned lim;
+
+ lim = inode->i_sb->s_cop->max_namelen(inode);
+ if (iname->len <= 0 || iname->len > lim)
+ return -EIO;
+
+ /* Allocate request */
+ req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ printk_ratelimited(KERN_ERR
+ "%s: crypto_request_alloc() failed\n", __func__);
+ return -ENOMEM;
+ }
+ ablkcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ fname_crypt_complete, &ecr);
+
+ /* Initialize IV */
+ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
+
+ /* Create decryption request */
+ sg_init_one(&src_sg, iname->name, iname->len);
+ sg_init_one(&dst_sg, oname->name, oname->len);
+ ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+ res = crypto_ablkcipher_decrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+ ablkcipher_request_free(req);
+ if (res < 0) {
+ printk_ratelimited(KERN_ERR
+ "%s: Error (error code %d)\n", __func__, res);
+ return res;
+ }
+
+ oname->len = strnlen(oname->name, iname->len);
+ return 0;
+}
+
+static const char *lookup_table =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
+
+#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3)
+
+/**
+ * digest_encode() -
+ *
+ * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
+ * The encoded string is roughly 4/3 times the size of the input string.
+ */
+static int digest_encode(const char *src, int len, char *dst)
+{
+ int i = 0, bits = 0, ac = 0;
+ char *cp = dst;
+
+ while (i < len) {
+ ac += (((unsigned char) src[i]) << bits);
+ bits += 8;
+ do {
+ *cp++ = lookup_table[ac & 0x3f];
+ ac >>= 6;
+ bits -= 6;
+ } while (bits >= 6);
+ i++;
+ }
+ if (bits)
+ *cp++ = lookup_table[ac & 0x3f];
+ return cp - dst;
+}
+
+static int digest_decode(const char *src, int len, char *dst)
+{
+ int i = 0, bits = 0, ac = 0;
+ const char *p;
+ char *cp = dst;
+
+ while (i < len) {
+ p = strchr(lookup_table, src[i]);
+ if (p == NULL || src[i] == 0)
+ return -2;
+ ac += (p - lookup_table) << bits;
+ bits += 6;
+ if (bits >= 8) {
+ *cp++ = ac & 0xff;
+ ac >>= 8;
+ bits -= 8;
+ }
+ i++;
+ }
+ if (ac)
+ return -1;
+ return cp - dst;
+}
+
+u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen)
+{
+ int padding = 32;
+ struct fscrypt_info *ci = inode->i_crypt_info;
+
+ if (ci)
+ padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
+ ilen = max(ilen, (u32)FS_CRYPTO_BLOCK_SIZE);
+ return round_up(ilen, padding);
+}
+EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
+
+/**
+ * fscrypt_fname_crypto_alloc_obuff() -
+ *
+ * Allocates an output buffer that is sufficient for the crypto operation
+ * specified by the context and the direction.
+ */
+int fscrypt_fname_alloc_buffer(const struct inode *inode,
+ u32 ilen, struct fscrypt_str *crypto_str)
+{
+ u32 olen = fscrypt_fname_encrypted_size(inode, ilen);
+ const u32 max_encoded_len =
+ max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE),
+ 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name)));
+
+ crypto_str->len = olen;
+ olen = max(olen, max_encoded_len);
+
+ /*
+ * Allocated buffer can hold one more character to null-terminate the
+ * string
+ */
+ crypto_str->name = kmalloc(olen + 1, GFP_NOFS);
+ if (!(crypto_str->name))
+ return -ENOMEM;
+ return 0;
+}
+EXPORT_SYMBOL(fscrypt_fname_alloc_buffer);
+
+/**
+ * fscrypt_fname_crypto_free_buffer() -
+ *
+ * Frees the buffer allocated for crypto operation.
+ */
+void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
+{
+ if (!crypto_str)
+ return;
+ kfree(crypto_str->name);
+ crypto_str->name = NULL;
+}
+EXPORT_SYMBOL(fscrypt_fname_free_buffer);
+
+/**
+ * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user
+ * space
+ *
+ * The caller must have allocated sufficient memory for the @oname string.
+ *
+ * If the key is available, we'll decrypt the disk name; otherwise, we'll encode
+ * it for presentation. Short names are directly base64-encoded, while long
+ * names are encoded in fscrypt_digested_name format.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fscrypt_fname_disk_to_usr(struct inode *inode,
+ u32 hash, u32 minor_hash,
+ const struct fscrypt_str *iname,
+ struct fscrypt_str *oname)
+{
+ const struct qstr qname = FSTR_TO_QSTR(iname);
+ struct fscrypt_digested_name digested_name;
+
+ if (fscrypt_is_dot_dotdot(&qname)) {
+ oname->name[0] = '.';
+ oname->name[iname->len - 1] = '.';
+ oname->len = iname->len;
+ return 0;
+ }
+
+ if (iname->len < FS_CRYPTO_BLOCK_SIZE)
+ return -EUCLEAN;
+
+ if (inode->i_crypt_info)
+ return fname_decrypt(inode, iname, oname);
+
+ if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) {
+ oname->len = digest_encode(iname->name, iname->len,
+ oname->name);
+ return 0;
+ }
+ if (hash) {
+ digested_name.hash = hash;
+ digested_name.minor_hash = minor_hash;
+ } else {
+ digested_name.hash = 0;
+ digested_name.minor_hash = 0;
+ }
+ memcpy(digested_name.digest,
+ FSCRYPT_FNAME_DIGEST(iname->name, iname->len),
+ FSCRYPT_FNAME_DIGEST_SIZE);
+ oname->name[0] = '_';
+ oname->len = 1 + digest_encode((const char *)&digested_name,
+ sizeof(digested_name), oname->name + 1);
+ return 0;
+}
+EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
+
+/**
+ * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk
+ * space
+ *
+ * The caller must have allocated sufficient memory for the @oname string.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fscrypt_fname_usr_to_disk(struct inode *inode,
+ const struct qstr *iname,
+ struct fscrypt_str *oname)
+{
+ if (fscrypt_is_dot_dotdot(iname)) {
+ oname->name[0] = '.';
+ oname->name[iname->len - 1] = '.';
+ oname->len = iname->len;
+ return 0;
+ }
+ if (inode->i_crypt_info)
+ return fname_encrypt(inode, iname, oname);
+ /*
+ * Without a proper key, a user is not allowed to modify the filenames
+ * in a directory. Consequently, a user space name cannot be mapped to
+ * a disk-space name
+ */
+ return -ENOKEY;
+}
+EXPORT_SYMBOL(fscrypt_fname_usr_to_disk);
+
+/**
+ * fscrypt_setup_filename() - prepare to search a possibly encrypted directory
+ * @dir: the directory that will be searched
+ * @iname: the user-provided filename being searched for
+ * @lookup: 1 if we're allowed to proceed without the key because it's
+ * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot
+ * proceed without the key because we're going to create the dir_entry.
+ * @fname: the filename information to be filled in
+ *
+ * Given a user-provided filename @iname, this function sets @fname->disk_name
+ * to the name that would be stored in the on-disk directory entry, if possible.
+ * If the directory is unencrypted this is simply @iname. Else, if we have the
+ * directory's encryption key, then @iname is the plaintext, so we encrypt it to
+ * get the disk_name.
+ *
+ * Else, for keyless @lookup operations, @iname is the presented ciphertext, so
+ * we decode it to get either the ciphertext disk_name (for short names) or the
+ * fscrypt_digested_name (for long names). Non-@lookup operations will be
+ * impossible in this case, so we fail them with ENOKEY.
+ *
+ * If successful, fscrypt_free_filename() must be called later to clean up.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct fscrypt_name *fname)
+{
+ int ret;
+ int digested;
+
+ memset(fname, 0, sizeof(struct fscrypt_name));
+ fname->usr_fname = iname;
+
+ if (!dir->i_sb->s_cop->is_encrypted(dir) ||
+ fscrypt_is_dot_dotdot(iname)) {
+ fname->disk_name.name = (unsigned char *)iname->name;
+ fname->disk_name.len = iname->len;
+ return 0;
+ }
+ ret = fscrypt_get_encryption_info(dir);
+ if (ret && ret != -EOPNOTSUPP)
+ return ret;
+
+ if (dir->i_crypt_info) {
+ ret = fscrypt_fname_alloc_buffer(dir, iname->len,
+ &fname->crypto_buf);
+ if (ret)
+ return ret;
+ ret = fname_encrypt(dir, iname, &fname->crypto_buf);
+ if (ret)
+ goto errout;
+ fname->disk_name.name = fname->crypto_buf.name;
+ fname->disk_name.len = fname->crypto_buf.len;
+ return 0;
+ }
+ if (!lookup)
+ return -ENOKEY;
+
+ /*
+ * We don't have the key and we are doing a lookup; decode the
+ * user-supplied name
+ */
+ if (iname->name[0] == '_') {
+ if (iname->len !=
+ 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name)))
+ return -ENOENT;
+ digested = 1;
+ } else {
+ if (iname->len >
+ BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE))
+ return -ENOENT;
+ digested = 0;
+ }
+
+ fname->crypto_buf.name =
+ kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE,
+ sizeof(struct fscrypt_digested_name)),
+ GFP_KERNEL);
+ if (fname->crypto_buf.name == NULL)
+ return -ENOMEM;
+
+ ret = digest_decode(iname->name + digested, iname->len - digested,
+ fname->crypto_buf.name);
+ if (ret < 0) {
+ ret = -ENOENT;
+ goto errout;
+ }
+ fname->crypto_buf.len = ret;
+ if (digested) {
+ const struct fscrypt_digested_name *n =
+ (const void *)fname->crypto_buf.name;
+ fname->hash = n->hash;
+ fname->minor_hash = n->minor_hash;
+ } else {
+ fname->disk_name.name = fname->crypto_buf.name;
+ fname->disk_name.len = fname->crypto_buf.len;
+ }
+ return 0;
+
+errout:
+ fscrypt_fname_free_buffer(&fname->crypto_buf);
+ return ret;
+}
+EXPORT_SYMBOL(fscrypt_setup_filename);
+
+void fscrypt_free_filename(struct fscrypt_name *fname)
+{
+ kfree(fname->crypto_buf.name);
+ fname->crypto_buf.name = NULL;
+ fname->usr_fname = NULL;
+ fname->disk_name.name = NULL;
+}
+EXPORT_SYMBOL(fscrypt_free_filename);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
new file mode 100644
index 000000000000..1bc34d9814d9
--- /dev/null
+++ b/fs/crypto/fscrypt_private.h
@@ -0,0 +1,115 @@
+/*
+ * fscrypt_private.h
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions.
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+
+#ifndef _FSCRYPT_PRIVATE_H
+#define _FSCRYPT_PRIVATE_H
+
+#include <linux/fscrypt_supp.h>
+
+/* Encryption parameters */
+#define FS_XTS_TWEAK_SIZE 16
+#define FS_AES_128_ECB_KEY_SIZE 16
+#define FS_AES_256_GCM_KEY_SIZE 32
+#define FS_AES_256_CBC_KEY_SIZE 32
+#define FS_AES_256_CTS_KEY_SIZE 32
+#define FS_AES_256_XTS_KEY_SIZE 64
+
+#define FS_KEY_DERIVATION_NONCE_SIZE 16
+
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ * 1 byte: Protector format (1 = this version)
+ * 1 byte: File contents encryption mode
+ * 1 byte: File names encryption mode
+ * 1 byte: Flags
+ * 8 bytes: Master Key descriptor
+ * 16 bytes: Encryption Key derivation nonce
+ */
+struct fscrypt_context {
+ u8 format;
+ u8 contents_encryption_mode;
+ u8 filenames_encryption_mode;
+ u8 flags;
+ u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+} __packed;
+
+#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1
+
+/*
+ * A pointer to this structure is stored in the file system's in-core
+ * representation of an inode.
+ */
+struct fscrypt_info {
+ u8 ci_data_mode;
+ u8 ci_filename_mode;
+ u8 ci_flags;
+ struct crypto_ablkcipher *ci_ctfm;
+ u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
+};
+
+typedef enum {
+ FS_DECRYPT = 0,
+ FS_ENCRYPT,
+} fscrypt_direction_t;
+
+#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
+#define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002
+
+struct fscrypt_completion_result {
+ struct completion completion;
+ int res;
+};
+
+#define DECLARE_FS_COMPLETION_RESULT(ecr) \
+ struct fscrypt_completion_result ecr = { \
+ COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 }
+
+static inline void inode_lock(struct inode *inode)
+{
+ mutex_lock(&inode->i_mutex);
+}
+
+static inline void inode_unlock(struct inode *inode)
+{
+ mutex_unlock(&inode->i_mutex);
+}
+
+static inline const struct user_key_payload *user_key_payload(const struct key *key)
+{
+ return (struct user_key_payload *)rcu_dereference_key(key);
+}
+
+/* bio stuffs */
+#define REQ_OP_READ READ
+#define REQ_OP_WRITE WRITE
+#define bio_op(bio) ((bio)->bi_rw & 1)
+
+static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
+ unsigned op_flags)
+{
+ bio->bi_rw = op | op_flags;
+}
+
+/* crypto.c */
+extern int fscrypt_initialize(unsigned int cop_flags);
+extern struct workqueue_struct *fscrypt_read_workqueue;
+extern int fscrypt_do_page_crypto(const struct inode *inode,
+ fscrypt_direction_t rw, u64 lblk_num,
+ struct page *src_page,
+ struct page *dest_page,
+ unsigned int len, unsigned int offs,
+ gfp_t gfp_flags);
+extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
+ gfp_t gfp_flags);
+
+#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
new file mode 100644
index 000000000000..995003cb5301
--- /dev/null
+++ b/fs/crypto/keyinfo.c
@@ -0,0 +1,297 @@
+/*
+ * key management facility for FS encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions.
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+
+#include <keys/user-type.h>
+#include <linux/scatterlist.h>
+#include "fscrypt_private.h"
+
+static void derive_crypt_complete(struct crypto_async_request *req, int rc)
+{
+ struct fscrypt_completion_result *ecr = req->data;
+
+ if (rc == -EINPROGRESS)
+ return;
+
+ ecr->res = rc;
+ complete(&ecr->completion);
+}
+
+/**
+ * derive_key_aes() - Derive a key using AES-128-ECB
+ * @deriving_key: Encryption key used for derivation.
+ * @source_key: Source key to which to apply derivation.
+ * @derived_key: Derived key.
+ *
+ * Return: Zero on success; non-zero otherwise.
+ */
+static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
+ u8 source_key[FS_AES_256_XTS_KEY_SIZE],
+ u8 derived_key[FS_AES_256_XTS_KEY_SIZE])
+{
+ int res = 0;
+ struct ablkcipher_request *req = NULL;
+ DECLARE_FS_COMPLETION_RESULT(ecr);
+ struct scatterlist src_sg, dst_sg;
+ struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, 0);
+
+ if (IS_ERR(tfm)) {
+ res = PTR_ERR(tfm);
+ tfm = NULL;
+ goto out;
+ }
+ crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ res = -ENOMEM;
+ goto out;
+ }
+ ablkcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ derive_crypt_complete, &ecr);
+ res = crypto_ablkcipher_setkey(tfm, deriving_key,
+ FS_AES_128_ECB_KEY_SIZE);
+ if (res < 0)
+ goto out;
+
+ sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE);
+ sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE);
+ ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
+ FS_AES_256_XTS_KEY_SIZE, NULL);
+ res = crypto_ablkcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+out:
+ if (req)
+ ablkcipher_request_free(req);
+ if (tfm)
+ crypto_free_ablkcipher(tfm);
+ return res;
+}
+
+static int validate_user_key(struct fscrypt_info *crypt_info,
+ struct fscrypt_context *ctx, u8 *raw_key,
+ const char *prefix)
+{
+ char *description;
+ struct key *keyring_key;
+ struct fscrypt_key *master_key;
+ const struct user_key_payload *ukp;
+ int prefix_size = strlen(prefix);
+ int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1;
+ int res;
+
+ /* FIXME: 3.18 causes kernel panic.
+ description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
+ FS_KEY_DESCRIPTOR_SIZE,
+ ctx->master_key_descriptor);
+ */
+ description = kmalloc(full_key_len, GFP_NOFS);
+ if (!description)
+ return -ENOMEM;
+
+ memcpy(description, prefix, prefix_size);
+ sprintf(description + prefix_size,
+ "%*phN", FS_KEY_DESCRIPTOR_SIZE,
+ ctx->master_key_descriptor);
+ description[full_key_len - 1] = '\0';
+
+ keyring_key = request_key(&key_type_logon, description, NULL);
+ kfree(description);
+ if (IS_ERR(keyring_key))
+ return PTR_ERR(keyring_key);
+ down_read(&keyring_key->sem);
+
+ if (keyring_key->type != &key_type_logon) {
+ printk_once(KERN_WARNING
+ "%s: key type must be logon\n", __func__);
+ res = -ENOKEY;
+ goto out;
+ }
+ ukp = user_key_payload(keyring_key);
+ if (ukp->datalen != sizeof(struct fscrypt_key)) {
+ res = -EINVAL;
+ goto out;
+ }
+ master_key = (struct fscrypt_key *)ukp->data;
+ BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
+
+ if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
+ printk_once(KERN_WARNING
+ "%s: key size incorrect: %d\n",
+ __func__, master_key->size);
+ res = -ENOKEY;
+ goto out;
+ }
+ res = derive_key_aes(ctx->nonce, master_key->raw, raw_key);
+out:
+ up_read(&keyring_key->sem);
+ key_put(keyring_key);
+ return res;
+}
+
+static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode,
+ const char **cipher_str_ret, int *keysize_ret)
+{
+ if (S_ISREG(inode->i_mode)) {
+ if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) {
+ *cipher_str_ret = "xts(aes)";
+ *keysize_ret = FS_AES_256_XTS_KEY_SIZE;
+ return 0;
+ }
+ pr_warn_once("fscrypto: unsupported contents encryption mode "
+ "%d for inode %lu\n",
+ ci->ci_data_mode, inode->i_ino);
+ return -ENOKEY;
+ }
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+ if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) {
+ *cipher_str_ret = "cts(cbc(aes))";
+ *keysize_ret = FS_AES_256_CTS_KEY_SIZE;
+ return 0;
+ }
+ pr_warn_once("fscrypto: unsupported filenames encryption mode "
+ "%d for inode %lu\n",
+ ci->ci_filename_mode, inode->i_ino);
+ return -ENOKEY;
+ }
+
+ pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n",
+ (inode->i_mode & S_IFMT), inode->i_ino);
+ return -ENOKEY;
+}
+
+static void put_crypt_info(struct fscrypt_info *ci)
+{
+ if (!ci)
+ return;
+
+ crypto_free_ablkcipher(ci->ci_ctfm);
+ kmem_cache_free(fscrypt_info_cachep, ci);
+}
+
+int fscrypt_get_encryption_info(struct inode *inode)
+{
+ struct fscrypt_info *crypt_info;
+ struct fscrypt_context ctx;
+ struct crypto_ablkcipher *ctfm;
+ const char *cipher_str;
+ int keysize;
+ u8 *raw_key = NULL;
+ int res;
+
+ if (inode->i_crypt_info)
+ return 0;
+
+ res = fscrypt_initialize(inode->i_sb->s_cop->flags);
+ if (res)
+ return res;
+
+ res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (res < 0) {
+ if (!fscrypt_dummy_context_enabled(inode) ||
+ inode->i_sb->s_cop->is_encrypted(inode))
+ return res;
+ /* Fake up a context for an unencrypted directory */
+ memset(&ctx, 0, sizeof(ctx));
+ ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+ ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
+ ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
+ memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
+ } else if (res != sizeof(ctx)) {
+ return -EINVAL;
+ }
+
+ if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
+ return -EINVAL;
+
+ if (ctx.flags & ~FS_POLICY_FLAGS_VALID)
+ return -EINVAL;
+
+ crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS);
+ if (!crypt_info)
+ return -ENOMEM;
+
+ crypt_info->ci_flags = ctx.flags;
+ crypt_info->ci_data_mode = ctx.contents_encryption_mode;
+ crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
+ crypt_info->ci_ctfm = NULL;
+ memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
+ sizeof(crypt_info->ci_master_key));
+
+ res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize);
+ if (res)
+ goto out;
+
+ /*
+ * This cannot be a stack buffer because it is passed to the scatterlist
+ * crypto API as part of key derivation.
+ */
+ res = -ENOMEM;
+ raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS);
+ if (!raw_key)
+ goto out;
+
+ res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX);
+ if (res && inode->i_sb->s_cop->key_prefix) {
+ int res2 = validate_user_key(crypt_info, &ctx, raw_key,
+ inode->i_sb->s_cop->key_prefix);
+ if (res2) {
+ if (res2 == -ENOKEY)
+ res = -ENOKEY;
+ goto out;
+ }
+ } else if (res) {
+ goto out;
+ }
+ ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
+ if (!ctfm || IS_ERR(ctfm)) {
+ res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
+ printk(KERN_DEBUG
+ "%s: error %d (inode %u) allocating crypto tfm\n",
+ __func__, res, (unsigned) inode->i_ino);
+ goto out;
+ }
+ crypt_info->ci_ctfm = ctfm;
+ crypto_ablkcipher_clear_flags(ctfm, ~0);
+ crypto_ablkcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ res = crypto_ablkcipher_setkey(ctfm, raw_key, keysize);
+ if (res)
+ goto out;
+
+ if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL)
+ crypt_info = NULL;
+out:
+ if (res == -ENOKEY)
+ res = 0;
+ put_crypt_info(crypt_info);
+ kzfree(raw_key);
+ return res;
+}
+EXPORT_SYMBOL(fscrypt_get_encryption_info);
+
+void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci)
+{
+ struct fscrypt_info *prev;
+
+ if (ci == NULL)
+ ci = ACCESS_ONCE(inode->i_crypt_info);
+ if (ci == NULL)
+ return;
+
+ prev = cmpxchg(&inode->i_crypt_info, ci, NULL);
+ if (prev != ci)
+ return;
+
+ put_crypt_info(ci);
+}
+EXPORT_SYMBOL(fscrypt_put_encryption_info);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
new file mode 100644
index 000000000000..210976e7a269
--- /dev/null
+++ b/fs/crypto/policy.c
@@ -0,0 +1,269 @@
+/*
+ * Encryption policy functions for per-file encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/mount.h>
+#include "fscrypt_private.h"
+
+/*
+ * check whether an encryption policy is consistent with an encryption context
+ */
+static bool is_encryption_context_consistent_with_policy(
+ const struct fscrypt_context *ctx,
+ const struct fscrypt_policy *policy)
+{
+ return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (ctx->flags == policy->flags) &&
+ (ctx->contents_encryption_mode ==
+ policy->contents_encryption_mode) &&
+ (ctx->filenames_encryption_mode ==
+ policy->filenames_encryption_mode);
+}
+
+static int create_encryption_context_from_policy(struct inode *inode,
+ const struct fscrypt_policy *policy)
+{
+ struct fscrypt_context ctx;
+
+ ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+ memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE);
+
+ if (!fscrypt_valid_contents_enc_mode(
+ policy->contents_encryption_mode))
+ return -EINVAL;
+
+ if (!fscrypt_valid_filenames_enc_mode(
+ policy->filenames_encryption_mode))
+ return -EINVAL;
+
+ if (policy->flags & ~FS_POLICY_FLAGS_VALID)
+ return -EINVAL;
+
+ ctx.contents_encryption_mode = policy->contents_encryption_mode;
+ ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
+ ctx.flags = policy->flags;
+ BUILD_BUG_ON(sizeof(ctx.nonce) != FS_KEY_DERIVATION_NONCE_SIZE);
+ get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+
+ return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
+}
+
+int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
+{
+ struct fscrypt_policy policy;
+ struct inode *inode = file_inode(filp);
+ int ret;
+ struct fscrypt_context ctx;
+
+ if (copy_from_user(&policy, arg, sizeof(policy)))
+ return -EFAULT;
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (policy.version != 0)
+ return -EINVAL;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ inode_lock(inode);
+
+ ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (ret == -ENODATA) {
+ if (!S_ISDIR(inode->i_mode))
+ ret = -ENOTDIR;
+ else if (!inode->i_sb->s_cop->empty_dir(inode))
+ ret = -ENOTEMPTY;
+ else
+ ret = create_encryption_context_from_policy(inode,
+ &policy);
+ } else if (ret == sizeof(ctx) &&
+ is_encryption_context_consistent_with_policy(&ctx,
+ &policy)) {
+ /* The file already uses the same encryption policy. */
+ ret = 0;
+ } else if (ret >= 0 || ret == -ERANGE) {
+ /* The file already uses a different encryption policy. */
+ ret = -EEXIST;
+ }
+
+ inode_unlock(inode);
+
+ mnt_drop_write_file(filp);
+ return ret;
+}
+EXPORT_SYMBOL(fscrypt_ioctl_set_policy);
+
+int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct fscrypt_context ctx;
+ struct fscrypt_policy policy;
+ int res;
+
+ if (!inode->i_sb->s_cop->is_encrypted(inode))
+ return -ENODATA;
+
+ res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (res < 0 && res != -ERANGE)
+ return res;
+ if (res != sizeof(ctx))
+ return -EINVAL;
+ if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
+ return -EINVAL;
+
+ policy.version = 0;
+ policy.contents_encryption_mode = ctx.contents_encryption_mode;
+ policy.filenames_encryption_mode = ctx.filenames_encryption_mode;
+ policy.flags = ctx.flags;
+ memcpy(policy.master_key_descriptor, ctx.master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE);
+
+ if (copy_to_user(arg, &policy, sizeof(policy)))
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
+
+/**
+ * fscrypt_has_permitted_context() - is a file's encryption policy permitted
+ * within its directory?
+ *
+ * @parent: inode for parent directory
+ * @child: inode for file being looked up, opened, or linked into @parent
+ *
+ * Filesystems must call this before permitting access to an inode in a
+ * situation where the parent directory is encrypted (either before allowing
+ * ->lookup() to succeed, or for a regular file before allowing it to be opened)
+ * and before any operation that involves linking an inode into an encrypted
+ * directory, including link, rename, and cross rename. It enforces the
+ * constraint that within a given encrypted directory tree, all files use the
+ * same encryption policy. The pre-access check is needed to detect potentially
+ * malicious offline violations of this constraint, while the link and rename
+ * checks are needed to prevent online violations of this constraint.
+ *
+ * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail
+ * the filesystem operation with EPERM.
+ */
+int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
+{
+ const struct fscrypt_operations *cops = parent->i_sb->s_cop;
+ const struct fscrypt_info *parent_ci, *child_ci;
+ struct fscrypt_context parent_ctx, child_ctx;
+ int res;
+
+ /* No restrictions on file types which are never encrypted */
+ if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
+ !S_ISLNK(child->i_mode))
+ return 1;
+
+ /* No restrictions if the parent directory is unencrypted */
+ if (!cops->is_encrypted(parent))
+ return 1;
+
+ /* Encrypted directories must not contain unencrypted files */
+ if (!cops->is_encrypted(child))
+ return 0;
+
+ /*
+ * Both parent and child are encrypted, so verify they use the same
+ * encryption policy. Compare the fscrypt_info structs if the keys are
+ * available, otherwise retrieve and compare the fscrypt_contexts.
+ *
+ * Note that the fscrypt_context retrieval will be required frequently
+ * when accessing an encrypted directory tree without the key.
+ * Performance-wise this is not a big deal because we already don't
+ * really optimize for file access without the key (to the extent that
+ * such access is even possible), given that any attempted access
+ * already causes a fscrypt_context retrieval and keyring search.
+ *
+ * In any case, if an unexpected error occurs, fall back to "forbidden".
+ */
+
+ res = fscrypt_get_encryption_info(parent);
+ if (res)
+ return 0;
+ res = fscrypt_get_encryption_info(child);
+ if (res)
+ return 0;
+ parent_ci = parent->i_crypt_info;
+ child_ci = child->i_crypt_info;
+
+ if (parent_ci && child_ci) {
+ return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key,
+ FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
+ (parent_ci->ci_filename_mode ==
+ child_ci->ci_filename_mode) &&
+ (parent_ci->ci_flags == child_ci->ci_flags);
+ }
+
+ res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx));
+ if (res != sizeof(parent_ctx))
+ return 0;
+
+ res = cops->get_context(child, &child_ctx, sizeof(child_ctx));
+ if (res != sizeof(child_ctx))
+ return 0;
+
+ return memcmp(parent_ctx.master_key_descriptor,
+ child_ctx.master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (parent_ctx.contents_encryption_mode ==
+ child_ctx.contents_encryption_mode) &&
+ (parent_ctx.filenames_encryption_mode ==
+ child_ctx.filenames_encryption_mode) &&
+ (parent_ctx.flags == child_ctx.flags);
+}
+EXPORT_SYMBOL(fscrypt_has_permitted_context);
+
+/**
+ * fscrypt_inherit_context() - Sets a child context from its parent
+ * @parent: Parent inode from which the context is inherited.
+ * @child: Child inode that inherits the context from @parent.
+ * @fs_data: private data given by FS.
+ * @preload: preload child i_crypt_info if true
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fscrypt_inherit_context(struct inode *parent, struct inode *child,
+ void *fs_data, bool preload)
+{
+ struct fscrypt_context ctx;
+ struct fscrypt_info *ci;
+ int res;
+
+ res = fscrypt_get_encryption_info(parent);
+ if (res < 0)
+ return res;
+
+ ci = parent->i_crypt_info;
+ if (ci == NULL)
+ return -ENOKEY;
+
+ ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+ ctx.contents_encryption_mode = ci->ci_data_mode;
+ ctx.filenames_encryption_mode = ci->ci_filename_mode;
+ ctx.flags = ci->ci_flags;
+ memcpy(ctx.master_key_descriptor, ci->ci_master_key,
+ FS_KEY_DESCRIPTOR_SIZE);
+ get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+ res = parent->i_sb->s_cop->set_context(child, &ctx,
+ sizeof(ctx), fs_data);
+ if (res)
+ return res;
+ return preload ? fscrypt_get_encryption_info(child): 0;
+}
+EXPORT_SYMBOL(fscrypt_inherit_context);
diff --git a/fs/dcache.c b/fs/dcache.c
index 321a761e7299..c88ce0c031c8 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2735,11 +2735,11 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
* Note: If ever the locking in lock_rename() changes, then please
* remember to update this too...
*/
-static struct dentry *__d_unalias(struct inode *inode,
+static int __d_unalias(struct inode *inode,
struct dentry *dentry, struct dentry *alias)
{
struct mutex *m1 = NULL, *m2 = NULL;
- struct dentry *ret = ERR_PTR(-EBUSY);
+ int ret = -EBUSY;
/* If alias and dentry share a parent, then no extra locks required */
if (alias->d_parent == dentry->d_parent)
@@ -2754,7 +2754,7 @@ static struct dentry *__d_unalias(struct inode *inode,
m2 = &alias->d_parent->d_inode->i_mutex;
out_unalias:
__d_move(alias, dentry, false);
- ret = alias;
+ ret = 0;
out_err:
spin_unlock(&inode->i_lock);
if (m2)
@@ -2789,130 +2789,57 @@ out_err:
*/
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
- struct dentry *new = NULL;
-
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (inode && S_ISDIR(inode->i_mode)) {
- spin_lock(&inode->i_lock);
- new = __d_find_any_alias(inode);
- if (new) {
- if (!IS_ROOT(new)) {
- spin_unlock(&inode->i_lock);
- dput(new);
- iput(inode);
- return ERR_PTR(-EIO);
- }
- if (d_ancestor(new, dentry)) {
- spin_unlock(&inode->i_lock);
- dput(new);
- iput(inode);
- return ERR_PTR(-EIO);
- }
- write_seqlock(&rename_lock);
- __d_move(new, dentry, false);
- write_sequnlock(&rename_lock);
- spin_unlock(&inode->i_lock);
- security_d_instantiate(new, inode);
- iput(inode);
- } else {
- /* already taking inode->i_lock, so d_add() by hand */
- __d_instantiate(dentry, inode);
- spin_unlock(&inode->i_lock);
- security_d_instantiate(dentry, inode);
- d_rehash(dentry);
- }
- } else {
- d_instantiate(dentry, inode);
- if (d_unhashed(dentry))
- d_rehash(dentry);
- }
- return new;
-}
-EXPORT_SYMBOL(d_splice_alias);
-
-/**
- * d_materialise_unique - introduce an inode into the tree
- * @dentry: candidate dentry
- * @inode: inode to bind to the dentry, to which aliases may be attached
- *
- * Introduces an dentry into the tree, substituting an extant disconnected
- * root directory alias in its place if there is one. Caller must hold the
- * i_mutex of the parent directory.
- */
-struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
-{
- struct dentry *actual;
-
BUG_ON(!d_unhashed(dentry));
if (!inode) {
- actual = dentry;
__d_instantiate(dentry, NULL);
- d_rehash(actual);
- goto out_nolock;
+ goto out;
}
-
spin_lock(&inode->i_lock);
-
if (S_ISDIR(inode->i_mode)) {
- struct dentry *alias;
-
- /* Does an aliased dentry already exist? */
- alias = __d_find_alias(inode);
- if (alias) {
- actual = alias;
+ struct dentry *new = __d_find_any_alias(inode);
+ if (unlikely(new)) {
write_seqlock(&rename_lock);
-
- if (d_ancestor(alias, dentry)) {
- /* Check for loops */
- actual = ERR_PTR(-ELOOP);
+ if (unlikely(d_ancestor(new, dentry))) {
+ write_sequnlock(&rename_lock);
spin_unlock(&inode->i_lock);
- } else if (IS_ROOT(alias)) {
- /* Is this an anonymous mountpoint that we
- * could splice into our tree? */
- __d_move(alias, dentry, false);
+ dput(new);
+ new = ERR_PTR(-ELOOP);
+ pr_warn_ratelimited(
+ "VFS: Lookup of '%s' in %s %s"
+ " would have caused loop\n",
+ dentry->d_name.name,
+ inode->i_sb->s_type->name,
+ inode->i_sb->s_id);
+ } else if (!IS_ROOT(new)) {
+ int err = __d_unalias(inode, dentry, new);
write_sequnlock(&rename_lock);
- goto found;
+ if (err) {
+ dput(new);
+ new = ERR_PTR(err);
+ }
} else {
- /* Nope, but we must(!) avoid directory
- * aliasing. This drops inode->i_lock */
- actual = __d_unalias(inode, dentry, alias);
- }
- write_sequnlock(&rename_lock);
- if (IS_ERR(actual)) {
- if (PTR_ERR(actual) == -ELOOP)
- pr_warn_ratelimited(
- "VFS: Lookup of '%s' in %s %s"
- " would have caused loop\n",
- dentry->d_name.name,
- inode->i_sb->s_type->name,
- inode->i_sb->s_id);
- dput(alias);
+ __d_move(new, dentry, false);
+ write_sequnlock(&rename_lock);
+ spin_unlock(&inode->i_lock);
+ security_d_instantiate(new, inode);
}
- goto out_nolock;
+ iput(inode);
+ return new;
}
}
-
- /* Add a unique reference */
- actual = __d_instantiate_unique(dentry, inode);
- if (!actual)
- actual = dentry;
-
- d_rehash(actual);
-found:
+ /* already taking inode->i_lock, so d_add() by hand */
+ __d_instantiate(dentry, inode);
spin_unlock(&inode->i_lock);
-out_nolock:
- if (actual == dentry) {
- security_d_instantiate(dentry, inode);
- return NULL;
- }
-
- iput(inode);
- return actual;
+out:
+ security_d_instantiate(dentry, inode);
+ d_rehash(dentry);
+ return NULL;
}
-EXPORT_SYMBOL_GPL(d_materialise_unique);
+EXPORT_SYMBOL(d_splice_alias);
static int prepend(char **buffer, int *buflen, const char *str, int namelen)
{
@@ -3113,6 +3040,7 @@ char *d_absolute_path(const struct path *path,
return ERR_PTR(error);
return res;
}
+EXPORT_SYMBOL(d_absolute_path);
/*
* same as __d_path but appends "(deleted)" for unlinked files.
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 6f0ce531e221..960f294b45a8 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -34,93 +34,16 @@ static struct vfsmount *debugfs_mount;
static int debugfs_mount_count;
static bool debugfs_registered;
-static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev,
- void *data, const struct file_operations *fops)
-
+static struct inode *debugfs_get_inode(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
-
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- switch (mode & S_IFMT) {
- default:
- init_special_inode(inode, mode, dev);
- break;
- case S_IFREG:
- inode->i_fop = fops ? fops : &debugfs_file_operations;
- inode->i_private = data;
- break;
- case S_IFLNK:
- inode->i_op = &debugfs_link_operations;
- inode->i_private = data;
- break;
- case S_IFDIR:
- inode->i_op = &simple_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
-
- /* directory inodes start off with i_nlink == 2
- * (for "." entry) */
- inc_nlink(inode);
- break;
- }
}
return inode;
}
-/* SMP-safe */
-static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
- umode_t mode, dev_t dev, void *data,
- const struct file_operations *fops)
-{
- struct inode *inode;
- int error = -EPERM;
-
- if (dentry->d_inode)
- return -EEXIST;
-
- inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops);
- if (inode) {
- d_instantiate(dentry, inode);
- dget(dentry);
- error = 0;
- }
- return error;
-}
-
-static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
- int res;
-
- mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
- res = debugfs_mknod(dir, dentry, mode, 0, NULL, NULL);
- if (!res) {
- inc_nlink(dir);
- fsnotify_mkdir(dir, dentry);
- }
- return res;
-}
-
-static int debugfs_link(struct inode *dir, struct dentry *dentry, umode_t mode,
- void *data)
-{
- mode = (mode & S_IALLUGO) | S_IFLNK;
- return debugfs_mknod(dir, dentry, mode, 0, data, NULL);
-}
-
-static int debugfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- void *data, const struct file_operations *fops)
-{
- int res;
-
- mode = (mode & S_IALLUGO) | S_IFREG;
- res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
- if (!res)
- fsnotify_create(dir, dentry);
- return res;
-}
-
static inline int debugfs_positive(struct dentry *dentry)
{
return dentry->d_inode && !d_unhashed(dentry);
@@ -261,6 +184,18 @@ static const struct super_operations debugfs_super_operations = {
.evict_inode = debugfs_evict_inode,
};
+static struct vfsmount *debugfs_automount(struct path *path)
+{
+ struct vfsmount *(*f)(void *);
+ f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
+ return f(path->dentry->d_inode->i_private);
+}
+
+static const struct dentry_operations debugfs_dops = {
+ .d_delete = always_delete_dentry,
+ .d_automount = debugfs_automount,
+};
+
static int debug_fill_super(struct super_block *sb, void *data, int silent)
{
static struct tree_descr debug_files[] = {{""}};
@@ -285,6 +220,7 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
goto fail;
sb->s_op = &debugfs_super_operations;
+ sb->s_d_op = &debugfs_dops;
debugfs_apply_options(sb);
@@ -311,11 +247,9 @@ static struct file_system_type debug_fs_type = {
};
MODULE_ALIAS_FS("debugfs");
-static struct dentry *__create_file(const char *name, umode_t mode,
- struct dentry *parent, void *data,
- const struct file_operations *fops)
+static struct dentry *start_creating(const char *name, struct dentry *parent)
{
- struct dentry *dentry = NULL;
+ struct dentry *dentry;
int error;
pr_debug("debugfs: creating file '%s'\n",name);
@@ -323,7 +257,7 @@ static struct dentry *__create_file(const char *name, umode_t mode,
error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
&debugfs_mount_count);
if (error)
- goto exit;
+ return ERR_PTR(error);
/* If the parent is not specified, we create it in the root.
* We need the root dentry to do this, which is in the super
@@ -335,31 +269,26 @@ static struct dentry *__create_file(const char *name, umode_t mode,
mutex_lock(&parent->d_inode->i_mutex);
dentry = lookup_one_len(name, parent, strlen(name));
- if (!IS_ERR(dentry)) {
- switch (mode & S_IFMT) {
- case S_IFDIR:
- error = debugfs_mkdir(parent->d_inode, dentry, mode);
-
- break;
- case S_IFLNK:
- error = debugfs_link(parent->d_inode, dentry, mode,
- data);
- break;
- default:
- error = debugfs_create(parent->d_inode, dentry, mode,
- data, fops);
- break;
- }
+ if (!IS_ERR(dentry) && dentry->d_inode) {
dput(dentry);
- } else
- error = PTR_ERR(dentry);
- mutex_unlock(&parent->d_inode->i_mutex);
-
- if (error) {
- dentry = NULL;
- simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+ dentry = ERR_PTR(-EEXIST);
}
-exit:
+ if (IS_ERR(dentry))
+ mutex_unlock(&parent->d_inode->i_mutex);
+ return dentry;
+}
+
+static struct dentry *failed_creating(struct dentry *dentry)
+{
+ mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+ dput(dentry);
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+ return NULL;
+}
+
+static struct dentry *end_creating(struct dentry *dentry)
+{
+ mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
return dentry;
}
@@ -393,15 +322,27 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops)
{
- switch (mode & S_IFMT) {
- case S_IFREG:
- case 0:
- break;
- default:
- BUG();
- }
+ struct dentry *dentry;
+ struct inode *inode;
+
+ if (!(mode & S_IFMT))
+ mode |= S_IFREG;
+ BUG_ON(!S_ISREG(mode));
+ dentry = start_creating(name, parent);
+
+ if (IS_ERR(dentry))
+ return NULL;
+
+ inode = debugfs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return failed_creating(dentry);
- return __create_file(name, mode, parent, data, fops);
+ inode->i_mode = mode;
+ inode->i_fop = fops ? fops : &debugfs_file_operations;
+ inode->i_private = data;
+ d_instantiate(dentry, inode);
+ fsnotify_create(dentry->d_parent->d_inode, dentry);
+ return end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_file);
@@ -425,12 +366,65 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
*/
struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
{
- return __create_file(name, S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
- parent, NULL, NULL);
+ struct dentry *dentry = start_creating(name, parent);
+ struct inode *inode;
+
+ if (IS_ERR(dentry))
+ return NULL;
+
+ inode = debugfs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return failed_creating(dentry);
+
+ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ inode->i_op = &simple_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+
+ /* directory inodes start off with i_nlink == 2 (for "." entry) */
+ inc_nlink(inode);
+ d_instantiate(dentry, inode);
+ inc_nlink(dentry->d_parent->d_inode);
+ fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+ return end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_dir);
/**
+ * debugfs_create_automount - create automount point in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @f: function to be called when pathname resolution steps on that one.
+ * @data: opaque argument to pass to f().
+ *
+ * @f should return what ->d_automount() would.
+ */
+struct dentry *debugfs_create_automount(const char *name,
+ struct dentry *parent,
+ struct vfsmount *(*f)(void *),
+ void *data)
+{
+ struct dentry *dentry = start_creating(name, parent);
+ struct inode *inode;
+
+ if (IS_ERR(dentry))
+ return NULL;
+
+ inode = debugfs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return failed_creating(dentry);
+
+ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ inode->i_flags |= S_AUTOMOUNT;
+ inode->i_private = data;
+ dentry->d_fsdata = (void *)f;
+ d_instantiate(dentry, inode);
+ return end_creating(dentry);
+}
+EXPORT_SYMBOL(debugfs_create_automount);
+
+/**
* debugfs_create_symlink- create a symbolic link in the debugfs filesystem
* @name: a pointer to a string containing the name of the symbolic link to
* create.
@@ -456,17 +450,28 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
const char *target)
{
- struct dentry *result;
- char *link;
-
- link = kstrdup(target, GFP_KERNEL);
+ struct dentry *dentry;
+ struct inode *inode;
+ char *link = kstrdup(target, GFP_KERNEL);
if (!link)
return NULL;
- result = __create_file(name, S_IFLNK | S_IRWXUGO, parent, link, NULL);
- if (!result)
+ dentry = start_creating(name, parent);
+ if (IS_ERR(dentry)) {
kfree(link);
- return result;
+ return NULL;
+ }
+
+ inode = debugfs_get_inode(dentry->d_sb);
+ if (unlikely(!inode)) {
+ kfree(link);
+ return failed_creating(dentry);
+ }
+ inode->i_mode = S_IFLNK | S_IRWXUGO;
+ inode->i_op = &debugfs_link_operations;
+ inode->i_private = link;
+ d_instantiate(dentry, inode);
+ return end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_symlink);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index c02f52cfe64a..03fe4c8b3921 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -119,7 +119,6 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
.sb = inode->i_sb,
};
lower_file = ecryptfs_file_to_lower(file);
- lower_file->f_pos = ctx->pos;
rc = iterate_dir(lower_file, &buf.ctx);
ctx->pos = buf.ctx.pos;
if (rc < 0)
@@ -243,14 +242,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
}
ecryptfs_set_file_lower(
file, ecryptfs_inode_to_private(inode)->lower_file);
- if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
- ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
- mutex_lock(&crypt_stat->cs_mutex);
- crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
- mutex_unlock(&crypt_stat->cs_mutex);
- rc = 0;
- goto out;
- }
rc = read_or_initialize_metadata(ecryptfs_dentry);
if (rc)
goto out_put;
@@ -267,6 +258,45 @@ out:
return rc;
}
+/**
+ * ecryptfs_dir_open
+ * @inode: inode speciying file to open
+ * @file: Structure to return filled in
+ *
+ * Opens the file specified by inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_dir_open(struct inode *inode, struct file *file)
+{
+ struct dentry *ecryptfs_dentry = file->f_path.dentry;
+ /* Private value of ecryptfs_dentry allocated in
+ * ecryptfs_lookup() */
+ struct ecryptfs_file_info *file_info;
+ struct file *lower_file;
+
+ /* Released in ecryptfs_release or end of function if failure */
+ file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL);
+ ecryptfs_set_file_private(file, file_info);
+ if (unlikely(!file_info)) {
+ ecryptfs_printk(KERN_ERR,
+ "Error attempting to allocate memory\n");
+ return -ENOMEM;
+ }
+ lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry),
+ file->f_flags, current_cred());
+ if (IS_ERR(lower_file)) {
+ printk(KERN_ERR "%s: Error attempting to initialize "
+ "the lower file for the dentry with name "
+ "[%pd]; rc = [%ld]\n", __func__,
+ ecryptfs_dentry, PTR_ERR(lower_file));
+ kmem_cache_free(ecryptfs_file_info_cache, file_info);
+ return PTR_ERR(lower_file);
+ }
+ ecryptfs_set_file_lower(file, lower_file);
+ return 0;
+}
+
static int ecryptfs_flush(struct file *file, fl_owner_t td)
{
struct file *lower_file = ecryptfs_file_to_lower(file);
@@ -287,6 +317,19 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
return 0;
}
+static int ecryptfs_dir_release(struct inode *inode, struct file *file)
+{
+ fput(ecryptfs_file_to_lower(file));
+ kmem_cache_free(ecryptfs_file_info_cache,
+ ecryptfs_file_to_private(file));
+ return 0;
+}
+
+static loff_t ecryptfs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ return vfs_llseek(ecryptfs_file_to_lower(file), offset, whence);
+}
+
static int
ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
@@ -367,13 +410,10 @@ const struct file_operations ecryptfs_dir_fops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ecryptfs_compat_ioctl,
#endif
- .open = ecryptfs_open,
- .flush = ecryptfs_flush,
- .release = ecryptfs_release,
+ .open = ecryptfs_dir_open,
+ .release = ecryptfs_dir_release,
.fsync = ecryptfs_fsync,
- .fasync = ecryptfs_fasync,
- .splice_read = generic_file_splice_read,
- .llseek = default_llseek,
+ .llseek = ecryptfs_dir_llseek,
};
const struct file_operations ecryptfs_main_fops = {
@@ -382,7 +422,6 @@ const struct file_operations ecryptfs_main_fops = {
.read_iter = ecryptfs_read_update_atime,
.write = new_sync_write,
.write_iter = generic_file_write_iter,
- .iterate = ecryptfs_readdir,
.unlocked_ioctl = ecryptfs_unlocked_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ecryptfs_compat_ioctl,
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 7bcfff900f05..1c6bba7e6aa1 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,6 +34,7 @@
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <linux/device.h>
+#include <linux/freezer.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
@@ -1637,7 +1638,8 @@ fetch_events:
}
spin_unlock_irqrestore(&ep->lock, flags);
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!freezable_schedule_hrtimeout_range(to, slack,
+ HRTIMER_MODE_ABS))
timed_out = 1;
spin_lock_irqsave(&ep->lock, flags);
diff --git a/fs/exec.c b/fs/exec.c
index 515081d66c60..a552a192d0e3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1120,7 +1120,7 @@ EXPORT_SYMBOL(flush_old_exec);
void would_dump(struct linux_binprm *bprm, struct file *file)
{
- if (inode_permission(file_inode(file), MAY_READ) < 0)
+ if (inode_permission2(file->f_path.mnt, file_inode(file), MAY_READ) < 0)
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
}
EXPORT_SYMBOL(would_dump);
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 8bbaf5bcf982..c4509495a104 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -195,15 +195,11 @@ __ext3_set_acl(handle_t *handle, struct inode *inode, int type,
case ACL_TYPE_ACCESS:
name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl) {
- error = posix_acl_equiv_mode(acl, &inode->i_mode);
- if (error < 0)
+ error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+ if (error)
return error;
- else {
- inode->i_ctime = CURRENT_TIME_SEC;
- ext3_mark_inode_dirty(handle, inode);
- if (error == 0)
- acl = NULL;
- }
+ inode->i_ctime = CURRENT_TIME_SEC;
+ ext3_mark_inode_dirty(handle, inode);
}
break;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index efea5d5c44ce..bf8bc8aba471 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -64,6 +64,29 @@ config EXT4_FS_SECURITY
If you are not using a security module that requires using
extended attributes for file security labels, say N.
+config EXT4_ENCRYPTION
+ tristate "Ext4 Encryption"
+ depends on EXT4_FS
+ select CRYPTO_AES
+ select CRYPTO_CBC
+ select CRYPTO_ECB
+ select CRYPTO_XTS
+ select CRYPTO_CTS
+ select CRYPTO_CTR
+ select CRYPTO_SHA256
+ select KEYS
+ select ENCRYPTED_KEYS
+ help
+ Enable encryption of ext4 files and directories. This
+ feature is similar to ecryptfs, but it is more memory
+ efficient since it avoids caching the encrypted and
+ decrypted pages in the page cache.
+
+config EXT4_FS_ENCRYPTION
+ bool
+ default y
+ depends on EXT4_ENCRYPTION
+
config EXT4_DEBUG
bool "EXT4 debugging support"
depends on EXT4_FS
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 0310fec2ee3d..75285ea9aa05 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -8,7 +8,9 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
- xattr_trusted.o inline.o
+ xattr_trusted.o inline.o readpage.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
+ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o crypto.o \
+ crypto_key.o crypto_fname.o
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
new file mode 100644
index 000000000000..1d9429baee61
--- /dev/null
+++ b/fs/ext4/crypto.c
@@ -0,0 +1,536 @@
+/*
+ * linux/fs/ext4/crypto.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption functions for ext4
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ * Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ * Ildar Muslukhov, 2014
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <keys/user-type.h>
+#include <keys/encrypted-type.h>
+#include <linux/crypto.h>
+#include <linux/ecryptfs.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/spinlock_types.h>
+#include <linux/namei.h>
+
+#include "ext4_extents.h"
+#include "xattr.h"
+
+/* Encryption added and removed here! (L: */
+
+static unsigned int num_prealloc_crypto_pages = 32;
+static unsigned int num_prealloc_crypto_ctxs = 128;
+
+module_param(num_prealloc_crypto_pages, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_pages,
+ "Number of crypto pages to preallocate");
+module_param(num_prealloc_crypto_ctxs, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
+ "Number of crypto contexts to preallocate");
+
+static mempool_t *ext4_bounce_page_pool;
+
+static LIST_HEAD(ext4_free_crypto_ctxs);
+static DEFINE_SPINLOCK(ext4_crypto_ctx_lock);
+
+static struct kmem_cache *ext4_crypto_ctx_cachep;
+struct kmem_cache *ext4_crypt_info_cachep;
+
+/**
+ * ext4_release_crypto_ctx() - Releases an encryption context
+ * @ctx: The encryption context to release.
+ *
+ * If the encryption context was allocated from the pre-allocated pool, returns
+ * it to that pool. Else, frees it.
+ *
+ * If there's a bounce page in the context, this frees that.
+ */
+void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx)
+{
+ unsigned long flags;
+
+ if (ctx->flags & EXT4_WRITE_PATH_FL && ctx->w.bounce_page)
+ mempool_free(ctx->w.bounce_page, ext4_bounce_page_pool);
+ ctx->w.bounce_page = NULL;
+ ctx->w.control_page = NULL;
+ if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) {
+ kmem_cache_free(ext4_crypto_ctx_cachep, ctx);
+ } else {
+ spin_lock_irqsave(&ext4_crypto_ctx_lock, flags);
+ list_add(&ctx->free_list, &ext4_free_crypto_ctxs);
+ spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags);
+ }
+}
+
+/**
+ * ext4_get_crypto_ctx() - Gets an encryption context
+ * @inode: The inode for which we are doing the crypto
+ *
+ * Allocates and initializes an encryption context.
+ *
+ * Return: An allocated and initialized encryption context on success; error
+ * value or NULL otherwise.
+ */
+struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode,
+ gfp_t gfp_flags)
+{
+ struct ext4_crypto_ctx *ctx = NULL;
+ int res = 0;
+ unsigned long flags;
+ struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+
+ if (ci == NULL)
+ return ERR_PTR(-ENOKEY);
+
+ /*
+ * We first try getting the ctx from a free list because in
+ * the common case the ctx will have an allocated and
+ * initialized crypto tfm, so it's probably a worthwhile
+ * optimization. For the bounce page, we first try getting it
+ * from the kernel allocator because that's just about as fast
+ * as getting it from a list and because a cache of free pages
+ * should generally be a "last resort" option for a filesystem
+ * to be able to do its job.
+ */
+ spin_lock_irqsave(&ext4_crypto_ctx_lock, flags);
+ ctx = list_first_entry_or_null(&ext4_free_crypto_ctxs,
+ struct ext4_crypto_ctx, free_list);
+ if (ctx)
+ list_del(&ctx->free_list);
+ spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags);
+ if (!ctx) {
+ ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, gfp_flags);
+ if (!ctx) {
+ res = -ENOMEM;
+ goto out;
+ }
+ ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL;
+ } else {
+ ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL;
+ }
+ ctx->flags &= ~EXT4_WRITE_PATH_FL;
+
+out:
+ if (res) {
+ if (!IS_ERR_OR_NULL(ctx))
+ ext4_release_crypto_ctx(ctx);
+ ctx = ERR_PTR(res);
+ }
+ return ctx;
+}
+
+struct workqueue_struct *ext4_read_workqueue;
+static DEFINE_MUTEX(crypto_init);
+
+/**
+ * ext4_exit_crypto() - Shutdown the ext4 encryption system
+ */
+void ext4_exit_crypto(void)
+{
+ struct ext4_crypto_ctx *pos, *n;
+
+ list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list)
+ kmem_cache_free(ext4_crypto_ctx_cachep, pos);
+ INIT_LIST_HEAD(&ext4_free_crypto_ctxs);
+ if (ext4_bounce_page_pool)
+ mempool_destroy(ext4_bounce_page_pool);
+ ext4_bounce_page_pool = NULL;
+ if (ext4_read_workqueue)
+ destroy_workqueue(ext4_read_workqueue);
+ ext4_read_workqueue = NULL;
+ if (ext4_crypto_ctx_cachep)
+ kmem_cache_destroy(ext4_crypto_ctx_cachep);
+ ext4_crypto_ctx_cachep = NULL;
+ if (ext4_crypt_info_cachep)
+ kmem_cache_destroy(ext4_crypt_info_cachep);
+ ext4_crypt_info_cachep = NULL;
+}
+
+/**
+ * ext4_init_crypto() - Set up for ext4 encryption.
+ *
+ * We only call this when we start accessing encrypted files, since it
+ * results in memory getting allocated that wouldn't otherwise be used.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int ext4_init_crypto(void)
+{
+ int i, res = -ENOMEM;
+
+ mutex_lock(&crypto_init);
+ if (ext4_read_workqueue)
+ goto already_initialized;
+ ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0);
+ if (!ext4_read_workqueue)
+ goto fail;
+
+ ext4_crypto_ctx_cachep = KMEM_CACHE(ext4_crypto_ctx,
+ SLAB_RECLAIM_ACCOUNT);
+ if (!ext4_crypto_ctx_cachep)
+ goto fail;
+
+ ext4_crypt_info_cachep = KMEM_CACHE(ext4_crypt_info,
+ SLAB_RECLAIM_ACCOUNT);
+ if (!ext4_crypt_info_cachep)
+ goto fail;
+
+ for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
+ struct ext4_crypto_ctx *ctx;
+
+ ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS);
+ if (!ctx) {
+ res = -ENOMEM;
+ goto fail;
+ }
+ list_add(&ctx->free_list, &ext4_free_crypto_ctxs);
+ }
+
+ ext4_bounce_page_pool =
+ mempool_create_page_pool(num_prealloc_crypto_pages, 0);
+ if (!ext4_bounce_page_pool) {
+ res = -ENOMEM;
+ goto fail;
+ }
+already_initialized:
+ mutex_unlock(&crypto_init);
+ return 0;
+fail:
+ ext4_exit_crypto();
+ mutex_unlock(&crypto_init);
+ return res;
+}
+
+void ext4_restore_control_page(struct page *data_page)
+{
+ struct ext4_crypto_ctx *ctx =
+ (struct ext4_crypto_ctx *)page_private(data_page);
+
+ set_page_private(data_page, (unsigned long)NULL);
+ ClearPagePrivate(data_page);
+ unlock_page(data_page);
+ ext4_release_crypto_ctx(ctx);
+}
+
+/**
+ * ext4_crypt_complete() - The completion callback for page encryption
+ * @req: The asynchronous encryption request context
+ * @res: The result of the encryption operation
+ */
+static void ext4_crypt_complete(struct crypto_async_request *req, int res)
+{
+ struct ext4_completion_result *ecr = req->data;
+
+ if (res == -EINPROGRESS)
+ return;
+ ecr->res = res;
+ complete(&ecr->completion);
+}
+
+typedef enum {
+ EXT4_DECRYPT = 0,
+ EXT4_ENCRYPT,
+} ext4_direction_t;
+
+static int ext4_page_crypto(struct inode *inode,
+ ext4_direction_t rw,
+ pgoff_t index,
+ struct page *src_page,
+ struct page *dest_page,
+ gfp_t gfp_flags)
+
+{
+ u8 xts_tweak[EXT4_XTS_TWEAK_SIZE];
+ struct ablkcipher_request *req = NULL;
+ DECLARE_EXT4_COMPLETION_RESULT(ecr);
+ struct scatterlist dst, src;
+ struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+ struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ int res = 0;
+
+ req = ablkcipher_request_alloc(tfm, gfp_flags);
+ if (!req) {
+ printk_ratelimited(KERN_ERR
+ "%s: crypto_request_alloc() failed\n",
+ __func__);
+ return -ENOMEM;
+ }
+ ablkcipher_request_set_callback(
+ req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ ext4_crypt_complete, &ecr);
+
+ BUILD_BUG_ON(EXT4_XTS_TWEAK_SIZE < sizeof(index));
+ memcpy(xts_tweak, &index, sizeof(index));
+ memset(&xts_tweak[sizeof(index)], 0,
+ EXT4_XTS_TWEAK_SIZE - sizeof(index));
+
+ sg_init_table(&dst, 1);
+ sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
+ sg_init_table(&src, 1);
+ sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
+ ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
+ xts_tweak);
+ if (rw == EXT4_DECRYPT)
+ res = crypto_ablkcipher_decrypt(req);
+ else
+ res = crypto_ablkcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+ ablkcipher_request_free(req);
+ if (res) {
+ printk_ratelimited(
+ KERN_ERR
+ "%s: crypto_ablkcipher_encrypt() returned %d\n",
+ __func__, res);
+ return res;
+ }
+ return 0;
+}
+
+static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx,
+ gfp_t gfp_flags)
+{
+ ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, gfp_flags);
+ if (ctx->w.bounce_page == NULL)
+ return ERR_PTR(-ENOMEM);
+ ctx->flags |= EXT4_WRITE_PATH_FL;
+ return ctx->w.bounce_page;
+}
+
+/**
+ * ext4_encrypt() - Encrypts a page
+ * @inode: The inode for which the encryption should take place
+ * @plaintext_page: The page to encrypt. Must be locked.
+ *
+ * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
+ * encryption context.
+ *
+ * Called on the page write path. The caller must call
+ * ext4_restore_control_page() on the returned ciphertext page to
+ * release the bounce buffer and the encryption context.
+ *
+ * Return: An allocated page with the encrypted content on success. Else, an
+ * error value or NULL.
+ */
+struct page *ext4_encrypt(struct inode *inode,
+ struct page *plaintext_page,
+ gfp_t gfp_flags)
+{
+ struct ext4_crypto_ctx *ctx;
+ struct page *ciphertext_page = NULL;
+ int err;
+
+ BUG_ON(!PageLocked(plaintext_page));
+
+ ctx = ext4_get_crypto_ctx(inode, gfp_flags);
+ if (IS_ERR(ctx))
+ return (struct page *) ctx;
+
+ /* The encryption operation will require a bounce page. */
+ ciphertext_page = alloc_bounce_page(ctx, gfp_flags);
+ if (IS_ERR(ciphertext_page))
+ goto errout;
+ ctx->w.control_page = plaintext_page;
+ err = ext4_page_crypto(inode, EXT4_ENCRYPT, plaintext_page->index,
+ plaintext_page, ciphertext_page, gfp_flags);
+ if (err) {
+ ciphertext_page = ERR_PTR(err);
+ errout:
+ ext4_release_crypto_ctx(ctx);
+ return ciphertext_page;
+ }
+ SetPagePrivate(ciphertext_page);
+ set_page_private(ciphertext_page, (unsigned long)ctx);
+ lock_page(ciphertext_page);
+ return ciphertext_page;
+}
+
+/**
+ * ext4_decrypt() - Decrypts a page in-place
+ * @ctx: The encryption context.
+ * @page: The page to decrypt. Must be locked.
+ *
+ * Decrypts page in-place using the ctx encryption context.
+ *
+ * Called from the read completion callback.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int ext4_decrypt(struct page *page)
+{
+ BUG_ON(!PageLocked(page));
+
+ return ext4_page_crypto(page->mapping->host, EXT4_DECRYPT,
+ page->index, page, page, GFP_NOFS);
+}
+
+int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
+{
+ struct ext4_crypto_ctx *ctx;
+ struct page *ciphertext_page = NULL;
+ struct bio *bio;
+ ext4_lblk_t lblk = ex->ee_block;
+ ext4_fsblk_t pblk = ext4_ext_pblock(ex);
+ unsigned int len = ext4_ext_get_actual_len(ex);
+ int ret, err = 0;
+
+#if 0
+ ext4_msg(inode->i_sb, KERN_CRIT,
+ "ext4_encrypted_zeroout ino %lu lblk %u len %u",
+ (unsigned long) inode->i_ino, lblk, len);
+#endif
+
+ BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
+
+ ctx = ext4_get_crypto_ctx(inode, GFP_NOFS);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT);
+ if (IS_ERR(ciphertext_page)) {
+ err = PTR_ERR(ciphertext_page);
+ goto errout;
+ }
+
+ while (len--) {
+ err = ext4_page_crypto(inode, EXT4_ENCRYPT, lblk,
+ ZERO_PAGE(0), ciphertext_page,
+ GFP_NOFS);
+ if (err)
+ goto errout;
+
+ bio = bio_alloc(GFP_NOWAIT, 1);
+ if (!bio) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ bio->bi_bdev = inode->i_sb->s_bdev;
+ bio->bi_iter.bi_sector =
+ pblk << (inode->i_sb->s_blocksize_bits - 9);
+ ret = bio_add_page(bio, ciphertext_page,
+ inode->i_sb->s_blocksize, 0);
+ if (ret != inode->i_sb->s_blocksize) {
+ /* should never happen! */
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "bio_add_page failed: %d", ret);
+ WARN_ON(1);
+ bio_put(bio);
+ err = -EIO;
+ goto errout;
+ }
+ err = submit_bio_wait(WRITE, bio);
+ err = submit_bio_wait(WRITE, bio);
+ if ((err == 0) && !test_bit(BIO_UPTODATE, &bio->bi_flags))
+ err = -EIO;
+ bio_put(bio);
+ if (err)
+ goto errout;
+ lblk++; pblk++;
+ }
+ err = 0;
+errout:
+ ext4_release_crypto_ctx(ctx);
+ return err;
+}
+
+bool ext4_valid_contents_enc_mode(uint32_t mode)
+{
+ return (mode == EXT4_ENCRYPTION_MODE_AES_256_XTS);
+}
+
+/**
+ * ext4_validate_encryption_key_size() - Validate the encryption key size
+ * @mode: The key mode.
+ * @size: The key size to validate.
+ *
+ * Return: The validated key size for @mode. Zero if invalid.
+ */
+uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size)
+{
+ if (size == ext4_encryption_key_size(mode))
+ return size;
+ return 0;
+}
+
+/*
+ * Validate dentries for encrypted directories to make sure we aren't
+ * potentially caching stale data after a key has been added or
+ * removed.
+ */
+static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct dentry *dir;
+ struct ext4_crypt_info *ci;
+ int dir_has_key, cached_with_key;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ dir = dget_parent(dentry);
+ if (!ext4_encrypted_inode(d_inode(dir))) {
+ dput(dir);
+ return 0;
+ }
+ ci = EXT4_I(d_inode(dir))->i_crypt_info;
+
+ /* this should eventually be an flag in d_flags */
+ cached_with_key = dentry->d_fsdata != NULL;
+ dir_has_key = (ci != NULL);
+ dput(dir);
+
+ /*
+ * If the dentry was cached without the key, and it is a
+ * negative dentry, it might be a valid name. We can't check
+ * if the key has since been made available due to locking
+ * reasons, so we fail the validation so ext4_lookup() can do
+ * this check.
+ *
+ * We also fail the validation if the dentry was created with
+ * the key present, but we no longer have the key, or vice versa.
+ */
+ if ((!cached_with_key && d_is_negative(dentry)) ||
+ (!cached_with_key && dir_has_key) ||
+ (cached_with_key && !dir_has_key)) {
+#if 0 /* Revalidation debug */
+ char buf[80];
+ char *cp = simple_dname(dentry, buf, sizeof(buf));
+
+ if (IS_ERR(cp))
+ cp = (char *) "???";
+ pr_err("revalidate: %s %p %d %d %d\n", cp, dentry->d_fsdata,
+ cached_with_key, d_is_negative(dentry),
+ dir_has_key);
+#endif
+ return 0;
+ }
+ return 1;
+}
+
+const struct dentry_operations ext4_encrypted_d_ops = {
+ .d_revalidate = ext4_d_revalidate,
+};
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
new file mode 100644
index 000000000000..2cfe3ffc276f
--- /dev/null
+++ b/fs/ext4/crypto_fname.c
@@ -0,0 +1,470 @@
+/*
+ * linux/fs/ext4/crypto_fname.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains functions for filename crypto management in ext4
+ *
+ * Written by Uday Savagaonkar, 2014.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ */
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/crypto.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/spinlock_types.h>
+
+#include "ext4.h"
+#include "ext4_crypto.h"
+#include "xattr.h"
+
+/**
+ * ext4_dir_crypt_complete() -
+ */
+static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res)
+{
+ struct ext4_completion_result *ecr = req->data;
+
+ if (res == -EINPROGRESS)
+ return;
+ ecr->res = res;
+ complete(&ecr->completion);
+}
+
+bool ext4_valid_filenames_enc_mode(uint32_t mode)
+{
+ return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS);
+}
+
+static unsigned max_name_len(struct inode *inode)
+{
+ return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
+ EXT4_NAME_LEN;
+}
+
+/**
+ * ext4_fname_encrypt() -
+ *
+ * This function encrypts the input filename, and returns the length of the
+ * ciphertext. Errors are returned as negative numbers. We trust the caller to
+ * allocate sufficient memory to oname string.
+ */
+static int ext4_fname_encrypt(struct inode *inode,
+ const struct qstr *iname,
+ struct ext4_str *oname)
+{
+ u32 ciphertext_len;
+ struct ablkcipher_request *req = NULL;
+ DECLARE_EXT4_COMPLETION_RESULT(ecr);
+ struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+ struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ int res = 0;
+ char iv[EXT4_CRYPTO_BLOCK_SIZE];
+ struct scatterlist src_sg, dst_sg;
+ int padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK);
+ char *workbuf, buf[32], *alloc_buf = NULL;
+ unsigned lim = max_name_len(inode);
+
+ if (iname->len <= 0 || iname->len > lim)
+ return -EIO;
+
+ ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ?
+ EXT4_CRYPTO_BLOCK_SIZE : iname->len;
+ ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding);
+ ciphertext_len = (ciphertext_len > lim)
+ ? lim : ciphertext_len;
+
+ if (ciphertext_len <= sizeof(buf)) {
+ workbuf = buf;
+ } else {
+ alloc_buf = kmalloc(ciphertext_len, GFP_NOFS);
+ if (!alloc_buf)
+ return -ENOMEM;
+ workbuf = alloc_buf;
+ }
+
+ /* Allocate request */
+ req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ printk_ratelimited(
+ KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
+ kfree(alloc_buf);
+ return -ENOMEM;
+ }
+ ablkcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ ext4_dir_crypt_complete, &ecr);
+
+ /* Copy the input */
+ memcpy(workbuf, iname->name, iname->len);
+ if (iname->len < ciphertext_len)
+ memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
+
+ /* Initialize IV */
+ memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE);
+
+ /* Create encryption request */
+ sg_init_one(&src_sg, workbuf, ciphertext_len);
+ sg_init_one(&dst_sg, oname->name, ciphertext_len);
+ ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
+ res = crypto_ablkcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+ kfree(alloc_buf);
+ ablkcipher_request_free(req);
+ if (res < 0) {
+ printk_ratelimited(
+ KERN_ERR "%s: Error (error code %d)\n", __func__, res);
+ }
+ oname->len = ciphertext_len;
+ return res;
+}
+
+/*
+ * ext4_fname_decrypt()
+ * This function decrypts the input filename, and returns
+ * the length of the plaintext.
+ * Errors are returned as negative numbers.
+ * We trust the caller to allocate sufficient memory to oname string.
+ */
+static int ext4_fname_decrypt(struct inode *inode,
+ const struct ext4_str *iname,
+ struct ext4_str *oname)
+{
+ struct ext4_str tmp_in[2], tmp_out[1];
+ struct ablkcipher_request *req = NULL;
+ DECLARE_EXT4_COMPLETION_RESULT(ecr);
+ struct scatterlist src_sg, dst_sg;
+ struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+ struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ int res = 0;
+ char iv[EXT4_CRYPTO_BLOCK_SIZE];
+ unsigned lim = max_name_len(inode);
+
+ if (iname->len <= 0 || iname->len > lim)
+ return -EIO;
+
+ tmp_in[0].name = iname->name;
+ tmp_in[0].len = iname->len;
+ tmp_out[0].name = oname->name;
+
+ /* Allocate request */
+ req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ printk_ratelimited(
+ KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
+ return -ENOMEM;
+ }
+ ablkcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ ext4_dir_crypt_complete, &ecr);
+
+ /* Initialize IV */
+ memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE);
+
+ /* Create encryption request */
+ sg_init_one(&src_sg, iname->name, iname->len);
+ sg_init_one(&dst_sg, oname->name, oname->len);
+ ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+ res = crypto_ablkcipher_decrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+ ablkcipher_request_free(req);
+ if (res < 0) {
+ printk_ratelimited(
+ KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n",
+ __func__, res);
+ return res;
+ }
+
+ oname->len = strnlen(oname->name, iname->len);
+ return oname->len;
+}
+
+static const char *lookup_table =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
+
+/**
+ * ext4_fname_encode_digest() -
+ *
+ * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
+ * The encoded string is roughly 4/3 times the size of the input string.
+ */
+static int digest_encode(const char *src, int len, char *dst)
+{
+ int i = 0, bits = 0, ac = 0;
+ char *cp = dst;
+
+ while (i < len) {
+ ac += (((unsigned char) src[i]) << bits);
+ bits += 8;
+ do {
+ *cp++ = lookup_table[ac & 0x3f];
+ ac >>= 6;
+ bits -= 6;
+ } while (bits >= 6);
+ i++;
+ }
+ if (bits)
+ *cp++ = lookup_table[ac & 0x3f];
+ return cp - dst;
+}
+
+static int digest_decode(const char *src, int len, char *dst)
+{
+ int i = 0, bits = 0, ac = 0;
+ const char *p;
+ char *cp = dst;
+
+ while (i < len) {
+ p = strchr(lookup_table, src[i]);
+ if (p == NULL || src[i] == 0)
+ return -2;
+ ac += (p - lookup_table) << bits;
+ bits += 6;
+ if (bits >= 8) {
+ *cp++ = ac & 0xff;
+ ac >>= 8;
+ bits -= 8;
+ }
+ i++;
+ }
+ if (ac)
+ return -1;
+ return cp - dst;
+}
+
+/**
+ * ext4_fname_crypto_round_up() -
+ *
+ * Return: The next multiple of block size
+ */
+u32 ext4_fname_crypto_round_up(u32 size, u32 blksize)
+{
+ return ((size+blksize-1)/blksize)*blksize;
+}
+
+unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen)
+{
+ struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+ int padding = 32;
+
+ if (ci)
+ padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK);
+ if (ilen < EXT4_CRYPTO_BLOCK_SIZE)
+ ilen = EXT4_CRYPTO_BLOCK_SIZE;
+ return ext4_fname_crypto_round_up(ilen, padding);
+}
+
+/*
+ * ext4_fname_crypto_alloc_buffer() -
+ *
+ * Allocates an output buffer that is sufficient for the crypto operation
+ * specified by the context and the direction.
+ */
+int ext4_fname_crypto_alloc_buffer(struct inode *inode,
+ u32 ilen, struct ext4_str *crypto_str)
+{
+ unsigned int olen = ext4_fname_encrypted_size(inode, ilen);
+
+ crypto_str->len = olen;
+ if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2)
+ olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2;
+ /* Allocated buffer can hold one more character to null-terminate the
+ * string */
+ crypto_str->name = kmalloc(olen+1, GFP_NOFS);
+ if (!(crypto_str->name))
+ return -ENOMEM;
+ return 0;
+}
+
+/**
+ * ext4_fname_crypto_free_buffer() -
+ *
+ * Frees the buffer allocated for crypto operation.
+ */
+void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str)
+{
+ if (!crypto_str)
+ return;
+ kfree(crypto_str->name);
+ crypto_str->name = NULL;
+}
+
+/**
+ * ext4_fname_disk_to_usr() - converts a filename from disk space to user space
+ */
+int _ext4_fname_disk_to_usr(struct inode *inode,
+ struct dx_hash_info *hinfo,
+ const struct ext4_str *iname,
+ struct ext4_str *oname)
+{
+ char buf[24];
+ int ret;
+
+ if (iname->len < 3) {
+ /*Check for . and .. */
+ if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') {
+ oname->name[0] = '.';
+ oname->name[iname->len-1] = '.';
+ oname->len = iname->len;
+ return oname->len;
+ }
+ }
+ if (iname->len < EXT4_CRYPTO_BLOCK_SIZE) {
+ EXT4_ERROR_INODE(inode, "encrypted inode too small");
+ return -EUCLEAN;
+ }
+ if (EXT4_I(inode)->i_crypt_info)
+ return ext4_fname_decrypt(inode, iname, oname);
+
+ if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) {
+ ret = digest_encode(iname->name, iname->len, oname->name);
+ oname->len = ret;
+ return ret;
+ }
+ if (hinfo) {
+ memcpy(buf, &hinfo->hash, 4);
+ memcpy(buf+4, &hinfo->minor_hash, 4);
+ } else
+ memset(buf, 0, 8);
+ memcpy(buf + 8, iname->name + ((iname->len - 17) & ~15), 16);
+ oname->name[0] = '_';
+ ret = digest_encode(buf, 24, oname->name+1);
+ oname->len = ret + 1;
+ return ret + 1;
+}
+
+int ext4_fname_disk_to_usr(struct inode *inode,
+ struct dx_hash_info *hinfo,
+ const struct ext4_dir_entry_2 *de,
+ struct ext4_str *oname)
+{
+ struct ext4_str iname = {.name = (unsigned char *) de->name,
+ .len = de->name_len };
+
+ return _ext4_fname_disk_to_usr(inode, hinfo, &iname, oname);
+}
+
+
+/**
+ * ext4_fname_usr_to_disk() - converts a filename from user space to disk space
+ */
+int ext4_fname_usr_to_disk(struct inode *inode,
+ const struct qstr *iname,
+ struct ext4_str *oname)
+{
+ int res;
+ struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+
+ if (iname->len < 3) {
+ /*Check for . and .. */
+ if (iname->name[0] == '.' &&
+ iname->name[iname->len-1] == '.') {
+ oname->name[0] = '.';
+ oname->name[iname->len-1] = '.';
+ oname->len = iname->len;
+ return oname->len;
+ }
+ }
+ if (ci) {
+ res = ext4_fname_encrypt(inode, iname, oname);
+ return res;
+ }
+ /* Without a proper key, a user is not allowed to modify the filenames
+ * in a directory. Consequently, a user space name cannot be mapped to
+ * a disk-space name */
+ return -EACCES;
+}
+
+int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct ext4_filename *fname)
+{
+ struct ext4_crypt_info *ci;
+ int ret = 0, bigname = 0;
+
+ memset(fname, 0, sizeof(struct ext4_filename));
+ fname->usr_fname = iname;
+
+ if (!ext4_encrypted_inode(dir) ||
+ ((iname->name[0] == '.') &&
+ ((iname->len == 1) ||
+ ((iname->name[1] == '.') && (iname->len == 2))))) {
+ fname->disk_name.name = (unsigned char *) iname->name;
+ fname->disk_name.len = iname->len;
+ return 0;
+ }
+ ret = ext4_get_encryption_info(dir);
+ if (ret)
+ return ret;
+ ci = EXT4_I(dir)->i_crypt_info;
+ if (ci) {
+ ret = ext4_fname_crypto_alloc_buffer(dir, iname->len,
+ &fname->crypto_buf);
+ if (ret < 0)
+ return ret;
+ ret = ext4_fname_encrypt(dir, iname, &fname->crypto_buf);
+ if (ret < 0)
+ goto errout;
+ fname->disk_name.name = fname->crypto_buf.name;
+ fname->disk_name.len = fname->crypto_buf.len;
+ return 0;
+ }
+ if (!lookup)
+ return -EACCES;
+
+ /* We don't have the key and we are doing a lookup; decode the
+ * user-supplied name
+ */
+ if (iname->name[0] == '_')
+ bigname = 1;
+ if ((bigname && (iname->len != 33)) ||
+ (!bigname && (iname->len > 43)))
+ return -ENOENT;
+
+ fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
+ if (fname->crypto_buf.name == NULL)
+ return -ENOMEM;
+ ret = digest_decode(iname->name + bigname, iname->len - bigname,
+ fname->crypto_buf.name);
+ if (ret < 0) {
+ ret = -ENOENT;
+ goto errout;
+ }
+ fname->crypto_buf.len = ret;
+ if (bigname) {
+ memcpy(&fname->hinfo.hash, fname->crypto_buf.name, 4);
+ memcpy(&fname->hinfo.minor_hash, fname->crypto_buf.name + 4, 4);
+ } else {
+ fname->disk_name.name = fname->crypto_buf.name;
+ fname->disk_name.len = fname->crypto_buf.len;
+ }
+ return 0;
+errout:
+ kfree(fname->crypto_buf.name);
+ fname->crypto_buf.name = NULL;
+ return ret;
+}
+
+void ext4_fname_free_filename(struct ext4_filename *fname)
+{
+ kfree(fname->crypto_buf.name);
+ fname->crypto_buf.name = NULL;
+ fname->usr_fname = NULL;
+ fname->disk_name.name = NULL;
+}
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
new file mode 100644
index 000000000000..a4ea942ef6f6
--- /dev/null
+++ b/fs/ext4/crypto_key.c
@@ -0,0 +1,262 @@
+/*
+ * linux/fs/ext4/crypto_key.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions for ext4
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <uapi/linux/keyctl.h>
+
+#include "ext4.h"
+#include "xattr.h"
+
+static void derive_crypt_complete(struct crypto_async_request *req, int rc)
+{
+ struct ext4_completion_result *ecr = req->data;
+
+ if (rc == -EINPROGRESS)
+ return;
+
+ ecr->res = rc;
+ complete(&ecr->completion);
+}
+
+/**
+ * ext4_derive_key_aes() - Derive a key using AES-128-ECB
+ * @deriving_key: Encryption key used for derivation.
+ * @source_key: Source key to which to apply derivation.
+ * @derived_key: Derived key.
+ *
+ * Return: Zero on success; non-zero otherwise.
+ */
+static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
+ char source_key[EXT4_AES_256_XTS_KEY_SIZE],
+ char derived_key[EXT4_AES_256_XTS_KEY_SIZE])
+{
+ int res = 0;
+ struct ablkcipher_request *req = NULL;
+ DECLARE_EXT4_COMPLETION_RESULT(ecr);
+ struct scatterlist src_sg, dst_sg;
+ struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
+ 0);
+
+ if (IS_ERR(tfm)) {
+ res = PTR_ERR(tfm);
+ tfm = NULL;
+ goto out;
+ }
+ crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ res = -ENOMEM;
+ goto out;
+ }
+ ablkcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ derive_crypt_complete, &ecr);
+ res = crypto_ablkcipher_setkey(tfm, deriving_key,
+ EXT4_AES_128_ECB_KEY_SIZE);
+ if (res < 0)
+ goto out;
+ sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE);
+ sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE);
+ ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
+ EXT4_AES_256_XTS_KEY_SIZE, NULL);
+ res = crypto_ablkcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+
+out:
+ if (req)
+ ablkcipher_request_free(req);
+ if (tfm)
+ crypto_free_ablkcipher(tfm);
+ return res;
+}
+
+void ext4_free_crypt_info(struct ext4_crypt_info *ci)
+{
+ if (!ci)
+ return;
+
+ crypto_free_ablkcipher(ci->ci_ctfm);
+ kmem_cache_free(ext4_crypt_info_cachep, ci);
+}
+
+void ext4_free_encryption_info(struct inode *inode,
+ struct ext4_crypt_info *ci)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_crypt_info *prev;
+
+ if (ci == NULL)
+ ci = ACCESS_ONCE(ei->i_crypt_info);
+ if (ci == NULL)
+ return;
+ prev = cmpxchg(&ei->i_crypt_info, ci, NULL);
+ if (prev != ci)
+ return;
+
+ ext4_free_crypt_info(ci);
+}
+
+int ext4_get_encryption_info(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_crypt_info *crypt_info;
+ char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE +
+ (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1];
+ struct key *keyring_key = NULL;
+ struct ext4_encryption_key *master_key;
+ struct ext4_encryption_context ctx;
+ struct user_key_payload *ukp;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct crypto_ablkcipher *ctfm;
+ const char *cipher_str;
+ char raw_key[EXT4_MAX_KEY_SIZE];
+ char mode;
+ int res;
+
+ if (ei->i_crypt_info)
+ return 0;
+
+ if (!ext4_read_workqueue) {
+ res = ext4_init_crypto();
+ if (res)
+ return res;
+ }
+
+ res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ &ctx, sizeof(ctx));
+ if (res < 0) {
+ if (!DUMMY_ENCRYPTION_ENABLED(sbi))
+ return res;
+ ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
+ ctx.filenames_encryption_mode =
+ EXT4_ENCRYPTION_MODE_AES_256_CTS;
+ ctx.flags = 0;
+ } else if (res != sizeof(ctx))
+ return -EINVAL;
+ res = 0;
+
+ crypt_info = kmem_cache_alloc(ext4_crypt_info_cachep, GFP_KERNEL);
+ if (!crypt_info)
+ return -ENOMEM;
+
+ crypt_info->ci_flags = ctx.flags;
+ crypt_info->ci_data_mode = ctx.contents_encryption_mode;
+ crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
+ crypt_info->ci_ctfm = NULL;
+ memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
+ sizeof(crypt_info->ci_master_key));
+ if (S_ISREG(inode->i_mode))
+ mode = crypt_info->ci_data_mode;
+ else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ mode = crypt_info->ci_filename_mode;
+ else
+ BUG();
+ switch (mode) {
+ case EXT4_ENCRYPTION_MODE_AES_256_XTS:
+ cipher_str = "xts(aes)";
+ break;
+ case EXT4_ENCRYPTION_MODE_AES_256_CTS:
+ cipher_str = "cts(cbc(aes))";
+ break;
+ default:
+ printk_once(KERN_WARNING
+ "ext4: unsupported key mode %d (ino %u)\n",
+ mode, (unsigned) inode->i_ino);
+ res = -ENOKEY;
+ goto out;
+ }
+ if (DUMMY_ENCRYPTION_ENABLED(sbi)) {
+ memset(raw_key, 0x42, EXT4_AES_256_XTS_KEY_SIZE);
+ goto got_key;
+ }
+ memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX,
+ EXT4_KEY_DESC_PREFIX_SIZE);
+ sprintf(full_key_descriptor + EXT4_KEY_DESC_PREFIX_SIZE,
+ "%*phN", EXT4_KEY_DESCRIPTOR_SIZE,
+ ctx.master_key_descriptor);
+ full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE +
+ (2 * EXT4_KEY_DESCRIPTOR_SIZE)] = '\0';
+ keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
+ if (IS_ERR(keyring_key)) {
+ res = PTR_ERR(keyring_key);
+ keyring_key = NULL;
+ goto out;
+ }
+ if (keyring_key->type != &key_type_logon) {
+ printk_once(KERN_WARNING
+ "ext4: key type must be logon\n");
+ res = -ENOKEY;
+ goto out;
+ }
+ down_read(&keyring_key->sem);
+ ukp = ((struct user_key_payload *)keyring_key->payload.data);
+ if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
+ res = -EINVAL;
+ up_read(&keyring_key->sem);
+ goto out;
+ }
+ master_key = (struct ext4_encryption_key *)ukp->data;
+ BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE !=
+ EXT4_KEY_DERIVATION_NONCE_SIZE);
+ if (master_key->size != EXT4_AES_256_XTS_KEY_SIZE) {
+ printk_once(KERN_WARNING
+ "ext4: key size incorrect: %d\n",
+ master_key->size);
+ res = -ENOKEY;
+ up_read(&keyring_key->sem);
+ goto out;
+ }
+ res = ext4_derive_key_aes(ctx.nonce, master_key->raw,
+ raw_key);
+ up_read(&keyring_key->sem);
+ if (res)
+ goto out;
+got_key:
+ ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
+ if (!ctfm || IS_ERR(ctfm)) {
+ res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
+ printk(KERN_DEBUG
+ "%s: error %d (inode %u) allocating crypto tfm\n",
+ __func__, res, (unsigned) inode->i_ino);
+ goto out;
+ }
+ crypt_info->ci_ctfm = ctfm;
+ crypto_ablkcipher_clear_flags(ctfm, ~0);
+ crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm),
+ CRYPTO_TFM_REQ_WEAK_KEY);
+ res = crypto_ablkcipher_setkey(ctfm, raw_key,
+ ext4_encryption_key_size(mode));
+ if (res)
+ goto out;
+
+ if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) == NULL)
+ crypt_info = NULL;
+out:
+ if (res == -ENOKEY)
+ res = 0;
+ key_put(keyring_key);
+ ext4_free_crypt_info(crypt_info);
+ memzero_explicit(raw_key, sizeof(raw_key));
+ return res;
+}
+
+int ext4_has_encryption_key(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ return (ei->i_crypt_info != NULL);
+}
diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
new file mode 100644
index 000000000000..e4f4fc4e56ab
--- /dev/null
+++ b/fs/ext4/crypto_policy.c
@@ -0,0 +1,268 @@
+/*
+ * linux/fs/ext4/crypto_policy.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption policy functions for ext4
+ *
+ * Written by Michael Halcrow, 2015.
+ */
+
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+
+static int ext4_inode_has_encryption_context(struct inode *inode)
+{
+ int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0);
+ return (res > 0);
+}
+
+/*
+ * check whether the policy is consistent with the encryption context
+ * for the inode
+ */
+static int ext4_is_encryption_context_consistent_with_policy(
+ struct inode *inode, const struct ext4_encryption_policy *policy)
+{
+ struct ext4_encryption_context ctx;
+ int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+ sizeof(ctx));
+ if (res != sizeof(ctx))
+ return 0;
+ return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
+ EXT4_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (ctx.flags ==
+ policy->flags) &&
+ (ctx.contents_encryption_mode ==
+ policy->contents_encryption_mode) &&
+ (ctx.filenames_encryption_mode ==
+ policy->filenames_encryption_mode));
+}
+
+static int ext4_create_encryption_context_from_policy(
+ struct inode *inode, const struct ext4_encryption_policy *policy)
+{
+ struct ext4_encryption_context ctx;
+ handle_t *handle;
+ int res, res2;
+
+ res = ext4_convert_inline_data(inode);
+ if (res)
+ return res;
+
+ ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
+ memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
+ EXT4_KEY_DESCRIPTOR_SIZE);
+ if (!ext4_valid_contents_enc_mode(policy->contents_encryption_mode)) {
+ printk(KERN_WARNING
+ "%s: Invalid contents encryption mode %d\n", __func__,
+ policy->contents_encryption_mode);
+ return -EINVAL;
+ }
+ if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
+ printk(KERN_WARNING
+ "%s: Invalid filenames encryption mode %d\n", __func__,
+ policy->filenames_encryption_mode);
+ return -EINVAL;
+ }
+ if (policy->flags & ~EXT4_POLICY_FLAGS_VALID)
+ return -EINVAL;
+ ctx.contents_encryption_mode = policy->contents_encryption_mode;
+ ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
+ ctx.flags = policy->flags;
+ BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE);
+ get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE);
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC,
+ ext4_jbd2_credits_xattr(inode));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+ sizeof(ctx), 0);
+ if (!res) {
+ ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ res = ext4_mark_inode_dirty(handle, inode);
+ if (res)
+ EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
+ }
+ res2 = ext4_journal_stop(handle);
+ if (!res)
+ res = res2;
+ return res;
+}
+
+int ext4_process_policy(const struct ext4_encryption_policy *policy,
+ struct inode *inode)
+{
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (policy->version != 0)
+ return -EINVAL;
+
+ if (!ext4_inode_has_encryption_context(inode)) {
+ if (!S_ISDIR(inode->i_mode))
+ return -EINVAL;
+ if (!ext4_empty_dir(inode))
+ return -ENOTEMPTY;
+ return ext4_create_encryption_context_from_policy(inode,
+ policy);
+ }
+
+ if (ext4_is_encryption_context_consistent_with_policy(inode, policy))
+ return 0;
+
+ printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
+ __func__);
+ return -EINVAL;
+}
+
+int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy)
+{
+ struct ext4_encryption_context ctx;
+
+ int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ &ctx, sizeof(ctx));
+ if (res != sizeof(ctx))
+ return -ENOENT;
+ if (ctx.format != EXT4_ENCRYPTION_CONTEXT_FORMAT_V1)
+ return -EINVAL;
+ policy->version = 0;
+ policy->contents_encryption_mode = ctx.contents_encryption_mode;
+ policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
+ policy->flags = ctx.flags;
+ memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+ EXT4_KEY_DESCRIPTOR_SIZE);
+ return 0;
+}
+
+int ext4_is_child_context_consistent_with_parent(struct inode *parent,
+ struct inode *child)
+{
+ const struct ext4_crypt_info *parent_ci, *child_ci;
+ struct ext4_encryption_context parent_ctx, child_ctx;
+ int res;
+
+ /* No restrictions on file types which are never encrypted */
+ if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
+ !S_ISLNK(child->i_mode))
+ return 1;
+
+ /* No restrictions if the parent directory is unencrypted */
+ if (!ext4_encrypted_inode(parent))
+ return 1;
+
+ /* Encrypted directories must not contain unencrypted files */
+ if (!ext4_encrypted_inode(child))
+ return 0;
+
+ /*
+ * Both parent and child are encrypted, so verify they use the same
+ * encryption policy. Compare the fscrypt_info structs if the keys are
+ * available, otherwise retrieve and compare the fscrypt_contexts.
+ *
+ * Note that the fscrypt_context retrieval will be required frequently
+ * when accessing an encrypted directory tree without the key.
+ * Performance-wise this is not a big deal because we already don't
+ * really optimize for file access without the key (to the extent that
+ * such access is even possible), given that any attempted access
+ * already causes a fscrypt_context retrieval and keyring search.
+ *
+ * In any case, if an unexpected error occurs, fall back to "forbidden".
+ */
+
+ res = ext4_get_encryption_info(parent);
+ if (res)
+ return 0;
+ res = ext4_get_encryption_info(child);
+ if (res)
+ return 0;
+ parent_ci = EXT4_I(parent)->i_crypt_info;
+ child_ci = EXT4_I(child)->i_crypt_info;
+ if (parent_ci && child_ci) {
+ return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key,
+ EXT4_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
+ (parent_ci->ci_filename_mode ==
+ child_ci->ci_filename_mode) &&
+ (parent_ci->ci_flags == child_ci->ci_flags);
+ }
+
+ res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ &parent_ctx, sizeof(parent_ctx));
+ if (res != sizeof(parent_ctx))
+ return 0;
+
+ res = ext4_xattr_get(child, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ &child_ctx, sizeof(child_ctx));
+ if (res != sizeof(child_ctx))
+ return 0;
+
+ return memcmp(parent_ctx.master_key_descriptor,
+ child_ctx.master_key_descriptor,
+ EXT4_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (parent_ctx.contents_encryption_mode ==
+ child_ctx.contents_encryption_mode) &&
+ (parent_ctx.filenames_encryption_mode ==
+ child_ctx.filenames_encryption_mode) &&
+ (parent_ctx.flags == child_ctx.flags);
+}
+
+/**
+ * ext4_inherit_context() - Sets a child context from its parent
+ * @parent: Parent inode from which the context is inherited.
+ * @child: Child inode that inherits the context from @parent.
+ *
+ * Return: Zero on success, non-zero otherwise
+ */
+int ext4_inherit_context(struct inode *parent, struct inode *child)
+{
+ struct ext4_encryption_context ctx;
+ struct ext4_crypt_info *ci;
+ int res;
+
+ res = ext4_get_encryption_info(parent);
+ if (res < 0)
+ return res;
+ ci = EXT4_I(parent)->i_crypt_info;
+ if (ci == NULL)
+ return -ENOKEY;
+
+ ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
+ if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) {
+ ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
+ ctx.filenames_encryption_mode =
+ EXT4_ENCRYPTION_MODE_AES_256_CTS;
+ ctx.flags = 0;
+ memset(ctx.master_key_descriptor, 0x42,
+ EXT4_KEY_DESCRIPTOR_SIZE);
+ res = 0;
+ } else {
+ ctx.contents_encryption_mode = ci->ci_data_mode;
+ ctx.filenames_encryption_mode = ci->ci_filename_mode;
+ ctx.flags = ci->ci_flags;
+ memcpy(ctx.master_key_descriptor, ci->ci_master_key,
+ EXT4_KEY_DESCRIPTOR_SIZE);
+ }
+ get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE);
+ res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+ sizeof(ctx), 0);
+ if (!res) {
+ ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT);
+ ext4_clear_inode_state(child, EXT4_STATE_MAY_INLINE_DATA);
+ res = ext4_get_encryption_info(child);
+ }
+ return res;
+}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index c24143ea9c08..a29895b2def3 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -110,7 +110,15 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
int err;
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
+ struct buffer_head *bh = NULL;
int dir_has_error = 0;
+ struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
+
+ if (ext4_encrypted_inode(inode)) {
+ err = ext4_get_encryption_info(inode);
+ if (err && err != -ENOKEY)
+ return err;
+ }
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(file, ctx);
@@ -127,17 +135,23 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
if (ext4_has_inline_data(inode)) {
int has_inline_data = 1;
- int ret = ext4_read_inline_dir(file, ctx,
+ err = ext4_read_inline_dir(file, ctx,
&has_inline_data);
if (has_inline_data)
- return ret;
+ return err;
+ }
+
+ if (ext4_encrypted_inode(inode)) {
+ err = ext4_fname_crypto_alloc_buffer(inode, EXT4_NAME_LEN,
+ &fname_crypto_str);
+ if (err < 0)
+ return err;
}
offset = ctx->pos & (sb->s_blocksize - 1);
while (ctx->pos < inode->i_size) {
struct ext4_map_blocks map;
- struct buffer_head *bh = NULL;
map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
map.m_len = 1;
@@ -180,6 +194,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
(unsigned long long)ctx->pos);
ctx->pos += sb->s_blocksize - offset;
brelse(bh);
+ bh = NULL;
continue;
}
set_buffer_verified(bh);
@@ -226,25 +241,45 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
offset += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
if (le32_to_cpu(de->inode)) {
- if (!dir_emit(ctx, de->name,
- de->name_len,
- le32_to_cpu(de->inode),
- get_dtype(sb, de->file_type))) {
- brelse(bh);
- return 0;
+ if (!ext4_encrypted_inode(inode)) {
+ if (!dir_emit(ctx, de->name,
+ de->name_len,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type)))
+ goto done;
+ } else {
+ int save_len = fname_crypto_str.len;
+
+ /* Directory is encrypted */
+ err = ext4_fname_disk_to_usr(inode,
+ NULL, de, &fname_crypto_str);
+ fname_crypto_str.len = save_len;
+ if (err < 0)
+ goto errout;
+ if (!dir_emit(ctx,
+ fname_crypto_str.name, err,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type)))
+ goto done;
}
}
ctx->pos += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
}
- offset = 0;
+ if ((ctx->pos < inode->i_size) && !dir_relax(inode))
+ goto done;
brelse(bh);
- if (ctx->pos < inode->i_size) {
- if (!dir_relax(inode))
- return 0;
- }
+ bh = NULL;
+ offset = 0;
}
- return 0;
+done:
+ err = 0;
+errout:
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ ext4_fname_crypto_free_buffer(&fname_crypto_str);
+#endif
+ brelse(bh);
+ return err;
}
static inline int is_32bit_api(void)
@@ -384,10 +419,15 @@ void ext4_htree_free_dir_info(struct dir_private_info *p)
/*
* Given a directory entry, enter it into the fname rb tree.
+ *
+ * When filename encryption is enabled, the dirent will hold the
+ * encrypted filename, while the htree will hold decrypted filename.
+ * The decrypted filename is passed in via ent_name. parameter.
*/
int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
__u32 minor_hash,
- struct ext4_dir_entry_2 *dirent)
+ struct ext4_dir_entry_2 *dirent,
+ struct ext4_str *ent_name)
{
struct rb_node **p, *parent = NULL;
struct fname *fname, *new_fn;
@@ -398,17 +438,17 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
p = &info->root.rb_node;
/* Create and allocate the fname structure */
- len = sizeof(struct fname) + dirent->name_len + 1;
+ len = sizeof(struct fname) + ent_name->len + 1;
new_fn = kzalloc(len, GFP_KERNEL);
if (!new_fn)
return -ENOMEM;
new_fn->hash = hash;
new_fn->minor_hash = minor_hash;
new_fn->inode = le32_to_cpu(dirent->inode);
- new_fn->name_len = dirent->name_len;
+ new_fn->name_len = ent_name->len;
new_fn->file_type = dirent->file_type;
- memcpy(new_fn->name, dirent->name, dirent->name_len);
- new_fn->name[dirent->name_len] = 0;
+ memcpy(new_fn->name, ent_name->name, ent_name->len);
+ new_fn->name[ent_name->len] = 0;
while (*p) {
parent = *p;
@@ -561,6 +601,13 @@ finished:
return 0;
}
+static int ext4_dir_open(struct inode * inode, struct file * filp)
+{
+ if (ext4_encrypted_inode(inode))
+ return ext4_get_encryption_info(inode) ? -EACCES : 0;
+ return 0;
+}
+
static int ext4_release_dir(struct inode *inode, struct file *filp)
{
if (filp->private_data)
@@ -603,5 +650,6 @@ const struct file_operations ext4_dir_operations = {
.compat_ioctl = ext4_compat_ioctl,
#endif
.fsync = ext4_sync_file,
+ .open = ext4_dir_open,
.release = ext4_release_dir,
};
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bd997b3d6a6f..79c2c5f9a6fc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -374,7 +374,8 @@ struct flex_groups {
#define EXT4_DIRTY_FL 0x00000100
#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */
-#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */
+ /* nb: was previously EXT2_ECOMPR_FL */
+#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */
/* End compression flags --- maybe not all used */
#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */
#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */
@@ -427,11 +428,11 @@ enum {
EXT4_INODE_APPEND = 5, /* writes to file may only append */
EXT4_INODE_NODUMP = 6, /* do not dump file */
EXT4_INODE_NOATIME = 7, /* do not update atime */
-/* Reserved for compression usage... */
+/* Reserved for compression usage, co-opted for encryption usage */
EXT4_INODE_DIRTY = 8,
EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
EXT4_INODE_NOCOMPR = 10, /* Don't compress */
- EXT4_INODE_ECOMPR = 11, /* Compression error */
+ EXT4_INODE_ENCRYPT = 11, /* Encrypted */
/* End compression flags --- maybe not all used */
EXT4_INODE_INDEX = 12, /* hash-indexed directory */
EXT4_INODE_IMAGIC = 13, /* AFS directory */
@@ -476,7 +477,7 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(DIRTY);
CHECK_FLAG_VALUE(COMPRBLK);
CHECK_FLAG_VALUE(NOCOMPR);
- CHECK_FLAG_VALUE(ECOMPR);
+ CHECK_FLAG_VALUE(ENCRYPT);
CHECK_FLAG_VALUE(INDEX);
CHECK_FLAG_VALUE(IMAGIC);
CHECK_FLAG_VALUE(JOURNAL_DATA);
@@ -593,6 +594,15 @@ enum {
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
+/* Encryption algorithms */
+#define EXT4_ENCRYPTION_MODE_INVALID 0
+#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1
+#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2
+#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3
+#define EXT4_ENCRYPTION_MODE_AES_256_CTS 4
+
+#include "ext4_crypto.h"
+
/*
* ioctl commands
*/
@@ -614,6 +624,9 @@ enum {
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
+#define EXT4_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct ext4_encryption_policy)
+#define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16])
+#define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -967,6 +980,11 @@ struct ext4_inode_info {
/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
__u32 i_csum_seed;
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ /* Encryption params */
+ struct ext4_crypt_info *i_crypt_info;
+#endif
};
/*
@@ -1159,7 +1177,8 @@ struct ext4_super_block {
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
__u8 s_log_groups_per_flex; /* FLEX_BG group size */
__u8 s_checksum_type; /* metadata checksum algorithm used */
- __le16 s_reserved_pad;
+ __u8 s_encryption_level; /* versioning level for encryption */
+ __u8 s_reserved_pad; /* Padding to next 32bits */
__le64 s_kbytes_written; /* nr of lifetime kilobytes written */
__le32 s_snapshot_inum; /* Inode number of active snapshot */
__le32 s_snapshot_id; /* sequential ID of active snapshot */
@@ -1185,7 +1204,10 @@ struct ext4_super_block {
__le32 s_grp_quota_inum; /* inode for tracking group quota */
__le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
__le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */
- __le32 s_reserved[106]; /* Padding to the end of the block */
+ __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */
+ __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */
+ __le32 s_lpf_ino; /* Location of the lost+found inode */
+ __le32 s_reserved[100]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1196,8 +1218,16 @@ struct ext4_super_block {
/*
* run-time mount flags
*/
-#define EXT4_MF_MNTDIR_SAMPLED 0x0001
-#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
+#define EXT4_MF_MNTDIR_SAMPLED 0x0001
+#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
+#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \
+ EXT4_MF_TEST_DUMMY_ENCRYPTION))
+#else
+#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
+#endif
/* Number of quota types we support */
#define EXT4_MAXQUOTAS 2
@@ -1481,6 +1511,18 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_SB(sb) (sb)
#endif
+/*
+ * Returns true if the inode is inode is encrypted
+ */
+static inline int ext4_encrypted_inode(struct inode *inode)
+{
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
+#else
+ return 0;
+#endif
+}
+
#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
/*
@@ -1565,6 +1607,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000
#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1588,8 +1631,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
EXT4_FEATURE_INCOMPAT_EXTENTS| \
EXT4_FEATURE_INCOMPAT_64BIT| \
EXT4_FEATURE_INCOMPAT_FLEX_BG| \
- EXT4_FEATURE_INCOMPAT_MMP | \
- EXT4_FEATURE_INCOMPAT_INLINE_DATA)
+ EXT4_FEATURE_INCOMPAT_MMP | \
+ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
+ EXT4_FEATURE_INCOMPAT_ENCRYPT)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1809,6 +1853,17 @@ struct dx_hash_info
*/
#define HASH_NB_ALWAYS 1
+struct ext4_filename {
+ const struct qstr *usr_fname;
+ struct ext4_str disk_name;
+ struct dx_hash_info hinfo;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ struct ext4_str crypto_buf;
+#endif
+};
+
+#define fname_name(p) ((p)->disk_name.name)
+#define fname_len(p) ((p)->disk_name.len)
/*
* Describe an inode's exact location on disk and in memory
@@ -2014,6 +2069,120 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
+/* crypto_policy.c */
+int ext4_is_child_context_consistent_with_parent(struct inode *parent,
+ struct inode *child);
+int ext4_inherit_context(struct inode *parent, struct inode *child);
+void ext4_to_hex(char *dst, char *src, size_t src_size);
+int ext4_process_policy(const struct ext4_encryption_policy *policy,
+ struct inode *inode);
+int ext4_get_policy(struct inode *inode,
+ struct ext4_encryption_policy *policy);
+
+/* crypto.c */
+extern struct kmem_cache *ext4_crypt_info_cachep;
+bool ext4_valid_contents_enc_mode(uint32_t mode);
+uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size);
+extern struct workqueue_struct *ext4_read_workqueue;
+struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode,
+ gfp_t gfp_flags);
+void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx);
+void ext4_restore_control_page(struct page *data_page);
+struct page *ext4_encrypt(struct inode *inode,
+ struct page *plaintext_page,
+ gfp_t gfp_flags);
+int ext4_decrypt(struct page *page);
+int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex);
+extern const struct dentry_operations ext4_encrypted_d_ops;
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+int ext4_init_crypto(void);
+void ext4_exit_crypto(void);
+static inline int ext4_sb_has_crypto(struct super_block *sb)
+{
+ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+}
+#else
+static inline int ext4_init_crypto(void) { return 0; }
+static inline void ext4_exit_crypto(void) { }
+static inline int ext4_sb_has_crypto(struct super_block *sb)
+{
+ return 0;
+}
+#endif
+
+/* crypto_fname.c */
+bool ext4_valid_filenames_enc_mode(uint32_t mode);
+u32 ext4_fname_crypto_round_up(u32 size, u32 blksize);
+unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen);
+int ext4_fname_crypto_alloc_buffer(struct inode *inode,
+ u32 ilen, struct ext4_str *crypto_str);
+int _ext4_fname_disk_to_usr(struct inode *inode,
+ struct dx_hash_info *hinfo,
+ const struct ext4_str *iname,
+ struct ext4_str *oname);
+int ext4_fname_disk_to_usr(struct inode *inode,
+ struct dx_hash_info *hinfo,
+ const struct ext4_dir_entry_2 *de,
+ struct ext4_str *oname);
+int ext4_fname_usr_to_disk(struct inode *inode,
+ const struct qstr *iname,
+ struct ext4_str *oname);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str);
+int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct ext4_filename *fname);
+void ext4_fname_free_filename(struct ext4_filename *fname);
+#else
+static inline
+int ext4_setup_fname_crypto(struct inode *inode)
+{
+ return 0;
+}
+static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { }
+static inline int ext4_fname_setup_filename(struct inode *dir,
+ const struct qstr *iname,
+ int lookup, struct ext4_filename *fname)
+{
+ fname->usr_fname = iname;
+ fname->disk_name.name = (unsigned char *) iname->name;
+ fname->disk_name.len = iname->len;
+ return 0;
+}
+static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
+#endif
+
+
+/* crypto_key.c */
+void ext4_free_crypt_info(struct ext4_crypt_info *ci);
+void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci);
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+int ext4_has_encryption_key(struct inode *inode);
+
+int ext4_get_encryption_info(struct inode *inode);
+
+static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode)
+{
+ return EXT4_I(inode)->i_crypt_info;
+}
+
+#else
+static inline int ext4_has_encryption_key(struct inode *inode)
+{
+ return 0;
+}
+static inline int ext4_get_encryption_info(struct inode *inode)
+{
+ return 0;
+}
+static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode)
+{
+ return NULL;
+}
+#endif
+
+
/* dir.c */
extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
struct file *,
@@ -2024,18 +2193,20 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
(de), (bh), (buf), (size), (offset)))
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
- __u32 minor_hash,
- struct ext4_dir_entry_2 *dirent);
+ __u32 minor_hash,
+ struct ext4_dir_entry_2 *dirent,
+ struct ext4_str *ent_name);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
struct buffer_head *bh,
void *buf, int buf_size,
- const char *name, int namelen,
+ struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de);
-void ext4_insert_dentry(struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int buf_size,
- const char *name, int namelen);
+int ext4_insert_dentry(struct inode *dir,
+ struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
@@ -2109,9 +2280,11 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count);
-extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+extern int ext4_trim_fs(struct super_block *, struct fstrim_range *,
+ unsigned long blkdev_flags);
/* inode.c */
+int ext4_inode_is_fast_symlink(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_get_block_write(struct inode *inode, sector_t iblock,
@@ -2188,13 +2361,14 @@ extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
__u32 start_minor_hash, __u32 *next_hash);
-extern int search_dir(struct buffer_head *bh,
- char *search_buf,
- int buf_size,
- struct inode *dir,
- const struct qstr *d_name,
- unsigned int offset,
- struct ext4_dir_entry_2 **res_dir);
+extern int ext4_search_dir(struct buffer_head *bh,
+ char *search_buf,
+ int buf_size,
+ struct inode *dir,
+ struct ext4_filename *fname,
+ const struct qstr *d_name,
+ unsigned int offset,
+ struct ext4_dir_entry_2 **res_dir);
extern int ext4_generic_delete_entry(handle_t *handle,
struct inode *dir,
struct ext4_dir_entry_2 *de_del,
@@ -2202,6 +2376,7 @@ extern int ext4_generic_delete_entry(handle_t *handle,
void *entry_buf,
int buf_size,
int csum_size);
+extern int ext4_empty_dir(struct inode *inode);
/* resize.c */
extern int ext4_group_add(struct super_block *sb,
@@ -2639,7 +2814,9 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
unsigned len, unsigned copied,
struct page *page);
-extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+extern int ext4_try_add_inline_entry(handle_t *handle,
+ struct ext4_filename *fname,
+ struct dentry *dentry,
struct inode *inode);
extern int ext4_try_create_inline_dir(handle_t *handle,
struct inode *parent,
@@ -2653,6 +2830,7 @@ extern int htree_inlinedir_to_tree(struct file *dir_file,
__u32 start_hash, __u32 start_minor_hash,
int *has_inline_data);
extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+ struct ext4_filename *fname,
const struct qstr *d_name,
struct ext4_dir_entry_2 **res_dir,
int *has_inline_data);
@@ -2712,6 +2890,10 @@ static inline void ext4_set_de_type(struct super_block *sb,
de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
+/* readpages.c */
+extern int ext4_mpage_readpages(struct address_space *mapping,
+ struct list_head *pages, struct page *page,
+ unsigned nr_pages);
/* symlink.c */
extern const struct inode_operations ext4_symlink_inode_operations;
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
new file mode 100644
index 000000000000..1b17b05b9f4d
--- /dev/null
+++ b/fs/ext4/ext4_crypto.h
@@ -0,0 +1,158 @@
+/*
+ * linux/fs/ext4/ext4_crypto.h
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption header content for ext4
+ *
+ * Written by Michael Halcrow, 2015.
+ */
+
+#ifndef _EXT4_CRYPTO_H
+#define _EXT4_CRYPTO_H
+
+#include <linux/fs.h>
+
+#define EXT4_KEY_DESCRIPTOR_SIZE 8
+
+/* Policy provided via an ioctl on the topmost directory */
+struct ext4_encryption_policy {
+ char version;
+ char contents_encryption_mode;
+ char filenames_encryption_mode;
+ char flags;
+ char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE];
+} __attribute__((__packed__));
+
+#define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1
+#define EXT4_KEY_DERIVATION_NONCE_SIZE 16
+
+#define EXT4_POLICY_FLAGS_PAD_4 0x00
+#define EXT4_POLICY_FLAGS_PAD_8 0x01
+#define EXT4_POLICY_FLAGS_PAD_16 0x02
+#define EXT4_POLICY_FLAGS_PAD_32 0x03
+#define EXT4_POLICY_FLAGS_PAD_MASK 0x03
+#define EXT4_POLICY_FLAGS_VALID 0x03
+
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ * 1 byte: Protector format (1 = this version)
+ * 1 byte: File contents encryption mode
+ * 1 byte: File names encryption mode
+ * 1 byte: Reserved
+ * 8 bytes: Master Key descriptor
+ * 16 bytes: Encryption Key derivation nonce
+ */
+struct ext4_encryption_context {
+ char format;
+ char contents_encryption_mode;
+ char filenames_encryption_mode;
+ char flags;
+ char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE];
+ char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE];
+} __attribute__((__packed__));
+
+/* Encryption parameters */
+#define EXT4_XTS_TWEAK_SIZE 16
+#define EXT4_AES_128_ECB_KEY_SIZE 16
+#define EXT4_AES_256_GCM_KEY_SIZE 32
+#define EXT4_AES_256_CBC_KEY_SIZE 32
+#define EXT4_AES_256_CTS_KEY_SIZE 32
+#define EXT4_AES_256_XTS_KEY_SIZE 64
+#define EXT4_MAX_KEY_SIZE 64
+
+#define EXT4_KEY_DESC_PREFIX "ext4:"
+#define EXT4_KEY_DESC_PREFIX_SIZE 5
+
+/* This is passed in from userspace into the kernel keyring */
+struct ext4_encryption_key {
+ __u32 mode;
+ char raw[EXT4_MAX_KEY_SIZE];
+ __u32 size;
+} __attribute__((__packed__));
+
+struct ext4_crypt_info {
+ char ci_data_mode;
+ char ci_filename_mode;
+ char ci_flags;
+ struct crypto_ablkcipher *ci_ctfm;
+ char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE];
+};
+
+#define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
+#define EXT4_WRITE_PATH_FL 0x00000002
+
+struct ext4_crypto_ctx {
+ union {
+ struct {
+ struct page *bounce_page; /* Ciphertext page */
+ struct page *control_page; /* Original page */
+ } w;
+ struct {
+ struct bio *bio;
+ struct work_struct work;
+ } r;
+ struct list_head free_list; /* Free list */
+ };
+ char flags; /* Flags */
+ char mode; /* Encryption mode for tfm */
+};
+
+struct ext4_completion_result {
+ struct completion completion;
+ int res;
+};
+
+#define DECLARE_EXT4_COMPLETION_RESULT(ecr) \
+ struct ext4_completion_result ecr = { \
+ COMPLETION_INITIALIZER((ecr).completion), 0 }
+
+static inline int ext4_encryption_key_size(int mode)
+{
+ switch (mode) {
+ case EXT4_ENCRYPTION_MODE_AES_256_XTS:
+ return EXT4_AES_256_XTS_KEY_SIZE;
+ case EXT4_ENCRYPTION_MODE_AES_256_GCM:
+ return EXT4_AES_256_GCM_KEY_SIZE;
+ case EXT4_ENCRYPTION_MODE_AES_256_CBC:
+ return EXT4_AES_256_CBC_KEY_SIZE;
+ case EXT4_ENCRYPTION_MODE_AES_256_CTS:
+ return EXT4_AES_256_CTS_KEY_SIZE;
+ default:
+ BUG();
+ }
+ return 0;
+}
+
+#define EXT4_FNAME_NUM_SCATTER_ENTRIES 4
+#define EXT4_CRYPTO_BLOCK_SIZE 16
+#define EXT4_FNAME_CRYPTO_DIGEST_SIZE 32
+
+struct ext4_str {
+ unsigned char *name;
+ u32 len;
+};
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct ext4_encrypted_symlink_data {
+ __le16 len;
+ char encrypted_path[1];
+} __attribute__((__packed__));
+
+/**
+ * This function is used to calculate the disk space required to
+ * store a filename of length l in encrypted symlink format.
+ */
+static inline u32 encrypted_symlink_data_len(u32 l)
+{
+ if (l < EXT4_CRYPTO_BLOCK_SIZE)
+ l = EXT4_CRYPTO_BLOCK_SIZE;
+ return (l + sizeof(struct ext4_encrypted_symlink_data) - 1);
+}
+
+#endif /* _EXT4_CRYPTO_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e5b44dd2724e..9e2363b2c372 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3115,6 +3115,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
ee_len = ext4_ext_get_actual_len(ex);
ee_pblock = ext4_ext_pblock(ex);
+ if (ext4_encrypted_inode(inode))
+ return ext4_encrypted_zeroout(inode, ex);
+
ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
if (ret > 0)
ret = 0;
@@ -3543,10 +3546,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
*/
split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
- if (EXT4_EXT_MAY_ZEROOUT & split_flag)
+ if ((EXT4_EXT_MAY_ZEROOUT & split_flag) &&
+ !ext4_encrypted_inode(inode))
max_zeroout = sbi->s_extent_max_zeroout_kb >>
(inode->i_sb->s_blocksize_bits - 10);
+ if (ext4_encrypted_inode(inode))
+ max_zeroout = 0;
+
/* If extent is less than s_max_zeroout_kb, zeroout directly */
if (max_zeroout && (ee_len <= max_zeroout)) {
err = ext4_ext_zeroout(inode, ex);
@@ -4909,6 +4916,20 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
ext4_lblk_t lblk;
unsigned int blkbits = inode->i_blkbits;
+ /*
+ * Encrypted inodes can't handle collapse range or insert
+ * range since we would need to re-encrypt blocks with a
+ * different IV or XTS tweak (which are based on the logical
+ * block number).
+ *
+ * XXX It's not clear why zero range isn't working, but we'll
+ * leave it disabled for encrypted inodes for now. This is a
+ * bug we should fix....
+ */
+ if (ext4_encrypted_inode(inode) &&
+ (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)))
+ return -EOPNOTSUPP;
+
/* Return error if mode is not supported */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ed2a57a3bb50..598bebb6d0a9 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -200,6 +200,15 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct inode *inode = file->f_mapping->host;
+
+ if (ext4_encrypted_inode(inode)) {
+ int err = ext4_get_encryption_info(inode);
+ if (err)
+ return 0;
+ if (ext4_encryption_info(inode) == NULL)
+ return -ENOKEY;
+ }
file_accessed(file);
vma->vm_ops = &ext4_file_vm_ops;
return 0;
@@ -212,6 +221,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
struct vfsmount *mnt = filp->f_path.mnt;
struct path path;
char buf[64], *cp;
+ int ret;
if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
!(sb->s_flags & MS_RDONLY))) {
@@ -245,12 +255,19 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
ext4_journal_stop(handle);
}
}
+ if (ext4_encrypted_inode(inode)) {
+ ret = ext4_get_encryption_info(inode);
+ if (ret)
+ return -EACCES;
+ if (ext4_encryption_info(inode) == NULL)
+ return -ENOKEY;
+ }
/*
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
*/
if (filp->f_mode & FMODE_WRITE) {
- int ret = ext4_inode_attach_jinode(inode);
+ ret = ext4_inode_attach_jinode(inode);
if (ret < 0)
return ret;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9f230e589ecc..b7d49d2ab74f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -727,11 +727,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ext4_group_t i;
ext4_group_t flex_group;
struct ext4_group_info *grp;
+ int encrypt = 0;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
return ERR_PTR(-EPERM);
+ if ((ext4_encrypted_inode(dir) ||
+ DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) &&
+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
+ err = ext4_get_encryption_info(dir);
+ if (err)
+ return ERR_PTR(err);
+ if (ext4_encryption_info(dir) == NULL)
+ return ERR_PTR(-EPERM);
+ if (!handle)
+ nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb);
+ encrypt = 1;
+ }
+
sb = dir->i_sb;
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
@@ -1029,11 +1043,9 @@ got:
ext4_set_inode_state(inode, EXT4_STATE_NEW);
ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
-
ei->i_inline_off = 0;
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-
ret = inode;
err = dquot_alloc_inode(inode);
if (err)
@@ -1060,6 +1072,12 @@ got:
ei->i_datasync_tid = handle->h_transaction->t_tid;
}
+ if (encrypt) {
+ err = ext4_inherit_context(dir, inode);
+ if (err)
+ goto fail_free_drop;
+ }
+
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
ext4_std_error(sb, err);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 8fc2357c6867..63007822a464 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -11,11 +11,14 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
+
+#include <linux/fiemap.h>
+
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
#include "truncate.h"
-#include <linux/fiemap.h>
+#include <trace/events/android_fs.h>
#define EXT4_XATTR_SYSTEM_DATA "data"
#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
@@ -501,6 +504,17 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
return -EAGAIN;
}
+ if (trace_android_fs_dataread_start_enabled()) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_dataread_start(inode, page_offset(page),
+ PAGE_SIZE, current->pid,
+ path, current->comm);
+ }
+
/*
* Current inline data can only exist in the 1st page,
* So for all the other pages, just set them uptodate.
@@ -512,6 +526,8 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
SetPageUptodate(page);
}
+ trace_android_fs_dataread_end(inode, page_offset(page), PAGE_SIZE);
+
up_read(&EXT4_I(inode)->xattr_sem);
unlock_page(page);
@@ -991,20 +1007,18 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
* and -EEXIST if directory entry already exists.
*/
static int ext4_add_dirent_to_inline(handle_t *handle,
+ struct ext4_filename *fname,
struct dentry *dentry,
struct inode *inode,
struct ext4_iloc *iloc,
void *inline_start, int inline_size)
{
struct inode *dir = dentry->d_parent->d_inode;
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
int err;
struct ext4_dir_entry_2 *de;
- err = ext4_find_dest_de(dir, inode, iloc->bh,
- inline_start, inline_size,
- name, namelen, &de);
+ err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start,
+ inline_size, fname, &de);
if (err)
return err;
@@ -1012,7 +1026,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
err = ext4_journal_get_write_access(handle, iloc->bh);
if (err)
return err;
- ext4_insert_dentry(inode, de, inline_size, name, namelen);
+ ext4_insert_dentry(dir, inode, de, inline_size, fname);
ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
@@ -1242,8 +1256,8 @@ out:
* If succeeds, return 0. If not, extended the inline dir and copied data to
* the new created block.
*/
-int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
- struct inode *inode)
+int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
+ struct dentry *dentry, struct inode *inode)
{
int ret, inline_size;
void *inline_start;
@@ -1262,7 +1276,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
EXT4_INLINE_DOTDOT_SIZE;
inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
- ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+ ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc,
inline_start, inline_size);
if (ret != -ENOSPC)
goto out;
@@ -1283,8 +1297,9 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
if (inline_size) {
inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
- ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
- inline_start, inline_size);
+ ret = ext4_add_dirent_to_inline(handle, fname, dentry,
+ inode, &iloc, inline_start,
+ inline_size);
if (ret != -ENOSPC)
goto out;
@@ -1324,6 +1339,7 @@ int htree_inlinedir_to_tree(struct file *dir_file,
struct ext4_iloc iloc;
void *dir_buf = NULL;
struct ext4_dir_entry_2 fake;
+ struct ext4_str tmp_str;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@@ -1395,8 +1411,10 @@ int htree_inlinedir_to_tree(struct file *dir_file,
continue;
if (de->inode == 0)
continue;
- err = ext4_htree_store_dirent(dir_file,
- hinfo->hash, hinfo->minor_hash, de);
+ tmp_str.name = de->name;
+ tmp_str.len = de->name_len;
+ err = ext4_htree_store_dirent(dir_file, hinfo->hash,
+ hinfo->minor_hash, de, &tmp_str);
if (err) {
count = err;
goto out;
@@ -1602,6 +1620,7 @@ out:
}
struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+ struct ext4_filename *fname,
const struct qstr *d_name,
struct ext4_dir_entry_2 **res_dir,
int *has_inline_data)
@@ -1623,8 +1642,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir,
inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
EXT4_INLINE_DOTDOT_SIZE;
inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
- ret = search_dir(iloc.bh, inline_start, inline_size,
- dir, d_name, 0, res_dir);
+ ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
+ dir, fname, d_name, 0, res_dir);
if (ret == 1)
goto out_find;
if (ret < 0)
@@ -1636,8 +1655,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir,
inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
- ret = search_dir(iloc.bh, inline_start, inline_size,
- dir, d_name, 0, res_dir);
+ ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
+ dir, fname, d_name, 0, res_dir);
if (ret == 1)
goto out_find;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 39a263ecca18..72d07c77bc7f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -46,6 +46,7 @@
#include "truncate.h"
#include <trace/events/ext4.h>
+#include <trace/events/android_fs.h>
#define MPAGE_DA_EXTENT_TAIL 0x01
@@ -141,7 +142,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
/*
* Test whether an inode is a fast symlink.
*/
-static int ext4_inode_is_fast_symlink(struct inode *inode)
+int ext4_inode_is_fast_symlink(struct inode *inode)
{
int ea_blocks = EXT4_I(inode)->i_file_acl ?
EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
@@ -661,6 +662,20 @@ has_zeroout:
ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
+
+ /*
+ * Inodes with freshly allocated blocks where contents will be
+ * visible after transaction commit must be on transaction's
+ * ordered data list.
+ */
+ if (map->m_flags & EXT4_MAP_NEW &&
+ !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
+ !IS_NOQUOTA(inode) &&
+ ext4_should_order_data(inode)) {
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret)
+ return ret;
+ }
}
return retval;
}
@@ -879,6 +894,95 @@ int do_journal_get_write_access(handle_t *handle,
static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
+ get_block_t *get_block)
+{
+ unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned to = from + len;
+ struct inode *inode = page->mapping->host;
+ unsigned block_start, block_end;
+ sector_t block;
+ int err = 0;
+ unsigned blocksize = inode->i_sb->s_blocksize;
+ unsigned bbits;
+ struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
+ bool decrypt = false;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(from > PAGE_CACHE_SIZE);
+ BUG_ON(to > PAGE_CACHE_SIZE);
+ BUG_ON(from > to);
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, blocksize, 0);
+ head = page_buffers(page);
+ bbits = ilog2(blocksize);
+ block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+
+ for (bh = head, block_start = 0; bh != head || !block_start;
+ block++, block_start = block_end, bh = bh->b_this_page) {
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to) {
+ if (PageUptodate(page)) {
+ if (!buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+ }
+ continue;
+ }
+ if (buffer_new(bh))
+ clear_buffer_new(bh);
+ if (!buffer_mapped(bh)) {
+ WARN_ON(bh->b_size != blocksize);
+ err = get_block(inode, block, bh, 1);
+ if (err)
+ break;
+ if (buffer_new(bh)) {
+ unmap_underlying_metadata(bh->b_bdev,
+ bh->b_blocknr);
+ if (PageUptodate(page)) {
+ clear_buffer_new(bh);
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ continue;
+ }
+ if (block_end > to || block_start < from)
+ zero_user_segments(page, to, block_end,
+ block_start, from);
+ continue;
+ }
+ }
+ if (PageUptodate(page)) {
+ if (!buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+ !buffer_unwritten(bh) &&
+ (block_start < from || block_end > to)) {
+ ll_rw_block(READ, 1, &bh);
+ *wait_bh++ = bh;
+ decrypt = ext4_encrypted_inode(inode) &&
+ S_ISREG(inode->i_mode);
+ }
+ }
+ /*
+ * If we issued read requests, let them complete.
+ */
+ while (wait_bh > wait) {
+ wait_on_buffer(*--wait_bh);
+ if (!buffer_uptodate(*wait_bh))
+ err = -EIO;
+ }
+ if (unlikely(err))
+ page_zero_new_buffers(page, from, to);
+ else if (decrypt)
+ err = ext4_decrypt(page);
+ return err;
+}
+#endif
+
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -891,6 +995,16 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index;
unsigned from, to;
+ if (trace_android_fs_datawrite_start_enabled()) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_datawrite_start(inode, pos, len,
+ current->pid, path,
+ current->comm);
+ }
trace_ext4_write_begin(inode, pos, len, flags);
/*
* Reserve one block more for addition to orphan list in case
@@ -941,11 +1055,19 @@ retry_journal:
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ if (ext4_should_dioread_nolock(inode))
+ ret = ext4_block_write_begin(page, pos, len,
+ ext4_get_block_write);
+ else
+ ret = ext4_block_write_begin(page, pos, len,
+ ext4_get_block);
+#else
if (ext4_should_dioread_nolock(inode))
ret = __block_write_begin(page, pos, len, ext4_get_block_write);
else
ret = __block_write_begin(page, pos, len, ext4_get_block);
-
+#endif
if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, page_buffers(page),
from, to, NULL,
@@ -1018,16 +1140,8 @@ static int ext4_write_end(struct file *file,
int ret = 0, ret2;
int i_size_changed = 0;
+ trace_android_fs_datawrite_end(inode, pos, len);
trace_ext4_write_end(inode, pos, len, copied);
- if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
- ret = ext4_jbd2_file_inode(handle, inode);
- if (ret) {
- unlock_page(page);
- page_cache_release(page);
- goto errout;
- }
- }
-
if (ext4_has_inline_data(inode)) {
ret = ext4_write_inline_data_end(inode, pos, len,
copied, page);
@@ -1091,6 +1205,7 @@ static int ext4_journalled_write_end(struct file *file,
unsigned from, to;
int size_changed = 0;
+ trace_android_fs_datawrite_end(inode, pos, len);
trace_ext4_journalled_write_end(inode, pos, len, copied);
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
@@ -1738,11 +1853,22 @@ static int ext4_writepage(struct page *page,
* the page. But we may reach here when we do a journal commit via
* journal_submit_inode_data_buffers() and in that case we must write
* allocated buffers to achieve data=ordered mode guarantees.
+ *
+ * Also, if there is only one buffer per page (the fs block
+ * size == the page size), if one buffer needs block
+ * allocation or needs to modify the extent tree to clear the
+ * unwritten flag, we know that the page can't be written at
+ * all, so we might as well refuse the write immediately.
+ * Unfortunately if the block size != page size, we can't as
+ * easily detect this case using ext4_walk_page_buffers(), but
+ * for the extremely common case, this is an optimization that
+ * skips a useless round trip through ext4_bio_write_page().
*/
if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
ext4_bh_delay_or_unwritten)) {
redirty_page_for_writepage(wbc, page);
- if (current->flags & PF_MEMALLOC) {
+ if ((current->flags & PF_MEMALLOC) ||
+ (inode->i_sb->s_blocksize == PAGE_CACHE_SIZE)) {
/*
* For memory cleaning there's no point in writing only
* some buffers. So just bail out. Warn if we came here
@@ -2574,6 +2700,16 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
len, flags, pagep, fsdata);
}
*fsdata = (void *)0;
+ if (trace_android_fs_datawrite_start_enabled()) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_datawrite_start(inode, pos, len,
+ current->pid,
+ path, current->comm);
+ }
trace_ext4_da_write_begin(inode, pos, len, flags);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@@ -2624,7 +2760,12 @@ retry_journal:
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ ret = ext4_block_write_begin(page, pos, len,
+ ext4_da_get_block_prep);
+#else
ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
+#endif
if (ret < 0) {
unlock_page(page);
ext4_journal_stop(handle);
@@ -2687,6 +2828,7 @@ static int ext4_da_write_end(struct file *file,
return ext4_write_end(file, mapping, pos,
len, copied, page, fsdata);
+ trace_android_fs_datawrite_end(inode, pos, len);
trace_ext4_da_write_end(inode, pos, len, copied);
start = pos & (PAGE_CACHE_SIZE - 1);
end = start + copied - 1;
@@ -2870,7 +3012,7 @@ static int ext4_readpage(struct file *file, struct page *page)
ret = ext4_readpage_inline(inode, page);
if (ret == -EAGAIN)
- return mpage_readpage(page, ext4_get_block);
+ return ext4_mpage_readpages(page->mapping, NULL, page, 1);
return ret;
}
@@ -2885,7 +3027,7 @@ ext4_readpages(struct file *file, struct address_space *mapping,
if (ext4_has_inline_data(inode))
return 0;
- return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
+ return ext4_mpage_readpages(mapping, pages, NULL, nr_pages);
}
static void ext4_invalidatepage(struct page *page, unsigned int offset,
@@ -3082,6 +3224,9 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
get_block_func = ext4_get_block_write;
dio_flags = DIO_LOCKING;
}
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
+#endif
ret = __blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iter,
offset,
@@ -3145,6 +3290,11 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
size_t count = iov_iter_count(iter);
ssize_t ret;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ return 0;
+#endif
+
/*
* If we are doing data journalling we don't support O_DIRECT
*/
@@ -3155,12 +3305,39 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
if (ext4_has_inline_data(inode))
return 0;
+ if (trace_android_fs_dataread_start_enabled() && (rw == READ)) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_dataread_start(inode, offset, count,
+ current->pid, path,
+ current->comm);
+ }
+ if (trace_android_fs_datawrite_start_enabled() && (rw == WRITE)) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_datawrite_start(inode, offset, count,
+ current->pid, path,
+ current->comm);
+ }
+
trace_ext4_direct_IO_enter(inode, offset, count, rw);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_direct_IO(rw, iocb, iter, offset);
else
ret = ext4_ind_direct_IO(rw, iocb, iter, offset);
trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);
+
+ if (trace_android_fs_dataread_start_enabled() && (rw == READ))
+ trace_android_fs_dataread_end(inode, offset, count);
+ if (trace_android_fs_datawrite_start_enabled() && (rw == WRITE))
+ trace_android_fs_datawrite_end(inode, offset, count);
+
return ret;
}
@@ -3324,6 +3501,13 @@ static int ext4_block_zero_page_range(handle_t *handle,
/* Uhhuh. Read error. Complain and punt. */
if (!buffer_uptodate(bh))
goto unlock;
+ if (S_ISREG(inode->i_mode) &&
+ ext4_encrypted_inode(inode)) {
+ /* We expect the key to be set. */
+ BUG_ON(!ext4_has_encryption_key(inode));
+ BUG_ON(blocksize != PAGE_CACHE_SIZE);
+ WARN_ON_ONCE(ext4_decrypt(page));
+ }
}
if (ext4_should_journal_data(inode)) {
BUFFER_TRACE(bh, "get write access");
@@ -3363,6 +3547,11 @@ static int ext4_block_truncate_page(handle_t *handle,
unsigned blocksize;
struct inode *inode = mapping->host;
+ /* If we are processing an encrypted inode during orphan list
+ * handling */
+ if (ext4_encrypted_inode(inode) && !ext4_has_encryption_key(inode))
+ return 0;
+
blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1));
@@ -4130,7 +4319,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext4_dir_inode_operations;
inode->i_fop = &ext4_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
- if (ext4_inode_is_fast_symlink(inode)) {
+ if (ext4_inode_is_fast_symlink(inode) &&
+ !ext4_encrypted_inode(inode)) {
inode->i_op = &ext4_fast_symlink_inode_operations;
nd_terminate_link(ei->i_data, inode->i_size,
sizeof(ei->i_data) - 1);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bfda18a15592..008b688f1432 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -14,6 +14,7 @@
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/file.h>
+#include <linux/random.h>
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
@@ -198,6 +199,16 @@ journal_err_out:
return err;
}
+static int uuid_is_zero(__u8 u[16])
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ if (u[i])
+ return 0;
+ return 1;
+}
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -587,11 +598,13 @@ resizefs_out:
return err;
}
+ case FIDTRIM:
case FITRIM:
{
struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret = 0;
+ int flags = cmd == FIDTRIM ? BLKDEV_DISCARD_SECURE : 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -599,13 +612,15 @@ resizefs_out:
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
+ if ((flags & BLKDEV_DISCARD_SECURE) && !blk_queue_secdiscard(q))
+ return -EOPNOTSUPP;
if (copy_from_user(&range, (struct fstrim_range __user *)arg,
sizeof(range)))
return -EFAULT;
range.minlen = max((unsigned int)range.minlen,
q->limits.discard_granularity);
- ret = ext4_trim_fs(sb, &range);
+ ret = ext4_trim_fs(sb, &range, flags);
if (ret < 0)
return ret;
@@ -617,7 +632,88 @@ resizefs_out:
}
case EXT4_IOC_PRECACHE_EXTENTS:
return ext4_ext_precache(inode);
+ case EXT4_IOC_SET_ENCRYPTION_POLICY: {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ struct ext4_encryption_policy policy;
+ int err = 0;
+
+ if (copy_from_user(&policy,
+ (struct ext4_encryption_policy __user *)arg,
+ sizeof(policy))) {
+ err = -EFAULT;
+ goto encryption_policy_out;
+ }
+ err = mnt_want_write_file(filp);
+ if (err)
+ goto encryption_policy_out;
+
+ mutex_lock(&inode->i_mutex);
+
+ err = ext4_process_policy(&policy, inode);
+
+ mutex_unlock(&inode->i_mutex);
+
+ mnt_drop_write_file(filp);
+encryption_policy_out:
+ return err;
+#else
+ return -EOPNOTSUPP;
+#endif
+ }
+ case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
+ int err, err2;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ handle_t *handle;
+
+ if (!ext4_sb_has_crypto(sb))
+ return -EOPNOTSUPP;
+ if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto pwsalt_err_exit;
+ }
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto pwsalt_err_journal;
+ generate_random_uuid(sbi->s_es->s_encrypt_pw_salt);
+ err = ext4_handle_dirty_metadata(handle, NULL,
+ sbi->s_sbh);
+ pwsalt_err_journal:
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
+ err = err2;
+ pwsalt_err_exit:
+ mnt_drop_write_file(filp);
+ if (err)
+ return err;
+ }
+ if (copy_to_user((void __user *) arg,
+ sbi->s_es->s_encrypt_pw_salt, 16))
+ return -EFAULT;
+ return 0;
+ }
+ case EXT4_IOC_GET_ENCRYPTION_POLICY: {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ struct ext4_encryption_policy policy;
+ int err = 0;
+
+ if (!ext4_encrypted_inode(inode))
+ return -ENOENT;
+ err = ext4_get_policy(inode, &policy);
+ if (err)
+ return err;
+ if (copy_to_user((void __user *)arg, &policy, sizeof(policy)))
+ return -EFAULT;
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+ }
default:
return -ENOTTY;
}
@@ -682,6 +778,9 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FITRIM:
case EXT4_IOC_RESIZE_FS:
case EXT4_IOC_PRECACHE_EXTENTS:
+ case EXT4_IOC_SET_ENCRYPTION_POLICY:
+ case EXT4_IOC_GET_ENCRYPTION_PWSALT:
+ case EXT4_IOC_GET_ENCRYPTION_POLICY:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index da268894e7c7..2309de2a5917 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2748,7 +2748,8 @@ int ext4_mb_release(struct super_block *sb)
}
static inline int ext4_issue_discard(struct super_block *sb,
- ext4_group_t block_group, ext4_grpblk_t cluster, int count)
+ ext4_group_t block_group, ext4_grpblk_t cluster, int count,
+ unsigned long flags)
{
ext4_fsblk_t discard_block;
@@ -2757,7 +2758,7 @@ static inline int ext4_issue_discard(struct super_block *sb,
count = EXT4_C2B(EXT4_SB(sb), count);
trace_ext4_discard_blocks(sb,
(unsigned long long) discard_block, count);
- return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+ return sb_issue_discard(sb, discard_block, count, GFP_NOFS, flags);
}
/*
@@ -2779,7 +2780,7 @@ static void ext4_free_data_callback(struct super_block *sb,
if (test_opt(sb, DISCARD)) {
err = ext4_issue_discard(sb, entry->efd_group,
entry->efd_start_cluster,
- entry->efd_count);
+ entry->efd_count, 0);
if (err && err != -EOPNOTSUPP)
ext4_msg(sb, KERN_WARNING, "discard request in"
" group:%d block:%d count:%d failed"
@@ -4825,7 +4826,8 @@ do_more:
* them with group lock_held
*/
if (test_opt(sb, DISCARD)) {
- err = ext4_issue_discard(sb, block_group, bit, count);
+ err = ext4_issue_discard(sb, block_group, bit, count,
+ 0);
if (err && err != -EOPNOTSUPP)
ext4_msg(sb, KERN_WARNING, "discard request in"
" group:%d block:%d count:%lu failed"
@@ -5020,13 +5022,15 @@ error_return:
* @count: number of blocks to TRIM
* @group: alloc. group we are working with
* @e4b: ext4 buddy for the group
+ * @blkdev_flags: flags for the block device
*
* Trim "count" blocks starting at "start" in the "group". To assure that no
* one will allocate those blocks, mark it as used in buddy bitmap. This must
* be called with under the group lock.
*/
static int ext4_trim_extent(struct super_block *sb, int start, int count,
- ext4_group_t group, struct ext4_buddy *e4b)
+ ext4_group_t group, struct ext4_buddy *e4b,
+ unsigned long blkdev_flags)
__releases(bitlock)
__acquires(bitlock)
{
@@ -5047,7 +5051,7 @@ __acquires(bitlock)
*/
mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group);
- ret = ext4_issue_discard(sb, group, start, count);
+ ret = ext4_issue_discard(sb, group, start, count, blkdev_flags);
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
return ret;
@@ -5060,6 +5064,7 @@ __acquires(bitlock)
* @start: first group block to examine
* @max: last group block to examine
* @minblocks: minimum extent block count
+ * @blkdev_flags: flags for the block device
*
* ext4_trim_all_free walks through group's buddy bitmap searching for free
* extents. When the free block is found, ext4_trim_extent is called to TRIM
@@ -5074,7 +5079,7 @@ __acquires(bitlock)
static ext4_grpblk_t
ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ext4_grpblk_t start, ext4_grpblk_t max,
- ext4_grpblk_t minblocks)
+ ext4_grpblk_t minblocks, unsigned long blkdev_flags)
{
void *bitmap;
ext4_grpblk_t next, count = 0, free_count = 0;
@@ -5107,7 +5112,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
if ((next - start) >= minblocks) {
ret = ext4_trim_extent(sb, start,
- next - start, group, &e4b);
+ next - start, group, &e4b,
+ blkdev_flags);
if (ret && ret != -EOPNOTSUPP)
break;
ret = 0;
@@ -5149,6 +5155,7 @@ out:
* ext4_trim_fs() -- trim ioctl handle function
* @sb: superblock for filesystem
* @range: fstrim_range structure
+ * @blkdev_flags: flags for the block device
*
* start: First Byte to trim
* len: number of Bytes to trim from start
@@ -5157,7 +5164,8 @@ out:
* start to start+len. For each such a group ext4_trim_all_free function
* is invoked to trim all free space.
*/
-int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range,
+ unsigned long blkdev_flags)
{
struct ext4_group_info *grp;
ext4_group_t group, first_group, last_group;
@@ -5213,7 +5221,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
if (grp->bb_free >= minlen) {
cnt = ext4_trim_all_free(sb, group, first_cluster,
- end, minlen);
+ end, minlen, blkdev_flags);
if (cnt < 0) {
ret = cnt;
break;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 4d1049517e4a..36e383e6a69a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -601,6 +601,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
ext4_should_journal_data(donor_inode)) {
return -EINVAL;
}
+
+ if (ext4_encrypted_inode(orig_inode) ||
+ ext4_encrypted_inode(donor_inode)) {
+ ext4_msg(orig_inode->i_sb, KERN_ERR,
+ "Online defrag not supported for encrypted files");
+ return -EOPNOTSUPP;
+ }
+
/* Protect orig and donor inodes against a truncate */
lock_two_nondirectories(orig_inode, donor_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d3a22a11ac76..b668bf30726c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -249,13 +249,14 @@ static void dx_set_count(struct dx_entry *entries, unsigned value);
static void dx_set_limit(struct dx_entry *entries, unsigned value);
static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
static unsigned dx_node_limit(struct inode *dir);
-static struct dx_frame *dx_probe(const struct qstr *d_name,
+static struct dx_frame *dx_probe(struct ext4_filename *fname,
struct inode *dir,
struct dx_hash_info *hinfo,
struct dx_frame *frame);
static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
- struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
+ unsigned blocksize, struct dx_hash_info *hinfo,
+ struct dx_map_entry map[]);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
struct dx_map_entry *offsets, int count, unsigned blocksize);
@@ -267,10 +268,10 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
struct dx_frame *frames,
__u32 *start_hash);
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
- const struct qstr *d_name,
+ struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir);
-static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
- struct inode *inode);
+static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+ struct dentry *dentry, struct inode *inode);
/* checksumming functions */
void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
@@ -586,8 +587,10 @@ struct stats
unsigned bcount;
};
-static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de,
- int size, int show_names)
+static struct stats dx_show_leaf(struct inode *dir,
+ struct dx_hash_info *hinfo,
+ struct ext4_dir_entry_2 *de,
+ int size, int show_names)
{
unsigned names = 0, space = 0;
char *base = (char *) de;
@@ -600,12 +603,69 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
{
if (show_names)
{
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ int len;
+ char *name;
+ struct ext4_str fname_crypto_str
+ = {.name = NULL, .len = 0};
+ int res = 0;
+
+ name = de->name;
+ len = de->name_len;
+ if (ext4_encrypted_inode(inode))
+ res = ext4_get_encryption_info(dir);
+ if (res) {
+ printk(KERN_WARNING "Error setting up"
+ " fname crypto: %d\n", res);
+ }
+ if (ctx == NULL) {
+ /* Directory is not encrypted */
+ ext4fs_dirhash(de->name,
+ de->name_len, &h);
+ printk("%*.s:(U)%x.%u ", len,
+ name, h.hash,
+ (unsigned) ((char *) de
+ - base));
+ } else {
+ /* Directory is encrypted */
+ res = ext4_fname_crypto_alloc_buffer(
+ ctx, de->name_len,
+ &fname_crypto_str);
+ if (res < 0) {
+ printk(KERN_WARNING "Error "
+ "allocating crypto "
+ "buffer--skipping "
+ "crypto\n");
+ ctx = NULL;
+ }
+ res = ext4_fname_disk_to_usr(ctx, NULL, de,
+ &fname_crypto_str);
+ if (res < 0) {
+ printk(KERN_WARNING "Error "
+ "converting filename "
+ "from disk to usr"
+ "\n");
+ name = "??";
+ len = 2;
+ } else {
+ name = fname_crypto_str.name;
+ len = fname_crypto_str.len;
+ }
+ ext4fs_dirhash(de->name, de->name_len,
+ &h);
+ printk("%*.s:(E)%x.%u ", len, name,
+ h.hash, (unsigned) ((char *) de
+ - base));
+ ext4_fname_crypto_free_buffer(
+ &fname_crypto_str);
+ }
+#else
int len = de->name_len;
char *name = de->name;
- while (len--) printk("%c", *name++);
ext4fs_dirhash(de->name, de->name_len, &h);
- printk(":%x.%u ", h.hash,
+ printk("%*.s:%x.%u ", len, name, h.hash,
(unsigned) ((char *) de - base));
+#endif
}
space += EXT4_DIR_REC_LEN(de->name_len);
names++;
@@ -623,7 +683,6 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
unsigned count = dx_get_count(entries), names = 0, space = 0, i;
unsigned bcount = 0;
struct buffer_head *bh;
- int err;
printk("%i indexed blocks...\n", count);
for (i = 0; i < count; i++, entries++)
{
@@ -637,7 +696,8 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
continue;
stats = levels?
dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
- dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
+ dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *)
+ bh->b_data, blocksize, 0);
names += stats.names;
space += stats.space;
bcount += stats.bcount;
@@ -661,7 +721,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
* back to userspace.
*/
static struct dx_frame *
-dx_probe(const struct qstr *d_name, struct inode *dir,
+dx_probe(struct ext4_filename *fname, struct inode *dir,
struct dx_hash_info *hinfo, struct dx_frame *frame_in)
{
unsigned count, indirect;
@@ -683,12 +743,14 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
root->info.hash_version);
goto fail;
}
+ if (fname)
+ hinfo = &fname->hinfo;
hinfo->hash_version = root->info.hash_version;
if (hinfo->hash_version <= DX_HASH_TEA)
hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
- if (d_name)
- ext4fs_dirhash(d_name->name, d_name->len, hinfo);
+ if (fname && fname_name(fname))
+ ext4fs_dirhash(fname_name(fname), fname_len(fname), hinfo);
hash = hinfo->hash;
if (root->info.unused_flags & 1) {
@@ -773,6 +835,7 @@ fail:
brelse(frame->bh);
frame--;
}
+
if (ret_err == ERR_PTR(ERR_BAD_DX_DIR))
ext4_warning(dir->i_sb,
"Corrupt dir inode %lu, running e2fsck is "
@@ -878,6 +941,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
struct buffer_head *bh;
struct ext4_dir_entry_2 *de, *top;
int err = 0, count = 0;
+ struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str;
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
@@ -889,6 +953,22 @@ static int htree_dirblock_to_tree(struct file *dir_file,
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ /* Check if the directory is encrypted */
+ if (ext4_encrypted_inode(dir)) {
+ err = ext4_get_encryption_info(dir);
+ if (err < 0) {
+ brelse(bh);
+ return err;
+ }
+ err = ext4_fname_crypto_alloc_buffer(dir, EXT4_NAME_LEN,
+ &fname_crypto_str);
+ if (err < 0) {
+ brelse(bh);
+ return err;
+ }
+ }
+#endif
for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
bh->b_data, bh->b_size,
@@ -904,14 +984,38 @@ static int htree_dirblock_to_tree(struct file *dir_file,
continue;
if (de->inode == 0)
continue;
- if ((err = ext4_htree_store_dirent(dir_file,
- hinfo->hash, hinfo->minor_hash, de)) != 0) {
- brelse(bh);
- return err;
+ if (!ext4_encrypted_inode(dir)) {
+ tmp_str.name = de->name;
+ tmp_str.len = de->name_len;
+ err = ext4_htree_store_dirent(dir_file,
+ hinfo->hash, hinfo->minor_hash, de,
+ &tmp_str);
+ } else {
+ int save_len = fname_crypto_str.len;
+
+ /* Directory is encrypted */
+ err = ext4_fname_disk_to_usr(dir, hinfo, de,
+ &fname_crypto_str);
+ if (err < 0) {
+ count = err;
+ goto errout;
+ }
+ err = ext4_htree_store_dirent(dir_file,
+ hinfo->hash, hinfo->minor_hash, de,
+ &fname_crypto_str);
+ fname_crypto_str.len = save_len;
+ }
+ if (err != 0) {
+ count = err;
+ goto errout;
}
count++;
}
+errout:
brelse(bh);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ ext4_fname_crypto_free_buffer(&fname_crypto_str);
+#endif
return count;
}
@@ -935,6 +1039,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
int count = 0;
int ret, err;
__u32 hashval;
+ struct ext4_str tmp_str;
dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
start_hash, start_minor_hash));
@@ -970,14 +1075,22 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
/* Add '.' and '..' from the htree header */
if (!start_hash && !start_minor_hash) {
de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
- if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+ tmp_str.name = de->name;
+ tmp_str.len = de->name_len;
+ err = ext4_htree_store_dirent(dir_file, 0, 0,
+ de, &tmp_str);
+ if (err != 0)
goto errout;
count++;
}
if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
de = ext4_next_entry(de, dir->i_sb->s_blocksize);
- if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
+ tmp_str.name = de->name;
+ tmp_str.len = de->name_len;
+ err = ext4_htree_store_dirent(dir_file, 2, 0,
+ de, &tmp_str);
+ if (err != 0)
goto errout;
count++;
}
@@ -1019,12 +1132,13 @@ errout:
static inline int search_dirblock(struct buffer_head *bh,
struct inode *dir,
+ struct ext4_filename *fname,
const struct qstr *d_name,
unsigned int offset,
struct ext4_dir_entry_2 **res_dir)
{
- return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
- d_name, offset, res_dir);
+ return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
+ fname, d_name, offset, res_dir);
}
/*
@@ -1035,8 +1149,8 @@ static inline int search_dirblock(struct buffer_head *bh,
* Create map of hash values, offsets, and sizes, stored at end of block.
* Returns number of entries mapped.
*/
-static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
- struct dx_hash_info *hinfo,
+static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
+ unsigned blocksize, struct dx_hash_info *hinfo,
struct dx_map_entry *map_tail)
{
int count = 0;
@@ -1106,57 +1220,87 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
* `len <= EXT4_NAME_LEN' is guaranteed by caller.
* `de != NULL' is guaranteed by caller.
*/
-static inline int ext4_match (int len, const char * const name,
- struct ext4_dir_entry_2 * de)
+static inline int ext4_match(struct ext4_filename *fname,
+ struct ext4_dir_entry_2 *de)
{
- if (len != de->name_len)
- return 0;
+ const void *name = fname_name(fname);
+ u32 len = fname_len(fname);
+
if (!de->inode)
return 0;
- return !memcmp(name, de->name, len);
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ if (unlikely(!name)) {
+ if (fname->usr_fname->name[0] == '_') {
+ int ret;
+ if (de->name_len <= 32)
+ return 0;
+ ret = memcmp(de->name + ((de->name_len - 17) & ~15),
+ fname->crypto_buf.name + 8, 16);
+ return (ret == 0) ? 1 : 0;
+ }
+ name = fname->crypto_buf.name;
+ len = fname->crypto_buf.len;
+ }
+#endif
+ if (de->name_len != len)
+ return 0;
+ return (memcmp(de->name, name, len) == 0) ? 1 : 0;
}
/*
* Returns 0 if not found, -1 on failure, and 1 on success
*/
-int search_dir(struct buffer_head *bh,
- char *search_buf,
- int buf_size,
- struct inode *dir,
- const struct qstr *d_name,
- unsigned int offset,
- struct ext4_dir_entry_2 **res_dir)
+int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
+ struct inode *dir, struct ext4_filename *fname,
+ const struct qstr *d_name,
+ unsigned int offset, struct ext4_dir_entry_2 **res_dir)
{
struct ext4_dir_entry_2 * de;
char * dlimit;
int de_len;
- const char *name = d_name->name;
- int namelen = d_name->len;
+ int res;
de = (struct ext4_dir_entry_2 *)search_buf;
dlimit = search_buf + buf_size;
while ((char *) de < dlimit) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
+ if ((char *) de + de->name_len <= dlimit) {
+ res = ext4_match(fname, de);
+ if (res < 0) {
+ res = -1;
+ goto return_result;
+ }
+ if (res > 0) {
+ /* found a match - just to be sure, do
+ * a full check */
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ bh->b_data,
+ bh->b_size, offset)) {
+ res = -1;
+ goto return_result;
+ }
+ *res_dir = de;
+ res = 1;
+ goto return_result;
+ }
- if ((char *) de + namelen <= dlimit &&
- ext4_match (namelen, name, de)) {
- /* found a match - just to be sure, do a full check */
- if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
- bh->b_size, offset))
- return -1;
- *res_dir = de;
- return 1;
}
/* prevent looping on a bad block */
de_len = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
- if (de_len <= 0)
- return -1;
+ if (de_len <= 0) {
+ res = -1;
+ goto return_result;
+ }
offset += de_len;
de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
}
- return 0;
+
+ res = 0;
+return_result:
+ return res;
}
static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
@@ -1202,7 +1346,8 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
buffer */
int num = 0;
ext4_lblk_t nblocks;
- int i, namelen;
+ int i, namelen, retval;
+ struct ext4_filename fname;
*res_dir = NULL;
sb = dir->i_sb;
@@ -1210,14 +1355,18 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
if (namelen > EXT4_NAME_LEN)
return NULL;
+ retval = ext4_fname_setup_filename(dir, d_name, 1, &fname);
+ if (retval)
+ return ERR_PTR(retval);
+
if (ext4_has_inline_data(dir)) {
int has_inline_data = 1;
- ret = ext4_find_inline_entry(dir, d_name, res_dir,
+ ret = ext4_find_inline_entry(dir, &fname, d_name, res_dir,
&has_inline_data);
if (has_inline_data) {
if (inlined)
*inlined = 1;
- return ret;
+ goto cleanup_and_exit;
}
}
@@ -1232,14 +1381,14 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
goto restart;
}
if (is_dx(dir)) {
- bh = ext4_dx_find_entry(dir, d_name, res_dir);
+ ret = ext4_dx_find_entry(dir, &fname, res_dir);
/*
* On success, or if the error was file not found,
* return. Otherwise, fall back to doing a search the
* old fashioned way.
*/
- if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR)
- return bh;
+ if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR)
+ goto cleanup_and_exit;
dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
"falling back\n"));
}
@@ -1270,8 +1419,10 @@ restart:
num++;
bh = ext4_getblk(NULL, dir, b++, 0);
if (unlikely(IS_ERR(bh))) {
- if (ra_max == 0)
- return bh;
+ if (ra_max == 0) {
+ ret = bh;
+ goto cleanup_and_exit;
+ }
break;
}
bh_use[ra_max] = bh;
@@ -1301,7 +1452,7 @@ restart:
goto next;
}
set_buffer_verified(bh);
- i = search_dirblock(bh, dir, d_name,
+ i = search_dirblock(bh, dir, &fname, d_name,
block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
if (i == 1) {
EXT4_I(dir)->i_dir_start_lookup = block;
@@ -1332,20 +1483,25 @@ cleanup_and_exit:
/* Clean up the read-ahead blocks */
for (; ra_ptr < ra_max; ra_ptr++)
brelse(bh_use[ra_ptr]);
+ ext4_fname_free_filename(&fname);
return ret;
}
-static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir)
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+ struct ext4_filename *fname,
+ struct ext4_dir_entry_2 **res_dir)
{
struct super_block * sb = dir->i_sb;
- struct dx_hash_info hinfo;
struct dx_frame frames[2], *frame;
+ const struct qstr *d_name = fname->usr_fname;
struct buffer_head *bh;
ext4_lblk_t block;
int retval;
- frame = dx_probe(d_name, dir, &hinfo, frames);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ *res_dir = NULL;
+#endif
+ frame = dx_probe(fname, dir, NULL, frames);
if (IS_ERR(frame))
return (struct buffer_head *) frame;
do {
@@ -1354,7 +1510,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
if (IS_ERR(bh))
goto errout;
- retval = search_dirblock(bh, dir, d_name,
+ retval = search_dirblock(bh, dir, fname, d_name,
block << EXT4_BLOCK_SIZE_BITS(sb),
res_dir);
if (retval == 1)
@@ -1366,7 +1522,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
}
/* Check to see if we should continue to search */
- retval = ext4_htree_next_block(dir, hinfo.hash, frame,
+ retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame,
frames, NULL);
if (retval < 0) {
ext4_warning(sb,
@@ -1391,6 +1547,24 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
struct ext4_dir_entry_2 *de;
struct buffer_head *bh;
+ if (ext4_encrypted_inode(dir)) {
+ int res = ext4_get_encryption_info(dir);
+
+ /*
+ * This should be a properly defined flag for
+ * dentry->d_flags when we uplift this to the VFS.
+ * d_fsdata is set to (void *) 1 if if the dentry is
+ * created while the directory was encrypted and we
+ * don't have access to the key.
+ */
+ dentry->d_fsdata = NULL;
+ if (ext4_encryption_info(dir))
+ dentry->d_fsdata = (void *) 1;
+ d_set_d_op(dentry, &ext4_encrypted_d_ops);
+ if (res && res != -ENOKEY)
+ return ERR_PTR(res);
+ }
+
if (dentry->d_name.len > EXT4_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
@@ -1417,6 +1591,18 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
ino);
return ERR_PTR(-EIO);
}
+ if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
+ (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)) &&
+ !ext4_is_child_context_consistent_with_parent(dir,
+ inode)) {
+ iput(inode);
+ ext4_warning(inode->i_sb,
+ "Inconsistent encryption contexts: %lu/%lu\n",
+ (unsigned long) dir->i_ino,
+ (unsigned long) inode->i_ino);
+ return ERR_PTR(-EPERM);
+ }
}
return d_splice_alias(inode, dentry);
}
@@ -1541,7 +1727,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
/* create map in the end of data2 block */
map = (struct dx_map_entry *) (data2 + blocksize);
- count = dx_make_map((struct ext4_dir_entry_2 *) data1,
+ count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1,
blocksize, hinfo, map);
map -= count;
dx_sort_map(map, count);
@@ -1564,7 +1750,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
hash2, split, count-split));
/* Fancy dance to stay within two buffers */
- de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
+ de2 = dx_move_dirents(data1, data2, map + split, count - split,
+ blocksize);
de = dx_pack_dirents(data1, blocksize);
de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
(char *) de,
@@ -1580,8 +1767,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
initialize_dirent_tail(t, blocksize);
}
- dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
- dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
+ dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1,
+ blocksize, 1));
+ dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2,
+ blocksize, 1));
/* Which block gets the new entry? */
if (hinfo->hash >= hash2) {
@@ -1610,23 +1799,32 @@ journal_error:
int ext4_find_dest_de(struct inode *dir, struct inode *inode,
struct buffer_head *bh,
void *buf, int buf_size,
- const char *name, int namelen,
+ struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de)
{
struct ext4_dir_entry_2 *de;
- unsigned short reclen = EXT4_DIR_REC_LEN(namelen);
+ unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname));
int nlen, rlen;
unsigned int offset = 0;
char *top;
+ int res;
de = (struct ext4_dir_entry_2 *)buf;
top = buf + buf_size - reclen;
while ((char *) de <= top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
- buf, buf_size, offset))
- return -EIO;
- if (ext4_match(namelen, name, de))
- return -EEXIST;
+ buf, buf_size, offset)) {
+ res = -EIO;
+ goto return_result;
+ }
+ /* Provide crypto context and crypto buffer to ext4 match */
+ res = ext4_match(fname, de);
+ if (res < 0)
+ goto return_result;
+ if (res > 0) {
+ res = -EEXIST;
+ goto return_result;
+ }
nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
if ((de->inode ? rlen - nlen : rlen) >= reclen)
@@ -1634,17 +1832,22 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
offset += rlen;
}
- if ((char *) de > top)
- return -ENOSPC;
- *dest_de = de;
- return 0;
+ if ((char *) de > top)
+ res = -ENOSPC;
+ else {
+ *dest_de = de;
+ res = 0;
+ }
+return_result:
+ return res;
}
-void ext4_insert_dentry(struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int buf_size,
- const char *name, int namelen)
+int ext4_insert_dentry(struct inode *dir,
+ struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ struct ext4_filename *fname)
{
int nlen, rlen;
@@ -1653,7 +1856,7 @@ void ext4_insert_dentry(struct inode *inode,
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
if (de->inode) {
struct ext4_dir_entry_2 *de1 =
- (struct ext4_dir_entry_2 *)((char *)de + nlen);
+ (struct ext4_dir_entry_2 *)((char *)de + nlen);
de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
de = de1;
@@ -1661,9 +1864,11 @@ void ext4_insert_dentry(struct inode *inode,
de->file_type = EXT4_FT_UNKNOWN;
de->inode = cpu_to_le32(inode->i_ino);
ext4_set_de_type(inode->i_sb, de, inode->i_mode);
- de->name_len = namelen;
- memcpy(de->name, name, namelen);
+ de->name_len = fname_len(fname);
+ memcpy(de->name, fname_name(fname), fname_len(fname));
+ return 0;
}
+
/*
* Add a new entry into a directory (leaf) block. If de is non-NULL,
* it points to a directory entry which is guaranteed to be large
@@ -1672,13 +1877,11 @@ void ext4_insert_dentry(struct inode *inode,
* space. It will return -ENOSPC if no space is available, and -EIO
* and -EEXIST if directory entry already exists.
*/
-static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
+static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
+ struct inode *dir,
struct inode *inode, struct ext4_dir_entry_2 *de,
struct buffer_head *bh)
{
- struct inode *dir = dentry->d_parent->d_inode;
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
unsigned int blocksize = dir->i_sb->s_blocksize;
int csum_size = 0;
int err;
@@ -1687,9 +1890,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
csum_size = sizeof(struct ext4_dir_entry_tail);
if (!de) {
- err = ext4_find_dest_de(dir, inode,
- bh, bh->b_data, blocksize - csum_size,
- name, namelen, &de);
+ err = ext4_find_dest_de(dir, inode, bh, bh->b_data,
+ blocksize - csum_size, fname, &de);
if (err)
return err;
}
@@ -1700,8 +1902,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
return err;
}
- /* By now the buffer is marked for journaling */
- ext4_insert_dentry(inode, de, blocksize, name, namelen);
+ /* By now the buffer is marked for journaling. Due to crypto operations,
+ * the following function call may fail */
+ err = ext4_insert_dentry(dir, inode, de, blocksize, fname);
+ if (err < 0)
+ return err;
/*
* XXX shouldn't update any times until successful
@@ -1729,12 +1934,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
* This converts a one block unindexed directory to a 3 block indexed
* directory, and adds the dentry to the indexed directory.
*/
-static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
+static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
+ struct dentry *dentry,
struct inode *inode, struct buffer_head *bh)
{
struct inode *dir = dentry->d_parent->d_inode;
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
struct buffer_head *bh2;
struct dx_root *root;
struct dx_frame frames[2], *frame;
@@ -1745,10 +1949,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
unsigned len;
int retval;
unsigned blocksize;
- struct dx_hash_info hinfo;
ext4_lblk_t block;
struct fake_dirent *fde;
- int csum_size = 0;
+ int csum_size = 0;
if (ext4_has_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
@@ -1811,11 +2014,12 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
/* Initialize as for dx_probe */
- hinfo.hash_version = root->info.hash_version;
- if (hinfo.hash_version <= DX_HASH_TEA)
- hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
- hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
- ext4fs_dirhash(name, namelen, &hinfo);
+ fname->hinfo.hash_version = root->info.hash_version;
+ if (fname->hinfo.hash_version <= DX_HASH_TEA)
+ fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
+ fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ ext4fs_dirhash(fname_name(fname), fname_len(fname), &fname->hinfo);
+
memset(frames, 0, sizeof(frames));
frame = frames;
frame->entries = entries;
@@ -1830,14 +2034,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
if (retval)
goto out_frames;
- de = do_split(handle,dir, &bh, frame, &hinfo);
+ de = do_split(handle,dir, &bh, frame, &fname->hinfo);
if (IS_ERR(de)) {
retval = PTR_ERR(de);
goto out_frames;
}
dx_release(frames);
- retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh);
brelse(bh);
return retval;
out_frames:
@@ -1869,6 +2073,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
struct ext4_dir_entry_2 *de;
struct ext4_dir_entry_tail *t;
struct super_block *sb;
+ struct ext4_filename fname;
int retval;
int dx_fallback=0;
unsigned blocksize;
@@ -1883,10 +2088,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (!dentry->d_name.len)
return -EINVAL;
+ retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname);
+ if (retval)
+ return retval;
+
if (ext4_has_inline_data(dir)) {
- retval = ext4_try_add_inline_entry(handle, dentry, inode);
+ retval = ext4_try_add_inline_entry(handle, &fname,
+ dentry, inode);
if (retval < 0)
- return retval;
+ goto out;
if (retval == 1) {
retval = 0;
goto out;
@@ -1894,7 +2104,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
}
if (is_dx(dir)) {
- retval = ext4_dx_add_entry(handle, dentry, inode);
+ retval = ext4_dx_add_entry(handle, &fname, dentry, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
goto out;
ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
@@ -1904,24 +2114,32 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
blocks = dir->i_size >> sb->s_blocksize_bits;
for (block = 0; block < blocks; block++) {
bh = ext4_read_dirblock(dir, block, DIRENT);
- if (IS_ERR(bh))
- return PTR_ERR(bh);
+ if (IS_ERR(bh)) {
+ retval = PTR_ERR(bh);
+ bh = NULL;
+ goto out;
+ }
- retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+ retval = add_dirent_to_buf(handle, &fname, dir, inode,
+ NULL, bh);
if (retval != -ENOSPC)
goto out;
if (blocks == 1 && !dx_fallback &&
EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
- retval = make_indexed_dir(handle, dentry, inode, bh);
+ retval = make_indexed_dir(handle, &fname, dentry,
+ inode, bh);
bh = NULL; /* make_indexed_dir releases bh */
goto out;
}
brelse(bh);
}
bh = ext4_append(handle, dir, &block);
- if (IS_ERR(bh))
- return PTR_ERR(bh);
+ if (IS_ERR(bh)) {
+ retval = PTR_ERR(bh);
+ bh = NULL;
+ goto out;
+ }
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
@@ -1931,8 +2149,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
initialize_dirent_tail(t, blocksize);
}
- retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh);
out:
+ ext4_fname_free_filename(&fname);
brelse(bh);
if (retval == 0)
ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
@@ -1942,19 +2161,18 @@ out:
/*
* Returns 0 for success, or a negative error value
*/
-static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
- struct inode *inode)
+static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+ struct dentry *dentry, struct inode *inode)
{
struct dx_frame frames[2], *frame;
struct dx_entry *entries, *at;
- struct dx_hash_info hinfo;
struct buffer_head *bh;
struct inode *dir = dentry->d_parent->d_inode;
struct super_block *sb = dir->i_sb;
struct ext4_dir_entry_2 *de;
int err;
- frame = dx_probe(&dentry->d_name, dir, &hinfo, frames);
+ frame = dx_probe(fname, dir, NULL, frames);
if (IS_ERR(frame))
return PTR_ERR(frame);
entries = frame->entries;
@@ -1971,7 +2189,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (err)
goto journal_error;
- err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+ err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh);
if (err != -ENOSPC)
goto cleanup;
@@ -2067,12 +2285,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto cleanup;
}
}
- de = do_split(handle, dir, &bh, frame, &hinfo);
+ de = do_split(handle, dir, &bh, frame, &fname->hinfo);
if (IS_ERR(de)) {
err = PTR_ERR(de);
goto cleanup;
}
- err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ err = add_dirent_to_buf(handle, fname, dir, inode, de, bh);
goto cleanup;
journal_error:
@@ -2452,7 +2670,7 @@ out_stop:
/*
* routine to check that the specified directory is empty (for rmdir)
*/
-static int empty_dir(struct inode *inode)
+int ext4_empty_dir(struct inode *inode)
{
unsigned int offset;
struct buffer_head *bh;
@@ -2720,7 +2938,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
goto end_rmdir;
retval = -ENOTEMPTY;
- if (!empty_dir(inode))
+ if (!ext4_empty_dir(inode))
goto end_rmdir;
handle = ext4_journal_start(dir, EXT4_HT_DIR,
@@ -2831,16 +3049,38 @@ static int ext4_symlink(struct inode *dir,
{
handle_t *handle;
struct inode *inode;
- int l, err, retries = 0;
+ int err, len = strlen(symname);
int credits;
+ bool encryption_required;
+ struct ext4_str disk_link;
+ struct ext4_encrypted_symlink_data *sd = NULL;
+
+ disk_link.len = len + 1;
+ disk_link.name = (char *) symname;
- l = strlen(symname)+1;
- if (l > dir->i_sb->s_blocksize)
- return -ENAMETOOLONG;
+ encryption_required = (ext4_encrypted_inode(dir) ||
+ DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)));
+ if (encryption_required) {
+ err = ext4_get_encryption_info(dir);
+ if (err)
+ return err;
+ if (ext4_encryption_info(dir) == NULL)
+ return -EPERM;
+ disk_link.len = (ext4_fname_encrypted_size(dir, len) +
+ sizeof(struct ext4_encrypted_symlink_data));
+ sd = kzalloc(disk_link.len, GFP_KERNEL);
+ if (!sd)
+ return -ENOMEM;
+ }
+
+ if (disk_link.len > dir->i_sb->s_blocksize) {
+ err = -ENAMETOOLONG;
+ goto err_free_sd;
+ }
dquot_initialize(dir);
- if (l > EXT4_N_BLOCKS * 4) {
+ if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
/*
* For non-fast symlinks, we just allocate inode and put it on
* orphan list in the first transaction => we need bitmap,
@@ -2859,16 +3099,34 @@ static int ext4_symlink(struct inode *dir,
credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
}
-retry:
+
inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
&dentry->d_name, 0, NULL,
EXT4_HT_DIR, credits);
handle = ext4_journal_current_handle();
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_stop;
+ if (IS_ERR(inode)) {
+ if (handle)
+ ext4_journal_stop(handle);
+ err = PTR_ERR(inode);
+ goto err_free_sd;
+ }
- if (l > EXT4_N_BLOCKS * 4) {
+ if (encryption_required) {
+ struct qstr istr;
+ struct ext4_str ostr;
+
+ istr.name = (const unsigned char *) symname;
+ istr.len = len;
+ ostr.name = sd->encrypted_path;
+ ostr.len = disk_link.len;
+ err = ext4_fname_usr_to_disk(inode, &istr, &ostr);
+ if (err < 0)
+ goto err_drop_inode;
+ sd->len = cpu_to_le16(ostr.len);
+ disk_link.name = (char *) sd;
+ }
+
+ if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
/*
@@ -2884,9 +3142,10 @@ retry:
drop_nlink(inode);
err = ext4_orphan_add(handle, inode);
ext4_journal_stop(handle);
+ handle = NULL;
if (err)
goto err_drop_inode;
- err = __page_symlink(inode, symname, l, 1);
+ err = __page_symlink(inode, disk_link.name, disk_link.len, 1);
if (err)
goto err_drop_inode;
/*
@@ -2898,36 +3157,40 @@ retry:
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
+ handle = NULL;
goto err_drop_inode;
}
set_nlink(inode, 1);
err = ext4_orphan_del(handle, inode);
- if (err) {
- ext4_journal_stop(handle);
- clear_nlink(inode);
+ if (err)
goto err_drop_inode;
- }
} else {
/* clear the extent format for fast symlink */
ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
- inode->i_op = &ext4_fast_symlink_inode_operations;
- memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
- inode->i_size = l-1;
+ inode->i_op = encryption_required ?
+ &ext4_symlink_inode_operations :
+ &ext4_fast_symlink_inode_operations;
+ memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name,
+ disk_link.len);
+ inode->i_size = disk_link.len - 1;
}
EXT4_I(inode)->i_disksize = inode->i_size;
err = ext4_add_nondir(handle, dentry, inode);
if (!err && IS_DIRSYNC(dir))
ext4_handle_sync(handle);
-out_stop:
if (handle)
ext4_journal_stop(handle);
- if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
- goto retry;
+ kfree(sd);
return err;
err_drop_inode:
+ if (handle)
+ ext4_journal_stop(handle);
+ clear_nlink(inode);
unlock_new_inode(inode);
iput(inode);
+err_free_sd:
+ kfree(sd);
return err;
}
@@ -2940,7 +3203,9 @@ static int ext4_link(struct dentry *old_dentry,
if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
-
+ if (ext4_encrypted_inode(dir) &&
+ !ext4_is_child_context_consistent_with_parent(dir, inode))
+ return -EPERM;
dquot_initialize(dir);
retry:
@@ -3241,6 +3506,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
goto end_rename;
+ if ((old.dir != new.dir) &&
+ ext4_encrypted_inode(new.dir) &&
+ !ext4_is_child_context_consistent_with_parent(new.dir,
+ old.inode)) {
+ retval = -EPERM;
+ goto end_rename;
+ }
+
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
&new.de, &new.inlined);
if (IS_ERR(new.bh)) {
@@ -3281,7 +3554,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (S_ISDIR(old.inode->i_mode)) {
if (new.inode) {
retval = -ENOTEMPTY;
- if (!empty_dir(new.inode))
+ if (!ext4_empty_dir(new.inode))
goto end_rename;
} else {
retval = -EMLINK;
@@ -3355,8 +3628,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
ext4_dec_count(handle, old.dir);
if (new.inode) {
- /* checked empty_dir above, can't have another parent,
- * ext4_dec_count() won't work for many-linked dirs */
+ /* checked ext4_empty_dir above, can't have another
+ * parent, ext4_dec_count() won't work for many-linked
+ * dirs */
clear_nlink(new.inode);
} else {
ext4_inc_count(handle, new.dir);
@@ -3404,6 +3678,15 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
u8 new_file_type;
int retval;
+ if ((ext4_encrypted_inode(old_dir) ||
+ ext4_encrypted_inode(new_dir)) &&
+ (old_dir != new_dir) &&
+ (!ext4_is_child_context_consistent_with_parent(new_dir,
+ old.inode) ||
+ !ext4_is_child_context_consistent_with_parent(old_dir,
+ new.inode)))
+ return -EPERM;
+
dquot_initialize(old.dir);
dquot_initialize(new.dir);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b24a2541a9ba..8444f2964a73 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -26,6 +26,7 @@
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/ratelimit.h>
+#include <linux/backing-dev.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -69,6 +70,10 @@ static void ext4_finish_bio(struct bio *bio)
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ struct page *data_page = NULL;
+ struct ext4_crypto_ctx *ctx = NULL;
+#endif
struct buffer_head *bh, *head;
unsigned bio_start = bvec->bv_offset;
unsigned bio_end = bio_start + bvec->bv_len;
@@ -78,6 +83,15 @@ static void ext4_finish_bio(struct bio *bio)
if (!page)
continue;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ if (!page->mapping) {
+ /* The bounce data pages are unmapped. */
+ data_page = page;
+ ctx = (struct ext4_crypto_ctx *)page_private(data_page);
+ page = ctx->w.control_page;
+ }
+#endif
+
if (error) {
SetPageError(page);
set_bit(AS_EIO, &page->mapping->flags);
@@ -102,8 +116,13 @@ static void ext4_finish_bio(struct bio *bio)
} while ((bh = bh->b_this_page) != head);
bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
local_irq_restore(flags);
- if (!under_io)
+ if (!under_io) {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ if (ctx)
+ ext4_restore_control_page(data_page);
+#endif
end_page_writeback(page);
+ }
}
}
@@ -378,6 +397,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
static int io_submit_add_bh(struct ext4_io_submit *io,
struct inode *inode,
+ struct page *page,
struct buffer_head *bh)
{
int ret;
@@ -391,7 +411,7 @@ submit_and_retry:
if (ret)
return ret;
}
- ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
+ ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
io->io_next_block++;
@@ -404,11 +424,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
struct writeback_control *wbc,
bool keep_towrite)
{
+ struct page *data_page = NULL;
struct inode *inode = page->mapping->host;
unsigned block_start, blocksize;
struct buffer_head *bh, *head;
int ret = 0;
int nr_submitted = 0;
+ int nr_to_submit = 0;
blocksize = 1 << inode->i_blkbits;
@@ -461,21 +483,44 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
}
set_buffer_async_write(bh);
+ nr_to_submit++;
} while ((bh = bh->b_this_page) != head);
- /* Now submit buffers to write */
bh = head = page_buffers(page);
+
+ if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) &&
+ nr_to_submit) {
+ gfp_t gfp_flags = GFP_NOFS;
+
+ retry_encrypt:
+ data_page = ext4_encrypt(inode, page, gfp_flags);
+ if (IS_ERR(data_page)) {
+ ret = PTR_ERR(data_page);
+ if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
+ if (io->io_bio) {
+ ext4_io_submit(io);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ }
+ gfp_flags |= __GFP_NOFAIL;
+ goto retry_encrypt;
+ }
+ data_page = NULL;
+ goto out;
+ }
+ }
+
+ /* Now submit buffers to write */
do {
if (!buffer_async_write(bh))
continue;
- ret = io_submit_add_bh(io, inode, bh);
+ ret = io_submit_add_bh(io, inode,
+ data_page ? data_page : page, bh);
if (ret) {
/*
* We only get here on ENOMEM. Not much else
* we can do but mark the page as dirty, and
* better luck next time.
*/
- redirty_page_for_writepage(wbc, page);
break;
}
nr_submitted++;
@@ -484,6 +529,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
/* Error stopped previous loop? Clean up buffers... */
if (ret) {
+ out:
+ if (data_page)
+ ext4_restore_control_page(data_page);
+ printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
+ redirty_page_for_writepage(wbc, page);
do {
clear_buffer_async_write(bh);
bh = bh->b_this_page;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
new file mode 100644
index 000000000000..8920d5962c43
--- /dev/null
+++ b/fs/ext4/readpage.c
@@ -0,0 +1,367 @@
+/*
+ * linux/fs/ext4/readpage.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This was originally taken from fs/mpage.c
+ *
+ * The intent is the ext4_mpage_readpages() function here is intended
+ * to replace mpage_readpages() in the general case, not just for
+ * encrypted files. It has some limitations (see below), where it
+ * will fall back to read_block_full_page(), but these limitations
+ * should only be hit when page_size != block_size.
+ *
+ * This will allow us to attach a callback function to support ext4
+ * encryption.
+ *
+ * If anything unusual happens, such as:
+ *
+ * - encountering a page which has buffers
+ * - encountering a page which has a non-hole after a hole
+ * - encountering a page with non-contiguous blocks
+ *
+ * then this code just gives up and calls the buffer_head-based read function.
+ * It does handle a page which has holes at the end - that is a common case:
+ * the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/kdev_t.h>
+#include <linux/gfp.h>
+#include <linux/bio.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/highmem.h>
+#include <linux/prefetch.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+#include <linux/cleancache.h>
+
+#include "ext4.h"
+#include <trace/events/android_fs.h>
+
+/*
+ * Call ext4_decrypt on every single page, reusing the encryption
+ * context.
+ */
+static void completion_pages(struct work_struct *work)
+{
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ struct ext4_crypto_ctx *ctx =
+ container_of(work, struct ext4_crypto_ctx, r.work);
+ struct bio *bio = ctx->r.bio;
+ struct bio_vec *bv;
+ int i;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ struct page *page = bv->bv_page;
+
+ int ret = ext4_decrypt(page);
+ if (ret) {
+ WARN_ON_ONCE(1);
+ SetPageError(page);
+ } else
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ ext4_release_crypto_ctx(ctx);
+ bio_put(bio);
+#else
+ BUG();
+#endif
+}
+
+static inline bool ext4_bio_encrypted(struct bio *bio)
+{
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ return unlikely(bio->bi_private != NULL);
+#else
+ return false;
+#endif
+}
+
+static void
+ext4_trace_read_completion(struct bio *bio, int err)
+{
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+ if (first_page != NULL)
+ trace_android_fs_dataread_end(first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size);
+}
+
+/*
+ * I/O completion handler for multipage BIOs.
+ *
+ * The mpage code never puts partial pages into a BIO (except for end-of-file).
+ * If a page does not map to a contiguous run of blocks then it simply falls
+ * back to block_read_full_page().
+ *
+ * Why is this? If a page's completion depends on a number of different BIOs
+ * which can complete in any order (or at the same time) then determining the
+ * status of that page is hard. See end_buffer_async_read() for the details.
+ * There is no point in duplicating all that complexity.
+ */
+static void mpage_end_io(struct bio *bio, int err)
+{
+ struct bio_vec *bv;
+ int i;
+
+ if (trace_android_fs_dataread_start_enabled())
+ ext4_trace_read_completion(bio, err);
+
+ if (ext4_bio_encrypted(bio)) {
+ struct ext4_crypto_ctx *ctx = bio->bi_private;
+
+ if (err) {
+ ext4_release_crypto_ctx(ctx);
+ } else {
+ INIT_WORK(&ctx->r.work, completion_pages);
+ ctx->r.bio = bio;
+ queue_work(ext4_read_workqueue, &ctx->r.work);
+ return;
+ }
+ }
+ bio_for_each_segment_all(bv, bio, i) {
+ struct page *page = bv->bv_page;
+
+ if (!err) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ }
+
+ bio_put(bio);
+}
+
+static void
+ext4_submit_bio_read(struct bio *bio)
+{
+ if (trace_android_fs_dataread_start_enabled()) {
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+ if (first_page != NULL) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ first_page->mapping->host);
+ trace_android_fs_dataread_start(
+ first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size,
+ current->pid,
+ path,
+ current->comm);
+ }
+ }
+ submit_bio(READ, bio);
+}
+
+int ext4_mpage_readpages(struct address_space *mapping,
+ struct list_head *pages, struct page *page,
+ unsigned nr_pages)
+{
+ struct bio *bio = NULL;
+ unsigned page_idx;
+ sector_t last_block_in_bio = 0;
+
+ struct inode *inode = mapping->host;
+ const unsigned blkbits = inode->i_blkbits;
+ const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+ const unsigned blocksize = 1 << blkbits;
+ sector_t block_in_file;
+ sector_t last_block;
+ sector_t last_block_in_file;
+ sector_t blocks[MAX_BUF_PER_PAGE];
+ unsigned page_block;
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int length;
+ unsigned relative_block = 0;
+ struct ext4_map_blocks map;
+
+ map.m_pblk = 0;
+ map.m_lblk = 0;
+ map.m_len = 0;
+ map.m_flags = 0;
+
+ for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
+ int fully_mapped = 1;
+ unsigned first_hole = blocks_per_page;
+
+ prefetchw(&page->flags);
+ if (pages) {
+ page = list_entry(pages->prev, struct page, lru);
+ list_del(&page->lru);
+ if (add_to_page_cache_lru(page, mapping,
+ page->index, GFP_KERNEL))
+ goto next_page;
+ }
+
+ if (page_has_buffers(page))
+ goto confused;
+
+ block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+ last_block = block_in_file + nr_pages * blocks_per_page;
+ last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+ if (last_block > last_block_in_file)
+ last_block = last_block_in_file;
+ page_block = 0;
+
+ /*
+ * Map blocks using the previous result first.
+ */
+ if ((map.m_flags & EXT4_MAP_MAPPED) &&
+ block_in_file > map.m_lblk &&
+ block_in_file < (map.m_lblk + map.m_len)) {
+ unsigned map_offset = block_in_file - map.m_lblk;
+ unsigned last = map.m_len - map_offset;
+
+ for (relative_block = 0; ; relative_block++) {
+ if (relative_block == last) {
+ /* needed? */
+ map.m_flags &= ~EXT4_MAP_MAPPED;
+ break;
+ }
+ if (page_block == blocks_per_page)
+ break;
+ blocks[page_block] = map.m_pblk + map_offset +
+ relative_block;
+ page_block++;
+ block_in_file++;
+ }
+ }
+
+ /*
+ * Then do more ext4_map_blocks() calls until we are
+ * done with this page.
+ */
+ while (page_block < blocks_per_page) {
+ if (block_in_file < last_block) {
+ map.m_lblk = block_in_file;
+ map.m_len = last_block - block_in_file;
+
+ if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
+ set_error_page:
+ SetPageError(page);
+ zero_user_segment(page, 0,
+ PAGE_CACHE_SIZE);
+ unlock_page(page);
+ goto next_page;
+ }
+ }
+ if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
+ fully_mapped = 0;
+ if (first_hole == blocks_per_page)
+ first_hole = page_block;
+ page_block++;
+ block_in_file++;
+ continue;
+ }
+ if (first_hole != blocks_per_page)
+ goto confused; /* hole -> non-hole */
+
+ /* Contiguous blocks? */
+ if (page_block && blocks[page_block-1] != map.m_pblk-1)
+ goto confused;
+ for (relative_block = 0; ; relative_block++) {
+ if (relative_block == map.m_len) {
+ /* needed? */
+ map.m_flags &= ~EXT4_MAP_MAPPED;
+ break;
+ } else if (page_block == blocks_per_page)
+ break;
+ blocks[page_block] = map.m_pblk+relative_block;
+ page_block++;
+ block_in_file++;
+ }
+ }
+ if (first_hole != blocks_per_page) {
+ zero_user_segment(page, first_hole << blkbits,
+ PAGE_CACHE_SIZE);
+ if (first_hole == 0) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ goto next_page;
+ }
+ } else if (fully_mapped) {
+ SetPageMappedToDisk(page);
+ }
+ if (fully_mapped && blocks_per_page == 1 &&
+ !PageUptodate(page) && cleancache_get_page(page) == 0) {
+ SetPageUptodate(page);
+ goto confused;
+ }
+
+ /*
+ * This page will go to BIO. Do we need to send this
+ * BIO off first?
+ */
+ if (bio && (last_block_in_bio != blocks[0] - 1)) {
+ submit_and_realloc:
+ ext4_submit_bio_read(bio);
+ bio = NULL;
+ }
+ if (bio == NULL) {
+ struct ext4_crypto_ctx *ctx = NULL;
+
+ if (ext4_encrypted_inode(inode) &&
+ S_ISREG(inode->i_mode)) {
+ ctx = ext4_get_crypto_ctx(inode, GFP_NOFS);
+ if (IS_ERR(ctx))
+ goto set_error_page;
+ }
+ bio = bio_alloc(GFP_KERNEL,
+ min_t(int, nr_pages, bio_get_nr_vecs(bdev)));
+ if (!bio) {
+ if (ctx)
+ ext4_release_crypto_ctx(ctx);
+ goto set_error_page;
+ }
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
+ bio->bi_end_io = mpage_end_io;
+ bio->bi_private = ctx;
+ }
+
+ length = first_hole << blkbits;
+ if (bio_add_page(bio, page, length, 0) < length)
+ goto submit_and_realloc;
+
+ if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
+ (relative_block == map.m_len)) ||
+ (first_hole != blocks_per_page)) {
+ ext4_submit_bio_read(bio);
+ bio = NULL;
+ } else
+ last_block_in_bio = blocks[blocks_per_page - 1];
+ goto next_page;
+ confused:
+ if (bio) {
+ ext4_submit_bio_read(bio);
+ bio = NULL;
+ }
+ if (!PageUptodate(page))
+ block_read_full_page(page, ext4_get_block);
+ else
+ unlock_page(page);
+ next_page:
+ if (pages)
+ page_cache_release(page);
+ }
+ BUG_ON(pages && !list_empty(pages));
+ if (bio)
+ ext4_submit_bio_read(bio);
+ return 0;
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a5200023e604..29242697c455 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -304,6 +304,8 @@ static void __save_error_info(struct super_block *sb, const char *func,
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ if (bdev_read_only(sb->s_bdev))
+ return;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
es->s_last_error_time = cpu_to_le32(get_seconds());
strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
@@ -910,7 +912,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
-
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ ei->i_crypt_info = NULL;
+#endif
return &ei->vfs_inode;
}
@@ -988,6 +992,10 @@ void ext4_clear_inode(struct inode *inode)
jbd2_free_inode(EXT4_I(inode)->jinode);
EXT4_I(inode)->jinode = NULL;
}
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ if (EXT4_I(inode)->i_crypt_info)
+ ext4_free_encryption_info(inode, EXT4_I(inode)->i_crypt_info);
+#endif
}
static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -1145,7 +1153,7 @@ enum {
Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
- Opt_data_err_abort, Opt_data_err_ignore,
+ Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
@@ -1231,6 +1239,7 @@ static const match_table_t tokens = {
{Opt_init_itable, "init_itable"},
{Opt_noinit_itable, "noinit_itable"},
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+ {Opt_test_dummy_encryption, "test_dummy_encryption"},
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
{Opt_removed, "nocheck"}, /* mount option from ext2/3 */
{Opt_removed, "reservation"}, /* mount option from ext2/3 */
@@ -1429,6 +1438,7 @@ static const struct mount_opts {
{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
{Opt_max_dir_size_kb, 0, MOPT_GTE0},
+ {Opt_test_dummy_encryption, 0, MOPT_GTE0},
{Opt_err, 0, 0}
};
@@ -1599,6 +1609,15 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
}
*journal_ioprio =
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+ } else if (token == Opt_test_dummy_encryption) {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
+ ext4_msg(sb, KERN_WARNING,
+ "Test dummy encryption mode enabled");
+#else
+ ext4_msg(sb, KERN_WARNING,
+ "Test dummy encryption mount option ignored");
+#endif
} else if (m->flags & MOPT_DATAJ) {
if (is_remount) {
if (!sbi->s_journal)
@@ -2711,11 +2730,13 @@ static struct attribute *ext4_attrs[] = {
EXT4_INFO_ATTR(lazy_itable_init);
EXT4_INFO_ATTR(batched_discard);
EXT4_INFO_ATTR(meta_bg_resize);
+EXT4_INFO_ATTR(encryption);
static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
ATTR_LIST(batched_discard),
ATTR_LIST(meta_bg_resize),
+ ATTR_LIST(encryption),
NULL,
};
@@ -3714,6 +3735,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) &&
+ es->s_encryption_level) {
+ ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
+ es->s_encryption_level);
+ goto failed_mount;
+ }
+
if (sb->s_blocksize != blocksize) {
/* Validate the filesystem blocksize */
if (!sb_set_blocksize(sb, blocksize)) {
@@ -3931,6 +3959,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
EXT4_DESC_PER_BLOCK(sb);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG)) {
+ if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
+ ext4_msg(sb, KERN_WARNING,
+ "first meta block group too large: %u "
+ "(group descriptor block count %u)",
+ le32_to_cpu(es->s_first_meta_bg), db_count);
+ goto failed_mount;
+ }
+ }
sbi->s_group_desc = ext4_kvmalloc(db_count *
sizeof(struct buffer_head *),
GFP_KERNEL);
@@ -4080,6 +4117,21 @@ no_journal:
}
}
+ if ((DUMMY_ENCRYPTION_ENABLED(sbi) ||
+ EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) &&
+ (blocksize != PAGE_CACHE_SIZE)) {
+ ext4_msg(sb, KERN_ERR,
+ "Unsupported blocksize for fs encryption");
+ goto failed_mount_wq;
+ }
+
+ if (DUMMY_ENCRYPTION_ENABLED(sbi) &&
+ !(sb->s_flags & MS_RDONLY) &&
+ !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+ ext4_commit_super(sb, 1);
+ }
+
/*
* Get the # of file system overhead blocks from the
* superblock if present.
@@ -5693,6 +5745,7 @@ out7:
static void __exit ext4_exit_fs(void)
{
+ ext4_exit_crypto();
ext4_destroy_lazyinit_thread();
unregister_as_ext2();
unregister_as_ext3();
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index ff3711932018..4f68e83cd25d 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,8 +23,93 @@
#include "ext4.h"
#include "xattr.h"
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
{
+ struct page *cpage = NULL;
+ char *caddr, *paddr = NULL;
+ struct ext4_str cstr, pstr;
+ struct inode *inode = dentry->d_inode;
+ struct ext4_encrypted_symlink_data *sd;
+ loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
+ int res;
+ u32 plen, max_size = inode->i_sb->s_blocksize;
+
+ if (!ext4_encrypted_inode(inode))
+ return page_follow_link_light(dentry, nd);
+
+ res = ext4_get_encryption_info(inode);
+ if (res)
+ return ERR_PTR(res);
+
+ if (ext4_inode_is_fast_symlink(inode)) {
+ caddr = (char *) EXT4_I(dentry->d_inode)->i_data;
+ max_size = sizeof(EXT4_I(dentry->d_inode)->i_data);
+ } else {
+ cpage = read_mapping_page(inode->i_mapping, 0, NULL);
+ if (IS_ERR(cpage))
+ return cpage;
+ caddr = kmap(cpage);
+ caddr[size] = 0;
+ }
+
+ /* Symlink is encrypted */
+ sd = (struct ext4_encrypted_symlink_data *)caddr;
+ cstr.name = sd->encrypted_path;
+ cstr.len = le32_to_cpu(sd->len);
+ if ((cstr.len +
+ sizeof(struct ext4_encrypted_symlink_data) - 1) >
+ max_size) {
+ /* Symlink data on the disk is corrupted */
+ res = -EIO;
+ goto errout;
+ }
+ plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ?
+ EXT4_FNAME_CRYPTO_DIGEST_SIZE*2 : cstr.len;
+ paddr = kmalloc(plen + 1, GFP_NOFS);
+ if (!paddr) {
+ res = -ENOMEM;
+ goto errout;
+ }
+ pstr.name = paddr;
+ pstr.len = plen;
+ res = _ext4_fname_disk_to_usr(inode, NULL, &cstr, &pstr);
+ if (res < 0)
+ goto errout;
+ /* Null-terminate the name */
+ if (res <= plen)
+ paddr[res] = '\0';
+ nd_set_link(nd, paddr);
+ if (cpage) {
+ kunmap(cpage);
+ page_cache_release(cpage);
+ }
+ return NULL;
+errout:
+ if (cpage) {
+ kunmap(cpage);
+ page_cache_release(cpage);
+ }
+ kfree(paddr);
+ return ERR_PTR(res);
+}
+
+static void ext4_put_link(struct dentry *dentry, struct nameidata *nd,
+ void *cookie)
+{
+ struct page *page = cookie;
+
+ if (!page) {
+ kfree(nd_get_link(nd));
+ } else {
+ kunmap(page);
+ page_cache_release(page);
+ }
+}
+#endif
+
+static void *ext4_follow_fast_link(struct dentry *dentry, struct nameidata *nd)
+{
struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
nd_set_link(nd, (char *) ei->i_data);
return NULL;
@@ -32,8 +117,13 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
const struct inode_operations ext4_symlink_inode_operations = {
.readlink = generic_readlink,
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ .follow_link = ext4_follow_link,
+ .put_link = ext4_put_link,
+#else
.follow_link = page_follow_link_light,
.put_link = page_put_link,
+#endif
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
@@ -43,7 +133,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = ext4_follow_link,
+ .follow_link = ext4_follow_fast_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 29bedf5589f6..ddc0957760ba 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -23,6 +23,7 @@
#define EXT4_XATTR_INDEX_SECURITY 6
#define EXT4_XATTR_INDEX_SYSTEM 7
#define EXT4_XATTR_INDEX_RICHACL 8
+#define EXT4_XATTR_INDEX_ENCRYPTION 9
struct ext4_xattr_header {
__le32 h_magic; /* magic number for identification */
@@ -98,6 +99,8 @@ extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
+#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"
+
extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 736a348509f7..378c221d68a9 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -1,6 +1,8 @@
config F2FS_FS
- tristate "F2FS filesystem support (EXPERIMENTAL)"
+ tristate "F2FS filesystem support"
depends on BLOCK
+ select CRYPTO
+ select CRYPTO_CRC32
help
F2FS is based on Log-structured File System (LFS), which supports
versatile "flash-friendly" features. The design has been focused on
@@ -45,7 +47,7 @@ config F2FS_FS_POSIX_ACL
default y
help
Posix Access Control Lists (ACLs) support permissions for users and
- gourps beyond the owner/group/world scheme.
+ groups beyond the owner/group/world scheme.
To learn more about Access Control Lists, visit the POSIX ACLs for
Linux website <http://acl.bestbits.at/>.
@@ -71,3 +73,32 @@ config F2FS_CHECK_FS
Enables BUG_ONs which check the filesystem consistency in runtime.
If you want to improve the performance, say N.
+
+config F2FS_FS_ENCRYPTION
+ bool "F2FS Encryption"
+ depends on F2FS_FS
+ depends on F2FS_FS_XATTR
+ select FS_ENCRYPTION
+ help
+ Enable encryption of f2fs files and directories. This
+ feature is similar to ecryptfs, but it is more memory
+ efficient since it avoids caching the encrypted and
+ decrypted pages in the page cache.
+
+config F2FS_IO_TRACE
+ bool "F2FS IO tracer"
+ depends on F2FS_FS
+ depends on FUNCTION_TRACER
+ help
+ F2FS IO trace is based on a function trace, which gathers process
+ information and block IO patterns in the filesystem level.
+
+ If unsure, say N.
+
+config F2FS_FAULT_INJECTION
+ bool "F2FS fault injection facility"
+ depends on F2FS_FS
+ help
+ Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on.
+
+ If unsure, say N.
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 2e35da12d292..ca949ea7c02f 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -2,6 +2,8 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o
f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o
f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o
+f2fs-y += shrinker.o extent_cache.o
f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
+f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index bafe6df6016f..8ac67d547826 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -62,7 +62,7 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size)
if (count == 0)
return NULL;
- acl = posix_acl_alloc(count, GFP_KERNEL);
+ acl = posix_acl_alloc(count, GFP_NOFS);
if (!acl)
return ERR_PTR(-ENOMEM);
@@ -109,14 +109,16 @@ fail:
return ERR_PTR(-EINVAL);
}
-static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
+static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi,
+ const struct posix_acl *acl, size_t *size)
{
struct f2fs_acl_header *f2fs_acl;
struct f2fs_acl_entry *entry;
int i;
- f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
- sizeof(struct f2fs_acl_entry), GFP_KERNEL);
+ f2fs_acl = f2fs_kmalloc(sbi, sizeof(struct f2fs_acl_header) +
+ acl->a_count * sizeof(struct f2fs_acl_entry),
+ GFP_NOFS);
if (!f2fs_acl)
return ERR_PTR(-ENOMEM);
@@ -162,7 +164,8 @@ fail:
return ERR_PTR(-EINVAL);
}
-struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
+static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
+ struct page *dpage)
{
int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
void *value = NULL;
@@ -172,12 +175,13 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
if (type == ACL_TYPE_ACCESS)
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
- retval = f2fs_getxattr(inode, name_index, "", NULL, 0);
+ retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage);
if (retval > 0) {
- value = kmalloc(retval, GFP_F2FS_ZERO);
+ value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO);
if (!value)
return ERR_PTR(-ENOMEM);
- retval = f2fs_getxattr(inode, name_index, "", value, retval);
+ retval = f2fs_getxattr(inode, name_index, "", value,
+ retval, dpage);
}
if (retval > 0)
@@ -188,16 +192,17 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
acl = ERR_PTR(retval);
kfree(value);
- if (!IS_ERR(acl))
- set_cached_acl(inode, type, acl);
-
return acl;
}
+struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
+{
+ return __f2fs_get_acl(inode, type, NULL);
+}
+
static int __f2fs_set_acl(struct inode *inode, int type,
struct posix_acl *acl, struct page *ipage)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
int name_index;
void *value = NULL;
size_t size = 0;
@@ -210,7 +215,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
if (error)
return error;
- set_acl_inode(fi, inode->i_mode);
+ set_acl_inode(inode, inode->i_mode);
}
break;
@@ -225,9 +230,9 @@ static int __f2fs_set_acl(struct inode *inode, int type,
}
if (acl) {
- value = f2fs_acl_to_disk(acl, &size);
+ value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size);
if (IS_ERR(value)) {
- cond_clear_inode_flag(fi, FI_ACL_MODE);
+ clear_inode_flag(inode, FI_ACL_MODE);
return (int)PTR_ERR(value);
}
}
@@ -238,7 +243,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
if (!error)
set_cached_acl(inode, type, acl);
- cond_clear_inode_flag(fi, FI_ACL_MODE);
+ clear_inode_flag(inode, FI_ACL_MODE);
return error;
}
@@ -247,22 +252,147 @@ int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
return __f2fs_set_acl(inode, type, acl, NULL);
}
-int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage)
+/*
+ * Most part of f2fs_acl_clone, f2fs_acl_create_masq, f2fs_acl_create
+ * are copied from posix_acl.c
+ */
+static struct posix_acl *f2fs_acl_clone(const struct posix_acl *acl,
+ gfp_t flags)
+{
+ struct posix_acl *clone = NULL;
+
+ if (acl) {
+ int size = sizeof(struct posix_acl) + acl->a_count *
+ sizeof(struct posix_acl_entry);
+ clone = kmemdup(acl, size, flags);
+ if (clone)
+ atomic_set(&clone->a_refcount, 1);
+ }
+ return clone;
+}
+
+static int f2fs_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
{
- struct posix_acl *default_acl, *acl;
+ struct posix_acl_entry *pa, *pe;
+ struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
+ umode_t mode = *mode_p;
+ int not_equiv = 0;
+
+ /* assert(atomic_read(acl->a_refcount) == 1); */
+
+ FOREACH_ACL_ENTRY(pa, acl, pe) {
+ switch(pa->e_tag) {
+ case ACL_USER_OBJ:
+ pa->e_perm &= (mode >> 6) | ~S_IRWXO;
+ mode &= (pa->e_perm << 6) | ~S_IRWXU;
+ break;
+
+ case ACL_USER:
+ case ACL_GROUP:
+ not_equiv = 1;
+ break;
+
+ case ACL_GROUP_OBJ:
+ group_obj = pa;
+ break;
+
+ case ACL_OTHER:
+ pa->e_perm &= mode | ~S_IRWXO;
+ mode &= pa->e_perm | ~S_IRWXO;
+ break;
+
+ case ACL_MASK:
+ mask_obj = pa;
+ not_equiv = 1;
+ break;
+
+ default:
+ return -EIO;
+ }
+ }
+
+ if (mask_obj) {
+ mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
+ mode &= (mask_obj->e_perm << 3) | ~S_IRWXG;
+ } else {
+ if (!group_obj)
+ return -EIO;
+ group_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
+ mode &= (group_obj->e_perm << 3) | ~S_IRWXG;
+ }
+
+ *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
+ return not_equiv;
+}
+
+static int f2fs_acl_create(struct inode *dir, umode_t *mode,
+ struct posix_acl **default_acl, struct posix_acl **acl,
+ struct page *dpage)
+{
+ struct posix_acl *p;
+ struct posix_acl *clone;
+ int ret;
+
+ *acl = NULL;
+ *default_acl = NULL;
+
+ if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
+ return 0;
+
+ p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dpage);
+ if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
+ *mode &= ~current_umask();
+ return 0;
+ }
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ clone = f2fs_acl_clone(p, GFP_NOFS);
+ if (!clone)
+ goto no_mem;
+
+ ret = f2fs_acl_create_masq(clone, mode);
+ if (ret < 0)
+ goto no_mem_clone;
+
+ if (ret == 0)
+ posix_acl_release(clone);
+ else
+ *acl = clone;
+
+ if (!S_ISDIR(*mode))
+ posix_acl_release(p);
+ else
+ *default_acl = p;
+
+ return 0;
+
+no_mem_clone:
+ posix_acl_release(clone);
+no_mem:
+ posix_acl_release(p);
+ return -ENOMEM;
+}
+
+int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
+ struct page *dpage)
+{
+ struct posix_acl *default_acl = NULL, *acl = NULL;
int error = 0;
- error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage);
if (error)
return error;
+ f2fs_mark_inode_dirty_sync(inode, true);
+
if (default_acl) {
error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
ipage);
posix_acl_release(default_acl);
}
if (acl) {
- if (error)
+ if (!error)
error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
ipage);
posix_acl_release(acl);
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index e0864651cdc1..2c685185c24d 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -37,15 +37,15 @@ struct f2fs_acl_header {
#ifdef CONFIG_F2FS_FS_POSIX_ACL
extern struct posix_acl *f2fs_get_acl(struct inode *, int);
-extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int f2fs_init_acl(struct inode *, struct inode *, struct page *);
+extern int f2fs_set_acl(struct inode *, struct posix_acl *, int);
+extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
+ struct page *);
#else
-#define f2fs_check_acl NULL
#define f2fs_get_acl NULL
#define f2fs_set_acl NULL
static inline int f2fs_init_acl(struct inode *inode, struct inode *dir,
- struct page *page)
+ struct page *ipage, struct page *dpage)
{
return 0;
}
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index dd10a031c052..b39de99aa58f 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -20,10 +20,19 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include "trace.h"
#include <trace/events/f2fs.h>
static struct kmem_cache *ino_entry_slab;
-static struct kmem_cache *inode_entry_slab;
+struct kmem_cache *inode_entry_slab;
+
+void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
+{
+ set_ckpt_flags(sbi, CP_ERROR_FLAG);
+ sbi->sb->s_flags |= MS_RDONLY;
+ if (!end_io)
+ f2fs_flush_merged_bios(sbi);
+}
/*
* We guarantee no failure on the returned page.
@@ -33,25 +42,39 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
struct address_space *mapping = META_MAPPING(sbi);
struct page *page = NULL;
repeat:
- page = grab_cache_page(mapping, index);
+ page = f2fs_grab_cache_page(mapping, index, false);
if (!page) {
cond_resched();
goto repeat;
}
- f2fs_wait_on_page_writeback(page, META);
- SetPageUptodate(page);
+ f2fs_wait_on_page_writeback(page, META, true);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
return page;
}
/*
* We guarantee no failure on the returned page.
*/
-struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
+ bool is_meta)
{
struct address_space *mapping = META_MAPPING(sbi);
struct page *page;
+ struct f2fs_io_info fio = {
+ .sbi = sbi,
+ .type = META,
+ .op = REQ_OP_READ,
+ .op_flags = REQ_META | REQ_PRIO,
+ .old_blkaddr = index,
+ .new_blkaddr = index,
+ .encrypted_page = NULL,
+ };
+
+ if (unlikely(!is_meta))
+ fio.op_flags &= ~REQ_META;
repeat:
- page = grab_cache_page(mapping, index);
+ page = f2fs_grab_cache_page(mapping, index, false);
if (!page) {
cond_resched();
goto repeat;
@@ -59,101 +82,124 @@ repeat:
if (PageUptodate(page))
goto out;
- if (f2fs_submit_page_bio(sbi, page, index,
- READ_SYNC | REQ_META | REQ_PRIO))
+ fio.page = page;
+
+ if (f2fs_submit_page_bio(&fio)) {
+ f2fs_put_page(page, 1);
goto repeat;
+ }
lock_page(page);
if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
+
+ /*
+ * if there is any IO error when accessing device, make our filesystem
+ * readonly and make sure do not write checkpoint with non-uptodate
+ * meta page.
+ */
+ if (unlikely(!PageUptodate(page)))
+ f2fs_stop_checkpoint(sbi, false);
out:
+ mark_page_accessed(page);
return page;
}
-struct page *get_meta_page_ra(struct f2fs_sb_info *sbi, pgoff_t index)
+struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
{
- bool readahead = false;
- struct page *page;
-
- page = find_get_page(META_MAPPING(sbi), index);
- if (!page || (page && !PageUptodate(page)))
- readahead = true;
- f2fs_put_page(page, 0);
+ return __get_meta_page(sbi, index, true);
+}
- if (readahead)
- ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
- return get_meta_page(sbi, index);
+/* for POR only */
+struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ return __get_meta_page(sbi, index, false);
}
-static inline block_t get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
+bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
{
switch (type) {
case META_NAT:
- return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK;
+ break;
case META_SIT:
- return SIT_BLK_CNT(sbi);
+ if (unlikely(blkaddr >= SIT_BLK_CNT(sbi)))
+ return false;
+ break;
case META_SSA:
+ if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) ||
+ blkaddr < SM_I(sbi)->ssa_blkaddr))
+ return false;
+ break;
case META_CP:
- return 0;
+ if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr ||
+ blkaddr < __start_cp_addr(sbi)))
+ return false;
+ break;
case META_POR:
- return MAX_BLKADDR(sbi);
+ if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
+ blkaddr < MAIN_BLKADDR(sbi)))
+ return false;
+ break;
default:
BUG();
}
+
+ return true;
}
/*
* Readahead CP/NAT/SIT/SSA pages
*/
-int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
+int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
+ int type, bool sync)
{
- block_t prev_blk_addr = 0;
struct page *page;
block_t blkno = start;
- block_t max_blks = get_max_meta_blks(sbi, type);
-
struct f2fs_io_info fio = {
+ .sbi = sbi,
.type = META,
- .rw = READ_SYNC | REQ_META | REQ_PRIO
+ .op = REQ_OP_READ,
+ .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD,
+ .encrypted_page = NULL,
};
+ struct blk_plug plug;
+ if (unlikely(type == META_POR))
+ fio.op_flags &= ~REQ_META;
+
+ blk_start_plug(&plug);
for (; nrpages-- > 0; blkno++) {
- block_t blk_addr;
+
+ if (!is_valid_blkaddr(sbi, blkno, type))
+ goto out;
switch (type) {
case META_NAT:
- /* get nat block addr */
- if (unlikely(blkno >= max_blks))
+ if (unlikely(blkno >=
+ NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
blkno = 0;
- blk_addr = current_nat_addr(sbi,
+ /* get nat block addr */
+ fio.new_blkaddr = current_nat_addr(sbi,
blkno * NAT_ENTRY_PER_BLOCK);
break;
case META_SIT:
/* get sit block addr */
- if (unlikely(blkno >= max_blks))
- goto out;
- blk_addr = current_sit_addr(sbi,
+ fio.new_blkaddr = current_sit_addr(sbi,
blkno * SIT_ENTRY_PER_BLOCK);
- if (blkno != start && prev_blk_addr + 1 != blk_addr)
- goto out;
- prev_blk_addr = blk_addr;
break;
case META_SSA:
case META_CP:
case META_POR:
- if (unlikely(blkno >= max_blks))
- goto out;
- if (unlikely(blkno < SEG0_BLKADDR(sbi)))
- goto out;
- blk_addr = blkno;
+ fio.new_blkaddr = blkno;
break;
default:
BUG();
}
- page = grab_cache_page(META_MAPPING(sbi), blk_addr);
+ page = f2fs_grab_cache_page(META_MAPPING(sbi),
+ fio.new_blkaddr, false);
if (!page)
continue;
if (PageUptodate(page)) {
@@ -161,14 +207,31 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
continue;
}
- f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
+ fio.page = page;
+ fio.old_blkaddr = fio.new_blkaddr;
+ f2fs_submit_page_mbio(&fio);
f2fs_put_page(page, 0);
}
out:
f2fs_submit_merged_bio(sbi, META, READ);
+ blk_finish_plug(&plug);
return blkno - start;
}
+void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ struct page *page;
+ bool readahead = false;
+
+ page = find_get_page(META_MAPPING(sbi), index);
+ if (!page || !PageUptodate(page))
+ readahead = true;
+ f2fs_put_page(page, 0);
+
+ if (readahead)
+ ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true);
+}
+
static int f2fs_write_meta_page(struct page *page,
struct writeback_control *wbc)
{
@@ -176,17 +239,25 @@ static int f2fs_write_meta_page(struct page *page,
trace_f2fs_writepage(page, META);
- if (unlikely(sbi->por_doing))
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto redirty_out;
- if (wbc->for_reclaim)
+ if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0))
goto redirty_out;
if (unlikely(f2fs_cp_error(sbi)))
goto redirty_out;
- f2fs_wait_on_page_writeback(page, META);
write_meta_page(sbi, page);
dec_page_count(sbi, F2FS_DIRTY_META);
+
+ if (wbc->for_reclaim)
+ f2fs_submit_merged_bio_cond(sbi, page->mapping->host,
+ 0, page->index, META, WRITE);
+
unlock_page(page);
+
+ if (unlikely(f2fs_cp_error(sbi)))
+ f2fs_submit_merged_bio(sbi, META, WRITE);
+
return 0;
redirty_out:
@@ -200,15 +271,16 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
long diff, written;
- trace_f2fs_writepages(mapping->host, wbc, META);
-
/* collect a number of dirty meta pages and write together */
if (wbc->for_kupdate ||
get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
goto skip_write;
- /* if mounting is failed, skip writing node pages */
- mutex_lock(&sbi->cp_mutex);
+ /* if locked failed, cp will flush dirty pages instead */
+ if (!mutex_trylock(&sbi->cp_mutex))
+ goto skip_write;
+
+ trace_f2fs_writepages(mapping->host, wbc, META);
diff = nr_pages_to_write(sbi, META, wbc);
written = sync_meta_pages(sbi, META, wbc->nr_to_write);
mutex_unlock(&sbi->cp_mutex);
@@ -217,6 +289,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
skip_write:
wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
+ trace_f2fs_writepages(mapping->host, wbc, META);
return 0;
}
@@ -224,15 +297,18 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
long nr_to_write)
{
struct address_space *mapping = META_MAPPING(sbi);
- pgoff_t index = 0, end = LONG_MAX;
+ pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX;
struct pagevec pvec;
long nwritten = 0;
struct writeback_control wbc = {
.for_reclaim = 0,
};
+ struct blk_plug plug;
pagevec_init(&pvec, 0);
+ blk_start_plug(&plug);
+
while (index <= end) {
int i, nr_pages;
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
@@ -244,6 +320,13 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ if (prev == ULONG_MAX)
+ prev = page->index - 1;
+ if (nr_to_write != LONG_MAX && page->index != prev + 1) {
+ pagevec_release(&pvec);
+ goto stop;
+ }
+
lock_page(page);
if (unlikely(page->mapping != mapping)) {
@@ -256,24 +339,30 @@ continue_unlock:
goto continue_unlock;
}
+ f2fs_wait_on_page_writeback(page, META, true);
+
+ BUG_ON(PageWriteback(page));
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- if (f2fs_write_meta_page(page, &wbc)) {
+ if (mapping->a_ops->writepage(page, &wbc)) {
unlock_page(page);
break;
}
nwritten++;
+ prev = page->index;
if (unlikely(nwritten >= nr_to_write))
break;
}
pagevec_release(&pvec);
cond_resched();
}
-
+stop:
if (nwritten)
f2fs_submit_merged_bio(sbi, type, WRITE);
+ blk_finish_plug(&plug);
+
return nwritten;
}
@@ -281,10 +370,13 @@ static int f2fs_set_meta_page_dirty(struct page *page)
{
trace_f2fs_set_page_dirty(page, META);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
+ f2fs_set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
+ SetPagePrivate(page);
+ f2fs_trace_pid(page);
return 1;
}
return 0;
@@ -294,59 +386,70 @@ const struct address_space_operations f2fs_meta_aops = {
.writepage = f2fs_write_meta_page,
.writepages = f2fs_write_meta_pages,
.set_page_dirty = f2fs_set_meta_page_dirty,
+ .invalidatepage = f2fs_invalidate_page,
+ .releasepage = f2fs_release_page,
+#ifdef CONFIG_F2FS_MIGRATION
+ .migratepage = f2fs_migrate_page,
+#endif
};
static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
{
- struct ino_entry *e;
+ struct inode_management *im = &sbi->im[type];
+ struct ino_entry *e, *tmp;
+
+ tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS);
retry:
- spin_lock(&sbi->ino_lock[type]);
+ radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
- e = radix_tree_lookup(&sbi->ino_root[type], ino);
+ spin_lock(&im->ino_lock);
+ e = radix_tree_lookup(&im->ino_root, ino);
if (!e) {
- e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC);
- if (!e) {
- spin_unlock(&sbi->ino_lock[type]);
- goto retry;
- }
- if (radix_tree_insert(&sbi->ino_root[type], ino, e)) {
- spin_unlock(&sbi->ino_lock[type]);
- kmem_cache_free(ino_entry_slab, e);
+ e = tmp;
+ if (radix_tree_insert(&im->ino_root, ino, e)) {
+ spin_unlock(&im->ino_lock);
+ radix_tree_preload_end();
goto retry;
}
memset(e, 0, sizeof(struct ino_entry));
e->ino = ino;
- list_add_tail(&e->list, &sbi->ino_list[type]);
+ list_add_tail(&e->list, &im->ino_list);
+ if (type != ORPHAN_INO)
+ im->ino_num++;
}
- spin_unlock(&sbi->ino_lock[type]);
+ spin_unlock(&im->ino_lock);
+ radix_tree_preload_end();
+
+ if (e != tmp)
+ kmem_cache_free(ino_entry_slab, tmp);
}
static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
{
+ struct inode_management *im = &sbi->im[type];
struct ino_entry *e;
- spin_lock(&sbi->ino_lock[type]);
- e = radix_tree_lookup(&sbi->ino_root[type], ino);
+ spin_lock(&im->ino_lock);
+ e = radix_tree_lookup(&im->ino_root, ino);
if (e) {
list_del(&e->list);
- radix_tree_delete(&sbi->ino_root[type], ino);
- if (type == ORPHAN_INO)
- sbi->n_orphans--;
- spin_unlock(&sbi->ino_lock[type]);
+ radix_tree_delete(&im->ino_root, ino);
+ im->ino_num--;
+ spin_unlock(&im->ino_lock);
kmem_cache_free(ino_entry_slab, e);
return;
}
- spin_unlock(&sbi->ino_lock[type]);
+ spin_unlock(&im->ino_lock);
}
-void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
{
/* add new dirty ino entry into list */
__add_ino_entry(sbi, ino, type);
}
-void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
{
/* remove dirty ino entry from list */
__remove_ino_entry(sbi, ino, type);
@@ -355,55 +458,72 @@ void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
/* mode should be APPEND_INO or UPDATE_INO */
bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
{
+ struct inode_management *im = &sbi->im[mode];
struct ino_entry *e;
- spin_lock(&sbi->ino_lock[mode]);
- e = radix_tree_lookup(&sbi->ino_root[mode], ino);
- spin_unlock(&sbi->ino_lock[mode]);
+
+ spin_lock(&im->ino_lock);
+ e = radix_tree_lookup(&im->ino_root, ino);
+ spin_unlock(&im->ino_lock);
return e ? true : false;
}
-void release_dirty_inode(struct f2fs_sb_info *sbi)
+void release_ino_entry(struct f2fs_sb_info *sbi, bool all)
{
struct ino_entry *e, *tmp;
int i;
- for (i = APPEND_INO; i <= UPDATE_INO; i++) {
- spin_lock(&sbi->ino_lock[i]);
- list_for_each_entry_safe(e, tmp, &sbi->ino_list[i], list) {
+ for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) {
+ struct inode_management *im = &sbi->im[i];
+
+ spin_lock(&im->ino_lock);
+ list_for_each_entry_safe(e, tmp, &im->ino_list, list) {
list_del(&e->list);
- radix_tree_delete(&sbi->ino_root[i], e->ino);
+ radix_tree_delete(&im->ino_root, e->ino);
kmem_cache_free(ino_entry_slab, e);
+ im->ino_num--;
}
- spin_unlock(&sbi->ino_lock[i]);
+ spin_unlock(&im->ino_lock);
}
}
int acquire_orphan_inode(struct f2fs_sb_info *sbi)
{
+ struct inode_management *im = &sbi->im[ORPHAN_INO];
int err = 0;
- spin_lock(&sbi->ino_lock[ORPHAN_INO]);
- if (unlikely(sbi->n_orphans >= sbi->max_orphans))
+ spin_lock(&im->ino_lock);
+
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(sbi, FAULT_ORPHAN)) {
+ spin_unlock(&im->ino_lock);
+ f2fs_show_injection_info(FAULT_ORPHAN);
+ return -ENOSPC;
+ }
+#endif
+ if (unlikely(im->ino_num >= sbi->max_orphans))
err = -ENOSPC;
else
- sbi->n_orphans++;
- spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
+ im->ino_num++;
+ spin_unlock(&im->ino_lock);
return err;
}
void release_orphan_inode(struct f2fs_sb_info *sbi)
{
- spin_lock(&sbi->ino_lock[ORPHAN_INO]);
- f2fs_bug_on(sbi, sbi->n_orphans == 0);
- sbi->n_orphans--;
- spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
+ struct inode_management *im = &sbi->im[ORPHAN_INO];
+
+ spin_lock(&im->ino_lock);
+ f2fs_bug_on(sbi, im->ino_num == 0);
+ im->ino_num--;
+ spin_unlock(&im->ino_lock);
}
-void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+void add_orphan_inode(struct inode *inode)
{
/* add new orphan ino entry into list */
- __add_ino_entry(sbi, ino, ORPHAN_INO);
+ __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO);
+ update_inode_page(inode);
}
void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -412,46 +532,82 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
__remove_ino_entry(sbi, ino, ORPHAN_INO);
}
-static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
- struct inode *inode = f2fs_iget(sbi->sb, ino);
- f2fs_bug_on(sbi, IS_ERR(inode));
+ struct inode *inode;
+ struct node_info ni;
+ int err = acquire_orphan_inode(sbi);
+
+ if (err) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: orphan failed (ino=%x), run fsck to fix.",
+ __func__, ino);
+ return err;
+ }
+
+ __add_ino_entry(sbi, ino, ORPHAN_INO);
+
+ inode = f2fs_iget_retry(sbi->sb, ino);
+ if (IS_ERR(inode)) {
+ /*
+ * there should be a bug that we can't find the entry
+ * to orphan inode.
+ */
+ f2fs_bug_on(sbi, PTR_ERR(inode) == -ENOENT);
+ return PTR_ERR(inode);
+ }
+
clear_nlink(inode);
/* truncate all the data during iput */
iput(inode);
+
+ get_node_info(sbi, ino, &ni);
+
+ /* ENOMEM was fully retried in f2fs_evict_inode. */
+ if (ni.blk_addr != NULL_ADDR) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: orphan failed (ino=%x) by kernel, retry mount.",
+ __func__, ino);
+ return -EIO;
+ }
+ __remove_ino_entry(sbi, ino, ORPHAN_INO);
+ return 0;
}
-void recover_orphan_inodes(struct f2fs_sb_info *sbi)
+int recover_orphan_inodes(struct f2fs_sb_info *sbi)
{
- block_t start_blk, orphan_blkaddr, i, j;
-
- if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
- return;
+ block_t start_blk, orphan_blocks, i, j;
+ int err;
- sbi->por_doing = true;
+ if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
+ return 0;
- start_blk = __start_cp_addr(sbi) + 1 +
- le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
- orphan_blkaddr = __start_sum_addr(sbi) - 1;
+ start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
+ orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
- ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP);
+ ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true);
- for (i = 0; i < orphan_blkaddr; i++) {
+ for (i = 0; i < orphan_blocks; i++) {
struct page *page = get_meta_page(sbi, start_blk + i);
struct f2fs_orphan_block *orphan_blk;
orphan_blk = (struct f2fs_orphan_block *)page_address(page);
for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
- recover_orphan_inode(sbi, ino);
+ err = recover_orphan_inode(sbi, ino);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return err;
+ }
}
f2fs_put_page(page, 1);
}
/* clear Orphan Flag */
- clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
- sbi->por_doing = false;
- return;
+ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG);
+ return 0;
}
static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
@@ -459,28 +615,28 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
struct list_head *head;
struct f2fs_orphan_block *orphan_blk = NULL;
unsigned int nentries = 0;
- unsigned short index;
- unsigned short orphan_blocks =
- (unsigned short)GET_ORPHAN_BLOCKS(sbi->n_orphans);
+ unsigned short index = 1;
+ unsigned short orphan_blocks;
struct page *page = NULL;
struct ino_entry *orphan = NULL;
+ struct inode_management *im = &sbi->im[ORPHAN_INO];
- for (index = 0; index < orphan_blocks; index++)
- grab_meta_page(sbi, start_blk + index);
+ orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num);
- index = 1;
- spin_lock(&sbi->ino_lock[ORPHAN_INO]);
- head = &sbi->ino_list[ORPHAN_INO];
+ /*
+ * we don't need to do spin_lock(&im->ino_lock) here, since all the
+ * orphan inode operations are covered under f2fs_lock_op().
+ * And, spin_lock should be avoided due to page operations below.
+ */
+ head = &im->ino_list;
/* loop for each orphan inode entry and write them in Jornal block */
list_for_each_entry(orphan, head, list) {
if (!page) {
- page = find_get_page(META_MAPPING(sbi), start_blk++);
- f2fs_bug_on(sbi, !page);
+ page = grab_meta_page(sbi, start_blk++);
orphan_blk =
(struct f2fs_orphan_block *)page_address(page);
memset(orphan_blk, 0, sizeof(*orphan_blk));
- f2fs_put_page(page, 0);
}
orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
@@ -509,49 +665,56 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
set_page_dirty(page);
f2fs_put_page(page, 1);
}
-
- spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
}
-static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
- block_t cp_addr, unsigned long long *version)
+static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
+ struct f2fs_checkpoint **cp_block, struct page **cp_page,
+ unsigned long long *version)
{
- struct page *cp_page_1, *cp_page_2 = NULL;
unsigned long blk_size = sbi->blocksize;
- struct f2fs_checkpoint *cp_block;
- unsigned long long cur_version = 0, pre_version = 0;
- size_t crc_offset;
+ size_t crc_offset = 0;
__u32 crc = 0;
- /* Read the 1st cp block in this CP pack */
- cp_page_1 = get_meta_page(sbi, cp_addr);
+ *cp_page = get_meta_page(sbi, cp_addr);
+ *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page);
- /* get the version number */
- cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1);
- crc_offset = le32_to_cpu(cp_block->checksum_offset);
- if (crc_offset >= blk_size)
- goto invalid_cp1;
+ crc_offset = le32_to_cpu((*cp_block)->checksum_offset);
+ if (crc_offset > (blk_size - sizeof(__le32))) {
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "invalid crc_offset: %zu", crc_offset);
+ return -EINVAL;
+ }
- crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
- if (!f2fs_crc_valid(crc, cp_block, crc_offset))
- goto invalid_cp1;
+ crc = cur_cp_crc(*cp_block);
+ if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) {
+ f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value");
+ return -EINVAL;
+ }
- pre_version = cur_cp_version(cp_block);
+ *version = cur_cp_version(*cp_block);
+ return 0;
+}
- /* Read the 2nd cp block in this CP pack */
- cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
- cp_page_2 = get_meta_page(sbi, cp_addr);
+static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
+ block_t cp_addr, unsigned long long *version)
+{
+ struct page *cp_page_1 = NULL, *cp_page_2 = NULL;
+ struct f2fs_checkpoint *cp_block = NULL;
+ unsigned long long cur_version = 0, pre_version = 0;
+ int err;
- cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2);
- crc_offset = le32_to_cpu(cp_block->checksum_offset);
- if (crc_offset >= blk_size)
- goto invalid_cp2;
+ err = get_checkpoint_version(sbi, cp_addr, &cp_block,
+ &cp_page_1, version);
+ if (err)
+ goto invalid_cp1;
+ pre_version = *version;
- crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
- if (!f2fs_crc_valid(crc, cp_block, crc_offset))
+ cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
+ err = get_checkpoint_version(sbi, cp_addr, &cp_block,
+ &cp_page_2, version);
+ if (err)
goto invalid_cp2;
-
- cur_version = cur_cp_version(cp_block);
+ cur_version = *version;
if (cur_version == pre_version) {
*version = cur_version;
@@ -573,7 +736,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
unsigned long blk_size = sbi->blocksize;
unsigned long long cp1_version = 0, cp2_version = 0;
unsigned long long cp_start_blk_no;
- unsigned int cp_blks = 1 + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
+ unsigned int cp_blks = 1 + __cp_payload(sbi);
block_t cp_blk_no;
int i;
@@ -608,6 +771,15 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
memcpy(sbi->ckpt, cp_block, blk_size);
+ /* Sanity checking of checkpoint */
+ if (sanity_check_ckpt(sbi))
+ goto free_fail_no_cp;
+
+ if (cur_page == cp1)
+ sbi->cur_cp_pack = 1;
+ else
+ sbi->cur_cp_pack = 2;
+
if (cp_blks <= 1)
goto done;
@@ -629,118 +801,104 @@ done:
f2fs_put_page(cp2, 1);
return 0;
+free_fail_no_cp:
+ f2fs_put_page(cp1, 1);
+ f2fs_put_page(cp2, 1);
fail_no_cp:
kfree(sbi->ckpt);
return -EINVAL;
}
-static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
+static void __add_dirty_inode(struct inode *inode, enum inode_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
- if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
- return -EEXIST;
+ if (is_inode_flag_set(inode, flag))
+ return;
- set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
- F2FS_I(inode)->dirty_dir = new;
- list_add_tail(&new->list, &sbi->dir_inode_list);
- stat_inc_dirty_dir(sbi);
- return 0;
+ set_inode_flag(inode, flag);
+ if (!f2fs_is_volatile_file(inode))
+ list_add_tail(&F2FS_I(inode)->dirty_list,
+ &sbi->inode_list[type]);
+ stat_inc_dirty_inode(sbi, type);
}
-void update_dirty_page(struct inode *inode, struct page *page)
+static void __remove_dirty_inode(struct inode *inode, enum inode_type type)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct dir_inode_entry *new;
- int ret = 0;
+ int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
- if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
+ if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag))
return;
- if (!S_ISDIR(inode->i_mode)) {
- inode_inc_dirty_pages(inode);
- goto out;
- }
-
- new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
- new->inode = inode;
- INIT_LIST_HEAD(&new->list);
-
- spin_lock(&sbi->dir_inode_lock);
- ret = __add_dirty_inode(inode, new);
- inode_inc_dirty_pages(inode);
- spin_unlock(&sbi->dir_inode_lock);
-
- if (ret)
- kmem_cache_free(inode_entry_slab, new);
-out:
- SetPagePrivate(page);
+ list_del_init(&F2FS_I(inode)->dirty_list);
+ clear_inode_flag(inode, flag);
+ stat_dec_dirty_inode(F2FS_I_SB(inode), type);
}
-void add_dirty_dir_inode(struct inode *inode)
+void update_dirty_page(struct inode *inode, struct page *page)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct dir_inode_entry *new =
- f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
- int ret = 0;
+ enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
- new->inode = inode;
- INIT_LIST_HEAD(&new->list);
+ if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
+ return;
- spin_lock(&sbi->dir_inode_lock);
- ret = __add_dirty_inode(inode, new);
- spin_unlock(&sbi->dir_inode_lock);
+ spin_lock(&sbi->inode_lock[type]);
+ if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH))
+ __add_dirty_inode(inode, type);
+ inode_inc_dirty_pages(inode);
+ spin_unlock(&sbi->inode_lock[type]);
- if (ret)
- kmem_cache_free(inode_entry_slab, new);
+ SetPagePrivate(page);
+ f2fs_trace_pid(page);
}
-void remove_dirty_dir_inode(struct inode *inode)
+void remove_dirty_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct dir_inode_entry *entry;
+ enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
- if (!S_ISDIR(inode->i_mode))
+ if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
return;
- spin_lock(&sbi->dir_inode_lock);
- if (get_dirty_pages(inode) ||
- !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
- spin_unlock(&sbi->dir_inode_lock);
+ if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH))
return;
- }
- entry = F2FS_I(inode)->dirty_dir;
- list_del(&entry->list);
- F2FS_I(inode)->dirty_dir = NULL;
- clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
- stat_dec_dirty_dir(sbi);
- spin_unlock(&sbi->dir_inode_lock);
- kmem_cache_free(inode_entry_slab, entry);
-
- /* Only from the recovery routine */
- if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
- clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
- iput(inode);
- }
+ spin_lock(&sbi->inode_lock[type]);
+ __remove_dirty_inode(inode, type);
+ spin_unlock(&sbi->inode_lock[type]);
}
-void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
+int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
{
struct list_head *head;
- struct dir_inode_entry *entry;
struct inode *inode;
+ struct f2fs_inode_info *fi;
+ bool is_dir = (type == DIR_INODE);
+
+ trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir,
+ get_pages(sbi, is_dir ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
retry:
- spin_lock(&sbi->dir_inode_lock);
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
- head = &sbi->dir_inode_list;
+ spin_lock(&sbi->inode_lock[type]);
+
+ head = &sbi->inode_list[type];
if (list_empty(head)) {
- spin_unlock(&sbi->dir_inode_lock);
- return;
+ spin_unlock(&sbi->inode_lock[type]);
+ trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir,
+ get_pages(sbi, is_dir ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
+ return 0;
}
- entry = list_entry(head->next, struct dir_inode_entry, list);
- inode = igrab(entry->inode);
- spin_unlock(&sbi->dir_inode_lock);
+ fi = list_first_entry(head, struct f2fs_inode_info, dirty_list);
+ inode = igrab(&fi->vfs_inode);
+ spin_unlock(&sbi->inode_lock[type]);
if (inode) {
filemap_fdatawrite(inode->i_mapping);
iput(inode);
@@ -750,10 +908,56 @@ retry:
* wribacking dentry pages in the freeing inode.
*/
f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ cond_resched();
}
goto retry;
}
+int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
+{
+ struct list_head *head = &sbi->inode_list[DIRTY_META];
+ struct inode *inode;
+ struct f2fs_inode_info *fi;
+ s64 total = get_pages(sbi, F2FS_DIRTY_IMETA);
+
+ while (total--) {
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+
+ spin_lock(&sbi->inode_lock[DIRTY_META]);
+ if (list_empty(head)) {
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ return 0;
+ }
+ fi = list_first_entry(head, struct f2fs_inode_info,
+ gdirty_list);
+ inode = igrab(&fi->vfs_inode);
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ if (inode) {
+ sync_inode_metadata(inode, 0);
+
+ /* it's on eviction */
+ if (is_inode_flag_set(inode, FI_DIRTY_INODE))
+ update_inode_page(inode);
+ iput(inode);
+ }
+ };
+ return 0;
+}
+
+static void __prepare_cp_block(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ nid_t last_nid = nm_i->next_scan_nid;
+
+ next_free_nid(sbi, &last_nid);
+ ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
+ ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
+ ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
+ ckpt->next_free_nid = cpu_to_le32(last_nid);
+}
+
/*
* Freeze all the FS-operations for checkpoint.
*/
@@ -774,31 +978,50 @@ retry_flush_dents:
/* write all the dirty dentry pages */
if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
f2fs_unlock_all(sbi);
- sync_dirty_dir_inodes(sbi);
- if (unlikely(f2fs_cp_error(sbi))) {
- err = -EIO;
+ err = sync_dirty_inodes(sbi, DIR_INODE);
+ if (err)
goto out;
- }
+ cond_resched();
goto retry_flush_dents;
}
/*
* POR: we should ensure that there are no dirty node pages
- * until finishing nat/sit flush.
+ * until finishing nat/sit flush. inode->i_blocks can be updated.
*/
+ down_write(&sbi->node_change);
+
+ if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
+ up_write(&sbi->node_change);
+ f2fs_unlock_all(sbi);
+ err = f2fs_sync_inode_meta(sbi);
+ if (err)
+ goto out;
+ cond_resched();
+ goto retry_flush_dents;
+ }
+
retry_flush_nodes:
down_write(&sbi->node_write);
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
up_write(&sbi->node_write);
- sync_node_pages(sbi, 0, &wbc);
- if (unlikely(f2fs_cp_error(sbi))) {
+ err = sync_node_pages(sbi, &wbc);
+ if (err) {
+ up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
- err = -EIO;
goto out;
}
+ cond_resched();
goto retry_flush_nodes;
}
+
+ /*
+ * sbi->node_change is used only for AIO write_begin path which produces
+ * dirty node blocks and some checkpoint values by block allocation.
+ */
+ __prepare_cp_block(sbi);
+ up_write(&sbi->node_change);
out:
blk_finish_plug(&plug);
return err;
@@ -817,49 +1040,79 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
for (;;) {
prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (!get_pages(sbi, F2FS_WRITEBACK))
+ if (!get_pages(sbi, F2FS_WB_CP_DATA))
break;
- io_schedule();
+ io_schedule_timeout(5*HZ);
}
finish_wait(&sbi->cp_wait, &wait);
}
-static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+{
+ unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+
+ spin_lock(&sbi->cp_lock);
+
+ if ((cpc->reason & CP_UMOUNT) &&
+ le32_to_cpu(ckpt->cp_pack_total_block_count) >
+ sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
+ disable_nat_bits(sbi, false);
+
+ if (cpc->reason & CP_TRIMMED)
+ __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
+
+ if (cpc->reason & CP_UMOUNT)
+ __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+
+ if (cpc->reason & CP_FASTBOOT)
+ __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
+
+ if (orphan_num)
+ __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+
+ if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
+ __set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+
+ /* set this flag to activate crc|cp_ver for recovery */
+ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG);
+
+ spin_unlock(&sbi->cp_lock);
+}
+
+static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
- struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
struct f2fs_nm_info *nm_i = NM_I(sbi);
- nid_t last_nid = nm_i->next_scan_nid;
+ unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
block_t start_blk;
- struct page *cp_page;
unsigned int data_sum_blocks, orphan_blocks;
__u32 crc32 = 0;
- void *kaddr;
int i;
- int cp_payload_blks = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
-
- /*
- * This avoids to conduct wrong roll-forward operations and uses
- * metapages, so should be called prior to sync_meta_pages below.
- */
- discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg));
+ int cp_payload_blks = __cp_payload(sbi);
+ struct super_block *sb = sbi->sb;
+ struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
+ u64 kbytes_written;
/* Flush all the NAT/SIT pages */
while (get_pages(sbi, F2FS_DIRTY_META)) {
sync_meta_pages(sbi, META, LONG_MAX);
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
}
- next_free_nid(sbi, &last_nid);
-
/*
* modify checkpoint
* version number is already updated
*/
ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
- ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
ckpt->cur_node_segno[i] =
@@ -878,136 +1131,189 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
}
- ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
- ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
- ckpt->next_free_nid = cpu_to_le32(last_nid);
-
/* 2 cp + n data seg summary + orphan inode blocks */
- data_sum_blocks = npages_for_summary_flush(sbi);
+ data_sum_blocks = npages_for_summary_flush(sbi, false);
+ spin_lock(&sbi->cp_lock);
if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
- set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+ __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
else
- clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+ __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+ spin_unlock(&sbi->cp_lock);
- orphan_blocks = GET_ORPHAN_BLOCKS(sbi->n_orphans);
+ orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num);
ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
orphan_blocks);
- if (cpc->reason == CP_UMOUNT) {
- set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+ if (__remain_node_summaries(cpc->reason))
ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
cp_payload_blks + data_sum_blocks +
orphan_blocks + NR_CURSEG_NODE_TYPE);
- } else {
- clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+ else
ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
cp_payload_blks + data_sum_blocks +
orphan_blocks);
- }
-
- if (sbi->n_orphans)
- set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
- else
- clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
- if (sbi->need_fsck)
- set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+ /* update ckpt flag for checkpoint */
+ update_ckpt_flags(sbi, cpc);
/* update SIT/NAT bitmap */
get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
- crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
+ crc32 = f2fs_crc32(sbi, ckpt, le32_to_cpu(ckpt->checksum_offset));
*((__le32 *)((unsigned char *)ckpt +
le32_to_cpu(ckpt->checksum_offset)))
= cpu_to_le32(crc32);
- start_blk = __start_cp_addr(sbi);
+ start_blk = __start_cp_next_addr(sbi);
- /* write out checkpoint buffer at block 0 */
- cp_page = grab_meta_page(sbi, start_blk++);
- kaddr = page_address(cp_page);
- memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
- set_page_dirty(cp_page);
- f2fs_put_page(cp_page, 1);
-
- for (i = 1; i < 1 + cp_payload_blks; i++) {
- cp_page = grab_meta_page(sbi, start_blk++);
- kaddr = page_address(cp_page);
- memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE,
- (1 << sbi->log_blocksize));
- set_page_dirty(cp_page);
- f2fs_put_page(cp_page, 1);
+ /* write nat bits */
+ if (enabled_nat_bits(sbi, cpc)) {
+ __u64 cp_ver = cur_cp_version(ckpt);
+ block_t blk;
+
+ cp_ver |= ((__u64)crc32 << 32);
+ *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver);
+
+ blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks;
+ for (i = 0; i < nm_i->nat_bits_blocks; i++)
+ update_meta_page(sbi, nm_i->nat_bits +
+ (i << F2FS_BLKSIZE_BITS), blk + i);
+
+ /* Flush all the NAT BITS pages */
+ while (get_pages(sbi, F2FS_DIRTY_META)) {
+ sync_meta_pages(sbi, META, LONG_MAX);
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+ }
}
- if (sbi->n_orphans) {
+ /* need to wait for end_io results */
+ wait_on_all_pages_writeback(sbi);
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+
+ /* write out checkpoint buffer at block 0 */
+ update_meta_page(sbi, ckpt, start_blk++);
+
+ for (i = 1; i < 1 + cp_payload_blks; i++)
+ update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE,
+ start_blk++);
+
+ if (orphan_num) {
write_orphan_inodes(sbi, start_blk);
start_blk += orphan_blocks;
}
write_data_summaries(sbi, start_blk);
start_blk += data_sum_blocks;
- if (cpc->reason == CP_UMOUNT) {
+
+ /* Record write statistics in the hot node summary */
+ kbytes_written = sbi->kbytes_written;
+ if (sb->s_bdev->bd_part)
+ kbytes_written += BD_PART_WRITTEN(sbi);
+
+ seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written);
+
+ if (__remain_node_summaries(cpc->reason)) {
write_node_summaries(sbi, start_blk);
start_blk += NR_CURSEG_NODE_TYPE;
}
/* writeout checkpoint block */
- cp_page = grab_meta_page(sbi, start_blk);
- kaddr = page_address(cp_page);
- memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
- set_page_dirty(cp_page);
- f2fs_put_page(cp_page, 1);
+ update_meta_page(sbi, ckpt, start_blk);
/* wait for previous submitted node/meta pages writeback */
wait_on_all_pages_writeback(sbi);
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
- filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
- filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
+ filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
+ filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
/* update user_block_counts */
sbi->last_valid_block_count = sbi->total_valid_block_count;
- sbi->alloc_valid_block_count = 0;
+ percpu_counter_set(&sbi->alloc_valid_block_count, 0);
/* Here, we only have one bio having CP pack */
sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
- release_dirty_inode(sbi);
+ /* wait for previous submitted meta pages writeback */
+ wait_on_all_pages_writeback(sbi);
+
+ release_ino_entry(sbi, false);
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
+
+ clear_sbi_flag(sbi, SBI_IS_DIRTY);
+ clear_sbi_flag(sbi, SBI_NEED_CP);
+ __set_cp_next_pack(sbi);
+
+ /*
+ * redirty superblock if metadata like node page or inode cache is
+ * updated during writing checkpoint.
+ */
+ if (get_pages(sbi, F2FS_DIRTY_NODES) ||
+ get_pages(sbi, F2FS_DIRTY_IMETA))
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
- clear_prefree_segments(sbi);
- F2FS_RESET_SB_DIRT(sbi);
+ f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS));
+
+ return 0;
}
/*
* We guarantee that this checkpoint procedure will not fail.
*/
-void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long long ckpt_ver;
-
- trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
+ int err = 0;
mutex_lock(&sbi->cp_mutex);
- if (!sbi->s_dirty && cpc->reason != CP_DISCARD)
+ if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
+ ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
+ ((cpc->reason & CP_DISCARD) && !sbi->discard_blks)))
goto out;
- if (unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto out;
+ }
+ if (f2fs_readonly(sbi->sb)) {
+ err = -EROFS;
goto out;
- if (block_operations(sbi))
+ }
+
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
+
+ err = block_operations(sbi);
+ if (err)
goto out;
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
- f2fs_submit_merged_bio(sbi, NODE, WRITE);
- f2fs_submit_merged_bio(sbi, META, WRITE);
+ f2fs_flush_merged_bios(sbi);
+
+ /* this is the case of multiple fstrims without any changes */
+ if (cpc->reason & CP_DISCARD) {
+ if (!exist_trim_candidates(sbi, cpc)) {
+ unblock_operations(sbi);
+ goto out;
+ }
+
+ if (NM_I(sbi)->dirty_nat_cnt == 0 &&
+ SIT_I(sbi)->dirty_sentries == 0 &&
+ prefree_segments(sbi) == 0) {
+ flush_sit_entries(sbi, cpc);
+ clear_prefree_segments(sbi, cpc);
+ unblock_operations(sbi);
+ goto out;
+ }
+ }
/*
* update checkpoint pack index
@@ -1018,17 +1324,29 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
/* write cached NAT/SIT entries to NAT/SIT area */
- flush_nat_entries(sbi);
+ flush_nat_entries(sbi, cpc);
flush_sit_entries(sbi, cpc);
/* unlock all the fs_lock[] in do_checkpoint() */
- do_checkpoint(sbi, cpc);
+ err = do_checkpoint(sbi, cpc);
+ if (err)
+ release_discard_addrs(sbi);
+ else
+ clear_prefree_segments(sbi, cpc);
unblock_operations(sbi);
stat_inc_cp_count(sbi->stat_info);
+
+ if (cpc->reason & CP_RECOVERY)
+ f2fs_msg(sbi->sb, KERN_NOTICE,
+ "checkpoint: version = %llx", ckpt_ver);
+
+ /* do checkpoint periodically */
+ f2fs_update_time(sbi, CP_TIME);
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
out:
mutex_unlock(&sbi->cp_mutex);
- trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
+ return err;
}
void init_ino_entry_info(struct f2fs_sb_info *sbi)
@@ -1036,20 +1354,17 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi)
int i;
for (i = 0; i < MAX_INO_ENTRY; i++) {
- INIT_RADIX_TREE(&sbi->ino_root[i], GFP_ATOMIC);
- spin_lock_init(&sbi->ino_lock[i]);
- INIT_LIST_HEAD(&sbi->ino_list[i]);
+ struct inode_management *im = &sbi->im[i];
+
+ INIT_RADIX_TREE(&im->ino_root, GFP_ATOMIC);
+ spin_lock_init(&im->ino_lock);
+ INIT_LIST_HEAD(&im->ino_list);
+ im->ino_num = 0;
}
- /*
- * considering 512 blocks in a segment 8 blocks are needed for cp
- * and log segment summaries. Remaining blocks are used to keep
- * orphan entries with the limitation one reserved segment
- * for cp pack we can have max 1020*504 orphan entries
- */
- sbi->n_orphans = 0;
sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
- NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK;
+ NR_CURSEG_TYPE - __cp_payload(sbi)) *
+ F2FS_ORPHANS_PER_BLOCK;
}
int __init create_checkpoint_caches(void)
@@ -1058,8 +1373,8 @@ int __init create_checkpoint_caches(void)
sizeof(struct ino_entry));
if (!ino_entry_slab)
return -ENOMEM;
- inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
- sizeof(struct dir_inode_entry));
+ inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry",
+ sizeof(struct inode_entry));
if (!inode_entry_slab) {
kmem_cache_destroy(ino_entry_slab);
return -ENOMEM;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f988b01b6f89..dd97cbe74b0e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -15,25 +15,69 @@
#include <linux/aio.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/prefetch.h>
+#include <linux/uio.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/cleancache.h>
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include "trace.h"
#include <trace/events/f2fs.h>
+#include <trace/events/android_fs.h>
+
+static bool __is_cp_guaranteed(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode;
+ struct f2fs_sb_info *sbi;
+
+ if (!mapping)
+ return false;
+
+ inode = mapping->host;
+ sbi = F2FS_I_SB(inode);
+
+ if (inode->i_ino == F2FS_META_INO(sbi) ||
+ inode->i_ino == F2FS_NODE_INO(sbi) ||
+ S_ISDIR(inode->i_mode) ||
+ is_cold_data(page))
+ return true;
+ return false;
+}
static void f2fs_read_end_io(struct bio *bio, int err)
{
struct bio_vec *bvec;
int i;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) {
+ f2fs_show_injection_info(FAULT_IO);
+ err = -EIO;
+ }
+#endif
+
+ if (f2fs_bio_encrypted(bio)) {
+ if (err) {
+ fscrypt_release_ctx(bio->bi_private);
+ } else {
+ fscrypt_decrypt_bio_pages(bio->bi_private, bio);
+ return;
+ }
+ }
+
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
if (!err) {
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
} else {
ClearPageUptodate(page);
SetPageError(page);
@@ -51,26 +95,74 @@ static void f2fs_write_end_io(struct bio *bio, int err)
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
+ enum count_type type = WB_DATA_TYPE(page);
+
+ if (IS_DUMMY_WRITTEN_PAGE(page)) {
+ set_page_private(page, (unsigned long)NULL);
+ ClearPagePrivate(page);
+ unlock_page(page);
+ mempool_free(page, sbi->write_io_dummy);
+
+ if (unlikely(err))
+ f2fs_stop_checkpoint(sbi, true);
+ continue;
+ }
+
+ fscrypt_pullback_bio_page(&page, true);
if (unlikely(err)) {
- set_page_dirty(page);
set_bit(AS_EIO, &page->mapping->flags);
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, true);
}
+ dec_page_count(sbi, type);
+ clear_cold_data(page);
end_page_writeback(page);
- dec_page_count(sbi, F2FS_WRITEBACK);
}
+ if (!get_pages(sbi, F2FS_WB_CP_DATA) &&
+ wq_has_sleeper(&sbi->cp_wait))
+ wake_up(&sbi->cp_wait);
+
+ bio_put(bio);
+}
- if (sbi->wait_io) {
- complete(sbi->wait_io);
- sbi->wait_io = NULL;
+/*
+ * Return true, if pre_bio's bdev is same as its target device.
+ */
+struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
+ block_t blk_addr, struct bio *bio)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++) {
+ if (FDEV(i).start_blk <= blk_addr &&
+ FDEV(i).end_blk >= blk_addr) {
+ blk_addr -= FDEV(i).start_blk;
+ bdev = FDEV(i).bdev;
+ break;
+ }
}
+ if (bio) {
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
+ }
+ return bdev;
+}
- if (!get_pages(sbi, F2FS_WRITEBACK) &&
- !list_empty(&sbi->cp_wait.task_list))
- wake_up(&sbi->cp_wait);
+int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+ int i;
- bio_put(bio);
+ for (i = 0; i < sbi->s_ndevs; i++)
+ if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr)
+ return i;
+ return 0;
+}
+
+static bool __same_bdev(struct f2fs_sb_info *sbi,
+ block_t blk_addr, struct bio *bio)
+{
+ return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev;
}
/*
@@ -81,52 +173,127 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
{
struct bio *bio;
- /* No failure on bio allocation */
- bio = bio_alloc(GFP_NOIO, npages);
+ bio = f2fs_bio_alloc(npages);
- bio->bi_bdev = sbi->sb->s_bdev;
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
+ f2fs_target_device(sbi, blk_addr, bio);
bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
- bio->bi_private = sbi;
+ bio->bi_private = is_read ? NULL : sbi;
return bio;
}
+static inline void __submit_bio(struct f2fs_sb_info *sbi,
+ struct bio *bio, enum page_type type)
+{
+ if (!is_read_io(bio_op(bio))) {
+ unsigned int start;
+
+ if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
+ current->plug && (type == DATA || type == NODE))
+ blk_finish_plug(current->plug);
+
+ if (type != DATA && type != NODE)
+ goto submit_io;
+
+ start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS;
+ start %= F2FS_IO_SIZE(sbi);
+
+ if (start == 0)
+ goto submit_io;
+
+ /* fill dummy pages */
+ for (; start < F2FS_IO_SIZE(sbi); start++) {
+ struct page *page =
+ mempool_alloc(sbi->write_io_dummy,
+ GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL);
+ f2fs_bug_on(sbi, !page);
+
+ SetPagePrivate(page);
+ set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE);
+ lock_page(page);
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
+ f2fs_bug_on(sbi, 1);
+ }
+ /*
+ * In the NODE case, we lose next block address chain. So, we
+ * need to do checkpoint in f2fs_sync_file.
+ */
+ if (type == NODE)
+ set_sbi_flag(sbi, SBI_NEED_CP);
+ }
+submit_io:
+ if (is_read_io(bio_op(bio)))
+ trace_f2fs_submit_read_bio(sbi->sb, type, bio);
+ else
+ trace_f2fs_submit_write_bio(sbi->sb, type, bio);
+ submit_bio(0, bio);
+}
+
static void __submit_merged_bio(struct f2fs_bio_info *io)
{
struct f2fs_io_info *fio = &io->fio;
- int rw;
if (!io->bio)
return;
- rw = fio->rw;
+ bio_set_op_attrs(io->bio, fio->op, fio->op_flags);
- if (is_read_io(rw)) {
- trace_f2fs_submit_read_bio(io->sbi->sb, rw,
- fio->type, io->bio);
- submit_bio(rw, io->bio);
- } else {
- trace_f2fs_submit_write_bio(io->sbi->sb, rw,
- fio->type, io->bio);
- /*
- * META_FLUSH is only from the checkpoint procedure, and we
- * should wait this metadata bio for FS consistency.
- */
- if (fio->type == META_FLUSH) {
- DECLARE_COMPLETION_ONSTACK(wait);
- io->sbi->wait_io = &wait;
- submit_bio(rw, io->bio);
- wait_for_completion(&wait);
- } else {
- submit_bio(rw, io->bio);
- }
- }
+ if (is_read_io(fio->op))
+ trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio);
+ else
+ trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio);
+ __submit_bio(io->sbi, io->bio, fio->type);
io->bio = NULL;
}
-void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
+static bool __has_merged_page(struct f2fs_bio_info *io,
+ struct inode *inode, nid_t ino, pgoff_t idx)
+{
+ struct bio_vec *bvec;
+ struct page *target;
+ int i;
+
+ if (!io->bio)
+ return false;
+
+ if (!inode && !ino)
+ return true;
+
+ bio_for_each_segment_all(bvec, io->bio, i) {
+
+ if (bvec->bv_page->mapping)
+ target = bvec->bv_page;
+ else
+ target = fscrypt_control_page(bvec->bv_page);
+
+ if (idx != target->index)
+ continue;
+
+ if (inode && inode == target->mapping->host)
+ return true;
+ if (ino && ino == ino_of_node(target))
+ return true;
+ }
+
+ return false;
+}
+
+static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode,
+ nid_t ino, pgoff_t idx, enum page_type type)
+{
+ enum page_type btype = PAGE_TYPE_OF_BIO(type);
+ struct f2fs_bio_info *io = &sbi->write_io[btype];
+ bool ret;
+
+ down_read(&io->io_rwsem);
+ ret = __has_merged_page(io, inode, ino, idx);
+ up_read(&io->io_rwsem);
+ return ret;
+}
+
+static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
+ struct inode *inode, nid_t ino, pgoff_t idx,
enum page_type type, int rw)
{
enum page_type btype = PAGE_TYPE_OF_BIO(type);
@@ -136,79 +303,137 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
down_write(&io->io_rwsem);
+ if (!__has_merged_page(io, inode, ino, idx))
+ goto out;
+
/* change META to META_FLUSH in the checkpoint procedure */
if (type >= META_FLUSH) {
io->fio.type = META_FLUSH;
- if (test_opt(sbi, NOBARRIER))
- io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO;
- else
- io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
+ io->fio.op = REQ_OP_WRITE;
+ io->fio.op_flags = REQ_META | REQ_PRIO;
+ if (!test_opt(sbi, NOBARRIER))
+ io->fio.op_flags |= WRITE_FLUSH | REQ_FUA;
}
__submit_merged_bio(io);
+out:
up_write(&io->io_rwsem);
}
+void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type,
+ int rw)
+{
+ __f2fs_submit_merged_bio(sbi, NULL, 0, 0, type, rw);
+}
+
+void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi,
+ struct inode *inode, nid_t ino, pgoff_t idx,
+ enum page_type type, int rw)
+{
+ if (has_merged_page(sbi, inode, ino, idx, type))
+ __f2fs_submit_merged_bio(sbi, inode, ino, idx, type, rw);
+}
+
+void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi)
+{
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ f2fs_submit_merged_bio(sbi, NODE, WRITE);
+ f2fs_submit_merged_bio(sbi, META, WRITE);
+}
+
/*
* Fill the locked page with data located in the block address.
- * Return unlocked page.
+ * A caller needs to unlock the page on failure.
*/
-int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
- block_t blk_addr, int rw)
+int f2fs_submit_page_bio(struct f2fs_io_info *fio)
{
struct bio *bio;
+ struct page *page = fio->encrypted_page ?
+ fio->encrypted_page : fio->page;
- trace_f2fs_submit_page_bio(page, blk_addr, rw);
+ trace_f2fs_submit_page_bio(page, fio);
+ f2fs_trace_ios(fio, 0);
/* Allocate a new bio */
- bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw));
+ bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->op));
- if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
bio_put(bio);
- f2fs_put_page(page, 1);
return -EFAULT;
}
+ bio_set_op_attrs(bio, fio->op, fio->op_flags);
+
+ __submit_bio(fio->sbi, bio, fio->type);
- submit_bio(rw, bio);
+ if (!is_read_io(fio->op))
+ inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page));
return 0;
}
-void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
- block_t blk_addr, struct f2fs_io_info *fio)
+int f2fs_submit_page_mbio(struct f2fs_io_info *fio)
{
+ struct f2fs_sb_info *sbi = fio->sbi;
enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
struct f2fs_bio_info *io;
- bool is_read = is_read_io(fio->rw);
+ bool is_read = is_read_io(fio->op);
+ struct page *bio_page;
+ int err = 0;
io = is_read ? &sbi->read_io : &sbi->write_io[btype];
- verify_block_addr(sbi, blk_addr);
+ if (fio->old_blkaddr != NEW_ADDR)
+ verify_block_addr(sbi, fio->old_blkaddr);
+ verify_block_addr(sbi, fio->new_blkaddr);
- down_write(&io->io_rwsem);
+ bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
+
+ /* set submitted = 1 as a return value */
+ fio->submitted = 1;
if (!is_read)
- inc_page_count(sbi, F2FS_WRITEBACK);
+ inc_page_count(sbi, WB_DATA_TYPE(bio_page));
+
+ down_write(&io->io_rwsem);
- if (io->bio && (io->last_block_in_bio != blk_addr - 1 ||
- io->fio.rw != fio->rw))
+ if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
+ (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) ||
+ !__same_bdev(sbi, fio->new_blkaddr, io->bio)))
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
- int bio_blocks = MAX_BIO_BLOCKS(sbi);
-
- io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
+ if ((fio->type == DATA || fio->type == NODE) &&
+ fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) {
+ err = -EAGAIN;
+ if (!is_read)
+ dec_page_count(sbi, WB_DATA_TYPE(bio_page));
+ goto out_fail;
+ }
+ io->bio = __bio_alloc(sbi, fio->new_blkaddr,
+ BIO_MAX_PAGES, is_read);
io->fio = *fio;
}
- if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
- PAGE_CACHE_SIZE) {
+ if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) <
+ PAGE_SIZE) {
__submit_merged_bio(io);
goto alloc_new;
}
- io->last_block_in_bio = blk_addr;
-
+ io->last_block_in_bio = fio->new_blkaddr;
+ f2fs_trace_ios(fio, 0);
+out_fail:
up_write(&io->io_rwsem);
- trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
+ trace_f2fs_submit_page_mbio(fio->page, fio);
+ return err;
+}
+
+static void __set_data_blkaddr(struct dnode_of_data *dn)
+{
+ struct f2fs_node *rn = F2FS_NODE(dn->node_page);
+ __le32 *addr_array;
+
+ /* Get physical address of data block */
+ addr_array = blkaddr_in_node(rn);
+ addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
}
/*
@@ -217,49 +442,70 @@ alloc_new:
* ->node_page
* update block addresses in the node page
*/
-static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
+void set_data_blkaddr(struct dnode_of_data *dn)
{
- struct f2fs_node *rn;
- __le32 *addr_array;
- struct page *node_page = dn->node_page;
- unsigned int ofs_in_node = dn->ofs_in_node;
-
- f2fs_wait_on_page_writeback(node_page, NODE);
-
- rn = F2FS_NODE(node_page);
+ f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+ __set_data_blkaddr(dn);
+ if (set_page_dirty(dn->node_page))
+ dn->node_changed = true;
+}
- /* Get physical address of data block */
- addr_array = blkaddr_in_node(rn);
- addr_array[ofs_in_node] = cpu_to_le32(new_addr);
- set_page_dirty(node_page);
+void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
+{
+ dn->data_blkaddr = blkaddr;
+ set_data_blkaddr(dn);
+ f2fs_update_extent_cache(dn);
}
-int reserve_new_block(struct dnode_of_data *dn)
+/* dn->ofs_in_node will be returned with up-to-date last block pointer */
+int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
- if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+ if (!count)
+ return 0;
+
+ if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return -EPERM;
- if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
+ if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count)))
return -ENOSPC;
- trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
+ trace_f2fs_reserve_new_blocks(dn->inode, dn->nid,
+ dn->ofs_in_node, count);
- __set_data_blkaddr(dn, NEW_ADDR);
- dn->data_blkaddr = NEW_ADDR;
- mark_inode_dirty(dn->inode);
- sync_inode_page(dn);
+ f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
+
+ for (; count > 0; dn->ofs_in_node++) {
+ block_t blkaddr =
+ datablock_addr(dn->node_page, dn->ofs_in_node);
+ if (blkaddr == NULL_ADDR) {
+ dn->data_blkaddr = NEW_ADDR;
+ __set_data_blkaddr(dn);
+ count--;
+ }
+ }
+
+ if (set_page_dirty(dn->node_page))
+ dn->node_changed = true;
return 0;
}
+/* Should keep dn->ofs_in_node unchanged */
+int reserve_new_block(struct dnode_of_data *dn)
+{
+ unsigned int ofs_in_node = dn->ofs_in_node;
+ int ret;
+
+ ret = reserve_new_blocks(dn, 1);
+ dn->ofs_in_node = ofs_in_node;
+ return ret;
+}
+
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
{
bool need_put = dn->inode_page ? false : true;
int err;
- /* if inode_page exists, index should be zero */
- f2fs_bug_on(F2FS_I_SB(dn->inode), !need_put && index);
-
err = get_dnode_of_data(dn, index, ALLOC_NODE);
if (err)
return err;
@@ -271,174 +517,110 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
return err;
}
-static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
- struct buffer_head *bh_result)
+int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
- pgoff_t start_fofs, end_fofs;
- block_t start_blkaddr;
+ struct extent_info ei = {0,0,0};
+ struct inode *inode = dn->inode;
- if (is_inode_flag_set(fi, FI_NO_EXTENT))
- return 0;
-
- read_lock(&fi->ext.ext_lock);
- if (fi->ext.len == 0) {
- read_unlock(&fi->ext.ext_lock);
+ if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ dn->data_blkaddr = ei.blk + index - ei.fofs;
return 0;
}
- stat_inc_total_hit(inode->i_sb);
-
- start_fofs = fi->ext.fofs;
- end_fofs = fi->ext.fofs + fi->ext.len - 1;
- start_blkaddr = fi->ext.blk_addr;
-
- if (pgofs >= start_fofs && pgofs <= end_fofs) {
- unsigned int blkbits = inode->i_sb->s_blocksize_bits;
- size_t count;
-
- clear_buffer_new(bh_result);
- map_bh(bh_result, inode->i_sb,
- start_blkaddr + pgofs - start_fofs);
- count = end_fofs - pgofs + 1;
- if (count < (UINT_MAX >> blkbits))
- bh_result->b_size = (count << blkbits);
- else
- bh_result->b_size = UINT_MAX;
-
- stat_inc_read_hit(inode->i_sb);
- read_unlock(&fi->ext.ext_lock);
- return 1;
- }
- read_unlock(&fi->ext.ext_lock);
- return 0;
+ return f2fs_reserve_block(dn, index);
}
-void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
+struct page *get_read_data_page(struct inode *inode, pgoff_t index,
+ int op_flags, bool for_write)
{
- struct f2fs_inode_info *fi = F2FS_I(dn->inode);
- pgoff_t fofs, start_fofs, end_fofs;
- block_t start_blkaddr, end_blkaddr;
- int need_update = true;
-
- f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
- fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
- dn->ofs_in_node;
-
- /* Update the page address in the parent node */
- __set_data_blkaddr(dn, blk_addr);
+ struct address_space *mapping = inode->i_mapping;
+ struct dnode_of_data dn;
+ struct page *page;
+ struct extent_info ei = {0,0,0};
+ int err;
+ struct f2fs_io_info fio = {
+ .sbi = F2FS_I_SB(inode),
+ .type = DATA,
+ .op = REQ_OP_READ,
+ .op_flags = op_flags,
+ .encrypted_page = NULL,
+ };
- if (is_inode_flag_set(fi, FI_NO_EXTENT))
- return;
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ return read_mapping_page(mapping, index, NULL);
- write_lock(&fi->ext.ext_lock);
+ page = f2fs_grab_cache_page(mapping, index, for_write);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
- start_fofs = fi->ext.fofs;
- end_fofs = fi->ext.fofs + fi->ext.len - 1;
- start_blkaddr = fi->ext.blk_addr;
- end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1;
+ if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ dn.data_blkaddr = ei.blk + index - ei.fofs;
+ goto got_it;
+ }
- /* Drop and initialize the matched extent */
- if (fi->ext.len == 1 && fofs == start_fofs)
- fi->ext.len = 0;
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ if (err)
+ goto put_err;
+ f2fs_put_dnode(&dn);
- /* Initial extent */
- if (fi->ext.len == 0) {
- if (blk_addr != NULL_ADDR) {
- fi->ext.fofs = fofs;
- fi->ext.blk_addr = blk_addr;
- fi->ext.len = 1;
- }
- goto end_update;
+ if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
+ err = -ENOENT;
+ goto put_err;
}
-
- /* Front merge */
- if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) {
- fi->ext.fofs--;
- fi->ext.blk_addr--;
- fi->ext.len++;
- goto end_update;
+got_it:
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ return page;
}
- /* Back merge */
- if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) {
- fi->ext.len++;
- goto end_update;
+ /*
+ * A new dentry page is allocated but not able to be written, since its
+ * new inode page couldn't be allocated due to -ENOSPC.
+ * In such the case, its blkaddr can be remained as NEW_ADDR.
+ * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata.
+ */
+ if (dn.data_blkaddr == NEW_ADDR) {
+ zero_user_segment(page, 0, PAGE_SIZE);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
+ unlock_page(page);
+ return page;
}
- /* Split the existing extent */
- if (fi->ext.len > 1 &&
- fofs >= start_fofs && fofs <= end_fofs) {
- if ((end_fofs - fofs) < (fi->ext.len >> 1)) {
- fi->ext.len = fofs - start_fofs;
- } else {
- fi->ext.fofs = fofs + 1;
- fi->ext.blk_addr = start_blkaddr +
- fofs - start_fofs + 1;
- fi->ext.len -= fofs - start_fofs + 1;
- }
- } else {
- need_update = false;
- }
+ fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
+ fio.page = page;
+ err = f2fs_submit_page_bio(&fio);
+ if (err)
+ goto put_err;
+ return page;
- /* Finally, if the extent is very fragmented, let's drop the cache. */
- if (fi->ext.len < F2FS_MIN_EXTENT_LEN) {
- fi->ext.len = 0;
- set_inode_flag(fi, FI_NO_EXTENT);
- need_update = true;
- }
-end_update:
- write_unlock(&fi->ext.ext_lock);
- if (need_update)
- sync_inode_page(dn);
- return;
+put_err:
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
}
-struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
+struct page *find_data_page(struct inode *inode, pgoff_t index)
{
struct address_space *mapping = inode->i_mapping;
- struct dnode_of_data dn;
struct page *page;
- int err;
page = find_get_page(mapping, index);
if (page && PageUptodate(page))
return page;
f2fs_put_page(page, 0);
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
- if (err)
- return ERR_PTR(err);
- f2fs_put_dnode(&dn);
-
- if (dn.data_blkaddr == NULL_ADDR)
- return ERR_PTR(-ENOENT);
-
- /* By fallocate(), there is no cached page, but with NEW_ADDR */
- if (unlikely(dn.data_blkaddr == NEW_ADDR))
- return ERR_PTR(-EINVAL);
-
- page = grab_cache_page(mapping, index);
- if (!page)
- return ERR_PTR(-ENOMEM);
-
- if (PageUptodate(page)) {
- unlock_page(page);
+ page = get_read_data_page(inode, index, 0, false);
+ if (IS_ERR(page))
return page;
- }
- err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr,
- sync ? READ_SYNC : READA);
- if (err)
- return ERR_PTR(err);
+ if (PageUptodate(page))
+ return page;
- if (sync) {
- wait_on_page_locked(page);
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 0);
- return ERR_PTR(-EIO);
- }
+ wait_on_page_locked(page);
+ if (unlikely(!PageUptodate(page))) {
+ f2fs_put_page(page, 0);
+ return ERR_PTR(-EIO);
}
return page;
}
@@ -448,60 +630,26 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
* Because, the callers, functions in dir.c and GC, should be able to know
* whether this page exists or not.
*/
-struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
+struct page *get_lock_data_page(struct inode *inode, pgoff_t index,
+ bool for_write)
{
struct address_space *mapping = inode->i_mapping;
- struct dnode_of_data dn;
struct page *page;
- int err;
-
repeat:
- page = grab_cache_page(mapping, index);
- if (!page)
- return ERR_PTR(-ENOMEM);
-
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
- if (err) {
- f2fs_put_page(page, 1);
- return ERR_PTR(err);
- }
- f2fs_put_dnode(&dn);
-
- if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
- f2fs_put_page(page, 1);
- return ERR_PTR(-ENOENT);
- }
-
- if (PageUptodate(page))
+ page = get_read_data_page(inode, index, 0, for_write);
+ if (IS_ERR(page))
return page;
- /*
- * A new dentry page is allocated but not able to be written, since its
- * new inode page couldn't be allocated due to -ENOSPC.
- * In such the case, its blkaddr can be remained as NEW_ADDR.
- * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata.
- */
- if (dn.data_blkaddr == NEW_ADDR) {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
- SetPageUptodate(page);
- return page;
- }
-
- err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
- dn.data_blkaddr, READ_SYNC);
- if (err)
- return ERR_PTR(err);
-
+ /* wait for read completion */
lock_page(page);
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
- }
if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
+ if (unlikely(!PageUptodate(page))) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
return page;
}
@@ -511,7 +659,8 @@ repeat:
*
* Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
* f2fs_unlock_op().
- * Note that, ipage is set only by make_empty_dir.
+ * Note that, ipage is set only by make_empty_dir, and if any error occur,
+ * ipage should be released by this function.
*/
struct page *get_new_data_page(struct inode *inode,
struct page *ipage, pgoff_t index, bool new_i_size)
@@ -521,235 +670,616 @@ struct page *get_new_data_page(struct inode *inode,
struct dnode_of_data dn;
int err;
+ page = f2fs_grab_cache_page(mapping, index, true);
+ if (!page) {
+ /*
+ * before exiting, we should make sure ipage will be released
+ * if any error occur.
+ */
+ f2fs_put_page(ipage, 1);
+ return ERR_PTR(-ENOMEM);
+ }
+
set_new_dnode(&dn, inode, ipage, NULL, 0);
err = f2fs_reserve_block(&dn, index);
- if (err)
+ if (err) {
+ f2fs_put_page(page, 1);
return ERR_PTR(err);
-repeat:
- page = grab_cache_page(mapping, index);
- if (!page) {
- err = -ENOMEM;
- goto put_err;
}
+ if (!ipage)
+ f2fs_put_dnode(&dn);
if (PageUptodate(page))
- return page;
+ goto got_it;
if (dn.data_blkaddr == NEW_ADDR) {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
- SetPageUptodate(page);
+ zero_user_segment(page, 0, PAGE_SIZE);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
} else {
- err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
- dn.data_blkaddr, READ_SYNC);
- if (err)
- goto put_err;
-
- lock_page(page);
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 1);
- err = -EIO;
- goto put_err;
- }
- if (unlikely(page->mapping != mapping)) {
- f2fs_put_page(page, 1);
- goto repeat;
- }
- }
+ f2fs_put_page(page, 1);
- if (new_i_size &&
- i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
- i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
- /* Only the directory inode sets new_i_size */
- set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
+ /* if ipage exists, blkaddr should be NEW_ADDR */
+ f2fs_bug_on(F2FS_I_SB(inode), ipage);
+ page = get_lock_data_page(inode, index, true);
+ if (IS_ERR(page))
+ return page;
}
+got_it:
+ if (new_i_size && i_size_read(inode) <
+ ((loff_t)(index + 1) << PAGE_SHIFT))
+ f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT));
return page;
-
-put_err:
- f2fs_put_dnode(&dn);
- return ERR_PTR(err);
}
static int __allocate_data_block(struct dnode_of_data *dn)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
- struct f2fs_inode_info *fi = F2FS_I(dn->inode);
struct f2fs_summary sum;
- block_t new_blkaddr;
struct node_info ni;
pgoff_t fofs;
- int type;
+ blkcnt_t count = 1;
- if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+ if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return -EPERM;
- if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
- return -ENOSPC;
- __set_data_blkaddr(dn, NEW_ADDR);
- dn->data_blkaddr = NEW_ADDR;
+ dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
+ if (dn->data_blkaddr == NEW_ADDR)
+ goto alloc;
+
+ if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count)))
+ return -ENOSPC;
+alloc:
get_node_info(sbi, dn->nid, &ni);
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
- type = CURSEG_WARM_DATA;
-
- allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type);
-
- /* direct IO doesn't use extent cache to maximize the performance */
- set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
- update_extent_cache(new_blkaddr, dn);
- clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+ allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr,
+ &sum, CURSEG_WARM_DATA);
+ set_data_blkaddr(dn);
/* update i_size */
- fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
dn->ofs_in_node;
- if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
- i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
-
- dn->data_blkaddr = new_blkaddr;
+ if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT))
+ f2fs_i_size_write(dn->inode,
+ ((loff_t)(fofs + 1) << PAGE_SHIFT));
return 0;
}
+static inline bool __force_buffered_io(struct inode *inode, int rw)
+{
+ return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) ||
+ (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
+ F2FS_I_SB(inode)->s_ndevs);
+}
+
+int f2fs_preallocate_blocks(struct inode *inode, loff_t pos,
+ size_t count, bool dio)
+{
+ struct f2fs_map_blocks map;
+ int err = 0;
+
+ map.m_lblk = F2FS_BLK_ALIGN(pos);
+ map.m_len = F2FS_BYTES_TO_BLK(pos + count);
+ if (map.m_len > map.m_lblk)
+ map.m_len -= map.m_lblk;
+ else
+ map.m_len = 0;
+
+ map.m_next_pgofs = NULL;
+
+ if (dio) {
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+ return f2fs_map_blocks(inode, &map, 1,
+ __force_buffered_io(inode, WRITE) ?
+ F2FS_GET_BLOCK_PRE_AIO :
+ F2FS_GET_BLOCK_PRE_DIO);
+ }
+ if (pos + count > MAX_INLINE_DATA) {
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+ }
+ if (!f2fs_has_inline_data(inode))
+ return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+ return err;
+}
+
+static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
+{
+ if (flag == F2FS_GET_BLOCK_PRE_AIO) {
+ if (lock)
+ down_read(&sbi->node_change);
+ else
+ up_read(&sbi->node_change);
+ } else {
+ if (lock)
+ f2fs_lock_op(sbi);
+ else
+ f2fs_unlock_op(sbi);
+ }
+}
+
/*
- * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
+ * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with
+ * f2fs_map_blocks structure.
* If original data blocks are allocated, then give them to blockdev.
* Otherwise,
* a. preallocate requested block addresses
* b. do not use extent cache for better performance
* c. give the block addresses to blockdev
*/
-static int __get_data_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create, bool fiemap)
+int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
+ int create, int flag)
{
- unsigned int blkbits = inode->i_sb->s_blocksize_bits;
- unsigned maxblocks = bh_result->b_size >> blkbits;
+ unsigned int maxblocks = map->m_len;
struct dnode_of_data dn;
- int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
- pgoff_t pgofs, end_offset;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int mode = create ? ALLOC_NODE : LOOKUP_NODE;
+ pgoff_t pgofs, end_offset, end;
int err = 0, ofs = 1;
- bool allocated = false;
+ unsigned int ofs_in_node, last_ofs_in_node;
+ blkcnt_t prealloc;
+ struct extent_info ei = {0,0,0};
+ block_t blkaddr;
- /* Get the page offset from the block offset(iblock) */
- pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
+ if (!maxblocks)
+ return 0;
- if (check_extent_cache(inode, pgofs, bh_result))
- goto out;
+ map->m_len = 0;
+ map->m_flags = 0;
- if (create) {
- f2fs_balance_fs(F2FS_I_SB(inode));
- f2fs_lock_op(F2FS_I_SB(inode));
+ /* it only supports block size == page size */
+ pgofs = (pgoff_t)map->m_lblk;
+ end = pgofs + maxblocks;
+
+ if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+ map->m_pblk = ei.blk + pgofs - ei.fofs;
+ map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs);
+ map->m_flags = F2FS_MAP_MAPPED;
+ goto out;
}
+next_dnode:
+ if (create)
+ __do_map_lock(sbi, flag, true);
+
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, pgofs, mode);
if (err) {
- if (err == -ENOENT)
+ if (flag == F2FS_GET_BLOCK_BMAP)
+ map->m_pblk = 0;
+ if (err == -ENOENT) {
err = 0;
+ if (map->m_next_pgofs)
+ *map->m_next_pgofs =
+ get_next_page_offset(&dn, pgofs);
+ }
goto unlock_out;
}
- if (dn.data_blkaddr == NEW_ADDR && !fiemap)
- goto put_out;
- if (dn.data_blkaddr != NULL_ADDR) {
- map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
- } else if (create) {
- err = __allocate_data_block(&dn);
- if (err)
- goto put_out;
- allocated = true;
- map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+ prealloc = 0;
+ last_ofs_in_node = ofs_in_node = dn.ofs_in_node;
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+
+next_block:
+ blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+
+ if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
+ if (create) {
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto sync_out;
+ }
+ if (flag == F2FS_GET_BLOCK_PRE_AIO) {
+ if (blkaddr == NULL_ADDR) {
+ prealloc++;
+ last_ofs_in_node = dn.ofs_in_node;
+ }
+ } else {
+ err = __allocate_data_block(&dn);
+ if (!err)
+ set_inode_flag(inode, FI_APPEND_WRITE);
+ }
+ if (err)
+ goto sync_out;
+ map->m_flags |= F2FS_MAP_NEW;
+ blkaddr = dn.data_blkaddr;
+ } else {
+ if (flag == F2FS_GET_BLOCK_BMAP) {
+ map->m_pblk = 0;
+ goto sync_out;
+ }
+ if (flag == F2FS_GET_BLOCK_FIEMAP &&
+ blkaddr == NULL_ADDR) {
+ if (map->m_next_pgofs)
+ *map->m_next_pgofs = pgofs + 1;
+ }
+ if (flag != F2FS_GET_BLOCK_FIEMAP ||
+ blkaddr != NEW_ADDR)
+ goto sync_out;
+ }
+ }
+
+ if (flag == F2FS_GET_BLOCK_PRE_AIO)
+ goto skip;
+
+ if (map->m_len == 0) {
+ /* preallocated unwritten block should be mapped for fiemap. */
+ if (blkaddr == NEW_ADDR)
+ map->m_flags |= F2FS_MAP_UNWRITTEN;
+ map->m_flags |= F2FS_MAP_MAPPED;
+
+ map->m_pblk = blkaddr;
+ map->m_len = 1;
+ } else if ((map->m_pblk != NEW_ADDR &&
+ blkaddr == (map->m_pblk + ofs)) ||
+ (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
+ flag == F2FS_GET_BLOCK_PRE_DIO) {
+ ofs++;
+ map->m_len++;
} else {
- goto put_out;
+ goto sync_out;
}
- end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
- bh_result->b_size = (((size_t)1) << blkbits);
+skip:
dn.ofs_in_node++;
pgofs++;
-get_next:
- if (dn.ofs_in_node >= end_offset) {
- if (allocated)
- sync_inode_page(&dn);
- allocated = false;
- f2fs_put_dnode(&dn);
+ /* preallocate blocks in batch for one dnode page */
+ if (flag == F2FS_GET_BLOCK_PRE_AIO &&
+ (pgofs == end || dn.ofs_in_node == end_offset)) {
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, pgofs, mode);
- if (err) {
- if (err == -ENOENT)
- err = 0;
- goto unlock_out;
- }
- if (dn.data_blkaddr == NEW_ADDR && !fiemap)
- goto put_out;
+ dn.ofs_in_node = ofs_in_node;
+ err = reserve_new_blocks(&dn, prealloc);
+ if (err)
+ goto sync_out;
- end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ map->m_len += dn.ofs_in_node - ofs_in_node;
+ if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) {
+ err = -ENOSPC;
+ goto sync_out;
+ }
+ dn.ofs_in_node = end_offset;
}
- if (maxblocks > (bh_result->b_size >> blkbits)) {
- block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
- if (blkaddr == NULL_ADDR && create) {
- err = __allocate_data_block(&dn);
- if (err)
- goto sync_out;
- allocated = true;
- blkaddr = dn.data_blkaddr;
- }
- /* Give more consecutive addresses for the readahead */
- if (blkaddr == (bh_result->b_blocknr + ofs)) {
- ofs++;
- dn.ofs_in_node++;
- pgofs++;
- bh_result->b_size += (((size_t)1) << blkbits);
- goto get_next;
- }
+ if (pgofs >= end)
+ goto sync_out;
+ else if (dn.ofs_in_node < end_offset)
+ goto next_block;
+
+ f2fs_put_dnode(&dn);
+
+ if (create) {
+ __do_map_lock(sbi, flag, false);
+ f2fs_balance_fs(sbi, dn.node_changed);
}
+ goto next_dnode;
+
sync_out:
- if (allocated)
- sync_inode_page(&dn);
-put_out:
f2fs_put_dnode(&dn);
unlock_out:
- if (create)
- f2fs_unlock_op(F2FS_I_SB(inode));
+ if (create) {
+ __do_map_lock(sbi, flag, false);
+ f2fs_balance_fs(sbi, dn.node_changed);
+ }
out:
- trace_f2fs_get_data_block(inode, iblock, bh_result, err);
+ trace_f2fs_map_blocks(inode, map, err);
+ return err;
+}
+
+static int __get_data_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create, int flag,
+ pgoff_t *next_pgofs)
+{
+ struct f2fs_map_blocks map;
+ int err;
+
+ map.m_lblk = iblock;
+ map.m_len = bh->b_size >> inode->i_blkbits;
+ map.m_next_pgofs = next_pgofs;
+
+ err = f2fs_map_blocks(inode, &map, create, flag);
+ if (!err) {
+ map_bh(bh, inode->i_sb, map.m_pblk);
+ bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
+ bh->b_size = (u64)map.m_len << inode->i_blkbits;
+ }
return err;
}
static int get_data_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create, int flag,
+ pgoff_t *next_pgofs)
+{
+ return __get_data_block(inode, iblock, bh_result, create,
+ flag, next_pgofs);
+}
+
+static int get_data_block_dio(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- return __get_data_block(inode, iblock, bh_result, create, false);
+ return __get_data_block(inode, iblock, bh_result, create,
+ F2FS_GET_BLOCK_DIO, NULL);
}
-static int get_data_block_fiemap(struct inode *inode, sector_t iblock,
+static int get_data_block_bmap(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- return __get_data_block(inode, iblock, bh_result, create, true);
+ /* Block number less than F2FS MAX BLOCKS */
+ if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks))
+ return -EFBIG;
+
+ return __get_data_block(inode, iblock, bh_result, create,
+ F2FS_GET_BLOCK_BMAP, NULL);
+}
+
+static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
+{
+ return (offset >> inode->i_blkbits);
+}
+
+static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
+{
+ return (blk << inode->i_blkbits);
}
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
- return generic_block_fiemap(inode, fieinfo,
- start, len, get_data_block_fiemap);
+ struct buffer_head map_bh;
+ sector_t start_blk, last_blk;
+ pgoff_t next_pgofs;
+ u64 logical = 0, phys = 0, size = 0;
+ u32 flags = 0;
+ int ret = 0;
+
+ ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+ if (ret)
+ return ret;
+
+ if (f2fs_has_inline_data(inode)) {
+ ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len);
+ if (ret != -EAGAIN)
+ return ret;
+ }
+
+ inode_lock(inode);
+
+ if (logical_to_blk(inode, len) == 0)
+ len = blk_to_logical(inode, 1);
+
+ start_blk = logical_to_blk(inode, start);
+ last_blk = logical_to_blk(inode, start + len - 1);
+
+next:
+ memset(&map_bh, 0, sizeof(struct buffer_head));
+ map_bh.b_size = len;
+
+ ret = get_data_block(inode, start_blk, &map_bh, 0,
+ F2FS_GET_BLOCK_FIEMAP, &next_pgofs);
+ if (ret)
+ goto out;
+
+ /* HOLE */
+ if (!buffer_mapped(&map_bh)) {
+ start_blk = next_pgofs;
+
+ if (blk_to_logical(inode, start_blk) < blk_to_logical(inode,
+ F2FS_I_SB(inode)->max_file_blocks))
+ goto prep_next;
+
+ flags |= FIEMAP_EXTENT_LAST;
+ }
+
+ if (size) {
+ if (f2fs_encrypted_inode(inode))
+ flags |= FIEMAP_EXTENT_DATA_ENCRYPTED;
+
+ ret = fiemap_fill_next_extent(fieinfo, logical,
+ phys, size, flags);
+ }
+
+ if (start_blk > last_blk || ret)
+ goto out;
+
+ logical = blk_to_logical(inode, start_blk);
+ phys = blk_to_logical(inode, map_bh.b_blocknr);
+ size = map_bh.b_size;
+ flags = 0;
+ if (buffer_unwritten(&map_bh))
+ flags = FIEMAP_EXTENT_UNWRITTEN;
+
+ start_blk += logical_to_blk(inode, size);
+
+prep_next:
+ cond_resched();
+ if (fatal_signal_pending(current))
+ ret = -EINTR;
+ else
+ goto next;
+out:
+ if (ret == 1)
+ ret = 0;
+
+ inode_unlock(inode);
+ return ret;
+}
+
+static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
+ unsigned nr_pages)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct fscrypt_ctx *ctx = NULL;
+ struct bio *bio;
+
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+ ctx = fscrypt_get_ctx(inode, GFP_NOFS);
+ if (IS_ERR(ctx))
+ return ERR_CAST(ctx);
+
+ /* wait the page to be moved by cleaning */
+ f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
+ }
+
+ bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES));
+ if (!bio) {
+ if (ctx)
+ fscrypt_release_ctx(ctx);
+ return ERR_PTR(-ENOMEM);
+ }
+ f2fs_target_device(sbi, blkaddr, bio);
+ bio->bi_end_io = f2fs_read_end_io;
+ bio->bi_private = ctx;
+
+ return bio;
+}
+
+/*
+ * This function was originally taken from fs/mpage.c, and customized for f2fs.
+ * Major change was from block_size == page_size in f2fs by default.
+ */
+static int f2fs_mpage_readpages(struct address_space *mapping,
+ struct list_head *pages, struct page *page,
+ unsigned nr_pages)
+{
+ struct bio *bio = NULL;
+ unsigned page_idx;
+ sector_t last_block_in_bio = 0;
+ struct inode *inode = mapping->host;
+ const unsigned blkbits = inode->i_blkbits;
+ const unsigned blocksize = 1 << blkbits;
+ sector_t block_in_file;
+ sector_t last_block;
+ sector_t last_block_in_file;
+ sector_t block_nr;
+ struct f2fs_map_blocks map;
+
+ map.m_pblk = 0;
+ map.m_lblk = 0;
+ map.m_len = 0;
+ map.m_flags = 0;
+ map.m_next_pgofs = NULL;
+
+ for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
+
+ if (pages) {
+ page = list_last_entry(pages, struct page, lru);
+
+ prefetchw(&page->flags);
+ list_del(&page->lru);
+ if (add_to_page_cache_lru(page, mapping,
+ page->index, GFP_KERNEL))
+ goto next_page;
+ }
+
+ block_in_file = (sector_t)page->index;
+ last_block = block_in_file + nr_pages;
+ last_block_in_file = (i_size_read(inode) + blocksize - 1) >>
+ blkbits;
+ if (last_block > last_block_in_file)
+ last_block = last_block_in_file;
+
+ /*
+ * Map blocks using the previous result first.
+ */
+ if ((map.m_flags & F2FS_MAP_MAPPED) &&
+ block_in_file > map.m_lblk &&
+ block_in_file < (map.m_lblk + map.m_len))
+ goto got_it;
+
+ /*
+ * Then do more f2fs_map_blocks() calls until we are
+ * done with this page.
+ */
+ map.m_flags = 0;
+
+ if (block_in_file < last_block) {
+ map.m_lblk = block_in_file;
+ map.m_len = last_block - block_in_file;
+
+ if (f2fs_map_blocks(inode, &map, 0,
+ F2FS_GET_BLOCK_READ))
+ goto set_error_page;
+ }
+got_it:
+ if ((map.m_flags & F2FS_MAP_MAPPED)) {
+ block_nr = map.m_pblk + block_in_file - map.m_lblk;
+ SetPageMappedToDisk(page);
+
+ if (!PageUptodate(page) && !cleancache_get_page(page)) {
+ SetPageUptodate(page);
+ goto confused;
+ }
+ } else {
+ zero_user_segment(page, 0, PAGE_SIZE);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
+ unlock_page(page);
+ goto next_page;
+ }
+
+ /*
+ * This page will go to BIO. Do we need to send this
+ * BIO off first?
+ */
+ if (bio && (last_block_in_bio != block_nr - 1 ||
+ !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
+submit_and_realloc:
+ __submit_bio(F2FS_I_SB(inode), bio, DATA);
+ bio = NULL;
+ }
+ if (bio == NULL) {
+ bio = f2fs_grab_bio(inode, block_nr, nr_pages);
+ if (IS_ERR(bio)) {
+ bio = NULL;
+ goto set_error_page;
+ }
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ }
+
+ if (bio_add_page(bio, page, blocksize, 0) < blocksize)
+ goto submit_and_realloc;
+
+ last_block_in_bio = block_nr;
+ goto next_page;
+set_error_page:
+ SetPageError(page);
+ zero_user_segment(page, 0, PAGE_SIZE);
+ unlock_page(page);
+ goto next_page;
+confused:
+ if (bio) {
+ __submit_bio(F2FS_I_SB(inode), bio, DATA);
+ bio = NULL;
+ }
+ unlock_page(page);
+next_page:
+ if (pages)
+ put_page(page);
+ }
+ BUG_ON(pages && !list_empty(pages));
+ if (bio)
+ __submit_bio(F2FS_I_SB(inode), bio, DATA);
+ return 0;
}
static int f2fs_read_data_page(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
- int ret;
+ int ret = -EAGAIN;
trace_f2fs_readpage(page, DATA);
/* If the file has inline data, try to read it directly */
if (f2fs_has_inline_data(inode))
ret = f2fs_read_inline_data(inode, page);
- else
- ret = mpage_readpage(page, get_data_block);
-
+ if (ret == -EAGAIN)
+ ret = f2fs_mpage_readpages(page->mapping, NULL, page, 1);
return ret;
}
@@ -758,30 +1288,105 @@ static int f2fs_read_data_pages(struct file *file,
struct list_head *pages, unsigned nr_pages)
{
struct inode *inode = file->f_mapping->host;
+ struct page *page = list_last_entry(pages, struct page, lru);
+
+ trace_f2fs_readpages(inode, page, nr_pages);
/* If the file has inline data, skip readpages */
if (f2fs_has_inline_data(inode))
return 0;
- return mpage_readpages(mapping, pages, nr_pages, get_data_block);
+ return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages);
}
-int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
+static int encrypt_one_page(struct f2fs_io_info *fio)
{
+ struct inode *inode = fio->page->mapping->host;
+ gfp_t gfp_flags = GFP_NOFS;
+
+ if (!f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode))
+ return 0;
+
+ /* wait for GCed encrypted page writeback */
+ f2fs_wait_on_encrypted_page_writeback(fio->sbi, fio->old_blkaddr);
+
+retry_encrypt:
+ fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
+ PAGE_SIZE, 0, fio->page->index, gfp_flags);
+ if (!IS_ERR(fio->encrypted_page))
+ return 0;
+
+ /* flush pending IOs and wait for a while in the ENOMEM case */
+ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
+ f2fs_flush_merged_bios(fio->sbi);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ gfp_flags |= __GFP_NOFAIL;
+ goto retry_encrypt;
+ }
+ return PTR_ERR(fio->encrypted_page);
+}
+
+static inline bool need_inplace_update(struct f2fs_io_info *fio)
+{
+ struct inode *inode = fio->page->mapping->host;
+
+ if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
+ return false;
+ if (is_cold_data(fio->page))
+ return false;
+ if (IS_ATOMIC_WRITTEN_PAGE(fio->page))
+ return false;
+
+ return need_inplace_update_policy(inode, fio);
+}
+
+static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio)
+{
+ if (fio->old_blkaddr == NEW_ADDR)
+ return false;
+ if (fio->old_blkaddr == NULL_ADDR)
+ return false;
+ return true;
+}
+
+int do_write_data_page(struct f2fs_io_info *fio)
+{
+ struct page *page = fio->page;
struct inode *inode = page->mapping->host;
- block_t old_blkaddr, new_blkaddr;
struct dnode_of_data dn;
+ struct extent_info ei = {0,0,0};
+ bool ipu_force = false;
int err = 0;
set_new_dnode(&dn, inode, NULL, NULL, 0);
+ if (need_inplace_update(fio) &&
+ f2fs_lookup_extent_cache(inode, page->index, &ei)) {
+ fio->old_blkaddr = ei.blk + page->index - ei.fofs;
+
+ if (valid_ipu_blkaddr(fio)) {
+ ipu_force = true;
+ fio->need_lock = false;
+ goto got_it;
+ }
+ }
+
+ if (fio->need_lock)
+ f2fs_lock_op(fio->sbi);
+
err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
if (err)
- return err;
+ goto out;
- old_blkaddr = dn.data_blkaddr;
+ fio->old_blkaddr = dn.data_blkaddr;
/* This page is already truncated */
- if (old_blkaddr == NULL_ADDR)
+ if (fio->old_blkaddr == NULL_ADDR) {
+ ClearPageUptodate(page);
+ goto out_writepage;
+ }
+got_it:
+ err = encrypt_one_page(fio);
+ if (err)
goto out_writepage;
set_page_writeback(page);
@@ -790,35 +1395,52 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
* If current allocation needs SSR,
* it had better in-place writes for updated data.
*/
- if (unlikely(old_blkaddr != NEW_ADDR &&
- !is_cold_data(page) &&
- need_inplace_update(inode))) {
- rewrite_data_page(page, old_blkaddr, fio);
- set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
- } else {
- write_data_page(page, &dn, &new_blkaddr, fio);
- update_extent_cache(new_blkaddr, &dn);
- set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
+ if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) {
+ f2fs_put_dnode(&dn);
+ if (fio->need_lock)
+ f2fs_unlock_op(fio->sbi);
+ err = rewrite_data_page(fio);
+ trace_f2fs_do_write_data_page(fio->page, IPU);
+ set_inode_flag(inode, FI_UPDATE_WRITE);
+ return err;
}
+
+ /* LFS mode write path */
+ write_data_page(&dn, fio);
+ trace_f2fs_do_write_data_page(page, OPU);
+ set_inode_flag(inode, FI_APPEND_WRITE);
+ if (page->index == 0)
+ set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
out_writepage:
f2fs_put_dnode(&dn);
+out:
+ if (fio->need_lock)
+ f2fs_unlock_op(fio->sbi);
return err;
}
-static int f2fs_write_data_page(struct page *page,
- struct writeback_control *wbc)
+static int __write_data_page(struct page *page, bool *submitted,
+ struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long) i_size)
- >> PAGE_CACHE_SHIFT;
+ >> PAGE_SHIFT;
+ loff_t psize = (page->index + 1) << PAGE_SHIFT;
unsigned offset = 0;
bool need_balance_fs = false;
int err = 0;
struct f2fs_io_info fio = {
+ .sbi = sbi,
.type = DATA,
- .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+ .op = REQ_OP_WRITE,
+ .op_flags = wbc_to_write_flags(wbc),
+ .old_blkaddr = NULL_ADDR,
+ .page = page,
+ .encrypted_page = NULL,
+ .submitted = false,
+ .need_lock = true,
};
trace_f2fs_writepage(page, DATA);
@@ -830,66 +1452,238 @@ static int f2fs_write_data_page(struct page *page,
* If the offset is out-of-range of file size,
* this page does not have to be written to disk.
*/
- offset = i_size & (PAGE_CACHE_SIZE - 1);
+ offset = i_size & (PAGE_SIZE - 1);
if ((page->index >= end_index + 1) || !offset)
goto out;
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, PAGE_SIZE);
write:
- if (unlikely(sbi->por_doing))
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+ goto redirty_out;
+ if (f2fs_is_drop_cache(inode))
+ goto out;
+ /* we should not write 0'th page having journal header */
+ if (f2fs_is_volatile_file(inode) && (!page->index ||
+ (!wbc->for_reclaim &&
+ available_free_memory(sbi, BASE_CHECK))))
goto redirty_out;
-
- /* Dentry blocks are controlled by checkpoint */
- if (S_ISDIR(inode->i_mode)) {
- if (unlikely(f2fs_cp_error(sbi)))
- goto redirty_out;
- err = do_write_data_page(page, &fio);
- goto done;
- }
/* we should bypass data pages to proceed the kworkder jobs */
if (unlikely(f2fs_cp_error(sbi))) {
- SetPageError(page);
- unlock_page(page);
+ mapping_set_error(page->mapping, -EIO);
goto out;
}
+ /* Dentry blocks are controlled by checkpoint */
+ if (S_ISDIR(inode->i_mode)) {
+ fio.need_lock = false;
+ err = do_write_data_page(&fio);
+ goto done;
+ }
+
if (!wbc->for_reclaim)
need_balance_fs = true;
- else if (has_not_enough_free_secs(sbi, 0))
+ else if (has_not_enough_free_secs(sbi, 0, 0))
goto redirty_out;
-
- f2fs_lock_op(sbi);
- if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode))
- err = f2fs_write_inline_data(inode, page, offset);
else
- err = do_write_data_page(page, &fio);
- f2fs_unlock_op(sbi);
+ set_inode_flag(inode, FI_HOT_DATA);
+
+ err = -EAGAIN;
+ if (f2fs_has_inline_data(inode)) {
+ err = f2fs_write_inline_data(inode, page);
+ if (!err)
+ goto out;
+ }
+
+ if (err == -EAGAIN)
+ err = do_write_data_page(&fio);
+ if (F2FS_I(inode)->last_disk_size < psize)
+ F2FS_I(inode)->last_disk_size = psize;
+
done:
if (err && err != -ENOENT)
goto redirty_out;
- clear_cold_data(page);
out:
inode_dec_dirty_pages(inode);
+ if (err)
+ ClearPageUptodate(page);
+
+ if (wbc->for_reclaim) {
+ f2fs_submit_merged_bio_cond(sbi, inode, 0, page->index,
+ DATA, WRITE);
+ clear_inode_flag(inode, FI_HOT_DATA);
+ remove_dirty_inode(inode);
+ submitted = NULL;
+ }
+
unlock_page(page);
- if (need_balance_fs)
- f2fs_balance_fs(sbi);
- if (wbc->for_reclaim)
+ if (!S_ISDIR(inode->i_mode))
+ f2fs_balance_fs(sbi, need_balance_fs);
+
+ if (unlikely(f2fs_cp_error(sbi))) {
f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ submitted = NULL;
+ }
+
+ if (submitted)
+ *submitted = fio.submitted;
+
return 0;
redirty_out:
redirty_page_for_writepage(wbc, page);
- return AOP_WRITEPAGE_ACTIVATE;
+ if (!err)
+ return AOP_WRITEPAGE_ACTIVATE;
+ unlock_page(page);
+ return err;
}
-static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+static int f2fs_write_data_page(struct page *page,
+ struct writeback_control *wbc)
+{
+ return __write_data_page(page, NULL, wbc);
+}
+
+/*
+ * This function was copied from write_cche_pages from mm/page-writeback.c.
+ * The major change is making write step of cold data page separately from
+ * warm/hot data page.
+ */
+static int f2fs_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- struct address_space *mapping = data;
- int ret = mapping->a_ops->writepage(page, wbc);
- mapping_set_error(mapping, ret);
+ int ret = 0;
+ int done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t uninitialized_var(writeback_index);
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ pgoff_t done_index;
+ pgoff_t last_idx = ULONG_MAX;
+ int cycled;
+ int range_whole = 0;
+ int tag;
+
+ pagevec_init(&pvec, 0);
+
+ if (get_dirty_pages(mapping->host) <=
+ SM_I(F2FS_M_SB(mapping))->min_hot_blocks)
+ set_inode_flag(mapping->host, FI_HOT_DATA);
+ else
+ clear_inode_flag(mapping->host, FI_HOT_DATA);
+
+ if (wbc->range_cyclic) {
+ writeback_index = mapping->writeback_index; /* prev offset */
+ index = writeback_index;
+ if (index == 0)
+ cycled = 1;
+ else
+ cycled = 0;
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+ cycled = 1; /* ignore range_cyclic tests */
+ }
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
+retry:
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, index, end);
+ done_index = index;
+ while (!done && (index <= end)) {
+ int i;
+
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ bool submitted = false;
+
+ if (page->index > end) {
+ done = 1;
+ break;
+ }
+
+ done_index = page->index;
+
+ lock_page(page);
+
+ if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ if (PageWriteback(page)) {
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ f2fs_wait_on_page_writeback(page,
+ DATA, true);
+ else
+ goto continue_unlock;
+ }
+
+ BUG_ON(PageWriteback(page));
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ ret = __write_data_page(page, &submitted, wbc);
+ if (unlikely(ret)) {
+ /*
+ * keep nr_to_write, since vfs uses this to
+ * get # of written pages.
+ */
+ if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ unlock_page(page);
+ ret = 0;
+ continue;
+ }
+ done_index = page->index + 1;
+ done = 1;
+ break;
+ } else if (submitted) {
+ last_idx = page->index;
+ }
+
+ /* give a priority to WB_SYNC threads */
+ if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) ||
+ --wbc->nr_to_write <= 0) &&
+ wbc->sync_mode == WB_SYNC_NONE) {
+ done = 1;
+ break;
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+
+ if (!cycled && !done) {
+ cycled = 1;
+ index = 0;
+ end = writeback_index - 1;
+ goto retry;
+ }
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = done_index;
+
+ if (last_idx != ULONG_MAX)
+ f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host,
+ 0, last_idx, DATA, WRITE);
+
return ret;
}
@@ -898,51 +1692,143 @@ static int f2fs_write_data_pages(struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- bool locked = false;
+ struct blk_plug plug;
int ret;
- long diff;
-
- trace_f2fs_writepages(mapping->host, wbc, DATA);
/* deal with chardevs and other special file */
if (!mapping->a_ops->writepage)
return 0;
+ /* skip writing if there is no dirty page in this inode */
+ if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE)
+ return 0;
+
if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
- diff = nr_pages_to_write(sbi, DATA, wbc);
+ /* skip writing during file defragment */
+ if (is_inode_flag_set(inode, FI_DO_DEFRAG))
+ goto skip_write;
- if (!S_ISDIR(inode->i_mode)) {
- mutex_lock(&sbi->writepages);
- locked = true;
- }
- ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
- if (locked)
- mutex_unlock(&sbi->writepages);
+ /* during POR, we don't need to trigger writepage at all. */
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+ goto skip_write;
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ trace_f2fs_writepages(mapping->host, wbc, DATA);
+
+ /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ atomic_inc(&sbi->wb_sync_req);
+ else if (atomic_read(&sbi->wb_sync_req))
+ goto skip_write;
- remove_dirty_dir_inode(inode);
+ blk_start_plug(&plug);
+ ret = f2fs_write_cache_pages(mapping, wbc);
+ blk_finish_plug(&plug);
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ atomic_dec(&sbi->wb_sync_req);
+ /*
+ * if some pages were truncated, we cannot guarantee its mapping->host
+ * to detect pending bios.
+ */
- wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
+ remove_dirty_inode(inode);
return ret;
skip_write:
wbc->pages_skipped += get_dirty_pages(inode);
+ trace_f2fs_writepages(mapping->host, wbc, DATA);
return 0;
}
static void f2fs_write_failed(struct address_space *mapping, loff_t to)
{
struct inode *inode = mapping->host;
+ loff_t i_size = i_size_read(inode);
+
+ if (to > i_size) {
+ truncate_pagecache(inode, i_size);
+ truncate_blocks(inode, i_size, true);
+ }
+}
- if (to > inode->i_size) {
- truncate_pagecache(inode, inode->i_size);
- truncate_blocks(inode, inode->i_size, true);
+static int prepare_write_begin(struct f2fs_sb_info *sbi,
+ struct page *page, loff_t pos, unsigned len,
+ block_t *blk_addr, bool *node_changed)
+{
+ struct inode *inode = page->mapping->host;
+ pgoff_t index = page->index;
+ struct dnode_of_data dn;
+ struct page *ipage;
+ bool locked = false;
+ struct extent_info ei = {0,0,0};
+ int err = 0;
+
+ /*
+ * we already allocated all the blocks, so we don't need to get
+ * the block addresses when there is no need to fill the page.
+ */
+ if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE)
+ return 0;
+
+ if (f2fs_has_inline_data(inode) ||
+ (pos & PAGE_MASK) >= i_size_read(inode)) {
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+ locked = true;
+ }
+restart:
+ /* check inline_data */
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto unlock_out;
+ }
+
+ set_new_dnode(&dn, inode, ipage, ipage, 0);
+
+ if (f2fs_has_inline_data(inode)) {
+ if (pos + len <= MAX_INLINE_DATA) {
+ read_inline_data(page, ipage);
+ set_inode_flag(inode, FI_DATA_EXIST);
+ if (inode->i_nlink)
+ set_inline_node(ipage);
+ } else {
+ err = f2fs_convert_inline_page(&dn, page);
+ if (err)
+ goto out;
+ if (dn.data_blkaddr == NULL_ADDR)
+ err = f2fs_get_block(&dn, index);
+ }
+ } else if (locked) {
+ err = f2fs_get_block(&dn, index);
+ } else {
+ if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ dn.data_blkaddr = ei.blk + index - ei.fofs;
+ } else {
+ /* hole case */
+ err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ if (err || dn.data_blkaddr == NULL_ADDR) {
+ f2fs_put_dnode(&dn);
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO,
+ true);
+ locked = true;
+ goto restart;
+ }
+ }
}
+
+ /* convert_inline_page can make node_changed */
+ *blk_addr = dn.data_blkaddr;
+ *node_changed = dn.node_changed;
+out:
+ f2fs_put_dnode(&dn);
+unlock_out:
+ if (locked)
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+ return err;
}
static int f2fs_write_begin(struct file *file, struct address_space *mapping,
@@ -951,92 +1837,111 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct page *page;
- pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
- struct dnode_of_data dn;
+ struct page *page = NULL;
+ pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT;
+ bool need_balance = false;
+ block_t blkaddr = NULL_ADDR;
int err = 0;
+ if (trace_android_fs_datawrite_start_enabled()) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_datawrite_start(inode, pos, len,
+ current->pid, path,
+ current->comm);
+ }
trace_f2fs_write_begin(inode, pos, len, flags);
- f2fs_balance_fs(sbi);
+ /*
+ * We should check this at this moment to avoid deadlock on inode page
+ * and #0 page. The locking rule for inline_data conversion should be:
+ * lock_page(page #0) -> lock_page(inode_page)
+ */
+ if (index != 0) {
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ goto fail;
+ }
repeat:
- err = f2fs_convert_inline_data(inode, pos + len, NULL);
- if (err)
- goto fail;
-
- page = grab_cache_page_write_begin(mapping, index, flags);
+ /*
+ * Do not use grab_cache_page_write_begin() to avoid deadlock due to
+ * wait_for_stable_page. Will wait that below with our IO control.
+ */
+ page = grab_cache_page(mapping, index);
if (!page) {
err = -ENOMEM;
goto fail;
}
- /* to avoid latency during memory pressure */
- unlock_page(page);
-
*pagep = page;
- if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA)
- goto inline_data;
-
- f2fs_lock_op(sbi);
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = f2fs_reserve_block(&dn, index);
- f2fs_unlock_op(sbi);
- if (err) {
- f2fs_put_page(page, 0);
+ err = prepare_write_begin(sbi, page, pos, len,
+ &blkaddr, &need_balance);
+ if (err)
goto fail;
- }
-inline_data:
- lock_page(page);
- if (unlikely(page->mapping != mapping)) {
- f2fs_put_page(page, 1);
- goto repeat;
+
+ if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) {
+ unlock_page(page);
+ f2fs_balance_fs(sbi, true);
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ f2fs_put_page(page, 1);
+ goto repeat;
+ }
}
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, false);
- if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
- return 0;
+ /* wait for GCed encrypted page writeback */
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
- if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
- unsigned start = pos & (PAGE_CACHE_SIZE - 1);
- unsigned end = start + len;
+ if (len == PAGE_SIZE || PageUptodate(page))
+ return 0;
- /* Reading beyond i_size is simple: memset to zero */
- zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
- goto out;
+ if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode)) {
+ zero_user_segment(page, len, PAGE_SIZE);
+ return 0;
}
- if (f2fs_has_inline_data(inode)) {
- err = f2fs_read_inline_data(inode, page);
- if (err) {
- page_cache_release(page);
+ if (blkaddr == NEW_ADDR) {
+ zero_user_segment(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ } else {
+ struct bio *bio;
+
+ bio = f2fs_grab_bio(inode, blkaddr, 1);
+ if (IS_ERR(bio)) {
+ err = PTR_ERR(bio);
goto fail;
}
- } else if (dn.data_blkaddr == NEW_ADDR) {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
- } else {
- err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
- READ_SYNC);
- if (err)
+ bio->bi_rw = REQ_OP_READ;
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ bio_put(bio);
+ err = -EFAULT;
goto fail;
+ }
+
+ __submit_bio(sbi, bio, DATA);
lock_page(page);
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 1);
- err = -EIO;
- goto fail;
- }
if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
+ if (unlikely(!PageUptodate(page))) {
+ err = -EIO;
+ goto fail;
+ }
}
-out:
- SetPageUptodate(page);
- clear_cold_data(page);
return 0;
+
fail:
+ f2fs_put_page(page, 1);
f2fs_write_failed(mapping, pos + len);
return err;
}
@@ -1048,31 +1953,38 @@ static int f2fs_write_end(struct file *file,
{
struct inode *inode = page->mapping->host;
+ trace_android_fs_datawrite_end(inode, pos, len);
trace_f2fs_write_end(inode, pos, len, copied);
- if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
- register_inmem_page(inode, page);
- else
- set_page_dirty(page);
-
- if (pos + copied > i_size_read(inode)) {
- i_size_write(inode, pos + copied);
- mark_inode_dirty(inode);
- update_inode_page(inode);
+ /*
+ * This should be come from len == PAGE_SIZE, and we expect copied
+ * should be PAGE_SIZE. Otherwise, we treat it with zero copied and
+ * let generic_perform_write() try to copy data again through copied=0.
+ */
+ if (!PageUptodate(page)) {
+ if (unlikely(copied != len))
+ copied = 0;
+ else
+ SetPageUptodate(page);
}
+ if (!copied)
+ goto unlock_out;
+
+ set_page_dirty(page);
+ if (pos + copied > i_size_read(inode))
+ f2fs_i_size_write(inode, pos + copied);
+unlock_out:
f2fs_put_page(page, 1);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return copied;
}
-static int check_direct_IO(struct inode *inode, int rw,
- struct iov_iter *iter, loff_t offset)
+static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
+ loff_t offset)
{
unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
- if (rw == READ)
- return 0;
-
if (offset & blocksize_mask)
return -EINVAL;
@@ -1082,52 +1994,136 @@ static int check_direct_IO(struct inode *inode, int rw,
return 0;
}
-static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter, loff_t offset)
+static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
int err;
- /* Let buffer I/O handle the inline data case. */
- if (f2fs_has_inline_data(inode))
- return 0;
+ err = check_direct_IO(inode, iter, offset);
+ if (err)
+ return err;
- if (check_direct_IO(inode, rw, iter, offset))
+ if (__force_buffered_io(inode, rw))
return 0;
+ if (trace_android_fs_dataread_start_enabled() && (rw == READ)) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_dataread_start(inode, offset,
+ count, current->pid, path,
+ current->comm);
+ }
+ if (trace_android_fs_datawrite_start_enabled() && (rw == WRITE)) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_datawrite_start(inode, offset, count,
+ current->pid, path,
+ current->comm);
+ }
+
trace_f2fs_direct_IO_enter(inode, offset, count, rw);
- err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
- if (err < 0 && (rw & WRITE))
- f2fs_write_failed(mapping, offset + count);
+ down_read(&F2FS_I(inode)->dio_rwsem[rw]);
+ err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block_dio);
+ up_read(&F2FS_I(inode)->dio_rwsem[rw]);
+
+ if (rw & WRITE) {
+ if (err > 0)
+ set_inode_flag(inode, FI_UPDATE_WRITE);
+ else if (err < 0)
+ f2fs_write_failed(mapping, offset + count);
+ }
trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
+ if (trace_android_fs_dataread_start_enabled() && (rw == READ))
+ trace_android_fs_dataread_end(inode, offset, count);
+ if (trace_android_fs_datawrite_start_enabled() && (rw == WRITE))
+ trace_android_fs_datawrite_end(inode, offset, count);
+
return err;
}
-static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
- unsigned int length)
+void f2fs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)
+ if (inode->i_ino >= F2FS_ROOT_INO(sbi) &&
+ (offset % PAGE_SIZE || length != PAGE_SIZE))
return;
- if (PageDirty(page))
- inode_dec_dirty_pages(inode);
+ if (PageDirty(page)) {
+ if (inode->i_ino == F2FS_META_INO(sbi)) {
+ dec_page_count(sbi, F2FS_DIRTY_META);
+ } else if (inode->i_ino == F2FS_NODE_INO(sbi)) {
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ } else {
+ inode_dec_dirty_pages(inode);
+ remove_dirty_inode(inode);
+ }
+ }
+
+ /* This is atomic written page, keep Private */
+ if (IS_ATOMIC_WRITTEN_PAGE(page))
+ return drop_inmem_page(inode, page);
+
+ set_page_private(page, 0);
ClearPagePrivate(page);
}
-static int f2fs_release_data_page(struct page *page, gfp_t wait)
+int f2fs_release_page(struct page *page, gfp_t wait)
{
+ /* If this is dirty page, keep PagePrivate */
+ if (PageDirty(page))
+ return 0;
+
+ /* This is atomic written page, keep Private */
+ if (IS_ATOMIC_WRITTEN_PAGE(page))
+ return 0;
+
+ set_page_private(page, 0);
ClearPagePrivate(page);
return 1;
}
+/*
+ * This was copied from __set_page_dirty_buffers which gives higher performance
+ * in very high speed storages. (e.g., pmem)
+ */
+void f2fs_set_page_dirty_nobuffers(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ unsigned long flags;
+
+ if (unlikely(!mapping))
+ return;
+
+ spin_lock(&mapping->private_lock);
+ SetPageDirty(page);
+ spin_unlock(&mapping->private_lock);
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ WARN_ON_ONCE(!PageUptodate(page));
+ account_page_dirtied(page, mapping);
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ return;
+}
+
static int f2fs_set_data_page_dirty(struct page *page)
{
struct address_space *mapping = page->mapping;
@@ -1135,11 +2131,23 @@ static int f2fs_set_data_page_dirty(struct page *page)
trace_f2fs_set_page_dirty(page, DATA);
- SetPageUptodate(page);
- mark_inode_dirty(inode);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
+
+ if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
+ if (!IS_ATOMIC_WRITTEN_PAGE(page)) {
+ register_inmem_page(inode, page);
+ return 1;
+ }
+ /*
+ * Previously, this page has been registered, we just
+ * return here.
+ */
+ return 0;
+ }
if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
+ f2fs_set_page_dirty_nobuffers(page);
update_dirty_page(inode, page);
return 1;
}
@@ -1153,8 +2161,64 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
if (f2fs_has_inline_data(inode))
return 0;
- return generic_block_bmap(mapping, block, get_data_block);
+ /* make sure allocating whole blocks */
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ filemap_write_and_wait(mapping);
+
+ return generic_block_bmap(mapping, block, get_data_block_bmap);
+}
+
+#ifdef CONFIG_F2FS_MIGRATION
+#include <linux/migrate.h>
+
+int f2fs_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ int rc, extra_count;
+ struct f2fs_inode_info *fi = F2FS_I(mapping->host);
+ bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page);
+
+ BUG_ON(PageWriteback(page));
+
+ /* migrating an atomic written page is safe with the inmem_lock hold */
+ if (atomic_written && !mutex_trylock(&fi->inmem_lock))
+ return -EAGAIN;
+
+ /*
+ * A reference is expected if PagePrivate set when move mapping,
+ * however F2FS breaks this for maintaining dirty page counts when
+ * truncating pages. So here adjusting the 'extra_count' make it work.
+ */
+ extra_count = (atomic_written ? 1 : 0) - page_has_private(page);
+ rc = migrate_page_move_mapping(mapping, newpage,
+ page, NULL, mode, extra_count);
+ if (rc != MIGRATEPAGE_SUCCESS) {
+ if (atomic_written)
+ mutex_unlock(&fi->inmem_lock);
+ return rc;
+ }
+
+ if (atomic_written) {
+ struct inmem_pages *cur;
+ list_for_each_entry(cur, &fi->inmem_pages, list)
+ if (cur->page == page) {
+ cur->page = newpage;
+ break;
+ }
+ mutex_unlock(&fi->inmem_lock);
+ put_page(page);
+ get_page(newpage);
+ }
+
+ if (PagePrivate(page))
+ SetPagePrivate(newpage);
+ set_page_private(newpage, page_private(page));
+
+ migrate_page_copy(newpage, page);
+
+ return MIGRATEPAGE_SUCCESS;
}
+#endif
const struct address_space_operations f2fs_dblock_aops = {
.readpage = f2fs_read_data_page,
@@ -1164,8 +2228,11 @@ const struct address_space_operations f2fs_dblock_aops = {
.write_begin = f2fs_write_begin,
.write_end = f2fs_write_end,
.set_page_dirty = f2fs_set_data_page_dirty,
- .invalidatepage = f2fs_invalidate_data_page,
- .releasepage = f2fs_release_data_page,
+ .invalidatepage = f2fs_invalidate_page,
+ .releasepage = f2fs_release_page,
.direct_IO = f2fs_direct_IO,
.bmap = f2fs_bmap,
+#ifdef CONFIG_F2FS_MIGRATION
+ .migratepage = f2fs_migrate_page,
+#endif
};
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8e53d80c181a..629c57024106 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -33,19 +33,57 @@ static void update_general_status(struct f2fs_sb_info *sbi)
int i;
/* validation check of the segment numbers */
- si->hit_ext = sbi->read_hit_ext;
- si->total_ext = sbi->total_hit_ext;
+ si->hit_largest = atomic64_read(&sbi->read_hit_largest);
+ si->hit_cached = atomic64_read(&sbi->read_hit_cached);
+ si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree);
+ si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree;
+ si->total_ext = atomic64_read(&sbi->total_hit_ext);
+ si->ext_tree = atomic_read(&sbi->total_ext_tree);
+ si->zombie_tree = atomic_read(&sbi->total_zombie_tree);
+ si->ext_node = atomic_read(&sbi->total_ext_node);
si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
- si->ndirty_dirs = sbi->n_dirty_dirs;
si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
+ si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
+ si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
+ si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
+ si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
+ si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
+ si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
+ si->aw_cnt = atomic_read(&sbi->aw_cnt);
+ si->vw_cnt = atomic_read(&sbi->vw_cnt);
+ si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
+ si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt);
+ si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
+ si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA);
+ if (SM_I(sbi) && SM_I(sbi)->fcc_info) {
+ si->nr_flushed =
+ atomic_read(&SM_I(sbi)->fcc_info->issued_flush);
+ si->nr_flushing =
+ atomic_read(&SM_I(sbi)->fcc_info->issing_flush);
+ }
+ if (SM_I(sbi) && SM_I(sbi)->dcc_info) {
+ si->nr_discarded =
+ atomic_read(&SM_I(sbi)->dcc_info->issued_discard);
+ si->nr_discarding =
+ atomic_read(&SM_I(sbi)->dcc_info->issing_discard);
+ si->nr_discard_cmd =
+ atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
+ si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks;
+ }
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
si->valid_count = valid_user_blocks(sbi);
+ si->discard_blks = discard_blocks(sbi);
si->valid_node_count = valid_node_count(sbi);
si->valid_inode_count = valid_inode_count(sbi);
- si->inline_inode = sbi->inline_inode;
+ si->inline_xattr = atomic_read(&sbi->inline_xattr);
+ si->inline_inode = atomic_read(&sbi->inline_inode);
+ si->inline_dir = atomic_read(&sbi->inline_dir);
+ si->append = sbi->im[APPEND_INO].ino_num;
+ si->update = sbi->im[UPDATE_INO].ino_num;
+ si->orphans = sbi->im[ORPHAN_INO].ino_num;
si->utilization = utilization(sbi);
si->free_segs = free_segments(sbi);
@@ -55,8 +93,12 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->node_pages = NODE_MAPPING(sbi)->nrpages;
si->meta_pages = META_MAPPING(sbi)->nrpages;
si->nats = NM_I(sbi)->nat_cnt;
- si->sits = SIT_I(sbi)->dirty_sentries;
- si->fnids = NM_I(sbi)->fcnt;
+ si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
+ si->sits = MAIN_SEGS(sbi);
+ si->dirty_sits = SIT_I(sbi)->dirty_sentries;
+ si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST];
+ si->avail_nids = NM_I(sbi)->available_nids;
+ si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST];
si->bg_gc = sbi->bg_gc;
si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
@@ -69,14 +111,16 @@ static void update_general_status(struct f2fs_sb_info *sbi)
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
si->curseg[i] = curseg->segno;
- si->cursec[i] = curseg->segno / sbi->segs_per_sec;
- si->curzone[i] = si->cursec[i] / sbi->secs_per_zone;
+ si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno);
+ si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]);
}
for (i = 0; i < 2; i++) {
si->segment_count[i] = sbi->segment_count[i];
si->block_count[i] = sbi->block_count[i];
}
+
+ si->inplace_count = atomic_read(&sbi->inplace_count);
}
/*
@@ -85,16 +129,17 @@ static void update_general_status(struct f2fs_sb_info *sbi)
static void update_sit_info(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
- unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
+ unsigned long long blks_per_sec, hblks_per_sec, total_vblocks;
+ unsigned long long bimodal, dist;
unsigned int segno, vblocks;
int ndirty = 0;
bimodal = 0;
total_vblocks = 0;
- blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
+ blks_per_sec = BLKS_PER_SEC(sbi);
hblks_per_sec = blks_per_sec / 2;
for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
- vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ vblocks = get_valid_blocks(sbi, segno, true);
dist = abs(vblocks - hblks_per_sec);
bimodal += dist * dist;
@@ -103,10 +148,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
ndirty++;
}
}
- dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
- si->bimodal = bimodal / dist;
+ dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100);
+ si->bimodal = div64_u64(bimodal, dist);
if (si->dirty_count)
- si->avg_vblocks = total_vblocks / ndirty;
+ si->avg_vblocks = div_u64(total_vblocks, ndirty);
else
si->avg_vblocks = 0;
}
@@ -118,13 +163,19 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
unsigned npages;
+ int i;
if (si->base_mem)
goto get_cache;
- si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
+ /* build stat */
+ si->base_mem = sizeof(struct f2fs_stat_info);
+
+ /* build superblock */
+ si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
si->base_mem += 2 * sizeof(struct f2fs_inode_info);
si->base_mem += sizeof(*sbi->ckpt);
+ si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE;
/* build sm */
si->base_mem += sizeof(struct f2fs_sm_info);
@@ -134,6 +185,9 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+ if (f2fs_discard_en(sbi))
+ si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+ si->base_mem += SIT_VBLOCK_MAP_SIZE;
if (sbi->segs_per_sec > 1)
si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
@@ -145,7 +199,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
/* build curseg */
si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
- si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE;
+ si->base_mem += PAGE_SIZE * NR_CURSEG_TYPE;
/* build dirty segmap */
si->base_mem += sizeof(struct dirty_seglist_info);
@@ -155,20 +209,47 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
/* build nm */
si->base_mem += sizeof(struct f2fs_nm_info);
si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
+ si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS);
+ si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE;
+ si->base_mem += NM_I(sbi)->nat_blocks / 8;
+ si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short);
+
+get_cache:
+ si->cache_mem = 0;
/* build gc */
- si->base_mem += sizeof(struct f2fs_gc_kthread);
+ if (sbi->gc_thread)
+ si->cache_mem += sizeof(struct f2fs_gc_kthread);
+
+ /* build merge flush thread */
+ if (SM_I(sbi)->fcc_info)
+ si->cache_mem += sizeof(struct flush_cmd_control);
+ if (SM_I(sbi)->dcc_info) {
+ si->cache_mem += sizeof(struct discard_cmd_control);
+ si->cache_mem += sizeof(struct discard_cmd) *
+ atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
+ }
-get_cache:
/* free nids */
- si->cache_mem = NM_I(sbi)->fcnt;
- si->cache_mem += NM_I(sbi)->nat_cnt;
+ si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] +
+ NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) *
+ sizeof(struct free_nid);
+ si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
+ si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
+ sizeof(struct nat_entry_set);
+ si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
+ for (i = 0; i <= ORPHAN_INO; i++)
+ si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
+ si->cache_mem += atomic_read(&sbi->total_ext_tree) *
+ sizeof(struct extent_tree);
+ si->cache_mem += atomic_read(&sbi->total_ext_node) *
+ sizeof(struct extent_node);
+
+ si->page_mem = 0;
npages = NODE_MAPPING(sbi)->nrpages;
- si->cache_mem += npages << PAGE_CACHE_SHIFT;
+ si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
npages = META_MAPPING(sbi)->nrpages;
- si->cache_mem += npages << PAGE_CACHE_SHIFT;
- si->cache_mem += sbi->n_orphans * sizeof(struct ino_entry);
- si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
+ si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
static int stat_show(struct seq_file *s, void *v)
@@ -183,23 +264,35 @@ static int stat_show(struct seq_file *s, void *v)
update_general_status(si->sbi);
- seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n",
- bdevname(si->sbi->sb->s_bdev, devname), i++);
+ seq_printf(s, "\n=====[ partition info(%s). #%d, %s]=====\n",
+ bdevname(si->sbi->sb->s_bdev, devname), i++,
+ f2fs_readonly(si->sbi->sb) ? "RO": "RW");
seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
si->sit_area_segs, si->nat_area_segs);
seq_printf(s, "[SSA: %d] [MAIN: %d",
si->ssa_area_segs, si->main_area_segs);
seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
si->overp_segs, si->rsvd_segs);
- seq_printf(s, "Utilization: %d%% (%d valid blocks)\n",
- si->utilization, si->valid_count);
+ if (test_opt(si->sbi, DISCARD))
+ seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n",
+ si->utilization, si->valid_count, si->discard_blks);
+ else
+ seq_printf(s, "Utilization: %u%% (%u valid blocks)\n",
+ si->utilization, si->valid_count);
+
seq_printf(s, " - Node: %u (Inode: %u, ",
si->valid_node_count, si->valid_inode_count);
seq_printf(s, "Other: %u)\n - Data: %u\n",
si->valid_node_count - si->valid_inode_count,
si->valid_count - si->valid_node_count);
+ seq_printf(s, " - Inline_xattr Inode: %u\n",
+ si->inline_xattr);
seq_printf(s, " - Inline_data Inode: %u\n",
si->inline_inode);
+ seq_printf(s, " - Inline_dentry Inode: %u\n",
+ si->inline_dir);
+ seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n",
+ si->orphans, si->append, si->update);
seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
si->main_area_segs, si->main_area_sections,
si->main_area_zones);
@@ -233,27 +326,55 @@ static int stat_show(struct seq_file *s, void *v)
si->dirty_count);
seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n",
si->prefree_count, si->free_segs, si->free_secs);
- seq_printf(s, "CP calls: %d\n", si->cp_count);
+ seq_printf(s, "CP calls: %d (BG: %d)\n",
+ si->cp_count, si->bg_cp_count);
seq_printf(s, "GC calls: %d (BG: %d)\n",
si->call_count, si->bg_gc);
- seq_printf(s, " - data segments : %d\n", si->data_segs);
- seq_printf(s, " - node segments : %d\n", si->node_segs);
- seq_printf(s, "Try to move %d blocks\n", si->tot_blks);
- seq_printf(s, " - data blocks : %d\n", si->data_blks);
- seq_printf(s, " - node blocks : %d\n", si->node_blks);
- seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
- si->hit_ext, si->total_ext);
+ seq_printf(s, " - data segments : %d (%d)\n",
+ si->data_segs, si->bg_data_segs);
+ seq_printf(s, " - node segments : %d (%d)\n",
+ si->node_segs, si->bg_node_segs);
+ seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
+ si->bg_data_blks + si->bg_node_blks);
+ seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks,
+ si->bg_data_blks);
+ seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks,
+ si->bg_node_blks);
+ seq_puts(s, "\nExtent Cache:\n");
+ seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n",
+ si->hit_largest, si->hit_cached,
+ si->hit_rbtree);
+ seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n",
+ !si->total_ext ? 0 :
+ div64_u64(si->hit_total * 100, si->total_ext),
+ si->hit_total, si->total_ext);
+ seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
+ si->ext_tree, si->zombie_tree, si->ext_node);
seq_puts(s, "\nBalancing F2FS Async:\n");
+ seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), "
+ "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
+ si->nr_wb_cp_data, si->nr_wb_data,
+ si->nr_flushing, si->nr_flushed,
+ si->nr_discarding, si->nr_discarded,
+ si->nr_discard_cmd, si->undiscard_blks);
+ seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), "
+ "volatile IO: %4d (Max. %4d)\n",
+ si->inmem_pages, si->aw_cnt, si->max_aw_cnt,
+ si->vw_cnt, si->max_vw_cnt);
seq_printf(s, " - nodes: %4d in %4d\n",
si->ndirty_node, si->node_pages);
- seq_printf(s, " - dents: %4d in dirs:%4d\n",
- si->ndirty_dent, si->ndirty_dirs);
+ seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n",
+ si->ndirty_dent, si->ndirty_dirs, si->ndirty_all);
+ seq_printf(s, " - datas: %4d in files:%4d\n",
+ si->ndirty_data, si->ndirty_files);
seq_printf(s, " - meta: %4d in %4d\n",
si->ndirty_meta, si->meta_pages);
- seq_printf(s, " - NATs: %9d\n - SITs: %9d\n",
- si->nats, si->sits);
- seq_printf(s, " - free_nids: %9d\n",
- si->fnids);
+ seq_printf(s, " - imeta: %4d\n",
+ si->ndirty_imeta);
+ seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
+ si->dirty_nats, si->nats, si->dirty_sits, si->sits);
+ seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n",
+ si->free_nids, si->avail_nids, si->alloc_nids);
seq_puts(s, "\nDistribution of User Blocks:");
seq_puts(s, " [ valid | invalid | free ]\n");
seq_puts(s, " [");
@@ -269,6 +390,7 @@ static int stat_show(struct seq_file *s, void *v)
for (j = 0; j < si->util_free; j++)
seq_putc(s, '-');
seq_puts(s, "]\n\n");
+ seq_printf(s, "IPU: %u blocks\n", si->inplace_count);
seq_printf(s, "SSR: %u blocks in %u segments\n",
si->block_count[SSR], si->segment_count[SSR]);
seq_printf(s, "LFS: %u blocks in %u segments\n",
@@ -281,9 +403,14 @@ static int stat_show(struct seq_file *s, void *v)
/* memory footprint */
update_mem_info(si->sbi);
- seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n",
- (si->base_mem + si->cache_mem) >> 10,
- si->base_mem >> 10, si->cache_mem >> 10);
+ seq_printf(s, "\nMemory: %llu KB\n",
+ (si->base_mem + si->cache_mem + si->page_mem) >> 10);
+ seq_printf(s, " - static: %llu KB\n",
+ si->base_mem >> 10);
+ seq_printf(s, " - cached: %llu KB\n",
+ si->cache_mem >> 10);
+ seq_printf(s, " - paged : %llu KB\n",
+ si->page_mem >> 10);
}
mutex_unlock(&f2fs_stat_mutex);
return 0;
@@ -322,6 +449,21 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
si->sbi = sbi;
sbi->stat_info = si;
+ atomic64_set(&sbi->total_hit_ext, 0);
+ atomic64_set(&sbi->read_hit_rbtree, 0);
+ atomic64_set(&sbi->read_hit_largest, 0);
+ atomic64_set(&sbi->read_hit_cached, 0);
+
+ atomic_set(&sbi->inline_xattr, 0);
+ atomic_set(&sbi->inline_inode, 0);
+ atomic_set(&sbi->inline_dir, 0);
+ atomic_set(&sbi->inplace_count, 0);
+
+ atomic_set(&sbi->aw_cnt, 0);
+ atomic_set(&sbi->vw_cnt, 0);
+ atomic_set(&sbi->max_aw_cnt, 0);
+ atomic_set(&sbi->max_vw_cnt, 0);
+
mutex_lock(&f2fs_stat_mutex);
list_add_tail(&si->stat_list, &f2fs_stat_list);
mutex_unlock(&f2fs_stat_mutex);
@@ -340,20 +482,23 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
kfree(si);
}
-void __init f2fs_create_root_stats(void)
+int __init f2fs_create_root_stats(void)
{
struct dentry *file;
f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
if (!f2fs_debugfs_root)
- return;
+ return -ENOMEM;
file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
NULL, &stat_fops);
if (!file) {
debugfs_remove(f2fs_debugfs_root);
f2fs_debugfs_root = NULL;
+ return -ENOMEM;
}
+
+ return 0;
}
void f2fs_destroy_root_stats(void)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index b54f87149c09..f0333d1ef222 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -17,8 +17,8 @@
static unsigned long dir_blocks(struct inode *inode)
{
- return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1))
- >> PAGE_CACHE_SHIFT;
+ return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1))
+ >> PAGE_SHIFT;
}
static unsigned int dir_buckets(unsigned int level, int dir_level)
@@ -48,7 +48,6 @@ static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
[F2FS_FT_SYMLINK] = DT_LNK,
};
-#define S_SHIFT 12
static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = F2FS_FT_DIR,
@@ -59,12 +58,18 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK,
};
-static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
+void set_de_type(struct f2fs_dir_entry *de, umode_t mode)
{
- umode_t mode = inode->i_mode;
de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
}
+unsigned char get_de_type(struct f2fs_dir_entry *de)
+{
+ if (de->file_type < F2FS_FT_MAX)
+ return f2fs_filetype_table[de->file_type];
+ return DT_UNKNOWN;
+}
+
static unsigned long dir_block_index(unsigned int level,
int dir_level, unsigned int idx)
{
@@ -77,81 +82,85 @@ static unsigned long dir_block_index(unsigned int level,
return bidx;
}
-static bool early_match_name(size_t namelen, f2fs_hash_t namehash,
- struct f2fs_dir_entry *de)
+static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
+ struct fscrypt_name *fname,
+ f2fs_hash_t namehash,
+ int *max_slots,
+ struct page **res_page)
{
- if (le16_to_cpu(de->name_len) != namelen)
- return false;
+ struct f2fs_dentry_block *dentry_blk;
+ struct f2fs_dir_entry *de;
+ struct f2fs_dentry_ptr d;
- if (de->hash_code != namehash)
- return false;
+ dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page);
- return true;
+ make_dentry_ptr_block(NULL, &d, dentry_blk);
+ de = find_target_dentry(fname, namehash, max_slots, &d);
+ if (de)
+ *res_page = dentry_page;
+ else
+ kunmap(dentry_page);
+
+ return de;
}
-static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
- struct qstr *name, int *max_slots,
- f2fs_hash_t namehash, struct page **res_page)
+struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
+ f2fs_hash_t namehash, int *max_slots,
+ struct f2fs_dentry_ptr *d)
{
struct f2fs_dir_entry *de;
unsigned long bit_pos = 0;
- struct f2fs_dentry_block *dentry_blk = kmap(dentry_page);
- const void *dentry_bits = &dentry_blk->dentry_bitmap;
int max_len = 0;
- while (bit_pos < NR_DENTRY_IN_BLOCK) {
- if (!test_bit_le(bit_pos, dentry_bits)) {
- if (bit_pos == 0)
- max_len = 1;
- else if (!test_bit_le(bit_pos - 1, dentry_bits))
- max_len++;
+ if (max_slots)
+ *max_slots = 0;
+ while (bit_pos < d->max) {
+ if (!test_bit_le(bit_pos, d->bitmap)) {
bit_pos++;
+ max_len++;
continue;
}
- de = &dentry_blk->dentry[bit_pos];
- if (early_match_name(name->len, namehash, de)) {
- if (!memcmp(dentry_blk->filename[bit_pos],
- name->name,
- name->len)) {
- *res_page = dentry_page;
- goto found;
- }
- }
- if (max_len > *max_slots) {
- *max_slots = max_len;
- max_len = 0;
+
+ de = &d->dentry[bit_pos];
+
+ if (unlikely(!de->name_len)) {
+ bit_pos++;
+ continue;
}
- /*
- * For the most part, it should be a bug when name_len is zero.
- * We stop here for figuring out where the bugs has occurred.
- */
- f2fs_bug_on(F2FS_P_SB(dentry_page), !de->name_len);
+ if (de->hash_code == namehash &&
+ fscrypt_match_name(fname, d->filename[bit_pos],
+ le16_to_cpu(de->name_len)))
+ goto found;
+
+ if (max_slots && max_len > *max_slots)
+ *max_slots = max_len;
+ max_len = 0;
bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
}
de = NULL;
- kunmap(dentry_page);
found:
- if (max_len > *max_slots)
+ if (max_slots && max_len > *max_slots)
*max_slots = max_len;
return de;
}
static struct f2fs_dir_entry *find_in_level(struct inode *dir,
- unsigned int level, struct qstr *name,
- f2fs_hash_t namehash, struct page **res_page)
+ unsigned int level,
+ struct fscrypt_name *fname,
+ struct page **res_page)
{
- int s = GET_DENTRY_SLOTS(name->len);
+ struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
+ int s = GET_DENTRY_SLOTS(name.len);
unsigned int nbucket, nblock;
unsigned int bidx, end_block;
struct page *dentry_page;
struct f2fs_dir_entry *de = NULL;
bool room = false;
- int max_slots = 0;
-
- f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
+ int max_slots;
+ f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname);
nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
nblock = bucket_blocks(level);
@@ -162,14 +171,19 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
for (; bidx < end_block; bidx++) {
/* no need to allocate new dentry pages to all the indices */
- dentry_page = find_data_page(dir, bidx, true);
+ dentry_page = find_data_page(dir, bidx);
if (IS_ERR(dentry_page)) {
- room = true;
- continue;
+ if (PTR_ERR(dentry_page) == -ENOENT) {
+ room = true;
+ continue;
+ } else {
+ *res_page = dentry_page;
+ break;
+ }
}
- de = find_in_block(dentry_page, name, &max_slots,
- namehash, res_page);
+ de = find_in_block(dentry_page, fname, namehash, &max_slots,
+ res_page);
if (de)
break;
@@ -186,69 +200,93 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
return de;
}
-/*
- * Find an entry in the specified directory with the wanted name.
- * It returns the page where the entry was found (as a parameter - res_page),
- * and the entry itself. Page is returned mapped and unlocked.
- * Entry is guaranteed to be valid.
- */
-struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
- struct qstr *child, struct page **res_page)
+struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
+ struct fscrypt_name *fname, struct page **res_page)
{
unsigned long npages = dir_blocks(dir);
struct f2fs_dir_entry *de = NULL;
- f2fs_hash_t name_hash;
unsigned int max_depth;
unsigned int level;
- if (npages == 0)
- return NULL;
+ if (f2fs_has_inline_dentry(dir)) {
+ *res_page = NULL;
+ de = find_in_inline_dir(dir, fname, res_page);
+ goto out;
+ }
- *res_page = NULL;
+ if (npages == 0) {
+ *res_page = NULL;
+ goto out;
+ }
- name_hash = f2fs_dentry_hash(child);
max_depth = F2FS_I(dir)->i_current_depth;
+ if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) {
+ f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING,
+ "Corrupted max_depth of %lu: %u",
+ dir->i_ino, max_depth);
+ max_depth = MAX_DIR_HASH_DEPTH;
+ f2fs_i_depth_write(dir, max_depth);
+ }
for (level = 0; level < max_depth; level++) {
- de = find_in_level(dir, level, child, name_hash, res_page);
- if (de)
+ *res_page = NULL;
+ de = find_in_level(dir, level, fname, res_page);
+ if (de || IS_ERR(*res_page))
break;
}
- if (!de && F2FS_I(dir)->chash != name_hash) {
- F2FS_I(dir)->chash = name_hash;
- F2FS_I(dir)->clevel = level - 1;
- }
+out:
+ /* This is to increase the speed of f2fs_create */
+ if (!de)
+ F2FS_I(dir)->task = current;
return de;
}
-struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
+/*
+ * Find an entry in the specified directory with the wanted name.
+ * It returns the page where the entry was found (as a parameter - res_page),
+ * and the entry itself. Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
+ struct qstr *child, struct page **res_page)
{
- struct page *page;
- struct f2fs_dir_entry *de;
- struct f2fs_dentry_block *dentry_blk;
+ struct f2fs_dir_entry *de = NULL;
+ struct fscrypt_name fname;
+ int err;
- page = get_lock_data_page(dir, 0);
- if (IS_ERR(page))
+ err = fscrypt_setup_filename(dir, child, 1, &fname);
+ if (err) {
+ if (err == -ENOENT)
+ *res_page = NULL;
+ else
+ *res_page = ERR_PTR(err);
return NULL;
+ }
- dentry_blk = kmap(page);
- de = &dentry_blk->dentry[1];
- *p = page;
- unlock_page(page);
+ de = __f2fs_find_entry(dir, &fname, res_page);
+
+ fscrypt_free_filename(&fname);
return de;
}
-ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
+struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
+{
+ struct qstr dotdot = QSTR_INIT("..", 2);
+
+ return f2fs_find_entry(dir, &dotdot, p);
+}
+
+ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr,
+ struct page **page)
{
ino_t res = 0;
struct f2fs_dir_entry *de;
- struct page *page;
- de = f2fs_find_entry(dir, qstr, &page);
+ de = f2fs_find_entry(dir, qstr, page);
if (de) {
res = le32_to_cpu(de->ino);
- kunmap(page);
- f2fs_put_page(page, 0);
+ f2fs_dentry_kunmap(dir, *page);
+ f2fs_put_page(*page, 0);
}
return res;
@@ -257,15 +295,16 @@ ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
struct page *page, struct inode *inode)
{
+ enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
lock_page(page);
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, type, true);
de->ino = cpu_to_le32(inode->i_ino);
- set_de_type(de, inode);
- kunmap(page);
+ set_de_type(de, inode->i_mode);
+ f2fs_dentry_kunmap(dir, page);
set_page_dirty(page);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
- mark_inode_dirty(dir);
+ dir->i_mtime = dir->i_ctime = current_time(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
f2fs_put_page(page, 1);
}
@@ -273,7 +312,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
{
struct f2fs_inode *ri;
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
/* copy name info. to this inode page */
ri = F2FS_INODE(ipage);
@@ -282,18 +321,17 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
set_page_dirty(ipage);
}
-int update_dent_inode(struct inode *inode, const struct qstr *name)
+void do_make_empty_dir(struct inode *inode, struct inode *parent,
+ struct f2fs_dentry_ptr *d)
{
- struct page *page;
+ struct qstr dot = QSTR_INIT(".", 1);
+ struct qstr dotdot = QSTR_INIT("..", 2);
- page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ /* update dirent of "." */
+ f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0);
- init_dent_inode(name, page);
- f2fs_put_page(page, 1);
-
- return 0;
+ /* update dirent of ".." */
+ f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1);
}
static int make_empty_dir(struct inode *inode,
@@ -301,31 +339,20 @@ static int make_empty_dir(struct inode *inode,
{
struct page *dentry_page;
struct f2fs_dentry_block *dentry_blk;
- struct f2fs_dir_entry *de;
+ struct f2fs_dentry_ptr d;
+
+ if (f2fs_has_inline_dentry(inode))
+ return make_empty_inline_dir(inode, parent, page);
dentry_page = get_new_data_page(inode, page, 0, true);
if (IS_ERR(dentry_page))
return PTR_ERR(dentry_page);
-
dentry_blk = kmap_atomic(dentry_page);
- de = &dentry_blk->dentry[0];
- de->name_len = cpu_to_le16(1);
- de->hash_code = 0;
- de->ino = cpu_to_le32(inode->i_ino);
- memcpy(dentry_blk->filename[0], ".", 1);
- set_de_type(de, inode);
-
- de = &dentry_blk->dentry[1];
- de->hash_code = 0;
- de->name_len = cpu_to_le16(2);
- de->ino = cpu_to_le32(parent->i_ino);
- memcpy(dentry_blk->filename[1], "..", 2);
- set_de_type(de, inode);
-
- test_and_set_bit_le(0, &dentry_blk->dentry_bitmap);
- test_and_set_bit_le(1, &dentry_blk->dentry_bitmap);
+ make_dentry_ptr_block(NULL, &d, dentry_blk);
+ do_make_empty_dir(inode, parent, &d);
+
kunmap_atomic(dentry_blk);
set_page_dirty(dentry_page);
@@ -333,30 +360,42 @@ static int make_empty_dir(struct inode *inode,
return 0;
}
-static struct page *init_inode_metadata(struct inode *inode,
- struct inode *dir, const struct qstr *name)
+struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
+ const struct qstr *new_name, const struct qstr *orig_name,
+ struct page *dpage)
{
struct page *page;
int err;
- if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
+ if (is_inode_flag_set(inode, FI_NEW_INODE)) {
page = new_inode_page(inode);
if (IS_ERR(page))
return page;
if (S_ISDIR(inode->i_mode)) {
+ /* in order to handle error case */
+ get_page(page);
err = make_empty_dir(inode, dir, page);
- if (err)
- goto error;
+ if (err) {
+ lock_page(page);
+ goto put_error;
+ }
+ put_page(page);
}
- err = f2fs_init_acl(inode, dir, page);
+ err = f2fs_init_acl(inode, dir, page, dpage);
if (err)
goto put_error;
- err = f2fs_init_security(inode, dir, name, page);
+ err = f2fs_init_security(inode, dir, orig_name, page);
if (err)
goto put_error;
+
+ if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) {
+ err = fscrypt_inherit_context(dir, inode, page, false);
+ if (err)
+ goto put_error;
+ }
} else {
page = get_node_page(F2FS_I_SB(dir), inode->i_ino);
if (IS_ERR(page))
@@ -365,14 +404,17 @@ static struct page *init_inode_metadata(struct inode *inode,
set_cold_node(inode, page);
}
- if (name)
- init_dent_inode(name, page);
+ if (new_name) {
+ init_dent_inode(new_name, page);
+ if (f2fs_encrypted_inode(dir))
+ file_set_enc_name(inode);
+ }
/*
* This file should be checkpointed during fsync.
* We lost i_pino from now on.
*/
- if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+ if (is_inode_flag_set(inode, FI_INC_LINK)) {
file_lost_pino(inode);
/*
* If link the tmpfile to alias through linkat path,
@@ -380,91 +422,97 @@ static struct page *init_inode_metadata(struct inode *inode,
*/
if (inode->i_nlink == 0)
remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
- inc_nlink(inode);
+ f2fs_i_links_write(inode, true);
}
return page;
put_error:
+ clear_nlink(inode);
+ update_inode(inode, page);
f2fs_put_page(page, 1);
-error:
- /* once the failed inode becomes a bad inode, i_mode is S_IFREG */
- truncate_inode_pages(&inode->i_data, 0);
- truncate_blocks(inode, 0, false);
- remove_dirty_dir_inode(inode);
- remove_inode_page(inode);
return ERR_PTR(err);
}
-static void update_parent_metadata(struct inode *dir, struct inode *inode,
+void update_parent_metadata(struct inode *dir, struct inode *inode,
unsigned int current_depth)
{
- if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
- if (S_ISDIR(inode->i_mode)) {
- inc_nlink(dir);
- set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
- }
- clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
+ if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) {
+ if (S_ISDIR(inode->i_mode))
+ f2fs_i_links_write(dir, true);
+ clear_inode_flag(inode, FI_NEW_INODE);
}
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
- mark_inode_dirty(dir);
+ dir->i_mtime = dir->i_ctime = current_time(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
- if (F2FS_I(dir)->i_current_depth != current_depth) {
- F2FS_I(dir)->i_current_depth = current_depth;
- set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
- }
+ if (F2FS_I(dir)->i_current_depth != current_depth)
+ f2fs_i_depth_write(dir, current_depth);
- if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
- clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ if (inode && is_inode_flag_set(inode, FI_INC_LINK))
+ clear_inode_flag(inode, FI_INC_LINK);
}
-static int room_for_filename(struct f2fs_dentry_block *dentry_blk, int slots)
+int room_for_filename(const void *bitmap, int slots, int max_slots)
{
int bit_start = 0;
int zero_start, zero_end;
next:
- zero_start = find_next_zero_bit_le(&dentry_blk->dentry_bitmap,
- NR_DENTRY_IN_BLOCK,
- bit_start);
- if (zero_start >= NR_DENTRY_IN_BLOCK)
- return NR_DENTRY_IN_BLOCK;
+ zero_start = find_next_zero_bit_le(bitmap, max_slots, bit_start);
+ if (zero_start >= max_slots)
+ return max_slots;
- zero_end = find_next_bit_le(&dentry_blk->dentry_bitmap,
- NR_DENTRY_IN_BLOCK,
- zero_start);
+ zero_end = find_next_bit_le(bitmap, max_slots, zero_start);
if (zero_end - zero_start >= slots)
return zero_start;
bit_start = zero_end + 1;
- if (zero_end + 1 >= NR_DENTRY_IN_BLOCK)
- return NR_DENTRY_IN_BLOCK;
+ if (zero_end + 1 >= max_slots)
+ return max_slots;
goto next;
}
-/*
- * Caller should grab and release a rwsem by calling f2fs_lock_op() and
- * f2fs_unlock_op().
- */
-int __f2fs_add_link(struct inode *dir, const struct qstr *name,
- struct inode *inode)
+void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
+ const struct qstr *name, f2fs_hash_t name_hash,
+ unsigned int bit_pos)
+{
+ struct f2fs_dir_entry *de;
+ int slots = GET_DENTRY_SLOTS(name->len);
+ int i;
+
+ de = &d->dentry[bit_pos];
+ de->hash_code = name_hash;
+ de->name_len = cpu_to_le16(name->len);
+ memcpy(d->filename[bit_pos], name->name, name->len);
+ de->ino = cpu_to_le32(ino);
+ set_de_type(de, mode);
+ for (i = 0; i < slots; i++) {
+ __set_bit_le(bit_pos + i, (void *)d->bitmap);
+ /* avoid wrong garbage data for readdir */
+ if (i)
+ (de + i)->name_len = 0;
+ }
+}
+
+int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
+ const struct qstr *orig_name,
+ struct inode *inode, nid_t ino, umode_t mode)
{
unsigned int bit_pos;
unsigned int level;
unsigned int current_depth;
unsigned long bidx, block;
f2fs_hash_t dentry_hash;
- struct f2fs_dir_entry *de;
unsigned int nbucket, nblock;
- size_t namelen = name->len;
struct page *dentry_page = NULL;
struct f2fs_dentry_block *dentry_blk = NULL;
- int slots = GET_DENTRY_SLOTS(namelen);
- struct page *page;
- int err = 0;
- int i;
+ struct f2fs_dentry_ptr d;
+ struct page *page = NULL;
+ int slots, err = 0;
- dentry_hash = f2fs_dentry_hash(name);
level = 0;
+ slots = GET_DENTRY_SLOTS(new_name->len);
+ dentry_hash = f2fs_dentry_hash(new_name, NULL);
+
current_depth = F2FS_I(dir)->i_current_depth;
if (F2FS_I(dir)->chash == dentry_hash) {
level = F2FS_I(dir)->clevel;
@@ -472,6 +520,12 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
}
start:
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) {
+ f2fs_show_injection_info(FAULT_DIR_DEPTH);
+ return -ENOSPC;
+ }
+#endif
if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
return -ENOSPC;
@@ -491,7 +545,8 @@ start:
return PTR_ERR(dentry_page);
dentry_blk = kmap(dentry_page);
- bit_pos = room_for_filename(dentry_blk, slots);
+ bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
+ slots, NR_DENTRY_IN_BLOCK);
if (bit_pos < NR_DENTRY_IN_BLOCK)
goto add_dentry;
@@ -503,39 +558,96 @@ start:
++level;
goto start;
add_dentry:
- f2fs_wait_on_page_writeback(dentry_page, DATA);
+ f2fs_wait_on_page_writeback(dentry_page, DATA, true);
- down_write(&F2FS_I(inode)->i_sem);
- page = init_inode_metadata(inode, dir, name);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
- goto fail;
+ if (inode) {
+ down_write(&F2FS_I(inode)->i_sem);
+ page = init_inode_metadata(inode, dir, new_name,
+ orig_name, NULL);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto fail;
+ }
}
- de = &dentry_blk->dentry[bit_pos];
- de->hash_code = dentry_hash;
- de->name_len = cpu_to_le16(namelen);
- memcpy(dentry_blk->filename[bit_pos], name->name, name->len);
- de->ino = cpu_to_le32(inode->i_ino);
- set_de_type(de, inode);
- for (i = 0; i < slots; i++)
- test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
+
+ make_dentry_ptr_block(NULL, &d, dentry_blk);
+ f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos);
+
set_page_dirty(dentry_page);
- /* we don't need to mark_inode_dirty now */
- F2FS_I(inode)->i_pino = dir->i_ino;
- update_inode(inode, page);
- f2fs_put_page(page, 1);
+ if (inode) {
+ f2fs_i_pino_write(inode, dir->i_ino);
+ f2fs_put_page(page, 1);
+ }
update_parent_metadata(dir, inode, current_depth);
fail:
- up_write(&F2FS_I(inode)->i_sem);
+ if (inode)
+ up_write(&F2FS_I(inode)->i_sem);
- if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
- update_inode_page(dir);
- clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
- }
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
+
+ return err;
+}
+
+int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname,
+ struct inode *inode, nid_t ino, umode_t mode)
+{
+ struct qstr new_name;
+ int err = -EAGAIN;
+
+ new_name.name = fname_name(fname);
+ new_name.len = fname_len(fname);
+
+ if (f2fs_has_inline_dentry(dir))
+ err = f2fs_add_inline_entry(dir, &new_name, fname->usr_fname,
+ inode, ino, mode);
+ if (err == -EAGAIN)
+ err = f2fs_add_regular_entry(dir, &new_name, fname->usr_fname,
+ inode, ino, mode);
+
+ f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
+ return err;
+}
+
+/*
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
+ */
+int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+ struct inode *inode, nid_t ino, umode_t mode)
+{
+ struct fscrypt_name fname;
+ struct page *page = NULL;
+ struct f2fs_dir_entry *de = NULL;
+ int err;
+
+ err = fscrypt_setup_filename(dir, name, 0, &fname);
+ if (err)
+ return err;
+
+ /*
+ * An immature stakable filesystem shows a race condition between lookup
+ * and create. If we have same task when doing lookup and create, it's
+ * definitely fine as expected by VFS normally. Otherwise, let's just
+ * verify on-disk dentry one more time, which guarantees filesystem
+ * consistency more.
+ */
+ if (current != F2FS_I(dir)->task) {
+ de = __f2fs_find_entry(dir, &fname, &page);
+ F2FS_I(dir)->task = NULL;
+ }
+ if (de) {
+ f2fs_dentry_kunmap(dir, page);
+ f2fs_put_page(page, 0);
+ err = -EEXIST;
+ } else if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ } else {
+ err = __f2fs_do_add_link(dir, &fname, inode, ino, mode);
+ }
+ fscrypt_free_filename(&fname);
return err;
}
@@ -545,41 +657,67 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
int err = 0;
down_write(&F2FS_I(inode)->i_sem);
- page = init_inode_metadata(inode, dir, NULL);
+ page = init_inode_metadata(inode, dir, NULL, NULL, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto fail;
}
- /* we don't need to mark_inode_dirty now */
- update_inode(inode, page);
f2fs_put_page(page, 1);
- clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
+ clear_inode_flag(inode, FI_NEW_INODE);
fail:
up_write(&F2FS_I(inode)->i_sem);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return err;
}
+void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+
+ down_write(&F2FS_I(inode)->i_sem);
+
+ if (S_ISDIR(inode->i_mode))
+ f2fs_i_links_write(dir, false);
+ inode->i_ctime = current_time(inode);
+
+ f2fs_i_links_write(inode, false);
+ if (S_ISDIR(inode->i_mode)) {
+ f2fs_i_links_write(inode, false);
+ f2fs_i_size_write(inode, 0);
+ }
+ up_write(&F2FS_I(inode)->i_sem);
+
+ if (inode->i_nlink == 0)
+ add_orphan_inode(inode);
+ else
+ release_orphan_inode(sbi);
+}
+
/*
* It only removes the dentry from the dentry page, corresponding name
* entry in name page does not need to be touched during deletion.
*/
void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
- struct inode *inode)
+ struct inode *dir, struct inode *inode)
{
struct f2fs_dentry_block *dentry_blk;
unsigned int bit_pos;
- struct inode *dir = page->mapping->host;
int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
int i;
+ f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
+
+ if (f2fs_has_inline_dentry(dir))
+ return f2fs_delete_inline_entry(dentry, page, dir, inode);
+
lock_page(page);
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, true);
dentry_blk = page_address(page);
bit_pos = dentry - dentry_blk->dentry;
for (i = 0; i < slots; i++)
- test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
+ __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
/* Let's check and deallocate this dentry page */
bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
@@ -588,37 +726,19 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
kunmap(page); /* kunmap - pair of f2fs_find_entry */
set_page_dirty(page);
- dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-
- if (inode) {
- struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
-
- down_write(&F2FS_I(inode)->i_sem);
-
- if (S_ISDIR(inode->i_mode)) {
- drop_nlink(dir);
- update_inode_page(dir);
- }
- inode->i_ctime = CURRENT_TIME;
- drop_nlink(inode);
- if (S_ISDIR(inode->i_mode)) {
- drop_nlink(inode);
- i_size_write(inode, 0);
- }
- up_write(&F2FS_I(inode)->i_sem);
- update_inode_page(inode);
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
- if (inode->i_nlink == 0)
- add_orphan_inode(sbi, inode->i_ino);
- else
- release_orphan_inode(sbi);
- }
+ if (inode)
+ f2fs_drop_nlink(dir, inode);
- if (bit_pos == NR_DENTRY_IN_BLOCK) {
- truncate_hole(dir, page->index, page->index + 1);
+ if (bit_pos == NR_DENTRY_IN_BLOCK &&
+ !truncate_hole(dir, page->index, page->index + 1)) {
clear_page_dirty_for_io(page);
+ ClearPagePrivate(page);
ClearPageUptodate(page);
inode_dec_dirty_pages(dir);
+ remove_dirty_inode(dir);
}
f2fs_put_page(page, 1);
}
@@ -628,11 +748,14 @@ bool f2fs_empty_dir(struct inode *dir)
unsigned long bidx;
struct page *dentry_page;
unsigned int bit_pos;
- struct f2fs_dentry_block *dentry_blk;
+ struct f2fs_dentry_block *dentry_blk;
unsigned long nblock = dir_blocks(dir);
+ if (f2fs_has_inline_dentry(dir))
+ return f2fs_empty_inline_dir(dir);
+
for (bidx = 0; bidx < nblock; bidx++) {
- dentry_page = get_lock_data_page(dir, bidx);
+ dentry_page = get_lock_data_page(dir, bidx, false);
if (IS_ERR(dentry_page)) {
if (PTR_ERR(dentry_page) == -ENOENT)
continue;
@@ -640,7 +763,6 @@ bool f2fs_empty_dir(struct inode *dir)
return false;
}
-
dentry_blk = kmap_atomic(dentry_page);
if (bidx == 0)
bit_pos = 2;
@@ -659,19 +781,83 @@ bool f2fs_empty_dir(struct inode *dir)
return true;
}
+int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
+ unsigned int start_pos, struct fscrypt_str *fstr)
+{
+ unsigned char d_type = DT_UNKNOWN;
+ unsigned int bit_pos;
+ struct f2fs_dir_entry *de = NULL;
+ struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
+
+ bit_pos = ((unsigned long)ctx->pos % d->max);
+
+ while (bit_pos < d->max) {
+ bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos);
+ if (bit_pos >= d->max)
+ break;
+
+ de = &d->dentry[bit_pos];
+ if (de->name_len == 0) {
+ bit_pos++;
+ ctx->pos = start_pos + bit_pos;
+ continue;
+ }
+
+ d_type = get_de_type(de);
+
+ de_name.name = d->filename[bit_pos];
+ de_name.len = le16_to_cpu(de->name_len);
+
+ if (f2fs_encrypted_inode(d->inode)) {
+ int save_len = fstr->len;
+ int err;
+
+ err = fscrypt_fname_disk_to_usr(d->inode,
+ (u32)de->hash_code, 0,
+ &de_name, fstr);
+ if (err)
+ return err;
+
+ de_name = *fstr;
+ fstr->len = save_len;
+ }
+
+ if (!dir_emit(ctx, de_name.name, de_name.len,
+ le32_to_cpu(de->ino), d_type))
+ return 1;
+
+ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+ ctx->pos = start_pos + bit_pos;
+ }
+ return 0;
+}
+
static int f2fs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
unsigned long npages = dir_blocks(inode);
- unsigned int bit_pos = 0;
struct f2fs_dentry_block *dentry_blk = NULL;
- struct f2fs_dir_entry *de = NULL;
struct page *dentry_page = NULL;
struct file_ra_state *ra = &file->f_ra;
unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
- unsigned char d_type = DT_UNKNOWN;
+ struct f2fs_dentry_ptr d;
+ struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
+ int err = 0;
+
+ if (f2fs_encrypted_inode(inode)) {
+ err = fscrypt_get_encryption_info(inode);
+ if (err && err != -ENOKEY)
+ return err;
- bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
+ err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
+ if (err < 0)
+ return err;
+ }
+
+ if (f2fs_has_inline_dentry(inode)) {
+ err = f2fs_read_inline_dir(file, ctx, &fstr);
+ goto out;
+ }
/* readahead for multi pages of dir */
if (npages - n > 1 && !ra_has_index(ra, n))
@@ -679,44 +865,42 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
for (; n < npages; n++) {
- dentry_page = get_lock_data_page(inode, n);
- if (IS_ERR(dentry_page))
- continue;
+ dentry_page = get_lock_data_page(inode, n, false);
+ if (IS_ERR(dentry_page)) {
+ err = PTR_ERR(dentry_page);
+ if (err == -ENOENT) {
+ err = 0;
+ continue;
+ } else {
+ goto out;
+ }
+ }
dentry_blk = kmap(dentry_page);
- while (bit_pos < NR_DENTRY_IN_BLOCK) {
- bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
- NR_DENTRY_IN_BLOCK,
- bit_pos);
- if (bit_pos >= NR_DENTRY_IN_BLOCK)
- break;
- de = &dentry_blk->dentry[bit_pos];
- if (de->file_type < F2FS_FT_MAX)
- d_type = f2fs_filetype_table[de->file_type];
- else
- d_type = DT_UNKNOWN;
- if (!dir_emit(ctx,
- dentry_blk->filename[bit_pos],
- le16_to_cpu(de->name_len),
- le32_to_cpu(de->ino), d_type))
- goto stop;
+ make_dentry_ptr_block(inode, &d, dentry_blk);
- bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
- ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos;
+ err = f2fs_fill_dentries(ctx, &d,
+ n * NR_DENTRY_IN_BLOCK, &fstr);
+ if (err) {
+ kunmap(dentry_page);
+ f2fs_put_page(dentry_page, 1);
+ break;
}
- bit_pos = 0;
+
ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
- dentry_page = NULL;
- }
-stop:
- if (dentry_page && !IS_ERR(dentry_page)) {
- kunmap(dentry_page);
- f2fs_put_page(dentry_page, 1);
}
+out:
+ fscrypt_fname_free_buffer(&fstr);
+ return err < 0 ? err : 0;
+}
+static int f2fs_dir_open(struct inode *inode, struct file *filp)
+{
+ if (f2fs_encrypted_inode(inode))
+ return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
return 0;
}
@@ -725,5 +909,9 @@ const struct file_operations f2fs_dir_operations = {
.read = generic_read_dir,
.iterate = f2fs_readdir,
.fsync = f2fs_sync_file,
+ .open = f2fs_dir_open,
.unlocked_ioctl = f2fs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = f2fs_compat_ioctl,
+#endif
};
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
new file mode 100644
index 000000000000..2f98d7039701
--- /dev/null
+++ b/fs/f2fs/extent_cache.c
@@ -0,0 +1,809 @@
+/*
+ * f2fs extent cache support
+ *
+ * Copyright (c) 2015 Motorola Mobility
+ * Copyright (c) 2015 Samsung Electronics
+ * Authors: Jaegeuk Kim <jaegeuk@kernel.org>
+ * Chao Yu <chao2.yu@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include <trace/events/f2fs.h>
+
+static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re,
+ unsigned int ofs)
+{
+ if (cached_re) {
+ if (cached_re->ofs <= ofs &&
+ cached_re->ofs + cached_re->len > ofs) {
+ return cached_re;
+ }
+ }
+ return NULL;
+}
+
+static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root,
+ unsigned int ofs)
+{
+ struct rb_node *node = root->rb_node;
+ struct rb_entry *re;
+
+ while (node) {
+ re = rb_entry(node, struct rb_entry, rb_node);
+
+ if (ofs < re->ofs)
+ node = node->rb_left;
+ else if (ofs >= re->ofs + re->len)
+ node = node->rb_right;
+ else
+ return re;
+ }
+ return NULL;
+}
+
+struct rb_entry *__lookup_rb_tree(struct rb_root *root,
+ struct rb_entry *cached_re, unsigned int ofs)
+{
+ struct rb_entry *re;
+
+ re = __lookup_rb_tree_fast(cached_re, ofs);
+ if (!re)
+ return __lookup_rb_tree_slow(root, ofs);
+
+ return re;
+}
+
+struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
+ struct rb_root *root, struct rb_node **parent,
+ unsigned int ofs)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_entry *re;
+
+ while (*p) {
+ *parent = *p;
+ re = rb_entry(*parent, struct rb_entry, rb_node);
+
+ if (ofs < re->ofs)
+ p = &(*p)->rb_left;
+ else if (ofs >= re->ofs + re->len)
+ p = &(*p)->rb_right;
+ else
+ f2fs_bug_on(sbi, 1);
+ }
+
+ return p;
+}
+
+/*
+ * lookup rb entry in position of @ofs in rb-tree,
+ * if hit, return the entry, otherwise, return NULL
+ * @prev_ex: extent before ofs
+ * @next_ex: extent after ofs
+ * @insert_p: insert point for new extent at ofs
+ * in order to simpfy the insertion after.
+ * tree must stay unchanged between lookup and insertion.
+ */
+struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root,
+ struct rb_entry *cached_re,
+ unsigned int ofs,
+ struct rb_entry **prev_entry,
+ struct rb_entry **next_entry,
+ struct rb_node ***insert_p,
+ struct rb_node **insert_parent,
+ bool force)
+{
+ struct rb_node **pnode = &root->rb_node;
+ struct rb_node *parent = NULL, *tmp_node;
+ struct rb_entry *re = cached_re;
+
+ *insert_p = NULL;
+ *insert_parent = NULL;
+ *prev_entry = NULL;
+ *next_entry = NULL;
+
+ if (RB_EMPTY_ROOT(root))
+ return NULL;
+
+ if (re) {
+ if (re->ofs <= ofs && re->ofs + re->len > ofs)
+ goto lookup_neighbors;
+ }
+
+ while (*pnode) {
+ parent = *pnode;
+ re = rb_entry(*pnode, struct rb_entry, rb_node);
+
+ if (ofs < re->ofs)
+ pnode = &(*pnode)->rb_left;
+ else if (ofs >= re->ofs + re->len)
+ pnode = &(*pnode)->rb_right;
+ else
+ goto lookup_neighbors;
+ }
+
+ *insert_p = pnode;
+ *insert_parent = parent;
+
+ re = rb_entry(parent, struct rb_entry, rb_node);
+ tmp_node = parent;
+ if (parent && ofs > re->ofs)
+ tmp_node = rb_next(parent);
+ *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+
+ tmp_node = parent;
+ if (parent && ofs < re->ofs)
+ tmp_node = rb_prev(parent);
+ *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+ return NULL;
+
+lookup_neighbors:
+ if (ofs == re->ofs || force) {
+ /* lookup prev node for merging backward later */
+ tmp_node = rb_prev(&re->rb_node);
+ *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+ }
+ if (ofs == re->ofs + re->len - 1 || force) {
+ /* lookup next node for merging frontward later */
+ tmp_node = rb_next(&re->rb_node);
+ *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+ }
+ return re;
+}
+
+bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi,
+ struct rb_root *root)
+{
+#ifdef CONFIG_F2FS_CHECK_FS
+ struct rb_node *cur = rb_first(root), *next;
+ struct rb_entry *cur_re, *next_re;
+
+ if (!cur)
+ return true;
+
+ while (cur) {
+ next = rb_next(cur);
+ if (!next)
+ return true;
+
+ cur_re = rb_entry(cur, struct rb_entry, rb_node);
+ next_re = rb_entry(next, struct rb_entry, rb_node);
+
+ if (cur_re->ofs + cur_re->len > next_re->ofs) {
+ f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, "
+ "cur(%u, %u) next(%u, %u)",
+ cur_re->ofs, cur_re->len,
+ next_re->ofs, next_re->len);
+ return false;
+ }
+
+ cur = next;
+ }
+#endif
+ return true;
+}
+
+static struct kmem_cache *extent_tree_slab;
+static struct kmem_cache *extent_node_slab;
+
+static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_info *ei,
+ struct rb_node *parent, struct rb_node **p)
+{
+ struct extent_node *en;
+
+ en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC);
+ if (!en)
+ return NULL;
+
+ en->ei = *ei;
+ INIT_LIST_HEAD(&en->list);
+ en->et = et;
+
+ rb_link_node(&en->rb_node, parent, p);
+ rb_insert_color(&en->rb_node, &et->root);
+ atomic_inc(&et->node_cnt);
+ atomic_inc(&sbi->total_ext_node);
+ return en;
+}
+
+static void __detach_extent_node(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_node *en)
+{
+ rb_erase(&en->rb_node, &et->root);
+ atomic_dec(&et->node_cnt);
+ atomic_dec(&sbi->total_ext_node);
+
+ if (et->cached_en == en)
+ et->cached_en = NULL;
+ kmem_cache_free(extent_node_slab, en);
+}
+
+/*
+ * Flow to release an extent_node:
+ * 1. list_del_init
+ * 2. __detach_extent_node
+ * 3. kmem_cache_free.
+ */
+static void __release_extent_node(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_node *en)
+{
+ spin_lock(&sbi->extent_lock);
+ f2fs_bug_on(sbi, list_empty(&en->list));
+ list_del_init(&en->list);
+ spin_unlock(&sbi->extent_lock);
+
+ __detach_extent_node(sbi, et, en);
+}
+
+static struct extent_tree *__grab_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et;
+ nid_t ino = inode->i_ino;
+
+ mutex_lock(&sbi->extent_tree_lock);
+ et = radix_tree_lookup(&sbi->extent_tree_root, ino);
+ if (!et) {
+ et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS);
+ f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
+ memset(et, 0, sizeof(struct extent_tree));
+ et->ino = ino;
+ et->root = RB_ROOT;
+ et->cached_en = NULL;
+ rwlock_init(&et->lock);
+ INIT_LIST_HEAD(&et->list);
+ atomic_set(&et->node_cnt, 0);
+ atomic_inc(&sbi->total_ext_tree);
+ } else {
+ atomic_dec(&sbi->total_zombie_tree);
+ list_del_init(&et->list);
+ }
+ mutex_unlock(&sbi->extent_tree_lock);
+
+ /* never died until evict_inode */
+ F2FS_I(inode)->extent_tree = et;
+
+ return et;
+}
+
+static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_info *ei)
+{
+ struct rb_node **p = &et->root.rb_node;
+ struct extent_node *en;
+
+ en = __attach_extent_node(sbi, et, ei, NULL, p);
+ if (!en)
+ return NULL;
+
+ et->largest = en->ei;
+ et->cached_en = en;
+ return en;
+}
+
+static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
+ struct extent_tree *et)
+{
+ struct rb_node *node, *next;
+ struct extent_node *en;
+ unsigned int count = atomic_read(&et->node_cnt);
+
+ node = rb_first(&et->root);
+ while (node) {
+ next = rb_next(node);
+ en = rb_entry(node, struct extent_node, rb_node);
+ __release_extent_node(sbi, et, en);
+ node = next;
+ }
+
+ return count - atomic_read(&et->node_cnt);
+}
+
+static void __drop_largest_extent(struct inode *inode,
+ pgoff_t fofs, unsigned int len)
+{
+ struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest;
+
+ if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) {
+ largest->len = 0;
+ f2fs_mark_inode_dirty_sync(inode, true);
+ }
+}
+
+/* return true, if inode page is changed */
+bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et;
+ struct extent_node *en;
+ struct extent_info ei;
+
+ if (!f2fs_may_extent_tree(inode)) {
+ /* drop largest extent */
+ if (i_ext && i_ext->len) {
+ i_ext->len = 0;
+ return true;
+ }
+ return false;
+ }
+
+ et = __grab_extent_tree(inode);
+
+ if (!i_ext || !i_ext->len)
+ return false;
+
+ get_extent_info(&ei, i_ext);
+
+ write_lock(&et->lock);
+ if (atomic_read(&et->node_cnt))
+ goto out;
+
+ en = __init_extent_tree(sbi, et, &ei);
+ if (en) {
+ spin_lock(&sbi->extent_lock);
+ list_add_tail(&en->list, &sbi->extent_list);
+ spin_unlock(&sbi->extent_lock);
+ }
+out:
+ write_unlock(&et->lock);
+ return false;
+}
+
+static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_node *en;
+ bool ret = false;
+
+ f2fs_bug_on(sbi, !et);
+
+ trace_f2fs_lookup_extent_tree_start(inode, pgofs);
+
+ read_lock(&et->lock);
+
+ if (et->largest.fofs <= pgofs &&
+ et->largest.fofs + et->largest.len > pgofs) {
+ *ei = et->largest;
+ ret = true;
+ stat_inc_largest_node_hit(sbi);
+ goto out;
+ }
+
+ en = (struct extent_node *)__lookup_rb_tree(&et->root,
+ (struct rb_entry *)et->cached_en, pgofs);
+ if (!en)
+ goto out;
+
+ if (en == et->cached_en)
+ stat_inc_cached_node_hit(sbi);
+ else
+ stat_inc_rbtree_node_hit(sbi);
+
+ *ei = en->ei;
+ spin_lock(&sbi->extent_lock);
+ if (!list_empty(&en->list)) {
+ list_move_tail(&en->list, &sbi->extent_list);
+ et->cached_en = en;
+ }
+ spin_unlock(&sbi->extent_lock);
+ ret = true;
+out:
+ stat_inc_total_hit(sbi);
+ read_unlock(&et->lock);
+
+ trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei);
+ return ret;
+}
+
+static struct extent_node *__try_merge_extent_node(struct inode *inode,
+ struct extent_tree *et, struct extent_info *ei,
+ struct extent_node *prev_ex,
+ struct extent_node *next_ex)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_node *en = NULL;
+
+ if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) {
+ prev_ex->ei.len += ei->len;
+ ei = &prev_ex->ei;
+ en = prev_ex;
+ }
+
+ if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) {
+ next_ex->ei.fofs = ei->fofs;
+ next_ex->ei.blk = ei->blk;
+ next_ex->ei.len += ei->len;
+ if (en)
+ __release_extent_node(sbi, et, prev_ex);
+
+ en = next_ex;
+ }
+
+ if (!en)
+ return NULL;
+
+ __try_update_largest_extent(inode, et, en);
+
+ spin_lock(&sbi->extent_lock);
+ if (!list_empty(&en->list)) {
+ list_move_tail(&en->list, &sbi->extent_list);
+ et->cached_en = en;
+ }
+ spin_unlock(&sbi->extent_lock);
+ return en;
+}
+
+static struct extent_node *__insert_extent_tree(struct inode *inode,
+ struct extent_tree *et, struct extent_info *ei,
+ struct rb_node **insert_p,
+ struct rb_node *insert_parent)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct rb_node **p = &et->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct extent_node *en = NULL;
+
+ if (insert_p && insert_parent) {
+ parent = insert_parent;
+ p = insert_p;
+ goto do_insert;
+ }
+
+ p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs);
+do_insert:
+ en = __attach_extent_node(sbi, et, ei, parent, p);
+ if (!en)
+ return NULL;
+
+ __try_update_largest_extent(inode, et, en);
+
+ /* update in global extent list */
+ spin_lock(&sbi->extent_lock);
+ list_add_tail(&en->list, &sbi->extent_list);
+ et->cached_en = en;
+ spin_unlock(&sbi->extent_lock);
+ return en;
+}
+
+static void f2fs_update_extent_tree_range(struct inode *inode,
+ pgoff_t fofs, block_t blkaddr, unsigned int len)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_node *en = NULL, *en1 = NULL;
+ struct extent_node *prev_en = NULL, *next_en = NULL;
+ struct extent_info ei, dei, prev;
+ struct rb_node **insert_p = NULL, *insert_parent = NULL;
+ unsigned int end = fofs + len;
+ unsigned int pos = (unsigned int)fofs;
+
+ if (!et)
+ return;
+
+ trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len);
+
+ write_lock(&et->lock);
+
+ if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
+ write_unlock(&et->lock);
+ return;
+ }
+
+ prev = et->largest;
+ dei.len = 0;
+
+ /*
+ * drop largest extent before lookup, in case it's already
+ * been shrunk from extent tree
+ */
+ __drop_largest_extent(inode, fofs, len);
+
+ /* 1. lookup first extent node in range [fofs, fofs + len - 1] */
+ en = (struct extent_node *)__lookup_rb_tree_ret(&et->root,
+ (struct rb_entry *)et->cached_en, fofs,
+ (struct rb_entry **)&prev_en,
+ (struct rb_entry **)&next_en,
+ &insert_p, &insert_parent, false);
+ if (!en)
+ en = next_en;
+
+ /* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */
+ while (en && en->ei.fofs < end) {
+ unsigned int org_end;
+ int parts = 0; /* # of parts current extent split into */
+
+ next_en = en1 = NULL;
+
+ dei = en->ei;
+ org_end = dei.fofs + dei.len;
+ f2fs_bug_on(sbi, pos >= org_end);
+
+ if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
+ en->ei.len = pos - en->ei.fofs;
+ prev_en = en;
+ parts = 1;
+ }
+
+ if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) {
+ if (parts) {
+ set_extent_info(&ei, end,
+ end - dei.fofs + dei.blk,
+ org_end - end);
+ en1 = __insert_extent_tree(inode, et, &ei,
+ NULL, NULL);
+ next_en = en1;
+ } else {
+ en->ei.fofs = end;
+ en->ei.blk += end - dei.fofs;
+ en->ei.len -= end - dei.fofs;
+ next_en = en;
+ }
+ parts++;
+ }
+
+ if (!next_en) {
+ struct rb_node *node = rb_next(&en->rb_node);
+
+ next_en = rb_entry_safe(node, struct extent_node,
+ rb_node);
+ }
+
+ if (parts)
+ __try_update_largest_extent(inode, et, en);
+ else
+ __release_extent_node(sbi, et, en);
+
+ /*
+ * if original extent is split into zero or two parts, extent
+ * tree has been altered by deletion or insertion, therefore
+ * invalidate pointers regard to tree.
+ */
+ if (parts != 1) {
+ insert_p = NULL;
+ insert_parent = NULL;
+ }
+ en = next_en;
+ }
+
+ /* 3. update extent in extent cache */
+ if (blkaddr) {
+
+ set_extent_info(&ei, fofs, blkaddr, len);
+ if (!__try_merge_extent_node(inode, et, &ei, prev_en, next_en))
+ __insert_extent_tree(inode, et, &ei,
+ insert_p, insert_parent);
+
+ /* give up extent_cache, if split and small updates happen */
+ if (dei.len >= 1 &&
+ prev.len < F2FS_MIN_EXTENT_LEN &&
+ et->largest.len < F2FS_MIN_EXTENT_LEN) {
+ __drop_largest_extent(inode, 0, UINT_MAX);
+ set_inode_flag(inode, FI_NO_EXTENT);
+ }
+ }
+
+ if (is_inode_flag_set(inode, FI_NO_EXTENT))
+ __free_extent_tree(sbi, et);
+
+ write_unlock(&et->lock);
+}
+
+unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+ struct extent_tree *et, *next;
+ struct extent_node *en;
+ unsigned int node_cnt = 0, tree_cnt = 0;
+ int remained;
+
+ if (!test_opt(sbi, EXTENT_CACHE))
+ return 0;
+
+ if (!atomic_read(&sbi->total_zombie_tree))
+ goto free_node;
+
+ if (!mutex_trylock(&sbi->extent_tree_lock))
+ goto out;
+
+ /* 1. remove unreferenced extent tree */
+ list_for_each_entry_safe(et, next, &sbi->zombie_list, list) {
+ if (atomic_read(&et->node_cnt)) {
+ write_lock(&et->lock);
+ node_cnt += __free_extent_tree(sbi, et);
+ write_unlock(&et->lock);
+ }
+ f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
+ list_del_init(&et->list);
+ radix_tree_delete(&sbi->extent_tree_root, et->ino);
+ kmem_cache_free(extent_tree_slab, et);
+ atomic_dec(&sbi->total_ext_tree);
+ atomic_dec(&sbi->total_zombie_tree);
+ tree_cnt++;
+
+ if (node_cnt + tree_cnt >= nr_shrink)
+ goto unlock_out;
+ cond_resched();
+ }
+ mutex_unlock(&sbi->extent_tree_lock);
+
+free_node:
+ /* 2. remove LRU extent entries */
+ if (!mutex_trylock(&sbi->extent_tree_lock))
+ goto out;
+
+ remained = nr_shrink - (node_cnt + tree_cnt);
+
+ spin_lock(&sbi->extent_lock);
+ for (; remained > 0; remained--) {
+ if (list_empty(&sbi->extent_list))
+ break;
+ en = list_first_entry(&sbi->extent_list,
+ struct extent_node, list);
+ et = en->et;
+ if (!write_trylock(&et->lock)) {
+ /* refresh this extent node's position in extent list */
+ list_move_tail(&en->list, &sbi->extent_list);
+ continue;
+ }
+
+ list_del_init(&en->list);
+ spin_unlock(&sbi->extent_lock);
+
+ __detach_extent_node(sbi, et, en);
+
+ write_unlock(&et->lock);
+ node_cnt++;
+ spin_lock(&sbi->extent_lock);
+ }
+ spin_unlock(&sbi->extent_lock);
+
+unlock_out:
+ mutex_unlock(&sbi->extent_tree_lock);
+out:
+ trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
+
+ return node_cnt + tree_cnt;
+}
+
+unsigned int f2fs_destroy_extent_node(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ unsigned int node_cnt = 0;
+
+ if (!et || !atomic_read(&et->node_cnt))
+ return 0;
+
+ write_lock(&et->lock);
+ node_cnt = __free_extent_tree(sbi, et);
+ write_unlock(&et->lock);
+
+ return node_cnt;
+}
+
+void f2fs_drop_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
+
+ set_inode_flag(inode, FI_NO_EXTENT);
+
+ write_lock(&et->lock);
+ __free_extent_tree(sbi, et);
+ __drop_largest_extent(inode, 0, UINT_MAX);
+ write_unlock(&et->lock);
+}
+
+void f2fs_destroy_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ unsigned int node_cnt = 0;
+
+ if (!et)
+ return;
+
+ if (inode->i_nlink && !is_bad_inode(inode) &&
+ atomic_read(&et->node_cnt)) {
+ mutex_lock(&sbi->extent_tree_lock);
+ list_add_tail(&et->list, &sbi->zombie_list);
+ atomic_inc(&sbi->total_zombie_tree);
+ mutex_unlock(&sbi->extent_tree_lock);
+ return;
+ }
+
+ /* free all extent info belong to this extent tree */
+ node_cnt = f2fs_destroy_extent_node(inode);
+
+ /* delete extent tree entry in radix tree */
+ mutex_lock(&sbi->extent_tree_lock);
+ f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
+ radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
+ kmem_cache_free(extent_tree_slab, et);
+ atomic_dec(&sbi->total_ext_tree);
+ mutex_unlock(&sbi->extent_tree_lock);
+
+ F2FS_I(inode)->extent_tree = NULL;
+
+ trace_f2fs_destroy_extent_tree(inode, node_cnt);
+}
+
+bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei)
+{
+ if (!f2fs_may_extent_tree(inode))
+ return false;
+
+ return f2fs_lookup_extent_tree(inode, pgofs, ei);
+}
+
+void f2fs_update_extent_cache(struct dnode_of_data *dn)
+{
+ pgoff_t fofs;
+ block_t blkaddr;
+
+ if (!f2fs_may_extent_tree(dn->inode))
+ return;
+
+ if (dn->data_blkaddr == NEW_ADDR)
+ blkaddr = NULL_ADDR;
+ else
+ blkaddr = dn->data_blkaddr;
+
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
+ dn->ofs_in_node;
+ f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1);
+}
+
+void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t fofs, block_t blkaddr, unsigned int len)
+
+{
+ if (!f2fs_may_extent_tree(dn->inode))
+ return;
+
+ f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len);
+}
+
+void init_extent_cache_info(struct f2fs_sb_info *sbi)
+{
+ INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
+ mutex_init(&sbi->extent_tree_lock);
+ INIT_LIST_HEAD(&sbi->extent_list);
+ spin_lock_init(&sbi->extent_lock);
+ atomic_set(&sbi->total_ext_tree, 0);
+ INIT_LIST_HEAD(&sbi->zombie_list);
+ atomic_set(&sbi->total_zombie_tree, 0);
+ atomic_set(&sbi->total_ext_node, 0);
+}
+
+int __init create_extent_cache(void)
+{
+ extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree",
+ sizeof(struct extent_tree));
+ if (!extent_tree_slab)
+ return -ENOMEM;
+ extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node",
+ sizeof(struct extent_node));
+ if (!extent_node_slab) {
+ kmem_cache_destroy(extent_tree_slab);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void destroy_extent_cache(void)
+{
+ kmem_cache_destroy(extent_node_slab);
+ kmem_cache_destroy(extent_tree_slab);
+}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 8171e80b2ee9..208689f0d0d1 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -19,6 +19,16 @@
#include <linux/magic.h>
#include <linux/kobject.h>
#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#include <linux/fscrypt_supp.h>
+#else
+#include <linux/fscrypt_notsupp.h>
+#endif
+#include <crypto/hash.h>
+#include <linux/writeback.h>
#ifdef CONFIG_F2FS_CHECK_FS
#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
@@ -28,10 +38,34 @@
do { \
if (unlikely(condition)) { \
WARN_ON(1); \
- sbi->need_fsck = true; \
+ set_sbi_flag(sbi, SBI_NEED_FSCK); \
} \
} while (0)
-#define f2fs_down_write(x, y) down_write(x)
+#endif
+
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+enum {
+ FAULT_KMALLOC,
+ FAULT_PAGE_ALLOC,
+ FAULT_ALLOC_NID,
+ FAULT_ORPHAN,
+ FAULT_BLOCK,
+ FAULT_DIR_DEPTH,
+ FAULT_EVICT_INODE,
+ FAULT_TRUNCATE,
+ FAULT_IO,
+ FAULT_CHECKPOINT,
+ FAULT_MAX,
+};
+
+struct f2fs_fault_info {
+ atomic_t inject_ops;
+ unsigned int inject_rate;
+ unsigned int inject_type;
+};
+
+extern char *fault_name[FAULT_MAX];
+#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type)))
#endif
/*
@@ -46,12 +80,20 @@
#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
#define F2FS_MOUNT_INLINE_XATTR 0x00000080
#define F2FS_MOUNT_INLINE_DATA 0x00000100
-#define F2FS_MOUNT_FLUSH_MERGE 0x00000200
-#define F2FS_MOUNT_NOBARRIER 0x00000400
-
-#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
-#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
-#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option)
+#define F2FS_MOUNT_INLINE_DENTRY 0x00000200
+#define F2FS_MOUNT_FLUSH_MERGE 0x00000400
+#define F2FS_MOUNT_NOBARRIER 0x00000800
+#define F2FS_MOUNT_FASTBOOT 0x00001000
+#define F2FS_MOUNT_EXTENT_CACHE 0x00002000
+#define F2FS_MOUNT_FORCE_FG_GC 0x00004000
+#define F2FS_MOUNT_DATA_FLUSH 0x00008000
+#define F2FS_MOUNT_FAULT_INJECTION 0x00010000
+#define F2FS_MOUNT_ADAPTIVE 0x00020000
+#define F2FS_MOUNT_LFS 0x00040000
+
+#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
+#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
+#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option)
#define ver_after(a, b) (typecheck(unsigned long long, a) && \
typecheck(unsigned long long, b) && \
@@ -67,25 +109,111 @@ struct f2fs_mount_info {
unsigned int opt;
};
-#define CRCPOLY_LE 0xedb88320
+#define F2FS_FEATURE_ENCRYPT 0x0001
+#define F2FS_FEATURE_BLKZONED 0x0002
+
+#define F2FS_HAS_FEATURE(sb, mask) \
+ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
+#define F2FS_SET_FEATURE(sb, mask) \
+ (F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask))
+#define F2FS_CLEAR_FEATURE(sb, mask) \
+ (F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask))
+
+/* bio stuffs */
+#define REQ_OP_READ READ
+#define REQ_OP_WRITE WRITE
+#define bio_op(bio) ((bio)->bi_rw & 1)
-static inline __u32 f2fs_crc32(void *buf, size_t len)
+static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
+ unsigned op_flags)
{
- unsigned char *p = (unsigned char *)buf;
- __u32 crc = F2FS_SUPER_MAGIC;
- int i;
+ bio->bi_rw = op | op_flags;
+}
- while (len--) {
- crc ^= *p++;
- for (i = 0; i < 8; i++)
- crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
- }
- return crc;
+static inline int wbc_to_write_flags(struct writeback_control *wbc)
+{
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ return REQ_SYNC;
+ return 0;
+}
+
+static inline void inode_lock(struct inode *inode)
+{
+ mutex_lock(&inode->i_mutex);
+}
+
+static inline int inode_trylock(struct inode *inode)
+{
+ return mutex_trylock(&inode->i_mutex);
+}
+
+static inline void inode_unlock(struct inode *inode)
+{
+ mutex_unlock(&inode->i_mutex);
+}
+
+#define rb_entry_safe(ptr, type, member) \
+ ({ typeof(ptr) ____ptr = (ptr); \
+ ____ptr ? rb_entry(____ptr, type, member) : NULL; \
+ })
+
+#define list_last_entry(ptr, type, member) \
+ list_entry((ptr)->prev, type, member)
+
+#define list_first_entry(ptr, type, member) \
+ list_entry((ptr)->next, type, member)
+
+/**
+ * wq_has_sleeper - check if there are any waiting processes
+ * @wq: wait queue head
+ *
+ * Returns true if wq has waiting processes
+ *
+ * Please refer to the comment for waitqueue_active.
+ */
+static inline bool wq_has_sleeper(wait_queue_head_t *wq)
+{
+ /*
+ * We need to be sure we are in sync with the
+ * add_wait_queue modifications to the wait queue.
+ *
+ * This memory barrier should be paired with one on the
+ * waiting side.
+ */
+ smp_mb();
+ return waitqueue_active(wq);
}
-static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
+static inline struct dentry *file_dentry(const struct file *file)
{
- return f2fs_crc32(buf, buf_size) == blk_crc;
+ return file->f_path.dentry;
+}
+
+static inline void inode_nohighmem(struct inode *inode)
+{
+ mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
+}
+
+/**
+ * current_time - Return FS time
+ * @inode: inode.
+ *
+ * Return the current time truncated to the time granularity supported by
+ * the fs.
+ *
+ * Note that inode and inode->sb cannot be NULL.
+ * Otherwise, the function warns and returns time without truncation.
+ */
+static inline struct timespec current_time(struct inode *inode)
+{
+ struct timespec now = current_kernel_time();
+
+ if (unlikely(!inode->i_sb)) {
+ WARN(1, "current_time() called with uninitialized super_block in the inode");
+ return now;
+ }
+
+ return timespec_trunc(now, inode->i_sb->s_time_gran);
}
/*
@@ -96,11 +224,22 @@ enum {
SIT_BITMAP
};
-enum {
- CP_UMOUNT,
- CP_SYNC,
- CP_DISCARD,
-};
+#define CP_UMOUNT 0x00000001
+#define CP_FASTBOOT 0x00000002
+#define CP_SYNC 0x00000004
+#define CP_RECOVERY 0x00000008
+#define CP_DISCARD 0x00000010
+#define CP_TRIMMED 0x00000020
+
+#define DEF_BATCHED_TRIM_SECTIONS 2048
+#define BATCHED_TRIM_SEGMENTS(sbi) \
+ (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections))
+#define BATCHED_TRIM_BLOCKS(sbi) \
+ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
+#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi)
+#define DISCARD_ISSUE_RATE 8
+#define DEF_CP_INTERVAL 60 /* 60 secs */
+#define DEF_IDLE_INTERVAL 5 /* 5 secs */
struct cp_control {
int reason;
@@ -134,17 +273,69 @@ struct ino_entry {
nid_t ino; /* inode number */
};
-/* for the list of directory inodes */
-struct dir_inode_entry {
+/* for the list of inodes to be GCed */
+struct inode_entry {
struct list_head list; /* list head */
struct inode *inode; /* vfs inode pointer */
};
-/* for the list of blockaddresses to be discarded */
+/* for the bitmap indicate blocks to be discarded */
struct discard_entry {
struct list_head list; /* list head */
- block_t blkaddr; /* block address to be discarded */
- int len; /* # of consecutive blocks of the discard */
+ block_t start_blkaddr; /* start blockaddr of current segment */
+ unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */
+};
+
+/* max discard pend list number */
+#define MAX_PLIST_NUM 512
+#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \
+ (MAX_PLIST_NUM - 1) : (blk_num - 1))
+
+enum {
+ D_PREP,
+ D_SUBMIT,
+ D_DONE,
+};
+
+struct discard_info {
+ block_t lstart; /* logical start address */
+ block_t len; /* length */
+ block_t start; /* actual start address in dev */
+};
+
+struct discard_cmd {
+ struct rb_node rb_node; /* rb node located in rb-tree */
+ union {
+ struct {
+ block_t lstart; /* logical start address */
+ block_t len; /* length */
+ block_t start; /* actual start address in dev */
+ };
+ struct discard_info di; /* discard info */
+
+ };
+ struct list_head list; /* command list */
+ struct completion wait; /* compleation */
+ struct block_device *bdev; /* bdev */
+ unsigned short ref; /* reference count */
+ unsigned char state; /* state */
+ int error; /* bio error */
+};
+
+struct discard_cmd_control {
+ struct task_struct *f2fs_issue_discard; /* discard thread */
+ struct list_head entry_list; /* 4KB discard entry list */
+ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */
+ struct list_head wait_list; /* store on-flushing entries */
+ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */
+ struct mutex cmd_lock;
+ unsigned int nr_discards; /* # of discards in the list */
+ unsigned int max_discards; /* max. discards to be issued */
+ unsigned int undiscard_blks; /* # of undiscard blocks */
+ atomic_t issued_discard; /* # of issued discard */
+ atomic_t issing_discard; /* # of issing discard */
+ atomic_t discard_cmd_cnt; /* # of cached cmd count */
+ struct rb_root root; /* root of discard rb-tree */
};
/* for the list of fsync inodes, used only during recovery */
@@ -153,40 +344,41 @@ struct fsync_inode_entry {
struct inode *inode; /* vfs inode pointer */
block_t blkaddr; /* block address locating the last fsync */
block_t last_dentry; /* block address locating the last dentry */
- block_t last_inode; /* block address locating the last inode */
};
-#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats))
-#define sits_in_cursum(sum) (le16_to_cpu(sum->n_sits))
+#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats))
+#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits))
-#define nat_in_journal(sum, i) (sum->nat_j.entries[i].ne)
-#define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid)
-#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se)
-#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno)
+#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne)
+#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid)
+#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se)
+#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno)
-#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
-#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
+#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl))
+#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl))
-static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
+static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i)
{
- int before = nats_in_cursum(rs);
- rs->n_nats = cpu_to_le16(before + i);
+ int before = nats_in_cursum(journal);
+
+ journal->n_nats = cpu_to_le16(before + i);
return before;
}
-static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
+static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i)
{
- int before = sits_in_cursum(rs);
- rs->n_sits = cpu_to_le16(before + i);
+ int before = sits_in_cursum(journal);
+
+ journal->n_sits = cpu_to_le16(before + i);
return before;
}
-static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
- int type)
+static inline bool __has_cursum_space(struct f2fs_journal *journal,
+ int size, int type)
{
if (type == NAT_JOURNAL)
- return size <= MAX_NAT_JENTRIES(sum);
- return size <= MAX_SIT_JENTRIES(sum);
+ return size <= MAX_NAT_JENTRIES(journal);
+ return size <= MAX_SIT_JENTRIES(journal);
}
/*
@@ -194,23 +386,95 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
*/
#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
+#define F2FS_IOC_GETVERSION FS_IOC_GETVERSION
#define F2FS_IOCTL_MAGIC 0xf5
#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1)
#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2)
#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
+#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
+#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
+#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32)
+#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7)
+#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \
+ struct f2fs_defragment)
+#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \
+ struct f2fs_move_range)
+#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \
+ struct f2fs_flush_device)
+
+#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
+#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
+#define F2FS_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
+
+/*
+ * should be same as XFS_IOC_GOINGDOWN.
+ * Flags for going down operation used by FS_IOC_GOINGDOWN
+ */
+#define F2FS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* Shutdown */
+#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */
+#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */
+#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */
+#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
*/
-#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS
-#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION
#endif
+struct f2fs_defragment {
+ u64 start;
+ u64 len;
+};
+
+struct f2fs_move_range {
+ u32 dst_fd; /* destination fd */
+ u64 pos_in; /* start position in src_fd */
+ u64 pos_out; /* start position in dst_fd */
+ u64 len; /* size to move */
+};
+
+struct f2fs_flush_device {
+ u32 dev_num; /* device number to flush */
+ u32 segments; /* # of segments to flush */
+};
+
/*
* For INODE and NODE manager
*/
+/* for directory operations */
+struct f2fs_dentry_ptr {
+ struct inode *inode;
+ const void *bitmap;
+ struct f2fs_dir_entry *dentry;
+ __u8 (*filename)[F2FS_SLOT_LEN];
+ int max;
+};
+
+static inline void make_dentry_ptr_block(struct inode *inode,
+ struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t)
+{
+ d->inode = inode;
+ d->max = NR_DENTRY_IN_BLOCK;
+ d->bitmap = &t->dentry_bitmap;
+ d->dentry = t->dentry;
+ d->filename = t->filename;
+}
+
+static inline void make_dentry_ptr_inline(struct inode *inode,
+ struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t)
+{
+ d->inode = inode;
+ d->max = NR_INLINE_DENTRY;
+ d->bitmap = &t->dentry_bitmap;
+ d->dentry = t->dentry;
+ d->filename = t->filename;
+}
+
/*
* XATTR_NODE_OFFSET stores xattrs to one node block per file keeping -1
* as its node offset to distinguish from index node blocks.
@@ -227,25 +491,105 @@ enum {
*/
};
-#define F2FS_LINK_MAX 32000 /* maximum link count per file */
+#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */
#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */
+/* vector size for gang look-up from extent cache that consists of radix tree */
+#define EXT_TREE_VEC_SIZE 64
+
/* for in-memory extent cache entry */
-#define F2FS_MIN_EXTENT_LEN 16 /* minimum extent length */
+#define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */
+
+/* number of extent info in extent cache we try to shrink */
+#define EXTENT_CACHE_SHRINK_NUMBER 128
+
+struct rb_entry {
+ struct rb_node rb_node; /* rb node located in rb-tree */
+ unsigned int ofs; /* start offset of the entry */
+ unsigned int len; /* length of the entry */
+};
struct extent_info {
- rwlock_t ext_lock; /* rwlock for consistency */
- unsigned int fofs; /* start offset in a file */
- u32 blk_addr; /* start block address of the extent */
- unsigned int len; /* length of the extent */
+ unsigned int fofs; /* start offset in a file */
+ unsigned int len; /* length of the extent */
+ u32 blk; /* start block address of the extent */
};
+struct extent_node {
+ struct rb_node rb_node;
+ union {
+ struct {
+ unsigned int fofs;
+ unsigned int len;
+ u32 blk;
+ };
+ struct extent_info ei; /* extent info */
+
+ };
+ struct list_head list; /* node in global extent list of sbi */
+ struct extent_tree *et; /* extent tree pointer */
+};
+
+struct extent_tree {
+ nid_t ino; /* inode number */
+ struct rb_root root; /* root of extent info rb-tree */
+ struct extent_node *cached_en; /* recently accessed extent node */
+ struct extent_info largest; /* largested extent info */
+ struct list_head list; /* to be used by sbi->zombie_list */
+ rwlock_t lock; /* protect extent info rb-tree */
+ atomic_t node_cnt; /* # of extent node in rb-tree*/
+};
+
+/*
+ * This structure is taken from ext4_map_blocks.
+ *
+ * Note that, however, f2fs uses NEW and MAPPED flags for f2fs_map_blocks().
+ */
+#define F2FS_MAP_NEW (1 << BH_New)
+#define F2FS_MAP_MAPPED (1 << BH_Mapped)
+#define F2FS_MAP_UNWRITTEN (1 << BH_Unwritten)
+#define F2FS_MAP_FLAGS (F2FS_MAP_NEW | F2FS_MAP_MAPPED |\
+ F2FS_MAP_UNWRITTEN)
+
+struct f2fs_map_blocks {
+ block_t m_pblk;
+ block_t m_lblk;
+ unsigned int m_len;
+ unsigned int m_flags;
+ pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */
+};
+
+/* for flag in get_data_block */
+#define F2FS_GET_BLOCK_READ 0
+#define F2FS_GET_BLOCK_DIO 1
+#define F2FS_GET_BLOCK_FIEMAP 2
+#define F2FS_GET_BLOCK_BMAP 3
+#define F2FS_GET_BLOCK_PRE_DIO 4
+#define F2FS_GET_BLOCK_PRE_AIO 5
+
/*
* i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
*/
#define FADVISE_COLD_BIT 0x01
#define FADVISE_LOST_PINO_BIT 0x02
+#define FADVISE_ENCRYPT_BIT 0x04
+#define FADVISE_ENC_NAME_BIT 0x08
+#define FADVISE_KEEP_SIZE_BIT 0x10
+
+#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT)
+#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT)
+#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT)
+#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT)
+#define file_set_encrypt(inode) set_file(inode, FADVISE_ENCRYPT_BIT)
+#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT)
+#define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT)
+#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
+#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT)
+#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
#define DEF_DIR_LEVEL 0
@@ -264,59 +608,133 @@ struct f2fs_inode_info {
atomic_t dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
+ struct task_struct *task; /* lookup and create consistency */
nid_t i_xattr_nid; /* node id that contains xattrs */
- unsigned long long xattr_ver; /* cp version of xattr modification */
- struct extent_info ext; /* in-memory extent cache entry */
- struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */
+ loff_t last_disk_size; /* lastly written file size */
+ struct list_head dirty_list; /* dirty list for dirs and files */
+ struct list_head gdirty_list; /* linked in global dirty list */
struct list_head inmem_pages; /* inmemory pages managed by f2fs */
struct mutex inmem_lock; /* lock for inmemory pages */
+ struct extent_tree *extent_tree; /* cached extent_tree entry */
+ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */
};
static inline void get_extent_info(struct extent_info *ext,
- struct f2fs_extent i_ext)
+ struct f2fs_extent *i_ext)
{
- write_lock(&ext->ext_lock);
- ext->fofs = le32_to_cpu(i_ext.fofs);
- ext->blk_addr = le32_to_cpu(i_ext.blk_addr);
- ext->len = le32_to_cpu(i_ext.len);
- write_unlock(&ext->ext_lock);
+ ext->fofs = le32_to_cpu(i_ext->fofs);
+ ext->blk = le32_to_cpu(i_ext->blk);
+ ext->len = le32_to_cpu(i_ext->len);
}
static inline void set_raw_extent(struct extent_info *ext,
struct f2fs_extent *i_ext)
{
- read_lock(&ext->ext_lock);
i_ext->fofs = cpu_to_le32(ext->fofs);
- i_ext->blk_addr = cpu_to_le32(ext->blk_addr);
+ i_ext->blk = cpu_to_le32(ext->blk);
i_ext->len = cpu_to_le32(ext->len);
- read_unlock(&ext->ext_lock);
}
+static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
+ u32 blk, unsigned int len)
+{
+ ei->fofs = fofs;
+ ei->blk = blk;
+ ei->len = len;
+}
+
+static inline bool __is_discard_mergeable(struct discard_info *back,
+ struct discard_info *front)
+{
+ return back->lstart + back->len == front->lstart;
+}
+
+static inline bool __is_discard_back_mergeable(struct discard_info *cur,
+ struct discard_info *back)
+{
+ return __is_discard_mergeable(back, cur);
+}
+
+static inline bool __is_discard_front_mergeable(struct discard_info *cur,
+ struct discard_info *front)
+{
+ return __is_discard_mergeable(cur, front);
+}
+
+static inline bool __is_extent_mergeable(struct extent_info *back,
+ struct extent_info *front)
+{
+ return (back->fofs + back->len == front->fofs &&
+ back->blk + back->len == front->blk);
+}
+
+static inline bool __is_back_mergeable(struct extent_info *cur,
+ struct extent_info *back)
+{
+ return __is_extent_mergeable(back, cur);
+}
+
+static inline bool __is_front_mergeable(struct extent_info *cur,
+ struct extent_info *front)
+{
+ return __is_extent_mergeable(cur, front);
+}
+
+extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
+static inline void __try_update_largest_extent(struct inode *inode,
+ struct extent_tree *et, struct extent_node *en)
+{
+ if (en->ei.len > et->largest.len) {
+ et->largest = en->ei;
+ f2fs_mark_inode_dirty_sync(inode, true);
+ }
+}
+
+enum nid_list {
+ FREE_NID_LIST,
+ ALLOC_NID_LIST,
+ MAX_NID_LIST,
+};
+
struct f2fs_nm_info {
block_t nat_blkaddr; /* base disk address of NAT */
nid_t max_nid; /* maximum possible node ids */
- nid_t available_nids; /* maximum available node ids */
+ nid_t available_nids; /* # of available node ids */
nid_t next_scan_nid; /* the next nid to be scanned */
unsigned int ram_thresh; /* control the memory footprint */
+ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */
+ unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
struct radix_tree_root nat_set_root;/* root of the nat set cache */
- rwlock_t nat_tree_lock; /* protect nat_tree_lock */
+ struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */
struct list_head nat_entries; /* cached nat entry list (clean) */
unsigned int nat_cnt; /* the # of cached nat entries */
unsigned int dirty_nat_cnt; /* total num of nat entries in set */
+ unsigned int nat_blocks; /* # of nat blocks */
/* free node ids management */
struct radix_tree_root free_nid_root;/* root of the free_nid cache */
- struct list_head free_nid_list; /* a list for free nids */
- spinlock_t free_nid_list_lock; /* protect free nid list */
- unsigned int fcnt; /* the number of free node id */
+ struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */
+ unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */
+ spinlock_t nid_list_lock; /* protect nid lists ops */
struct mutex build_lock; /* lock for build free nids */
+ unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE];
+ unsigned char *nat_block_bitmap;
+ unsigned short *free_nid_count; /* free nid count of NAT block */
/* for checkpoint */
char *nat_bitmap; /* NAT bitmap pointer */
+
+ unsigned int nat_bits_blocks; /* # of nat bits blocks */
+ unsigned char *nat_bits; /* NAT bits blocks */
+ unsigned char *full_nat_bits; /* full NAT pages */
+ unsigned char *empty_nat_bits; /* empty NAT pages */
+#ifdef CONFIG_F2FS_CHECK_FS
+ char *nat_bitmap_mir; /* NAT bitmap mirror */
+#endif
int bitmap_size; /* bitmap size */
};
@@ -332,6 +750,9 @@ struct dnode_of_data {
nid_t nid; /* node id of the direct node block */
unsigned int ofs_in_node; /* data offset in the node page */
bool inode_page_locked; /* inode page is locked or not */
+ bool node_changed; /* is node block changed */
+ char cur_level; /* level of hole node page */
+ char max_level; /* level of current page located */
block_t data_blkaddr; /* block address of the node block */
};
@@ -369,7 +790,7 @@ enum {
CURSEG_HOT_NODE, /* direct node blocks of directory files */
CURSEG_WARM_NODE, /* direct node blocks of normal files */
CURSEG_COLD_NODE, /* indirect node blocks */
- NO_CHECK_TYPE
+ NO_CHECK_TYPE,
};
struct flush_cmd {
@@ -381,6 +802,8 @@ struct flush_cmd {
struct flush_cmd_control {
struct task_struct *f2fs_issue_flush; /* flush thread */
wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
+ atomic_t issued_flush; /* # of issued flushes */
+ atomic_t issing_flush; /* # of issing flushes */
struct llist_head issue_list; /* list for command issue */
struct llist_node *dispatch_list; /* list for command dispatch */
};
@@ -403,20 +826,21 @@ struct f2fs_sm_info {
/* a threshold to reclaim prefree segments */
unsigned int rec_prefree_segments;
- /* for small discard management */
- struct list_head discard_list; /* 4KB discard list */
- int nr_discards; /* # of discards in the list */
- int max_discards; /* max. discards to be issued */
+ /* for batched trimming */
+ unsigned int trim_sections; /* # of sections to trim */
struct list_head sit_entry_set; /* sit entry set list */
unsigned int ipu_policy; /* in-place-update policy */
unsigned int min_ipu_util; /* in-place-update threshold */
unsigned int min_fsync_blocks; /* threshold for fsync */
+ unsigned int min_hot_blocks; /* threshold for hot block allocation */
/* for flush command control */
- struct flush_cmd_control *cmd_control_info;
+ struct flush_cmd_control *fcc_info;
+ /* for discard command control */
+ struct discard_cmd_control *dcc_info;
};
/*
@@ -428,11 +852,16 @@ struct f2fs_sm_info {
* f2fs monitors the number of several block types such as on-writeback,
* dirty dentry blocks, dirty node blocks, and dirty meta blocks.
*/
+#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
enum count_type {
- F2FS_WRITEBACK,
F2FS_DIRTY_DENTS,
+ F2FS_DIRTY_DATA,
F2FS_DIRTY_NODES,
F2FS_DIRTY_META,
+ F2FS_INMEM_PAGES,
+ F2FS_DIRTY_IMETA,
+ F2FS_WB_CP_DATA,
+ F2FS_WB_DATA,
NR_COUNT_TYPE,
};
@@ -454,14 +883,28 @@ enum page_type {
META,
NR_PAGE_TYPE,
META_FLUSH,
+ INMEM, /* the below types are used by tracepoints only. */
+ INMEM_DROP,
+ INMEM_INVALIDATE,
+ INMEM_REVOKE,
+ IPU,
+ OPU,
};
struct f2fs_io_info {
+ struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */
enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
- int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
+ int op; /* contains REQ_OP_ */
+ int op_flags; /* req_flag_bits */
+ block_t new_blkaddr; /* new block address to be written */
+ block_t old_blkaddr; /* old block address before Cow */
+ struct page *page; /* page to be written */
+ struct page *encrypted_page; /* encrypted page */
+ bool submitted; /* indicate IO submission */
+ bool need_lock; /* indicate we need to lock cp_rwsem */
};
-#define is_read_io(rw) (((rw) & 1) == READ)
+#define is_read_io(rw) ((rw) == READ)
struct f2fs_bio_info {
struct f2fs_sb_info *sbi; /* f2fs superblock */
struct bio *bio; /* bios to merge */
@@ -470,13 +913,62 @@ struct f2fs_bio_info {
struct rw_semaphore io_rwsem; /* blocking op for bio */
};
+#define FDEV(i) (sbi->devs[i])
+#define RDEV(i) (raw_super->devs[i])
+struct f2fs_dev_info {
+ struct block_device *bdev;
+ char path[MAX_PATH_LEN];
+ unsigned int total_segments;
+ block_t start_blk;
+ block_t end_blk;
+#ifdef CONFIG_BLK_DEV_ZONED
+ unsigned int nr_blkz; /* Total number of zones */
+ u8 *blkz_type; /* Array of zones type */
+#endif
+};
+
+enum inode_type {
+ DIR_INODE, /* for dirty dir inode */
+ FILE_INODE, /* for dirty regular/symlink inode */
+ DIRTY_META, /* for all dirtied inode metadata */
+ NR_INODE_TYPE,
+};
+
+/* for inner inode cache management */
+struct inode_management {
+ struct radix_tree_root ino_root; /* ino entry array */
+ spinlock_t ino_lock; /* for ino entry lock */
+ struct list_head ino_list; /* inode list head */
+ unsigned long ino_num; /* number of entries */
+};
+
+/* For s_flag in struct f2fs_sb_info */
+enum {
+ SBI_IS_DIRTY, /* dirty flag for checkpoint */
+ SBI_IS_CLOSE, /* specify unmounting */
+ SBI_NEED_FSCK, /* need fsck.f2fs to fix */
+ SBI_POR_DOING, /* recovery is doing or not */
+ SBI_NEED_SB_WRITE, /* need to recover superblock */
+ SBI_NEED_CP, /* need to checkpoint */
+};
+
+enum {
+ CP_TIME,
+ REQ_TIME,
+ MAX_TIME,
+};
+
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
- struct buffer_head *raw_super_buf; /* buffer head of raw sb */
struct f2fs_super_block *raw_super; /* raw super block pointer */
- int s_dirty; /* dirty flag for checkpoint */
- bool need_fsck; /* need fsck.f2fs to fix */
+ int valid_super_block; /* valid super block no */
+ unsigned long s_flag; /* flags for sbi */
+
+#ifdef CONFIG_BLK_DEV_ZONED
+ unsigned int blocks_per_blkz; /* F2FS blocks per zone */
+ unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */
+#endif
/* for node-related operations */
struct f2fs_nm_info *nm_info; /* node manager */
@@ -488,30 +980,41 @@ struct f2fs_sb_info {
/* for bio operations */
struct f2fs_bio_info read_io; /* for read bios */
struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */
- struct completion *wait_io; /* for completion bios */
+ struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */
+ int write_io_size_bits; /* Write IO size bits */
+ mempool_t *write_io_dummy; /* Dummy pages */
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
+ int cur_cp_pack; /* remain current cp pack */
+ spinlock_t cp_lock; /* for flag in ckpt */
struct inode *meta_inode; /* cache meta blocks */
struct mutex cp_mutex; /* checkpoint procedure lock */
struct rw_semaphore cp_rwsem; /* blocking FS operations */
struct rw_semaphore node_write; /* locking node writes */
- struct mutex writepages; /* mutex for writepages() */
- bool por_doing; /* recovery is doing or not */
+ struct rw_semaphore node_change; /* locking node change */
wait_queue_head_t cp_wait;
+ unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
+ long interval_time[MAX_TIME]; /* to store thresholds */
- /* for inode management */
- struct radix_tree_root ino_root[MAX_INO_ENTRY]; /* ino entry array */
- spinlock_t ino_lock[MAX_INO_ENTRY]; /* for ino entry lock */
- struct list_head ino_list[MAX_INO_ENTRY]; /* inode list head */
+ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
/* for orphan inode, use 0'th array */
- unsigned int n_orphans; /* # of orphan inodes */
unsigned int max_orphans; /* max orphan inodes */
- /* for directory inode management */
- struct list_head dir_inode_list; /* dir inode list */
- spinlock_t dir_inode_lock; /* for dir inode list lock */
+ /* for inode management */
+ struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */
+ spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */
+
+ /* for extent tree cache */
+ struct radix_tree_root extent_tree_root;/* cache extent cache entries */
+ struct mutex extent_tree_lock; /* locking extent radix tree */
+ struct list_head extent_list; /* lru list for shrinker */
+ spinlock_t extent_lock; /* locking extent lru list */
+ atomic_t total_ext_tree; /* extent tree count */
+ struct list_head zombie_list; /* extent zombie tree list */
+ atomic_t total_zombie_tree; /* extent zombie tree count */
+ atomic_t total_ext_node; /* extent info count */
/* basic filesystem units */
unsigned int log_sectors_per_block; /* log2 sectors per block */
@@ -527,16 +1030,26 @@ struct f2fs_sb_info {
unsigned int total_sections; /* total section count */
unsigned int total_node_count; /* total node block count */
unsigned int total_valid_node_count; /* valid node block count */
- unsigned int total_valid_inode_count; /* valid inode count */
+ loff_t max_file_blocks; /* max block index of file */
int active_logs; /* # of active logs */
int dir_level; /* directory level */
block_t user_block_count; /* # of user blocks */
block_t total_valid_block_count; /* # of valid blocks */
- block_t alloc_valid_block_count; /* # of allocated blocks */
+ block_t discard_blks; /* discard command candidats */
block_t last_valid_block_count; /* for recovery */
u32 s_next_generation; /* for NFS support */
- atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */
+
+ /* # of pages, see count_type */
+ atomic_t nr_pages[NR_COUNT_TYPE];
+ /* # of allocated blocks */
+ struct percpu_counter alloc_valid_block_count;
+
+ /* writeback control */
+ atomic_t wb_sync_req; /* count # of WB_SYNC threads */
+
+ /* valid inode count */
+ struct percpu_counter total_valid_inode_count;
struct f2fs_mount_info mount_opt; /* mount options */
@@ -545,6 +1058,9 @@ struct f2fs_sb_info {
struct f2fs_gc_kthread *gc_thread; /* GC thread */
unsigned int cur_victim_sec; /* current victim section num */
+ /* threshold for converting bg victims for fg */
+ u64 fggc_threshold;
+
/* maximum # of trials to find a victim segment for SSR and GC */
unsigned int max_victim_search;
@@ -556,22 +1072,129 @@ struct f2fs_sb_info {
struct f2fs_stat_info *stat_info; /* FS status information */
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
- int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
- int inline_inode; /* # of inline_data inodes */
+ atomic_t inplace_count; /* # of inplace update */
+ atomic64_t total_hit_ext; /* # of lookup extent cache */
+ atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */
+ atomic64_t read_hit_largest; /* # of hit largest extent node */
+ atomic64_t read_hit_cached; /* # of hit cached extent node */
+ atomic_t inline_xattr; /* # of inline_xattr inodes */
+ atomic_t inline_inode; /* # of inline_data inodes */
+ atomic_t inline_dir; /* # of inline_dentry inodes */
+ atomic_t aw_cnt; /* # of atomic writes */
+ atomic_t vw_cnt; /* # of volatile writes */
+ atomic_t max_aw_cnt; /* max # of atomic writes */
+ atomic_t max_vw_cnt; /* max # of volatile writes */
int bg_gc; /* background gc calls */
- unsigned int n_dirty_dirs; /* # of dir inodes */
+ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */
#endif
- unsigned int last_victim[2]; /* last victim segment # */
spinlock_t stat_lock; /* lock for stat operations */
/* For sysfs suppport */
struct kobject s_kobj;
struct completion s_kobj_unregister;
+
+ /* For shrinker support */
+ struct list_head s_list;
+ int s_ndevs; /* number of devices */
+ struct f2fs_dev_info *devs; /* for device list */
+ struct mutex umount_mutex;
+ unsigned int shrinker_run_no;
+
+ /* For write statistics */
+ u64 sectors_written_start;
+ u64 kbytes_written;
+
+ /* Reference to checksum algorithm driver via cryptoapi */
+ struct crypto_shash *s_chksum_driver;
+
+ /* For fault injection */
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ struct f2fs_fault_info fault_info;
+#endif
};
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+#define f2fs_show_injection_info(type) \
+ printk("%sF2FS-fs : inject %s in %s of %pF\n", \
+ KERN_INFO, fault_name[type], \
+ __func__, __builtin_return_address(0))
+static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
+{
+ struct f2fs_fault_info *ffi = &sbi->fault_info;
+
+ if (!ffi->inject_rate)
+ return false;
+
+ if (!IS_FAULT_SET(ffi, type))
+ return false;
+
+ atomic_inc(&ffi->inject_ops);
+ if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
+ atomic_set(&ffi->inject_ops, 0);
+ return true;
+ }
+ return false;
+}
+#endif
+
+/* For write statistics. Suppose sector size is 512 bytes,
+ * and the return value is in kbytes. s is of struct f2fs_sb_info.
+ */
+#define BD_PART_WRITTEN(s) \
+(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \
+ (s)->sectors_written_start) >> 1)
+
+static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
+{
+ sbi->last_time[type] = jiffies;
+}
+
+static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type)
+{
+ struct timespec ts = {sbi->interval_time[type], 0};
+ unsigned long interval = timespec_to_jiffies(&ts);
+
+ return time_after(jiffies, sbi->last_time[type] + interval);
+}
+
+static inline bool is_idle(struct f2fs_sb_info *sbi)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ struct request_list *rl = &q->root_rl;
+
+ if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC])
+ return 0;
+
+ return f2fs_time_over(sbi, REQ_TIME);
+}
+
/*
* Inline functions
*/
+static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
+ unsigned int length)
+{
+ SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver);
+ u32 *ctx = (u32 *)shash_desc_ctx(shash);
+ int err;
+
+ shash->tfm = sbi->s_chksum_driver;
+ shash->flags = 0;
+ *ctx = F2FS_SUPER_MAGIC;
+
+ err = crypto_shash_update(shash, address, length);
+ BUG_ON(err);
+
+ return *ctx;
+}
+
+static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
+ void *buf, size_t buf_size)
+{
+ return f2fs_crc32(sbi, buf, buf_size) == blk_crc;
+}
+
static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
{
return container_of(inode, struct f2fs_inode_info, vfs_inode);
@@ -652,14 +1275,19 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
return sbi->node_inode->i_mapping;
}
-static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
+static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type)
+{
+ return test_bit(type, &sbi->s_flag);
+}
+
+static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
{
- sbi->s_dirty = 1;
+ set_bit(type, &sbi->s_flag);
}
-static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi)
+static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
{
- sbi->s_dirty = 0;
+ clear_bit(type, &sbi->s_flag);
}
static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
@@ -667,26 +1295,77 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
return le64_to_cpu(cp->checkpoint_ver);
}
-static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp)
+{
+ size_t crc_offset = le32_to_cpu(cp->checksum_offset);
+ return le32_to_cpu(*((__le32 *)((unsigned char *)cp + crc_offset)));
+}
+
+static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
{
unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+
return ckpt_flags & f;
}
-static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+static inline bool is_set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
{
- unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+ return __is_set_ckpt_flags(F2FS_CKPT(sbi), f);
+}
+
+static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+ unsigned int ckpt_flags;
+
+ ckpt_flags = le32_to_cpu(cp->ckpt_flags);
ckpt_flags |= f;
cp->ckpt_flags = cpu_to_le32(ckpt_flags);
}
-static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
{
- unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+ spin_lock(&sbi->cp_lock);
+ __set_ckpt_flags(F2FS_CKPT(sbi), f);
+ spin_unlock(&sbi->cp_lock);
+}
+
+static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+ unsigned int ckpt_flags;
+
+ ckpt_flags = le32_to_cpu(cp->ckpt_flags);
ckpt_flags &= (~f);
cp->ckpt_flags = cpu_to_le32(ckpt_flags);
}
+static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
+{
+ spin_lock(&sbi->cp_lock);
+ __clear_ckpt_flags(F2FS_CKPT(sbi), f);
+ spin_unlock(&sbi->cp_lock);
+}
+
+static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
+{
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+
+ if (lock)
+ spin_lock(&sbi->cp_lock);
+ __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG);
+ kfree(NM_I(sbi)->nat_bits);
+ NM_I(sbi)->nat_bits = NULL;
+ if (lock)
+ spin_unlock(&sbi->cp_lock);
+}
+
+static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi,
+ struct cp_control *cpc)
+{
+ bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
+
+ return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set;
+}
+
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
down_read(&sbi->cp_rwsem);
@@ -699,7 +1378,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
{
- f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
+ down_write(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
@@ -707,6 +1386,28 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
up_write(&sbi->cp_rwsem);
}
+static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
+{
+ int reason = CP_SYNC;
+
+ if (test_opt(sbi, FASTBOOT))
+ reason = CP_FASTBOOT;
+ if (is_sbi_flag_set(sbi, SBI_IS_CLOSE))
+ reason = CP_UMOUNT;
+ return reason;
+}
+
+static inline bool __remain_node_summaries(int reason)
+{
+ return (reason & (CP_UMOUNT | CP_FASTBOOT));
+}
+
+static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
+{
+ return (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG) ||
+ is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG));
+}
+
/*
* Check whether the given nid is within node id range.
*/
@@ -737,22 +1438,39 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs)
return ofs == XATTR_NODE_OFFSET;
}
+static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool);
static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
- struct inode *inode, blkcnt_t count)
+ struct inode *inode, blkcnt_t *count)
{
- block_t valid_block_count;
+ blkcnt_t diff;
- spin_lock(&sbi->stat_lock);
- valid_block_count =
- sbi->total_valid_block_count + (block_t)count;
- if (unlikely(valid_block_count > sbi->user_block_count)) {
- spin_unlock(&sbi->stat_lock);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(sbi, FAULT_BLOCK)) {
+ f2fs_show_injection_info(FAULT_BLOCK);
return false;
}
- inode->i_blocks += count;
- sbi->total_valid_block_count = valid_block_count;
- sbi->alloc_valid_block_count += (block_t)count;
+#endif
+ /*
+ * let's increase this in prior to actual block count change in order
+ * for f2fs_sync_file to avoid data races when deciding checkpoint.
+ */
+ percpu_counter_add(&sbi->alloc_valid_block_count, (*count));
+
+ spin_lock(&sbi->stat_lock);
+ sbi->total_valid_block_count += (block_t)(*count);
+ if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) {
+ diff = sbi->total_valid_block_count - sbi->user_block_count;
+ *count -= diff;
+ sbi->total_valid_block_count = sbi->user_block_count;
+ if (!*count) {
+ spin_unlock(&sbi->stat_lock);
+ percpu_counter_sub(&sbi->alloc_valid_block_count, diff);
+ return false;
+ }
+ }
spin_unlock(&sbi->stat_lock);
+
+ f2fs_i_blocks_write(inode, *count, true);
return true;
}
@@ -763,22 +1481,27 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
spin_lock(&sbi->stat_lock);
f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
f2fs_bug_on(sbi, inode->i_blocks < count);
- inode->i_blocks -= count;
sbi->total_valid_block_count -= (block_t)count;
spin_unlock(&sbi->stat_lock);
+ f2fs_i_blocks_write(inode, count, false);
}
static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
{
atomic_inc(&sbi->nr_pages[count_type]);
- F2FS_SET_SB_DIRT(sbi);
+
+ if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES ||
+ count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA)
+ return;
+
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
}
static inline void inode_inc_dirty_pages(struct inode *inode)
{
atomic_inc(&F2FS_I(inode)->dirty_pages);
- if (S_ISDIR(inode->i_mode))
- inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
+ inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -788,16 +1511,16 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
static inline void inode_dec_dirty_pages(struct inode *inode)
{
- if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
+ if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
return;
atomic_dec(&F2FS_I(inode)->dirty_pages);
-
- if (S_ISDIR(inode->i_mode))
- dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
+ dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
-static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
+static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
{
return atomic_read(&sbi->nr_pages[count_type]);
}
@@ -809,10 +1532,11 @@ static inline int get_dirty_pages(struct inode *inode)
static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
{
- unsigned int pages_per_sec = sbi->segs_per_sec *
- (1 << sbi->log_blocks_per_seg);
- return ((get_pages(sbi, block_type) + pages_per_sec - 1)
- >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+ unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
+ unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >>
+ sbi->log_blocks_per_seg;
+
+ return segs / sbi->segs_per_sec;
}
static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
@@ -820,6 +1544,11 @@ static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
return sbi->total_valid_block_count;
}
+static inline block_t discard_blocks(struct f2fs_sb_info *sbi)
+{
+ return sbi->discard_blks;
+}
+
static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -833,12 +1562,17 @@ static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
return 0;
}
+static inline block_t __cp_payload(struct f2fs_sb_info *sbi)
+{
+ return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
+}
+
static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
int offset;
- if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload) > 0) {
+ if (__cp_payload(sbi) > 0) {
if (flag == NAT_BITMAP)
return &ckpt->sit_nat_version_bitmap;
else
@@ -852,22 +1586,27 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
{
- block_t start_addr;
- struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
- unsigned long long ckpt_version = cur_cp_version(ckpt);
-
- start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
+ block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
- /*
- * odd numbered checkpoint should at cp segment 0
- * and even segment must be at cp segment 1
- */
- if (!(ckpt_version & 1))
+ if (sbi->cur_cp_pack == 2)
start_addr += sbi->blocks_per_seg;
+ return start_addr;
+}
+
+static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi)
+{
+ block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
+ if (sbi->cur_cp_pack == 1)
+ start_addr += sbi->blocks_per_seg;
return start_addr;
}
+static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi)
+{
+ sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1;
+}
+
static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
{
return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
@@ -894,13 +1633,13 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
}
if (inode)
- inode->i_blocks++;
+ f2fs_i_blocks_write(inode, 1, true);
- sbi->alloc_valid_block_count++;
sbi->total_valid_node_count++;
sbi->total_valid_block_count++;
spin_unlock(&sbi->stat_lock);
+ percpu_counter_inc(&sbi->alloc_valid_block_count);
return true;
}
@@ -913,7 +1652,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
f2fs_bug_on(sbi, !sbi->total_valid_node_count);
f2fs_bug_on(sbi, !inode->i_blocks);
- inode->i_blocks--;
+ f2fs_i_blocks_write(inode, 1, false);
sbi->total_valid_node_count--;
sbi->total_valid_block_count--;
@@ -927,23 +1666,46 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
{
- spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count);
- sbi->total_valid_inode_count++;
- spin_unlock(&sbi->stat_lock);
+ percpu_counter_inc(&sbi->total_valid_inode_count);
}
static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
{
- spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi, !sbi->total_valid_inode_count);
- sbi->total_valid_inode_count--;
- spin_unlock(&sbi->stat_lock);
+ percpu_counter_dec(&sbi->total_valid_inode_count);
}
-static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
+static inline s64 valid_inode_count(struct f2fs_sb_info *sbi)
{
- return sbi->total_valid_inode_count;
+ return percpu_counter_sum_positive(&sbi->total_valid_inode_count);
+}
+
+static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
+ pgoff_t index, bool for_write)
+{
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ struct page *page = find_lock_page(mapping, index);
+
+ if (page)
+ return page;
+
+ if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) {
+ f2fs_show_injection_info(FAULT_PAGE_ALLOC);
+ return NULL;
+ }
+#endif
+ if (!for_write)
+ return grab_cache_page(mapping, index);
+ return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+}
+
+static inline void f2fs_copy_page(struct page *src, struct page *dst)
+{
+ char *src_kaddr = kmap(src);
+ char *dst_kaddr = kmap(dst);
+
+ memcpy(dst_kaddr, src_kaddr, PAGE_SIZE);
+ kunmap(dst);
+ kunmap(src);
}
static inline void f2fs_put_page(struct page *page, int unlock)
@@ -955,7 +1717,7 @@ static inline void f2fs_put_page(struct page *page, int unlock)
f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
unlock_page(page);
}
- page_cache_release(page);
+ put_page(page);
}
static inline void f2fs_put_dnode(struct dnode_of_data *dn)
@@ -978,21 +1740,37 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
gfp_t flags)
{
void *entry;
-retry:
- entry = kmem_cache_alloc(cachep, flags);
- if (!entry) {
- cond_resched();
- goto retry;
- }
+ entry = kmem_cache_alloc(cachep, flags);
+ if (!entry)
+ entry = kmem_cache_alloc(cachep, flags | __GFP_NOFAIL);
return entry;
}
+static inline struct bio *f2fs_bio_alloc(int npages)
+{
+ struct bio *bio;
+
+ /* No failure on bio allocation */
+ bio = bio_alloc(GFP_NOIO, npages);
+ if (!bio)
+ bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
+ return bio;
+}
+
+static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
+ unsigned long index, void *item)
+{
+ while (radix_tree_insert(root, index, item))
+ cond_resched();
+}
+
#define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino)
static inline bool IS_INODE(struct page *page)
{
struct f2fs_node *p = F2FS_NODE(page);
+
return RAW_IS_INODE(p);
}
@@ -1006,6 +1784,7 @@ static inline block_t datablock_addr(struct page *node_page,
{
struct f2fs_node *raw_node;
__le32 *addr_array;
+
raw_node = F2FS_NODE(node_page);
addr_array = blkaddr_in_node(raw_node);
return le32_to_cpu(addr_array[offset]);
@@ -1020,7 +1799,25 @@ static inline int f2fs_test_bit(unsigned int nr, char *addr)
return mask & *addr;
}
-static inline int f2fs_set_bit(unsigned int nr, char *addr)
+static inline void f2fs_set_bit(unsigned int nr, char *addr)
+{
+ int mask;
+
+ addr += (nr >> 3);
+ mask = 1 << (7 - (nr & 0x07));
+ *addr |= mask;
+}
+
+static inline void f2fs_clear_bit(unsigned int nr, char *addr)
+{
+ int mask;
+
+ addr += (nr >> 3);
+ mask = 1 << (7 - (nr & 0x07));
+ *addr &= ~mask;
+}
+
+static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr)
{
int mask;
int ret;
@@ -1032,7 +1829,7 @@ static inline int f2fs_set_bit(unsigned int nr, char *addr)
return ret;
}
-static inline int f2fs_clear_bit(unsigned int nr, char *addr)
+static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr)
{
int mask;
int ret;
@@ -1044,86 +1841,179 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr)
return ret;
}
+static inline void f2fs_change_bit(unsigned int nr, char *addr)
+{
+ int mask;
+
+ addr += (nr >> 3);
+ mask = 1 << (7 - (nr & 0x07));
+ *addr ^= mask;
+}
+
/* used for f2fs_inode_info->flags */
enum {
FI_NEW_INODE, /* indicate newly allocated inode */
FI_DIRTY_INODE, /* indicate inode is dirty or not */
+ FI_AUTO_RECOVER, /* indicate inode is recoverable */
FI_DIRTY_DIR, /* indicate directory has dirty pages */
FI_INC_LINK, /* need to increment i_nlink */
FI_ACL_MODE, /* indicate acl mode */
FI_NO_ALLOC, /* should not allocate any blocks */
- FI_UPDATE_DIR, /* should update inode block for consistency */
- FI_DELAY_IPUT, /* used for the recovery */
+ FI_FREE_NID, /* free allocated nide */
FI_NO_EXTENT, /* not to use the extent cache */
FI_INLINE_XATTR, /* used for inline xattr */
FI_INLINE_DATA, /* used for inline data*/
+ FI_INLINE_DENTRY, /* used for inline dentry */
FI_APPEND_WRITE, /* inode has appended data */
FI_UPDATE_WRITE, /* inode has in-place-update data */
FI_NEED_IPU, /* used for ipu per file */
FI_ATOMIC_FILE, /* indicate atomic file */
+ FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */
FI_VOLATILE_FILE, /* indicate volatile file */
+ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
+ FI_DROP_CACHE, /* drop dirty page cache */
+ FI_DATA_EXIST, /* indicate data exists */
+ FI_INLINE_DOTS, /* indicate inline dot dentries */
+ FI_DO_DEFRAG, /* indicate defragment is running */
+ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
+ FI_HOT_DATA, /* indicate file is hot */
};
-static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
+static inline void __mark_inode_dirty_flag(struct inode *inode,
+ int flag, bool set)
+{
+ switch (flag) {
+ case FI_INLINE_XATTR:
+ case FI_INLINE_DATA:
+ case FI_INLINE_DENTRY:
+ if (set)
+ return;
+ case FI_DATA_EXIST:
+ case FI_INLINE_DOTS:
+ f2fs_mark_inode_dirty_sync(inode, true);
+ }
+}
+
+static inline void set_inode_flag(struct inode *inode, int flag)
{
- if (!test_bit(flag, &fi->flags))
- set_bit(flag, &fi->flags);
+ if (!test_bit(flag, &F2FS_I(inode)->flags))
+ set_bit(flag, &F2FS_I(inode)->flags);
+ __mark_inode_dirty_flag(inode, flag, true);
}
-static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag)
+static inline int is_inode_flag_set(struct inode *inode, int flag)
{
- return test_bit(flag, &fi->flags);
+ return test_bit(flag, &F2FS_I(inode)->flags);
}
-static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag)
+static inline void clear_inode_flag(struct inode *inode, int flag)
{
- if (test_bit(flag, &fi->flags))
- clear_bit(flag, &fi->flags);
+ if (test_bit(flag, &F2FS_I(inode)->flags))
+ clear_bit(flag, &F2FS_I(inode)->flags);
+ __mark_inode_dirty_flag(inode, flag, false);
}
-static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode)
+static inline void set_acl_inode(struct inode *inode, umode_t mode)
{
- fi->i_acl_mode = mode;
- set_inode_flag(fi, FI_ACL_MODE);
+ F2FS_I(inode)->i_acl_mode = mode;
+ set_inode_flag(inode, FI_ACL_MODE);
+ f2fs_mark_inode_dirty_sync(inode, false);
}
-static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
+static inline void f2fs_i_links_write(struct inode *inode, bool inc)
{
- if (is_inode_flag_set(fi, FI_ACL_MODE)) {
- clear_inode_flag(fi, FI_ACL_MODE);
- return 1;
- }
- return 0;
+ if (inc)
+ inc_nlink(inode);
+ else
+ drop_nlink(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
-static inline void get_inline_info(struct f2fs_inode_info *fi,
- struct f2fs_inode *ri)
+static inline void f2fs_i_blocks_write(struct inode *inode,
+ blkcnt_t diff, bool add)
{
+ bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE);
+ bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER);
+
+ inode->i_blocks = add ? inode->i_blocks + diff :
+ inode->i_blocks - diff;
+ f2fs_mark_inode_dirty_sync(inode, true);
+ if (clean || recover)
+ set_inode_flag(inode, FI_AUTO_RECOVER);
+}
+
+static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
+{
+ bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE);
+ bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER);
+
+ if (i_size_read(inode) == i_size)
+ return;
+
+ i_size_write(inode, i_size);
+ f2fs_mark_inode_dirty_sync(inode, true);
+ if (clean || recover)
+ set_inode_flag(inode, FI_AUTO_RECOVER);
+}
+
+static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth)
+{
+ F2FS_I(inode)->i_current_depth = depth;
+ f2fs_mark_inode_dirty_sync(inode, true);
+}
+
+static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid)
+{
+ F2FS_I(inode)->i_xattr_nid = xnid;
+ f2fs_mark_inode_dirty_sync(inode, true);
+}
+
+static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino)
+{
+ F2FS_I(inode)->i_pino = pino;
+ f2fs_mark_inode_dirty_sync(inode, true);
+}
+
+static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+
if (ri->i_inline & F2FS_INLINE_XATTR)
- set_inode_flag(fi, FI_INLINE_XATTR);
+ set_bit(FI_INLINE_XATTR, &fi->flags);
if (ri->i_inline & F2FS_INLINE_DATA)
- set_inode_flag(fi, FI_INLINE_DATA);
+ set_bit(FI_INLINE_DATA, &fi->flags);
+ if (ri->i_inline & F2FS_INLINE_DENTRY)
+ set_bit(FI_INLINE_DENTRY, &fi->flags);
+ if (ri->i_inline & F2FS_DATA_EXIST)
+ set_bit(FI_DATA_EXIST, &fi->flags);
+ if (ri->i_inline & F2FS_INLINE_DOTS)
+ set_bit(FI_INLINE_DOTS, &fi->flags);
}
-static inline void set_raw_inline(struct f2fs_inode_info *fi,
- struct f2fs_inode *ri)
+static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
{
ri->i_inline = 0;
- if (is_inode_flag_set(fi, FI_INLINE_XATTR))
+ if (is_inode_flag_set(inode, FI_INLINE_XATTR))
ri->i_inline |= F2FS_INLINE_XATTR;
- if (is_inode_flag_set(fi, FI_INLINE_DATA))
+ if (is_inode_flag_set(inode, FI_INLINE_DATA))
ri->i_inline |= F2FS_INLINE_DATA;
+ if (is_inode_flag_set(inode, FI_INLINE_DENTRY))
+ ri->i_inline |= F2FS_INLINE_DENTRY;
+ if (is_inode_flag_set(inode, FI_DATA_EXIST))
+ ri->i_inline |= F2FS_DATA_EXIST;
+ if (is_inode_flag_set(inode, FI_INLINE_DOTS))
+ ri->i_inline |= F2FS_INLINE_DOTS;
}
static inline int f2fs_has_inline_xattr(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR);
+ return is_inode_flag_set(inode, FI_INLINE_XATTR);
}
-static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
+static inline unsigned int addrs_per_inode(struct inode *inode)
{
- if (f2fs_has_inline_xattr(&fi->vfs_inode))
+ if (f2fs_has_inline_xattr(inode))
return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
return DEF_ADDRS_PER_INODE;
}
@@ -1131,6 +2021,7 @@ static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
static inline void *inline_xattr_addr(struct page *page)
{
struct f2fs_inode *ri = F2FS_INODE(page);
+
return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
F2FS_INLINE_XATTR_ADDRS]);
}
@@ -1145,25 +2036,97 @@ static inline int inline_xattr_size(struct inode *inode)
static inline int f2fs_has_inline_data(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
+ return is_inode_flag_set(inode, FI_INLINE_DATA);
+}
+
+static inline int f2fs_exist_data(struct inode *inode)
+{
+ return is_inode_flag_set(inode, FI_DATA_EXIST);
+}
+
+static inline int f2fs_has_inline_dots(struct inode *inode)
+{
+ return is_inode_flag_set(inode, FI_INLINE_DOTS);
}
static inline bool f2fs_is_atomic_file(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
+ return is_inode_flag_set(inode, FI_ATOMIC_FILE);
+}
+
+static inline bool f2fs_is_commit_atomic_write(struct inode *inode)
+{
+ return is_inode_flag_set(inode, FI_ATOMIC_COMMIT);
}
static inline bool f2fs_is_volatile_file(struct inode *inode)
{
- return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
+ return is_inode_flag_set(inode, FI_VOLATILE_FILE);
+}
+
+static inline bool f2fs_is_first_block_written(struct inode *inode)
+{
+ return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN);
+}
+
+static inline bool f2fs_is_drop_cache(struct inode *inode)
+{
+ return is_inode_flag_set(inode, FI_DROP_CACHE);
}
static inline void *inline_data_addr(struct page *page)
{
struct f2fs_inode *ri = F2FS_INODE(page);
+
return (void *)&(ri->i_addr[1]);
}
+static inline int f2fs_has_inline_dentry(struct inode *inode)
+{
+ return is_inode_flag_set(inode, FI_INLINE_DENTRY);
+}
+
+static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page)
+{
+ if (!f2fs_has_inline_dentry(dir))
+ kunmap(page);
+}
+
+static inline int is_file(struct inode *inode, int type)
+{
+ return F2FS_I(inode)->i_advise & type;
+}
+
+static inline void set_file(struct inode *inode, int type)
+{
+ F2FS_I(inode)->i_advise |= type;
+ f2fs_mark_inode_dirty_sync(inode, true);
+}
+
+static inline void clear_file(struct inode *inode, int type)
+{
+ F2FS_I(inode)->i_advise &= ~type;
+ f2fs_mark_inode_dirty_sync(inode, true);
+}
+
+static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
+{
+ if (dsync) {
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ bool ret;
+
+ spin_lock(&sbi->inode_lock[DIRTY_META]);
+ ret = list_empty(&F2FS_I(inode)->gdirty_list);
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ return ret;
+ }
+ if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) ||
+ file_keep_isize(inode) ||
+ i_size_read(inode) & PAGE_MASK)
+ return false;
+ return F2FS_I(inode)->last_disk_size == i_size_read(inode);
+}
+
static inline int f2fs_readonly(struct super_block *sb)
{
return sb->s_flags & MS_RDONLY;
@@ -1171,50 +2134,94 @@ static inline int f2fs_readonly(struct super_block *sb)
static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
{
- return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+ return is_set_ckpt_flags(sbi, CP_ERROR_FLAG);
}
-static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
+static inline bool is_dot_dotdot(const struct qstr *str)
{
- set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
- sbi->sb->s_flags |= MS_RDONLY;
+ if (str->len == 1 && str->name[0] == '.')
+ return true;
+
+ if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
+ return true;
+
+ return false;
+}
+
+static inline bool f2fs_may_extent_tree(struct inode *inode)
+{
+ mode_t mode = inode->i_mode;
+
+ if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) ||
+ is_inode_flag_set(inode, FI_NO_EXTENT))
+ return false;
+
+ return S_ISREG(mode);
+}
+
+static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
+ size_t size, gfp_t flags)
+{
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(sbi, FAULT_KMALLOC)) {
+ f2fs_show_injection_info(FAULT_KMALLOC);
+ return NULL;
+ }
+#endif
+ return kmalloc(size, flags);
+}
+
+static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
+{
+ void *ret;
+
+ ret = kmalloc(size, flags | __GFP_NOWARN);
+ if (!ret)
+ ret = __vmalloc(size, flags, PAGE_KERNEL);
+ return ret;
+}
+
+static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
+{
+ void *ret;
+
+ ret = kzalloc(size, flags | __GFP_NOWARN);
+ if (!ret)
+ ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
+ return ret;
}
#define get_inode_mode(i) \
- ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
+ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
-/* get offset of first page in next direct node */
-#define PGOFS_OF_NEXT_DNODE(pgofs, fi) \
- ((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) : \
- (pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) / \
- ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi))
-
/*
* file.c
*/
-int f2fs_sync_file(struct file *, loff_t, loff_t, int);
-void truncate_data_blocks(struct dnode_of_data *);
-int truncate_blocks(struct inode *, u64, bool);
-void f2fs_truncate(struct inode *);
-int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
-int f2fs_setattr(struct dentry *, struct iattr *);
-int truncate_hole(struct inode *, pgoff_t, pgoff_t);
-int truncate_data_blocks_range(struct dnode_of_data *, int);
-long f2fs_ioctl(struct file *, unsigned int, unsigned long);
-long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
+int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
+void truncate_data_blocks(struct dnode_of_data *dn);
+int truncate_blocks(struct inode *inode, u64 from, bool lock);
+int f2fs_truncate(struct inode *inode);
+int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
+int f2fs_setattr(struct dentry *dentry, struct iattr *attr);
+int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end);
+int truncate_data_blocks_range(struct dnode_of_data *dn, int count);
+long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
/*
* inode.c
*/
-void f2fs_set_inode_flags(struct inode *);
-struct inode *f2fs_iget(struct super_block *, unsigned long);
-int try_to_free_nats(struct f2fs_sb_info *, int);
-void update_inode(struct inode *, struct page *);
-void update_inode_page(struct inode *);
-int f2fs_write_inode(struct inode *, struct writeback_control *);
-void f2fs_evict_inode(struct inode *);
-void handle_failed_inode(struct inode *);
+void f2fs_set_inode_flags(struct inode *inode);
+struct inode *f2fs_iget(struct super_block *sb, unsigned long ino);
+struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino);
+int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink);
+int update_inode(struct inode *inode, struct page *node_page);
+int update_inode_page(struct inode *inode);
+int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc);
+void f2fs_evict_inode(struct inode *inode);
+void handle_failed_inode(struct inode *inode);
/*
* namei.c
@@ -1224,36 +2231,68 @@ struct dentry *f2fs_get_parent(struct dentry *child);
/*
* dir.c
*/
-struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *,
- struct page **);
-struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
-ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
-void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
- struct page *, struct inode *);
-int update_dent_inode(struct inode *, const struct qstr *);
-int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
-void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
-int f2fs_do_tmpfile(struct inode *, struct inode *);
-int f2fs_make_empty(struct inode *, struct inode *);
-bool f2fs_empty_dir(struct inode *);
+void set_de_type(struct f2fs_dir_entry *de, umode_t mode);
+unsigned char get_de_type(struct f2fs_dir_entry *de);
+struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
+ f2fs_hash_t namehash, int *max_slots,
+ struct f2fs_dentry_ptr *d);
+int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
+ unsigned int start_pos, struct fscrypt_str *fstr);
+void do_make_empty_dir(struct inode *inode, struct inode *parent,
+ struct f2fs_dentry_ptr *d);
+struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
+ const struct qstr *new_name,
+ const struct qstr *orig_name, struct page *dpage);
+void update_parent_metadata(struct inode *dir, struct inode *inode,
+ unsigned int current_depth);
+int room_for_filename(const void *bitmap, int slots, int max_slots);
+void f2fs_drop_nlink(struct inode *dir, struct inode *inode);
+struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
+ struct fscrypt_name *fname, struct page **res_page);
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
+ struct qstr *child, struct page **res_page);
+struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p);
+ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr,
+ struct page **page);
+void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
+ struct page *page, struct inode *inode);
+void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
+ const struct qstr *name, f2fs_hash_t name_hash,
+ unsigned int bit_pos);
+int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
+ const struct qstr *orig_name,
+ struct inode *inode, nid_t ino, umode_t mode);
+int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname,
+ struct inode *inode, nid_t ino, umode_t mode);
+int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+ struct inode *inode, nid_t ino, umode_t mode);
+void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
+ struct inode *dir, struct inode *inode);
+int f2fs_do_tmpfile(struct inode *inode, struct inode *dir);
+bool f2fs_empty_dir(struct inode *dir);
static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
{
- return __f2fs_add_link(dentry->d_parent->d_inode, &dentry->d_name,
- inode);
+ return __f2fs_add_link(d_inode(dentry->d_parent), &dentry->d_name,
+ inode, inode->i_ino, inode->i_mode);
}
/*
* super.c
*/
-int f2fs_sync_fs(struct super_block *, int);
+int f2fs_inode_dirtied(struct inode *inode, bool sync);
+void f2fs_inode_synced(struct inode *inode);
+int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
+int f2fs_sync_fs(struct super_block *sb, int sync);
extern __printf(3, 4)
-void f2fs_msg(struct super_block *, const char *, const char *, ...);
+void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...);
+int sanity_check_ckpt(struct f2fs_sb_info *sbi);
/*
* hash.c
*/
-f2fs_hash_t f2fs_dentry_hash(const struct qstr *);
+f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info,
+ struct fscrypt_name *fname);
/*
* node.c
@@ -1261,136 +2300,186 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *);
struct dnode_of_data;
struct node_info;
-bool available_free_memory(struct f2fs_sb_info *, int);
-bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
-bool has_fsynced_inode(struct f2fs_sb_info *, nid_t);
-bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
-void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
-int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
-int truncate_inode_blocks(struct inode *, pgoff_t);
-int truncate_xattr_node(struct inode *, struct page *);
-int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t);
-void remove_inode_page(struct inode *);
-struct page *new_inode_page(struct inode *);
-struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
-void ra_node_page(struct f2fs_sb_info *, nid_t);
-struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
-struct page *get_node_page_ra(struct page *, int);
-void sync_inode_page(struct dnode_of_data *);
-int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
-bool alloc_nid(struct f2fs_sb_info *, nid_t *);
-void alloc_nid_done(struct f2fs_sb_info *, nid_t);
-void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
-void recover_inline_xattr(struct inode *, struct page *);
-void recover_xattr_data(struct inode *, struct page *, block_t);
-int recover_inode_page(struct f2fs_sb_info *, struct page *);
-int restore_node_summary(struct f2fs_sb_info *, unsigned int,
- struct f2fs_summary_block *);
-void flush_nat_entries(struct f2fs_sb_info *);
-int build_node_manager(struct f2fs_sb_info *);
-void destroy_node_manager(struct f2fs_sb_info *);
+bool available_free_memory(struct f2fs_sb_info *sbi, int type);
+int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid);
+bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid);
+bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino);
+void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni);
+pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs);
+int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode);
+int truncate_inode_blocks(struct inode *inode, pgoff_t from);
+int truncate_xattr_node(struct inode *inode, struct page *page);
+int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino);
+int remove_inode_page(struct inode *inode);
+struct page *new_inode_page(struct inode *inode);
+struct page *new_node_page(struct dnode_of_data *dn,
+ unsigned int ofs, struct page *ipage);
+void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid);
+struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
+struct page *get_node_page_ra(struct page *parent, int start);
+void move_node_page(struct page *node_page, int gc_type);
+int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
+ struct writeback_control *wbc, bool atomic);
+int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc);
+void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount);
+bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid);
+void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid);
+void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid);
+int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink);
+void recover_inline_xattr(struct inode *inode, struct page *page);
+int recover_xattr_data(struct inode *inode, struct page *page,
+ block_t blkaddr);
+int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page);
+int restore_node_summary(struct f2fs_sb_info *sbi,
+ unsigned int segno, struct f2fs_summary_block *sum);
+void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+int build_node_manager(struct f2fs_sb_info *sbi);
+void destroy_node_manager(struct f2fs_sb_info *sbi);
int __init create_node_manager_caches(void);
void destroy_node_manager_caches(void);
/*
* segment.c
*/
-void register_inmem_page(struct inode *, struct page *);
-void commit_inmem_pages(struct inode *, bool);
-void f2fs_balance_fs(struct f2fs_sb_info *);
-void f2fs_balance_fs_bg(struct f2fs_sb_info *);
-int f2fs_issue_flush(struct f2fs_sb_info *);
-int create_flush_cmd_control(struct f2fs_sb_info *);
-void destroy_flush_cmd_control(struct f2fs_sb_info *);
-void invalidate_blocks(struct f2fs_sb_info *, block_t);
-void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
-void clear_prefree_segments(struct f2fs_sb_info *);
-void release_discard_addrs(struct f2fs_sb_info *);
-void discard_next_dnode(struct f2fs_sb_info *, block_t);
-int npages_for_summary_flush(struct f2fs_sb_info *);
-void allocate_new_segments(struct f2fs_sb_info *);
-int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
-struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
-void write_meta_page(struct f2fs_sb_info *, struct page *);
-void write_node_page(struct f2fs_sb_info *, struct page *,
- struct f2fs_io_info *, unsigned int, block_t, block_t *);
-void write_data_page(struct page *, struct dnode_of_data *, block_t *,
- struct f2fs_io_info *);
-void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
-void recover_data_page(struct f2fs_sb_info *, struct page *,
- struct f2fs_summary *, block_t, block_t);
-void allocate_data_block(struct f2fs_sb_info *, struct page *,
- block_t, block_t *, struct f2fs_summary *, int);
-void f2fs_wait_on_page_writeback(struct page *, enum page_type);
-void write_data_summaries(struct f2fs_sb_info *, block_t);
-void write_node_summaries(struct f2fs_sb_info *, block_t);
-int lookup_journal_in_cursum(struct f2fs_summary_block *,
- int, unsigned int, int);
-void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
-int build_segment_manager(struct f2fs_sb_info *);
-void destroy_segment_manager(struct f2fs_sb_info *);
+void register_inmem_page(struct inode *inode, struct page *page);
+void drop_inmem_pages(struct inode *inode);
+void drop_inmem_page(struct inode *inode, struct page *page);
+int commit_inmem_pages(struct inode *inode);
+void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
+void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi);
+int f2fs_issue_flush(struct f2fs_sb_info *sbi);
+int create_flush_cmd_control(struct f2fs_sb_info *sbi);
+void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free);
+void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
+bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
+void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new);
+void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
+void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+void release_discard_addrs(struct f2fs_sb_info *sbi);
+int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
+void allocate_new_segments(struct f2fs_sb_info *sbi);
+int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
+bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno);
+void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr);
+void write_meta_page(struct f2fs_sb_info *sbi, struct page *page);
+void write_node_page(unsigned int nid, struct f2fs_io_info *fio);
+void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio);
+int rewrite_data_page(struct f2fs_io_info *fio);
+void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+ block_t old_blkaddr, block_t new_blkaddr,
+ bool recover_curseg, bool recover_newaddr);
+void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
+ block_t old_addr, block_t new_addr,
+ unsigned char version, bool recover_curseg,
+ bool recover_newaddr);
+void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+ block_t old_blkaddr, block_t *new_blkaddr,
+ struct f2fs_summary *sum, int type);
+void f2fs_wait_on_page_writeback(struct page *page,
+ enum page_type type, bool ordered);
+void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
+ block_t blkaddr);
+void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
+void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
+int lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
+ unsigned int val, int alloc);
+void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+int build_segment_manager(struct f2fs_sb_info *sbi);
+void destroy_segment_manager(struct f2fs_sb_info *sbi);
int __init create_segment_manager_caches(void);
void destroy_segment_manager_caches(void);
/*
* checkpoint.c
*/
-struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
-struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
-struct page *get_meta_page_ra(struct f2fs_sb_info *, pgoff_t);
-int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
-long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
-void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
-void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
-void release_dirty_inode(struct f2fs_sb_info *);
-bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
-int acquire_orphan_inode(struct f2fs_sb_info *);
-void release_orphan_inode(struct f2fs_sb_info *);
-void add_orphan_inode(struct f2fs_sb_info *, nid_t);
-void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
-void recover_orphan_inodes(struct f2fs_sb_info *);
-int get_valid_checkpoint(struct f2fs_sb_info *);
-void update_dirty_page(struct inode *, struct page *);
-void add_dirty_dir_inode(struct inode *);
-void remove_dirty_dir_inode(struct inode *);
-void sync_dirty_dir_inodes(struct f2fs_sb_info *);
-void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
-void init_ino_entry_info(struct f2fs_sb_info *);
+void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io);
+struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index);
+struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index);
+struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index);
+bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type);
+int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
+ int type, bool sync);
+void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index);
+long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
+ long nr_to_write);
+void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
+void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
+void release_ino_entry(struct f2fs_sb_info *sbi, bool all);
+bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode);
+int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi);
+int acquire_orphan_inode(struct f2fs_sb_info *sbi);
+void release_orphan_inode(struct f2fs_sb_info *sbi);
+void add_orphan_inode(struct inode *inode);
+void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino);
+int recover_orphan_inodes(struct f2fs_sb_info *sbi);
+int get_valid_checkpoint(struct f2fs_sb_info *sbi);
+void update_dirty_page(struct inode *inode, struct page *page);
+void remove_dirty_inode(struct inode *inode);
+int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type);
+int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+void init_ino_entry_info(struct f2fs_sb_info *sbi);
int __init create_checkpoint_caches(void);
void destroy_checkpoint_caches(void);
/*
* data.c
*/
-void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
-int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int);
-void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t,
- struct f2fs_io_info *);
-int reserve_new_block(struct dnode_of_data *);
-int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
-void update_extent_cache(block_t, struct dnode_of_data *);
-struct page *find_data_page(struct inode *, pgoff_t, bool);
-struct page *get_lock_data_page(struct inode *, pgoff_t);
-struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
-int do_write_data_page(struct page *, struct f2fs_io_info *);
-int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
+void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type,
+ int rw);
+void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi,
+ struct inode *inode, nid_t ino, pgoff_t idx,
+ enum page_type type, int rw);
+void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi);
+int f2fs_submit_page_bio(struct f2fs_io_info *fio);
+int f2fs_submit_page_mbio(struct f2fs_io_info *fio);
+struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
+ block_t blk_addr, struct bio *bio);
+int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr);
+void set_data_blkaddr(struct dnode_of_data *dn);
+void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr);
+int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count);
+int reserve_new_block(struct dnode_of_data *dn);
+int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
+int f2fs_preallocate_blocks(struct inode *inode, loff_t pos,
+ size_t count, bool dio);
+int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
+struct page *get_read_data_page(struct inode *inode, pgoff_t index,
+ int op_flags, bool for_write);
+struct page *find_data_page(struct inode *inode, pgoff_t index);
+struct page *get_lock_data_page(struct inode *inode, pgoff_t index,
+ bool for_write);
+struct page *get_new_data_page(struct inode *inode,
+ struct page *ipage, pgoff_t index, bool new_i_size);
+int do_write_data_page(struct f2fs_io_info *fio);
+int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
+ int create, int flag);
+int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
+void f2fs_set_page_dirty_nobuffers(struct page *page);
+void f2fs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length);
+int f2fs_release_page(struct page *page, gfp_t wait);
+#ifdef CONFIG_MIGRATION
+int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
+ struct page *page, enum migrate_mode mode);
+#endif
/*
* gc.c
*/
-int start_gc_thread(struct f2fs_sb_info *);
-void stop_gc_thread(struct f2fs_sb_info *);
-block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
-int f2fs_gc(struct f2fs_sb_info *);
-void build_gc_manager(struct f2fs_sb_info *);
-int __init create_gc_caches(void);
-void destroy_gc_caches(void);
+int start_gc_thread(struct f2fs_sb_info *sbi);
+void stop_gc_thread(struct f2fs_sb_info *sbi);
+block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode);
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background,
+ unsigned int segno);
+void build_gc_manager(struct f2fs_sb_info *sbi);
/*
* recovery.c
*/
-int recover_fsync_data(struct f2fs_sb_info *);
-bool space_for_roll_forward(struct f2fs_sb_info *);
+int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only);
+bool space_for_roll_forward(struct f2fs_sb_info *sbi);
/*
* debug.c
@@ -1401,26 +2490,39 @@ struct f2fs_stat_info {
struct f2fs_sb_info *sbi;
int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
int main_area_segs, main_area_sections, main_area_zones;
- int hit_ext, total_ext;
- int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
- int nats, sits, fnids;
+ unsigned long long hit_largest, hit_cached, hit_rbtree;
+ unsigned long long hit_total, total_ext;
+ int ext_tree, zombie_tree, ext_node;
+ int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
+ int inmem_pages;
+ unsigned int ndirty_dirs, ndirty_files, ndirty_all;
+ int nats, dirty_nats, sits, dirty_sits;
+ int free_nids, avail_nids, alloc_nids;
int total_count, utilization;
- int bg_gc, inline_inode;
- unsigned int valid_count, valid_node_count, valid_inode_count;
+ int bg_gc, nr_wb_cp_data, nr_wb_data;
+ int nr_flushing, nr_flushed, nr_discarding, nr_discarded;
+ int nr_discard_cmd;
+ unsigned int undiscard_blks;
+ int inline_xattr, inline_inode, inline_dir, append, update, orphans;
+ int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt;
+ unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
unsigned int bimodal, avg_vblocks;
int util_free, util_valid, util_invalid;
int rsvd_segs, overp_segs;
int dirty_count, node_pages, meta_pages;
- int prefree_count, call_count, cp_count;
+ int prefree_count, call_count, cp_count, bg_cp_count;
int tot_segs, node_segs, data_segs, free_segs, free_secs;
+ int bg_node_segs, bg_data_segs;
int tot_blks, data_blks, node_blks;
+ int bg_data_blks, bg_node_blks;
int curseg[NR_CURSEG_TYPE];
int cursec[NR_CURSEG_TYPE];
int curzone[NR_CURSEG_TYPE];
unsigned int segment_count[2];
unsigned int block_count[2];
- unsigned base_mem, cache_mem;
+ unsigned int inplace_count;
+ unsigned long long base_mem, cache_mem, page_mem;
};
static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
@@ -1429,79 +2531,143 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
}
#define stat_inc_cp_count(si) ((si)->cp_count++)
+#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++)
#define stat_inc_call_count(si) ((si)->call_count++)
#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++)
-#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++)
-#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--)
-#define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++)
-#define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++)
+#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++)
+#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--)
+#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext))
+#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree))
+#define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest))
+#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached))
+#define stat_inc_inline_xattr(inode) \
+ do { \
+ if (f2fs_has_inline_xattr(inode)) \
+ (atomic_inc(&F2FS_I_SB(inode)->inline_xattr)); \
+ } while (0)
+#define stat_dec_inline_xattr(inode) \
+ do { \
+ if (f2fs_has_inline_xattr(inode)) \
+ (atomic_dec(&F2FS_I_SB(inode)->inline_xattr)); \
+ } while (0)
#define stat_inc_inline_inode(inode) \
do { \
if (f2fs_has_inline_data(inode)) \
- ((F2FS_I_SB(inode))->inline_inode++); \
+ (atomic_inc(&F2FS_I_SB(inode)->inline_inode)); \
} while (0)
#define stat_dec_inline_inode(inode) \
do { \
if (f2fs_has_inline_data(inode)) \
- ((F2FS_I_SB(inode))->inline_inode--); \
+ (atomic_dec(&F2FS_I_SB(inode)->inline_inode)); \
+ } while (0)
+#define stat_inc_inline_dir(inode) \
+ do { \
+ if (f2fs_has_inline_dentry(inode)) \
+ (atomic_inc(&F2FS_I_SB(inode)->inline_dir)); \
+ } while (0)
+#define stat_dec_inline_dir(inode) \
+ do { \
+ if (f2fs_has_inline_dentry(inode)) \
+ (atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \
} while (0)
-
#define stat_inc_seg_type(sbi, curseg) \
((sbi)->segment_count[(curseg)->alloc_type]++)
#define stat_inc_block_count(sbi, curseg) \
((sbi)->block_count[(curseg)->alloc_type]++)
-
-#define stat_inc_seg_count(sbi, type) \
+#define stat_inc_inplace_blocks(sbi) \
+ (atomic_inc(&(sbi)->inplace_count))
+#define stat_inc_atomic_write(inode) \
+ (atomic_inc(&F2FS_I_SB(inode)->aw_cnt))
+#define stat_dec_atomic_write(inode) \
+ (atomic_dec(&F2FS_I_SB(inode)->aw_cnt))
+#define stat_update_max_atomic_write(inode) \
+ do { \
+ int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \
+ int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \
+ if (cur > max) \
+ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \
+ } while (0)
+#define stat_inc_volatile_write(inode) \
+ (atomic_inc(&F2FS_I_SB(inode)->vw_cnt))
+#define stat_dec_volatile_write(inode) \
+ (atomic_dec(&F2FS_I_SB(inode)->vw_cnt))
+#define stat_update_max_volatile_write(inode) \
+ do { \
+ int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \
+ int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \
+ if (cur > max) \
+ atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \
+ } while (0)
+#define stat_inc_seg_count(sbi, type, gc_type) \
do { \
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
- (si)->tot_segs++; \
- if (type == SUM_TYPE_DATA) \
+ si->tot_segs++; \
+ if ((type) == SUM_TYPE_DATA) { \
si->data_segs++; \
- else \
+ si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \
+ } else { \
si->node_segs++; \
+ si->bg_node_segs += (gc_type == BG_GC) ? 1 : 0; \
+ } \
} while (0)
#define stat_inc_tot_blk_count(si, blks) \
- (si->tot_blks += (blks))
+ ((si)->tot_blks += (blks))
-#define stat_inc_data_blk_count(sbi, blks) \
+#define stat_inc_data_blk_count(sbi, blks, gc_type) \
do { \
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
stat_inc_tot_blk_count(si, blks); \
si->data_blks += (blks); \
+ si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \
} while (0)
-#define stat_inc_node_blk_count(sbi, blks) \
+#define stat_inc_node_blk_count(sbi, blks, gc_type) \
do { \
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
stat_inc_tot_blk_count(si, blks); \
si->node_blks += (blks); \
+ si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \
} while (0)
-int f2fs_build_stats(struct f2fs_sb_info *);
-void f2fs_destroy_stats(struct f2fs_sb_info *);
-void __init f2fs_create_root_stats(void);
+int f2fs_build_stats(struct f2fs_sb_info *sbi);
+void f2fs_destroy_stats(struct f2fs_sb_info *sbi);
+int __init f2fs_create_root_stats(void);
void f2fs_destroy_root_stats(void);
#else
-#define stat_inc_cp_count(si)
-#define stat_inc_call_count(si)
-#define stat_inc_bggc_count(si)
-#define stat_inc_dirty_dir(sbi)
-#define stat_dec_dirty_dir(sbi)
-#define stat_inc_total_hit(sb)
-#define stat_inc_read_hit(sb)
-#define stat_inc_inline_inode(inode)
-#define stat_dec_inline_inode(inode)
-#define stat_inc_seg_type(sbi, curseg)
-#define stat_inc_block_count(sbi, curseg)
-#define stat_inc_seg_count(si, type)
-#define stat_inc_tot_blk_count(si, blks)
-#define stat_inc_data_blk_count(si, blks)
-#define stat_inc_node_blk_count(sbi, blks)
+#define stat_inc_cp_count(si) do { } while (0)
+#define stat_inc_bg_cp_count(si) do { } while (0)
+#define stat_inc_call_count(si) do { } while (0)
+#define stat_inc_bggc_count(si) do { } while (0)
+#define stat_inc_dirty_inode(sbi, type) do { } while (0)
+#define stat_dec_dirty_inode(sbi, type) do { } while (0)
+#define stat_inc_total_hit(sb) do { } while (0)
+#define stat_inc_rbtree_node_hit(sb) do { } while (0)
+#define stat_inc_largest_node_hit(sbi) do { } while (0)
+#define stat_inc_cached_node_hit(sbi) do { } while (0)
+#define stat_inc_inline_xattr(inode) do { } while (0)
+#define stat_dec_inline_xattr(inode) do { } while (0)
+#define stat_inc_inline_inode(inode) do { } while (0)
+#define stat_dec_inline_inode(inode) do { } while (0)
+#define stat_inc_inline_dir(inode) do { } while (0)
+#define stat_dec_inline_dir(inode) do { } while (0)
+#define stat_inc_atomic_write(inode) do { } while (0)
+#define stat_dec_atomic_write(inode) do { } while (0)
+#define stat_update_max_atomic_write(inode) do { } while (0)
+#define stat_inc_volatile_write(inode) do { } while (0)
+#define stat_dec_volatile_write(inode) do { } while (0)
+#define stat_update_max_volatile_write(inode) do { } while (0)
+#define stat_inc_seg_type(sbi, curseg) do { } while (0)
+#define stat_inc_block_count(sbi, curseg) do { } while (0)
+#define stat_inc_inplace_blocks(sbi) do { } while (0)
+#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0)
+#define stat_inc_tot_blk_count(si, blks) do { } while (0)
+#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0)
+#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0)
static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
-static inline void __init f2fs_create_root_stats(void) { }
+static inline int __init f2fs_create_root_stats(void) { return 0; }
static inline void f2fs_destroy_root_stats(void) { }
#endif
@@ -1513,15 +2679,151 @@ extern const struct address_space_operations f2fs_node_aops;
extern const struct address_space_operations f2fs_meta_aops;
extern const struct inode_operations f2fs_dir_inode_operations;
extern const struct inode_operations f2fs_symlink_inode_operations;
+extern const struct inode_operations f2fs_encrypted_symlink_inode_operations;
extern const struct inode_operations f2fs_special_inode_operations;
+extern struct kmem_cache *inode_entry_slab;
/*
* inline.c
*/
-bool f2fs_may_inline(struct inode *);
-int f2fs_read_inline_data(struct inode *, struct page *);
-int f2fs_convert_inline_data(struct inode *, pgoff_t, struct page *);
-int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
-void truncate_inline_data(struct inode *, u64);
-bool recover_inline_data(struct inode *, struct page *);
+bool f2fs_may_inline_data(struct inode *inode);
+bool f2fs_may_inline_dentry(struct inode *inode);
+void read_inline_data(struct page *page, struct page *ipage);
+void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from);
+int f2fs_read_inline_data(struct inode *inode, struct page *page);
+int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page);
+int f2fs_convert_inline_inode(struct inode *inode);
+int f2fs_write_inline_data(struct inode *inode, struct page *page);
+bool recover_inline_data(struct inode *inode, struct page *npage);
+struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
+ struct fscrypt_name *fname, struct page **res_page);
+int make_empty_inline_dir(struct inode *inode, struct inode *parent,
+ struct page *ipage);
+int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
+ const struct qstr *orig_name,
+ struct inode *inode, nid_t ino, umode_t mode);
+void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
+ struct inode *dir, struct inode *inode);
+bool f2fs_empty_inline_dir(struct inode *dir);
+int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
+ struct fscrypt_str *fstr);
+int f2fs_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len);
+
+/*
+ * shrinker.c
+ */
+unsigned long f2fs_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc);
+unsigned long f2fs_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
+void f2fs_join_shrinker(struct f2fs_sb_info *sbi);
+void f2fs_leave_shrinker(struct f2fs_sb_info *sbi);
+
+/*
+ * extent_cache.c
+ */
+struct rb_entry *__lookup_rb_tree(struct rb_root *root,
+ struct rb_entry *cached_re, unsigned int ofs);
+struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
+ struct rb_root *root, struct rb_node **parent,
+ unsigned int ofs);
+struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root,
+ struct rb_entry *cached_re, unsigned int ofs,
+ struct rb_entry **prev_entry, struct rb_entry **next_entry,
+ struct rb_node ***insert_p, struct rb_node **insert_parent,
+ bool force);
+bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi,
+ struct rb_root *root);
+unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
+bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext);
+void f2fs_drop_extent_tree(struct inode *inode);
+unsigned int f2fs_destroy_extent_node(struct inode *inode);
+void f2fs_destroy_extent_tree(struct inode *inode);
+bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei);
+void f2fs_update_extent_cache(struct dnode_of_data *dn);
+void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t fofs, block_t blkaddr, unsigned int len);
+void init_extent_cache_info(struct f2fs_sb_info *sbi);
+int __init create_extent_cache(void);
+void destroy_extent_cache(void);
+
+/*
+ * crypto support
+ */
+static inline bool f2fs_encrypted_inode(struct inode *inode)
+{
+ return file_is_encrypt(inode);
+}
+
+static inline void f2fs_set_encrypted_inode(struct inode *inode)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ file_set_encrypt(inode);
+#endif
+}
+
+static inline bool f2fs_bio_encrypted(struct bio *bio)
+{
+ return bio->bi_private != NULL;
+}
+
+static inline int f2fs_sb_has_crypto(struct super_block *sb)
+{
+ return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
+}
+
+static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
+{
+ return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED);
+}
+
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline int get_blkz_type(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkaddr)
+{
+ unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz;
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++)
+ if (FDEV(i).bdev == bdev)
+ return FDEV(i).blkz_type[zno];
+ return -EINVAL;
+}
+#endif
+
+static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
+{
+ struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
+
+ return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb);
+}
+
+static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
+{
+ clear_opt(sbi, ADAPTIVE);
+ clear_opt(sbi, LFS);
+
+ switch (mt) {
+ case F2FS_MOUNT_ADAPTIVE:
+ set_opt(sbi, ADAPTIVE);
+ break;
+ case F2FS_MOUNT_LFS:
+ set_opt(sbi, LFS);
+ break;
+ }
+}
+
+static inline bool f2fs_may_encrypt(struct inode *inode)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ mode_t mode = inode->i_mode;
+
+ return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
+#else
+ return 0;
+#endif
+}
#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8e68bb64f835..5e8ea4967ed5 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -20,12 +20,18 @@
#include <linux/uaccess.h>
#include <linux/mount.h>
#include <linux/pagevec.h>
+#include <linux/random.h>
+#include <linux/aio.h>
+#include <linux/uuid.h>
+#include <linux/file.h>
#include "f2fs.h"
#include "node.h"
#include "segment.h"
#include "xattr.h"
#include "acl.h"
+#include "gc.h"
+#include "trace.h"
#include <trace/events/f2fs.h>
static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
@@ -37,22 +43,22 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
struct dnode_of_data dn;
int err;
- f2fs_balance_fs(sbi);
-
sb_start_pagefault(inode->i_sb);
- /* force to convert with normal data indices */
- err = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, page);
- if (err)
- goto out;
+ f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
/* block allocation */
f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = f2fs_reserve_block(&dn, page->index);
- f2fs_unlock_op(sbi);
- if (err)
+ if (err) {
+ f2fs_unlock_op(sbi);
goto out;
+ }
+ f2fs_put_dnode(&dn);
+ f2fs_unlock_op(sbi);
+
+ f2fs_balance_fs(sbi, dn.node_changed);
file_update_time(vma->vm_file);
lock_page(page);
@@ -71,20 +77,28 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
goto mapped;
/* page is wholly or partially inside EOF */
- if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
+ if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
+ i_size_read(inode)) {
unsigned offset;
- offset = i_size_read(inode) & ~PAGE_CACHE_MASK;
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ offset = i_size_read(inode) & ~PAGE_MASK;
+ zero_user_segment(page, offset, PAGE_SIZE);
}
set_page_dirty(page);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
trace_f2fs_vm_page_mkwrite(page, DATA);
mapped:
/* fill the page */
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, false);
+
+ /* wait for GCed encrypted page writeback */
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
+
out:
sb_end_pagefault(inode->i_sb);
+ f2fs_update_time(sbi, REQ_TIME);
return block_page_mkwrite_return(err);
}
@@ -92,7 +106,6 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = f2fs_vm_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
static int get_parent_ino(struct inode *inode, nid_t *pino)
@@ -105,11 +118,6 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
if (!dentry)
return 0;
- if (update_dent_inode(inode, &dentry->d_name)) {
- dput(dentry);
- return 0;
- }
-
*pino = parent_ino(dentry);
dput(dentry);
return 1;
@@ -122,22 +130,51 @@ static inline bool need_do_checkpoint(struct inode *inode)
if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
need_cp = true;
+ else if (is_sbi_flag_set(sbi, SBI_NEED_CP))
+ need_cp = true;
else if (file_wrong_pino(inode))
need_cp = true;
else if (!space_for_roll_forward(sbi))
need_cp = true;
else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
need_cp = true;
- else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
+ else if (test_opt(sbi, FASTBOOT))
+ need_cp = true;
+ else if (sbi->active_logs == 2)
need_cp = true;
return need_cp;
}
-int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct page *i = find_get_page(NODE_MAPPING(sbi), ino);
+ bool ret = false;
+ /* But we need to avoid that there are some inode updates */
+ if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino))
+ ret = true;
+ f2fs_put_page(i, 0);
+ return ret;
+}
+
+static void try_to_fix_pino(struct inode *inode)
{
- struct inode *inode = file->f_mapping->host;
struct f2fs_inode_info *fi = F2FS_I(inode);
+ nid_t pino;
+
+ down_write(&fi->i_sem);
+ if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
+ get_parent_ino(inode, &pino)) {
+ f2fs_i_pino_write(inode, pino);
+ file_got_pino(inode);
+ }
+ up_write(&fi->i_sem);
+}
+
+static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
+ int datasync, bool atomic)
+{
+ struct inode *inode = file->f_mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t ino = inode->i_ino;
int ret = 0;
@@ -154,96 +191,100 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_f2fs_sync_file_enter(inode);
/* if fdatasync is triggered, let's do in-place-update */
- if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
- set_inode_flag(fi, FI_NEED_IPU);
+ if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
+ set_inode_flag(inode, FI_NEED_IPU);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- clear_inode_flag(fi, FI_NEED_IPU);
+ clear_inode_flag(inode, FI_NEED_IPU);
if (ret) {
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
return ret;
}
+ /* if the inode is dirty, let's recover all the time */
+ if (!f2fs_skip_inode_update(inode, datasync)) {
+ f2fs_write_inode(inode, NULL);
+ goto go_write;
+ }
+
/*
* if there is no written data, don't waste time to write recovery info.
*/
- if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
+ if (!is_inode_flag_set(inode, FI_APPEND_WRITE) &&
!exist_written_data(sbi, ino, APPEND_INO)) {
- struct page *i = find_get_page(NODE_MAPPING(sbi), ino);
- /* But we need to avoid that there are some inode updates */
- if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) {
- f2fs_put_page(i, 0);
+ /* it may call write_inode just prior to fsync */
+ if (need_inode_page_update(sbi, ino))
goto go_write;
- }
- f2fs_put_page(i, 0);
- if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
+ if (is_inode_flag_set(inode, FI_UPDATE_WRITE) ||
exist_written_data(sbi, ino, UPDATE_INO))
goto flush_out;
goto out;
}
go_write:
- /* guarantee free sections for fsync */
- f2fs_balance_fs(sbi);
-
/*
* Both of fdatasync() and fsync() are able to be recovered from
* sudden-power-off.
*/
- down_read(&fi->i_sem);
+ down_read(&F2FS_I(inode)->i_sem);
need_cp = need_do_checkpoint(inode);
- up_read(&fi->i_sem);
+ up_read(&F2FS_I(inode)->i_sem);
if (need_cp) {
- nid_t pino;
-
/* all the dirty node pages should be flushed for POR */
ret = f2fs_sync_fs(inode->i_sb, 1);
- down_write(&fi->i_sem);
- F2FS_I(inode)->xattr_ver = 0;
- if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
- get_parent_ino(inode, &pino)) {
- F2FS_I(inode)->i_pino = pino;
- file_got_pino(inode);
- up_write(&fi->i_sem);
- mark_inode_dirty_sync(inode);
- ret = f2fs_write_inode(inode, NULL);
- if (ret)
- goto out;
- } else {
- up_write(&fi->i_sem);
- }
- } else {
+ /*
+ * We've secured consistency through sync_fs. Following pino
+ * will be used only for fsynced inodes after checkpoint.
+ */
+ try_to_fix_pino(inode);
+ clear_inode_flag(inode, FI_APPEND_WRITE);
+ clear_inode_flag(inode, FI_UPDATE_WRITE);
+ goto out;
+ }
sync_nodes:
- sync_node_pages(sbi, ino, &wbc);
+ ret = fsync_node_pages(sbi, inode, &wbc, atomic);
+ if (ret)
+ goto out;
- if (need_inode_block_update(sbi, ino)) {
- mark_inode_dirty_sync(inode);
- ret = f2fs_write_inode(inode, NULL);
- if (ret)
- goto out;
- goto sync_nodes;
- }
+ /* if cp_error was enabled, we should avoid infinite loop */
+ if (unlikely(f2fs_cp_error(sbi))) {
+ ret = -EIO;
+ goto out;
+ }
- ret = wait_on_node_pages_writeback(sbi, ino);
- if (ret)
- goto out;
+ if (need_inode_block_update(sbi, ino)) {
+ f2fs_mark_inode_dirty_sync(inode, true);
+ f2fs_write_inode(inode, NULL);
+ goto sync_nodes;
+ }
+
+ ret = wait_on_node_pages_writeback(sbi, ino);
+ if (ret)
+ goto out;
- /* once recovery info is written, don't need to tack this */
- remove_dirty_inode(sbi, ino, APPEND_INO);
- clear_inode_flag(fi, FI_APPEND_WRITE);
+ /* once recovery info is written, don't need to tack this */
+ remove_ino_entry(sbi, ino, APPEND_INO);
+ clear_inode_flag(inode, FI_APPEND_WRITE);
flush_out:
- remove_dirty_inode(sbi, ino, UPDATE_INO);
- clear_inode_flag(fi, FI_UPDATE_WRITE);
- ret = f2fs_issue_flush(F2FS_I_SB(inode));
- }
+ remove_ino_entry(sbi, ino, UPDATE_INO);
+ clear_inode_flag(inode, FI_UPDATE_WRITE);
+ if (!atomic)
+ ret = f2fs_issue_flush(sbi);
+ f2fs_update_time(sbi, REQ_TIME);
out:
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+ f2fs_trace_ios(NULL, 1);
return ret;
}
+int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ return f2fs_do_sync_file(file, start, end, datasync, false);
+}
+
static pgoff_t __get_first_dirty_index(struct address_space *mapping,
pgoff_t pgofs, int whence)
{
@@ -257,7 +298,7 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping,
pagevec_init(&pvec, 0);
nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs,
PAGECACHE_TAG_DIRTY, 1);
- pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX;
+ pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX;
pagevec_release(&pvec);
return pgofs;
}
@@ -289,45 +330,44 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
loff_t isize;
int err = 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
isize = i_size_read(inode);
if (offset >= isize)
goto fail;
/* handle inline data case */
- if (f2fs_has_inline_data(inode)) {
+ if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) {
if (whence == SEEK_HOLE)
data_ofs = isize;
goto found;
}
- pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT);
+ pgofs = (pgoff_t)(offset >> PAGE_SHIFT);
dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence);
- for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) {
+ for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
+ err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE);
if (err && err != -ENOENT) {
goto fail;
} else if (err == -ENOENT) {
/* direct node does not exists */
if (whence == SEEK_DATA) {
- pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
- F2FS_I(inode));
+ pgofs = get_next_page_offset(&dn, pgofs);
continue;
} else {
goto found;
}
}
- end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
/* find data/hole in dnode block */
for (; dn.ofs_in_node < end_offset;
dn.ofs_in_node++, pgofs++,
- data_ofs = pgofs << PAGE_CACHE_SHIFT) {
+ data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
block_t blkaddr;
blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
@@ -344,10 +384,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
found:
if (whence == SEEK_HOLE && data_ofs > isize)
data_ofs = isize;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return vfs_setpos(file, data_ofs, maxbytes);
fail:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return -ENXIO;
}
@@ -374,16 +414,54 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct inode *inode = file_inode(file);
+ int err;
+
+ if (f2fs_encrypted_inode(inode)) {
+ err = fscrypt_get_encryption_info(inode);
+ if (err)
+ return 0;
+ if (!f2fs_encrypted_inode(inode))
+ return -ENOKEY;
+ }
+
+ /* we don't need to use inline_data strictly */
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+
file_accessed(file);
vma->vm_ops = &f2fs_file_vm_ops;
return 0;
}
+static int f2fs_file_open(struct inode *inode, struct file *filp)
+{
+ int ret = generic_file_open(inode, filp);
+ struct dentry *dir;
+
+ if (!ret && f2fs_encrypted_inode(inode)) {
+ ret = fscrypt_get_encryption_info(inode);
+ if (ret)
+ return -EACCES;
+ if (!fscrypt_has_encryption_key(inode))
+ return -ENOKEY;
+ }
+ dir = dget_parent(file_dentry(filp));
+ if (f2fs_encrypted_inode(d_inode(dir)) &&
+ !fscrypt_has_permitted_context(d_inode(dir), inode)) {
+ dput(dir);
+ return -EPERM;
+ }
+ dput(dir);
+ return ret;
+}
+
int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
{
- int nr_free = 0, ofs = dn->ofs_in_node;
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct f2fs_node *raw_node;
+ int nr_free = 0, ofs = dn->ofs_in_node, len = count;
__le32 *addr;
raw_node = F2FS_NODE(dn->node_page);
@@ -394,17 +472,28 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
if (blkaddr == NULL_ADDR)
continue;
- update_extent_cache(NULL_ADDR, dn);
+ dn->data_blkaddr = NULL_ADDR;
+ set_data_blkaddr(dn);
invalidate_blocks(sbi, blkaddr);
+ if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
+ clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN);
nr_free++;
}
+
if (nr_free) {
+ pgoff_t fofs;
+ /*
+ * once we invalidate valid blkaddr in range [ofs, ofs + count],
+ * we will invalidate all blkaddr in the whole range.
+ */
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page),
+ dn->inode) + ofs;
+ f2fs_update_extent_cache_range(dn, fofs, 0, len);
dec_valid_block_count(sbi, dn->inode, nr_free);
- set_page_dirty(dn->node_page);
- sync_inode_page(dn);
}
dn->ofs_in_node = ofs;
+ f2fs_update_time(sbi, REQ_TIME);
trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid,
dn->ofs_in_node, nr_free);
return nr_free;
@@ -415,32 +504,36 @@ void truncate_data_blocks(struct dnode_of_data *dn)
truncate_data_blocks_range(dn, ADDRS_PER_BLOCK);
}
-static void truncate_partial_data_page(struct inode *inode, u64 from)
+static int truncate_partial_data_page(struct inode *inode, u64 from,
+ bool cache_only)
{
- unsigned offset = from & (PAGE_CACHE_SIZE - 1);
+ unsigned offset = from & (PAGE_SIZE - 1);
+ pgoff_t index = from >> PAGE_SHIFT;
+ struct address_space *mapping = inode->i_mapping;
struct page *page;
- if (f2fs_has_inline_data(inode))
- return truncate_inline_data(inode, from);
+ if (!offset && !cache_only)
+ return 0;
- if (!offset)
- return;
+ if (cache_only) {
+ page = find_lock_page(mapping, index);
+ if (page && PageUptodate(page))
+ goto truncate_out;
+ f2fs_put_page(page, 1);
+ return 0;
+ }
- page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false);
+ page = get_lock_data_page(inode, index, true);
if (IS_ERR(page))
- return;
-
- lock_page(page);
- if (unlikely(!PageUptodate(page) ||
- page->mapping != inode->i_mapping))
- goto out;
-
- f2fs_wait_on_page_writeback(page, DATA);
- zero_user(page, offset, PAGE_CACHE_SIZE - offset);
- set_page_dirty(page);
-
-out:
+ return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page);
+truncate_out:
+ f2fs_wait_on_page_writeback(page, DATA, true);
+ zero_user(page, offset, PAGE_SIZE - offset);
+ if (!cache_only || !f2fs_encrypted_inode(inode) ||
+ !S_ISREG(inode->i_mode))
+ set_page_dirty(page);
f2fs_put_page(page, 1);
+ return 0;
}
int truncate_blocks(struct inode *inode, u64 from, bool lock)
@@ -450,30 +543,41 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
struct dnode_of_data dn;
pgoff_t free_from;
int count = 0, err = 0;
+ struct page *ipage;
+ bool truncate_page = false;
trace_f2fs_truncate_blocks_enter(inode, from);
- if (f2fs_has_inline_data(inode))
- goto done;
+ free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1);
- free_from = (pgoff_t)
- ((from + blocksize - 1) >> (sbi->log_blocksize));
+ if (free_from >= sbi->max_file_blocks)
+ goto free_partial;
if (lock)
f2fs_lock_op(sbi);
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto out;
+ }
+
+ if (f2fs_has_inline_data(inode)) {
+ truncate_inline_inode(inode, ipage, from);
+ f2fs_put_page(ipage, 1);
+ truncate_page = true;
+ goto out;
+ }
+
+ set_new_dnode(&dn, inode, ipage, NULL, 0);
+ err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA);
if (err) {
if (err == -ENOENT)
goto free_next;
- if (lock)
- f2fs_unlock_op(sbi);
- trace_f2fs_truncate_blocks_exit(inode, err);
- return err;
+ goto out;
}
- count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ count = ADDRS_PER_PAGE(dn.node_page, inode);
count -= dn.ofs_in_node;
f2fs_bug_on(sbi, count < 0);
@@ -486,34 +590,54 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
f2fs_put_dnode(&dn);
free_next:
err = truncate_inode_blocks(inode, free_from);
+out:
if (lock)
f2fs_unlock_op(sbi);
-done:
+free_partial:
/* lastly zero out the first data page */
- truncate_partial_data_page(inode, from);
+ if (!err)
+ err = truncate_partial_data_page(inode, from, truncate_page);
trace_f2fs_truncate_blocks_exit(inode, err);
return err;
}
-void f2fs_truncate(struct inode *inode)
+int f2fs_truncate(struct inode *inode)
{
+ int err;
+
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)))
- return;
+ return 0;
trace_f2fs_truncate(inode);
- if (!truncate_blocks(inode, i_size_read(inode), true)) {
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(inode);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) {
+ f2fs_show_injection_info(FAULT_TRUNCATE);
+ return -EIO;
}
+#endif
+ /* we should check inline_data size */
+ if (!f2fs_may_inline_data(inode)) {
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+ }
+
+ err = truncate_blocks(inode, i_size_read(inode), true);
+ if (err)
+ return err;
+
+ inode->i_mtime = inode->i_ctime = current_time(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
+ return 0;
}
int f2fs_getattr(struct vfsmount *mnt,
struct dentry *dentry, struct kstat *stat)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
generic_fillattr(inode, stat);
stat->blocks <<= 3;
return 0;
@@ -522,7 +646,6 @@ int f2fs_getattr(struct vfsmount *mnt,
#ifdef CONFIG_F2FS_FS_POSIX_ACL
static void __setattr_copy(struct inode *inode, const struct iattr *attr)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
unsigned int ia_valid = attr->ia_valid;
if (ia_valid & ATTR_UID)
@@ -543,7 +666,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr)
if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
mode &= ~S_ISGID;
- set_acl_inode(fi, mode);
+ set_acl_inode(inode, mode);
}
}
#else
@@ -552,43 +675,59 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr)
int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
- struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct inode *inode = d_inode(dentry);
int err;
+ bool size_changed = false;
err = inode_change_ok(inode, attr);
if (err)
return err;
if (attr->ia_valid & ATTR_SIZE) {
- err = f2fs_convert_inline_data(inode, attr->ia_size, NULL);
- if (err)
- return err;
+ if (f2fs_encrypted_inode(inode) &&
+ fscrypt_get_encryption_info(inode))
+ return -EACCES;
- if (attr->ia_size != i_size_read(inode)) {
+ if (attr->ia_size <= i_size_read(inode)) {
truncate_setsize(inode, attr->ia_size);
- f2fs_truncate(inode);
- f2fs_balance_fs(F2FS_I_SB(inode));
+ err = f2fs_truncate(inode);
+ if (err)
+ return err;
} else {
/*
- * giving a chance to truncate blocks past EOF which
- * are fallocated with FALLOC_FL_KEEP_SIZE.
+ * do not trim all blocks after i_size if target size is
+ * larger than i_size.
*/
- f2fs_truncate(inode);
+ truncate_setsize(inode, attr->ia_size);
+
+ /* should convert inline inode here */
+ if (!f2fs_may_inline_data(inode)) {
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+ }
+ inode->i_mtime = inode->i_ctime = current_time(inode);
}
+
+ size_changed = true;
}
__setattr_copy(inode, attr);
if (attr->ia_valid & ATTR_MODE) {
err = posix_acl_chmod(inode, get_inode_mode(inode));
- if (err || is_inode_flag_set(fi, FI_ACL_MODE)) {
- inode->i_mode = fi->i_acl_mode;
- clear_inode_flag(fi, FI_ACL_MODE);
+ if (err || is_inode_flag_set(inode, FI_ACL_MODE)) {
+ inode->i_mode = F2FS_I(inode)->i_acl_mode;
+ clear_inode_flag(inode, FI_ACL_MODE);
}
}
- mark_inode_dirty(inode);
+ /* file size may changed here */
+ f2fs_mark_inode_dirty_sync(inode, size_changed);
+
+ /* inode change will produce dirty node pages flushed by checkpoint */
+ f2fs_balance_fs(F2FS_I_SB(inode), true);
+
return err;
}
@@ -606,48 +745,58 @@ const struct inode_operations f2fs_file_inode_operations = {
.fiemap = f2fs_fiemap,
};
-static void fill_zero(struct inode *inode, pgoff_t index,
+static int fill_zero(struct inode *inode, pgoff_t index,
loff_t start, loff_t len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page;
if (!len)
- return;
+ return 0;
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
page = get_new_data_page(inode, NULL, index, false);
f2fs_unlock_op(sbi);
- if (!IS_ERR(page)) {
- f2fs_wait_on_page_writeback(page, DATA);
- zero_user(page, start, len);
- set_page_dirty(page);
- f2fs_put_page(page, 1);
- }
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ f2fs_wait_on_page_writeback(page, DATA, true);
+ zero_user(page, start, len);
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+ return 0;
}
int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
{
- pgoff_t index;
int err;
- for (index = pg_start; index < pg_end; index++) {
+ while (pg_start < pg_end) {
struct dnode_of_data dn;
+ pgoff_t end_offset, count;
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE);
if (err) {
- if (err == -ENOENT)
+ if (err == -ENOENT) {
+ pg_start++;
continue;
+ }
return err;
}
- if (dn.data_blkaddr != NULL_ADDR)
- truncate_data_blocks_range(&dn, 1);
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+ count = min(end_offset - dn.ofs_in_node, pg_end - pg_start);
+
+ f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset);
+
+ truncate_data_blocks_range(&dn, count);
f2fs_put_dnode(&dn);
+
+ pg_start += count;
}
return 0;
}
@@ -656,44 +805,45 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
pgoff_t pg_start, pg_end;
loff_t off_start, off_end;
- int ret = 0;
-
- if (!S_ISREG(inode->i_mode))
- return -EOPNOTSUPP;
-
- /* skip punching hole beyond i_size */
- if (offset >= inode->i_size)
- return ret;
+ int ret;
- ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
+ ret = f2fs_convert_inline_inode(inode);
if (ret)
return ret;
- pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
- pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+ pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
+ pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
- off_start = offset & (PAGE_CACHE_SIZE - 1);
- off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+ off_start = offset & (PAGE_SIZE - 1);
+ off_end = (offset + len) & (PAGE_SIZE - 1);
if (pg_start == pg_end) {
- fill_zero(inode, pg_start, off_start,
+ ret = fill_zero(inode, pg_start, off_start,
off_end - off_start);
+ if (ret)
+ return ret;
} else {
- if (off_start)
- fill_zero(inode, pg_start++, off_start,
- PAGE_CACHE_SIZE - off_start);
- if (off_end)
- fill_zero(inode, pg_end, 0, off_end);
+ if (off_start) {
+ ret = fill_zero(inode, pg_start++, off_start,
+ PAGE_SIZE - off_start);
+ if (ret)
+ return ret;
+ }
+ if (off_end) {
+ ret = fill_zero(inode, pg_end, 0, off_end);
+ if (ret)
+ return ret;
+ }
if (pg_start < pg_end) {
struct address_space *mapping = inode->i_mapping;
loff_t blk_start, blk_end;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
- blk_start = pg_start << PAGE_CACHE_SHIFT;
- blk_end = pg_end << PAGE_CACHE_SHIFT;
+ blk_start = (loff_t)pg_start << PAGE_SHIFT;
+ blk_end = (loff_t)pg_end << PAGE_SHIFT;
truncate_inode_pages_range(mapping, blk_start,
blk_end - 1);
@@ -706,92 +856,599 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
-static int expand_inode_data(struct inode *inode, loff_t offset,
- loff_t len, int mode)
+static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr,
+ int *do_replace, pgoff_t off, pgoff_t len)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct dnode_of_data dn;
+ int ret, done, i;
+
+next_dnode:
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, off, LOOKUP_NODE_RA);
+ if (ret && ret != -ENOENT) {
+ return ret;
+ } else if (ret == -ENOENT) {
+ if (dn.max_level == 0)
+ return -ENOENT;
+ done = min((pgoff_t)ADDRS_PER_BLOCK - dn.ofs_in_node, len);
+ blkaddr += done;
+ do_replace += done;
+ goto next;
+ }
+
+ done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) -
+ dn.ofs_in_node, len);
+ for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
+ *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+ if (!is_checkpointed_data(sbi, *blkaddr)) {
+
+ if (test_opt(sbi, LFS)) {
+ f2fs_put_dnode(&dn);
+ return -ENOTSUPP;
+ }
+
+ /* do not invalidate this block address */
+ f2fs_update_data_blkaddr(&dn, NULL_ADDR);
+ *do_replace = 1;
+ }
+ }
+ f2fs_put_dnode(&dn);
+next:
+ len -= done;
+ off += done;
+ if (len)
+ goto next_dnode;
+ return 0;
+}
+
+static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr,
+ int *do_replace, pgoff_t off, int len)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct dnode_of_data dn;
+ int ret, i;
+
+ for (i = 0; i < len; i++, do_replace++, blkaddr++) {
+ if (*do_replace == 0)
+ continue;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA);
+ if (ret) {
+ dec_valid_block_count(sbi, inode, 1);
+ invalidate_blocks(sbi, *blkaddr);
+ } else {
+ f2fs_update_data_blkaddr(&dn, *blkaddr);
+ }
+ f2fs_put_dnode(&dn);
+ }
+ return 0;
+}
+
+static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
+ block_t *blkaddr, int *do_replace,
+ pgoff_t src, pgoff_t dst, pgoff_t len, bool full)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(src_inode);
+ pgoff_t i = 0;
+ int ret;
+
+ while (i < len) {
+ if (blkaddr[i] == NULL_ADDR && !full) {
+ i++;
+ continue;
+ }
+
+ if (do_replace[i] || blkaddr[i] == NULL_ADDR) {
+ struct dnode_of_data dn;
+ struct node_info ni;
+ size_t new_size;
+ pgoff_t ilen;
+
+ set_new_dnode(&dn, dst_inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, dst + i, ALLOC_NODE);
+ if (ret)
+ return ret;
+
+ get_node_info(sbi, dn.nid, &ni);
+ ilen = min((pgoff_t)
+ ADDRS_PER_PAGE(dn.node_page, dst_inode) -
+ dn.ofs_in_node, len - i);
+ do {
+ dn.data_blkaddr = datablock_addr(dn.node_page,
+ dn.ofs_in_node);
+ truncate_data_blocks_range(&dn, 1);
+
+ if (do_replace[i]) {
+ f2fs_i_blocks_write(src_inode,
+ 1, false);
+ f2fs_i_blocks_write(dst_inode,
+ 1, true);
+ f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
+ blkaddr[i], ni.version, true, false);
+
+ do_replace[i] = 0;
+ }
+ dn.ofs_in_node++;
+ i++;
+ new_size = (dst + i) << PAGE_SHIFT;
+ if (dst_inode->i_size < new_size)
+ f2fs_i_size_write(dst_inode, new_size);
+ } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR));
+
+ f2fs_put_dnode(&dn);
+ } else {
+ struct page *psrc, *pdst;
+
+ psrc = get_lock_data_page(src_inode, src + i, true);
+ if (IS_ERR(psrc))
+ return PTR_ERR(psrc);
+ pdst = get_new_data_page(dst_inode, NULL, dst + i,
+ true);
+ if (IS_ERR(pdst)) {
+ f2fs_put_page(psrc, 1);
+ return PTR_ERR(pdst);
+ }
+ f2fs_copy_page(psrc, pdst);
+ set_page_dirty(pdst);
+ f2fs_put_page(pdst, 1);
+ f2fs_put_page(psrc, 1);
+
+ ret = truncate_hole(src_inode, src + i, src + i + 1);
+ if (ret)
+ return ret;
+ i++;
+ }
+ }
+ return 0;
+}
+
+static int __exchange_data_block(struct inode *src_inode,
+ struct inode *dst_inode, pgoff_t src, pgoff_t dst,
+ pgoff_t len, bool full)
+{
+ block_t *src_blkaddr;
+ int *do_replace;
+ pgoff_t olen;
+ int ret;
+
+ while (len) {
+ olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len);
+
+ src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
+ if (!src_blkaddr)
+ return -ENOMEM;
+
+ do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL);
+ if (!do_replace) {
+ kvfree(src_blkaddr);
+ return -ENOMEM;
+ }
+
+ ret = __read_out_blkaddrs(src_inode, src_blkaddr,
+ do_replace, src, olen);
+ if (ret)
+ goto roll_back;
+
+ ret = __clone_blkaddrs(src_inode, dst_inode, src_blkaddr,
+ do_replace, src, dst, olen, full);
+ if (ret)
+ goto roll_back;
+
+ src += olen;
+ dst += olen;
+ len -= olen;
+
+ kvfree(src_blkaddr);
+ kvfree(do_replace);
+ }
+ return 0;
+
+roll_back:
+ __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, len);
+ kvfree(src_blkaddr);
+ kvfree(do_replace);
+ return ret;
+}
+
+static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+ int ret;
+
+ f2fs_balance_fs(sbi, true);
+ f2fs_lock_op(sbi);
+
+ f2fs_drop_extent_tree(inode);
+
+ ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true);
+ f2fs_unlock_op(sbi);
+ return ret;
+}
+
+static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+ pgoff_t pg_start, pg_end;
+ loff_t new_size;
+ int ret;
+
+ if (offset + len >= i_size_read(inode))
+ return -EINVAL;
+
+ /* collapse range should be aligned to block size of f2fs. */
+ if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
+ return -EINVAL;
+
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
+
+ pg_start = offset >> PAGE_SHIFT;
+ pg_end = (offset + len) >> PAGE_SHIFT;
+
+ /* write out all dirty pages from offset */
+ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ if (ret)
+ return ret;
+
+ truncate_pagecache(inode, offset);
+
+ ret = f2fs_do_collapse(inode, pg_start, pg_end);
+ if (ret)
+ return ret;
+
+ /* write out all moved pages, if possible */
+ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ truncate_pagecache(inode, offset);
+
+ new_size = i_size_read(inode) - len;
+ truncate_pagecache(inode, new_size);
+
+ ret = truncate_blocks(inode, new_size, true);
+ if (!ret)
+ f2fs_i_size_write(inode, new_size);
+
+ return ret;
+}
+
+static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
+ pgoff_t end)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+ pgoff_t index = start;
+ unsigned int ofs_in_node = dn->ofs_in_node;
+ blkcnt_t count = 0;
+ int ret;
+
+ for (; index < end; index++, dn->ofs_in_node++) {
+ if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR)
+ count++;
+ }
+
+ dn->ofs_in_node = ofs_in_node;
+ ret = reserve_new_blocks(dn, count);
+ if (ret)
+ return ret;
+
+ dn->ofs_in_node = ofs_in_node;
+ for (index = start; index < end; index++, dn->ofs_in_node++) {
+ dn->data_blkaddr =
+ datablock_addr(dn->node_page, dn->ofs_in_node);
+ /*
+ * reserve_new_blocks will not guarantee entire block
+ * allocation.
+ */
+ if (dn->data_blkaddr == NULL_ADDR) {
+ ret = -ENOSPC;
+ break;
+ }
+ if (dn->data_blkaddr != NEW_ADDR) {
+ invalidate_blocks(sbi, dn->data_blkaddr);
+ dn->data_blkaddr = NEW_ADDR;
+ set_data_blkaddr(dn);
+ }
+ }
+
+ f2fs_update_extent_cache_range(dn, start, 0, index - start);
+
+ return ret;
+}
+
+static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
+ int mode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct address_space *mapping = inode->i_mapping;
pgoff_t index, pg_start, pg_end;
loff_t new_size = i_size_read(inode);
loff_t off_start, off_end;
int ret = 0;
- f2fs_balance_fs(sbi);
-
ret = inode_newsize_ok(inode, (len + offset));
if (ret)
return ret;
- ret = f2fs_convert_inline_data(inode, offset + len, NULL);
+ ret = f2fs_convert_inline_inode(inode);
if (ret)
return ret;
- pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
- pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
-
- off_start = offset & (PAGE_CACHE_SIZE - 1);
- off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+ ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1);
+ if (ret)
+ return ret;
- f2fs_lock_op(sbi);
+ truncate_pagecache_range(inode, offset, offset + len - 1);
- for (index = pg_start; index <= pg_end; index++) {
- struct dnode_of_data dn;
+ pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
+ pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
- if (index == pg_end && !off_end)
- goto noalloc;
+ off_start = offset & (PAGE_SIZE - 1);
+ off_end = (offset + len) & (PAGE_SIZE - 1);
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = f2fs_reserve_block(&dn, index);
+ if (pg_start == pg_end) {
+ ret = fill_zero(inode, pg_start, off_start,
+ off_end - off_start);
if (ret)
- break;
-noalloc:
- if (pg_start == pg_end)
- new_size = offset + len;
- else if (index == pg_start && off_start)
- new_size = (index + 1) << PAGE_CACHE_SHIFT;
- else if (index == pg_end)
- new_size = (index << PAGE_CACHE_SHIFT) + off_end;
- else
- new_size += PAGE_CACHE_SIZE;
- }
-
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- i_size_read(inode) < new_size) {
- i_size_write(inode, new_size);
- mark_inode_dirty(inode);
- update_inode_page(inode);
+ return ret;
+
+ new_size = max_t(loff_t, new_size, offset + len);
+ } else {
+ if (off_start) {
+ ret = fill_zero(inode, pg_start++, off_start,
+ PAGE_SIZE - off_start);
+ if (ret)
+ return ret;
+
+ new_size = max_t(loff_t, new_size,
+ (loff_t)pg_start << PAGE_SHIFT);
+ }
+
+ for (index = pg_start; index < pg_end;) {
+ struct dnode_of_data dn;
+ unsigned int end_offset;
+ pgoff_t end;
+
+ f2fs_lock_op(sbi);
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, index, ALLOC_NODE);
+ if (ret) {
+ f2fs_unlock_op(sbi);
+ goto out;
+ }
+
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+ end = min(pg_end, end_offset - dn.ofs_in_node + index);
+
+ ret = f2fs_do_zero_range(&dn, index, end);
+ f2fs_put_dnode(&dn);
+ f2fs_unlock_op(sbi);
+
+ f2fs_balance_fs(sbi, dn.node_changed);
+
+ if (ret)
+ goto out;
+
+ index = end;
+ new_size = max_t(loff_t, new_size,
+ (loff_t)index << PAGE_SHIFT);
+ }
+
+ if (off_end) {
+ ret = fill_zero(inode, pg_end, 0, off_end);
+ if (ret)
+ goto out;
+
+ new_size = max_t(loff_t, new_size, offset + len);
+ }
}
- f2fs_unlock_op(sbi);
+
+out:
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
+ f2fs_i_size_write(inode, new_size);
return ret;
}
+static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ pgoff_t nr, pg_start, pg_end, delta, idx;
+ loff_t new_size;
+ int ret = 0;
+
+ new_size = i_size_read(inode) + len;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ return ret;
+
+ if (offset >= i_size_read(inode))
+ return -EINVAL;
+
+ /* insert range should be aligned to block size of f2fs. */
+ if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
+ return -EINVAL;
+
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
+
+ f2fs_balance_fs(sbi, true);
+
+ ret = truncate_blocks(inode, i_size_read(inode), true);
+ if (ret)
+ return ret;
+
+ /* write out all dirty pages from offset */
+ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ if (ret)
+ return ret;
+
+ truncate_pagecache(inode, offset);
+
+ pg_start = offset >> PAGE_SHIFT;
+ pg_end = (offset + len) >> PAGE_SHIFT;
+ delta = pg_end - pg_start;
+ idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ while (!ret && idx > pg_start) {
+ nr = idx - pg_start;
+ if (nr > delta)
+ nr = delta;
+ idx -= nr;
+
+ f2fs_lock_op(sbi);
+ f2fs_drop_extent_tree(inode);
+
+ ret = __exchange_data_block(inode, inode, idx,
+ idx + delta, nr, false);
+ f2fs_unlock_op(sbi);
+ }
+
+ /* write out all moved pages, if possible */
+ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ truncate_pagecache(inode, offset);
+
+ if (!ret)
+ f2fs_i_size_write(inode, new_size);
+ return ret;
+}
+
+static int expand_inode_data(struct inode *inode, loff_t offset,
+ loff_t len, int mode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_map_blocks map = { .m_next_pgofs = NULL };
+ pgoff_t pg_end;
+ loff_t new_size = i_size_read(inode);
+ loff_t off_end;
+ int err;
+
+ err = inode_newsize_ok(inode, (len + offset));
+ if (err)
+ return err;
+
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+
+ f2fs_balance_fs(sbi, true);
+
+ pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT;
+ off_end = (offset + len) & (PAGE_SIZE - 1);
+
+ map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT;
+ map.m_len = pg_end - map.m_lblk;
+ if (off_end)
+ map.m_len++;
+
+ err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+ if (err) {
+ pgoff_t last_off;
+
+ if (!map.m_len)
+ return err;
+
+ last_off = map.m_lblk + map.m_len - 1;
+
+ /* update new size to the failed position */
+ new_size = (last_off == pg_end) ? offset + len:
+ (loff_t)(last_off + 1) << PAGE_SHIFT;
+ } else {
+ new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end;
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
+ f2fs_i_size_write(inode, new_size);
+
+ return err;
+}
+
+#ifndef FALLOC_FL_COLLAPSE_RANGE
+#define FALLOC_FL_COLLAPSE_RANGE 0X08
+#endif
+#ifndef FALLOC_FL_ZERO_RANGE
+#define FALLOC_FL_ZERO_RANGE 0X10
+#endif
+#ifndef FALLOC_FL_INSERT_RANGE
+#define FALLOC_FL_INSERT_RANGE 0X20
+#endif
+
static long f2fs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- long ret;
+ long ret = 0;
+
+ /* f2fs only support ->fallocate for regular file */
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if (f2fs_encrypted_inode(inode) &&
+ (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
+ return -EOPNOTSUPP;
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
+ FALLOC_FL_INSERT_RANGE))
return -EOPNOTSUPP;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
+
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ if (offset >= inode->i_size)
+ goto out;
- if (mode & FALLOC_FL_PUNCH_HOLE)
ret = punch_hole(inode, offset, len);
- else
+ } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+ ret = f2fs_collapse_range(inode, offset, len);
+ } else if (mode & FALLOC_FL_ZERO_RANGE) {
+ ret = f2fs_zero_range(inode, offset, len, mode);
+ } else if (mode & FALLOC_FL_INSERT_RANGE) {
+ ret = f2fs_insert_range(inode, offset, len);
+ } else {
ret = expand_inode_data(inode, offset, len, mode);
+ }
if (!ret) {
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ file_set_keep_isize(inode);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
}
- mutex_unlock(&inode->i_mutex);
+out:
+ inode_unlock(inode);
trace_f2fs_fallocate(inode, mode, offset, len, ret);
return ret;
}
+static int f2fs_release_file(struct inode *inode, struct file *filp)
+{
+ /*
+ * f2fs_relase_file is called at every close calls. So we should
+ * not drop any inmemory pages by close called by other process.
+ */
+ if (!(filp->f_mode & FMODE_WRITE) ||
+ atomic_read(&inode->i_writecount) != 1)
+ return 0;
+
+ /* some remained atomic pages should discarded */
+ if (f2fs_is_atomic_file(inode))
+ drop_inmem_pages(inode);
+ if (f2fs_is_volatile_file(inode)) {
+ clear_inode_flag(inode, FI_VOLATILE_FILE);
+ stat_dec_volatile_write(inode);
+ set_inode_flag(inode, FI_DROP_CACHE);
+ filemap_fdatawrite(inode->i_mapping);
+ clear_inode_flag(inode, FI_DROP_CACHE);
+ }
+ return 0;
+}
+
#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
@@ -817,33 +1474,29 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
- unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+ unsigned int flags;
unsigned int oldflags;
int ret;
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (get_user(flags, (int __user *)arg))
+ return -EFAULT;
+
ret = mnt_want_write_file(filp);
if (ret)
return ret;
- if (!inode_owner_or_capable(inode)) {
- ret = -EACCES;
- goto out;
- }
-
- if (get_user(flags, (int __user *)arg)) {
- ret = -EFAULT;
- goto out;
- }
+ inode_lock(inode);
flags = f2fs_mask_flags(inode->i_mode, flags);
- mutex_lock(&inode->i_mutex);
-
oldflags = fi->i_flags;
if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
ret = -EPERM;
goto out;
}
@@ -852,29 +1505,70 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
flags = flags & FS_FL_USER_MODIFIABLE;
flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
fi->i_flags = flags;
- mutex_unlock(&inode->i_mutex);
+ inode->i_ctime = current_time(inode);
f2fs_set_inode_flags(inode);
- inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(inode);
+
+ inode_unlock(inode);
out:
mnt_drop_write_file(filp);
return ret;
}
+static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+
+ return put_user(inode->i_generation, (int __user *)arg);
+}
+
static int f2fs_ioc_start_atomic_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int ret;
if (!inode_owner_or_capable(inode))
return -EACCES;
- f2fs_balance_fs(sbi);
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
- set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ inode_lock(inode);
- return f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
+ if (f2fs_is_atomic_file(inode))
+ goto out;
+
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ goto out;
+
+ set_inode_flag(inode, FI_ATOMIC_FILE);
+ set_inode_flag(inode, FI_HOT_DATA);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+
+ if (!get_dirty_pages(inode))
+ goto inc_stat;
+
+ f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
+ "Unexpected flush for atomic writes: ino=%lu, npages=%u",
+ inode->i_ino, get_dirty_pages(inode));
+ ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
+ if (ret) {
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
+ goto out;
+ }
+
+inc_stat:
+ stat_inc_atomic_write(inode);
+ stat_update_max_atomic_write(inode);
+out:
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_ioc_commit_atomic_write(struct file *filp)
@@ -885,30 +1579,173 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ inode_lock(inode);
+
if (f2fs_is_volatile_file(inode))
- return 0;
+ goto err_out;
+
+ if (f2fs_is_atomic_file(inode)) {
+ ret = commit_inmem_pages(inode);
+ if (ret)
+ goto err_out;
+
+ ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
+ if (!ret) {
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
+ stat_dec_atomic_write(inode);
+ }
+ } else {
+ ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
+ }
+err_out:
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+static int f2fs_ioc_start_volatile_write(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ int ret;
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
ret = mnt_want_write_file(filp);
if (ret)
return ret;
- if (f2fs_is_atomic_file(inode))
- commit_inmem_pages(inode, false);
+ inode_lock(inode);
+
+ if (f2fs_is_volatile_file(inode))
+ goto out;
+
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ goto out;
+
+ stat_inc_volatile_write(inode);
+ stat_update_max_volatile_write(inode);
- ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
+ set_inode_flag(inode, FI_VOLATILE_FILE);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+out:
+ inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
}
-static int f2fs_ioc_start_volatile_write(struct file *filp)
+static int f2fs_ioc_release_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ int ret;
if (!inode_owner_or_capable(inode))
return -EACCES;
- set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
- return 0;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ inode_lock(inode);
+
+ if (!f2fs_is_volatile_file(inode))
+ goto out;
+
+ if (!f2fs_is_first_block_written(inode)) {
+ ret = truncate_partial_data_page(inode, 0, true);
+ goto out;
+ }
+
+ ret = punch_hole(inode, 0, F2FS_BLKSIZE);
+out:
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+static int f2fs_ioc_abort_volatile_write(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ int ret;
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ inode_lock(inode);
+
+ if (f2fs_is_atomic_file(inode))
+ drop_inmem_pages(inode);
+ if (f2fs_is_volatile_file(inode)) {
+ clear_inode_flag(inode, FI_VOLATILE_FILE);
+ stat_dec_volatile_write(inode);
+ ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
+ }
+
+ inode_unlock(inode);
+
+ mnt_drop_write_file(filp);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+ return ret;
+}
+
+static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct super_block *sb = sbi->sb;
+ __u32 in;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(in, (__u32 __user *)arg))
+ return -EFAULT;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ switch (in) {
+ case F2FS_GOING_DOWN_FULLSYNC:
+ sb = freeze_bdev(sb->s_bdev);
+ if (sb && !IS_ERR(sb)) {
+ f2fs_stop_checkpoint(sbi, false);
+ thaw_bdev(sb->s_bdev, sb);
+ }
+ break;
+ case F2FS_GOING_DOWN_METASYNC:
+ /* do checkpoint only */
+ f2fs_sync_fs(sb, 1);
+ f2fs_stop_checkpoint(sbi, false);
+ break;
+ case F2FS_GOING_DOWN_NOSYNC:
+ f2fs_stop_checkpoint(sbi, false);
+ break;
+ case F2FS_GOING_DOWN_METAFLUSH:
+ sync_meta_pages(sbi, META, LONG_MAX);
+ f2fs_stop_checkpoint(sbi, false);
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+ f2fs_update_time(sbi, REQ_TIME);
+out:
+ mnt_drop_write_file(filp);
+ return ret;
}
static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
@@ -929,18 +1766,530 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
sizeof(range)))
return -EFAULT;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
range.minlen = max((unsigned int)range.minlen,
q->limits.discard_granularity);
ret = f2fs_trim_fs(F2FS_SB(sb), &range);
+ mnt_drop_write_file(filp);
if (ret < 0)
return ret;
if (copy_to_user((struct fstrim_range __user *)arg, &range,
sizeof(range)))
return -EFAULT;
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return 0;
}
+static bool uuid_is_nonzero(__u8 u[16])
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ if (u[i])
+ return true;
+ return false;
+}
+
+static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+
+ return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
+}
+
+static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
+{
+ return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
+}
+
+static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int err;
+
+ if (!f2fs_sb_has_crypto(inode->i_sb))
+ return -EOPNOTSUPP;
+
+ if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt))
+ goto got_it;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ /* update superblock with uuid */
+ generate_random_uuid(sbi->raw_super->encrypt_pw_salt);
+
+ err = f2fs_commit_super(sbi, false);
+ if (err) {
+ /* undo new data */
+ memset(sbi->raw_super->encrypt_pw_salt, 0, 16);
+ mnt_drop_write_file(filp);
+ return err;
+ }
+ mnt_drop_write_file(filp);
+got_it:
+ if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt,
+ 16))
+ return -EFAULT;
+ return 0;
+}
+
+static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ __u32 sync;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(sync, (__u32 __user *)arg))
+ return -EFAULT;
+
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ if (!sync) {
+ if (!mutex_trylock(&sbi->gc_mutex)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ } else {
+ mutex_lock(&sbi->gc_mutex);
+ }
+
+ ret = f2fs_gc(sbi, sync, true, NULL_SEGNO);
+out:
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = f2fs_sync_fs(sbi->sb, 1);
+
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
+ struct file *filp,
+ struct f2fs_defragment *range)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_map_blocks map = { .m_next_pgofs = NULL };
+ struct extent_info ei = {0,0,0};
+ pgoff_t pg_start, pg_end;
+ unsigned int blk_per_seg = sbi->blocks_per_seg;
+ unsigned int total = 0, sec_num;
+ block_t blk_end = 0;
+ bool fragmented = false;
+ int err;
+
+ /* if in-place-update policy is enabled, don't waste time here */
+ if (need_inplace_update_policy(inode, NULL))
+ return -EINVAL;
+
+ pg_start = range->start >> PAGE_SHIFT;
+ pg_end = (range->start + range->len) >> PAGE_SHIFT;
+
+ f2fs_balance_fs(sbi, true);
+
+ inode_lock(inode);
+
+ /* writeback all dirty pages in the range */
+ err = filemap_write_and_wait_range(inode->i_mapping, range->start,
+ range->start + range->len - 1);
+ if (err)
+ goto out;
+
+ /*
+ * lookup mapping info in extent cache, skip defragmenting if physical
+ * block addresses are continuous.
+ */
+ if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
+ if (ei.fofs + ei.len >= pg_end)
+ goto out;
+ }
+
+ map.m_lblk = pg_start;
+
+ /*
+ * lookup mapping info in dnode page cache, skip defragmenting if all
+ * physical block addresses are continuous even if there are hole(s)
+ * in logical blocks.
+ */
+ while (map.m_lblk < pg_end) {
+ map.m_len = pg_end - map.m_lblk;
+ err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+ if (err)
+ goto out;
+
+ if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+ map.m_lblk++;
+ continue;
+ }
+
+ if (blk_end && blk_end != map.m_pblk) {
+ fragmented = true;
+ break;
+ }
+ blk_end = map.m_pblk + map.m_len;
+
+ map.m_lblk += map.m_len;
+ }
+
+ if (!fragmented)
+ goto out;
+
+ map.m_lblk = pg_start;
+ map.m_len = pg_end - pg_start;
+
+ sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi);
+
+ /*
+ * make sure there are enough free section for LFS allocation, this can
+ * avoid defragment running in SSR mode when free section are allocated
+ * intensively
+ */
+ if (has_not_enough_free_secs(sbi, 0, sec_num)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ while (map.m_lblk < pg_end) {
+ pgoff_t idx;
+ int cnt = 0;
+
+do_map:
+ map.m_len = pg_end - map.m_lblk;
+ err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+ if (err)
+ goto clear_out;
+
+ if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+ map.m_lblk++;
+ continue;
+ }
+
+ set_inode_flag(inode, FI_DO_DEFRAG);
+
+ idx = map.m_lblk;
+ while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
+ struct page *page;
+
+ page = get_lock_data_page(inode, idx, true);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto clear_out;
+ }
+
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+
+ idx++;
+ cnt++;
+ total++;
+ }
+
+ map.m_lblk = idx;
+
+ if (idx < pg_end && cnt < blk_per_seg)
+ goto do_map;
+
+ clear_inode_flag(inode, FI_DO_DEFRAG);
+
+ err = filemap_fdatawrite(inode->i_mapping);
+ if (err)
+ goto out;
+ }
+clear_out:
+ clear_inode_flag(inode, FI_DO_DEFRAG);
+out:
+ inode_unlock(inode);
+ if (!err)
+ range->len = (u64)total << PAGE_SHIFT;
+ return err;
+}
+
+static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_defragment range;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode))
+ return -EINVAL;
+
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
+
+ if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ /* verify alignment of offset & size */
+ if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1))
+ return -EINVAL;
+
+ if (unlikely((range.start + range.len) >> PAGE_SHIFT >
+ sbi->max_file_blocks))
+ return -EINVAL;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ err = f2fs_defragment_range(sbi, filp, &range);
+ mnt_drop_write_file(filp);
+
+ f2fs_update_time(sbi, REQ_TIME);
+ if (err < 0)
+ return err;
+
+ if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
+ sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, size_t len)
+{
+ struct inode *src = file_inode(file_in);
+ struct inode *dst = file_inode(file_out);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(src);
+ size_t olen = len, dst_max_i_size = 0;
+ size_t dst_osize;
+ int ret;
+
+ if (file_in->f_path.mnt != file_out->f_path.mnt ||
+ src->i_sb != dst->i_sb)
+ return -EXDEV;
+
+ if (unlikely(f2fs_readonly(src->i_sb)))
+ return -EROFS;
+
+ if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode))
+ return -EINVAL;
+
+ if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst))
+ return -EOPNOTSUPP;
+
+ if (src == dst) {
+ if (pos_in == pos_out)
+ return 0;
+ if (pos_out > pos_in && pos_out < pos_in + len)
+ return -EINVAL;
+ }
+
+ inode_lock(src);
+ if (src != dst) {
+ if (!inode_trylock(dst)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ ret = -EINVAL;
+ if (pos_in + len > src->i_size || pos_in + len < pos_in)
+ goto out_unlock;
+ if (len == 0)
+ olen = len = src->i_size - pos_in;
+ if (pos_in + len == src->i_size)
+ len = ALIGN(src->i_size, F2FS_BLKSIZE) - pos_in;
+ if (len == 0) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ dst_osize = dst->i_size;
+ if (pos_out + olen > dst->i_size)
+ dst_max_i_size = pos_out + olen;
+
+ /* verify the end result is block aligned */
+ if (!IS_ALIGNED(pos_in, F2FS_BLKSIZE) ||
+ !IS_ALIGNED(pos_in + len, F2FS_BLKSIZE) ||
+ !IS_ALIGNED(pos_out, F2FS_BLKSIZE))
+ goto out_unlock;
+
+ ret = f2fs_convert_inline_inode(src);
+ if (ret)
+ goto out_unlock;
+
+ ret = f2fs_convert_inline_inode(dst);
+ if (ret)
+ goto out_unlock;
+
+ /* write out all dirty pages from offset */
+ ret = filemap_write_and_wait_range(src->i_mapping,
+ pos_in, pos_in + len);
+ if (ret)
+ goto out_unlock;
+
+ ret = filemap_write_and_wait_range(dst->i_mapping,
+ pos_out, pos_out + len);
+ if (ret)
+ goto out_unlock;
+
+ f2fs_balance_fs(sbi, true);
+ f2fs_lock_op(sbi);
+ ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS,
+ pos_out >> F2FS_BLKSIZE_BITS,
+ len >> F2FS_BLKSIZE_BITS, false);
+
+ if (!ret) {
+ if (dst_max_i_size)
+ f2fs_i_size_write(dst, dst_max_i_size);
+ else if (dst_osize != dst->i_size)
+ f2fs_i_size_write(dst, dst_osize);
+ }
+ f2fs_unlock_op(sbi);
+out_unlock:
+ if (src != dst)
+ inode_unlock(dst);
+out:
+ inode_unlock(src);
+ return ret;
+}
+
+static int f2fs_ioc_move_range(struct file *filp, unsigned long arg)
+{
+ struct f2fs_move_range range;
+ struct fd dst;
+ int err;
+
+ if (!(filp->f_mode & FMODE_READ) ||
+ !(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (copy_from_user(&range, (struct f2fs_move_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ dst = fdget(range.dst_fd);
+ if (!dst.file)
+ return -EBADF;
+
+ if (!(dst.file->f_mode & FMODE_WRITE)) {
+ err = -EBADF;
+ goto err_out;
+ }
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ goto err_out;
+
+ err = f2fs_move_file_range(filp, range.pos_in, dst.file,
+ range.pos_out, range.len);
+
+ mnt_drop_write_file(filp);
+ if (err)
+ goto err_out;
+
+ if (copy_to_user((struct f2fs_move_range __user *)arg,
+ &range, sizeof(range)))
+ err = -EFAULT;
+err_out:
+ fdput(dst);
+ return err;
+}
+
+static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct sit_info *sm = SIT_I(sbi);
+ unsigned int start_segno = 0, end_segno = 0;
+ unsigned int dev_start_segno = 0, dev_end_segno = 0;
+ struct f2fs_flush_device range;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
+
+ if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num ||
+ sbi->segs_per_sec != 1) {
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "Can't flush %u in %d for segs_per_sec %u != 1\n",
+ range.dev_num, sbi->s_ndevs,
+ sbi->segs_per_sec);
+ return -EINVAL;
+ }
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ if (range.dev_num != 0)
+ dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk);
+ dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk);
+
+ start_segno = sm->last_victim[FLUSH_DEVICE];
+ if (start_segno < dev_start_segno || start_segno >= dev_end_segno)
+ start_segno = dev_start_segno;
+ end_segno = min(start_segno + range.segments, dev_end_segno);
+
+ while (start_segno < end_segno) {
+ if (!mutex_trylock(&sbi->gc_mutex)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ sm->last_victim[GC_CB] = end_segno + 1;
+ sm->last_victim[GC_GREEDY] = end_segno + 1;
+ sm->last_victim[ALLOC_NEXT] = end_segno + 1;
+ ret = f2fs_gc(sbi, true, true, start_segno);
+ if (ret == -EAGAIN)
+ ret = 0;
+ else if (ret < 0)
+ break;
+ start_segno++;
+ }
+out:
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
@@ -948,19 +2297,82 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_getflags(filp, arg);
case F2FS_IOC_SETFLAGS:
return f2fs_ioc_setflags(filp, arg);
+ case F2FS_IOC_GETVERSION:
+ return f2fs_ioc_getversion(filp, arg);
case F2FS_IOC_START_ATOMIC_WRITE:
return f2fs_ioc_start_atomic_write(filp);
case F2FS_IOC_COMMIT_ATOMIC_WRITE:
return f2fs_ioc_commit_atomic_write(filp);
case F2FS_IOC_START_VOLATILE_WRITE:
return f2fs_ioc_start_volatile_write(filp);
+ case F2FS_IOC_RELEASE_VOLATILE_WRITE:
+ return f2fs_ioc_release_volatile_write(filp);
+ case F2FS_IOC_ABORT_VOLATILE_WRITE:
+ return f2fs_ioc_abort_volatile_write(filp);
+ case F2FS_IOC_SHUTDOWN:
+ return f2fs_ioc_shutdown(filp, arg);
case FITRIM:
return f2fs_ioc_fitrim(filp, arg);
+ case F2FS_IOC_SET_ENCRYPTION_POLICY:
+ return f2fs_ioc_set_encryption_policy(filp, arg);
+ case F2FS_IOC_GET_ENCRYPTION_POLICY:
+ return f2fs_ioc_get_encryption_policy(filp, arg);
+ case F2FS_IOC_GET_ENCRYPTION_PWSALT:
+ return f2fs_ioc_get_encryption_pwsalt(filp, arg);
+ case F2FS_IOC_GARBAGE_COLLECT:
+ return f2fs_ioc_gc(filp, arg);
+ case F2FS_IOC_WRITE_CHECKPOINT:
+ return f2fs_ioc_write_checkpoint(filp, arg);
+ case F2FS_IOC_DEFRAGMENT:
+ return f2fs_ioc_defragment(filp, arg);
+ case F2FS_IOC_MOVE_RANGE:
+ return f2fs_ioc_move_range(filp, arg);
+ case F2FS_IOC_FLUSH_DEVICE:
+ return f2fs_ioc_flush_device(filp, arg);
default:
return -ENOTTY;
}
}
+static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ size_t count = iov_iter_count(from);
+ loff_t pos = iocb->ki_pos;
+ struct blk_plug plug;
+ ssize_t ret;
+
+ if (f2fs_encrypted_inode(inode) &&
+ !fscrypt_has_encryption_key(inode) &&
+ fscrypt_get_encryption_info(inode))
+ return -EACCES;
+
+ inode_lock(inode);
+ ret = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (!ret) {
+ int err = f2fs_preallocate_blocks(inode, pos, count,
+ file->f_flags & O_DIRECT);
+ if (err) {
+ inode_unlock(inode);
+ return err;
+ }
+ blk_start_plug(&plug);
+ ret = __generic_file_write_iter(iocb, from);
+ blk_finish_plug(&plug);
+ }
+ inode_unlock(inode);
+
+ if (ret > 0) {
+ ssize_t err;
+
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ if (err < 0)
+ ret = err;
+ }
+ return ret;
+}
+
#ifdef CONFIG_COMPAT
long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
@@ -971,6 +2383,24 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC32_SETFLAGS:
cmd = F2FS_IOC_SETFLAGS;
break;
+ case F2FS_IOC32_GETVERSION:
+ cmd = F2FS_IOC_GETVERSION;
+ break;
+ case F2FS_IOC_START_ATOMIC_WRITE:
+ case F2FS_IOC_COMMIT_ATOMIC_WRITE:
+ case F2FS_IOC_START_VOLATILE_WRITE:
+ case F2FS_IOC_RELEASE_VOLATILE_WRITE:
+ case F2FS_IOC_ABORT_VOLATILE_WRITE:
+ case F2FS_IOC_SHUTDOWN:
+ case F2FS_IOC_SET_ENCRYPTION_POLICY:
+ case F2FS_IOC_GET_ENCRYPTION_PWSALT:
+ case F2FS_IOC_GET_ENCRYPTION_POLICY:
+ case F2FS_IOC_GARBAGE_COLLECT:
+ case F2FS_IOC_WRITE_CHECKPOINT:
+ case F2FS_IOC_DEFRAGMENT:
+ case F2FS_IOC_MOVE_RANGE:
+ case F2FS_IOC_FLUSH_DEVICE:
+ break;
default:
return -ENOIOCTLCMD;
}
@@ -983,8 +2413,9 @@ const struct file_operations f2fs_file_operations = {
.read = new_sync_read,
.write = new_sync_write,
.read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
- .open = generic_file_open,
+ .write_iter = f2fs_file_write_iter,
+ .open = f2fs_file_open,
+ .release = f2fs_release_file,
.mmap = f2fs_file_mmap,
.fsync = f2fs_sync_file,
.fallocate = f2fs_fallocate,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 2a8f4acdb86b..026522107ca3 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -16,7 +16,6 @@
#include <linux/kthread.h>
#include <linux/delay.h>
#include <linux/freezer.h>
-#include <linux/blkdev.h>
#include "f2fs.h"
#include "node.h"
@@ -24,8 +23,6 @@
#include "gc.h"
#include <trace/events/f2fs.h>
-static struct kmem_cache *winode_slab;
-
static int gc_thread_func(void *data)
{
struct f2fs_sb_info *sbi = data;
@@ -46,10 +43,17 @@ static int gc_thread_func(void *data)
break;
if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
- wait_ms = increase_sleep_time(gc_th, wait_ms);
+ increase_sleep_time(gc_th, &wait_ms);
continue;
}
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
+ f2fs_show_injection_info(FAULT_CHECKPOINT);
+ f2fs_stop_checkpoint(sbi, false);
+ }
+#endif
+
/*
* [GC triggering condition]
* 0. GC is not conducted currently.
@@ -67,22 +71,25 @@ static int gc_thread_func(void *data)
continue;
if (!is_idle(sbi)) {
- wait_ms = increase_sleep_time(gc_th, wait_ms);
+ increase_sleep_time(gc_th, &wait_ms);
mutex_unlock(&sbi->gc_mutex);
continue;
}
if (has_enough_invalid_blocks(sbi))
- wait_ms = decrease_sleep_time(gc_th, wait_ms);
+ decrease_sleep_time(gc_th, &wait_ms);
else
- wait_ms = increase_sleep_time(gc_th, wait_ms);
+ increase_sleep_time(gc_th, &wait_ms);
stat_inc_bggc_count(sbi);
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi))
+ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO))
wait_ms = gc_th->no_gc_sleep_time;
+ trace_f2fs_background_gc(sbi->sb, wait_ms,
+ prefree_segments(sbi), free_segments(sbi));
+
/* balancing f2fs's metadata periodically */
f2fs_balance_fs_bg(sbi);
@@ -96,9 +103,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
dev_t dev = sbi->sb->s_bdev->bd_dev;
int err = 0;
- if (!test_opt(sbi, BG_GC))
- goto out;
- gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
+ gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
if (!gc_th) {
err = -ENOMEM;
goto out;
@@ -163,10 +168,15 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
p->ofs_unit = sbi->segs_per_sec;
}
- if (p->max_search > sbi->max_victim_search)
+ /* we need to check every dirty segments in the FG_GC case */
+ if (gc_type != FG_GC && p->max_search > sbi->max_victim_search)
p->max_search = sbi->max_victim_search;
- p->offset = sbi->last_victim[p->gc_mode];
+ /* let's select beginning hot/small space first */
+ if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
+ p->offset = 0;
+ else
+ p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
}
static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
@@ -174,9 +184,9 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
{
/* SSR allocates in a segment unit */
if (p->alloc_mode == SSR)
- return 1 << sbi->log_blocks_per_seg;
+ return sbi->blocks_per_seg;
if (p->gc_mode == GC_GREEDY)
- return (1 << sbi->log_blocks_per_seg) * p->ofs_unit;
+ return 2 * sbi->blocks_per_seg * p->ofs_unit;
else if (p->gc_mode == GC_CB)
return UINT_MAX;
else /* No other gc_mode */
@@ -196,8 +206,12 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
if (sec_usage_check(sbi, secno))
continue;
+
+ if (no_fggc_candidate(sbi, secno))
+ continue;
+
clear_bit(secno, dirty_i->victim_secmap);
- return secno * sbi->segs_per_sec;
+ return GET_SEG_FROM_SEC(sbi, secno);
}
return NULL_SEGNO;
}
@@ -205,8 +219,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
- unsigned int secno = GET_SECNO(sbi, segno);
- unsigned int start = secno * sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
unsigned long long mtime = 0;
unsigned int vblocks;
unsigned char age = 0;
@@ -215,7 +229,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
for (i = 0; i < sbi->segs_per_sec; i++)
mtime += get_seg_entry(sbi, start + i)->mtime;
- vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ vblocks = get_valid_blocks(sbi, segno, true);
mtime = div_u64(mtime, sbi->segs_per_sec);
vblocks = div_u64(vblocks, sbi->segs_per_sec);
@@ -234,6 +248,16 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
}
+static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ unsigned int valid_blocks =
+ get_valid_blocks(sbi, segno, true);
+
+ return IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
+ valid_blocks * 2 : valid_blocks;
+}
+
static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
unsigned int segno, struct victim_sel_policy *p)
{
@@ -242,11 +266,23 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
/* alloc_mode == LFS */
if (p->gc_mode == GC_GREEDY)
- return get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ return get_greedy_cost(sbi, segno);
else
return get_cb_cost(sbi, segno);
}
+static unsigned int count_bits(const unsigned long *addr,
+ unsigned int offset, unsigned int len)
+{
+ unsigned int end = offset + len, sum = 0;
+
+ while (offset < end) {
+ if (test_bit(offset++, addr))
+ ++sum;
+ }
+ return sum;
+}
+
/*
* This function is called from two paths.
* One is garbage collection and the other is SSR segment selection.
@@ -259,9 +295,11 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned int *result, int gc_type, int type, char alloc_mode)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ struct sit_info *sm = SIT_I(sbi);
struct victim_sel_policy p;
- unsigned int secno, max_cost;
- int nsearched = 0;
+ unsigned int secno, last_victim;
+ unsigned int last_segment = MAIN_SEGS(sbi);
+ unsigned int nsearched = 0;
mutex_lock(&dirty_i->seglist_lock);
@@ -269,8 +307,20 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
select_policy(sbi, gc_type, type, &p);
p.min_segno = NULL_SEGNO;
- p.min_cost = max_cost = get_max_cost(sbi, &p);
+ p.min_cost = get_max_cost(sbi, &p);
+
+ if (*result != NULL_SEGNO) {
+ if (IS_DATASEG(get_seg_entry(sbi, *result)->type) &&
+ get_valid_blocks(sbi, *result, false) &&
+ !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
+ p.min_segno = *result;
+ goto out;
+ }
+
+ if (p.max_search == 0)
+ goto out;
+ last_victim = sm->last_victim[p.gc_mode];
if (p.alloc_mode == LFS && gc_type == FG_GC) {
p.min_segno = check_bg_victims(sbi);
if (p.min_segno != NULL_SEGNO)
@@ -281,10 +331,12 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned long cost;
unsigned int segno;
- segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset);
- if (segno >= MAIN_SEGS(sbi)) {
- if (sbi->last_victim[p.gc_mode]) {
- sbi->last_victim[p.gc_mode] = 0;
+ segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
+ if (segno >= last_segment) {
+ if (sm->last_victim[p.gc_mode]) {
+ last_segment =
+ sm->last_victim[p.gc_mode];
+ sm->last_victim[p.gc_mode] = 0;
p.offset = 0;
continue;
}
@@ -292,34 +344,45 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
}
p.offset = segno + p.ofs_unit;
- if (p.ofs_unit > 1)
+ if (p.ofs_unit > 1) {
p.offset -= segno % p.ofs_unit;
+ nsearched += count_bits(p.dirty_segmap,
+ p.offset - p.ofs_unit,
+ p.ofs_unit);
+ } else {
+ nsearched++;
+ }
- secno = GET_SECNO(sbi, segno);
+ secno = GET_SEC_FROM_SEG(sbi, segno);
if (sec_usage_check(sbi, secno))
- continue;
+ goto next;
if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
- continue;
+ goto next;
+ if (gc_type == FG_GC && p.alloc_mode == LFS &&
+ no_fggc_candidate(sbi, secno))
+ goto next;
cost = get_gc_cost(sbi, segno, &p);
if (p.min_cost > cost) {
p.min_segno = segno;
p.min_cost = cost;
- } else if (unlikely(cost == max_cost)) {
- continue;
}
-
- if (nsearched++ >= p.max_search) {
- sbi->last_victim[p.gc_mode] = segno;
+next:
+ if (nsearched >= p.max_search) {
+ if (!sm->last_victim[p.gc_mode] && segno <= last_victim)
+ sm->last_victim[p.gc_mode] = last_victim + 1;
+ else
+ sm->last_victim[p.gc_mode] = segno + 1;
+ sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi);
break;
}
}
if (p.min_segno != NULL_SEGNO) {
got_it:
if (p.alloc_mode == LFS) {
- secno = GET_SECNO(sbi, p.min_segno);
+ secno = GET_SEC_FROM_SEG(sbi, p.min_segno);
if (gc_type == FG_GC)
sbi->cur_victim_sec = secno;
else
@@ -331,6 +394,7 @@ got_it:
sbi->cur_victim_sec,
prefree_segments(sbi), free_segments(sbi));
}
+out:
mutex_unlock(&dirty_i->seglist_lock);
return (p.min_segno == NULL_SEGNO) ? 0 : 1;
@@ -340,37 +404,39 @@ static const struct victim_selection default_v_ops = {
.get_victim = get_victim_by_default,
};
-static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
+static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino)
{
struct inode_entry *ie;
- list_for_each_entry(ie, ilist, list)
- if (ie->inode->i_ino == ino)
- return ie->inode;
+ ie = radix_tree_lookup(&gc_list->iroot, ino);
+ if (ie)
+ return ie->inode;
return NULL;
}
-static void add_gc_inode(struct inode *inode, struct list_head *ilist)
+static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
{
struct inode_entry *new_ie;
- if (inode == find_gc_inode(inode->i_ino, ilist)) {
+ if (inode == find_gc_inode(gc_list, inode->i_ino)) {
iput(inode);
return;
}
-
- new_ie = f2fs_kmem_cache_alloc(winode_slab, GFP_NOFS);
+ new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
new_ie->inode = inode;
- list_add_tail(&new_ie->list, ilist);
+
+ f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
+ list_add_tail(&new_ie->list, &gc_list->ilist);
}
-static void put_gc_inode(struct list_head *ilist)
+static void put_gc_inode(struct gc_inode_list *gc_list)
{
struct inode_entry *ie, *next_ie;
- list_for_each_entry_safe(ie, next_ie, ilist, list) {
+ list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) {
+ radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
iput(ie->inode);
list_del(&ie->list);
- kmem_cache_free(winode_slab, ie);
+ kmem_cache_free(inode_entry_slab, ie);
}
}
@@ -396,9 +462,12 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
static void gc_node_segment(struct f2fs_sb_info *sbi,
struct f2fs_summary *sum, unsigned int segno, int gc_type)
{
- bool initial = true;
struct f2fs_summary *entry;
+ block_t start_addr;
int off;
+ int phase = 0;
+
+ start_addr = START_BLOCK(sbi, segno);
next_step:
entry = sum;
@@ -406,18 +475,27 @@ next_step:
for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
nid_t nid = le32_to_cpu(entry->nid);
struct page *node_page;
+ struct node_info ni;
/* stop BG_GC if there is not enough free sections. */
- if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
return;
if (check_valid_map(sbi, segno, off) == 0)
continue;
- if (initial) {
+ if (phase == 0) {
+ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
+ META_NAT, true);
+ continue;
+ }
+
+ if (phase == 1) {
ra_node_page(sbi, nid);
continue;
}
+
+ /* phase == 2 */
node_page = get_node_page(sbi, nid);
if (IS_ERR(node_page))
continue;
@@ -428,38 +506,18 @@ next_step:
continue;
}
- /* set page dirty and write it */
- if (gc_type == FG_GC) {
- f2fs_wait_on_page_writeback(node_page, NODE);
- set_page_dirty(node_page);
- } else {
- if (!PageWriteback(node_page))
- set_page_dirty(node_page);
+ get_node_info(sbi, nid, &ni);
+ if (ni.blk_addr != start_addr + off) {
+ f2fs_put_page(node_page, 1);
+ continue;
}
- f2fs_put_page(node_page, 1);
- stat_inc_node_blk_count(sbi, 1);
- }
- if (initial) {
- initial = false;
- goto next_step;
+ move_node_page(node_page, gc_type);
+ stat_inc_node_blk_count(sbi, 1, gc_type);
}
- if (gc_type == FG_GC) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .for_reclaim = 0,
- };
- sync_node_pages(sbi, 0, &wbc);
-
- /*
- * In the case of FG_GC, it'd be better to reclaim this victim
- * completely.
- */
- if (get_valid_blocks(sbi, segno, 1) != 0)
- goto next_step;
- }
+ if (++phase < 3)
+ goto next_step;
}
/*
@@ -469,7 +527,7 @@ next_step:
* as indirect or double indirect node blocks, are given, it must be a caller's
* bug.
*/
-block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
+block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
{
unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
unsigned int bidx;
@@ -486,10 +544,10 @@ block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
bidx = node_ofs - 5 - dec;
}
- return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi);
+ return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode);
}
-static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
struct node_info *dni, block_t blkaddr, unsigned int *nofs)
{
struct page *node_page;
@@ -502,13 +560,15 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
node_page = get_node_page(sbi, nid);
if (IS_ERR(node_page))
- return 0;
+ return false;
get_node_info(sbi, nid, dni);
if (sum->version != dni->version) {
- f2fs_put_page(node_page, 1);
- return 0;
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: valid data with mismatched node version.",
+ __func__);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
}
*nofs = ofs_of_node(node_page);
@@ -516,16 +576,132 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
f2fs_put_page(node_page, 1);
if (source_blkaddr != blkaddr)
- return 0;
- return 1;
+ return false;
+ return true;
}
-static void move_data_page(struct inode *inode, struct page *page, int gc_type)
+static void move_encrypted_block(struct inode *inode, block_t bidx,
+ unsigned int segno, int off)
{
struct f2fs_io_info fio = {
+ .sbi = F2FS_I_SB(inode),
.type = DATA,
- .rw = WRITE_SYNC,
+ .op = REQ_OP_READ,
+ .op_flags = 0,
+ .encrypted_page = NULL,
};
+ struct dnode_of_data dn;
+ struct f2fs_summary sum;
+ struct node_info ni;
+ struct page *page;
+ block_t newaddr;
+ int err;
+
+ /* do not read out */
+ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
+ if (!page)
+ return;
+
+ if (!check_valid_map(F2FS_I_SB(inode), segno, off))
+ goto out;
+
+ if (f2fs_is_atomic_file(inode))
+ goto out;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
+ if (err)
+ goto out;
+
+ if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
+ ClearPageUptodate(page);
+ goto put_out;
+ }
+
+ /*
+ * don't cache encrypted data into meta inode until previous dirty
+ * data were writebacked to avoid racing between GC and flush.
+ */
+ f2fs_wait_on_page_writeback(page, DATA, true);
+
+ get_node_info(fio.sbi, dn.nid, &ni);
+ set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
+
+ /* read page */
+ fio.page = page;
+ fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
+
+ allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
+ &sum, CURSEG_COLD_DATA);
+
+ fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr,
+ FGP_LOCK | FGP_CREAT, GFP_NOFS);
+ if (!fio.encrypted_page) {
+ err = -ENOMEM;
+ goto recover_block;
+ }
+
+ err = f2fs_submit_page_bio(&fio);
+ if (err)
+ goto put_page_out;
+
+ /* write page */
+ lock_page(fio.encrypted_page);
+
+ if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
+ err = -EIO;
+ goto put_page_out;
+ }
+ if (unlikely(!PageUptodate(fio.encrypted_page))) {
+ err = -EIO;
+ goto put_page_out;
+ }
+
+ set_page_dirty(fio.encrypted_page);
+ f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true);
+ if (clear_page_dirty_for_io(fio.encrypted_page))
+ dec_page_count(fio.sbi, F2FS_DIRTY_META);
+
+ set_page_writeback(fio.encrypted_page);
+
+ /* allocate block address */
+ f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
+
+ fio.op = REQ_OP_WRITE;
+ fio.op_flags = REQ_SYNC;
+ fio.new_blkaddr = newaddr;
+ f2fs_submit_page_mbio(&fio);
+
+ f2fs_update_data_blkaddr(&dn, newaddr);
+ set_inode_flag(inode, FI_APPEND_WRITE);
+ if (page->index == 0)
+ set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
+put_page_out:
+ f2fs_put_page(fio.encrypted_page, 1);
+recover_block:
+ if (err)
+ __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
+ true, true);
+put_out:
+ f2fs_put_dnode(&dn);
+out:
+ f2fs_put_page(page, 1);
+}
+
+static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
+ unsigned int segno, int off)
+{
+ struct page *page;
+
+ page = get_lock_data_page(inode, bidx, true);
+ if (IS_ERR(page))
+ return;
+
+ if (!check_valid_map(F2FS_I_SB(inode), segno, off))
+ goto out;
+
+ if (f2fs_is_atomic_file(inode))
+ goto out;
if (gc_type == BG_GC) {
if (PageWriteback(page))
@@ -533,13 +709,34 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
set_page_dirty(page);
set_cold_data(page);
} else {
- f2fs_wait_on_page_writeback(page, DATA);
+ struct f2fs_io_info fio = {
+ .sbi = F2FS_I_SB(inode),
+ .type = DATA,
+ .op = REQ_OP_WRITE,
+ .op_flags = REQ_SYNC,
+ .old_blkaddr = NULL_ADDR,
+ .page = page,
+ .encrypted_page = NULL,
+ .need_lock = true,
+ };
+ bool is_dirty = PageDirty(page);
+ int err;
- if (clear_page_dirty_for_io(page))
+retry:
+ set_page_dirty(page);
+ f2fs_wait_on_page_writeback(page, DATA, true);
+ if (clear_page_dirty_for_io(page)) {
inode_dec_dirty_pages(inode);
+ remove_dirty_inode(inode);
+ }
+
set_cold_data(page);
- do_write_data_page(page, &fio);
- clear_cold_data(page);
+
+ err = do_write_data_page(&fio);
+ if (err == -ENOMEM && is_dirty) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
}
out:
f2fs_put_page(page, 1);
@@ -553,7 +750,7 @@ out:
* the victim data block is ignored.
*/
static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
- struct list_head *ilist, unsigned int segno, int gc_type)
+ struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
{
struct super_block *sb = sbi->sb;
struct f2fs_summary *entry;
@@ -572,188 +769,275 @@ next_step:
struct node_info dni; /* dnode info for the data */
unsigned int ofs_in_node, nofs;
block_t start_bidx;
+ nid_t nid = le32_to_cpu(entry->nid);
/* stop BG_GC if there is not enough free sections. */
- if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
return;
if (check_valid_map(sbi, segno, off) == 0)
continue;
if (phase == 0) {
- ra_node_page(sbi, le32_to_cpu(entry->nid));
+ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
+ META_NAT, true);
+ continue;
+ }
+
+ if (phase == 1) {
+ ra_node_page(sbi, nid);
continue;
}
/* Get an inode by ino with checking validity */
- if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0)
+ if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs))
continue;
- if (phase == 1) {
+ if (phase == 2) {
ra_node_page(sbi, dni.ino);
continue;
}
ofs_in_node = le16_to_cpu(entry->ofs_in_node);
- if (phase == 2) {
+ if (phase == 3) {
inode = f2fs_iget(sb, dni.ino);
if (IS_ERR(inode) || is_bad_inode(inode))
continue;
- start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
+ /* if encrypted inode, let's go phase 3 */
+ if (f2fs_encrypted_inode(inode) &&
+ S_ISREG(inode->i_mode)) {
+ add_gc_inode(gc_list, inode);
+ continue;
+ }
- data_page = find_data_page(inode,
- start_bidx + ofs_in_node, false);
- if (IS_ERR(data_page))
- goto next_iput;
+ start_bidx = start_bidx_of_node(nofs, inode);
+ data_page = get_read_data_page(inode,
+ start_bidx + ofs_in_node, REQ_RAHEAD,
+ true);
+ if (IS_ERR(data_page)) {
+ iput(inode);
+ continue;
+ }
f2fs_put_page(data_page, 0);
- add_gc_inode(inode, ilist);
- } else {
- inode = find_gc_inode(dni.ino, ilist);
- if (inode) {
- start_bidx = start_bidx_of_node(nofs,
- F2FS_I(inode));
- data_page = get_lock_data_page(inode,
- start_bidx + ofs_in_node);
- if (IS_ERR(data_page))
+ add_gc_inode(gc_list, inode);
+ continue;
+ }
+
+ /* phase 4 */
+ inode = find_gc_inode(gc_list, dni.ino);
+ if (inode) {
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ bool locked = false;
+
+ if (S_ISREG(inode->i_mode)) {
+ if (!down_write_trylock(&fi->dio_rwsem[READ]))
+ continue;
+ if (!down_write_trylock(
+ &fi->dio_rwsem[WRITE])) {
+ up_write(&fi->dio_rwsem[READ]);
continue;
- move_data_page(inode, data_page, gc_type);
- stat_inc_data_blk_count(sbi, 1);
+ }
+ locked = true;
}
- }
- continue;
-next_iput:
- iput(inode);
- }
- if (++phase < 4)
- goto next_step;
+ start_bidx = start_bidx_of_node(nofs, inode)
+ + ofs_in_node;
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ move_encrypted_block(inode, start_bidx, segno, off);
+ else
+ move_data_page(inode, start_bidx, gc_type, segno, off);
- if (gc_type == FG_GC) {
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ if (locked) {
+ up_write(&fi->dio_rwsem[WRITE]);
+ up_write(&fi->dio_rwsem[READ]);
+ }
- /*
- * In the case of FG_GC, it'd be better to reclaim this victim
- * completely.
- */
- if (get_valid_blocks(sbi, segno, 1) != 0) {
- phase = 2;
- goto next_step;
+ stat_inc_data_blk_count(sbi, 1, gc_type);
}
}
+
+ if (++phase < 5)
+ goto next_step;
}
static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
- int gc_type, int type)
+ int gc_type)
{
struct sit_info *sit_i = SIT_I(sbi);
int ret;
+
mutex_lock(&sit_i->sentry_lock);
- ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, type, LFS);
+ ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
+ NO_CHECK_TYPE, LFS);
mutex_unlock(&sit_i->sentry_lock);
return ret;
}
-static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
- struct list_head *ilist, int gc_type)
+static int do_garbage_collect(struct f2fs_sb_info *sbi,
+ unsigned int start_segno,
+ struct gc_inode_list *gc_list, int gc_type)
{
struct page *sum_page;
struct f2fs_summary_block *sum;
struct blk_plug plug;
+ unsigned int segno = start_segno;
+ unsigned int end_segno = start_segno + sbi->segs_per_sec;
+ int sec_freed = 0;
+ unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
+ SUM_TYPE_DATA : SUM_TYPE_NODE;
- /* read segment summary of victim */
- sum_page = get_sum_page(sbi, segno);
+ /* readahead multi ssa blocks those have contiguous address */
+ if (sbi->segs_per_sec > 1)
+ ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
+ sbi->segs_per_sec, META_SSA, true);
+
+ /* reference all summary page */
+ while (segno < end_segno) {
+ sum_page = get_sum_page(sbi, segno++);
+ unlock_page(sum_page);
+ }
blk_start_plug(&plug);
- sum = page_address(sum_page);
+ for (segno = start_segno; segno < end_segno; segno++) {
+
+ /* find segment summary of victim */
+ sum_page = find_get_page(META_MAPPING(sbi),
+ GET_SUM_BLOCK(sbi, segno));
+ f2fs_put_page(sum_page, 0);
+
+ if (get_valid_blocks(sbi, segno, false) == 0 ||
+ !PageUptodate(sum_page) ||
+ unlikely(f2fs_cp_error(sbi)))
+ goto next;
+
+ sum = page_address(sum_page);
+ f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer)));
- switch (GET_SUM_TYPE((&sum->footer))) {
- case SUM_TYPE_NODE:
- gc_node_segment(sbi, sum->entries, segno, gc_type);
- break;
- case SUM_TYPE_DATA:
- gc_data_segment(sbi, sum->entries, ilist, segno, gc_type);
- break;
+ /*
+ * this is to avoid deadlock:
+ * - lock_page(sum_page) - f2fs_replace_block
+ * - check_valid_map() - mutex_lock(sentry_lock)
+ * - mutex_lock(sentry_lock) - change_curseg()
+ * - lock_page(sum_page)
+ */
+ if (type == SUM_TYPE_NODE)
+ gc_node_segment(sbi, sum->entries, segno, gc_type);
+ else
+ gc_data_segment(sbi, sum->entries, gc_list, segno,
+ gc_type);
+
+ stat_inc_seg_count(sbi, type, gc_type);
+next:
+ f2fs_put_page(sum_page, 0);
}
+
+ if (gc_type == FG_GC)
+ f2fs_submit_merged_bio(sbi,
+ (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE);
+
blk_finish_plug(&plug);
- stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)));
+ if (gc_type == FG_GC &&
+ get_valid_blocks(sbi, start_segno, true) == 0)
+ sec_freed = 1;
+
stat_inc_call_count(sbi->stat_info);
- f2fs_put_page(sum_page, 1);
+ return sec_freed;
}
-int f2fs_gc(struct f2fs_sb_info *sbi)
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
+ bool background, unsigned int segno)
{
- struct list_head ilist;
- unsigned int segno, i;
- int gc_type = BG_GC;
- int nfree = 0;
- int ret = -1;
- struct cp_control cpc = {
- .reason = CP_SYNC,
+ int gc_type = sync ? FG_GC : BG_GC;
+ int sec_freed = 0;
+ int ret = -EINVAL;
+ struct cp_control cpc;
+ unsigned int init_segno = segno;
+ struct gc_inode_list gc_list = {
+ .ilist = LIST_HEAD_INIT(gc_list.ilist),
+ .iroot = RADIX_TREE_INIT(GFP_NOFS),
};
- INIT_LIST_HEAD(&ilist);
+ cpc.reason = __get_cp_reason(sbi);
gc_more:
if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
goto stop;
- if (unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi))) {
+ ret = -EIO;
goto stop;
+ }
- if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
- gc_type = FG_GC;
- write_checkpoint(sbi, &cpc);
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) {
+ /*
+ * For example, if there are many prefree_segments below given
+ * threshold, we can make them free by checkpoint. Then, we
+ * secure free segments which doesn't need fggc any more.
+ */
+ if (prefree_segments(sbi)) {
+ ret = write_checkpoint(sbi, &cpc);
+ if (ret)
+ goto stop;
+ }
+ if (has_not_enough_free_secs(sbi, 0, 0))
+ gc_type = FG_GC;
}
- if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
+ /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
+ if (gc_type == BG_GC && !background)
+ goto stop;
+ if (!__get_victim(sbi, &segno, gc_type))
goto stop;
ret = 0;
- /* readahead multi ssa blocks those have contiguous address */
- if (sbi->segs_per_sec > 1)
- ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
- META_SSA);
-
- for (i = 0; i < sbi->segs_per_sec; i++)
- do_garbage_collect(sbi, segno + i, &ilist, gc_type);
+ if (do_garbage_collect(sbi, segno, &gc_list, gc_type) &&
+ gc_type == FG_GC)
+ sec_freed++;
- if (gc_type == FG_GC) {
+ if (gc_type == FG_GC)
sbi->cur_victim_sec = NULL_SEGNO;
- nfree++;
- WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec));
- }
- if (has_not_enough_free_secs(sbi, nfree))
- goto gc_more;
+ if (!sync) {
+ if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
+ segno = NULL_SEGNO;
+ goto gc_more;
+ }
- if (gc_type == FG_GC)
- write_checkpoint(sbi, &cpc);
+ if (gc_type == FG_GC)
+ ret = write_checkpoint(sbi, &cpc);
+ }
stop:
+ SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
+ SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
mutex_unlock(&sbi->gc_mutex);
- put_gc_inode(&ilist);
+ put_gc_inode(&gc_list);
+
+ if (sync)
+ ret = sec_freed ? 0 : -EAGAIN;
return ret;
}
void build_gc_manager(struct f2fs_sb_info *sbi)
{
+ u64 main_count, resv_count, ovp_count;
+
DIRTY_I(sbi)->v_ops = &default_v_ops;
-}
-int __init create_gc_caches(void)
-{
- winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
- sizeof(struct inode_entry));
- if (!winode_slab)
- return -ENOMEM;
- return 0;
-}
+ /* threshold of # of valid blocks in a section for victims of FG_GC */
+ main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg;
+ resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg;
+ ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
-void destroy_gc_caches(void)
-{
- kmem_cache_destroy(winode_slab);
+ sbi->fggc_threshold = div64_u64((main_count - ovp_count) *
+ BLKS_PER_SEC(sbi), (main_count - resv_count));
+
+ /* give warm/cold data area from slower device */
+ if (sbi->s_ndevs && sbi->segs_per_sec == 1)
+ SIT_I(sbi)->last_victim[ALLOC_NEXT] =
+ GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 16f0b2b22999..a993967dcdb9 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -35,9 +35,9 @@ struct f2fs_gc_kthread {
unsigned int gc_idle;
};
-struct inode_entry {
- struct list_head list;
- struct inode *inode;
+struct gc_inode_list {
+ struct list_head ilist;
+ struct radix_tree_root iroot;
};
/*
@@ -64,26 +64,26 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
}
-static inline long increase_sleep_time(struct f2fs_gc_kthread *gc_th, long wait)
+static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th,
+ long *wait)
{
- if (wait == gc_th->no_gc_sleep_time)
- return wait;
+ if (*wait == gc_th->no_gc_sleep_time)
+ return;
- wait += gc_th->min_sleep_time;
- if (wait > gc_th->max_sleep_time)
- wait = gc_th->max_sleep_time;
- return wait;
+ *wait += gc_th->min_sleep_time;
+ if (*wait > gc_th->max_sleep_time)
+ *wait = gc_th->max_sleep_time;
}
-static inline long decrease_sleep_time(struct f2fs_gc_kthread *gc_th, long wait)
+static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th,
+ long *wait)
{
- if (wait == gc_th->no_gc_sleep_time)
- wait = gc_th->max_sleep_time;
+ if (*wait == gc_th->no_gc_sleep_time)
+ *wait = gc_th->max_sleep_time;
- wait -= gc_th->min_sleep_time;
- if (wait <= gc_th->min_sleep_time)
- wait = gc_th->min_sleep_time;
- return wait;
+ *wait -= gc_th->min_sleep_time;
+ if (*wait <= gc_th->min_sleep_time)
+ *wait = gc_th->min_sleep_time;
}
static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
@@ -100,11 +100,3 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
return true;
return false;
}
-
-static inline int is_idle(struct f2fs_sb_info *sbi)
-{
- struct block_device *bdev = sbi->sb->s_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- struct request_list *rl = &q->root_rl;
- return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]);
-}
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index a844fcfb9a8d..eb2e031ea887 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -70,7 +70,8 @@ static void str2hashbuf(const unsigned char *msg, size_t len,
*buf++ = pad;
}
-f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
+f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info,
+ struct fscrypt_name *fname)
{
__u32 hash;
f2fs_hash_t f2fs_hash;
@@ -79,8 +80,11 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
const unsigned char *name = name_info->name;
size_t len = name_info->len;
- if ((len <= 2) && (name[0] == '.') &&
- (name[1] == '.' || name[1] == '\0'))
+ /* encrypted bigname case */
+ if (fname && !fname->disk_name.name)
+ return cpu_to_le32(fname->hash);
+
+ if (is_dot_dotdot(name_info))
return 0;
/* Initialize the default seed for the hash checksum functions */
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 88036fd75797..ce2c0a6267ae 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -10,153 +10,213 @@
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
+#include <trace/events/android_fs.h>
#include "f2fs.h"
+#include "node.h"
-bool f2fs_may_inline(struct inode *inode)
+bool f2fs_may_inline_data(struct inode *inode)
{
- block_t nr_blocks;
- loff_t i_size;
+ if (f2fs_is_atomic_file(inode))
+ return false;
- if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
+ if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))
return false;
- if (f2fs_is_atomic_file(inode))
+ if (i_size_read(inode) > MAX_INLINE_DATA)
+ return false;
+
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
return false;
- nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
- if (inode->i_blocks > nr_blocks)
+ return true;
+}
+
+bool f2fs_may_inline_dentry(struct inode *inode)
+{
+ if (!test_opt(F2FS_I_SB(inode), INLINE_DENTRY))
return false;
- i_size = i_size_read(inode);
- if (i_size > MAX_INLINE_DATA)
+ if (!S_ISDIR(inode->i_mode))
return false;
return true;
}
-int f2fs_read_inline_data(struct inode *inode, struct page *page)
+void read_inline_data(struct page *page, struct page *ipage)
{
- struct page *ipage;
void *src_addr, *dst_addr;
- if (page->index) {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
- goto out;
+ if (trace_android_fs_dataread_start_enabled()) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ page->mapping->host);
+ trace_android_fs_dataread_start(page->mapping->host,
+ page_offset(page),
+ PAGE_SIZE, current->pid,
+ path, current->comm);
}
+ if (PageUptodate(page))
+ return;
+
+ f2fs_bug_on(F2FS_P_SB(page), page->index);
+
+ zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
+
+ /* Copy the whole inline data block */
+ src_addr = inline_data_addr(ipage);
+ dst_addr = kmap_atomic(page);
+ memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+ flush_dcache_page(page);
+ kunmap_atomic(dst_addr);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
+}
+
+void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from)
+{
+ void *addr;
+
+ if (from >= MAX_INLINE_DATA)
+ return;
+
+ addr = inline_data_addr(ipage);
+
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
+ memset(addr + from, 0, MAX_INLINE_DATA - from);
+ set_page_dirty(ipage);
+
+ if (from == 0)
+ clear_inode_flag(inode, FI_DATA_EXIST);
+}
+
+int f2fs_read_inline_data(struct inode *inode, struct page *page)
+{
+ struct page *ipage;
+
ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage)) {
+ trace_android_fs_dataread_end(inode, page_offset(page),
+ PAGE_SIZE);
unlock_page(page);
return PTR_ERR(ipage);
}
- zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+ if (!f2fs_has_inline_data(inode)) {
+ f2fs_put_page(ipage, 1);
+ return -EAGAIN;
+ }
+
+ if (page->index)
+ zero_user_segment(page, 0, PAGE_SIZE);
+ else
+ read_inline_data(page, ipage);
+
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
- /* Copy the whole inline data block */
- src_addr = inline_data_addr(ipage);
- dst_addr = kmap(page);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
- kunmap(page);
f2fs_put_page(ipage, 1);
-out:
- SetPageUptodate(page);
+ trace_android_fs_dataread_end(inode, page_offset(page), PAGE_SIZE);
unlock_page(page);
-
return 0;
}
-static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
+int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
{
- int err = 0;
- struct page *ipage;
- struct dnode_of_data dn;
- void *src_addr, *dst_addr;
- block_t new_blk_addr;
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_io_info fio = {
+ .sbi = F2FS_I_SB(dn->inode),
.type = DATA,
- .rw = WRITE_SYNC | REQ_PRIO,
+ .op = REQ_OP_WRITE,
+ .op_flags = REQ_SYNC | REQ_PRIO,
+ .page = page,
+ .encrypted_page = NULL,
};
+ int dirty, err;
- f2fs_lock_op(sbi);
- ipage = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- err = PTR_ERR(ipage);
- goto out;
- }
+ if (!f2fs_exist_data(dn->inode))
+ goto clear_out;
- /* someone else converted inline_data already */
- if (!f2fs_has_inline_data(inode))
- goto out;
-
- /*
- * i_addr[0] is not used for inline data,
- * so reserving new block will not destroy inline data
- */
- set_new_dnode(&dn, inode, ipage, NULL, 0);
- err = f2fs_reserve_block(&dn, 0);
+ err = f2fs_reserve_block(dn, 0);
if (err)
- goto out;
+ return err;
- f2fs_wait_on_page_writeback(page, DATA);
- zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+ f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page));
- /* Copy the whole inline data block */
- src_addr = inline_data_addr(ipage);
- dst_addr = kmap(page);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
- kunmap(page);
- SetPageUptodate(page);
+ read_inline_data(page, dn->inode_page);
+ set_page_dirty(page);
+
+ /* clear dirty state */
+ dirty = clear_page_dirty_for_io(page);
/* write data page to try to make data consistent */
set_page_writeback(page);
- write_data_page(page, &dn, &new_blk_addr, &fio);
- update_extent_cache(new_blk_addr, &dn);
- f2fs_wait_on_page_writeback(page, DATA);
+ fio.old_blkaddr = dn->data_blkaddr;
+ set_inode_flag(dn->inode, FI_HOT_DATA);
+ write_data_page(dn, &fio);
+ f2fs_wait_on_page_writeback(page, DATA, true);
+ if (dirty) {
+ inode_dec_dirty_pages(dn->inode);
+ remove_dirty_inode(dn->inode);
+ }
- /* clear inline data and flag after data writeback */
- zero_user_segment(ipage, INLINE_DATA_OFFSET,
- INLINE_DATA_OFFSET + MAX_INLINE_DATA);
- clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
- stat_dec_inline_inode(inode);
+ /* this converted inline_data should be recovered. */
+ set_inode_flag(dn->inode, FI_APPEND_WRITE);
- sync_inode_page(&dn);
- f2fs_put_dnode(&dn);
-out:
- f2fs_unlock_op(sbi);
- return err;
+ /* clear inline data and flag after data writeback */
+ truncate_inline_inode(dn->inode, dn->inode_page, 0);
+ clear_inline_node(dn->inode_page);
+clear_out:
+ stat_dec_inline_inode(dn->inode);
+ clear_inode_flag(dn->inode, FI_INLINE_DATA);
+ f2fs_put_dnode(dn);
+ return 0;
}
-int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size,
- struct page *page)
+int f2fs_convert_inline_inode(struct inode *inode)
{
- struct page *new_page = page;
- int err;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct dnode_of_data dn;
+ struct page *ipage, *page;
+ int err = 0;
if (!f2fs_has_inline_data(inode))
return 0;
- else if (to_size <= MAX_INLINE_DATA)
- return 0;
- if (!page || page->index != 0) {
- new_page = grab_cache_page(inode->i_mapping, 0);
- if (!new_page)
- return -ENOMEM;
+ page = f2fs_grab_cache_page(inode->i_mapping, 0, false);
+ if (!page)
+ return -ENOMEM;
+
+ f2fs_lock_op(sbi);
+
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto out;
}
- err = __f2fs_convert_inline_data(inode, new_page);
- if (!page || page->index != 0)
- f2fs_put_page(new_page, 1);
+ set_new_dnode(&dn, inode, ipage, ipage, 0);
+
+ if (f2fs_has_inline_data(inode))
+ err = f2fs_convert_inline_page(&dn, page);
+
+ f2fs_put_dnode(&dn);
+out:
+ f2fs_unlock_op(sbi);
+
+ f2fs_put_page(page, 1);
+
+ f2fs_balance_fs(sbi, dn.node_changed);
+
return err;
}
-int f2fs_write_inline_data(struct inode *inode,
- struct page *page, unsigned size)
+int f2fs_write_inline_data(struct inode *inode, struct page *page)
{
void *src_addr, *dst_addr;
- struct page *ipage;
struct dnode_of_data dn;
int err;
@@ -164,47 +224,27 @@ int f2fs_write_inline_data(struct inode *inode,
err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
if (err)
return err;
- ipage = dn.inode_page;
-
- f2fs_wait_on_page_writeback(ipage, NODE);
- zero_user_segment(ipage, INLINE_DATA_OFFSET,
- INLINE_DATA_OFFSET + MAX_INLINE_DATA);
- src_addr = kmap(page);
- dst_addr = inline_data_addr(ipage);
- memcpy(dst_addr, src_addr, size);
- kunmap(page);
- /* Release the first data block if it is allocated */
if (!f2fs_has_inline_data(inode)) {
- truncate_data_blocks_range(&dn, 1);
- set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
- stat_inc_inline_inode(inode);
+ f2fs_put_dnode(&dn);
+ return -EAGAIN;
}
- set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
- sync_inode_page(&dn);
- f2fs_put_dnode(&dn);
-
- return 0;
-}
-
-void truncate_inline_data(struct inode *inode, u64 from)
-{
- struct page *ipage;
-
- if (from >= MAX_INLINE_DATA)
- return;
+ f2fs_bug_on(F2FS_I_SB(inode), page->index);
- ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
- if (IS_ERR(ipage))
- return;
+ f2fs_wait_on_page_writeback(dn.inode_page, NODE, true);
+ src_addr = kmap_atomic(page);
+ dst_addr = inline_data_addr(dn.inode_page);
+ memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+ kunmap_atomic(src_addr);
+ set_page_dirty(dn.inode_page);
- f2fs_wait_on_page_writeback(ipage, NODE);
+ set_inode_flag(inode, FI_APPEND_WRITE);
+ set_inode_flag(inode, FI_DATA_EXIST);
- zero_user_segment(ipage, INLINE_DATA_OFFSET + from,
- INLINE_DATA_OFFSET + MAX_INLINE_DATA);
- set_page_dirty(ipage);
- f2fs_put_page(ipage, 1);
+ clear_inline_node(dn.inode_page);
+ f2fs_put_dnode(&dn);
+ return 0;
}
bool recover_inline_data(struct inode *inode, struct page *npage)
@@ -231,12 +271,16 @@ process_inline:
ipage = get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
src_addr = inline_data_addr(npage);
dst_addr = inline_data_addr(ipage);
memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
- update_inode(inode, ipage);
+
+ set_inode_flag(inode, FI_INLINE_DATA);
+ set_inode_flag(inode, FI_DATA_EXIST);
+
+ set_page_dirty(ipage);
f2fs_put_page(ipage, 1);
return true;
}
@@ -244,16 +288,400 @@ process_inline:
if (f2fs_has_inline_data(inode)) {
ipage = get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
- f2fs_wait_on_page_writeback(ipage, NODE);
- zero_user_segment(ipage, INLINE_DATA_OFFSET,
- INLINE_DATA_OFFSET + MAX_INLINE_DATA);
- clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
- update_inode(inode, ipage);
+ truncate_inline_inode(inode, ipage, 0);
+ clear_inode_flag(inode, FI_INLINE_DATA);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
- truncate_blocks(inode, 0, false);
- set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+ if (truncate_blocks(inode, 0, false))
+ return false;
goto process_inline;
}
return false;
}
+
+struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
+ struct fscrypt_name *fname, struct page **res_page)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct f2fs_inline_dentry *inline_dentry;
+ struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
+ struct f2fs_dir_entry *de;
+ struct f2fs_dentry_ptr d;
+ struct page *ipage;
+ f2fs_hash_t namehash;
+
+ ipage = get_node_page(sbi, dir->i_ino);
+ if (IS_ERR(ipage)) {
+ *res_page = ipage;
+ return NULL;
+ }
+
+ namehash = f2fs_dentry_hash(&name, fname);
+
+ inline_dentry = inline_data_addr(ipage);
+
+ make_dentry_ptr_inline(NULL, &d, inline_dentry);
+ de = find_target_dentry(fname, namehash, NULL, &d);
+ unlock_page(ipage);
+ if (de)
+ *res_page = ipage;
+ else
+ f2fs_put_page(ipage, 0);
+
+ return de;
+}
+
+int make_empty_inline_dir(struct inode *inode, struct inode *parent,
+ struct page *ipage)
+{
+ struct f2fs_inline_dentry *dentry_blk;
+ struct f2fs_dentry_ptr d;
+
+ dentry_blk = inline_data_addr(ipage);
+
+ make_dentry_ptr_inline(NULL, &d, dentry_blk);
+ do_make_empty_dir(inode, parent, &d);
+
+ set_page_dirty(ipage);
+
+ /* update i_size to MAX_INLINE_DATA */
+ if (i_size_read(inode) < MAX_INLINE_DATA)
+ f2fs_i_size_write(inode, MAX_INLINE_DATA);
+ return 0;
+}
+
+/*
+ * NOTE: ipage is grabbed by caller, but if any error occurs, we should
+ * release ipage in this function.
+ */
+static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
+ struct f2fs_inline_dentry *inline_dentry)
+{
+ struct page *page;
+ struct dnode_of_data dn;
+ struct f2fs_dentry_block *dentry_blk;
+ int err;
+
+ page = f2fs_grab_cache_page(dir->i_mapping, 0, false);
+ if (!page) {
+ f2fs_put_page(ipage, 1);
+ return -ENOMEM;
+ }
+
+ set_new_dnode(&dn, dir, ipage, NULL, 0);
+ err = f2fs_reserve_block(&dn, 0);
+ if (err)
+ goto out;
+
+ f2fs_wait_on_page_writeback(page, DATA, true);
+ zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
+
+ dentry_blk = kmap_atomic(page);
+
+ /* copy data from inline dentry block to new dentry block */
+ memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap,
+ INLINE_DENTRY_BITMAP_SIZE);
+ memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0,
+ SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE);
+ /*
+ * we do not need to zero out remainder part of dentry and filename
+ * field, since we have used bitmap for marking the usage status of
+ * them, besides, we can also ignore copying/zeroing reserved space
+ * of dentry block, because them haven't been used so far.
+ */
+ memcpy(dentry_blk->dentry, inline_dentry->dentry,
+ sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY);
+ memcpy(dentry_blk->filename, inline_dentry->filename,
+ NR_INLINE_DENTRY * F2FS_SLOT_LEN);
+
+ kunmap_atomic(dentry_blk);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
+ set_page_dirty(page);
+
+ /* clear inline dir and flag after data writeback */
+ truncate_inline_inode(dir, ipage, 0);
+
+ stat_dec_inline_dir(dir);
+ clear_inode_flag(dir, FI_INLINE_DENTRY);
+
+ f2fs_i_depth_write(dir, 1);
+ if (i_size_read(dir) < PAGE_SIZE)
+ f2fs_i_size_write(dir, PAGE_SIZE);
+out:
+ f2fs_put_page(page, 1);
+ return err;
+}
+
+static int f2fs_add_inline_entries(struct inode *dir,
+ struct f2fs_inline_dentry *inline_dentry)
+{
+ struct f2fs_dentry_ptr d;
+ unsigned long bit_pos = 0;
+ int err = 0;
+
+ make_dentry_ptr_inline(NULL, &d, inline_dentry);
+
+ while (bit_pos < d.max) {
+ struct f2fs_dir_entry *de;
+ struct qstr new_name;
+ nid_t ino;
+ umode_t fake_mode;
+
+ if (!test_bit_le(bit_pos, d.bitmap)) {
+ bit_pos++;
+ continue;
+ }
+
+ de = &d.dentry[bit_pos];
+
+ if (unlikely(!de->name_len)) {
+ bit_pos++;
+ continue;
+ }
+
+ new_name.name = d.filename[bit_pos];
+ new_name.len = le16_to_cpu(de->name_len);
+
+ ino = le32_to_cpu(de->ino);
+ fake_mode = get_de_type(de) << S_SHIFT;
+
+ err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL,
+ ino, fake_mode);
+ if (err)
+ goto punch_dentry_pages;
+
+ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+ }
+ return 0;
+punch_dentry_pages:
+ truncate_inode_pages(&dir->i_data, 0);
+ truncate_blocks(dir, 0, false);
+ remove_dirty_inode(dir);
+ return err;
+}
+
+static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
+ struct f2fs_inline_dentry *inline_dentry)
+{
+ struct f2fs_inline_dentry *backup_dentry;
+ int err;
+
+ backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir),
+ sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO);
+ if (!backup_dentry) {
+ f2fs_put_page(ipage, 1);
+ return -ENOMEM;
+ }
+
+ memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA);
+ truncate_inline_inode(dir, ipage, 0);
+
+ unlock_page(ipage);
+
+ err = f2fs_add_inline_entries(dir, backup_dentry);
+ if (err)
+ goto recover;
+
+ lock_page(ipage);
+
+ stat_dec_inline_dir(dir);
+ clear_inode_flag(dir, FI_INLINE_DENTRY);
+ kfree(backup_dentry);
+ return 0;
+recover:
+ lock_page(ipage);
+ memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA);
+ f2fs_i_depth_write(dir, 0);
+ f2fs_i_size_write(dir, MAX_INLINE_DATA);
+ set_page_dirty(ipage);
+ f2fs_put_page(ipage, 1);
+
+ kfree(backup_dentry);
+ return err;
+}
+
+static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+ struct f2fs_inline_dentry *inline_dentry)
+{
+ if (!F2FS_I(dir)->i_dir_level)
+ return f2fs_move_inline_dirents(dir, ipage, inline_dentry);
+ else
+ return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry);
+}
+
+int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
+ const struct qstr *orig_name,
+ struct inode *inode, nid_t ino, umode_t mode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct page *ipage;
+ unsigned int bit_pos;
+ f2fs_hash_t name_hash;
+ struct f2fs_inline_dentry *dentry_blk = NULL;
+ struct f2fs_dentry_ptr d;
+ int slots = GET_DENTRY_SLOTS(new_name->len);
+ struct page *page = NULL;
+ int err = 0;
+
+ ipage = get_node_page(sbi, dir->i_ino);
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
+
+ dentry_blk = inline_data_addr(ipage);
+ bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
+ slots, NR_INLINE_DENTRY);
+ if (bit_pos >= NR_INLINE_DENTRY) {
+ err = f2fs_convert_inline_dir(dir, ipage, dentry_blk);
+ if (err)
+ return err;
+ err = -EAGAIN;
+ goto out;
+ }
+
+ if (inode) {
+ down_write(&F2FS_I(inode)->i_sem);
+ page = init_inode_metadata(inode, dir, new_name,
+ orig_name, ipage);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto fail;
+ }
+ }
+
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
+
+ name_hash = f2fs_dentry_hash(new_name, NULL);
+ make_dentry_ptr_inline(NULL, &d, dentry_blk);
+ f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos);
+
+ set_page_dirty(ipage);
+
+ /* we don't need to mark_inode_dirty now */
+ if (inode) {
+ f2fs_i_pino_write(inode, dir->i_ino);
+ f2fs_put_page(page, 1);
+ }
+
+ update_parent_metadata(dir, inode, 0);
+fail:
+ if (inode)
+ up_write(&F2FS_I(inode)->i_sem);
+out:
+ f2fs_put_page(ipage, 1);
+ return err;
+}
+
+void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
+ struct inode *dir, struct inode *inode)
+{
+ struct f2fs_inline_dentry *inline_dentry;
+ int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
+ unsigned int bit_pos;
+ int i;
+
+ lock_page(page);
+ f2fs_wait_on_page_writeback(page, NODE, true);
+
+ inline_dentry = inline_data_addr(page);
+ bit_pos = dentry - inline_dentry->dentry;
+ for (i = 0; i < slots; i++)
+ __clear_bit_le(bit_pos + i,
+ &inline_dentry->dentry_bitmap);
+
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
+
+ if (inode)
+ f2fs_drop_nlink(dir, inode);
+}
+
+bool f2fs_empty_inline_dir(struct inode *dir)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct page *ipage;
+ unsigned int bit_pos = 2;
+ struct f2fs_inline_dentry *dentry_blk;
+
+ ipage = get_node_page(sbi, dir->i_ino);
+ if (IS_ERR(ipage))
+ return false;
+
+ dentry_blk = inline_data_addr(ipage);
+ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+ NR_INLINE_DENTRY,
+ bit_pos);
+
+ f2fs_put_page(ipage, 1);
+
+ if (bit_pos < NR_INLINE_DENTRY)
+ return false;
+
+ return true;
+}
+
+int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
+ struct fscrypt_str *fstr)
+{
+ struct inode *inode = file_inode(file);
+ struct f2fs_inline_dentry *inline_dentry = NULL;
+ struct page *ipage = NULL;
+ struct f2fs_dentry_ptr d;
+ int err;
+
+ if (ctx->pos == NR_INLINE_DENTRY)
+ return 0;
+
+ ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
+
+ inline_dentry = inline_data_addr(ipage);
+
+ make_dentry_ptr_inline(inode, &d, inline_dentry);
+
+ err = f2fs_fill_dentries(ctx, &d, 0, fstr);
+ if (!err)
+ ctx->pos = NR_INLINE_DENTRY;
+
+ f2fs_put_page(ipage, 1);
+ return err < 0 ? err : 0;
+}
+
+int f2fs_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo, __u64 start, __u64 len)
+{
+ __u64 byteaddr, ilen;
+ __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
+ FIEMAP_EXTENT_LAST;
+ struct node_info ni;
+ struct page *ipage;
+ int err = 0;
+
+ ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
+
+ if (!f2fs_has_inline_data(inode)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode));
+ if (start >= ilen)
+ goto out;
+ if (start + len < ilen)
+ ilen = start + len;
+ ilen -= start;
+
+ get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni);
+ byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits;
+ byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage);
+ err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags);
+out:
+ f2fs_put_page(ipage, 1);
+ return err;
+}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 0deead4505e7..20782deb51f3 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -11,14 +11,22 @@
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
#include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
#include <linux/writeback.h>
-#include <linux/bitops.h>
#include "f2fs.h"
#include "node.h"
#include <trace/events/f2fs.h>
+void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync)
+{
+ if (f2fs_inode_dirtied(inode, sync))
+ return;
+
+ mark_inode_dirty_sync(inode);
+}
+
void f2fs_set_inode_flags(struct inode *inode)
{
unsigned int flags = F2FS_I(inode)->i_flags;
@@ -34,8 +42,9 @@ void f2fs_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & FS_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- set_mask_bits(&inode->i_flags,
- S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl);
+ inode_set_flags(inode, new_fl,
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+ f2fs_mark_inode_dirty_sync(inode, false);
}
static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
@@ -51,6 +60,15 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
}
}
+static bool __written_first_block(struct f2fs_inode *ri)
+{
+ block_t addr = le32_to_cpu(ri->i_addr[0]);
+
+ if (addr != NEW_ADDR && addr != NULL_ADDR)
+ return true;
+ return false;
+}
+
static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
{
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
@@ -67,6 +85,25 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
}
}
+static void __recover_inline_status(struct inode *inode, struct page *ipage)
+{
+ void *inline_data = inline_data_addr(ipage);
+ __le32 *start = inline_data;
+ __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32);
+
+ while (start < end) {
+ if (*start++) {
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
+
+ set_inode_flag(inode, FI_DATA_EXIST);
+ set_raw_inline(inode, F2FS_INODE(ipage));
+ set_page_dirty(ipage);
+ return;
+ }
+ }
+ return;
+}
+
static int do_read_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -111,13 +148,30 @@ static int do_read_inode(struct inode *inode)
fi->i_pino = le32_to_cpu(ri->i_pino);
fi->i_dir_level = ri->i_dir_level;
- get_extent_info(&fi->ext, ri->i_ext);
- get_inline_info(fi, ri);
+ if (f2fs_init_extent_tree(inode, &ri->i_ext))
+ set_page_dirty(node_page);
+
+ get_inline_info(inode, ri);
+
+ /* check data exist */
+ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
+ __recover_inline_status(inode, node_page);
/* get rdev by using inline_info */
__get_inode_rdev(inode, ri);
+ if (__written_first_block(ri))
+ set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
+
+ if (!need_inode_block_update(sbi, inode->i_ino))
+ fi->last_disk_size = inode->i_size;
+
f2fs_put_page(node_page, 1);
+
+ stat_inc_inline_xattr(inode);
+ stat_inc_inline_inode(inode);
+ stat_inc_inline_dir(inode);
+
return 0;
}
@@ -156,9 +210,13 @@ make_now:
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
} else if (S_ISLNK(inode->i_mode)) {
- inode->i_op = &f2fs_symlink_inode_operations;
+ if (f2fs_encrypted_inode(inode))
+ inode->i_op = &f2fs_encrypted_symlink_inode_operations;
+ else
+ inode->i_op = &f2fs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &f2fs_dblock_aops;
} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
@@ -178,11 +236,28 @@ bad_inode:
return ERR_PTR(ret);
}
-void update_inode(struct inode *inode, struct page *node_page)
+struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino)
+{
+ struct inode *inode;
+retry:
+ inode = f2fs_iget(sb, ino);
+ if (IS_ERR(inode)) {
+ if (PTR_ERR(inode) == -ENOMEM) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+ }
+ return inode;
+}
+
+int update_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_inode *ri;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
- f2fs_wait_on_page_writeback(node_page, NODE);
+ f2fs_inode_synced(inode);
+
+ f2fs_wait_on_page_writeback(node_page, NODE, true);
ri = F2FS_INODE(node_page);
@@ -193,8 +268,15 @@ void update_inode(struct inode *inode, struct page *node_page)
ri->i_links = cpu_to_le32(inode->i_nlink);
ri->i_size = cpu_to_le64(i_size_read(inode));
ri->i_blocks = cpu_to_le64(inode->i_blocks);
- set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
- set_raw_inline(F2FS_I(inode), ri);
+
+ if (et) {
+ read_lock(&et->lock);
+ set_raw_extent(&et->largest, &ri->i_ext);
+ read_unlock(&et->lock);
+ } else {
+ memset(&ri->i_ext, 0, sizeof(ri->i_ext));
+ }
+ set_raw_inline(inode, ri);
ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
@@ -211,15 +293,19 @@ void update_inode(struct inode *inode, struct page *node_page)
__set_inode_rdev(inode, ri);
set_cold_node(inode, node_page);
- set_page_dirty(node_page);
- clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+ /* deleted inode */
+ if (inode->i_nlink == 0)
+ clear_inline_node(node_page);
+
+ return set_page_dirty(node_page);
}
-void update_inode_page(struct inode *inode)
+int update_inode_page(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *node_page;
+ int ret = 0;
retry:
node_page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(node_page)) {
@@ -228,12 +314,13 @@ retry:
cond_resched();
goto retry;
} else if (err != -ENOENT) {
- f2fs_stop_checkpoint(sbi);
+ f2fs_stop_checkpoint(sbi, false);
}
- return;
+ return 0;
}
- update_inode(inode, node_page);
+ ret = update_inode(inode, node_page);
f2fs_put_page(node_page, 1);
+ return ret;
}
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -244,20 +331,16 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
inode->i_ino == F2FS_META_INO(sbi))
return 0;
- if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
+ if (!is_inode_flag_set(inode, FI_DIRTY_INODE))
return 0;
/*
- * We need to lock here to prevent from producing dirty node pages
+ * We need to balance fs here to prevent from producing dirty node pages
* during the urgent cleaning time when runing out of free sections.
*/
- f2fs_lock_op(sbi);
update_inode_page(inode);
- f2fs_unlock_op(sbi);
-
- if (wbc)
- f2fs_balance_fs(sbi);
-
+ if (wbc && wbc->nr_to_write)
+ f2fs_balance_fs(sbi, true);
return 0;
}
@@ -268,46 +351,85 @@ void f2fs_evict_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+ int err = 0;
/* some remained atomic pages should discarded */
- if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
- commit_inmem_pages(inode, true);
+ if (f2fs_is_atomic_file(inode))
+ drop_inmem_pages(inode);
trace_f2fs_evict_inode(inode);
- truncate_inode_pages_final(&inode->i_data);
+ truncate_inode_pages(&inode->i_data, 0);
if (inode->i_ino == F2FS_NODE_INO(sbi) ||
inode->i_ino == F2FS_META_INO(sbi))
goto out_clear;
f2fs_bug_on(sbi, get_dirty_pages(inode));
- remove_dirty_dir_inode(inode);
+ remove_dirty_inode(inode);
+
+ f2fs_destroy_extent_tree(inode);
if (inode->i_nlink || is_bad_inode(inode))
goto no_delete;
+ remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
+ remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+
sb_start_intwrite(inode->i_sb);
- set_inode_flag(F2FS_I(inode), FI_NO_ALLOC);
+ set_inode_flag(inode, FI_NO_ALLOC);
i_size_write(inode, 0);
-
+retry:
if (F2FS_HAS_BLOCKS(inode))
- f2fs_truncate(inode);
+ err = f2fs_truncate(inode);
- f2fs_lock_op(sbi);
- remove_inode_page(inode);
- stat_dec_inline_inode(inode);
- f2fs_unlock_op(sbi);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(sbi, FAULT_EVICT_INODE)) {
+ f2fs_show_injection_info(FAULT_EVICT_INODE);
+ err = -EIO;
+ }
+#endif
+ if (!err) {
+ f2fs_lock_op(sbi);
+ err = remove_inode_page(inode);
+ f2fs_unlock_op(sbi);
+ if (err == -ENOENT)
+ err = 0;
+ }
+ /* give more chances, if ENOMEM case */
+ if (err == -ENOMEM) {
+ err = 0;
+ goto retry;
+ }
+
+ if (err)
+ update_inode_page(inode);
sb_end_intwrite(inode->i_sb);
no_delete:
- invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
+ stat_dec_inline_xattr(inode);
+ stat_dec_inline_dir(inode);
+ stat_dec_inline_inode(inode);
+
+ /* ino == 0, if f2fs_new_inode() was failed t*/
+ if (inode->i_ino)
+ invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino,
+ inode->i_ino);
if (xnid)
invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
- if (is_inode_flag_set(F2FS_I(inode), FI_APPEND_WRITE))
- add_dirty_inode(sbi, inode->i_ino, APPEND_INO);
- if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE))
- add_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
+ if (inode->i_nlink) {
+ if (is_inode_flag_set(inode, FI_APPEND_WRITE))
+ add_ino_entry(sbi, inode->i_ino, APPEND_INO);
+ if (is_inode_flag_set(inode, FI_UPDATE_WRITE))
+ add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+ }
+ if (is_inode_flag_set(inode, FI_FREE_NID)) {
+ alloc_nid_failed(sbi, inode->i_ino);
+ clear_inode_flag(inode, FI_FREE_NID);
+ }
+ f2fs_bug_on(sbi, err &&
+ !exist_written_data(sbi, inode->i_ino, ORPHAN_INO));
out_clear:
+ fscrypt_put_encryption_info(inode, NULL);
clear_inode(inode);
}
@@ -315,19 +437,45 @@ out_clear:
void handle_failed_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct node_info ni;
+ /*
+ * clear nlink of inode in order to release resource of inode
+ * immediately.
+ */
clear_nlink(inode);
- make_bad_inode(inode);
- unlock_new_inode(inode);
- i_size_write(inode, 0);
- if (F2FS_HAS_BLOCKS(inode))
- f2fs_truncate(inode);
+ /*
+ * we must call this to avoid inode being remained as dirty, resulting
+ * in a panic when flushing dirty inodes in gdirty_list.
+ */
+ update_inode_page(inode);
+ f2fs_inode_synced(inode);
- remove_inode_page(inode);
- stat_dec_inline_inode(inode);
+ /* don't make bad inode, since it becomes a regular file. */
+ unlock_new_inode(inode);
+
+ /*
+ * Note: we should add inode to orphan list before f2fs_unlock_op()
+ * so we can prevent losing this orphan when encoutering checkpoint
+ * and following suddenly power-off.
+ */
+ get_node_info(sbi, inode->i_ino, &ni);
+
+ if (ni.blk_addr != NULL_ADDR) {
+ int err = acquire_orphan_inode(sbi);
+ if (err) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "Too many orphan inodes, run fsck to fix.");
+ } else {
+ add_orphan_inode(inode);
+ }
+ alloc_nid_done(sbi, inode->i_ino);
+ } else {
+ set_inode_flag(inode, FI_FREE_NID);
+ }
- alloc_nid_failed(sbi, inode->i_ino);
f2fs_unlock_op(sbi);
/* iput will drop the inode object */
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 0d2526e5aa11..b5a523972ffd 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -9,11 +9,13 @@
* published by the Free Software Foundation.
*/
#include <linux/fs.h>
+#include <linux/namei.h>
#include <linux/f2fs_fs.h>
#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/ctype.h>
#include <linux/dcache.h>
+#include <linux/namei.h>
#include "f2fs.h"
#include "node.h"
@@ -45,28 +47,44 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
inode->i_ino = ino;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_generation = sbi->s_next_generation++;
err = insert_inode_locked(inode);
if (err) {
err = -EINVAL;
nid_free = true;
- goto out;
+ goto fail;
}
+
+ /* If the directory encrypted, then we should encrypt the inode. */
+ if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
+ f2fs_set_encrypted_inode(inode);
+
+ set_inode_flag(inode, FI_NEW_INODE);
+
+ if (test_opt(sbi, INLINE_XATTR))
+ set_inode_flag(inode, FI_INLINE_XATTR);
+ if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
+ set_inode_flag(inode, FI_INLINE_DATA);
+ if (f2fs_may_inline_dentry(inode))
+ set_inode_flag(inode, FI_INLINE_DENTRY);
+
+ f2fs_init_extent_tree(inode, NULL);
+
+ stat_inc_inline_xattr(inode);
+ stat_inc_inline_inode(inode);
+ stat_inc_inline_dir(inode);
+
trace_f2fs_new_inode(inode, 0);
- mark_inode_dirty(inode);
return inode;
-out:
- clear_nlink(inode);
- unlock_new_inode(inode);
fail:
trace_f2fs_new_inode(inode, err);
make_bad_inode(inode);
- iput(inode);
if (nid_free)
- alloc_nid_failed(sbi, ino);
+ set_inode_flag(inode, FI_FREE_NID);
+ iput(inode);
return ERR_PTR(err);
}
@@ -74,11 +92,23 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
{
size_t slen = strlen(s);
size_t sublen = strlen(sub);
+ int i;
- if (sublen > slen)
+ /*
+ * filename format of multimedia file should be defined as:
+ * "filename + '.' + extension + (optional: '.' + temp extension)".
+ */
+ if (slen < sublen + 2)
return 0;
- return !strncasecmp(s + slen - sublen, sub, sublen);
+ for (i = 1; i < slen - sublen; i++) {
+ if (s[i] != '.')
+ continue;
+ if (!strncasecmp(s + i + 1, sub, sublen))
+ return 1;
+ }
+
+ return 0;
}
/*
@@ -107,8 +137,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
nid_t ino = 0;
int err;
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -131,6 +159,11 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+
+ if (IS_DIRSYNC(dir))
+ f2fs_sync_fs(sbi->sb, 1);
+
+ f2fs_balance_fs(sbi, true);
return 0;
out:
handle_failed_inode(inode);
@@ -140,16 +173,20 @@ out:
static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
int err;
- f2fs_balance_fs(sbi);
+ if (f2fs_encrypted_inode(dir) &&
+ !fscrypt_has_permitted_context(dir, inode))
+ return -EPERM;
- inode->i_ctime = CURRENT_TIME;
+ f2fs_balance_fs(sbi, true);
+
+ inode->i_ctime = current_time(inode);
ihold(inode);
- set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ set_inode_flag(inode, FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -157,9 +194,12 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
f2fs_unlock_op(sbi);
d_instantiate(dentry, inode);
+
+ if (IS_DIRSYNC(dir))
+ f2fs_sync_fs(sbi->sb, 1);
return 0;
out:
- clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ clear_inode_flag(inode, FI_INC_LINK);
iput(inode);
f2fs_unlock_op(sbi);
return err;
@@ -168,10 +208,64 @@ out:
struct dentry *f2fs_get_parent(struct dentry *child)
{
struct qstr dotdot = QSTR_INIT("..", 2);
- unsigned long ino = f2fs_inode_by_name(child->d_inode, &dotdot);
- if (!ino)
+ struct page *page;
+ unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page);
+ if (!ino) {
+ if (IS_ERR(page))
+ return ERR_CAST(page);
return ERR_PTR(-ENOENT);
- return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino));
+ }
+ return d_obtain_alias(f2fs_iget(child->d_sb, ino));
+}
+
+static int __recover_dot_dentries(struct inode *dir, nid_t pino)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct qstr dot = QSTR_INIT(".", 1);
+ struct qstr dotdot = QSTR_INIT("..", 2);
+ struct f2fs_dir_entry *de;
+ struct page *page;
+ int err = 0;
+
+ if (f2fs_readonly(sbi->sb)) {
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "skip recovering inline_dots inode (ino:%lu, pino:%u) "
+ "in readonly mountpoint", dir->i_ino, pino);
+ return 0;
+ }
+
+ f2fs_balance_fs(sbi, true);
+
+ f2fs_lock_op(sbi);
+
+ de = f2fs_find_entry(dir, &dot, &page);
+ if (de) {
+ f2fs_dentry_kunmap(dir, page);
+ f2fs_put_page(page, 0);
+ } else if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto out;
+ } else {
+ err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR);
+ if (err)
+ goto out;
+ }
+
+ de = f2fs_find_entry(dir, &dotdot, &page);
+ if (de) {
+ f2fs_dentry_kunmap(dir, page);
+ f2fs_put_page(page, 0);
+ } else if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ } else {
+ err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR);
+ }
+out:
+ if (!err)
+ clear_inode_flag(dir, FI_INLINE_DOTS);
+
+ f2fs_unlock_op(sbi);
+ return err;
}
static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
@@ -180,74 +274,163 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
struct inode *inode = NULL;
struct f2fs_dir_entry *de;
struct page *page;
+ nid_t ino;
+ int err = 0;
+ unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir));
+
+ if (f2fs_encrypted_inode(dir)) {
+ int res = fscrypt_get_encryption_info(dir);
+
+ /*
+ * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
+ * created while the directory was encrypted and we
+ * don't have access to the key.
+ */
+ if (fscrypt_has_encryption_key(dir))
+ fscrypt_set_encrypted_dentry(dentry);
+ fscrypt_set_d_op(dentry);
+ if (res && res != -ENOKEY)
+ return ERR_PTR(res);
+ }
if (dentry->d_name.len > F2FS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
de = f2fs_find_entry(dir, &dentry->d_name, &page);
- if (de) {
- nid_t ino = le32_to_cpu(de->ino);
- kunmap(page);
- f2fs_put_page(page, 0);
+ if (!de) {
+ if (IS_ERR(page))
+ return (struct dentry *)page;
+ return d_splice_alias(inode, dentry);
+ }
+
+ ino = le32_to_cpu(de->ino);
+ f2fs_dentry_kunmap(dir, page);
+ f2fs_put_page(page, 0);
- inode = f2fs_iget(dir->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
+ inode = f2fs_iget(dir->i_sb, ino);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
- stat_inc_inline_inode(inode);
+ if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) {
+ err = __recover_dot_dentries(dir, root_ino);
+ if (err)
+ goto err_out;
}
+ if (f2fs_has_inline_dots(inode)) {
+ err = __recover_dot_dentries(inode, dir->i_ino);
+ if (err)
+ goto err_out;
+ }
+ if (f2fs_encrypted_inode(dir) &&
+ (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
+ !fscrypt_has_permitted_context(dir, inode)) {
+ f2fs_msg(inode->i_sb, KERN_WARNING,
+ "Inconsistent encryption contexts: %lu/%lu",
+ dir->i_ino, inode->i_ino);
+ err = -EPERM;
+ goto err_out;
+ }
return d_splice_alias(inode, dentry);
+
+err_out:
+ iput(inode);
+ return ERR_PTR(err);
}
static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct f2fs_dir_entry *de;
struct page *page;
int err = -ENOENT;
trace_f2fs_unlink_enter(dir, dentry);
- f2fs_balance_fs(sbi);
de = f2fs_find_entry(dir, &dentry->d_name, &page);
- if (!de)
+ if (!de) {
+ if (IS_ERR(page))
+ err = PTR_ERR(page);
goto fail;
+ }
+
+ f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
err = acquire_orphan_inode(sbi);
if (err) {
f2fs_unlock_op(sbi);
- kunmap(page);
+ f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
goto fail;
}
- f2fs_delete_entry(de, page, inode);
+ f2fs_delete_entry(de, page, dir, inode);
f2fs_unlock_op(sbi);
- /* In order to evict this inode, we set it dirty */
- mark_inode_dirty(inode);
+ if (IS_DIRSYNC(dir))
+ f2fs_sync_fs(sbi->sb, 1);
fail:
trace_f2fs_unlink_exit(inode, err);
return err;
}
+static void *f2fs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct page *page;
+ char *link;
+
+ page = page_follow_link_light(dentry, nd);
+ if (IS_ERR(page))
+ return page;
+
+ link = nd_get_link(nd);
+ if (IS_ERR(link))
+ return link;
+
+ /* this is broken symlink case */
+ if (*link == 0) {
+ kunmap(page);
+ page_cache_release(page);
+ return ERR_PTR(-ENOENT);
+ }
+ return page;
+}
+
static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
- size_t symlen = strlen(symname) + 1;
+ size_t len = strlen(symname);
+ struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1);
+ struct fscrypt_symlink_data *sd = NULL;
int err;
- f2fs_balance_fs(sbi);
+ if (f2fs_encrypted_inode(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err)
+ return err;
+
+ if (!fscrypt_has_encryption_key(dir))
+ return -ENOKEY;
+
+ disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
+ sizeof(struct fscrypt_symlink_data));
+ }
+
+ if (disk_link.len > dir->i_sb->s_blocksize)
+ return -ENAMETOOLONG;
inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
if (IS_ERR(inode))
return PTR_ERR(inode);
- inode->i_op = &f2fs_symlink_inode_operations;
+ if (f2fs_encrypted_inode(inode))
+ inode->i_op = &f2fs_encrypted_symlink_inode_operations;
+ else
+ inode->i_op = &f2fs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &f2fs_dblock_aops;
f2fs_lock_op(sbi);
@@ -255,12 +438,65 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
if (err)
goto out;
f2fs_unlock_op(sbi);
-
- err = page_symlink(inode, symname, symlen);
alloc_nid_done(sbi, inode->i_ino);
+ if (f2fs_encrypted_inode(inode)) {
+ struct qstr istr = QSTR_INIT(symname, len);
+ struct fscrypt_str ostr;
+
+ sd = kzalloc(disk_link.len, GFP_NOFS);
+ if (!sd) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ err = fscrypt_get_encryption_info(inode);
+ if (err)
+ goto err_out;
+
+ if (!fscrypt_has_encryption_key(inode)) {
+ err = -ENOKEY;
+ goto err_out;
+ }
+
+ ostr.name = sd->encrypted_path;
+ ostr.len = disk_link.len;
+ err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
+ if (err)
+ goto err_out;
+
+ sd->len = cpu_to_le16(ostr.len);
+ disk_link.name = (char *)sd;
+ }
+
+ err = page_symlink(inode, disk_link.name, disk_link.len);
+
+err_out:
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+
+ /*
+ * Let's flush symlink data in order to avoid broken symlink as much as
+ * possible. Nevertheless, fsyncing is the best way, but there is no
+ * way to get a file descriptor in order to flush that.
+ *
+ * Note that, it needs to do dir->fsync to make this recoverable.
+ * If the symlink path is stored into inline_data, there is no
+ * performance regression.
+ */
+ if (!err) {
+ filemap_write_and_wait_range(inode->i_mapping, 0,
+ disk_link.len - 1);
+
+ if (IS_DIRSYNC(dir))
+ f2fs_sync_fs(sbi->sb, 1);
+ } else {
+ f2fs_unlink(dir, dentry);
+ }
+
+ kfree(sd);
+
+ f2fs_balance_fs(sbi, true);
return err;
out:
handle_failed_inode(inode);
@@ -273,8 +509,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct inode *inode;
int err;
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, S_IFDIR | mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -282,9 +516,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
- set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ set_inode_flag(inode, FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -296,17 +530,21 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ if (IS_DIRSYNC(dir))
+ f2fs_sync_fs(sbi->sb, 1);
+
+ f2fs_balance_fs(sbi, true);
return 0;
out_fail:
- clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ clear_inode_flag(inode, FI_INC_LINK);
handle_failed_inode(inode);
return err;
}
static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (f2fs_empty_dir(inode))
return f2fs_unlink(dir, dentry);
return -ENOTEMPTY;
@@ -322,8 +560,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -338,51 +574,158 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
f2fs_unlock_op(sbi);
alloc_nid_done(sbi, inode->i_ino);
+
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+
+ if (IS_DIRSYNC(dir))
+ f2fs_sync_fs(sbi->sb, 1);
+
+ f2fs_balance_fs(sbi, true);
+ return 0;
+out:
+ handle_failed_inode(inode);
+ return err;
+}
+
+static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
+ umode_t mode, struct inode **whiteout)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct inode *inode;
+ int err;
+
+ inode = f2fs_new_inode(dir, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (whiteout) {
+ init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
+ inode->i_op = &f2fs_special_inode_operations;
+ } else {
+ inode->i_op = &f2fs_file_inode_operations;
+ inode->i_fop = &f2fs_file_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ }
+
+ f2fs_lock_op(sbi);
+ err = acquire_orphan_inode(sbi);
+ if (err)
+ goto out;
+
+ err = f2fs_do_tmpfile(inode, dir);
+ if (err)
+ goto release_out;
+
+ /*
+ * add this non-linked tmpfile to orphan list, in this way we could
+ * remove all unused data of tmpfile after abnormal power-off.
+ */
+ add_orphan_inode(inode);
+ alloc_nid_done(sbi, inode->i_ino);
+
+ if (whiteout) {
+ f2fs_i_links_write(inode, false);
+ *whiteout = inode;
+ } else {
+ d_tmpfile(dentry, inode);
+ }
+ /* link_count was changed by d_tmpfile as well. */
+ f2fs_unlock_op(sbi);
+ unlock_new_inode(inode);
+
+ f2fs_balance_fs(sbi, true);
return 0;
+
+release_out:
+ release_orphan_inode(sbi);
out:
handle_failed_inode(inode);
return err;
}
+static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ if (f2fs_encrypted_inode(dir)) {
+ int err = fscrypt_get_encryption_info(dir);
+ if (err)
+ return err;
+ }
+
+ return __f2fs_tmpfile(dir, dentry, mode, NULL);
+}
+
+static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout)
+{
+ return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout);
+}
+
static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
- struct inode *old_inode = old_dentry->d_inode;
- struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
+ struct inode *whiteout = NULL;
struct page *old_dir_page;
- struct page *old_page, *new_page;
+ struct page *old_page, *new_page = NULL;
struct f2fs_dir_entry *old_dir_entry = NULL;
struct f2fs_dir_entry *old_entry;
struct f2fs_dir_entry *new_entry;
+ bool is_old_inline = f2fs_has_inline_dentry(old_dir);
int err = -ENOENT;
- f2fs_balance_fs(sbi);
+ if ((f2fs_encrypted_inode(old_dir) &&
+ !fscrypt_has_encryption_key(old_dir)) ||
+ (f2fs_encrypted_inode(new_dir) &&
+ !fscrypt_has_encryption_key(new_dir)))
+ return -ENOKEY;
+
+ if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) &&
+ !fscrypt_has_permitted_context(new_dir, old_inode)) {
+ err = -EPERM;
+ goto out;
+ }
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
- if (!old_entry)
+ if (!old_entry) {
+ if (IS_ERR(old_page))
+ err = PTR_ERR(old_page);
goto out;
+ }
if (S_ISDIR(old_inode->i_mode)) {
- err = -EIO;
old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page);
- if (!old_dir_entry)
+ if (!old_dir_entry) {
+ if (IS_ERR(old_dir_page))
+ err = PTR_ERR(old_dir_page);
goto out_old;
+ }
+ }
+
+ if (flags & RENAME_WHITEOUT) {
+ err = f2fs_create_whiteout(old_dir, &whiteout);
+ if (err)
+ goto out_dir;
}
if (new_inode) {
err = -ENOTEMPTY;
if (old_dir_entry && !f2fs_empty_dir(new_inode))
- goto out_dir;
+ goto out_whiteout;
err = -ENOENT;
new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
&new_page);
- if (!new_entry)
- goto out_dir;
+ if (!new_entry) {
+ if (IS_ERR(new_page))
+ err = PTR_ERR(new_page);
+ goto out_whiteout;
+ }
+
+ f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
@@ -390,41 +733,53 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
goto put_out_dir;
- if (update_dent_inode(old_inode, &new_dentry->d_name)) {
- release_orphan_inode(sbi);
- goto put_out_dir;
- }
-
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
- new_inode->i_ctime = CURRENT_TIME;
+ new_inode->i_ctime = current_time(new_inode);
down_write(&F2FS_I(new_inode)->i_sem);
if (old_dir_entry)
- drop_nlink(new_inode);
- drop_nlink(new_inode);
+ f2fs_i_links_write(new_inode, false);
+ f2fs_i_links_write(new_inode, false);
up_write(&F2FS_I(new_inode)->i_sem);
- mark_inode_dirty(new_inode);
-
if (!new_inode->i_nlink)
- add_orphan_inode(sbi, new_inode->i_ino);
+ add_orphan_inode(new_inode);
else
release_orphan_inode(sbi);
-
- update_inode_page(old_inode);
- update_inode_page(new_inode);
} else {
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = f2fs_add_link(new_dentry, old_inode);
if (err) {
f2fs_unlock_op(sbi);
- goto out_dir;
+ goto out_whiteout;
}
- if (old_dir_entry) {
- inc_nlink(new_dir);
- update_inode_page(new_dir);
+ if (old_dir_entry)
+ f2fs_i_links_write(new_dir, true);
+
+ /*
+ * old entry and new entry can locate in the same inline
+ * dentry in inode, when attaching new entry in inline dentry,
+ * it could force inline dentry conversion, after that,
+ * old_entry and old_page will point to wrong address, in
+ * order to avoid this, let's do the check and update here.
+ */
+ if (is_old_inline && !f2fs_has_inline_dentry(old_dir)) {
+ f2fs_put_page(old_page, 0);
+ old_page = NULL;
+
+ old_entry = f2fs_find_entry(old_dir,
+ &old_dentry->d_name, &old_page);
+ if (!old_entry) {
+ err = -ENOENT;
+ if (IS_ERR(old_page))
+ err = PTR_ERR(old_page);
+ f2fs_unlock_op(sbi);
+ goto out_dir;
+ }
}
}
@@ -432,39 +787,54 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
file_lost_pino(old_inode);
up_write(&F2FS_I(old_inode)->i_sem);
- old_inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(old_inode);
+ old_inode->i_ctime = current_time(old_inode);
+ f2fs_mark_inode_dirty_sync(old_inode, false);
+
+ f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
- f2fs_delete_entry(old_entry, old_page, NULL);
+ if (whiteout) {
+ whiteout->i_state |= I_LINKABLE;
+ set_inode_flag(whiteout, FI_INC_LINK);
+ err = f2fs_add_link(old_dentry, whiteout);
+ if (err)
+ goto put_out_dir;
+ whiteout->i_state &= ~I_LINKABLE;
+ iput(whiteout);
+ }
if (old_dir_entry) {
- if (old_dir != new_dir) {
+ if (old_dir != new_dir && !whiteout) {
f2fs_set_link(old_inode, old_dir_entry,
old_dir_page, new_dir);
- update_inode_page(old_inode);
} else {
- kunmap(old_dir_page);
+ f2fs_dentry_kunmap(old_inode, old_dir_page);
f2fs_put_page(old_dir_page, 0);
}
- drop_nlink(old_dir);
- mark_inode_dirty(old_dir);
- update_inode_page(old_dir);
+ f2fs_i_links_write(old_dir, false);
}
f2fs_unlock_op(sbi);
+
+ if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+ f2fs_sync_fs(sbi->sb, 1);
return 0;
put_out_dir:
f2fs_unlock_op(sbi);
- kunmap(new_page);
- f2fs_put_page(new_page, 0);
+ if (new_page) {
+ f2fs_dentry_kunmap(new_dir, new_page);
+ f2fs_put_page(new_page, 0);
+ }
+out_whiteout:
+ if (whiteout)
+ iput(whiteout);
out_dir:
if (old_dir_entry) {
- kunmap(old_dir_page);
+ f2fs_dentry_kunmap(old_inode, old_dir_page);
f2fs_put_page(old_dir_page, 0);
}
out_old:
- kunmap(old_page);
+ f2fs_dentry_kunmap(old_dir, old_page);
f2fs_put_page(old_page, 0);
out:
return err;
@@ -474,8 +844,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
- struct inode *old_inode = old_dentry->d_inode;
- struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
struct page *old_dir_page, *new_dir_page;
struct page *old_page, *new_page;
struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL;
@@ -483,32 +853,52 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
int old_nlink = 0, new_nlink = 0;
int err = -ENOENT;
- f2fs_balance_fs(sbi);
+ if ((f2fs_encrypted_inode(old_dir) &&
+ !fscrypt_has_encryption_key(old_dir)) ||
+ (f2fs_encrypted_inode(new_dir) &&
+ !fscrypt_has_encryption_key(new_dir)))
+ return -ENOKEY;
+
+ if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) &&
+ (old_dir != new_dir) &&
+ (!fscrypt_has_permitted_context(new_dir, old_inode) ||
+ !fscrypt_has_permitted_context(old_dir, new_inode)))
+ return -EPERM;
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
- if (!old_entry)
+ if (!old_entry) {
+ if (IS_ERR(old_page))
+ err = PTR_ERR(old_page);
goto out;
+ }
new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page);
- if (!new_entry)
+ if (!new_entry) {
+ if (IS_ERR(new_page))
+ err = PTR_ERR(new_page);
goto out_old;
+ }
/* prepare for updating ".." directory entry info later */
if (old_dir != new_dir) {
if (S_ISDIR(old_inode->i_mode)) {
- err = -EIO;
old_dir_entry = f2fs_parent_dir(old_inode,
&old_dir_page);
- if (!old_dir_entry)
+ if (!old_dir_entry) {
+ if (IS_ERR(old_dir_page))
+ err = PTR_ERR(old_dir_page);
goto out_new;
+ }
}
if (S_ISDIR(new_inode->i_mode)) {
- err = -EIO;
new_dir_entry = f2fs_parent_dir(new_inode,
&new_dir_page);
- if (!new_dir_entry)
+ if (!new_dir_entry) {
+ if (IS_ERR(new_dir_page))
+ err = PTR_ERR(new_dir_page);
goto out_old_dir;
+ }
}
}
@@ -522,20 +912,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
old_nlink = old_dir_entry ? -1 : 1;
new_nlink = -old_nlink;
err = -EMLINK;
- if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) ||
- (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX))
+ if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) ||
+ (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX))
goto out_new_dir;
}
- f2fs_lock_op(sbi);
+ f2fs_balance_fs(sbi, true);
- err = update_dent_inode(old_inode, &new_dentry->d_name);
- if (err)
- goto out_unlock;
-
- err = update_dent_inode(new_inode, &old_dentry->d_name);
- if (err)
- goto out_undo;
+ f2fs_lock_op(sbi);
/* update ".." directory entry info of old dentry */
if (old_dir_entry)
@@ -552,19 +936,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
file_lost_pino(old_inode);
up_write(&F2FS_I(old_inode)->i_sem);
- update_inode_page(old_inode);
-
old_dir->i_ctime = CURRENT_TIME;
if (old_nlink) {
down_write(&F2FS_I(old_dir)->i_sem);
- if (old_nlink < 0)
- drop_nlink(old_dir);
- else
- inc_nlink(old_dir);
+ f2fs_i_links_write(old_dir, old_nlink > 0);
up_write(&F2FS_I(old_dir)->i_sem);
}
- mark_inode_dirty(old_dir);
- update_inode_page(old_dir);
+ f2fs_mark_inode_dirty_sync(old_dir, false);
/* update directory entry info of new dir inode */
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
@@ -573,42 +951,34 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
file_lost_pino(new_inode);
up_write(&F2FS_I(new_inode)->i_sem);
- update_inode_page(new_inode);
-
new_dir->i_ctime = CURRENT_TIME;
if (new_nlink) {
down_write(&F2FS_I(new_dir)->i_sem);
- if (new_nlink < 0)
- drop_nlink(new_dir);
- else
- inc_nlink(new_dir);
+ f2fs_i_links_write(new_dir, new_nlink > 0);
up_write(&F2FS_I(new_dir)->i_sem);
}
- mark_inode_dirty(new_dir);
- update_inode_page(new_dir);
+ f2fs_mark_inode_dirty_sync(new_dir, false);
f2fs_unlock_op(sbi);
+
+ if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+ f2fs_sync_fs(sbi->sb, 1);
return 0;
-out_undo:
- /* Still we may fail to recover name info of f2fs_inode here */
- update_dent_inode(old_inode, &old_dentry->d_name);
-out_unlock:
- f2fs_unlock_op(sbi);
out_new_dir:
if (new_dir_entry) {
- kunmap(new_dir_page);
+ f2fs_dentry_kunmap(new_inode, new_dir_page);
f2fs_put_page(new_dir_page, 0);
}
out_old_dir:
if (old_dir_entry) {
- kunmap(old_dir_page);
+ f2fs_dentry_kunmap(old_inode, old_dir_page);
f2fs_put_page(old_dir_page, 0);
}
out_new:
- kunmap(new_page);
+ f2fs_dentry_kunmap(new_dir, new_page);
f2fs_put_page(new_page, 0);
out_old:
- kunmap(old_page);
+ f2fs_dentry_kunmap(old_dir, old_page);
f2fs_put_page(old_page, 0);
out:
return err;
@@ -618,7 +988,7 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
- if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
if (flags & RENAME_EXCHANGE) {
@@ -629,51 +999,90 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
* VFS has already handled the new dentry existence case,
* here, we just deal with "RENAME_NOREPLACE" as regular rename.
*/
- return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry);
+ return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
}
-static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static void *f2fs_encrypted_follow_link(struct dentry *dentry,
+ struct nameidata *nd)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- struct inode *inode;
- int err;
-
- inode = f2fs_new_inode(dir, mode);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
- inode->i_op = &f2fs_file_inode_operations;
- inode->i_fop = &f2fs_file_operations;
- inode->i_mapping->a_ops = &f2fs_dblock_aops;
-
- f2fs_lock_op(sbi);
- err = acquire_orphan_inode(sbi);
- if (err)
- goto out;
+ struct page *cpage = NULL;
+ char *caddr, *paddr = NULL;
+ struct fscrypt_str cstr = FSTR_INIT(NULL, 0);
+ struct fscrypt_str pstr = FSTR_INIT(NULL, 0);
+ struct fscrypt_symlink_data *sd;
+ struct inode *inode = d_inode(dentry);
+ u32 max_size = inode->i_sb->s_blocksize;
+ int res;
+
+ res = fscrypt_get_encryption_info(inode);
+ if (res)
+ return ERR_PTR(res);
+
+ cpage = read_mapping_page(inode->i_mapping, 0, NULL);
+ if (IS_ERR(cpage))
+ return ERR_CAST(cpage);
+ caddr = kmap(cpage);
+
+ /* Symlink is encrypted */
+ sd = (struct fscrypt_symlink_data *)caddr;
+ cstr.name = sd->encrypted_path;
+ cstr.len = le16_to_cpu(sd->len);
+
+ /* this is broken symlink case */
+ if (unlikely(cstr.len == 0)) {
+ res = -ENOENT;
+ goto errout;
+ }
- err = f2fs_do_tmpfile(inode, dir);
- if (err)
- goto release_out;
+ if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) {
+ /* Symlink data on the disk is corrupted */
+ res = -EIO;
+ goto errout;
+ }
+ res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
+ if (res)
+ goto errout;
+
+ res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
+ if (res)
+ goto errout;
+
+ /* this is broken symlink case */
+ if (unlikely(pstr.name[0] == 0)) {
+ res = -ENOENT;
+ goto errout;
+ }
- /*
- * add this non-linked tmpfile to orphan list, in this way we could
- * remove all unused data of tmpfile after abnormal power-off.
- */
- add_orphan_inode(sbi, inode->i_ino);
- f2fs_unlock_op(sbi);
+ paddr = pstr.name;
- alloc_nid_done(sbi, inode->i_ino);
- d_tmpfile(dentry, inode);
- unlock_new_inode(inode);
- return 0;
+ /* Null-terminate the name */
+ paddr[pstr.len] = '\0';
+ nd_set_link(nd, paddr);
-release_out:
- release_orphan_inode(sbi);
-out:
- handle_failed_inode(inode);
- return err;
+ kunmap(cpage);
+ page_cache_release(cpage);
+ return NULL;
+errout:
+ fscrypt_fname_free_buffer(&pstr);
+ kunmap(cpage);
+ page_cache_release(cpage);
+ return ERR_PTR(res);
}
+const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = f2fs_encrypted_follow_link,
+ .put_link = kfree_put_link,
+ .getattr = f2fs_getattr,
+ .setattr = f2fs_setattr,
+#ifdef CONFIG_F2FS_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = f2fs_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
+
const struct inode_operations f2fs_dir_inode_operations = {
.create = f2fs_create,
.lookup = f2fs_lookup,
@@ -699,7 +1108,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
const struct inode_operations f2fs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
+ .follow_link = f2fs_follow_link,
.put_link = page_put_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 44b8afef43d9..b06e270b0fbe 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -19,9 +19,10 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include "trace.h"
#include <trace/events/f2fs.h>
-#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
+#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
static struct kmem_cache *nat_entry_slab;
static struct kmem_cache *free_nid_slab;
@@ -31,22 +32,50 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct sysinfo val;
+ unsigned long avail_ram;
unsigned long mem_size = 0;
bool res = false;
si_meminfo(&val);
- /* give 25%, 25%, 50% memory for each components respectively */
+
+ /* only uses low memory */
+ avail_ram = val.totalram - val.totalhigh;
+
+ /*
+ * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
+ */
if (type == FREE_NIDS) {
- mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> 12;
- res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2);
+ mem_size = (nm_i->nid_cnt[FREE_NID_LIST] *
+ sizeof(struct free_nid)) >> PAGE_SHIFT;
+ res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == NAT_ENTRIES) {
- mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> 12;
- res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2);
+ mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
+ PAGE_SHIFT;
+ res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
+ if (excess_cached_nats(sbi))
+ res = false;
} else if (type == DIRTY_DENTS) {
if (sbi->sb->s_bdi->dirty_exceeded)
return false;
mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
- res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 1);
+ res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+ } else if (type == INO_ENTRIES) {
+ int i;
+
+ for (i = 0; i <= UPDATE_INO; i++)
+ mem_size += sbi->im[i].ino_num *
+ sizeof(struct ino_entry);
+ mem_size >>= PAGE_SHIFT;
+ res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+ } else if (type == EXTENT_CACHE) {
+ mem_size = (atomic_read(&sbi->total_ext_tree) *
+ sizeof(struct extent_tree) +
+ atomic_read(&sbi->total_ext_node) *
+ sizeof(struct extent_node)) >> PAGE_SHIFT;
+ res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+ } else {
+ if (!sbi->sb->s_bdi->dirty_exceeded)
+ return true;
}
return res;
}
@@ -95,7 +124,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
src_addr = page_address(src_page);
dst_addr = page_address(dst_page);
- memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+ memcpy(dst_addr, src_addr, PAGE_SIZE);
set_page_dirty(dst_page);
f2fs_put_page(src_page, 1);
@@ -131,20 +160,16 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
if (get_nat_flag(ne, IS_DIRTY))
return;
-retry:
+
head = radix_tree_lookup(&nm_i->nat_set_root, set);
if (!head) {
- head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
+ head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS);
INIT_LIST_HEAD(&head->entry_list);
INIT_LIST_HEAD(&head->set_list);
head->set = set;
head->entry_cnt = 0;
-
- if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
- cond_resched();
- goto retry;
- }
+ f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
}
list_move_tail(&ne->list, &head->entry_list);
nm_i->dirty_nat_cnt++;
@@ -153,18 +178,12 @@ retry:
}
static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
- struct nat_entry *ne)
+ struct nat_entry_set *set, struct nat_entry *ne)
{
- nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
- struct nat_entry_set *head;
-
- head = radix_tree_lookup(&nm_i->nat_set_root, set);
- if (head) {
- list_move_tail(&ne->list, &nm_i->nat_entries);
- set_nat_flag(ne, IS_DIRTY, false);
- head->entry_cnt--;
- nm_i->dirty_nat_cnt--;
- }
+ list_move_tail(&ne->list, &nm_i->nat_entries);
+ set_nat_flag(ne, IS_DIRTY, false);
+ set->entry_cnt--;
+ nm_i->dirty_nat_cnt--;
}
static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
@@ -174,32 +193,35 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
start, nr);
}
-bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
+int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
- bool is_cp = true;
+ bool need = false;
- read_lock(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
- if (e && !get_nat_flag(e, IS_CHECKPOINTED))
- is_cp = false;
- read_unlock(&nm_i->nat_tree_lock);
- return is_cp;
+ if (e) {
+ if (!get_nat_flag(e, IS_CHECKPOINTED) &&
+ !get_nat_flag(e, HAS_FSYNCED_INODE))
+ need = true;
+ }
+ up_read(&nm_i->nat_tree_lock);
+ return need;
}
-bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
+bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
- bool fsynced = false;
+ bool is_cp = true;
- read_lock(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, ino);
- if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
- fsynced = true;
- read_unlock(&nm_i->nat_tree_lock);
- return fsynced;
+ down_read(&nm_i->nat_tree_lock);
+ e = __lookup_nat_cache(nm_i, nid);
+ if (e && !get_nat_flag(e, IS_CHECKPOINTED))
+ is_cp = false;
+ up_read(&nm_i->nat_tree_lock);
+ return is_cp;
}
bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
@@ -208,27 +230,34 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
struct nat_entry *e;
bool need_update = true;
- read_lock(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ino);
if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
(get_nat_flag(e, IS_CHECKPOINTED) ||
get_nat_flag(e, HAS_FSYNCED_INODE)))
need_update = false;
- read_unlock(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
return need_update;
}
-static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
+static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
+ bool no_fail)
{
struct nat_entry *new;
- new = kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC);
- if (!new)
- return NULL;
- if (radix_tree_insert(&nm_i->nat_root, nid, new)) {
- kmem_cache_free(nat_entry_slab, new);
- return NULL;
+ if (no_fail) {
+ new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
+ f2fs_radix_tree_insert(&nm_i->nat_root, nid, new);
+ } else {
+ new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
+ if (!new)
+ return NULL;
+ if (radix_tree_insert(&nm_i->nat_root, nid, new)) {
+ kmem_cache_free(nat_entry_slab, new);
+ return NULL;
+ }
}
+
memset(new, 0, sizeof(struct nat_entry));
nat_set_nid(new, nid);
nat_reset_flag(new);
@@ -237,22 +266,23 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
return new;
}
-static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
+static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
struct f2fs_nat_entry *ne)
{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
-retry:
- write_lock(&nm_i->nat_tree_lock);
+
e = __lookup_nat_cache(nm_i, nid);
if (!e) {
- e = grab_nat_entry(nm_i, nid);
- if (!e) {
- write_unlock(&nm_i->nat_tree_lock);
- goto retry;
- }
- node_info_from_raw_nat(&e->ni, ne);
+ e = grab_nat_entry(nm_i, nid, false);
+ if (e)
+ node_info_from_raw_nat(&e->ni, ne);
+ } else {
+ f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
+ nat_get_blkaddr(e) !=
+ le32_to_cpu(ne->block_addr) ||
+ nat_get_version(e) != ne->version);
}
- write_unlock(&nm_i->nat_tree_lock);
}
static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
@@ -260,16 +290,12 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
-retry:
- write_lock(&nm_i->nat_tree_lock);
+
+ down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ni->nid);
if (!e) {
- e = grab_nat_entry(nm_i, ni->nid);
- if (!e) {
- write_unlock(&nm_i->nat_tree_lock);
- goto retry;
- }
- e->ni = *ni;
+ e = grab_nat_entry(nm_i, ni->nid, true);
+ copy_node_info(&e->ni, ni);
f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
} else if (new_blkaddr == NEW_ADDR) {
/*
@@ -277,7 +303,7 @@ retry:
* previous nat entry can be remained in nat cache.
* So, reinitialize it with new information.
*/
- e->ni = *ni;
+ copy_node_info(&e->ni, ni);
f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
}
@@ -295,6 +321,10 @@ retry:
if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
unsigned char version = nat_get_version(e);
nat_set_version(e, inc_node_version(version));
+
+ /* in order to reuse the nid */
+ if (nm_i->next_scan_nid > ni->nid)
+ nm_i->next_scan_nid = ni->nid;
}
/* change address */
@@ -304,23 +334,24 @@ retry:
__set_nat_cache_dirty(nm_i, e);
/* update fsync_mark if its inode nat entry is still alive */
- e = __lookup_nat_cache(nm_i, ni->ino);
+ if (ni->nid != ni->ino)
+ e = __lookup_nat_cache(nm_i, ni->ino);
if (e) {
if (fsync_done && ni->nid == ni->ino)
set_nat_flag(e, HAS_FSYNCED_INODE, true);
set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
}
- write_unlock(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
}
int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
+ int nr = nr_shrink;
- if (available_free_memory(sbi, NAT_ENTRIES))
+ if (!down_write_trylock(&nm_i->nat_tree_lock))
return 0;
- write_lock(&nm_i->nat_tree_lock);
while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
struct nat_entry *ne;
ne = list_first_entry(&nm_i->nat_entries,
@@ -328,8 +359,8 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
__del_from_nat_cache(nm_i, ne);
nr_shrink--;
}
- write_unlock(&nm_i->nat_tree_lock);
- return nr_shrink;
+ up_write(&nm_i->nat_tree_lock);
+ return nr - nr_shrink;
}
/*
@@ -339,59 +370,121 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
nid_t start_nid = START_NID(nid);
struct f2fs_nat_block *nat_blk;
struct page *page = NULL;
struct f2fs_nat_entry ne;
struct nat_entry *e;
+ pgoff_t index;
int i;
- memset(&ne, 0, sizeof(struct f2fs_nat_entry));
ni->nid = nid;
/* Check nat cache */
- read_lock(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
ni->ino = nat_get_ino(e);
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
- }
- read_unlock(&nm_i->nat_tree_lock);
- if (e)
+ up_read(&nm_i->nat_tree_lock);
return;
+ }
+
+ memset(&ne, 0, sizeof(struct f2fs_nat_entry));
/* Check current segment summary */
- mutex_lock(&curseg->curseg_mutex);
- i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
+ down_read(&curseg->journal_rwsem);
+ i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
if (i >= 0) {
- ne = nat_in_journal(sum, i);
+ ne = nat_in_journal(journal, i);
node_info_from_raw_nat(ni, &ne);
}
- mutex_unlock(&curseg->curseg_mutex);
- if (i >= 0)
+ up_read(&curseg->journal_rwsem);
+ if (i >= 0) {
+ up_read(&nm_i->nat_tree_lock);
goto cache;
+ }
/* Fill node_info from nat page */
- page = get_current_nat_page(sbi, start_nid);
+ index = current_nat_addr(sbi, nid);
+ up_read(&nm_i->nat_tree_lock);
+
+ page = get_meta_page(sbi, index);
nat_blk = (struct f2fs_nat_block *)page_address(page);
ne = nat_blk->entries[nid - start_nid];
node_info_from_raw_nat(ni, &ne);
f2fs_put_page(page, 1);
cache:
/* cache nat entry */
- cache_nat_entry(NM_I(sbi), nid, &ne);
+ down_write(&nm_i->nat_tree_lock);
+ cache_nat_entry(sbi, nid, &ne);
+ up_write(&nm_i->nat_tree_lock);
+}
+
+/*
+ * readahead MAX_RA_NODE number of node pages.
+ */
+static void ra_node_pages(struct page *parent, int start, int n)
+{
+ struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+ struct blk_plug plug;
+ int i, end;
+ nid_t nid;
+
+ blk_start_plug(&plug);
+
+ /* Then, try readahead for siblings of the desired node */
+ end = start + n;
+ end = min(end, NIDS_PER_BLOCK);
+ for (i = start; i < end; i++) {
+ nid = get_nid(parent, i, false);
+ ra_node_page(sbi, nid);
+ }
+
+ blk_finish_plug(&plug);
+}
+
+pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
+{
+ const long direct_index = ADDRS_PER_INODE(dn->inode);
+ const long direct_blks = ADDRS_PER_BLOCK;
+ const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
+ unsigned int skipped_unit = ADDRS_PER_BLOCK;
+ int cur_level = dn->cur_level;
+ int max_level = dn->max_level;
+ pgoff_t base = 0;
+
+ if (!dn->max_level)
+ return pgofs + 1;
+
+ while (max_level-- > cur_level)
+ skipped_unit *= NIDS_PER_BLOCK;
+
+ switch (dn->max_level) {
+ case 3:
+ base += 2 * indirect_blks;
+ case 2:
+ base += 2 * direct_blks;
+ case 1:
+ base += direct_index;
+ break;
+ default:
+ f2fs_bug_on(F2FS_I_SB(dn->inode), 1);
+ }
+
+ return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base;
}
/*
* The maximum depth is four.
* Offset[0] will have raw inode offset.
*/
-static int get_node_path(struct f2fs_inode_info *fi, long block,
+static int get_node_path(struct inode *inode, long block,
int offset[4], unsigned int noffset[4])
{
- const long direct_index = ADDRS_PER_INODE(fi);
+ const long direct_index = ADDRS_PER_INODE(inode);
const long direct_blks = ADDRS_PER_BLOCK;
const long dptrs_per_blk = NIDS_PER_BLOCK;
const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
@@ -472,14 +565,14 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct page *npage[4];
- struct page *parent;
+ struct page *parent = NULL;
int offset[4];
unsigned int noffset[4];
nid_t nids[4];
- int level, i;
+ int level, i = 0;
int err = 0;
- level = get_node_path(F2FS_I(dn->inode), index, offset, noffset);
+ level = get_node_path(dn->inode, index, offset, noffset);
nids[0] = dn->inode->i_ino;
npage[0] = dn->inode_page;
@@ -489,6 +582,14 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
if (IS_ERR(npage[0]))
return PTR_ERR(npage[0]);
}
+
+ /* if inline_data is set, should not report any block indices */
+ if (f2fs_has_inline_data(dn->inode) && index) {
+ err = -ENOENT;
+ f2fs_put_page(npage[0], 1);
+ goto release_out;
+ }
+
parent = npage[0];
if (level != 0)
nids[1] = get_nid(parent, offset[0], true);
@@ -558,6 +659,11 @@ release_pages:
release_out:
dn->inode_page = NULL;
dn->node_page = NULL;
+ if (err == -ENOENT) {
+ dn->cur_level = i;
+ dn->max_level = level;
+ dn->ofs_in_node = offset[level];
+ }
return err;
}
@@ -581,12 +687,11 @@ static void truncate_node(struct dnode_of_data *dn)
if (dn->nid == dn->inode->i_ino) {
remove_orphan_inode(sbi, dn->nid);
dec_valid_inode_count(sbi);
- } else {
- sync_inode_page(dn);
+ f2fs_inode_synced(dn->inode);
}
invalidate:
clear_node_page_dirty(dn->node_page);
- F2FS_SET_SB_DIRT(sbi);
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
f2fs_put_page(dn->node_page, 1);
@@ -641,6 +746,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
return PTR_ERR(page);
}
+ ra_node_pages(page, ofs, NIDS_PER_BLOCK);
+
rn = F2FS_NODE(page);
if (depth < 3) {
for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
@@ -651,7 +758,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
ret = truncate_dnode(&rdn);
if (ret < 0)
goto out_err;
- set_nid(page, i, 0, false);
+ if (set_nid(page, i, 0, false))
+ dn->node_changed = true;
}
} else {
child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
@@ -664,7 +772,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
rdn.nid = child_nid;
ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
if (ret == (NIDS_PER_BLOCK + 1)) {
- set_nid(page, i, 0, false);
+ if (set_nid(page, i, 0, false))
+ dn->node_changed = true;
child_nofs += ret;
} else if (ret < 0 && ret != -ENOENT) {
goto out_err;
@@ -716,6 +825,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
}
+ ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK);
+
/* free direct nodes linked to a partial indirect node */
for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
child_nid = get_nid(pages[idx], i, false);
@@ -725,7 +836,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
err = truncate_dnode(dn);
if (err < 0)
goto fail;
- set_nid(pages[idx], i, 0, false);
+ if (set_nid(pages[idx], i, 0, false))
+ dn->node_changed = true;
}
if (offset[idx + 1] == 0) {
@@ -762,8 +874,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
trace_f2fs_truncate_inode_blocks_enter(inode, from);
- level = get_node_path(F2FS_I(inode), from, offset, noffset);
-restart:
+ level = get_node_path(inode, from, offset, noffset);
+
page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
@@ -827,11 +939,8 @@ skip_partial:
if (offset[1] == 0 &&
ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
lock_page(page);
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
- f2fs_put_page(page, 1);
- goto restart;
- }
- f2fs_wait_on_page_writeback(page, NODE);
+ BUG_ON(page->mapping != NODE_MAPPING(sbi));
+ f2fs_wait_on_page_writeback(page, NODE, true);
ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
set_page_dirty(page);
unlock_page(page);
@@ -860,10 +969,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
if (IS_ERR(npage))
return PTR_ERR(npage);
- F2FS_I(inode)->i_xattr_nid = 0;
-
- /* need to do checkpoint during fsync */
- F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
+ f2fs_i_xnid_write(inode, 0);
set_new_dnode(&dn, inode, page, npage, nid);
@@ -877,17 +983,20 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
* Caller should grab and release a rwsem by calling f2fs_lock_op() and
* f2fs_unlock_op().
*/
-void remove_inode_page(struct inode *inode)
+int remove_inode_page(struct inode *inode)
{
struct dnode_of_data dn;
+ int err;
set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
- if (get_dnode_of_data(&dn, 0, LOOKUP_NODE))
- return;
+ err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
+ if (err)
+ return err;
- if (truncate_xattr_node(inode, dn.inode_page)) {
+ err = truncate_xattr_node(inode, dn.inode_page);
+ if (err) {
f2fs_put_dnode(&dn);
- return;
+ return err;
}
/* remove potential inline_data blocks */
@@ -901,6 +1010,7 @@ void remove_inode_page(struct inode *inode)
/* will put inode & node pages */
truncate_node(&dn);
+ return 0;
}
struct page *new_inode_page(struct inode *inode)
@@ -918,14 +1028,14 @@ struct page *new_node_page(struct dnode_of_data *dn,
unsigned int ofs, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
- struct node_info old_ni, new_ni;
+ struct node_info new_ni;
struct page *page;
int err;
- if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+ if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return ERR_PTR(-EPERM);
- page = grab_cache_page(NODE_MAPPING(sbi), dn->nid);
+ page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -933,32 +1043,30 @@ struct page *new_node_page(struct dnode_of_data *dn,
err = -ENOSPC;
goto fail;
}
-
- get_node_info(sbi, dn->nid, &old_ni);
-
- /* Reinitialize old_ni with new node page */
- f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR);
- new_ni = old_ni;
+#ifdef CONFIG_F2FS_CHECK_FS
+ get_node_info(sbi, dn->nid, &new_ni);
+ f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR);
+#endif
+ new_ni.nid = dn->nid;
new_ni.ino = dn->inode->i_ino;
+ new_ni.blk_addr = NULL_ADDR;
+ new_ni.flag = 0;
+ new_ni.version = 0;
set_node_addr(sbi, &new_ni, NEW_ADDR, false);
- f2fs_wait_on_page_writeback(page, NODE);
+ f2fs_wait_on_page_writeback(page, NODE, true);
fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
set_cold_node(dn->inode, page);
- SetPageUptodate(page);
- set_page_dirty(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
+ if (set_page_dirty(page))
+ dn->node_changed = true;
if (f2fs_has_xattr_block(ofs))
- F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
+ f2fs_i_xnid_write(dn->inode, dn->nid);
- dn->node_page = page;
- if (ipage)
- update_inode(dn->inode, ipage);
- else
- sync_inode_page(dn);
if (ofs == 0)
inc_valid_inode_count(sbi);
-
return page;
fail:
@@ -970,25 +1078,33 @@ fail:
/*
* Caller should do after getting the following values.
* 0: f2fs_put_page(page, 0)
- * LOCKED_PAGE: f2fs_put_page(page, 1)
- * error: nothing
+ * LOCKED_PAGE or error: f2fs_put_page(page, 1)
*/
-static int read_node_page(struct page *page, int rw)
+static int read_node_page(struct page *page, int op_flags)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
struct node_info ni;
+ struct f2fs_io_info fio = {
+ .sbi = sbi,
+ .type = NODE,
+ .op = REQ_OP_READ,
+ .op_flags = op_flags,
+ .page = page,
+ .encrypted_page = NULL,
+ };
+
+ if (PageUptodate(page))
+ return LOCKED_PAGE;
get_node_info(sbi, page->index, &ni);
if (unlikely(ni.blk_addr == NULL_ADDR)) {
- f2fs_put_page(page, 1);
+ ClearPageUptodate(page);
return -ENOENT;
}
- if (PageUptodate(page))
- return LOCKED_PAGE;
-
- return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw);
+ fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr;
+ return f2fs_submit_page_bio(&fio);
}
/*
@@ -999,134 +1115,433 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
struct page *apage;
int err;
- apage = find_get_page(NODE_MAPPING(sbi), nid);
- if (apage && PageUptodate(apage)) {
- f2fs_put_page(apage, 0);
+ if (!nid)
+ return;
+ f2fs_bug_on(sbi, check_nid_range(sbi, nid));
+
+ rcu_read_lock();
+ apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid);
+ rcu_read_unlock();
+ if (apage)
return;
- }
- f2fs_put_page(apage, 0);
- apage = grab_cache_page(NODE_MAPPING(sbi), nid);
+ apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
if (!apage)
return;
- err = read_node_page(apage, READA);
- if (err == 0)
- f2fs_put_page(apage, 0);
- else if (err == LOCKED_PAGE)
- f2fs_put_page(apage, 1);
+ err = read_node_page(apage, REQ_RAHEAD);
+ f2fs_put_page(apage, err ? 1 : 0);
}
-struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
+ struct page *parent, int start)
{
struct page *page;
int err;
+
+ if (!nid)
+ return ERR_PTR(-ENOENT);
+ f2fs_bug_on(sbi, check_nid_range(sbi, nid));
repeat:
- page = grab_cache_page(NODE_MAPPING(sbi), nid);
+ page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
if (!page)
return ERR_PTR(-ENOMEM);
- err = read_node_page(page, READ_SYNC);
- if (err < 0)
+ err = read_node_page(page, 0);
+ if (err < 0) {
+ f2fs_put_page(page, 1);
return ERR_PTR(err);
- else if (err == LOCKED_PAGE)
- goto got_it;
+ } else if (err == LOCKED_PAGE) {
+ goto page_hit;
+ }
+
+ if (parent)
+ ra_node_pages(parent, start + 1, MAX_RA_NODE);
lock_page(page);
- if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
- f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
- }
+
if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
f2fs_put_page(page, 1);
goto repeat;
}
-got_it:
+
+ if (unlikely(!PageUptodate(page)))
+ goto out_err;
+page_hit:
+ if(unlikely(nid != nid_of_node(page))) {
+ f2fs_bug_on(sbi, 1);
+ ClearPageUptodate(page);
+out_err:
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
+ mark_page_accessed(page);
return page;
}
-/*
- * Return a locked page for the desired node page.
- * And, readahead MAX_RA_NODE number of node pages.
- */
+struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+{
+ return __get_node_page(sbi, nid, NULL, 0);
+}
+
struct page *get_node_page_ra(struct page *parent, int start)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
- struct blk_plug plug;
+ nid_t nid = get_nid(parent, start, false);
+
+ return __get_node_page(sbi, nid, parent, start);
+}
+
+static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct inode *inode;
struct page *page;
- int err, i, end;
- nid_t nid;
+ int ret;
- /* First, try getting the desired direct node. */
- nid = get_nid(parent, start, false);
- if (!nid)
- return ERR_PTR(-ENOENT);
-repeat:
- page = grab_cache_page(NODE_MAPPING(sbi), nid);
+ /* should flush inline_data before evict_inode */
+ inode = ilookup(sbi->sb, ino);
+ if (!inode)
+ return;
+
+ page = find_get_page(inode->i_mapping, 0);
if (!page)
- return ERR_PTR(-ENOMEM);
+ goto iput_out;
- err = read_node_page(page, READ_SYNC);
- if (err < 0)
- return ERR_PTR(err);
- else if (err == LOCKED_PAGE)
- goto page_hit;
+ if (!trylock_page(page))
+ goto release_out;
- blk_start_plug(&plug);
+ if (!PageUptodate(page))
+ goto page_out;
- /* Then, try readahead for siblings of the desired node */
- end = start + MAX_RA_NODE;
- end = min(end, NIDS_PER_BLOCK);
- for (i = start + 1; i < end; i++) {
- nid = get_nid(parent, i, false);
- if (!nid)
- continue;
- ra_node_page(sbi, nid);
- }
+ if (!PageDirty(page))
+ goto page_out;
- blk_finish_plug(&plug);
+ if (!clear_page_dirty_for_io(page))
+ goto page_out;
- lock_page(page);
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
- f2fs_put_page(page, 1);
- goto repeat;
+ ret = f2fs_write_inline_data(inode, page);
+ inode_dec_dirty_pages(inode);
+ remove_dirty_inode(inode);
+ if (ret)
+ set_page_dirty(page);
+page_out:
+ unlock_page(page);
+release_out:
+ f2fs_put_page(page, 0);
+iput_out:
+ iput(inode);
+}
+
+void move_node_page(struct page *node_page, int gc_type)
+{
+ if (gc_type == FG_GC) {
+ struct f2fs_sb_info *sbi = F2FS_P_SB(node_page);
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 1,
+ .for_reclaim = 0,
+ };
+
+ set_page_dirty(node_page);
+ f2fs_wait_on_page_writeback(node_page, NODE, true);
+
+ f2fs_bug_on(sbi, PageWriteback(node_page));
+ if (!clear_page_dirty_for_io(node_page))
+ goto out_page;
+
+ if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc))
+ unlock_page(node_page);
+ goto release_page;
+ } else {
+ /* set page dirty and write it */
+ if (!PageWriteback(node_page))
+ set_page_dirty(node_page);
}
-page_hit:
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
+out_page:
+ unlock_page(node_page);
+release_page:
+ f2fs_put_page(node_page, 0);
+}
+
+static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ pgoff_t index, end;
+ struct pagevec pvec;
+ struct page *last_page = NULL;
+
+ pagevec_init(&pvec, 0);
+ index = 0;
+ end = ULONG_MAX;
+
+ while (index <= end) {
+ int i, nr_pages;
+ nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (unlikely(f2fs_cp_error(sbi))) {
+ f2fs_put_page(last_page, 0);
+ pagevec_release(&pvec);
+ return ERR_PTR(-EIO);
+ }
+
+ if (!IS_DNODE(page) || !is_cold_node(page))
+ continue;
+ if (ino_of_node(page) != ino)
+ continue;
+
+ lock_page(page);
+
+ if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+ if (ino_of_node(page) != ino)
+ goto continue_unlock;
+
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ if (last_page)
+ f2fs_put_page(last_page, 0);
+
+ get_page(page);
+ last_page = page;
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
}
- return page;
+ return last_page;
}
-void sync_inode_page(struct dnode_of_data *dn)
+static int __write_node_page(struct page *page, bool atomic, bool *submitted,
+ struct writeback_control *wbc)
{
- if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) {
- update_inode(dn->inode, dn->node_page);
- } else if (dn->inode_page) {
- if (!dn->inode_page_locked)
- lock_page(dn->inode_page);
- update_inode(dn->inode, dn->inode_page);
- if (!dn->inode_page_locked)
- unlock_page(dn->inode_page);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+ nid_t nid;
+ struct node_info ni;
+ struct f2fs_io_info fio = {
+ .sbi = sbi,
+ .type = NODE,
+ .op = REQ_OP_WRITE,
+ .op_flags = wbc_to_write_flags(wbc),
+ .page = page,
+ .encrypted_page = NULL,
+ .submitted = false,
+ };
+
+ trace_f2fs_writepage(page, NODE);
+
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+ goto redirty_out;
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto redirty_out;
+
+ /* get old block addr of this node page */
+ nid = nid_of_node(page);
+ f2fs_bug_on(sbi, page->index != nid);
+
+ if (wbc->for_reclaim) {
+ if (!down_read_trylock(&sbi->node_write))
+ goto redirty_out;
} else {
- update_inode_page(dn->inode);
+ down_read(&sbi->node_write);
+ }
+
+ get_node_info(sbi, nid, &ni);
+
+ /* This page is already truncated */
+ if (unlikely(ni.blk_addr == NULL_ADDR)) {
+ ClearPageUptodate(page);
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ up_read(&sbi->node_write);
+ unlock_page(page);
+ return 0;
+ }
+
+ if (atomic && !test_opt(sbi, NOBARRIER))
+ fio.op_flags |= WRITE_FLUSH_FUA;
+
+ set_page_writeback(page);
+ fio.old_blkaddr = ni.blk_addr;
+ write_node_page(nid, &fio);
+ set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ up_read(&sbi->node_write);
+
+ if (wbc->for_reclaim) {
+ f2fs_submit_merged_bio_cond(sbi, page->mapping->host, 0,
+ page->index, NODE, WRITE);
+ submitted = NULL;
+ }
+
+ unlock_page(page);
+
+ if (unlikely(f2fs_cp_error(sbi))) {
+ f2fs_submit_merged_bio(sbi, NODE, WRITE);
+ submitted = NULL;
+ }
+ if (submitted)
+ *submitted = fio.submitted;
+
+ return 0;
+
+redirty_out:
+ redirty_page_for_writepage(wbc, page);
+ return AOP_WRITEPAGE_ACTIVATE;
+}
+
+static int f2fs_write_node_page(struct page *page,
+ struct writeback_control *wbc)
+{
+ return __write_node_page(page, false, NULL, wbc);
+}
+
+int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
+ struct writeback_control *wbc, bool atomic)
+{
+ pgoff_t index, end;
+ pgoff_t last_idx = ULONG_MAX;
+ struct pagevec pvec;
+ int ret = 0;
+ struct page *last_page = NULL;
+ bool marked = false;
+ nid_t ino = inode->i_ino;
+
+ if (atomic) {
+ last_page = last_fsync_dnode(sbi, ino);
+ if (IS_ERR_OR_NULL(last_page))
+ return PTR_ERR_OR_ZERO(last_page);
+ }
+retry:
+ pagevec_init(&pvec, 0);
+ index = 0;
+ end = ULONG_MAX;
+
+ while (index <= end) {
+ int i, nr_pages;
+ nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ bool submitted = false;
+
+ if (unlikely(f2fs_cp_error(sbi))) {
+ f2fs_put_page(last_page, 0);
+ pagevec_release(&pvec);
+ ret = -EIO;
+ goto out;
+ }
+
+ if (!IS_DNODE(page) || !is_cold_node(page))
+ continue;
+ if (ino_of_node(page) != ino)
+ continue;
+
+ lock_page(page);
+
+ if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+ if (ino_of_node(page) != ino)
+ goto continue_unlock;
+
+ if (!PageDirty(page) && page != last_page) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ f2fs_wait_on_page_writeback(page, NODE, true);
+ BUG_ON(PageWriteback(page));
+
+ set_fsync_mark(page, 0);
+ set_dentry_mark(page, 0);
+
+ if (!atomic || page == last_page) {
+ set_fsync_mark(page, 1);
+ if (IS_INODE(page)) {
+ if (is_inode_flag_set(inode,
+ FI_DIRTY_INODE))
+ update_inode(inode, page);
+ set_dentry_mark(page,
+ need_dentry_mark(sbi, ino));
+ }
+ /* may be written by other thread */
+ if (!PageDirty(page))
+ set_page_dirty(page);
+ }
+
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ ret = __write_node_page(page, atomic &&
+ page == last_page,
+ &submitted, wbc);
+ if (ret) {
+ unlock_page(page);
+ f2fs_put_page(last_page, 0);
+ break;
+ } else if (submitted) {
+ last_idx = page->index;
+ }
+
+ if (page == last_page) {
+ f2fs_put_page(page, 0);
+ marked = true;
+ break;
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+
+ if (ret || marked)
+ break;
}
+ if (!ret && atomic && !marked) {
+ f2fs_msg(sbi->sb, KERN_DEBUG,
+ "Retry to write fsync mark: ino=%u, idx=%lx",
+ ino, last_page->index);
+ lock_page(last_page);
+ f2fs_wait_on_page_writeback(last_page, NODE, true);
+ set_page_dirty(last_page);
+ unlock_page(last_page);
+ goto retry;
+ }
+out:
+ if (last_idx != ULONG_MAX)
+ f2fs_submit_merged_bio_cond(sbi, NULL, ino, last_idx,
+ NODE, WRITE);
+ return ret ? -EIO: 0;
}
-int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
- struct writeback_control *wbc)
+int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc)
{
pgoff_t index, end;
struct pagevec pvec;
- int step = ino ? 2 : 0;
- int nwritten = 0, wrote = 0;
+ int step = 0;
+ int nwritten = 0;
+ int ret = 0;
pagevec_init(&pvec, 0);
next_step:
index = 0;
- end = LONG_MAX;
+ end = ULONG_MAX;
while (index <= end) {
int i, nr_pages;
@@ -1138,6 +1553,13 @@ next_step:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ bool submitted = false;
+
+ if (unlikely(f2fs_cp_error(sbi))) {
+ pagevec_release(&pvec);
+ ret = -EIO;
+ goto out;
+ }
/*
* flushing sequence with step:
@@ -1153,14 +1575,8 @@ next_step:
if (step == 2 && (!IS_DNODE(page) ||
!is_cold_node(page)))
continue;
-
- /*
- * If an fsync mode,
- * we should not skip writing node pages.
- */
- if (ino && ino_of_node(page) == ino)
- lock_page(page);
- else if (!trylock_page(page))
+lock_node:
+ if (!trylock_page(page))
continue;
if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
@@ -1168,37 +1584,34 @@ continue_unlock:
unlock_page(page);
continue;
}
- if (ino && ino_of_node(page) != ino)
- goto continue_unlock;
if (!PageDirty(page)) {
/* someone wrote it for us */
goto continue_unlock;
}
+ /* flush inline_data */
+ if (is_inline_node(page)) {
+ clear_inline_node(page);
+ unlock_page(page);
+ flush_inline_data(sbi, ino_of_node(page));
+ goto lock_node;
+ }
+
+ f2fs_wait_on_page_writeback(page, NODE, true);
+
+ BUG_ON(PageWriteback(page));
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- /* called by fsync() */
- if (ino && IS_DNODE(page)) {
- set_fsync_mark(page, 1);
- if (IS_INODE(page)) {
- if (!is_checkpointed_node(sbi, ino) &&
- !has_fsynced_inode(sbi, ino))
- set_dentry_mark(page, 1);
- else
- set_dentry_mark(page, 0);
- }
- nwritten++;
- } else {
- set_fsync_mark(page, 0);
- set_dentry_mark(page, 0);
- }
+ set_fsync_mark(page, 0);
+ set_dentry_mark(page, 0);
- if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
+ ret = __write_node_page(page, false, &submitted, wbc);
+ if (ret)
unlock_page(page);
- else
- wrote++;
+ else if (submitted)
+ nwritten++;
if (--wbc->nr_to_write == 0)
break;
@@ -1216,15 +1629,15 @@ continue_unlock:
step++;
goto next_step;
}
-
- if (wrote)
+out:
+ if (nwritten)
f2fs_submit_merged_bio(sbi, NODE, WRITE);
- return nwritten;
+ return ret;
}
int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
{
- pgoff_t index = 0, end = LONG_MAX;
+ pgoff_t index = 0, end = ULONG_MAX;
struct pagevec pvec;
int ret2 = 0, ret = 0;
@@ -1246,7 +1659,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
continue;
if (ino && ino_of_node(page) == ino) {
- f2fs_wait_on_page_writeback(page, NODE);
+ f2fs_wait_on_page_writeback(page, NODE, true);
if (TestClearPageError(page))
ret = -EIO;
}
@@ -1264,65 +1677,13 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
return ret;
}
-static int f2fs_write_node_page(struct page *page,
- struct writeback_control *wbc)
-{
- struct f2fs_sb_info *sbi = F2FS_P_SB(page);
- nid_t nid;
- block_t new_addr;
- struct node_info ni;
- struct f2fs_io_info fio = {
- .type = NODE,
- .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
- };
-
- trace_f2fs_writepage(page, NODE);
-
- if (unlikely(sbi->por_doing))
- goto redirty_out;
- if (unlikely(f2fs_cp_error(sbi)))
- goto redirty_out;
-
- f2fs_wait_on_page_writeback(page, NODE);
-
- /* get old block addr of this node page */
- nid = nid_of_node(page);
- f2fs_bug_on(sbi, page->index != nid);
-
- get_node_info(sbi, nid, &ni);
-
- /* This page is already truncated */
- if (unlikely(ni.blk_addr == NULL_ADDR)) {
- dec_page_count(sbi, F2FS_DIRTY_NODES);
- unlock_page(page);
- return 0;
- }
-
- if (wbc->for_reclaim)
- goto redirty_out;
-
- down_read(&sbi->node_write);
- set_page_writeback(page);
- write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
- set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page));
- dec_page_count(sbi, F2FS_DIRTY_NODES);
- up_read(&sbi->node_write);
- unlock_page(page);
- return 0;
-
-redirty_out:
- redirty_page_for_writepage(wbc, page);
- return AOP_WRITEPAGE_ACTIVATE;
-}
-
static int f2fs_write_node_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
+ struct blk_plug plug;
long diff;
- trace_f2fs_writepages(mapping->host, wbc, NODE);
-
/* balancing f2fs's metadata in background */
f2fs_balance_fs_bg(sbi);
@@ -1330,14 +1691,19 @@ static int f2fs_write_node_pages(struct address_space *mapping,
if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
goto skip_write;
+ trace_f2fs_writepages(mapping->host, wbc, NODE);
+
diff = nr_pages_to_write(sbi, NODE, wbc);
wbc->sync_mode = WB_SYNC_NONE;
- sync_node_pages(sbi, 0, wbc);
+ blk_start_plug(&plug);
+ sync_node_pages(sbi, wbc);
+ blk_finish_plug(&plug);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
return 0;
skip_write:
wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
+ trace_f2fs_writepages(mapping->host, wbc, NODE);
return 0;
}
@@ -1345,31 +1711,18 @@ static int f2fs_set_node_page_dirty(struct page *page)
{
trace_f2fs_set_page_dirty(page, NODE);
- SetPageUptodate(page);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
+ f2fs_set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
SetPagePrivate(page);
+ f2fs_trace_pid(page);
return 1;
}
return 0;
}
-static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
- unsigned int length)
-{
- struct inode *inode = page->mapping->host;
- if (PageDirty(page))
- dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES);
- ClearPagePrivate(page);
-}
-
-static int f2fs_release_node_page(struct page *page, gfp_t wait)
-{
- ClearPagePrivate(page);
- return 1;
-}
-
/*
* Structure of the f2fs node operations
*/
@@ -1377,8 +1730,11 @@ const struct address_space_operations f2fs_node_aops = {
.writepage = f2fs_write_node_page,
.writepages = f2fs_write_node_pages,
.set_page_dirty = f2fs_set_node_page_dirty,
- .invalidatepage = f2fs_invalidate_node_page,
- .releasepage = f2fs_release_node_page,
+ .invalidatepage = f2fs_invalidate_page,
+ .releasepage = f2fs_release_page,
+#ifdef CONFIG_F2FS_MIGRATION
+ .migratepage = f2fs_migrate_page,
+#endif
};
static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
@@ -1387,112 +1743,250 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
return radix_tree_lookup(&nm_i->free_nid_root, n);
}
-static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i,
- struct free_nid *i)
+static int __insert_nid_to_list(struct f2fs_sb_info *sbi,
+ struct free_nid *i, enum nid_list list, bool new)
{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ if (new) {
+ int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
+ if (err)
+ return err;
+ }
+
+ f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
+ i->state != NID_ALLOC);
+ nm_i->nid_cnt[list]++;
+ list_add_tail(&i->list, &nm_i->nid_list[list]);
+ return 0;
+}
+
+static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
+ struct free_nid *i, enum nid_list list, bool reuse)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
+ i->state != NID_ALLOC);
+ nm_i->nid_cnt[list]--;
list_del(&i->list);
- radix_tree_delete(&nm_i->free_nid_root, i->nid);
+ if (!reuse)
+ radix_tree_delete(&nm_i->free_nid_root, i->nid);
}
-static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
+/* return if the nid is recognized as free */
+static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
- struct free_nid *i;
+ struct free_nid *i, *e;
struct nat_entry *ne;
- bool allocated = false;
-
- if (!available_free_memory(sbi, FREE_NIDS))
- return -1;
+ int err = -EINVAL;
+ bool ret = false;
/* 0 nid should not be used */
if (unlikely(nid == 0))
- return 0;
-
- if (build) {
- /* do not add allocated nids */
- read_lock(&nm_i->nat_tree_lock);
- ne = __lookup_nat_cache(nm_i, nid);
- if (ne &&
- (!get_nat_flag(ne, IS_CHECKPOINTED) ||
- nat_get_blkaddr(ne) != NULL_ADDR))
- allocated = true;
- read_unlock(&nm_i->nat_tree_lock);
- if (allocated)
- return 0;
- }
+ return false;
i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
i->nid = nid;
i->state = NID_NEW;
- spin_lock(&nm_i->free_nid_list_lock);
- if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) {
- spin_unlock(&nm_i->free_nid_list_lock);
- kmem_cache_free(free_nid_slab, i);
- return 0;
+ if (radix_tree_preload(GFP_NOFS))
+ goto err;
+
+ spin_lock(&nm_i->nid_list_lock);
+
+ if (build) {
+ /*
+ * Thread A Thread B
+ * - f2fs_create
+ * - f2fs_new_inode
+ * - alloc_nid
+ * - __insert_nid_to_list(ALLOC_NID_LIST)
+ * - f2fs_balance_fs_bg
+ * - build_free_nids
+ * - __build_free_nids
+ * - scan_nat_page
+ * - add_free_nid
+ * - __lookup_nat_cache
+ * - f2fs_add_link
+ * - init_inode_metadata
+ * - new_inode_page
+ * - new_node_page
+ * - set_node_addr
+ * - alloc_nid_done
+ * - __remove_nid_from_list(ALLOC_NID_LIST)
+ * - __insert_nid_to_list(FREE_NID_LIST)
+ */
+ ne = __lookup_nat_cache(nm_i, nid);
+ if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
+ nat_get_blkaddr(ne) != NULL_ADDR))
+ goto err_out;
+
+ e = __lookup_free_nid_list(nm_i, nid);
+ if (e) {
+ if (e->state == NID_NEW)
+ ret = true;
+ goto err_out;
+ }
}
- list_add_tail(&i->list, &nm_i->free_nid_list);
- nm_i->fcnt++;
- spin_unlock(&nm_i->free_nid_list_lock);
- return 1;
+ ret = true;
+ err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true);
+err_out:
+ spin_unlock(&nm_i->nid_list_lock);
+ radix_tree_preload_end();
+err:
+ if (err)
+ kmem_cache_free(free_nid_slab, i);
+ return ret;
}
-static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
bool need_free = false;
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
if (i && i->state == NID_NEW) {
- __del_from_free_nid_list(nm_i, i);
- nm_i->fcnt--;
+ __remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
need_free = true;
}
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_unlock(&nm_i->nid_list_lock);
if (need_free)
kmem_cache_free(free_nid_slab, i);
}
+static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
+ bool set, bool build)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid);
+ unsigned int nid_ofs = nid - START_NID(nid);
+
+ if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
+ return;
+
+ if (set)
+ __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
+ else
+ __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
+
+ if (set)
+ nm_i->free_nid_count[nat_ofs]++;
+ else if (!build)
+ nm_i->free_nid_count[nat_ofs]--;
+}
+
static void scan_nat_page(struct f2fs_sb_info *sbi,
struct page *nat_page, nid_t start_nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct f2fs_nat_block *nat_blk = page_address(nat_page);
block_t blk_addr;
+ unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid);
int i;
+ if (test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
+ return;
+
+ __set_bit_le(nat_ofs, nm_i->nat_block_bitmap);
+
i = start_nid % NAT_ENTRY_PER_BLOCK;
for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
+ bool freed = false;
if (unlikely(start_nid >= nm_i->max_nid))
break;
blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
- if (blk_addr == NULL_ADDR) {
- if (add_free_nid(sbi, start_nid, true) < 0)
- break;
+ if (blk_addr == NULL_ADDR)
+ freed = add_free_nid(sbi, start_nid, true);
+ spin_lock(&NM_I(sbi)->nid_list_lock);
+ update_free_nid_bitmap(sbi, start_nid, freed, true);
+ spin_unlock(&NM_I(sbi)->nid_list_lock);
+ }
+}
+
+static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ struct f2fs_journal *journal = curseg->journal;
+ unsigned int i, idx;
+
+ down_read(&nm_i->nat_tree_lock);
+
+ for (i = 0; i < nm_i->nat_blocks; i++) {
+ if (!test_bit_le(i, nm_i->nat_block_bitmap))
+ continue;
+ if (!nm_i->free_nid_count[i])
+ continue;
+ for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) {
+ nid_t nid;
+
+ if (!test_bit_le(idx, nm_i->free_nid_bitmap[i]))
+ continue;
+
+ nid = i * NAT_ENTRY_PER_BLOCK + idx;
+ add_free_nid(sbi, nid, true);
+
+ if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS)
+ goto out;
}
}
+out:
+ down_read(&curseg->journal_rwsem);
+ for (i = 0; i < nats_in_cursum(journal); i++) {
+ block_t addr;
+ nid_t nid;
+
+ addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
+ nid = le32_to_cpu(nid_in_journal(journal, i));
+ if (addr == NULL_ADDR)
+ add_free_nid(sbi, nid, true);
+ else
+ remove_free_nid(sbi, nid);
+ }
+ up_read(&curseg->journal_rwsem);
+ up_read(&nm_i->nat_tree_lock);
}
-static void build_free_nids(struct f2fs_sb_info *sbi)
+static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
int i = 0;
nid_t nid = nm_i->next_scan_nid;
+ if (unlikely(nid >= nm_i->max_nid))
+ nid = 0;
+
/* Enough entries */
- if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK)
+ if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK)
return;
+ if (!sync && !available_free_memory(sbi, FREE_NIDS))
+ return;
+
+ if (!mount) {
+ /* try to find free nids in free_nid_bitmap */
+ scan_free_nid_bits(sbi);
+
+ if (nm_i->nid_cnt[FREE_NID_LIST])
+ return;
+ }
+
/* readahead nat pages to be scanned */
- ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT);
+ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
+ META_NAT, true);
+
+ down_read(&nm_i->nat_tree_lock);
while (1) {
struct page *page = get_current_nat_page(sbi, nid);
@@ -1504,7 +1998,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
if (unlikely(nid >= nm_i->max_nid))
nid = 0;
- if (i++ == FREE_NID_PAGES)
+ if (++i >= FREE_NID_PAGES)
break;
}
@@ -1512,16 +2006,29 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
nm_i->next_scan_nid = nid;
/* find free nids from current sum_pages */
- mutex_lock(&curseg->curseg_mutex);
- for (i = 0; i < nats_in_cursum(sum); i++) {
- block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
- nid = le32_to_cpu(nid_in_journal(sum, i));
+ down_read(&curseg->journal_rwsem);
+ for (i = 0; i < nats_in_cursum(journal); i++) {
+ block_t addr;
+
+ addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
+ nid = le32_to_cpu(nid_in_journal(journal, i));
if (addr == NULL_ADDR)
add_free_nid(sbi, nid, true);
else
- remove_free_nid(nm_i, nid);
+ remove_free_nid(sbi, nid);
}
- mutex_unlock(&curseg->curseg_mutex);
+ up_read(&curseg->journal_rwsem);
+ up_read(&nm_i->nat_tree_lock);
+
+ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
+ nm_i->ra_nid_pages, META_NAT, false);
+}
+
+void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
+{
+ mutex_lock(&NM_I(sbi)->build_lock);
+ __build_free_nids(sbi, sync, mount);
+ mutex_unlock(&NM_I(sbi)->build_lock);
}
/*
@@ -1534,31 +2041,40 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i = NULL;
retry:
- if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids))
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(sbi, FAULT_ALLOC_NID)) {
+ f2fs_show_injection_info(FAULT_ALLOC_NID);
return false;
+ }
+#endif
+ spin_lock(&nm_i->nid_list_lock);
- spin_lock(&nm_i->free_nid_list_lock);
+ if (unlikely(nm_i->available_nids == 0)) {
+ spin_unlock(&nm_i->nid_list_lock);
+ return false;
+ }
/* We should not use stale free nids created by build_free_nids */
- if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
- f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
- list_for_each_entry(i, &nm_i->free_nid_list, list)
- if (i->state == NID_NEW)
- break;
-
- f2fs_bug_on(sbi, i->state != NID_NEW);
+ if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) {
+ f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST]));
+ i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST],
+ struct free_nid, list);
*nid = i->nid;
+
+ __remove_nid_from_list(sbi, i, FREE_NID_LIST, true);
i->state = NID_ALLOC;
- nm_i->fcnt--;
- spin_unlock(&nm_i->free_nid_list_lock);
+ __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false);
+ nm_i->available_nids--;
+
+ update_free_nid_bitmap(sbi, *nid, false, false);
+
+ spin_unlock(&nm_i->nid_list_lock);
return true;
}
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_unlock(&nm_i->nid_list_lock);
/* Let's scan nat pages and its caches to get free nids */
- mutex_lock(&nm_i->build_lock);
- build_free_nids(sbi);
- mutex_unlock(&nm_i->build_lock);
+ build_free_nids(sbi, true, false);
goto retry;
}
@@ -1570,11 +2086,11 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
- f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
- __del_from_free_nid_list(nm_i, i);
- spin_unlock(&nm_i->free_nid_list_lock);
+ f2fs_bug_on(sbi, !i);
+ __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
+ spin_unlock(&nm_i->nid_list_lock);
kmem_cache_free(free_nid_slab, i);
}
@@ -1591,22 +2107,58 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
if (!nid)
return;
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
- f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
+ f2fs_bug_on(sbi, !i);
+
if (!available_free_memory(sbi, FREE_NIDS)) {
- __del_from_free_nid_list(nm_i, i);
+ __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
need_free = true;
} else {
+ __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true);
i->state = NID_NEW;
- nm_i->fcnt++;
+ __insert_nid_to_list(sbi, i, FREE_NID_LIST, false);
}
- spin_unlock(&nm_i->free_nid_list_lock);
+
+ nm_i->available_nids++;
+
+ update_free_nid_bitmap(sbi, nid, true, false);
+
+ spin_unlock(&nm_i->nid_list_lock);
if (need_free)
kmem_cache_free(free_nid_slab, i);
}
+int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct free_nid *i, *next;
+ int nr = nr_shrink;
+
+ if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
+ return 0;
+
+ if (!mutex_trylock(&nm_i->build_lock))
+ return 0;
+
+ spin_lock(&nm_i->nid_list_lock);
+ list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST],
+ list) {
+ if (nr_shrink <= 0 ||
+ nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
+ break;
+
+ __remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+ kmem_cache_free(free_nid_slab, i);
+ nr_shrink--;
+ }
+ spin_unlock(&nm_i->nid_list_lock);
+ mutex_unlock(&nm_i->build_lock);
+
+ return nr - nr_shrink;
+}
+
void recover_inline_xattr(struct inode *inode, struct page *page)
{
void *src_addr, *dst_addr;
@@ -1619,7 +2171,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
ri = F2FS_INODE(page);
if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
- clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR);
+ clear_inode_flag(inode, FI_INLINE_XATTR);
goto update_inode;
}
@@ -1627,25 +2179,25 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
src_addr = inline_xattr_addr(page);
inline_size = inline_xattr_size(inode);
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
memcpy(dst_addr, src_addr, inline_size);
update_inode:
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
}
-void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
+int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
nid_t new_xnid = nid_of_node(page);
struct node_info ni;
+ struct page *xpage;
- /* 1: invalidate the previous xattr nid */
if (!prev_xnid)
goto recover_xnid;
- /* Deallocate node address */
+ /* 1: invalidate the previous xattr nid */
get_node_info(sbi, prev_xnid, &ni);
f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
invalidate_blocks(sbi, ni.blk_addr);
@@ -1653,21 +2205,27 @@ void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
set_node_addr(sbi, &ni, NULL_ADDR, false);
recover_xnid:
- /* 2: allocate new xattr nid */
+ /* 2: update xattr nid in inode */
+ remove_free_nid(sbi, new_xnid);
+ f2fs_i_xnid_write(inode, new_xnid);
if (unlikely(!inc_valid_node_count(sbi, inode)))
f2fs_bug_on(sbi, 1);
+ update_inode_page(inode);
+
+ /* 3: update and set xattr node page dirty */
+ xpage = grab_cache_page(NODE_MAPPING(sbi), new_xnid);
+ if (!xpage)
+ return -ENOMEM;
+
+ memcpy(F2FS_NODE(xpage), F2FS_NODE(page), PAGE_SIZE);
- remove_free_nid(NM_I(sbi), new_xnid);
get_node_info(sbi, new_xnid, &ni);
ni.ino = inode->i_ino;
set_node_addr(sbi, &ni, NEW_ADDR, false);
- F2FS_I(inode)->i_xattr_nid = new_xnid;
+ set_page_dirty(xpage);
+ f2fs_put_page(xpage, 1);
- /* 3: update xattr blkaddr */
- refresh_sit_entry(sbi, NEW_ADDR, blkaddr);
- set_node_addr(sbi, &ni, blkaddr, false);
-
- update_inode_page(inode);
+ return 0;
}
int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -1681,15 +2239,18 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
if (unlikely(old_ni.blk_addr != NULL_ADDR))
return -EINVAL;
-
- ipage = grab_cache_page(NODE_MAPPING(sbi), ino);
- if (!ipage)
- return -ENOMEM;
+retry:
+ ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
+ if (!ipage) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
/* Should not use this inode from free nid list */
- remove_free_nid(NM_I(sbi), ino);
+ remove_free_nid(sbi, ino);
- SetPageUptodate(ipage);
+ if (!PageUptodate(ipage))
+ SetPageUptodate(ipage);
fill_node_footer(ipage, ino, ino, 0, true);
src = F2FS_INODE(page);
@@ -1714,114 +2275,79 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
return 0;
}
-/*
- * ra_sum_pages() merge contiguous pages into one bio and submit.
- * these pre-read pages are allocated in bd_inode's mapping tree.
- */
-static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages,
- int start, int nrpages)
-{
- struct inode *inode = sbi->sb->s_bdev->bd_inode;
- struct address_space *mapping = inode->i_mapping;
- int i, page_idx = start;
- struct f2fs_io_info fio = {
- .type = META,
- .rw = READ_SYNC | REQ_META | REQ_PRIO
- };
-
- for (i = 0; page_idx < start + nrpages; page_idx++, i++) {
- /* alloc page in bd_inode for reading node summary info */
- pages[i] = grab_cache_page(mapping, page_idx);
- if (!pages[i])
- break;
- f2fs_submit_page_mbio(sbi, pages[i], page_idx, &fio);
- }
-
- f2fs_submit_merged_bio(sbi, META, READ);
- return i;
-}
-
int restore_node_summary(struct f2fs_sb_info *sbi,
unsigned int segno, struct f2fs_summary_block *sum)
{
struct f2fs_node *rn;
struct f2fs_summary *sum_entry;
- struct inode *inode = sbi->sb->s_bdev->bd_inode;
block_t addr;
- int bio_blocks = MAX_BIO_BLOCKS(sbi);
- struct page *pages[bio_blocks];
- int i, idx, last_offset, nrpages, err = 0;
+ int i, idx, last_offset, nrpages;
/* scan the node segment */
last_offset = sbi->blocks_per_seg;
addr = START_BLOCK(sbi, segno);
sum_entry = &sum->entries[0];
- for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
- nrpages = min(last_offset - i, bio_blocks);
+ for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
+ nrpages = min(last_offset - i, BIO_MAX_PAGES);
/* readahead node pages */
- nrpages = ra_sum_pages(sbi, pages, addr, nrpages);
- if (!nrpages)
- return -ENOMEM;
-
- for (idx = 0; idx < nrpages; idx++) {
- if (err)
- goto skip;
-
- lock_page(pages[idx]);
- if (unlikely(!PageUptodate(pages[idx]))) {
- err = -EIO;
- } else {
- rn = F2FS_NODE(pages[idx]);
- sum_entry->nid = rn->footer.nid;
- sum_entry->version = 0;
- sum_entry->ofs_in_node = 0;
- sum_entry++;
- }
- unlock_page(pages[idx]);
-skip:
- page_cache_release(pages[idx]);
+ ra_meta_pages(sbi, addr, nrpages, META_POR, true);
+
+ for (idx = addr; idx < addr + nrpages; idx++) {
+ struct page *page = get_tmp_page(sbi, idx);
+
+ rn = F2FS_NODE(page);
+ sum_entry->nid = rn->footer.nid;
+ sum_entry->version = 0;
+ sum_entry->ofs_in_node = 0;
+ sum_entry++;
+ f2fs_put_page(page, 1);
}
- invalidate_mapping_pages(inode->i_mapping, addr,
+ invalidate_mapping_pages(META_MAPPING(sbi), addr,
addr + nrpages);
}
- return err;
+ return 0;
}
static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
int i;
- mutex_lock(&curseg->curseg_mutex);
- for (i = 0; i < nats_in_cursum(sum); i++) {
+ down_write(&curseg->journal_rwsem);
+ for (i = 0; i < nats_in_cursum(journal); i++) {
struct nat_entry *ne;
struct f2fs_nat_entry raw_ne;
- nid_t nid = le32_to_cpu(nid_in_journal(sum, i));
+ nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
- raw_ne = nat_in_journal(sum, i);
-retry:
- write_lock(&nm_i->nat_tree_lock);
- ne = __lookup_nat_cache(nm_i, nid);
- if (ne)
- goto found;
+ raw_ne = nat_in_journal(journal, i);
- ne = grab_nat_entry(nm_i, nid);
+ ne = __lookup_nat_cache(nm_i, nid);
if (!ne) {
- write_unlock(&nm_i->nat_tree_lock);
- goto retry;
+ ne = grab_nat_entry(nm_i, nid, true);
+ node_info_from_raw_nat(&ne->ni, &raw_ne);
+ }
+
+ /*
+ * if a free nat in journal has not been used after last
+ * checkpoint, we should remove it from available nids,
+ * since later we will add it again.
+ */
+ if (!get_nat_flag(ne, IS_DIRTY) &&
+ le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) {
+ spin_lock(&nm_i->nid_list_lock);
+ nm_i->available_nids--;
+ spin_unlock(&nm_i->nid_list_lock);
}
- node_info_from_raw_nat(&ne->ni, &raw_ne);
-found:
+
__set_nat_cache_dirty(nm_i, ne);
- write_unlock(&nm_i->nat_tree_lock);
}
- update_nats_in_cursum(sum, -i);
- mutex_unlock(&curseg->curseg_mutex);
+ update_nats_in_cursum(journal, -i);
+ up_write(&curseg->journal_rwsem);
}
static void __adjust_nat_entry_set(struct nat_entry_set *nes,
@@ -1842,11 +2368,42 @@ add_out:
list_add_tail(&nes->set_list, head);
}
+static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
+ struct page *page)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
+ struct f2fs_nat_block *nat_blk = page_address(page);
+ int valid = 0;
+ int i;
+
+ if (!enabled_nat_bits(sbi, NULL))
+ return;
+
+ for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) {
+ if (start_nid == 0 && i == 0)
+ valid++;
+ if (nat_blk->entries[i].block_addr)
+ valid++;
+ }
+ if (valid == 0) {
+ __set_bit_le(nat_index, nm_i->empty_nat_bits);
+ __clear_bit_le(nat_index, nm_i->full_nat_bits);
+ return;
+ }
+
+ __clear_bit_le(nat_index, nm_i->empty_nat_bits);
+ if (valid == NAT_ENTRY_PER_BLOCK)
+ __set_bit_le(nat_index, nm_i->full_nat_bits);
+ else
+ __clear_bit_le(nat_index, nm_i->full_nat_bits);
+}
+
static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
- struct nat_entry_set *set)
+ struct nat_entry_set *set, struct cp_control *cpc)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
bool to_journal = true;
struct f2fs_nat_block *nat_blk;
@@ -1858,11 +2415,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
* #1, flush nat entries to journal in current hot data summary block.
* #2, flush nat entries to nat page.
*/
- if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
+ if (enabled_nat_bits(sbi, cpc) ||
+ !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
to_journal = false;
if (to_journal) {
- mutex_lock(&curseg->curseg_mutex);
+ down_write(&curseg->journal_rwsem);
} else {
page = get_next_nat_page(sbi, start_nid);
nat_blk = page_address(page);
@@ -1879,30 +2437,38 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
continue;
if (to_journal) {
- offset = lookup_journal_in_cursum(sum,
+ offset = lookup_journal_in_cursum(journal,
NAT_JOURNAL, nid, 1);
f2fs_bug_on(sbi, offset < 0);
- raw_ne = &nat_in_journal(sum, offset);
- nid_in_journal(sum, offset) = cpu_to_le32(nid);
+ raw_ne = &nat_in_journal(journal, offset);
+ nid_in_journal(journal, offset) = cpu_to_le32(nid);
} else {
raw_ne = &nat_blk->entries[nid - start_nid];
}
raw_nat_from_node_info(raw_ne, &ne->ni);
-
- write_lock(&NM_I(sbi)->nat_tree_lock);
nat_reset_flag(ne);
- __clear_nat_cache_dirty(NM_I(sbi), ne);
- write_unlock(&NM_I(sbi)->nat_tree_lock);
-
- if (nat_get_blkaddr(ne) == NULL_ADDR)
+ __clear_nat_cache_dirty(NM_I(sbi), set, ne);
+ if (nat_get_blkaddr(ne) == NULL_ADDR) {
add_free_nid(sbi, nid, false);
+ spin_lock(&NM_I(sbi)->nid_list_lock);
+ NM_I(sbi)->available_nids++;
+ update_free_nid_bitmap(sbi, nid, true, false);
+ spin_unlock(&NM_I(sbi)->nid_list_lock);
+ } else {
+ spin_lock(&NM_I(sbi)->nid_list_lock);
+ update_free_nid_bitmap(sbi, nid, false, false);
+ spin_unlock(&NM_I(sbi)->nid_list_lock);
+ }
}
- if (to_journal)
- mutex_unlock(&curseg->curseg_mutex);
- else
+ if (to_journal) {
+ up_write(&curseg->journal_rwsem);
+ } else {
+ __update_nat_bits(sbi, start_nid, page);
f2fs_put_page(page, 1);
+ }
+ /* Allow dirty nats by node block allocation in write_begin */
if (!set->entry_cnt) {
radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
kmem_cache_free(nat_entry_set_slab, set);
@@ -1912,42 +2478,122 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
/*
* This function is called during the checkpointing process.
*/
-void flush_nat_entries(struct f2fs_sb_info *sbi)
+void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
- struct nat_entry_set *setvec[NATVEC_SIZE];
+ struct f2fs_journal *journal = curseg->journal;
+ struct nat_entry_set *setvec[SETVEC_SIZE];
struct nat_entry_set *set, *tmp;
unsigned int found;
nid_t set_idx = 0;
LIST_HEAD(sets);
+ if (!nm_i->dirty_nat_cnt)
+ return;
+
+ down_write(&nm_i->nat_tree_lock);
+
/*
* if there are no enough space in journal to store dirty nat
* entries, remove all entries from journal and merge them
* into nat entry set.
*/
- if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+ if (enabled_nat_bits(sbi, cpc) ||
+ !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
remove_nats_in_journal(sbi);
- if (!nm_i->dirty_nat_cnt)
- return;
-
while ((found = __gang_lookup_nat_set(nm_i,
- set_idx, NATVEC_SIZE, setvec))) {
+ set_idx, SETVEC_SIZE, setvec))) {
unsigned idx;
set_idx = setvec[found - 1]->set + 1;
for (idx = 0; idx < found; idx++)
__adjust_nat_entry_set(setvec[idx], &sets,
- MAX_NAT_JENTRIES(sum));
+ MAX_NAT_JENTRIES(journal));
}
/* flush dirty nats in nat entry set */
list_for_each_entry_safe(set, tmp, &sets, set_list)
- __flush_nat_entry_set(sbi, set);
+ __flush_nat_entry_set(sbi, set, cpc);
+
+ up_write(&nm_i->nat_tree_lock);
+ /* Allow dirty nats by node block allocation in write_begin */
+}
+
+static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE;
+ unsigned int i;
+ __u64 cp_ver = cur_cp_version(ckpt);
+ block_t nat_bits_addr;
+
+ if (!enabled_nat_bits(sbi, NULL))
+ return 0;
+
+ nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 +
+ F2FS_BLKSIZE - 1);
+ nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS,
+ GFP_KERNEL);
+ if (!nm_i->nat_bits)
+ return -ENOMEM;
+
+ nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
+ nm_i->nat_bits_blocks;
+ for (i = 0; i < nm_i->nat_bits_blocks; i++) {
+ struct page *page = get_meta_page(sbi, nat_bits_addr++);
+
+ memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS),
+ page_address(page), F2FS_BLKSIZE);
+ f2fs_put_page(page, 1);
+ }
+
+ cp_ver |= (cur_cp_crc(ckpt) << 32);
+ if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
+ disable_nat_bits(sbi, true);
+ return 0;
+ }
+
+ nm_i->full_nat_bits = nm_i->nat_bits + 8;
+ nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
+
+ f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint");
+ return 0;
+}
+
+inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ unsigned int i = 0;
+ nid_t nid, last_nid;
+
+ if (!enabled_nat_bits(sbi, NULL))
+ return;
- f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
+ for (i = 0; i < nm_i->nat_blocks; i++) {
+ i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i);
+ if (i >= nm_i->nat_blocks)
+ break;
+
+ __set_bit_le(i, nm_i->nat_block_bitmap);
+
+ nid = i * NAT_ENTRY_PER_BLOCK;
+ last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK;
+
+ spin_lock(&NM_I(sbi)->nid_list_lock);
+ for (; nid < last_nid; nid++)
+ update_free_nid_bitmap(sbi, nid, true, true);
+ spin_unlock(&NM_I(sbi)->nid_list_lock);
+ }
+
+ for (i = 0; i < nm_i->nat_blocks; i++) {
+ i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i);
+ if (i >= nm_i->nat_blocks)
+ break;
+
+ __set_bit_le(i, nm_i->nat_block_bitmap);
+ }
}
static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -1955,31 +2601,36 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned char *version_bitmap;
- unsigned int nat_segs, nat_blocks;
+ unsigned int nat_segs;
+ int err;
nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
/* segment_count_nat includes pair segment so divide to 2. */
nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
- nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
-
- nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
+ nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
+ nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks;
/* not used nids: 0, node, meta, (and root counted as valid node) */
- nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
- nm_i->fcnt = 0;
+ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
+ F2FS_RESERVED_NODE_NUM;
+ nm_i->nid_cnt[FREE_NID_LIST] = 0;
+ nm_i->nid_cnt[ALLOC_NID_LIST] = 0;
nm_i->nat_cnt = 0;
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
+ nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
+ nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
- INIT_LIST_HEAD(&nm_i->free_nid_list);
- INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
- INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC);
+ INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]);
+ INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]);
+ INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
+ INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
INIT_LIST_HEAD(&nm_i->nat_entries);
mutex_init(&nm_i->build_lock);
- spin_lock_init(&nm_i->free_nid_list_lock);
- rwlock_init(&nm_i->nat_tree_lock);
+ spin_lock_init(&nm_i->nid_list_lock);
+ init_rwsem(&nm_i->nat_tree_lock);
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
@@ -1991,6 +2642,39 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
GFP_KERNEL);
if (!nm_i->nat_bitmap)
return -ENOMEM;
+
+ err = __get_nat_bitmaps(sbi);
+ if (err)
+ return err;
+
+#ifdef CONFIG_F2FS_CHECK_FS
+ nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size,
+ GFP_KERNEL);
+ if (!nm_i->nat_bitmap_mir)
+ return -ENOMEM;
+#endif
+
+ return 0;
+}
+
+static int init_free_nid_cache(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ nm_i->free_nid_bitmap = f2fs_kvzalloc(nm_i->nat_blocks *
+ NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL);
+ if (!nm_i->free_nid_bitmap)
+ return -ENOMEM;
+
+ nm_i->nat_block_bitmap = f2fs_kvzalloc(nm_i->nat_blocks / 8,
+ GFP_KERNEL);
+ if (!nm_i->nat_block_bitmap)
+ return -ENOMEM;
+
+ nm_i->free_nid_count = f2fs_kvzalloc(nm_i->nat_blocks *
+ sizeof(unsigned short), GFP_KERNEL);
+ if (!nm_i->free_nid_count)
+ return -ENOMEM;
return 0;
}
@@ -2006,7 +2690,14 @@ int build_node_manager(struct f2fs_sb_info *sbi)
if (err)
return err;
- build_free_nids(sbi);
+ err = init_free_nid_cache(sbi);
+ if (err)
+ return err;
+
+ /* load free nid status from nat_bits table */
+ load_free_nid_bitmap(sbi);
+
+ build_free_nids(sbi, true, true);
return 0;
}
@@ -2015,6 +2706,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i, *next_i;
struct nat_entry *natvec[NATVEC_SIZE];
+ struct nat_entry_set *setvec[SETVEC_SIZE];
nid_t nid = 0;
unsigned int found;
@@ -2022,31 +2714,56 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
return;
/* destroy free nid list */
- spin_lock(&nm_i->free_nid_list_lock);
- list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
- f2fs_bug_on(sbi, i->state == NID_ALLOC);
- __del_from_free_nid_list(nm_i, i);
- nm_i->fcnt--;
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
+ list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST],
+ list) {
+ __remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+ spin_unlock(&nm_i->nid_list_lock);
kmem_cache_free(free_nid_slab, i);
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
}
- f2fs_bug_on(sbi, nm_i->fcnt);
- spin_unlock(&nm_i->free_nid_list_lock);
+ f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]);
+ f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]);
+ f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST]));
+ spin_unlock(&nm_i->nid_list_lock);
/* destroy nat cache */
- write_lock(&nm_i->nat_tree_lock);
+ down_write(&nm_i->nat_tree_lock);
while ((found = __gang_lookup_nat_cache(nm_i,
nid, NATVEC_SIZE, natvec))) {
unsigned idx;
+
nid = nat_get_nid(natvec[found - 1]) + 1;
for (idx = 0; idx < found; idx++)
__del_from_nat_cache(nm_i, natvec[idx]);
}
f2fs_bug_on(sbi, nm_i->nat_cnt);
- write_unlock(&nm_i->nat_tree_lock);
+
+ /* destroy nat set cache */
+ nid = 0;
+ while ((found = __gang_lookup_nat_set(nm_i,
+ nid, SETVEC_SIZE, setvec))) {
+ unsigned idx;
+
+ nid = setvec[found - 1]->set + 1;
+ for (idx = 0; idx < found; idx++) {
+ /* entry_cnt is not zero, when cp_error was occurred */
+ f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list));
+ radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set);
+ kmem_cache_free(nat_entry_set_slab, setvec[idx]);
+ }
+ }
+ up_write(&nm_i->nat_tree_lock);
+
+ kvfree(nm_i->nat_block_bitmap);
+ kvfree(nm_i->free_nid_bitmap);
+ kvfree(nm_i->free_nid_count);
kfree(nm_i->nat_bitmap);
+ kfree(nm_i->nat_bits);
+#ifdef CONFIG_F2FS_CHECK_FS
+ kfree(nm_i->nat_bitmap_mir);
+#endif
sbi->nm_info = NULL;
kfree(nm_i);
}
@@ -2061,17 +2778,17 @@ int __init create_node_manager_caches(void)
free_nid_slab = f2fs_kmem_cache_create("free_nid",
sizeof(struct free_nid));
if (!free_nid_slab)
- goto destory_nat_entry;
+ goto destroy_nat_entry;
nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set",
sizeof(struct nat_entry_set));
if (!nat_entry_set_slab)
- goto destory_free_nid;
+ goto destroy_free_nid;
return 0;
-destory_free_nid:
+destroy_free_nid:
kmem_cache_destroy(free_nid_slab);
-destory_nat_entry:
+destroy_nat_entry:
kmem_cache_destroy(nat_entry_slab);
fail:
return -ENOMEM;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 8d5e6e0dd840..558048e33cf9 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -9,26 +9,43 @@
* published by the Free Software Foundation.
*/
/* start node id of a node block dedicated to the given node id */
-#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
+#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
/* node block offset on the NAT area dedicated to the given start node id */
-#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
+#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK)
-/* # of pages to perform readahead before building free nids */
-#define FREE_NID_PAGES 4
+/* # of pages to perform synchronous readahead before building free nids */
+#define FREE_NID_PAGES 8
+#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES)
+
+#define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */
/* maximum readahead size for node during getting data blocks */
#define MAX_RA_NODE 128
/* control the memory footprint threshold (10MB per 1GB ram) */
-#define DEF_RAM_THRESHOLD 10
+#define DEF_RAM_THRESHOLD 1
+
+/* control dirty nats ratio threshold (default: 10% over max nid count) */
+#define DEF_DIRTY_NAT_RATIO_THRESHOLD 10
+/* control total # of nats */
+#define DEF_NAT_CACHE_THRESHOLD 100000
/* vector size for gang look-up from nat cache that consists of radix tree */
#define NATVEC_SIZE 64
+#define SETVEC_SIZE 32
/* return value for read_node_page */
#define LOCKED_PAGE 1
+/* For flag in struct node_info */
+enum {
+ IS_CHECKPOINTED, /* is it checkpointed before? */
+ HAS_FSYNCED_INODE, /* is the inode fsynced before? */
+ HAS_LAST_FSYNC, /* has the latest node fsync mark? */
+ IS_DIRTY, /* this nat entry is dirty? */
+};
+
/*
* For node information
*/
@@ -37,46 +54,49 @@ struct node_info {
nid_t ino; /* inode number of the node's owner */
block_t blk_addr; /* block address of the node */
unsigned char version; /* version of the node */
-};
-
-enum {
- IS_CHECKPOINTED, /* is it checkpointed before? */
- HAS_FSYNCED_INODE, /* is the inode fsynced before? */
- HAS_LAST_FSYNC, /* has the latest node fsync mark? */
- IS_DIRTY, /* this nat entry is dirty? */
+ unsigned char flag; /* for node information bits */
};
struct nat_entry {
struct list_head list; /* for clean or dirty nat list */
- unsigned char flag; /* for node information bits */
struct node_info ni; /* in-memory node information */
};
-#define nat_get_nid(nat) (nat->ni.nid)
-#define nat_set_nid(nat, n) (nat->ni.nid = n)
-#define nat_get_blkaddr(nat) (nat->ni.blk_addr)
-#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b)
-#define nat_get_ino(nat) (nat->ni.ino)
-#define nat_set_ino(nat, i) (nat->ni.ino = i)
-#define nat_get_version(nat) (nat->ni.version)
-#define nat_set_version(nat, v) (nat->ni.version = v)
+#define nat_get_nid(nat) ((nat)->ni.nid)
+#define nat_set_nid(nat, n) ((nat)->ni.nid = (n))
+#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr)
+#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b))
+#define nat_get_ino(nat) ((nat)->ni.ino)
+#define nat_set_ino(nat, i) ((nat)->ni.ino = (i))
+#define nat_get_version(nat) ((nat)->ni.version)
+#define nat_set_version(nat, v) ((nat)->ni.version = (v))
-#define inc_node_version(version) (++version)
+#define inc_node_version(version) (++(version))
+
+static inline void copy_node_info(struct node_info *dst,
+ struct node_info *src)
+{
+ dst->nid = src->nid;
+ dst->ino = src->ino;
+ dst->blk_addr = src->blk_addr;
+ dst->version = src->version;
+ /* should not copy flag here */
+}
static inline void set_nat_flag(struct nat_entry *ne,
unsigned int type, bool set)
{
unsigned char mask = 0x01 << type;
if (set)
- ne->flag |= mask;
+ ne->ni.flag |= mask;
else
- ne->flag &= ~mask;
+ ne->ni.flag &= ~mask;
}
static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
{
unsigned char mask = 0x01 << type;
- return ne->flag & mask;
+ return ne->ni.flag & mask;
}
static inline void nat_reset_flag(struct nat_entry *ne)
@@ -103,10 +123,24 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
raw_ne->version = ni->version;
}
+static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi)
+{
+ return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid *
+ NM_I(sbi)->dirty_nats_ratio / 100;
+}
+
+static inline bool excess_cached_nats(struct f2fs_sb_info *sbi)
+{
+ return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD;
+}
+
enum mem_type {
FREE_NIDS, /* indicates the free nid list */
NAT_ENTRIES, /* indicates the cached nat entry */
- DIRTY_DENTS /* indicates dirty dentry pages */
+ DIRTY_DENTS, /* indicates dirty dentry pages */
+ INO_ENTRIES, /* indicates inode entries */
+ EXTENT_CACHE, /* indicates extent cache */
+ BASE_CHECK, /* check kernel status */
};
struct nat_entry_set {
@@ -135,14 +169,15 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *fnid;
- spin_lock(&nm_i->free_nid_list_lock);
- if (nm_i->fcnt <= 0) {
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
+ if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) {
+ spin_unlock(&nm_i->nid_list_lock);
return;
}
- fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
+ fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST],
+ struct free_nid, list);
*nid = fnid->nid;
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_unlock(&nm_i->nid_list_lock);
}
/*
@@ -151,6 +186,12 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+#ifdef CONFIG_F2FS_CHECK_FS
+ if (memcmp(nm_i->nat_bitmap, nm_i->nat_bitmap_mir,
+ nm_i->bitmap_size))
+ f2fs_bug_on(sbi, 1);
+#endif
memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size);
}
@@ -159,14 +200,17 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
struct f2fs_nm_info *nm_i = NM_I(sbi);
pgoff_t block_off;
pgoff_t block_addr;
- int seg_off;
+ /*
+ * block_off = segment_off * 512 + off_in_segment
+ * OLD = (segment_off * 512) * 2 + off_in_segment
+ * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment
+ */
block_off = NAT_BLOCK_OFFSET(start);
- seg_off = block_off >> sbi->log_blocks_per_seg;
block_addr = (pgoff_t)(nm_i->nat_blkaddr +
- (seg_off << sbi->log_blocks_per_seg << 1) +
- (block_off & ((1 << sbi->log_blocks_per_seg) - 1)));
+ (block_off << 1) -
+ (block_off & (sbi->blocks_per_seg - 1)));
if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
block_addr += sbi->blocks_per_seg;
@@ -192,37 +236,10 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
{
unsigned int block_off = NAT_BLOCK_OFFSET(start_nid);
- if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
- f2fs_clear_bit(block_off, nm_i->nat_bitmap);
- else
- f2fs_set_bit(block_off, nm_i->nat_bitmap);
-}
-
-static inline void fill_node_footer(struct page *page, nid_t nid,
- nid_t ino, unsigned int ofs, bool reset)
-{
- struct f2fs_node *rn = F2FS_NODE(page);
- if (reset)
- memset(rn, 0, sizeof(*rn));
- rn->footer.nid = cpu_to_le32(nid);
- rn->footer.ino = cpu_to_le32(ino);
- rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT);
-}
-
-static inline void copy_node_footer(struct page *dst, struct page *src)
-{
- struct f2fs_node *src_rn = F2FS_NODE(src);
- struct f2fs_node *dst_rn = F2FS_NODE(dst);
- memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer));
-}
-
-static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
-{
- struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
- struct f2fs_node *rn = F2FS_NODE(page);
-
- rn->footer.cp_ver = ckpt->checkpoint_ver;
- rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
+ f2fs_change_bit(block_off, nm_i->nat_bitmap);
+#ifdef CONFIG_F2FS_CHECK_FS
+ f2fs_change_bit(block_off, nm_i->nat_bitmap_mir);
+#endif
}
static inline nid_t ino_of_node(struct page *node_page)
@@ -244,7 +261,7 @@ static inline unsigned int ofs_of_node(struct page *node_page)
return flag >> OFFSET_BIT_SHIFT;
}
-static inline unsigned long long cpver_of_node(struct page *node_page)
+static inline __u64 cpver_of_node(struct page *node_page)
{
struct f2fs_node *rn = F2FS_NODE(node_page);
return le64_to_cpu(rn->footer.cp_ver);
@@ -256,6 +273,56 @@ static inline block_t next_blkaddr_of_node(struct page *node_page)
return le32_to_cpu(rn->footer.next_blkaddr);
}
+static inline void fill_node_footer(struct page *page, nid_t nid,
+ nid_t ino, unsigned int ofs, bool reset)
+{
+ struct f2fs_node *rn = F2FS_NODE(page);
+ unsigned int old_flag = 0;
+
+ if (reset)
+ memset(rn, 0, sizeof(*rn));
+ else
+ old_flag = le32_to_cpu(rn->footer.flag);
+
+ rn->footer.nid = cpu_to_le32(nid);
+ rn->footer.ino = cpu_to_le32(ino);
+
+ /* should remain old flag bits such as COLD_BIT_SHIFT */
+ rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) |
+ (old_flag & OFFSET_BIT_MASK));
+}
+
+static inline void copy_node_footer(struct page *dst, struct page *src)
+{
+ struct f2fs_node *src_rn = F2FS_NODE(src);
+ struct f2fs_node *dst_rn = F2FS_NODE(dst);
+ memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer));
+}
+
+static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
+ struct f2fs_node *rn = F2FS_NODE(page);
+ __u64 cp_ver = cur_cp_version(ckpt);
+
+ if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG))
+ cp_ver |= (cur_cp_crc(ckpt) << 32);
+
+ rn->footer.cp_ver = cpu_to_le64(cp_ver);
+ rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
+}
+
+static inline bool is_recoverable_dnode(struct page *page)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
+ __u64 cp_ver = cur_cp_version(ckpt);
+
+ if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG))
+ cp_ver |= (cur_cp_crc(ckpt) << 32);
+
+ return cp_ver == cpver_of_node(page);
+}
+
/*
* f2fs assigns the following node offsets described as (num).
* N = NIDS_PER_BLOCK
@@ -282,7 +349,7 @@ static inline bool IS_DNODE(struct page *node_page)
unsigned int ofs = ofs_of_node(node_page);
if (f2fs_has_xattr_block(ofs))
- return false;
+ return true;
if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
ofs == 5 + 2 * NIDS_PER_BLOCK)
@@ -295,17 +362,17 @@ static inline bool IS_DNODE(struct page *node_page)
return true;
}
-static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
+static inline int set_nid(struct page *p, int off, nid_t nid, bool i)
{
struct f2fs_node *rn = F2FS_NODE(p);
- f2fs_wait_on_page_writeback(p, NODE);
+ f2fs_wait_on_page_writeback(p, NODE, true);
if (i)
rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
else
rn->in.nid[off] = cpu_to_le32(nid);
- set_page_dirty(p);
+ return set_page_dirty(p);
}
static inline nid_t get_nid(struct page *p, int off, bool i)
@@ -323,28 +390,6 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
* - Mark cold node blocks in their node footer
* - Mark cold data pages in page cache
*/
-static inline int is_file(struct inode *inode, int type)
-{
- return F2FS_I(inode)->i_advise & type;
-}
-
-static inline void set_file(struct inode *inode, int type)
-{
- F2FS_I(inode)->i_advise |= type;
-}
-
-static inline void clear_file(struct inode *inode, int type)
-{
- F2FS_I(inode)->i_advise &= ~type;
-}
-
-#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT)
-#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
-#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT)
-#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT)
-#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT)
-#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT)
-
static inline int is_cold_data(struct page *page)
{
return PageChecked(page);
@@ -370,6 +415,21 @@ static inline int is_node(struct page *page, int type)
#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT)
#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT)
+static inline int is_inline_node(struct page *page)
+{
+ return PageChecked(page);
+}
+
+static inline void set_inline_node(struct page *page)
+{
+ SetPageChecked(page);
+}
+
+static inline void clear_inline_node(struct page *page)
+{
+ ClearPageChecked(page);
+}
+
static inline void set_cold_node(struct inode *inode, struct page *page)
{
struct f2fs_node *rn = F2FS_NODE(page);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index ebd013225788..20147d297f89 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -49,8 +49,9 @@ static struct kmem_cache *fsync_entry_slab;
bool space_for_roll_forward(struct f2fs_sb_info *sbi)
{
- if (sbi->last_valid_block_count + sbi->alloc_valid_block_count
- > sbi->user_block_count)
+ s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count);
+
+ if (sbi->last_valid_block_count + nalloc > sbi->user_block_count)
return false;
return true;
}
@@ -67,38 +68,71 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
return NULL;
}
-static int recover_dentry(struct inode *inode, struct page *ipage)
+static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
+ struct list_head *head, nid_t ino)
+{
+ struct inode *inode;
+ struct fsync_inode_entry *entry;
+
+ inode = f2fs_iget_retry(sbi->sb, ino);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
+ entry->inode = inode;
+ list_add_tail(&entry->list, head);
+
+ return entry;
+}
+
+static void del_fsync_inode(struct fsync_inode_entry *entry)
+{
+ iput(entry->inode);
+ list_del(&entry->list);
+ kmem_cache_free(fsync_entry_slab, entry);
+}
+
+static int recover_dentry(struct inode *inode, struct page *ipage,
+ struct list_head *dir_list)
{
struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
nid_t pino = le32_to_cpu(raw_inode->i_pino);
struct f2fs_dir_entry *de;
- struct qstr name;
+ struct fscrypt_name fname;
struct page *page;
struct inode *dir, *einode;
+ struct fsync_inode_entry *entry;
int err = 0;
-
- dir = f2fs_iget(inode->i_sb, pino);
- if (IS_ERR(dir)) {
- err = PTR_ERR(dir);
- goto out;
+ char *name;
+
+ entry = get_fsync_inode(dir_list, pino);
+ if (!entry) {
+ entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino);
+ if (IS_ERR(entry)) {
+ dir = ERR_CAST(entry);
+ err = PTR_ERR(entry);
+ goto out;
+ }
}
- name.len = le32_to_cpu(raw_inode->i_namelen);
- name.name = raw_inode->i_name;
+ dir = entry->inode;
+
+ memset(&fname, 0, sizeof(struct fscrypt_name));
+ fname.disk_name.len = le32_to_cpu(raw_inode->i_namelen);
+ fname.disk_name.name = raw_inode->i_name;
- if (unlikely(name.len > F2FS_NAME_LEN)) {
+ if (unlikely(fname.disk_name.len > F2FS_NAME_LEN)) {
WARN_ON(1);
err = -ENAMETOOLONG;
- goto out_err;
+ goto out;
}
retry:
- de = f2fs_find_entry(dir, &name, &page);
- if (de && inode->i_ino == le32_to_cpu(de->ino)) {
- clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ de = __f2fs_find_entry(dir, &fname, &page);
+ if (de && inode->i_ino == le32_to_cpu(de->ino))
goto out_unmap_put;
- }
+
if (de) {
- einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
+ einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino));
if (IS_ERR(einode)) {
WARN_ON(1);
err = PTR_ERR(einode);
@@ -111,32 +145,30 @@ retry:
iput(einode);
goto out_unmap_put;
}
- f2fs_delete_entry(de, page, einode);
+ f2fs_delete_entry(de, page, dir, einode);
iput(einode);
goto retry;
- }
- err = __f2fs_add_link(dir, &name, inode);
- if (err)
- goto out_err;
-
- if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) {
- iput(dir);
+ } else if (IS_ERR(page)) {
+ err = PTR_ERR(page);
} else {
- add_dirty_dir_inode(dir);
- set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
+ err = __f2fs_do_add_link(dir, &fname, inode,
+ inode->i_ino, inode->i_mode);
}
-
+ if (err == -ENOMEM)
+ goto retry;
goto out;
out_unmap_put:
- kunmap(page);
+ f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
-out_err:
- iput(dir);
out:
+ if (file_enc_name(inode))
+ name = "<encrypted>";
+ else
+ name = raw_inode->i_name;
f2fs_msg(inode->i_sb, KERN_NOTICE,
"%s: ino = %x, name = %s, dir = %lx, err = %d",
- __func__, ino_of_node(ipage), raw_inode->i_name,
+ __func__, ino_of_node(ipage), name,
IS_ERR(dir) ? 0 : dir->i_ino, err);
return err;
}
@@ -144,23 +176,31 @@ out:
static void recover_inode(struct inode *inode, struct page *page)
{
struct f2fs_inode *raw = F2FS_INODE(page);
+ char *name;
inode->i_mode = le16_to_cpu(raw->i_mode);
- i_size_write(inode, le64_to_cpu(raw->i_size));
- inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
+ f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
+ inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime);
inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
- inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+ inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec);
inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+ F2FS_I(inode)->i_advise = raw->i_advise;
+
+ if (file_enc_name(inode))
+ name = "<encrypted>";
+ else
+ name = F2FS_INODE(page)->i_name;
+
f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
- ino_of_node(page), F2FS_INODE(page)->i_name);
+ ino_of_node(page), name);
}
-static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
+static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
+ bool check_only)
{
- unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
struct page *page = NULL;
block_t blkaddr;
@@ -173,60 +213,50 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
while (1) {
struct fsync_inode_entry *entry;
- if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
+ if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
return 0;
- page = get_meta_page_ra(sbi, blkaddr);
+ page = get_tmp_page(sbi, blkaddr);
- if (cp_ver != cpver_of_node(page))
+ if (!is_recoverable_dnode(page))
break;
if (!is_fsync_dnode(page))
goto next;
entry = get_fsync_inode(head, ino_of_node(page));
- if (entry) {
- if (IS_INODE(page) && is_dent_dnode(page))
- set_inode_flag(F2FS_I(entry->inode),
- FI_INC_LINK);
- } else {
- if (IS_INODE(page) && is_dent_dnode(page)) {
+ if (!entry) {
+ if (!check_only &&
+ IS_INODE(page) && is_dent_dnode(page)) {
err = recover_inode_page(sbi, page);
if (err)
break;
}
- /* add this fsync inode to the list */
- entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
- if (!entry) {
- err = -ENOMEM;
- break;
- }
/*
* CP | dnode(F) | inode(DF)
* For this case, we should not give up now.
*/
- entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
- if (IS_ERR(entry->inode)) {
- err = PTR_ERR(entry->inode);
- kmem_cache_free(fsync_entry_slab, entry);
- if (err == -ENOENT)
+ entry = add_fsync_inode(sbi, head, ino_of_node(page));
+ if (IS_ERR(entry)) {
+ err = PTR_ERR(entry);
+ if (err == -ENOENT) {
+ err = 0;
goto next;
+ }
break;
}
- list_add_tail(&entry->list, head);
}
entry->blkaddr = blkaddr;
- if (IS_INODE(page)) {
- entry->last_inode = blkaddr;
- if (is_dent_dnode(page))
- entry->last_dentry = blkaddr;
- }
+ if (IS_INODE(page) && is_dent_dnode(page))
+ entry->last_dentry = blkaddr;
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
+
+ ra_meta_pages_cond(sbi, blkaddr);
}
f2fs_put_page(page, 1);
return err;
@@ -236,11 +266,8 @@ static void destroy_fsync_dnodes(struct list_head *head)
{
struct fsync_inode_entry *entry, *tmp;
- list_for_each_entry_safe(entry, tmp, head, list) {
- iput(entry->inode);
- list_del(&entry->list);
- kmem_cache_free(fsync_entry_slab, entry);
- }
+ list_for_each_entry_safe(entry, tmp, head, list)
+ del_fsync_inode(entry);
}
static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
@@ -252,6 +279,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
struct f2fs_summary_block *sum_node;
struct f2fs_summary sum;
struct page *sum_page, *node_page;
+ struct dnode_of_data tdn = *dn;
nid_t ino, nid;
struct inode *inode;
unsigned int offset;
@@ -279,17 +307,15 @@ got_it:
/* Use the locked dnode page and inode */
nid = le32_to_cpu(sum.nid);
if (dn->inode->i_ino == nid) {
- struct dnode_of_data tdn = *dn;
tdn.nid = nid;
+ if (!dn->inode_page_locked)
+ lock_page(dn->inode_page);
tdn.node_page = dn->inode_page;
tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
- truncate_data_blocks_range(&tdn, 1);
- return 0;
+ goto truncate_out;
} else if (dn->nid == nid) {
- struct dnode_of_data tdn = *dn;
tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
- truncate_data_blocks_range(&tdn, 1);
- return 0;
+ goto truncate_out;
}
/* Get the node page */
@@ -303,46 +329,60 @@ got_it:
if (ino != dn->inode->i_ino) {
/* Deallocate previous index in the node page */
- inode = f2fs_iget(sbi->sb, ino);
+ inode = f2fs_iget_retry(sbi->sb, ino);
if (IS_ERR(inode))
return PTR_ERR(inode);
} else {
inode = dn->inode;
}
- bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
- le16_to_cpu(sum.ofs_in_node);
+ bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node);
- if (ino != dn->inode->i_ino) {
- truncate_hole(inode, bidx, bidx + 1);
+ /*
+ * if inode page is locked, unlock temporarily, but its reference
+ * count keeps alive.
+ */
+ if (ino == dn->inode->i_ino && dn->inode_page_locked)
+ unlock_page(dn->inode_page);
+
+ set_new_dnode(&tdn, inode, NULL, NULL, 0);
+ if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
+ goto out;
+
+ if (tdn.data_blkaddr == blkaddr)
+ truncate_data_blocks_range(&tdn, 1);
+
+ f2fs_put_dnode(&tdn);
+out:
+ if (ino != dn->inode->i_ino)
iput(inode);
- } else {
- struct dnode_of_data tdn;
- set_new_dnode(&tdn, inode, dn->inode_page, NULL, 0);
- if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
- return 0;
- if (tdn.data_blkaddr != NULL_ADDR)
- truncate_data_blocks_range(&tdn, 1);
- f2fs_put_page(tdn.node_page, 1);
- }
+ else if (dn->inode_page_locked)
+ lock_page(dn->inode_page);
+ return 0;
+
+truncate_out:
+ if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr)
+ truncate_data_blocks_range(&tdn, 1);
+ if (dn->inode->i_ino == nid && !dn->inode_page_locked)
+ unlock_page(dn->inode_page);
return 0;
}
static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
struct page *page, block_t blkaddr)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
- unsigned int start, end;
struct dnode_of_data dn;
- struct f2fs_summary sum;
struct node_info ni;
+ unsigned int start, end;
int err = 0, recovered = 0;
/* step 1: recover xattr */
if (IS_INODE(page)) {
recover_inline_xattr(inode, page);
} else if (f2fs_has_xattr_block(ofs_of_node(page))) {
- recover_xattr_data(inode, page, blkaddr);
+ err = recover_xattr_data(inode, page, blkaddr);
+ if (!err)
+ recovered++;
goto out;
}
@@ -351,99 +391,132 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
goto out;
/* step 3: recover data indices */
- start = start_bidx_of_node(ofs_of_node(page), fi);
- end = start + ADDRS_PER_PAGE(page, fi);
-
- f2fs_lock_op(sbi);
+ start = start_bidx_of_node(ofs_of_node(page), inode);
+ end = start + ADDRS_PER_PAGE(page, inode);
set_new_dnode(&dn, inode, NULL, NULL, 0);
-
+retry_dn:
err = get_dnode_of_data(&dn, start, ALLOC_NODE);
if (err) {
- f2fs_unlock_op(sbi);
+ if (err == -ENOMEM) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry_dn;
+ }
goto out;
}
- f2fs_wait_on_page_writeback(dn.node_page, NODE);
+ f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
get_node_info(sbi, dn.nid, &ni);
f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page));
- for (; start < end; start++) {
+ for (; start < end; start++, dn.ofs_in_node++) {
block_t src, dest;
src = datablock_addr(dn.node_page, dn.ofs_in_node);
dest = datablock_addr(page, dn.ofs_in_node);
- if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) {
+ /* skip recovering if dest is the same as src */
+ if (src == dest)
+ continue;
+
+ /* dest is invalid, just invalidate src block */
+ if (dest == NULL_ADDR) {
+ truncate_data_blocks_range(&dn, 1);
+ continue;
+ }
+
+ if (!file_keep_isize(inode) &&
+ (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT)))
+ f2fs_i_size_write(inode,
+ (loff_t)(start + 1) << PAGE_SHIFT);
+
+ /*
+ * dest is reserved block, invalidate src block
+ * and then reserve one new block in dnode page.
+ */
+ if (dest == NEW_ADDR) {
+ truncate_data_blocks_range(&dn, 1);
+ reserve_new_block(&dn);
+ continue;
+ }
+
+ /* dest is valid block, try to recover from src to dest */
+ if (is_valid_blkaddr(sbi, dest, META_POR)) {
+
if (src == NULL_ADDR) {
err = reserve_new_block(&dn);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ while (err)
+ err = reserve_new_block(&dn);
+#endif
/* We should not get -ENOSPC */
f2fs_bug_on(sbi, err);
+ if (err)
+ goto err;
}
-
+retry_prev:
/* Check the previous node page having this index */
err = check_index_in_prev_nodes(sbi, dest, &dn);
- if (err)
+ if (err) {
+ if (err == -ENOMEM) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry_prev;
+ }
goto err;
-
- set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
+ }
/* write dummy data page */
- recover_data_page(sbi, NULL, &sum, src, dest);
- update_extent_cache(dest, &dn);
+ f2fs_replace_block(sbi, &dn, src, dest,
+ ni.version, false, false);
recovered++;
}
- dn.ofs_in_node++;
}
- /* write node page in place */
- set_summary(&sum, dn.nid, 0, 0);
- if (IS_INODE(dn.node_page))
- sync_inode_page(&dn);
-
copy_node_footer(dn.node_page, page);
fill_node_footer(dn.node_page, dn.nid, ni.ino,
ofs_of_node(page), false);
set_page_dirty(dn.node_page);
err:
f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
out:
f2fs_msg(sbi->sb, KERN_NOTICE,
- "recover_data: ino = %lx, recovered = %d blocks, err = %d",
- inode->i_ino, recovered, err);
+ "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d",
+ inode->i_ino,
+ file_keep_isize(inode) ? "keep" : "recover",
+ recovered, err);
return err;
}
-static int recover_data(struct f2fs_sb_info *sbi,
- struct list_head *head, int type)
+static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
+ struct list_head *dir_list)
{
- unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
struct page *page = NULL;
int err = 0;
block_t blkaddr;
/* get node pages in the current segment */
- curseg = CURSEG_I(sbi, type);
+ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
while (1) {
struct fsync_inode_entry *entry;
- if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
+ if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
break;
- page = get_meta_page_ra(sbi, blkaddr);
+ ra_meta_pages_cond(sbi, blkaddr);
- if (cp_ver != cpver_of_node(page)) {
+ page = get_tmp_page(sbi, blkaddr);
+
+ if (!is_recoverable_dnode(page)) {
f2fs_put_page(page, 1);
break;
}
- entry = get_fsync_inode(head, ino_of_node(page));
+ entry = get_fsync_inode(inode_list, ino_of_node(page));
if (!entry)
goto next;
/*
@@ -451,10 +524,10 @@ static int recover_data(struct f2fs_sb_info *sbi,
* In this case, we can lose the latest inode(x).
* So, call recover_inode for the inode update.
*/
- if (entry->last_inode == blkaddr)
+ if (IS_INODE(page))
recover_inode(entry->inode, page);
if (entry->last_dentry == blkaddr) {
- err = recover_dentry(entry->inode, page);
+ err = recover_dentry(entry->inode, page, dir_list);
if (err) {
f2fs_put_page(page, 1);
break;
@@ -466,11 +539,8 @@ static int recover_data(struct f2fs_sb_info *sbi,
break;
}
- if (entry->blkaddr == blkaddr) {
- iput(entry->inode);
- list_del(&entry->list);
- kmem_cache_free(fsync_entry_slab, entry);
- }
+ if (entry->blkaddr == blkaddr)
+ del_fsync_inode(entry);
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
@@ -481,12 +551,12 @@ next:
return err;
}
-int recover_fsync_data(struct f2fs_sb_info *sbi)
+int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
{
- struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
struct list_head inode_list;
- block_t blkaddr;
+ struct list_head dir_list;
int err;
+ int ret = 0;
bool need_writecp = false;
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
@@ -495,58 +565,54 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
return -ENOMEM;
INIT_LIST_HEAD(&inode_list);
-
- /* step #1: find fsynced inode numbers */
- sbi->por_doing = true;
+ INIT_LIST_HEAD(&dir_list);
/* prevent checkpoint */
mutex_lock(&sbi->cp_mutex);
- blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
-
- err = find_fsync_dnodes(sbi, &inode_list);
- if (err)
+ /* step #1: find fsynced inode numbers */
+ err = find_fsync_dnodes(sbi, &inode_list, check_only);
+ if (err || list_empty(&inode_list))
goto out;
- if (list_empty(&inode_list))
+ if (check_only) {
+ ret = 1;
goto out;
+ }
need_writecp = true;
/* step #2: recover data */
- err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
+ err = recover_data(sbi, &inode_list, &dir_list);
if (!err)
f2fs_bug_on(sbi, !list_empty(&inode_list));
out:
destroy_fsync_dnodes(&inode_list);
- kmem_cache_destroy(fsync_entry_slab);
/* truncate meta pages to be used by the recovery */
truncate_inode_pages_range(META_MAPPING(sbi),
- MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
+ (loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1);
if (err) {
- truncate_inode_pages_final(NODE_MAPPING(sbi));
- truncate_inode_pages_final(META_MAPPING(sbi));
+ truncate_inode_pages(NODE_MAPPING(sbi), 0);
+ truncate_inode_pages(META_MAPPING(sbi), 0);
}
- sbi->por_doing = false;
- if (err) {
- discard_next_dnode(sbi, blkaddr);
-
- /* Flush all the NAT/SIT pages */
- while (get_pages(sbi, F2FS_DIRTY_META))
- sync_meta_pages(sbi, META, LONG_MAX);
- set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
- mutex_unlock(&sbi->cp_mutex);
- } else if (need_writecp) {
+ clear_sbi_flag(sbi, SBI_POR_DOING);
+ if (err)
+ set_ckpt_flags(sbi, CP_ERROR_FLAG);
+ mutex_unlock(&sbi->cp_mutex);
+
+ /* let's drop all the directory inodes for clean checkpoint */
+ destroy_fsync_dnodes(&dir_list);
+
+ if (!err && need_writecp) {
struct cp_control cpc = {
- .reason = CP_SYNC,
+ .reason = CP_RECOVERY,
};
- mutex_unlock(&sbi->cp_mutex);
- write_checkpoint(sbi, &cpc);
- } else {
- mutex_unlock(&sbi->cp_mutex);
+ err = write_checkpoint(sbi, &cpc);
}
- return err;
+
+ kmem_cache_destroy(fsync_entry_slab);
+ return ret ? ret: err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 3c31221affe6..1f1010bdfb97 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -14,20 +14,37 @@
#include <linux/blkdev.h>
#include <linux/prefetch.h>
#include <linux/kthread.h>
-#include <linux/vmalloc.h>
#include <linux/swap.h>
+#include <linux/timer.h>
#include "f2fs.h"
#include "segment.h"
#include "node.h"
+#include "trace.h"
#include <trace/events/f2fs.h>
#define __reverse_ffz(x) __reverse_ffs(~(x))
static struct kmem_cache *discard_entry_slab;
+static struct kmem_cache *discard_cmd_slab;
static struct kmem_cache *sit_entry_set_slab;
static struct kmem_cache *inmem_entry_slab;
+static unsigned long __reverse_ulong(unsigned char *str)
+{
+ unsigned long tmp = 0;
+ int shift = 24, idx = 0;
+
+#if BITS_PER_LONG == 64
+ shift = 56;
+#endif
+ while (shift >= 0) {
+ tmp |= (unsigned long)str[idx++] << shift;
+ shift -= BITS_PER_BYTE;
+ }
+ return tmp;
+}
+
/*
* __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
* MSB and LSB are reversed in a byte by f2fs_set_bit.
@@ -37,27 +54,31 @@ static inline unsigned long __reverse_ffs(unsigned long word)
int num = 0;
#if BITS_PER_LONG == 64
- if ((word & 0xffffffff) == 0) {
+ if ((word & 0xffffffff00000000UL) == 0)
num += 32;
+ else
word >>= 32;
- }
#endif
- if ((word & 0xffff) == 0) {
+ if ((word & 0xffff0000) == 0)
num += 16;
+ else
word >>= 16;
- }
- if ((word & 0xff) == 0) {
+
+ if ((word & 0xff00) == 0)
num += 8;
+ else
word >>= 8;
- }
+
if ((word & 0xf0) == 0)
num += 4;
else
word >>= 4;
+
if ((word & 0xc) == 0)
num += 2;
else
word >>= 2;
+
if ((word & 0x2) == 0)
num += 1;
return num;
@@ -66,112 +87,83 @@ static inline unsigned long __reverse_ffs(unsigned long word)
/*
* __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
* f2fs_set_bit makes MSB and LSB reversed in a byte.
+ * @size must be integral times of unsigned long.
* Example:
- * LSB <--> MSB
- * f2fs_set_bit(0, bitmap) => 0000 0001
- * f2fs_set_bit(7, bitmap) => 1000 0000
+ * MSB <--> LSB
+ * f2fs_set_bit(0, bitmap) => 1000 0000
+ * f2fs_set_bit(7, bitmap) => 0000 0001
*/
static unsigned long __find_rev_next_bit(const unsigned long *addr,
unsigned long size, unsigned long offset)
{
const unsigned long *p = addr + BIT_WORD(offset);
- unsigned long result = offset & ~(BITS_PER_LONG - 1);
+ unsigned long result = size;
unsigned long tmp;
- unsigned long mask, submask;
- unsigned long quot, rest;
if (offset >= size)
return size;
- size -= result;
+ size -= (offset & ~(BITS_PER_LONG - 1));
offset %= BITS_PER_LONG;
- if (!offset)
- goto aligned;
-
- tmp = *(p++);
- quot = (offset >> 3) << 3;
- rest = offset & 0x7;
- mask = ~0UL << quot;
- submask = (unsigned char)(0xff << rest) >> rest;
- submask <<= quot;
- mask &= submask;
- tmp &= mask;
- if (size < BITS_PER_LONG)
- goto found_first;
- if (tmp)
- goto found_middle;
-
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
-aligned:
- while (size & ~(BITS_PER_LONG-1)) {
- tmp = *(p++);
+
+ while (1) {
+ if (*p == 0)
+ goto pass;
+
+ tmp = __reverse_ulong((unsigned char *)p);
+
+ tmp &= ~0UL >> offset;
+ if (size < BITS_PER_LONG)
+ tmp &= (~0UL << (BITS_PER_LONG - size));
if (tmp)
- goto found_middle;
- result += BITS_PER_LONG;
+ goto found;
+pass:
+ if (size <= BITS_PER_LONG)
+ break;
size -= BITS_PER_LONG;
+ offset = 0;
+ p++;
}
- if (!size)
- return result;
- tmp = *p;
-found_first:
- tmp &= (~0UL >> (BITS_PER_LONG - size));
- if (tmp == 0UL) /* Are any bits set? */
- return result + size; /* Nope. */
-found_middle:
- return result + __reverse_ffs(tmp);
+ return result;
+found:
+ return result - size + __reverse_ffs(tmp);
}
static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
unsigned long size, unsigned long offset)
{
const unsigned long *p = addr + BIT_WORD(offset);
- unsigned long result = offset & ~(BITS_PER_LONG - 1);
+ unsigned long result = size;
unsigned long tmp;
- unsigned long mask, submask;
- unsigned long quot, rest;
if (offset >= size)
return size;
- size -= result;
+ size -= (offset & ~(BITS_PER_LONG - 1));
offset %= BITS_PER_LONG;
- if (!offset)
- goto aligned;
-
- tmp = *(p++);
- quot = (offset >> 3) << 3;
- rest = offset & 0x7;
- mask = ~(~0UL << quot);
- submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest);
- submask <<= quot;
- mask += submask;
- tmp |= mask;
- if (size < BITS_PER_LONG)
- goto found_first;
- if (~tmp)
- goto found_middle;
-
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
-aligned:
- while (size & ~(BITS_PER_LONG - 1)) {
- tmp = *(p++);
- if (~tmp)
- goto found_middle;
- result += BITS_PER_LONG;
+
+ while (1) {
+ if (*p == ~0UL)
+ goto pass;
+
+ tmp = __reverse_ulong((unsigned char *)p);
+
+ if (offset)
+ tmp |= ~0UL << (BITS_PER_LONG - offset);
+ if (size < BITS_PER_LONG)
+ tmp |= ~0UL >> size;
+ if (tmp != ~0UL)
+ goto found;
+pass:
+ if (size <= BITS_PER_LONG)
+ break;
size -= BITS_PER_LONG;
+ offset = 0;
+ p++;
}
- if (!size)
- return result;
- tmp = *p;
-
-found_first:
- tmp |= ~0UL << size;
- if (tmp == ~0UL) /* Are any bits zero? */
- return result + size; /* Nope. */
-found_middle:
- return result + __reverse_ffz(tmp);
+ return result;
+found:
+ return result - size + __reverse_ffz(tmp);
}
void register_inmem_page(struct inode *inode, struct page *page)
@@ -179,6 +171,11 @@ void register_inmem_page(struct inode *inode, struct page *page)
struct f2fs_inode_info *fi = F2FS_I(inode);
struct inmem_pages *new;
+ f2fs_trace_pid(page);
+
+ set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
+ SetPagePrivate(page);
+
new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
/* add atomic page indices to the list */
@@ -189,95 +186,316 @@ void register_inmem_page(struct inode *inode, struct page *page)
mutex_lock(&fi->inmem_lock);
get_page(page);
list_add_tail(&new->list, &fi->inmem_pages);
+ inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+ mutex_unlock(&fi->inmem_lock);
+
+ trace_f2fs_register_inmem_page(page, INMEM);
+}
+
+static int __revoke_inmem_pages(struct inode *inode,
+ struct list_head *head, bool drop, bool recover)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct inmem_pages *cur, *tmp;
+ int err = 0;
+
+ list_for_each_entry_safe(cur, tmp, head, list) {
+ struct page *page = cur->page;
+
+ if (drop)
+ trace_f2fs_commit_inmem_page(page, INMEM_DROP);
+
+ lock_page(page);
+
+ if (recover) {
+ struct dnode_of_data dn;
+ struct node_info ni;
+
+ trace_f2fs_commit_inmem_page(page, INMEM_REVOKE);
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) {
+ err = -EAGAIN;
+ goto next;
+ }
+ get_node_info(sbi, dn.nid, &ni);
+ f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
+ cur->old_addr, ni.version, true, true);
+ f2fs_put_dnode(&dn);
+ }
+next:
+ /* we don't need to invalidate this in the sccessful status */
+ if (drop || recover)
+ ClearPageUptodate(page);
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ f2fs_put_page(page, 1);
+
+ list_del(&cur->list);
+ kmem_cache_free(inmem_entry_slab, cur);
+ dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+ }
+ return err;
+}
+
+void drop_inmem_pages(struct inode *inode)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+
+ mutex_lock(&fi->inmem_lock);
+ __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+ mutex_unlock(&fi->inmem_lock);
+
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
+ stat_dec_atomic_write(inode);
+}
+
+void drop_inmem_page(struct inode *inode, struct page *page)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct list_head *head = &fi->inmem_pages;
+ struct inmem_pages *cur = NULL;
+
+ f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page));
+
+ mutex_lock(&fi->inmem_lock);
+ list_for_each_entry(cur, head, list) {
+ if (cur->page == page)
+ break;
+ }
+
+ f2fs_bug_on(sbi, !cur || cur->page != page);
+ list_del(&cur->list);
mutex_unlock(&fi->inmem_lock);
+
+ dec_page_count(sbi, F2FS_INMEM_PAGES);
+ kmem_cache_free(inmem_entry_slab, cur);
+
+ ClearPageUptodate(page);
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ f2fs_put_page(page, 0);
+
+ trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE);
}
-void commit_inmem_pages(struct inode *inode, bool abort)
+static int __commit_inmem_pages(struct inode *inode,
+ struct list_head *revoke_list)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct inmem_pages *cur, *tmp;
- bool submit_bio = false;
struct f2fs_io_info fio = {
+ .sbi = sbi,
.type = DATA,
- .rw = WRITE_SYNC,
+ .op = REQ_OP_WRITE,
+ .op_flags = REQ_SYNC | REQ_PRIO,
};
+ pgoff_t last_idx = ULONG_MAX;
+ int err = 0;
- f2fs_balance_fs(sbi);
- f2fs_lock_op(sbi);
-
- mutex_lock(&fi->inmem_lock);
list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
- lock_page(cur->page);
- if (!abort && cur->page->mapping == inode->i_mapping) {
- f2fs_wait_on_page_writeback(cur->page, DATA);
- if (clear_page_dirty_for_io(cur->page))
+ struct page *page = cur->page;
+
+ lock_page(page);
+ if (page->mapping == inode->i_mapping) {
+ trace_f2fs_commit_inmem_page(page, INMEM);
+
+ set_page_dirty(page);
+ f2fs_wait_on_page_writeback(page, DATA, true);
+ if (clear_page_dirty_for_io(page)) {
inode_dec_dirty_pages(inode);
- do_write_data_page(cur->page, &fio);
- submit_bio = true;
+ remove_dirty_inode(inode);
+ }
+
+ fio.page = page;
+ fio.old_blkaddr = NULL_ADDR;
+ fio.encrypted_page = NULL;
+ fio.need_lock = false,
+ err = do_write_data_page(&fio);
+ if (err) {
+ unlock_page(page);
+ break;
+ }
+
+ /* record old blkaddr for revoking */
+ cur->old_addr = fio.old_blkaddr;
+ last_idx = page->index;
}
- f2fs_put_page(cur->page, 1);
- list_del(&cur->list);
- kmem_cache_free(inmem_entry_slab, cur);
+ unlock_page(page);
+ list_move_tail(&cur->list, revoke_list);
+ }
+
+ if (last_idx != ULONG_MAX)
+ f2fs_submit_merged_bio_cond(sbi, inode, 0, last_idx,
+ DATA, WRITE);
+
+ if (!err)
+ __revoke_inmem_pages(inode, revoke_list, false, false);
+
+ return err;
+}
+
+int commit_inmem_pages(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct list_head revoke_list;
+ int err;
+
+ INIT_LIST_HEAD(&revoke_list);
+ f2fs_balance_fs(sbi, true);
+ f2fs_lock_op(sbi);
+
+ set_inode_flag(inode, FI_ATOMIC_COMMIT);
+
+ mutex_lock(&fi->inmem_lock);
+ err = __commit_inmem_pages(inode, &revoke_list);
+ if (err) {
+ int ret;
+ /*
+ * try to revoke all committed pages, but still we could fail
+ * due to no memory or other reason, if that happened, EAGAIN
+ * will be returned, which means in such case, transaction is
+ * already not integrity, caller should use journal to do the
+ * recovery or rewrite & commit last transaction. For other
+ * error number, revoking was done by filesystem itself.
+ */
+ ret = __revoke_inmem_pages(inode, &revoke_list, false, true);
+ if (ret)
+ err = ret;
+
+ /* drop all uncommitted pages */
+ __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
}
- if (submit_bio)
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
mutex_unlock(&fi->inmem_lock);
- filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX);
+ clear_inode_flag(inode, FI_ATOMIC_COMMIT);
+
f2fs_unlock_op(sbi);
+ return err;
}
/*
* This function balances dirty node and dentry pages.
* In addition, it controls garbage collection.
*/
-void f2fs_balance_fs(struct f2fs_sb_info *sbi)
+void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
+ f2fs_show_injection_info(FAULT_CHECKPOINT);
+ f2fs_stop_checkpoint(sbi, false);
+ }
+#endif
+
+ /* balance_fs_bg is able to be pending */
+ if (need && excess_cached_nats(sbi))
+ f2fs_balance_fs_bg(sbi);
+
/*
* We should do GC or end up with checkpoint, if there are so many dirty
* dir/node pages without enough free segments.
*/
- if (has_not_enough_free_secs(sbi, 0)) {
+ if (has_not_enough_free_secs(sbi, 0, 0)) {
mutex_lock(&sbi->gc_mutex);
- f2fs_gc(sbi);
+ f2fs_gc(sbi, false, false, NULL_SEGNO);
}
}
void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
{
- /* check the # of cached NAT entries and prefree segments */
- if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
- excess_prefree_segs(sbi))
+ /* try to shrink extent cache when there is no enough memory */
+ if (!available_free_memory(sbi, EXTENT_CACHE))
+ f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
+
+ /* check the # of cached NAT entries */
+ if (!available_free_memory(sbi, NAT_ENTRIES))
+ try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK);
+
+ if (!available_free_memory(sbi, FREE_NIDS))
+ try_to_free_nids(sbi, MAX_FREE_NIDS);
+ else
+ build_free_nids(sbi, false, false);
+
+ if (!is_idle(sbi) && !excess_dirty_nats(sbi))
+ return;
+
+ /* checkpoint is the only way to shrink partial cached entries */
+ if (!available_free_memory(sbi, NAT_ENTRIES) ||
+ !available_free_memory(sbi, INO_ENTRIES) ||
+ excess_prefree_segs(sbi) ||
+ excess_dirty_nats(sbi) ||
+ f2fs_time_over(sbi, CP_TIME)) {
+ if (test_opt(sbi, DATA_FLUSH)) {
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ sync_dirty_inodes(sbi, FILE_INODE);
+ blk_finish_plug(&plug);
+ }
f2fs_sync_fs(sbi->sb, true);
+ stat_inc_bg_cp_count(sbi->stat_info);
+ }
+}
+
+static int __submit_flush_wait(struct f2fs_sb_info *sbi,
+ struct block_device *bdev)
+{
+ struct bio *bio = f2fs_bio_alloc(0);
+ int ret;
+
+ bio->bi_rw = REQ_OP_WRITE;
+ bio->bi_bdev = bdev;
+ ret = submit_bio_wait(WRITE_FLUSH, bio);
+ bio_put(bio);
+
+ trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER),
+ test_opt(sbi, FLUSH_MERGE), ret);
+ return ret;
+}
+
+static int submit_flush_wait(struct f2fs_sb_info *sbi)
+{
+ int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev);
+ int i;
+
+ if (!sbi->s_ndevs || ret)
+ return ret;
+
+ for (i = 1; i < sbi->s_ndevs; i++) {
+ ret = __submit_flush_wait(sbi, FDEV(i).bdev);
+ if (ret)
+ break;
+ }
+ return ret;
}
static int issue_flush_thread(void *data)
{
struct f2fs_sb_info *sbi = data;
- struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
+ struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
wait_queue_head_t *q = &fcc->flush_wait_queue;
repeat:
if (kthread_should_stop())
return 0;
if (!llist_empty(&fcc->issue_list)) {
- struct bio *bio = bio_alloc(GFP_NOIO, 0);
struct flush_cmd *cmd, *next;
int ret;
fcc->dispatch_list = llist_del_all(&fcc->issue_list);
fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
- bio->bi_bdev = sbi->sb->s_bdev;
- ret = submit_bio_wait(WRITE_FLUSH, bio);
+ ret = submit_flush_wait(sbi);
+ atomic_inc(&fcc->issued_flush);
llist_for_each_entry_safe(cmd, next,
fcc->dispatch_list, llnode) {
cmd->ret = ret;
complete(&cmd->wait);
}
- bio_put(bio);
fcc->dispatch_list = NULL;
}
@@ -288,26 +506,43 @@ repeat:
int f2fs_issue_flush(struct f2fs_sb_info *sbi)
{
- struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
+ struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
struct flush_cmd cmd;
-
- trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER),
- test_opt(sbi, FLUSH_MERGE));
+ int ret;
if (test_opt(sbi, NOBARRIER))
return 0;
- if (!test_opt(sbi, FLUSH_MERGE))
- return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
+ if (!test_opt(sbi, FLUSH_MERGE)) {
+ ret = submit_flush_wait(sbi);
+ atomic_inc(&fcc->issued_flush);
+ return ret;
+ }
+
+ if (!atomic_read(&fcc->issing_flush)) {
+ atomic_inc(&fcc->issing_flush);
+ ret = submit_flush_wait(sbi);
+ atomic_dec(&fcc->issing_flush);
+
+ atomic_inc(&fcc->issued_flush);
+ return ret;
+ }
init_completion(&cmd.wait);
+ atomic_inc(&fcc->issing_flush);
llist_add(&cmd.llnode, &fcc->issue_list);
if (!fcc->dispatch_list)
wake_up(&fcc->flush_wait_queue);
- wait_for_completion(&cmd.wait);
+ if (fcc->f2fs_issue_flush) {
+ wait_for_completion(&cmd.wait);
+ atomic_dec(&fcc->issing_flush);
+ } else {
+ llist_del_all(&fcc->issue_list);
+ atomic_set(&fcc->issing_flush, 0);
+ }
return cmd.ret;
}
@@ -318,32 +553,46 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
struct flush_cmd_control *fcc;
int err = 0;
+ if (SM_I(sbi)->fcc_info) {
+ fcc = SM_I(sbi)->fcc_info;
+ goto init_thread;
+ }
+
fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
if (!fcc)
return -ENOMEM;
+ atomic_set(&fcc->issued_flush, 0);
+ atomic_set(&fcc->issing_flush, 0);
init_waitqueue_head(&fcc->flush_wait_queue);
init_llist_head(&fcc->issue_list);
- SM_I(sbi)->cmd_control_info = fcc;
+ SM_I(sbi)->fcc_info = fcc;
+init_thread:
fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(fcc->f2fs_issue_flush)) {
err = PTR_ERR(fcc->f2fs_issue_flush);
kfree(fcc);
- SM_I(sbi)->cmd_control_info = NULL;
+ SM_I(sbi)->fcc_info = NULL;
return err;
}
return err;
}
-void destroy_flush_cmd_control(struct f2fs_sb_info *sbi)
+void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
{
- struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
+ struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
+
+ if (fcc && fcc->f2fs_issue_flush) {
+ struct task_struct *flush_thread = fcc->f2fs_issue_flush;
- if (fcc && fcc->f2fs_issue_flush)
- kthread_stop(fcc->f2fs_issue_flush);
- kfree(fcc);
- SM_I(sbi)->cmd_control_info = NULL;
+ fcc->f2fs_issue_flush = NULL;
+ kthread_stop(flush_thread);
+ }
+ if (free) {
+ kfree(fcc);
+ SM_I(sbi)->fcc_info = NULL;
+ }
}
static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
@@ -386,8 +635,8 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
dirty_i->nr_dirty[t]--;
- if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
- clear_bit(GET_SECNO(sbi, segno),
+ if (get_valid_blocks(sbi, segno, true) == 0)
+ clear_bit(GET_SEC_FROM_SEG(sbi, segno),
dirty_i->victim_secmap);
}
}
@@ -407,7 +656,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
mutex_lock(&dirty_i->seglist_lock);
- valid_blocks = get_valid_blocks(sbi, segno, 0);
+ valid_blocks = get_valid_blocks(sbi, segno, false);
if (valid_blocks == 0) {
__locate_dirty_segment(sbi, segno, PRE);
@@ -422,99 +671,649 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
mutex_unlock(&dirty_i->seglist_lock);
}
-static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
- block_t blkstart, block_t blklen)
+static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct list_head *pend_list;
+ struct discard_cmd *dc;
+
+ f2fs_bug_on(sbi, !len);
+
+ pend_list = &dcc->pend_list[plist_idx(len)];
+
+ dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS);
+ INIT_LIST_HEAD(&dc->list);
+ dc->bdev = bdev;
+ dc->lstart = lstart;
+ dc->start = start;
+ dc->len = len;
+ dc->ref = 0;
+ dc->state = D_PREP;
+ dc->error = 0;
+ init_completion(&dc->wait);
+ list_add_tail(&dc->list, pend_list);
+ atomic_inc(&dcc->discard_cmd_cnt);
+ dcc->undiscard_blks += len;
+
+ return dc;
+}
+
+static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len,
+ struct rb_node *parent, struct rb_node **p)
{
- sector_t start = SECTOR_FROM_BLOCK(blkstart);
- sector_t len = SECTOR_FROM_BLOCK(blklen);
- trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
- return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_cmd *dc;
+
+ dc = __create_discard_cmd(sbi, bdev, lstart, start, len);
+
+ rb_link_node(&dc->rb_node, parent, p);
+ rb_insert_color(&dc->rb_node, &dcc->root);
+
+ return dc;
}
-void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
+static void __detach_discard_cmd(struct discard_cmd_control *dcc,
+ struct discard_cmd *dc)
{
- if (f2fs_issue_discard(sbi, blkaddr, 1)) {
- struct page *page = grab_meta_page(sbi, blkaddr);
- /* zero-filled page */
- set_page_dirty(page);
- f2fs_put_page(page, 1);
+ if (dc->state == D_DONE)
+ atomic_dec(&dcc->issing_discard);
+
+ list_del(&dc->list);
+ rb_erase(&dc->rb_node, &dcc->root);
+ dcc->undiscard_blks -= dc->len;
+
+ kmem_cache_free(discard_cmd_slab, dc);
+
+ atomic_dec(&dcc->discard_cmd_cnt);
+}
+
+static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
+ struct discard_cmd *dc)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+
+ if (dc->error == -EOPNOTSUPP)
+ dc->error = 0;
+
+ if (dc->error)
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Issue discard failed, ret: %d", dc->error);
+ __detach_discard_cmd(dcc, dc);
+}
+
+static void f2fs_submit_discard_endio(struct bio *bio, int err)
+{
+ struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private;
+
+ dc->error = err;
+ dc->state = D_DONE;
+ complete(&dc->wait);
+ bio_put(bio);
+}
+
+/* copied from block/blk-lib.c in 4.10-rc1 */
+static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, int flags,
+ struct bio **biop)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+ struct bio *bio = *biop;
+ unsigned int granularity;
+ int op = REQ_WRITE | REQ_DISCARD;
+ int alignment;
+ sector_t bs_mask;
+
+ if (!q)
+ return -ENXIO;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (flags & BLKDEV_DISCARD_SECURE) {
+ if (!blk_queue_secdiscard(q))
+ return -EOPNOTSUPP;
+ op |= REQ_SECURE;
+ }
+
+ bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+ if ((sector | nr_sects) & bs_mask)
+ return -EINVAL;
+
+ /* Zero-sector (unknown) and one-sector granularities are the same. */
+ granularity = max(q->limits.discard_granularity >> 9, 1U);
+ alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
+
+ while (nr_sects) {
+ unsigned int req_sects;
+ sector_t end_sect, tmp;
+
+ /* Make sure bi_size doesn't overflow */
+ req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9);
+
+ /**
+ * If splitting a request, and the next starting sector would be
+ * misaligned, stop the discard at the previous aligned sector.
+ */
+ end_sect = sector + req_sects;
+ tmp = end_sect;
+ if (req_sects < nr_sects &&
+ sector_div(tmp, granularity) != alignment) {
+ end_sect = end_sect - alignment;
+ sector_div(end_sect, granularity);
+ end_sect = end_sect * granularity + alignment;
+ req_sects = end_sect - sector;
+ }
+
+ if (bio) {
+ int ret = submit_bio_wait(0, bio);
+ bio_put(bio);
+ if (ret)
+ return ret;
+ }
+ bio = f2fs_bio_alloc(1);
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = bdev;
+ bio_set_op_attrs(bio, op, 0);
+
+ bio->bi_iter.bi_size = req_sects << 9;
+ nr_sects -= req_sects;
+ sector = end_sect;
+
+ /*
+ * We can loop for a long time in here, if someone does
+ * full device discards (like mkfs). Be nice and allow
+ * us to schedule out to avoid softlocking if preempt
+ * is disabled.
+ */
+ cond_resched();
+ }
+
+ *biop = bio;
+ return 0;
+}
+
+static void __submit_discard_cmd(struct f2fs_sb_info *sbi,
+ struct discard_cmd *dc)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct bio *bio = NULL;
+
+ if (dc->state != D_PREP)
+ return;
+
+ trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len);
+
+ dc->error = __blkdev_issue_discard(dc->bdev,
+ SECTOR_FROM_BLOCK(dc->start),
+ SECTOR_FROM_BLOCK(dc->len),
+ GFP_NOFS, 0, &bio);
+ if (!dc->error) {
+ /* should keep before submission to avoid D_DONE right away */
+ dc->state = D_SUBMIT;
+ atomic_inc(&dcc->issued_discard);
+ atomic_inc(&dcc->issing_discard);
+ if (bio) {
+ bio->bi_private = dc;
+ bio->bi_end_io = f2fs_submit_discard_endio;
+ submit_bio(REQ_SYNC, bio);
+ list_move_tail(&dc->list, &dcc->wait_list);
+ }
+ } else {
+ __remove_discard_cmd(sbi, dc);
+ }
+}
+
+static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len,
+ struct rb_node **insert_p,
+ struct rb_node *insert_parent)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct rb_node **p = &dcc->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct discard_cmd *dc = NULL;
+
+ if (insert_p && insert_parent) {
+ parent = insert_parent;
+ p = insert_p;
+ goto do_insert;
+ }
+
+ p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart);
+do_insert:
+ dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p);
+ if (!dc)
+ return NULL;
+
+ return dc;
+}
+
+static void __relocate_discard_cmd(struct discard_cmd_control *dcc,
+ struct discard_cmd *dc)
+{
+ list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]);
+}
+
+static void __punch_discard_cmd(struct f2fs_sb_info *sbi,
+ struct discard_cmd *dc, block_t blkaddr)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_info di = dc->di;
+ bool modified = false;
+
+ if (dc->state == D_DONE || dc->len == 1) {
+ __remove_discard_cmd(sbi, dc);
+ return;
+ }
+
+ dcc->undiscard_blks -= di.len;
+
+ if (blkaddr > di.lstart) {
+ dc->len = blkaddr - dc->lstart;
+ dcc->undiscard_blks += dc->len;
+ __relocate_discard_cmd(dcc, dc);
+ f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
+ modified = true;
+ }
+
+ if (blkaddr < di.lstart + di.len - 1) {
+ if (modified) {
+ __insert_discard_tree(sbi, dc->bdev, blkaddr + 1,
+ di.start + blkaddr + 1 - di.lstart,
+ di.lstart + di.len - 1 - blkaddr,
+ NULL, NULL);
+ f2fs_bug_on(sbi,
+ !__check_rb_tree_consistence(sbi, &dcc->root));
+ } else {
+ dc->lstart++;
+ dc->len--;
+ dc->start++;
+ dcc->undiscard_blks += dc->len;
+ __relocate_discard_cmd(dcc, dc);
+ f2fs_bug_on(sbi,
+ !__check_rb_tree_consistence(sbi, &dcc->root));
+ }
+ }
+}
+
+static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
+ struct discard_cmd *dc;
+ struct discard_info di = {0};
+ struct rb_node **insert_p = NULL, *insert_parent = NULL;
+ block_t end = lstart + len;
+
+ mutex_lock(&dcc->cmd_lock);
+
+ dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root,
+ NULL, lstart,
+ (struct rb_entry **)&prev_dc,
+ (struct rb_entry **)&next_dc,
+ &insert_p, &insert_parent, true);
+ if (dc)
+ prev_dc = dc;
+
+ if (!prev_dc) {
+ di.lstart = lstart;
+ di.len = next_dc ? next_dc->lstart - lstart : len;
+ di.len = min(di.len, len);
+ di.start = start;
+ }
+
+ while (1) {
+ struct rb_node *node;
+ bool merged = false;
+ struct discard_cmd *tdc = NULL;
+
+ if (prev_dc) {
+ di.lstart = prev_dc->lstart + prev_dc->len;
+ if (di.lstart < lstart)
+ di.lstart = lstart;
+ if (di.lstart >= end)
+ break;
+
+ if (!next_dc || next_dc->lstart > end)
+ di.len = end - di.lstart;
+ else
+ di.len = next_dc->lstart - di.lstart;
+ di.start = start + di.lstart - lstart;
+ }
+
+ if (!di.len)
+ goto next;
+
+ if (prev_dc && prev_dc->state == D_PREP &&
+ prev_dc->bdev == bdev &&
+ __is_discard_back_mergeable(&di, &prev_dc->di)) {
+ prev_dc->di.len += di.len;
+ dcc->undiscard_blks += di.len;
+ __relocate_discard_cmd(dcc, prev_dc);
+ f2fs_bug_on(sbi,
+ !__check_rb_tree_consistence(sbi, &dcc->root));
+ di = prev_dc->di;
+ tdc = prev_dc;
+ merged = true;
+ }
+
+ if (next_dc && next_dc->state == D_PREP &&
+ next_dc->bdev == bdev &&
+ __is_discard_front_mergeable(&di, &next_dc->di)) {
+ next_dc->di.lstart = di.lstart;
+ next_dc->di.len += di.len;
+ next_dc->di.start = di.start;
+ dcc->undiscard_blks += di.len;
+ __relocate_discard_cmd(dcc, next_dc);
+ if (tdc)
+ __remove_discard_cmd(sbi, tdc);
+ f2fs_bug_on(sbi,
+ !__check_rb_tree_consistence(sbi, &dcc->root));
+ merged = true;
+ }
+
+ if (!merged) {
+ __insert_discard_tree(sbi, bdev, di.lstart, di.start,
+ di.len, NULL, NULL);
+ f2fs_bug_on(sbi,
+ !__check_rb_tree_consistence(sbi, &dcc->root));
+ }
+ next:
+ prev_dc = next_dc;
+ if (!prev_dc)
+ break;
+
+ node = rb_next(&prev_dc->rb_node);
+ next_dc = rb_entry_safe(node, struct discard_cmd, rb_node);
}
+
+ mutex_unlock(&dcc->cmd_lock);
}
-static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkstart, block_t blklen)
+{
+ block_t lblkstart = blkstart;
+
+ trace_f2fs_queue_discard(bdev, blkstart, blklen);
+
+ if (sbi->s_ndevs) {
+ int devi = f2fs_target_device_index(sbi, blkstart);
+
+ blkstart -= FDEV(devi).start_blk;
+ }
+ __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen);
+ return 0;
+}
+
+static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct list_head *pend_list;
+ struct discard_cmd *dc, *tmp;
+ struct blk_plug plug;
+ int i, iter = 0;
+
+ mutex_lock(&dcc->cmd_lock);
+ blk_start_plug(&plug);
+ for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+ pend_list = &dcc->pend_list[i];
+ list_for_each_entry_safe(dc, tmp, pend_list, list) {
+ f2fs_bug_on(sbi, dc->state != D_PREP);
+
+ if (!issue_cond || is_idle(sbi))
+ __submit_discard_cmd(sbi, dc);
+ if (issue_cond && iter++ > DISCARD_ISSUE_RATE)
+ goto out;
+ }
+ }
+out:
+ blk_finish_plug(&plug);
+ mutex_unlock(&dcc->cmd_lock);
+}
+
+static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct list_head *wait_list = &(dcc->wait_list);
+ struct discard_cmd *dc, *tmp;
+
+ mutex_lock(&dcc->cmd_lock);
+ list_for_each_entry_safe(dc, tmp, wait_list, list) {
+ if (!wait_cond || dc->state == D_DONE) {
+ if (dc->ref)
+ continue;
+ wait_for_completion_io(&dc->wait);
+ __remove_discard_cmd(sbi, dc);
+ }
+ }
+ mutex_unlock(&dcc->cmd_lock);
+}
+
+/* This should be covered by global mutex, &sit_i->sentry_lock */
+void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_cmd *dc;
+ bool need_wait = false;
+
+ mutex_lock(&dcc->cmd_lock);
+ dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr);
+ if (dc) {
+ if (dc->state == D_PREP) {
+ __punch_discard_cmd(sbi, dc, blkaddr);
+ } else {
+ dc->ref++;
+ need_wait = true;
+ }
+ }
+ mutex_unlock(&dcc->cmd_lock);
+
+ if (need_wait) {
+ wait_for_completion_io(&dc->wait);
+ mutex_lock(&dcc->cmd_lock);
+ f2fs_bug_on(sbi, dc->state != D_DONE);
+ dc->ref--;
+ if (!dc->ref)
+ __remove_discard_cmd(sbi, dc);
+ mutex_unlock(&dcc->cmd_lock);
+ }
+}
+
+/* This comes from f2fs_put_super */
+void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
+{
+ __issue_discard_cmd(sbi, false);
+ __wait_discard_cmd(sbi, false);
+}
+
+static int issue_discard_thread(void *data)
+{
+ struct f2fs_sb_info *sbi = data;
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ wait_queue_head_t *q = &dcc->discard_wait_queue;
+repeat:
+ if (kthread_should_stop())
+ return 0;
+
+ __issue_discard_cmd(sbi, true);
+ __wait_discard_cmd(sbi, true);
+
+ congestion_wait(BLK_RW_SYNC, HZ/50);
+
+ wait_event_interruptible(*q, kthread_should_stop() ||
+ atomic_read(&dcc->discard_cmd_cnt));
+ goto repeat;
+}
+
+#ifdef CONFIG_BLK_DEV_ZONED
+static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkstart, block_t blklen)
+{
+ sector_t sector, nr_sects;
+ block_t lblkstart = blkstart;
+ int devi = 0;
+
+ if (sbi->s_ndevs) {
+ devi = f2fs_target_device_index(sbi, blkstart);
+ blkstart -= FDEV(devi).start_blk;
+ }
+
+ /*
+ * We need to know the type of the zone: for conventional zones,
+ * use regular discard if the drive supports it. For sequential
+ * zones, reset the zone write pointer.
+ */
+ switch (get_blkz_type(sbi, bdev, blkstart)) {
+
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ if (!blk_queue_discard(bdev_get_queue(bdev)))
+ return 0;
+ return __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
+ sector = SECTOR_FROM_BLOCK(blkstart);
+ nr_sects = SECTOR_FROM_BLOCK(blklen);
+
+ if (sector & (bdev_zone_size(bdev) - 1) ||
+ nr_sects != bdev_zone_size(bdev)) {
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "(%d) %s: Unaligned discard attempted (block %x + %x)",
+ devi, sbi->s_ndevs ? FDEV(devi).path: "",
+ blkstart, blklen);
+ return -EIO;
+ }
+ trace_f2fs_issue_reset_zone(bdev, blkstart);
+ return blkdev_reset_zones(bdev, sector,
+ nr_sects, GFP_NOFS);
+ default:
+ /* Unknown zone type: broken device ? */
+ return -EIO;
+ }
+}
+#endif
+
+static int __issue_discard_async(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkstart, block_t blklen)
+{
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
+ bdev_zoned_model(bdev) != BLK_ZONED_NONE)
+ return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
+#endif
+ return __queue_discard_cmd(sbi, bdev, blkstart, blklen);
+}
+
+static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
+ block_t blkstart, block_t blklen)
+{
+ sector_t start = blkstart, len = 0;
+ struct block_device *bdev;
+ struct seg_entry *se;
+ unsigned int offset;
+ block_t i;
+ int err = 0;
+
+ bdev = f2fs_target_device(sbi, blkstart, NULL);
+
+ for (i = blkstart; i < blkstart + blklen; i++, len++) {
+ if (i != start) {
+ struct block_device *bdev2 =
+ f2fs_target_device(sbi, i, NULL);
+
+ if (bdev2 != bdev) {
+ err = __issue_discard_async(sbi, bdev,
+ start, len);
+ if (err)
+ return err;
+ bdev = bdev2;
+ start = i;
+ len = 0;
+ }
+ }
+
+ se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
+ offset = GET_BLKOFF_FROM_SEG0(sbi, i);
+
+ if (!f2fs_test_and_set_bit(offset, se->discard_map))
+ sbi->discard_blks--;
+ }
+
+ if (len)
+ err = __issue_discard_async(sbi, bdev, start, len);
+ return err;
+}
+
+static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
+ bool check_only)
{
- struct list_head *head = &SM_I(sbi)->discard_list;
- struct discard_entry *new;
int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
int max_blocks = sbi->blocks_per_seg;
struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
- unsigned long dmap[entries];
+ unsigned long *discard_map = (unsigned long *)se->discard_map;
+ unsigned long *dmap = SIT_I(sbi)->tmp_map;
unsigned int start = 0, end = -1;
- bool force = (cpc->reason == CP_DISCARD);
+ bool force = (cpc->reason & CP_DISCARD);
+ struct discard_entry *de = NULL;
+ struct list_head *head = &SM_I(sbi)->dcc_info->entry_list;
int i;
- if (!force && !test_opt(sbi, DISCARD))
- return;
-
- if (force && !se->valid_blocks) {
- struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- /*
- * if this segment is registered in the prefree list, then
- * we should skip adding a discard candidate, and let the
- * checkpoint do that later.
- */
- mutex_lock(&dirty_i->seglist_lock);
- if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) {
- mutex_unlock(&dirty_i->seglist_lock);
- cpc->trimmed += sbi->blocks_per_seg;
- return;
- }
- mutex_unlock(&dirty_i->seglist_lock);
+ if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi))
+ return false;
- new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
- INIT_LIST_HEAD(&new->list);
- new->blkaddr = START_BLOCK(sbi, cpc->trim_start);
- new->len = sbi->blocks_per_seg;
- list_add_tail(&new->list, head);
- SM_I(sbi)->nr_discards += sbi->blocks_per_seg;
- cpc->trimmed += sbi->blocks_per_seg;
- return;
+ if (!force) {
+ if (!test_opt(sbi, DISCARD) || !se->valid_blocks ||
+ SM_I(sbi)->dcc_info->nr_discards >=
+ SM_I(sbi)->dcc_info->max_discards)
+ return false;
}
- /* zero block will be discarded through the prefree list */
- if (!se->valid_blocks || se->valid_blocks == max_blocks)
- return;
-
/* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
for (i = 0; i < entries; i++)
- dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
+ dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] :
+ (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
- while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
+ while (force || SM_I(sbi)->dcc_info->nr_discards <=
+ SM_I(sbi)->dcc_info->max_discards) {
start = __find_rev_next_bit(dmap, max_blocks, end + 1);
if (start >= max_blocks)
break;
end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
-
- if (end - start < cpc->trim_minlen)
+ if (force && start && end != max_blocks
+ && (end - start) < cpc->trim_minlen)
continue;
- new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
- INIT_LIST_HEAD(&new->list);
- new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
- new->len = end - start;
- cpc->trimmed += end - start;
+ if (check_only)
+ return true;
- list_add_tail(&new->list, head);
- SM_I(sbi)->nr_discards += end - start;
+ if (!de) {
+ de = f2fs_kmem_cache_alloc(discard_entry_slab,
+ GFP_F2FS_ZERO);
+ de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start);
+ list_add_tail(&de->list, head);
+ }
+
+ for (i = start; i < end; i++)
+ __set_bit_le(i, (void *)de->discard_map);
+
+ SM_I(sbi)->dcc_info->nr_discards += end - start;
}
+ return false;
}
void release_discard_addrs(struct f2fs_sb_info *sbi)
{
- struct list_head *head = &(SM_I(sbi)->discard_list);
+ struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list);
struct discard_entry *entry, *this;
/* drop caches */
@@ -538,13 +1337,15 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
mutex_unlock(&dirty_i->seglist_lock);
}
-void clear_prefree_segments(struct f2fs_sb_info *sbi)
+void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
- struct list_head *head = &(SM_I(sbi)->discard_list);
+ struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list);
struct discard_entry *entry, *this;
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
unsigned int start = 0, end = -1;
+ unsigned int secno, start_segno;
+ bool force = (cpc->reason & CP_DISCARD);
mutex_lock(&dirty_i->seglist_lock);
@@ -564,18 +1365,127 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
if (!test_opt(sbi, DISCARD))
continue;
- f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
+ if (force && start >= cpc->trim_start &&
+ (end - 1) <= cpc->trim_end)
+ continue;
+
+ if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) {
+ f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
(end - start) << sbi->log_blocks_per_seg);
+ continue;
+ }
+next:
+ secno = GET_SEC_FROM_SEG(sbi, start);
+ start_segno = GET_SEG_FROM_SEC(sbi, secno);
+ if (!IS_CURSEC(sbi, secno) &&
+ !get_valid_blocks(sbi, start, true))
+ f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
+ sbi->segs_per_sec << sbi->log_blocks_per_seg);
+
+ start = start_segno + sbi->segs_per_sec;
+ if (start < end)
+ goto next;
+ else
+ end = start - 1;
}
mutex_unlock(&dirty_i->seglist_lock);
/* send small discards */
list_for_each_entry_safe(entry, this, head, list) {
- f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
+ unsigned int cur_pos = 0, next_pos, len, total_len = 0;
+ bool is_valid = test_bit_le(0, entry->discard_map);
+
+find_next:
+ if (is_valid) {
+ next_pos = find_next_zero_bit_le(entry->discard_map,
+ sbi->blocks_per_seg, cur_pos);
+ len = next_pos - cur_pos;
+
+ if (force && len < cpc->trim_minlen)
+ goto skip;
+
+ f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos,
+ len);
+ cpc->trimmed += len;
+ total_len += len;
+ } else {
+ next_pos = find_next_bit_le(entry->discard_map,
+ sbi->blocks_per_seg, cur_pos);
+ }
+skip:
+ cur_pos = next_pos;
+ is_valid = !is_valid;
+
+ if (cur_pos < sbi->blocks_per_seg)
+ goto find_next;
+
list_del(&entry->list);
- SM_I(sbi)->nr_discards -= entry->len;
+ SM_I(sbi)->dcc_info->nr_discards -= total_len;
kmem_cache_free(discard_entry_slab, entry);
}
+
+ wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue);
+}
+
+static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
+{
+ dev_t dev = sbi->sb->s_bdev->bd_dev;
+ struct discard_cmd_control *dcc;
+ int err = 0, i;
+
+ if (SM_I(sbi)->dcc_info) {
+ dcc = SM_I(sbi)->dcc_info;
+ goto init_thread;
+ }
+
+ dcc = kzalloc(sizeof(struct discard_cmd_control), GFP_KERNEL);
+ if (!dcc)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&dcc->entry_list);
+ for (i = 0; i < MAX_PLIST_NUM; i++)
+ INIT_LIST_HEAD(&dcc->pend_list[i]);
+ INIT_LIST_HEAD(&dcc->wait_list);
+ mutex_init(&dcc->cmd_lock);
+ atomic_set(&dcc->issued_discard, 0);
+ atomic_set(&dcc->issing_discard, 0);
+ atomic_set(&dcc->discard_cmd_cnt, 0);
+ dcc->nr_discards = 0;
+ dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
+ dcc->undiscard_blks = 0;
+ dcc->root = RB_ROOT;
+
+ init_waitqueue_head(&dcc->discard_wait_queue);
+ SM_I(sbi)->dcc_info = dcc;
+init_thread:
+ dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi,
+ "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
+ if (IS_ERR(dcc->f2fs_issue_discard)) {
+ err = PTR_ERR(dcc->f2fs_issue_discard);
+ kfree(dcc);
+ SM_I(sbi)->dcc_info = NULL;
+ return err;
+ }
+
+ return err;
+}
+
+static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+
+ if (!dcc)
+ return;
+
+ if (dcc->f2fs_issue_discard) {
+ struct task_struct *discard_thread = dcc->f2fs_issue_discard;
+
+ dcc->f2fs_issue_discard = NULL;
+ kthread_stop(discard_thread);
+ }
+
+ kfree(dcc);
+ SM_I(sbi)->dcc_info = NULL;
}
static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
@@ -620,11 +1530,41 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
/* Update valid block bitmap */
if (del > 0) {
- if (f2fs_set_bit(offset, se->cur_valid_map))
+ if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) {
+#ifdef CONFIG_F2FS_CHECK_FS
+ if (f2fs_test_and_set_bit(offset,
+ se->cur_valid_map_mir))
+ f2fs_bug_on(sbi, 1);
+ else
+ WARN_ON(1);
+#else
f2fs_bug_on(sbi, 1);
+#endif
+ }
+ if (f2fs_discard_en(sbi) &&
+ !f2fs_test_and_set_bit(offset, se->discard_map))
+ sbi->discard_blks--;
+
+ /* don't overwrite by SSR to keep node chain */
+ if (se->type == CURSEG_WARM_NODE) {
+ if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map))
+ se->ckpt_valid_blocks++;
+ }
} else {
- if (!f2fs_clear_bit(offset, se->cur_valid_map))
+ if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) {
+#ifdef CONFIG_F2FS_CHECK_FS
+ if (!f2fs_test_and_clear_bit(offset,
+ se->cur_valid_map_mir))
+ f2fs_bug_on(sbi, 1);
+ else
+ WARN_ON(1);
+#else
f2fs_bug_on(sbi, 1);
+#endif
+ }
+ if (f2fs_discard_en(sbi) &&
+ f2fs_test_and_clear_bit(offset, se->discard_map))
+ sbi->discard_blks++;
}
if (!f2fs_test_bit(offset, se->ckpt_valid_map))
se->ckpt_valid_blocks += del;
@@ -668,6 +1608,30 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
mutex_unlock(&sit_i->sentry_lock);
}
+bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int segno, offset;
+ struct seg_entry *se;
+ bool is_cp = false;
+
+ if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
+ return true;
+
+ mutex_lock(&sit_i->sentry_lock);
+
+ segno = GET_SEGNO(sbi, blkaddr);
+ se = get_seg_entry(sbi, segno);
+ offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
+
+ if (f2fs_test_bit(offset, se->ckpt_valid_map))
+ is_cp = true;
+
+ mutex_unlock(&sit_i->sentry_lock);
+
+ return is_cp;
+}
+
/*
* This function should be resided under the curseg_mutex lock
*/
@@ -683,7 +1647,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
/*
* Calculate the number of current summary pages for writing
*/
-int npages_for_summary_flush(struct f2fs_sb_info *sbi)
+int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
{
int valid_sum_count = 0;
int i, sum_in_page;
@@ -691,16 +1655,21 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi)
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
if (sbi->ckpt->alloc_type[i] == SSR)
valid_sum_count += sbi->blocks_per_seg;
- else
- valid_sum_count += curseg_blkoff(sbi, i);
+ else {
+ if (for_ra)
+ valid_sum_count += le16_to_cpu(
+ F2FS_CKPT(sbi)->cur_data_blkoff[i]);
+ else
+ valid_sum_count += curseg_blkoff(sbi, i);
+ }
}
- sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE -
+ sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE -
SUM_FOOTER_SIZE) / SUMMARY_SIZE;
if (valid_sum_count <= sum_in_page)
return 1;
else if ((valid_sum_count - sum_in_page) <=
- (PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
+ (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
return 2;
return 3;
}
@@ -713,12 +1682,46 @@ struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno));
}
+void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr)
+{
+ struct page *page = grab_meta_page(sbi, blk_addr);
+ void *dst = page_address(page);
+
+ if (src)
+ memcpy(dst, src, PAGE_SIZE);
+ else
+ memset(dst, 0, PAGE_SIZE);
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+}
+
static void write_sum_page(struct f2fs_sb_info *sbi,
struct f2fs_summary_block *sum_blk, block_t blk_addr)
{
+ update_meta_page(sbi, (void *)sum_blk, blk_addr);
+}
+
+static void write_current_sum_page(struct f2fs_sb_info *sbi,
+ int type, block_t blk_addr)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
struct page *page = grab_meta_page(sbi, blk_addr);
- void *kaddr = page_address(page);
- memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE);
+ struct f2fs_summary_block *src = curseg->sum_blk;
+ struct f2fs_summary_block *dst;
+
+ dst = (struct f2fs_summary_block *)page_address(page);
+
+ mutex_lock(&curseg->curseg_mutex);
+
+ down_read(&curseg->journal_rwsem);
+ memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE);
+ up_read(&curseg->journal_rwsem);
+
+ memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE);
+ memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE);
+
+ mutex_unlock(&curseg->curseg_mutex);
+
set_page_dirty(page);
f2fs_put_page(page, 1);
}
@@ -744,20 +1747,19 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int segno, secno, zoneno;
unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
- unsigned int hint = *newseg / sbi->segs_per_sec;
- unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
+ unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg);
+ unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg);
unsigned int left_start = hint;
bool init = true;
int go_left = 0;
int i;
- write_lock(&free_i->segmap_lock);
+ spin_lock(&free_i->segmap_lock);
if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
segno = find_next_zero_bit(free_i->free_segmap,
- MAIN_SEGS(sbi), *newseg + 1);
- if (segno - *newseg < sbi->segs_per_sec -
- (*newseg % sbi->segs_per_sec))
+ GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1);
+ if (segno < GET_SEG_FROM_SEC(sbi, hint + 1))
goto got_it;
}
find_other_zone:
@@ -788,8 +1790,8 @@ find_other_zone:
secno = left_start;
skip_left:
hint = secno;
- segno = secno * sbi->segs_per_sec;
- zoneno = secno / sbi->secs_per_zone;
+ segno = GET_SEG_FROM_SEC(sbi, secno);
+ zoneno = GET_ZONE_FROM_SEC(sbi, secno);
/* give up on finding another zone */
if (!init)
@@ -824,7 +1826,7 @@ got_it:
f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
__set_inuse(sbi, segno);
*newseg = segno;
- write_unlock(&free_i->segmap_lock);
+ spin_unlock(&free_i->segmap_lock);
}
static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
@@ -833,7 +1835,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
struct summary_footer *sum_footer;
curseg->segno = curseg->next_segno;
- curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno);
+ curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno);
curseg->next_blkoff = 0;
curseg->next_segno = NULL_SEGNO;
@@ -846,6 +1848,20 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
__set_sit_entry_type(sbi, type, curseg->segno, modified);
}
+static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
+{
+ /* if segs_per_sec is large than 1, we need to keep original policy. */
+ if (sbi->segs_per_sec != 1)
+ return CURSEG_I(sbi, type)->segno;
+
+ if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
+ return 0;
+
+ if (SIT_I(sbi)->last_victim[ALLOC_NEXT])
+ return SIT_I(sbi)->last_victim[ALLOC_NEXT];
+ return CURSEG_I(sbi, type)->segno;
+}
+
/*
* Allocate a current working segment.
* This function always allocates a free segment in LFS manner.
@@ -864,6 +1880,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
if (test_opt(sbi, NOHEAP))
dir = ALLOC_RIGHT;
+ segno = __get_next_segno(sbi, type);
get_new_segment(sbi, &segno, new_sec, dir);
curseg->next_segno = segno;
reset_curseg(sbi, type, 1);
@@ -875,7 +1892,7 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi,
{
struct seg_entry *se = get_seg_entry(sbi, seg->segno);
int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
- unsigned long target_map[entries];
+ unsigned long *target_map = SIT_I(sbi)->tmp_map;
unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
int i, pos;
@@ -939,16 +1956,43 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops;
+ unsigned segno = NULL_SEGNO;
+ int i, cnt;
+ bool reversed = false;
+
+ /* need_SSR() already forces to do this */
+ if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) {
+ curseg->next_segno = segno;
+ return 1;
+ }
- if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0))
- return v_ops->get_victim(sbi,
- &(curseg)->next_segno, BG_GC, type, SSR);
+ /* For node segments, let's do SSR more intensively */
+ if (IS_NODESEG(type)) {
+ if (type >= CURSEG_WARM_NODE) {
+ reversed = true;
+ i = CURSEG_COLD_NODE;
+ } else {
+ i = CURSEG_HOT_NODE;
+ }
+ cnt = NR_CURSEG_NODE_TYPE;
+ } else {
+ if (type >= CURSEG_WARM_DATA) {
+ reversed = true;
+ i = CURSEG_COLD_DATA;
+ } else {
+ i = CURSEG_HOT_DATA;
+ }
+ cnt = NR_CURSEG_DATA_TYPE;
+ }
- /* For data segments, let's do SSR more intensively */
- for (; type >= CURSEG_HOT_DATA; type--)
- if (v_ops->get_victim(sbi, &(curseg)->next_segno,
- BG_GC, type, SSR))
+ for (; cnt-- > 0; reversed ? i-- : i++) {
+ if (i == type)
+ continue;
+ if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) {
+ curseg->next_segno = segno;
return 1;
+ }
+ }
return 0;
}
@@ -963,7 +2007,8 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
if (force)
new_curseg(sbi, type, true);
- else if (type == CURSEG_WARM_NODE)
+ else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
+ type == CURSEG_WARM_NODE)
new_curseg(sbi, type, false);
else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
new_curseg(sbi, type, false);
@@ -978,14 +2023,14 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
void allocate_new_segments(struct f2fs_sb_info *sbi)
{
struct curseg_info *curseg;
- unsigned int old_curseg;
+ unsigned int old_segno;
int i;
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
curseg = CURSEG_I(sbi, i);
- old_curseg = curseg->segno;
+ old_segno = curseg->segno;
SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
- locate_dirty_segment(sbi, old_curseg);
+ locate_dirty_segment(sbi, old_segno);
}
}
@@ -993,35 +2038,77 @@ static const struct segment_allocation default_salloc_ops = {
.allocate_segment = allocate_segment_by_default,
};
+bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+{
+ __u64 trim_start = cpc->trim_start;
+ bool has_candidate = false;
+
+ mutex_lock(&SIT_I(sbi)->sentry_lock);
+ for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) {
+ if (add_discard_addrs(sbi, cpc, true)) {
+ has_candidate = true;
+ break;
+ }
+ }
+ mutex_unlock(&SIT_I(sbi)->sentry_lock);
+
+ cpc->trim_start = trim_start;
+ return has_candidate;
+}
+
int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
{
- __u64 start = range->start >> sbi->log_blocksize;
- __u64 end = start + (range->len >> sbi->log_blocksize) - 1;
+ __u64 start = F2FS_BYTES_TO_BLK(range->start);
+ __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
unsigned int start_segno, end_segno;
struct cp_control cpc;
+ int err = 0;
- if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) ||
- range->len < sbi->blocksize)
+ if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
return -EINVAL;
cpc.trimmed = 0;
if (end <= MAIN_BLKADDR(sbi))
goto out;
+ if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "Found FS corruption, run fsck to fix.");
+ goto out;
+ }
+
/* start/end segment number in main_area */
start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
GET_SEGNO(sbi, end);
cpc.reason = CP_DISCARD;
- cpc.trim_start = start_segno;
- cpc.trim_end = end_segno;
- cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
+ cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen));
/* do checkpoint to issue discard commands safely */
- write_checkpoint(sbi, &cpc);
+ for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
+ cpc.trim_start = start_segno;
+
+ if (sbi->discard_blks == 0)
+ break;
+ else if (sbi->discard_blks < BATCHED_TRIM_BLOCKS(sbi))
+ cpc.trim_end = end_segno;
+ else
+ cpc.trim_end = min_t(unsigned int,
+ rounddown(start_segno +
+ BATCHED_TRIM_SEGMENTS(sbi),
+ sbi->segs_per_sec) - 1, end_segno);
+
+ mutex_lock(&sbi->gc_mutex);
+ err = write_checkpoint(sbi, &cpc);
+ mutex_unlock(&sbi->gc_mutex);
+ if (err)
+ break;
+
+ schedule();
+ }
out:
- range->len = cpc.trimmed << sbi->log_blocksize;
- return 0;
+ range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
+ return err;
}
static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
@@ -1050,8 +2137,8 @@ static int __get_segment_type_4(struct page *page, enum page_type p_type)
else
return CURSEG_COLD_DATA;
} else {
- if (IS_DNODE(page) && !is_cold_node(page))
- return CURSEG_HOT_NODE;
+ if (IS_DNODE(page) && is_cold_node(page))
+ return CURSEG_WARM_NODE;
else
return CURSEG_COLD_NODE;
}
@@ -1062,18 +2149,16 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
if (p_type == DATA) {
struct inode *inode = page->mapping->host;
- if (S_ISDIR(inode->i_mode))
- return CURSEG_HOT_DATA;
- else if (is_cold_data(page) || file_is_cold(inode))
+ if (is_cold_data(page) || file_is_cold(inode))
return CURSEG_COLD_DATA;
- else
- return CURSEG_WARM_DATA;
+ if (is_inode_flag_set(inode, FI_HOT_DATA))
+ return CURSEG_HOT_DATA;
+ return CURSEG_WARM_DATA;
} else {
if (IS_DNODE(page))
return is_cold_node(page) ? CURSEG_WARM_NODE :
CURSEG_HOT_NODE;
- else
- return CURSEG_COLD_NODE;
+ return CURSEG_COLD_NODE;
}
}
@@ -1096,14 +2181,15 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
struct f2fs_summary *sum, int type)
{
struct sit_info *sit_i = SIT_I(sbi);
- struct curseg_info *curseg;
-
- curseg = CURSEG_I(sbi, type);
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
mutex_lock(&curseg->curseg_mutex);
+ mutex_lock(&sit_i->sentry_lock);
*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+ f2fs_wait_discard_bio(sbi, *new_blkaddr);
+
/*
* __add_sum_entry should be resided under the curseg_mutex
* because, this function updates a summary entry in the
@@ -1111,7 +2197,6 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
*/
__add_sum_entry(sbi, type, sum);
- mutex_lock(&sit_i->sentry_lock);
__refresh_next_blkoff(sbi, curseg);
stat_inc_block_count(sbi, curseg);
@@ -1119,8 +2204,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
if (!__has_curseg_space(sbi, type))
sit_i->s_ops->allocate_segment(sbi, type, false);
/*
- * SIT information should be updated before segment allocation,
- * since SSR needs latest valid block information.
+ * SIT information should be updated after segment allocation,
+ * since we need to keep dirty segments precisely under SSR.
*/
refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
@@ -1132,84 +2217,111 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
mutex_unlock(&curseg->curseg_mutex);
}
-static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
- block_t old_blkaddr, block_t *new_blkaddr,
- struct f2fs_summary *sum, struct f2fs_io_info *fio)
+static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
{
- int type = __get_segment_type(page, fio->type);
+ int type = __get_segment_type(fio->page, fio->type);
+ int err;
- allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type);
+ if (fio->type == NODE || fio->type == DATA)
+ mutex_lock(&fio->sbi->wio_mutex[fio->type]);
+reallocate:
+ allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
+ &fio->new_blkaddr, sum, type);
/* writeout dirty page into bdev */
- f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio);
+ err = f2fs_submit_page_mbio(fio);
+ if (err == -EAGAIN) {
+ fio->old_blkaddr = fio->new_blkaddr;
+ goto reallocate;
+ }
+
+ if (fio->type == NODE || fio->type == DATA)
+ mutex_unlock(&fio->sbi->wio_mutex[fio->type]);
}
void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
{
struct f2fs_io_info fio = {
+ .sbi = sbi,
.type = META,
- .rw = WRITE_SYNC | REQ_META | REQ_PRIO
+ .op = REQ_OP_WRITE,
+ .op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
+ .old_blkaddr = page->index,
+ .new_blkaddr = page->index,
+ .page = page,
+ .encrypted_page = NULL,
};
+ if (unlikely(page->index >= MAIN_BLKADDR(sbi)))
+ fio.op_flags &= ~REQ_META;
+
set_page_writeback(page);
- f2fs_submit_page_mbio(sbi, page, page->index, &fio);
+ f2fs_submit_page_mbio(&fio);
}
-void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
- struct f2fs_io_info *fio,
- unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
+void write_node_page(unsigned int nid, struct f2fs_io_info *fio)
{
struct f2fs_summary sum;
+
set_summary(&sum, nid, 0, 0);
- do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio);
+ do_write_page(&sum, fio);
}
-void write_data_page(struct page *page, struct dnode_of_data *dn,
- block_t *new_blkaddr, struct f2fs_io_info *fio)
+void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+ struct f2fs_sb_info *sbi = fio->sbi;
struct f2fs_summary sum;
struct node_info ni;
f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
get_node_info(sbi, dn->nid, &ni);
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
-
- do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio);
+ do_write_page(&sum, fio);
+ f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
}
-void rewrite_data_page(struct page *page, block_t old_blkaddr,
- struct f2fs_io_info *fio)
+int rewrite_data_page(struct f2fs_io_info *fio)
{
- f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio);
+ fio->new_blkaddr = fio->old_blkaddr;
+ stat_inc_inplace_blocks(fio->sbi);
+ return f2fs_submit_page_bio(fio);
}
-void recover_data_page(struct f2fs_sb_info *sbi,
- struct page *page, struct f2fs_summary *sum,
- block_t old_blkaddr, block_t new_blkaddr)
+void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+ block_t old_blkaddr, block_t new_blkaddr,
+ bool recover_curseg, bool recover_newaddr)
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg;
unsigned int segno, old_cursegno;
struct seg_entry *se;
int type;
+ unsigned short old_blkoff;
segno = GET_SEGNO(sbi, new_blkaddr);
se = get_seg_entry(sbi, segno);
type = se->type;
- if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
- if (old_blkaddr == NULL_ADDR)
- type = CURSEG_COLD_DATA;
- else
+ if (!recover_curseg) {
+ /* for recovery flow */
+ if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
+ if (old_blkaddr == NULL_ADDR)
+ type = CURSEG_COLD_DATA;
+ else
+ type = CURSEG_WARM_DATA;
+ }
+ } else {
+ if (!IS_CURSEG(sbi, segno))
type = CURSEG_WARM_DATA;
}
+
curseg = CURSEG_I(sbi, type);
mutex_lock(&curseg->curseg_mutex);
mutex_lock(&sit_i->sentry_lock);
old_cursegno = curseg->segno;
+ old_blkoff = curseg->next_blkoff;
/* change the current segment */
if (segno != curseg->segno) {
@@ -1220,46 +2332,70 @@ void recover_data_page(struct f2fs_sb_info *sbi,
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
__add_sum_entry(sbi, type, sum);
- refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
+ if (!recover_curseg || recover_newaddr)
+ update_sit_entry(sbi, new_blkaddr, 1);
+ if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+ update_sit_entry(sbi, old_blkaddr, -1);
+
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr));
+
locate_dirty_segment(sbi, old_cursegno);
+ if (recover_curseg) {
+ if (old_cursegno != curseg->segno) {
+ curseg->next_segno = old_cursegno;
+ change_curseg(sbi, type, true);
+ }
+ curseg->next_blkoff = old_blkoff;
+ }
+
mutex_unlock(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
}
-static inline bool is_merged_page(struct f2fs_sb_info *sbi,
- struct page *page, enum page_type type)
+void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
+ block_t old_addr, block_t new_addr,
+ unsigned char version, bool recover_curseg,
+ bool recover_newaddr)
{
- enum page_type btype = PAGE_TYPE_OF_BIO(type);
- struct f2fs_bio_info *io = &sbi->write_io[btype];
- struct bio_vec *bvec;
- int i;
+ struct f2fs_summary sum;
- down_read(&io->io_rwsem);
- if (!io->bio)
- goto out;
+ set_summary(&sum, dn->nid, dn->ofs_in_node, version);
- bio_for_each_segment_all(bvec, io->bio, i) {
- if (page == bvec->bv_page) {
- up_read(&io->io_rwsem);
- return true;
- }
- }
+ __f2fs_replace_block(sbi, &sum, old_addr, new_addr,
+ recover_curseg, recover_newaddr);
-out:
- up_read(&io->io_rwsem);
- return false;
+ f2fs_update_data_blkaddr(dn, new_addr);
}
void f2fs_wait_on_page_writeback(struct page *page,
- enum page_type type)
+ enum page_type type, bool ordered)
{
if (PageWriteback(page)) {
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
- if (is_merged_page(sbi, page, type))
- f2fs_submit_merged_bio(sbi, type, WRITE);
- wait_on_page_writeback(page);
+ f2fs_submit_merged_bio_cond(sbi, page->mapping->host,
+ 0, page->index, type, WRITE);
+ if (ordered)
+ wait_on_page_writeback(page);
+ else
+ wait_for_stable_page(page);
+ }
+}
+
+void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
+ block_t blkaddr)
+{
+ struct page *cpage;
+
+ if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
+ return;
+
+ cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
+ if (cpage) {
+ f2fs_wait_on_page_writeback(cpage, DATA, true);
+ f2fs_put_page(cpage, 1);
}
}
@@ -1279,12 +2415,11 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
/* Step 1: restore nat cache */
seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
- memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE);
+ memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE);
/* Step 2: restore sit cache */
seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
- memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE,
- SUM_JOURNAL_SIZE);
+ memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE);
offset = 2 * SUM_JOURNAL_SIZE;
/* Step 3: restore summary entries */
@@ -1308,7 +2443,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
s = (struct f2fs_summary *)(kaddr + offset);
seg_i->sum_blk->entries[j] = *s;
offset += SUMMARY_SIZE;
- if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+ if (offset + SUMMARY_SIZE <= PAGE_SIZE -
SUM_FOOTER_SIZE)
continue;
@@ -1339,7 +2474,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
segno = le32_to_cpu(ckpt->cur_data_segno[type]);
blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
CURSEG_HOT_DATA]);
- if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+ if (__exist_node_summaries(sbi))
blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type);
else
blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
@@ -1348,7 +2483,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
CURSEG_HOT_NODE]);
blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
CURSEG_HOT_NODE]);
- if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+ if (__exist_node_summaries(sbi))
blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
type - CURSEG_HOT_NODE);
else
@@ -1359,7 +2494,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
sum = (struct f2fs_summary_block *)page_address(new);
if (IS_NODESEG(type)) {
- if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) {
+ if (__exist_node_summaries(sbi)) {
struct f2fs_summary *ns = &sum->entries[0];
int i;
for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
@@ -1380,7 +2515,14 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
/* set uncompleted segment to curseg */
curseg = CURSEG_I(sbi, type);
mutex_lock(&curseg->curseg_mutex);
- memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE);
+
+ /* update journal info */
+ down_write(&curseg->journal_rwsem);
+ memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE);
+ up_write(&curseg->journal_rwsem);
+
+ memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE);
+ memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE);
curseg->next_segno = segno;
reset_curseg(sbi, type, 0);
curseg->alloc_type = ckpt->alloc_type[type];
@@ -1392,22 +2534,39 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
{
+ struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal;
+ struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal;
int type = CURSEG_HOT_DATA;
int err;
- if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
+ if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) {
+ int npages = npages_for_summary_flush(sbi, true);
+
+ if (npages >= 2)
+ ra_meta_pages(sbi, start_sum_block(sbi), npages,
+ META_CP, true);
+
/* restore for compacted data summary */
if (read_compacted_summaries(sbi))
return -EINVAL;
type = CURSEG_HOT_NODE;
}
+ if (__exist_node_summaries(sbi))
+ ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
+ NR_CURSEG_TYPE - type, META_CP, true);
+
for (; type <= CURSEG_COLD_NODE; type++) {
err = read_normal_summaries(sbi, type);
if (err)
return err;
}
+ /* sanity check for summary blocks */
+ if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES ||
+ sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES)
+ return -EINVAL;
+
return 0;
}
@@ -1425,13 +2584,12 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
/* Step 1: write nat cache */
seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
- memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE);
+ memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE);
written_size += SUM_JOURNAL_SIZE;
/* Step 2: write sit cache */
seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
- memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits,
- SUM_JOURNAL_SIZE);
+ memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE);
written_size += SUM_JOURNAL_SIZE;
/* Step 3: write summary entries */
@@ -1453,7 +2611,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
*summary = seg_i->sum_blk->entries[j];
written_size += SUMMARY_SIZE;
- if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+ if (written_size + SUMMARY_SIZE <= PAGE_SIZE -
SUM_FOOTER_SIZE)
continue;
@@ -1477,17 +2635,13 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi,
else
end = type + NR_CURSEG_NODE_TYPE;
- for (i = type; i < end; i++) {
- struct curseg_info *sum = CURSEG_I(sbi, i);
- mutex_lock(&sum->curseg_mutex);
- write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type));
- mutex_unlock(&sum->curseg_mutex);
- }
+ for (i = type; i < end; i++)
+ write_current_sum_page(sbi, i, blkaddr + (i - type));
}
void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
{
- if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG))
+ if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG))
write_compacted_summaries(sbi, start_blk);
else
write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA);
@@ -1495,28 +2649,27 @@ void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
{
- if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG))
- write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
+ write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
}
-int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
+int lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
unsigned int val, int alloc)
{
int i;
if (type == NAT_JOURNAL) {
- for (i = 0; i < nats_in_cursum(sum); i++) {
- if (le32_to_cpu(nid_in_journal(sum, i)) == val)
+ for (i = 0; i < nats_in_cursum(journal); i++) {
+ if (le32_to_cpu(nid_in_journal(journal, i)) == val)
return i;
}
- if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES)
- return update_nats_in_cursum(sum, 1);
+ if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL))
+ return update_nats_in_cursum(journal, 1);
} else if (type == SIT_JOURNAL) {
- for (i = 0; i < sits_in_cursum(sum); i++)
- if (le32_to_cpu(segno_in_journal(sum, i)) == val)
+ for (i = 0; i < sits_in_cursum(journal); i++)
+ if (le32_to_cpu(segno_in_journal(journal, i)) == val)
return i;
- if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES)
- return update_sits_in_cursum(sum, 1);
+ if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL))
+ return update_sits_in_cursum(journal, 1);
}
return -1;
}
@@ -1524,17 +2677,7 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
unsigned int segno)
{
- struct sit_info *sit_i = SIT_I(sbi);
- unsigned int offset = SIT_BLOCK_OFFSET(segno);
- block_t blk_addr = sit_i->sit_base_addr + offset;
-
- check_seg_range(sbi, segno);
-
- /* calculate sit block address */
- if (f2fs_test_bit(offset, sit_i->sit_bitmap))
- blk_addr += sit_i->sit_blocks;
-
- return get_meta_page(sbi, blk_addr);
+ return get_meta_page(sbi, current_sit_addr(sbi, segno));
}
static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
@@ -1555,7 +2698,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
src_addr = page_address(src_page);
dst_addr = page_address(dst_page);
- memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+ memcpy(dst_addr, src_addr, PAGE_SIZE);
set_page_dirty(dst_page);
f2fs_put_page(src_page, 1);
@@ -1568,7 +2711,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
static struct sit_entry_set *grab_sit_entry_set(void)
{
struct sit_entry_set *ses =
- f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC);
+ f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS);
ses->entry_cnt = 0;
INIT_LIST_HEAD(&ses->set_list);
@@ -1630,20 +2773,22 @@ static void add_sits_in_set(struct f2fs_sb_info *sbi)
static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
int i;
- for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
+ down_write(&curseg->journal_rwsem);
+ for (i = 0; i < sits_in_cursum(journal); i++) {
unsigned int segno;
bool dirtied;
- segno = le32_to_cpu(segno_in_journal(sum, i));
+ segno = le32_to_cpu(segno_in_journal(journal, i));
dirtied = __mark_sit_entry_dirty(sbi, segno);
if (!dirtied)
add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
}
- update_sits_in_cursum(sum, -sits_in_cursum(sum));
+ update_sits_in_cursum(journal, -i);
+ up_write(&curseg->journal_rwsem);
}
/*
@@ -1655,15 +2800,17 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct sit_info *sit_i = SIT_I(sbi);
unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
struct sit_entry_set *ses, *tmp;
struct list_head *head = &SM_I(sbi)->sit_entry_set;
bool to_journal = true;
struct seg_entry *se;
- mutex_lock(&curseg->curseg_mutex);
mutex_lock(&sit_i->sentry_lock);
+ if (!sit_i->dirty_sentries)
+ goto out;
+
/*
* add and account sit entries of dirty bitmap in sit entry
* set temporarily
@@ -1675,19 +2822,16 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* entries, remove all entries from journal and add and account
* them in sit entry set.
*/
- if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
+ if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL))
remove_sits_in_journal(sbi);
- if (!sit_i->dirty_sentries)
- goto out;
-
/*
* there are two steps to flush sit entries:
* #1, flush sit entries to journal in current cold data summary block.
* #2, flush sit entries to sit page.
*/
list_for_each_entry_safe(ses, tmp, head, set_list) {
- struct page *page;
+ struct page *page = NULL;
struct f2fs_sit_block *raw_sit = NULL;
unsigned int start_segno = ses->start_segno;
unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK,
@@ -1695,10 +2839,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
unsigned int segno = start_segno;
if (to_journal &&
- !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL))
+ !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL))
to_journal = false;
- if (!to_journal) {
+ if (to_journal) {
+ down_write(&curseg->journal_rwsem);
+ } else {
page = get_next_sit_page(sbi, start_segno);
raw_sit = page_address(page);
}
@@ -1710,19 +2856,19 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
se = get_seg_entry(sbi, segno);
/* add discard candidates */
- if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) {
+ if (!(cpc->reason & CP_DISCARD)) {
cpc->trim_start = segno;
- add_discard_addrs(sbi, cpc);
+ add_discard_addrs(sbi, cpc, false);
}
if (to_journal) {
- offset = lookup_journal_in_cursum(sum,
+ offset = lookup_journal_in_cursum(journal,
SIT_JOURNAL, segno, 1);
f2fs_bug_on(sbi, offset < 0);
- segno_in_journal(sum, offset) =
+ segno_in_journal(journal, offset) =
cpu_to_le32(segno);
seg_info_to_raw_sit(se,
- &sit_in_journal(sum, offset));
+ &sit_in_journal(journal, offset));
} else {
sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
seg_info_to_raw_sit(se,
@@ -1734,7 +2880,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
ses->entry_cnt--;
}
- if (!to_journal)
+ if (to_journal)
+ up_write(&curseg->journal_rwsem);
+ else
f2fs_put_page(page, 1);
f2fs_bug_on(sbi, ses->entry_cnt);
@@ -1744,12 +2892,15 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_bug_on(sbi, !list_empty(head));
f2fs_bug_on(sbi, sit_i->dirty_sentries);
out:
- if (cpc->reason == CP_DISCARD) {
+ if (cpc->reason & CP_DISCARD) {
+ __u64 trim_start = cpc->trim_start;
+
for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
- add_discard_addrs(sbi, cpc);
+ add_discard_addrs(sbi, cpc, false);
+
+ cpc->trim_start = trim_start;
}
mutex_unlock(&sit_i->sentry_lock);
- mutex_unlock(&curseg->curseg_mutex);
set_prefree_as_free_segments(sbi);
}
@@ -1757,10 +2908,9 @@ out:
static int build_sit_info(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
- struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct sit_info *sit_i;
unsigned int sit_segs, start;
- char *src_bitmap, *dst_bitmap;
+ char *src_bitmap;
unsigned int bitmap_size;
/* allocate memory for SIT information */
@@ -1770,12 +2920,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
SM_I(sbi)->sit_info = sit_i;
- sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry));
+ sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) *
+ sizeof(struct seg_entry), GFP_KERNEL);
if (!sit_i->sentries)
return -ENOMEM;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+ sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
if (!sit_i->dirty_sentries_bitmap)
return -ENOMEM;
@@ -1784,14 +2935,32 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
sit_i->sentries[start].ckpt_valid_map
= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
- if (!sit_i->sentries[start].cur_valid_map
- || !sit_i->sentries[start].ckpt_valid_map)
+ if (!sit_i->sentries[start].cur_valid_map ||
+ !sit_i->sentries[start].ckpt_valid_map)
return -ENOMEM;
+
+#ifdef CONFIG_F2FS_CHECK_FS
+ sit_i->sentries[start].cur_valid_map_mir
+ = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+ if (!sit_i->sentries[start].cur_valid_map_mir)
+ return -ENOMEM;
+#endif
+
+ if (f2fs_discard_en(sbi)) {
+ sit_i->sentries[start].discard_map
+ = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+ if (!sit_i->sentries[start].discard_map)
+ return -ENOMEM;
+ }
}
+ sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+ if (!sit_i->tmp_map)
+ return -ENOMEM;
+
if (sbi->segs_per_sec > 1) {
- sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
- sizeof(struct sec_entry));
+ sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) *
+ sizeof(struct sec_entry), GFP_KERNEL);
if (!sit_i->sec_entries)
return -ENOMEM;
}
@@ -1803,17 +2972,22 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
- dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL);
- if (!dst_bitmap)
+ sit_i->sit_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL);
+ if (!sit_i->sit_bitmap)
return -ENOMEM;
+#ifdef CONFIG_F2FS_CHECK_FS
+ sit_i->sit_bitmap_mir = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL);
+ if (!sit_i->sit_bitmap_mir)
+ return -ENOMEM;
+#endif
+
/* init SIT information */
sit_i->s_ops = &default_salloc_ops;
sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
- sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count);
- sit_i->sit_bitmap = dst_bitmap;
+ sit_i->written_valid_blocks = 0;
sit_i->bitmap_size = bitmap_size;
sit_i->dirty_sentries = 0;
sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
@@ -1836,12 +3010,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
SM_I(sbi)->free_info = free_i;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
+ free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL);
if (!free_i->free_segmap)
return -ENOMEM;
sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
+ free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL);
if (!free_i->free_secmap)
return -ENOMEM;
@@ -1853,7 +3027,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
free_i->free_segments = 0;
free_i->free_sections = 0;
- rwlock_init(&free_i->segmap_lock);
+ spin_lock_init(&free_i->segmap_lock);
return 0;
}
@@ -1870,9 +3044,14 @@ static int build_curseg(struct f2fs_sb_info *sbi)
for (i = 0; i < NR_CURSEG_TYPE; i++) {
mutex_init(&array[i].curseg_mutex);
- array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+ array[i].sum_blk = kzalloc(PAGE_SIZE, GFP_KERNEL);
if (!array[i].sum_blk)
return -ENOMEM;
+ init_rwsem(&array[i].journal_rwsem);
+ array[i].journal = kzalloc(sizeof(struct f2fs_journal),
+ GFP_KERNEL);
+ if (!array[i].journal)
+ return -ENOMEM;
array[i].segno = NULL_SEGNO;
array[i].next_blkoff = 0;
}
@@ -1883,49 +3062,85 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
+ struct seg_entry *se;
+ struct f2fs_sit_entry sit;
int sit_blk_cnt = SIT_BLK_CNT(sbi);
unsigned int i, start, end;
unsigned int readed, start_blk = 0;
- int nrpages = MAX_BIO_BLOCKS(sbi);
do {
- readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
+ readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES,
+ META_SIT, true);
start = start_blk * sit_i->sents_per_block;
end = (start_blk + readed) * sit_i->sents_per_block;
for (; start < end && start < MAIN_SEGS(sbi); start++) {
- struct seg_entry *se = &sit_i->sentries[start];
struct f2fs_sit_block *sit_blk;
- struct f2fs_sit_entry sit;
struct page *page;
- mutex_lock(&curseg->curseg_mutex);
- for (i = 0; i < sits_in_cursum(sum); i++) {
- if (le32_to_cpu(segno_in_journal(sum, i))
- == start) {
- sit = sit_in_journal(sum, i);
- mutex_unlock(&curseg->curseg_mutex);
- goto got_it;
- }
- }
- mutex_unlock(&curseg->curseg_mutex);
-
+ se = &sit_i->sentries[start];
page = get_current_sit_page(sbi, start);
sit_blk = (struct f2fs_sit_block *)page_address(page);
sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
f2fs_put_page(page, 1);
-got_it:
+
check_block_count(sbi, start, &sit);
seg_info_from_raw_sit(se, &sit);
- if (sbi->segs_per_sec > 1) {
- struct sec_entry *e = get_sec_entry(sbi, start);
- e->valid_blocks += se->valid_blocks;
+
+ /* build discard map only one time */
+ if (f2fs_discard_en(sbi)) {
+ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+ memset(se->discard_map, 0xff,
+ SIT_VBLOCK_MAP_SIZE);
+ } else {
+ memcpy(se->discard_map,
+ se->cur_valid_map,
+ SIT_VBLOCK_MAP_SIZE);
+ sbi->discard_blks +=
+ sbi->blocks_per_seg -
+ se->valid_blocks;
+ }
}
+
+ if (sbi->segs_per_sec > 1)
+ get_sec_entry(sbi, start)->valid_blocks +=
+ se->valid_blocks;
}
start_blk += readed;
} while (start_blk < sit_blk_cnt);
+
+ down_read(&curseg->journal_rwsem);
+ for (i = 0; i < sits_in_cursum(journal); i++) {
+ unsigned int old_valid_blocks;
+
+ start = le32_to_cpu(segno_in_journal(journal, i));
+ se = &sit_i->sentries[start];
+ sit = sit_in_journal(journal, i);
+
+ old_valid_blocks = se->valid_blocks;
+
+ check_block_count(sbi, start, &sit);
+ seg_info_from_raw_sit(se, &sit);
+
+ if (f2fs_discard_en(sbi)) {
+ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+ memset(se->discard_map, 0xff,
+ SIT_VBLOCK_MAP_SIZE);
+ } else {
+ memcpy(se->discard_map, se->cur_valid_map,
+ SIT_VBLOCK_MAP_SIZE);
+ sbi->discard_blks += old_valid_blocks -
+ se->valid_blocks;
+ }
+ }
+
+ if (sbi->segs_per_sec > 1)
+ get_sec_entry(sbi, start)->valid_blocks +=
+ se->valid_blocks - old_valid_blocks;
+ }
+ up_read(&curseg->journal_rwsem);
}
static void init_free_segmap(struct f2fs_sb_info *sbi)
@@ -1937,6 +3152,9 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
struct seg_entry *sentry = get_seg_entry(sbi, start);
if (!sentry->valid_blocks)
__set_free(sbi, start);
+ else
+ SIT_I(sbi)->written_valid_blocks +=
+ sentry->valid_blocks;
}
/* set use the current segments */
@@ -1959,7 +3177,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
if (segno >= MAIN_SEGS(sbi))
break;
offset = segno + 1;
- valid_blocks = get_valid_blocks(sbi, segno, 0);
+ valid_blocks = get_valid_blocks(sbi, segno, false);
if (valid_blocks == sbi->blocks_per_seg || !valid_blocks)
continue;
if (valid_blocks > sbi->blocks_per_seg) {
@@ -1977,7 +3195,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi)
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->victim_secmap)
return -ENOMEM;
return 0;
@@ -1999,7 +3217,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
for (i = 0; i < NR_DIRTY_TYPE; i++) {
- dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->dirty_segmap[i])
return -ENOMEM;
}
@@ -2058,13 +3276,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
sm_info->rec_prefree_segments = sm_info->main_segments *
DEF_RECLAIM_PREFREE_SEGMENTS / 100;
- sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
+ if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS)
+ sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS;
+
+ if (!test_opt(sbi, LFS))
+ sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
+ sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
- INIT_LIST_HEAD(&sm_info->discard_list);
- sm_info->nr_discards = 0;
- sm_info->max_discards = 0;
+ sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
INIT_LIST_HEAD(&sm_info->sit_entry_set);
@@ -2074,6 +3295,10 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
return err;
}
+ err = create_discard_cmd_control(sbi);
+ if (err)
+ return err;
+
err = build_sit_info(sbi);
if (err)
return err;
@@ -2102,7 +3327,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
mutex_lock(&dirty_i->seglist_lock);
- kfree(dirty_i->dirty_segmap[dirty_type]);
+ kvfree(dirty_i->dirty_segmap[dirty_type]);
dirty_i->nr_dirty[dirty_type] = 0;
mutex_unlock(&dirty_i->seglist_lock);
}
@@ -2110,7 +3335,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
static void destroy_victim_secmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- kfree(dirty_i->victim_secmap);
+ kvfree(dirty_i->victim_secmap);
}
static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
@@ -2138,8 +3363,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi)
if (!array)
return;
SM_I(sbi)->curseg_array = NULL;
- for (i = 0; i < NR_CURSEG_TYPE; i++)
+ for (i = 0; i < NR_CURSEG_TYPE; i++) {
kfree(array[i].sum_blk);
+ kfree(array[i].journal);
+ }
kfree(array);
}
@@ -2149,8 +3376,8 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi)
if (!free_i)
return;
SM_I(sbi)->free_info = NULL;
- kfree(free_i->free_segmap);
- kfree(free_i->free_secmap);
+ kvfree(free_i->free_segmap);
+ kvfree(free_i->free_secmap);
kfree(free_i);
}
@@ -2165,15 +3392,24 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
if (sit_i->sentries) {
for (start = 0; start < MAIN_SEGS(sbi); start++) {
kfree(sit_i->sentries[start].cur_valid_map);
+#ifdef CONFIG_F2FS_CHECK_FS
+ kfree(sit_i->sentries[start].cur_valid_map_mir);
+#endif
kfree(sit_i->sentries[start].ckpt_valid_map);
+ kfree(sit_i->sentries[start].discard_map);
}
}
- vfree(sit_i->sentries);
- vfree(sit_i->sec_entries);
- kfree(sit_i->dirty_sentries_bitmap);
+ kfree(sit_i->tmp_map);
+
+ kvfree(sit_i->sentries);
+ kvfree(sit_i->sec_entries);
+ kvfree(sit_i->dirty_sentries_bitmap);
SM_I(sbi)->sit_info = NULL;
kfree(sit_i->sit_bitmap);
+#ifdef CONFIG_F2FS_CHECK_FS
+ kfree(sit_i->sit_bitmap_mir);
+#endif
kfree(sit_i);
}
@@ -2183,7 +3419,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
if (!sm_info)
return;
- destroy_flush_cmd_control(sbi);
+ destroy_flush_cmd_control(sbi, true);
+ destroy_discard_cmd_control(sbi);
destroy_dirty_segmap(sbi);
destroy_curseg(sbi);
destroy_free_segmap(sbi);
@@ -2199,10 +3436,15 @@ int __init create_segment_manager_caches(void)
if (!discard_entry_slab)
goto fail;
+ discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd",
+ sizeof(struct discard_cmd));
+ if (!discard_cmd_slab)
+ goto destroy_discard_entry;
+
sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
- sizeof(struct nat_entry_set));
+ sizeof(struct sit_entry_set));
if (!sit_entry_set_slab)
- goto destory_discard_entry;
+ goto destroy_discard_cmd;
inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
sizeof(struct inmem_pages));
@@ -2212,7 +3454,9 @@ int __init create_segment_manager_caches(void)
destroy_sit_entry_set:
kmem_cache_destroy(sit_entry_set_slab);
-destory_discard_entry:
+destroy_discard_cmd:
+ kmem_cache_destroy(discard_cmd_slab);
+destroy_discard_entry:
kmem_cache_destroy(discard_entry_slab);
fail:
return -ENOMEM;
@@ -2221,6 +3465,7 @@ fail:
void destroy_segment_manager_caches(void)
{
kmem_cache_destroy(sit_entry_set_slab);
+ kmem_cache_destroy(discard_cmd_slab);
kmem_cache_destroy(discard_entry_slab);
kmem_cache_destroy(inmem_entry_slab);
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 2495bec1c621..f488f0e2d529 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -15,80 +15,89 @@
#define NULL_SECNO ((unsigned int)(~0))
#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */
+#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */
+
+#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */
/* L: Logical segment # in volume, R: Relative segment # in main area */
-#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
-#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno)
+#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno)
+#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno)
-#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA)
-#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE)
+#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA)
+#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE)
#define IS_CURSEG(sbi, seg) \
- ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
+ (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
#define IS_CURSEC(sbi, secno) \
- ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
- sbi->segs_per_sec)) \
+ (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
+ (sbi)->segs_per_sec)) \
#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr)
#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr)
#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments)
-#define MAIN_SECS(sbi) (sbi->total_sections)
+#define MAIN_SECS(sbi) ((sbi)->total_sections)
#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count)
-#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg)
+#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg)
#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
-#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \
- sbi->log_blocks_per_seg))
+#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \
+ (sbi)->log_blocks_per_seg))
#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \
- (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
+ (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg))
#define NEXT_FREE_BLKADDR(sbi, curseg) \
- (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
+ (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff)
#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi))
#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
- (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
+ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg)
#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \
- (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1))
+ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1))
#define GET_SEGNO(sbi, blk_addr) \
- (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \
+ ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \
NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
-#define GET_SECNO(sbi, segno) \
- ((segno) / sbi->segs_per_sec)
-#define GET_ZONENO_FROM_SEGNO(sbi, segno) \
- ((segno / sbi->segs_per_sec) / sbi->secs_per_zone)
+#define BLKS_PER_SEC(sbi) \
+ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg)
+#define GET_SEC_FROM_SEG(sbi, segno) \
+ ((segno) / (sbi)->segs_per_sec)
+#define GET_SEG_FROM_SEC(sbi, secno) \
+ ((secno) * (sbi)->segs_per_sec)
+#define GET_ZONE_FROM_SEC(sbi, secno) \
+ ((secno) / (sbi)->secs_per_zone)
+#define GET_ZONE_FROM_SEG(sbi, segno) \
+ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno))
#define GET_SUM_BLOCK(sbi, segno) \
- ((sbi->sm_info->ssa_blkaddr) + segno)
+ ((sbi)->sm_info->ssa_blkaddr + (segno))
#define GET_SUM_TYPE(footer) ((footer)->entry_type)
-#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type)
+#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type))
#define SIT_ENTRY_OFFSET(sit_i, segno) \
- (segno % sit_i->sents_per_block)
+ ((segno) % (sit_i)->sents_per_block)
#define SIT_BLOCK_OFFSET(segno) \
- (segno / SIT_ENTRY_PER_BLOCK)
+ ((segno) / SIT_ENTRY_PER_BLOCK)
#define START_SEGNO(segno) \
(SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK)
#define SIT_BLK_CNT(sbi) \
@@ -99,9 +108,7 @@
#define SECTOR_FROM_BLOCK(blk_addr) \
(((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
#define SECTOR_TO_BLOCK(sectors) \
- (sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
-#define MAX_BIO_BLOCKS(sbi) \
- ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES))
+ ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK)
/*
* indicate a block allocation direction: RIGHT and LEFT.
@@ -130,16 +137,21 @@ enum {
*/
enum {
GC_CB = 0,
- GC_GREEDY
+ GC_GREEDY,
+ ALLOC_NEXT,
+ FLUSH_DEVICE,
+ MAX_GC_POLICY,
};
/*
* BG_GC means the background cleaning job.
* FG_GC means the on-demand cleaning job.
+ * FORCE_FG_GC means on-demand cleaning job in background.
*/
enum {
BG_GC = 0,
- FG_GC
+ FG_GC,
+ FORCE_FG_GC,
};
/* for a function parameter to select a victim segment */
@@ -155,15 +167,20 @@ struct victim_sel_policy {
};
struct seg_entry {
- unsigned short valid_blocks; /* # of valid blocks */
+ unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */
+ unsigned int valid_blocks:10; /* # of valid blocks */
+ unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */
+ unsigned int padding:6; /* padding */
unsigned char *cur_valid_map; /* validity bitmap of blocks */
+#ifdef CONFIG_F2FS_CHECK_FS
+ unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */
+#endif
/*
* # of valid blocks and the validity bitmap stored in the the last
* checkpoint pack. This information is used by the SSR mode.
*/
- unsigned short ckpt_valid_blocks;
- unsigned char *ckpt_valid_map;
- unsigned char type; /* segment type like CURSEG_XXX_TYPE */
+ unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */
+ unsigned char *discard_map;
unsigned long long mtime; /* modification time of the segment */
};
@@ -175,9 +192,22 @@ struct segment_allocation {
void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
};
+/*
+ * this value is set in page as a private data which indicate that
+ * the page is atomically written, and it is in inmem_pages list.
+ */
+#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1)
+#define DUMMY_WRITTEN_PAGE ((unsigned long)-2)
+
+#define IS_ATOMIC_WRITTEN_PAGE(page) \
+ (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
+#define IS_DUMMY_WRITTEN_PAGE(page) \
+ (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE)
+
struct inmem_pages {
struct list_head list;
struct page *page;
+ block_t old_addr; /* for revoking when fail to commit */
};
struct sit_info {
@@ -187,8 +217,12 @@ struct sit_info {
block_t sit_blocks; /* # of blocks used by SIT area */
block_t written_valid_blocks; /* # of valid blocks in main area */
char *sit_bitmap; /* SIT bitmap pointer */
+#ifdef CONFIG_F2FS_CHECK_FS
+ char *sit_bitmap_mir; /* SIT bitmap mirror */
+#endif
unsigned int bitmap_size; /* SIT bitmap size */
+ unsigned long *tmp_map; /* bitmap for temporal use */
unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */
unsigned int dirty_sentries; /* # of dirty sentries */
unsigned int sents_per_block; /* # of SIT entries per block */
@@ -201,13 +235,15 @@ struct sit_info {
unsigned long long mounted_time; /* mount time */
unsigned long long min_mtime; /* min. modification time */
unsigned long long max_mtime; /* max. modification time */
+
+ unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */
};
struct free_segmap_info {
unsigned int start_segno; /* start segment number logically */
unsigned int free_segments; /* # of free segments */
unsigned int free_sections; /* # of free sections */
- rwlock_t segmap_lock; /* free segmap lock */
+ spinlock_t segmap_lock; /* free segmap lock */
unsigned long *free_segmap; /* free segment bitmap */
unsigned long *free_secmap; /* free section bitmap */
};
@@ -243,6 +279,8 @@ struct victim_selection {
struct curseg_info {
struct mutex curseg_mutex; /* lock for consistency */
struct f2fs_summary_block *sum_blk; /* cached summary block */
+ struct rw_semaphore journal_rwsem; /* protect journal area */
+ struct f2fs_journal *journal; /* cached journal info */
unsigned char alloc_type; /* current allocation type */
unsigned int segno; /* current segment number */
unsigned short next_blkoff; /* next block offset to write */
@@ -275,17 +313,17 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
- return &sit_i->sec_entries[GET_SECNO(sbi, segno)];
+ return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)];
}
static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
- unsigned int segno, int section)
+ unsigned int segno, bool use_section)
{
/*
* In order to get # of valid blocks in a section instantly from many
* segments, f2fs manages two counting structures separately.
*/
- if (section > 1)
+ if (use_section && sbi->segs_per_sec > 1)
return get_sec_entry(sbi, segno)->valid_blocks;
else
return get_seg_entry(sbi, segno)->valid_blocks;
@@ -298,6 +336,9 @@ static inline void seg_info_from_raw_sit(struct seg_entry *se,
se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs);
memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+#ifdef CONFIG_F2FS_CHECK_FS
+ memcpy(se->cur_valid_map_mir, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+#endif
se->type = GET_SIT_TYPE(rs);
se->mtime = le64_to_cpu(rs->mtime);
}
@@ -318,36 +359,38 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
unsigned int max, unsigned int segno)
{
unsigned int ret;
- read_lock(&free_i->segmap_lock);
+ spin_lock(&free_i->segmap_lock);
ret = find_next_bit(free_i->free_segmap, max, segno);
- read_unlock(&free_i->segmap_lock);
+ spin_unlock(&free_i->segmap_lock);
return ret;
}
static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
- unsigned int start_segno = secno * sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
- write_lock(&free_i->segmap_lock);
+ spin_lock(&free_i->segmap_lock);
clear_bit(segno, free_i->free_segmap);
free_i->free_segments++;
- next = find_next_bit(free_i->free_segmap, MAIN_SEGS(sbi), start_segno);
+ next = find_next_bit(free_i->free_segmap,
+ start_segno + sbi->segs_per_sec, start_segno);
if (next >= start_segno + sbi->segs_per_sec) {
clear_bit(secno, free_i->free_secmap);
free_i->free_sections++;
}
- write_unlock(&free_i->segmap_lock);
+ spin_unlock(&free_i->segmap_lock);
}
static inline void __set_inuse(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+
set_bit(segno, free_i->free_segmap);
free_i->free_segments--;
if (!test_and_set_bit(secno, free_i->free_secmap))
@@ -358,11 +401,11 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
- unsigned int start_segno = secno * sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
- write_lock(&free_i->segmap_lock);
+ spin_lock(&free_i->segmap_lock);
if (test_and_clear_bit(segno, free_i->free_segmap)) {
free_i->free_segments++;
@@ -373,27 +416,34 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
free_i->free_sections++;
}
}
- write_unlock(&free_i->segmap_lock);
+ spin_unlock(&free_i->segmap_lock);
}
static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
- write_lock(&free_i->segmap_lock);
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+
+ spin_lock(&free_i->segmap_lock);
if (!test_and_set_bit(segno, free_i->free_segmap)) {
free_i->free_segments--;
if (!test_and_set_bit(secno, free_i->free_secmap))
free_i->free_sections--;
}
- write_unlock(&free_i->segmap_lock);
+ spin_unlock(&free_i->segmap_lock);
}
static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
void *dst_addr)
{
struct sit_info *sit_i = SIT_I(sbi);
+
+#ifdef CONFIG_F2FS_CHECK_FS
+ if (memcmp(sit_i->sit_bitmap, sit_i->sit_bitmap_mir,
+ sit_i->bitmap_size))
+ f2fs_bug_on(sbi, 1);
+#endif
memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size);
}
@@ -439,32 +489,40 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi)
static inline int overprovision_sections(struct f2fs_sb_info *sbi)
{
- return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec;
+ return GET_SEC_FROM_SEG(sbi, (unsigned int)overprovision_segments(sbi));
}
static inline int reserved_sections(struct f2fs_sb_info *sbi)
{
- return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec;
+ return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi));
}
static inline bool need_SSR(struct f2fs_sb_info *sbi)
{
int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
- return free_sections(sbi) <= (node_secs + 2 * dent_secs +
- reserved_sections(sbi) + 1);
+ int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
+
+ if (test_opt(sbi, LFS))
+ return false;
+
+ return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
+ 2 * reserved_sections(sbi));
}
-static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
+static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
+ int freed, int needed)
{
int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+ int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
- if (unlikely(sbi->por_doing))
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return false;
- return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
- reserved_sections(sbi));
+ return (free_sections(sbi) + freed) <=
+ (node_secs + 2 * dent_secs + imeta_secs +
+ reserved_sections(sbi) + needed);
}
static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
@@ -494,6 +552,7 @@ static inline int utilization(struct f2fs_sb_info *sbi)
*/
#define DEF_MIN_IPU_UTIL 70
#define DEF_MIN_FSYNC_BLOCKS 8
+#define DEF_MIN_HOT_BLOCKS 16
enum {
F2FS_IPU_FORCE,
@@ -501,15 +560,16 @@ enum {
F2FS_IPU_UTIL,
F2FS_IPU_SSR_UTIL,
F2FS_IPU_FSYNC,
+ F2FS_IPU_ASYNC,
};
-static inline bool need_inplace_update(struct inode *inode)
+static inline bool need_inplace_update_policy(struct inode *inode,
+ struct f2fs_io_info *fio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int policy = SM_I(sbi)->ipu_policy;
- /* IPU can be done only for the user data */
- if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
+ if (test_opt(sbi, LFS))
return false;
if (policy & (0x1 << F2FS_IPU_FORCE))
@@ -523,9 +583,18 @@ static inline bool need_inplace_update(struct inode *inode)
utilization(sbi) > SM_I(sbi)->min_ipu_util)
return true;
+ /*
+ * IPU for rewrite async pages
+ */
+ if (policy & (0x1 << F2FS_IPU_ASYNC) &&
+ fio && fio->op == REQ_OP_WRITE &&
+ !(fio->op_flags & REQ_SYNC) &&
+ !f2fs_encrypted_inode(inode))
+ return true;
+
/* this is only set during fdatasync */
if (policy & (0x1 << F2FS_IPU_FSYNC) &&
- is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
+ is_inode_flag_set(inode, FI_NEED_IPU))
return true;
return false;
@@ -551,16 +620,15 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
return curseg->next_blkoff;
}
-#ifdef CONFIG_F2FS_CHECK_FS
static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
{
- BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
+ f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1);
}
static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
{
- BUG_ON(blk_addr < SEG0_BLKADDR(sbi));
- BUG_ON(blk_addr >= MAX_BLKADDR(sbi));
+ BUG_ON(blk_addr < SEG0_BLKADDR(sbi)
+ || blk_addr >= MAX_BLKADDR(sbi));
}
/*
@@ -569,16 +637,11 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
static inline void check_block_count(struct f2fs_sb_info *sbi,
int segno, struct f2fs_sit_entry *raw_sit)
{
+#ifdef CONFIG_F2FS_CHECK_FS
bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false;
int valid_blocks = 0;
int cur_pos = 0, next_pos;
- /* check segment usage */
- BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
-
- /* check boundary of a given segment number */
- BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
-
/* check bitmap with valid block count */
do {
if (is_valid) {
@@ -594,35 +657,11 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
is_valid = !is_valid;
} while (cur_pos < sbi->blocks_per_seg);
BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
-}
-#else
-static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
-{
- if (segno > TOTAL_SEGS(sbi) - 1)
- sbi->need_fsck = true;
-}
-
-static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
-{
- if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
- sbi->need_fsck = true;
-}
-
-/*
- * Summary block is always treated as an invalid block
- */
-static inline void check_block_count(struct f2fs_sb_info *sbi,
- int segno, struct f2fs_sit_entry *raw_sit)
-{
- /* check segment usage */
- if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
- sbi->need_fsck = true;
-
- /* check boundary of a given segment number */
- if (segno > TOTAL_SEGS(sbi) - 1)
- sbi->need_fsck = true;
-}
#endif
+ /* check segment usage, and check boundary of a given segment number */
+ f2fs_bug_on(sbi, GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg
+ || segno > TOTAL_SEGS(sbi) - 1);
+}
static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
unsigned int start)
@@ -633,6 +672,12 @@ static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
check_seg_range(sbi, start);
+#ifdef CONFIG_F2FS_CHECK_FS
+ if (f2fs_test_bit(offset, sit_i->sit_bitmap) !=
+ f2fs_test_bit(offset, sit_i->sit_bitmap_mir))
+ f2fs_bug_on(sbi, 1);
+#endif
+
/* calculate sit block address */
if (f2fs_test_bit(offset, sit_i->sit_bitmap))
blk_addr += sit_i->sit_blocks;
@@ -657,10 +702,10 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
{
unsigned int block_off = SIT_BLOCK_OFFSET(start);
- if (f2fs_test_bit(block_off, sit_i->sit_bitmap))
- f2fs_clear_bit(block_off, sit_i->sit_bitmap);
- else
- f2fs_set_bit(block_off, sit_i->sit_bitmap);
+ f2fs_change_bit(block_off, sit_i->sit_bitmap);
+#ifdef CONFIG_F2FS_CHECK_FS
+ f2fs_change_bit(block_off, sit_i->sit_bitmap_mir);
+#endif
}
static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi)
@@ -691,35 +736,40 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
- (base + 1) + type;
}
-static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
+static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi,
+ unsigned int secno)
{
- if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno))
+ if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >=
+ sbi->fggc_threshold)
return true;
return false;
}
-static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
+static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
{
- struct block_device *bdev = sbi->sb->s_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- return SECTOR_TO_BLOCK(queue_max_sectors(q));
+ if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno))
+ return true;
+ return false;
}
/*
* It is very important to gather dirty pages and write at once, so that we can
* submit a big bio without interfering other data writes.
* By default, 512 pages for directory data,
- * 512 pages (2MB) * 3 for three types of nodes, and
- * max_bio_blocks for meta are set.
+ * 512 pages (2MB) * 8 for nodes, and
+ * 256 pages * 8 for meta are set.
*/
static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
{
+ if (sbi->sb->s_bdi->dirty_exceeded)
+ return 0;
+
if (type == DATA)
return sbi->blocks_per_seg;
else if (type == NODE)
- return 3 * sbi->blocks_per_seg;
+ return 8 * sbi->blocks_per_seg;
else if (type == META)
- return MAX_BIO_BLOCKS(sbi);
+ return 8 * BIO_MAX_PAGES;
else
return 0;
}
@@ -736,13 +786,9 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
return 0;
nr_to_write = wbc->nr_to_write;
-
- if (type == DATA)
- desired = 4096;
- else if (type == NODE)
- desired = 3 * max_hw_blocks(sbi);
- else
- desired = MAX_BIO_BLOCKS(sbi);
+ desired = BIO_MAX_PAGES;
+ if (type == NODE)
+ desired <<= 1;
wbc->nr_to_write = desired;
return desired - nr_to_write;
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
new file mode 100644
index 000000000000..5c60fc28ec75
--- /dev/null
+++ b/fs/f2fs/shrinker.c
@@ -0,0 +1,143 @@
+/*
+ * f2fs shrinker support
+ * the basic infra was copied from fs/ubifs/shrinker.c
+ *
+ * Copyright (c) 2015 Motorola Mobility
+ * Copyright (c) 2015 Jaegeuk Kim <jaegeuk@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+#include "node.h"
+
+static LIST_HEAD(f2fs_list);
+static DEFINE_SPINLOCK(f2fs_list_lock);
+static unsigned int shrinker_run_no;
+
+static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
+{
+ long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
+
+ return count > 0 ? count : 0;
+}
+
+static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
+{
+ long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS;
+
+ return count > 0 ? count : 0;
+}
+
+static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
+{
+ return atomic_read(&sbi->total_zombie_tree) +
+ atomic_read(&sbi->total_ext_node);
+}
+
+unsigned long f2fs_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct f2fs_sb_info *sbi;
+ struct list_head *p;
+ unsigned long count = 0;
+
+ spin_lock(&f2fs_list_lock);
+ p = f2fs_list.next;
+ while (p != &f2fs_list) {
+ sbi = list_entry(p, struct f2fs_sb_info, s_list);
+
+ /* stop f2fs_put_super */
+ if (!mutex_trylock(&sbi->umount_mutex)) {
+ p = p->next;
+ continue;
+ }
+ spin_unlock(&f2fs_list_lock);
+
+ /* count extent cache entries */
+ count += __count_extent_cache(sbi);
+
+ /* shrink clean nat cache entries */
+ count += __count_nat_entries(sbi);
+
+ /* count free nids cache entries */
+ count += __count_free_nids(sbi);
+
+ spin_lock(&f2fs_list_lock);
+ p = p->next;
+ mutex_unlock(&sbi->umount_mutex);
+ }
+ spin_unlock(&f2fs_list_lock);
+ return count;
+}
+
+unsigned long f2fs_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ unsigned long nr = sc->nr_to_scan;
+ struct f2fs_sb_info *sbi;
+ struct list_head *p;
+ unsigned int run_no;
+ unsigned long freed = 0;
+
+ spin_lock(&f2fs_list_lock);
+ do {
+ run_no = ++shrinker_run_no;
+ } while (run_no == 0);
+ p = f2fs_list.next;
+ while (p != &f2fs_list) {
+ sbi = list_entry(p, struct f2fs_sb_info, s_list);
+
+ if (sbi->shrinker_run_no == run_no)
+ break;
+
+ /* stop f2fs_put_super */
+ if (!mutex_trylock(&sbi->umount_mutex)) {
+ p = p->next;
+ continue;
+ }
+ spin_unlock(&f2fs_list_lock);
+
+ sbi->shrinker_run_no = run_no;
+
+ /* shrink extent cache entries */
+ freed += f2fs_shrink_extent_tree(sbi, nr >> 1);
+
+ /* shrink clean nat cache entries */
+ if (freed < nr)
+ freed += try_to_free_nats(sbi, nr - freed);
+
+ /* shrink free nids cache entries */
+ if (freed < nr)
+ freed += try_to_free_nids(sbi, nr - freed);
+
+ spin_lock(&f2fs_list_lock);
+ p = p->next;
+ list_move_tail(&sbi->s_list, &f2fs_list);
+ mutex_unlock(&sbi->umount_mutex);
+ if (freed >= nr)
+ break;
+ }
+ spin_unlock(&f2fs_list_lock);
+ return freed;
+}
+
+void f2fs_join_shrinker(struct f2fs_sb_info *sbi)
+{
+ spin_lock(&f2fs_list_lock);
+ list_add_tail(&sbi->s_list, &f2fs_list);
+ spin_unlock(&f2fs_list_lock);
+}
+
+void f2fs_leave_shrinker(struct f2fs_sb_info *sbi)
+{
+ f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi));
+
+ spin_lock(&f2fs_list_lock);
+ list_del(&sbi->s_list);
+ spin_unlock(&f2fs_list_lock);
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 341466bcf180..6afc50c19163 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -30,6 +30,7 @@
#include "segment.h"
#include "xattr.h"
#include "gc.h"
+#include "trace.h"
#define CREATE_TRACE_POINTS
#include <trace/events/f2fs.h>
@@ -38,11 +39,51 @@ static struct proc_dir_entry *f2fs_proc_root;
static struct kmem_cache *f2fs_inode_cachep;
static struct kset *f2fs_kset;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+
+char *fault_name[FAULT_MAX] = {
+ [FAULT_KMALLOC] = "kmalloc",
+ [FAULT_PAGE_ALLOC] = "page alloc",
+ [FAULT_ALLOC_NID] = "alloc nid",
+ [FAULT_ORPHAN] = "orphan",
+ [FAULT_BLOCK] = "no more block",
+ [FAULT_DIR_DEPTH] = "too big dir depth",
+ [FAULT_EVICT_INODE] = "evict_inode fail",
+ [FAULT_TRUNCATE] = "truncate fail",
+ [FAULT_IO] = "IO error",
+ [FAULT_CHECKPOINT] = "checkpoint error",
+};
+
+static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi,
+ unsigned int rate)
+{
+ struct f2fs_fault_info *ffi = &sbi->fault_info;
+
+ if (rate) {
+ atomic_set(&ffi->inject_ops, 0);
+ ffi->inject_rate = rate;
+ ffi->inject_type = (1 << FAULT_MAX) - 1;
+ } else {
+ memset(ffi, 0, sizeof(struct f2fs_fault_info));
+ }
+}
+#endif
+
+/* f2fs-wide shrinker description */
+static struct shrinker f2fs_shrinker_info = {
+ .scan_objects = f2fs_shrink_scan,
+ .count_objects = f2fs_shrink_count,
+ .seeks = DEFAULT_SEEKS,
+};
+
enum {
Opt_gc_background,
Opt_disable_roll_forward,
+ Opt_norecovery,
Opt_discard,
+ Opt_nodiscard,
Opt_noheap,
+ Opt_heap,
Opt_user_xattr,
Opt_nouser_xattr,
Opt_acl,
@@ -50,17 +91,32 @@ enum {
Opt_active_logs,
Opt_disable_ext_identify,
Opt_inline_xattr,
+ Opt_noinline_xattr,
Opt_inline_data,
+ Opt_inline_dentry,
+ Opt_noinline_dentry,
Opt_flush_merge,
+ Opt_noflush_merge,
Opt_nobarrier,
+ Opt_fastboot,
+ Opt_extent_cache,
+ Opt_noextent_cache,
+ Opt_noinline_data,
+ Opt_data_flush,
+ Opt_mode,
+ Opt_io_size_bits,
+ Opt_fault_injection,
Opt_err,
};
static match_table_t f2fs_tokens = {
{Opt_gc_background, "background_gc=%s"},
{Opt_disable_roll_forward, "disable_roll_forward"},
+ {Opt_norecovery, "norecovery"},
{Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
{Opt_noheap, "no_heap"},
+ {Opt_heap, "heap"},
{Opt_user_xattr, "user_xattr"},
{Opt_nouser_xattr, "nouser_xattr"},
{Opt_acl, "acl"},
@@ -68,9 +124,21 @@ static match_table_t f2fs_tokens = {
{Opt_active_logs, "active_logs=%u"},
{Opt_disable_ext_identify, "disable_ext_identify"},
{Opt_inline_xattr, "inline_xattr"},
+ {Opt_noinline_xattr, "noinline_xattr"},
{Opt_inline_data, "inline_data"},
+ {Opt_inline_dentry, "inline_dentry"},
+ {Opt_noinline_dentry, "noinline_dentry"},
{Opt_flush_merge, "flush_merge"},
+ {Opt_noflush_merge, "noflush_merge"},
{Opt_nobarrier, "nobarrier"},
+ {Opt_fastboot, "fastboot"},
+ {Opt_extent_cache, "extent_cache"},
+ {Opt_noextent_cache, "noextent_cache"},
+ {Opt_noinline_data, "noinline_data"},
+ {Opt_data_flush, "data_flush"},
+ {Opt_mode, "mode=%s"},
+ {Opt_io_size_bits, "io_bits=%u"},
+ {Opt_fault_injection, "fault_injection=%u"},
{Opt_err, NULL},
};
@@ -78,8 +146,13 @@ static match_table_t f2fs_tokens = {
enum {
GC_THREAD, /* struct f2fs_gc_thread */
SM_INFO, /* struct f2fs_sm_info */
+ DCC_INFO, /* struct discard_cmd_control */
NM_INFO, /* struct f2fs_nm_info */
F2FS_SBI, /* struct f2fs_sb_info */
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ FAULT_INFO_RATE, /* struct f2fs_fault_info */
+ FAULT_INFO_TYPE, /* struct f2fs_fault_info */
+#endif
};
struct f2fs_attr {
@@ -97,13 +170,33 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
return (unsigned char *)sbi->gc_thread;
else if (struct_type == SM_INFO)
return (unsigned char *)SM_I(sbi);
+ else if (struct_type == DCC_INFO)
+ return (unsigned char *)SM_I(sbi)->dcc_info;
else if (struct_type == NM_INFO)
return (unsigned char *)NM_I(sbi);
else if (struct_type == F2FS_SBI)
return (unsigned char *)sbi;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ else if (struct_type == FAULT_INFO_RATE ||
+ struct_type == FAULT_INFO_TYPE)
+ return (unsigned char *)&sbi->fault_info;
+#endif
return NULL;
}
+static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->sb;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)(sbi->kbytes_written +
+ BD_PART_WRITTEN(sbi)));
+}
+
static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -137,6 +230,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret < 0)
return ret;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX))
+ return -EINVAL;
+#endif
*ui = t;
return count;
}
@@ -182,18 +279,32 @@ static struct f2fs_attr f2fs_attr_##_name = { \
f2fs_sbi_show, f2fs_sbi_store, \
offsetof(struct struct_name, elname))
+#define F2FS_GENERAL_RO_ATTR(name) \
+static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
+
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time);
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
+F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
+#endif
+F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -203,12 +314,23 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_idle),
ATTR_LIST(reclaim_segments),
ATTR_LIST(max_small_discards),
+ ATTR_LIST(batched_trim_sections),
ATTR_LIST(ipu_policy),
ATTR_LIST(min_ipu_util),
ATTR_LIST(min_fsync_blocks),
+ ATTR_LIST(min_hot_blocks),
ATTR_LIST(max_victim_search),
ATTR_LIST(dir_level),
ATTR_LIST(ram_thresh),
+ ATTR_LIST(ra_nid_pages),
+ ATTR_LIST(dirty_nats_ratio),
+ ATTR_LIST(cp_interval),
+ ATTR_LIST(idle_interval),
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ ATTR_LIST(inject_rate),
+ ATTR_LIST(inject_type),
+#endif
+ ATTR_LIST(lifetime_write_kbytes),
NULL,
};
@@ -245,6 +367,7 @@ static void init_once(void *foo)
static int parse_options(struct super_block *sb, char *options)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct request_queue *q;
substring_t args[MAX_OPT_ARGS];
char *p, *name;
int arg = 0;
@@ -269,11 +392,16 @@ static int parse_options(struct super_block *sb, char *options)
if (!name)
return -ENOMEM;
- if (strlen(name) == 2 && !strncmp(name, "on", 2))
+ if (strlen(name) == 2 && !strncmp(name, "on", 2)) {
set_opt(sbi, BG_GC);
- else if (strlen(name) == 3 && !strncmp(name, "off", 3))
+ clear_opt(sbi, FORCE_FG_GC);
+ } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) {
clear_opt(sbi, BG_GC);
- else {
+ clear_opt(sbi, FORCE_FG_GC);
+ } else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) {
+ set_opt(sbi, BG_GC);
+ set_opt(sbi, FORCE_FG_GC);
+ } else {
kfree(name);
return -EINVAL;
}
@@ -282,12 +410,36 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_disable_roll_forward:
set_opt(sbi, DISABLE_ROLL_FORWARD);
break;
+ case Opt_norecovery:
+ /* this option mounts f2fs with ro */
+ set_opt(sbi, DISABLE_ROLL_FORWARD);
+ if (!f2fs_readonly(sb))
+ return -EINVAL;
+ break;
case Opt_discard:
- set_opt(sbi, DISCARD);
+ q = bdev_get_queue(sb->s_bdev);
+ if (blk_queue_discard(q)) {
+ set_opt(sbi, DISCARD);
+ } else if (!f2fs_sb_mounted_blkzoned(sb)) {
+ f2fs_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but "
+ "the device does not support discard");
+ }
+ break;
+ case Opt_nodiscard:
+ if (f2fs_sb_mounted_blkzoned(sb)) {
+ f2fs_msg(sb, KERN_WARNING,
+ "discard is required for zoned block devices");
+ return -EINVAL;
+ }
+ clear_opt(sbi, DISCARD);
break;
case Opt_noheap:
set_opt(sbi, NOHEAP);
break;
+ case Opt_heap:
+ clear_opt(sbi, NOHEAP);
+ break;
#ifdef CONFIG_F2FS_FS_XATTR
case Opt_user_xattr:
set_opt(sbi, XATTR_USER);
@@ -298,6 +450,9 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_inline_xattr:
set_opt(sbi, INLINE_XATTR);
break;
+ case Opt_noinline_xattr:
+ clear_opt(sbi, INLINE_XATTR);
+ break;
#else
case Opt_user_xattr:
f2fs_msg(sb, KERN_INFO,
@@ -311,6 +466,10 @@ static int parse_options(struct super_block *sb, char *options)
f2fs_msg(sb, KERN_INFO,
"inline_xattr options not supported");
break;
+ case Opt_noinline_xattr:
+ f2fs_msg(sb, KERN_INFO,
+ "noinline_xattr options not supported");
+ break;
#endif
#ifdef CONFIG_F2FS_FS_POSIX_ACL
case Opt_acl:
@@ -340,12 +499,82 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_inline_data:
set_opt(sbi, INLINE_DATA);
break;
+ case Opt_inline_dentry:
+ set_opt(sbi, INLINE_DENTRY);
+ break;
+ case Opt_noinline_dentry:
+ clear_opt(sbi, INLINE_DENTRY);
+ break;
case Opt_flush_merge:
set_opt(sbi, FLUSH_MERGE);
break;
+ case Opt_noflush_merge:
+ clear_opt(sbi, FLUSH_MERGE);
+ break;
case Opt_nobarrier:
set_opt(sbi, NOBARRIER);
break;
+ case Opt_fastboot:
+ set_opt(sbi, FASTBOOT);
+ break;
+ case Opt_extent_cache:
+ set_opt(sbi, EXTENT_CACHE);
+ break;
+ case Opt_noextent_cache:
+ clear_opt(sbi, EXTENT_CACHE);
+ break;
+ case Opt_noinline_data:
+ clear_opt(sbi, INLINE_DATA);
+ break;
+ case Opt_data_flush:
+ set_opt(sbi, DATA_FLUSH);
+ break;
+ case Opt_mode:
+ name = match_strdup(&args[0]);
+
+ if (!name)
+ return -ENOMEM;
+ if (strlen(name) == 8 &&
+ !strncmp(name, "adaptive", 8)) {
+ if (f2fs_sb_mounted_blkzoned(sb)) {
+ f2fs_msg(sb, KERN_WARNING,
+ "adaptive mode is not allowed with "
+ "zoned block device feature");
+ kfree(name);
+ return -EINVAL;
+ }
+ set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+ } else if (strlen(name) == 3 &&
+ !strncmp(name, "lfs", 3)) {
+ set_opt_mode(sbi, F2FS_MOUNT_LFS);
+ } else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
+ case Opt_io_size_bits:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+ if (arg > __ilog2_u32(BIO_MAX_PAGES)) {
+ f2fs_msg(sb, KERN_WARNING,
+ "Not support %d, larger than %d",
+ 1 << arg, BIO_MAX_PAGES);
+ return -EINVAL;
+ }
+ sbi->write_io_size_bits = arg;
+ break;
+ case Opt_fault_injection:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ f2fs_build_fault_attr(sbi, arg);
+ set_opt(sbi, FAULT_INJECTION);
+#else
+ f2fs_msg(sb, KERN_INFO,
+ "FAULT_INJECTION was not selected");
+#endif
+ break;
default:
f2fs_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" or missing value",
@@ -353,6 +582,13 @@ static int parse_options(struct super_block *sb, char *options)
return -EINVAL;
}
}
+
+ if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) {
+ f2fs_msg(sb, KERN_ERR,
+ "Should set mode=lfs with %uKB-sized IO",
+ F2FS_IO_SIZE_KB(sbi));
+ return -EINVAL;
+ }
return 0;
}
@@ -371,24 +607,22 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
atomic_set(&fi->dirty_pages, 0);
fi->i_current_depth = 1;
fi->i_advise = 0;
- rwlock_init(&fi->ext.ext_lock);
init_rwsem(&fi->i_sem);
+ INIT_LIST_HEAD(&fi->dirty_list);
+ INIT_LIST_HEAD(&fi->gdirty_list);
INIT_LIST_HEAD(&fi->inmem_pages);
mutex_init(&fi->inmem_lock);
-
- set_inode_flag(fi, FI_NEW_INODE);
-
- if (test_opt(F2FS_SB(sb), INLINE_XATTR))
- set_inode_flag(fi, FI_INLINE_XATTR);
+ init_rwsem(&fi->dio_rwsem[READ]);
+ init_rwsem(&fi->dio_rwsem[WRITE]);
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
-
return &fi->vfs_inode;
}
static int f2fs_drop_inode(struct inode *inode)
{
+ int ret;
/*
* This is to avoid a deadlock condition like below.
* writeback_single_inode(inode)
@@ -396,9 +630,77 @@ static int f2fs_drop_inode(struct inode *inode)
* - f2fs_gc -> iput -> evict
* - inode_wait_for_writeback(inode)
*/
- if (!inode_unhashed(inode) && inode->i_state & I_SYNC)
+ if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) {
+ if (!inode->i_nlink && !is_bad_inode(inode)) {
+ /* to avoid evict_inode call simultaneously */
+ atomic_inc(&inode->i_count);
+ spin_unlock(&inode->i_lock);
+
+ /* some remained atomic pages should discarded */
+ if (f2fs_is_atomic_file(inode))
+ drop_inmem_pages(inode);
+
+ /* should remain fi->extent_tree for writepage */
+ f2fs_destroy_extent_node(inode);
+
+ sb_start_intwrite(inode->i_sb);
+ f2fs_i_size_write(inode, 0);
+
+ if (F2FS_HAS_BLOCKS(inode))
+ f2fs_truncate(inode);
+
+ sb_end_intwrite(inode->i_sb);
+
+ fscrypt_put_encryption_info(inode, NULL);
+ spin_lock(&inode->i_lock);
+ atomic_dec(&inode->i_count);
+ }
+ trace_f2fs_drop_inode(inode, 0);
return 0;
- return generic_drop_inode(inode);
+ }
+ ret = generic_drop_inode(inode);
+ trace_f2fs_drop_inode(inode, ret);
+ return ret;
+}
+
+int f2fs_inode_dirtied(struct inode *inode, bool sync)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int ret = 0;
+
+ spin_lock(&sbi->inode_lock[DIRTY_META]);
+ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) {
+ ret = 1;
+ } else {
+ set_inode_flag(inode, FI_DIRTY_INODE);
+ stat_inc_dirty_inode(sbi, DIRTY_META);
+ }
+ if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) {
+ list_add_tail(&F2FS_I(inode)->gdirty_list,
+ &sbi->inode_list[DIRTY_META]);
+ inc_page_count(sbi, F2FS_DIRTY_IMETA);
+ }
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ return ret;
+}
+
+void f2fs_inode_synced(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ spin_lock(&sbi->inode_lock[DIRTY_META]);
+ if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) {
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ return;
+ }
+ if (!list_empty(&F2FS_I(inode)->gdirty_list)) {
+ list_del_init(&F2FS_I(inode)->gdirty_list);
+ dec_page_count(sbi, F2FS_DIRTY_IMETA);
+ }
+ clear_inode_flag(inode, FI_DIRTY_INODE);
+ clear_inode_flag(inode, FI_AUTO_RECOVER);
+ stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META);
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
}
/*
@@ -408,7 +710,16 @@ static int f2fs_drop_inode(struct inode *inode)
*/
static void f2fs_dirty_inode(struct inode *inode, int flags)
{
- set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+ inode->i_ino == F2FS_META_INO(sbi))
+ return;
+
+ if (is_inode_flag_set(inode, FI_AUTO_RECOVER))
+ clear_inode_flag(inode, FI_AUTO_RECOVER);
+
+ f2fs_inode_dirtied(inode, false);
}
static void f2fs_i_callback(struct rcu_head *head)
@@ -422,33 +733,78 @@ static void f2fs_destroy_inode(struct inode *inode)
call_rcu(&inode->i_rcu, f2fs_i_callback);
}
+static void destroy_percpu_info(struct f2fs_sb_info *sbi)
+{
+ percpu_counter_destroy(&sbi->alloc_valid_block_count);
+ percpu_counter_destroy(&sbi->total_valid_inode_count);
+}
+
+static void destroy_device_list(struct f2fs_sb_info *sbi)
+{
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++) {
+ blkdev_put(FDEV(i).bdev, FMODE_EXCL);
+#ifdef CONFIG_BLK_DEV_ZONED
+ kfree(FDEV(i).blkz_type);
+#endif
+ }
+ kfree(sbi->devs);
+}
+
static void f2fs_put_super(struct super_block *sb)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
if (sbi->s_proc) {
remove_proc_entry("segment_info", sbi->s_proc);
+ remove_proc_entry("segment_bits", sbi->s_proc);
remove_proc_entry(sb->s_id, f2fs_proc_root);
}
kobject_del(&sbi->s_kobj);
- f2fs_destroy_stats(sbi);
stop_gc_thread(sbi);
- /* We don't need to do checkpoint when it's clean */
- if (sbi->s_dirty) {
+ /* prevent remaining shrinker jobs */
+ mutex_lock(&sbi->umount_mutex);
+
+ /*
+ * We don't need to do checkpoint when superblock is clean.
+ * But, the previous checkpoint was not done by umount, it needs to do
+ * clean checkpoint again.
+ */
+ if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
+ !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
struct cp_control cpc = {
.reason = CP_UMOUNT,
};
write_checkpoint(sbi, &cpc);
}
+ /* be sure to wait for any on-going discard commands */
+ f2fs_wait_discard_bios(sbi);
+
+ if (!sbi->discard_blks) {
+ struct cp_control cpc = {
+ .reason = CP_UMOUNT | CP_TRIMMED,
+ };
+ write_checkpoint(sbi, &cpc);
+ }
+
+ /* write_checkpoint can update stat informaion */
+ f2fs_destroy_stats(sbi);
+
/*
* normally superblock is clean, so we need to release this.
* In addition, EIO will skip do checkpoint, we need this as well.
*/
- release_dirty_inode(sbi);
- release_discard_addrs(sbi);
+ release_ino_entry(sbi, true);
+
+ f2fs_leave_shrinker(sbi);
+ mutex_unlock(&sbi->umount_mutex);
+
+ /* our cp_error case, we can wait for any writeback page */
+ f2fs_flush_merged_bios(sbi);
iput(sbi->node_inode);
iput(sbi->meta_inode);
@@ -462,39 +818,51 @@ static void f2fs_put_super(struct super_block *sb)
wait_for_completion(&sbi->s_kobj_unregister);
sb->s_fs_info = NULL;
- brelse(sbi->raw_super_buf);
+ if (sbi->s_chksum_driver)
+ crypto_free_shash(sbi->s_chksum_driver);
+ kfree(sbi->raw_super);
+
+ destroy_device_list(sbi);
+ if (sbi->write_io_dummy)
+ mempool_destroy(sbi->write_io_dummy);
+ destroy_percpu_info(sbi);
kfree(sbi);
}
int f2fs_sync_fs(struct super_block *sb, int sync)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int err = 0;
trace_f2fs_sync_fs(sb, sync);
if (sync) {
- struct cp_control cpc = {
- .reason = CP_SYNC,
- };
+ struct cp_control cpc;
+
+ cpc.reason = __get_cp_reason(sbi);
+
mutex_lock(&sbi->gc_mutex);
- write_checkpoint(sbi, &cpc);
+ err = write_checkpoint(sbi, &cpc);
mutex_unlock(&sbi->gc_mutex);
- } else {
- f2fs_balance_fs(sbi);
}
+ f2fs_trace_ios(NULL, 1);
- return 0;
+ return err;
}
static int f2fs_freeze(struct super_block *sb)
{
- int err;
-
if (f2fs_readonly(sb))
return 0;
- err = f2fs_sync_fs(sb, 1);
- return err;
+ /* IO error happened before */
+ if (unlikely(f2fs_cp_error(F2FS_SB(sb))))
+ return -EIO;
+
+ /* must be clean, since sync_filesystem() was already called */
+ if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY))
+ return -EINVAL;
+ return 0;
}
static int f2fs_unfreeze(struct super_block *sb)
@@ -517,11 +885,12 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bsize = sbi->blocksize;
buf->f_blocks = total_count - start_count;
- buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
+ buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count;
buf->f_bavail = user_block_count - valid_user_blocks(sbi);
buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
- buf->f_ffree = buf->f_files - valid_inode_count(sbi);
+ buf->f_ffree = min(buf->f_files - valid_node_count(sbi),
+ buf->f_bavail);
buf->f_namelen = F2FS_NAME_LEN;
buf->f_fsid.val[0] = (u32)id;
@@ -534,16 +903,22 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
- if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC))
- seq_printf(seq, ",background_gc=%s", "on");
- else
+ if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) {
+ if (test_opt(sbi, FORCE_FG_GC))
+ seq_printf(seq, ",background_gc=%s", "sync");
+ else
+ seq_printf(seq, ",background_gc=%s", "on");
+ } else {
seq_printf(seq, ",background_gc=%s", "off");
+ }
if (test_opt(sbi, DISABLE_ROLL_FORWARD))
seq_puts(seq, ",disable_roll_forward");
if (test_opt(sbi, DISCARD))
seq_puts(seq, ",discard");
if (test_opt(sbi, NOHEAP))
- seq_puts(seq, ",no_heap_alloc");
+ seq_puts(seq, ",no_heap");
+ else
+ seq_puts(seq, ",heap");
#ifdef CONFIG_F2FS_FS_XATTR
if (test_opt(sbi, XATTR_USER))
seq_puts(seq, ",user_xattr");
@@ -551,6 +926,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",nouser_xattr");
if (test_opt(sbi, INLINE_XATTR))
seq_puts(seq, ",inline_xattr");
+ else
+ seq_puts(seq, ",noinline_xattr");
#endif
#ifdef CONFIG_F2FS_FS_POSIX_ACL
if (test_opt(sbi, POSIX_ACL))
@@ -562,11 +939,37 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",disable_ext_identify");
if (test_opt(sbi, INLINE_DATA))
seq_puts(seq, ",inline_data");
+ else
+ seq_puts(seq, ",noinline_data");
+ if (test_opt(sbi, INLINE_DENTRY))
+ seq_puts(seq, ",inline_dentry");
+ else
+ seq_puts(seq, ",noinline_dentry");
if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
seq_puts(seq, ",flush_merge");
if (test_opt(sbi, NOBARRIER))
seq_puts(seq, ",nobarrier");
+ if (test_opt(sbi, FASTBOOT))
+ seq_puts(seq, ",fastboot");
+ if (test_opt(sbi, EXTENT_CACHE))
+ seq_puts(seq, ",extent_cache");
+ else
+ seq_puts(seq, ",noextent_cache");
+ if (test_opt(sbi, DATA_FLUSH))
+ seq_puts(seq, ",data_flush");
+
+ seq_puts(seq, ",mode=");
+ if (test_opt(sbi, ADAPTIVE))
+ seq_puts(seq, "adaptive");
+ else if (test_opt(sbi, LFS))
+ seq_puts(seq, "lfs");
seq_printf(seq, ",active_logs=%u", sbi->active_logs);
+ if (F2FS_IO_SIZE_BITS(sbi))
+ seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi));
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (test_opt(sbi, FAULT_INJECTION))
+ seq_puts(seq, ",fault_injection");
+#endif
return 0;
}
@@ -586,9 +989,9 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
struct seg_entry *se = get_seg_entry(sbi, i);
if ((i % 10) == 0)
- seq_printf(seq, "%-5d", i);
+ seq_printf(seq, "%-10d", i);
seq_printf(seq, "%d|%-3u", se->type,
- get_valid_blocks(sbi, i, 1));
+ get_valid_blocks(sbi, i, false));
if ((i % 10) == 9 || i == (total_segs - 1))
seq_putc(seq, '\n');
else
@@ -598,19 +1001,78 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
return 0;
}
-static int segment_info_open_fs(struct inode *inode, struct file *file)
+static int segment_bits_seq_show(struct seq_file *seq, void *offset)
{
- return single_open(file, segment_info_seq_show, PDE_DATA(inode));
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ unsigned int total_segs =
+ le32_to_cpu(sbi->raw_super->segment_count_main);
+ int i, j;
+
+ seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n"
+ "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
+
+ for (i = 0; i < total_segs; i++) {
+ struct seg_entry *se = get_seg_entry(sbi, i);
+
+ seq_printf(seq, "%-10d", i);
+ seq_printf(seq, "%d|%-3u|", se->type,
+ get_valid_blocks(sbi, i, false));
+ for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
+ seq_printf(seq, " %.2x", se->cur_valid_map[j]);
+ seq_putc(seq, '\n');
+ }
+ return 0;
}
-static const struct file_operations f2fs_seq_segment_info_fops = {
- .owner = THIS_MODULE,
- .open = segment_info_open_fs,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+#define F2FS_PROC_FILE_DEF(_name) \
+static int _name##_open_fs(struct inode *inode, struct file *file) \
+{ \
+ return single_open(file, _name##_seq_show, PDE_DATA(inode)); \
+} \
+ \
+static const struct file_operations f2fs_seq_##_name##_fops = { \
+ .owner = THIS_MODULE, \
+ .open = _name##_open_fs, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
};
+F2FS_PROC_FILE_DEF(segment_info);
+F2FS_PROC_FILE_DEF(segment_bits);
+
+static void default_options(struct f2fs_sb_info *sbi)
+{
+ /* init some FS parameters */
+ sbi->active_logs = NR_CURSEG_TYPE;
+
+ set_opt(sbi, BG_GC);
+ set_opt(sbi, INLINE_XATTR);
+ set_opt(sbi, INLINE_DATA);
+ set_opt(sbi, INLINE_DENTRY);
+ set_opt(sbi, EXTENT_CACHE);
+ set_opt(sbi, NOHEAP);
+ set_opt(sbi, FLUSH_MERGE);
+ if (f2fs_sb_mounted_blkzoned(sbi->sb)) {
+ set_opt_mode(sbi, F2FS_MOUNT_LFS);
+ set_opt(sbi, DISCARD);
+ } else {
+ set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+ }
+
+#ifdef CONFIG_F2FS_FS_XATTR
+ set_opt(sbi, XATTR_USER);
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ set_opt(sbi, POSIX_ACL);
+#endif
+
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ f2fs_build_fault_attr(sbi, 0);
+#endif
+}
+
static int f2fs_remount(struct super_block *sb, int *flags, char *data)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -618,8 +1080,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
int err, active_logs;
bool need_restart_gc = false;
bool need_stop_gc = false;
-
- sync_filesystem(sb);
+ bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ struct f2fs_fault_info ffi = sbi->fault_info;
+#endif
/*
* Save the old mount options in case we
@@ -628,8 +1092,17 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
org_mount_opt = sbi->mount_opt;
active_logs = sbi->active_logs;
+ /* recover superblocks we couldn't write due to previous RO mount */
+ if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
+ err = f2fs_commit_super(sbi, false);
+ f2fs_msg(sb, KERN_INFO,
+ "Try to recover all the superblocks, ret: %d", err);
+ if (!err)
+ clear_sbi_flag(sbi, SBI_NEED_SB_WRITE);
+ }
+
sbi->mount_opt.opt = 0;
- sbi->active_logs = NR_CURSEG_TYPE;
+ default_options(sbi);
/* parse mount options */
err = parse_options(sb, data);
@@ -643,6 +1116,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
goto skip;
+ /* disallow enable/disable extent_cache dynamically */
+ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
+ err = -EINVAL;
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "switch extent_cache option is not allowed");
+ goto restore_opts;
+ }
+
/*
* We stop the GC thread if FS is mounted as RO
* or if background_gc = off is passed in mount
@@ -651,31 +1132,42 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
if (sbi->gc_thread) {
stop_gc_thread(sbi);
- f2fs_sync_fs(sb, 1);
need_restart_gc = true;
}
- } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) {
+ } else if (!sbi->gc_thread) {
err = start_gc_thread(sbi);
if (err)
goto restore_opts;
need_stop_gc = true;
}
+ if (*flags & MS_RDONLY) {
+ writeback_inodes_sb(sb, WB_REASON_SYNC);
+ sync_inodes_sb(sb);
+
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
+ set_sbi_flag(sbi, SBI_IS_CLOSE);
+ f2fs_sync_fs(sb, 1);
+ clear_sbi_flag(sbi, SBI_IS_CLOSE);
+ }
+
/*
* We stop issue flush thread if FS is mounted as RO
* or if flush_merge is not passed in mount option.
*/
if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
- destroy_flush_cmd_control(sbi);
- } else if (test_opt(sbi, FLUSH_MERGE) && !SM_I(sbi)->cmd_control_info) {
+ clear_opt(sbi, FLUSH_MERGE);
+ destroy_flush_cmd_control(sbi, false);
+ } else {
err = create_flush_cmd_control(sbi);
if (err)
goto restore_gc;
}
skip:
/* Update the POSIXACL Flag */
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+
return 0;
restore_gc:
if (need_restart_gc) {
@@ -688,6 +1180,9 @@ restore_gc:
restore_opts:
sbi->mount_opt = org_mount_opt;
sbi->active_logs = active_logs;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ sbi->fault_info = ffi;
+#endif
return err;
}
@@ -707,6 +1202,42 @@ static struct super_operations f2fs_sops = {
.remount_fs = f2fs_remount,
};
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
+{
+ return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+ F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, NULL);
+}
+
+static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
+ void *fs_data)
+{
+ return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+ F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, fs_data, XATTR_CREATE);
+}
+
+static unsigned f2fs_max_namelen(struct inode *inode)
+{
+ return S_ISLNK(inode->i_mode) ?
+ inode->i_sb->s_blocksize : F2FS_NAME_LEN;
+}
+
+static const struct fscrypt_operations f2fs_cryptops = {
+ .key_prefix = "f2fs:",
+ .get_context = f2fs_get_context,
+ .set_context = f2fs_set_context,
+ .is_encrypted = f2fs_encrypted_inode,
+ .empty_dir = f2fs_empty_dir,
+ .max_namelen = f2fs_max_namelen,
+};
+#else
+static const struct fscrypt_operations f2fs_cryptops = {
+ .is_encrypted = f2fs_encrypted_inode,
+};
+#endif
+
static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
u64 ino, u32 generation)
{
@@ -752,7 +1283,7 @@ static const struct export_operations f2fs_export_ops = {
.get_parent = f2fs_get_parent,
};
-static loff_t max_file_size(unsigned bits)
+static loff_t max_file_blocks(void)
{
loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS);
loff_t leaf_count = ADDRS_PER_BLOCK;
@@ -768,13 +1299,29 @@ static loff_t max_file_size(unsigned bits)
leaf_count *= NIDS_PER_BLOCK;
result += leaf_count;
- result <<= bits;
return result;
}
-static inline bool sanity_check_area_boundary(struct super_block *sb,
- struct f2fs_super_block *raw_super)
+static int __f2fs_commit_super(struct buffer_head *bh,
+ struct f2fs_super_block *super)
{
+ lock_buffer(bh);
+ if (super)
+ memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super));
+ set_buffer_uptodate(bh);
+ set_buffer_dirty(bh);
+ unlock_buffer(bh);
+
+ /* it's rare case, we can do fua all the time */
+ return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA);
+}
+
+static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
+ struct buffer_head *bh)
+{
+ struct f2fs_super_block *raw_super = (struct f2fs_super_block *)
+ (bh->b_data + F2FS_SUPER_OFFSET);
+ struct super_block *sb = sbi->sb;
u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr);
u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr);
@@ -788,6 +1335,10 @@ static inline bool sanity_check_area_boundary(struct super_block *sb,
u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main);
u32 segment_count = le32_to_cpu(raw_super->segment_count);
u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
+ u64 main_end_blkaddr = main_blkaddr +
+ (segment_count_main << log_blocks_per_seg);
+ u64 seg_end_blkaddr = segment0_blkaddr +
+ (segment_count << log_blocks_per_seg);
if (segment0_blkaddr != cp_blkaddr) {
f2fs_msg(sb, KERN_INFO,
@@ -832,22 +1383,47 @@ static inline bool sanity_check_area_boundary(struct super_block *sb,
return true;
}
- if (main_blkaddr + (segment_count_main << log_blocks_per_seg) !=
- segment0_blkaddr + (segment_count << log_blocks_per_seg)) {
+ if (main_end_blkaddr > seg_end_blkaddr) {
f2fs_msg(sb, KERN_INFO,
- "Wrong MAIN_AREA boundary, start(%u) end(%u) blocks(%u)",
+ "Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)",
main_blkaddr,
- segment0_blkaddr + (segment_count << log_blocks_per_seg),
+ segment0_blkaddr +
+ (segment_count << log_blocks_per_seg),
segment_count_main << log_blocks_per_seg);
return true;
- }
+ } else if (main_end_blkaddr < seg_end_blkaddr) {
+ int err = 0;
+ char *res;
+ /* fix in-memory information all the time */
+ raw_super->segment_count = cpu_to_le32((main_end_blkaddr -
+ segment0_blkaddr) >> log_blocks_per_seg);
+
+ if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) {
+ set_sbi_flag(sbi, SBI_NEED_SB_WRITE);
+ res = "internally";
+ } else {
+ err = __f2fs_commit_super(bh, NULL);
+ res = err ? "failed" : "done";
+ }
+ f2fs_msg(sb, KERN_INFO,
+ "Fix alignment : %s, start(%u) end(%u) block(%u)",
+ res, main_blkaddr,
+ segment0_blkaddr +
+ (segment_count << log_blocks_per_seg),
+ segment_count_main << log_blocks_per_seg);
+ if (err)
+ return true;
+ }
return false;
}
-static int sanity_check_raw_super(struct super_block *sb,
- struct f2fs_super_block *raw_super)
+static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
+ struct buffer_head *bh)
{
+ struct f2fs_super_block *raw_super = (struct f2fs_super_block *)
+ (bh->b_data + F2FS_SUPER_OFFSET);
+ struct super_block *sb = sbi->sb;
unsigned int blocksize;
if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
@@ -858,10 +1434,10 @@ static int sanity_check_raw_super(struct super_block *sb,
}
/* Currently, support only 4KB page cache size */
- if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) {
+ if (F2FS_BLKSIZE != PAGE_SIZE) {
f2fs_msg(sb, KERN_INFO,
"Invalid page_cache_size (%lu), supports only 4KB\n",
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
return 1;
}
@@ -921,17 +1497,20 @@ static int sanity_check_raw_super(struct super_block *sb,
}
/* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
- if (sanity_check_area_boundary(sb, raw_super))
+ if (sanity_check_area_boundary(sbi, bh))
return 1;
return 0;
}
-static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
+int sanity_check_ckpt(struct f2fs_sb_info *sbi)
{
unsigned int total, fsmeta;
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ unsigned int main_segs, blocks_per_seg;
+ unsigned int ovp_segments, reserved_segments;
+ int i;
total = le32_to_cpu(raw_super->segment_count);
fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
@@ -943,6 +1522,32 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
if (unlikely(fsmeta >= total))
return 1;
+ main_segs = le32_to_cpu(sbi->raw_super->segment_count_main);
+ blocks_per_seg = sbi->blocks_per_seg;
+
+ for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
+ if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs ||
+ le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) {
+ return 1;
+ }
+ }
+ for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
+ if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs ||
+ le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) {
+ return 1;
+ }
+ }
+
+ ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
+ reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
+
+ if (unlikely(fsmeta < F2FS_MIN_SEGMENTS ||
+ ovp_segments == 0 || reserved_segments == 0)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Wrong layout: check mkfs.f2fs version");
+ return 1;
+ }
+
if (unlikely(f2fs_cp_error(sbi))) {
f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
return 1;
@@ -973,54 +1578,287 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->cur_victim_sec = NULL_SECNO;
sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
+ sbi->dir_level = DEF_DIR_LEVEL;
+ sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
+ sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL;
+ clear_sbi_flag(sbi, SBI_NEED_FSCK);
+
for (i = 0; i < NR_COUNT_TYPE; i++)
atomic_set(&sbi->nr_pages[i], 0);
- sbi->dir_level = DEF_DIR_LEVEL;
- sbi->need_fsck = false;
+ atomic_set(&sbi->wb_sync_req, 0);
+
+ INIT_LIST_HEAD(&sbi->s_list);
+ mutex_init(&sbi->umount_mutex);
+ mutex_init(&sbi->wio_mutex[NODE]);
+ mutex_init(&sbi->wio_mutex[DATA]);
+ spin_lock_init(&sbi->cp_lock);
}
+static int init_percpu_info(struct f2fs_sb_info *sbi)
+{
+ int err;
+
+ err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL);
+ if (err)
+ return err;
+
+ return percpu_counter_init(&sbi->total_valid_inode_count, 0,
+ GFP_KERNEL);
+}
+
+#ifdef CONFIG_BLK_DEV_ZONED
+static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
+{
+ struct block_device *bdev = FDEV(devi).bdev;
+ sector_t nr_sectors = bdev->bd_part->nr_sects;
+ sector_t sector = 0;
+ struct blk_zone *zones;
+ unsigned int i, nr_zones;
+ unsigned int n = 0;
+ int err = -EIO;
+
+ if (!f2fs_sb_mounted_blkzoned(sbi->sb))
+ return 0;
+
+ if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
+ SECTOR_TO_BLOCK(bdev_zone_size(bdev)))
+ return -EINVAL;
+ sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev));
+ if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz !=
+ __ilog2_u32(sbi->blocks_per_blkz))
+ return -EINVAL;
+ sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
+ FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
+ sbi->log_blocks_per_blkz;
+ if (nr_sectors & (bdev_zone_size(bdev) - 1))
+ FDEV(devi).nr_blkz++;
+
+ FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL);
+ if (!FDEV(devi).blkz_type)
+ return -ENOMEM;
+
+#define F2FS_REPORT_NR_ZONES 4096
+
+ zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone),
+ GFP_KERNEL);
+ if (!zones)
+ return -ENOMEM;
+
+ /* Get block zones type */
+ while (zones && sector < nr_sectors) {
+
+ nr_zones = F2FS_REPORT_NR_ZONES;
+ err = blkdev_report_zones(bdev, sector,
+ zones, &nr_zones,
+ GFP_KERNEL);
+ if (err)
+ break;
+ if (!nr_zones) {
+ err = -EIO;
+ break;
+ }
+
+ for (i = 0; i < nr_zones; i++) {
+ FDEV(devi).blkz_type[n] = zones[i].type;
+ sector += zones[i].len;
+ n++;
+ }
+ }
+
+ kfree(zones);
+
+ return err;
+}
+#endif
+
/*
* Read f2fs raw super block.
- * Because we have two copies of super block, so read the first one at first,
- * if the first one is invalid, move to read the second one.
+ * Because we have two copies of super block, so read both of them
+ * to get the first valid one. If any one of them is broken, we pass
+ * them recovery flag back to the caller.
*/
-static int read_raw_super_block(struct super_block *sb,
+static int read_raw_super_block(struct f2fs_sb_info *sbi,
struct f2fs_super_block **raw_super,
- struct buffer_head **raw_super_buf)
+ int *valid_super_block, int *recovery)
{
- int block = 0;
+ struct super_block *sb = sbi->sb;
+ int block;
+ struct buffer_head *bh;
+ struct f2fs_super_block *super;
+ int err = 0;
+
+ super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL);
+ if (!super)
+ return -ENOMEM;
-retry:
- *raw_super_buf = sb_bread(sb, block);
- if (!*raw_super_buf) {
- f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
+ for (block = 0; block < 2; block++) {
+ bh = sb_bread(sb, block);
+ if (!bh) {
+ f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
block + 1);
- if (block == 0) {
- block++;
- goto retry;
- } else {
- return -EIO;
+ err = -EIO;
+ continue;
+ }
+
+ /* sanity checking of raw super */
+ if (sanity_check_raw_super(sbi, bh)) {
+ f2fs_msg(sb, KERN_ERR,
+ "Can't find valid F2FS filesystem in %dth superblock",
+ block + 1);
+ err = -EINVAL;
+ brelse(bh);
+ continue;
}
+
+ if (!*raw_super) {
+ memcpy(super, bh->b_data + F2FS_SUPER_OFFSET,
+ sizeof(*super));
+ *valid_super_block = block;
+ *raw_super = super;
+ }
+ brelse(bh);
}
- *raw_super = (struct f2fs_super_block *)
- ((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET);
+ /* Fail to read any one of the superblocks*/
+ if (err < 0)
+ *recovery = 1;
- /* sanity checking of raw super */
- if (sanity_check_raw_super(sb, *raw_super)) {
- brelse(*raw_super_buf);
- f2fs_msg(sb, KERN_ERR,
- "Can't find valid F2FS filesystem in %dth superblock",
- block + 1);
- if (block == 0) {
- block++;
- goto retry;
+ /* No valid superblock */
+ if (!*raw_super)
+ kfree(super);
+ else
+ err = 0;
+
+ return err;
+}
+
+int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
+{
+ struct buffer_head *bh;
+ int err;
+
+ if ((recover && f2fs_readonly(sbi->sb)) ||
+ bdev_read_only(sbi->sb->s_bdev)) {
+ set_sbi_flag(sbi, SBI_NEED_SB_WRITE);
+ return -EROFS;
+ }
+
+ /* write back-up superblock first */
+ bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1);
+ if (!bh)
+ return -EIO;
+ err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
+ brelse(bh);
+
+ /* if we are in recovery path, skip writing valid superblock */
+ if (recover || err)
+ return err;
+
+ /* write current valid superblock */
+ bh = sb_getblk(sbi->sb, sbi->valid_super_block);
+ if (!bh)
+ return -EIO;
+ err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
+ brelse(bh);
+ return err;
+}
+
+static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+ unsigned int max_devices = MAX_DEVICES;
+ int i;
+
+ /* Initialize single device information */
+ if (!RDEV(0).path[0]) {
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (bdev_zoned_model(sbi->sb->s_bdev) == BLK_ZONED_NONE)
+ return 0;
+ max_devices = 1;
+#else
+ return 0;
+#endif
+ }
+
+ /*
+ * Initialize multiple devices information, or single
+ * zoned block device information.
+ */
+ sbi->devs = kcalloc(max_devices, sizeof(struct f2fs_dev_info),
+ GFP_KERNEL);
+ if (!sbi->devs)
+ return -ENOMEM;
+
+ for (i = 0; i < max_devices; i++) {
+
+ if (i > 0 && !RDEV(i).path[0])
+ break;
+
+ if (max_devices == 1) {
+ /* Single zoned block device mount */
+ FDEV(0).bdev =
+ blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev,
+ sbi->sb->s_mode, sbi->sb->s_type);
} else {
+ /* Multi-device mount */
+ memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
+ FDEV(i).total_segments =
+ le32_to_cpu(RDEV(i).total_segments);
+ if (i == 0) {
+ FDEV(i).start_blk = 0;
+ FDEV(i).end_blk = FDEV(i).start_blk +
+ (FDEV(i).total_segments <<
+ sbi->log_blocks_per_seg) - 1 +
+ le32_to_cpu(raw_super->segment0_blkaddr);
+ } else {
+ FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
+ FDEV(i).end_blk = FDEV(i).start_blk +
+ (FDEV(i).total_segments <<
+ sbi->log_blocks_per_seg) - 1;
+ }
+ FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
+ sbi->sb->s_mode, sbi->sb->s_type);
+ }
+ if (IS_ERR(FDEV(i).bdev))
+ return PTR_ERR(FDEV(i).bdev);
+
+ /* to release errored devices */
+ sbi->s_ndevs = i + 1;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
+ !f2fs_sb_mounted_blkzoned(sbi->sb)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Zoned block device feature not enabled\n");
return -EINVAL;
}
+ if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
+ if (init_blkz_info(sbi, i)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Failed to initialize F2FS blkzone information");
+ return -EINVAL;
+ }
+ if (max_devices == 1)
+ break;
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
+ i, FDEV(i).path,
+ FDEV(i).total_segments,
+ FDEV(i).start_blk, FDEV(i).end_blk,
+ bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
+ "Host-aware" : "Host-managed");
+ continue;
+ }
+#endif
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Mount Device [%2d]: %20s, %8u, %8x - %8x",
+ i, FDEV(i).path,
+ FDEV(i).total_segments,
+ FDEV(i).start_blk, FDEV(i).end_blk);
}
-
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi));
return 0;
}
@@ -1028,50 +1866,81 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
{
struct f2fs_sb_info *sbi;
struct f2fs_super_block *raw_super;
- struct buffer_head *raw_super_buf;
struct inode *root;
- long err = -EINVAL;
- bool retry = true;
- int i;
+ int err;
+ bool retry = true, need_fsck = false;
+ char *options = NULL;
+ int recovery, i, valid_super_block;
+ struct curseg_info *seg_i;
try_onemore:
+ err = -EINVAL;
+ raw_super = NULL;
+ valid_super_block = -1;
+ recovery = 0;
+
/* allocate memory for f2fs-specific super block info */
sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
+ sbi->sb = sb;
+
+ /* Load the checksum driver */
+ sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0);
+ if (IS_ERR(sbi->s_chksum_driver)) {
+ f2fs_msg(sb, KERN_ERR, "Cannot load crc32 driver.");
+ err = PTR_ERR(sbi->s_chksum_driver);
+ sbi->s_chksum_driver = NULL;
+ goto free_sbi;
+ }
+
/* set a block size */
if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) {
f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
goto free_sbi;
}
- err = read_raw_super_block(sb, &raw_super, &raw_super_buf);
+ err = read_raw_super_block(sbi, &raw_super, &valid_super_block,
+ &recovery);
if (err)
goto free_sbi;
sb->s_fs_info = sbi;
- /* init some FS parameters */
- sbi->active_logs = NR_CURSEG_TYPE;
-
- set_opt(sbi, BG_GC);
+ sbi->raw_super = raw_super;
-#ifdef CONFIG_F2FS_FS_XATTR
- set_opt(sbi, XATTR_USER);
-#endif
-#ifdef CONFIG_F2FS_FS_POSIX_ACL
- set_opt(sbi, POSIX_ACL);
+ /*
+ * The BLKZONED feature indicates that the drive was formatted with
+ * zone alignment optimization. This is optional for host-aware
+ * devices, but mandatory for host-managed zoned block devices.
+ */
+#ifndef CONFIG_BLK_DEV_ZONED
+ if (f2fs_sb_mounted_blkzoned(sb)) {
+ f2fs_msg(sb, KERN_ERR,
+ "Zoned block device support is not enabled\n");
+ goto free_sb_buf;
+ }
#endif
+ default_options(sbi);
/* parse mount options */
- err = parse_options(sb, (char *)data);
- if (err)
+ options = kstrdup((const char *)data, GFP_KERNEL);
+ if (data && !options) {
+ err = -ENOMEM;
goto free_sb_buf;
+ }
+
+ err = parse_options(sb, options);
+ if (err)
+ goto free_options;
- sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
+ sbi->max_file_blocks = max_file_blocks();
+ sb->s_maxbytes = sbi->max_file_blocks <<
+ le32_to_cpu(raw_super->log_blocksize);
sb->s_max_links = F2FS_LINK_MAX;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
sb->s_op = &f2fs_sops;
+ sb->s_cop = &f2fs_cryptops;
sb->s_xattr = f2fs_xattr_handlers;
sb->s_export_op = &f2fs_export_ops;
sb->s_magic = F2FS_SUPER_MAGIC;
@@ -1081,14 +1950,14 @@ try_onemore:
memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
/* init f2fs-specific super block info */
- sbi->sb = sb;
- sbi->raw_super = raw_super;
- sbi->raw_super_buf = raw_super_buf;
+ sbi->valid_super_block = valid_super_block;
mutex_init(&sbi->gc_mutex);
- mutex_init(&sbi->writepages);
mutex_init(&sbi->cp_mutex);
init_rwsem(&sbi->node_write);
- sbi->por_doing = false;
+ init_rwsem(&sbi->node_change);
+
+ /* disallow all the data/node/meta page writes */
+ set_sbi_flag(sbi, SBI_POR_DOING);
spin_lock_init(&sbi->stat_lock);
init_rwsem(&sbi->read_io.io_rwsem);
@@ -1104,12 +1973,23 @@ try_onemore:
init_waitqueue_head(&sbi->cp_wait);
init_sb_info(sbi);
+ err = init_percpu_info(sbi);
+ if (err)
+ goto free_options;
+
+ if (F2FS_IO_SIZE(sbi) > 1) {
+ sbi->write_io_dummy =
+ mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0);
+ if (!sbi->write_io_dummy)
+ goto free_options;
+ }
+
/* get an inode for meta space */
sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
if (IS_ERR(sbi->meta_inode)) {
f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
err = PTR_ERR(sbi->meta_inode);
- goto free_sb_buf;
+ goto free_io_dummy;
}
err = get_valid_checkpoint(sbi);
@@ -1118,24 +1998,28 @@ try_onemore:
goto free_meta_inode;
}
- /* sanity checking of checkpoint */
- err = -EINVAL;
- if (sanity_check_ckpt(sbi)) {
- f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint");
- goto free_cp;
+ /* Initialize device list */
+ err = f2fs_scan_devices(sbi);
+ if (err) {
+ f2fs_msg(sb, KERN_ERR, "Failed to find devices");
+ goto free_devices;
}
sbi->total_valid_node_count =
le32_to_cpu(sbi->ckpt->valid_node_count);
- sbi->total_valid_inode_count =
- le32_to_cpu(sbi->ckpt->valid_inode_count);
+ percpu_counter_set(&sbi->total_valid_inode_count,
+ le32_to_cpu(sbi->ckpt->valid_inode_count));
sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count);
sbi->total_valid_block_count =
le64_to_cpu(sbi->ckpt->valid_block_count);
sbi->last_valid_block_count = sbi->total_valid_block_count;
- sbi->alloc_valid_block_count = 0;
- INIT_LIST_HEAD(&sbi->dir_inode_list);
- spin_lock_init(&sbi->dir_inode_lock);
+
+ for (i = 0; i < NR_INODE_TYPE; i++) {
+ INIT_LIST_HEAD(&sbi->inode_list[i]);
+ spin_lock_init(&sbi->inode_lock[i]);
+ }
+
+ init_extent_cache_info(sbi);
init_ino_entry_info(sbi);
@@ -1153,6 +2037,17 @@ try_onemore:
goto free_nm;
}
+ /* For write statistics */
+ if (sb->s_bdev->bd_part)
+ sbi->sectors_written_start =
+ (u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+
+ /* Read accumulated write IO statistics if exists */
+ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
+ if (__exist_node_summaries(sbi))
+ sbi->kbytes_written =
+ le64_to_cpu(seg_i->journal->info.kbytes_written);
+
build_gc_manager(sbi);
/* get an inode for node space */
@@ -1163,8 +2058,16 @@ try_onemore:
goto free_nm;
}
+ f2fs_join_shrinker(sbi);
+
+ err = f2fs_build_stats(sbi);
+ if (err)
+ goto free_nm;
+
/* if there are nt orphan nodes free them */
- recover_orphan_inodes(sbi);
+ err = recover_orphan_inodes(sbi);
+ if (err)
+ goto free_node_inode;
/* read root inode and dentry */
root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
@@ -1185,23 +2088,14 @@ try_onemore:
goto free_root_inode;
}
- err = f2fs_build_stats(sbi);
- if (err)
- goto free_root_inode;
-
if (f2fs_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
- if (sbi->s_proc)
+ if (sbi->s_proc) {
proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
&f2fs_seq_segment_info_fops, sb);
-
- if (test_opt(sbi, DISCARD)) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
- if (!blk_queue_discard(q))
- f2fs_msg(sb, KERN_WARNING,
- "mounting with \"discard\" option, but "
- "the device does not support discard");
+ proc_create_data("segment_bits", S_IRUGO, sbi->s_proc,
+ &f2fs_seq_segment_bits_fops, sb);
}
sbi->s_kobj.kset = f2fs_kset;
@@ -1211,61 +2105,126 @@ try_onemore:
if (err)
goto free_proc;
- if (!retry)
- sbi->need_fsck = true;
-
/* recover fsynced data */
if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
- err = recover_fsync_data(sbi);
- if (err) {
+ /*
+ * mount should be failed, when device has readonly mode, and
+ * previous checkpoint was not done by clean system shutdown.
+ */
+ if (bdev_read_only(sb->s_bdev) &&
+ !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+ err = -EROFS;
+ goto free_kobj;
+ }
+
+ if (need_fsck)
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+
+ if (!retry)
+ goto skip_recovery;
+
+ err = recover_fsync_data(sbi, false);
+ if (err < 0) {
+ need_fsck = true;
f2fs_msg(sb, KERN_ERR,
- "Cannot recover all fsync data errno=%ld", err);
+ "Cannot recover all fsync data errno=%d", err);
+ goto free_kobj;
+ }
+ } else {
+ err = recover_fsync_data(sbi, true);
+
+ if (!f2fs_readonly(sb) && err > 0) {
+ err = -EINVAL;
+ f2fs_msg(sb, KERN_ERR,
+ "Need to recover fsync data");
goto free_kobj;
}
}
+skip_recovery:
+ /* recover_fsync_data() cleared this already */
+ clear_sbi_flag(sbi, SBI_POR_DOING);
/*
* If filesystem is not mounted as read-only then
* do start the gc_thread.
*/
- if (!f2fs_readonly(sb)) {
+ if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) {
/* After POR, we can run background GC thread.*/
err = start_gc_thread(sbi);
if (err)
goto free_kobj;
}
+ kfree(options);
+
+ /* recover broken superblock */
+ if (recovery) {
+ err = f2fs_commit_super(sbi, true);
+ f2fs_msg(sb, KERN_INFO,
+ "Try to recover %dth superblock, ret: %d",
+ sbi->valid_super_block ? 1 : 2, err);
+ }
+
+ f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx",
+ cur_cp_version(F2FS_CKPT(sbi)));
+ f2fs_update_time(sbi, CP_TIME);
+ f2fs_update_time(sbi, REQ_TIME);
return 0;
free_kobj:
+ f2fs_sync_inode_meta(sbi);
kobject_del(&sbi->s_kobj);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
free_proc:
if (sbi->s_proc) {
remove_proc_entry("segment_info", sbi->s_proc);
+ remove_proc_entry("segment_bits", sbi->s_proc);
remove_proc_entry(sb->s_id, f2fs_proc_root);
}
- f2fs_destroy_stats(sbi);
free_root_inode:
dput(sb->s_root);
sb->s_root = NULL;
free_node_inode:
+ truncate_inode_pages_final(NODE_MAPPING(sbi));
+ mutex_lock(&sbi->umount_mutex);
+ release_ino_entry(sbi, true);
+ f2fs_leave_shrinker(sbi);
+ /*
+ * Some dirty meta pages can be produced by recover_orphan_inodes()
+ * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg()
+ * followed by write_checkpoint() through f2fs_write_node_pages(), which
+ * falls into an infinite loop in sync_meta_pages().
+ */
+ truncate_inode_pages_final(META_MAPPING(sbi));
iput(sbi->node_inode);
+ mutex_unlock(&sbi->umount_mutex);
+ f2fs_destroy_stats(sbi);
free_nm:
destroy_node_manager(sbi);
free_sm:
destroy_segment_manager(sbi);
-free_cp:
+free_devices:
+ destroy_device_list(sbi);
kfree(sbi->ckpt);
free_meta_inode:
make_bad_inode(sbi->meta_inode);
iput(sbi->meta_inode);
+free_io_dummy:
+ if (sbi->write_io_dummy)
+ mempool_destroy(sbi->write_io_dummy);
+free_options:
+ destroy_percpu_info(sbi);
+ kfree(options);
free_sb_buf:
- brelse(raw_super_buf);
+ kfree(raw_super);
free_sbi:
+ if (sbi->s_chksum_driver)
+ crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi);
/* give only one another chance */
if (retry) {
- retry = 0;
+ retry = false;
shrink_dcache_sb(sb);
goto try_onemore;
}
@@ -1278,11 +2237,18 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super);
}
+static void kill_f2fs_super(struct super_block *sb)
+{
+ if (sb->s_root)
+ set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE);
+ kill_block_super(sb);
+}
+
static struct file_system_type f2fs_fs_type = {
.owner = THIS_MODULE,
.name = "f2fs",
.mount = f2fs_mount,
- .kill_sb = kill_block_super,
+ .kill_sb = kill_f2fs_super,
.fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("f2fs");
@@ -1310,6 +2276,8 @@ static int __init init_f2fs_fs(void)
{
int err;
+ f2fs_build_trace_ios();
+
err = init_inodecache();
if (err)
goto fail;
@@ -1319,30 +2287,40 @@ static int __init init_f2fs_fs(void)
err = create_segment_manager_caches();
if (err)
goto free_node_manager_caches;
- err = create_gc_caches();
+ err = create_checkpoint_caches();
if (err)
goto free_segment_manager_caches;
- err = create_checkpoint_caches();
+ err = create_extent_cache();
if (err)
- goto free_gc_caches;
+ goto free_checkpoint_caches;
f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
if (!f2fs_kset) {
err = -ENOMEM;
- goto free_checkpoint_caches;
+ goto free_extent_cache;
}
- err = register_filesystem(&f2fs_fs_type);
+ err = register_shrinker(&f2fs_shrinker_info);
if (err)
goto free_kset;
- f2fs_create_root_stats();
+
+ err = register_filesystem(&f2fs_fs_type);
+ if (err)
+ goto free_shrinker;
+ err = f2fs_create_root_stats();
+ if (err)
+ goto free_filesystem;
f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
return 0;
+free_filesystem:
+ unregister_filesystem(&f2fs_fs_type);
+free_shrinker:
+ unregister_shrinker(&f2fs_shrinker_info);
free_kset:
kset_unregister(f2fs_kset);
+free_extent_cache:
+ destroy_extent_cache();
free_checkpoint_caches:
destroy_checkpoint_caches();
-free_gc_caches:
- destroy_gc_caches();
free_segment_manager_caches:
destroy_segment_manager_caches();
free_node_manager_caches:
@@ -1358,12 +2336,14 @@ static void __exit exit_f2fs_fs(void)
remove_proc_entry("fs/f2fs", NULL);
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
+ unregister_shrinker(&f2fs_shrinker_info);
+ kset_unregister(f2fs_kset);
+ destroy_extent_cache();
destroy_checkpoint_caches();
- destroy_gc_caches();
destroy_segment_manager_caches();
destroy_node_manager_caches();
destroy_inodecache();
- kset_unregister(f2fs_kset);
+ f2fs_destroy_trace_ios();
}
module_init(init_f2fs_fs)
@@ -1372,3 +2352,4 @@ module_exit(exit_f2fs_fs)
MODULE_AUTHOR("Samsung Electronics's Praesto Team");
MODULE_DESCRIPTION("Flash Friendly File System");
MODULE_LICENSE("GPL");
+
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
new file mode 100644
index 000000000000..bccbbf2616d2
--- /dev/null
+++ b/fs/f2fs/trace.c
@@ -0,0 +1,162 @@
+/*
+ * f2fs IO tracer
+ *
+ * Copyright (c) 2014 Motorola Mobility
+ * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/sched.h>
+#include <linux/radix-tree.h>
+
+#include "f2fs.h"
+#include "trace.h"
+
+static RADIX_TREE(pids, GFP_ATOMIC);
+static spinlock_t pids_lock;
+static struct last_io_info last_io;
+
+static inline void __print_last_io(void)
+{
+ if (!last_io.len)
+ return;
+
+ trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n",
+ last_io.major, last_io.minor,
+ last_io.pid, "----------------",
+ last_io.type,
+ last_io.fio.op, last_io.fio.op_flags,
+ last_io.fio.new_blkaddr,
+ last_io.len);
+ memset(&last_io, 0, sizeof(last_io));
+}
+
+static int __file_type(struct inode *inode, pid_t pid)
+{
+ if (f2fs_is_atomic_file(inode))
+ return __ATOMIC_FILE;
+ else if (f2fs_is_volatile_file(inode))
+ return __VOLATILE_FILE;
+ else if (S_ISDIR(inode->i_mode))
+ return __DIR_FILE;
+ else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode)))
+ return __NODE_FILE;
+ else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode)))
+ return __META_FILE;
+ else if (pid)
+ return __NORMAL_FILE;
+ else
+ return __MISC_FILE;
+}
+
+void f2fs_trace_pid(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ pid_t pid = task_pid_nr(current);
+ void *p;
+
+ set_page_private(page, (unsigned long)pid);
+
+ if (radix_tree_preload(GFP_NOFS))
+ return;
+
+ spin_lock(&pids_lock);
+ p = radix_tree_lookup(&pids, pid);
+ if (p == current)
+ goto out;
+ if (p)
+ radix_tree_delete(&pids, pid);
+
+ f2fs_radix_tree_insert(&pids, pid, current);
+
+ trace_printk("%3x:%3x %4x %-16s\n",
+ MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
+ pid, current->comm);
+out:
+ spin_unlock(&pids_lock);
+ radix_tree_preload_end();
+}
+
+void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
+{
+ struct inode *inode;
+ pid_t pid;
+ int major, minor;
+
+ if (flush) {
+ __print_last_io();
+ return;
+ }
+
+ inode = fio->page->mapping->host;
+ pid = page_private(fio->page);
+
+ major = MAJOR(inode->i_sb->s_dev);
+ minor = MINOR(inode->i_sb->s_dev);
+
+ if (last_io.major == major && last_io.minor == minor &&
+ last_io.pid == pid &&
+ last_io.type == __file_type(inode, pid) &&
+ last_io.fio.op == fio->op &&
+ last_io.fio.op_flags == fio->op_flags &&
+ last_io.fio.new_blkaddr + last_io.len ==
+ fio->new_blkaddr) {
+ last_io.len++;
+ return;
+ }
+
+ __print_last_io();
+
+ last_io.major = major;
+ last_io.minor = minor;
+ last_io.pid = pid;
+ last_io.type = __file_type(inode, pid);
+ last_io.fio = *fio;
+ last_io.len = 1;
+ return;
+}
+
+void f2fs_build_trace_ios(void)
+{
+ spin_lock_init(&pids_lock);
+}
+
+#define PIDVEC_SIZE 128
+static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
+ unsigned int max_items)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned int ret = 0;
+
+ if (unlikely(!max_items))
+ return 0;
+
+ radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
+ results[ret] = iter.index;
+ if (++ret == max_items)
+ break;
+ }
+ return ret;
+}
+
+void f2fs_destroy_trace_ios(void)
+{
+ pid_t pid[PIDVEC_SIZE];
+ pid_t next_pid = 0;
+ unsigned int found;
+
+ spin_lock(&pids_lock);
+ while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
+ unsigned idx;
+
+ next_pid = pid[found - 1] + 1;
+ for (idx = 0; idx < found; idx++)
+ radix_tree_delete(&pids, pid[idx]);
+ }
+ spin_unlock(&pids_lock);
+}
diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h
new file mode 100644
index 000000000000..67db24ac1e85
--- /dev/null
+++ b/fs/f2fs/trace.h
@@ -0,0 +1,46 @@
+/*
+ * f2fs IO tracer
+ *
+ * Copyright (c) 2014 Motorola Mobility
+ * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __F2FS_TRACE_H__
+#define __F2FS_TRACE_H__
+
+#ifdef CONFIG_F2FS_IO_TRACE
+#include <trace/events/f2fs.h>
+
+enum file_type {
+ __NORMAL_FILE,
+ __DIR_FILE,
+ __NODE_FILE,
+ __META_FILE,
+ __ATOMIC_FILE,
+ __VOLATILE_FILE,
+ __MISC_FILE,
+};
+
+struct last_io_info {
+ int major, minor;
+ pid_t pid;
+ enum file_type type;
+ struct f2fs_io_info fio;
+ block_t len;
+};
+
+extern void f2fs_trace_pid(struct page *);
+extern void f2fs_trace_ios(struct f2fs_io_info *, int);
+extern void f2fs_build_trace_ios(void);
+extern void f2fs_destroy_trace_ios(void);
+#else
+#define f2fs_trace_pid(p)
+#define f2fs_trace_ios(i, n)
+#define f2fs_build_trace_ios()
+#define f2fs_destroy_trace_ios()
+
+#endif
+#endif /* __F2FS_TRACE_H__ */
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index deca8728117b..c01e2b61e747 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -83,7 +83,7 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
}
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_getxattr(dentry->d_inode, type, name, buffer, size);
+ return f2fs_getxattr(d_inode(dentry), type, name, buffer, size, NULL);
}
static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
@@ -108,7 +108,7 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_setxattr(dentry->d_inode, type, name,
+ return f2fs_setxattr(d_inode(dentry), type, name,
value, size, NULL, flags);
}
@@ -130,19 +130,20 @@ static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (strcmp(name, "") != 0)
return -EINVAL;
- *((char *)buffer) = F2FS_I(inode)->i_advise;
+ if (buffer)
+ *((char *)buffer) = F2FS_I(inode)->i_advise;
return sizeof(char);
}
static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags, int type)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (strcmp(name, "") != 0)
return -EINVAL;
@@ -152,6 +153,7 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
return -EINVAL;
F2FS_I(inode)->i_advise |= *(char *)value;
+ f2fs_mark_inode_dirty_sync(inode, true);
return 0;
}
@@ -264,18 +266,125 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
return entry;
}
-static void *read_all_xattrs(struct inode *inode, struct page *ipage)
+static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr,
+ void **last_addr, int index,
+ size_t len, const char *name)
+{
+ struct f2fs_xattr_entry *entry;
+ unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2;
+
+ list_for_each_xattr(entry, base_addr) {
+ if ((void *)entry + sizeof(__u32) > base_addr + inline_size ||
+ (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) >
+ base_addr + inline_size) {
+ *last_addr = entry;
+ return NULL;
+ }
+ if (entry->e_name_index != index)
+ continue;
+ if (entry->e_name_len != len)
+ continue;
+ if (!memcmp(entry->e_name, name, len))
+ break;
+ }
+ return entry;
+}
+
+static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
+ unsigned int index, unsigned int len,
+ const char *name, struct f2fs_xattr_entry **xe,
+ void **base_addr)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ void *cur_addr, *txattr_addr, *last_addr = NULL;
+ nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+ unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0;
+ unsigned int inline_size = inline_xattr_size(inode);
+ int err = 0;
+
+ if (!size && !inline_size)
+ return -ENODATA;
+
+ txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE,
+ GFP_F2FS_ZERO);
+ if (!txattr_addr)
+ return -ENOMEM;
+
+ /* read from inline xattr */
+ if (inline_size) {
+ struct page *page = NULL;
+ void *inline_addr;
+
+ if (ipage) {
+ inline_addr = inline_xattr_addr(ipage);
+ } else {
+ page = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto out;
+ }
+ inline_addr = inline_xattr_addr(page);
+ }
+ memcpy(txattr_addr, inline_addr, inline_size);
+ f2fs_put_page(page, 1);
+
+ *xe = __find_inline_xattr(txattr_addr, &last_addr,
+ index, len, name);
+ if (*xe)
+ goto check;
+ }
+
+ /* read from xattr node block */
+ if (xnid) {
+ struct page *xpage;
+ void *xattr_addr;
+
+ /* The inode already has an extended attribute block. */
+ xpage = get_node_page(sbi, xnid);
+ if (IS_ERR(xpage)) {
+ err = PTR_ERR(xpage);
+ goto out;
+ }
+
+ xattr_addr = page_address(xpage);
+ memcpy(txattr_addr + inline_size, xattr_addr, size);
+ f2fs_put_page(xpage, 1);
+ }
+
+ if (last_addr)
+ cur_addr = XATTR_HDR(last_addr) - 1;
+ else
+ cur_addr = txattr_addr;
+
+ *xe = __find_xattr(cur_addr, index, len, name);
+check:
+ if (IS_XATTR_LAST_ENTRY(*xe)) {
+ err = -ENODATA;
+ goto out;
+ }
+
+ *base_addr = txattr_addr;
+ return 0;
+out:
+ kzfree(txattr_addr);
+ return err;
+}
+
+static int read_all_xattrs(struct inode *inode, struct page *ipage,
+ void **base_addr)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_xattr_header *header;
- size_t size = PAGE_SIZE, inline_size = 0;
+ nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+ unsigned int size = VALID_XATTR_BLOCK_SIZE;
+ unsigned int inline_size = inline_xattr_size(inode);
void *txattr_addr;
+ int err;
- inline_size = inline_xattr_size(inode);
-
- txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO);
+ txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE,
+ GFP_F2FS_ZERO);
if (!txattr_addr)
- return NULL;
+ return -ENOMEM;
/* read from inline xattr */
if (inline_size) {
@@ -286,8 +395,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
inline_addr = inline_xattr_addr(ipage);
} else {
page = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(page))
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
goto fail;
+ }
inline_addr = inline_xattr_addr(page);
}
memcpy(txattr_addr, inline_addr, inline_size);
@@ -295,17 +406,19 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
}
/* read from xattr node block */
- if (F2FS_I(inode)->i_xattr_nid) {
+ if (xnid) {
struct page *xpage;
void *xattr_addr;
/* The inode already has an extended attribute block. */
- xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
- if (IS_ERR(xpage))
+ xpage = get_node_page(sbi, xnid);
+ if (IS_ERR(xpage)) {
+ err = PTR_ERR(xpage);
goto fail;
+ }
xattr_addr = page_address(xpage);
- memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE);
+ memcpy(txattr_addr + inline_size, xattr_addr, size);
f2fs_put_page(xpage, 1);
}
@@ -316,24 +429,23 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC);
header->h_refcount = cpu_to_le32(1);
}
- return txattr_addr;
+ *base_addr = txattr_addr;
+ return 0;
fail:
kzfree(txattr_addr);
- return NULL;
+ return err;
}
static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
void *txattr_addr, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- size_t inline_size = 0;
+ size_t inline_size = inline_xattr_size(inode);
void *xattr_addr;
struct page *xpage;
nid_t new_nid = 0;
int err;
- inline_size = inline_xattr_size(inode);
-
if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid)
if (!alloc_nid(sbi, &new_nid))
return -ENOSPC;
@@ -345,7 +457,8 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
if (ipage) {
inline_addr = inline_xattr_addr(ipage);
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
+ set_page_dirty(ipage);
} else {
page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
@@ -353,7 +466,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
return PTR_ERR(page);
}
inline_addr = inline_xattr_addr(page);
- f2fs_wait_on_page_writeback(page, NODE);
+ f2fs_wait_on_page_writeback(page, NODE, true);
}
memcpy(inline_addr, txattr_addr, inline_size);
f2fs_put_page(page, 1);
@@ -374,7 +487,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
return PTR_ERR(xpage);
}
f2fs_bug_on(sbi, new_nid);
- f2fs_wait_on_page_writeback(xpage, NODE);
+ f2fs_wait_on_page_writeback(xpage, NODE, true);
} else {
struct dnode_of_data dn;
set_new_dnode(&dn, inode, NULL, NULL, new_nid);
@@ -387,23 +500,20 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
}
xattr_addr = page_address(xpage);
- memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE -
- sizeof(struct node_footer));
+ memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE);
set_page_dirty(xpage);
f2fs_put_page(xpage, 1);
- /* need to checkpoint during fsync */
- F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi));
return 0;
}
int f2fs_getxattr(struct inode *inode, int index, const char *name,
- void *buffer, size_t buffer_size)
+ void *buffer, size_t buffer_size, struct page *ipage)
{
- struct f2fs_xattr_entry *entry;
- void *base_addr;
+ struct f2fs_xattr_entry *entry = NULL;
int error = 0;
- size_t size, len;
+ unsigned int size, len;
+ void *base_addr = NULL;
if (name == NULL)
return -EINVAL;
@@ -412,21 +522,16 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
if (len > F2FS_NAME_LEN)
return -ERANGE;
- base_addr = read_all_xattrs(inode, NULL);
- if (!base_addr)
- return -ENOMEM;
-
- entry = __find_xattr(base_addr, index, len, name);
- if (IS_XATTR_LAST_ENTRY(entry)) {
- error = -ENODATA;
- goto cleanup;
- }
+ error = lookup_all_xattrs(inode, ipage, index, len, name,
+ &entry, &base_addr);
+ if (error)
+ return error;
size = le16_to_cpu(entry->e_value_size);
if (buffer && size > buffer_size) {
error = -ERANGE;
- goto cleanup;
+ goto out;
}
if (buffer) {
@@ -434,23 +539,22 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
memcpy(buffer, pval, size);
}
error = size;
-
-cleanup:
+out:
kzfree(base_addr);
return error;
}
ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct f2fs_xattr_entry *entry;
void *base_addr;
int error = 0;
size_t rest = buffer_size;
- base_addr = read_all_xattrs(inode, NULL);
- if (!base_addr)
- return -ENOMEM;
+ error = read_all_xattrs(inode, NULL, &base_addr);
+ if (error)
+ return error;
list_for_each_xattr(entry, base_addr) {
const struct xattr_handler *handler =
@@ -477,17 +581,25 @@ cleanup:
return error;
}
+static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry,
+ const void *value, size_t size)
+{
+ void *pval = entry->e_name + entry->e_name_len;
+
+ return (le16_to_cpu(entry->e_value_size) == size) &&
+ !memcmp(pval, value, size);
+}
+
static int __f2fs_setxattr(struct inode *inode, int index,
const char *name, const void *value, size_t size,
struct page *ipage, int flags)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_xattr_entry *here, *last;
void *base_addr;
int found, newsize;
size_t len;
__u32 new_hsize;
- int error = -ENOMEM;
+ int error = 0;
if (name == NULL)
return -EINVAL;
@@ -497,24 +609,32 @@ static int __f2fs_setxattr(struct inode *inode, int index,
len = strlen(name);
- if (len > F2FS_NAME_LEN || size > MAX_VALUE_LEN(inode))
+ if (len > F2FS_NAME_LEN)
return -ERANGE;
- base_addr = read_all_xattrs(inode, ipage);
- if (!base_addr)
- goto exit;
+ if (size > MAX_VALUE_LEN(inode))
+ return -E2BIG;
+
+ error = read_all_xattrs(inode, ipage, &base_addr);
+ if (error)
+ return error;
/* find entry with wanted name. */
here = __find_xattr(base_addr, index, len, name);
found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1;
- if ((flags & XATTR_REPLACE) && !found) {
+ if (found) {
+ if ((flags & XATTR_CREATE)) {
+ error = -EEXIST;
+ goto exit;
+ }
+
+ if (f2fs_xattr_value_same(here, value, size))
+ goto exit;
+ } else if ((flags & XATTR_REPLACE)) {
error = -ENODATA;
goto exit;
- } else if ((flags & XATTR_CREATE) && found) {
- error = -EEXIST;
- goto exit;
}
last = here;
@@ -535,7 +655,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
free = free + ENTRY_SIZE(here);
if (unlikely(free < newsize)) {
- error = -ENOSPC;
+ error = -E2BIG;
goto exit;
}
}
@@ -563,7 +683,6 @@ static int __f2fs_setxattr(struct inode *inode, int index,
* Before we come here, old entry is removed.
* We just write new entry.
*/
- memset(last, 0, newsize);
last->e_name_index = index;
last->e_name_len = len;
memcpy(last->e_name, name, len);
@@ -577,16 +696,17 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (error)
goto exit;
- if (is_inode_flag_set(fi, FI_ACL_MODE)) {
- inode->i_mode = fi->i_acl_mode;
- inode->i_ctime = CURRENT_TIME;
- clear_inode_flag(fi, FI_ACL_MODE);
+ if (is_inode_flag_set(inode, FI_ACL_MODE)) {
+ inode->i_mode = F2FS_I(inode)->i_acl_mode;
+ inode->i_ctime = current_time(inode);
+ clear_inode_flag(inode, FI_ACL_MODE);
}
-
- if (ipage)
- update_inode(inode, ipage);
- else
- update_inode_page(inode);
+ if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
+ !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
+ f2fs_set_encrypted_inode(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
+ if (!error && S_ISDIR(inode->i_mode))
+ set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
exit:
kzfree(base_addr);
return error;
@@ -603,7 +723,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
if (ipage)
return __f2fs_setxattr(inode, index, name, value,
size, ipage, flags);
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
/* protect xattr_ver */
@@ -612,5 +732,6 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
up_write(&F2FS_I(inode)->i_sem);
f2fs_unlock_op(sbi);
+ f2fs_update_time(sbi, REQ_TIME);
return err;
}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 34ab7dbcf5e3..08a4840d6d7d 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -35,6 +35,10 @@
#define F2FS_XATTR_INDEX_LUSTRE 5
#define F2FS_XATTR_INDEX_SECURITY 6
#define F2FS_XATTR_INDEX_ADVISE 7
+/* Should be same as EXT4_XATTR_INDEX_ENCRYPTION */
+#define F2FS_XATTR_INDEX_ENCRYPTION 9
+
+#define F2FS_XATTR_NAME_ENCRYPTION_CONTEXT "c"
struct f2fs_xattr_header {
__le32 h_magic; /* magic number for identification */
@@ -54,10 +58,10 @@ struct f2fs_xattr_entry {
#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1))
#define XATTR_ROUND (3)
-#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND)
+#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND)
#define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \
- entry->e_name_len + le16_to_cpu(entry->e_value_size)))
+ (entry)->e_name_len + le16_to_cpu((entry)->e_value_size)))
#define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\
ENTRY_SIZE(entry)))
@@ -68,9 +72,10 @@ struct f2fs_xattr_entry {
for (entry = XATTR_FIRST_ENTRY(addr);\
!IS_XATTR_LAST_ENTRY(entry);\
entry = XATTR_NEXT_ENTRY(entry))
-
-#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE - \
- sizeof(struct node_footer) - sizeof(__u32))
+#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer))
+#define XATTR_PADDING_SIZE (sizeof(__u32))
+#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \
+ VALID_XATTR_BLOCK_SIZE)
#define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \
sizeof(struct f2fs_xattr_header) - \
@@ -115,18 +120,21 @@ extern const struct xattr_handler *f2fs_xattr_handlers[];
extern int f2fs_setxattr(struct inode *, int, const char *,
const void *, size_t, struct page *, int);
-extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t);
+extern int f2fs_getxattr(struct inode *, int, const char *, void *,
+ size_t, struct page *);
extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
#else
#define f2fs_xattr_handlers NULL
static inline int f2fs_setxattr(struct inode *inode, int index,
- const char *name, const void *value, size_t size, int flags)
+ const char *name, const void *value, size_t size,
+ struct page *page, int flags)
{
return -EOPNOTSUPP;
}
static inline int f2fs_getxattr(struct inode *inode, int index,
- const char *name, void *buffer, size_t buffer_size)
+ const char *name, void *buffer,
+ size_t buffer_size, struct page *dpage)
{
return -EOPNOTSUPP;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a07634599cd7..618e0095127d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1187,7 +1187,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if ((inode->i_state & flags) == flags)
return;
- if (unlikely(block_dump))
+ if (unlikely(block_dump > 1))
block_dump___mark_inode_dirty(inode);
spin_lock(&inode->i_lock);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 7dca743b2ce1..940c683561dd 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -44,6 +44,7 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path)
if (old_pwd.dentry)
path_put(&old_pwd);
}
+EXPORT_SYMBOL(set_fs_pwd);
static inline int replace_path(struct path *p, const struct path *old, const struct path *new)
{
@@ -89,6 +90,7 @@ void free_fs_struct(struct fs_struct *fs)
path_put(&fs->pwd);
kmem_cache_free(fs_cachep, fs);
}
+EXPORT_SYMBOL(free_fs_struct);
void exit_fs(struct task_struct *tsk)
{
@@ -127,6 +129,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
}
return fs;
}
+EXPORT_SYMBOL_GPL(copy_fs_struct);
int unshare_fs_struct(void)
{
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index f2bbb8513360..54dd9fd7cdeb 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -13,6 +13,7 @@
#include <linux/poll.h>
#include <linux/uio.h>
#include <linux/miscdevice.h>
+#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/slab.h>
@@ -20,6 +21,7 @@
#include <linux/swap.h>
#include <linux/splice.h>
#include <linux/aio.h>
+#include <linux/freezer.h>
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
@@ -464,7 +466,10 @@ __acquires(fc->lock)
* Wait it out.
*/
spin_unlock(&fc->lock);
- wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
+
+ while (req->state != FUSE_REQ_FINISHED)
+ wait_event_freezable(req->waitq,
+ req->state == FUSE_REQ_FINISHED);
spin_lock(&fc->lock);
if (!req->aborted)
@@ -1864,6 +1869,10 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
spin_unlock(&fc->lock);
err = copy_out_args(cs, &req->out, nbytes);
+ if (req->in.h.opcode == FUSE_CANONICAL_PATH) {
+ req->out.h.error = kern_path((char *)req->out.args[0].value, 0,
+ req->canonical_path);
+ }
fuse_copy_finish(cs);
spin_lock(&fc->lock);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 015e21edd6bc..b555c4f6cbc0 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -277,6 +277,50 @@ invalid:
goto out;
}
+/*
+ * Get the canonical path. Since we must translate to a path, this must be done
+ * in the context of the userspace daemon, however, the userspace daemon cannot
+ * look up paths on its own. Instead, we handle the lookup as a special case
+ * inside of the write request.
+ */
+static void fuse_dentry_canonical_path(const struct path *path, struct path *canonical_path) {
+ struct inode *inode = path->dentry->d_inode;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_req *req;
+ int err;
+ char *path_name;
+
+ req = fuse_get_req(fc, 1);
+ err = PTR_ERR(req);
+ if (IS_ERR(req))
+ goto default_path;
+
+ path_name = (char*)__get_free_page(GFP_KERNEL);
+ if (!path_name) {
+ fuse_put_request(fc, req);
+ goto default_path;
+ }
+
+ req->in.h.opcode = FUSE_CANONICAL_PATH;
+ req->in.h.nodeid = get_node_id(inode);
+ req->in.numargs = 0;
+ req->out.numargs = 1;
+ req->out.args[0].size = PATH_MAX;
+ req->out.args[0].value = path_name;
+ req->canonical_path = canonical_path;
+ req->out.argvar = 1;
+ fuse_request_send(fc, req);
+ err = req->out.h.error;
+ fuse_put_request(fc, req);
+ free_page((unsigned long)path_name);
+ if (!err)
+ return;
+default_path:
+ canonical_path->dentry = path->dentry;
+ canonical_path->mnt = path->mnt;
+ path_get(canonical_path);
+}
+
static int invalid_nodeid(u64 nodeid)
{
return !nodeid || nodeid == FUSE_ROOT_ID;
@@ -284,6 +328,7 @@ static int invalid_nodeid(u64 nodeid)
const struct dentry_operations fuse_dentry_operations = {
.d_revalidate = fuse_dentry_revalidate,
+ .d_canonical_path = fuse_dentry_canonical_path,
};
int fuse_valid_type(int m)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 300619ba8591..309c28f7a86c 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -351,6 +351,9 @@ struct fuse_req {
/** Inode used in the request or NULL */
struct inode *inode;
+ /** Path used for completing d_canonical_path */
+ struct path *canonical_path;
+
/** AIO control block */
struct fuse_io_priv *io;
diff --git a/fs/inode.c b/fs/inode.c
index 56d1d2b4bf31..da55347c44a2 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1598,7 +1598,7 @@ int should_remove_suid(struct dentry *dentry)
}
EXPORT_SYMBOL(should_remove_suid);
-static int __remove_suid(struct dentry *dentry, int kill)
+static int __remove_suid(struct vfsmount *mnt, struct dentry *dentry, int kill)
{
struct iattr newattrs;
@@ -1607,7 +1607,7 @@ static int __remove_suid(struct dentry *dentry, int kill)
* Note we call this on write, so notify_change will not
* encounter any conflicting delegations:
*/
- return notify_change(dentry, &newattrs, NULL);
+ return notify_change2(mnt, dentry, &newattrs, NULL);
}
int file_remove_suid(struct file *file)
@@ -1630,7 +1630,7 @@ int file_remove_suid(struct file *file)
if (killpriv)
error = security_inode_killpriv(dentry);
if (!error && killsuid)
- error = __remove_suid(dentry, killsuid);
+ error = __remove_suid(file->f_path.mnt, dentry, killsuid);
if (!error)
inode_has_no_xattr(inode);
diff --git a/fs/internal.h b/fs/internal.h
index 53279bd90b72..1c4822d292e9 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -83,9 +83,11 @@ extern struct file *get_empty_filp(void);
* super.c
*/
extern int do_remount_sb(struct super_block *, int, void *, int);
+extern int do_remount_sb2(struct vfsmount *, struct super_block *, int,
+ void *, int);
extern bool grab_super_passive(struct super_block *sb);
extern struct dentry *mount_fs(struct file_system_type *,
- int, const char *, void *);
+ int, const char *, struct vfsmount *, void *);
extern struct super_block *user_get_super(dev_t);
/*
diff --git a/fs/mpage.c b/fs/mpage.c
index 3e79220babac..58440cff4189 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -30,6 +30,14 @@
#include <linux/cleancache.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/android_fs.h>
+
+EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_start);
+EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_end);
+EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_start);
+EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_end);
+
/*
* I/O completion handler for multipage BIOs.
*
@@ -47,6 +55,16 @@ static void mpage_end_io(struct bio *bio, int err)
struct bio_vec *bv;
int i;
+ if (trace_android_fs_dataread_end_enabled() &&
+ (bio_data_dir(bio) == READ)) {
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+ if (first_page != NULL)
+ trace_android_fs_dataread_end(first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size);
+ }
+
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
page_endio(page, bio_data_dir(bio), err);
@@ -57,6 +75,24 @@ static void mpage_end_io(struct bio *bio, int err)
static struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
+ if (trace_android_fs_dataread_start_enabled() && (rw == READ)) {
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+ if (first_page != NULL) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ first_page->mapping->host);
+ trace_android_fs_dataread_start(
+ first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size,
+ current->pid,
+ path,
+ current->comm);
+ }
+ }
bio->bi_end_io = mpage_end_io;
guard_bio_eod(rw, bio);
submit_bio(rw, bio);
diff --git a/fs/namei.c b/fs/namei.c
index e1976450a1e2..ffae3bda3679 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -368,9 +368,11 @@ EXPORT_SYMBOL(generic_permission);
* flag in inode->i_opflags, that says "this has not special
* permission function, use the fast case".
*/
-static inline int do_inode_permission(struct inode *inode, int mask)
+static inline int do_inode_permission(struct vfsmount *mnt, struct inode *inode, int mask)
{
if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
+ if (likely(mnt && inode->i_op->permission2))
+ return inode->i_op->permission2(mnt, inode, mask);
if (likely(inode->i_op->permission))
return inode->i_op->permission(inode, mask);
@@ -394,7 +396,7 @@ static inline int do_inode_permission(struct inode *inode, int mask)
* This does not check for a read-only file system. You probably want
* inode_permission().
*/
-int __inode_permission(struct inode *inode, int mask)
+int __inode_permission2(struct vfsmount *mnt, struct inode *inode, int mask)
{
int retval;
@@ -406,7 +408,7 @@ int __inode_permission(struct inode *inode, int mask)
return -EACCES;
}
- retval = do_inode_permission(inode, mask);
+ retval = do_inode_permission(mnt, inode, mask);
if (retval)
return retval;
@@ -414,7 +416,14 @@ int __inode_permission(struct inode *inode, int mask)
if (retval)
return retval;
- return security_inode_permission(inode, mask);
+ retval = security_inode_permission(inode, mask);
+ return retval;
+}
+EXPORT_SYMBOL(__inode_permission2);
+
+int __inode_permission(struct inode *inode, int mask)
+{
+ return __inode_permission2(NULL, inode, mask);
}
EXPORT_SYMBOL(__inode_permission);
@@ -450,14 +459,20 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
*
* When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
*/
-int inode_permission(struct inode *inode, int mask)
+int inode_permission2(struct vfsmount *mnt, struct inode *inode, int mask)
{
int retval;
retval = sb_permission(inode->i_sb, inode, mask);
if (retval)
return retval;
- return __inode_permission(inode, mask);
+ return __inode_permission2(mnt, inode, mask);
+}
+EXPORT_SYMBOL(inode_permission2);
+
+int inode_permission(struct inode *inode, int mask)
+{
+ return inode_permission2(NULL, inode, mask);
}
EXPORT_SYMBOL(inode_permission);
@@ -1512,13 +1527,13 @@ static int lookup_slow(struct nameidata *nd, struct path *path)
static inline int may_lookup(struct nameidata *nd)
{
if (nd->flags & LOOKUP_RCU) {
- int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
+ int err = inode_permission2(nd->path.mnt, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
if (err != -ECHILD)
return err;
if (unlazy_walk(nd, NULL))
return -ECHILD;
}
- return inode_permission(nd->inode, MAY_EXEC);
+ return inode_permission2(nd->path.mnt, nd->inode, MAY_EXEC);
}
static inline int handle_dots(struct nameidata *nd, int type)
@@ -1856,11 +1871,12 @@ static int path_init(int dfd, const char *name, unsigned int flags,
nd->depth = 0;
if (flags & LOOKUP_ROOT) {
struct dentry *root = nd->root.dentry;
+ struct vfsmount *mnt = nd->root.mnt;
struct inode *inode = root->d_inode;
if (*name) {
if (!d_can_lookup(root))
return -ENOTDIR;
- retval = inode_permission(inode, MAY_EXEC);
+ retval = inode_permission2(mnt, inode, MAY_EXEC);
if (retval)
return retval;
}
@@ -2114,6 +2130,7 @@ static struct dentry *lookup_hash(struct nameidata *nd)
/**
* lookup_one_len - filesystem helper to lookup single pathname component
* @name: pathname component to lookup
+ * @mnt: mount we are looking up on
* @base: base directory to lookup from
* @len: maximum length @len should be interpreted to
*
@@ -2122,7 +2139,7 @@ static struct dentry *lookup_hash(struct nameidata *nd)
* nameidata argument is passed to the filesystem methods and a filesystem
* using this helper needs to be prepared for that.
*/
-struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+struct dentry *lookup_one_len2(const char *name, struct vfsmount *mnt, struct dentry *base, int len)
{
struct qstr this;
unsigned int c;
@@ -2156,12 +2173,18 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
return ERR_PTR(err);
}
- err = inode_permission(base->d_inode, MAY_EXEC);
+ err = inode_permission2(mnt, base->d_inode, MAY_EXEC);
if (err)
return ERR_PTR(err);
return __lookup_hash(&this, base, 0);
}
+EXPORT_SYMBOL(lookup_one_len2);
+
+struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+{
+ return lookup_one_len2(name, NULL, base, len);
+}
EXPORT_SYMBOL(lookup_one_len);
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
@@ -2441,7 +2464,7 @@ EXPORT_SYMBOL(__check_sticky);
* 10. We don't allow removal of NFS sillyrenamed files; it's handled by
* nfs_async_unlink().
*/
-static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
+static int may_delete(struct vfsmount *mnt, struct inode *dir, struct dentry *victim, bool isdir)
{
struct inode *inode = victim->d_inode;
int error;
@@ -2453,7 +2476,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
BUG_ON(victim->d_parent->d_inode != dir);
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
- error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ error = inode_permission2(mnt, dir, MAY_WRITE | MAY_EXEC);
if (error)
return error;
if (IS_APPEND(dir))
@@ -2484,14 +2507,14 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
* 3. We should have write and exec permissions on dir
* 4. We can't do it if dir is immutable (done in permission())
*/
-static inline int may_create(struct inode *dir, struct dentry *child)
+static inline int may_create(struct vfsmount *mnt, struct inode *dir, struct dentry *child)
{
audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
if (child->d_inode)
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
- return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ return inode_permission2(mnt, dir, MAY_WRITE | MAY_EXEC);
}
/*
@@ -2538,10 +2561,10 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
}
EXPORT_SYMBOL(unlock_rename);
-int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool want_excl)
+int vfs_create2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool want_excl)
{
- int error = may_create(dir, dentry);
+ int error = may_create(mnt, dir, dentry);
if (error)
return error;
@@ -2557,11 +2580,19 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
fsnotify_create(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_create2);
+
+int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool want_excl)
+{
+ return vfs_create2(NULL, dir, dentry, mode, want_excl);
+}
EXPORT_SYMBOL(vfs_create);
static int may_open(struct path *path, int acc_mode, int flag)
{
struct dentry *dentry = path->dentry;
+ struct vfsmount *mnt = path->mnt;
struct inode *inode = dentry->d_inode;
int error;
@@ -2590,7 +2621,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
break;
}
- error = inode_permission(inode, acc_mode);
+ error = inode_permission2(mnt, inode, acc_mode);
if (error)
return error;
@@ -2625,7 +2656,7 @@ static int handle_truncate(struct file *filp)
if (!error)
error = security_path_truncate(path);
if (!error) {
- error = do_truncate(path->dentry, 0,
+ error = do_truncate2(path->mnt, path->dentry, 0,
ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
filp);
}
@@ -2646,7 +2677,7 @@ static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
if (error)
return error;
- error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
+ error = inode_permission2(dir->mnt, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
if (error)
return error;
@@ -2832,6 +2863,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
bool got_write, int *opened)
{
struct dentry *dir = nd->path.dentry;
+ struct vfsmount *mnt = nd->path.mnt;
struct inode *dir_inode = dir->d_inode;
struct dentry *dentry;
int error;
@@ -2879,7 +2911,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
error = security_path_mknod(&nd->path, dentry, mode, 0);
if (error)
goto out_dput;
- error = vfs_create(dir->d_inode, dentry, mode,
+ error = vfs_create2(mnt, dir->d_inode, dentry, mode,
nd->flags & LOOKUP_EXCL);
if (error)
goto out_dput;
@@ -3151,7 +3183,7 @@ static int do_tmpfile(int dfd, struct filename *pathname,
if (unlikely(error))
goto out;
/* we want directory to be writable */
- error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC);
+ error = inode_permission2(nd->path.mnt, nd->inode, MAY_WRITE | MAY_EXEC);
if (error)
goto out2;
dentry = nd->path.dentry;
@@ -3399,9 +3431,9 @@ struct dentry *user_path_create(int dfd, const char __user *pathname,
}
EXPORT_SYMBOL(user_path_create);
-int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+int vfs_mknod2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
- int error = may_create(dir, dentry);
+ int error = may_create(mnt, dir, dentry);
if (error)
return error;
@@ -3425,6 +3457,12 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
fsnotify_create(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_mknod2);
+
+int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+{
+ return vfs_mknod2(NULL, dir, dentry, mode, dev);
+}
EXPORT_SYMBOL(vfs_mknod);
static int may_mknod(umode_t mode)
@@ -3467,10 +3505,10 @@ retry:
goto out;
switch (mode & S_IFMT) {
case 0: case S_IFREG:
- error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+ error = vfs_create2(path.mnt, path.dentry->d_inode,dentry,mode,true);
break;
case S_IFCHR: case S_IFBLK:
- error = vfs_mknod(path.dentry->d_inode,dentry,mode,
+ error = vfs_mknod2(path.mnt, path.dentry->d_inode,dentry,mode,
new_decode_dev(dev));
break;
case S_IFIFO: case S_IFSOCK:
@@ -3491,9 +3529,9 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
return sys_mknodat(AT_FDCWD, filename, mode, dev);
}
-int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+int vfs_mkdir2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, umode_t mode)
{
- int error = may_create(dir, dentry);
+ int error = may_create(mnt, dir, dentry);
unsigned max_links = dir->i_sb->s_max_links;
if (error)
@@ -3515,6 +3553,12 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
fsnotify_mkdir(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_mkdir2);
+
+int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ return vfs_mkdir2(NULL, dir, dentry, mode);
+}
EXPORT_SYMBOL(vfs_mkdir);
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
@@ -3533,7 +3577,7 @@ retry:
mode &= ~current_umask();
error = security_path_mkdir(&path, dentry, mode);
if (!error)
- error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+ error = vfs_mkdir2(path.mnt, path.dentry->d_inode, dentry, mode);
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
@@ -3572,9 +3616,9 @@ void dentry_unhash(struct dentry *dentry)
}
EXPORT_SYMBOL(dentry_unhash);
-int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+int vfs_rmdir2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry)
{
- int error = may_delete(dir, dentry, 1);
+ int error = may_delete(mnt, dir, dentry, 1);
if (error)
return error;
@@ -3609,6 +3653,11 @@ out:
d_delete(dentry);
return error;
}
+EXPORT_SYMBOL(vfs_rmdir2);
+int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ return vfs_rmdir2(NULL, dir, dentry);
+}
EXPORT_SYMBOL(vfs_rmdir);
static long do_rmdir(int dfd, const char __user *pathname)
@@ -3652,7 +3701,7 @@ retry:
error = security_path_rmdir(&nd.path, dentry);
if (error)
goto exit3;
- error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+ error = vfs_rmdir2(nd.path.mnt, nd.path.dentry->d_inode, dentry);
exit3:
dput(dentry);
exit2:
@@ -3691,10 +3740,10 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported.
*/
-int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+int vfs_unlink2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
{
struct inode *target = dentry->d_inode;
- int error = may_delete(dir, dentry, 0);
+ int error = may_delete(mnt, dir, dentry, 0);
if (error)
return error;
@@ -3729,6 +3778,12 @@ out:
return error;
}
+EXPORT_SYMBOL(vfs_unlink2);
+
+int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+{
+ return vfs_unlink2(NULL, dir, dentry, delegated_inode);
+}
EXPORT_SYMBOL(vfs_unlink);
/*
@@ -3774,7 +3829,7 @@ retry_deleg:
error = security_path_unlink(&nd.path, dentry);
if (error)
goto exit2;
- error = vfs_unlink(nd.path.dentry->d_inode, dentry, &delegated_inode);
+ error = vfs_unlink2(nd.path.mnt, nd.path.dentry->d_inode, dentry, &delegated_inode);
exit2:
dput(dentry);
}
@@ -3824,9 +3879,9 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
return do_unlinkat(AT_FDCWD, pathname);
}
-int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+int vfs_symlink2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, const char *oldname)
{
- int error = may_create(dir, dentry);
+ int error = may_create(mnt, dir, dentry);
if (error)
return error;
@@ -3843,6 +3898,12 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
fsnotify_create(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_symlink2);
+
+int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+{
+ return vfs_symlink2(NULL, dir, dentry, oldname);
+}
EXPORT_SYMBOL(vfs_symlink);
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
@@ -3865,7 +3926,7 @@ retry:
error = security_path_symlink(&path, dentry, from->name);
if (!error)
- error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
+ error = vfs_symlink2(path.mnt, path.dentry->d_inode, dentry, from->name);
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
@@ -3900,7 +3961,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported.
*/
-int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+int vfs_link2(struct vfsmount *mnt, struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
{
struct inode *inode = old_dentry->d_inode;
unsigned max_links = dir->i_sb->s_max_links;
@@ -3909,7 +3970,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
if (!inode)
return -ENOENT;
- error = may_create(dir, new_dentry);
+ error = may_create(mnt, dir, new_dentry);
if (error)
return error;
@@ -3952,6 +4013,12 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
fsnotify_link(dir, inode, new_dentry);
return error;
}
+EXPORT_SYMBOL(vfs_link2);
+
+int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+{
+ return vfs_link2(NULL, old_dentry, dir, new_dentry, delegated_inode);
+}
EXPORT_SYMBOL(vfs_link);
/*
@@ -4007,7 +4074,7 @@ retry:
error = security_path_link(old_path.dentry, &new_path, new_dentry);
if (error)
goto out_dput;
- error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
+ error = vfs_link2(old_path.mnt, old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
out_dput:
done_path_create(&new_path, new_dentry);
if (delegated_inode) {
@@ -4082,7 +4149,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* ->i_mutex on parents, which works but leads to some truly excessive
* locking].
*/
-int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+int vfs_rename2(struct vfsmount *mnt,
+ struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
struct inode **delegated_inode, unsigned int flags)
{
@@ -4097,19 +4165,19 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (source == target)
return 0;
- error = may_delete(old_dir, old_dentry, is_dir);
+ error = may_delete(mnt, old_dir, old_dentry, is_dir);
if (error)
return error;
if (!target) {
- error = may_create(new_dir, new_dentry);
+ error = may_create(mnt, new_dir, new_dentry);
} else {
new_is_dir = d_is_dir(new_dentry);
if (!(flags & RENAME_EXCHANGE))
- error = may_delete(new_dir, new_dentry, is_dir);
+ error = may_delete(mnt, new_dir, new_dentry, is_dir);
else
- error = may_delete(new_dir, new_dentry, new_is_dir);
+ error = may_delete(mnt, new_dir, new_dentry, new_is_dir);
}
if (error)
return error;
@@ -4126,12 +4194,12 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
*/
if (new_dir != old_dir) {
if (is_dir) {
- error = inode_permission(source, MAY_WRITE);
+ error = inode_permission2(mnt, source, MAY_WRITE);
if (error)
return error;
}
if ((flags & RENAME_EXCHANGE) && new_is_dir) {
- error = inode_permission(target, MAY_WRITE);
+ error = inode_permission2(mnt, target, MAY_WRITE);
if (error)
return error;
}
@@ -4214,6 +4282,14 @@ out:
return error;
}
+EXPORT_SYMBOL(vfs_rename2);
+
+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ struct inode **delegated_inode, unsigned int flags)
+{
+ return vfs_rename2(NULL, old_dir, old_dentry, new_dir, new_dentry, delegated_inode, flags);
+}
EXPORT_SYMBOL(vfs_rename);
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
@@ -4328,7 +4404,7 @@ retry_deleg:
&newnd.path, new_dentry, flags);
if (error)
goto exit5;
- error = vfs_rename(old_dir->d_inode, old_dentry,
+ error = vfs_rename2(oldnd.path.mnt, old_dir->d_inode, old_dentry,
new_dir->d_inode, new_dentry,
&delegated_inode, flags);
exit5:
@@ -4373,7 +4449,7 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
int vfs_whiteout(struct inode *dir, struct dentry *dentry)
{
- int error = may_create(dir, dentry);
+ int error = may_create(NULL, dir, dentry);
if (error)
return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index d0cd3f4012ec..b3c5331dbb2f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -568,6 +568,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
static void free_vfsmnt(struct mount *mnt)
{
+ kfree(mnt->mnt.data);
kfree(mnt->mnt_devname);
#ifdef CONFIG_SMP
free_percpu(mnt->mnt_pcp);
@@ -901,11 +902,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
if (!mnt)
return ERR_PTR(-ENOMEM);
+ mnt->mnt.data = NULL;
+ if (type->alloc_mnt_data) {
+ mnt->mnt.data = type->alloc_mnt_data();
+ if (!mnt->mnt.data) {
+ mnt_free_id(mnt);
+ free_vfsmnt(mnt);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
if (flags & MS_KERNMOUNT)
mnt->mnt.mnt_flags = MNT_INTERNAL;
- root = mount_fs(type, flags, name, data);
+ root = mount_fs(type, flags, name, &mnt->mnt, data);
if (IS_ERR(root)) {
+ kfree(mnt->mnt.data);
mnt_free_id(mnt);
free_vfsmnt(mnt);
return ERR_CAST(root);
@@ -933,6 +944,14 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
if (!mnt)
return ERR_PTR(-ENOMEM);
+ if (sb->s_op->clone_mnt_data) {
+ mnt->mnt.data = sb->s_op->clone_mnt_data(old->mnt.data);
+ if (!mnt->mnt.data) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ }
+
if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
mnt->mnt_group_id = 0; /* not a peer of original */
else
@@ -1000,6 +1019,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
return mnt;
out_free:
+ kfree(mnt->mnt.data);
mnt_free_id(mnt);
free_vfsmnt(mnt);
return ERR_PTR(err);
@@ -2138,8 +2158,14 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
err = change_mount_flags(path->mnt, flags);
else if (!capable(CAP_SYS_ADMIN))
err = -EPERM;
- else
- err = do_remount_sb(sb, flags, data, 0);
+ else {
+ err = do_remount_sb2(path->mnt, sb, flags, data, 0);
+ namespace_lock();
+ lock_mount_hash();
+ propagate_remount(mnt);
+ unlock_mount_hash();
+ namespace_unlock();
+ }
if (!err) {
lock_mount_hash();
mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index f2639f5724e8..49de994fb7b7 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -488,7 +488,7 @@ static int fanotify_find_path(int dfd, const char __user *filename,
}
/* you can only watch an inode if you have read permissions on it */
- ret = inode_permission(path->dentry->d_inode, MAY_READ);
+ ret = inode_permission2(path->mnt, path->dentry->d_inode, MAY_READ);
if (ret)
path_put(path);
out:
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index daf76652fe58..b2ddc882dac9 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -338,7 +338,7 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
if (error)
return error;
/* you can only watch an inode if you have read permissions on it */
- error = inode_permission(path->dentry->d_inode, MAY_READ);
+ error = inode_permission2(path->mnt, path->dentry->d_inode, MAY_READ);
if (error)
path_put(path);
return error;
@@ -703,6 +703,8 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
struct fsnotify_group *group;
struct inode *inode;
struct path path;
+ struct path alteredpath;
+ struct path *canonical_path = &path;
struct fd f;
int ret;
unsigned flags = 0;
@@ -730,13 +732,22 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
if (ret)
goto fput_and_out;
+ /* support stacked filesystems */
+ if(path.dentry && path.dentry->d_op) {
+ if (path.dentry->d_op->d_canonical_path) {
+ path.dentry->d_op->d_canonical_path(&path, &alteredpath);
+ canonical_path = &alteredpath;
+ path_put(&path);
+ }
+ }
+
/* inode held in place by reference to path; group by fget on fd */
- inode = path.dentry->d_inode;
+ inode = canonical_path->dentry->d_inode;
group = f.file->private_data;
/* create/update an inode mark */
ret = inotify_update_watch(group, inode, mask);
- path_put(&path);
+ path_put(canonical_path);
fput_and_out:
fdput(f);
return ret;
diff --git a/fs/open.c b/fs/open.c
index 1651f35d50f5..d7e0e1b8b0cf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -34,8 +34,8 @@
#include "internal.h"
-int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
- struct file *filp)
+int do_truncate2(struct vfsmount *mnt, struct dentry *dentry, loff_t length,
+ unsigned int time_attrs, struct file *filp)
{
int ret;
struct iattr newattrs;
@@ -58,17 +58,24 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
mutex_lock(&dentry->d_inode->i_mutex);
/* Note any delegations or leases have already been broken: */
- ret = notify_change(dentry, &newattrs, NULL);
+ ret = notify_change2(mnt, dentry, &newattrs, NULL);
mutex_unlock(&dentry->d_inode->i_mutex);
return ret;
}
+int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
+ struct file *filp)
+{
+ return do_truncate2(NULL, dentry, length, time_attrs, filp);
+}
long vfs_truncate(struct path *path, loff_t length)
{
struct inode *inode;
+ struct vfsmount *mnt;
long error;
inode = path->dentry->d_inode;
+ mnt = path->mnt;
/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
if (S_ISDIR(inode->i_mode))
@@ -80,7 +87,7 @@ long vfs_truncate(struct path *path, loff_t length)
if (error)
goto out;
- error = inode_permission(inode, MAY_WRITE);
+ error = inode_permission2(mnt, inode, MAY_WRITE);
if (error)
goto mnt_drop_write_and_out;
@@ -104,7 +111,7 @@ long vfs_truncate(struct path *path, loff_t length)
if (!error)
error = security_path_truncate(path);
if (!error)
- error = do_truncate(path->dentry, length, 0, NULL);
+ error = do_truncate2(mnt, path->dentry, length, 0, NULL);
put_write_and_out:
put_write_access(inode);
@@ -153,6 +160,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
struct inode *inode;
struct dentry *dentry;
+ struct vfsmount *mnt;
struct fd f;
int error;
@@ -169,6 +177,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
small = 0;
dentry = f.file->f_path.dentry;
+ mnt = f.file->f_path.mnt;
inode = dentry->d_inode;
error = -EINVAL;
if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
@@ -188,7 +197,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
if (!error)
error = security_path_truncate(&f.file->f_path);
if (!error)
- error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
+ error = do_truncate2(mnt, dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
sb_end_write(inode->i_sb);
out_putf:
fdput(f);
@@ -322,6 +331,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
struct cred *override_cred;
struct path path;
struct inode *inode;
+ struct vfsmount *mnt;
int res;
unsigned int lookup_flags = LOOKUP_FOLLOW;
@@ -352,6 +362,7 @@ retry:
goto out;
inode = path.dentry->d_inode;
+ mnt = path.mnt;
if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
/*
@@ -363,7 +374,7 @@ retry:
goto out_path_release;
}
- res = inode_permission(inode, mode | MAY_ACCESS);
+ res = inode_permission2(mnt, inode, mode | MAY_ACCESS);
/* SuS v2 requires we report a read only fs too */
if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
goto out_path_release;
@@ -407,7 +418,7 @@ retry:
if (error)
goto out;
- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+ error = inode_permission2(path.mnt, path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
if (error)
goto dput_and_out;
@@ -427,6 +438,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
struct fd f = fdget_raw(fd);
struct inode *inode;
+ struct vfsmount *mnt;
int error = -EBADF;
error = -EBADF;
@@ -434,12 +446,13 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
goto out;
inode = file_inode(f.file);
+ mnt = f.file->f_path.mnt;
error = -ENOTDIR;
if (!S_ISDIR(inode->i_mode))
goto out_putf;
- error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
+ error = inode_permission2(mnt, inode, MAY_EXEC | MAY_CHDIR);
if (!error)
set_fs_pwd(current->fs, &f.file->f_path);
out_putf:
@@ -458,7 +471,7 @@ retry:
if (error)
goto out;
- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+ error = inode_permission2(path.mnt, path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
if (error)
goto dput_and_out;
@@ -498,7 +511,7 @@ retry_deleg:
goto out_unlock;
newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
- error = notify_change(path->dentry, &newattrs, &delegated_inode);
+ error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
out_unlock:
mutex_unlock(&inode->i_mutex);
if (delegated_inode) {
@@ -578,7 +591,7 @@ retry_deleg:
mutex_lock(&inode->i_mutex);
error = security_path_chown(path, uid, gid);
if (!error)
- error = notify_change(path->dentry, &newattrs, &delegated_inode);
+ error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
mutex_unlock(&inode->i_mutex);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
diff --git a/fs/pnode.c b/fs/pnode.c
index 18e56fc4a88c..c35d03d42902 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -411,3 +411,37 @@ int propagate_umount(struct hlist_head *list)
__propagate_umount(mnt);
return 0;
}
+
+/*
+ * Iterates over all slaves, and slaves of slaves.
+ */
+static struct mount *next_descendent(struct mount *root, struct mount *cur)
+{
+ if (!IS_MNT_NEW(cur) && !list_empty(&cur->mnt_slave_list))
+ return first_slave(cur);
+ do {
+ struct mount *master = cur->mnt_master;
+
+ if (!master || cur->mnt_slave.next != &master->mnt_slave_list) {
+ struct mount *next = next_slave(cur);
+
+ return (next == root) ? NULL : next;
+ }
+ cur = master;
+ } while (cur != root);
+ return NULL;
+}
+
+void propagate_remount(struct mount *mnt)
+{
+ struct mount *m = mnt;
+ struct super_block *sb = mnt->mnt.mnt_sb;
+
+ if (sb->s_op->copy_mnt_data) {
+ m = next_descendent(mnt, m);
+ while (m) {
+ sb->s_op->copy_mnt_data(m->mnt.data, mnt->mnt.data);
+ m = next_descendent(mnt, m);
+ }
+ }
+}
diff --git a/fs/pnode.h b/fs/pnode.h
index 16afc3d6d2f2..a407d3237ee1 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -42,6 +42,7 @@ int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
struct hlist_head *);
int propagate_umount(struct hlist_head *);
int propagate_mount_busy(struct mount *, int);
+void propagate_remount(struct mount *);
void mnt_release_group_id(struct mount *);
int get_dominating_id(struct mount *mnt, const struct path *root);
unsigned int mnt_get_count(struct mount *mnt);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 16226e24ec48..068cbc05bcc1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -305,7 +305,8 @@ static void render_cap_t(struct seq_file *m, const char *header,
static inline void task_cap(struct seq_file *m, struct task_struct *p)
{
const struct cred *cred;
- kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
+ kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
+ cap_bset, cap_ambient;
rcu_read_lock();
cred = __task_cred(p);
@@ -313,12 +314,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
cap_permitted = cred->cap_permitted;
cap_effective = cred->cap_effective;
cap_bset = cred->cap_bset;
+ cap_ambient = cred->cap_ambient;
rcu_read_unlock();
render_cap_t(m, "CapInh:\t", &cap_inheritable);
render_cap_t(m, "CapPrm:\t", &cap_permitted);
render_cap_t(m, "CapEff:\t", &cap_effective);
render_cap_t(m, "CapBnd:\t", &cap_bset);
+ render_cap_t(m, "CapAmb:\t", &cap_ambient);
}
static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dc98620634a3..d3d0eef6bd97 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2602,8 +2602,8 @@ static const struct pid_entry tgid_base_stuff[] = {
ONE("cgroup", S_IRUGO, proc_cgroup_show),
#endif
ONE("oom_score", S_IRUGO, proc_oom_score),
- REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
- REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+ REG("oom_adj", S_IRUSR, proc_oom_adj_operations),
+ REG("oom_score_adj", S_IRUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
@@ -2883,6 +2883,44 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
}
/*
+ * proc_tid_comm_permission is a special permission function exclusively
+ * used for the node /proc/<pid>/task/<tid>/comm.
+ * It bypasses generic permission checks in the case where a task of the same
+ * task group attempts to access the node.
+ * The rational behind this is that glibc and bionic access this node for
+ * cross thread naming (pthread_set/getname_np(!self)). However, if
+ * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
+ * which locks out the cross thread naming implementation.
+ * This function makes sure that the node is always accessible for members of
+ * same thread group.
+ */
+static int proc_tid_comm_permission(struct inode *inode, int mask)
+{
+ bool is_same_tgroup;
+ struct task_struct *task;
+
+ task = get_proc_task(inode);
+ if (!task)
+ return -ESRCH;
+ is_same_tgroup = same_thread_group(current, task);
+ put_task_struct(task);
+
+ if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
+ /* This file (/proc/<pid>/task/<tid>/comm) can always be
+ * read or written by the members of the corresponding
+ * thread group.
+ */
+ return 0;
+ }
+
+ return generic_permission(inode, mask);
+}
+
+static const struct inode_operations proc_tid_comm_inode_operations = {
+ .permission = proc_tid_comm_permission,
+};
+
+/*
* Tasks
*/
static const struct pid_entry tid_base_stuff[] = {
@@ -2900,7 +2938,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
- REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
+ NOD("comm", S_IFREG|S_IRUGO|S_IWUSR,
+ &proc_tid_comm_inode_operations,
+ &proc_pid_set_comm_operations, {}),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
ONE("syscall", S_IRUSR, proc_pid_syscall),
#endif
@@ -2947,8 +2987,8 @@ static const struct pid_entry tid_base_stuff[] = {
ONE("cgroup", S_IRUGO, proc_cgroup_show),
#endif
ONE("oom_score", S_IRUGO, proc_oom_score),
- REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
- REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+ REG("oom_adj", S_IRUSR, proc_oom_adj_operations),
+ REG("oom_score_adj", S_IRUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index e11d7c590bb0..3e5f0eff02f9 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -283,11 +283,19 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
*/
int proc_fd_permission(struct inode *inode, int mask)
{
- int rv = generic_permission(inode, mask);
+ struct task_struct *p;
+ int rv;
+
+ rv = generic_permission(inode, mask);
if (rv == 0)
- return 0;
- if (task_tgid(current) == proc_pid(inode))
+ return rv;
+
+ rcu_read_lock();
+ p = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (p && same_thread_group(p, current))
rv = 0;
+ rcu_read_unlock();
+
return rv;
}
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 91a4e6426321..b4bf6639e285 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -430,6 +430,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
static ssize_t
read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
{
+ char *buf = file->private_data;
ssize_t acc = 0;
size_t size, tsz;
size_t elf_buflen;
@@ -500,23 +501,20 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
if (clear_user(buffer, tsz))
return -EFAULT;
} else if (is_vmalloc_or_module_addr((void *)start)) {
- char * elf_buf;
-
- elf_buf = kzalloc(tsz, GFP_KERNEL);
- if (!elf_buf)
- return -ENOMEM;
- vread(elf_buf, (char *)start, tsz);
+ vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
- if (copy_to_user(buffer, elf_buf, tsz)) {
- kfree(elf_buf);
+ if (copy_to_user(buffer, buf, tsz))
return -EFAULT;
- }
- kfree(elf_buf);
} else {
if (kern_addr_valid(start)) {
unsigned long n;
- n = copy_to_user(buffer, (char *)start, tsz);
+ /*
+ * Using bounce buffer to bypass the
+ * hardened user copy kernel text checks.
+ */
+ memcpy(buf, (char *) start, tsz);
+ n = copy_to_user(buffer, buf, tsz);
/*
* We cannot distinguish between fault on source
* and fault on destination. When this happens
@@ -549,6 +547,11 @@ static int open_kcore(struct inode *inode, struct file *filp)
{
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
+
+ filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!filp->private_data)
+ return -ENOMEM;
+
if (kcore_need_update)
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
@@ -559,10 +562,16 @@ static int open_kcore(struct inode *inode, struct file *filp)
return 0;
}
+static int release_kcore(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
static const struct file_operations proc_kcore_operations = {
.read = read_kcore,
.open = open_kcore,
+ .release = release_kcore,
.llseek = default_llseek,
};
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f92d5dd578a4..f81d63d2cacb 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -666,7 +666,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
ctl_dir = container_of(head, struct ctl_dir, header);
if (!dir_emit_dots(file, ctx))
- return 0;
+ goto out;
pos = 2;
@@ -676,6 +676,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
break;
}
}
+out:
sysctl_head_finish(head);
return 0;
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index aea2d0bf7174..0fb6cc46968b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -111,6 +111,56 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
+static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma)
+{
+ const char __user *name = vma_get_anon_name(vma);
+ struct mm_struct *mm = vma->vm_mm;
+
+ unsigned long page_start_vaddr;
+ unsigned long page_offset;
+ unsigned long num_pages;
+ unsigned long max_len = NAME_MAX;
+ int i;
+
+ page_start_vaddr = (unsigned long)name & PAGE_MASK;
+ page_offset = (unsigned long)name - page_start_vaddr;
+ num_pages = DIV_ROUND_UP(page_offset + max_len, PAGE_SIZE);
+
+ seq_puts(m, "[anon:");
+
+ for (i = 0; i < num_pages; i++) {
+ int len;
+ int write_len;
+ const char *kaddr;
+ long pages_pinned;
+ struct page *page;
+
+ pages_pinned = get_user_pages(current, mm, page_start_vaddr,
+ 1, 0, 0, &page, NULL);
+ if (pages_pinned < 1) {
+ seq_puts(m, "<fault>]");
+ return;
+ }
+
+ kaddr = (const char *)kmap(page);
+ len = min(max_len, PAGE_SIZE - page_offset);
+ write_len = strnlen(kaddr + page_offset, len);
+ seq_write(m, kaddr + page_offset, write_len);
+ kunmap(page);
+ put_page(page);
+
+ /* if strnlen hit a null terminator then we're done */
+ if (write_len != len)
+ break;
+
+ max_len -= len;
+ page_offset = 0;
+ page_start_vaddr += PAGE_SIZE;
+ }
+
+ seq_putc(m, ']');
+}
+
static void vma_stop(struct proc_maps_private *priv)
{
struct mm_struct *mm = priv->mm;
@@ -342,6 +392,12 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
seq_pad(m, ' ');
seq_printf(m, "[stack:%d]", tid);
}
+ goto done;
+ }
+
+ if (vma_get_anon_name(vma)) {
+ seq_pad(m, ' ');
+ seq_print_vma_name(m, vma);
}
}
@@ -441,6 +497,7 @@ struct mem_size_stats {
unsigned long swap;
unsigned long nonlinear;
u64 pss;
+ u64 swap_pss;
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
@@ -488,9 +545,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);
- if (!non_swap_entry(swpent))
+ if (!non_swap_entry(swpent)) {
+ int mapcount;
+
mss->swap += PAGE_SIZE;
- else if (is_migration_entry(swpent))
+ mapcount = swp_swapcount(swpent);
+ if (mapcount >= 2) {
+ u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
+
+ do_div(pss_delta, mapcount);
+ mss->swap_pss += pss_delta;
+ } else {
+ mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
+ }
+ } else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
} else if (pte_file(*pte)) {
if (pte_to_pgoff(*pte) != pgoff)
@@ -630,6 +698,12 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
show_map_vma(m, vma, is_pid);
+ if (vma_get_anon_name(vma)) {
+ seq_puts(m, "Name: ");
+ seq_print_vma_name(m, vma);
+ seq_putc(m, '\n');
+ }
+
seq_printf(m,
"Size: %8lu kB\n"
"Rss: %8lu kB\n"
@@ -642,6 +716,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
"Anonymous: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
"Swap: %8lu kB\n"
+ "SwapPss: %8lu kB\n"
"KernelPageSize: %8lu kB\n"
"MMUPageSize: %8lu kB\n"
"Locked: %8lu kB\n",
@@ -656,6 +731,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.anonymous >> 10,
mss.anonymous_thp >> 10,
mss.swap >> 10,
+ (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
vma_kernel_pagesize(vma) >> 10,
vma_mmu_pagesize(vma) >> 10,
(vma->vm_flags & VM_LOCKED) ?
@@ -740,6 +816,7 @@ enum clear_refs_types {
CLEAR_REFS_ANON,
CLEAR_REFS_MAPPED,
CLEAR_REFS_SOFT_DIRTY,
+ CLEAR_REFS_MM_HIWATER_RSS,
CLEAR_REFS_LAST,
};
@@ -854,6 +931,18 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
.mm = mm,
.private = &cp,
};
+
+ if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+ /*
+ * Writing 5 to /proc/pid/clear_refs resets the peak
+ * resident set size to this mm's current rss value.
+ */
+ down_write(&mm->mmap_sem);
+ reset_mm_hiwater_rss(mm);
+ up_write(&mm->mmap_sem);
+ goto out_mm;
+ }
+
down_read(&mm->mmap_sem);
if (type == CLEAR_REFS_SOFT_DIRTY) {
for (vma = mm->mmap; vma; vma = vma->vm_next) {
@@ -896,6 +985,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
mmu_notifier_invalidate_range_end(mm, 0, -1);
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
+out_mm:
mmput(mm);
}
put_task_struct(task);
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 73ca1740d839..f68d0fe2abd4 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -112,7 +112,9 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
if (err)
goto out;
show_mnt_opts(m, mnt);
- if (sb->s_op->show_options)
+ if (sb->s_op->show_options2)
+ err = sb->s_op->show_options2(mnt, m, mnt_path.dentry);
+ else if (sb->s_op->show_options)
err = sb->s_op->show_options(m, mnt_path.dentry);
seq_puts(m, " 0 0\n");
out:
@@ -173,7 +175,9 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
err = show_sb_opts(m, sb);
if (err)
goto out;
- if (sb->s_op->show_options)
+ if (sb->s_op->show_options2) {
+ err = sb->s_op->show_options2(mnt, m, mnt->mnt_root);
+ } else if (sb->s_op->show_options)
err = sb->s_op->show_options(m, mnt->mnt_root);
seq_putc(m, '\n');
out:
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 983d9510becc..916b8e23d968 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -21,6 +21,16 @@ config PSTORE_CONSOLE
When the option is enabled, pstore will log all kernel
messages, even if no oops or panic happened.
+config PSTORE_PMSG
+ bool "Log user space messages"
+ depends on PSTORE
+ help
+ When the option is enabled, pstore will export a character
+ interface /dev/pmsg0 to log user space messages. On reboot
+ data can be retrieved from /sys/fs/pstore/pmsg-ramoops-[ID].
+
+ If unsure, say N.
+
config PSTORE_FTRACE
bool "Persistent function tracer"
depends on PSTORE
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index 4c9095c2781e..e647d8e81712 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -7,5 +7,7 @@ obj-y += pstore.o
pstore-objs += inode.o platform.o
obj-$(CONFIG_PSTORE_FTRACE) += ftrace.o
+obj-$(CONFIG_PSTORE_PMSG) += pmsg.o
+
ramoops-objs += ram.o ram_core.o
obj-$(CONFIG_PSTORE_RAM) += ramoops.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index fafb7a02a5d6..6dce93c8ac29 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -316,32 +316,38 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
switch (type) {
case PSTORE_TYPE_DMESG:
- sprintf(name, "dmesg-%s-%lld%s", psname, id,
- compressed ? ".enc.z" : "");
+ scnprintf(name, sizeof(name), "dmesg-%s-%lld%s",
+ psname, id, compressed ? ".enc.z" : "");
break;
case PSTORE_TYPE_CONSOLE:
- sprintf(name, "console-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "console-%s", psname);
break;
case PSTORE_TYPE_FTRACE:
- sprintf(name, "ftrace-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "ftrace-%s", psname);
break;
case PSTORE_TYPE_MCE:
- sprintf(name, "mce-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id);
break;
case PSTORE_TYPE_PPC_RTAS:
- sprintf(name, "rtas-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id);
break;
case PSTORE_TYPE_PPC_OF:
- sprintf(name, "powerpc-ofw-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld",
+ psname, id);
break;
case PSTORE_TYPE_PPC_COMMON:
- sprintf(name, "powerpc-common-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "powerpc-common-%s-%lld",
+ psname, id);
+ break;
+ case PSTORE_TYPE_PMSG:
+ scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id);
break;
case PSTORE_TYPE_UNKNOWN:
- sprintf(name, "unknown-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id);
break;
default:
- sprintf(name, "type%d-%s-%lld", type, psname, id);
+ scnprintf(name, sizeof(name), "type%d-%s-%lld",
+ type, psname, id);
break;
}
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 3b3d305277c4..c36ba2cd0b5d 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -45,6 +45,12 @@ extern void pstore_register_ftrace(void);
static inline void pstore_register_ftrace(void) {}
#endif
+#ifdef CONFIG_PSTORE_PMSG
+extern void pstore_register_pmsg(void);
+#else
+static inline void pstore_register_pmsg(void) {}
+#endif
+
extern struct pstore_info *psinfo;
extern void pstore_set_kmsg_bytes(int);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 0a9b72cdfeca..ad1fe993d2d0 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -404,6 +404,40 @@ static int pstore_write_compat(enum pstore_type_id type,
size, psi);
}
+static int pstore_write_buf_user_compat(enum pstore_type_id type,
+ enum kmsg_dump_reason reason,
+ u64 *id, unsigned int part,
+ const char __user *buf,
+ bool compressed, size_t size,
+ struct pstore_info *psi)
+{
+ unsigned long flags = 0;
+ size_t i, bufsize = size;
+ long ret = 0;
+
+ if (unlikely(!access_ok(VERIFY_READ, buf, size)))
+ return -EFAULT;
+ if (bufsize > psinfo->bufsize)
+ bufsize = psinfo->bufsize;
+ spin_lock_irqsave(&psinfo->buf_lock, flags);
+ for (i = 0; i < size; ) {
+ size_t c = min(size - i, bufsize);
+
+ ret = __copy_from_user(psinfo->buf, buf + i, c);
+ if (unlikely(ret != 0)) {
+ ret = -EFAULT;
+ break;
+ }
+ ret = psi->write_buf(type, reason, id, part, psinfo->buf,
+ compressed, c, psi);
+ if (unlikely(ret < 0))
+ break;
+ i += c;
+ }
+ spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+ return unlikely(ret < 0) ? ret : size;
+}
+
/*
* platform specific persistent storage driver registers with
* us here. If pstore is already mounted, call the platform
@@ -428,6 +462,8 @@ int pstore_register(struct pstore_info *psi)
if (!psi->write)
psi->write = pstore_write_compat;
+ if (!psi->write_buf_user)
+ psi->write_buf_user = pstore_write_buf_user_compat;
psinfo = psi;
mutex_init(&psinfo->read_mutex);
spin_unlock(&pstore_lock);
@@ -447,6 +483,7 @@ int pstore_register(struct pstore_info *psi)
if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
pstore_register_console();
pstore_register_ftrace();
+ pstore_register_pmsg();
}
if (pstore_update_ms >= 0) {
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
new file mode 100644
index 000000000000..64b97a2966d9
--- /dev/null
+++ b/fs/pstore/pmsg.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include "internal.h"
+
+static DEFINE_MUTEX(pmsg_lock);
+
+static ssize_t write_pmsg(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u64 id;
+ int ret;
+
+ if (!count)
+ return 0;
+
+ /* check outside lock, page in any data. write_buf_user also checks */
+ if (!access_ok(VERIFY_READ, buf, count))
+ return -EFAULT;
+
+ mutex_lock(&pmsg_lock);
+ ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count,
+ psinfo);
+ mutex_unlock(&pmsg_lock);
+ return ret ? ret : count;
+}
+
+static const struct file_operations pmsg_fops = {
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+ .write = write_pmsg,
+};
+
+static struct class *pmsg_class;
+static int pmsg_major;
+#define PMSG_NAME "pmsg"
+#undef pr_fmt
+#define pr_fmt(fmt) PMSG_NAME ": " fmt
+
+static char *pmsg_devnode(struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0220;
+ return NULL;
+}
+
+void pstore_register_pmsg(void)
+{
+ struct device *pmsg_device;
+
+ pmsg_major = register_chrdev(0, PMSG_NAME, &pmsg_fops);
+ if (pmsg_major < 0) {
+ pr_err("register_chrdev failed\n");
+ goto err;
+ }
+
+ pmsg_class = class_create(THIS_MODULE, PMSG_NAME);
+ if (IS_ERR(pmsg_class)) {
+ pr_err("device class file already in use\n");
+ goto err_class;
+ }
+ pmsg_class->devnode = pmsg_devnode;
+
+ pmsg_device = device_create(pmsg_class, NULL, MKDEV(pmsg_major, 0),
+ NULL, "%s%d", PMSG_NAME, 0);
+ if (IS_ERR(pmsg_device)) {
+ pr_err("failed to create device\n");
+ goto err_device;
+ }
+ return;
+
+err_device:
+ class_destroy(pmsg_class);
+err_class:
+ unregister_chrdev(pmsg_major, PMSG_NAME);
+err:
+ return;
+}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 5fa34243b1ae..09486ed562da 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -51,6 +51,10 @@ static ulong ramoops_ftrace_size = MIN_MEM_SIZE;
module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400);
MODULE_PARM_DESC(ftrace_size, "size of ftrace log");
+static ulong ramoops_pmsg_size = MIN_MEM_SIZE;
+module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
+MODULE_PARM_DESC(pmsg_size, "size of user space message log");
+
static ulong mem_address;
module_param(mem_address, ulong, 0400);
MODULE_PARM_DESC(mem_address,
@@ -82,12 +86,14 @@ struct ramoops_context {
struct persistent_ram_zone **przs;
struct persistent_ram_zone *cprz;
struct persistent_ram_zone *fprz;
+ struct persistent_ram_zone *mprz;
phys_addr_t phys_addr;
unsigned long size;
unsigned int memtype;
size_t record_size;
size_t console_size;
size_t ftrace_size;
+ size_t pmsg_size;
int dump_oops;
struct persistent_ram_ecc_info ecc_info;
unsigned int max_dump_cnt;
@@ -96,6 +102,7 @@ struct ramoops_context {
unsigned int dump_read_cnt;
unsigned int console_read_cnt;
unsigned int ftrace_read_cnt;
+ unsigned int pmsg_read_cnt;
struct pstore_info pstore;
};
@@ -109,6 +116,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
cxt->dump_read_cnt = 0;
cxt->console_read_cnt = 0;
cxt->ftrace_read_cnt = 0;
+ cxt->pmsg_read_cnt = 0;
return 0;
}
@@ -162,6 +170,12 @@ static void ramoops_read_kmsg_hdr(char *buffer, struct timespec *time,
}
}
+static bool prz_ok(struct persistent_ram_zone *prz)
+{
+ return !!prz && !!(persistent_ram_old_size(prz) +
+ persistent_ram_ecc_string(prz, NULL, 0));
+}
+
static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
int *count, struct timespec *time,
char **buf, bool *compressed,
@@ -175,13 +189,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
cxt->max_dump_cnt, id, type,
PSTORE_TYPE_DMESG, 1);
- if (!prz)
+ if (!prz_ok(prz))
prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
1, id, type, PSTORE_TYPE_CONSOLE, 0);
- if (!prz)
+ if (!prz_ok(prz))
prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
1, id, type, PSTORE_TYPE_FTRACE, 0);
- if (!prz)
+ if (!prz_ok(prz))
+ prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
+ 1, id, type, PSTORE_TYPE_PMSG, 0);
+ if (!prz_ok(prz))
return 0;
size = persistent_ram_old_size(prz);
@@ -244,6 +261,11 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
return -ENOMEM;
persistent_ram_write(cxt->fprz, buf, size);
return 0;
+ } else if (type == PSTORE_TYPE_PMSG) {
+ if (!cxt->mprz)
+ return -ENOMEM;
+ persistent_ram_write(cxt->mprz, buf, size);
+ return 0;
}
if (type != PSTORE_TYPE_DMESG)
@@ -283,6 +305,24 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
return 0;
}
+static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type,
+ enum kmsg_dump_reason reason,
+ u64 *id, unsigned int part,
+ const char __user *buf,
+ bool compressed, size_t size,
+ struct pstore_info *psi)
+{
+ if (type == PSTORE_TYPE_PMSG) {
+ struct ramoops_context *cxt = psi->data;
+
+ if (!cxt->mprz)
+ return -ENOMEM;
+ return persistent_ram_write_user(cxt->mprz, buf, size);
+ }
+
+ return -EINVAL;
+}
+
static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
struct timespec time, struct pstore_info *psi)
{
@@ -301,6 +341,9 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
case PSTORE_TYPE_FTRACE:
prz = cxt->fprz;
break;
+ case PSTORE_TYPE_PMSG:
+ prz = cxt->mprz;
+ break;
default:
return -EINVAL;
}
@@ -318,6 +361,7 @@ static struct ramoops_context oops_cxt = {
.open = ramoops_pstore_open,
.read = ramoops_pstore_read,
.write_buf = ramoops_pstore_write_buf,
+ .write_buf_user = ramoops_pstore_write_buf_user,
.erase = ramoops_pstore_erase,
},
};
@@ -411,6 +455,12 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
return 0;
}
+void notrace ramoops_console_write_buf(const char *buf, size_t size)
+{
+ struct ramoops_context *cxt = &oops_cxt;
+ persistent_ram_write(cxt->cprz, buf, size);
+}
+
static int ramoops_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
@@ -427,7 +477,7 @@ static int ramoops_probe(struct platform_device *pdev)
goto fail_out;
if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
- !pdata->ftrace_size)) {
+ !pdata->ftrace_size && !pdata->pmsg_size)) {
pr_err("The memory size and the record/console size must be "
"non-zero\n");
goto fail_out;
@@ -439,6 +489,8 @@ static int ramoops_probe(struct platform_device *pdev)
pdata->console_size = rounddown_pow_of_two(pdata->console_size);
if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
+ if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size))
+ pdata->pmsg_size = rounddown_pow_of_two(pdata->pmsg_size);
cxt->size = pdata->mem_size;
cxt->phys_addr = pdata->mem_address;
@@ -446,12 +498,14 @@ static int ramoops_probe(struct platform_device *pdev)
cxt->record_size = pdata->record_size;
cxt->console_size = pdata->console_size;
cxt->ftrace_size = pdata->ftrace_size;
+ cxt->pmsg_size = pdata->pmsg_size;
cxt->dump_oops = pdata->dump_oops;
cxt->ecc_info = pdata->ecc_info;
paddr = cxt->phys_addr;
- dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size;
+ dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
+ - cxt->pmsg_size;
err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
if (err)
goto fail_out;
@@ -466,13 +520,9 @@ static int ramoops_probe(struct platform_device *pdev)
if (err)
goto fail_init_fprz;
- if (!cxt->przs && !cxt->cprz && !cxt->fprz) {
- pr_err("memory size too small, minimum is %zu\n",
- cxt->console_size + cxt->record_size +
- cxt->ftrace_size);
- err = -EINVAL;
- goto fail_cnt;
- }
+ err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0);
+ if (err)
+ goto fail_init_mprz;
cxt->pstore.data = cxt;
/*
@@ -517,7 +567,9 @@ fail_buf:
kfree(cxt->pstore.buf);
fail_clear:
cxt->pstore.bufsize = 0;
-fail_cnt:
+ cxt->max_dump_cnt = 0;
+ kfree(cxt->mprz);
+fail_init_mprz:
kfree(cxt->fprz);
fail_init_fprz:
kfree(cxt->cprz);
@@ -576,6 +628,7 @@ static void ramoops_register_dummy(void)
dummy_data->record_size = record_size;
dummy_data->console_size = ramoops_console_size;
dummy_data->ftrace_size = ramoops_ftrace_size;
+ dummy_data->pmsg_size = ramoops_pmsg_size;
dummy_data->dump_oops = dump_oops;
/*
* For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 76c3f80efdfa..aa9afe573155 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -17,15 +17,16 @@
#include <linux/device.h>
#include <linux/err.h>
#include <linux/errno.h>
-#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/io.h>
+#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/memblock.h>
+#include <linux/pstore_ram.h>
#include <linux/rslib.h>
#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <linux/vmalloc.h>
-#include <linux/pstore_ram.h>
#include <asm/page.h>
struct persistent_ram_buffer {
@@ -303,6 +304,16 @@ static void notrace persistent_ram_update(struct persistent_ram_zone *prz,
persistent_ram_update_ecc(prz, start, count);
}
+static int notrace persistent_ram_update_user(struct persistent_ram_zone *prz,
+ const void __user *s, unsigned int start, unsigned int count)
+{
+ struct persistent_ram_buffer *buffer = prz->buffer;
+ int ret = unlikely(__copy_from_user(buffer->data + start, s, count)) ?
+ -EFAULT : 0;
+ persistent_ram_update_ecc(prz, start, count);
+ return ret;
+}
+
void persistent_ram_save_old(struct persistent_ram_zone *prz)
{
struct persistent_ram_buffer *buffer = prz->buffer;
@@ -356,6 +367,38 @@ int notrace persistent_ram_write(struct persistent_ram_zone *prz,
return count;
}
+int notrace persistent_ram_write_user(struct persistent_ram_zone *prz,
+ const void __user *s, unsigned int count)
+{
+ int rem, ret = 0, c = count;
+ size_t start;
+
+ if (unlikely(!access_ok(VERIFY_READ, s, count)))
+ return -EFAULT;
+ if (unlikely(c > prz->buffer_size)) {
+ s += c - prz->buffer_size;
+ c = prz->buffer_size;
+ }
+
+ buffer_size_add(prz, c);
+
+ start = buffer_start_add(prz, c);
+
+ rem = prz->buffer_size - start;
+ if (unlikely(rem < c)) {
+ ret = persistent_ram_update_user(prz, s, start, rem);
+ s += rem;
+ c -= rem;
+ start = 0;
+ }
+ if (likely(!ret))
+ ret = persistent_ram_update_user(prz, s, start, c);
+
+ persistent_ram_update_header_ecc(prz);
+
+ return unlikely(ret) ? ret : count;
+}
+
size_t persistent_ram_old_size(struct persistent_ram_zone *prz)
{
return prz->old_log_size;
diff --git a/fs/sdcardfs/Kconfig b/fs/sdcardfs/Kconfig
new file mode 100644
index 000000000000..a1c103316ac7
--- /dev/null
+++ b/fs/sdcardfs/Kconfig
@@ -0,0 +1,13 @@
+config SDCARD_FS
+ tristate "sdcard file system"
+ depends on CONFIGFS_FS
+ default n
+ help
+ Sdcardfs is based on Wrapfs file system.
+
+config SDCARD_FS_FADV_NOACTIVE
+ bool "sdcardfs fadvise noactive support"
+ depends on FADV_NOACTIVE
+ default y
+ help
+ Sdcardfs supports fadvise noactive mode.
diff --git a/fs/sdcardfs/Makefile b/fs/sdcardfs/Makefile
new file mode 100644
index 000000000000..b84fbb2b45a4
--- /dev/null
+++ b/fs/sdcardfs/Makefile
@@ -0,0 +1,7 @@
+SDCARDFS_VERSION="0.1"
+
+EXTRA_CFLAGS += -DSDCARDFS_VERSION=\"$(SDCARDFS_VERSION)\"
+
+obj-$(CONFIG_SDCARD_FS) += sdcardfs.o
+
+sdcardfs-y := dentry.o file.o inode.o main.o super.o lookup.o mmap.o packagelist.o derived_perm.o
diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c
new file mode 100644
index 000000000000..0466b55fa25e
--- /dev/null
+++ b/fs/sdcardfs/dentry.c
@@ -0,0 +1,194 @@
+/*
+ * fs/sdcardfs/dentry.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include "linux/ctype.h"
+
+/*
+ * returns: -ERRNO if error (returned to user)
+ * 0: tell VFS to invalidate dentry
+ * 1: dentry is valid
+ */
+static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ int err = 1;
+ struct path parent_lower_path, lower_path;
+ struct dentry *parent_dentry = NULL;
+ struct dentry *parent_lower_dentry = NULL;
+ struct dentry *lower_cur_parent_dentry = NULL;
+ struct dentry *lower_dentry = NULL;
+ struct inode *inode;
+ struct sdcardfs_inode_data *data;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ spin_lock(&dentry->d_lock);
+ if (IS_ROOT(dentry)) {
+ spin_unlock(&dentry->d_lock);
+ return 1;
+ }
+ spin_unlock(&dentry->d_lock);
+
+ /* check uninitialized obb_dentry and
+ * whether the base obbpath has been changed or not
+ */
+ if (is_obbpath_invalid(dentry)) {
+ d_drop(dentry);
+ return 0;
+ }
+
+ parent_dentry = dget_parent(dentry);
+ sdcardfs_get_lower_path(parent_dentry, &parent_lower_path);
+ sdcardfs_get_real_lower(dentry, &lower_path);
+ parent_lower_dentry = parent_lower_path.dentry;
+ lower_dentry = lower_path.dentry;
+ lower_cur_parent_dentry = dget_parent(lower_dentry);
+
+ if ((lower_dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+ err = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+ if (err == 0) {
+ d_drop(dentry);
+ goto out;
+ }
+ }
+
+ spin_lock(&lower_dentry->d_lock);
+ if (d_unhashed(lower_dentry)) {
+ spin_unlock(&lower_dentry->d_lock);
+ d_drop(dentry);
+ err = 0;
+ goto out;
+ }
+ spin_unlock(&lower_dentry->d_lock);
+
+ if (parent_lower_dentry != lower_cur_parent_dentry) {
+ d_drop(dentry);
+ err = 0;
+ goto out;
+ }
+
+ if (dentry < lower_dentry) {
+ spin_lock(&dentry->d_lock);
+ spin_lock_nested(&lower_dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ } else {
+ spin_lock(&lower_dentry->d_lock);
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ }
+
+ if (!qstr_case_eq(&dentry->d_name, &lower_dentry->d_name)) {
+ __d_drop(dentry);
+ err = 0;
+ }
+
+ if (dentry < lower_dentry) {
+ spin_unlock(&lower_dentry->d_lock);
+ spin_unlock(&dentry->d_lock);
+ } else {
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&lower_dentry->d_lock);
+ }
+ if (!err)
+ goto out;
+
+ /* If our top's inode is gone, we may be out of date */
+ inode = igrab(dentry->d_inode);
+ if (inode) {
+ data = top_data_get(SDCARDFS_I(inode));
+ if (!data || data->abandoned) {
+ d_drop(dentry);
+ err = 0;
+ }
+ if (data)
+ data_put(data);
+ iput(inode);
+ }
+
+out:
+ dput(parent_dentry);
+ dput(lower_cur_parent_dentry);
+ sdcardfs_put_lower_path(parent_dentry, &parent_lower_path);
+ sdcardfs_put_real_lower(dentry, &lower_path);
+ return err;
+}
+
+static void sdcardfs_d_release(struct dentry *dentry)
+{
+ /* release and reset the lower paths */
+ if (has_graft_path(dentry))
+ sdcardfs_put_reset_orig_path(dentry);
+ sdcardfs_put_reset_lower_path(dentry);
+ free_dentry_private_data(dentry);
+}
+
+static int sdcardfs_hash_ci(const struct dentry *dentry,
+ struct qstr *qstr)
+{
+ /*
+ * This function is copy of vfat_hashi.
+ * FIXME Should we support national language?
+ * Refer to vfat_hashi()
+ * struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
+ */
+ const unsigned char *name;
+ unsigned int len;
+ unsigned long hash;
+
+ name = qstr->name;
+ len = qstr->len;
+
+ hash = init_name_hash();
+ while (len--)
+ hash = partial_name_hash(tolower(*name++), hash);
+ qstr->hash = end_name_hash(hash);
+
+ return 0;
+}
+
+/*
+ * Case insensitive compare of two vfat names.
+ */
+static int sdcardfs_cmp_ci(const struct dentry *parent,
+ const struct dentry *dentry,
+ unsigned int len, const char *str, const struct qstr *name)
+{
+ /* FIXME Should we support national language? */
+
+ if (name->len == len) {
+ if (str_n_case_eq(name->name, str, len))
+ return 0;
+ }
+ return 1;
+}
+
+static void sdcardfs_canonical_path(const struct path *path,
+ struct path *actual_path)
+{
+ sdcardfs_get_real_lower(path->dentry, actual_path);
+}
+
+const struct dentry_operations sdcardfs_ci_dops = {
+ .d_revalidate = sdcardfs_d_revalidate,
+ .d_release = sdcardfs_d_release,
+ .d_hash = sdcardfs_hash_ci,
+ .d_compare = sdcardfs_cmp_ci,
+ .d_canonical_path = sdcardfs_canonical_path,
+};
+
diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
new file mode 100644
index 000000000000..d567eddb333e
--- /dev/null
+++ b/fs/sdcardfs/derived_perm.c
@@ -0,0 +1,471 @@
+/*
+ * fs/sdcardfs/derived_perm.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+
+/* copy derived state from parent inode */
+static void inherit_derived_state(struct inode *parent, struct inode *child)
+{
+ struct sdcardfs_inode_info *pi = SDCARDFS_I(parent);
+ struct sdcardfs_inode_info *ci = SDCARDFS_I(child);
+
+ ci->data->perm = PERM_INHERIT;
+ ci->data->userid = pi->data->userid;
+ ci->data->d_uid = pi->data->d_uid;
+ ci->data->under_android = pi->data->under_android;
+ ci->data->under_cache = pi->data->under_cache;
+ ci->data->under_obb = pi->data->under_obb;
+ set_top(ci, pi->top_data);
+}
+
+/* helper function for derived state */
+void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
+ uid_t uid, bool under_android,
+ struct sdcardfs_inode_data *top)
+{
+ struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
+
+ info->data->perm = perm;
+ info->data->userid = userid;
+ info->data->d_uid = uid;
+ info->data->under_android = under_android;
+ info->data->under_cache = false;
+ info->data->under_obb = false;
+ set_top(info, top);
+}
+
+/* While renaming, there is a point where we want the path from dentry,
+ * but the name from newdentry
+ */
+void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
+ const struct qstr *name)
+{
+ struct sdcardfs_inode_info *info = SDCARDFS_I(dentry->d_inode);
+ struct sdcardfs_inode_data *parent_data =
+ SDCARDFS_I(parent->d_inode)->data;
+ appid_t appid;
+ unsigned long user_num;
+ int err;
+ struct qstr q_Android = QSTR_LITERAL("Android");
+ struct qstr q_data = QSTR_LITERAL("data");
+ struct qstr q_obb = QSTR_LITERAL("obb");
+ struct qstr q_media = QSTR_LITERAL("media");
+ struct qstr q_cache = QSTR_LITERAL("cache");
+
+ /* By default, each inode inherits from its parent.
+ * the properties are maintained on its private fields
+ * because the inode attributes will be modified with that of
+ * its lower inode.
+ * These values are used by our custom permission call instead
+ * of using the inode permissions.
+ */
+
+ inherit_derived_state(parent->d_inode, dentry->d_inode);
+
+ /* Files don't get special labels */
+ if (!S_ISDIR(dentry->d_inode->i_mode))
+ return;
+ /* Derive custom permissions based on parent and current node */
+ switch (parent_data->perm) {
+ case PERM_INHERIT:
+ case PERM_ANDROID_PACKAGE_CACHE:
+ /* Already inherited above */
+ break;
+ case PERM_PRE_ROOT:
+ /* Legacy internal layout places users at top level */
+ info->data->perm = PERM_ROOT;
+ err = kstrtoul(name->name, 10, &user_num);
+ if (err)
+ info->data->userid = 0;
+ else
+ info->data->userid = user_num;
+ set_top(info, info->data);
+ break;
+ case PERM_ROOT:
+ /* Assume masked off by default. */
+ if (qstr_case_eq(name, &q_Android)) {
+ /* App-specific directories inside; let anyone traverse */
+ info->data->perm = PERM_ANDROID;
+ info->data->under_android = true;
+ set_top(info, info->data);
+ }
+ break;
+ case PERM_ANDROID:
+ if (qstr_case_eq(name, &q_data)) {
+ /* App-specific directories inside; let anyone traverse */
+ info->data->perm = PERM_ANDROID_DATA;
+ set_top(info, info->data);
+ } else if (qstr_case_eq(name, &q_obb)) {
+ /* App-specific directories inside; let anyone traverse */
+ info->data->perm = PERM_ANDROID_OBB;
+ info->data->under_obb = true;
+ set_top(info, info->data);
+ /* Single OBB directory is always shared */
+ } else if (qstr_case_eq(name, &q_media)) {
+ /* App-specific directories inside; let anyone traverse */
+ info->data->perm = PERM_ANDROID_MEDIA;
+ set_top(info, info->data);
+ }
+ break;
+ case PERM_ANDROID_OBB:
+ case PERM_ANDROID_DATA:
+ case PERM_ANDROID_MEDIA:
+ info->data->perm = PERM_ANDROID_PACKAGE;
+ appid = get_appid(name->name);
+ if (appid != 0 && !is_excluded(name->name, parent_data->userid))
+ info->data->d_uid =
+ multiuser_get_uid(parent_data->userid, appid);
+ set_top(info, info->data);
+ break;
+ case PERM_ANDROID_PACKAGE:
+ if (qstr_case_eq(name, &q_cache)) {
+ info->data->perm = PERM_ANDROID_PACKAGE_CACHE;
+ info->data->under_cache = true;
+ }
+ break;
+ }
+}
+
+void get_derived_permission(struct dentry *parent, struct dentry *dentry)
+{
+ get_derived_permission_new(parent, dentry, &dentry->d_name);
+}
+
+static appid_t get_type(const char *name)
+{
+ const char *ext = strrchr(name, '.');
+ appid_t id;
+
+ if (ext && ext[0]) {
+ ext = &ext[1];
+ id = get_ext_gid(ext);
+ return id?:AID_MEDIA_RW;
+ }
+ return AID_MEDIA_RW;
+}
+
+void fixup_lower_ownership(struct dentry *dentry, const char *name)
+{
+ struct path path;
+ struct inode *inode;
+ struct inode *delegated_inode = NULL;
+ int error;
+ struct sdcardfs_inode_info *info;
+ struct sdcardfs_inode_data *info_d;
+ struct sdcardfs_inode_data *info_top;
+ perm_t perm;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ uid_t uid = sbi->options.fs_low_uid;
+ gid_t gid = sbi->options.fs_low_gid;
+ struct iattr newattrs;
+
+ info = SDCARDFS_I(dentry->d_inode);
+ info_d = info->data;
+ perm = info_d->perm;
+ if (info_d->under_obb) {
+ perm = PERM_ANDROID_OBB;
+ } else if (info_d->under_cache) {
+ perm = PERM_ANDROID_PACKAGE_CACHE;
+ } else if (perm == PERM_INHERIT) {
+ info_top = top_data_get(info);
+ perm = info_top->perm;
+ data_put(info_top);
+ }
+
+ switch (perm) {
+ case PERM_ROOT:
+ case PERM_ANDROID:
+ case PERM_ANDROID_DATA:
+ case PERM_ANDROID_MEDIA:
+ case PERM_ANDROID_PACKAGE:
+ case PERM_ANDROID_PACKAGE_CACHE:
+ uid = multiuser_get_uid(info_d->userid, uid);
+ break;
+ case PERM_ANDROID_OBB:
+ uid = AID_MEDIA_OBB;
+ break;
+ case PERM_PRE_ROOT:
+ default:
+ break;
+ }
+ switch (perm) {
+ case PERM_ROOT:
+ case PERM_ANDROID:
+ case PERM_ANDROID_DATA:
+ case PERM_ANDROID_MEDIA:
+ if (S_ISDIR(dentry->d_inode->i_mode))
+ gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW);
+ else
+ gid = multiuser_get_uid(info_d->userid, get_type(name));
+ break;
+ case PERM_ANDROID_OBB:
+ gid = AID_MEDIA_OBB;
+ break;
+ case PERM_ANDROID_PACKAGE:
+ if (uid_is_app(info_d->d_uid))
+ gid = multiuser_get_ext_gid(info_d->d_uid);
+ else
+ gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW);
+ break;
+ case PERM_ANDROID_PACKAGE_CACHE:
+ if (uid_is_app(info_d->d_uid))
+ gid = multiuser_get_ext_cache_gid(info_d->d_uid);
+ else
+ gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW);
+ break;
+ case PERM_PRE_ROOT:
+ default:
+ break;
+ }
+
+ sdcardfs_get_lower_path(dentry, &path);
+ inode = path.dentry->d_inode;
+ if (path.dentry->d_inode->i_gid.val != gid || path.dentry->d_inode->i_uid.val != uid) {
+retry_deleg:
+ newattrs.ia_valid = ATTR_GID | ATTR_UID | ATTR_FORCE;
+ newattrs.ia_uid = make_kuid(current_user_ns(), uid);
+ newattrs.ia_gid = make_kgid(current_user_ns(), gid);
+ if (!S_ISDIR(inode->i_mode))
+ newattrs.ia_valid |=
+ ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
+ mutex_lock(&inode->i_mutex);
+ error = security_path_chown(&path, newattrs.ia_uid, newattrs.ia_gid);
+ if (!error)
+ error = notify_change2(path.mnt, path.dentry, &newattrs, &delegated_inode);
+ mutex_unlock(&inode->i_mutex);
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
+ if (error)
+ pr_debug("sdcardfs: Failed to touch up lower fs gid/uid for %s\n", name);
+ }
+ sdcardfs_put_lower_path(dentry, &path);
+}
+
+static int descendant_may_need_fixup(struct sdcardfs_inode_data *data,
+ struct limit_search *limit)
+{
+ if (data->perm == PERM_ROOT)
+ return (limit->flags & BY_USERID) ?
+ data->userid == limit->userid : 1;
+ if (data->perm == PERM_PRE_ROOT || data->perm == PERM_ANDROID)
+ return 1;
+ return 0;
+}
+
+static int needs_fixup(perm_t perm)
+{
+ if (perm == PERM_ANDROID_DATA || perm == PERM_ANDROID_OBB
+ || perm == PERM_ANDROID_MEDIA)
+ return 1;
+ return 0;
+}
+
+static void __fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit, int depth)
+{
+ struct dentry *child;
+ struct sdcardfs_inode_info *info;
+
+ /*
+ * All paths will terminate their recursion on hitting PERM_ANDROID_OBB,
+ * PERM_ANDROID_MEDIA, or PERM_ANDROID_DATA. This happens at a depth of
+ * at most 3.
+ */
+ WARN(depth > 3, "%s: Max expected depth exceeded!\n", __func__);
+ spin_lock_nested(&dentry->d_lock, depth);
+ if (!dentry->d_inode) {
+ spin_unlock(&dentry->d_lock);
+ return;
+ }
+ info = SDCARDFS_I(dentry->d_inode);
+
+ if (needs_fixup(info->data->perm)) {
+ list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+ spin_lock_nested(&child->d_lock, depth + 1);
+ if (!(limit->flags & BY_NAME) || qstr_case_eq(&child->d_name, &limit->name)) {
+ if (child->d_inode) {
+ get_derived_permission(dentry, child);
+ fixup_tmp_permissions(child->d_inode);
+ spin_unlock(&child->d_lock);
+ break;
+ }
+ }
+ spin_unlock(&child->d_lock);
+ }
+ } else if (descendant_may_need_fixup(info->data, limit)) {
+ list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+ __fixup_perms_recursive(child, limit, depth + 1);
+ }
+ }
+ spin_unlock(&dentry->d_lock);
+}
+
+void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit)
+{
+ __fixup_perms_recursive(dentry, limit, 0);
+}
+
+/* main function for updating derived permission */
+inline void update_derived_permission_lock(struct dentry *dentry)
+{
+ struct dentry *parent;
+
+ if (!dentry || !dentry->d_inode) {
+ pr_err("sdcardfs: %s: invalid dentry\n", __func__);
+ return;
+ }
+ /* FIXME:
+ * 1. need to check whether the dentry is updated or not
+ * 2. remove the root dentry update
+ */
+ if (!IS_ROOT(dentry)) {
+ parent = dget_parent(dentry);
+ if (parent) {
+ get_derived_permission(parent, dentry);
+ dput(parent);
+ }
+ }
+ fixup_tmp_permissions(dentry->d_inode);
+}
+
+int need_graft_path(struct dentry *dentry)
+{
+ int ret = 0;
+ struct dentry *parent = dget_parent(dentry);
+ struct sdcardfs_inode_info *parent_info = SDCARDFS_I(parent->d_inode);
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ struct qstr obb = QSTR_LITERAL("obb");
+
+ if (parent_info->data->perm == PERM_ANDROID &&
+ qstr_case_eq(&dentry->d_name, &obb)) {
+
+ /* /Android/obb is the base obbpath of DERIVED_UNIFIED */
+ if (!(sbi->options.multiuser == false
+ && parent_info->data->userid == 0)) {
+ ret = 1;
+ }
+ }
+ dput(parent);
+ return ret;
+}
+
+int is_obbpath_invalid(struct dentry *dent)
+{
+ int ret = 0;
+ struct sdcardfs_dentry_info *di = SDCARDFS_D(dent);
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dent->d_sb);
+ char *path_buf, *obbpath_s;
+ int need_put = 0;
+ struct path lower_path;
+
+ /* check the base obbpath has been changed.
+ * this routine can check an uninitialized obb dentry as well.
+ * regarding the uninitialized obb, refer to the sdcardfs_mkdir()
+ */
+ spin_lock(&di->lock);
+ if (di->orig_path.dentry) {
+ if (!di->lower_path.dentry) {
+ ret = 1;
+ } else {
+ path_get(&di->lower_path);
+
+ path_buf = kmalloc(PATH_MAX, GFP_ATOMIC);
+ if (!path_buf) {
+ ret = 1;
+ pr_err("sdcardfs: fail to allocate path_buf in %s.\n", __func__);
+ } else {
+ obbpath_s = d_path(&di->lower_path, path_buf, PATH_MAX);
+ if (d_unhashed(di->lower_path.dentry) ||
+ !str_case_eq(sbi->obbpath_s, obbpath_s)) {
+ ret = 1;
+ }
+ kfree(path_buf);
+ }
+
+ pathcpy(&lower_path, &di->lower_path);
+ need_put = 1;
+ }
+ }
+ spin_unlock(&di->lock);
+ if (need_put)
+ path_put(&lower_path);
+ return ret;
+}
+
+int is_base_obbpath(struct dentry *dentry)
+{
+ int ret = 0;
+ struct dentry *parent = dget_parent(dentry);
+ struct sdcardfs_inode_info *parent_info = SDCARDFS_I(parent->d_inode);
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ struct qstr q_obb = QSTR_LITERAL("obb");
+
+ spin_lock(&SDCARDFS_D(dentry)->lock);
+ if (sbi->options.multiuser) {
+ if (parent_info->data->perm == PERM_PRE_ROOT &&
+ qstr_case_eq(&dentry->d_name, &q_obb)) {
+ ret = 1;
+ }
+ } else if (parent_info->data->perm == PERM_ANDROID &&
+ qstr_case_eq(&dentry->d_name, &q_obb)) {
+ ret = 1;
+ }
+ spin_unlock(&SDCARDFS_D(dentry)->lock);
+ return ret;
+}
+
+/* The lower_path will be stored to the dentry's orig_path
+ * and the base obbpath will be copyed to the lower_path variable.
+ * if an error returned, there's no change in the lower_path
+ * returns: -ERRNO if error (0: no error)
+ */
+int setup_obb_dentry(struct dentry *dentry, struct path *lower_path)
+{
+ int err = 0;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ struct path obbpath;
+
+ /* A local obb dentry must have its own orig_path to support rmdir
+ * and mkdir of itself. Usually, we expect that the sbi->obbpath
+ * is avaiable on this stage.
+ */
+ sdcardfs_set_orig_path(dentry, lower_path);
+
+ err = kern_path(sbi->obbpath_s,
+ LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &obbpath);
+
+ if (!err) {
+ /* the obbpath base has been found */
+ pathcpy(lower_path, &obbpath);
+ } else {
+ /* if the sbi->obbpath is not available, we can optionally
+ * setup the lower_path with its orig_path.
+ * but, the current implementation just returns an error
+ * because the sdcard daemon also regards this case as
+ * a lookup fail.
+ */
+ pr_info("sdcardfs: the sbi->obbpath is not available\n");
+ }
+ return err;
+}
+
+
diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
new file mode 100644
index 000000000000..8b0d5d2701e7
--- /dev/null
+++ b/fs/sdcardfs/file.c
@@ -0,0 +1,433 @@
+/*
+ * fs/sdcardfs/file.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE
+#include <linux/backing-dev.h>
+#endif
+
+static ssize_t sdcardfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int err;
+ struct file *lower_file;
+ struct dentry *dentry = file->f_path.dentry;
+#ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE
+ struct backing_dev_info *bdi;
+#endif
+
+ lower_file = sdcardfs_lower_file(file);
+
+#ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE
+ if (file->f_mode & FMODE_NOACTIVE) {
+ if (!(lower_file->f_mode & FMODE_NOACTIVE)) {
+ bdi = lower_file->f_mapping->backing_dev_info;
+ lower_file->f_ra.ra_pages = bdi->ra_pages * 2;
+ spin_lock(&lower_file->f_lock);
+ lower_file->f_mode |= FMODE_NOACTIVE;
+ spin_unlock(&lower_file->f_lock);
+ }
+ }
+#endif
+
+ err = vfs_read(lower_file, buf, count, ppos);
+ /* update our inode atime upon a successful lower read */
+ if (err >= 0)
+ fsstack_copy_attr_atime(dentry->d_inode,
+ file_inode(lower_file));
+
+ return err;
+}
+
+static ssize_t sdcardfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int err;
+ struct file *lower_file;
+ struct dentry *dentry = file->f_path.dentry;
+
+ /* check disk space */
+ if (!check_min_free_space(dentry, count, 0)) {
+ pr_err("No minimum free space.\n");
+ return -ENOSPC;
+ }
+
+ lower_file = sdcardfs_lower_file(file);
+ err = vfs_write(lower_file, buf, count, ppos);
+ /* update our inode times+sizes upon a successful lower write */
+ if (err >= 0) {
+ fsstack_copy_inode_size(dentry->d_inode,
+ file_inode(lower_file));
+ fsstack_copy_attr_times(dentry->d_inode,
+ file_inode(lower_file));
+ }
+
+ return err;
+}
+
+static int sdcardfs_readdir(struct file *file, struct dir_context *ctx)
+{
+ int err;
+ struct file *lower_file = NULL;
+ struct dentry *dentry = file->f_path.dentry;
+
+ lower_file = sdcardfs_lower_file(file);
+
+ lower_file->f_pos = file->f_pos;
+ err = iterate_dir(lower_file, ctx);
+ file->f_pos = lower_file->f_pos;
+ if (err >= 0) /* copy the atime */
+ fsstack_copy_attr_atime(dentry->d_inode,
+ file_inode(lower_file));
+ return err;
+}
+
+static long sdcardfs_unlocked_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ long err = -ENOTTY;
+ struct file *lower_file;
+
+ lower_file = sdcardfs_lower_file(file);
+
+ /* XXX: use vfs_ioctl if/when VFS exports it */
+ if (!lower_file || !lower_file->f_op)
+ goto out;
+ if (lower_file->f_op->unlocked_ioctl)
+ err = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
+
+ /* some ioctls can change inode attributes (EXT2_IOC_SETFLAGS) */
+ if (!err)
+ sdcardfs_copy_and_fix_attrs(file_inode(file),
+ file_inode(lower_file));
+out:
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+static long sdcardfs_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ long err = -ENOTTY;
+ struct file *lower_file;
+
+ lower_file = sdcardfs_lower_file(file);
+
+ /* XXX: use vfs_ioctl if/when VFS exports it */
+ if (!lower_file || !lower_file->f_op)
+ goto out;
+ if (lower_file->f_op->compat_ioctl)
+ err = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
+
+out:
+ return err;
+}
+#endif
+
+static int sdcardfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ int err = 0;
+ bool willwrite;
+ struct file *lower_file;
+ const struct vm_operations_struct *saved_vm_ops = NULL;
+
+ /* this might be deferred to mmap's writepage */
+ willwrite = ((vma->vm_flags | VM_SHARED | VM_WRITE) == vma->vm_flags);
+
+ /*
+ * File systems which do not implement ->writepage may use
+ * generic_file_readonly_mmap as their ->mmap op. If you call
+ * generic_file_readonly_mmap with VM_WRITE, you'd get an -EINVAL.
+ * But we cannot call the lower ->mmap op, so we can't tell that
+ * writeable mappings won't work. Therefore, our only choice is to
+ * check if the lower file system supports the ->writepage, and if
+ * not, return EINVAL (the same error that
+ * generic_file_readonly_mmap returns in that case).
+ */
+ lower_file = sdcardfs_lower_file(file);
+ if (willwrite && !lower_file->f_mapping->a_ops->writepage) {
+ err = -EINVAL;
+ pr_err("sdcardfs: lower file system does not support writeable mmap\n");
+ goto out;
+ }
+
+ /*
+ * find and save lower vm_ops.
+ *
+ * XXX: the VFS should have a cleaner way of finding the lower vm_ops
+ */
+ if (!SDCARDFS_F(file)->lower_vm_ops) {
+ err = lower_file->f_op->mmap(lower_file, vma);
+ if (err) {
+ pr_err("sdcardfs: lower mmap failed %d\n", err);
+ goto out;
+ }
+ saved_vm_ops = vma->vm_ops; /* save: came from lower ->mmap */
+ }
+
+ /*
+ * Next 3 lines are all I need from generic_file_mmap. I definitely
+ * don't want its test for ->readpage which returns -ENOEXEC.
+ */
+ file_accessed(file);
+ vma->vm_ops = &sdcardfs_vm_ops;
+
+ file->f_mapping->a_ops = &sdcardfs_aops; /* set our aops */
+ if (!SDCARDFS_F(file)->lower_vm_ops) /* save for our ->fault */
+ SDCARDFS_F(file)->lower_vm_ops = saved_vm_ops;
+ vma->vm_private_data = file;
+ get_file(lower_file);
+ vma->vm_file = lower_file;
+
+out:
+ return err;
+}
+
+static int sdcardfs_open(struct inode *inode, struct file *file)
+{
+ int err = 0;
+ struct file *lower_file = NULL;
+ struct path lower_path;
+ struct dentry *dentry = file->f_path.dentry;
+ struct dentry *parent = dget_parent(dentry);
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ const struct cred *saved_cred = NULL;
+
+ /* don't open unhashed/deleted files */
+ if (d_unhashed(dentry)) {
+ err = -ENOENT;
+ goto out_err;
+ }
+
+ if (!check_caller_access_to_name(parent->d_inode, &dentry->d_name)) {
+ err = -EACCES;
+ goto out_err;
+ }
+
+ /* save current_cred and override it */
+ OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(inode));
+
+ file->private_data =
+ kzalloc(sizeof(struct sdcardfs_file_info), GFP_KERNEL);
+ if (!SDCARDFS_F(file)) {
+ err = -ENOMEM;
+ goto out_revert_cred;
+ }
+
+ /* open lower object and link sdcardfs's file struct to lower's */
+ sdcardfs_get_lower_path(file->f_path.dentry, &lower_path);
+ lower_file = dentry_open(&lower_path, file->f_flags, current_cred());
+ path_put(&lower_path);
+ if (IS_ERR(lower_file)) {
+ err = PTR_ERR(lower_file);
+ lower_file = sdcardfs_lower_file(file);
+ if (lower_file) {
+ sdcardfs_set_lower_file(file, NULL);
+ fput(lower_file); /* fput calls dput for lower_dentry */
+ }
+ } else {
+ sdcardfs_set_lower_file(file, lower_file);
+ }
+
+ if (err)
+ kfree(SDCARDFS_F(file));
+ else
+ sdcardfs_copy_and_fix_attrs(inode, sdcardfs_lower_inode(inode));
+
+out_revert_cred:
+ REVERT_CRED(saved_cred);
+out_err:
+ dput(parent);
+ return err;
+}
+
+static int sdcardfs_flush(struct file *file, fl_owner_t id)
+{
+ int err = 0;
+ struct file *lower_file = NULL;
+
+ lower_file = sdcardfs_lower_file(file);
+ if (lower_file && lower_file->f_op && lower_file->f_op->flush) {
+ filemap_write_and_wait(file->f_mapping);
+ err = lower_file->f_op->flush(lower_file, id);
+ }
+
+ return err;
+}
+
+/* release all lower object references & free the file info structure */
+static int sdcardfs_file_release(struct inode *inode, struct file *file)
+{
+ struct file *lower_file;
+
+ lower_file = sdcardfs_lower_file(file);
+ if (lower_file) {
+ sdcardfs_set_lower_file(file, NULL);
+ fput(lower_file);
+ }
+
+ kfree(SDCARDFS_F(file));
+ return 0;
+}
+
+static int sdcardfs_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
+{
+ int err;
+ struct file *lower_file;
+ struct path lower_path;
+ struct dentry *dentry = file->f_path.dentry;
+
+ err = __generic_file_fsync(file, start, end, datasync);
+ if (err)
+ goto out;
+
+ lower_file = sdcardfs_lower_file(file);
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ err = vfs_fsync_range(lower_file, start, end, datasync);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+out:
+ return err;
+}
+
+static int sdcardfs_fasync(int fd, struct file *file, int flag)
+{
+ int err = 0;
+ struct file *lower_file = NULL;
+
+ lower_file = sdcardfs_lower_file(file);
+ if (lower_file->f_op && lower_file->f_op->fasync)
+ err = lower_file->f_op->fasync(fd, lower_file, flag);
+
+ return err;
+}
+
+/*
+ * Sdcardfs cannot use generic_file_llseek as ->llseek, because it would
+ * only set the offset of the upper file. So we have to implement our
+ * own method to set both the upper and lower file offsets
+ * consistently.
+ */
+static loff_t sdcardfs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ int err;
+ struct file *lower_file;
+
+ err = generic_file_llseek(file, offset, whence);
+ if (err < 0)
+ goto out;
+
+ lower_file = sdcardfs_lower_file(file);
+ err = generic_file_llseek(lower_file, offset, whence);
+
+out:
+ return err;
+}
+
+/*
+ * Sdcardfs read_iter, redirect modified iocb to lower read_iter
+ */
+ssize_t sdcardfs_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ int err;
+ struct file *file = iocb->ki_filp, *lower_file;
+
+ lower_file = sdcardfs_lower_file(file);
+ if (!lower_file->f_op->read_iter) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ get_file(lower_file); /* prevent lower_file from being released */
+ iocb->ki_filp = lower_file;
+ err = lower_file->f_op->read_iter(iocb, iter);
+ iocb->ki_filp = file;
+ fput(lower_file);
+ /* update upper inode atime as needed */
+ if (err >= 0 || err == -EIOCBQUEUED)
+ fsstack_copy_attr_atime(file->f_path.dentry->d_inode,
+ file_inode(lower_file));
+out:
+ return err;
+}
+
+/*
+ * Sdcardfs write_iter, redirect modified iocb to lower write_iter
+ */
+ssize_t sdcardfs_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ int err;
+ struct file *file = iocb->ki_filp, *lower_file;
+
+ lower_file = sdcardfs_lower_file(file);
+ if (!lower_file->f_op->write_iter) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ get_file(lower_file); /* prevent lower_file from being released */
+ iocb->ki_filp = lower_file;
+ err = lower_file->f_op->write_iter(iocb, iter);
+ iocb->ki_filp = file;
+ fput(lower_file);
+ /* update upper inode times/sizes as needed */
+ if (err >= 0 || err == -EIOCBQUEUED) {
+ fsstack_copy_inode_size(file->f_path.dentry->d_inode,
+ file_inode(lower_file));
+ fsstack_copy_attr_times(file->f_path.dentry->d_inode,
+ file_inode(lower_file));
+ }
+out:
+ return err;
+}
+
+const struct file_operations sdcardfs_main_fops = {
+ .llseek = generic_file_llseek,
+ .read = sdcardfs_read,
+ .write = sdcardfs_write,
+ .unlocked_ioctl = sdcardfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = sdcardfs_compat_ioctl,
+#endif
+ .mmap = sdcardfs_mmap,
+ .open = sdcardfs_open,
+ .flush = sdcardfs_flush,
+ .release = sdcardfs_file_release,
+ .fsync = sdcardfs_fsync,
+ .fasync = sdcardfs_fasync,
+ .read_iter = sdcardfs_read_iter,
+ .write_iter = sdcardfs_write_iter,
+};
+
+/* trimmed directory options */
+const struct file_operations sdcardfs_dir_fops = {
+ .llseek = sdcardfs_file_llseek,
+ .read = generic_read_dir,
+ .iterate = sdcardfs_readdir,
+ .unlocked_ioctl = sdcardfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = sdcardfs_compat_ioctl,
+#endif
+ .open = sdcardfs_open,
+ .release = sdcardfs_file_release,
+ .flush = sdcardfs_flush,
+ .fsync = sdcardfs_fsync,
+ .fasync = sdcardfs_fasync,
+};
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
new file mode 100644
index 000000000000..39b8b2b9af87
--- /dev/null
+++ b/fs/sdcardfs/inode.c
@@ -0,0 +1,914 @@
+/*
+ * fs/sdcardfs/inode.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include <linux/fs_struct.h>
+#include <linux/ratelimit.h>
+
+/* Do not directly use this function. Use OVERRIDE_CRED() instead. */
+const struct cred *override_fsids(struct sdcardfs_sb_info *sbi,
+ struct sdcardfs_inode_data *data)
+{
+ struct cred *cred;
+ const struct cred *old_cred;
+ uid_t uid;
+
+ cred = prepare_creds();
+ if (!cred)
+ return NULL;
+
+ if (data->under_obb)
+ uid = AID_MEDIA_OBB;
+ else
+ uid = multiuser_get_uid(data->userid, sbi->options.fs_low_uid);
+ cred->fsuid = make_kuid(&init_user_ns, uid);
+ cred->fsgid = make_kgid(&init_user_ns, sbi->options.fs_low_gid);
+
+ old_cred = override_creds(cred);
+
+ return old_cred;
+}
+
+/* Do not directly use this function, use REVERT_CRED() instead. */
+void revert_fsids(const struct cred *old_cred)
+{
+ const struct cred *cur_cred;
+
+ cur_cred = current->cred;
+ revert_creds(old_cred);
+ put_cred(cur_cred);
+}
+
+static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool want_excl)
+{
+ int err;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_dentry_mnt;
+ struct dentry *lower_parent_dentry = NULL;
+ struct path lower_path;
+ const struct cred *saved_cred = NULL;
+ struct fs_struct *saved_fs;
+ struct fs_struct *copied_fs;
+
+ if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+ err = -EACCES;
+ goto out_eacces;
+ }
+
+ /* save current_cred and override it */
+ OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ lower_dentry_mnt = lower_path.mnt;
+ lower_parent_dentry = lock_parent(lower_dentry);
+
+ /* set last 16bytes of mode field to 0664 */
+ mode = (mode & S_IFMT) | 00664;
+
+ /* temporarily change umask for lower fs write */
+ saved_fs = current->fs;
+ copied_fs = copy_fs_struct(current->fs);
+ if (!copied_fs) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ current->fs = copied_fs;
+ current->fs->umask = 0;
+ err = vfs_create2(lower_dentry_mnt, lower_parent_dentry->d_inode, lower_dentry, mode, want_excl);
+ if (err)
+ goto out;
+
+ err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path,
+ SDCARDFS_I(dir)->data->userid);
+ if (err)
+ goto out;
+ fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
+ fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode);
+ fixup_lower_ownership(dentry, dentry->d_name.name);
+
+out:
+ current->fs = saved_fs;
+ free_fs_struct(copied_fs);
+out_unlock:
+ unlock_dir(lower_parent_dentry);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ REVERT_CRED(saved_cred);
+out_eacces:
+ return err;
+}
+
+#if 0
+static int sdcardfs_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry)
+{
+ struct dentry *lower_old_dentry;
+ struct dentry *lower_new_dentry;
+ struct dentry *lower_dir_dentry;
+ u64 file_size_save;
+ int err;
+ struct path lower_old_path, lower_new_path;
+
+ OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb));
+
+ file_size_save = i_size_read(old_dentry->d_inode);
+ sdcardfs_get_lower_path(old_dentry, &lower_old_path);
+ sdcardfs_get_lower_path(new_dentry, &lower_new_path);
+ lower_old_dentry = lower_old_path.dentry;
+ lower_new_dentry = lower_new_path.dentry;
+ lower_dir_dentry = lock_parent(lower_new_dentry);
+
+ err = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode,
+ lower_new_dentry, NULL);
+ if (err || !lower_new_dentry->d_inode)
+ goto out;
+
+ err = sdcardfs_interpose(new_dentry, dir->i_sb, &lower_new_path);
+ if (err)
+ goto out;
+ fsstack_copy_attr_times(dir, lower_new_dentry->d_inode);
+ fsstack_copy_inode_size(dir, lower_new_dentry->d_inode);
+ set_nlink(old_dentry->d_inode,
+ sdcardfs_lower_inode(old_dentry->d_inode)->i_nlink);
+ i_size_write(new_dentry->d_inode, file_size_save);
+out:
+ unlock_dir(lower_dir_dentry);
+ sdcardfs_put_lower_path(old_dentry, &lower_old_path);
+ sdcardfs_put_lower_path(new_dentry, &lower_new_path);
+ REVERT_CRED();
+ return err;
+}
+#endif
+
+static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int err;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_mnt;
+ struct inode *lower_dir_inode = sdcardfs_lower_inode(dir);
+ struct dentry *lower_dir_dentry;
+ struct path lower_path;
+ const struct cred *saved_cred = NULL;
+
+ if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+ err = -EACCES;
+ goto out_eacces;
+ }
+
+ /* save current_cred and override it */
+ OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ lower_mnt = lower_path.mnt;
+ dget(lower_dentry);
+ lower_dir_dentry = lock_parent(lower_dentry);
+
+ err = vfs_unlink2(lower_mnt, lower_dir_inode, lower_dentry, NULL);
+
+ /*
+ * Note: unlinking on top of NFS can cause silly-renamed files.
+ * Trying to delete such files results in EBUSY from NFS
+ * below. Silly-renamed files will get deleted by NFS later on, so
+ * we just need to detect them here and treat such EBUSY errors as
+ * if the upper file was successfully deleted.
+ */
+ if (err == -EBUSY && lower_dentry->d_flags & DCACHE_NFSFS_RENAMED)
+ err = 0;
+ if (err)
+ goto out;
+ fsstack_copy_attr_times(dir, lower_dir_inode);
+ fsstack_copy_inode_size(dir, lower_dir_inode);
+ set_nlink(dentry->d_inode,
+ sdcardfs_lower_inode(dentry->d_inode)->i_nlink);
+ dentry->d_inode->i_ctime = dir->i_ctime;
+ d_drop(dentry); /* this is needed, else LTP fails (VFS won't do it) */
+out:
+ unlock_dir(lower_dir_dentry);
+ dput(lower_dentry);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ REVERT_CRED(saved_cred);
+out_eacces:
+ return err;
+}
+
+#if 0
+static int sdcardfs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ int err;
+ struct dentry *lower_dentry;
+ struct dentry *lower_parent_dentry = NULL;
+ struct path lower_path;
+
+ OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb));
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ lower_parent_dentry = lock_parent(lower_dentry);
+
+ err = vfs_symlink(lower_parent_dentry->d_inode, lower_dentry, symname);
+ if (err)
+ goto out;
+ err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path);
+ if (err)
+ goto out;
+ fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
+ fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode);
+
+out:
+ unlock_dir(lower_parent_dentry);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ REVERT_CRED();
+ return err;
+}
+#endif
+
+static int touch(char *abs_path, mode_t mode)
+{
+ struct file *filp = filp_open(abs_path, O_RDWR|O_CREAT|O_EXCL|O_NOFOLLOW, mode);
+
+ if (IS_ERR(filp)) {
+ if (PTR_ERR(filp) == -EEXIST) {
+ return 0;
+ } else {
+ pr_err("sdcardfs: failed to open(%s): %ld\n",
+ abs_path, PTR_ERR(filp));
+ return PTR_ERR(filp);
+ }
+ }
+ filp_close(filp, current->files);
+ return 0;
+}
+
+static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ int err;
+ int make_nomedia_in_obb = 0;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_mnt;
+ struct dentry *lower_parent_dentry = NULL;
+ struct path lower_path;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ const struct cred *saved_cred = NULL;
+ struct sdcardfs_inode_data *pd = SDCARDFS_I(dir)->data;
+ int touch_err = 0;
+ struct fs_struct *saved_fs;
+ struct fs_struct *copied_fs;
+ struct qstr q_obb = QSTR_LITERAL("obb");
+ struct qstr q_data = QSTR_LITERAL("data");
+
+ if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+ err = -EACCES;
+ goto out_eacces;
+ }
+
+ /* save current_cred and override it */
+ OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
+
+ /* check disk space */
+ if (!check_min_free_space(dentry, 0, 1)) {
+ pr_err("sdcardfs: No minimum free space.\n");
+ err = -ENOSPC;
+ goto out_revert;
+ }
+
+ /* the lower_dentry is negative here */
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ lower_mnt = lower_path.mnt;
+ lower_parent_dentry = lock_parent(lower_dentry);
+
+ /* set last 16bytes of mode field to 0775 */
+ mode = (mode & S_IFMT) | 00775;
+
+ /* temporarily change umask for lower fs write */
+ saved_fs = current->fs;
+ copied_fs = copy_fs_struct(current->fs);
+ if (!copied_fs) {
+ err = -ENOMEM;
+ unlock_dir(lower_parent_dentry);
+ goto out_unlock;
+ }
+ current->fs = copied_fs;
+ current->fs->umask = 0;
+ err = vfs_mkdir2(lower_mnt, lower_parent_dentry->d_inode, lower_dentry, mode);
+
+ if (err) {
+ unlock_dir(lower_parent_dentry);
+ goto out;
+ }
+
+ /* if it is a local obb dentry, setup it with the base obbpath */
+ if (need_graft_path(dentry)) {
+
+ err = setup_obb_dentry(dentry, &lower_path);
+ if (err) {
+ /* if the sbi->obbpath is not available, the lower_path won't be
+ * changed by setup_obb_dentry() but the lower path is saved to
+ * its orig_path. this dentry will be revalidated later.
+ * but now, the lower_path should be NULL
+ */
+ sdcardfs_put_reset_lower_path(dentry);
+
+ /* the newly created lower path which saved to its orig_path or
+ * the lower_path is the base obbpath.
+ * therefore, an additional path_get is required
+ */
+ path_get(&lower_path);
+ } else
+ make_nomedia_in_obb = 1;
+ }
+
+ err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path, pd->userid);
+ if (err) {
+ unlock_dir(lower_parent_dentry);
+ goto out;
+ }
+
+ fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
+ fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode);
+ /* update number of links on parent directory */
+ set_nlink(dir, sdcardfs_lower_inode(dir)->i_nlink);
+ fixup_lower_ownership(dentry, dentry->d_name.name);
+ unlock_dir(lower_parent_dentry);
+ if ((!sbi->options.multiuser) && (qstr_case_eq(&dentry->d_name, &q_obb))
+ && (pd->perm == PERM_ANDROID) && (pd->userid == 0))
+ make_nomedia_in_obb = 1;
+
+ /* When creating /Android/data and /Android/obb, mark them as .nomedia */
+ if (make_nomedia_in_obb ||
+ ((pd->perm == PERM_ANDROID)
+ && (qstr_case_eq(&dentry->d_name, &q_data)))) {
+ REVERT_CRED(saved_cred);
+ OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dentry->d_inode));
+ set_fs_pwd(current->fs, &lower_path);
+ touch_err = touch(".nomedia", 0664);
+ if (touch_err) {
+ pr_err("sdcardfs: failed to create .nomedia in %s: %d\n",
+ lower_path.dentry->d_name.name, touch_err);
+ goto out;
+ }
+ }
+out:
+ current->fs = saved_fs;
+ free_fs_struct(copied_fs);
+out_unlock:
+ sdcardfs_put_lower_path(dentry, &lower_path);
+out_revert:
+ REVERT_CRED(saved_cred);
+out_eacces:
+ return err;
+}
+
+static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct dentry *lower_dentry;
+ struct dentry *lower_dir_dentry;
+ struct vfsmount *lower_mnt;
+ int err;
+ struct path lower_path;
+ const struct cred *saved_cred = NULL;
+
+ if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+ err = -EACCES;
+ goto out_eacces;
+ }
+
+ /* save current_cred and override it */
+ OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
+
+ /* sdcardfs_get_real_lower(): in case of remove an user's obb dentry
+ * the dentry on the original path should be deleted.
+ */
+ sdcardfs_get_real_lower(dentry, &lower_path);
+
+ lower_dentry = lower_path.dentry;
+ lower_mnt = lower_path.mnt;
+ lower_dir_dentry = lock_parent(lower_dentry);
+
+ err = vfs_rmdir2(lower_mnt, lower_dir_dentry->d_inode, lower_dentry);
+ if (err)
+ goto out;
+
+ d_drop(dentry); /* drop our dentry on success (why not VFS's job?) */
+ if (dentry->d_inode)
+ clear_nlink(dentry->d_inode);
+ fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
+ fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
+ set_nlink(dir, lower_dir_dentry->d_inode->i_nlink);
+
+out:
+ unlock_dir(lower_dir_dentry);
+ sdcardfs_put_real_lower(dentry, &lower_path);
+ REVERT_CRED(saved_cred);
+out_eacces:
+ return err;
+}
+
+#if 0
+static int sdcardfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+ dev_t dev)
+{
+ int err;
+ struct dentry *lower_dentry;
+ struct dentry *lower_parent_dentry = NULL;
+ struct path lower_path;
+
+ OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb));
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ lower_parent_dentry = lock_parent(lower_dentry);
+
+ err = vfs_mknod(lower_parent_dentry->d_inode, lower_dentry, mode, dev);
+ if (err)
+ goto out;
+
+ err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path);
+ if (err)
+ goto out;
+ fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
+ fsstack_copy_inode_size(dir, lower_parent_dentry->d_inode);
+
+out:
+ unlock_dir(lower_parent_dentry);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ REVERT_CRED();
+ return err;
+}
+#endif
+
+/*
+ * The locking rules in sdcardfs_rename are complex. We could use a simpler
+ * superblock-level name-space lock for renames and copy-ups.
+ */
+static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ int err = 0;
+ struct dentry *lower_old_dentry = NULL;
+ struct dentry *lower_new_dentry = NULL;
+ struct dentry *lower_old_dir_dentry = NULL;
+ struct dentry *lower_new_dir_dentry = NULL;
+ struct vfsmount *lower_mnt = NULL;
+ struct dentry *trap = NULL;
+ struct path lower_old_path, lower_new_path;
+ const struct cred *saved_cred = NULL;
+
+ if (!check_caller_access_to_name(old_dir, &old_dentry->d_name) ||
+ !check_caller_access_to_name(new_dir, &new_dentry->d_name)) {
+ err = -EACCES;
+ goto out_eacces;
+ }
+
+ /* save current_cred and override it */
+ OVERRIDE_CRED(SDCARDFS_SB(old_dir->i_sb), saved_cred, SDCARDFS_I(new_dir));
+
+ sdcardfs_get_real_lower(old_dentry, &lower_old_path);
+ sdcardfs_get_lower_path(new_dentry, &lower_new_path);
+ lower_old_dentry = lower_old_path.dentry;
+ lower_new_dentry = lower_new_path.dentry;
+ lower_mnt = lower_old_path.mnt;
+ lower_old_dir_dentry = dget_parent(lower_old_dentry);
+ lower_new_dir_dentry = dget_parent(lower_new_dentry);
+
+ trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ /* source should not be ancestor of target */
+ if (trap == lower_old_dentry) {
+ err = -EINVAL;
+ goto out;
+ }
+ /* target should not be ancestor of source */
+ if (trap == lower_new_dentry) {
+ err = -ENOTEMPTY;
+ goto out;
+ }
+
+ err = vfs_rename2(lower_mnt,
+ lower_old_dir_dentry->d_inode, lower_old_dentry,
+ lower_new_dir_dentry->d_inode, lower_new_dentry,
+ NULL, 0);
+ if (err)
+ goto out;
+
+ /* Copy attrs from lower dir, but i_uid/i_gid */
+ sdcardfs_copy_and_fix_attrs(new_dir, lower_new_dir_dentry->d_inode);
+ fsstack_copy_inode_size(new_dir, lower_new_dir_dentry->d_inode);
+
+ if (new_dir != old_dir) {
+ sdcardfs_copy_and_fix_attrs(old_dir, lower_old_dir_dentry->d_inode);
+ fsstack_copy_inode_size(old_dir, lower_old_dir_dentry->d_inode);
+ }
+ get_derived_permission_new(new_dentry->d_parent, old_dentry, &new_dentry->d_name);
+ fixup_tmp_permissions(old_dentry->d_inode);
+ fixup_lower_ownership(old_dentry, new_dentry->d_name.name);
+ d_invalidate(old_dentry); /* Can't fixup ownership recursively :( */
+out:
+ unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ dput(lower_old_dir_dentry);
+ dput(lower_new_dir_dentry);
+ sdcardfs_put_real_lower(old_dentry, &lower_old_path);
+ sdcardfs_put_lower_path(new_dentry, &lower_new_path);
+ REVERT_CRED(saved_cred);
+out_eacces:
+ return err;
+}
+
+#if 0
+static int sdcardfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+ int err;
+ struct dentry *lower_dentry;
+ struct path lower_path;
+ /* XXX readlink does not requires overriding credential */
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ if (!lower_dentry->d_inode->i_op ||
+ !lower_dentry->d_inode->i_op->readlink) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = lower_dentry->d_inode->i_op->readlink(lower_dentry,
+ buf, bufsiz);
+ if (err < 0)
+ goto out;
+ fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
+
+out:
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ return err;
+}
+#endif
+
+#if 0
+static void *sdcardfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ char *buf;
+ int len = PAGE_SIZE, err;
+ mm_segment_t old_fs;
+
+ /* This is freed by the put_link method assuming a successful call. */
+ buf = kmalloc(len, GFP_KERNEL);
+ if (!buf) {
+ buf = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ /* read the symlink, and then we will follow it */
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sdcardfs_readlink(dentry, buf, len);
+ set_fs(old_fs);
+ if (err < 0) {
+ kfree(buf);
+ buf = ERR_PTR(err);
+ } else {
+ buf[err] = '\0';
+ }
+out:
+ nd_set_link(nd, buf);
+ return NULL;
+}
+#endif
+
+static int sdcardfs_permission_wrn(struct inode *inode, int mask)
+{
+ WARN_RATELIMIT(1, "sdcardfs does not support permission. Use permission2.\n");
+ return -EINVAL;
+}
+
+void copy_attrs(struct inode *dest, const struct inode *src)
+{
+ dest->i_mode = src->i_mode;
+ dest->i_uid = src->i_uid;
+ dest->i_gid = src->i_gid;
+ dest->i_rdev = src->i_rdev;
+ dest->i_atime = src->i_atime;
+ dest->i_mtime = src->i_mtime;
+ dest->i_ctime = src->i_ctime;
+ dest->i_blkbits = src->i_blkbits;
+ dest->i_flags = src->i_flags;
+#ifdef CONFIG_FS_POSIX_ACL
+ dest->i_acl = src->i_acl;
+#endif
+#ifdef CONFIG_SECURITY
+ dest->i_security = src->i_security;
+#endif
+}
+
+static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int mask)
+{
+ int err;
+ struct inode tmp;
+ struct sdcardfs_inode_data *top = top_data_get(SDCARDFS_I(inode));
+
+ if (!top)
+ return -EINVAL;
+
+ /*
+ * Permission check on sdcardfs inode.
+ * Calling process should have AID_SDCARD_RW permission
+ * Since generic_permission only needs i_mode, i_uid,
+ * i_gid, and i_sb, we can create a fake inode to pass
+ * this information down in.
+ *
+ * The underlying code may attempt to take locks in some
+ * cases for features we're not using, but if that changes,
+ * locks must be dealt with to avoid undefined behavior.
+ */
+ copy_attrs(&tmp, inode);
+ tmp.i_uid = make_kuid(&init_user_ns, top->d_uid);
+ tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, top));
+ tmp.i_mode = (inode->i_mode & S_IFMT)
+ | get_mode(mnt, SDCARDFS_I(inode), top);
+ data_put(top);
+ tmp.i_sb = inode->i_sb;
+ if (IS_POSIXACL(inode))
+ pr_warn("%s: This may be undefined behavior...\n", __func__);
+ err = generic_permission(&tmp, mask);
+ /* XXX
+ * Original sdcardfs code calls inode_permission(lower_inode,.. )
+ * for checking inode permission. But doing such things here seems
+ * duplicated work, because the functions called after this func,
+ * such as vfs_create, vfs_unlink, vfs_rename, and etc,
+ * does exactly same thing, i.e., they calls inode_permission().
+ * So we just let they do the things.
+ * If there are any security hole, just uncomment following if block.
+ */
+#if 0
+ if (!err) {
+ /*
+ * Permission check on lower_inode(=EXT4).
+ * we check it with AID_MEDIA_RW permission
+ */
+ struct inode *lower_inode;
+
+ OVERRIDE_CRED(SDCARDFS_SB(inode->sb));
+
+ lower_inode = sdcardfs_lower_inode(inode);
+ err = inode_permission(lower_inode, mask);
+
+ REVERT_CRED();
+ }
+#endif
+ return err;
+
+}
+
+static int sdcardfs_setattr_wrn(struct dentry *dentry, struct iattr *ia)
+{
+ WARN_RATELIMIT(1, "sdcardfs does not support setattr. User setattr2.\n");
+ return -EINVAL;
+}
+
+static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct iattr *ia)
+{
+ int err;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_mnt;
+ struct inode *inode;
+ struct inode *lower_inode;
+ struct path lower_path;
+ struct iattr lower_ia;
+ struct dentry *parent;
+ struct inode tmp;
+ struct sdcardfs_inode_data *top;
+ const struct cred *saved_cred = NULL;
+
+ inode = dentry->d_inode;
+ top = top_data_get(SDCARDFS_I(inode));
+
+ if (!top)
+ return -EINVAL;
+
+ /*
+ * Permission check on sdcardfs inode.
+ * Calling process should have AID_SDCARD_RW permission
+ * Since generic_permission only needs i_mode, i_uid,
+ * i_gid, and i_sb, we can create a fake inode to pass
+ * this information down in.
+ *
+ * The underlying code may attempt to take locks in some
+ * cases for features we're not using, but if that changes,
+ * locks must be dealt with to avoid undefined behavior.
+ *
+ */
+ copy_attrs(&tmp, inode);
+ tmp.i_uid = make_kuid(&init_user_ns, top->d_uid);
+ tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, top));
+ tmp.i_mode = (inode->i_mode & S_IFMT)
+ | get_mode(mnt, SDCARDFS_I(inode), top);
+ tmp.i_size = i_size_read(inode);
+ data_put(top);
+ tmp.i_sb = inode->i_sb;
+
+ /*
+ * Check if user has permission to change inode. We don't check if
+ * this user can change the lower inode: that should happen when
+ * calling notify_change on the lower inode.
+ */
+ /* prepare our own lower struct iattr (with the lower file) */
+ memcpy(&lower_ia, ia, sizeof(lower_ia));
+ /* Allow touch updating timestamps. A previous permission check ensures
+ * we have write access. Changes to mode, owner, and group are ignored
+ */
+ ia->ia_valid |= ATTR_FORCE;
+ err = inode_change_ok(&tmp, ia);
+
+ if (!err) {
+ /* check the Android group ID */
+ parent = dget_parent(dentry);
+ if (!check_caller_access_to_name(parent->d_inode, &dentry->d_name))
+ err = -EACCES;
+ dput(parent);
+ }
+
+ if (err)
+ goto out_err;
+
+ /* save current_cred and override it */
+ OVERRIDE_CRED(SDCARDFS_SB(dentry->d_sb), saved_cred, SDCARDFS_I(inode));
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ lower_mnt = lower_path.mnt;
+ lower_inode = sdcardfs_lower_inode(inode);
+
+ if (ia->ia_valid & ATTR_FILE)
+ lower_ia.ia_file = sdcardfs_lower_file(ia->ia_file);
+
+ lower_ia.ia_valid &= ~(ATTR_UID | ATTR_GID | ATTR_MODE);
+
+ /*
+ * If shrinking, first truncate upper level to cancel writing dirty
+ * pages beyond the new eof; and also if its' maxbytes is more
+ * limiting (fail with -EFBIG before making any change to the lower
+ * level). There is no need to vmtruncate the upper level
+ * afterwards in the other cases: we fsstack_copy_inode_size from
+ * the lower level.
+ */
+ if (current->mm)
+ down_write(&current->mm->mmap_sem);
+ if (ia->ia_valid & ATTR_SIZE) {
+ err = inode_newsize_ok(&tmp, ia->ia_size);
+ if (err) {
+ if (current->mm)
+ up_write(&current->mm->mmap_sem);
+ goto out;
+ }
+ truncate_setsize(inode, ia->ia_size);
+ }
+
+ /*
+ * mode change is for clearing setuid/setgid bits. Allow lower fs
+ * to interpret this in its own way.
+ */
+ if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+ lower_ia.ia_valid &= ~ATTR_MODE;
+
+ /* notify the (possibly copied-up) lower inode */
+ /*
+ * Note: we use lower_dentry->d_inode, because lower_inode may be
+ * unlinked (no inode->i_sb and i_ino==0. This happens if someone
+ * tries to open(), unlink(), then ftruncate() a file.
+ */
+ mutex_lock(&lower_dentry->d_inode->i_mutex);
+ err = notify_change2(lower_mnt, lower_dentry, &lower_ia, /* note: lower_ia */
+ NULL);
+ mutex_unlock(&lower_dentry->d_inode->i_mutex);
+ if (current->mm)
+ up_write(&current->mm->mmap_sem);
+ if (err)
+ goto out;
+
+ /* get attributes from the lower inode and update derived permissions */
+ sdcardfs_copy_and_fix_attrs(inode, lower_inode);
+
+ /*
+ * Not running fsstack_copy_inode_size(inode, lower_inode), because
+ * VFS should update our inode size, and notify_change on
+ * lower_inode should update its size.
+ */
+
+out:
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ REVERT_CRED(saved_cred);
+out_err:
+ return err;
+}
+
+static int sdcardfs_fillattr(struct vfsmount *mnt,
+ struct inode *inode, struct kstat *stat)
+{
+ struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
+ struct sdcardfs_inode_data *top = top_data_get(info);
+
+ if (!top)
+ return -EINVAL;
+
+ stat->dev = inode->i_sb->s_dev;
+ stat->ino = inode->i_ino;
+ stat->mode = (inode->i_mode & S_IFMT) | get_mode(mnt, info, top);
+ stat->nlink = inode->i_nlink;
+ stat->uid = make_kuid(&init_user_ns, top->d_uid);
+ stat->gid = make_kgid(&init_user_ns, get_gid(mnt, top));
+ stat->rdev = inode->i_rdev;
+ stat->size = i_size_read(inode);
+ stat->atime = inode->i_atime;
+ stat->mtime = inode->i_mtime;
+ stat->ctime = inode->i_ctime;
+ stat->blksize = (1 << inode->i_blkbits);
+ stat->blocks = inode->i_blocks;
+ data_put(top);
+ return 0;
+}
+
+static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct kstat lower_stat;
+ struct path lower_path;
+ struct dentry *parent;
+ int err;
+
+ parent = dget_parent(dentry);
+ if (!check_caller_access_to_name(parent->d_inode, &dentry->d_name)) {
+ dput(parent);
+ return -EACCES;
+ }
+ dput(parent);
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ err = vfs_getattr(&lower_path, &lower_stat);
+ if (err)
+ goto out;
+ sdcardfs_copy_and_fix_attrs(dentry->d_inode,
+ lower_path.dentry->d_inode);
+ err = sdcardfs_fillattr(mnt, dentry->d_inode, stat);
+ stat->blocks = lower_stat.blocks;
+out:
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ return err;
+}
+
+const struct inode_operations sdcardfs_symlink_iops = {
+ .permission2 = sdcardfs_permission,
+ .setattr2 = sdcardfs_setattr,
+ /* XXX Following operations are implemented,
+ * but FUSE(sdcard) or FAT does not support them
+ * These methods are *NOT* perfectly tested.
+ .readlink = sdcardfs_readlink,
+ .follow_link = sdcardfs_follow_link,
+ .put_link = kfree_put_link,
+ */
+};
+
+const struct inode_operations sdcardfs_dir_iops = {
+ .create = sdcardfs_create,
+ .lookup = sdcardfs_lookup,
+ .permission = sdcardfs_permission_wrn,
+ .permission2 = sdcardfs_permission,
+ .unlink = sdcardfs_unlink,
+ .mkdir = sdcardfs_mkdir,
+ .rmdir = sdcardfs_rmdir,
+ .rename = sdcardfs_rename,
+ .setattr = sdcardfs_setattr_wrn,
+ .setattr2 = sdcardfs_setattr,
+ .getattr = sdcardfs_getattr,
+ /* XXX Following operations are implemented,
+ * but FUSE(sdcard) or FAT does not support them
+ * These methods are *NOT* perfectly tested.
+ .symlink = sdcardfs_symlink,
+ .link = sdcardfs_link,
+ .mknod = sdcardfs_mknod,
+ */
+};
+
+const struct inode_operations sdcardfs_main_iops = {
+ .permission = sdcardfs_permission_wrn,
+ .permission2 = sdcardfs_permission,
+ .setattr = sdcardfs_setattr_wrn,
+ .setattr2 = sdcardfs_setattr,
+ .getattr = sdcardfs_getattr,
+};
diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
new file mode 100644
index 000000000000..ace852390d2a
--- /dev/null
+++ b/fs/sdcardfs/lookup.c
@@ -0,0 +1,466 @@
+/*
+ * fs/sdcardfs/lookup.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include "linux/delay.h"
+
+/* The dentry cache is just so we have properly sized dentries */
+static struct kmem_cache *sdcardfs_dentry_cachep;
+
+int sdcardfs_init_dentry_cache(void)
+{
+ sdcardfs_dentry_cachep =
+ kmem_cache_create("sdcardfs_dentry",
+ sizeof(struct sdcardfs_dentry_info),
+ 0, SLAB_RECLAIM_ACCOUNT, NULL);
+
+ return sdcardfs_dentry_cachep ? 0 : -ENOMEM;
+}
+
+void sdcardfs_destroy_dentry_cache(void)
+{
+ kmem_cache_destroy(sdcardfs_dentry_cachep);
+}
+
+void free_dentry_private_data(struct dentry *dentry)
+{
+ if (!dentry || !dentry->d_fsdata)
+ return;
+ kmem_cache_free(sdcardfs_dentry_cachep, dentry->d_fsdata);
+ dentry->d_fsdata = NULL;
+}
+
+/* allocate new dentry private data */
+int new_dentry_private_data(struct dentry *dentry)
+{
+ struct sdcardfs_dentry_info *info = SDCARDFS_D(dentry);
+
+ /* use zalloc to init dentry_info.lower_path */
+ info = kmem_cache_zalloc(sdcardfs_dentry_cachep, GFP_ATOMIC);
+ if (!info)
+ return -ENOMEM;
+
+ spin_lock_init(&info->lock);
+ dentry->d_fsdata = info;
+
+ return 0;
+}
+
+struct inode_data {
+ struct inode *lower_inode;
+ userid_t id;
+};
+
+static int sdcardfs_inode_test(struct inode *inode, void *candidate_data/*void *candidate_lower_inode*/)
+{
+ struct inode *current_lower_inode = sdcardfs_lower_inode(inode);
+ userid_t current_userid = SDCARDFS_I(inode)->data->userid;
+
+ if (current_lower_inode == ((struct inode_data *)candidate_data)->lower_inode &&
+ current_userid == ((struct inode_data *)candidate_data)->id)
+ return 1; /* found a match */
+ else
+ return 0; /* no match */
+}
+
+static int sdcardfs_inode_set(struct inode *inode, void *lower_inode)
+{
+ /* we do actual inode initialization in sdcardfs_iget */
+ return 0;
+}
+
+struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode, userid_t id)
+{
+ struct sdcardfs_inode_info *info;
+ struct inode_data data;
+ struct inode *inode; /* the new inode to return */
+
+ if (!igrab(lower_inode))
+ return ERR_PTR(-ESTALE);
+
+ data.id = id;
+ data.lower_inode = lower_inode;
+ inode = iget5_locked(sb, /* our superblock */
+ /*
+ * hashval: we use inode number, but we can
+ * also use "(unsigned long)lower_inode"
+ * instead.
+ */
+ lower_inode->i_ino, /* hashval */
+ sdcardfs_inode_test, /* inode comparison function */
+ sdcardfs_inode_set, /* inode init function */
+ &data); /* data passed to test+set fxns */
+ if (!inode) {
+ iput(lower_inode);
+ return ERR_PTR(-ENOMEM);
+ }
+ /* if found a cached inode, then just return it (after iput) */
+ if (!(inode->i_state & I_NEW)) {
+ iput(lower_inode);
+ return inode;
+ }
+
+ /* initialize new inode */
+ info = SDCARDFS_I(inode);
+
+ inode->i_ino = lower_inode->i_ino;
+ sdcardfs_set_lower_inode(inode, lower_inode);
+
+ inode->i_version++;
+
+ /* use different set of inode ops for symlinks & directories */
+ if (S_ISDIR(lower_inode->i_mode))
+ inode->i_op = &sdcardfs_dir_iops;
+ else if (S_ISLNK(lower_inode->i_mode))
+ inode->i_op = &sdcardfs_symlink_iops;
+ else
+ inode->i_op = &sdcardfs_main_iops;
+
+ /* use different set of file ops for directories */
+ if (S_ISDIR(lower_inode->i_mode))
+ inode->i_fop = &sdcardfs_dir_fops;
+ else
+ inode->i_fop = &sdcardfs_main_fops;
+
+ inode->i_mapping->a_ops = &sdcardfs_aops;
+
+ inode->i_atime.tv_sec = 0;
+ inode->i_atime.tv_nsec = 0;
+ inode->i_mtime.tv_sec = 0;
+ inode->i_mtime.tv_nsec = 0;
+ inode->i_ctime.tv_sec = 0;
+ inode->i_ctime.tv_nsec = 0;
+
+ /* properly initialize special inodes */
+ if (S_ISBLK(lower_inode->i_mode) || S_ISCHR(lower_inode->i_mode) ||
+ S_ISFIFO(lower_inode->i_mode) || S_ISSOCK(lower_inode->i_mode))
+ init_special_inode(inode, lower_inode->i_mode,
+ lower_inode->i_rdev);
+
+ /* all well, copy inode attributes */
+ sdcardfs_copy_and_fix_attrs(inode, lower_inode);
+ fsstack_copy_inode_size(inode, lower_inode);
+
+ unlock_new_inode(inode);
+ return inode;
+}
+
+/*
+ * Helper interpose routine, called directly by ->lookup to handle
+ * spliced dentries.
+ */
+static struct dentry *__sdcardfs_interpose(struct dentry *dentry,
+ struct super_block *sb,
+ struct path *lower_path,
+ userid_t id)
+{
+ struct inode *inode;
+ struct inode *lower_inode;
+ struct super_block *lower_sb;
+ struct dentry *ret_dentry;
+
+ lower_inode = lower_path->dentry->d_inode;
+ lower_sb = sdcardfs_lower_super(sb);
+
+ /* check that the lower file system didn't cross a mount point */
+ if (lower_inode->i_sb != lower_sb) {
+ ret_dentry = ERR_PTR(-EXDEV);
+ goto out;
+ }
+
+ /*
+ * We allocate our new inode below by calling sdcardfs_iget,
+ * which will initialize some of the new inode's fields
+ */
+
+ /* inherit lower inode number for sdcardfs's inode */
+ inode = sdcardfs_iget(sb, lower_inode, id);
+ if (IS_ERR(inode)) {
+ ret_dentry = ERR_CAST(inode);
+ goto out;
+ }
+
+ ret_dentry = d_splice_alias(inode, dentry);
+ dentry = ret_dentry ?: dentry;
+ if (!IS_ERR(dentry))
+ update_derived_permission_lock(dentry);
+out:
+ return ret_dentry;
+}
+
+/*
+ * Connect an sdcardfs inode dentry/inode with several lower ones. This is
+ * the classic stackable file system "vnode interposition" action.
+ *
+ * @dentry: sdcardfs's dentry which interposes on lower one
+ * @sb: sdcardfs's super_block
+ * @lower_path: the lower path (caller does path_get/put)
+ */
+int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb,
+ struct path *lower_path, userid_t id)
+{
+ struct dentry *ret_dentry;
+
+ ret_dentry = __sdcardfs_interpose(dentry, sb, lower_path, id);
+ return PTR_ERR(ret_dentry);
+}
+
+struct sdcardfs_name_data {
+ struct dir_context ctx;
+ const struct qstr *to_find;
+ char *name;
+ bool found;
+};
+
+static int sdcardfs_name_match(void *__buf, const char *name, int namelen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct sdcardfs_name_data *buf = (struct sdcardfs_name_data *) __buf;
+ struct qstr candidate = QSTR_INIT(name, namelen);
+
+ if (qstr_case_eq(buf->to_find, &candidate)) {
+ memcpy(buf->name, name, namelen);
+ buf->name[namelen] = 0;
+ buf->found = true;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Main driver function for sdcardfs's lookup.
+ *
+ * Returns: NULL (ok), ERR_PTR if an error occurred.
+ * Fills in lower_parent_path with <dentry,mnt> on success.
+ */
+static struct dentry *__sdcardfs_lookup(struct dentry *dentry,
+ unsigned int flags, struct path *lower_parent_path, userid_t id)
+{
+ int err = 0;
+ struct vfsmount *lower_dir_mnt;
+ struct dentry *lower_dir_dentry = NULL;
+ struct dentry *lower_dentry;
+ const struct qstr *name;
+ struct path lower_path;
+ struct qstr dname;
+ struct dentry *ret_dentry = NULL;
+ struct sdcardfs_sb_info *sbi;
+
+ sbi = SDCARDFS_SB(dentry->d_sb);
+ /* must initialize dentry operations */
+ d_set_d_op(dentry, &sdcardfs_ci_dops);
+
+ if (IS_ROOT(dentry))
+ goto out;
+
+ name = &dentry->d_name;
+
+ /* now start the actual lookup procedure */
+ lower_dir_dentry = lower_parent_path->dentry;
+ lower_dir_mnt = lower_parent_path->mnt;
+
+ /* Use vfs_path_lookup to check if the dentry exists or not */
+ err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name->name, 0,
+ &lower_path);
+ /* check for other cases */
+ if (err == -ENOENT) {
+ struct file *file;
+ const struct cred *cred = current_cred();
+
+ struct sdcardfs_name_data buffer = {
+ .ctx.actor = sdcardfs_name_match,
+ .to_find = name,
+ .name = __getname(),
+ .found = false,
+ };
+
+ if (!buffer.name) {
+ err = -ENOMEM;
+ goto out;
+ }
+ file = dentry_open(lower_parent_path, O_RDONLY, cred);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto put_name;
+ }
+ err = iterate_dir(file, &buffer.ctx);
+ fput(file);
+ if (err)
+ goto put_name;
+
+ if (buffer.found)
+ err = vfs_path_lookup(lower_dir_dentry,
+ lower_dir_mnt,
+ buffer.name, 0,
+ &lower_path);
+ else
+ err = -ENOENT;
+put_name:
+ __putname(buffer.name);
+ }
+
+ /* no error: handle positive dentries */
+ if (!err) {
+ /* check if the dentry is an obb dentry
+ * if true, the lower_inode must be replaced with
+ * the inode of the graft path
+ */
+
+ if (need_graft_path(dentry)) {
+
+ /* setup_obb_dentry()
+ * The lower_path will be stored to the dentry's orig_path
+ * and the base obbpath will be copyed to the lower_path variable.
+ * if an error returned, there's no change in the lower_path
+ * returns: -ERRNO if error (0: no error)
+ */
+ err = setup_obb_dentry(dentry, &lower_path);
+
+ if (err) {
+ /* if the sbi->obbpath is not available, we can optionally
+ * setup the lower_path with its orig_path.
+ * but, the current implementation just returns an error
+ * because the sdcard daemon also regards this case as
+ * a lookup fail.
+ */
+ pr_info("sdcardfs: base obbpath is not available\n");
+ sdcardfs_put_reset_orig_path(dentry);
+ goto out;
+ }
+ }
+
+ sdcardfs_set_lower_path(dentry, &lower_path);
+ ret_dentry =
+ __sdcardfs_interpose(dentry, dentry->d_sb, &lower_path, id);
+ if (IS_ERR(ret_dentry)) {
+ err = PTR_ERR(ret_dentry);
+ /* path_put underlying path on error */
+ sdcardfs_put_reset_lower_path(dentry);
+ }
+ goto out;
+ }
+
+ /*
+ * We don't consider ENOENT an error, and we want to return a
+ * negative dentry.
+ */
+ if (err && err != -ENOENT)
+ goto out;
+
+ /* instatiate a new negative dentry */
+ dname.name = name->name;
+ dname.len = name->len;
+
+ /* See if the low-level filesystem might want
+ * to use its own hash
+ */
+ lower_dentry = d_hash_and_lookup(lower_dir_dentry, &dname);
+ if (IS_ERR(lower_dentry))
+ return lower_dentry;
+ if (!lower_dentry) {
+ /* We called vfs_path_lookup earlier, and did not get a negative
+ * dentry then. Don't confuse the lower filesystem by forcing
+ * one on it now...
+ */
+ err = -ENOENT;
+ goto out;
+ }
+
+ lower_path.dentry = lower_dentry;
+ lower_path.mnt = mntget(lower_dir_mnt);
+ sdcardfs_set_lower_path(dentry, &lower_path);
+
+ /*
+ * If the intent is to create a file, then don't return an error, so
+ * the VFS will continue the process of making this negative dentry
+ * into a positive one.
+ */
+ if (flags & (LOOKUP_CREATE|LOOKUP_RENAME_TARGET))
+ err = 0;
+
+out:
+ if (err)
+ return ERR_PTR(err);
+ return ret_dentry;
+}
+
+/*
+ * On success:
+ * fills dentry object appropriate values and returns NULL.
+ * On fail (== error)
+ * returns error ptr
+ *
+ * @dir : Parent inode. It is locked (dir->i_mutex)
+ * @dentry : Target dentry to lookup. we should set each of fields.
+ * (dentry->d_name is initialized already)
+ * @nd : nameidata of parent inode
+ */
+struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct dentry *ret = NULL, *parent;
+ struct path lower_parent_path;
+ int err = 0;
+ const struct cred *saved_cred = NULL;
+
+ parent = dget_parent(dentry);
+
+ if (!check_caller_access_to_name(parent->d_inode, &dentry->d_name)) {
+ ret = ERR_PTR(-EACCES);
+ goto out_err;
+ }
+
+ /* save current_cred and override it */
+ OVERRIDE_CRED_PTR(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
+
+ sdcardfs_get_lower_path(parent, &lower_parent_path);
+
+ /* allocate dentry private data. We free it in ->d_release */
+ err = new_dentry_private_data(dentry);
+ if (err) {
+ ret = ERR_PTR(err);
+ goto out;
+ }
+
+ ret = __sdcardfs_lookup(dentry, flags, &lower_parent_path,
+ SDCARDFS_I(dir)->data->userid);
+ if (IS_ERR(ret))
+ goto out;
+ if (ret)
+ dentry = ret;
+ if (dentry->d_inode) {
+ fsstack_copy_attr_times(dentry->d_inode,
+ sdcardfs_lower_inode(dentry->d_inode));
+ /* get derived permission */
+ get_derived_permission(parent, dentry);
+ fixup_tmp_permissions(dentry->d_inode);
+ fixup_lower_ownership(dentry, dentry->d_name.name);
+ }
+ /* update parent directory's atime */
+ fsstack_copy_attr_atime(parent->d_inode,
+ sdcardfs_lower_inode(parent->d_inode));
+
+out:
+ sdcardfs_put_lower_path(parent, &lower_parent_path);
+ REVERT_CRED(saved_cred);
+out_err:
+ dput(parent);
+ return ret;
+}
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
new file mode 100644
index 000000000000..dd4b87c7cb1d
--- /dev/null
+++ b/fs/sdcardfs/main.c
@@ -0,0 +1,486 @@
+/*
+ * fs/sdcardfs/main.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/parser.h>
+
+enum {
+ Opt_fsuid,
+ Opt_fsgid,
+ Opt_gid,
+ Opt_debug,
+ Opt_mask,
+ Opt_multiuser,
+ Opt_userid,
+ Opt_reserved_mb,
+ Opt_err,
+};
+
+static const match_table_t sdcardfs_tokens = {
+ {Opt_fsuid, "fsuid=%u"},
+ {Opt_fsgid, "fsgid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_debug, "debug"},
+ {Opt_mask, "mask=%u"},
+ {Opt_userid, "userid=%d"},
+ {Opt_multiuser, "multiuser"},
+ {Opt_reserved_mb, "reserved_mb=%u"},
+ {Opt_err, NULL}
+};
+
+static int parse_options(struct super_block *sb, char *options, int silent,
+ int *debug, struct sdcardfs_vfsmount_options *vfsopts,
+ struct sdcardfs_mount_options *opts)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+
+ /* by default, we use AID_MEDIA_RW as uid, gid */
+ opts->fs_low_uid = AID_MEDIA_RW;
+ opts->fs_low_gid = AID_MEDIA_RW;
+ vfsopts->mask = 0;
+ opts->multiuser = false;
+ opts->fs_user_id = 0;
+ vfsopts->gid = 0;
+ /* by default, 0MB is reserved */
+ opts->reserved_mb = 0;
+
+ *debug = 0;
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, sdcardfs_tokens, args);
+
+ switch (token) {
+ case Opt_debug:
+ *debug = 1;
+ break;
+ case Opt_fsuid:
+ if (match_int(&args[0], &option))
+ return 0;
+ opts->fs_low_uid = option;
+ break;
+ case Opt_fsgid:
+ if (match_int(&args[0], &option))
+ return 0;
+ opts->fs_low_gid = option;
+ break;
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ return 0;
+ vfsopts->gid = option;
+ break;
+ case Opt_userid:
+ if (match_int(&args[0], &option))
+ return 0;
+ opts->fs_user_id = option;
+ break;
+ case Opt_mask:
+ if (match_int(&args[0], &option))
+ return 0;
+ vfsopts->mask = option;
+ break;
+ case Opt_multiuser:
+ opts->multiuser = true;
+ break;
+ case Opt_reserved_mb:
+ if (match_int(&args[0], &option))
+ return 0;
+ opts->reserved_mb = option;
+ break;
+ /* unknown option */
+ default:
+ if (!silent)
+ pr_err("Unrecognized mount option \"%s\" or missing value", p);
+ return -EINVAL;
+ }
+ }
+
+ if (*debug) {
+ pr_info("sdcardfs : options - debug:%d\n", *debug);
+ pr_info("sdcardfs : options - uid:%d\n",
+ opts->fs_low_uid);
+ pr_info("sdcardfs : options - gid:%d\n",
+ opts->fs_low_gid);
+ }
+
+ return 0;
+}
+
+int parse_options_remount(struct super_block *sb, char *options, int silent,
+ struct sdcardfs_vfsmount_options *vfsopts)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+ int debug;
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, sdcardfs_tokens, args);
+
+ switch (token) {
+ case Opt_debug:
+ debug = 1;
+ break;
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ return 0;
+ vfsopts->gid = option;
+
+ break;
+ case Opt_mask:
+ if (match_int(&args[0], &option))
+ return 0;
+ vfsopts->mask = option;
+ break;
+ case Opt_multiuser:
+ case Opt_userid:
+ case Opt_fsuid:
+ case Opt_fsgid:
+ case Opt_reserved_mb:
+ pr_warn("Option \"%s\" can't be changed during remount\n", p);
+ break;
+ /* unknown option */
+ default:
+ if (!silent)
+ pr_err("Unrecognized mount option \"%s\" or missing value", p);
+ return -EINVAL;
+ }
+ }
+
+ if (debug) {
+ pr_info("sdcardfs : options - debug:%d\n", debug);
+ pr_info("sdcardfs : options - gid:%d\n", vfsopts->gid);
+ pr_info("sdcardfs : options - mask:%d\n", vfsopts->mask);
+ }
+
+ return 0;
+}
+
+#if 0
+/*
+ * our custom d_alloc_root work-alike
+ *
+ * we can't use d_alloc_root if we want to use our own interpose function
+ * unchanged, so we simply call our own "fake" d_alloc_root
+ */
+static struct dentry *sdcardfs_d_alloc_root(struct super_block *sb)
+{
+ struct dentry *ret = NULL;
+
+ if (sb) {
+ static const struct qstr name = {
+ .name = "/",
+ .len = 1
+ };
+
+ ret = d_alloc(NULL, &name);
+ if (ret) {
+ d_set_d_op(ret, &sdcardfs_ci_dops);
+ ret->d_sb = sb;
+ ret->d_parent = ret;
+ }
+ }
+ return ret;
+}
+#endif
+
+DEFINE_MUTEX(sdcardfs_super_list_lock);
+EXPORT_SYMBOL_GPL(sdcardfs_super_list_lock);
+LIST_HEAD(sdcardfs_super_list);
+EXPORT_SYMBOL_GPL(sdcardfs_super_list);
+
+/*
+ * There is no need to lock the sdcardfs_super_info's rwsem as there is no
+ * way anyone can have a reference to the superblock at this point in time.
+ */
+static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
+ const char *dev_name, void *raw_data, int silent)
+{
+ int err = 0;
+ int debug;
+ struct super_block *lower_sb;
+ struct path lower_path;
+ struct sdcardfs_sb_info *sb_info;
+ struct sdcardfs_vfsmount_options *mnt_opt = mnt->data;
+ struct inode *inode;
+
+ pr_info("sdcardfs version 2.0\n");
+
+ if (!dev_name) {
+ pr_err("sdcardfs: read_super: missing dev_name argument\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ pr_info("sdcardfs: dev_name -> %s\n", dev_name);
+ pr_info("sdcardfs: options -> %s\n", (char *)raw_data);
+ pr_info("sdcardfs: mnt -> %p\n", mnt);
+
+ /* parse lower path */
+ err = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
+ &lower_path);
+ if (err) {
+ pr_err("sdcardfs: error accessing lower directory '%s'\n", dev_name);
+ goto out;
+ }
+
+ /* allocate superblock private data */
+ sb->s_fs_info = kzalloc(sizeof(struct sdcardfs_sb_info), GFP_KERNEL);
+ if (!SDCARDFS_SB(sb)) {
+ pr_crit("sdcardfs: read_super: out of memory\n");
+ err = -ENOMEM;
+ goto out_free;
+ }
+
+ sb_info = sb->s_fs_info;
+ /* parse options */
+ err = parse_options(sb, raw_data, silent, &debug, mnt_opt, &sb_info->options);
+ if (err) {
+ pr_err("sdcardfs: invalid options\n");
+ goto out_freesbi;
+ }
+
+ /* set the lower superblock field of upper superblock */
+ lower_sb = lower_path.dentry->d_sb;
+ atomic_inc(&lower_sb->s_active);
+ sdcardfs_set_lower_super(sb, lower_sb);
+
+ /* inherit maxbytes from lower file system */
+ sb->s_maxbytes = lower_sb->s_maxbytes;
+
+ /*
+ * Our c/m/atime granularity is 1 ns because we may stack on file
+ * systems whose granularity is as good.
+ */
+ sb->s_time_gran = 1;
+
+ sb->s_magic = SDCARDFS_SUPER_MAGIC;
+ sb->s_op = &sdcardfs_sops;
+
+ /* get a new inode and allocate our root dentry */
+ inode = sdcardfs_iget(sb, lower_path.dentry->d_inode, 0);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_sput;
+ }
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root) {
+ err = -ENOMEM;
+ goto out_iput;
+ }
+ d_set_d_op(sb->s_root, &sdcardfs_ci_dops);
+
+ /* link the upper and lower dentries */
+ sb->s_root->d_fsdata = NULL;
+ err = new_dentry_private_data(sb->s_root);
+ if (err)
+ goto out_freeroot;
+
+ /* set the lower dentries for s_root */
+ sdcardfs_set_lower_path(sb->s_root, &lower_path);
+
+ /*
+ * No need to call interpose because we already have a positive
+ * dentry, which was instantiated by d_make_root. Just need to
+ * d_rehash it.
+ */
+ d_rehash(sb->s_root);
+
+ /* setup permission policy */
+ sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL);
+ mutex_lock(&sdcardfs_super_list_lock);
+ if (sb_info->options.multiuser) {
+ setup_derived_state(sb->s_root->d_inode, PERM_PRE_ROOT,
+ sb_info->options.fs_user_id, AID_ROOT,
+ false, SDCARDFS_I(sb->s_root->d_inode)->data);
+ snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name);
+ } else {
+ setup_derived_state(sb->s_root->d_inode, PERM_ROOT,
+ sb_info->options.fs_user_id, AID_ROOT,
+ false, SDCARDFS_I(sb->s_root->d_inode)->data);
+ snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name);
+ }
+ fixup_tmp_permissions(sb->s_root->d_inode);
+ sb_info->sb = sb;
+ list_add(&sb_info->list, &sdcardfs_super_list);
+ mutex_unlock(&sdcardfs_super_list_lock);
+
+ if (!silent)
+ pr_info("sdcardfs: mounted on top of %s type %s\n",
+ dev_name, lower_sb->s_type->name);
+ goto out; /* all is well */
+
+ /* no longer needed: free_dentry_private_data(sb->s_root); */
+out_freeroot:
+ dput(sb->s_root);
+out_iput:
+ iput(inode);
+out_sput:
+ /* drop refs we took earlier */
+ atomic_dec(&lower_sb->s_active);
+out_freesbi:
+ kfree(SDCARDFS_SB(sb));
+ sb->s_fs_info = NULL;
+out_free:
+ path_put(&lower_path);
+
+out:
+ return err;
+}
+
+/* A feature which supports mount_nodev() with options */
+static struct dentry *mount_nodev_with_options(struct vfsmount *mnt,
+ struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data,
+ int (*fill_super)(struct vfsmount *, struct super_block *,
+ const char *, void *, int))
+
+{
+ int error;
+ struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);
+
+ if (IS_ERR(s))
+ return ERR_CAST(s);
+
+ s->s_flags = flags;
+
+ error = fill_super(mnt, s, dev_name, data, flags & MS_SILENT ? 1 : 0);
+ if (error) {
+ deactivate_locked_super(s);
+ return ERR_PTR(error);
+ }
+ s->s_flags |= MS_ACTIVE;
+ return dget(s->s_root);
+}
+
+static struct dentry *sdcardfs_mount(struct vfsmount *mnt,
+ struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *raw_data)
+{
+ /*
+ * dev_name is a lower_path_name,
+ * raw_data is a option string.
+ */
+ return mount_nodev_with_options(mnt, fs_type, flags, dev_name,
+ raw_data, sdcardfs_read_super);
+}
+
+static struct dentry *sdcardfs_mount_wrn(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *raw_data)
+{
+ WARN(1, "sdcardfs does not support mount. Use mount2.\n");
+ return ERR_PTR(-EINVAL);
+}
+
+void *sdcardfs_alloc_mnt_data(void)
+{
+ return kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL);
+}
+
+void sdcardfs_kill_sb(struct super_block *sb)
+{
+ struct sdcardfs_sb_info *sbi;
+
+ if (sb->s_magic == SDCARDFS_SUPER_MAGIC) {
+ sbi = SDCARDFS_SB(sb);
+ mutex_lock(&sdcardfs_super_list_lock);
+ list_del(&sbi->list);
+ mutex_unlock(&sdcardfs_super_list_lock);
+ }
+ generic_shutdown_super(sb);
+}
+
+static struct file_system_type sdcardfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = SDCARDFS_NAME,
+ .mount = sdcardfs_mount_wrn,
+ .mount2 = sdcardfs_mount,
+ .alloc_mnt_data = sdcardfs_alloc_mnt_data,
+ .kill_sb = sdcardfs_kill_sb,
+ .fs_flags = 0,
+};
+MODULE_ALIAS_FS(SDCARDFS_NAME);
+
+static int __init init_sdcardfs_fs(void)
+{
+ int err;
+
+ pr_info("Registering sdcardfs " SDCARDFS_VERSION "\n");
+
+ err = sdcardfs_init_inode_cache();
+ if (err)
+ goto out;
+ err = sdcardfs_init_dentry_cache();
+ if (err)
+ goto out;
+ err = packagelist_init();
+ if (err)
+ goto out;
+ err = register_filesystem(&sdcardfs_fs_type);
+out:
+ if (err) {
+ sdcardfs_destroy_inode_cache();
+ sdcardfs_destroy_dentry_cache();
+ packagelist_exit();
+ }
+ return err;
+}
+
+static void __exit exit_sdcardfs_fs(void)
+{
+ sdcardfs_destroy_inode_cache();
+ sdcardfs_destroy_dentry_cache();
+ packagelist_exit();
+ unregister_filesystem(&sdcardfs_fs_type);
+ pr_info("Completed sdcardfs module unload\n");
+}
+
+/* Original wrapfs authors */
+MODULE_AUTHOR("Erez Zadok, Filesystems and Storage Lab, Stony Brook University (http://www.fsl.cs.sunysb.edu/)");
+
+/* Original sdcardfs authors */
+MODULE_AUTHOR("Woojoong Lee, Daeho Jeong, Kitae Lee, Yeongjin Gil System Memory Lab., Samsung Electronics");
+
+/* Current maintainer */
+MODULE_AUTHOR("Daniel Rosenberg, Google");
+MODULE_DESCRIPTION("Sdcardfs " SDCARDFS_VERSION);
+MODULE_LICENSE("GPL");
+
+module_init(init_sdcardfs_fs);
+module_exit(exit_sdcardfs_fs);
diff --git a/fs/sdcardfs/mmap.c b/fs/sdcardfs/mmap.c
new file mode 100644
index 000000000000..f61896128cfe
--- /dev/null
+++ b/fs/sdcardfs/mmap.c
@@ -0,0 +1,89 @@
+/*
+ * fs/sdcardfs/mmap.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+
+static int sdcardfs_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ int err;
+ struct file *file;
+ const struct vm_operations_struct *lower_vm_ops;
+
+ file = (struct file *)vma->vm_private_data;
+ lower_vm_ops = SDCARDFS_F(file)->lower_vm_ops;
+ BUG_ON(!lower_vm_ops);
+
+ err = lower_vm_ops->fault(vma, vmf);
+ return err;
+}
+
+static void sdcardfs_vm_open(struct vm_area_struct *vma)
+{
+ struct file *file = (struct file *)vma->vm_private_data;
+
+ get_file(file);
+}
+
+static void sdcardfs_vm_close(struct vm_area_struct *vma)
+{
+ struct file *file = (struct file *)vma->vm_private_data;
+
+ fput(file);
+}
+
+static int sdcardfs_page_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ int err = 0;
+ struct file *file;
+ const struct vm_operations_struct *lower_vm_ops;
+
+ file = (struct file *)vma->vm_private_data;
+ lower_vm_ops = SDCARDFS_F(file)->lower_vm_ops;
+ BUG_ON(!lower_vm_ops);
+ if (!lower_vm_ops->page_mkwrite)
+ goto out;
+
+ err = lower_vm_ops->page_mkwrite(vma, vmf);
+out:
+ return err;
+}
+
+static ssize_t sdcardfs_direct_IO(int rw, struct kiocb *iocb,
+ struct iov_iter *iter, loff_t pos)
+{
+ /*
+ * This function should never be called directly. We need it
+ * to exist, to get past a check in open_check_o_direct(),
+ * which is called from do_last().
+ */
+ return -EINVAL;
+}
+
+const struct address_space_operations sdcardfs_aops = {
+ .direct_IO = sdcardfs_direct_IO,
+};
+
+const struct vm_operations_struct sdcardfs_vm_ops = {
+ .fault = sdcardfs_fault,
+ .page_mkwrite = sdcardfs_page_mkwrite,
+ .open = sdcardfs_vm_open,
+ .close = sdcardfs_vm_close,
+};
diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h
new file mode 100644
index 000000000000..85341e753f8c
--- /dev/null
+++ b/fs/sdcardfs/multiuser.h
@@ -0,0 +1,53 @@
+/*
+ * fs/sdcardfs/multiuser.h
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#define AID_USER_OFFSET 100000 /* offset for uid ranges for each user */
+#define AID_APP_START 10000 /* first app user */
+#define AID_APP_END 19999 /* last app user */
+#define AID_CACHE_GID_START 20000 /* start of gids for apps to mark cached data */
+#define AID_EXT_GID_START 30000 /* start of gids for apps to mark external data */
+#define AID_EXT_CACHE_GID_START 40000 /* start of gids for apps to mark external cached data */
+#define AID_EXT_CACHE_GID_END 49999 /* end of gids for apps to mark external cached data */
+#define AID_SHARED_GID_START 50000 /* start of gids for apps in each user to share */
+
+typedef uid_t userid_t;
+typedef uid_t appid_t;
+
+static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id)
+{
+ return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET);
+}
+
+static inline bool uid_is_app(uid_t uid)
+{
+ appid_t appid = uid % AID_USER_OFFSET;
+
+ return appid >= AID_APP_START && appid <= AID_APP_END;
+}
+
+static inline gid_t multiuser_get_ext_cache_gid(uid_t uid)
+{
+ return uid - AID_APP_START + AID_EXT_CACHE_GID_START;
+}
+
+static inline gid_t multiuser_get_ext_gid(uid_t uid)
+{
+ return uid - AID_APP_START + AID_EXT_GID_START;
+}
diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
new file mode 100644
index 000000000000..d552dde9c130
--- /dev/null
+++ b/fs/sdcardfs/packagelist.c
@@ -0,0 +1,885 @@
+/*
+ * fs/sdcardfs/packagelist.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include <linux/hashtable.h>
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/radix-tree.h>
+#include <linux/dcache.h>
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+struct hashtable_entry {
+ struct hlist_node hlist;
+ struct hlist_node dlist; /* for deletion cleanup */
+ struct qstr key;
+ atomic_t value;
+};
+
+static DEFINE_HASHTABLE(package_to_appid, 8);
+static DEFINE_HASHTABLE(package_to_userid, 8);
+static DEFINE_HASHTABLE(ext_to_groupid, 8);
+
+
+static struct kmem_cache *hashtable_entry_cachep;
+
+static unsigned int full_name_case_hash(const unsigned char *name, unsigned int len)
+{
+ unsigned long hash = init_name_hash();
+
+ while (len--)
+ hash = partial_name_hash(tolower(*name++), hash);
+ return end_name_hash(hash);
+}
+
+static inline void qstr_init(struct qstr *q, const char *name)
+{
+ q->name = name;
+ q->len = strlen(q->name);
+ q->hash = full_name_case_hash(q->name, q->len);
+}
+
+static inline int qstr_copy(const struct qstr *src, struct qstr *dest)
+{
+ dest->name = kstrdup(src->name, GFP_KERNEL);
+ dest->hash_len = src->hash_len;
+ return !!dest->name;
+}
+
+
+static appid_t __get_appid(const struct qstr *key)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = key->hash;
+ appid_t ret_id;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key)) {
+ ret_id = atomic_read(&hash_cur->value);
+ rcu_read_unlock();
+ return ret_id;
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+appid_t get_appid(const char *key)
+{
+ struct qstr q;
+
+ qstr_init(&q, key);
+ return __get_appid(&q);
+}
+
+static appid_t __get_ext_gid(const struct qstr *key)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = key->hash;
+ appid_t ret_id;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key)) {
+ ret_id = atomic_read(&hash_cur->value);
+ rcu_read_unlock();
+ return ret_id;
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+appid_t get_ext_gid(const char *key)
+{
+ struct qstr q;
+
+ qstr_init(&q, key);
+ return __get_ext_gid(&q);
+}
+
+static appid_t __is_excluded(const struct qstr *app_name, userid_t user)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = app_name->hash;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+ if (atomic_read(&hash_cur->value) == user &&
+ qstr_case_eq(app_name, &hash_cur->key)) {
+ rcu_read_unlock();
+ return 1;
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+appid_t is_excluded(const char *key, userid_t user)
+{
+ struct qstr q;
+
+ qstr_init(&q, key);
+ return __is_excluded(&q, user);
+}
+
+/* Kernel has already enforced everything we returned through
+ * derive_permissions_locked(), so this is used to lock down access
+ * even further, such as enforcing that apps hold sdcard_rw.
+ */
+int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name)
+{
+ struct qstr q_autorun = QSTR_LITERAL("autorun.inf");
+ struct qstr q__android_secure = QSTR_LITERAL(".android_secure");
+ struct qstr q_android_secure = QSTR_LITERAL("android_secure");
+
+ /* Always block security-sensitive files at root */
+ if (parent_node && SDCARDFS_I(parent_node)->data->perm == PERM_ROOT) {
+ if (qstr_case_eq(name, &q_autorun)
+ || qstr_case_eq(name, &q__android_secure)
+ || qstr_case_eq(name, &q_android_secure)) {
+ return 0;
+ }
+ }
+
+ /* Root always has access; access for any other UIDs should always
+ * be controlled through packages.list.
+ */
+ if (from_kuid(&init_user_ns, current_fsuid()) == 0)
+ return 1;
+
+ /* No extra permissions to enforce */
+ return 1;
+}
+
+static struct hashtable_entry *alloc_hashtable_entry(const struct qstr *key,
+ appid_t value)
+{
+ struct hashtable_entry *ret = kmem_cache_alloc(hashtable_entry_cachep,
+ GFP_KERNEL);
+ if (!ret)
+ return NULL;
+ INIT_HLIST_NODE(&ret->dlist);
+ INIT_HLIST_NODE(&ret->hlist);
+
+ if (!qstr_copy(key, &ret->key)) {
+ kmem_cache_free(hashtable_entry_cachep, ret);
+ return NULL;
+ }
+
+ atomic_set(&ret->value, value);
+ return ret;
+}
+
+static int insert_packagelist_appid_entry_locked(const struct qstr *key, appid_t value)
+{
+ struct hashtable_entry *hash_cur;
+ struct hashtable_entry *new_entry;
+ unsigned int hash = key->hash;
+
+ hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key)) {
+ atomic_set(&hash_cur->value, value);
+ return 0;
+ }
+ }
+ new_entry = alloc_hashtable_entry(key, value);
+ if (!new_entry)
+ return -ENOMEM;
+ hash_add_rcu(package_to_appid, &new_entry->hlist, hash);
+ return 0;
+}
+
+static int insert_ext_gid_entry_locked(const struct qstr *key, appid_t value)
+{
+ struct hashtable_entry *hash_cur;
+ struct hashtable_entry *new_entry;
+ unsigned int hash = key->hash;
+
+ /* An extension can only belong to one gid */
+ hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key))
+ return -EINVAL;
+ }
+ new_entry = alloc_hashtable_entry(key, value);
+ if (!new_entry)
+ return -ENOMEM;
+ hash_add_rcu(ext_to_groupid, &new_entry->hlist, hash);
+ return 0;
+}
+
+static int insert_userid_exclude_entry_locked(const struct qstr *key, userid_t value)
+{
+ struct hashtable_entry *hash_cur;
+ struct hashtable_entry *new_entry;
+ unsigned int hash = key->hash;
+
+ /* Only insert if not already present */
+ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+ if (atomic_read(&hash_cur->value) == value &&
+ qstr_case_eq(key, &hash_cur->key))
+ return 0;
+ }
+ new_entry = alloc_hashtable_entry(key, value);
+ if (!new_entry)
+ return -ENOMEM;
+ hash_add_rcu(package_to_userid, &new_entry->hlist, hash);
+ return 0;
+}
+
+static void fixup_all_perms_name(const struct qstr *key)
+{
+ struct sdcardfs_sb_info *sbinfo;
+ struct limit_search limit = {
+ .flags = BY_NAME,
+ .name = QSTR_INIT(key->name, key->len),
+ };
+ list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
+ if (sbinfo_has_sdcard_magic(sbinfo))
+ fixup_perms_recursive(sbinfo->sb->s_root, &limit);
+ }
+}
+
+static void fixup_all_perms_name_userid(const struct qstr *key, userid_t userid)
+{
+ struct sdcardfs_sb_info *sbinfo;
+ struct limit_search limit = {
+ .flags = BY_NAME | BY_USERID,
+ .name = QSTR_INIT(key->name, key->len),
+ .userid = userid,
+ };
+ list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
+ if (sbinfo_has_sdcard_magic(sbinfo))
+ fixup_perms_recursive(sbinfo->sb->s_root, &limit);
+ }
+}
+
+static void fixup_all_perms_userid(userid_t userid)
+{
+ struct sdcardfs_sb_info *sbinfo;
+ struct limit_search limit = {
+ .flags = BY_USERID,
+ .userid = userid,
+ };
+ list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
+ if (sbinfo_has_sdcard_magic(sbinfo))
+ fixup_perms_recursive(sbinfo->sb->s_root, &limit);
+ }
+}
+
+static int insert_packagelist_entry(const struct qstr *key, appid_t value)
+{
+ int err;
+
+ mutex_lock(&sdcardfs_super_list_lock);
+ err = insert_packagelist_appid_entry_locked(key, value);
+ if (!err)
+ fixup_all_perms_name(key);
+ mutex_unlock(&sdcardfs_super_list_lock);
+
+ return err;
+}
+
+static int insert_ext_gid_entry(const struct qstr *key, appid_t value)
+{
+ int err;
+
+ mutex_lock(&sdcardfs_super_list_lock);
+ err = insert_ext_gid_entry_locked(key, value);
+ mutex_unlock(&sdcardfs_super_list_lock);
+
+ return err;
+}
+
+static int insert_userid_exclude_entry(const struct qstr *key, userid_t value)
+{
+ int err;
+
+ mutex_lock(&sdcardfs_super_list_lock);
+ err = insert_userid_exclude_entry_locked(key, value);
+ if (!err)
+ fixup_all_perms_name_userid(key, value);
+ mutex_unlock(&sdcardfs_super_list_lock);
+
+ return err;
+}
+
+static void free_hashtable_entry(struct hashtable_entry *entry)
+{
+ kfree(entry->key.name);
+ kmem_cache_free(hashtable_entry_cachep, entry);
+}
+
+static void remove_packagelist_entry_locked(const struct qstr *key)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = key->hash;
+ struct hlist_node *h_t;
+ HLIST_HEAD(free_list);
+
+ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key)) {
+ hash_del_rcu(&hash_cur->hlist);
+ hlist_add_head(&hash_cur->dlist, &free_list);
+ }
+ }
+ hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key)) {
+ hash_del_rcu(&hash_cur->hlist);
+ hlist_add_head(&hash_cur->dlist, &free_list);
+ break;
+ }
+ }
+ synchronize_rcu();
+ hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist)
+ free_hashtable_entry(hash_cur);
+}
+
+static void remove_packagelist_entry(const struct qstr *key)
+{
+ mutex_lock(&sdcardfs_super_list_lock);
+ remove_packagelist_entry_locked(key);
+ fixup_all_perms_name(key);
+ mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void remove_ext_gid_entry_locked(const struct qstr *key, gid_t group)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = key->hash;
+
+ hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key) && atomic_read(&hash_cur->value) == group) {
+ hash_del_rcu(&hash_cur->hlist);
+ synchronize_rcu();
+ free_hashtable_entry(hash_cur);
+ break;
+ }
+ }
+}
+
+static void remove_ext_gid_entry(const struct qstr *key, gid_t group)
+{
+ mutex_lock(&sdcardfs_super_list_lock);
+ remove_ext_gid_entry_locked(key, group);
+ mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void remove_userid_all_entry_locked(userid_t userid)
+{
+ struct hashtable_entry *hash_cur;
+ struct hlist_node *h_t;
+ HLIST_HEAD(free_list);
+ int i;
+
+ hash_for_each_rcu(package_to_userid, i, hash_cur, hlist) {
+ if (atomic_read(&hash_cur->value) == userid) {
+ hash_del_rcu(&hash_cur->hlist);
+ hlist_add_head(&hash_cur->dlist, &free_list);
+ }
+ }
+ synchronize_rcu();
+ hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist) {
+ free_hashtable_entry(hash_cur);
+ }
+}
+
+static void remove_userid_all_entry(userid_t userid)
+{
+ mutex_lock(&sdcardfs_super_list_lock);
+ remove_userid_all_entry_locked(userid);
+ fixup_all_perms_userid(userid);
+ mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void remove_userid_exclude_entry_locked(const struct qstr *key, userid_t userid)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = key->hash;
+
+ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key) &&
+ atomic_read(&hash_cur->value) == userid) {
+ hash_del_rcu(&hash_cur->hlist);
+ synchronize_rcu();
+ free_hashtable_entry(hash_cur);
+ break;
+ }
+ }
+}
+
+static void remove_userid_exclude_entry(const struct qstr *key, userid_t userid)
+{
+ mutex_lock(&sdcardfs_super_list_lock);
+ remove_userid_exclude_entry_locked(key, userid);
+ fixup_all_perms_name_userid(key, userid);
+ mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void packagelist_destroy(void)
+{
+ struct hashtable_entry *hash_cur;
+ struct hlist_node *h_t;
+ HLIST_HEAD(free_list);
+ int i;
+
+ mutex_lock(&sdcardfs_super_list_lock);
+ hash_for_each_rcu(package_to_appid, i, hash_cur, hlist) {
+ hash_del_rcu(&hash_cur->hlist);
+ hlist_add_head(&hash_cur->dlist, &free_list);
+ }
+ hash_for_each_rcu(package_to_userid, i, hash_cur, hlist) {
+ hash_del_rcu(&hash_cur->hlist);
+ hlist_add_head(&hash_cur->dlist, &free_list);
+ }
+ synchronize_rcu();
+ hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist)
+ free_hashtable_entry(hash_cur);
+ mutex_unlock(&sdcardfs_super_list_lock);
+ pr_info("sdcardfs: destroyed packagelist pkgld\n");
+}
+
+struct package_details {
+ struct config_item item;
+ struct qstr name;
+};
+
+static inline struct package_details *to_package_details(struct config_item *item)
+{
+ return item ? container_of(item, struct package_details, item) : NULL;
+}
+
+CONFIGFS_ATTR_STRUCT(package_details);
+#define PACKAGE_DETAILS_ATTR(_name, _mode, _show, _store) \
+struct package_details_attribute package_details_attr_##_name = __CONFIGFS_ATTR(_name, _mode, _show, _store)
+#define PACKAGE_DETAILS_ATTRIBUTE(name) (&package_details_attr_##name.attr)
+
+static ssize_t package_details_appid_show(struct package_details *package_details,
+ char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%u\n", __get_appid(&package_details->name));
+}
+
+static ssize_t package_details_appid_store(struct package_details *package_details,
+ const char *page, size_t count)
+{
+ unsigned int tmp;
+ int ret;
+
+ ret = kstrtouint(page, 10, &tmp);
+ if (ret)
+ return ret;
+
+ ret = insert_packagelist_entry(&package_details->name, tmp);
+
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static ssize_t package_details_excluded_userids_show(struct package_details *package_details,
+ char *page)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = package_details->name.hash;
+ int count = 0;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(&package_details->name, &hash_cur->key))
+ count += scnprintf(page + count, PAGE_SIZE - count,
+ "%d ", atomic_read(&hash_cur->value));
+ }
+ rcu_read_unlock();
+ if (count)
+ count--;
+ count += scnprintf(page + count, PAGE_SIZE - count, "\n");
+ return count;
+}
+
+static ssize_t package_details_excluded_userids_store(struct package_details *package_details,
+ const char *page, size_t count)
+{
+ unsigned int tmp;
+ int ret;
+
+ ret = kstrtouint(page, 10, &tmp);
+ if (ret)
+ return ret;
+
+ ret = insert_userid_exclude_entry(&package_details->name, tmp);
+
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static ssize_t package_details_clear_userid_store(struct package_details *package_details,
+ const char *page, size_t count)
+{
+ unsigned int tmp;
+ int ret;
+
+ ret = kstrtouint(page, 10, &tmp);
+ if (ret)
+ return ret;
+ remove_userid_exclude_entry(&package_details->name, tmp);
+ return count;
+}
+
+static void package_details_release(struct config_item *item)
+{
+ struct package_details *package_details = to_package_details(item);
+
+ pr_info("sdcardfs: removing %s\n", package_details->name.name);
+ remove_packagelist_entry(&package_details->name);
+ kfree(package_details->name.name);
+ kfree(package_details);
+}
+
+PACKAGE_DETAILS_ATTR(appid, S_IRUGO | S_IWUGO, package_details_appid_show, package_details_appid_store);
+PACKAGE_DETAILS_ATTR(excluded_userids, S_IRUGO | S_IWUGO,
+ package_details_excluded_userids_show, package_details_excluded_userids_store);
+PACKAGE_DETAILS_ATTR(clear_userid, S_IWUGO, NULL, package_details_clear_userid_store);
+
+static struct configfs_attribute *package_details_attrs[] = {
+ PACKAGE_DETAILS_ATTRIBUTE(appid),
+ PACKAGE_DETAILS_ATTRIBUTE(excluded_userids),
+ PACKAGE_DETAILS_ATTRIBUTE(clear_userid),
+ NULL,
+};
+
+CONFIGFS_ATTR_OPS(package_details);
+
+static struct configfs_item_operations package_details_item_ops = {
+ .release = package_details_release,
+ .show_attribute = package_details_attr_show,
+ .store_attribute = package_details_attr_store,
+};
+
+static struct config_item_type package_appid_type = {
+ .ct_item_ops = &package_details_item_ops,
+ .ct_attrs = package_details_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+struct extensions_value {
+ struct config_group group;
+ unsigned int num;
+};
+
+struct extension_details {
+ struct config_item item;
+ struct qstr name;
+ unsigned int num;
+};
+
+static inline struct extensions_value *to_extensions_value(struct config_item *item)
+{
+ return item ? container_of(to_config_group(item), struct extensions_value, group) : NULL;
+}
+
+static inline struct extension_details *to_extension_details(struct config_item *item)
+{
+ return item ? container_of(item, struct extension_details, item) : NULL;
+}
+
+static void extension_details_release(struct config_item *item)
+{
+ struct extension_details *extension_details = to_extension_details(item);
+
+ pr_info("sdcardfs: No longer mapping %s files to gid %d\n",
+ extension_details->name.name, extension_details->num);
+ remove_ext_gid_entry(&extension_details->name, extension_details->num);
+ kfree(extension_details->name.name);
+ kfree(extension_details);
+}
+
+static struct configfs_item_operations extension_details_item_ops = {
+ .release = extension_details_release,
+};
+
+static struct config_item_type extension_details_type = {
+ .ct_item_ops = &extension_details_item_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item *extension_details_make_item(struct config_group *group, const char *name)
+{
+ struct extensions_value *extensions_value = to_extensions_value(&group->cg_item);
+ struct extension_details *extension_details = kzalloc(sizeof(struct extension_details), GFP_KERNEL);
+ const char *tmp;
+ int ret;
+
+ if (!extension_details)
+ return ERR_PTR(-ENOMEM);
+
+ tmp = kstrdup(name, GFP_KERNEL);
+ if (!tmp) {
+ kfree(extension_details);
+ return ERR_PTR(-ENOMEM);
+ }
+ qstr_init(&extension_details->name, tmp);
+ ret = insert_ext_gid_entry(&extension_details->name, extensions_value->num);
+
+ if (ret) {
+ kfree(extension_details->name.name);
+ kfree(extension_details);
+ return ERR_PTR(ret);
+ }
+ config_item_init_type_name(&extension_details->item, name, &extension_details_type);
+
+ return &extension_details->item;
+}
+
+static struct configfs_group_operations extensions_value_group_ops = {
+ .make_item = extension_details_make_item,
+};
+
+static struct config_item_type extensions_name_type = {
+ .ct_group_ops = &extensions_value_group_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_group *extensions_make_group(struct config_group *group, const char *name)
+{
+ struct extensions_value *extensions_value;
+ unsigned int tmp;
+ int ret;
+
+ extensions_value = kzalloc(sizeof(struct extensions_value), GFP_KERNEL);
+ if (!extensions_value)
+ return ERR_PTR(-ENOMEM);
+ ret = kstrtouint(name, 10, &tmp);
+ if (ret) {
+ kfree(extensions_value);
+ return ERR_PTR(ret);
+ }
+
+ extensions_value->num = tmp;
+ config_group_init_type_name(&extensions_value->group, name,
+ &extensions_name_type);
+ return &extensions_value->group;
+}
+
+static void extensions_drop_group(struct config_group *group, struct config_item *item)
+{
+ struct extensions_value *value = to_extensions_value(item);
+
+ pr_info("sdcardfs: No longer mapping any files to gid %d\n", value->num);
+ kfree(value);
+}
+
+static struct configfs_group_operations extensions_group_ops = {
+ .make_group = extensions_make_group,
+ .drop_item = extensions_drop_group,
+};
+
+static struct config_item_type extensions_type = {
+ .ct_group_ops = &extensions_group_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+struct config_group extension_group = {
+ .cg_item = {
+ .ci_namebuf = "extensions",
+ .ci_type = &extensions_type,
+ },
+};
+
+struct packages {
+ struct configfs_subsystem subsystem;
+};
+
+static inline struct packages *to_packages(struct config_item *item)
+{
+ return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct packages, subsystem) : NULL;
+}
+
+CONFIGFS_ATTR_STRUCT(packages);
+#define PACKAGES_ATTR(_name, _mode, _show, _store) \
+struct packages_attribute packages_attr_##_name = __CONFIGFS_ATTR(_name, _mode, _show, _store)
+#define PACKAGES_ATTR_RO(_name, _show) \
+struct packages_attribute packages_attr_##_name = __CONFIGFS_ATTR_RO(_name, _show)
+
+static struct config_item *packages_make_item(struct config_group *group, const char *name)
+{
+ struct package_details *package_details;
+ const char *tmp;
+
+ package_details = kzalloc(sizeof(struct package_details), GFP_KERNEL);
+ if (!package_details)
+ return ERR_PTR(-ENOMEM);
+ tmp = kstrdup(name, GFP_KERNEL);
+ if (!tmp) {
+ kfree(package_details);
+ return ERR_PTR(-ENOMEM);
+ }
+ qstr_init(&package_details->name, tmp);
+ config_item_init_type_name(&package_details->item, name,
+ &package_appid_type);
+
+ return &package_details->item;
+}
+
+static ssize_t packages_list_show(struct packages *packages,
+ char *page)
+{
+ struct hashtable_entry *hash_cur_app;
+ struct hashtable_entry *hash_cur_user;
+ int i;
+ int count = 0, written = 0;
+ const char errormsg[] = "<truncated>\n";
+ unsigned int hash;
+
+ rcu_read_lock();
+ hash_for_each_rcu(package_to_appid, i, hash_cur_app, hlist) {
+ written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n",
+ hash_cur_app->key.name, atomic_read(&hash_cur_app->value));
+ hash = hash_cur_app->key.hash;
+ hash_for_each_possible_rcu(package_to_userid, hash_cur_user, hlist, hash) {
+ if (qstr_case_eq(&hash_cur_app->key, &hash_cur_user->key)) {
+ written += scnprintf(page + count + written - 1,
+ PAGE_SIZE - sizeof(errormsg) - count - written + 1,
+ " %d\n", atomic_read(&hash_cur_user->value)) - 1;
+ }
+ }
+ if (count + written == PAGE_SIZE - sizeof(errormsg) - 1) {
+ count += scnprintf(page + count, PAGE_SIZE - count, errormsg);
+ break;
+ }
+ count += written;
+ }
+ rcu_read_unlock();
+
+ return count;
+}
+
+static ssize_t packages_remove_userid_store(struct packages *packages,
+ const char *page, size_t count)
+{
+ unsigned int tmp;
+ int ret;
+
+ ret = kstrtouint(page, 10, &tmp);
+ if (ret)
+ return ret;
+ remove_userid_all_entry(tmp);
+ return count;
+}
+
+struct packages_attribute packages_attr_packages_gid_list = __CONFIGFS_ATTR_RO(packages_gid.list, packages_list_show);
+PACKAGES_ATTR(remove_userid, S_IWUGO, NULL, packages_remove_userid_store);
+
+static struct configfs_attribute *packages_attrs[] = {
+ &packages_attr_packages_gid_list.attr,
+ &packages_attr_remove_userid.attr,
+ NULL,
+};
+
+CONFIGFS_ATTR_OPS(packages)
+static struct configfs_item_operations packages_item_ops = {
+ .show_attribute = packages_attr_show,
+ .store_attribute = packages_attr_store,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations packages_group_ops = {
+ .make_item = packages_make_item,
+};
+
+static struct config_item_type packages_type = {
+ .ct_item_ops = &packages_item_ops,
+ .ct_group_ops = &packages_group_ops,
+ .ct_attrs = packages_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+struct config_group *sd_default_groups[] = {
+ &extension_group,
+ NULL,
+};
+
+static struct packages sdcardfs_packages = {
+ .subsystem = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "sdcardfs",
+ .ci_type = &packages_type,
+ },
+ .default_groups = sd_default_groups,
+ },
+ },
+};
+
+static int configfs_sdcardfs_init(void)
+{
+ int ret, i;
+ struct configfs_subsystem *subsys = &sdcardfs_packages.subsystem;
+
+ for (i = 0; sd_default_groups[i]; i++)
+ config_group_init(sd_default_groups[i]);
+ config_group_init(&subsys->su_group);
+ mutex_init(&subsys->su_mutex);
+ ret = configfs_register_subsystem(subsys);
+ if (ret) {
+ pr_err("Error %d while registering subsystem %s\n",
+ ret,
+ subsys->su_group.cg_item.ci_namebuf);
+ }
+ return ret;
+}
+
+static void configfs_sdcardfs_exit(void)
+{
+ configfs_unregister_subsystem(&sdcardfs_packages.subsystem);
+}
+
+int packagelist_init(void)
+{
+ hashtable_entry_cachep =
+ kmem_cache_create("packagelist_hashtable_entry",
+ sizeof(struct hashtable_entry), 0, 0, NULL);
+ if (!hashtable_entry_cachep) {
+ pr_err("sdcardfs: failed creating pkgl_hashtable entry slab cache\n");
+ return -ENOMEM;
+ }
+
+ configfs_sdcardfs_init();
+ return 0;
+}
+
+void packagelist_exit(void)
+{
+ configfs_sdcardfs_exit();
+ packagelist_destroy();
+ kmem_cache_destroy(hashtable_entry_cachep);
+}
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
new file mode 100644
index 000000000000..8f2725c735e5
--- /dev/null
+++ b/fs/sdcardfs/sdcardfs.h
@@ -0,0 +1,664 @@
+/*
+ * fs/sdcardfs/sdcardfs.h
+ *
+ * The sdcardfs v2.0
+ * This file system replaces the sdcard daemon on Android
+ * On version 2.0, some of the daemon functions have been ported
+ * to support the multi-user concepts of Android 4.4
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#ifndef _SDCARDFS_H_
+#define _SDCARDFS_H_
+
+#include <linux/dcache.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/aio.h>
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/fs_stack.h>
+#include <linux/magic.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/security.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include "multiuser.h"
+
+/* the file system name */
+#define SDCARDFS_NAME "sdcardfs"
+
+/* sdcardfs root inode number */
+#define SDCARDFS_ROOT_INO 1
+
+/* useful for tracking code reachability */
+#define UDBG pr_default("DBG:%s:%s:%d\n", __FILE__, __func__, __LINE__)
+
+#define SDCARDFS_DIRENT_SIZE 256
+
+/* temporary static uid settings for development */
+#define AID_ROOT 0 /* uid for accessing /mnt/sdcard & extSdcard */
+#define AID_MEDIA_RW 1023 /* internal media storage write access */
+
+#define AID_SDCARD_RW 1015 /* external storage write access */
+#define AID_SDCARD_R 1028 /* external storage read access */
+#define AID_SDCARD_PICS 1033 /* external storage photos access */
+#define AID_SDCARD_AV 1034 /* external storage audio/video access */
+#define AID_SDCARD_ALL 1035 /* access all users external storage */
+#define AID_MEDIA_OBB 1059 /* obb files */
+
+#define AID_SDCARD_IMAGE 1057
+
+#define AID_PACKAGE_INFO 1027
+
+
+/*
+ * Permissions are handled by our permission function.
+ * We don't want anyone who happens to look at our inode value to prematurely
+ * block access, so store more permissive values. These are probably never
+ * used.
+ */
+#define fixup_tmp_permissions(x) \
+ do { \
+ (x)->i_uid = make_kuid(&init_user_ns, \
+ SDCARDFS_I(x)->data->d_uid); \
+ (x)->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW); \
+ (x)->i_mode = ((x)->i_mode & S_IFMT) | 0775;\
+ } while (0)
+
+/* OVERRIDE_CRED() and REVERT_CRED()
+ * OVERRIDE_CRED()
+ * backup original task->cred
+ * and modifies task->cred->fsuid/fsgid to specified value.
+ * REVERT_CRED()
+ * restore original task->cred->fsuid/fsgid.
+ * These two macro should be used in pair, and OVERRIDE_CRED() should be
+ * placed at the beginning of a function, right after variable declaration.
+ */
+#define OVERRIDE_CRED(sdcardfs_sbi, saved_cred, info) \
+ do { \
+ saved_cred = override_fsids(sdcardfs_sbi, info->data); \
+ if (!saved_cred) \
+ return -ENOMEM; \
+ } while (0)
+
+#define OVERRIDE_CRED_PTR(sdcardfs_sbi, saved_cred, info) \
+ do { \
+ saved_cred = override_fsids(sdcardfs_sbi, info->data); \
+ if (!saved_cred) \
+ return ERR_PTR(-ENOMEM); \
+ } while (0)
+
+#define REVERT_CRED(saved_cred) revert_fsids(saved_cred)
+
+/* Android 5.0 support */
+
+/* Permission mode for a specific node. Controls how file permissions
+ * are derived for children nodes.
+ */
+typedef enum {
+ /* Nothing special; this node should just inherit from its parent. */
+ PERM_INHERIT,
+ /* This node is one level above a normal root; used for legacy layouts
+ * which use the first level to represent user_id.
+ */
+ PERM_PRE_ROOT,
+ /* This node is "/" */
+ PERM_ROOT,
+ /* This node is "/Android" */
+ PERM_ANDROID,
+ /* This node is "/Android/data" */
+ PERM_ANDROID_DATA,
+ /* This node is "/Android/obb" */
+ PERM_ANDROID_OBB,
+ /* This node is "/Android/media" */
+ PERM_ANDROID_MEDIA,
+ /* This node is "/Android/[data|media|obb]/[package]" */
+ PERM_ANDROID_PACKAGE,
+ /* This node is "/Android/[data|media|obb]/[package]/cache" */
+ PERM_ANDROID_PACKAGE_CACHE,
+} perm_t;
+
+struct sdcardfs_sb_info;
+struct sdcardfs_mount_options;
+struct sdcardfs_inode_info;
+struct sdcardfs_inode_data;
+
+/* Do not directly use this function. Use OVERRIDE_CRED() instead. */
+const struct cred *override_fsids(struct sdcardfs_sb_info *sbi,
+ struct sdcardfs_inode_data *data);
+/* Do not directly use this function, use REVERT_CRED() instead. */
+void revert_fsids(const struct cred *old_cred);
+
+/* operations vectors defined in specific files */
+extern const struct file_operations sdcardfs_main_fops;
+extern const struct file_operations sdcardfs_dir_fops;
+extern const struct inode_operations sdcardfs_main_iops;
+extern const struct inode_operations sdcardfs_dir_iops;
+extern const struct inode_operations sdcardfs_symlink_iops;
+extern const struct super_operations sdcardfs_sops;
+extern const struct dentry_operations sdcardfs_ci_dops;
+extern const struct address_space_operations sdcardfs_aops, sdcardfs_dummy_aops;
+extern const struct vm_operations_struct sdcardfs_vm_ops;
+
+extern int sdcardfs_init_inode_cache(void);
+extern void sdcardfs_destroy_inode_cache(void);
+extern int sdcardfs_init_dentry_cache(void);
+extern void sdcardfs_destroy_dentry_cache(void);
+extern int new_dentry_private_data(struct dentry *dentry);
+extern void free_dentry_private_data(struct dentry *dentry);
+extern struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags);
+extern struct inode *sdcardfs_iget(struct super_block *sb,
+ struct inode *lower_inode, userid_t id);
+extern int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb,
+ struct path *lower_path, userid_t id);
+
+/* file private data */
+struct sdcardfs_file_info {
+ struct file *lower_file;
+ const struct vm_operations_struct *lower_vm_ops;
+};
+
+struct sdcardfs_inode_data {
+ struct kref refcount;
+ bool abandoned;
+
+ perm_t perm;
+ userid_t userid;
+ uid_t d_uid;
+ bool under_android;
+ bool under_cache;
+ bool under_obb;
+};
+
+/* sdcardfs inode data in memory */
+struct sdcardfs_inode_info {
+ struct inode *lower_inode;
+ /* state derived based on current position in hierarchy */
+ struct sdcardfs_inode_data *data;
+
+ /* top folder for ownership */
+ struct sdcardfs_inode_data *top_data;
+
+ struct inode vfs_inode;
+};
+
+
+/* sdcardfs dentry data in memory */
+struct sdcardfs_dentry_info {
+ spinlock_t lock; /* protects lower_path */
+ struct path lower_path;
+ struct path orig_path;
+};
+
+struct sdcardfs_mount_options {
+ uid_t fs_low_uid;
+ gid_t fs_low_gid;
+ userid_t fs_user_id;
+ bool multiuser;
+ unsigned int reserved_mb;
+};
+
+struct sdcardfs_vfsmount_options {
+ gid_t gid;
+ mode_t mask;
+};
+
+extern int parse_options_remount(struct super_block *sb, char *options, int silent,
+ struct sdcardfs_vfsmount_options *vfsopts);
+
+/* sdcardfs super-block data in memory */
+struct sdcardfs_sb_info {
+ struct super_block *sb;
+ struct super_block *lower_sb;
+ /* derived perm policy : some of options have been added
+ * to sdcardfs_mount_options (Android 4.4 support)
+ */
+ struct sdcardfs_mount_options options;
+ spinlock_t lock; /* protects obbpath */
+ char *obbpath_s;
+ struct path obbpath;
+ void *pkgl_id;
+ struct list_head list;
+};
+
+/*
+ * inode to private data
+ *
+ * Since we use containers and the struct inode is _inside_ the
+ * sdcardfs_inode_info structure, SDCARDFS_I will always (given a non-NULL
+ * inode pointer), return a valid non-NULL pointer.
+ */
+static inline struct sdcardfs_inode_info *SDCARDFS_I(const struct inode *inode)
+{
+ return container_of(inode, struct sdcardfs_inode_info, vfs_inode);
+}
+
+/* dentry to private data */
+#define SDCARDFS_D(dent) ((struct sdcardfs_dentry_info *)(dent)->d_fsdata)
+
+/* superblock to private data */
+#define SDCARDFS_SB(super) ((struct sdcardfs_sb_info *)(super)->s_fs_info)
+
+/* file to private Data */
+#define SDCARDFS_F(file) ((struct sdcardfs_file_info *)((file)->private_data))
+
+/* file to lower file */
+static inline struct file *sdcardfs_lower_file(const struct file *f)
+{
+ return SDCARDFS_F(f)->lower_file;
+}
+
+static inline void sdcardfs_set_lower_file(struct file *f, struct file *val)
+{
+ SDCARDFS_F(f)->lower_file = val;
+}
+
+/* inode to lower inode. */
+static inline struct inode *sdcardfs_lower_inode(const struct inode *i)
+{
+ return SDCARDFS_I(i)->lower_inode;
+}
+
+static inline void sdcardfs_set_lower_inode(struct inode *i, struct inode *val)
+{
+ SDCARDFS_I(i)->lower_inode = val;
+}
+
+/* superblock to lower superblock */
+static inline struct super_block *sdcardfs_lower_super(
+ const struct super_block *sb)
+{
+ return SDCARDFS_SB(sb)->lower_sb;
+}
+
+static inline void sdcardfs_set_lower_super(struct super_block *sb,
+ struct super_block *val)
+{
+ SDCARDFS_SB(sb)->lower_sb = val;
+}
+
+/* path based (dentry/mnt) macros */
+static inline void pathcpy(struct path *dst, const struct path *src)
+{
+ dst->dentry = src->dentry;
+ dst->mnt = src->mnt;
+}
+
+/* sdcardfs_get_pname functions calls path_get()
+ * therefore, the caller must call "proper" path_put functions
+ */
+#define SDCARDFS_DENT_FUNC(pname) \
+static inline void sdcardfs_get_##pname(const struct dentry *dent, \
+ struct path *pname) \
+{ \
+ spin_lock(&SDCARDFS_D(dent)->lock); \
+ pathcpy(pname, &SDCARDFS_D(dent)->pname); \
+ path_get(pname); \
+ spin_unlock(&SDCARDFS_D(dent)->lock); \
+ return; \
+} \
+static inline void sdcardfs_put_##pname(const struct dentry *dent, \
+ struct path *pname) \
+{ \
+ path_put(pname); \
+ return; \
+} \
+static inline void sdcardfs_set_##pname(const struct dentry *dent, \
+ struct path *pname) \
+{ \
+ spin_lock(&SDCARDFS_D(dent)->lock); \
+ pathcpy(&SDCARDFS_D(dent)->pname, pname); \
+ spin_unlock(&SDCARDFS_D(dent)->lock); \
+ return; \
+} \
+static inline void sdcardfs_reset_##pname(const struct dentry *dent) \
+{ \
+ spin_lock(&SDCARDFS_D(dent)->lock); \
+ SDCARDFS_D(dent)->pname.dentry = NULL; \
+ SDCARDFS_D(dent)->pname.mnt = NULL; \
+ spin_unlock(&SDCARDFS_D(dent)->lock); \
+ return; \
+} \
+static inline void sdcardfs_put_reset_##pname(const struct dentry *dent) \
+{ \
+ struct path pname; \
+ spin_lock(&SDCARDFS_D(dent)->lock); \
+ if (SDCARDFS_D(dent)->pname.dentry) { \
+ pathcpy(&pname, &SDCARDFS_D(dent)->pname); \
+ SDCARDFS_D(dent)->pname.dentry = NULL; \
+ SDCARDFS_D(dent)->pname.mnt = NULL; \
+ spin_unlock(&SDCARDFS_D(dent)->lock); \
+ path_put(&pname); \
+ } else \
+ spin_unlock(&SDCARDFS_D(dent)->lock); \
+ return; \
+}
+
+SDCARDFS_DENT_FUNC(lower_path)
+SDCARDFS_DENT_FUNC(orig_path)
+
+static inline bool sbinfo_has_sdcard_magic(struct sdcardfs_sb_info *sbinfo)
+{
+ return sbinfo && sbinfo->sb
+ && sbinfo->sb->s_magic == SDCARDFS_SUPER_MAGIC;
+}
+
+static inline struct sdcardfs_inode_data *data_get(
+ struct sdcardfs_inode_data *data)
+{
+ if (data)
+ kref_get(&data->refcount);
+ return data;
+}
+
+static inline struct sdcardfs_inode_data *top_data_get(
+ struct sdcardfs_inode_info *info)
+{
+ return data_get(info->top_data);
+}
+
+extern void data_release(struct kref *ref);
+
+static inline void data_put(struct sdcardfs_inode_data *data)
+{
+ kref_put(&data->refcount, data_release);
+}
+
+static inline void release_own_data(struct sdcardfs_inode_info *info)
+{
+ /*
+ * This happens exactly once per inode. At this point, the inode that
+ * originally held this data is about to be freed, and all references
+ * to it are held as a top value, and will likely be released soon.
+ */
+ info->data->abandoned = true;
+ data_put(info->data);
+}
+
+static inline void set_top(struct sdcardfs_inode_info *info,
+ struct sdcardfs_inode_data *top)
+{
+ struct sdcardfs_inode_data *old_top = info->top_data;
+
+ if (top)
+ data_get(top);
+ info->top_data = top;
+ if (old_top)
+ data_put(old_top);
+}
+
+static inline int get_gid(struct vfsmount *mnt,
+ struct sdcardfs_inode_data *data)
+{
+ struct sdcardfs_vfsmount_options *opts = mnt->data;
+
+ if (opts->gid == AID_SDCARD_RW)
+ /* As an optimization, certain trusted system components only run
+ * as owner but operate across all users. Since we're now handing
+ * out the sdcard_rw GID only to trusted apps, we're okay relaxing
+ * the user boundary enforcement for the default view. The UIDs
+ * assigned to app directories are still multiuser aware.
+ */
+ return AID_SDCARD_RW;
+ else
+ return multiuser_get_uid(data->userid, opts->gid);
+}
+
+static inline int get_mode(struct vfsmount *mnt,
+ struct sdcardfs_inode_info *info,
+ struct sdcardfs_inode_data *data)
+{
+ int owner_mode;
+ int filtered_mode;
+ struct sdcardfs_vfsmount_options *opts = mnt->data;
+ int visible_mode = 0775 & ~opts->mask;
+
+
+ if (data->perm == PERM_PRE_ROOT) {
+ /* Top of multi-user view should always be visible to ensure
+ * secondary users can traverse inside.
+ */
+ visible_mode = 0711;
+ } else if (data->under_android) {
+ /* Block "other" access to Android directories, since only apps
+ * belonging to a specific user should be in there; we still
+ * leave +x open for the default view.
+ */
+ if (opts->gid == AID_SDCARD_RW)
+ visible_mode = visible_mode & ~0006;
+ else
+ visible_mode = visible_mode & ~0007;
+ }
+ owner_mode = info->lower_inode->i_mode & 0700;
+ filtered_mode = visible_mode & (owner_mode | (owner_mode >> 3) | (owner_mode >> 6));
+ return filtered_mode;
+}
+
+static inline int has_graft_path(const struct dentry *dent)
+{
+ int ret = 0;
+
+ spin_lock(&SDCARDFS_D(dent)->lock);
+ if (SDCARDFS_D(dent)->orig_path.dentry != NULL)
+ ret = 1;
+ spin_unlock(&SDCARDFS_D(dent)->lock);
+
+ return ret;
+}
+
+static inline void sdcardfs_get_real_lower(const struct dentry *dent,
+ struct path *real_lower)
+{
+ /* in case of a local obb dentry
+ * the orig_path should be returned
+ */
+ if (has_graft_path(dent))
+ sdcardfs_get_orig_path(dent, real_lower);
+ else
+ sdcardfs_get_lower_path(dent, real_lower);
+}
+
+static inline void sdcardfs_put_real_lower(const struct dentry *dent,
+ struct path *real_lower)
+{
+ if (has_graft_path(dent))
+ sdcardfs_put_orig_path(dent, real_lower);
+ else
+ sdcardfs_put_lower_path(dent, real_lower);
+}
+
+extern struct mutex sdcardfs_super_list_lock;
+extern struct list_head sdcardfs_super_list;
+
+/* for packagelist.c */
+extern appid_t get_appid(const char *app_name);
+extern appid_t get_ext_gid(const char *app_name);
+extern appid_t is_excluded(const char *app_name, userid_t userid);
+extern int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name);
+extern int packagelist_init(void);
+extern void packagelist_exit(void);
+
+/* for derived_perm.c */
+#define BY_NAME (1 << 0)
+#define BY_USERID (1 << 1)
+struct limit_search {
+ unsigned int flags;
+ struct qstr name;
+ userid_t userid;
+};
+
+extern void setup_derived_state(struct inode *inode, perm_t perm,
+ userid_t userid, uid_t uid, bool under_android,
+ struct sdcardfs_inode_data *top);
+extern void get_derived_permission(struct dentry *parent, struct dentry *dentry);
+extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name);
+extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit);
+
+extern void update_derived_permission_lock(struct dentry *dentry);
+void fixup_lower_ownership(struct dentry *dentry, const char *name);
+extern int need_graft_path(struct dentry *dentry);
+extern int is_base_obbpath(struct dentry *dentry);
+extern int is_obbpath_invalid(struct dentry *dentry);
+extern int setup_obb_dentry(struct dentry *dentry, struct path *lower_path);
+
+/* locking helpers */
+static inline struct dentry *lock_parent(struct dentry *dentry)
+{
+ struct dentry *dir = dget_parent(dentry);
+
+ mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+ return dir;
+}
+
+static inline void unlock_dir(struct dentry *dir)
+{
+ mutex_unlock(&dir->d_inode->i_mutex);
+ dput(dir);
+}
+
+static inline int prepare_dir(const char *path_s, uid_t uid, gid_t gid, mode_t mode)
+{
+ int err;
+ struct dentry *dent;
+ struct iattr attrs;
+ struct path parent;
+
+ dent = kern_path_locked(path_s, &parent);
+ if (IS_ERR(dent)) {
+ err = PTR_ERR(dent);
+ if (err == -EEXIST)
+ err = 0;
+ goto out_unlock;
+ }
+
+ err = vfs_mkdir2(parent.mnt, parent.dentry->d_inode, dent, mode);
+ if (err) {
+ if (err == -EEXIST)
+ err = 0;
+ goto out_dput;
+ }
+
+ attrs.ia_uid = make_kuid(&init_user_ns, uid);
+ attrs.ia_gid = make_kgid(&init_user_ns, gid);
+ attrs.ia_valid = ATTR_UID | ATTR_GID;
+ mutex_lock(&dent->d_inode->i_mutex);
+ notify_change2(parent.mnt, dent, &attrs, NULL);
+ mutex_unlock(&dent->d_inode->i_mutex);
+
+out_dput:
+ dput(dent);
+
+out_unlock:
+ /* parent dentry locked by lookup_create */
+ mutex_unlock(&parent.dentry->d_inode->i_mutex);
+ path_put(&parent);
+ return err;
+}
+
+/*
+ * Return 1, if a disk has enough free space, otherwise 0.
+ * We assume that any files can not be overwritten.
+ */
+static inline int check_min_free_space(struct dentry *dentry, size_t size, int dir)
+{
+ int err;
+ struct path lower_path;
+ struct kstatfs statfs;
+ u64 avail;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+
+ if (sbi->options.reserved_mb) {
+ /* Get fs stat of lower filesystem. */
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ err = vfs_statfs(&lower_path, &statfs);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+
+ if (unlikely(err))
+ return 0;
+
+ /* Invalid statfs informations. */
+ if (unlikely(statfs.f_bsize == 0))
+ return 0;
+
+ /* if you are checking directory, set size to f_bsize. */
+ if (unlikely(dir))
+ size = statfs.f_bsize;
+
+ /* available size */
+ avail = statfs.f_bavail * statfs.f_bsize;
+
+ /* not enough space */
+ if ((u64)size > avail)
+ return 0;
+
+ /* enough space */
+ if ((avail - size) > (sbi->options.reserved_mb * 1024 * 1024))
+ return 1;
+
+ return 0;
+ } else
+ return 1;
+}
+
+/*
+ * Copies attrs and maintains sdcardfs managed attrs
+ * Since our permission check handles all special permissions, set those to be open
+ */
+static inline void sdcardfs_copy_and_fix_attrs(struct inode *dest, const struct inode *src)
+{
+ dest->i_mode = (src->i_mode & S_IFMT) | S_IRWXU | S_IRWXG |
+ S_IROTH | S_IXOTH; /* 0775 */
+ dest->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(dest)->data->d_uid);
+ dest->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW);
+ dest->i_rdev = src->i_rdev;
+ dest->i_atime = src->i_atime;
+ dest->i_mtime = src->i_mtime;
+ dest->i_ctime = src->i_ctime;
+ dest->i_blkbits = src->i_blkbits;
+ dest->i_flags = src->i_flags;
+ set_nlink(dest, src->i_nlink);
+}
+
+static inline bool str_case_eq(const char *s1, const char *s2)
+{
+ return !strcasecmp(s1, s2);
+}
+
+static inline bool str_n_case_eq(const char *s1, const char *s2, size_t len)
+{
+ return !strncasecmp(s1, s2, len);
+}
+
+static inline bool qstr_case_eq(const struct qstr *q1, const struct qstr *q2)
+{
+ return q1->len == q2->len && str_case_eq(q1->name, q2->name);
+}
+
+/* */
+#define QSTR_LITERAL(string) QSTR_INIT(string, sizeof(string)-1)
+
+#endif /* not _SDCARDFS_H_ */
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
new file mode 100644
index 000000000000..7f4539b4b249
--- /dev/null
+++ b/fs/sdcardfs/super.c
@@ -0,0 +1,324 @@
+/*
+ * fs/sdcardfs/super.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+
+/*
+ * The inode cache is used with alloc_inode for both our inode info and the
+ * vfs inode.
+ */
+static struct kmem_cache *sdcardfs_inode_cachep;
+
+/*
+ * To support the top references, we must track some data separately.
+ * An sdcardfs_inode_info always has a reference to its data, and once set up,
+ * also has a reference to its top. The top may be itself, in which case it
+ * holds two references to its data. When top is changed, it takes a ref to the
+ * new data and then drops the ref to the old data.
+ */
+static struct kmem_cache *sdcardfs_inode_data_cachep;
+
+void data_release(struct kref *ref)
+{
+ struct sdcardfs_inode_data *data =
+ container_of(ref, struct sdcardfs_inode_data, refcount);
+
+ kmem_cache_free(sdcardfs_inode_data_cachep, data);
+}
+
+/* final actions when unmounting a file system */
+static void sdcardfs_put_super(struct super_block *sb)
+{
+ struct sdcardfs_sb_info *spd;
+ struct super_block *s;
+
+ spd = SDCARDFS_SB(sb);
+ if (!spd)
+ return;
+
+ if (spd->obbpath_s) {
+ kfree(spd->obbpath_s);
+ path_put(&spd->obbpath);
+ }
+
+ /* decrement lower super references */
+ s = sdcardfs_lower_super(sb);
+ sdcardfs_set_lower_super(sb, NULL);
+ atomic_dec(&s->s_active);
+
+ kfree(spd);
+ sb->s_fs_info = NULL;
+}
+
+static int sdcardfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ int err;
+ struct path lower_path;
+ u32 min_blocks;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ err = vfs_statfs(&lower_path, buf);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+
+ if (sbi->options.reserved_mb) {
+ /* Invalid statfs informations. */
+ if (buf->f_bsize == 0) {
+ pr_err("Returned block size is zero.\n");
+ return -EINVAL;
+ }
+
+ min_blocks = ((sbi->options.reserved_mb * 1024 * 1024)/buf->f_bsize);
+ buf->f_blocks -= min_blocks;
+
+ if (buf->f_bavail > min_blocks)
+ buf->f_bavail -= min_blocks;
+ else
+ buf->f_bavail = 0;
+
+ /* Make reserved blocks invisiable to media storage */
+ buf->f_bfree = buf->f_bavail;
+ }
+
+ /* set return buf to our f/s to avoid confusing user-level utils */
+ buf->f_type = SDCARDFS_SUPER_MAGIC;
+
+ return err;
+}
+
+/*
+ * @flags: numeric mount options
+ * @options: mount options string
+ */
+static int sdcardfs_remount_fs(struct super_block *sb, int *flags, char *options)
+{
+ int err = 0;
+
+ /*
+ * The VFS will take care of "ro" and "rw" flags among others. We
+ * can safely accept a few flags (RDONLY, MANDLOCK), and honor
+ * SILENT, but anything else left over is an error.
+ */
+ if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT)) != 0) {
+ pr_err("sdcardfs: remount flags 0x%x unsupported\n", *flags);
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+/*
+ * @mnt: mount point we are remounting
+ * @sb: superblock we are remounting
+ * @flags: numeric mount options
+ * @options: mount options string
+ */
+static int sdcardfs_remount_fs2(struct vfsmount *mnt, struct super_block *sb,
+ int *flags, char *options)
+{
+ int err = 0;
+
+ /*
+ * The VFS will take care of "ro" and "rw" flags among others. We
+ * can safely accept a few flags (RDONLY, MANDLOCK), and honor
+ * SILENT, but anything else left over is an error.
+ */
+ if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT | MS_REMOUNT)) != 0) {
+ pr_err("sdcardfs: remount flags 0x%x unsupported\n", *flags);
+ err = -EINVAL;
+ }
+ pr_info("Remount options were %s for vfsmnt %p.\n", options, mnt);
+ err = parse_options_remount(sb, options, *flags & ~MS_SILENT, mnt->data);
+
+
+ return err;
+}
+
+static void *sdcardfs_clone_mnt_data(void *data)
+{
+ struct sdcardfs_vfsmount_options *opt = kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL);
+ struct sdcardfs_vfsmount_options *old = data;
+
+ if (!opt)
+ return NULL;
+ opt->gid = old->gid;
+ opt->mask = old->mask;
+ return opt;
+}
+
+static void sdcardfs_copy_mnt_data(void *data, void *newdata)
+{
+ struct sdcardfs_vfsmount_options *old = data;
+ struct sdcardfs_vfsmount_options *new = newdata;
+
+ old->gid = new->gid;
+ old->mask = new->mask;
+}
+
+/*
+ * Called by iput() when the inode reference count reached zero
+ * and the inode is not hashed anywhere. Used to clear anything
+ * that needs to be, before the inode is completely destroyed and put
+ * on the inode free list.
+ */
+static void sdcardfs_evict_inode(struct inode *inode)
+{
+ struct inode *lower_inode;
+
+ truncate_inode_pages(&inode->i_data, 0);
+ set_top(SDCARDFS_I(inode), NULL);
+ clear_inode(inode);
+ /*
+ * Decrement a reference to a lower_inode, which was incremented
+ * by our read_inode when it was created initially.
+ */
+ lower_inode = sdcardfs_lower_inode(inode);
+ sdcardfs_set_lower_inode(inode, NULL);
+ iput(lower_inode);
+}
+
+static struct inode *sdcardfs_alloc_inode(struct super_block *sb)
+{
+ struct sdcardfs_inode_info *i;
+ struct sdcardfs_inode_data *d;
+
+ i = kmem_cache_alloc(sdcardfs_inode_cachep, GFP_KERNEL);
+ if (!i)
+ return NULL;
+
+ /* memset everything up to the inode to 0 */
+ memset(i, 0, offsetof(struct sdcardfs_inode_info, vfs_inode));
+
+ d = kmem_cache_alloc(sdcardfs_inode_data_cachep,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!d) {
+ kmem_cache_free(sdcardfs_inode_cachep, i);
+ return NULL;
+ }
+
+ i->data = d;
+ kref_init(&d->refcount);
+
+ i->vfs_inode.i_version = 1;
+ return &i->vfs_inode;
+}
+
+static void i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+
+ release_own_data(SDCARDFS_I(inode));
+ kmem_cache_free(sdcardfs_inode_cachep, SDCARDFS_I(inode));
+}
+
+static void sdcardfs_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, i_callback);
+}
+
+/* sdcardfs inode cache constructor */
+static void init_once(void *obj)
+{
+ struct sdcardfs_inode_info *i = obj;
+
+ inode_init_once(&i->vfs_inode);
+}
+
+int sdcardfs_init_inode_cache(void)
+{
+ sdcardfs_inode_cachep =
+ kmem_cache_create("sdcardfs_inode_cache",
+ sizeof(struct sdcardfs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT, init_once);
+
+ if (!sdcardfs_inode_cachep)
+ return -ENOMEM;
+
+ sdcardfs_inode_data_cachep =
+ kmem_cache_create("sdcardfs_inode_data_cache",
+ sizeof(struct sdcardfs_inode_data), 0,
+ SLAB_RECLAIM_ACCOUNT, NULL);
+ if (!sdcardfs_inode_data_cachep) {
+ kmem_cache_destroy(sdcardfs_inode_cachep);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/* sdcardfs inode cache destructor */
+void sdcardfs_destroy_inode_cache(void)
+{
+ kmem_cache_destroy(sdcardfs_inode_data_cachep);
+ kmem_cache_destroy(sdcardfs_inode_cachep);
+}
+
+/*
+ * Used only in nfs, to kill any pending RPC tasks, so that subsequent
+ * code can actually succeed and won't leave tasks that need handling.
+ */
+static void sdcardfs_umount_begin(struct super_block *sb)
+{
+ struct super_block *lower_sb;
+
+ lower_sb = sdcardfs_lower_super(sb);
+ if (lower_sb && lower_sb->s_op && lower_sb->s_op->umount_begin)
+ lower_sb->s_op->umount_begin(lower_sb);
+}
+
+static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m,
+ struct dentry *root)
+{
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(root->d_sb);
+ struct sdcardfs_mount_options *opts = &sbi->options;
+ struct sdcardfs_vfsmount_options *vfsopts = mnt->data;
+
+ if (opts->fs_low_uid != 0)
+ seq_printf(m, ",fsuid=%u", opts->fs_low_uid);
+ if (opts->fs_low_gid != 0)
+ seq_printf(m, ",fsgid=%u", opts->fs_low_gid);
+ if (vfsopts->gid != 0)
+ seq_printf(m, ",gid=%u", vfsopts->gid);
+ if (opts->multiuser)
+ seq_puts(m, ",multiuser");
+ if (vfsopts->mask)
+ seq_printf(m, ",mask=%u", vfsopts->mask);
+ if (opts->fs_user_id)
+ seq_printf(m, ",userid=%u", opts->fs_user_id);
+ if (opts->reserved_mb != 0)
+ seq_printf(m, ",reserved=%uMB", opts->reserved_mb);
+
+ return 0;
+};
+
+const struct super_operations sdcardfs_sops = {
+ .put_super = sdcardfs_put_super,
+ .statfs = sdcardfs_statfs,
+ .remount_fs = sdcardfs_remount_fs,
+ .remount_fs2 = sdcardfs_remount_fs2,
+ .clone_mnt_data = sdcardfs_clone_mnt_data,
+ .copy_mnt_data = sdcardfs_copy_mnt_data,
+ .evict_inode = sdcardfs_evict_inode,
+ .umount_begin = sdcardfs_umount_begin,
+ .show_options2 = sdcardfs_show_options,
+ .alloc_inode = sdcardfs_alloc_inode,
+ .destroy_inode = sdcardfs_destroy_inode,
+ .drop_inode = generic_delete_inode,
+};
diff --git a/fs/select.c b/fs/select.c
index 467bb1cb3ea5..f684c750e08a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -971,7 +971,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
if (ret == -EINTR) {
struct restart_block *restart_block;
- restart_block = &current_thread_info()->restart_block;
+ restart_block = &current->restart_block;
restart_block->fn = do_restart_poll;
restart_block->poll.ufds = ufds;
restart_block->poll.nfds = nfds;
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index b6fa8657dcbc..6dd158a216f4 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -26,34 +26,6 @@ config SQUASHFS
If unsure, say N.
choice
- prompt "File decompression options"
- depends on SQUASHFS
- help
- Squashfs now supports two options for decompressing file
- data. Traditionally Squashfs has decompressed into an
- intermediate buffer and then memcopied it into the page cache.
- Squashfs now supports the ability to decompress directly into
- the page cache.
-
- If unsure, select "Decompress file data into an intermediate buffer"
-
-config SQUASHFS_FILE_CACHE
- bool "Decompress file data into an intermediate buffer"
- help
- Decompress file data into an intermediate buffer and then
- memcopy it into the page cache.
-
-config SQUASHFS_FILE_DIRECT
- bool "Decompress files directly into the page cache"
- help
- Directly decompress file data into the page cache.
- Doing so can significantly improve performance because
- it eliminates a memcpy and it also removes the lock contention
- on the single buffer.
-
-endchoice
-
-choice
prompt "Decompressor parallelisation options"
depends on SQUASHFS
help
@@ -120,6 +92,21 @@ config SQUASHFS_ZLIB
If unsure, say Y.
+config SQUASHFS_LZ4
+ bool "Include support for LZ4 compressed file systems"
+ depends on SQUASHFS
+ select LZ4_DECOMPRESS
+ help
+ Saying Y here includes support for reading Squashfs file systems
+ compressed with LZ4 compression. LZ4 compression is mainly
+ aimed at embedded systems with slower CPUs where the overheads
+ of zlib are too high.
+
+ LZ4 is not the standard compression used in Squashfs and so most
+ file systems will be readable without selecting this option.
+
+ If unsure, say N.
+
config SQUASHFS_LZO
bool "Include support for LZO compressed file systems"
depends on SQUASHFS
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 4132520b4ff2..fe51f1507ed1 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,12 +5,12 @@
obj-$(CONFIG_SQUASHFS) += squashfs.o
squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
squashfs-y += namei.o super.o symlink.o decompressor.o
-squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o
-squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
+squashfs-y += file_direct.o page_actor.o
squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
+squashfs-$(CONFIG_SQUASHFS_LZ4) += lz4_wrapper.o
squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
squashfs-$(CONFIG_SQUASHFS_ZLIB) += zlib_wrapper.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 0cea9b9236d0..1b65ddf8263f 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -28,9 +28,12 @@
#include <linux/fs.h>
#include <linux/vfs.h>
+#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/pagemap.h>
#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
@@ -38,177 +41,434 @@
#include "decompressor.h"
#include "page_actor.h"
-/*
- * Read the metadata block length, this is stored in the first two
- * bytes of the metadata block.
- */
-static struct buffer_head *get_block_length(struct super_block *sb,
- u64 *cur_index, int *offset, int *length)
+static struct workqueue_struct *squashfs_read_wq;
+
+struct squashfs_read_request {
+ struct super_block *sb;
+ u64 index;
+ int length;
+ int compressed;
+ int offset;
+ u64 read_end;
+ struct squashfs_page_actor *output;
+ enum {
+ SQUASHFS_COPY,
+ SQUASHFS_DECOMPRESS,
+ SQUASHFS_METADATA,
+ } data_processing;
+ bool synchronous;
+
+ /*
+ * If the read is synchronous, it is possible to retrieve information
+ * about the request by setting these pointers.
+ */
+ int *res;
+ int *bytes_read;
+ int *bytes_uncompressed;
+
+ int nr_buffers;
+ struct buffer_head **bh;
+ struct work_struct offload;
+};
+
+struct squashfs_bio_request {
+ struct buffer_head **bh;
+ int nr_buffers;
+};
+
+static int squashfs_bio_submit(struct squashfs_read_request *req);
+
+int squashfs_init_read_wq(void)
{
- struct squashfs_sb_info *msblk = sb->s_fs_info;
- struct buffer_head *bh;
+ squashfs_read_wq = create_workqueue("SquashFS read wq");
+ return !!squashfs_read_wq;
+}
+
+void squashfs_destroy_read_wq(void)
+{
+ flush_workqueue(squashfs_read_wq);
+ destroy_workqueue(squashfs_read_wq);
+}
+
+static void free_read_request(struct squashfs_read_request *req, int error)
+{
+ if (!req->synchronous)
+ squashfs_page_actor_free(req->output, error);
+ if (req->res)
+ *(req->res) = error;
+ kfree(req->bh);
+ kfree(req);
+}
+
+static void squashfs_process_blocks(struct squashfs_read_request *req)
+{
+ int error = 0;
+ int bytes, i, length;
+ struct squashfs_sb_info *msblk = req->sb->s_fs_info;
+ struct squashfs_page_actor *actor = req->output;
+ struct buffer_head **bh = req->bh;
+ int nr_buffers = req->nr_buffers;
+
+ for (i = 0; i < nr_buffers; ++i) {
+ if (!bh[i])
+ continue;
+ wait_on_buffer(bh[i]);
+ if (!buffer_uptodate(bh[i]))
+ error = -EIO;
+ }
+ if (error)
+ goto cleanup;
+
+ if (req->data_processing == SQUASHFS_METADATA) {
+ /* Extract the length of the metadata block */
+ if (req->offset != msblk->devblksize - 1) {
+ length = le16_to_cpup((__le16 *)
+ (bh[0]->b_data + req->offset));
+ } else {
+ length = (unsigned char)bh[0]->b_data[req->offset];
+ length |= (unsigned char)bh[1]->b_data[0] << 8;
+ }
+ req->compressed = SQUASHFS_COMPRESSED(length);
+ req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS
+ : SQUASHFS_COPY;
+ length = SQUASHFS_COMPRESSED_SIZE(length);
+ if (req->index + length + 2 > req->read_end) {
+ for (i = 0; i < nr_buffers; ++i)
+ put_bh(bh[i]);
+ kfree(bh);
+ req->length = length;
+ req->index += 2;
+ squashfs_bio_submit(req);
+ return;
+ }
+ req->length = length;
+ req->offset = (req->offset + 2) % PAGE_SIZE;
+ if (req->offset < 2) {
+ put_bh(bh[0]);
+ ++bh;
+ --nr_buffers;
+ }
+ }
+ if (req->bytes_read)
+ *(req->bytes_read) = req->length;
- bh = sb_bread(sb, *cur_index);
- if (bh == NULL)
- return NULL;
-
- if (msblk->devblksize - *offset == 1) {
- *length = (unsigned char) bh->b_data[*offset];
- put_bh(bh);
- bh = sb_bread(sb, ++(*cur_index));
- if (bh == NULL)
- return NULL;
- *length |= (unsigned char) bh->b_data[0] << 8;
- *offset = 1;
- } else {
- *length = (unsigned char) bh->b_data[*offset] |
- (unsigned char) bh->b_data[*offset + 1] << 8;
- *offset += 2;
-
- if (*offset == msblk->devblksize) {
- put_bh(bh);
- bh = sb_bread(sb, ++(*cur_index));
- if (bh == NULL)
- return NULL;
- *offset = 0;
+ if (req->data_processing == SQUASHFS_COPY) {
+ squashfs_bh_to_actor(bh, nr_buffers, req->output, req->offset,
+ req->length, msblk->devblksize);
+ } else if (req->data_processing == SQUASHFS_DECOMPRESS) {
+ req->length = squashfs_decompress(msblk, bh, nr_buffers,
+ req->offset, req->length, actor);
+ if (req->length < 0) {
+ error = -EIO;
+ goto cleanup;
}
}
- return bh;
+ /* Last page may have trailing bytes not filled */
+ bytes = req->length % PAGE_SIZE;
+ if (bytes && actor->page[actor->pages - 1])
+ zero_user_segment(actor->page[actor->pages - 1], bytes,
+ PAGE_SIZE);
+
+cleanup:
+ if (req->bytes_uncompressed)
+ *(req->bytes_uncompressed) = req->length;
+ if (error) {
+ for (i = 0; i < nr_buffers; ++i)
+ if (bh[i])
+ put_bh(bh[i]);
+ }
+ free_read_request(req, error);
}
+static void read_wq_handler(struct work_struct *work)
+{
+ squashfs_process_blocks(container_of(work,
+ struct squashfs_read_request, offload));
+}
-/*
- * Read and decompress a metadata block or datablock. Length is non-zero
- * if a datablock is being read (the size is stored elsewhere in the
- * filesystem), otherwise the length is obtained from the first two bytes of
- * the metadata block. A bit in the length field indicates if the block
- * is stored uncompressed in the filesystem (usually because compression
- * generated a larger block - this does occasionally happen with compression
- * algorithms).
- */
-int squashfs_read_data(struct super_block *sb, u64 index, int length,
- u64 *next_index, struct squashfs_page_actor *output)
+static void squashfs_bio_end_io(struct bio *bio, int error)
{
- struct squashfs_sb_info *msblk = sb->s_fs_info;
- struct buffer_head **bh;
- int offset = index & ((1 << msblk->devblksize_log2) - 1);
- u64 cur_index = index >> msblk->devblksize_log2;
- int bytes, compressed, b = 0, k = 0, avail, i;
+ int i;
+ struct squashfs_bio_request *bio_req = bio->bi_private;
+
+ bio_put(bio);
+
+ for (i = 0; i < bio_req->nr_buffers; ++i) {
+ if (!bio_req->bh[i])
+ continue;
+ if (!error)
+ set_buffer_uptodate(bio_req->bh[i]);
+ else
+ clear_buffer_uptodate(bio_req->bh[i]);
+ unlock_buffer(bio_req->bh[i]);
+ }
+ kfree(bio_req);
+}
+
+static int bh_is_optional(struct squashfs_read_request *req, int idx)
+{
+ int start_idx, end_idx;
+ struct squashfs_sb_info *msblk = req->sb->s_fs_info;
- bh = kcalloc(((output->length + msblk->devblksize - 1)
- >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
- if (bh == NULL)
+ start_idx = (idx * msblk->devblksize - req->offset) >> PAGE_SHIFT;
+ end_idx = ((idx + 1) * msblk->devblksize - req->offset + 1) >> PAGE_SHIFT;
+ if (start_idx >= req->output->pages)
+ return 1;
+ if (start_idx < 0)
+ start_idx = end_idx;
+ if (end_idx >= req->output->pages)
+ end_idx = start_idx;
+ return !req->output->page[start_idx] && !req->output->page[end_idx];
+}
+
+static int actor_getblks(struct squashfs_read_request *req, u64 block)
+{
+ int i;
+
+ req->bh = kmalloc_array(req->nr_buffers, sizeof(*(req->bh)), GFP_NOIO);
+ if (!req->bh)
return -ENOMEM;
- if (length) {
+ for (i = 0; i < req->nr_buffers; ++i) {
/*
- * Datablock.
+ * When dealing with an uncompressed block, the actor may
+ * contains NULL pages. There's no need to read the buffers
+ * associated with these pages.
*/
- bytes = -offset;
- compressed = SQUASHFS_COMPRESSED_BLOCK(length);
- length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
- if (next_index)
- *next_index = index + length;
-
- TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
- index, compressed ? "" : "un", length, output->length);
-
- if (length < 0 || length > output->length ||
- (index + length) > msblk->bytes_used)
- goto read_failure;
-
- for (b = 0; bytes < length; b++, cur_index++) {
- bh[b] = sb_getblk(sb, cur_index);
- if (bh[b] == NULL)
- goto block_release;
- bytes += msblk->devblksize;
+ if (!req->compressed && bh_is_optional(req, i)) {
+ req->bh[i] = NULL;
+ continue;
}
- ll_rw_block(READ, b, bh);
- } else {
- /*
- * Metadata block.
- */
- if ((index + 2) > msblk->bytes_used)
- goto read_failure;
+ req->bh[i] = sb_getblk(req->sb, block + i);
+ if (!req->bh[i]) {
+ while (--i) {
+ if (req->bh[i])
+ put_bh(req->bh[i]);
+ }
+ return -1;
+ }
+ }
+ return 0;
+}
- bh[0] = get_block_length(sb, &cur_index, &offset, &length);
- if (bh[0] == NULL)
- goto read_failure;
- b = 1;
+static int squashfs_bio_submit(struct squashfs_read_request *req)
+{
+ struct bio *bio = NULL;
+ struct buffer_head *bh;
+ struct squashfs_bio_request *bio_req = NULL;
+ int b = 0, prev_block = 0;
+ struct squashfs_sb_info *msblk = req->sb->s_fs_info;
- bytes = msblk->devblksize - offset;
- compressed = SQUASHFS_COMPRESSED(length);
- length = SQUASHFS_COMPRESSED_SIZE(length);
- if (next_index)
- *next_index = index + length + 2;
+ u64 read_start = round_down(req->index, msblk->devblksize);
+ u64 read_end = round_up(req->index + req->length, msblk->devblksize);
+ sector_t block = read_start >> msblk->devblksize_log2;
+ sector_t block_end = read_end >> msblk->devblksize_log2;
+ int offset = read_start - round_down(req->index, PAGE_SIZE);
+ int nr_buffers = block_end - block;
+ int blksz = msblk->devblksize;
+ int bio_max_pages = nr_buffers > BIO_MAX_PAGES ? BIO_MAX_PAGES
+ : nr_buffers;
- TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
- compressed ? "" : "un", length);
+ /* Setup the request */
+ req->read_end = read_end;
+ req->offset = req->index - read_start;
+ req->nr_buffers = nr_buffers;
+ if (actor_getblks(req, block) < 0)
+ goto getblk_failed;
- if (length < 0 || length > output->length ||
- (index + length) > msblk->bytes_used)
- goto block_release;
+ /* Create and submit the BIOs */
+ for (b = 0; b < nr_buffers; ++b, offset += blksz) {
+ bh = req->bh[b];
+ if (!bh || !trylock_buffer(bh))
+ continue;
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ continue;
+ }
+ offset %= PAGE_SIZE;
- for (; bytes < length; b++) {
- bh[b] = sb_getblk(sb, ++cur_index);
- if (bh[b] == NULL)
- goto block_release;
- bytes += msblk->devblksize;
+ /* Append the buffer to the current BIO if it is contiguous */
+ if (bio && bio_req && prev_block + 1 == b) {
+ if (bio_add_page(bio, bh->b_page, blksz, offset)) {
+ bio_req->nr_buffers += 1;
+ prev_block = b;
+ continue;
+ }
}
- ll_rw_block(READ, b - 1, bh + 1);
+
+ /* Otherwise, submit the current BIO and create a new one */
+ if (bio)
+ submit_bio(READ, bio);
+ bio_req = kcalloc(1, sizeof(struct squashfs_bio_request),
+ GFP_NOIO);
+ if (!bio_req)
+ goto req_alloc_failed;
+ bio_req->bh = &req->bh[b];
+ bio = bio_alloc(GFP_NOIO, bio_max_pages);
+ if (!bio)
+ goto bio_alloc_failed;
+ bio->bi_bdev = req->sb->s_bdev;
+ bio->bi_iter.bi_sector = (block + b)
+ << (msblk->devblksize_log2 - 9);
+ bio->bi_private = bio_req;
+ bio->bi_end_io = squashfs_bio_end_io;
+
+ bio_add_page(bio, bh->b_page, blksz, offset);
+ bio_req->nr_buffers += 1;
+ prev_block = b;
}
+ if (bio)
+ submit_bio(READ, bio);
- for (i = 0; i < b; i++) {
- wait_on_buffer(bh[i]);
- if (!buffer_uptodate(bh[i]))
- goto block_release;
+ if (req->synchronous)
+ squashfs_process_blocks(req);
+ else {
+ INIT_WORK(&req->offload, read_wq_handler);
+ schedule_work(&req->offload);
}
+ return 0;
- if (compressed) {
- length = squashfs_decompress(msblk, bh, b, offset, length,
- output);
- if (length < 0)
- goto read_failure;
- } else {
- /*
- * Block is uncompressed.
- */
- int in, pg_offset = 0;
- void *data = squashfs_first_page(output);
-
- for (bytes = length; k < b; k++) {
- in = min(bytes, msblk->devblksize - offset);
- bytes -= in;
- while (in) {
- if (pg_offset == PAGE_CACHE_SIZE) {
- data = squashfs_next_page(output);
- pg_offset = 0;
- }
- avail = min_t(int, in, PAGE_CACHE_SIZE -
- pg_offset);
- memcpy(data + pg_offset, bh[k]->b_data + offset,
- avail);
- in -= avail;
- pg_offset += avail;
- offset += avail;
- }
- offset = 0;
- put_bh(bh[k]);
- }
- squashfs_finish_page(output);
+bio_alloc_failed:
+ kfree(bio_req);
+req_alloc_failed:
+ unlock_buffer(bh);
+ while (--nr_buffers >= b)
+ if (req->bh[nr_buffers])
+ put_bh(req->bh[nr_buffers]);
+ while (--b >= 0)
+ if (req->bh[b])
+ wait_on_buffer(req->bh[b]);
+getblk_failed:
+ free_read_request(req, -ENOMEM);
+ return -ENOMEM;
+}
+
+static int read_metadata_block(struct squashfs_read_request *req,
+ u64 *next_index)
+{
+ int ret, error, bytes_read = 0, bytes_uncompressed = 0;
+ struct squashfs_sb_info *msblk = req->sb->s_fs_info;
+
+ if (req->index + 2 > msblk->bytes_used) {
+ free_read_request(req, -EINVAL);
+ return -EINVAL;
+ }
+ req->length = 2;
+
+ /* Do not read beyond the end of the device */
+ if (req->index + req->length > msblk->bytes_used)
+ req->length = msblk->bytes_used - req->index;
+ req->data_processing = SQUASHFS_METADATA;
+
+ /*
+ * Reading metadata is always synchronous because we don't know the
+ * length in advance and the function is expected to update
+ * 'next_index' and return the length.
+ */
+ req->synchronous = true;
+ req->res = &error;
+ req->bytes_read = &bytes_read;
+ req->bytes_uncompressed = &bytes_uncompressed;
+
+ TRACE("Metadata block @ 0x%llx, %scompressed size %d, src size %d\n",
+ req->index, req->compressed ? "" : "un", bytes_read,
+ req->output->length);
+
+ ret = squashfs_bio_submit(req);
+ if (ret)
+ return ret;
+ if (error)
+ return error;
+ if (next_index)
+ *next_index += 2 + bytes_read;
+ return bytes_uncompressed;
+}
+
+static int read_data_block(struct squashfs_read_request *req, int length,
+ u64 *next_index, bool synchronous)
+{
+ int ret, error = 0, bytes_uncompressed = 0, bytes_read = 0;
+
+ req->compressed = SQUASHFS_COMPRESSED_BLOCK(length);
+ req->length = length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
+ req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS
+ : SQUASHFS_COPY;
+
+ req->synchronous = synchronous;
+ if (synchronous) {
+ req->res = &error;
+ req->bytes_read = &bytes_read;
+ req->bytes_uncompressed = &bytes_uncompressed;
+ }
+
+ TRACE("Data block @ 0x%llx, %scompressed size %d, src size %d\n",
+ req->index, req->compressed ? "" : "un", req->length,
+ req->output->length);
+
+ ret = squashfs_bio_submit(req);
+ if (ret)
+ return ret;
+ if (synchronous)
+ ret = error ? error : bytes_uncompressed;
+ if (next_index)
+ *next_index += length;
+ return ret;
+}
+
+/*
+ * Read and decompress a metadata block or datablock. Length is non-zero
+ * if a datablock is being read (the size is stored elsewhere in the
+ * filesystem), otherwise the length is obtained from the first two bytes of
+ * the metadata block. A bit in the length field indicates if the block
+ * is stored uncompressed in the filesystem (usually because compression
+ * generated a larger block - this does occasionally happen with compression
+ * algorithms).
+ */
+static int __squashfs_read_data(struct super_block *sb, u64 index, int length,
+ u64 *next_index, struct squashfs_page_actor *output, bool sync)
+{
+ struct squashfs_read_request *req;
+
+ req = kcalloc(1, sizeof(struct squashfs_read_request), GFP_KERNEL);
+ if (!req) {
+ if (!sync)
+ squashfs_page_actor_free(output, -ENOMEM);
+ return -ENOMEM;
+ }
+
+ req->sb = sb;
+ req->index = index;
+ req->output = output;
+
+ if (next_index)
+ *next_index = index;
+
+ if (length)
+ length = read_data_block(req, length, next_index, sync);
+ else
+ length = read_metadata_block(req, next_index);
+
+ if (length < 0) {
+ ERROR("squashfs_read_data failed to read block 0x%llx\n",
+ (unsigned long long)index);
+ return -EIO;
}
- kfree(bh);
return length;
+}
-block_release:
- for (; k < b; k++)
- put_bh(bh[k]);
+int squashfs_read_data(struct super_block *sb, u64 index, int length,
+ u64 *next_index, struct squashfs_page_actor *output)
+{
+ return __squashfs_read_data(sb, index, length, next_index, output,
+ true);
+}
+
+int squashfs_read_data_async(struct super_block *sb, u64 index, int length,
+ u64 *next_index, struct squashfs_page_actor *output)
+{
-read_failure:
- ERROR("squashfs_read_data failed to read block 0x%llx\n",
- (unsigned long long) index);
- kfree(bh);
- return -EIO;
+ return __squashfs_read_data(sb, index, length, next_index, output,
+ false);
}
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 1cb70a0b2168..6785d086ab38 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -209,17 +209,14 @@ void squashfs_cache_put(struct squashfs_cache_entry *entry)
*/
void squashfs_cache_delete(struct squashfs_cache *cache)
{
- int i, j;
+ int i;
if (cache == NULL)
return;
for (i = 0; i < cache->entries; i++) {
- if (cache->entry[i].data) {
- for (j = 0; j < cache->pages; j++)
- kfree(cache->entry[i].data[j]);
- kfree(cache->entry[i].data);
- }
+ if (cache->entry[i].page)
+ free_page_array(cache->entry[i].page, cache->pages);
kfree(cache->entry[i].actor);
}
@@ -236,7 +233,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache)
struct squashfs_cache *squashfs_cache_init(char *name, int entries,
int block_size)
{
- int i, j;
+ int i;
struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL);
if (cache == NULL) {
@@ -268,22 +265,13 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
init_waitqueue_head(&cache->entry[i].wait_queue);
entry->cache = cache;
entry->block = SQUASHFS_INVALID_BLK;
- entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL);
- if (entry->data == NULL) {
+ entry->page = alloc_page_array(cache->pages, GFP_KERNEL);
+ if (!entry->page) {
ERROR("Failed to allocate %s cache entry\n", name);
goto cleanup;
}
-
- for (j = 0; j < cache->pages; j++) {
- entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
- if (entry->data[j] == NULL) {
- ERROR("Failed to allocate %s buffer\n", name);
- goto cleanup;
- }
- }
-
- entry->actor = squashfs_page_actor_init(entry->data,
- cache->pages, 0);
+ entry->actor = squashfs_page_actor_init(entry->page,
+ cache->pages, 0, NULL);
if (entry->actor == NULL) {
ERROR("Failed to allocate %s cache entry\n", name);
goto cleanup;
@@ -314,18 +302,20 @@ int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
return min(length, entry->length - offset);
while (offset < entry->length) {
- void *buff = entry->data[offset / PAGE_CACHE_SIZE]
- + (offset % PAGE_CACHE_SIZE);
+ void *buff = kmap_atomic(entry->page[offset / PAGE_CACHE_SIZE])
+ + (offset % PAGE_CACHE_SIZE);
int bytes = min_t(int, entry->length - offset,
PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
if (bytes >= remaining) {
memcpy(buffer, buff, remaining);
+ kunmap_atomic(buff);
remaining = 0;
break;
}
memcpy(buffer, buff, bytes);
+ kunmap_atomic(buff);
buffer += bytes;
remaining -= bytes;
offset += bytes;
@@ -416,43 +406,38 @@ struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
void *squashfs_read_table(struct super_block *sb, u64 block, int length)
{
int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- int i, res;
- void *table, *buffer, **data;
+ struct page **page;
+ void *buff;
+ int res;
struct squashfs_page_actor *actor;
- table = buffer = kmalloc(length, GFP_KERNEL);
- if (table == NULL)
+ page = alloc_page_array(pages, GFP_KERNEL);
+ if (!page)
return ERR_PTR(-ENOMEM);
- data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
- if (data == NULL) {
- res = -ENOMEM;
- goto failed;
- }
-
- actor = squashfs_page_actor_init(data, pages, length);
+ actor = squashfs_page_actor_init(page, pages, length, NULL);
if (actor == NULL) {
res = -ENOMEM;
- goto failed2;
+ goto failed;
}
- for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
- data[i] = buffer;
-
res = squashfs_read_data(sb, block, length |
SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor);
- kfree(data);
- kfree(actor);
-
if (res < 0)
- goto failed;
+ goto failed2;
- return table;
+ buff = kmalloc(length, GFP_KERNEL);
+ if (!buff)
+ goto failed2;
+ squashfs_actor_to_buf(actor, buff, length);
+ squashfs_page_actor_free(actor, 0);
+ free_page_array(page, pages);
+ return buff;
failed2:
- kfree(data);
+ squashfs_page_actor_free(actor, 0);
failed:
- kfree(table);
+ free_page_array(page, pages);
return ERR_PTR(res);
}
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index ac22fe73b0ad..7de35bf297aa 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -24,7 +24,8 @@
#include <linux/types.h>
#include <linux/mutex.h>
#include <linux/slab.h>
-#include <linux/buffer_head.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
@@ -41,6 +42,12 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
};
+#ifndef CONFIG_SQUASHFS_LZ4
+static const struct squashfs_decompressor squashfs_lz4_comp_ops = {
+ NULL, NULL, NULL, NULL, LZ4_COMPRESSION, "lz4", 0
+};
+#endif
+
#ifndef CONFIG_SQUASHFS_LZO
static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
@@ -65,6 +72,7 @@ static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
static const struct squashfs_decompressor *decompressor[] = {
&squashfs_zlib_comp_ops,
+ &squashfs_lz4_comp_ops,
&squashfs_lzo_comp_ops,
&squashfs_xz_comp_ops,
&squashfs_lzma_unsupported_comp_ops,
@@ -87,40 +95,44 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
static void *get_comp_opts(struct super_block *sb, unsigned short flags)
{
struct squashfs_sb_info *msblk = sb->s_fs_info;
- void *buffer = NULL, *comp_opts;
+ void *comp_opts, *buffer = NULL;
+ struct page *page;
struct squashfs_page_actor *actor = NULL;
int length = 0;
+ if (!SQUASHFS_COMP_OPTS(flags))
+ return squashfs_comp_opts(msblk, buffer, length);
+
/*
* Read decompressor specific options from file system if present
*/
- if (SQUASHFS_COMP_OPTS(flags)) {
- buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
- if (buffer == NULL) {
- comp_opts = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- actor = squashfs_page_actor_init(&buffer, 1, 0);
- if (actor == NULL) {
- comp_opts = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- length = squashfs_read_data(sb,
- sizeof(struct squashfs_super_block), 0, NULL, actor);
-
- if (length < 0) {
- comp_opts = ERR_PTR(length);
- goto out;
- }
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ actor = squashfs_page_actor_init(&page, 1, 0, NULL);
+ if (actor == NULL) {
+ comp_opts = ERR_PTR(-ENOMEM);
+ goto actor_error;
+ }
+
+ length = squashfs_read_data(sb,
+ sizeof(struct squashfs_super_block), 0, NULL, actor);
+
+ if (length < 0) {
+ comp_opts = ERR_PTR(length);
+ goto read_error;
}
+ buffer = kmap_atomic(page);
comp_opts = squashfs_comp_opts(msblk, buffer, length);
+ kunmap_atomic(buffer);
-out:
- kfree(actor);
- kfree(buffer);
+read_error:
+ squashfs_page_actor_free(actor, 0);
+actor_error:
+ __free_page(page);
return comp_opts;
}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index af0985321808..a25713c031a5 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -46,6 +46,10 @@ static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk,
extern const struct squashfs_decompressor squashfs_xz_comp_ops;
#endif
+#ifdef CONFIG_SQUASHFS_LZ4
+extern const struct squashfs_decompressor squashfs_lz4_comp_ops;
+#endif
+
#ifdef CONFIG_SQUASHFS_LZO
extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
#endif
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index e5c9689062ba..d778763de26e 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -47,12 +47,16 @@
#include <linux/string.h>
#include <linux/pagemap.h>
#include <linux/mutex.h>
+#include <linux/mm_inline.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"
+// Backported from 4.5
+#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
+
/*
* Locate cache slot in range [offset, index] for specified inode. If
* there's more than one return the slot closest to index.
@@ -438,6 +442,21 @@ static int squashfs_readpage_fragment(struct page *page)
return res;
}
+static int squashfs_readpages_fragment(struct page *page,
+ struct list_head *readahead_pages, struct address_space *mapping)
+{
+ if (!page) {
+ page = lru_to_page(readahead_pages);
+ list_del(&page->lru);
+ if (add_to_page_cache_lru(page, mapping, page->index,
+ GFP_KERNEL)) {
+ put_page(page);
+ return 0;
+ }
+ }
+ return squashfs_readpage_fragment(page);
+}
+
static int squashfs_readpage_sparse(struct page *page, int index, int file_end)
{
struct inode *inode = page->mapping->host;
@@ -450,54 +469,105 @@ static int squashfs_readpage_sparse(struct page *page, int index, int file_end)
return 0;
}
-static int squashfs_readpage(struct file *file, struct page *page)
+static int squashfs_readpages_sparse(struct page *page,
+ struct list_head *readahead_pages, int index, int file_end,
+ struct address_space *mapping)
{
- struct inode *inode = page->mapping->host;
+ if (!page) {
+ page = lru_to_page(readahead_pages);
+ list_del(&page->lru);
+ if (add_to_page_cache_lru(page, mapping, page->index,
+ GFP_KERNEL)) {
+ put_page(page);
+ return 0;
+ }
+ }
+ return squashfs_readpage_sparse(page, index, file_end);
+}
+
+static int __squashfs_readpages(struct file *file, struct page *page,
+ struct list_head *readahead_pages, unsigned int nr_pages,
+ struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
- int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
int file_end = i_size_read(inode) >> msblk->block_log;
int res;
- void *pageaddr;
- TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
- page->index, squashfs_i(inode)->start);
+ do {
+ struct page *cur_page = page ? page
+ : lru_to_page(readahead_pages);
+ int page_index = cur_page->index;
+ int index = page_index >> (msblk->block_log - PAGE_CACHE_SHIFT);
+
+ if (page_index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT))
+ return 1;
+
+ if (index < file_end || squashfs_i(inode)->fragment_block ==
+ SQUASHFS_INVALID_BLK) {
+ u64 block = 0;
+ int bsize = read_blocklist(inode, index, &block);
+
+ if (bsize < 0)
+ return -1;
+
+ if (bsize == 0) {
+ res = squashfs_readpages_sparse(page,
+ readahead_pages, index, file_end,
+ mapping);
+ } else {
+ res = squashfs_readpages_block(page,
+ readahead_pages, &nr_pages, mapping,
+ page_index, block, bsize);
+ }
+ } else {
+ res = squashfs_readpages_fragment(page,
+ readahead_pages, mapping);
+ }
+ if (res)
+ return 0;
+ page = NULL;
+ } while (readahead_pages && !list_empty(readahead_pages));
- if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT))
- goto out;
+ return 0;
+}
- if (index < file_end || squashfs_i(inode)->fragment_block ==
- SQUASHFS_INVALID_BLK) {
- u64 block = 0;
- int bsize = read_blocklist(inode, index, &block);
- if (bsize < 0)
- goto error_out;
+static int squashfs_readpage(struct file *file, struct page *page)
+{
+ int ret;
- if (bsize == 0)
- res = squashfs_readpage_sparse(page, index, file_end);
+ TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
+ page->index, squashfs_i(page->mapping->host)->start);
+
+ get_page(page);
+
+ ret = __squashfs_readpages(file, page, NULL, 1, page->mapping);
+ if (ret) {
+ flush_dcache_page(page);
+ if (ret < 0)
+ SetPageError(page);
else
- res = squashfs_readpage_block(page, block, bsize);
- } else
- res = squashfs_readpage_fragment(page);
-
- if (!res)
- return 0;
-
-error_out:
- SetPageError(page);
-out:
- pageaddr = kmap_atomic(page);
- memset(pageaddr, 0, PAGE_CACHE_SIZE);
- kunmap_atomic(pageaddr);
- flush_dcache_page(page);
- if (!PageError(page))
- SetPageUptodate(page);
- unlock_page(page);
+ SetPageUptodate(page);
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ unlock_page(page);
+ put_page(page);
+ }
return 0;
}
+static int squashfs_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned int nr_pages)
+{
+ TRACE("Entered squashfs_readpages, %u pages, first page index %lx\n",
+ nr_pages, lru_to_page(pages)->index);
+ __squashfs_readpages(file, NULL, pages, nr_pages, mapping);
+ return 0;
+}
+
const struct address_space_operations squashfs_aops = {
- .readpage = squashfs_readpage
+ .readpage = squashfs_readpage,
+ .readpages = squashfs_readpages,
};
diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c
deleted file mode 100644
index f2310d2a2019..000000000000
--- a/fs/squashfs/file_cache.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2013
- * Phillip Lougher <phillip@squashfs.org.uk>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- */
-
-#include <linux/fs.h>
-#include <linux/vfs.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/pagemap.h>
-#include <linux/mutex.h>
-
-#include "squashfs_fs.h"
-#include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
-#include "squashfs.h"
-
-/* Read separately compressed datablock and memcopy into page cache */
-int squashfs_readpage_block(struct page *page, u64 block, int bsize)
-{
- struct inode *i = page->mapping->host;
- struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
- block, bsize);
- int res = buffer->error;
-
- if (res)
- ERROR("Unable to read page, block %llx, size %x\n", block,
- bsize);
- else
- squashfs_copy_cache(page, buffer, buffer->length, 0);
-
- squashfs_cache_put(buffer);
- return res;
-}
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 43e7a7eddac0..23458bca945d 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -13,6 +13,7 @@
#include <linux/string.h>
#include <linux/pagemap.h>
#include <linux/mutex.h>
+#include <linux/mm_inline.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
@@ -20,157 +21,139 @@
#include "squashfs.h"
#include "page_actor.h"
-static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
- int pages, struct page **page);
-
-/* Read separately compressed datablock directly into page cache */
-int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
+// Backported from 4.5
+#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
+static void release_actor_pages(struct page **page, int pages, int error)
{
- struct inode *inode = target_page->mapping->host;
- struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+ int i;
- int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
- int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
- int start_index = target_page->index & ~mask;
- int end_index = start_index | mask;
- int i, n, pages, missing_pages, bytes, res = -ENOMEM;
+ for (i = 0; i < pages; i++) {
+ if (!page[i])
+ continue;
+ flush_dcache_page(page[i]);
+ if (!error)
+ SetPageUptodate(page[i]);
+ else {
+ SetPageError(page[i]);
+ zero_user_segment(page[i], 0, PAGE_CACHE_SIZE);
+ }
+ unlock_page(page[i]);
+ put_page(page[i]);
+ }
+ kfree(page);
+}
+
+/*
+ * Create a "page actor" which will kmap and kunmap the
+ * page cache pages appropriately within the decompressor
+ */
+static struct squashfs_page_actor *actor_from_page_cache(
+ unsigned int actor_pages, struct page *target_page,
+ struct list_head *rpages, unsigned int *nr_pages, int start_index,
+ struct address_space *mapping)
+{
struct page **page;
struct squashfs_page_actor *actor;
- void *pageaddr;
-
- if (end_index > file_end)
- end_index = file_end;
-
- pages = end_index - start_index + 1;
-
- page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL);
- if (page == NULL)
- return res;
-
- /*
- * Create a "page actor" which will kmap and kunmap the
- * page cache pages appropriately within the decompressor
- */
- actor = squashfs_page_actor_init_special(page, pages, 0);
- if (actor == NULL)
- goto out;
-
- /* Try to grab all the pages covered by the Squashfs block */
- for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
- page[i] = (n == target_page->index) ? target_page :
- grab_cache_page_nowait(target_page->mapping, n);
+ int i, n;
+ gfp_t gfp = GFP_KERNEL;
+
+ page = kmalloc_array(actor_pages, sizeof(void *), GFP_KERNEL);
+ if (!page)
+ return NULL;
+
+ for (i = 0, n = start_index; i < actor_pages; i++, n++) {
+ if (target_page == NULL && rpages && !list_empty(rpages)) {
+ struct page *cur_page = lru_to_page(rpages);
+
+ if (cur_page->index < start_index + actor_pages) {
+ list_del(&cur_page->lru);
+ --(*nr_pages);
+ if (add_to_page_cache_lru(cur_page, mapping,
+ cur_page->index, gfp))
+ put_page(cur_page);
+ else
+ target_page = cur_page;
+ } else
+ rpages = NULL;
+ }
- if (page[i] == NULL) {
- missing_pages++;
- continue;
+ if (target_page && target_page->index == n) {
+ page[i] = target_page;
+ target_page = NULL;
+ } else {
+ page[i] = grab_cache_page_nowait(mapping, n);
+ if (page[i] == NULL)
+ continue;
}
if (PageUptodate(page[i])) {
unlock_page(page[i]);
page_cache_release(page[i]);
page[i] = NULL;
- missing_pages++;
}
}
- if (missing_pages) {
- /*
- * Couldn't get one or more pages, this page has either
- * been VM reclaimed, but others are still in the page cache
- * and uptodate, or we're racing with another thread in
- * squashfs_readpage also trying to grab them. Fall back to
- * using an intermediate buffer.
- */
- res = squashfs_read_cache(target_page, block, bsize, pages,
- page);
- if (res < 0)
- goto mark_errored;
-
- goto out;
- }
-
- /* Decompress directly into the page cache buffers */
- res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
- if (res < 0)
- goto mark_errored;
-
- /* Last page may have trailing bytes not filled */
- bytes = res % PAGE_CACHE_SIZE;
- if (bytes) {
- pageaddr = kmap_atomic(page[pages - 1]);
- memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
- kunmap_atomic(pageaddr);
- }
-
- /* Mark pages as uptodate, unlock and release */
- for (i = 0; i < pages; i++) {
- flush_dcache_page(page[i]);
- SetPageUptodate(page[i]);
- unlock_page(page[i]);
- if (page[i] != target_page)
- page_cache_release(page[i]);
+ actor = squashfs_page_actor_init(page, actor_pages, 0,
+ release_actor_pages);
+ if (!actor) {
+ release_actor_pages(page, actor_pages, -ENOMEM);
+ kfree(page);
+ return NULL;
}
-
- kfree(actor);
- kfree(page);
-
- return 0;
-
-mark_errored:
- /* Decompression failed, mark pages as errored. Target_page is
- * dealt with by the caller
- */
- for (i = 0; i < pages; i++) {
- if (page[i] == NULL || page[i] == target_page)
- continue;
- flush_dcache_page(page[i]);
- SetPageError(page[i]);
- unlock_page(page[i]);
- page_cache_release(page[i]);
- }
-
-out:
- kfree(actor);
- kfree(page);
- return res;
+ return actor;
}
+int squashfs_readpages_block(struct page *target_page,
+ struct list_head *readahead_pages,
+ unsigned int *nr_pages,
+ struct address_space *mapping,
+ int page_index, u64 block, int bsize)
-static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
- int pages, struct page **page)
{
- struct inode *i = target_page->mapping->host;
- struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
- block, bsize);
- int bytes = buffer->length, res = buffer->error, n, offset = 0;
- void *pageaddr;
-
- if (res) {
- ERROR("Unable to read page, block %llx, size %x\n", block,
- bsize);
- goto out;
- }
-
- for (n = 0; n < pages && bytes > 0; n++,
- bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
- int avail = min_t(int, bytes, PAGE_CACHE_SIZE);
-
- if (page[n] == NULL)
- continue;
+ struct squashfs_page_actor *actor;
+ struct inode *inode = mapping->host;
+ struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+ int start_index, end_index, file_end, actor_pages, res;
+ int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
- pageaddr = kmap_atomic(page[n]);
- squashfs_copy_data(pageaddr, buffer, offset, avail);
- memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
- kunmap_atomic(pageaddr);
- flush_dcache_page(page[n]);
- SetPageUptodate(page[n]);
- unlock_page(page[n]);
- if (page[n] != target_page)
- page_cache_release(page[n]);
+ /*
+ * If readpage() is called on an uncompressed datablock, we can just
+ * read the pages instead of fetching the whole block.
+ * This greatly improves the performance when a process keep doing
+ * random reads because we only fetch the necessary data.
+ * The readahead algorithm will take care of doing speculative reads
+ * if necessary.
+ * We can't read more than 1 block even if readahead provides use more
+ * pages because we don't know yet if the next block is compressed or
+ * not.
+ */
+ if (bsize && !SQUASHFS_COMPRESSED_BLOCK(bsize)) {
+ u64 block_end = block + msblk->block_size;
+
+ block += (page_index & mask) * PAGE_SIZE;
+ actor_pages = (block_end - block) / PAGE_SIZE;
+ if (*nr_pages < actor_pages)
+ actor_pages = *nr_pages;
+ start_index = page_index;
+ bsize = min_t(int, bsize, (PAGE_SIZE * actor_pages)
+ | SQUASHFS_COMPRESSED_BIT_BLOCK);
+ } else {
+ file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+ start_index = page_index & ~mask;
+ end_index = start_index | mask;
+ if (end_index > file_end)
+ end_index = file_end;
+ actor_pages = end_index - start_index + 1;
}
-out:
- squashfs_cache_put(buffer);
- return res;
+ actor = actor_from_page_cache(actor_pages, target_page,
+ readahead_pages, nr_pages, start_index,
+ mapping);
+ if (!actor)
+ return -ENOMEM;
+
+ res = squashfs_read_data_async(inode->i_sb, block, bsize, NULL,
+ actor);
+ return res < 0 ? res : 0;
}
diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c
new file mode 100644
index 000000000000..df4fa3c7ddd0
--- /dev/null
+++ b/fs/squashfs/lz4_wrapper.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2013, 2014
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/lz4.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs.h"
+#include "decompressor.h"
+#include "page_actor.h"
+
+#define LZ4_LEGACY 1
+
+struct lz4_comp_opts {
+ __le32 version;
+ __le32 flags;
+};
+
+struct squashfs_lz4 {
+ void *input;
+ void *output;
+};
+
+
+static void *lz4_comp_opts(struct squashfs_sb_info *msblk,
+ void *buff, int len)
+{
+ struct lz4_comp_opts *comp_opts = buff;
+
+ /* LZ4 compressed filesystems always have compression options */
+ if (comp_opts == NULL || len < sizeof(*comp_opts))
+ return ERR_PTR(-EIO);
+
+ if (le32_to_cpu(comp_opts->version) != LZ4_LEGACY) {
+ /* LZ4 format currently used by the kernel is the 'legacy'
+ * format */
+ ERROR("Unknown LZ4 version\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ return NULL;
+}
+
+
+static void *lz4_init(struct squashfs_sb_info *msblk, void *buff)
+{
+ int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+ struct squashfs_lz4 *stream;
+
+ stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+ if (stream == NULL)
+ goto failed;
+ stream->input = vmalloc(block_size);
+ if (stream->input == NULL)
+ goto failed2;
+ stream->output = vmalloc(block_size);
+ if (stream->output == NULL)
+ goto failed3;
+
+ return stream;
+
+failed3:
+ vfree(stream->input);
+failed2:
+ kfree(stream);
+failed:
+ ERROR("Failed to initialise LZ4 decompressor\n");
+ return ERR_PTR(-ENOMEM);
+}
+
+
+static void lz4_free(void *strm)
+{
+ struct squashfs_lz4 *stream = strm;
+
+ if (stream) {
+ vfree(stream->input);
+ vfree(stream->output);
+ }
+ kfree(stream);
+}
+
+
+static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
+ struct buffer_head **bh, int b, int offset, int length,
+ struct squashfs_page_actor *output)
+{
+ int res;
+ size_t dest_len = output->length;
+ struct squashfs_lz4 *stream = strm;
+
+ squashfs_bh_to_buf(bh, b, stream->input, offset, length,
+ msblk->devblksize);
+ res = lz4_decompress_unknownoutputsize(stream->input, length,
+ stream->output, &dest_len);
+ if (res)
+ return -EIO;
+ squashfs_buf_to_actor(stream->output, output, dest_len);
+
+ return dest_len;
+}
+
+const struct squashfs_decompressor squashfs_lz4_comp_ops = {
+ .init = lz4_init,
+ .comp_opts = lz4_comp_opts,
+ .free = lz4_free,
+ .decompress = lz4_uncompress,
+ .id = LZ4_COMPRESSION,
+ .name = "lz4",
+ .supported = 1
+};
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 244b9fbfff7b..2c844d53a59e 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -79,45 +79,19 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
struct buffer_head **bh, int b, int offset, int length,
struct squashfs_page_actor *output)
{
- struct squashfs_lzo *stream = strm;
- void *buff = stream->input, *data;
- int avail, i, bytes = length, res;
+ int res;
size_t out_len = output->length;
+ struct squashfs_lzo *stream = strm;
- for (i = 0; i < b; i++) {
- avail = min(bytes, msblk->devblksize - offset);
- memcpy(buff, bh[i]->b_data + offset, avail);
- buff += avail;
- bytes -= avail;
- offset = 0;
- put_bh(bh[i]);
- }
-
+ squashfs_bh_to_buf(bh, b, stream->input, offset, length,
+ msblk->devblksize);
res = lzo1x_decompress_safe(stream->input, (size_t)length,
stream->output, &out_len);
if (res != LZO_E_OK)
- goto failed;
+ return -EIO;
+ squashfs_buf_to_actor(stream->output, output, out_len);
- res = bytes = (int)out_len;
- data = squashfs_first_page(output);
- buff = stream->output;
- while (data) {
- if (bytes <= PAGE_CACHE_SIZE) {
- memcpy(data, buff, bytes);
- break;
- } else {
- memcpy(data, buff, PAGE_CACHE_SIZE);
- buff += PAGE_CACHE_SIZE;
- bytes -= PAGE_CACHE_SIZE;
- data = squashfs_next_page(output);
- }
- }
- squashfs_finish_page(output);
-
- return res;
-
-failed:
- return -EIO;
+ return out_len;
}
const struct squashfs_decompressor squashfs_lzo_comp_ops = {
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
index 5a1c11f56441..ee1ade20f364 100644
--- a/fs/squashfs/page_actor.c
+++ b/fs/squashfs/page_actor.c
@@ -9,92 +9,145 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
#include "page_actor.h"
-/*
- * This file contains implementations of page_actor for decompressing into
- * an intermediate buffer, and for decompressing directly into the
- * page cache.
- *
- * Calling code should avoid sleeping between calls to squashfs_first_page()
- * and squashfs_finish_page().
- */
-
-/* Implementation of page_actor for decompressing into intermediate buffer */
-static void *cache_first_page(struct squashfs_page_actor *actor)
+struct squashfs_page_actor *squashfs_page_actor_init(struct page **page,
+ int pages, int length, void (*release_pages)(struct page **, int, int))
{
- actor->next_page = 1;
- return actor->buffer[0];
-}
+ struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
-static void *cache_next_page(struct squashfs_page_actor *actor)
-{
- if (actor->next_page == actor->pages)
+ if (actor == NULL)
return NULL;
- return actor->buffer[actor->next_page++];
+ actor->length = length ? : pages * PAGE_SIZE;
+ actor->page = page;
+ actor->pages = pages;
+ actor->next_page = 0;
+ actor->pageaddr = NULL;
+ actor->release_pages = release_pages;
+ return actor;
}
-static void cache_finish_page(struct squashfs_page_actor *actor)
+void squashfs_page_actor_free(struct squashfs_page_actor *actor, int error)
{
- /* empty */
+ if (!actor)
+ return;
+
+ if (actor->release_pages)
+ actor->release_pages(actor->page, actor->pages, error);
+ kfree(actor);
}
-struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
- int pages, int length)
+void squashfs_actor_to_buf(struct squashfs_page_actor *actor, void *buf,
+ int length)
{
- struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
-
- if (actor == NULL)
- return NULL;
-
- actor->length = length ? : pages * PAGE_CACHE_SIZE;
- actor->buffer = buffer;
- actor->pages = pages;
- actor->next_page = 0;
- actor->squashfs_first_page = cache_first_page;
- actor->squashfs_next_page = cache_next_page;
- actor->squashfs_finish_page = cache_finish_page;
- return actor;
+ void *pageaddr;
+ int i, avail, pos = 0;
+
+ for (i = 0; i < actor->pages && pos < length; ++i) {
+ avail = min_t(int, length - pos, PAGE_SIZE);
+ if (actor->page[i]) {
+ pageaddr = kmap_atomic(actor->page[i]);
+ memcpy(buf + pos, pageaddr, avail);
+ kunmap_atomic(pageaddr);
+ }
+ pos += avail;
+ }
}
-/* Implementation of page_actor for decompressing directly into page cache. */
-static void *direct_first_page(struct squashfs_page_actor *actor)
+void squashfs_buf_to_actor(void *buf, struct squashfs_page_actor *actor,
+ int length)
{
- actor->next_page = 1;
- return actor->pageaddr = kmap_atomic(actor->page[0]);
+ void *pageaddr;
+ int i, avail, pos = 0;
+
+ for (i = 0; i < actor->pages && pos < length; ++i) {
+ avail = min_t(int, length - pos, PAGE_SIZE);
+ if (actor->page[i]) {
+ pageaddr = kmap_atomic(actor->page[i]);
+ memcpy(pageaddr, buf + pos, avail);
+ kunmap_atomic(pageaddr);
+ }
+ pos += avail;
+ }
}
-static void *direct_next_page(struct squashfs_page_actor *actor)
+void squashfs_bh_to_actor(struct buffer_head **bh, int nr_buffers,
+ struct squashfs_page_actor *actor, int offset, int length, int blksz)
{
- if (actor->pageaddr)
- kunmap_atomic(actor->pageaddr);
+ void *kaddr = NULL;
+ int bytes = 0, pgoff = 0, b = 0, p = 0, i, avail;
+
+ while (bytes < length) {
+ if (actor->page[p]) {
+ kaddr = kmap_atomic(actor->page[p]);
+ while (pgoff < PAGE_SIZE && bytes < length) {
+ avail = min_t(int, blksz - offset,
+ PAGE_SIZE - pgoff);
+ memcpy(kaddr + pgoff, bh[b]->b_data + offset,
+ avail);
+ pgoff += avail;
+ bytes += avail;
+ offset = (offset + avail) % blksz;
+ if (!offset) {
+ put_bh(bh[b]);
+ ++b;
+ }
+ }
+ kunmap_atomic(kaddr);
+ pgoff = 0;
+ } else {
+ for (i = 0; i < PAGE_SIZE / blksz; ++i) {
+ if (bh[b])
+ put_bh(bh[b]);
+ ++b;
+ }
+ bytes += PAGE_SIZE;
+ }
+ ++p;
+ }
+}
- return actor->pageaddr = actor->next_page == actor->pages ? NULL :
- kmap_atomic(actor->page[actor->next_page++]);
+void squashfs_bh_to_buf(struct buffer_head **bh, int nr_buffers, void *buf,
+ int offset, int length, int blksz)
+{
+ int i, avail, bytes = 0;
+
+ for (i = 0; i < nr_buffers && bytes < length; ++i) {
+ avail = min_t(int, length - bytes, blksz - offset);
+ if (bh[i]) {
+ memcpy(buf + bytes, bh[i]->b_data + offset, avail);
+ put_bh(bh[i]);
+ }
+ bytes += avail;
+ offset = 0;
+ }
}
-static void direct_finish_page(struct squashfs_page_actor *actor)
+void free_page_array(struct page **page, int nr_pages)
{
- if (actor->pageaddr)
- kunmap_atomic(actor->pageaddr);
+ int i;
+
+ for (i = 0; i < nr_pages; ++i)
+ __free_page(page[i]);
+ kfree(page);
}
-struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
- int pages, int length)
+struct page **alloc_page_array(int nr_pages, int gfp_mask)
{
- struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
+ int i;
+ struct page **page;
- if (actor == NULL)
+ page = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
+ if (!page)
return NULL;
-
- actor->length = length ? : pages * PAGE_CACHE_SIZE;
- actor->page = page;
- actor->pages = pages;
- actor->next_page = 0;
- actor->pageaddr = NULL;
- actor->squashfs_first_page = direct_first_page;
- actor->squashfs_next_page = direct_next_page;
- actor->squashfs_finish_page = direct_finish_page;
- return actor;
+ for (i = 0; i < nr_pages; ++i) {
+ page[i] = alloc_page(gfp_mask);
+ if (!page[i]) {
+ free_page_array(page, i);
+ return NULL;
+ }
+ }
+ return page;
}
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 26dd82008b82..c228de957463 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -8,74 +8,58 @@
* the COPYING file in the top-level directory.
*/
-#ifndef CONFIG_SQUASHFS_FILE_DIRECT
struct squashfs_page_actor {
- void **page;
+ struct page **page;
+ void *pageaddr;
int pages;
int length;
int next_page;
+ void (*release_pages)(struct page **, int, int);
};
-static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
- int pages, int length)
-{
- struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
-
- if (actor == NULL)
- return NULL;
+extern struct squashfs_page_actor *squashfs_page_actor_init(struct page **,
+ int, int, void (*)(struct page **, int, int));
+extern void squashfs_page_actor_free(struct squashfs_page_actor *, int);
- actor->length = length ? : pages * PAGE_CACHE_SIZE;
- actor->page = page;
- actor->pages = pages;
- actor->next_page = 0;
- return actor;
-}
+extern void squashfs_actor_to_buf(struct squashfs_page_actor *, void *, int);
+extern void squashfs_buf_to_actor(void *, struct squashfs_page_actor *, int);
+extern void squashfs_bh_to_actor(struct buffer_head **, int,
+ struct squashfs_page_actor *, int, int, int);
+extern void squashfs_bh_to_buf(struct buffer_head **, int, void *, int, int,
+ int);
+/*
+ * Calling code should avoid sleeping between calls to squashfs_first_page()
+ * and squashfs_finish_page().
+ */
static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
{
actor->next_page = 1;
- return actor->page[0];
+ return actor->pageaddr = actor->page[0] ? kmap_atomic(actor->page[0])
+ : NULL;
}
static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
{
- return actor->next_page == actor->pages ? NULL :
- actor->page[actor->next_page++];
-}
+ if (!IS_ERR_OR_NULL(actor->pageaddr))
+ kunmap_atomic(actor->pageaddr);
-static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
-{
- /* empty */
-}
-#else
-struct squashfs_page_actor {
- union {
- void **buffer;
- struct page **page;
- };
- void *pageaddr;
- void *(*squashfs_first_page)(struct squashfs_page_actor *);
- void *(*squashfs_next_page)(struct squashfs_page_actor *);
- void (*squashfs_finish_page)(struct squashfs_page_actor *);
- int pages;
- int length;
- int next_page;
-};
+ if (actor->next_page == actor->pages)
+ return actor->pageaddr = ERR_PTR(-ENODATA);
-extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int);
-extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page
- **, int, int);
-static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
-{
- return actor->squashfs_first_page(actor);
-}
-static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
-{
- return actor->squashfs_next_page(actor);
+ actor->pageaddr = actor->page[actor->next_page] ?
+ kmap_atomic(actor->page[actor->next_page]) : NULL;
+ ++actor->next_page;
+ return actor->pageaddr;
}
+
static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
{
- actor->squashfs_finish_page(actor);
+ if (!IS_ERR_OR_NULL(actor->pageaddr))
+ kunmap_atomic(actor->pageaddr);
}
-#endif
+
+extern struct page **alloc_page_array(int, int);
+extern void free_page_array(struct page **, int);
+
#endif
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 887d6d270080..bb28f162ff30 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -28,8 +28,14 @@
#define WARNING(s, args...) pr_warn("SQUASHFS: "s, ## args)
/* block.c */
+extern int squashfs_init_read_wq(void);
extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
struct squashfs_page_actor *);
+extern void squashfs_destroy_read_wq(void);
+extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
+ struct squashfs_page_actor *);
+extern int squashfs_read_data_async(struct super_block *, u64, int, u64 *,
+ struct squashfs_page_actor *);
/* cache.c */
extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
@@ -70,8 +76,9 @@ extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int,
int);
-/* file_xxx.c */
-extern int squashfs_readpage_block(struct page *, u64, int);
+/* file_direct.c */
+extern int squashfs_readpages_block(struct page *, struct list_head *,
+ unsigned int *, struct address_space *, int, u64, int);
/* id.c */
extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 4b2beda49498..506f4ba5b983 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -240,6 +240,7 @@ struct meta_index {
#define LZMA_COMPRESSION 2
#define LZO_COMPRESSION 3
#define XZ_COMPRESSION 4
+#define LZ4_COMPRESSION 5
struct squashfs_super_block {
__le32 s_magic;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 1da565cb50c3..8a6995de0277 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -49,7 +49,7 @@ struct squashfs_cache_entry {
int num_waiters;
wait_queue_head_t wait_queue;
struct squashfs_cache *cache;
- void **data;
+ struct page **page;
struct squashfs_page_actor *actor;
};
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 5056babe00df..61cd0b39ed0e 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -444,9 +444,15 @@ static int __init init_squashfs_fs(void)
if (err)
return err;
+ if (!squashfs_init_read_wq()) {
+ destroy_inodecache();
+ return -ENOMEM;
+ }
+
err = register_filesystem(&squashfs_fs_type);
if (err) {
destroy_inodecache();
+ squashfs_destroy_read_wq();
return err;
}
@@ -460,6 +466,7 @@ static void __exit exit_squashfs_fs(void)
{
unregister_filesystem(&squashfs_fs_type);
destroy_inodecache();
+ squashfs_destroy_read_wq();
}
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index c609624e4b8a..2f7be1fb167c 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -55,7 +55,7 @@ static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk,
struct comp_opts *opts;
int err = 0, n;
- opts = kmalloc(sizeof(*opts), GFP_KERNEL);
+ opts = kmalloc(sizeof(*opts), GFP_ATOMIC);
if (opts == NULL) {
err = -ENOMEM;
goto out2;
@@ -136,12 +136,13 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
enum xz_ret xz_err;
int avail, total = 0, k = 0;
struct squashfs_xz *stream = strm;
+ void *buf = NULL;
xz_dec_reset(stream->state);
stream->buf.in_pos = 0;
stream->buf.in_size = 0;
stream->buf.out_pos = 0;
- stream->buf.out_size = PAGE_CACHE_SIZE;
+ stream->buf.out_size = PAGE_SIZE;
stream->buf.out = squashfs_first_page(output);
do {
@@ -156,12 +157,20 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
if (stream->buf.out_pos == stream->buf.out_size) {
stream->buf.out = squashfs_next_page(output);
- if (stream->buf.out != NULL) {
+ if (!IS_ERR(stream->buf.out)) {
stream->buf.out_pos = 0;
- total += PAGE_CACHE_SIZE;
+ total += PAGE_SIZE;
}
}
+ if (!stream->buf.out) {
+ if (!buf) {
+ buf = kmalloc(PAGE_SIZE, GFP_ATOMIC);
+ if (!buf)
+ goto out;
+ }
+ stream->buf.out = buf;
+ }
xz_err = xz_dec_run(stream->state, &stream->buf);
if (stream->buf.in_pos == stream->buf.in_size && k < b)
@@ -173,11 +182,13 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
if (xz_err != XZ_STREAM_END || k < b)
goto out;
+ kfree(buf);
return total + stream->buf.out_pos;
out:
for (; k < b; k++)
put_bh(bh[k]);
+ kfree(buf);
return -EIO;
}
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 8727caba6882..d917c728422b 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -66,10 +66,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
struct buffer_head **bh, int b, int offset, int length,
struct squashfs_page_actor *output)
{
+ void *buf = NULL;
int zlib_err, zlib_init = 0, k = 0;
z_stream *stream = strm;
- stream->avail_out = PAGE_CACHE_SIZE;
+ stream->avail_out = PAGE_SIZE;
stream->next_out = squashfs_first_page(output);
stream->avail_in = 0;
@@ -84,8 +85,17 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
if (stream->avail_out == 0) {
stream->next_out = squashfs_next_page(output);
- if (stream->next_out != NULL)
- stream->avail_out = PAGE_CACHE_SIZE;
+ if (!IS_ERR(stream->next_out))
+ stream->avail_out = PAGE_SIZE;
+ }
+
+ if (!stream->next_out) {
+ if (!buf) {
+ buf = kmalloc(PAGE_SIZE, GFP_ATOMIC);
+ if (!buf)
+ goto out;
+ }
+ stream->next_out = buf;
}
if (!zlib_init) {
@@ -115,11 +125,13 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
if (k < b)
goto out;
+ kfree(buf);
return stream->total_out;
out:
for (; k < b; k++)
put_bh(bh[k]);
+ kfree(buf);
return -EIO;
}
diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..2693e90ab876 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -682,7 +682,8 @@ rescan:
}
/**
- * do_remount_sb - asks filesystem to change mount options.
+ * do_remount_sb2 - asks filesystem to change mount options.
+ * @mnt: mount we are looking at
* @sb: superblock in question
* @flags: numeric part of options
* @data: the rest of options
@@ -690,7 +691,7 @@ rescan:
*
* Alters the mount options of a mounted file system.
*/
-int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+int do_remount_sb2(struct vfsmount *mnt, struct super_block *sb, int flags, void *data, int force)
{
int retval;
int remount_ro;
@@ -732,7 +733,16 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
}
}
- if (sb->s_op->remount_fs) {
+ if (mnt && sb->s_op->remount_fs2) {
+ retval = sb->s_op->remount_fs2(mnt, sb, &flags, data);
+ if (retval) {
+ if (!force)
+ goto cancel_readonly;
+ /* If forced remount, go ahead despite any errors */
+ WARN(1, "forced remount of a %s fs returned %i\n",
+ sb->s_type->name, retval);
+ }
+ } else if (sb->s_op->remount_fs) {
retval = sb->s_op->remount_fs(sb, &flags, data);
if (retval) {
if (!force)
@@ -764,12 +774,17 @@ cancel_readonly:
return retval;
}
+int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+{
+ return do_remount_sb2(NULL, sb, flags, data, force);
+}
+
static void do_emergency_remount(struct work_struct *work)
{
struct super_block *sb, *p = NULL;
spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list) {
+ list_for_each_entry_reverse(sb, &super_blocks, s_list) {
if (hlist_unhashed(&sb->s_instances))
continue;
sb->s_count++;
@@ -1086,7 +1101,7 @@ struct dentry *mount_single(struct file_system_type *fs_type,
EXPORT_SYMBOL(mount_single);
struct dentry *
-mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
+mount_fs(struct file_system_type *type, int flags, const char *name, struct vfsmount *mnt, void *data)
{
struct dentry *root;
struct super_block *sb;
@@ -1103,7 +1118,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
goto out_free_secdata;
}
- root = type->mount(type, flags, name, data);
+ if (type->mount2)
+ root = type->mount2(mnt, type, flags, name, data);
+ else
+ root = type->mount(type, flags, name, data);
if (IS_ERR(root)) {
error = PTR_ERR(root);
goto out_free_secdata;
diff --git a/fs/sync.c b/fs/sync.c
index bdc729d80e5e..cdaf1d275c0c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -205,6 +205,7 @@ static int do_fsync(unsigned int fd, int datasync)
if (f.file) {
ret = vfs_fsync(f.file, datasync);
fdput(f);
+ inc_syscfs(current);
}
return ret;
}
diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile
new file mode 100644
index 000000000000..82fa35b656c4
--- /dev/null
+++ b/fs/tracefs/Makefile
@@ -0,0 +1,4 @@
+tracefs-objs := inode.o
+
+obj-$(CONFIG_TRACING) += tracefs.o
+
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
new file mode 100644
index 000000000000..f58beda7d77c
--- /dev/null
+++ b/fs/tracefs/inode.c
@@ -0,0 +1,654 @@
+/*
+ * inode.c - part of tracefs, a pseudo file system for activating tracing
+ *
+ * Based on debugfs by: Greg Kroah-Hartman <greg@kroah.com>
+ *
+ * Copyright (C) 2014 Red Hat Inc, author: Steven Rostedt <srostedt@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * tracefs is the file system that is used by the tracing infrastructure.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/kobject.h>
+#include <linux/namei.h>
+#include <linux/tracefs.h>
+#include <linux/fsnotify.h>
+#include <linux/seq_file.h>
+#include <linux/parser.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+
+#define TRACEFS_DEFAULT_MODE 0700
+
+static struct vfsmount *tracefs_mount;
+static int tracefs_mount_count;
+static bool tracefs_registered;
+
+static ssize_t default_read_file(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return 0;
+}
+
+static ssize_t default_write_file(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return count;
+}
+
+static const struct file_operations tracefs_file_operations = {
+ .read = default_read_file,
+ .write = default_write_file,
+ .open = simple_open,
+ .llseek = noop_llseek,
+};
+
+static struct tracefs_dir_ops {
+ int (*mkdir)(const char *name);
+ int (*rmdir)(const char *name);
+} tracefs_ops;
+
+static char *get_dname(struct dentry *dentry)
+{
+ const char *dname;
+ char *name;
+ int len = dentry->d_name.len;
+
+ dname = dentry->d_name.name;
+ name = kmalloc(len + 1, GFP_KERNEL);
+ if (!name)
+ return NULL;
+ memcpy(name, dname, len);
+ name[len] = 0;
+ return name;
+}
+
+static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode)
+{
+ char *name;
+ int ret;
+
+ name = get_dname(dentry);
+ if (!name)
+ return -ENOMEM;
+
+ /*
+ * The mkdir call can call the generic functions that create
+ * the files within the tracefs system. It is up to the individual
+ * mkdir routine to handle races.
+ */
+ mutex_unlock(&inode->i_mutex);
+ ret = tracefs_ops.mkdir(name);
+ mutex_lock(&inode->i_mutex);
+
+ kfree(name);
+
+ return ret;
+}
+
+static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
+{
+ char *name;
+ int ret;
+
+ name = get_dname(dentry);
+ if (!name)
+ return -ENOMEM;
+
+ /*
+ * The rmdir call can call the generic functions that create
+ * the files within the tracefs system. It is up to the individual
+ * rmdir routine to handle races.
+ * This time we need to unlock not only the parent (inode) but
+ * also the directory that is being deleted.
+ */
+ mutex_unlock(&inode->i_mutex);
+ mutex_unlock(&dentry->d_inode->i_mutex);
+
+ ret = tracefs_ops.rmdir(name);
+
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock(&dentry->d_inode->i_mutex);
+
+ kfree(name);
+
+ return ret;
+}
+
+static const struct inode_operations tracefs_dir_inode_operations = {
+ .lookup = simple_lookup,
+ .mkdir = tracefs_syscall_mkdir,
+ .rmdir = tracefs_syscall_rmdir,
+};
+
+static struct inode *tracefs_get_inode(struct super_block *sb)
+{
+ struct inode *inode = new_inode(sb);
+ if (inode) {
+ inode->i_ino = get_next_ino();
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ }
+ return inode;
+}
+
+struct tracefs_mount_opts {
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+};
+
+enum {
+ Opt_uid,
+ Opt_gid,
+ Opt_mode,
+ Opt_err
+};
+
+static const match_table_t tokens = {
+ {Opt_uid, "uid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_mode, "mode=%o"},
+ {Opt_err, NULL}
+};
+
+struct tracefs_fs_info {
+ struct tracefs_mount_opts mount_opts;
+};
+
+static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
+{
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+ int token;
+ kuid_t uid;
+ kgid_t gid;
+ char *p;
+
+ opts->mode = TRACEFS_DEFAULT_MODE;
+
+ while ((p = strsep(&data, ",")) != NULL) {
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_uid:
+ if (match_int(&args[0], &option))
+ return -EINVAL;
+ uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(uid))
+ return -EINVAL;
+ opts->uid = uid;
+ break;
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ return -EINVAL;
+ gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(gid))
+ return -EINVAL;
+ opts->gid = gid;
+ break;
+ case Opt_mode:
+ if (match_octal(&args[0], &option))
+ return -EINVAL;
+ opts->mode = option & S_IALLUGO;
+ break;
+ /*
+ * We might like to report bad mount options here;
+ * but traditionally tracefs has ignored all mount options
+ */
+ }
+ }
+
+ return 0;
+}
+
+static int tracefs_apply_options(struct super_block *sb)
+{
+ struct tracefs_fs_info *fsi = sb->s_fs_info;
+ struct inode *inode = sb->s_root->d_inode;
+ struct tracefs_mount_opts *opts = &fsi->mount_opts;
+
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= opts->mode;
+
+ inode->i_uid = opts->uid;
+ inode->i_gid = opts->gid;
+
+ return 0;
+}
+
+static int tracefs_remount(struct super_block *sb, int *flags, char *data)
+{
+ int err;
+ struct tracefs_fs_info *fsi = sb->s_fs_info;
+
+ sync_filesystem(sb);
+ err = tracefs_parse_options(data, &fsi->mount_opts);
+ if (err)
+ goto fail;
+
+ tracefs_apply_options(sb);
+
+fail:
+ return err;
+}
+
+static int tracefs_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct tracefs_fs_info *fsi = root->d_sb->s_fs_info;
+ struct tracefs_mount_opts *opts = &fsi->mount_opts;
+
+ if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(&init_user_ns, opts->uid));
+ if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(&init_user_ns, opts->gid));
+ if (opts->mode != TRACEFS_DEFAULT_MODE)
+ seq_printf(m, ",mode=%o", opts->mode);
+
+ return 0;
+}
+
+static const struct super_operations tracefs_super_operations = {
+ .statfs = simple_statfs,
+ .remount_fs = tracefs_remount,
+ .show_options = tracefs_show_options,
+};
+
+static int trace_fill_super(struct super_block *sb, void *data, int silent)
+{
+ static struct tree_descr trace_files[] = {{""}};
+ struct tracefs_fs_info *fsi;
+ int err;
+
+ save_mount_options(sb, data);
+
+ fsi = kzalloc(sizeof(struct tracefs_fs_info), GFP_KERNEL);
+ sb->s_fs_info = fsi;
+ if (!fsi) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ err = tracefs_parse_options(data, &fsi->mount_opts);
+ if (err)
+ goto fail;
+
+ err = simple_fill_super(sb, TRACEFS_MAGIC, trace_files);
+ if (err)
+ goto fail;
+
+ sb->s_op = &tracefs_super_operations;
+
+ tracefs_apply_options(sb);
+
+ return 0;
+
+fail:
+ kfree(fsi);
+ sb->s_fs_info = NULL;
+ return err;
+}
+
+static struct dentry *trace_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data)
+{
+ return mount_single(fs_type, flags, data, trace_fill_super);
+}
+
+static struct file_system_type trace_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "tracefs",
+ .mount = trace_mount,
+ .kill_sb = kill_litter_super,
+};
+MODULE_ALIAS_FS("tracefs");
+
+static struct dentry *start_creating(const char *name, struct dentry *parent)
+{
+ struct dentry *dentry;
+ int error;
+
+ pr_debug("tracefs: creating file '%s'\n",name);
+
+ error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
+ &tracefs_mount_count);
+ if (error)
+ return ERR_PTR(error);
+
+ /* If the parent is not specified, we create it in the root.
+ * We need the root dentry to do this, which is in the super
+ * block. A pointer to that is in the struct vfsmount that we
+ * have around.
+ */
+ if (!parent)
+ parent = tracefs_mount->mnt_root;
+
+ mutex_lock(&parent->d_inode->i_mutex);
+ dentry = lookup_one_len(name, parent, strlen(name));
+ if (!IS_ERR(dentry) && dentry->d_inode) {
+ dput(dentry);
+ dentry = ERR_PTR(-EEXIST);
+ }
+
+ if (IS_ERR(dentry)) {
+ mutex_unlock(&parent->d_inode->i_mutex);
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+ }
+
+ return dentry;
+}
+
+static struct dentry *failed_creating(struct dentry *dentry)
+{
+ mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+ dput(dentry);
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+ return NULL;
+}
+
+static struct dentry *end_creating(struct dentry *dentry)
+{
+ mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+ return dentry;
+}
+
+/**
+ * tracefs_create_file - create a file in the tracefs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is NULL, then the
+ * file will be created in the root of the tracefs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ * on. The inode.i_private pointer will point to this value on
+ * the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ * this file.
+ *
+ * This is the basic "create a file" function for tracefs. It allows for a
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the tracefs_create_dir() function is
+ * recommended to be used instead.)
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the tracefs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, %NULL will be returned.
+ *
+ * If tracefs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *tracefs_create_file(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fops)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+
+ if (!(mode & S_IFMT))
+ mode |= S_IFREG;
+ BUG_ON(!S_ISREG(mode));
+ dentry = start_creating(name, parent);
+
+ if (IS_ERR(dentry))
+ return NULL;
+
+ inode = tracefs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return failed_creating(dentry);
+
+ inode->i_mode = mode;
+ inode->i_fop = fops ? fops : &tracefs_file_operations;
+ inode->i_private = data;
+ d_instantiate(dentry, inode);
+ fsnotify_create(dentry->d_parent->d_inode, dentry);
+ return end_creating(dentry);
+}
+
+static struct dentry *__create_dir(const char *name, struct dentry *parent,
+ const struct inode_operations *ops)
+{
+ struct dentry *dentry = start_creating(name, parent);
+ struct inode *inode;
+
+ if (IS_ERR(dentry))
+ return NULL;
+
+ inode = tracefs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return failed_creating(dentry);
+
+ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ inode->i_op = ops;
+ inode->i_fop = &simple_dir_operations;
+
+ /* directory inodes start off with i_nlink == 2 (for "." entry) */
+ inc_nlink(inode);
+ d_instantiate(dentry, inode);
+ inc_nlink(dentry->d_parent->d_inode);
+ fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+ return end_creating(dentry);
+}
+
+/**
+ * tracefs_create_dir - create a directory in the tracefs filesystem
+ * @name: a pointer to a string containing the name of the directory to
+ * create.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is NULL, then the
+ * directory will be created in the root of the tracefs filesystem.
+ *
+ * This function creates a directory in tracefs with the given name.
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the tracefs_remove() function when the file is
+ * to be removed. If an error occurs, %NULL will be returned.
+ *
+ * If tracing is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
+{
+ return __create_dir(name, parent, &simple_dir_inode_operations);
+}
+
+/**
+ * tracefs_create_instance_dir - create the tracing instances directory
+ * @name: The name of the instances directory to create
+ * @parent: The parent directory that the instances directory will exist
+ * @mkdir: The function to call when a mkdir is performed.
+ * @rmdir: The function to call when a rmdir is performed.
+ *
+ * Only one instances directory is allowed.
+ *
+ * The instances directory is special as it allows for mkdir and rmdir to
+ * to be done by userspace. When a mkdir or rmdir is performed, the inode
+ * locks are released and the methhods passed in (@mkdir and @rmdir) are
+ * called without locks and with the name of the directory being created
+ * within the instances directory.
+ *
+ * Returns the dentry of the instances directory.
+ */
+struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent,
+ int (*mkdir)(const char *name),
+ int (*rmdir)(const char *name))
+{
+ struct dentry *dentry;
+
+ /* Only allow one instance of the instances directory. */
+ if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir))
+ return NULL;
+
+ dentry = __create_dir(name, parent, &tracefs_dir_inode_operations);
+ if (!dentry)
+ return NULL;
+
+ tracefs_ops.mkdir = mkdir;
+ tracefs_ops.rmdir = rmdir;
+
+ return dentry;
+}
+
+static inline int tracefs_positive(struct dentry *dentry)
+{
+ return dentry->d_inode && !d_unhashed(dentry);
+}
+
+static int __tracefs_remove(struct dentry *dentry, struct dentry *parent)
+{
+ int ret = 0;
+
+ if (tracefs_positive(dentry)) {
+ if (dentry->d_inode) {
+ dget(dentry);
+ switch (dentry->d_inode->i_mode & S_IFMT) {
+ case S_IFDIR:
+ ret = simple_rmdir(parent->d_inode, dentry);
+ break;
+ default:
+ simple_unlink(parent->d_inode, dentry);
+ break;
+ }
+ if (!ret)
+ d_delete(dentry);
+ dput(dentry);
+ }
+ }
+ return ret;
+}
+
+/**
+ * tracefs_remove - removes a file or directory from the tracefs filesystem
+ * @dentry: a pointer to a the dentry of the file or directory to be
+ * removed.
+ *
+ * This function removes a file or directory in tracefs that was previously
+ * created with a call to another tracefs function (like
+ * tracefs_create_file() or variants thereof.)
+ */
+void tracefs_remove(struct dentry *dentry)
+{
+ struct dentry *parent;
+ int ret;
+
+ if (IS_ERR_OR_NULL(dentry))
+ return;
+
+ parent = dentry->d_parent;
+ if (!parent || !parent->d_inode)
+ return;
+
+ mutex_lock(&parent->d_inode->i_mutex);
+ ret = __tracefs_remove(dentry, parent);
+ mutex_unlock(&parent->d_inode->i_mutex);
+ if (!ret)
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+}
+
+/**
+ * tracefs_remove_recursive - recursively removes a directory
+ * @dentry: a pointer to a the dentry of the directory to be removed.
+ *
+ * This function recursively removes a directory tree in tracefs that
+ * was previously created with a call to another tracefs function
+ * (like tracefs_create_file() or variants thereof.)
+ */
+void tracefs_remove_recursive(struct dentry *dentry)
+{
+ struct dentry *child, *parent;
+
+ if (IS_ERR_OR_NULL(dentry))
+ return;
+
+ parent = dentry->d_parent;
+ if (!parent || !parent->d_inode)
+ return;
+
+ parent = dentry;
+ down:
+ mutex_lock(&parent->d_inode->i_mutex);
+ loop:
+ /*
+ * The parent->d_subdirs is protected by the d_lock. Outside that
+ * lock, the child can be unlinked and set to be freed which can
+ * use the d_u.d_child as the rcu head and corrupt this list.
+ */
+ spin_lock(&parent->d_lock);
+ list_for_each_entry(child, &parent->d_subdirs, d_child) {
+ if (!tracefs_positive(child))
+ continue;
+
+ /* perhaps simple_empty(child) makes more sense */
+ if (!list_empty(&child->d_subdirs)) {
+ spin_unlock(&parent->d_lock);
+ mutex_unlock(&parent->d_inode->i_mutex);
+ parent = child;
+ goto down;
+ }
+
+ spin_unlock(&parent->d_lock);
+
+ if (!__tracefs_remove(child, parent))
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+
+ /*
+ * The parent->d_lock protects agaist child from unlinking
+ * from d_subdirs. When releasing the parent->d_lock we can
+ * no longer trust that the next pointer is valid.
+ * Restart the loop. We'll skip this one with the
+ * tracefs_positive() check.
+ */
+ goto loop;
+ }
+ spin_unlock(&parent->d_lock);
+
+ mutex_unlock(&parent->d_inode->i_mutex);
+ child = parent;
+ parent = parent->d_parent;
+ mutex_lock(&parent->d_inode->i_mutex);
+
+ if (child != dentry)
+ /* go up */
+ goto loop;
+
+ if (!__tracefs_remove(child, parent))
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+ mutex_unlock(&parent->d_inode->i_mutex);
+}
+
+/**
+ * tracefs_initialized - Tells whether tracefs has been registered
+ */
+bool tracefs_initialized(void)
+{
+ return tracefs_registered;
+}
+
+static struct kobject *trace_kobj;
+
+static int __init tracefs_init(void)
+{
+ int retval;
+
+ trace_kobj = kobject_create_and_add("tracing", kernel_kobj);
+ if (!trace_kobj)
+ return -EINVAL;
+
+ retval = register_filesystem(&trace_fs_type);
+ if (!retval)
+ tracefs_registered = true;
+
+ return retval;
+}
+core_initcall(tracefs_init);
diff --git a/fs/utimes.c b/fs/utimes.c
index aa138d64560a..dfb457546c60 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -97,14 +97,14 @@ static int utimes_common(struct path *path, struct timespec *times)
goto mnt_drop_write_and_out;
if (!inode_owner_or_capable(inode)) {
- error = inode_permission(inode, MAY_WRITE);
+ error = inode_permission2(path->mnt, inode, MAY_WRITE);
if (error)
goto mnt_drop_write_and_out;
}
}
retry_deleg:
mutex_lock(&inode->i_mutex);
- error = notify_change(path->dentry, &newattrs, &delegated_inode);
+ error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
mutex_unlock(&inode->i_mutex);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index 14909b0b9cae..5148150cc80b 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -1,6 +1,8 @@
#ifndef __ASM_MEMORY_MODEL_H
#define __ASM_MEMORY_MODEL_H
+#include <linux/pfn.h>
+
#ifndef __ASSEMBLY__
#if defined(CONFIG_FLATMEM)
@@ -69,6 +71,12 @@
})
#endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */
+/*
+ * Convert a physical address to a Page Frame Number and back
+ */
+#define __phys_to_pfn(paddr) PHYS_PFN(paddr)
+#define __pfn_to_phys(pfn) PFN_PHYS(pfn)
+
#define page_to_pfn __page_to_pfn
#define pfn_to_page __pfn_to_page
diff --git a/include/asm-generic/seccomp.h b/include/asm-generic/seccomp.h
new file mode 100644
index 000000000000..9fa1f653ed3b
--- /dev/null
+++ b/include/asm-generic/seccomp.h
@@ -0,0 +1,30 @@
+/*
+ * include/asm-generic/seccomp.h
+ *
+ * Copyright (C) 2014 Linaro Limited
+ * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASM_GENERIC_SECCOMP_H
+#define _ASM_GENERIC_SECCOMP_H
+
+#include <linux/unistd.h>
+
+#if defined(CONFIG_COMPAT) && !defined(__NR_seccomp_read_32)
+#define __NR_seccomp_read_32 __NR_read
+#define __NR_seccomp_write_32 __NR_write
+#define __NR_seccomp_exit_32 __NR_exit
+#define __NR_seccomp_sigreturn_32 __NR_rt_sigreturn
+#endif /* CONFIG_COMPAT && ! already defined */
+
+#define __NR_seccomp_read __NR_read
+#define __NR_seccomp_write __NR_write
+#define __NR_seccomp_exit __NR_exit
+#ifndef __NR_seccomp_sigreturn
+#define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#endif
+
+#endif /* _ASM_GENERIC_SECCOMP_H */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index ac78910d7416..390eae19d9e0 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -227,6 +227,14 @@
*(.data..init_task)
/*
+ * Allow architectures to handle ro_after_init data on their
+ * own by defining an empty RO_AFTER_INIT_DATA.
+ */
+#ifndef RO_AFTER_INIT_DATA
+#define RO_AFTER_INIT_DATA *(.data..ro_after_init)
+#endif
+
+/*
* Read only Data
*/
#define RO_DATA_SECTION(align) \
@@ -234,6 +242,7 @@
.rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \
VMLINUX_SYMBOL(__start_rodata) = .; \
*(.rodata) *(.rodata.*) \
+ RO_AFTER_INIT_DATA /* Read only after init */ \
*(__vermagic) /* Kernel version magic */ \
. = ALIGN(8); \
VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .; \
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
new file mode 100644
index 000000000000..a4608897a5f3
--- /dev/null
+++ b/include/linux/Kbuild
@@ -0,0 +1,2 @@
+header-y += if_pppolac.h
+header-y += if_pppopns.h
diff --git a/include/linux/amba/mmci.h b/include/linux/amba/mmci.h
index 8c98113069ce..8bfd21c13d59 100644
--- a/include/linux/amba/mmci.h
+++ b/include/linux/amba/mmci.h
@@ -5,6 +5,15 @@
#define AMBA_MMCI_H
#include <linux/mmc/host.h>
+#include <linux/mmc/card.h>
+#include <linux/mmc/sdio_func.h>
+
+struct embedded_sdio_data {
+ struct sdio_cis cis;
+ struct sdio_cccr cccr;
+ struct sdio_embedded_func *funcs;
+ int num_funcs;
+};
/**
* struct mmci_platform_data - platform configuration for the MMCI
@@ -31,6 +40,10 @@ struct mmci_platform_data {
int gpio_wp;
int gpio_cd;
bool cd_invert;
+ unsigned int status_irq;
+ struct embedded_sdio_data *embedded_sdio;
+ int (*register_status_notify)(void (*callback)(int card_present, void *dev_id), void *dev_id);
+
};
#endif
diff --git a/include/linux/android_aid.h b/include/linux/android_aid.h
new file mode 100644
index 000000000000..6f1fa1792dfc
--- /dev/null
+++ b/include/linux/android_aid.h
@@ -0,0 +1,28 @@
+/* include/linux/android_aid.h
+ *
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_ANDROID_AID_H
+#define _LINUX_ANDROID_AID_H
+
+/* AIDs that the kernel treats differently */
+#define AID_OBSOLETE_000 KGIDT_INIT(3001) /* was NET_BT_ADMIN */
+#define AID_OBSOLETE_001 KGIDT_INIT(3002) /* was NET_BT */
+#define AID_INET KGIDT_INIT(3003)
+#define AID_NET_RAW KGIDT_INIT(3004)
+#define AID_NET_ADMIN KGIDT_INIT(3005)
+#define AID_NET_BW_STATS KGIDT_INIT(3006) /* read bandwidth statistics */
+#define AID_NET_BW_ACCT KGIDT_INIT(3007) /* change bandwidth statistics accounting */
+
+#endif
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 5b08a8540ecf..41ea776052be 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -2,6 +2,329 @@
#ifndef _LINUX_ATOMIC_H
#define _LINUX_ATOMIC_H
#include <asm/atomic.h>
+#include <asm/barrier.h>
+
+/*
+ * Relaxed variants of xchg, cmpxchg and some atomic operations.
+ *
+ * We support four variants:
+ *
+ * - Fully ordered: The default implementation, no suffix required.
+ * - Acquire: Provides ACQUIRE semantics, _acquire suffix.
+ * - Release: Provides RELEASE semantics, _release suffix.
+ * - Relaxed: No ordering guarantees, _relaxed suffix.
+ *
+ * For compound atomics performing both a load and a store, ACQUIRE
+ * semantics apply only to the load and RELEASE semantics only to the
+ * store portion of the operation. Note that a failed cmpxchg_acquire
+ * does -not- imply any memory ordering constraints.
+ *
+ * See Documentation/memory-barriers.txt for ACQUIRE/RELEASE definitions.
+ */
+
+#ifndef atomic_read_acquire
+#define atomic_read_acquire(v) smp_load_acquire(&(v)->counter)
+#endif
+
+#ifndef atomic_set_release
+#define atomic_set_release(v, i) smp_store_release(&(v)->counter, (i))
+#endif
+
+/*
+ * The idea here is to build acquire/release variants by adding explicit
+ * barriers on top of the relaxed variant. In the case where the relaxed
+ * variant is already fully ordered, no additional barriers are needed.
+ */
+#define __atomic_op_acquire(op, args...) \
+({ \
+ typeof(op##_relaxed(args)) __ret = op##_relaxed(args); \
+ smp_mb__after_atomic(); \
+ __ret; \
+})
+
+#define __atomic_op_release(op, args...) \
+({ \
+ smp_mb__before_atomic(); \
+ op##_relaxed(args); \
+})
+
+#define __atomic_op_fence(op, args...) \
+({ \
+ typeof(op##_relaxed(args)) __ret; \
+ smp_mb__before_atomic(); \
+ __ret = op##_relaxed(args); \
+ smp_mb__after_atomic(); \
+ __ret; \
+})
+
+/* atomic_add_return_relaxed */
+#ifndef atomic_add_return_relaxed
+#define atomic_add_return_relaxed atomic_add_return
+#define atomic_add_return_acquire atomic_add_return
+#define atomic_add_return_release atomic_add_return
+
+#else /* atomic_add_return_relaxed */
+
+#ifndef atomic_add_return_acquire
+#define atomic_add_return_acquire(...) \
+ __atomic_op_acquire(atomic_add_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_add_return_release
+#define atomic_add_return_release(...) \
+ __atomic_op_release(atomic_add_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_add_return
+#define atomic_add_return(...) \
+ __atomic_op_fence(atomic_add_return, __VA_ARGS__)
+#endif
+#endif /* atomic_add_return_relaxed */
+
+/* atomic_sub_return_relaxed */
+#ifndef atomic_sub_return_relaxed
+#define atomic_sub_return_relaxed atomic_sub_return
+#define atomic_sub_return_acquire atomic_sub_return
+#define atomic_sub_return_release atomic_sub_return
+
+#else /* atomic_sub_return_relaxed */
+
+#ifndef atomic_sub_return_acquire
+#define atomic_sub_return_acquire(...) \
+ __atomic_op_acquire(atomic_sub_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_sub_return_release
+#define atomic_sub_return_release(...) \
+ __atomic_op_release(atomic_sub_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_sub_return
+#define atomic_sub_return(...) \
+ __atomic_op_fence(atomic_sub_return, __VA_ARGS__)
+#endif
+#endif /* atomic_sub_return_relaxed */
+
+/* atomic_xchg_relaxed */
+#ifndef atomic_xchg_relaxed
+#define atomic_xchg_relaxed atomic_xchg
+#define atomic_xchg_acquire atomic_xchg
+#define atomic_xchg_release atomic_xchg
+
+#else /* atomic_xchg_relaxed */
+
+#ifndef atomic_xchg_acquire
+#define atomic_xchg_acquire(...) \
+ __atomic_op_acquire(atomic_xchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic_xchg_release
+#define atomic_xchg_release(...) \
+ __atomic_op_release(atomic_xchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic_xchg
+#define atomic_xchg(...) \
+ __atomic_op_fence(atomic_xchg, __VA_ARGS__)
+#endif
+#endif /* atomic_xchg_relaxed */
+
+/* atomic_cmpxchg_relaxed */
+#ifndef atomic_cmpxchg_relaxed
+#define atomic_cmpxchg_relaxed atomic_cmpxchg
+#define atomic_cmpxchg_acquire atomic_cmpxchg
+#define atomic_cmpxchg_release atomic_cmpxchg
+
+#else /* atomic_cmpxchg_relaxed */
+
+#ifndef atomic_cmpxchg_acquire
+#define atomic_cmpxchg_acquire(...) \
+ __atomic_op_acquire(atomic_cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic_cmpxchg_release
+#define atomic_cmpxchg_release(...) \
+ __atomic_op_release(atomic_cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic_cmpxchg
+#define atomic_cmpxchg(...) \
+ __atomic_op_fence(atomic_cmpxchg, __VA_ARGS__)
+#endif
+#endif /* atomic_cmpxchg_relaxed */
+
+#ifndef atomic64_read_acquire
+#define atomic64_read_acquire(v) smp_load_acquire(&(v)->counter)
+#endif
+
+#ifndef atomic64_set_release
+#define atomic64_set_release(v, i) smp_store_release(&(v)->counter, (i))
+#endif
+
+/* atomic64_add_return_relaxed */
+#ifndef atomic64_add_return_relaxed
+#define atomic64_add_return_relaxed atomic64_add_return
+#define atomic64_add_return_acquire atomic64_add_return
+#define atomic64_add_return_release atomic64_add_return
+
+#else /* atomic64_add_return_relaxed */
+
+#ifndef atomic64_add_return_acquire
+#define atomic64_add_return_acquire(...) \
+ __atomic_op_acquire(atomic64_add_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_add_return_release
+#define atomic64_add_return_release(...) \
+ __atomic_op_release(atomic64_add_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_add_return
+#define atomic64_add_return(...) \
+ __atomic_op_fence(atomic64_add_return, __VA_ARGS__)
+#endif
+#endif /* atomic64_add_return_relaxed */
+
+/* atomic64_sub_return_relaxed */
+#ifndef atomic64_sub_return_relaxed
+#define atomic64_sub_return_relaxed atomic64_sub_return
+#define atomic64_sub_return_acquire atomic64_sub_return
+#define atomic64_sub_return_release atomic64_sub_return
+
+#else /* atomic64_sub_return_relaxed */
+
+#ifndef atomic64_sub_return_acquire
+#define atomic64_sub_return_acquire(...) \
+ __atomic_op_acquire(atomic64_sub_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_sub_return_release
+#define atomic64_sub_return_release(...) \
+ __atomic_op_release(atomic64_sub_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_sub_return
+#define atomic64_sub_return(...) \
+ __atomic_op_fence(atomic64_sub_return, __VA_ARGS__)
+#endif
+#endif /* atomic64_sub_return_relaxed */
+
+/* atomic64_xchg_relaxed */
+#ifndef atomic64_xchg_relaxed
+#define atomic64_xchg_relaxed atomic64_xchg
+#define atomic64_xchg_acquire atomic64_xchg
+#define atomic64_xchg_release atomic64_xchg
+
+#else /* atomic64_xchg_relaxed */
+
+#ifndef atomic64_xchg_acquire
+#define atomic64_xchg_acquire(...) \
+ __atomic_op_acquire(atomic64_xchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_xchg_release
+#define atomic64_xchg_release(...) \
+ __atomic_op_release(atomic64_xchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_xchg
+#define atomic64_xchg(...) \
+ __atomic_op_fence(atomic64_xchg, __VA_ARGS__)
+#endif
+#endif /* atomic64_xchg_relaxed */
+
+/* atomic64_cmpxchg_relaxed */
+#ifndef atomic64_cmpxchg_relaxed
+#define atomic64_cmpxchg_relaxed atomic64_cmpxchg
+#define atomic64_cmpxchg_acquire atomic64_cmpxchg
+#define atomic64_cmpxchg_release atomic64_cmpxchg
+
+#else /* atomic64_cmpxchg_relaxed */
+
+#ifndef atomic64_cmpxchg_acquire
+#define atomic64_cmpxchg_acquire(...) \
+ __atomic_op_acquire(atomic64_cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_cmpxchg_release
+#define atomic64_cmpxchg_release(...) \
+ __atomic_op_release(atomic64_cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_cmpxchg
+#define atomic64_cmpxchg(...) \
+ __atomic_op_fence(atomic64_cmpxchg, __VA_ARGS__)
+#endif
+#endif /* atomic64_cmpxchg_relaxed */
+
+/* cmpxchg_relaxed */
+#ifndef cmpxchg_relaxed
+#define cmpxchg_relaxed cmpxchg
+#define cmpxchg_acquire cmpxchg
+#define cmpxchg_release cmpxchg
+
+#else /* cmpxchg_relaxed */
+
+#ifndef cmpxchg_acquire
+#define cmpxchg_acquire(...) \
+ __atomic_op_acquire(cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef cmpxchg_release
+#define cmpxchg_release(...) \
+ __atomic_op_release(cmpxchg, __VA_ARGS__)
+#endif
+
+#ifndef cmpxchg
+#define cmpxchg(...) \
+ __atomic_op_fence(cmpxchg, __VA_ARGS__)
+#endif
+#endif /* cmpxchg_relaxed */
+
+/* cmpxchg64_relaxed */
+#ifndef cmpxchg64_relaxed
+#define cmpxchg64_relaxed cmpxchg64
+#define cmpxchg64_acquire cmpxchg64
+#define cmpxchg64_release cmpxchg64
+
+#else /* cmpxchg64_relaxed */
+
+#ifndef cmpxchg64_acquire
+#define cmpxchg64_acquire(...) \
+ __atomic_op_acquire(cmpxchg64, __VA_ARGS__)
+#endif
+
+#ifndef cmpxchg64_release
+#define cmpxchg64_release(...) \
+ __atomic_op_release(cmpxchg64, __VA_ARGS__)
+#endif
+
+#ifndef cmpxchg64
+#define cmpxchg64(...) \
+ __atomic_op_fence(cmpxchg64, __VA_ARGS__)
+#endif
+#endif /* cmpxchg64_relaxed */
+
+/* xchg_relaxed */
+#ifndef xchg_relaxed
+#define xchg_relaxed xchg
+#define xchg_acquire xchg
+#define xchg_release xchg
+
+#else /* xchg_relaxed */
+
+#ifndef xchg_acquire
+#define xchg_acquire(...) __atomic_op_acquire(xchg, __VA_ARGS__)
+#endif
+
+#ifndef xchg_release
+#define xchg_release(...) __atomic_op_release(xchg, __VA_ARGS__)
+#endif
+
+#ifndef xchg
+#define xchg(...) __atomic_op_fence(xchg, __VA_ARGS__)
+#endif
+#endif /* xchg_relaxed */
/**
* atomic_add_unless - add unless the number is already a given value
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d94f4d0145a4..35370ef46364 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -209,6 +209,9 @@ struct request {
/* for bidi */
struct request *next_rq;
+
+ ktime_t lat_hist_io_start;
+ int lat_hist_enabled;
};
static inline unsigned short req_get_ioprio(struct request *req)
@@ -1622,6 +1625,79 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
extern int bdev_read_page(struct block_device *, sector_t, struct page *);
extern int bdev_write_page(struct block_device *, sector_t, struct page *,
struct writeback_control *);
+
+/*
+ * X-axis for IO latency histogram support.
+ */
+static const u_int64_t latency_x_axis_us[] = {
+ 100,
+ 200,
+ 300,
+ 400,
+ 500,
+ 600,
+ 700,
+ 800,
+ 900,
+ 1000,
+ 1200,
+ 1400,
+ 1600,
+ 1800,
+ 2000,
+ 2500,
+ 3000,
+ 4000,
+ 5000,
+ 6000,
+ 7000,
+ 9000,
+ 10000
+};
+
+#define BLK_IO_LAT_HIST_DISABLE 0
+#define BLK_IO_LAT_HIST_ENABLE 1
+#define BLK_IO_LAT_HIST_ZERO 2
+
+struct io_latency_state {
+ u_int64_t latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1];
+ u_int64_t latency_reads_elems;
+ u_int64_t latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1];
+ u_int64_t latency_writes_elems;
+};
+
+static inline void
+blk_update_latency_hist(struct io_latency_state *s,
+ int read,
+ u_int64_t delta_us)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) {
+ if (delta_us < (u_int64_t)latency_x_axis_us[i]) {
+ if (read)
+ s->latency_y_axis_read[i]++;
+ else
+ s->latency_y_axis_write[i]++;
+ break;
+ }
+ }
+ if (i == ARRAY_SIZE(latency_x_axis_us)) {
+ /* Overflowed the histogram */
+ if (read)
+ s->latency_y_axis_read[i]++;
+ else
+ s->latency_y_axis_write[i]++;
+ }
+ if (read)
+ s->latency_reads_elems++;
+ else
+ s->latency_writes_elems++;
+}
+
+void blk_zero_latency_hist(struct io_latency_state *s);
+ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf);
+
#else /* CONFIG_BLOCK */
struct block_device;
diff --git a/include/linux/cache.h b/include/linux/cache.h
index 17e7e82d2aa7..1be04f8c563a 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -12,10 +12,24 @@
#define SMP_CACHE_BYTES L1_CACHE_BYTES
#endif
+/*
+ * __read_mostly is used to keep rarely changing variables out of frequently
+ * updated cachelines. If an architecture doesn't support it, ignore the
+ * hint.
+ */
#ifndef __read_mostly
#define __read_mostly
#endif
+/*
+ * __ro_after_init is used to mark things that are read-only after init (i.e.
+ * after mark_rodata_ro() has been called). These are effectively read-only,
+ * but may get written to during init, so can't live in .rodata (via "const").
+ */
+#ifndef __ro_after_init
+#define __ro_after_init __attribute__((__section__(".data..ro_after_init")))
+#endif
+
#ifndef ____cacheline_aligned
#define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES)))
#endif
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 98c4f9b12b03..23befa049e65 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -15,6 +15,14 @@ SUBSYS(cpu)
SUBSYS(cpuacct)
#endif
+#if IS_ENABLED(CONFIG_CGROUP_SCHEDTUNE)
+SUBSYS(schedtune)
+#endif
+
+#if IS_ENABLED(CONFIG_BLK_CGROUP)
+SUBSYS(blkio)
+#endif
+
#if IS_ENABLED(CONFIG_MEMCG)
SUBSYS(memory)
#endif
@@ -31,10 +39,6 @@ SUBSYS(freezer)
SUBSYS(net_cls)
#endif
-#if IS_ENABLED(CONFIG_BLK_CGROUP)
-SUBSYS(blkio)
-#endif
-
#if IS_ENABLED(CONFIG_CGROUP_PERF)
SUBSYS(perf_event)
#endif
diff --git a/include/linux/clk-private.h b/include/linux/clk-private.h
index 0ca5f6046920..228a98724b1e 100644
--- a/include/linux/clk-private.h
+++ b/include/linux/clk-private.h
@@ -13,7 +13,9 @@
#include <linux/clk-provider.h>
#include <linux/kref.h>
+#include <linux/ktime.h>
#include <linux/list.h>
+#include <linux/rbtree.h>
/*
* WARNING: Do not include clk-private.h from any file that implements struct
@@ -28,6 +30,14 @@
struct module;
+#ifdef CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING
+struct freq_stats {
+ ktime_t time_spent;
+ unsigned long rate;
+ struct rb_node node;
+};
+#endif /*CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING*/
+
struct clk {
const char *name;
const struct clk_ops *ops;
@@ -53,6 +63,13 @@ struct clk {
unsigned int notifier_count;
#ifdef CONFIG_DEBUG_FS
struct dentry *dentry;
+#ifdef CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING
+ struct rb_root freq_stats_table;
+ struct freq_stats *current_freq_stats;
+ ktime_t default_freq_time;
+ ktime_t start_time;
+#endif /* CONFIG_COMMON_CLK_FREQ_STATS_ACCOUNTING*/
+
#endif
struct kref ref;
};
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b2d9a43012b2..10d981a245d3 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -267,4 +267,11 @@ void arch_cpu_idle_enter(void);
void arch_cpu_idle_exit(void);
void arch_cpu_idle_dead(void);
+#define IDLE_START 1
+#define IDLE_END 2
+
+void idle_notifier_register(struct notifier_block *n);
+void idle_notifier_unregister(struct notifier_block *n);
+void idle_notifier_call_chain(unsigned long val);
+
#endif /* _LINUX_CPU_H_ */
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 6734c9150ebb..6bd477165fde 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -18,6 +18,8 @@
#include <linux/notifier.h>
#include <linux/spinlock.h>
#include <linux/sysfs.h>
+#include <asm/cputime.h>
+
/*********************************************************************
* CPUFREQ INTERFACE *
@@ -156,6 +158,7 @@ u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy);
int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
int cpufreq_update_policy(unsigned int cpu);
bool have_governor_per_policy(void);
+bool cpufreq_driver_is_slow(void);
struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
#else
static inline unsigned int cpufreq_get(unsigned int cpu)
@@ -313,6 +316,14 @@ struct cpufreq_driver {
*/
#define CPUFREQ_NEED_INITIAL_FREQ_CHECK (1 << 5)
+/*
+ * Indicates that it is safe to call cpufreq_driver_target from
+ * non-interruptable context in scheduler hot paths. Drivers must
+ * opt-in to this flag, as the safe default is that they might sleep
+ * or be too slow for hot path use.
+ */
+#define CPUFREQ_DRIVER_FAST (1 << 6)
+
int cpufreq_register_driver(struct cpufreq_driver *driver_data);
int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
@@ -485,6 +496,12 @@ extern struct cpufreq_governor cpufreq_gov_ondemand;
#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
extern struct cpufreq_governor cpufreq_gov_conservative;
#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE)
+extern struct cpufreq_governor cpufreq_gov_interactive;
+#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_interactive)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED)
+extern struct cpufreq_governor cpufreq_gov_sched;
+#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_sched)
#endif
/*********************************************************************
@@ -614,4 +631,14 @@ unsigned int cpufreq_generic_get(unsigned int cpu);
int cpufreq_generic_init(struct cpufreq_policy *policy,
struct cpufreq_frequency_table *table,
unsigned int transition_latency);
+
+/*********************************************************************
+ * CPUFREQ STATS *
+ *********************************************************************/
+
+void acct_update_power(struct task_struct *p, cputime_t cputime);
+
+struct sched_domain;
+unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
+unsigned long cpufreq_scale_max_freq_capacity(int cpu);
#endif /* _LINUX_CPUFREQ_H */
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 575b7166cb08..90851a969d4b 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -143,7 +143,7 @@ extern void cpuidle_resume(void);
extern int cpuidle_enable_device(struct cpuidle_device *dev);
extern void cpuidle_disable_device(struct cpuidle_device *dev);
extern int cpuidle_play_dead(void);
-extern void cpuidle_use_deepest_state(bool enable);
+extern void cpuidle_enter_freeze(void);
extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
#else
@@ -176,11 +176,14 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev)
{return -ENODEV; }
static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
static inline int cpuidle_play_dead(void) {return -ENODEV; }
-static inline void cpuidle_use_deepest_state(bool enable) {}
+static inline void cpuidle_enter_freeze(void) { }
static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
struct cpuidle_device *dev) {return NULL; }
#endif
+/* kernel/sched/idle.c */
+extern void sched_idle_set_state(struct cpuidle_state *idle_state, int index);
+
#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a);
#else
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index bb1e42ce626e..c73c9a57afc2 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -22,6 +22,14 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
*/
#define cpumask_bits(maskp) ((maskp)->bits)
+/**
+ * cpumask_pr_args - printf args to output a cpumask
+ * @maskp: cpumask to be printed
+ *
+ * Can be used to provide arguments for '%*pb[l]' when printing a cpumask.
+ */
+#define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp)
+
#if NR_CPUS == 1
#define nr_cpu_ids 1
#else
diff --git a/include/linux/cred.h b/include/linux/cred.h
index f3a1ae59f9e8..46432b076f2f 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -122,6 +122,7 @@ struct cred {
kernel_cap_t cap_permitted; /* caps we're permitted */
kernel_cap_t cap_effective; /* caps we can actually use */
kernel_cap_t cap_bset; /* capability bounding set */
+ kernel_cap_t cap_ambient; /* Ambient capability set */
#ifdef CONFIG_KEYS
unsigned char jit_keyring; /* default keyring to attach requested
* keys to */
@@ -197,6 +198,13 @@ static inline void validate_process_creds(void)
}
#endif
+static inline bool cap_ambient_invariant_ok(const struct cred *cred)
+{
+ return cap_issubset(cred->cap_ambient,
+ cap_intersect(cred->cap_permitted,
+ cred->cap_inheritable));
+}
+
/**
* get_new_cred - Get a reference on a new set of credentials
* @cred: The new credentials to reference
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 340ee0dae93b..a86c2816f5e7 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -161,6 +161,7 @@ struct dentry_operations {
struct vfsmount *(*d_automount)(struct path *);
int (*d_manage)(struct dentry *, bool);
struct inode *(*d_select_inode)(struct dentry *, unsigned);
+ void (*d_canonical_path)(const struct path *, struct path *);
} ____cacheline_aligned;
/*
@@ -225,6 +226,8 @@ struct dentry_operations {
#define DCACHE_MAY_FREE 0x00800000
#define DCACHE_OP_SELECT_INODE 0x02000000 /* Unioned entry: dcache op selects inode */
+#define DCACHE_ENCRYPTED_WITH_KEY 0x04000000 /* dir is encrypted with a valid key */
+
extern seqlock_t rename_lock;
/*
@@ -232,7 +235,7 @@ extern seqlock_t rename_lock;
*/
extern void d_instantiate(struct dentry *, struct inode *);
extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
-extern struct dentry * d_materialise_unique(struct dentry *, struct inode *);
+#define d_materialise_unique(d, i) d_splice_alias(i, d)
extern int d_instantiate_no_diralias(struct dentry *, struct inode *);
extern void __d_drop(struct dentry *dentry);
extern void d_drop(struct dentry *dentry);
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 4d0b4d1aa132..06822af7db76 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -55,6 +55,11 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
const char *dest);
+struct dentry *debugfs_create_automount(const char *name,
+ struct dentry *parent,
+ struct vfsmount *(*f)(void *),
+ void *data);
+
void debugfs_remove(struct dentry *dentry);
void debugfs_remove_recursive(struct dentry *dentry);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index c51706b7a36e..7b836e5e5eb7 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -376,6 +376,12 @@ void dm_set_mdptr(struct mapped_device *md, void *ptr);
void *dm_get_mdptr(struct mapped_device *md);
/*
+ * Export the device via the ioctl interface (uses mdptr).
+ */
+int dm_ioctl_export(struct mapped_device *md, const char *name,
+ const char *uuid);
+
+/*
* A device can still be used while suspended, but I/O is deferred.
*/
int dm_suspend(struct mapped_device *md, unsigned suspend_flags);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 777c57596863..0e20fd154d9d 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -547,6 +547,9 @@ void efi_native_runtime_setup(void);
#define SMBIOS_TABLE_GUID \
EFI_GUID( 0xeb9d2d31, 0x2d88, 0x11d3, 0x9a, 0x16, 0x0, 0x90, 0x27, 0x3f, 0xc1, 0x4d )
+#define SMBIOS3_TABLE_GUID \
+ EFI_GUID( 0xf2fd1544, 0x9794, 0x4a2c, 0x99, 0x2e, 0xe5, 0xbb, 0xcf, 0x20, 0xe3, 0x94 )
+
#define SAL_SYSTEM_TABLE_GUID \
EFI_GUID( 0xeb9d2d32, 0x2d88, 0x11d3, 0x9a, 0x16, 0x0, 0x90, 0x27, 0x3f, 0xc1, 0x4d )
@@ -810,7 +813,8 @@ extern struct efi {
unsigned long mps; /* MPS table */
unsigned long acpi; /* ACPI table (IA64 ext 0.71) */
unsigned long acpi20; /* ACPI table (ACPI 2.0) */
- unsigned long smbios; /* SM BIOS table */
+ unsigned long smbios; /* SMBIOS table (32 bit entry point) */
+ unsigned long smbios3; /* SMBIOS table (64 bit entry point) */
unsigned long sal_systab; /* SAL system table */
unsigned long boot_info; /* boot info table */
unsigned long hcdp; /* HCDP table */
@@ -871,6 +875,8 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, unsigned lon
#endif
extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr);
extern int efi_config_init(efi_config_table_type_t *arch_tables);
+extern int efi_config_parse_tables(void *config_tables, int count, int sz,
+ efi_config_table_type_t *arch_tables);
extern u64 efi_get_iobase (void);
extern u32 efi_mem_type (unsigned long phys_addr);
extern u64 efi_mem_attributes (unsigned long phys_addr);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 2df22eab98e3..67303b808ff1 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -19,12 +19,16 @@
#define F2FS_MAX_LOG_SECTOR_SIZE 12 /* 12 bits for 4096 bytes */
#define F2FS_LOG_SECTORS_PER_BLOCK 3 /* log number for sector/blk */
#define F2FS_BLKSIZE 4096 /* support only 4KB block */
+#define F2FS_BLKSIZE_BITS 12 /* bits for F2FS_BLKSIZE */
#define F2FS_MAX_EXTENSION 64 /* # of extension entries */
-#define F2FS_BLK_ALIGN(x) (((x) + F2FS_BLKSIZE - 1) / F2FS_BLKSIZE)
+#define F2FS_BLK_ALIGN(x) (((x) + F2FS_BLKSIZE - 1) >> F2FS_BLKSIZE_BITS)
#define NULL_ADDR ((block_t)0) /* used as block_t addresses */
#define NEW_ADDR ((block_t)-1) /* used as block_t addresses */
+#define F2FS_BYTES_TO_BLK(bytes) ((bytes) >> F2FS_BLKSIZE_BITS)
+#define F2FS_BLK_TO_BYTES(blk) ((blk) << F2FS_BLKSIZE_BITS)
+
/* 0, 1(node nid), 2(meta nid) are reserved node id */
#define F2FS_RESERVED_NODE_NUM 3
@@ -32,8 +36,15 @@
#define F2FS_NODE_INO(sbi) (sbi->node_ino_num)
#define F2FS_META_INO(sbi) (sbi->meta_ino_num)
+#define F2FS_IO_SIZE(sbi) (1 << (sbi)->write_io_size_bits) /* Blocks */
+#define F2FS_IO_SIZE_KB(sbi) (1 << ((sbi)->write_io_size_bits + 2)) /* KB */
+#define F2FS_IO_SIZE_BYTES(sbi) (1 << ((sbi)->write_io_size_bits + 12)) /* B */
+#define F2FS_IO_SIZE_BITS(sbi) ((sbi)->write_io_size_bits) /* power of 2 */
+#define F2FS_IO_SIZE_MASK(sbi) (F2FS_IO_SIZE(sbi) - 1)
+
/* This flag is used by node and meta inodes, and by recovery */
-#define GFP_F2FS_ZERO (GFP_NOFS | __GFP_ZERO)
+#define GFP_F2FS_ZERO (GFP_NOFS | __GFP_ZERO)
+#define GFP_F2FS_HIGH_ZERO (GFP_NOFS | __GFP_ZERO | __GFP_HIGHMEM)
/*
* For further optimization on multi-head logs, on-disk layout supports maximum
@@ -45,9 +56,19 @@
#define MAX_ACTIVE_NODE_LOGS 8
#define MAX_ACTIVE_DATA_LOGS 8
+#define VERSION_LEN 256
+#define MAX_VOLUME_NAME 512
+#define MAX_PATH_LEN 64
+#define MAX_DEVICES 8
+
/*
* For superblock
*/
+struct f2fs_device {
+ __u8 path[MAX_PATH_LEN];
+ __le32 total_segments;
+} __packed;
+
struct f2fs_super_block {
__le32 magic; /* Magic Number */
__le16 major_ver; /* Major Version */
@@ -77,15 +98,26 @@ struct f2fs_super_block {
__le32 node_ino; /* node inode number */
__le32 meta_ino; /* meta inode number */
__u8 uuid[16]; /* 128-bit uuid for volume */
- __le16 volume_name[512]; /* volume name */
+ __le16 volume_name[MAX_VOLUME_NAME]; /* volume name */
__le32 extension_count; /* # of extensions below */
__u8 extension_list[F2FS_MAX_EXTENSION][8]; /* extension array */
__le32 cp_payload;
+ __u8 version[VERSION_LEN]; /* the kernel version */
+ __u8 init_version[VERSION_LEN]; /* the initial kernel version */
+ __le32 feature; /* defined features */
+ __u8 encryption_level; /* versioning level for encryption */
+ __u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */
+ struct f2fs_device devs[MAX_DEVICES]; /* device list */
+ __u8 reserved[327]; /* valid reserved region */
} __packed;
/*
* For checkpoint
*/
+#define CP_TRIMMED_FLAG 0x00000100
+#define CP_NAT_BITS_FLAG 0x00000080
+#define CP_CRC_RECOVERY_FLAG 0x00000040
+#define CP_FASTBOOT_FLAG 0x00000020
#define CP_FSCK_FLAG 0x00000010
#define CP_ERROR_FLAG 0x00000008
#define CP_COMPACT_SUM_FLAG 0x00000004
@@ -147,7 +179,7 @@ struct f2fs_orphan_block {
*/
struct f2fs_extent {
__le32 fofs; /* start file offset of the extent */
- __le32 blk_addr; /* start block address of the extent */
+ __le32 blk; /* start block address of the extent */
__le32 len; /* lengh of the extent */
} __packed;
@@ -155,12 +187,12 @@ struct f2fs_extent {
#define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */
#define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */
#define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */
-#define ADDRS_PER_INODE(fi) addrs_per_inode(fi)
+#define ADDRS_PER_INODE(inode) addrs_per_inode(inode)
#define ADDRS_PER_BLOCK 1018 /* Address Pointers in a Direct Block */
#define NIDS_PER_BLOCK 1018 /* Node IDs in an Indirect Block */
-#define ADDRS_PER_PAGE(page, fi) \
- (IS_INODE(page) ? ADDRS_PER_INODE(fi) : ADDRS_PER_BLOCK)
+#define ADDRS_PER_PAGE(page, inode) \
+ (IS_INODE(page) ? ADDRS_PER_INODE(inode) : ADDRS_PER_BLOCK)
#define NODE_DIR1_BLOCK (DEF_ADDRS_PER_INODE + 1)
#define NODE_DIR2_BLOCK (DEF_ADDRS_PER_INODE + 2)
@@ -170,14 +202,13 @@ struct f2fs_extent {
#define F2FS_INLINE_XATTR 0x01 /* file inline xattr flag */
#define F2FS_INLINE_DATA 0x02 /* file inline data flag */
+#define F2FS_INLINE_DENTRY 0x04 /* file inline dentry flag */
+#define F2FS_DATA_EXIST 0x08 /* file inline data exist flag */
+#define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */
#define MAX_INLINE_DATA (sizeof(__le32) * (DEF_ADDRS_PER_INODE - \
F2FS_INLINE_XATTR_ADDRS - 1))
-#define INLINE_DATA_OFFSET (PAGE_CACHE_SIZE - sizeof(struct node_footer) -\
- sizeof(__le32) * (DEF_ADDRS_PER_INODE + \
- DEF_NIDS_PER_INODE - 1))
-
struct f2fs_inode {
__le16 i_mode; /* file mode */
__u8 i_advise; /* file hints */
@@ -225,6 +256,8 @@ enum {
OFFSET_BIT_SHIFT
};
+#define OFFSET_BIT_MASK (0x07) /* (0x01 << OFFSET_BIT_SHIFT) - 1 */
+
struct node_footer {
__le32 nid; /* node id */
__le32 ino; /* inode nunmber */
@@ -246,7 +279,8 @@ struct f2fs_node {
/*
* For NAT entries
*/
-#define NAT_ENTRY_PER_BLOCK (PAGE_CACHE_SIZE / sizeof(struct f2fs_nat_entry))
+#define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry))
+#define NAT_ENTRY_BITMAP_SIZE ((NAT_ENTRY_PER_BLOCK + 7) / 8)
struct f2fs_nat_entry {
__u8 version; /* latest version of cached nat entry */
@@ -335,7 +369,7 @@ struct f2fs_summary {
struct summary_footer {
unsigned char entry_type; /* SUM_TYPE_XXX */
- __u32 check_sum; /* summary checksum */
+ __le32 check_sum; /* summary checksum */
} __packed;
#define SUM_JOURNAL_SIZE (F2FS_BLKSIZE - SUM_FOOTER_SIZE -\
@@ -348,6 +382,12 @@ struct summary_footer {
sizeof(struct sit_journal_entry))
#define SIT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\
sizeof(struct sit_journal_entry))
+
+/* Reserved area should make size of f2fs_extra_info equals to
+ * that of nat_journal and sit_journal.
+ */
+#define EXTRA_INFO_RESERVED (SUM_JOURNAL_SIZE - 2 - 8)
+
/*
* frequently updated NAT/SIT entries can be stored in the spare area in
* summary blocks
@@ -377,18 +417,28 @@ struct sit_journal {
__u8 reserved[SIT_JOURNAL_RESERVED];
} __packed;
-/* 4KB-sized summary block structure */
-struct f2fs_summary_block {
- struct f2fs_summary entries[ENTRIES_IN_SUM];
+struct f2fs_extra_info {
+ __le64 kbytes_written;
+ __u8 reserved[EXTRA_INFO_RESERVED];
+} __packed;
+
+struct f2fs_journal {
union {
__le16 n_nats;
__le16 n_sits;
};
- /* spare area is used by NAT or SIT journals */
+ /* spare area is used by NAT or SIT journals or extra info */
union {
struct nat_journal nat_j;
struct sit_journal sit_j;
+ struct f2fs_extra_info info;
};
+} __packed;
+
+/* 4KB-sized summary block structure */
+struct f2fs_summary_block {
+ struct f2fs_summary entries[ENTRIES_IN_SUM];
+ struct f2fs_journal journal;
struct summary_footer footer;
} __packed;
@@ -408,15 +458,25 @@ typedef __le32 f2fs_hash_t;
#define GET_DENTRY_SLOTS(x) ((x + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS)
-/* the number of dentry in a block */
-#define NR_DENTRY_IN_BLOCK 214
-
/* MAX level for dir lookup */
#define MAX_DIR_HASH_DEPTH 63
/* MAX buckets in one level of dir */
#define MAX_DIR_BUCKETS (1 << ((MAX_DIR_HASH_DEPTH / 2) - 1))
+/*
+ * space utilization of regular dentry and inline dentry
+ * regular dentry inline dentry
+ * bitmap 1 * 27 = 27 1 * 23 = 23
+ * reserved 1 * 3 = 3 1 * 7 = 7
+ * dentry 11 * 214 = 2354 11 * 182 = 2002
+ * filename 8 * 214 = 1712 8 * 182 = 1456
+ * total 4096 3488
+ *
+ * Note: there are more reserved space in inline dentry than in regular
+ * dentry, when converting inline dentry we should handle this carefully.
+ */
+#define NR_DENTRY_IN_BLOCK 214 /* the number of dentry in a block */
#define SIZE_OF_DIR_ENTRY 11 /* by byte */
#define SIZE_OF_DENTRY_BITMAP ((NR_DENTRY_IN_BLOCK + BITS_PER_BYTE - 1) / \
BITS_PER_BYTE)
@@ -441,6 +501,24 @@ struct f2fs_dentry_block {
__u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN];
} __packed;
+/* for inline dir */
+#define NR_INLINE_DENTRY (MAX_INLINE_DATA * BITS_PER_BYTE / \
+ ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
+ BITS_PER_BYTE + 1))
+#define INLINE_DENTRY_BITMAP_SIZE ((NR_INLINE_DENTRY + \
+ BITS_PER_BYTE - 1) / BITS_PER_BYTE)
+#define INLINE_RESERVED_SIZE (MAX_INLINE_DATA - \
+ ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
+ NR_INLINE_DENTRY + INLINE_DENTRY_BITMAP_SIZE))
+
+/* inline directory entry structure */
+struct f2fs_inline_dentry {
+ __u8 dentry_bitmap[INLINE_DENTRY_BITMAP_SIZE];
+ __u8 reserved[INLINE_RESERVED_SIZE];
+ struct f2fs_dir_entry dentry[NR_INLINE_DENTRY];
+ __u8 filename[NR_INLINE_DENTRY][F2FS_SLOT_LEN];
+} __packed;
+
/* file types used in inode_info->flags */
enum {
F2FS_FT_UNKNOWN,
@@ -454,4 +532,6 @@ enum {
F2FS_FT_MAX
};
+#define S_SHIFT 12
+
#endif /* _LINUX_F2FS_FS_H */
diff --git a/include/linux/fence.h b/include/linux/fence.h
index d174585b874b..1c94759e7148 100644
--- a/include/linux/fence.h
+++ b/include/linux/fence.h
@@ -107,6 +107,7 @@ struct fence_cb {
* @get_driver_name: returns the driver name.
* @get_timeline_name: return the name of the context this fence belongs to.
* @enable_signaling: enable software signaling of fence.
+ * @disable_signaling: disable software signaling of fence (optional).
* @signaled: [optional] peek whether the fence is signaled, can be null.
* @wait: custom wait implementation, or fence_default_wait.
* @release: [optional] called on destruction of fence, can be null
@@ -166,6 +167,7 @@ struct fence_ops {
const char * (*get_driver_name)(struct fence *fence);
const char * (*get_timeline_name)(struct fence *fence);
bool (*enable_signaling)(struct fence *fence);
+ void (*disable_signaling)(struct fence *fence);
bool (*signaled)(struct fence *fence);
signed long (*wait)(struct fence *fence, bool intr, signed long timeout);
void (*release)(struct fence *fence);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2a41353033d3..ab1e4b339285 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -49,6 +49,8 @@ struct swap_info_struct;
struct seq_file;
struct workqueue_struct;
struct iov_iter;
+struct fscrypt_info;
+struct fscrypt_operations;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -623,6 +625,10 @@ struct inode {
struct hlist_head i_fsnotify_marks;
#endif
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ struct fscrypt_info *i_crypt_info;
+#endif
+
void *i_private; /* fs or device private pointer */
};
@@ -1217,6 +1223,9 @@ struct super_block {
const struct xattr_handler **s_xattr;
struct list_head s_inodes; /* all inodes */
+
+ const struct fscrypt_operations *s_cop;
+
struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */
struct list_head s_mounts; /* list of mounts; _not_ for fs use */
struct block_device *s_bdev;
@@ -1411,13 +1420,21 @@ extern bool inode_owner_or_capable(const struct inode *inode);
* VFS helper functions..
*/
extern int vfs_create(struct inode *, struct dentry *, umode_t, bool);
+extern int vfs_create2(struct vfsmount *, struct inode *, struct dentry *, umode_t, bool);
extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
+extern int vfs_mkdir2(struct vfsmount *, struct inode *, struct dentry *, umode_t);
extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+extern int vfs_mknod2(struct vfsmount *, struct inode *, struct dentry *, umode_t, dev_t);
extern int vfs_symlink(struct inode *, struct dentry *, const char *);
+extern int vfs_symlink2(struct vfsmount *, struct inode *, struct dentry *, const char *);
extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
+extern int vfs_link2(struct vfsmount *, struct dentry *, struct inode *, struct dentry *, struct inode **);
extern int vfs_rmdir(struct inode *, struct dentry *);
+extern int vfs_rmdir2(struct vfsmount *, struct inode *, struct dentry *);
extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
+extern int vfs_unlink2(struct vfsmount *, struct inode *, struct dentry *, struct inode **);
extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
+extern int vfs_rename2(struct vfsmount *, struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
extern int vfs_whiteout(struct inode *, struct dentry *);
/*
@@ -1519,6 +1536,7 @@ struct inode_operations {
struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
void * (*follow_link) (struct dentry *, struct nameidata *);
int (*permission) (struct inode *, int);
+ int (*permission2) (struct vfsmount *, struct inode *, int);
struct posix_acl * (*get_acl)(struct inode *, int);
int (*readlink) (struct dentry *, char __user *,int);
@@ -1536,6 +1554,7 @@ struct inode_operations {
int (*rename2) (struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
int (*setattr) (struct dentry *, struct iattr *);
+ int (*setattr2) (struct vfsmount *, struct dentry *, struct iattr *);
int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
@@ -1579,9 +1598,13 @@ struct super_operations {
int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *);
int (*remount_fs) (struct super_block *, int *, char *);
+ int (*remount_fs2) (struct vfsmount *, struct super_block *, int *, char *);
+ void *(*clone_mnt_data) (void *);
+ void (*copy_mnt_data) (void *, void *);
void (*umount_begin) (struct super_block *);
int (*show_options)(struct seq_file *, struct dentry *);
+ int (*show_options2)(struct vfsmount *,struct seq_file *, struct dentry *);
int (*show_devname)(struct seq_file *, struct dentry *);
int (*show_path)(struct seq_file *, struct dentry *);
int (*show_stats)(struct seq_file *, struct dentry *);
@@ -1793,6 +1816,9 @@ struct file_system_type {
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);
+ struct dentry *(*mount2) (struct vfsmount *, struct file_system_type *, int,
+ const char *, void *);
+ void *(*alloc_mnt_data) (void);
void (*kill_sb) (struct super_block *);
struct module *owner;
struct file_system_type * next;
@@ -2060,6 +2086,8 @@ struct filename {
extern long vfs_truncate(struct path *, loff_t);
extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
struct file *filp);
+extern int do_truncate2(struct vfsmount *, struct dentry *, loff_t start,
+ unsigned int time_attrs, struct file *filp);
extern int do_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern long do_sys_open(int dfd, const char __user *filename, int flags,
@@ -2280,8 +2308,11 @@ extern void emergency_remount(void);
extern sector_t bmap(struct inode *, sector_t);
#endif
extern int notify_change(struct dentry *, struct iattr *, struct inode **);
+extern int notify_change2(struct vfsmount *, struct dentry *, struct iattr *, struct inode **);
extern int inode_permission(struct inode *, int);
+extern int inode_permission2(struct vfsmount *, struct inode *, int);
extern int __inode_permission(struct inode *, int);
+extern int __inode_permission2(struct vfsmount *, struct inode *, int);
extern int generic_permission(struct inode *, int);
extern int __check_sticky(struct inode *dir, struct inode *inode);
diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h
new file mode 100644
index 000000000000..10c1abfbac6c
--- /dev/null
+++ b/include/linux/fscrypt_common.h
@@ -0,0 +1,145 @@
+/*
+ * fscrypt_common.h: common declarations for per-file encryption
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+
+#ifndef _LINUX_FSCRYPT_COMMON_H
+#define _LINUX_FSCRYPT_COMMON_H
+
+#include <linux/key.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/bio.h>
+#include <linux/dcache.h>
+#include <crypto/skcipher.h>
+#include <uapi/linux/fs.h>
+
+#define FS_CRYPTO_BLOCK_SIZE 16
+
+struct fscrypt_info;
+
+struct fscrypt_ctx {
+ union {
+ struct {
+ struct page *bounce_page; /* Ciphertext page */
+ struct page *control_page; /* Original page */
+ } w;
+ struct {
+ struct bio *bio;
+ struct work_struct work;
+ } r;
+ struct list_head free_list; /* Free list */
+ };
+ u8 flags; /* Flags */
+};
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct fscrypt_symlink_data {
+ __le16 len;
+ char encrypted_path[1];
+} __packed;
+
+/**
+ * This function is used to calculate the disk space required to
+ * store a filename of length l in encrypted symlink format.
+ */
+static inline u32 fscrypt_symlink_data_len(u32 l)
+{
+ if (l < FS_CRYPTO_BLOCK_SIZE)
+ l = FS_CRYPTO_BLOCK_SIZE;
+ return (l + sizeof(struct fscrypt_symlink_data) - 1);
+}
+
+struct fscrypt_str {
+ unsigned char *name;
+ u32 len;
+};
+
+struct fscrypt_name {
+ const struct qstr *usr_fname;
+ struct fscrypt_str disk_name;
+ u32 hash;
+ u32 minor_hash;
+ struct fscrypt_str crypto_buf;
+};
+
+#define FSTR_INIT(n, l) { .name = n, .len = l }
+#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len)
+#define fname_name(p) ((p)->disk_name.name)
+#define fname_len(p) ((p)->disk_name.len)
+
+/*
+ * fscrypt superblock flags
+ */
+#define FS_CFLG_OWN_PAGES (1U << 1)
+
+/*
+ * crypto opertions for filesystems
+ */
+struct fscrypt_operations {
+ unsigned int flags;
+ const char *key_prefix;
+ int (*get_context)(struct inode *, void *, size_t);
+ int (*set_context)(struct inode *, const void *, size_t, void *);
+ int (*dummy_context)(struct inode *);
+ bool (*is_encrypted)(struct inode *);
+ bool (*empty_dir)(struct inode *);
+ unsigned (*max_namelen)(struct inode *);
+};
+
+static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
+{
+ if (inode->i_sb->s_cop->dummy_context &&
+ inode->i_sb->s_cop->dummy_context(inode))
+ return true;
+ return false;
+}
+
+static inline bool fscrypt_valid_contents_enc_mode(u32 mode)
+{
+ return (mode == FS_ENCRYPTION_MODE_AES_256_XTS);
+}
+
+static inline bool fscrypt_valid_filenames_enc_mode(u32 mode)
+{
+ return (mode == FS_ENCRYPTION_MODE_AES_256_CTS);
+}
+
+static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
+{
+ if (str->len == 1 && str->name[0] == '.')
+ return true;
+
+ if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
+ return true;
+
+ return false;
+}
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
+#else
+ WARN_ON_ONCE(1);
+ return ERR_PTR(-EINVAL);
+#endif
+}
+
+static inline int fscrypt_has_encryption_key(const struct inode *inode)
+{
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ return (inode->i_crypt_info != NULL);
+#else
+ return 0;
+#endif
+}
+
+#endif /* _LINUX_FSCRYPT_COMMON_H */
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
new file mode 100644
index 000000000000..ec406aed2f2f
--- /dev/null
+++ b/include/linux/fscrypt_notsupp.h
@@ -0,0 +1,177 @@
+/*
+ * fscrypt_notsupp.h
+ *
+ * This stubs out the fscrypt functions for filesystems configured without
+ * encryption support.
+ */
+
+#ifndef _LINUX_FSCRYPT_NOTSUPP_H
+#define _LINUX_FSCRYPT_NOTSUPP_H
+
+#include <linux/fscrypt_common.h>
+
+/* crypto.c */
+static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode,
+ gfp_t gfp_flags)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
+{
+ return;
+}
+
+static inline struct page *fscrypt_encrypt_page(const struct inode *inode,
+ struct page *page,
+ unsigned int len,
+ unsigned int offs,
+ u64 lblk_num, gfp_t gfp_flags)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline int fscrypt_decrypt_page(const struct inode *inode,
+ struct page *page,
+ unsigned int len, unsigned int offs,
+ u64 lblk_num)
+{
+ return -EOPNOTSUPP;
+}
+
+
+static inline void fscrypt_restore_control_page(struct page *page)
+{
+ return;
+}
+
+static inline void fscrypt_set_d_op(struct dentry *dentry)
+{
+ return;
+}
+
+static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry)
+{
+ return;
+}
+
+/* policy.c */
+static inline int fscrypt_ioctl_set_policy(struct file *filp,
+ const void __user *arg)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_has_permitted_context(struct inode *parent,
+ struct inode *child)
+{
+ return 0;
+}
+
+static inline int fscrypt_inherit_context(struct inode *parent,
+ struct inode *child,
+ void *fs_data, bool preload)
+{
+ return -EOPNOTSUPP;
+}
+
+/* keyinfo.c */
+static inline int fscrypt_get_encryption_info(struct inode *inode)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_put_encryption_info(struct inode *inode,
+ struct fscrypt_info *ci)
+{
+ return;
+}
+
+ /* fname.c */
+static inline int fscrypt_setup_filename(struct inode *dir,
+ const struct qstr *iname,
+ int lookup, struct fscrypt_name *fname)
+{
+ if (dir->i_sb->s_cop->is_encrypted(dir))
+ return -EOPNOTSUPP;
+
+ memset(fname, 0, sizeof(struct fscrypt_name));
+ fname->usr_fname = iname;
+ fname->disk_name.name = (unsigned char *)iname->name;
+ fname->disk_name.len = iname->len;
+ return 0;
+}
+
+static inline void fscrypt_free_filename(struct fscrypt_name *fname)
+{
+ return;
+}
+
+static inline u32 fscrypt_fname_encrypted_size(const struct inode *inode,
+ u32 ilen)
+{
+ /* never happens */
+ WARN_ON(1);
+ return 0;
+}
+
+static inline int fscrypt_fname_alloc_buffer(const struct inode *inode,
+ u32 ilen,
+ struct fscrypt_str *crypto_str)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
+{
+ return;
+}
+
+static inline int fscrypt_fname_disk_to_usr(struct inode *inode,
+ u32 hash, u32 minor_hash,
+ const struct fscrypt_str *iname,
+ struct fscrypt_str *oname)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_fname_usr_to_disk(struct inode *inode,
+ const struct qstr *iname,
+ struct fscrypt_str *oname)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
+ const u8 *de_name, u32 de_name_len)
+{
+ /* Encryption support disabled; use standard comparison */
+ if (de_name_len != fname->disk_name.len)
+ return false;
+ return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
+}
+
+/* bio.c */
+static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx,
+ struct bio *bio)
+{
+ return;
+}
+
+static inline void fscrypt_pullback_bio_page(struct page **page, bool restore)
+{
+ return;
+}
+
+static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
+ sector_t pblk, unsigned int len)
+{
+ return -EOPNOTSUPP;
+}
+
+#endif /* _LINUX_FSCRYPT_NOTSUPP_H */
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
new file mode 100644
index 000000000000..e12c224a0d1e
--- /dev/null
+++ b/include/linux/fscrypt_supp.h
@@ -0,0 +1,144 @@
+/*
+ * fscrypt_supp.h
+ *
+ * This is included by filesystems configured with encryption support.
+ */
+
+#ifndef _LINUX_FSCRYPT_SUPP_H
+#define _LINUX_FSCRYPT_SUPP_H
+
+#include <linux/fscrypt_common.h>
+
+/* crypto.c */
+extern struct kmem_cache *fscrypt_info_cachep;
+extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
+extern void fscrypt_release_ctx(struct fscrypt_ctx *);
+extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *,
+ unsigned int, unsigned int,
+ u64, gfp_t);
+extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int,
+ unsigned int, u64);
+extern void fscrypt_restore_control_page(struct page *);
+
+extern const struct dentry_operations fscrypt_d_ops;
+
+static inline void fscrypt_set_d_op(struct dentry *dentry)
+{
+ d_set_d_op(dentry, &fscrypt_d_ops);
+}
+
+static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY;
+ spin_unlock(&dentry->d_lock);
+}
+
+/* policy.c */
+extern int fscrypt_ioctl_set_policy(struct file *, const void __user *);
+extern int fscrypt_ioctl_get_policy(struct file *, void __user *);
+extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
+extern int fscrypt_inherit_context(struct inode *, struct inode *,
+ void *, bool);
+/* keyinfo.c */
+extern int fscrypt_get_encryption_info(struct inode *);
+extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *);
+
+/* fname.c */
+extern int fscrypt_setup_filename(struct inode *, const struct qstr *,
+ int lookup, struct fscrypt_name *);
+extern void fscrypt_free_filename(struct fscrypt_name *);
+extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32);
+extern int fscrypt_fname_alloc_buffer(const struct inode *, u32,
+ struct fscrypt_str *);
+extern void fscrypt_fname_free_buffer(struct fscrypt_str *);
+extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32,
+ const struct fscrypt_str *, struct fscrypt_str *);
+extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *,
+ struct fscrypt_str *);
+
+#define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE 32
+
+/* Extracts the second-to-last ciphertext block; see explanation below */
+#define FSCRYPT_FNAME_DIGEST(name, len) \
+ ((name) + round_down((len) - FS_CRYPTO_BLOCK_SIZE - 1, \
+ FS_CRYPTO_BLOCK_SIZE))
+
+#define FSCRYPT_FNAME_DIGEST_SIZE FS_CRYPTO_BLOCK_SIZE
+
+/**
+ * fscrypt_digested_name - alternate identifier for an on-disk filename
+ *
+ * When userspace lists an encrypted directory without access to the key,
+ * filenames whose ciphertext is longer than FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE
+ * bytes are shown in this abbreviated form (base64-encoded) rather than as the
+ * full ciphertext (base64-encoded). This is necessary to allow supporting
+ * filenames up to NAME_MAX bytes, since base64 encoding expands the length.
+ *
+ * To make it possible for filesystems to still find the correct directory entry
+ * despite not knowing the full on-disk name, we encode any filesystem-specific
+ * 'hash' and/or 'minor_hash' which the filesystem may need for its lookups,
+ * followed by the second-to-last ciphertext block of the filename. Due to the
+ * use of the CBC-CTS encryption mode, the second-to-last ciphertext block
+ * depends on the full plaintext. (Note that ciphertext stealing causes the
+ * last two blocks to appear "flipped".) This makes collisions very unlikely:
+ * just a 1 in 2^128 chance for two filenames to collide even if they share the
+ * same filesystem-specific hashes.
+ *
+ * This scheme isn't strictly immune to intentional collisions because it's
+ * basically like a CBC-MAC, which isn't secure on variable-length inputs.
+ * However, generating a CBC-MAC collision requires the ability to choose
+ * arbitrary ciphertext, which won't normally be possible with filename
+ * encryption since it would require write access to the raw disk.
+ *
+ * Taking a real cryptographic hash like SHA-256 over the full ciphertext would
+ * be better in theory but would be less efficient and more complicated to
+ * implement, especially since the filesystem would need to calculate it for
+ * each directory entry examined during a search.
+ */
+struct fscrypt_digested_name {
+ u32 hash;
+ u32 minor_hash;
+ u8 digest[FSCRYPT_FNAME_DIGEST_SIZE];
+};
+
+/**
+ * fscrypt_match_name() - test whether the given name matches a directory entry
+ * @fname: the name being searched for
+ * @de_name: the name from the directory entry
+ * @de_name_len: the length of @de_name in bytes
+ *
+ * Normally @fname->disk_name will be set, and in that case we simply compare
+ * that to the name stored in the directory entry. The only exception is that
+ * if we don't have the key for an encrypted directory and a filename in it is
+ * very long, then we won't have the full disk_name and we'll instead need to
+ * match against the fscrypt_digested_name.
+ *
+ * Return: %true if the name matches, otherwise %false.
+ */
+static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
+ const u8 *de_name, u32 de_name_len)
+{
+ if (unlikely(!fname->disk_name.name)) {
+ const struct fscrypt_digested_name *n =
+ (const void *)fname->crypto_buf.name;
+ if (WARN_ON_ONCE(fname->usr_fname->name[0] != '_'))
+ return false;
+ if (de_name_len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)
+ return false;
+ return !memcmp(FSCRYPT_FNAME_DIGEST(de_name, de_name_len),
+ n->digest, FSCRYPT_FNAME_DIGEST_SIZE);
+ }
+
+ if (de_name_len != fname->disk_name.len)
+ return false;
+ return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
+}
+
+/* bio.c */
+extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
+extern void fscrypt_pullback_bio_page(struct page **, bool);
+extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
+ unsigned int);
+
+#endif /* _LINUX_FSCRYPT_SUPP_H */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 662697babd48..dc008b9e9f41 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -39,6 +39,12 @@
# define FTRACE_FORCE_LIST_FUNC 0
#endif
+/* Main tracing buffer and events set up */
+#ifdef CONFIG_TRACING
+void trace_init(void);
+#else
+static inline void trace_init(void) { }
+#endif
struct module;
struct ftrace_hash;
diff --git a/include/linux/gpio_event.h b/include/linux/gpio_event.h
new file mode 100644
index 000000000000..2613fc5e4a93
--- /dev/null
+++ b/include/linux/gpio_event.h
@@ -0,0 +1,170 @@
+/* include/linux/gpio_event.h
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_GPIO_EVENT_H
+#define _LINUX_GPIO_EVENT_H
+
+#include <linux/input.h>
+
+struct gpio_event_input_devs {
+ int count;
+ struct input_dev *dev[];
+};
+enum {
+ GPIO_EVENT_FUNC_UNINIT = 0x0,
+ GPIO_EVENT_FUNC_INIT = 0x1,
+ GPIO_EVENT_FUNC_SUSPEND = 0x2,
+ GPIO_EVENT_FUNC_RESUME = 0x3,
+};
+struct gpio_event_info {
+ int (*func)(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info,
+ void **data, int func);
+ int (*event)(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info,
+ void **data, unsigned int dev, unsigned int type,
+ unsigned int code, int value); /* out events */
+ bool no_suspend;
+};
+
+struct gpio_event_platform_data {
+ const char *name;
+ struct gpio_event_info **info;
+ size_t info_count;
+ int (*power)(const struct gpio_event_platform_data *pdata, bool on);
+ const char *names[]; /* If name is NULL, names contain a NULL */
+ /* terminated list of input devices to create */
+};
+
+#define GPIO_EVENT_DEV_NAME "gpio-event"
+
+/* Key matrix */
+
+enum gpio_event_matrix_flags {
+ /* unset: drive active output low, set: drive active output high */
+ GPIOKPF_ACTIVE_HIGH = 1U << 0,
+ GPIOKPF_DEBOUNCE = 1U << 1,
+ GPIOKPF_REMOVE_SOME_PHANTOM_KEYS = 1U << 2,
+ GPIOKPF_REMOVE_PHANTOM_KEYS = GPIOKPF_REMOVE_SOME_PHANTOM_KEYS |
+ GPIOKPF_DEBOUNCE,
+ GPIOKPF_DRIVE_INACTIVE = 1U << 3,
+ GPIOKPF_LEVEL_TRIGGERED_IRQ = 1U << 4,
+ GPIOKPF_PRINT_UNMAPPED_KEYS = 1U << 16,
+ GPIOKPF_PRINT_MAPPED_KEYS = 1U << 17,
+ GPIOKPF_PRINT_PHANTOM_KEYS = 1U << 18,
+};
+
+#define MATRIX_CODE_BITS (10)
+#define MATRIX_KEY_MASK ((1U << MATRIX_CODE_BITS) - 1)
+#define MATRIX_KEY(dev, code) \
+ (((dev) << MATRIX_CODE_BITS) | (code & MATRIX_KEY_MASK))
+
+extern int gpio_event_matrix_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func);
+struct gpio_event_matrix_info {
+ /* initialize to gpio_event_matrix_func */
+ struct gpio_event_info info;
+ /* size must be ninputs * noutputs */
+ const unsigned short *keymap;
+ unsigned int *input_gpios;
+ unsigned int *output_gpios;
+ unsigned int ninputs;
+ unsigned int noutputs;
+ /* time to wait before reading inputs after driving each output */
+ ktime_t settle_time;
+ /* time to wait before scanning the keypad a second time */
+ ktime_t debounce_delay;
+ ktime_t poll_time;
+ unsigned flags;
+};
+
+/* Directly connected inputs and outputs */
+
+enum gpio_event_direct_flags {
+ GPIOEDF_ACTIVE_HIGH = 1U << 0,
+/* GPIOEDF_USE_DOWN_IRQ = 1U << 1, */
+/* GPIOEDF_USE_IRQ = (1U << 2) | GPIOIDF_USE_DOWN_IRQ, */
+ GPIOEDF_PRINT_KEYS = 1U << 8,
+ GPIOEDF_PRINT_KEY_DEBOUNCE = 1U << 9,
+ GPIOEDF_PRINT_KEY_UNSTABLE = 1U << 10,
+};
+
+struct gpio_event_direct_entry {
+ uint32_t gpio:16;
+ uint32_t code:10;
+ uint32_t dev:6;
+};
+
+/* inputs */
+extern int gpio_event_input_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func);
+struct gpio_event_input_info {
+ /* initialize to gpio_event_input_func */
+ struct gpio_event_info info;
+ ktime_t debounce_time;
+ ktime_t poll_time;
+ uint16_t flags;
+ uint16_t type;
+ const struct gpio_event_direct_entry *keymap;
+ size_t keymap_size;
+};
+
+/* outputs */
+extern int gpio_event_output_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func);
+extern int gpio_event_output_event(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data,
+ unsigned int dev, unsigned int type,
+ unsigned int code, int value);
+struct gpio_event_output_info {
+ /* initialize to gpio_event_output_func and gpio_event_output_event */
+ struct gpio_event_info info;
+ uint16_t flags;
+ uint16_t type;
+ const struct gpio_event_direct_entry *keymap;
+ size_t keymap_size;
+};
+
+
+/* axes */
+
+enum gpio_event_axis_flags {
+ GPIOEAF_PRINT_UNKNOWN_DIRECTION = 1U << 16,
+ GPIOEAF_PRINT_RAW = 1U << 17,
+ GPIOEAF_PRINT_EVENT = 1U << 18,
+};
+
+extern int gpio_event_axis_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func);
+struct gpio_event_axis_info {
+ /* initialize to gpio_event_axis_func */
+ struct gpio_event_info info;
+ uint8_t count; /* number of gpios for this axis */
+ uint8_t dev; /* device index when using multiple input devices */
+ uint8_t type; /* EV_REL or EV_ABS */
+ uint16_t code;
+ uint16_t decoded_size;
+ uint16_t (*map)(struct gpio_event_axis_info *info, uint16_t in);
+ uint32_t *gpio;
+ uint32_t flags;
+};
+#define gpio_axis_2bit_gray_map gpio_axis_4bit_gray_map
+#define gpio_axis_3bit_gray_map gpio_axis_4bit_gray_map
+uint16_t gpio_axis_4bit_gray_map(
+ struct gpio_event_axis_info *info, uint16_t in);
+uint16_t gpio_axis_5bit_singletrack_map(
+ struct gpio_event_axis_info *info, uint16_t in);
+
+#endif
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 78ea9bf941cd..15680809d345 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -670,8 +670,8 @@ struct hid_driver {
int (*input_mapped)(struct hid_device *hdev,
struct hid_input *hidinput, struct hid_field *field,
struct hid_usage *usage, unsigned long **bit, int *max);
- void (*input_configured)(struct hid_device *hdev,
- struct hid_input *hidinput);
+ int (*input_configured)(struct hid_device *hdev,
+ struct hid_input *hidinput);
void (*feature_mapping)(struct hid_device *hdev,
struct hid_field *field,
struct hid_usage *usage);
diff --git a/include/linux/if_pppolac.h b/include/linux/if_pppolac.h
new file mode 100644
index 000000000000..e40aa1075a30
--- /dev/null
+++ b/include/linux/if_pppolac.h
@@ -0,0 +1,23 @@
+/* include/linux/if_pppolac.h
+ *
+ * Header for PPP on L2TP Access Concentrator / PPPoLAC Socket (RFC 2661)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ * Author: Chia-chi Yeh <chiachi@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_IF_PPPOLAC_H
+#define __LINUX_IF_PPPOLAC_H
+
+#include <uapi/linux/if_pppolac.h>
+
+#endif /* __LINUX_IF_PPPOLAC_H */
diff --git a/include/linux/if_pppopns.h b/include/linux/if_pppopns.h
new file mode 100644
index 000000000000..4ac621a9ce7c
--- /dev/null
+++ b/include/linux/if_pppopns.h
@@ -0,0 +1,23 @@
+/* include/linux/if_pppopns.h
+ *
+ * Header for PPP on PPTP Network Server / PPPoPNS Socket (RFC 2637)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ * Author: Chia-chi Yeh <chiachi@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_IF_PPPOPNS_H
+#define __LINUX_IF_PPPOPNS_H
+
+#include <uapi/linux/if_pppopns.h>
+
+#endif /* __LINUX_IF_PPPOPNS_H */
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index aff7ad8a4ea3..070ec7d0df43 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -41,6 +41,25 @@ struct pptp_opt {
u32 seq_sent, seq_recv;
int ppp_flags;
};
+
+struct pppolac_opt {
+ __u32 local;
+ __u32 remote;
+ __u32 recv_sequence;
+ __u32 xmit_sequence;
+ atomic_t sequencing;
+ int (*backlog_rcv)(struct sock *sk_udp, struct sk_buff *skb);
+};
+
+struct pppopns_opt {
+ __u16 local;
+ __u16 remote;
+ __u32 recv_sequence;
+ __u32 xmit_sequence;
+ void (*data_ready)(struct sock *sk_raw);
+ int (*backlog_rcv)(struct sock *sk_raw, struct sk_buff *skb);
+};
+
#include <net/sock.h>
struct pppox_sock {
@@ -51,6 +70,8 @@ struct pppox_sock {
union {
struct pppoe_opt pppoe;
struct pptp_opt pptp;
+ struct pppolac_opt lac;
+ struct pppopns_opt pns;
} proto;
__be16 num;
};
diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index 46da02410a09..4f214275d13c 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -3,6 +3,7 @@
#include <uapi/linux/inet_diag.h>
+struct net;
struct sock;
struct inet_hashinfo;
struct nlattr;
@@ -23,6 +24,10 @@ struct inet_diag_handler {
void (*idiag_get_info)(struct sock *sk,
struct inet_diag_msg *r,
void *info);
+
+ int (*destroy)(struct sk_buff *in_skb,
+ struct inet_diag_req_v2 *req);
+
__u16 idiag_type;
};
@@ -31,7 +36,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct sk_buff *skb, struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 pid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh);
+ const struct nlmsghdr *unlh, bool net_admin);
void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb,
struct netlink_callback *cb, struct inet_diag_req_v2 *r,
struct nlattr *bc);
@@ -39,6 +44,10 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
struct sk_buff *in_skb, const struct nlmsghdr *nlh,
struct inet_diag_req_v2 *req);
+struct sock *inet_diag_find_one_icsk(struct net *net,
+ struct inet_hashinfo *hashinfo,
+ struct inet_diag_req_v2 *req);
+
int inet_diag_bc_sk(const struct nlattr *_bc, struct sock *sk);
extern int inet_diag_register(const struct inet_diag_handler *handler);
diff --git a/include/linux/init.h b/include/linux/init.h
index 2df8e8dd10a4..28ec4d128413 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -153,6 +153,10 @@ void prepare_namespace(void);
void __init load_default_modules(void);
int __init init_rootfs(void);
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void);
+#endif
+
extern void (*late_time_init)(void);
extern bool initcall_debug;
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index a041f64cf29a..16842b4e2056 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -191,6 +191,9 @@ extern struct task_group root_task_group;
.nr_cpus_allowed= NR_CPUS, \
.mm = NULL, \
.active_mm = &init_mm, \
+ .restart_block = { \
+ .fn = do_no_restart_syscall, \
+ }, \
.se = { \
.group_node = LIST_HEAD_INIT(tsk.se.group_node), \
}, \
diff --git a/include/linux/initramfs.h b/include/linux/initramfs.h
new file mode 100644
index 000000000000..fc7da63b125b
--- /dev/null
+++ b/include/linux/initramfs.h
@@ -0,0 +1,32 @@
+/*
+ * include/linux/initramfs.h
+ *
+ * Copyright (C) 2015, Google
+ * Rom Lemarchand <romlem@android.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _LINUX_INITRAMFS_H
+#define _LINUX_INITRAMFS_H
+
+#include <linux/kconfig.h>
+
+#if IS_BUILTIN(CONFIG_BLK_DEV_INITRD)
+
+int __init default_rootfs(void);
+
+#endif
+
+#endif /* _LINUX_INITRAMFS_H */
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 2725b03b4ae2..a66dd8fb1872 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -18,6 +18,7 @@ struct ipv6_devconf {
__s32 dad_transmits;
__s32 rtr_solicits;
__s32 rtr_solicit_interval;
+ __s32 rtr_solicit_max_interval;
__s32 rtr_solicit_delay;
__s32 force_mld_version;
__s32 mldv1_unsolicited_report_interval;
@@ -34,14 +35,17 @@ struct ipv6_devconf {
__s32 accept_ra_rtr_pref;
__s32 rtr_probe_interval;
#ifdef CONFIG_IPV6_ROUTE_INFO
+ __s32 accept_ra_rt_info_min_plen;
__s32 accept_ra_rt_info_max_plen;
#endif
#endif
+ __s32 accept_ra_rt_table;
__s32 proxy_ndp;
__s32 accept_source_route;
__s32 accept_ra_from_local;
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
__s32 optimistic_dad;
+ __s32 use_optimistic;
#endif
#ifdef CONFIG_IPV6_MROUTE
__s32 mc_forwarding;
@@ -51,6 +55,7 @@ struct ipv6_devconf {
__s32 force_tllao;
__s32 ndisc_notify;
__s32 suppress_frag_ndisc;
+ __s32 use_oif_addrs_only;
void *sysctl;
};
@@ -253,9 +258,9 @@ struct tcp6_timewait_sock {
};
#if IS_ENABLED(CONFIG_IPV6)
-static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
+static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk)
{
- return inet_sk(__sk)->pinet6;
+ return sk_fullsock(__sk) ? inet_sk(__sk)->pinet6 : NULL;
}
static inline struct raw6_sock *raw6_sk(const struct sock *sk)
diff --git a/include/linux/kcov.h b/include/linux/kcov.h
new file mode 100644
index 000000000000..2883ac98c280
--- /dev/null
+++ b/include/linux/kcov.h
@@ -0,0 +1,29 @@
+#ifndef _LINUX_KCOV_H
+#define _LINUX_KCOV_H
+
+#include <uapi/linux/kcov.h>
+
+struct task_struct;
+
+#ifdef CONFIG_KCOV
+
+void kcov_task_init(struct task_struct *t);
+void kcov_task_exit(struct task_struct *t);
+
+enum kcov_mode {
+ /* Coverage collection is not enabled yet. */
+ KCOV_MODE_DISABLED = 0,
+ /*
+ * Tracing coverage collection mode.
+ * Covered PCs are collected in a per-task buffer.
+ */
+ KCOV_MODE_TRACE = 1,
+};
+
+#else
+
+static inline void kcov_task_init(struct task_struct *t) {}
+static inline void kcov_task_exit(struct task_struct *t) {}
+
+#endif /* CONFIG_KCOV */
+#endif /* _LINUX_KCOV_H */
diff --git a/include/linux/keychord.h b/include/linux/keychord.h
new file mode 100644
index 000000000000..08cf5402102c
--- /dev/null
+++ b/include/linux/keychord.h
@@ -0,0 +1,23 @@
+/*
+ * Key chord input driver
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+*/
+
+#ifndef __LINUX_KEYCHORD_H_
+#define __LINUX_KEYCHORD_H_
+
+#include <uapi/linux/keychord.h>
+
+#endif /* __LINUX_KEYCHORD_H_ */
diff --git a/include/linux/keycombo.h b/include/linux/keycombo.h
new file mode 100644
index 000000000000..c6db2626b0d3
--- /dev/null
+++ b/include/linux/keycombo.h
@@ -0,0 +1,36 @@
+/*
+ * include/linux/keycombo.h - platform data structure for keycombo driver
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_KEYCOMBO_H
+#define _LINUX_KEYCOMBO_H
+
+#define KEYCOMBO_NAME "keycombo"
+
+/*
+ * if key_down_fn and key_up_fn are both present, you are guaranteed that
+ * key_down_fn will return before key_up_fn is called, and that key_up_fn
+ * is called iff key_down_fn is called.
+ */
+struct keycombo_platform_data {
+ void (*key_down_fn)(void *);
+ void (*key_up_fn)(void *);
+ void *priv;
+ int key_down_delay; /* Time in ms */
+ int *keys_up;
+ int keys_down[]; /* 0 terminated */
+};
+
+#endif /* _LINUX_KEYCOMBO_H */
diff --git a/include/linux/keyreset.h b/include/linux/keyreset.h
new file mode 100644
index 000000000000..2e34afab65e4
--- /dev/null
+++ b/include/linux/keyreset.h
@@ -0,0 +1,29 @@
+/*
+ * include/linux/keyreset.h - platform data structure for resetkeys driver
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_KEYRESET_H
+#define _LINUX_KEYRESET_H
+
+#define KEYRESET_NAME "keyreset"
+
+struct keyreset_platform_data {
+ int (*reset_fn)(void);
+ int key_down_delay;
+ int *keys_up;
+ int keys_down[]; /* 0 terminated */
+};
+
+#endif /* _LINUX_KEYRESET_H */
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 13d55206ccf6..869b21dcf503 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -38,6 +38,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
})
void kthread_bind(struct task_struct *k, unsigned int cpu);
+void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
int kthread_stop(struct task_struct *k);
bool kthread_should_stop(void);
bool kthread_should_park(void);
diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index 1cc89e9df480..ffb9c9da4f39 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -40,6 +40,11 @@ struct lsm_network_audit {
} fam;
};
+struct lsm_ioctlop_audit {
+ struct path path;
+ u16 cmd;
+};
+
/* Auxiliary data to use in generating the audit record. */
struct common_audit_data {
char type;
@@ -53,6 +58,7 @@ struct common_audit_data {
#define LSM_AUDIT_DATA_KMOD 8
#define LSM_AUDIT_DATA_INODE 9
#define LSM_AUDIT_DATA_DENTRY 10
+#define LSM_AUDIT_DATA_IOCTL_OP 11
union {
struct path path;
struct dentry *dentry;
@@ -68,6 +74,7 @@ struct common_audit_data {
} key_struct;
#endif
char *kmod_name;
+ struct lsm_ioctlop_audit *op;
} u;
/* this union contains LSM specific data */
union {
diff --git a/include/linux/memory-state-time.h b/include/linux/memory-state-time.h
new file mode 100644
index 000000000000..d2212b027866
--- /dev/null
+++ b/include/linux/memory-state-time.h
@@ -0,0 +1,42 @@
+/* include/linux/memory-state-time.h
+ *
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/workqueue.h>
+
+#define UPDATE_MEMORY_STATE(BLOCK, VALUE) BLOCK->update_call(BLOCK, VALUE)
+
+struct memory_state_update_block;
+
+typedef void (*memory_state_update_fn_t)(struct memory_state_update_block *ub,
+ int value);
+
+/* This struct is populated when you pass it to a memory_state_register*
+ * function. The update_call function is used for an update and defined in the
+ * typedef memory_state_update_fn_t
+ */
+struct memory_state_update_block {
+ memory_state_update_fn_t update_call;
+ int id;
+};
+
+/* Register a frequency struct memory_state_update_block to provide updates to
+ * memory_state_time about frequency changes using its update_call function.
+ */
+struct memory_state_update_block *memory_state_register_frequency_source(void);
+
+/* Register a bandwidth struct memory_state_update_block to provide updates to
+ * memory_state_time about bandwidth changes using its update_call function.
+ */
+struct memory_state_update_block *memory_state_register_bandwidth_source(void);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 54ad2e45bc6b..832499f3b71f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -48,6 +48,17 @@ extern int sysctl_legacy_va_layout;
#define sysctl_legacy_va_layout 0
#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+extern const int mmap_rnd_bits_min;
+extern const int mmap_rnd_bits_max;
+extern int mmap_rnd_bits __read_mostly;
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+extern const int mmap_rnd_compat_bits_min;
+extern const int mmap_rnd_compat_bits_max;
+extern int mmap_rnd_compat_bits __read_mostly;
+#endif
+
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -1087,6 +1098,7 @@ extern void pagefault_out_of_memory(void);
extern void show_free_areas(unsigned int flags);
extern bool skip_free_areas_node(unsigned int flags, int nid);
+void shmem_set_file(struct vm_area_struct *vma, struct file *file);
int shmem_zero_setup(struct vm_area_struct *);
#ifdef CONFIG_SHMEM
bool shmem_mapping(struct address_space *mapping);
@@ -1324,6 +1336,11 @@ static inline void update_hiwater_vm(struct mm_struct *mm)
mm->hiwater_vm = mm->total_vm;
}
+static inline void reset_mm_hiwater_rss(struct mm_struct *mm)
+{
+ mm->hiwater_rss = get_mm_rss(mm);
+}
+
static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
struct mm_struct *mm)
{
@@ -1763,7 +1780,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
extern struct vm_area_struct *vma_merge(struct mm_struct *,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- struct mempolicy *);
+ struct mempolicy *, const char __user *);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int split_vma(struct mm_struct *,
struct vm_area_struct *, unsigned long addr, int new_below);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6e0b286649f1..6c44170c4aca 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -274,6 +274,10 @@ struct vm_area_struct {
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap interval tree, or
* linkage of vma in the address_space->i_mmap_nonlinear list.
+ *
+ * For private anonymous mappings, a pointer to a null terminated string
+ * in the user process containing the name given to the vma, or NULL
+ * if unnamed.
*/
union {
struct {
@@ -281,6 +285,7 @@ struct vm_area_struct {
unsigned long rb_subtree_last;
} linear;
struct list_head nonlinear;
+ const char __user *anon_name;
} shared;
/*
@@ -525,4 +530,13 @@ enum tlb_flush_reason {
NR_TLB_FLUSH_REASONS,
};
+/* Return the name for an anonymous mapping or NULL for a file-backed mapping */
+static inline const char __user *vma_get_anon_name(struct vm_area_struct *vma)
+{
+ if (vma->vm_file)
+ return NULL;
+
+ return vma->shared.anon_name;
+}
+
#endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index b0692d28f8e6..868e0f977700 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -114,6 +114,9 @@ struct mmc_ext_csd {
u8 raw_pwr_cl_ddr_200_360; /* 253 */
u8 raw_bkops_status; /* 246 */
u8 raw_sectors[4]; /* 212 - 4 bytes */
+ u8 pre_eol_info; /* 267 */
+ u8 device_life_time_est_typ_a; /* 268 */
+ u8 device_life_time_est_typ_b; /* 269 */
unsigned int feature_support;
#define MMC_DISCARD_FEATURE BIT(0) /* CMD38 feature */
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index f206e29f94d7..873ec9deb055 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -135,6 +135,10 @@ struct mmc_request {
struct completion completion;
void (*done)(struct mmc_request *);/* completion function */
struct mmc_host *host;
+#ifdef CONFIG_BLOCK
+ ktime_t io_start;
+ int lat_hist_enabled;
+#endif
};
struct mmc_card;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index df0c15396bbf..8aa0fab5823a 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -15,6 +15,7 @@
#include <linux/sched.h>
#include <linux/device.h>
#include <linux/fault-inject.h>
+#include <linux/blkdev.h>
#include <linux/mmc/core.h>
#include <linux/mmc/card.h>
@@ -375,6 +376,20 @@ struct mmc_host {
int dsr_req; /* DSR value is valid */
u32 dsr; /* optional driver stage (DSR) value */
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ struct {
+ struct sdio_cis *cis;
+ struct sdio_cccr *cccr;
+ struct sdio_embedded_func *funcs;
+ int num_funcs;
+ } embedded_sdio_data;
+#endif
+
+#ifdef CONFIG_BLOCK
+ int latency_hist_enabled;
+ struct io_latency_state io_lat_s;
+#endif
+
unsigned long private[0] ____cacheline_aligned;
};
@@ -384,6 +399,14 @@ void mmc_remove_host(struct mmc_host *);
void mmc_free_host(struct mmc_host *);
int mmc_of_parse(struct mmc_host *host);
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+extern void mmc_set_embedded_sdio_data(struct mmc_host *host,
+ struct sdio_cis *cis,
+ struct sdio_cccr *cccr,
+ struct sdio_embedded_func *funcs,
+ int num_funcs);
+#endif
+
static inline void *mmc_priv(struct mmc_host *host)
{
return (void *)host->private;
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 1cd00b3a75b9..9007d262ec41 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -332,6 +332,9 @@ struct _mmc_csd {
#define EXT_CSD_GENERIC_CMD6_TIME 248 /* RO */
#define EXT_CSD_CACHE_SIZE 249 /* RO, 4 bytes */
#define EXT_CSD_PWR_CL_DDR_200_360 253 /* RO */
+#define EXT_CSD_PRE_EOL_INFO 267 /* RO */
+#define EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_A 268 /* RO */
+#define EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_B 269 /* RO */
#define EXT_CSD_TAG_UNIT_SIZE 498 /* RO */
#define EXT_CSD_DATA_TAG_SUPPORT 499 /* RO */
#define EXT_CSD_MAX_PACKED_WRITES 500 /* RO */
diff --git a/include/linux/mmc/pm.h b/include/linux/mmc/pm.h
index 4a139204c20c..6e2d6a135c7e 100644
--- a/include/linux/mmc/pm.h
+++ b/include/linux/mmc/pm.h
@@ -26,5 +26,6 @@ typedef unsigned int mmc_pm_flag_t;
#define MMC_PM_KEEP_POWER (1 << 0) /* preserve card power during suspend */
#define MMC_PM_WAKE_SDIO_IRQ (1 << 1) /* wake up host system on SDIO IRQ assertion */
+#define MMC_PM_IGNORE_PM_NOTIFY (1 << 2) /* ignore mmc pm notify */
#endif /* LINUX_MMC_PM_H */
diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h
index 50f0bc952328..dc680c4b50d4 100644..100755
--- a/include/linux/mmc/sdio_func.h
+++ b/include/linux/mmc/sdio_func.h
@@ -23,6 +23,14 @@ struct sdio_func;
typedef void (sdio_irq_handler_t)(struct sdio_func *);
/*
+ * Structure used to hold embedded SDIO device data from platform layer
+ */
+struct sdio_embedded_func {
+ uint8_t f_class;
+ uint32_t f_maxblksize;
+};
+
+/*
* SDIO function CIS tuple (unknown to the core)
*/
struct sdio_func_tuple {
@@ -130,6 +138,8 @@ extern int sdio_release_irq(struct sdio_func *func);
extern unsigned int sdio_align_size(struct sdio_func *func, unsigned int sz);
extern u8 sdio_readb(struct sdio_func *func, unsigned int addr, int *err_ret);
+extern u8 sdio_readb_ext(struct sdio_func *func, unsigned int addr, int *err_ret,
+ unsigned in);
extern u16 sdio_readw(struct sdio_func *func, unsigned int addr, int *err_ret);
extern u32 sdio_readl(struct sdio_func *func, unsigned int addr, int *err_ret);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ffe66e381c04..2594de31a1c7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -65,8 +65,10 @@ enum {
#ifdef CONFIG_CMA
# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
+# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
#else
# define is_migrate_cma(migratetype) false
+# define is_migrate_cma_page(_page) false
#endif
#define for_each_migratetype_order(order, type) \
diff --git a/include/linux/mount.h b/include/linux/mount.h
index c2c561dc0114..b9e2609701b6 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -66,6 +66,7 @@ struct vfsmount {
struct dentry *mnt_root; /* root of the mounted tree */
struct super_block *mnt_sb; /* pointer to superblock */
int mnt_flags;
+ void *data;
};
struct file; /* forward dec */
@@ -92,6 +93,6 @@ extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
extern void mark_mounts_for_expiry(struct list_head *mounts);
-extern dev_t name_to_dev_t(char *name);
+extern dev_t name_to_dev_t(const char *name);
#endif /* _LINUX_MOUNT_H */
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 492de72560fa..f6ff630b39aa 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -71,8 +71,11 @@ extern struct dentry *user_path_create(int, const char __user *, struct path *,
extern void done_path_create(struct path *, struct dentry *);
extern struct dentry *kern_path_locked(const char *, struct path *);
extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
+extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
+ const char *, unsigned int, struct path *);
extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len2(const char *, struct vfsmount *mnt, struct dentry *, int);
extern int follow_down_one(struct path *);
extern int follow_down(struct path *);
diff --git a/include/linux/netfilter/xt_qtaguid.h b/include/linux/netfilter/xt_qtaguid.h
new file mode 100644
index 000000000000..1c671552ec37
--- /dev/null
+++ b/include/linux/netfilter/xt_qtaguid.h
@@ -0,0 +1,14 @@
+#ifndef _XT_QTAGUID_MATCH_H
+#define _XT_QTAGUID_MATCH_H
+
+/* For now we just replace the xt_owner.
+ * FIXME: make iptables aware of qtaguid. */
+#include <linux/netfilter/xt_owner.h>
+
+#define XT_QTAGUID_UID XT_OWNER_UID
+#define XT_QTAGUID_GID XT_OWNER_GID
+#define XT_QTAGUID_SOCKET XT_OWNER_SOCKET
+#define xt_qtaguid_match_info xt_owner_match_info
+
+int qtaguid_untag(struct socket *sock, bool kernel);
+#endif /* _XT_QTAGUID_MATCH_H */
diff --git a/include/linux/netfilter/xt_quota2.h b/include/linux/netfilter/xt_quota2.h
new file mode 100644
index 000000000000..eadc6903314e
--- /dev/null
+++ b/include/linux/netfilter/xt_quota2.h
@@ -0,0 +1,25 @@
+#ifndef _XT_QUOTA_H
+#define _XT_QUOTA_H
+
+enum xt_quota_flags {
+ XT_QUOTA_INVERT = 1 << 0,
+ XT_QUOTA_GROW = 1 << 1,
+ XT_QUOTA_PACKET = 1 << 2,
+ XT_QUOTA_NO_CHANGE = 1 << 3,
+ XT_QUOTA_MASK = 0x0F,
+};
+
+struct xt_quota_counter;
+
+struct xt_quota_mtinfo2 {
+ char name[15];
+ u_int8_t flags;
+
+ /* Comparison-invariant */
+ aligned_u64 quota;
+
+ /* Used internally by the kernel */
+ struct xt_quota_counter *master __attribute__((aligned(8)));
+};
+
+#endif /* _XT_QUOTA_H */
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 9b2022ab4d85..eeb54ae381b9 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -14,8 +14,11 @@
* may be used to reset the timeout - for code which intentionally
* disables interrupts for a long time. This call is stateless.
*/
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR_NMI)
#include <asm/nmi.h>
+#endif
+
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
extern void touch_nmi_watchdog(void);
#else
static inline void touch_nmi_watchdog(void)
@@ -45,13 +48,23 @@ static inline bool watchdog_hardlockup_detector_is_enabled(void)
#ifdef arch_trigger_all_cpu_backtrace
static inline bool trigger_all_cpu_backtrace(void)
{
+ #if defined(CONFIG_ARM)
+ arch_trigger_all_cpu_backtrace();
+ #else
arch_trigger_all_cpu_backtrace(true);
+ #endif
return true;
}
static inline bool trigger_allbutself_cpu_backtrace(void)
{
+ #if defined(CONFIG_ARM)
+ arch_trigger_all_cpu_backtrace();
+ #else
arch_trigger_all_cpu_backtrace(false);
+ #endif
+
+
return true;
}
#else
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 83a6aeda899d..63295d2ce154 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -98,6 +98,14 @@
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
extern nodemask_t _unused_nodemask_arg_;
+/**
+ * nodemask_pr_args - printf args to output a nodemask
+ * @maskp: nodemask to be printed
+ *
+ * Can be used to provide arguments for '%*pb[l]' when printing a nodemask.
+ */
+#define nodemask_pr_args(maskp) MAX_NUMNODES, (maskp)->bits
+
/*
* The inline keyword gives the compiler room to decide to inline, or
* not inline a function as it sees best. However, as these functions
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 0ff360d5b3b3..35feeca7bf2c 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -57,6 +57,27 @@ extern int of_flat_dt_match(unsigned long node, const char *const *matches);
extern unsigned long of_get_flat_dt_root(void);
extern int of_get_flat_dt_size(void);
+/*
+ * early_init_dt_scan_chosen - scan the device tree for ramdisk and bootargs
+ *
+ * The boot arguments will be placed into the memory pointed to by @data.
+ * That memory should be COMMAND_LINE_SIZE big and initialized to be a valid
+ * (possibly empty) string. Logic for what will be in @data after this
+ * function finishes:
+ *
+ * - CONFIG_CMDLINE_FORCE=true
+ * CONFIG_CMDLINE
+ * - CONFIG_CMDLINE_EXTEND=true, @data is non-empty string
+ * @data + dt bootargs (even if dt bootargs are empty)
+ * - CONFIG_CMDLINE_EXTEND=true, @data is empty string
+ * CONFIG_CMDLINE + dt bootargs (even if dt bootargs are empty)
+ * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=non-empty:
+ * dt bootargs
+ * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=empty, @data is non-empty string
+ * @data is left unchanged
+ * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=empty, @data is empty string
+ * CONFIG_CMDLINE (or "" if that's not defined)
+ */
extern int early_init_dt_scan_chosen(unsigned long node, const char *uname,
int depth, void *data);
extern int early_init_dt_scan_memory(unsigned long node, const char *uname,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 482ccff29bc9..f67026ac6009 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -755,6 +755,11 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
loff_t *ppos);
+static inline bool perf_paranoid_any(void)
+{
+ return sysctl_perf_event_paranoid > 2;
+}
+
static inline bool perf_paranoid_tracepoint_raw(void)
{
return sysctl_perf_event_paranoid > -1;
diff --git a/include/linux/pfn.h b/include/linux/pfn.h
index 7646637221f3..97f3e88aead4 100644
--- a/include/linux/pfn.h
+++ b/include/linux/pfn.h
@@ -9,5 +9,6 @@
#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
#define PFN_DOWN(x) ((x) >> PAGE_SHIFT)
#define PFN_PHYS(x) ((phys_addr_t)(x) << PAGE_SHIFT)
+#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT))
#endif
diff --git a/include/linux/platform_data/ds2482.h b/include/linux/platform_data/ds2482.h
new file mode 100644
index 000000000000..5a6879e2a09a
--- /dev/null
+++ b/include/linux/platform_data/ds2482.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2012 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __PLATFORM_DATA_DS2482__
+#define __PLATFORM_DATA_DS2482__
+
+struct ds2482_platform_data {
+ int slpz_gpio;
+};
+
+#endif /* __PLATFORM_DATA_DS2482__ */
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 096dbced02ac..ec509ed08601 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -17,6 +17,7 @@
#include <linux/leds.h>
#include <linux/spinlock.h>
#include <linux/notifier.h>
+#include <linux/types.h>
/*
* All voltages, currents, charges, energies, time and temperatures in uV,
@@ -147,6 +148,12 @@ enum power_supply_property {
POWER_SUPPLY_PROP_SCOPE,
POWER_SUPPLY_PROP_CHARGE_TERM_CURRENT,
POWER_SUPPLY_PROP_CALIBRATE,
+ /* Local extensions */
+ POWER_SUPPLY_PROP_USB_HC,
+ POWER_SUPPLY_PROP_USB_OTG,
+ POWER_SUPPLY_PROP_CHARGE_ENABLED,
+ /* Local extensions of type int64_t */
+ POWER_SUPPLY_PROP_CHARGE_COUNTER_EXT,
/* Properties of type `const char *' */
POWER_SUPPLY_PROP_MODEL_NAME,
POWER_SUPPLY_PROP_MANUFACTURER,
@@ -171,6 +178,7 @@ enum power_supply_notifier_events {
union power_supply_propval {
int intval;
const char *strval;
+ int64_t int64val;
};
struct device;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index ece0c6bbfcc5..2eec52d597ce 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -22,12 +22,13 @@
#ifndef _LINUX_PSTORE_H
#define _LINUX_PSTORE_H
-#include <linux/time.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
#include <linux/kmsg_dump.h>
#include <linux/mutex.h>
-#include <linux/types.h>
#include <linux/spinlock.h>
-#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/types.h>
/* types */
enum pstore_type_id {
@@ -39,6 +40,7 @@ enum pstore_type_id {
PSTORE_TYPE_PPC_RTAS = 4,
PSTORE_TYPE_PPC_OF = 5,
PSTORE_TYPE_PPC_COMMON = 6,
+ PSTORE_TYPE_PMSG = 7,
PSTORE_TYPE_UNKNOWN = 255
};
@@ -65,6 +67,10 @@ struct pstore_info {
enum kmsg_dump_reason reason, u64 *id,
unsigned int part, const char *buf, bool compressed,
size_t size, struct pstore_info *psi);
+ int (*write_buf_user)(enum pstore_type_id type,
+ enum kmsg_dump_reason reason, u64 *id,
+ unsigned int part, const char __user *buf,
+ bool compressed, size_t size, struct pstore_info *psi);
int (*erase)(enum pstore_type_id type, u64 id,
int count, struct timespec time,
struct pstore_info *psi);
diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
index 4af3fdc85b01..45ac5a0d29ee 100644
--- a/include/linux/pstore_ram.h
+++ b/include/linux/pstore_ram.h
@@ -17,11 +17,12 @@
#ifndef __LINUX_PSTORE_RAM_H__
#define __LINUX_PSTORE_RAM_H__
+#include <linux/compiler.h>
#include <linux/device.h>
+#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/types.h>
-#include <linux/init.h>
struct persistent_ram_buffer;
struct rs_control;
@@ -59,7 +60,9 @@ void persistent_ram_free(struct persistent_ram_zone *prz);
void persistent_ram_zap(struct persistent_ram_zone *prz);
int persistent_ram_write(struct persistent_ram_zone *prz, const void *s,
- unsigned int count);
+ unsigned int count);
+int persistent_ram_write_user(struct persistent_ram_zone *prz,
+ const void __user *s, unsigned int count);
void persistent_ram_save_old(struct persistent_ram_zone *prz);
size_t persistent_ram_old_size(struct persistent_ram_zone *prz);
@@ -68,6 +71,8 @@ void persistent_ram_free_old(struct persistent_ram_zone *prz);
ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
char *str, size_t len);
+void ramoops_console_write_buf(const char *buf, size_t size);
+
/*
* Ramoops platform data
* @mem_size memory size for ramoops
@@ -81,6 +86,7 @@ struct ramoops_platform_data {
unsigned long record_size;
unsigned long console_size;
unsigned long ftrace_size;
+ unsigned long pmsg_size;
int dump_oops;
struct persistent_ram_ecc_info ecc_info;
};
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d978c9608e7c..ceb10d574fa9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -51,6 +51,7 @@ struct sched_param {
#include <linux/resource.h>
#include <linux/timer.h>
#include <linux/hrtimer.h>
+#include <linux/kcov.h>
#include <linux/task_io_accounting.h>
#include <linux/latencytop.h>
#include <linux/cred.h>
@@ -172,9 +173,17 @@ extern bool single_task_running(void);
extern unsigned long nr_iowait(void);
extern unsigned long nr_iowait_cpu(int cpu);
extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
+#ifdef CONFIG_CPU_QUIET
+extern u64 nr_running_integral(unsigned int cpu);
+#endif
extern void calc_global_load(unsigned long ticks);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
extern void update_cpu_load_nohz(void);
+#else
+static inline void update_cpu_load_nohz(void) { }
+#endif
extern unsigned long get_parent_ip(unsigned long addr);
@@ -267,6 +276,15 @@ extern char ___assert_task_state[1 - 2*!!(
/* Task command name length */
#define TASK_COMM_LEN 16
+enum task_event {
+ PUT_PREV_TASK = 0,
+ PICK_NEXT_TASK = 1,
+ TASK_WAKE = 2,
+ TASK_MIGRATE = 3,
+ TASK_UPDATE = 4,
+ IRQ_UPDATE = 5,
+};
+
#include <linux/spinlock.h>
/*
@@ -324,9 +342,6 @@ extern void show_regs(struct pt_regs *);
*/
extern void show_stack(struct task_struct *task, unsigned long *sp);
-void io_schedule(void);
-long io_schedule_timeout(long timeout);
-
extern void cpu_init (void);
extern void trap_init(void);
extern void update_process_times(int user);
@@ -383,6 +398,13 @@ extern signed long schedule_timeout_uninterruptible(signed long timeout);
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
+extern long io_schedule_timeout(long timeout);
+
+static inline void io_schedule(void)
+{
+ io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
+}
+
struct nsproxy;
struct user_namespace;
@@ -857,6 +879,14 @@ enum cpu_idle_type {
#define SCHED_CAPACITY_SHIFT 10
#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
+struct sched_capacity_reqs {
+ unsigned long cfs;
+ unsigned long rt;
+ unsigned long dl;
+
+ unsigned long total;
+};
+
/*
* sched-domains (multiprocessor balancing) declarations:
*/
@@ -875,6 +905,7 @@ enum cpu_idle_type {
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
#define SD_NUMA 0x4000 /* cross-node balancing */
+#define SD_SHARE_CAP_STATES 0x8000 /* Domain members share capacity state */
#ifdef CONFIG_SCHED_SMT
static inline int cpu_smt_flags(void)
@@ -907,6 +938,24 @@ struct sched_domain_attr {
extern int sched_domain_level_max;
+struct capacity_state {
+ unsigned long cap; /* compute capacity */
+ unsigned long power; /* power consumption at this compute capacity */
+};
+
+struct idle_state {
+ unsigned long power; /* power consumption in this idle state */
+};
+
+struct sched_group_energy {
+ unsigned int nr_idle_states; /* number of idle states */
+ struct idle_state *idle_states; /* ptr to idle state array */
+ unsigned int nr_cap_states; /* number of capacity states */
+ struct capacity_state *cap_states; /* ptr to capacity state array */
+};
+
+unsigned long capacity_curr_of(int cpu);
+
struct sched_group;
struct sched_domain {
@@ -1005,6 +1054,8 @@ bool cpus_share_cache(int this_cpu, int that_cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
typedef int (*sched_domain_flags_f)(void);
+typedef
+const struct sched_group_energy * const(*sched_domain_energy_f)(int cpu);
#define SDTL_OVERLAP 0x01
@@ -1017,6 +1068,7 @@ struct sd_data {
struct sched_domain_topology_level {
sched_domain_mask_f mask;
sched_domain_flags_f sd_flags;
+ sched_domain_energy_f energy;
int flags;
int numa_level;
struct sd_data data;
@@ -1073,16 +1125,24 @@ struct load_weight {
u32 inv_weight;
};
+/*
+ * The load_avg/util_avg accumulates an infinite geometric series.
+ * 1) load_avg factors frequency scaling into the amount of time that a
+ * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
+ * aggregated such weights of all runnable and blocked sched_entities.
+ * 2) util_avg factors frequency and cpu scaling into the amount of time
+ * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
+ * For cfs_rq, it is the aggregated such times of all runnable and
+ * blocked sched_entities.
+ * The 64 bit load_sum can:
+ * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with
+ * the highest weight (=88761) always runnable, we should not overflow
+ * 2) for entity, support any load.weight always runnable
+ */
struct sched_avg {
- /*
- * These sums represent an infinite geometric series and so are bound
- * above by 1024/(1-y). Thus we only need a u32 to store them for all
- * choices of y < 1-2^(-32)*1024.
- */
- u32 runnable_avg_sum, runnable_avg_period;
- u64 last_runnable_update;
- s64 decay_count;
- unsigned long load_avg_contrib;
+ u64 last_update_time, load_sum;
+ u32 util_sum, period_contrib;
+ unsigned long load_avg, util_avg;
};
#ifdef CONFIG_SCHEDSTATS
@@ -1121,6 +1181,41 @@ struct sched_statistics {
};
#endif
+#ifdef CONFIG_SCHED_WALT
+#define RAVG_HIST_SIZE_MAX 5
+
+/* ravg represents frequency scaled cpu-demand of tasks */
+struct ravg {
+ /*
+ * 'mark_start' marks the beginning of an event (task waking up, task
+ * starting to execute, task being preempted) within a window
+ *
+ * 'sum' represents how runnable a task has been within current
+ * window. It incorporates both running time and wait time and is
+ * frequency scaled.
+ *
+ * 'sum_history' keeps track of history of 'sum' seen over previous
+ * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
+ * ignored.
+ *
+ * 'demand' represents maximum sum seen over previous
+ * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
+ * demand for tasks.
+ *
+ * 'curr_window' represents task's contribution to cpu busy time
+ * statistics (rq->curr_runnable_sum) in current window
+ *
+ * 'prev_window' represents task's contribution to cpu busy time
+ * statistics (rq->prev_runnable_sum) in previous window
+ */
+ u64 mark_start;
+ u32 sum, demand;
+ u32 sum_history[RAVG_HIST_SIZE_MAX];
+ u32 curr_window, prev_window;
+ u16 active_windows;
+};
+#endif
+
struct sched_entity {
struct load_weight load; /* for load-balancing */
struct rb_node run_node;
@@ -1148,7 +1243,7 @@ struct sched_entity {
#endif
#ifdef CONFIG_SMP
- /* Per-entity load-tracking */
+ /* Per entity load average tracking */
struct sched_avg avg;
#endif
};
@@ -1244,9 +1339,9 @@ struct task_struct {
#ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
- struct task_struct *last_wakee;
- unsigned long wakee_flips;
+ unsigned int wakee_flips;
unsigned long wakee_flip_decay_ts;
+ struct task_struct *last_wakee;
int wake_cpu;
#endif
@@ -1257,6 +1352,15 @@ struct task_struct {
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
+#ifdef CONFIG_SCHED_WALT
+ struct ravg ravg;
+ /*
+ * 'init_load_pct' represents the initial task load assigned to children
+ * of this task
+ */
+ u32 init_load_pct;
+#endif
+
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
@@ -1329,6 +1433,8 @@ struct task_struct {
unsigned long atomic_flags; /* Flags needing atomic access. */
+ struct restart_block restart_block;
+
pid_t pid;
pid_t tgid;
@@ -1369,6 +1475,7 @@ struct task_struct {
cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
+ unsigned long long cpu_power;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
struct cputime prev_cputime;
#endif
@@ -1650,6 +1757,16 @@ struct task_struct {
/* bitmask and counter of trace recursion */
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
+#ifdef CONFIG_KCOV
+ /* Coverage collection mode enabled for this task (0 if disabled). */
+ enum kcov_mode kcov_mode;
+ /* Size of the kcov_area. */
+ unsigned kcov_size;
+ /* Buffer for coverage collection. */
+ void *kcov_area;
+ /* kcov desciptor wired with this task or NULL. */
+ struct kcov *kcov;
+#endif
#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
unsigned int memcg_kmem_skip_account;
struct memcg_oom_info {
@@ -1900,6 +2017,9 @@ static inline cputime_t task_gtime(struct task_struct *t)
extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
+extern int task_free_register(struct notifier_block *n);
+extern int task_free_unregister(struct notifier_block *n);
+
/*
* Per process flags
*/
@@ -2085,13 +2205,6 @@ static inline void calc_load_enter_idle(void) { }
static inline void calc_load_exit_idle(void) { }
#endif /* CONFIG_NO_HZ_COMMON */
-#ifndef CONFIG_CPUMASK_OFFSTACK
-static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
-{
- return set_cpus_allowed_ptr(p, &new_mask);
-}
-#endif
-
/*
* Do not use outside of architecture code which knows its limitations.
*
@@ -2984,6 +3097,11 @@ static inline void inc_syscw(struct task_struct *tsk)
{
tsk->ioac.syscw++;
}
+
+static inline void inc_syscfs(struct task_struct *tsk)
+{
+ tsk->ioac.syscfs++;
+}
#else
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
{
@@ -3000,6 +3118,9 @@ static inline void inc_syscr(struct task_struct *tsk)
static inline void inc_syscw(struct task_struct *tsk)
{
}
+static inline void inc_syscfs(struct task_struct *tsk)
+{
+}
#endif
#ifndef TASK_SIZE_OF
@@ -3017,13 +3138,13 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
static inline unsigned long task_rlimit(const struct task_struct *tsk,
unsigned int limit)
{
- return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur);
+ return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
}
static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
unsigned int limit)
{
- return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max);
+ return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
}
static inline unsigned long rlimit(unsigned int limit)
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 596a0e007c62..af35e330be3d 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -39,6 +39,16 @@ extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_child_runs_first;
+extern unsigned int sysctl_sched_is_big_little;
+extern unsigned int sysctl_sched_sync_hint_enable;
+extern unsigned int sysctl_sched_initial_task_util;
+extern unsigned int sysctl_sched_cstate_aware;
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int sysctl_sched_walt_init_task_load_pct;
+extern unsigned int sysctl_sched_walt_cpu_high_irqload;
+#endif
enum sched_tunable_scaling {
SCHED_TUNABLESCALING_NONE,
@@ -89,6 +99,22 @@ extern int sysctl_sched_rt_runtime;
extern unsigned int sysctl_sched_cfs_bandwidth_slice;
#endif
+#ifdef CONFIG_SCHED_TUNE
+extern int sysctl_sched_cfs_boost;
+int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length,
+ loff_t *ppos);
+static inline int get_sysctl_sched_cfs_boost(void)
+{
+ return sysctl_sched_cfs_boost;
+}
+#else
+static inline int get_sysctl_sched_cfs_boost(void)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_SCHED_AUTOGROUP
extern unsigned int sysctl_sched_autogroup_enabled;
#endif
diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h
new file mode 100644
index 000000000000..1daf3e1f98a7
--- /dev/null
+++ b/include/linux/sched_energy.h
@@ -0,0 +1,44 @@
+#ifndef _LINUX_SCHED_ENERGY_H
+#define _LINUX_SCHED_ENERGY_H
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+/*
+ * There doesn't seem to be an NR_CPUS style max number of sched domain
+ * levels so here's an arbitrary constant one for the moment.
+ *
+ * The levels alluded to here correspond to entries in struct
+ * sched_domain_topology_level that are meant to be populated by arch
+ * specific code (topology.c).
+ */
+#define NR_SD_LEVELS 8
+
+#define SD_LEVEL0 0
+#define SD_LEVEL1 1
+#define SD_LEVEL2 2
+#define SD_LEVEL3 3
+#define SD_LEVEL4 4
+#define SD_LEVEL5 5
+#define SD_LEVEL6 6
+#define SD_LEVEL7 7
+
+/*
+ * Convenience macro for iterating through said sd levels.
+ */
+#define for_each_possible_sd_level(level) \
+ for (level = 0; level < NR_SD_LEVELS; level++)
+
+#ifdef CONFIG_SMP
+
+extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
+
+void init_sched_energy_costs(void);
+
+#else
+
+#define init_sched_energy_costs() do { } while (0)
+
+#endif /* CONFIG_SMP */
+
+#endif
diff --git a/include/linux/security.h b/include/linux/security.h
index ea9eda4abdd5..14389a271f30 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1441,6 +1441,11 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
struct security_operations {
char name[SECURITY_NAME_MAX + 1];
+ int (*binder_set_context_mgr) (struct task_struct *mgr);
+ int (*binder_transaction) (struct task_struct *from, struct task_struct *to);
+ int (*binder_transfer_binder) (struct task_struct *from, struct task_struct *to);
+ int (*binder_transfer_file) (struct task_struct *from, struct task_struct *to, struct file *file);
+
int (*ptrace_access_check) (struct task_struct *child, unsigned int mode);
int (*ptrace_traceme) (struct task_struct *parent);
int (*capget) (struct task_struct *target,
@@ -1739,6 +1744,10 @@ extern void __init security_fixup_ops(struct security_operations *ops);
/* Security operations */
+int security_binder_set_context_mgr(struct task_struct *mgr);
+int security_binder_transaction(struct task_struct *from, struct task_struct *to);
+int security_binder_transfer_binder(struct task_struct *from, struct task_struct *to);
+int security_binder_transfer_file(struct task_struct *from, struct task_struct *to, struct file *file);
int security_ptrace_access_check(struct task_struct *child, unsigned int mode);
int security_ptrace_traceme(struct task_struct *parent);
int security_capget(struct task_struct *target,
@@ -1927,6 +1936,26 @@ static inline int security_init(void)
return 0;
}
+static inline int security_binder_set_context_mgr(struct task_struct *mgr)
+{
+ return 0;
+}
+
+static inline int security_binder_transaction(struct task_struct *from, struct task_struct *to)
+{
+ return 0;
+}
+
+static inline int security_binder_transfer_binder(struct task_struct *from, struct task_struct *to)
+{
+ return 0;
+}
+
+static inline int security_binder_transfer_file(struct task_struct *from, struct task_struct *to, struct file *file)
+{
+ return 0;
+}
+
static inline int security_ptrace_access_check(struct task_struct *child,
unsigned int mode)
{
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 21c2e05c1bc3..219b6a30d475 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -66,6 +66,7 @@ struct uart_ops {
void (*set_ldisc)(struct uart_port *, int new);
void (*pm)(struct uart_port *, unsigned int state,
unsigned int oldstate);
+ void (*wake_peer)(struct uart_port *);
/*
* Return a string describing the type of the port
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 5f97037a1759..899d13e28adc 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -146,6 +146,18 @@ void kfree(const void *);
void kzfree(const void *);
size_t ksize(const void *);
+#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
+const char *__check_heap_object(const void *ptr, unsigned long n,
+ struct page *page);
+#else
+static inline const char *__check_heap_object(const void *ptr,
+ unsigned long n,
+ struct page *page)
+{
+ return NULL;
+}
+#endif
+
/*
* Some archs want to perform DMA into kmalloc caches and need a guaranteed
* alignment larger than the alignment of a 64-bit integer.
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index e565d7c10c9c..68c4ec76a586 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -81,6 +81,7 @@ struct kmem_cache {
int reserved; /* Reserved bytes at the end of slabs */
const char *name; /* Name (only for display!) */
struct list_head list; /* List of slab caches */
+ int red_left_pad; /* Left redzone padding size */
#ifdef CONFIG_SYSFS
struct kobject kobj; /* For sysfs */
#endif
diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 46cca4c06848..1a3b383ba4e6 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -11,6 +11,7 @@ struct sock;
struct sock_diag_handler {
__u8 family;
int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh);
+ int (*destroy)(struct sk_buff *skb, struct nlmsghdr *nlh);
};
int sock_diag_register(const struct sock_diag_handler *h);
@@ -26,4 +27,5 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attr);
int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
struct sk_buff *skb, int attrtype);
+int sock_diag_destroy(struct sock *sk, int err);
#endif
diff --git a/include/linux/string.h b/include/linux/string.h
index 2e22a2e58f3a..fd04ba53dac3 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -114,6 +114,7 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
extern void * memchr(const void *,int,__kernel_size_t);
#endif
void *memchr_inv(const void *s, int c, size_t n);
+char *strreplace(char *s, char old, char new);
extern char *kstrdup(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 3388c1b6f7d8..57b77c1cf76d 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -201,6 +201,21 @@ struct platform_freeze_ops {
*/
extern void suspend_set_ops(const struct platform_suspend_ops *ops);
extern int suspend_valid_only_mem(suspend_state_t state);
+
+/* Suspend-to-idle state machnine. */
+enum freeze_state {
+ FREEZE_STATE_NONE, /* Not suspended/suspending. */
+ FREEZE_STATE_ENTER, /* Enter suspend-to-idle. */
+ FREEZE_STATE_WAKE, /* Wake up from suspend-to-idle. */
+};
+
+extern enum freeze_state __read_mostly suspend_freeze_state;
+
+static inline bool idle_should_freeze(void)
+{
+ return unlikely(suspend_freeze_state == FREEZE_STATE_ENTER);
+}
+
extern void freeze_set_ops(const struct platform_freeze_ops *ops);
extern void freeze_wake(void);
@@ -228,6 +243,7 @@ extern int pm_suspend(suspend_state_t state);
static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {}
static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; }
+static inline bool idle_should_freeze(void) { return false; }
static inline void freeze_set_ops(const struct platform_freeze_ops *ops) {}
static inline void freeze_wake(void) {}
#endif /* !CONFIG_SUSPEND */
@@ -379,6 +395,7 @@ extern bool pm_get_wakeup_count(unsigned int *count, bool block);
extern bool pm_save_wakeup_count(unsigned int count);
extern void pm_wakep_autosleep_enabled(bool set);
extern void pm_print_active_wakeup_sources(void);
+extern void pm_get_active_wakeup_sources(char *pending_sources, size_t max);
static inline void lock_system_sleep(void)
{
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1dc0e886227d..51b1a80b36c1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -440,6 +440,7 @@ extern unsigned int count_swap_pages(int, int);
extern sector_t map_swap_page(struct page *, struct block_device **);
extern sector_t swapdev_block(int, pgoff_t);
extern int page_swapcount(struct page *);
+extern int swp_swapcount(swp_entry_t entry);
extern struct swap_info_struct *page_swap_info(struct page *);
extern int reuse_swap_page(struct page *);
extern int try_to_free_swap(struct page *);
@@ -541,6 +542,11 @@ static inline int page_swapcount(struct page *page)
return 0;
}
+static inline int swp_swapcount(swp_entry_t entry)
+{
+ return 0;
+}
+
#define reuse_swap_page(page) (page_mapcount(page) == 1)
static inline int try_to_free_swap(struct page *page)
diff --git a/include/linux/switch.h b/include/linux/switch.h
new file mode 100644
index 000000000000..3e4c748e343a
--- /dev/null
+++ b/include/linux/switch.h
@@ -0,0 +1,53 @@
+/*
+ * Switch class driver
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+*/
+
+#ifndef __LINUX_SWITCH_H__
+#define __LINUX_SWITCH_H__
+
+struct switch_dev {
+ const char *name;
+ struct device *dev;
+ int index;
+ int state;
+
+ ssize_t (*print_name)(struct switch_dev *sdev, char *buf);
+ ssize_t (*print_state)(struct switch_dev *sdev, char *buf);
+};
+
+struct gpio_switch_platform_data {
+ const char *name;
+ unsigned gpio;
+
+ /* if NULL, switch_dev.name will be printed */
+ const char *name_on;
+ const char *name_off;
+ /* if NULL, "0" or "1" will be printed */
+ const char *state_on;
+ const char *state_off;
+};
+
+extern int switch_dev_register(struct switch_dev *sdev);
+extern void switch_dev_unregister(struct switch_dev *sdev);
+
+static inline int switch_get_state(struct switch_dev *sdev)
+{
+ return sdev->state;
+}
+
+extern void switch_set_state(struct switch_dev *sdev, int state);
+
+#endif /* __LINUX_SWITCH_H__ */
diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h
index bdf855c2856f..2dd338fdf881 100644
--- a/include/linux/task_io_accounting.h
+++ b/include/linux/task_io_accounting.h
@@ -18,6 +18,8 @@ struct task_io_accounting {
u64 syscr;
/* # of write syscalls */
u64 syscw;
+ /* # of fsync syscalls */
+ u64 syscfs;
#endif /* CONFIG_TASK_XACCT */
#ifdef CONFIG_TASK_IO_ACCOUNTING
diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h
index 4d090f9ee608..1b505c804af3 100644
--- a/include/linux/task_io_accounting_ops.h
+++ b/include/linux/task_io_accounting_ops.h
@@ -96,6 +96,7 @@ static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,
dst->wchar += src->wchar;
dst->syscr += src->syscr;
dst->syscw += src->syscw;
+ dst->syscfs += src->syscfs;
}
#else
static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index ff307b548ed3..4cf89517783a 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -145,6 +145,31 @@ static inline bool test_and_clear_restore_sigmask(void)
#error "no set_restore_sigmask() provided and default one won't work"
#endif
+#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
+static inline int arch_within_stack_frames(const void * const stack,
+ const void * const stackend,
+ const void *obj, unsigned long len)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_HARDENED_USERCOPY
+extern void __check_object_size(const void *ptr, unsigned long n,
+ bool to_user);
+
+static __always_inline void check_object_size(const void *ptr, unsigned long n,
+ bool to_user)
+{
+ if (!__builtin_constant_p(n))
+ __check_object_size(ptr, n, to_user);
+}
+#else
+static inline void check_object_size(const void *ptr, unsigned long n,
+ bool to_user)
+{ }
+#endif /* CONFIG_HARDENED_USERCOPY */
+
#endif /* __KERNEL__ */
#endif /* _LINUX_THREAD_INFO_H */
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 95640dcd1899..fdb9136ac711 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -16,16 +16,16 @@
* @read: Read function of @clock
* @mask: Bitmask for two's complement subtraction of non 64bit clocks
* @cycle_last: @clock cycle value at last update
- * @mult: NTP adjusted multiplier for scaled math conversion
+ * @mult: (NTP adjusted) multiplier for scaled math conversion
* @shift: Shift value for scaled math conversion
* @xtime_nsec: Shifted (fractional) nano seconds offset for readout
- * @base_mono: ktime_t (nanoseconds) base time for readout
+ * @base: ktime_t (nanoseconds) base time for readout
*
* This struct has size 56 byte on 64 bit. Together with a seqcount it
* occupies a single 64byte cache line.
*
* The struct is separate from struct timekeeper as it is also used
- * for a fast NMI safe accessor to clock monotonic.
+ * for a fast NMI safe accessors.
*/
struct tk_read_base {
struct clocksource *clock;
@@ -35,19 +35,19 @@ struct tk_read_base {
u32 mult;
u32 shift;
u64 xtime_nsec;
- ktime_t base_mono;
+ ktime_t base;
};
/**
* struct timekeeper - Structure holding internal timekeeping values.
- * @tkr: The readout base structure
+ * @tkr_mono: The readout base structure for CLOCK_MONOTONIC
+ * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW
* @xtime_sec: Current CLOCK_REALTIME time in seconds
* @wall_to_monotonic: CLOCK_REALTIME to CLOCK_MONOTONIC offset
* @offs_real: Offset clock monotonic -> clock realtime
* @offs_boot: Offset clock monotonic -> clock boottime
* @offs_tai: Offset clock monotonic -> clock tai
* @tai_offset: The current UTC to TAI offset in seconds
- * @base_raw: Monotonic raw base time in ktime_t format
* @raw_time: Monotonic raw base time in timespec64 format
* @cycle_interval: Number of clock cycles in one NTP interval
* @xtime_interval: Number of clock shifted nano seconds in one NTP
@@ -75,14 +75,14 @@ struct tk_read_base {
* used instead.
*/
struct timekeeper {
- struct tk_read_base tkr;
+ struct tk_read_base tkr_mono;
+ struct tk_read_base tkr_raw;
u64 xtime_sec;
struct timespec64 wall_to_monotonic;
ktime_t offs_real;
ktime_t offs_boot;
ktime_t offs_tai;
s32 tai_offset;
- ktime_t base_raw;
struct timespec64 raw_time;
/* The following members are for timekeeping internal use */
diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
new file mode 100644
index 000000000000..5b727a17beee
--- /dev/null
+++ b/include/linux/tracefs.h
@@ -0,0 +1,45 @@
+/*
+ * tracefs.h - a pseudo file system for activating tracing
+ *
+ * Based on debugfs by: 2004 Greg Kroah-Hartman <greg@kroah.com>
+ *
+ * Copyright (C) 2014 Red Hat Inc, author: Steven Rostedt <srostedt@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * tracefs is the file system that is used by the tracing infrastructure.
+ *
+ */
+
+#ifndef _TRACEFS_H_
+#define _TRACEFS_H_
+
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+
+#include <linux/types.h>
+
+struct file_operations;
+
+#ifdef CONFIG_TRACING
+
+struct dentry *tracefs_create_file(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fops);
+
+struct dentry *tracefs_create_dir(const char *name, struct dentry *parent);
+
+void tracefs_remove(struct dentry *dentry);
+void tracefs_remove_recursive(struct dentry *dentry);
+
+struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent,
+ int (*mkdir)(const char *name),
+ int (*rmdir)(const char *name));
+
+bool tracefs_initialized(void);
+
+#endif /* CONFIG_TRACING */
+
+#endif
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index ecd3319dac33..5e38c6d4d8eb 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -107,4 +107,11 @@ extern long __probe_kernel_read(void *dst, const void *src, size_t size);
extern long notrace probe_kernel_write(void *dst, const void *src, size_t size);
extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size);
+#ifndef user_access_begin
+#define user_access_begin() do { } while (0)
+#define user_access_end() do { } while (0)
+#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0)
+#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0)
+#endif
+
#endif /* __LINUX_UACCESS_H__ */
diff --git a/include/linux/uid_stat.h b/include/linux/uid_stat.h
new file mode 100644
index 000000000000..6bd6c4e52d17
--- /dev/null
+++ b/include/linux/uid_stat.h
@@ -0,0 +1,29 @@
+/* include/linux/uid_stat.h
+ *
+ * Copyright (C) 2008-2009 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __uid_stat_h
+#define __uid_stat_h
+
+/* Contains definitions for resource tracking per uid. */
+
+#ifdef CONFIG_UID_STAT
+int uid_stat_tcp_snd(uid_t uid, int size);
+int uid_stat_tcp_rcv(uid_t uid, int size);
+#else
+#define uid_stat_tcp_snd(uid, size) do {} while (0);
+#define uid_stat_tcp_rcv(uid, size) do {} while (0);
+#endif
+
+#endif /* _LINUX_UID_STAT_H */
diff --git a/include/linux/usb/class-dual-role.h b/include/linux/usb/class-dual-role.h
new file mode 100644
index 000000000000..c6df2238012e
--- /dev/null
+++ b/include/linux/usb/class-dual-role.h
@@ -0,0 +1,129 @@
+#ifndef __LINUX_CLASS_DUAL_ROLE_H__
+#define __LINUX_CLASS_DUAL_ROLE_H__
+
+#include <linux/workqueue.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+
+struct device;
+
+enum dual_role_supported_modes {
+ DUAL_ROLE_SUPPORTED_MODES_DFP_AND_UFP = 0,
+ DUAL_ROLE_SUPPORTED_MODES_DFP,
+ DUAL_ROLE_SUPPORTED_MODES_UFP,
+/*The following should be the last element*/
+ DUAL_ROLE_PROP_SUPPORTED_MODES_TOTAL,
+};
+
+enum {
+ DUAL_ROLE_PROP_MODE_UFP = 0,
+ DUAL_ROLE_PROP_MODE_DFP,
+ DUAL_ROLE_PROP_MODE_NONE,
+/*The following should be the last element*/
+ DUAL_ROLE_PROP_MODE_TOTAL,
+};
+
+enum {
+ DUAL_ROLE_PROP_PR_SRC = 0,
+ DUAL_ROLE_PROP_PR_SNK,
+ DUAL_ROLE_PROP_PR_NONE,
+/*The following should be the last element*/
+ DUAL_ROLE_PROP_PR_TOTAL,
+
+};
+
+enum {
+ DUAL_ROLE_PROP_DR_HOST = 0,
+ DUAL_ROLE_PROP_DR_DEVICE,
+ DUAL_ROLE_PROP_DR_NONE,
+/*The following should be the last element*/
+ DUAL_ROLE_PROP_DR_TOTAL,
+};
+
+enum {
+ DUAL_ROLE_PROP_VCONN_SUPPLY_NO = 0,
+ DUAL_ROLE_PROP_VCONN_SUPPLY_YES,
+/*The following should be the last element*/
+ DUAL_ROLE_PROP_VCONN_SUPPLY_TOTAL,
+};
+
+enum dual_role_property {
+ DUAL_ROLE_PROP_SUPPORTED_MODES = 0,
+ DUAL_ROLE_PROP_MODE,
+ DUAL_ROLE_PROP_PR,
+ DUAL_ROLE_PROP_DR,
+ DUAL_ROLE_PROP_VCONN_SUPPLY,
+};
+
+struct dual_role_phy_instance;
+
+/* Description of typec port */
+struct dual_role_phy_desc {
+ /* /sys/class/dual_role_usb/<name>/ */
+ const char *name;
+ enum dual_role_supported_modes supported_modes;
+ enum dual_role_property *properties;
+ size_t num_properties;
+
+ /* Callback for "cat /sys/class/dual_role_usb/<name>/<property>" */
+ int (*get_property)(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ unsigned int *val);
+ /* Callback for "echo <value> >
+ * /sys/class/dual_role_usb/<name>/<property>" */
+ int (*set_property)(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ const unsigned int *val);
+ /* Decides whether userspace can change a specific property */
+ int (*property_is_writeable)(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop);
+};
+
+struct dual_role_phy_instance {
+ const struct dual_role_phy_desc *desc;
+
+ /* Driver private data */
+ void *drv_data;
+
+ struct device dev;
+ struct work_struct changed_work;
+};
+
+#if IS_ENABLED(CONFIG_DUAL_ROLE_USB_INTF)
+extern void dual_role_instance_changed(struct dual_role_phy_instance
+ *dual_role);
+extern struct dual_role_phy_instance *__must_check
+devm_dual_role_instance_register(struct device *parent,
+ const struct dual_role_phy_desc *desc);
+extern void devm_dual_role_instance_unregister(struct device *dev,
+ struct dual_role_phy_instance
+ *dual_role);
+extern int dual_role_get_property(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ unsigned int *val);
+extern int dual_role_set_property(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ const unsigned int *val);
+extern int dual_role_property_is_writeable(struct dual_role_phy_instance
+ *dual_role,
+ enum dual_role_property prop);
+extern void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role);
+#else /* CONFIG_DUAL_ROLE_USB_INTF */
+static inline void dual_role_instance_changed(struct dual_role_phy_instance
+ *dual_role){}
+static inline struct dual_role_phy_instance *__must_check
+devm_dual_role_instance_register(struct device *parent,
+ const struct dual_role_phy_desc *desc)
+{
+ return ERR_PTR(-ENOSYS);
+}
+static inline void devm_dual_role_instance_unregister(struct device *dev,
+ struct dual_role_phy_instance
+ *dual_role){}
+static inline void *dual_role_get_drvdata(struct dual_role_phy_instance
+ *dual_role)
+{
+ return ERR_PTR(-ENOSYS);
+}
+#endif /* CONFIG_DUAL_ROLE_USB_INTF */
+#endif /* __LINUX_CLASS_DUAL_ROLE_H__ */
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index c330f5ef42cf..6380a0e4001a 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -562,6 +562,7 @@ struct usb_function_instance {
struct config_group group;
struct list_head cfs_list;
struct usb_function_driver *fd;
+ struct usb_function *f;
int (*set_inst_name)(struct usb_function_instance *inst,
const char *name);
void (*free_func_inst)(struct usb_function_instance *inst);
diff --git a/include/linux/usb/f_accessory.h b/include/linux/usb/f_accessory.h
new file mode 100644
index 000000000000..ebe3c4d59309
--- /dev/null
+++ b/include/linux/usb/f_accessory.h
@@ -0,0 +1,23 @@
+/*
+ * Gadget Function Driver for Android USB accessories
+ *
+ * Copyright (C) 2011 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __LINUX_USB_F_ACCESSORY_H
+#define __LINUX_USB_F_ACCESSORY_H
+
+#include <uapi/linux/usb/f_accessory.h>
+
+#endif /* __LINUX_USB_F_ACCESSORY_H */
diff --git a/include/linux/usb/f_mtp.h b/include/linux/usb/f_mtp.h
new file mode 100644
index 000000000000..4e8417791bea
--- /dev/null
+++ b/include/linux/usb/f_mtp.h
@@ -0,0 +1,23 @@
+/*
+ * Gadget Function Driver for MTP
+ *
+ * Copyright (C) 2010 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __LINUX_USB_F_MTP_H
+#define __LINUX_USB_F_MTP_H
+
+#include <uapi/linux/usb/f_mtp.h>
+
+#endif /* __LINUX_USB_F_MTP_H */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 82e7db7f7100..c013b8d8e434 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -211,6 +211,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item);
extern void dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
+void quiet_vmstat(void);
void cpu_vm_stats_fold(int cpu);
void refresh_zone_stat_thresholds(void);
@@ -272,6 +273,7 @@ static inline void __dec_zone_page_state(struct page *page,
static inline void refresh_cpu_vm_stats(int cpu) { }
static inline void refresh_zone_stat_thresholds(void) { }
static inline void cpu_vm_stats_fold(int cpu) { }
+static inline void quiet_vmstat(void) { }
static inline void drain_zonestat(struct zone *zone,
struct per_cpu_pageset *pset) { }
diff --git a/include/linux/wakelock.h b/include/linux/wakelock.h
new file mode 100644
index 000000000000..f4a698a22880
--- /dev/null
+++ b/include/linux/wakelock.h
@@ -0,0 +1,67 @@
+/* include/linux/wakelock.h
+ *
+ * Copyright (C) 2007-2012 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_WAKELOCK_H
+#define _LINUX_WAKELOCK_H
+
+#include <linux/ktime.h>
+#include <linux/device.h>
+
+/* A wake_lock prevents the system from entering suspend or other low power
+ * states when active. If the type is set to WAKE_LOCK_SUSPEND, the wake_lock
+ * prevents a full system suspend.
+ */
+
+enum {
+ WAKE_LOCK_SUSPEND, /* Prevent suspend */
+ WAKE_LOCK_TYPE_COUNT
+};
+
+struct wake_lock {
+ struct wakeup_source ws;
+};
+
+static inline void wake_lock_init(struct wake_lock *lock, int type,
+ const char *name)
+{
+ wakeup_source_init(&lock->ws, name);
+}
+
+static inline void wake_lock_destroy(struct wake_lock *lock)
+{
+ wakeup_source_trash(&lock->ws);
+}
+
+static inline void wake_lock(struct wake_lock *lock)
+{
+ __pm_stay_awake(&lock->ws);
+}
+
+static inline void wake_lock_timeout(struct wake_lock *lock, long timeout)
+{
+ __pm_wakeup_event(&lock->ws, jiffies_to_msecs(timeout));
+}
+
+static inline void wake_unlock(struct wake_lock *lock)
+{
+ __pm_relax(&lock->ws);
+}
+
+static inline int wake_lock_active(struct wake_lock *lock)
+{
+ return lock->ws.active;
+}
+
+#endif
diff --git a/include/linux/wakeup_reason.h b/include/linux/wakeup_reason.h
new file mode 100644
index 000000000000..d84d8c301546
--- /dev/null
+++ b/include/linux/wakeup_reason.h
@@ -0,0 +1,32 @@
+/*
+ * include/linux/wakeup_reason.h
+ *
+ * Logs the reason which caused the kernel to resume
+ * from the suspend mode.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _LINUX_WAKEUP_REASON_H
+#define _LINUX_WAKEUP_REASON_H
+
+#define MAX_SUSPEND_ABORT_LEN 256
+
+void log_wakeup_reason(int irq);
+int check_wakeup_reason(int irq);
+
+#ifdef CONFIG_SUSPEND
+void log_suspend_abort_reason(const char *fmt, ...);
+#else
+static inline void log_suspend_abort_reason(const char *fmt, ...) { }
+#endif
+
+#endif /* _LINUX_WAKEUP_REASON_H */
diff --git a/include/linux/wifi_tiwlan.h b/include/linux/wifi_tiwlan.h
new file mode 100644
index 000000000000..f07e0679fb82
--- /dev/null
+++ b/include/linux/wifi_tiwlan.h
@@ -0,0 +1,27 @@
+/* include/linux/wifi_tiwlan.h
+ *
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef _LINUX_WIFI_TIWLAN_H_
+#define _LINUX_WIFI_TIWLAN_H_
+
+#include <linux/wlan_plat.h>
+
+#define WMPA_NUMBER_OF_SECTIONS 3
+#define WMPA_NUMBER_OF_BUFFERS 160
+#define WMPA_SECTION_HEADER 24
+#define WMPA_SECTION_SIZE_0 (WMPA_NUMBER_OF_BUFFERS * 64)
+#define WMPA_SECTION_SIZE_1 (WMPA_NUMBER_OF_BUFFERS * 256)
+#define WMPA_SECTION_SIZE_2 (WMPA_NUMBER_OF_BUFFERS * 2048)
+
+#endif
diff --git a/include/linux/wl12xx.h b/include/linux/wl12xx.h
index a9c723be1acf..95704cd4cfab 100644
--- a/include/linux/wl12xx.h
+++ b/include/linux/wl12xx.h
@@ -26,28 +26,6 @@
#include <linux/err.h>
-/* Reference clock values */
-enum {
- WL12XX_REFCLOCK_19 = 0, /* 19.2 MHz */
- WL12XX_REFCLOCK_26 = 1, /* 26 MHz */
- WL12XX_REFCLOCK_38 = 2, /* 38.4 MHz */
- WL12XX_REFCLOCK_52 = 3, /* 52 MHz */
- WL12XX_REFCLOCK_38_XTAL = 4, /* 38.4 MHz, XTAL */
- WL12XX_REFCLOCK_26_XTAL = 5, /* 26 MHz, XTAL */
-};
-
-/* TCXO clock values */
-enum {
- WL12XX_TCXOCLOCK_19_2 = 0, /* 19.2MHz */
- WL12XX_TCXOCLOCK_26 = 1, /* 26 MHz */
- WL12XX_TCXOCLOCK_38_4 = 2, /* 38.4MHz */
- WL12XX_TCXOCLOCK_52 = 3, /* 52 MHz */
- WL12XX_TCXOCLOCK_16_368 = 4, /* 16.368 MHz */
- WL12XX_TCXOCLOCK_32_736 = 5, /* 32.736 MHz */
- WL12XX_TCXOCLOCK_16_8 = 6, /* 16.8 MHz */
- WL12XX_TCXOCLOCK_33_6 = 7, /* 33.6 MHz */
-};
-
struct wl1251_platform_data {
int power_gpio;
/* SDIO only: IRQ number if WLAN_IRQ line is used, 0 for SDIO IRQs */
@@ -55,23 +33,8 @@ struct wl1251_platform_data {
bool use_eeprom;
};
-struct wl12xx_platform_data {
- int irq;
- int board_ref_clock;
- int board_tcxo_clock;
- unsigned long platform_quirks;
- bool pwr_in_suspend;
-};
-
-/* Platform does not support level trigger interrupts */
-#define WL12XX_PLATFORM_QUIRK_EDGE_IRQ BIT(0)
-
#ifdef CONFIG_WILINK_PLATFORM_DATA
-int wl12xx_set_platform_data(const struct wl12xx_platform_data *data);
-
-struct wl12xx_platform_data *wl12xx_get_platform_data(void);
-
int wl1251_set_platform_data(const struct wl1251_platform_data *data);
struct wl1251_platform_data *wl1251_get_platform_data(void);
@@ -79,18 +42,6 @@ struct wl1251_platform_data *wl1251_get_platform_data(void);
#else
static inline
-int wl12xx_set_platform_data(const struct wl12xx_platform_data *data)
-{
- return -ENOSYS;
-}
-
-static inline
-struct wl12xx_platform_data *wl12xx_get_platform_data(void)
-{
- return ERR_PTR(-ENODATA);
-}
-
-static inline
int wl1251_set_platform_data(const struct wl1251_platform_data *data)
{
return -ENOSYS;
diff --git a/include/linux/wlan_plat.h b/include/linux/wlan_plat.h
new file mode 100644
index 000000000000..8e8b06f1ba4a
--- /dev/null
+++ b/include/linux/wlan_plat.h
@@ -0,0 +1,30 @@
+/* include/linux/wlan_plat.h
+ *
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef _LINUX_WLAN_PLAT_H_
+#define _LINUX_WLAN_PLAT_H_
+
+#define WLAN_PLAT_NODFS_FLAG 0x01
+
+struct wifi_platform_data {
+ int (*set_power)(int val);
+ int (*set_reset)(int val);
+ int (*set_carddetect)(int val);
+ void *(*mem_prealloc)(int section, unsigned long size);
+ int (*get_mac_addr)(unsigned char *buf);
+ int (*get_wake_irq)(void);
+ void *(*get_country_code)(char *ccode, u32 flags);
+};
+
+#endif
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index f14bd75f08b3..56529b34dc63 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -36,7 +36,8 @@ enum zpool_mapmode {
ZPOOL_MM_DEFAULT = ZPOOL_MM_RW
};
-struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops);
+struct zpool *zpool_create_pool(char *type, char *name,
+ gfp_t gfp, struct zpool_ops *ops);
char *zpool_get_type(struct zpool *pool);
@@ -80,7 +81,7 @@ struct zpool_driver {
atomic_t refcount;
struct list_head list;
- void *(*create)(gfp_t gfp, struct zpool_ops *ops);
+ void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops);
void (*destroy)(void *pool);
int (*malloc)(void *pool, size_t size, gfp_t gfp,
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 05c214760977..1338190b5478 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -36,7 +36,7 @@ enum zs_mapmode {
struct zs_pool;
-struct zs_pool *zs_create_pool(gfp_t flags);
+struct zs_pool *zs_create_pool(char *name, gfp_t flags);
void zs_destroy_pool(struct zs_pool *pool);
unsigned long zs_malloc(struct zs_pool *pool, size_t size);
@@ -47,5 +47,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
unsigned long zs_get_total_pages(struct zs_pool *pool);
+unsigned long zs_compact(struct zs_pool *pool);
#endif
diff --git a/include/net/activity_stats.h b/include/net/activity_stats.h
new file mode 100644
index 000000000000..10e4c1506eeb
--- /dev/null
+++ b/include/net/activity_stats.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Author: Mike Chan (mike@android.com)
+ */
+
+#ifndef __activity_stats_h
+#define __activity_stats_h
+
+#ifdef CONFIG_NET_ACTIVITY_STATS
+void activity_stats_update(void);
+#else
+#define activity_stats_update(void) {}
+#endif
+
+#endif /* _NET_ACTIVITY_STATS_H */
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index a9c0a6d3566f..8bd6f020272d 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -1,8 +1,9 @@
#ifndef _ADDRCONF_H
#define _ADDRCONF_H
-#define MAX_RTR_SOLICITATIONS 3
+#define MAX_RTR_SOLICITATIONS -1 /* unlimited */
#define RTR_SOLICITATION_INTERVAL (4*HZ)
+#define RTR_SOLICITATION_MAX_INTERVAL (3600*HZ) /* 1 hour */
#define MIN_VALID_LIFETIME (2*3600) /* 2 hours */
@@ -64,6 +65,9 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg);
int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
const struct net_device *dev, int strict);
+int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
+ const struct net_device *dev, int strict,
+ u32 banned_flags);
#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr);
@@ -195,6 +199,8 @@ static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset)
void addrconf_prefix_rcv(struct net_device *dev,
u8 *opt, int len, bool sllao);
+u32 addrconf_rt_table(const struct net_device *dev, u32 default_table);
+
/*
* anycast prototypes (anycast.c)
*/
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a2ddcf2398fd..4bbf53b6b8c3 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1465,6 +1465,8 @@ struct cfg80211_match_set {
* @channels: channels to scan
* @min_rssi_thold: for drivers only supporting a single threshold, this
* contains the minimum over all matchsets
+ * @owner_nlportid: netlink portid of owner (if this should is a request
+ * owned by a particular socket)
*/
struct cfg80211_sched_scan_request {
struct cfg80211_ssid *ssids;
@@ -1483,6 +1485,7 @@ struct cfg80211_sched_scan_request {
struct wiphy *wiphy;
struct net_device *dev;
unsigned long scan_start;
+ u32 owner_nlportid;
/* keep last */
struct ieee80211_channel *channels[0];
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index e584de16e4c3..ea9876285fe3 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -8,6 +8,11 @@
#include <net/flow.h>
#include <net/rtnetlink.h>
+struct fib_kuid_range {
+ kuid_t start;
+ kuid_t end;
+};
+
struct fib_rule {
struct list_head list;
int iifindex;
@@ -28,6 +33,7 @@ struct fib_rule {
int suppress_prefixlen;
char iifname[IFNAMSIZ];
char oifname[IFNAMSIZ];
+ struct fib_kuid_range uid_range;
struct rcu_head rcu;
};
@@ -88,7 +94,8 @@ struct fib_rules_ops {
[FRA_TABLE] = { .type = NLA_U32 }, \
[FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \
[FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \
- [FRA_GOTO] = { .type = NLA_U32 }
+ [FRA_GOTO] = { .type = NLA_U32 }, \
+ [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }
static inline void fib_rule_get(struct fib_rule *rule)
{
diff --git a/include/net/flow.h b/include/net/flow.h
index 8109a159d1b3..546f6d6dcb84 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -10,6 +10,7 @@
#include <linux/socket.h>
#include <linux/in6.h>
#include <linux/atomic.h>
+#include <linux/uidgid.h>
/*
* ifindex generation is per-net namespace, and loopback is
@@ -30,6 +31,7 @@ struct flowi_common {
#define FLOWI_FLAG_ANYSRC 0x01
#define FLOWI_FLAG_KNOWN_NH 0x02
__u32 flowic_secid;
+ kuid_t flowic_uid;
};
union flowi_uli {
@@ -66,6 +68,7 @@ struct flowi4 {
#define flowi4_proto __fl_common.flowic_proto
#define flowi4_flags __fl_common.flowic_flags
#define flowi4_secid __fl_common.flowic_secid
+#define flowi4_uid __fl_common.flowic_uid
/* (saddr,daddr) must be grouped, same order as in IP header */
__be32 saddr;
@@ -85,7 +88,8 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
__u32 mark, __u8 tos, __u8 scope,
__u8 proto, __u8 flags,
__be32 daddr, __be32 saddr,
- __be16 dport, __be16 sport)
+ __be16 dport, __be16 sport,
+ kuid_t uid)
{
fl4->flowi4_oif = oif;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
@@ -95,6 +99,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
fl4->flowi4_proto = proto;
fl4->flowi4_flags = flags;
fl4->flowi4_secid = 0;
+ fl4->flowi4_uid = uid;
fl4->daddr = daddr;
fl4->saddr = saddr;
fl4->fl4_dport = dport;
@@ -122,6 +127,7 @@ struct flowi6 {
#define flowi6_proto __fl_common.flowic_proto
#define flowi6_flags __fl_common.flowic_flags
#define flowi6_secid __fl_common.flowic_secid
+#define flowi6_uid __fl_common.flowic_uid
struct in6_addr daddr;
struct in6_addr saddr;
__be32 flowlabel;
@@ -165,6 +171,7 @@ struct flowi {
#define flowi_proto u.__fl_common.flowic_proto
#define flowi_flags u.__fl_common.flowic_flags
#define flowi_secid u.__fl_common.flowic_secid
+#define flowi_uid u.__fl_common.flowic_uid
} __attribute__((__aligned__(BITS_PER_LONG/8)));
static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4)
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 98e5f9578f86..fdf595abf86e 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -201,6 +201,7 @@ struct inet6_dev {
struct ipv6_devstat stats;
struct timer_list rs_timer;
+ __s32 rs_interval; /* in jiffies */
__u8 rs_probes;
__u8 addr_gen_mode;
diff --git a/include/net/ip.h b/include/net/ip.h
index 7f0cd17668ae..82730e673ea1 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -173,6 +173,7 @@ struct ip_reply_arg {
/* -1 if not needed */
int bound_dev_if;
u8 tos;
+ kuid_t uid;
};
#define IP_REPLY_ARG_NOSRCCHECK 1
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 2e765849ccd6..5254391cc741 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -109,9 +109,10 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
const struct in6_addr *gwaddr);
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif,
- u32 mark);
+ u32 mark, kuid_t uid);
void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu);
-void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark);
+void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
+ kuid_t uid);
void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
u32 mark);
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk);
diff --git a/include/net/route.h b/include/net/route.h
index b17cf28f996e..4c468eb05c1f 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -140,7 +140,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi
flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos,
RT_SCOPE_UNIVERSE, proto,
sk ? inet_sk_flowi_flags(sk) : 0,
- daddr, saddr, dport, sport);
+ daddr, saddr, dport, sport, sock_net_uid(net, sk));
if (sk)
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
return ip_route_output_flow(net, fl4, sk);
@@ -249,7 +249,8 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
flow_flags |= FLOWI_FLAG_ANYSRC;
flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
- protocol, flow_flags, dst, src, dport, sport);
+ protocol, flow_flags, dst, src, dport, sport,
+ sk->sk_uid);
}
static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
diff --git a/include/net/sock.h b/include/net/sock.h
index a40bc8c0af4b..4cf7e2279480 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -67,8 +67,8 @@
#include <linux/atomic.h>
#include <net/dst.h>
#include <net/checksum.h>
-#include <net/tcp_states.h>
#include <linux/net_tstamp.h>
+#include <net/tcp_states.h>
struct cgroup;
struct cgroup_subsys;
@@ -423,6 +423,7 @@ struct sock {
void *sk_security;
#endif
__u32 sk_mark;
+ kuid_t sk_uid;
u32 sk_classid;
struct cg_proto *sk_cgrp;
void (*sk_state_change)(struct sock *sk);
@@ -1060,6 +1061,7 @@ struct proto {
void (*destroy_cgroup)(struct mem_cgroup *memcg);
struct cg_proto *(*proto_cgroup)(struct mem_cgroup *memcg);
#endif
+ int (*diag_destroy)(struct sock *sk, int err);
};
/*
@@ -1736,6 +1738,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
sk->sk_wq = parent->wq;
parent->sk = sk;
sk_set_socket(sk, parent);
+ sk->sk_uid = SOCK_INODE(parent)->i_uid;
security_sock_graft(sk, parent);
write_unlock_bh(&sk->sk_callback_lock);
}
@@ -1743,6 +1746,11 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
kuid_t sock_i_uid(struct sock *sk);
unsigned long sock_i_ino(struct sock *sk);
+static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
+{
+ return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
+}
+
static inline struct dst_entry *
__sk_dst_get(struct sock *sk)
{
@@ -2281,10 +2289,11 @@ static inline struct sock *skb_steal_sock(struct sk_buff *skb)
/* This helper checks if a socket is a full socket,
* ie _not_ a timewait or request socket.
+ * TODO: Check for TCPF_NEW_SYN_RECV when that starts to exist.
*/
static inline bool sk_fullsock(const struct sock *sk)
{
- return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
+ return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT);
}
void sock_enable_timestamp(struct sock *sk, int flag);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7219b8f38cef..1eb86f714bcc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -276,6 +276,7 @@ extern int sysctl_tcp_challenge_ack_limit;
extern unsigned int sysctl_tcp_notsent_lowat;
extern int sysctl_tcp_min_tso_segs;
extern int sysctl_tcp_autocorking;
+extern int sysctl_tcp_default_init_rwnd;
extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
@@ -1084,6 +1085,8 @@ void tcp_set_state(struct sock *sk, int state);
void tcp_done(struct sock *sk);
+int tcp_abort(struct sock *sk, int err);
+
static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
{
rx_opt->dsack = 0;
diff --git a/include/net/udp.h b/include/net/udp.h
index 07f9b70962f6..a274df0c3421 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -229,6 +229,7 @@ int udp_get_port(struct sock *sk, unsigned short snum,
int (*saddr_cmp)(const struct sock *,
const struct sock *));
void udp_err(struct sk_buff *, u32);
+int udp_abort(struct sock *sk, int err);
int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len);
int udp_push_pending_frames(struct sock *sk);
diff --git a/include/trace/events/android_fs.h b/include/trace/events/android_fs.h
new file mode 100644
index 000000000000..49509533d3fa
--- /dev/null
+++ b/include/trace/events/android_fs.h
@@ -0,0 +1,65 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM android_fs
+
+#if !defined(_TRACE_ANDROID_FS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ANDROID_FS_H
+
+#include <linux/tracepoint.h>
+#include <trace/events/android_fs_template.h>
+
+DEFINE_EVENT(android_fs_data_start_template, android_fs_dataread_start,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes,
+ pid_t pid, char *pathname, char *command),
+ TP_ARGS(inode, offset, bytes, pid, pathname, command));
+
+DEFINE_EVENT(android_fs_data_end_template, android_fs_dataread_end,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes),
+ TP_ARGS(inode, offset, bytes));
+
+DEFINE_EVENT(android_fs_data_start_template, android_fs_datawrite_start,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes,
+ pid_t pid, char *pathname, char *command),
+ TP_ARGS(inode, offset, bytes, pid, pathname, command));
+
+DEFINE_EVENT(android_fs_data_end_template, android_fs_datawrite_end,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes),
+ TP_ARGS(inode, offset, bytes));
+
+#endif /* _TRACE_ANDROID_FS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
+#ifndef ANDROID_FSTRACE_GET_PATHNAME
+#define ANDROID_FSTRACE_GET_PATHNAME
+
+/* Sizes an on-stack array, so careful if sizing this up ! */
+#define MAX_TRACE_PATHBUF_LEN 256
+
+static inline char *
+android_fstrace_get_pathname(char *buf, int buflen, struct inode *inode)
+{
+ char *path;
+ struct dentry *d;
+
+ /*
+ * d_obtain_alias() will either iput() if it locates an existing
+ * dentry or transfer the reference to the new dentry created.
+ * So get an extra reference here.
+ */
+ ihold(inode);
+ d = d_obtain_alias(inode);
+ if (likely(!IS_ERR(d))) {
+ path = dentry_path_raw(d, buf, buflen);
+ if (unlikely(IS_ERR(path))) {
+ strcpy(buf, "ERROR");
+ path = buf;
+ }
+ dput(d);
+ } else {
+ strcpy(buf, "ERROR");
+ path = buf;
+ }
+ return path;
+}
+#endif
diff --git a/include/trace/events/android_fs_template.h b/include/trace/events/android_fs_template.h
new file mode 100644
index 000000000000..b23d17b56c63
--- /dev/null
+++ b/include/trace/events/android_fs_template.h
@@ -0,0 +1,64 @@
+#if !defined(_TRACE_ANDROID_FS_TEMPLATE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ANDROID_FS_TEMPLATE_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(android_fs_data_start_template,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes,
+ pid_t pid, char *pathname, char *command),
+ TP_ARGS(inode, offset, bytes, pid, pathname, command),
+ TP_STRUCT__entry(
+ __string(pathbuf, pathname);
+ __field(loff_t, offset);
+ __field(int, bytes);
+ __field(loff_t, i_size);
+ __string(cmdline, command);
+ __field(pid_t, pid);
+ __field(ino_t, ino);
+ ),
+ TP_fast_assign(
+ {
+ /*
+ * Replace the spaces in filenames and cmdlines
+ * because this screws up the tooling that parses
+ * the traces.
+ */
+ __assign_str(pathbuf, pathname);
+ (void)strreplace(__get_str(pathbuf), ' ', '_');
+ __entry->offset = offset;
+ __entry->bytes = bytes;
+ __entry->i_size = i_size_read(inode);
+ __assign_str(cmdline, command);
+ (void)strreplace(__get_str(cmdline), ' ', '_');
+ __entry->pid = pid;
+ __entry->ino = inode->i_ino;
+ }
+ ),
+ TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s,"
+ " pid %d, i_size %llu, ino %lu",
+ __get_str(pathbuf), __entry->offset, __entry->bytes,
+ __get_str(cmdline), __entry->pid, __entry->i_size,
+ (unsigned long) __entry->ino)
+);
+
+DECLARE_EVENT_CLASS(android_fs_data_end_template,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes),
+ TP_ARGS(inode, offset, bytes),
+ TP_STRUCT__entry(
+ __field(ino_t, ino);
+ __field(loff_t, offset);
+ __field(int, bytes);
+ ),
+ TP_fast_assign(
+ {
+ __entry->ino = inode->i_ino;
+ __entry->offset = offset;
+ __entry->bytes = bytes;
+ }
+ ),
+ TP_printk("ino %lu, offset %llu, bytes %d",
+ (unsigned long) __entry->ino,
+ __entry->offset, __entry->bytes)
+);
+
+#endif /* _TRACE_ANDROID_FS_TEMPLATE_H */
diff --git a/include/trace/events/cpufreq_interactive.h b/include/trace/events/cpufreq_interactive.h
new file mode 100644
index 000000000000..951e6ca12da8
--- /dev/null
+++ b/include/trace/events/cpufreq_interactive.h
@@ -0,0 +1,112 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cpufreq_interactive
+
+#if !defined(_TRACE_CPUFREQ_INTERACTIVE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_CPUFREQ_INTERACTIVE_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(set,
+ TP_PROTO(u32 cpu_id, unsigned long targfreq,
+ unsigned long actualfreq),
+ TP_ARGS(cpu_id, targfreq, actualfreq),
+
+ TP_STRUCT__entry(
+ __field( u32, cpu_id )
+ __field(unsigned long, targfreq )
+ __field(unsigned long, actualfreq )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu_id = (u32) cpu_id;
+ __entry->targfreq = targfreq;
+ __entry->actualfreq = actualfreq;
+ ),
+
+ TP_printk("cpu=%u targ=%lu actual=%lu",
+ __entry->cpu_id, __entry->targfreq,
+ __entry->actualfreq)
+);
+
+DEFINE_EVENT(set, cpufreq_interactive_setspeed,
+ TP_PROTO(u32 cpu_id, unsigned long targfreq,
+ unsigned long actualfreq),
+ TP_ARGS(cpu_id, targfreq, actualfreq)
+);
+
+DECLARE_EVENT_CLASS(loadeval,
+ TP_PROTO(unsigned long cpu_id, unsigned long load,
+ unsigned long curtarg, unsigned long curactual,
+ unsigned long newtarg),
+ TP_ARGS(cpu_id, load, curtarg, curactual, newtarg),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, cpu_id )
+ __field(unsigned long, load )
+ __field(unsigned long, curtarg )
+ __field(unsigned long, curactual )
+ __field(unsigned long, newtarg )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu_id = cpu_id;
+ __entry->load = load;
+ __entry->curtarg = curtarg;
+ __entry->curactual = curactual;
+ __entry->newtarg = newtarg;
+ ),
+
+ TP_printk("cpu=%lu load=%lu cur=%lu actual=%lu targ=%lu",
+ __entry->cpu_id, __entry->load, __entry->curtarg,
+ __entry->curactual, __entry->newtarg)
+);
+
+DEFINE_EVENT(loadeval, cpufreq_interactive_target,
+ TP_PROTO(unsigned long cpu_id, unsigned long load,
+ unsigned long curtarg, unsigned long curactual,
+ unsigned long newtarg),
+ TP_ARGS(cpu_id, load, curtarg, curactual, newtarg)
+);
+
+DEFINE_EVENT(loadeval, cpufreq_interactive_already,
+ TP_PROTO(unsigned long cpu_id, unsigned long load,
+ unsigned long curtarg, unsigned long curactual,
+ unsigned long newtarg),
+ TP_ARGS(cpu_id, load, curtarg, curactual, newtarg)
+);
+
+DEFINE_EVENT(loadeval, cpufreq_interactive_notyet,
+ TP_PROTO(unsigned long cpu_id, unsigned long load,
+ unsigned long curtarg, unsigned long curactual,
+ unsigned long newtarg),
+ TP_ARGS(cpu_id, load, curtarg, curactual, newtarg)
+);
+
+TRACE_EVENT(cpufreq_interactive_boost,
+ TP_PROTO(const char *s),
+ TP_ARGS(s),
+ TP_STRUCT__entry(
+ __string(s, s)
+ ),
+ TP_fast_assign(
+ __assign_str(s, s);
+ ),
+ TP_printk("%s", __get_str(s))
+);
+
+TRACE_EVENT(cpufreq_interactive_unboost,
+ TP_PROTO(const char *s),
+ TP_ARGS(s),
+ TP_STRUCT__entry(
+ __string(s, s)
+ ),
+ TP_fast_assign(
+ __assign_str(s, s);
+ ),
+ TP_printk("%s", __get_str(s))
+);
+
+#endif /* _TRACE_CPUFREQ_INTERACTIVE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/cpufreq_sched.h b/include/trace/events/cpufreq_sched.h
new file mode 100644
index 000000000000..a46cd088e969
--- /dev/null
+++ b/include/trace/events/cpufreq_sched.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2015 Steve Muckle <smuckle@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cpufreq_sched
+
+#if !defined(_TRACE_CPUFREQ_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_CPUFREQ_SCHED_H
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(cpufreq_sched_throttled,
+ TP_PROTO(unsigned int rem),
+ TP_ARGS(rem),
+ TP_STRUCT__entry(
+ __field( unsigned int, rem)
+ ),
+ TP_fast_assign(
+ __entry->rem = rem;
+ ),
+ TP_printk("throttled - %d usec remaining", __entry->rem)
+);
+
+TRACE_EVENT(cpufreq_sched_request_opp,
+ TP_PROTO(int cpu,
+ unsigned long capacity,
+ unsigned int freq_new,
+ unsigned int requested_freq),
+ TP_ARGS(cpu, capacity, freq_new, requested_freq),
+ TP_STRUCT__entry(
+ __field( int, cpu)
+ __field( unsigned long, capacity)
+ __field( unsigned int, freq_new)
+ __field( unsigned int, requested_freq)
+ ),
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->capacity = capacity;
+ __entry->freq_new = freq_new;
+ __entry->requested_freq = requested_freq;
+ ),
+ TP_printk("cpu %d cap change, cluster cap request %ld => OPP %d "
+ "(cur %d)",
+ __entry->cpu, __entry->capacity, __entry->freq_new,
+ __entry->requested_freq)
+);
+
+TRACE_EVENT(cpufreq_sched_update_capacity,
+ TP_PROTO(int cpu,
+ bool request,
+ struct sched_capacity_reqs *scr,
+ unsigned long new_capacity),
+ TP_ARGS(cpu, request, scr, new_capacity),
+ TP_STRUCT__entry(
+ __field( int, cpu)
+ __field( bool, request)
+ __field( unsigned long, cfs)
+ __field( unsigned long, rt)
+ __field( unsigned long, dl)
+ __field( unsigned long, total)
+ __field( unsigned long, new_total)
+ ),
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->request = request;
+ __entry->cfs = scr->cfs;
+ __entry->rt = scr->rt;
+ __entry->dl = scr->dl;
+ __entry->total = scr->total;
+ __entry->new_total = new_capacity;
+ ),
+ TP_printk("cpu=%d set_cap=%d cfs=%ld rt=%ld dl=%ld old_tot=%ld "
+ "new_tot=%ld",
+ __entry->cpu, __entry->request, __entry->cfs, __entry->rt,
+ __entry->dl, __entry->total, __entry->new_total)
+);
+
+#endif /* _TRACE_CPUFREQ_SCHED_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index bbc4de9baef7..83c7a11daafa 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -6,20 +6,27 @@
#include <linux/tracepoint.h>
-#define show_dev(entry) MAJOR(entry->dev), MINOR(entry->dev)
-#define show_dev_ino(entry) show_dev(entry), (unsigned long)entry->ino
+#define show_dev(dev) MAJOR(dev), MINOR(dev)
+#define show_dev_ino(entry) show_dev(entry->dev), (unsigned long)entry->ino
#define show_block_type(type) \
__print_symbolic(type, \
{ NODE, "NODE" }, \
{ DATA, "DATA" }, \
{ META, "META" }, \
- { META_FLUSH, "META_FLUSH" })
+ { META_FLUSH, "META_FLUSH" }, \
+ { INMEM, "INMEM" }, \
+ { INMEM_DROP, "INMEM_DROP" }, \
+ { INMEM_INVALIDATE, "INMEM_INVALIDATE" }, \
+ { INMEM_REVOKE, "INMEM_REVOKE" }, \
+ { IPU, "IN-PLACE" }, \
+ { OPU, "OUT-OF-PLACE" })
#define F2FS_BIO_MASK(t) (t & (READA | WRITE_FLUSH_FUA))
#define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO))
-#define show_bio_type(type) show_bio_base(type), show_bio_extra(type)
+#define show_bio_type(op, op_flags) \
+ show_bio_base((op|op_flags)), show_bio_extra((op|op_flags))
#define show_bio_base(type) \
__print_symbolic(F2FS_BIO_MASK(type), \
@@ -72,10 +79,14 @@
#define show_cpreason(type) \
__print_symbolic(type, \
{ CP_UMOUNT, "Umount" }, \
+ { CP_FASTBOOT, "Fastboot" }, \
{ CP_SYNC, "Sync" }, \
- { CP_DISCARD, "Discard" })
+ { CP_RECOVERY, "Recovery" }, \
+ { CP_DISCARD, "Discard" }, \
+ { CP_UMOUNT | CP_TRIMMED, "Umount,Trimmed" })
struct victim_sel_policy;
+struct f2fs_map_blocks;
DECLARE_EVENT_CLASS(f2fs__inode,
@@ -148,14 +159,14 @@ DEFINE_EVENT(f2fs__inode, f2fs_sync_file_enter,
TRACE_EVENT(f2fs_sync_file_exit,
- TP_PROTO(struct inode *inode, bool need_cp, int datasync, int ret),
+ TP_PROTO(struct inode *inode, int need_cp, int datasync, int ret),
TP_ARGS(inode, need_cp, datasync, ret),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(ino_t, ino)
- __field(bool, need_cp)
+ __field(int, need_cp)
__field(int, datasync)
__field(int, ret)
),
@@ -190,12 +201,12 @@ TRACE_EVENT(f2fs_sync_fs,
TP_fast_assign(
__entry->dev = sb->s_dev;
- __entry->dirty = F2FS_SB(sb)->s_dirty;
+ __entry->dirty = is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY);
__entry->wait = wait;
),
TP_printk("dev = (%d,%d), superblock is %s, wait = %d",
- show_dev(__entry),
+ show_dev(__entry->dev),
__entry->dirty ? "dirty" : "not dirty",
__entry->wait)
);
@@ -265,6 +276,13 @@ DEFINE_EVENT(f2fs__inode_exit, f2fs_unlink_exit,
TP_ARGS(inode, ret)
);
+DEFINE_EVENT(f2fs__inode_exit, f2fs_drop_inode,
+
+ TP_PROTO(struct inode *inode, int ret),
+
+ TP_ARGS(inode, ret)
+);
+
DEFINE_EVENT(f2fs__inode, f2fs_truncate,
TP_PROTO(struct inode *inode),
@@ -440,69 +458,64 @@ TRACE_EVENT(f2fs_truncate_partial_nodes,
__entry->err)
);
-TRACE_EVENT_CONDITION(f2fs_submit_page_bio,
-
- TP_PROTO(struct page *page, sector_t blkaddr, int type),
-
- TP_ARGS(page, blkaddr, type),
+TRACE_EVENT(f2fs_map_blocks,
+ TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, int ret),
- TP_CONDITION(page->mapping),
+ TP_ARGS(inode, map, ret),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(ino_t, ino)
- __field(pgoff_t, index)
- __field(sector_t, blkaddr)
- __field(int, type)
+ __field(block_t, m_lblk)
+ __field(block_t, m_pblk)
+ __field(unsigned int, m_len)
+ __field(int, ret)
),
TP_fast_assign(
- __entry->dev = page->mapping->host->i_sb->s_dev;
- __entry->ino = page->mapping->host->i_ino;
- __entry->index = page->index;
- __entry->blkaddr = blkaddr;
- __entry->type = type;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->m_lblk = map->m_lblk;
+ __entry->m_pblk = map->m_pblk;
+ __entry->m_len = map->m_len;
+ __entry->ret = ret;
),
- TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, "
- "blkaddr = 0x%llx, bio_type = %s%s",
+ TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, "
+ "start blkaddr = 0x%llx, len = 0x%llx, err = %d",
show_dev_ino(__entry),
- (unsigned long)__entry->index,
- (unsigned long long)__entry->blkaddr,
- show_bio_type(__entry->type))
+ (unsigned long long)__entry->m_lblk,
+ (unsigned long long)__entry->m_pblk,
+ (unsigned long long)__entry->m_len,
+ __entry->ret)
);
-TRACE_EVENT(f2fs_get_data_block,
- TP_PROTO(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int ret),
+TRACE_EVENT(f2fs_background_gc,
+
+ TP_PROTO(struct super_block *sb, long wait_ms,
+ unsigned int prefree, unsigned int free),
- TP_ARGS(inode, iblock, bh, ret),
+ TP_ARGS(sb, wait_ms, prefree, free),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(ino_t, ino)
- __field(sector_t, iblock)
- __field(sector_t, bh_start)
- __field(size_t, bh_size)
- __field(int, ret)
+ __field(long, wait_ms)
+ __field(unsigned int, prefree)
+ __field(unsigned int, free)
),
TP_fast_assign(
- __entry->dev = inode->i_sb->s_dev;
- __entry->ino = inode->i_ino;
- __entry->iblock = iblock;
- __entry->bh_start = bh->b_blocknr;
- __entry->bh_size = bh->b_size;
- __entry->ret = ret;
+ __entry->dev = sb->s_dev;
+ __entry->wait_ms = wait_ms;
+ __entry->prefree = prefree;
+ __entry->free = free;
),
- TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, "
- "start blkaddr = 0x%llx, len = 0x%llx bytes, err = %d",
- show_dev_ino(__entry),
- (unsigned long long)__entry->iblock,
- (unsigned long long)__entry->bh_start,
- (unsigned long long)__entry->bh_size,
- __entry->ret)
+ TP_printk("dev = (%d,%d), wait_ms = %ld, prefree = %u, free = %u",
+ show_dev(__entry->dev),
+ __entry->wait_ms,
+ __entry->prefree,
+ __entry->free)
);
TRACE_EVENT(f2fs_get_victim,
@@ -541,7 +554,7 @@ TRACE_EVENT(f2fs_get_victim,
TP_printk("dev = (%d,%d), type = %s, policy = (%s, %s, %s), victim = %u "
"ofs_unit = %u, pre_victim_secno = %d, prefree = %u, free = %u",
- show_dev(__entry),
+ show_dev(__entry->dev),
show_data_type(__entry->type),
show_gc_type(__entry->gc_type),
show_alloc_mode(__entry->alloc_mode),
@@ -656,39 +669,101 @@ TRACE_EVENT(f2fs_direct_IO_exit,
__entry->ret)
);
-TRACE_EVENT(f2fs_reserve_new_block,
+TRACE_EVENT(f2fs_reserve_new_blocks,
- TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node),
+ TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node,
+ blkcnt_t count),
- TP_ARGS(inode, nid, ofs_in_node),
+ TP_ARGS(inode, nid, ofs_in_node, count),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(nid_t, nid)
__field(unsigned int, ofs_in_node)
+ __field(blkcnt_t, count)
),
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->nid = nid;
__entry->ofs_in_node = ofs_in_node;
+ __entry->count = count;
),
- TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u",
- show_dev(__entry),
+ TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu",
+ show_dev(__entry->dev),
(unsigned int)__entry->nid,
- __entry->ofs_in_node)
+ __entry->ofs_in_node,
+ (unsigned long long)__entry->count)
+);
+
+DECLARE_EVENT_CLASS(f2fs__submit_page_bio,
+
+ TP_PROTO(struct page *page, struct f2fs_io_info *fio),
+
+ TP_ARGS(page, fio),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(pgoff_t, index)
+ __field(block_t, old_blkaddr)
+ __field(block_t, new_blkaddr)
+ __field(int, op)
+ __field(int, op_flags)
+ __field(int, type)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = page->mapping->host->i_sb->s_dev;
+ __entry->ino = page->mapping->host->i_ino;
+ __entry->index = page->index;
+ __entry->old_blkaddr = fio->old_blkaddr;
+ __entry->new_blkaddr = fio->new_blkaddr;
+ __entry->op = fio->op;
+ __entry->op_flags = fio->op_flags;
+ __entry->type = fio->type;
+ ),
+
+ TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, "
+ "oldaddr = 0x%llx, newaddr = 0x%llx rw = %s%s, type = %s",
+ show_dev_ino(__entry),
+ (unsigned long)__entry->index,
+ (unsigned long long)__entry->old_blkaddr,
+ (unsigned long long)__entry->new_blkaddr,
+ show_bio_type(__entry->op, __entry->op_flags),
+ show_block_type(__entry->type))
);
-DECLARE_EVENT_CLASS(f2fs__submit_bio,
+DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_bio,
- TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
+ TP_PROTO(struct page *page, struct f2fs_io_info *fio),
- TP_ARGS(sb, rw, type, bio),
+ TP_ARGS(page, fio),
+
+ TP_CONDITION(page->mapping)
+);
+
+DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_mbio,
+
+ TP_PROTO(struct page *page, struct f2fs_io_info *fio),
+
+ TP_ARGS(page, fio),
+
+ TP_CONDITION(page->mapping)
+);
+
+DECLARE_EVENT_CLASS(f2fs__bio,
+
+ TP_PROTO(struct super_block *sb, int type, struct bio *bio),
+
+ TP_ARGS(sb, type, bio),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(int, rw)
+ __field(dev_t, target)
+ __field(int, op)
+ __field(int, op_flags)
__field(int, type)
__field(sector_t, sector)
__field(unsigned int, size)
@@ -696,34 +771,55 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio,
TP_fast_assign(
__entry->dev = sb->s_dev;
- __entry->rw = rw;
+ __entry->target = bio->bi_bdev->bd_dev;
+ __entry->op = bio_op(bio);
+ __entry->op_flags = bio->bi_rw;
__entry->type = type;
__entry->sector = bio->bi_iter.bi_sector;
__entry->size = bio->bi_iter.bi_size;
),
- TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u",
- show_dev(__entry),
- show_bio_type(__entry->rw),
+ TP_printk("dev = (%d,%d)/(%d,%d), rw = %s%s, %s, sector = %lld, size = %u",
+ show_dev(__entry->target),
+ show_dev(__entry->dev),
+ show_bio_type(__entry->op, __entry->op_flags),
show_block_type(__entry->type),
(unsigned long long)__entry->sector,
__entry->size)
);
-DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_write_bio,
+DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_write_bio,
- TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
+ TP_PROTO(struct super_block *sb, int type, struct bio *bio),
- TP_ARGS(sb, rw, type, bio),
+ TP_ARGS(sb, type, bio),
TP_CONDITION(bio)
);
-DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_read_bio,
+DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_read_bio,
- TP_PROTO(struct super_block *sb, int rw, int type, struct bio *bio),
+ TP_PROTO(struct super_block *sb, int type, struct bio *bio),
- TP_ARGS(sb, rw, type, bio),
+ TP_ARGS(sb, type, bio),
+
+ TP_CONDITION(bio)
+);
+
+DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_read_bio,
+
+ TP_PROTO(struct super_block *sb, int type, struct bio *bio),
+
+ TP_ARGS(sb, type, bio),
+
+ TP_CONDITION(bio)
+);
+
+DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_write_bio,
+
+ TP_PROTO(struct super_block *sb, int type, struct bio *bio),
+
+ TP_ARGS(sb, type, bio),
TP_CONDITION(bio)
);
@@ -831,6 +927,13 @@ DEFINE_EVENT(f2fs__page, f2fs_writepage,
TP_ARGS(page, type)
);
+DEFINE_EVENT(f2fs__page, f2fs_do_write_data_page,
+
+ TP_PROTO(struct page *page, int type),
+
+ TP_ARGS(page, type)
+);
+
DEFINE_EVENT(f2fs__page, f2fs_readpage,
TP_PROTO(struct page *page, int type),
@@ -852,6 +955,20 @@ DEFINE_EVENT(f2fs__page, f2fs_vm_page_mkwrite,
TP_ARGS(page, type)
);
+DEFINE_EVENT(f2fs__page, f2fs_register_inmem_page,
+
+ TP_PROTO(struct page *page, int type),
+
+ TP_ARGS(page, type)
+);
+
+DEFINE_EVENT(f2fs__page, f2fs_commit_inmem_page,
+
+ TP_PROTO(struct page *page, int type),
+
+ TP_ARGS(page, type)
+);
+
TRACE_EVENT(f2fs_writepages,
TP_PROTO(struct inode *inode, struct writeback_control *wbc, int type),
@@ -916,36 +1033,30 @@ TRACE_EVENT(f2fs_writepages,
__entry->for_sync)
);
-TRACE_EVENT(f2fs_submit_page_mbio,
+TRACE_EVENT(f2fs_readpages,
- TP_PROTO(struct page *page, int rw, int type, block_t blk_addr),
+ TP_PROTO(struct inode *inode, struct page *page, unsigned int nrpage),
- TP_ARGS(page, rw, type, blk_addr),
+ TP_ARGS(inode, page, nrpage),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(ino_t, ino)
- __field(int, rw)
- __field(int, type)
- __field(pgoff_t, index)
- __field(block_t, block)
+ __field(pgoff_t, start)
+ __field(unsigned int, nrpage)
),
TP_fast_assign(
- __entry->dev = page->mapping->host->i_sb->s_dev;
- __entry->ino = page->mapping->host->i_ino;
- __entry->rw = rw;
- __entry->type = type;
- __entry->index = page->index;
- __entry->block = blk_addr;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->start = page->index;
+ __entry->nrpage = nrpage;
),
- TP_printk("dev = (%d,%d), ino = %lu, %s%s, %s, index = %lu, blkaddr = 0x%llx",
+ TP_printk("dev = (%d,%d), ino = %lu, start = %lu nrpage = %u",
show_dev_ino(__entry),
- show_bio_type(__entry->rw),
- show_block_type(__entry->type),
- (unsigned long)__entry->index,
- (unsigned long long)__entry->block)
+ (unsigned long)__entry->start,
+ __entry->nrpage)
);
TRACE_EVENT(f2fs_write_checkpoint,
@@ -967,16 +1078,16 @@ TRACE_EVENT(f2fs_write_checkpoint,
),
TP_printk("dev = (%d,%d), checkpoint for %s, state = %s",
- show_dev(__entry),
+ show_dev(__entry->dev),
show_cpreason(__entry->reason),
__entry->msg)
);
-TRACE_EVENT(f2fs_issue_discard,
+DECLARE_EVENT_CLASS(f2fs_discard,
- TP_PROTO(struct super_block *sb, block_t blkstart, block_t blklen),
+ TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen),
- TP_ARGS(sb, blkstart, blklen),
+ TP_ARGS(dev, blkstart, blklen),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -985,40 +1096,256 @@ TRACE_EVENT(f2fs_issue_discard,
),
TP_fast_assign(
- __entry->dev = sb->s_dev;
+ __entry->dev = dev->bd_dev;
__entry->blkstart = blkstart;
__entry->blklen = blklen;
),
TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx",
- show_dev(__entry),
+ show_dev(__entry->dev),
(unsigned long long)__entry->blkstart,
(unsigned long long)__entry->blklen)
);
+DEFINE_EVENT(f2fs_discard, f2fs_queue_discard,
+
+ TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen),
+
+ TP_ARGS(dev, blkstart, blklen)
+);
+
+DEFINE_EVENT(f2fs_discard, f2fs_issue_discard,
+
+ TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen),
+
+ TP_ARGS(dev, blkstart, blklen)
+);
+
+TRACE_EVENT(f2fs_issue_reset_zone,
+
+ TP_PROTO(struct block_device *dev, block_t blkstart),
+
+ TP_ARGS(dev, blkstart),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(block_t, blkstart)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dev->bd_dev;
+ __entry->blkstart = blkstart;
+ ),
+
+ TP_printk("dev = (%d,%d), reset zone at block = 0x%llx",
+ show_dev(__entry->dev),
+ (unsigned long long)__entry->blkstart)
+);
+
TRACE_EVENT(f2fs_issue_flush,
- TP_PROTO(struct super_block *sb, bool nobarrier, bool flush_merge),
+ TP_PROTO(struct block_device *dev, unsigned int nobarrier,
+ unsigned int flush_merge, int ret),
- TP_ARGS(sb, nobarrier, flush_merge),
+ TP_ARGS(dev, nobarrier, flush_merge, ret),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(bool, nobarrier)
- __field(bool, flush_merge)
+ __field(unsigned int, nobarrier)
+ __field(unsigned int, flush_merge)
+ __field(int, ret)
),
TP_fast_assign(
- __entry->dev = sb->s_dev;
+ __entry->dev = dev->bd_dev;
__entry->nobarrier = nobarrier;
__entry->flush_merge = flush_merge;
+ __entry->ret = ret;
),
- TP_printk("dev = (%d,%d), %s %s",
- show_dev(__entry),
+ TP_printk("dev = (%d,%d), %s %s, ret = %d",
+ show_dev(__entry->dev),
__entry->nobarrier ? "skip (nobarrier)" : "issue",
- __entry->flush_merge ? " with flush_merge" : "")
+ __entry->flush_merge ? " with flush_merge" : "",
+ __entry->ret)
+);
+
+TRACE_EVENT(f2fs_lookup_extent_tree_start,
+
+ TP_PROTO(struct inode *inode, unsigned int pgofs),
+
+ TP_ARGS(inode, pgofs),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(unsigned int, pgofs)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->pgofs = pgofs;
+ ),
+
+ TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u",
+ show_dev_ino(__entry),
+ __entry->pgofs)
+);
+
+TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end,
+
+ TP_PROTO(struct inode *inode, unsigned int pgofs,
+ struct extent_info *ei),
+
+ TP_ARGS(inode, pgofs, ei),
+
+ TP_CONDITION(ei),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(unsigned int, pgofs)
+ __field(unsigned int, fofs)
+ __field(u32, blk)
+ __field(unsigned int, len)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->pgofs = pgofs;
+ __entry->fofs = ei->fofs;
+ __entry->blk = ei->blk;
+ __entry->len = ei->len;
+ ),
+
+ TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, "
+ "ext_info(fofs: %u, blk: %u, len: %u)",
+ show_dev_ino(__entry),
+ __entry->pgofs,
+ __entry->fofs,
+ __entry->blk,
+ __entry->len)
+);
+
+TRACE_EVENT(f2fs_update_extent_tree_range,
+
+ TP_PROTO(struct inode *inode, unsigned int pgofs, block_t blkaddr,
+ unsigned int len),
+
+ TP_ARGS(inode, pgofs, blkaddr, len),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(unsigned int, pgofs)
+ __field(u32, blk)
+ __field(unsigned int, len)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->pgofs = pgofs;
+ __entry->blk = blkaddr;
+ __entry->len = len;
+ ),
+
+ TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, "
+ "blkaddr = %u, len = %u",
+ show_dev_ino(__entry),
+ __entry->pgofs,
+ __entry->blk,
+ __entry->len)
);
+
+TRACE_EVENT(f2fs_shrink_extent_tree,
+
+ TP_PROTO(struct f2fs_sb_info *sbi, unsigned int node_cnt,
+ unsigned int tree_cnt),
+
+ TP_ARGS(sbi, node_cnt, tree_cnt),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, node_cnt)
+ __field(unsigned int, tree_cnt)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sbi->sb->s_dev;
+ __entry->node_cnt = node_cnt;
+ __entry->tree_cnt = tree_cnt;
+ ),
+
+ TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u",
+ show_dev(__entry->dev),
+ __entry->node_cnt,
+ __entry->tree_cnt)
+);
+
+TRACE_EVENT(f2fs_destroy_extent_tree,
+
+ TP_PROTO(struct inode *inode, unsigned int node_cnt),
+
+ TP_ARGS(inode, node_cnt),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(unsigned int, node_cnt)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->node_cnt = node_cnt;
+ ),
+
+ TP_printk("dev = (%d,%d), ino = %lu, destroyed: node_cnt = %u",
+ show_dev_ino(__entry),
+ __entry->node_cnt)
+);
+
+DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes,
+
+ TP_PROTO(struct super_block *sb, int type, s64 count),
+
+ TP_ARGS(sb, type, count),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, type)
+ __field(s64, count)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sb->s_dev;
+ __entry->type = type;
+ __entry->count = count;
+ ),
+
+ TP_printk("dev = (%d,%d), %s, dirty count = %lld",
+ show_dev(__entry->dev),
+ show_file_type(__entry->type),
+ __entry->count)
+);
+
+DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_enter,
+
+ TP_PROTO(struct super_block *sb, int type, s64 count),
+
+ TP_ARGS(sb, type, count)
+);
+
+DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_exit,
+
+ TP_PROTO(struct super_block *sb, int type, s64 count),
+
+ TP_ARGS(sb, type, count)
+);
+
#endif /* _TRACE_F2FS_H */
/* This part must be outside protection */
diff --git a/include/trace/events/gpu.h b/include/trace/events/gpu.h
new file mode 100644
index 000000000000..7e15cdfafe5a
--- /dev/null
+++ b/include/trace/events/gpu.h
@@ -0,0 +1,143 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM gpu
+
+#if !defined(_TRACE_GPU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_GPU_H
+
+#include <linux/tracepoint.h>
+#include <linux/time.h>
+
+#define show_secs_from_ns(ns) \
+ ({ \
+ u64 t = ns + (NSEC_PER_USEC / 2); \
+ do_div(t, NSEC_PER_SEC); \
+ t; \
+ })
+
+#define show_usecs_from_ns(ns) \
+ ({ \
+ u64 t = ns + (NSEC_PER_USEC / 2) ; \
+ u32 rem; \
+ do_div(t, NSEC_PER_USEC); \
+ rem = do_div(t, USEC_PER_SEC); \
+ })
+
+/*
+ * The gpu_sched_switch event indicates that a switch from one GPU context to
+ * another occurred on one of the GPU hardware blocks.
+ *
+ * The gpu_name argument identifies the GPU hardware block. Each independently
+ * scheduled GPU hardware block should have a different name. This may be used
+ * in different ways for different GPUs. For example, if a GPU includes
+ * multiple processing cores it may use names "GPU 0", "GPU 1", etc. If a GPU
+ * includes a separately scheduled 2D and 3D hardware block, it might use the
+ * names "2D" and "3D".
+ *
+ * The timestamp argument is the timestamp at which the switch occurred on the
+ * GPU. These timestamps are in units of nanoseconds and must use
+ * approximately the same time as sched_clock, though they need not come from
+ * any CPU clock. The timestamps for a single hardware block must be
+ * monotonically nondecreasing. This means that if a variable compensation
+ * offset is used to translate from some other clock to the sched_clock, then
+ * care must be taken when increasing that offset, and doing so may result in
+ * multiple events with the same timestamp.
+ *
+ * The next_ctx_id argument identifies the next context that was running on
+ * the GPU hardware block. A value of 0 indicates that the hardware block
+ * will be idle.
+ *
+ * The next_prio argument indicates the priority of the next context at the
+ * time of the event. The exact numeric values may mean different things for
+ * different GPUs, but they should follow the rule that lower values indicate a
+ * higher priority.
+ *
+ * The next_job_id argument identifies the batch of work that the GPU will be
+ * working on. This should correspond to a job_id that was previously traced
+ * as a gpu_job_enqueue event when the batch of work was created.
+ */
+TRACE_EVENT(gpu_sched_switch,
+
+ TP_PROTO(const char *gpu_name, u64 timestamp,
+ u32 next_ctx_id, s32 next_prio, u32 next_job_id),
+
+ TP_ARGS(gpu_name, timestamp, next_ctx_id, next_prio, next_job_id),
+
+ TP_STRUCT__entry(
+ __string( gpu_name, gpu_name )
+ __field( u64, timestamp )
+ __field( u32, next_ctx_id )
+ __field( s32, next_prio )
+ __field( u32, next_job_id )
+ ),
+
+ TP_fast_assign(
+ __assign_str(gpu_name, gpu_name);
+ __entry->timestamp = timestamp;
+ __entry->next_ctx_id = next_ctx_id;
+ __entry->next_prio = next_prio;
+ __entry->next_job_id = next_job_id;
+ ),
+
+ TP_printk("gpu_name=%s ts=%llu.%06lu next_ctx_id=%lu next_prio=%ld "
+ "next_job_id=%lu",
+ __get_str(gpu_name),
+ (unsigned long long)show_secs_from_ns(__entry->timestamp),
+ (unsigned long)show_usecs_from_ns(__entry->timestamp),
+ (unsigned long)__entry->next_ctx_id,
+ (long)__entry->next_prio,
+ (unsigned long)__entry->next_job_id)
+);
+
+/*
+ * The gpu_job_enqueue event indicates that a batch of work has been queued up
+ * to be processed by the GPU. This event is not intended to indicate that
+ * the batch of work has been submitted to the GPU hardware, but rather that
+ * it has been submitted to the GPU kernel driver.
+ *
+ * This event should be traced on the thread that initiated the work being
+ * queued. For example, if a batch of work is submitted to the kernel by a
+ * userland thread, the event should be traced on that thread.
+ *
+ * The ctx_id field identifies the GPU context in which the batch of work
+ * being queued is to be run.
+ *
+ * The job_id field identifies the batch of work being queued within the given
+ * GPU context. The first batch of work submitted for a given GPU context
+ * should have a job_id of 0, and each subsequent batch of work should
+ * increment the job_id by 1.
+ *
+ * The type field identifies the type of the job being enqueued. The job
+ * types may be different for different GPU hardware. For example, a GPU may
+ * differentiate between "2D", "3D", and "compute" jobs.
+ */
+TRACE_EVENT(gpu_job_enqueue,
+
+ TP_PROTO(u32 ctx_id, u32 job_id, const char *type),
+
+ TP_ARGS(ctx_id, job_id, type),
+
+ TP_STRUCT__entry(
+ __field( u32, ctx_id )
+ __field( u32, job_id )
+ __string( type, type )
+ ),
+
+ TP_fast_assign(
+ __entry->ctx_id = ctx_id;
+ __entry->job_id = job_id;
+ __assign_str(type, type);
+ ),
+
+ TP_printk("ctx_id=%lu job_id=%lu type=%s",
+ (unsigned long)__entry->ctx_id,
+ (unsigned long)__entry->job_id,
+ __get_str(type))
+);
+
+#undef show_secs_from_ns
+#undef show_usecs_from_ns
+
+#endif /* _TRACE_GPU_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/mmc.h b/include/trace/events/mmc.h
new file mode 100644
index 000000000000..82b368dbcefc
--- /dev/null
+++ b/include/trace/events/mmc.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mmc
+
+#if !defined(_TRACE_MMC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MMC_H
+
+#include <linux/tracepoint.h>
+#include <linux/mmc/mmc.h>
+#include <linux/mmc/core.h>
+
+/*
+ * Unconditional logging of mmc block erase operations,
+ * including cmd, address, size
+ */
+DECLARE_EVENT_CLASS(mmc_blk_erase_class,
+ TP_PROTO(unsigned int cmd, unsigned int addr, unsigned int size),
+ TP_ARGS(cmd, addr, size),
+ TP_STRUCT__entry(
+ __field(unsigned int, cmd)
+ __field(unsigned int, addr)
+ __field(unsigned int, size)
+ ),
+ TP_fast_assign(
+ __entry->cmd = cmd;
+ __entry->addr = addr;
+ __entry->size = size;
+ ),
+ TP_printk("cmd=%u,addr=0x%08x,size=0x%08x",
+ __entry->cmd, __entry->addr, __entry->size)
+);
+
+DEFINE_EVENT(mmc_blk_erase_class, mmc_blk_erase_start,
+ TP_PROTO(unsigned int cmd, unsigned int addr, unsigned int size),
+ TP_ARGS(cmd, addr, size));
+
+DEFINE_EVENT(mmc_blk_erase_class, mmc_blk_erase_end,
+ TP_PROTO(unsigned int cmd, unsigned int addr, unsigned int size),
+ TP_ARGS(cmd, addr, size));
+
+/*
+ * Logging of start of read or write mmc block operation,
+ * including cmd, address, size
+ */
+DECLARE_EVENT_CLASS(mmc_blk_rw_class,
+ TP_PROTO(unsigned int cmd, unsigned int addr, struct mmc_data *data),
+ TP_ARGS(cmd, addr, data),
+ TP_STRUCT__entry(
+ __field(unsigned int, cmd)
+ __field(unsigned int, addr)
+ __field(unsigned int, size)
+ ),
+ TP_fast_assign(
+ __entry->cmd = cmd;
+ __entry->addr = addr;
+ __entry->size = data->blocks;
+ ),
+ TP_printk("cmd=%u,addr=0x%08x,size=0x%08x",
+ __entry->cmd, __entry->addr, __entry->size)
+);
+
+DEFINE_EVENT_CONDITION(mmc_blk_rw_class, mmc_blk_rw_start,
+ TP_PROTO(unsigned int cmd, unsigned int addr, struct mmc_data *data),
+ TP_ARGS(cmd, addr, data),
+ TP_CONDITION(((cmd == MMC_READ_MULTIPLE_BLOCK) ||
+ (cmd == MMC_WRITE_MULTIPLE_BLOCK)) &&
+ data));
+
+DEFINE_EVENT_CONDITION(mmc_blk_rw_class, mmc_blk_rw_end,
+ TP_PROTO(unsigned int cmd, unsigned int addr, struct mmc_data *data),
+ TP_ARGS(cmd, addr, data),
+ TP_CONDITION(((cmd == MMC_READ_MULTIPLE_BLOCK) ||
+ (cmd == MMC_WRITE_MULTIPLE_BLOCK)) &&
+ data));
+#endif /* _TRACE_MMC_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 1de256b35807..c978ecadbb7c 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -57,7 +57,7 @@ TRACE_EVENT(net_dev_start_xmit,
__entry->gso_type = skb_shinfo(skb)->gso_type;
),
- TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
+ TP_printk("dev=%s queue_mapping=%u skbaddr=%pK vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
__get_str(name), __entry->queue_mapping, __entry->skbaddr,
__entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci,
__entry->protocol, __entry->ip_summed, __entry->len,
@@ -90,7 +90,7 @@ TRACE_EVENT(net_dev_xmit,
__assign_str(name, dev->name);
),
- TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
+ TP_printk("dev=%s skbaddr=%pK len=%u rc=%d",
__get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
);
@@ -112,7 +112,7 @@ DECLARE_EVENT_CLASS(net_dev_template,
__assign_str(name, skb->dev->name);
),
- TP_printk("dev=%s skbaddr=%p len=%u",
+ TP_printk("dev=%s skbaddr=%pK len=%u",
__get_str(name), __entry->skbaddr, __entry->len)
)
@@ -191,7 +191,7 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
__entry->gso_type = skb_shinfo(skb)->gso_type;
),
- TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
+ TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%pK vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
__get_str(name), __entry->napi_id, __entry->queue_mapping,
__entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto,
__entry->vlan_tci, __entry->protocol, __entry->ip_summed,
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index d19840b0cac8..28ad66f6deed 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -111,6 +111,38 @@ DEFINE_EVENT(cpu, cpu_frequency,
TP_ARGS(frequency, cpu_id)
);
+TRACE_EVENT(cpu_frequency_limits,
+
+ TP_PROTO(unsigned int max_freq, unsigned int min_freq,
+ unsigned int cpu_id),
+
+ TP_ARGS(max_freq, min_freq, cpu_id),
+
+ TP_STRUCT__entry(
+ __field( u32, min_freq )
+ __field( u32, max_freq )
+ __field( u32, cpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->min_freq = min_freq;
+ __entry->max_freq = max_freq;
+ __entry->cpu_id = cpu_id;
+ ),
+
+ TP_printk("min=%lu max=%lu cpu_id=%lu",
+ (unsigned long)__entry->min_freq,
+ (unsigned long)__entry->max_freq,
+ (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(cpu, cpu_capacity,
+
+ TP_PROTO(unsigned int capacity, unsigned int cpu_id),
+
+ TP_ARGS(capacity, cpu_id)
+);
+
TRACE_EVENT(device_pm_callback_start,
TP_PROTO(struct device *dev, const char *pm_ops, int event),
@@ -264,6 +296,25 @@ DEFINE_EVENT(clock, clock_set_rate,
TP_ARGS(name, state, cpu_id)
);
+TRACE_EVENT(clock_set_parent,
+
+ TP_PROTO(const char *name, const char *parent_name),
+
+ TP_ARGS(name, parent_name),
+
+ TP_STRUCT__entry(
+ __string( name, name )
+ __string( parent_name, parent_name )
+ ),
+
+ TP_fast_assign(
+ __assign_str(name, name);
+ __assign_str(parent_name, parent_name);
+ ),
+
+ TP_printk("%s parent=%s", __get_str(name), __get_str(parent_name))
+);
+
/*
* The power domain events are used for power domains transitions
*/
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index a7d67bc14906..14497b87a90a 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -179,6 +179,31 @@ TRACE_EVENT(sched_migrate_task,
__entry->orig_cpu, __entry->dest_cpu)
);
+/*
+ * Tracepoint for a CPU going offline/online:
+ */
+TRACE_EVENT(sched_cpu_hotplug,
+
+ TP_PROTO(int affected_cpu, int error, int status),
+
+ TP_ARGS(affected_cpu, error, status),
+
+ TP_STRUCT__entry(
+ __field( int, affected_cpu )
+ __field( int, error )
+ __field( int, status )
+ ),
+
+ TP_fast_assign(
+ __entry->affected_cpu = affected_cpu;
+ __entry->error = error;
+ __entry->status = status;
+ ),
+
+ TP_printk("cpu %d %s error=%d", __entry->affected_cpu,
+ __entry->status ? "online" : "offline", __entry->error)
+);
+
DECLARE_EVENT_CLASS(sched_process_template,
TP_PROTO(struct task_struct *p),
@@ -207,7 +232,7 @@ DECLARE_EVENT_CLASS(sched_process_template,
DEFINE_EVENT(sched_process_template, sched_process_free,
TP_PROTO(struct task_struct *p),
TP_ARGS(p));
-
+
/*
* Tracepoint for a task exiting:
@@ -362,6 +387,30 @@ DEFINE_EVENT(sched_stat_template, sched_stat_blocked,
TP_ARGS(tsk, delay));
/*
+ * Tracepoint for recording the cause of uninterruptible sleep.
+ */
+TRACE_EVENT(sched_blocked_reason,
+
+ TP_PROTO(struct task_struct *tsk),
+
+ TP_ARGS(tsk),
+
+ TP_STRUCT__entry(
+ __field( pid_t, pid )
+ __field( void*, caller )
+ __field( bool, io_wait )
+ ),
+
+ TP_fast_assign(
+ __entry->pid = tsk->pid;
+ __entry->caller = (void*)get_wchan(tsk);
+ __entry->io_wait = tsk->in_iowait;
+ ),
+
+ TP_printk("pid=%d iowait=%d caller=%pS", __entry->pid, __entry->io_wait, __entry->caller)
+);
+
+/*
* Tracepoint for accounting runtime (time the task is executing
* on a CPU).
*/
@@ -550,6 +599,514 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
TP_printk("cpu=%d", __entry->cpu)
);
+
+TRACE_EVENT(sched_contrib_scale_f,
+
+ TP_PROTO(int cpu, unsigned long freq_scale_factor,
+ unsigned long cpu_scale_factor),
+
+ TP_ARGS(cpu, freq_scale_factor, cpu_scale_factor),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(unsigned long, freq_scale_factor)
+ __field(unsigned long, cpu_scale_factor)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->freq_scale_factor = freq_scale_factor;
+ __entry->cpu_scale_factor = cpu_scale_factor;
+ ),
+
+ TP_printk("cpu=%d freq_scale_factor=%lu cpu_scale_factor=%lu",
+ __entry->cpu, __entry->freq_scale_factor,
+ __entry->cpu_scale_factor)
+);
+
+#ifdef CONFIG_SMP
+
+/*
+ * Tracepoint for accounting sched averages for tasks.
+ */
+TRACE_EVENT(sched_load_avg_task,
+
+ TP_PROTO(struct task_struct *tsk, struct sched_avg *avg),
+
+ TP_ARGS(tsk, avg),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( int, cpu )
+ __field( unsigned long, load_avg )
+ __field( unsigned long, util_avg )
+ __field( u64, load_sum )
+ __field( u32, util_sum )
+ __field( u32, period_contrib )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->cpu = task_cpu(tsk);
+ __entry->load_avg = avg->load_avg;
+ __entry->util_avg = avg->util_avg;
+ __entry->load_sum = avg->load_sum;
+ __entry->util_sum = avg->util_sum;
+ __entry->period_contrib = avg->period_contrib;
+ ),
+
+ TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu load_sum=%llu"
+ " util_sum=%u period_contrib=%u",
+ __entry->comm,
+ __entry->pid,
+ __entry->cpu,
+ __entry->load_avg,
+ __entry->util_avg,
+ (u64)__entry->load_sum,
+ (u32)__entry->util_sum,
+ (u32)__entry->period_contrib)
+);
+
+/*
+ * Tracepoint for accounting sched averages for cpus.
+ */
+TRACE_EVENT(sched_load_avg_cpu,
+
+ TP_PROTO(int cpu, struct cfs_rq *cfs_rq),
+
+ TP_ARGS(cpu, cfs_rq),
+
+ TP_STRUCT__entry(
+ __field( int, cpu )
+ __field( unsigned long, load_avg )
+ __field( unsigned long, util_avg )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->load_avg = cfs_rq->avg.load_avg;
+ __entry->util_avg = cfs_rq->avg.util_avg;
+ ),
+
+ TP_printk("cpu=%d load_avg=%lu util_avg=%lu",
+ __entry->cpu, __entry->load_avg, __entry->util_avg)
+);
+
+/*
+ * Tracepoint for sched_tune_config settings
+ */
+TRACE_EVENT(sched_tune_config,
+
+ TP_PROTO(int boost, int pb_nrg_gain, int pb_cap_gain, int pc_nrg_gain, int pc_cap_gain),
+
+ TP_ARGS(boost, pb_nrg_gain, pb_cap_gain, pc_nrg_gain, pc_cap_gain),
+
+ TP_STRUCT__entry(
+ __field( int, boost )
+ __field( int, pb_nrg_gain )
+ __field( int, pb_cap_gain )
+ __field( int, pc_nrg_gain )
+ __field( int, pc_cap_gain )
+ ),
+
+ TP_fast_assign(
+ __entry->boost = boost;
+ __entry->pb_nrg_gain = pb_nrg_gain;
+ __entry->pb_cap_gain = pb_cap_gain;
+ __entry->pc_nrg_gain = pc_nrg_gain;
+ __entry->pc_cap_gain = pc_cap_gain;
+ ),
+
+ TP_printk("boost=%d "
+ "pb_nrg_gain=%d pb_cap_gain=%d "
+ "pc_nrg_gain=%d pc_cap_gain=%d",
+ __entry->boost,
+ __entry->pb_nrg_gain, __entry->pb_cap_gain,
+ __entry->pc_nrg_gain, __entry->pc_cap_gain)
+);
+
+/*
+ * Tracepoint for accounting CPU boosted utilization
+ */
+TRACE_EVENT(sched_boost_cpu,
+
+ TP_PROTO(int cpu, unsigned long util, long margin),
+
+ TP_ARGS(cpu, util, margin),
+
+ TP_STRUCT__entry(
+ __field( int, cpu )
+ __field( unsigned long, util )
+ __field(long, margin )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->util = util;
+ __entry->margin = margin;
+ ),
+
+ TP_printk("cpu=%d util=%lu margin=%ld",
+ __entry->cpu,
+ __entry->util,
+ __entry->margin)
+);
+
+/*
+ * Tracepoint for schedtune_tasks_update
+ */
+TRACE_EVENT(sched_tune_tasks_update,
+
+ TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx,
+ int boost, int max_boost),
+
+ TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( int, cpu )
+ __field( int, tasks )
+ __field( int, idx )
+ __field( int, boost )
+ __field( int, max_boost )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->cpu = cpu;
+ __entry->tasks = tasks;
+ __entry->idx = idx;
+ __entry->boost = boost;
+ __entry->max_boost = max_boost;
+ ),
+
+ TP_printk("pid=%d comm=%s "
+ "cpu=%d tasks=%d idx=%d boost=%d max_boost=%d",
+ __entry->pid, __entry->comm,
+ __entry->cpu, __entry->tasks, __entry->idx,
+ __entry->boost, __entry->max_boost)
+);
+
+/*
+ * Tracepoint for schedtune_boostgroup_update
+ */
+TRACE_EVENT(sched_tune_boostgroup_update,
+
+ TP_PROTO(int cpu, int variation, int max_boost),
+
+ TP_ARGS(cpu, variation, max_boost),
+
+ TP_STRUCT__entry(
+ __field( int, cpu )
+ __field( int, variation )
+ __field( int, max_boost )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->variation = variation;
+ __entry->max_boost = max_boost;
+ ),
+
+ TP_printk("cpu=%d variation=%d max_boost=%d",
+ __entry->cpu, __entry->variation, __entry->max_boost)
+);
+
+/*
+ * Tracepoint for accounting task boosted utilization
+ */
+TRACE_EVENT(sched_boost_task,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long util, long margin),
+
+ TP_ARGS(tsk, util, margin),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( unsigned long, util )
+ __field( long, margin )
+
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->util = util;
+ __entry->margin = margin;
+ ),
+
+ TP_printk("comm=%s pid=%d util=%lu margin=%ld",
+ __entry->comm, __entry->pid,
+ __entry->util,
+ __entry->margin)
+);
+
+/*
+ * Tracepoint for accounting sched group energy
+ */
+TRACE_EVENT(sched_energy_diff,
+
+ TP_PROTO(struct task_struct *tsk, int scpu, int dcpu, int udelta,
+ int nrgb, int nrga, int nrgd, int capb, int capa, int capd,
+ int nrgn, int nrgp),
+
+ TP_ARGS(tsk, scpu, dcpu, udelta,
+ nrgb, nrga, nrgd, capb, capa, capd,
+ nrgn, nrgp),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( int, scpu )
+ __field( int, dcpu )
+ __field( int, udelta )
+ __field( int, nrgb )
+ __field( int, nrga )
+ __field( int, nrgd )
+ __field( int, capb )
+ __field( int, capa )
+ __field( int, capd )
+ __field( int, nrgn )
+ __field( int, nrgp )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->scpu = scpu;
+ __entry->dcpu = dcpu;
+ __entry->udelta = udelta;
+ __entry->nrgb = nrgb;
+ __entry->nrga = nrga;
+ __entry->nrgd = nrgd;
+ __entry->capb = capb;
+ __entry->capa = capa;
+ __entry->capd = capd;
+ __entry->nrgn = nrgn;
+ __entry->nrgp = nrgp;
+ ),
+
+ TP_printk("pid=%d comm=%s "
+ "src_cpu=%d dst_cpu=%d usage_delta=%d "
+ "nrg_before=%d nrg_after=%d nrg_diff=%d "
+ "cap_before=%d cap_after=%d cap_delta=%d "
+ "nrg_delta=%d nrg_payoff=%d",
+ __entry->pid, __entry->comm,
+ __entry->scpu, __entry->dcpu, __entry->udelta,
+ __entry->nrgb, __entry->nrga, __entry->nrgd,
+ __entry->capb, __entry->capa, __entry->capd,
+ __entry->nrgn, __entry->nrgp)
+);
+
+/*
+ * Tracepoint for schedtune_tasks_update
+ */
+TRACE_EVENT(sched_tune_filter,
+
+ TP_PROTO(int nrg_delta, int cap_delta,
+ int nrg_gain, int cap_gain,
+ int payoff, int region),
+
+ TP_ARGS(nrg_delta, cap_delta, nrg_gain, cap_gain, payoff, region),
+
+ TP_STRUCT__entry(
+ __field( int, nrg_delta )
+ __field( int, cap_delta )
+ __field( int, nrg_gain )
+ __field( int, cap_gain )
+ __field( int, payoff )
+ __field( int, region )
+ ),
+
+ TP_fast_assign(
+ __entry->nrg_delta = nrg_delta;
+ __entry->cap_delta = cap_delta;
+ __entry->nrg_gain = nrg_gain;
+ __entry->cap_gain = cap_gain;
+ __entry->payoff = payoff;
+ __entry->region = region;
+ ),
+
+ TP_printk("nrg_delta=%d cap_delta=%d nrg_gain=%d cap_gain=%d payoff=%d region=%d",
+ __entry->nrg_delta, __entry->cap_delta,
+ __entry->nrg_gain, __entry->cap_gain,
+ __entry->payoff, __entry->region)
+);
+
+/*
+ * Tracepoint for system overutilized flag
+ */
+TRACE_EVENT(sched_overutilized,
+
+ TP_PROTO(bool overutilized),
+
+ TP_ARGS(overutilized),
+
+ TP_STRUCT__entry(
+ __field( bool, overutilized )
+ ),
+
+ TP_fast_assign(
+ __entry->overutilized = overutilized;
+ ),
+
+ TP_printk("overutilized=%d",
+ __entry->overutilized ? 1 : 0)
+);
+#ifdef CONFIG_SCHED_WALT
+struct rq;
+
+extern unsigned int sched_ravg_window;
+TRACE_EVENT(walt_update_task_ravg,
+
+ TP_PROTO(struct task_struct *p, struct rq *rq, int evt,
+ u64 wallclock, u64 irqtime),
+
+ TP_ARGS(p, rq, evt, wallclock, irqtime),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN )
+ __field(pid_t, pid )
+ __field(pid_t, cur_pid )
+ __field(unsigned int, cur_freq )
+ __field(u64, wallclock )
+ __field(u64, mark_start )
+ __field(u64, win_start )
+ __field(u64, irqtime )
+ __field(int, evt )
+ __field(unsigned int, demand )
+ __field(unsigned int, sum )
+ __field(unsigned int, walt_avg )
+ __field(int, cpu )
+ __field(u64, cs )
+ __field(u64, ps )
+ __field(u32, curr_window )
+ __field(u32, prev_window )
+ __field(u32, active_windows )
+ ),
+
+ TP_fast_assign(
+ __entry->wallclock = wallclock;
+ __entry->win_start = rq->window_start;
+ __entry->evt = evt;
+ __entry->cpu = rq->cpu;
+ __entry->cur_pid = rq->curr->pid;
+ __entry->cur_freq = rq->cur_freq;
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->pid = p->pid;
+ __entry->mark_start = p->ravg.mark_start;
+ __entry->demand = p->ravg.demand;
+ __entry->sum = p->ravg.sum;
+ __entry->walt_avg = p->ravg.demand << 10;
+ do_div(__entry->walt_avg, walt_ravg_window);
+ __entry->irqtime = irqtime;
+ __entry->cs = rq->curr_runnable_sum;
+ __entry->ps = rq->prev_runnable_sum;
+ __entry->curr_window = p->ravg.curr_window;
+ __entry->prev_window = p->ravg.prev_window;
+ __entry->active_windows = p->ravg.active_windows;
+ ),
+
+ TP_printk("wclock=%llu win_start=%llu event=%d cpu=%d "
+ "cur_freq=%u cur_pid=%d pid=%d comm=%s mrk_start=%llu "
+ "demand=%u sum=%u walt_avg=%u irqtime=%llu "
+ "cur_rsum=%llu pre_rsum=%llu "
+ "cur_wdw=%u pre_wdw=%u act_wds=%u",
+ __entry->wallclock, __entry->win_start,
+ __entry->evt, __entry->cpu,
+ __entry->cur_freq, __entry->cur_pid,
+ __entry->pid, __entry->comm, __entry->mark_start,
+ __entry->demand, __entry->sum,
+ __entry->walt_avg, __entry->irqtime,
+ __entry->cs, __entry->ps,
+ __entry->curr_window, __entry->prev_window,
+ __entry->active_windows
+ )
+);
+
+TRACE_EVENT(walt_update_history,
+
+ TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
+ int evt),
+
+ TP_ARGS(rq, p, runtime, samples, evt),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN )
+ __field(pid_t, pid )
+ __field(unsigned int, runtime )
+ __field(int, samples )
+ __field(int, evt )
+ __field(u64, demand )
+ __field(unsigned int, walt_avg )
+ __field(unsigned int, pelt_avg )
+ __array(u32, hist, RAVG_HIST_SIZE_MAX)
+ __field(int, cpu )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->pid = p->pid;
+ __entry->runtime = runtime;
+ __entry->samples = samples;
+ __entry->evt = evt;
+ __entry->demand = p->ravg.demand;
+ __entry->walt_avg = p->ravg.demand << 10;
+ do_div(__entry->walt_avg, walt_ravg_window);
+ __entry->pelt_avg = p->se.avg.util_avg;
+ memcpy(__entry->hist, p->ravg.sum_history,
+ RAVG_HIST_SIZE_MAX * sizeof(u32));
+ __entry->cpu = rq->cpu;
+ ),
+
+ TP_printk("pid=%d comm=%s runtime=%u samples=%d event=%d demand=%llu "
+ "walt=%u pelt=%u h0=%u h1=%u h2=%u h3=%u h4=%u cpu=%d",
+ __entry->pid, __entry->comm,
+ __entry->runtime, __entry->samples, __entry->evt,
+ __entry->demand,
+ __entry->walt_avg,
+ __entry->pelt_avg,
+ __entry->hist[0], __entry->hist[1],
+ __entry->hist[2], __entry->hist[3],
+ __entry->hist[4], __entry->cpu)
+);
+
+TRACE_EVENT(walt_migration_update_sum,
+
+ TP_PROTO(struct rq *rq, struct task_struct *p),
+
+ TP_ARGS(rq, p),
+
+ TP_STRUCT__entry(
+ __field(int, cpu )
+ __array(char, comm, TASK_COMM_LEN )
+ __field(int, pid )
+ __field(u64, cs )
+ __field(u64, ps )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu_of(rq);
+ __entry->cs = rq->curr_runnable_sum;
+ __entry->ps = rq->prev_runnable_sum;
+ __entry->pid = p->pid;
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ ),
+
+ TP_printk("cpu=%d pid=%d comm=%s "
+ "cur_ravg=%llu pre_ravg=%llu",
+ __entry->cpu, __entry->pid, __entry->comm,
+ __entry->cs, __entry->ps)
+);
+#endif /* CONFIG_SCHED_WALT */
+
+#endif /* CONFIG_SMP */
+
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index ea9bf2561b9e..71e1d0ed92f7 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -397,6 +397,7 @@ typedef struct elf64_shdr {
#define NT_ARM_TLS 0x401 /* ARM TLS register */
#define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */
#define NT_ARM_HW_WATCH 0x403 /* ARM hardware watchpoint registers */
+#define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */
#define NT_METAG_CBUF 0x500 /* Metag catch buffer registers */
#define NT_METAG_RPIPE 0x501 /* Metag read pipeline state */
#define NT_METAG_TLS 0x502 /* Metag TLS pointer */
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 2b82d7e30974..b1385dad8c28 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -29,6 +29,11 @@ struct fib_rule_hdr {
__u32 flags;
};
+struct fib_rule_uid_range {
+ __u32 start;
+ __u32 end;
+};
+
enum {
FRA_UNSPEC,
FRA_DST, /* destination address */
@@ -49,6 +54,9 @@ enum {
FRA_TABLE, /* Extended table id */
FRA_FWMASK, /* mask for netfilter mark */
FRA_OIFNAME,
+ FRA_PAD,
+ FRA_L3MDEV, /* iif or oif is l3mdev goto its table */
+ FRA_UID_RANGE, /* UID range */
__FRA_MAX
};
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 3735fa0a6784..829a567db77a 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -158,6 +158,8 @@ struct inodes_stat_t {
#define FITHAW _IOWR('X', 120, int) /* Thaw */
#define FITRIM _IOWR('X', 121, struct fstrim_range) /* Trim */
+#define FIDTRIM _IOWR('f', 128, struct fstrim_range) /* Deep discard trim */
+
#define FS_IOC_GETFLAGS _IOR('f', 1, long)
#define FS_IOC_SETFLAGS _IOW('f', 2, long)
#define FS_IOC_GETVERSION _IOR('v', 1, long)
@@ -169,6 +171,52 @@ struct inodes_stat_t {
#define FS_IOC32_SETVERSION _IOW('v', 2, int)
/*
+ * File system encryption support
+ */
+/* Policy provided via an ioctl on the topmost directory */
+#define FS_KEY_DESCRIPTOR_SIZE 8
+
+#define FS_POLICY_FLAGS_PAD_4 0x00
+#define FS_POLICY_FLAGS_PAD_8 0x01
+#define FS_POLICY_FLAGS_PAD_16 0x02
+#define FS_POLICY_FLAGS_PAD_32 0x03
+#define FS_POLICY_FLAGS_PAD_MASK 0x03
+#define FS_POLICY_FLAGS_VALID 0x03
+
+/* Encryption algorithms */
+#define FS_ENCRYPTION_MODE_INVALID 0
+#define FS_ENCRYPTION_MODE_AES_256_XTS 1
+#define FS_ENCRYPTION_MODE_AES_256_GCM 2
+#define FS_ENCRYPTION_MODE_AES_256_CBC 3
+#define FS_ENCRYPTION_MODE_AES_256_CTS 4
+
+
+struct fscrypt_policy {
+ __u8 version;
+ __u8 contents_encryption_mode;
+ __u8 filenames_encryption_mode;
+ __u8 flags;
+ __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+} __packed;
+
+#define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy)
+#define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16])
+#define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy)
+
+/* Parameters for passing an encryption key into the kernel keyring */
+#define FS_KEY_DESC_PREFIX "fscrypt:"
+#define FS_KEY_DESC_PREFIX_SIZE 8
+
+/* Structure that userspace passes to the kernel keyring */
+#define FS_MAX_KEY_SIZE 64
+
+struct fscrypt_key {
+ __u32 mode;
+ __u8 raw[FS_MAX_KEY_SIZE];
+ __u32 size;
+};
+
+/*
* Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
*/
#define FS_SECRM_FL 0x00000001 /* Secure deletion */
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 25084a052a1e..7f6b325e1dd5 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -358,6 +358,7 @@ enum fuse_opcode {
FUSE_FALLOCATE = 43,
FUSE_READDIRPLUS = 44,
FUSE_RENAME2 = 45,
+ FUSE_CANONICAL_PATH= 2016,
/* CUSE specific operations */
CUSE_INIT = 4096,
diff --git a/include/uapi/linux/hw_breakpoint.h b/include/uapi/linux/hw_breakpoint.h
index b04000a2296a..2b65efd19a46 100644
--- a/include/uapi/linux/hw_breakpoint.h
+++ b/include/uapi/linux/hw_breakpoint.h
@@ -4,7 +4,11 @@
enum {
HW_BREAKPOINT_LEN_1 = 1,
HW_BREAKPOINT_LEN_2 = 2,
+ HW_BREAKPOINT_LEN_3 = 3,
HW_BREAKPOINT_LEN_4 = 4,
+ HW_BREAKPOINT_LEN_5 = 5,
+ HW_BREAKPOINT_LEN_6 = 6,
+ HW_BREAKPOINT_LEN_7 = 7,
HW_BREAKPOINT_LEN_8 = 8,
};
diff --git a/include/uapi/linux/if_pppolac.h b/include/uapi/linux/if_pppolac.h
new file mode 100644
index 000000000000..b7eb8153ef66
--- /dev/null
+++ b/include/uapi/linux/if_pppolac.h
@@ -0,0 +1,33 @@
+/* include/uapi/linux/if_pppolac.h
+ *
+ * Header for PPP on L2TP Access Concentrator / PPPoLAC Socket (RFC 2661)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ * Author: Chia-chi Yeh <chiachi@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _UAPI_LINUX_IF_PPPOLAC_H
+#define _UAPI_LINUX_IF_PPPOLAC_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+struct sockaddr_pppolac {
+ sa_family_t sa_family; /* AF_PPPOX */
+ unsigned int sa_protocol; /* PX_PROTO_OLAC */
+ int udp_socket;
+ struct __attribute__((packed)) {
+ __u16 tunnel, session;
+ } local, remote;
+} __attribute__((packed));
+
+#endif /* _UAPI_LINUX_IF_PPPOLAC_H */
diff --git a/include/uapi/linux/if_pppopns.h b/include/uapi/linux/if_pppopns.h
new file mode 100644
index 000000000000..a392b52ea6ec
--- /dev/null
+++ b/include/uapi/linux/if_pppopns.h
@@ -0,0 +1,32 @@
+/* include/uapi/linux/if_pppopns.h
+ *
+ * Header for PPP on PPTP Network Server / PPPoPNS Socket (RFC 2637)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ * Author: Chia-chi Yeh <chiachi@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _UAPI_LINUX_IF_PPPOPNS_H
+#define _UAPI_LINUX_IF_PPPOPNS_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+struct sockaddr_pppopns {
+ sa_family_t sa_family; /* AF_PPPOX */
+ unsigned int sa_protocol; /* PX_PROTO_OPNS */
+ int tcp_socket;
+ __u16 local;
+ __u16 remote;
+} __attribute__((packed));
+
+#endif /* _UAPI_LINUX_IF_PPPOPNS_H */
diff --git a/include/uapi/linux/if_pppox.h b/include/uapi/linux/if_pppox.h
index e128769331b5..5861d45bc51d 100644
--- a/include/uapi/linux/if_pppox.h
+++ b/include/uapi/linux/if_pppox.h
@@ -23,6 +23,8 @@
#include <linux/socket.h>
#include <linux/if_ether.h>
#include <linux/if_pppol2tp.h>
+#include <linux/if_pppolac.h>
+#include <linux/if_pppopns.h>
/* For user-space programs to pick up these definitions
* which they wouldn't get otherwise without defining __KERNEL__
@@ -56,7 +58,9 @@ struct pptp_addr {
#define PX_PROTO_OE 0 /* Currently just PPPoE */
#define PX_PROTO_OL2TP 1 /* Now L2TP also */
#define PX_PROTO_PPTP 2
-#define PX_MAX_PROTO 3
+#define PX_PROTO_OLAC 3
+#define PX_PROTO_OPNS 4
+#define PX_MAX_PROTO 5
struct sockaddr_pppox {
__kernel_sa_family_t sa_family; /* address family, AF_PPPOX */
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index d65c0a09efd3..e835704ab807 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -72,6 +72,8 @@ enum {
INET_DIAG_BC_AUTO,
INET_DIAG_BC_S_COND,
INET_DIAG_BC_D_COND,
+ INET_DIAG_BC_DEV_COND, /* u32 ifindex */
+ INET_DIAG_BC_MARK_COND,
};
struct inet_diag_hostcond {
@@ -81,6 +83,11 @@ struct inet_diag_hostcond {
__be32 addr[0];
};
+struct inet_diag_markcond {
+ __u32 mark;
+ __u32 mask;
+};
+
/* Base info structure. It contains socket identity (addrs/ports/cookie)
* and, alas, the information shown by netstat. */
struct inet_diag_msg {
@@ -111,9 +118,15 @@ enum {
INET_DIAG_SKMEMINFO,
INET_DIAG_SHUTDOWN,
INET_DIAG_DCTCPINFO,
+ INET_DIAG_PROTOCOL, /* response attribute only */
+ INET_DIAG_SKV6ONLY,
+ INET_DIAG_LOCALS,
+ INET_DIAG_PEERS,
+ INET_DIAG_PAD,
+ INET_DIAG_MARK,
};
-#define INET_DIAG_MAX INET_DIAG_DCTCPINFO
+#define INET_DIAG_MAX INET_DIAG_MARK
/* INET_DIAG_MEM */
diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index a1d7e931ab72..cc75b7bed6c9 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -152,7 +152,12 @@ struct input_keymap_entry {
#define EVIOCGEFFECTS _IOR('E', 0x84, int) /* Report number of effects playable at the same time */
#define EVIOCGRAB _IOW('E', 0x90, int) /* Grab/Release device */
-#define EVIOCREVOKE _IOW('E', 0x91, int) /* Revoke device access */
+
+/* HACK: disable conflicting EVIOCREVOKE until Android userspace stops using EVIOCSSUSPENDBLOCK */
+/*#define EVIOCREVOKE _IOW('E', 0x91, int)*/ /* Revoke device access */
+
+#define EVIOCGSUSPENDBLOCK _IOR('E', 0x91, int) /* get suspend block enable */
+#define EVIOCSSUSPENDBLOCK _IOW('E', 0x91, int) /* set suspend block enable */
#define EVIOCSCLOCKID _IOW('E', 0xa0, int) /* Set clockid to be used for timestamps */
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index efa2666f4b8a..daa6fff79489 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -160,10 +160,27 @@ enum {
DEVCONF_ACCEPT_DAD,
DEVCONF_FORCE_TLLAO,
DEVCONF_NDISC_NOTIFY,
+ DEVCONF_ACCEPT_RA_RT_TABLE,
DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL,
DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL,
DEVCONF_SUPPRESS_FRAG_NDISC,
DEVCONF_ACCEPT_RA_FROM_LOCAL,
+ DEVCONF_USE_OPTIMISTIC,
+ DEVCONF_ACCEPT_RA_MTU,
+ DEVCONF_STABLE_SECRET,
+ DEVCONF_USE_OIF_ADDRS_ONLY,
+ DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
+ DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
+ DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
+ DEVCONF_DROP_UNSOLICITED_NA,
+ DEVCONF_KEEP_ADDR_ON_DOWN,
+ DEVCONF_RTR_SOLICIT_MAX_INTERVAL,
+ DEVCONF_SEG6_ENABLED,
+ DEVCONF_SEG6_REQUIRE_HMAC,
+ DEVCONF_ENHANCED_DAD,
+ DEVCONF_ADDR_GEN_MODE,
+ DEVCONF_DISABLE_POLICY,
+ DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN,
DEVCONF_MAX
};
diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h
new file mode 100644
index 000000000000..574e22ec640d
--- /dev/null
+++ b/include/uapi/linux/kcov.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_KCOV_IOCTLS_H
+#define _LINUX_KCOV_IOCTLS_H
+
+#include <linux/types.h>
+
+#define KCOV_INIT_TRACE _IOR('c', 1, unsigned long)
+#define KCOV_ENABLE _IO('c', 100)
+#define KCOV_DISABLE _IO('c', 101)
+
+#endif /* _LINUX_KCOV_IOCTLS_H */
diff --git a/include/uapi/linux/keychord.h b/include/uapi/linux/keychord.h
new file mode 100644
index 000000000000..ea7cf4d27bbd
--- /dev/null
+++ b/include/uapi/linux/keychord.h
@@ -0,0 +1,52 @@
+/*
+ * Key chord input driver
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+*/
+
+#ifndef _UAPI_LINUX_KEYCHORD_H_
+#define _UAPI_LINUX_KEYCHORD_H_
+
+#include <linux/input.h>
+
+#define KEYCHORD_VERSION 1
+
+/*
+ * One or more input_keychord structs are written to /dev/keychord
+ * at once to specify the list of keychords to monitor.
+ * Reading /dev/keychord returns the id of a keychord when the
+ * keychord combination is pressed. A keychord is signalled when
+ * all of the keys in the keycode list are in the pressed state.
+ * The order in which the keys are pressed does not matter.
+ * The keychord will not be signalled if keys not in the keycode
+ * list are pressed.
+ * Keychords will not be signalled on key release events.
+ */
+struct input_keychord {
+ /* should be KEYCHORD_VERSION */
+ __u16 version;
+ /*
+ * client specified ID, returned from read()
+ * when this keychord is pressed.
+ */
+ __u16 id;
+
+ /* number of keycodes in this keychord */
+ __u16 count;
+
+ /* variable length array of keycodes */
+ __u16 keycodes[];
+};
+
+#endif /* _UAPI_LINUX_KEYCHORD_H_ */
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 77c60311a6c6..fc8c9a4d1fd8 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -52,12 +52,16 @@
#define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs"
#define REISER2FS_JR_SUPER_MAGIC_STRING "ReIsEr3Fs"
+#define SDCARDFS_SUPER_MAGIC 0x5dca2df5
+
#define SMB_SUPER_MAGIC 0x517B
#define CGROUP_SUPER_MAGIC 0x27e0eb
#define STACK_END_MAGIC 0x57AC6E9D
+#define TRACEFS_MAGIC 0x74726163
+
#define V9FS_MAGIC 0x01021997
#define BDEVFS_MAGIC 0x62646576
diff --git a/include/uapi/linux/netfilter/xt_IDLETIMER.h b/include/uapi/linux/netfilter/xt_IDLETIMER.h
index 208ae9387331..faaa28b3d061 100644
--- a/include/uapi/linux/netfilter/xt_IDLETIMER.h
+++ b/include/uapi/linux/netfilter/xt_IDLETIMER.h
@@ -4,6 +4,7 @@
* Header file for Xtables timer target module.
*
* Copyright (C) 2004, 2010 Nokia Corporation
+ *
* Written by Timo Teras <ext-timo.teras@nokia.com>
*
* Converted to x_tables and forward-ported to 2.6.34
@@ -32,12 +33,19 @@
#include <linux/types.h>
#define MAX_IDLETIMER_LABEL_SIZE 28
+#define NLMSG_MAX_SIZE 64
+
+#define NL_EVENT_TYPE_INACTIVE 0
+#define NL_EVENT_TYPE_ACTIVE 1
struct idletimer_tg_info {
__u32 timeout;
char label[MAX_IDLETIMER_LABEL_SIZE];
+ /* Use netlink messages for notification in addition to sysfs */
+ __u8 send_nl_msg;
+
/* for kernel module internal use only */
struct idletimer_tg *timer __attribute__((aligned(8)));
};
diff --git a/include/uapi/linux/netfilter/xt_socket.h b/include/uapi/linux/netfilter/xt_socket.h
index 6315e2ac3474..55076a32bdfd 100644
--- a/include/uapi/linux/netfilter/xt_socket.h
+++ b/include/uapi/linux/netfilter/xt_socket.h
@@ -18,4 +18,9 @@ struct xt_socket_mtinfo2 {
};
#define XT_SOCKET_FLAGS_V2 (XT_SOCKET_TRANSPARENT | XT_SOCKET_NOWILDCARD)
+struct sock *xt_socket_get4_sk(const struct sk_buff *skb,
+ struct xt_action_param *par);
+struct sock *xt_socket_get6_sk(const struct sk_buff *skb,
+ struct xt_action_param *par);
+
#endif /* _XT_SOCKET_H */
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 4b28dc07bcb1..ed0bcc82fad1 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1608,7 +1608,10 @@ enum nl80211_commands {
*
* @NL80211_ATTR_IFACE_SOCKET_OWNER: flag attribute, if set during interface
* creation then the new interface will be owned by the netlink socket
- * that created it and will be destroyed when the socket is closed
+ * that created it and will be destroyed when the socket is closed.
+ * If set during scheduled scan start then the new scan req will be
+ * owned by the netlink socket that created it and the scheduled scan will
+ * be stopped when the socket is closed.
*
* @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is
* the TDLS link initiator.
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 513df75d0fc9..3fc82ff5d2dd 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -179,4 +179,20 @@ struct prctl_mm_map {
#define PR_SET_THP_DISABLE 41
#define PR_GET_THP_DISABLE 42
+/* Sets the timerslack for arbitrary threads
+ * arg2 slack value, 0 means "use default"
+ * arg3 pid of the thread whose timer slack needs to be set
+ */
+#define PR_SET_TIMERSLACK_PID 43
+
+#define PR_SET_VMA 0x53564d41
+# define PR_SET_VMA_ANON_NAME 0
+
+/* Control the ambient capability set */
+#define PR_CAP_AMBIENT 47
+# define PR_CAP_AMBIENT_IS_SET 1
+# define PR_CAP_AMBIENT_RAISE 2
+# define PR_CAP_AMBIENT_LOWER 3
+# define PR_CAP_AMBIENT_CLEAR_ALL 4
+
#endif /* _LINUX_PRCTL_H */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index eb0f1a554d7b..7acf1d6a5661 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -297,6 +297,14 @@ enum rtattr_type_t {
RTA_TABLE,
RTA_MARK,
RTA_MFC_STATS,
+ RTA_VIA,
+ RTA_NEWDST,
+ RTA_PREF,
+ RTA_ENCAP_TYPE,
+ RTA_ENCAP,
+ RTA_EXPIRES,
+ RTA_PAD,
+ RTA_UID,
__RTA_MAX
};
diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h
index b00e29efb161..472adbb711ed 100644
--- a/include/uapi/linux/sock_diag.h
+++ b/include/uapi/linux/sock_diag.h
@@ -4,6 +4,7 @@
#include <linux/types.h>
#define SOCK_DIAG_BY_FAMILY 20
+#define SOCK_DESTROY_BACKPORT 21
struct sock_diag_req {
__u8 sdiag_family;
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index 43aaba1cc037..bd07cd689f9c 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -569,6 +569,7 @@ enum {
NET_IPV6_PROXY_NDP=23,
NET_IPV6_ACCEPT_SOURCE_ROUTE=25,
NET_IPV6_ACCEPT_RA_FROM_LOCAL=26,
+ NET_IPV6_ACCEPT_RA_RT_INFO_MIN_PLEN=27,
__NET_IPV6_MAX
};
diff --git a/include/uapi/linux/usb/f_accessory.h b/include/uapi/linux/usb/f_accessory.h
new file mode 100644
index 000000000000..0baeb7d0d74c
--- /dev/null
+++ b/include/uapi/linux/usb/f_accessory.h
@@ -0,0 +1,146 @@
+/*
+ * Gadget Function Driver for Android USB accessories
+ *
+ * Copyright (C) 2011 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _UAPI_LINUX_USB_F_ACCESSORY_H
+#define _UAPI_LINUX_USB_F_ACCESSORY_H
+
+/* Use Google Vendor ID when in accessory mode */
+#define USB_ACCESSORY_VENDOR_ID 0x18D1
+
+
+/* Product ID to use when in accessory mode */
+#define USB_ACCESSORY_PRODUCT_ID 0x2D00
+
+/* Product ID to use when in accessory mode and adb is enabled */
+#define USB_ACCESSORY_ADB_PRODUCT_ID 0x2D01
+
+/* Indexes for strings sent by the host via ACCESSORY_SEND_STRING */
+#define ACCESSORY_STRING_MANUFACTURER 0
+#define ACCESSORY_STRING_MODEL 1
+#define ACCESSORY_STRING_DESCRIPTION 2
+#define ACCESSORY_STRING_VERSION 3
+#define ACCESSORY_STRING_URI 4
+#define ACCESSORY_STRING_SERIAL 5
+
+/* Control request for retrieving device's protocol version
+ *
+ * requestType: USB_DIR_IN | USB_TYPE_VENDOR
+ * request: ACCESSORY_GET_PROTOCOL
+ * value: 0
+ * index: 0
+ * data version number (16 bits little endian)
+ * 1 for original accessory support
+ * 2 adds HID and device to host audio support
+ */
+#define ACCESSORY_GET_PROTOCOL 51
+
+/* Control request for host to send a string to the device
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_SEND_STRING
+ * value: 0
+ * index: string ID
+ * data zero terminated UTF8 string
+ *
+ * The device can later retrieve these strings via the
+ * ACCESSORY_GET_STRING_* ioctls
+ */
+#define ACCESSORY_SEND_STRING 52
+
+/* Control request for starting device in accessory mode.
+ * The host sends this after setting all its strings to the device.
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_START
+ * value: 0
+ * index: 0
+ * data none
+ */
+#define ACCESSORY_START 53
+
+/* Control request for registering a HID device.
+ * Upon registering, a unique ID is sent by the accessory in the
+ * value parameter. This ID will be used for future commands for
+ * the device
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_REGISTER_HID_DEVICE
+ * value: Accessory assigned ID for the HID device
+ * index: total length of the HID report descriptor
+ * data none
+ */
+#define ACCESSORY_REGISTER_HID 54
+
+/* Control request for unregistering a HID device.
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_REGISTER_HID
+ * value: Accessory assigned ID for the HID device
+ * index: 0
+ * data none
+ */
+#define ACCESSORY_UNREGISTER_HID 55
+
+/* Control request for sending the HID report descriptor.
+ * If the HID descriptor is longer than the endpoint zero max packet size,
+ * the descriptor will be sent in multiple ACCESSORY_SET_HID_REPORT_DESC
+ * commands. The data for the descriptor must be sent sequentially
+ * if multiple packets are needed.
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_SET_HID_REPORT_DESC
+ * value: Accessory assigned ID for the HID device
+ * index: offset of data in descriptor
+ * (needed when HID descriptor is too big for one packet)
+ * data the HID report descriptor
+ */
+#define ACCESSORY_SET_HID_REPORT_DESC 56
+
+/* Control request for sending HID events.
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_SEND_HID_EVENT
+ * value: Accessory assigned ID for the HID device
+ * index: 0
+ * data the HID report for the event
+ */
+#define ACCESSORY_SEND_HID_EVENT 57
+
+/* Control request for setting the audio mode.
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_SET_AUDIO_MODE
+ * value: 0 - no audio
+ * 1 - device to host, 44100 16-bit stereo PCM
+ * index: 0
+ * data none
+ */
+#define ACCESSORY_SET_AUDIO_MODE 58
+
+/* ioctls for retrieving strings set by the host */
+#define ACCESSORY_GET_STRING_MANUFACTURER _IOW('M', 1, char[256])
+#define ACCESSORY_GET_STRING_MODEL _IOW('M', 2, char[256])
+#define ACCESSORY_GET_STRING_DESCRIPTION _IOW('M', 3, char[256])
+#define ACCESSORY_GET_STRING_VERSION _IOW('M', 4, char[256])
+#define ACCESSORY_GET_STRING_URI _IOW('M', 5, char[256])
+#define ACCESSORY_GET_STRING_SERIAL _IOW('M', 6, char[256])
+/* returns 1 if there is a start request pending */
+#define ACCESSORY_IS_START_REQUESTED _IO('M', 7)
+/* returns audio mode (set via the ACCESSORY_SET_AUDIO_MODE control request) */
+#define ACCESSORY_GET_AUDIO_MODE _IO('M', 8)
+
+#endif /* _UAPI_LINUX_USB_F_ACCESSORY_H */
diff --git a/include/uapi/linux/usb/f_mtp.h b/include/uapi/linux/usb/f_mtp.h
new file mode 100644
index 000000000000..503291855abd
--- /dev/null
+++ b/include/uapi/linux/usb/f_mtp.h
@@ -0,0 +1,61 @@
+/*
+ * Gadget Function Driver for MTP
+ *
+ * Copyright (C) 2010 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _UAPI_LINUX_USB_F_MTP_H
+#define _UAPI_LINUX_USB_F_MTP_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+struct mtp_file_range {
+ /* file descriptor for file to transfer */
+ int fd;
+ /* offset in file for start of transfer */
+ loff_t offset;
+ /* number of bytes to transfer */
+ int64_t length;
+ /* MTP command ID for data header,
+ * used only for MTP_SEND_FILE_WITH_HEADER
+ */
+ uint16_t command;
+ /* MTP transaction ID for data header,
+ * used only for MTP_SEND_FILE_WITH_HEADER
+ */
+ uint32_t transaction_id;
+};
+
+struct mtp_event {
+ /* size of the event */
+ size_t length;
+ /* event data to send */
+ void *data;
+};
+
+/* Sends the specified file range to the host */
+#define MTP_SEND_FILE _IOW('M', 0, struct mtp_file_range)
+/* Receives data from the host and writes it to a file.
+ * The file is created if it does not exist.
+ */
+#define MTP_RECEIVE_FILE _IOW('M', 1, struct mtp_file_range)
+/* Sends an event to the host via the interrupt endpoint */
+#define MTP_SEND_EVENT _IOW('M', 3, struct mtp_event)
+/* Sends the specified file range to the host,
+ * with a 12 byte MTP data packet header at the beginning.
+ */
+#define MTP_SEND_FILE_WITH_HEADER _IOW('M', 4, struct mtp_file_range)
+
+#endif /* _UAPI_LINUX_USB_F_MTP_H */
diff --git a/include/uapi/video/adf.h b/include/uapi/video/adf.h
new file mode 100644
index 000000000000..c5d2e62cdb9b
--- /dev/null
+++ b/include/uapi/video/adf.h
@@ -0,0 +1,321 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _UAPI_VIDEO_ADF_H_
+#define _UAPI_VIDEO_ADF_H_
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#include <drm/drm_fourcc.h>
+#include <drm/drm_mode.h>
+
+#define ADF_NAME_LEN 32
+#define ADF_MAX_CUSTOM_DATA_SIZE 4096
+
+enum adf_interface_type {
+ ADF_INTF_DSI = 0,
+ ADF_INTF_eDP = 1,
+ ADF_INTF_DPI = 2,
+ ADF_INTF_VGA = 3,
+ ADF_INTF_DVI = 4,
+ ADF_INTF_HDMI = 5,
+ ADF_INTF_MEMORY = 6,
+ ADF_INTF_TYPE_DEVICE_CUSTOM = 128,
+ ADF_INTF_TYPE_MAX = (~(__u32)0),
+};
+
+#define ADF_INTF_FLAG_PRIMARY (1 << 0)
+#define ADF_INTF_FLAG_EXTERNAL (1 << 1)
+
+enum adf_event_type {
+ ADF_EVENT_VSYNC = 0,
+ ADF_EVENT_HOTPLUG = 1,
+ ADF_EVENT_DEVICE_CUSTOM = 128,
+ ADF_EVENT_TYPE_MAX = 255,
+};
+
+/**
+ * struct adf_set_event - start or stop subscribing to ADF events
+ *
+ * @type: the type of event to (un)subscribe
+ * @enabled: subscribe or unsubscribe
+ *
+ * After subscribing to an event, userspace may poll() the ADF object's fd
+ * to wait for events or read() to consume the event's data.
+ *
+ * ADF reserves event types 0 to %ADF_EVENT_DEVICE_CUSTOM-1 for its own events.
+ * Devices may use event types %ADF_EVENT_DEVICE_CUSTOM to %ADF_EVENT_TYPE_MAX-1
+ * for driver-private events.
+ */
+struct adf_set_event {
+ __u8 type;
+ __u8 enabled;
+};
+
+/**
+ * struct adf_event - common header for ADF event data
+ *
+ * @type: event type
+ * @length: total size of event data, header inclusive
+ */
+struct adf_event {
+ __u8 type;
+ __u32 length;
+};
+
+/**
+ * struct adf_vsync_event - ADF vsync event
+ *
+ * @base: event header (see &struct adf_event)
+ * @timestamp: time of vsync event, in nanoseconds
+ */
+struct adf_vsync_event {
+ struct adf_event base;
+ __aligned_u64 timestamp;
+};
+
+/**
+ * struct adf_vsync_event - ADF display hotplug event
+ *
+ * @base: event header (see &struct adf_event)
+ * @connected: whether a display is now connected to the interface
+ */
+struct adf_hotplug_event {
+ struct adf_event base;
+ __u8 connected;
+};
+
+#define ADF_MAX_PLANES 4
+/**
+ * struct adf_buffer_config - description of buffer displayed by adf_post_config
+ *
+ * @overlay_engine: id of the target overlay engine
+ * @w: width of display region in pixels
+ * @h: height of display region in pixels
+ * @format: DRM-style fourcc, see drm_fourcc.h for standard formats
+ * @fd: dma_buf fd for each plane
+ * @offset: location of first pixel to scan out, in bytes
+ * @pitch: stride (i.e. length of a scanline including padding) in bytes
+ * @n_planes: number of planes in buffer
+ * @acquire_fence: sync_fence fd which will clear when the buffer is
+ * ready for display, or <0 if the buffer is already ready
+ */
+struct adf_buffer_config {
+ __u32 overlay_engine;
+
+ __u32 w;
+ __u32 h;
+ __u32 format;
+
+ __s32 fd[ADF_MAX_PLANES];
+ __u32 offset[ADF_MAX_PLANES];
+ __u32 pitch[ADF_MAX_PLANES];
+ __u8 n_planes;
+
+ __s32 acquire_fence;
+};
+#define ADF_MAX_BUFFERS (4096 / sizeof(struct adf_buffer_config))
+
+/**
+ * struct adf_post_config - request to flip to a new set of buffers
+ *
+ * @n_interfaces: number of interfaces targeted by the flip (input)
+ * @interfaces: ids of interfaces targeted by the flip (input)
+ * @n_bufs: number of buffers displayed (input)
+ * @bufs: description of buffers displayed (input)
+ * @custom_data_size: size of driver-private data (input)
+ * @custom_data: driver-private data (input)
+ * @complete_fence: sync_fence fd which will clear when this
+ * configuration has left the screen (output)
+ */
+struct adf_post_config {
+ size_t n_interfaces;
+ __u32 __user *interfaces;
+
+ size_t n_bufs;
+ struct adf_buffer_config __user *bufs;
+
+ size_t custom_data_size;
+ void __user *custom_data;
+
+ __s32 complete_fence;
+};
+#define ADF_MAX_INTERFACES (4096 / sizeof(__u32))
+
+/**
+ * struct adf_simple_buffer_allocate - request to allocate a "simple" buffer
+ *
+ * @w: width of buffer in pixels (input)
+ * @h: height of buffer in pixels (input)
+ * @format: DRM-style fourcc (input)
+ *
+ * @fd: dma_buf fd (output)
+ * @offset: location of first pixel, in bytes (output)
+ * @pitch: length of a scanline including padding, in bytes (output)
+ *
+ * Simple buffers are analogous to DRM's "dumb" buffers. They have a single
+ * plane of linear RGB data which can be allocated and scanned out without
+ * any driver-private ioctls or data.
+ *
+ * @format must be a standard RGB format defined in drm_fourcc.h.
+ *
+ * ADF clients must NOT assume that an interface can scan out a simple buffer
+ * allocated by a different ADF interface, even if the two interfaces belong to
+ * the same ADF device.
+ */
+struct adf_simple_buffer_alloc {
+ __u16 w;
+ __u16 h;
+ __u32 format;
+
+ __s32 fd;
+ __u32 offset;
+ __u32 pitch;
+};
+
+/**
+ * struct adf_simple_post_config - request to flip to a single buffer without
+ * driver-private data
+ *
+ * @buf: description of buffer displayed (input)
+ * @complete_fence: sync_fence fd which will clear when this buffer has left the
+ * screen (output)
+ */
+struct adf_simple_post_config {
+ struct adf_buffer_config buf;
+ __s32 complete_fence;
+};
+
+/**
+ * struct adf_attachment_config - description of attachment between an overlay
+ * engine and an interface
+ *
+ * @overlay_engine: id of the overlay engine
+ * @interface: id of the interface
+ */
+struct adf_attachment_config {
+ __u32 overlay_engine;
+ __u32 interface;
+};
+
+/**
+ * struct adf_device_data - describes a display device
+ *
+ * @name: display device's name
+ * @n_attachments: the number of current attachments
+ * @attachments: list of current attachments
+ * @n_allowed_attachments: the number of allowed attachments
+ * @allowed_attachments: list of allowed attachments
+ * @custom_data_size: size of driver-private data
+ * @custom_data: driver-private data
+ */
+struct adf_device_data {
+ char name[ADF_NAME_LEN];
+
+ size_t n_attachments;
+ struct adf_attachment_config __user *attachments;
+
+ size_t n_allowed_attachments;
+ struct adf_attachment_config __user *allowed_attachments;
+
+ size_t custom_data_size;
+ void __user *custom_data;
+};
+#define ADF_MAX_ATTACHMENTS (4096 / sizeof(struct adf_attachment_config))
+
+/**
+ * struct adf_device_data - describes a display interface
+ *
+ * @name: display interface's name
+ * @type: interface type (see enum @adf_interface_type)
+ * @id: which interface of type @type;
+ * e.g. interface DSI.1 -> @type=@ADF_INTF_TYPE_DSI, @id=1
+ * @flags: informational flags (bitmask of %ADF_INTF_FLAG_* values)
+ * @dpms_state: DPMS state (one of @DRM_MODE_DPMS_* defined in drm_mode.h)
+ * @hotplug_detect: whether a display is plugged in
+ * @width_mm: screen width in millimeters, or 0 if unknown
+ * @height_mm: screen height in millimeters, or 0 if unknown
+ * @current_mode: current display mode
+ * @n_available_modes: the number of hardware display modes
+ * @available_modes: list of hardware display modes
+ * @custom_data_size: size of driver-private data
+ * @custom_data: driver-private data
+ */
+struct adf_interface_data {
+ char name[ADF_NAME_LEN];
+
+ __u32 type;
+ __u32 id;
+ /* e.g. type=ADF_INTF_TYPE_DSI, id=1 => DSI.1 */
+ __u32 flags;
+
+ __u8 dpms_state;
+ __u8 hotplug_detect;
+ __u16 width_mm;
+ __u16 height_mm;
+
+ struct drm_mode_modeinfo current_mode;
+ size_t n_available_modes;
+ struct drm_mode_modeinfo __user *available_modes;
+
+ size_t custom_data_size;
+ void __user *custom_data;
+};
+#define ADF_MAX_MODES (4096 / sizeof(struct drm_mode_modeinfo))
+
+/**
+ * struct adf_overlay_engine_data - describes an overlay engine
+ *
+ * @name: overlay engine's name
+ * @n_supported_formats: number of supported formats
+ * @supported_formats: list of supported formats
+ * @custom_data_size: size of driver-private data
+ * @custom_data: driver-private data
+ */
+struct adf_overlay_engine_data {
+ char name[ADF_NAME_LEN];
+
+ size_t n_supported_formats;
+ __u32 __user *supported_formats;
+
+ size_t custom_data_size;
+ void __user *custom_data;
+};
+#define ADF_MAX_SUPPORTED_FORMATS (4096 / sizeof(__u32))
+
+#define ADF_IOCTL_TYPE 'D'
+#define ADF_IOCTL_NR_CUSTOM 128
+
+#define ADF_SET_EVENT _IOW(ADF_IOCTL_TYPE, 0, struct adf_set_event)
+#define ADF_BLANK _IOW(ADF_IOCTL_TYPE, 1, __u8)
+#define ADF_POST_CONFIG _IOW(ADF_IOCTL_TYPE, 2, struct adf_post_config)
+#define ADF_SET_MODE _IOW(ADF_IOCTL_TYPE, 3, \
+ struct drm_mode_modeinfo)
+#define ADF_GET_DEVICE_DATA _IOR(ADF_IOCTL_TYPE, 4, struct adf_device_data)
+#define ADF_GET_INTERFACE_DATA _IOR(ADF_IOCTL_TYPE, 5, \
+ struct adf_interface_data)
+#define ADF_GET_OVERLAY_ENGINE_DATA \
+ _IOR(ADF_IOCTL_TYPE, 6, \
+ struct adf_overlay_engine_data)
+#define ADF_SIMPLE_POST_CONFIG _IOW(ADF_IOCTL_TYPE, 7, \
+ struct adf_simple_post_config)
+#define ADF_SIMPLE_BUFFER_ALLOC _IOW(ADF_IOCTL_TYPE, 8, \
+ struct adf_simple_buffer_alloc)
+#define ADF_ATTACH _IOW(ADF_IOCTL_TYPE, 9, \
+ struct adf_attachment_config)
+#define ADF_DETACH _IOW(ADF_IOCTL_TYPE, 10, \
+ struct adf_attachment_config)
+
+#endif /* _UAPI_VIDEO_ADF_H_ */
diff --git a/include/video/adf.h b/include/video/adf.h
new file mode 100644
index 000000000000..34f10e538f9e
--- /dev/null
+++ b/include/video/adf.h
@@ -0,0 +1,502 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _VIDEO_ADF_H
+#define _VIDEO_ADF_H
+
+#include <linux/device.h>
+#include <linux/dma-buf.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/kthread.h>
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/scatterlist.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <uapi/video/adf.h>
+#include "sync.h"
+
+struct adf_obj;
+struct adf_obj_ops;
+struct adf_device;
+struct adf_device_ops;
+struct adf_interface;
+struct adf_interface_ops;
+struct adf_overlay_engine;
+struct adf_overlay_engine_ops;
+
+/**
+ * struct adf_buffer - buffer displayed by adf_post
+ *
+ * @overlay_engine: target overlay engine
+ * @w: width of display region in pixels
+ * @h: height of display region in pixels
+ * @format: DRM-style fourcc, see drm_fourcc.h for standard formats
+ * @dma_bufs: dma_buf for each plane
+ * @offset: location of first pixel to scan out, in bytes
+ * @pitch: length of a scanline including padding, in bytes
+ * @n_planes: number of planes in buffer
+ * @acquire_fence: sync_fence which will clear when the buffer is
+ * ready for display
+ *
+ * &struct adf_buffer is the in-kernel counterpart to the userspace-facing
+ * &struct adf_buffer_config.
+ */
+struct adf_buffer {
+ struct adf_overlay_engine *overlay_engine;
+
+ u32 w;
+ u32 h;
+ u32 format;
+
+ struct dma_buf *dma_bufs[ADF_MAX_PLANES];
+ u32 offset[ADF_MAX_PLANES];
+ u32 pitch[ADF_MAX_PLANES];
+ u8 n_planes;
+
+ struct sync_fence *acquire_fence;
+};
+
+/**
+ * struct adf_buffer_mapping - state for mapping a &struct adf_buffer into the
+ * display device
+ *
+ * @attachments: dma-buf attachment for each plane
+ * @sg_tables: SG tables for each plane
+ */
+struct adf_buffer_mapping {
+ struct dma_buf_attachment *attachments[ADF_MAX_PLANES];
+ struct sg_table *sg_tables[ADF_MAX_PLANES];
+};
+
+/**
+ * struct adf_post - request to flip to a new set of buffers
+ *
+ * @n_bufs: number of buffers displayed
+ * @bufs: buffers displayed
+ * @mappings: in-device mapping state for each buffer
+ * @custom_data_size: size of driver-private data
+ * @custom_data: driver-private data
+ *
+ * &struct adf_post is the in-kernel counterpart to the userspace-facing
+ * &struct adf_post_config.
+ */
+struct adf_post {
+ size_t n_bufs;
+ struct adf_buffer *bufs;
+ struct adf_buffer_mapping *mappings;
+
+ size_t custom_data_size;
+ void *custom_data;
+};
+
+/**
+ * struct adf_attachment - description of attachment between an overlay engine
+ * and an interface
+ *
+ * @overlay_engine: the overlay engine
+ * @interface: the interface
+ *
+ * &struct adf_attachment is the in-kernel counterpart to the userspace-facing
+ * &struct adf_attachment_config.
+ */
+struct adf_attachment {
+ struct adf_overlay_engine *overlay_engine;
+ struct adf_interface *interface;
+};
+
+struct adf_pending_post {
+ struct list_head head;
+ struct adf_post config;
+ void *state;
+};
+
+enum adf_obj_type {
+ ADF_OBJ_OVERLAY_ENGINE = 0,
+ ADF_OBJ_INTERFACE = 1,
+ ADF_OBJ_DEVICE = 2,
+};
+
+/**
+ * struct adf_obj_ops - common ADF object implementation ops
+ *
+ * @open: handle opening the object's device node
+ * @release: handle releasing an open file
+ * @ioctl: handle custom ioctls
+ *
+ * @supports_event: return whether the object supports generating events of type
+ * @type
+ * @set_event: enable or disable events of type @type
+ * @event_type_str: return a string representation of custom event @type
+ * (@type >= %ADF_EVENT_DEVICE_CUSTOM).
+ *
+ * @custom_data: copy up to %ADF_MAX_CUSTOM_DATA_SIZE bytes of driver-private
+ * data into @data (allocated by ADF) and return the number of copied bytes
+ * in @size. Return 0 on success or an error code (<0) on failure.
+ */
+struct adf_obj_ops {
+ /* optional */
+ int (*open)(struct adf_obj *obj, struct inode *inode,
+ struct file *file);
+ /* optional */
+ void (*release)(struct adf_obj *obj, struct inode *inode,
+ struct file *file);
+ /* optional */
+ long (*ioctl)(struct adf_obj *obj, unsigned int cmd, unsigned long arg);
+
+ /* optional */
+ bool (*supports_event)(struct adf_obj *obj, enum adf_event_type type);
+ /* required if supports_event is implemented */
+ void (*set_event)(struct adf_obj *obj, enum adf_event_type type,
+ bool enabled);
+ /* optional */
+ const char *(*event_type_str)(struct adf_obj *obj,
+ enum adf_event_type type);
+
+ /* optional */
+ int (*custom_data)(struct adf_obj *obj, void *data, size_t *size);
+};
+
+struct adf_obj {
+ enum adf_obj_type type;
+ char name[ADF_NAME_LEN];
+ struct adf_device *parent;
+
+ const struct adf_obj_ops *ops;
+
+ struct device dev;
+
+ struct spinlock file_lock;
+ struct list_head file_list;
+
+ struct mutex event_lock;
+ struct rb_root event_refcount;
+
+ int id;
+ int minor;
+};
+
+/**
+ * struct adf_device_quirks - common display device quirks
+ *
+ * @buffer_padding: whether the last scanline of a buffer extends to the
+ * buffer's pitch (@ADF_BUFFER_PADDED_TO_PITCH) or just to the visible
+ * width (@ADF_BUFFER_UNPADDED)
+ */
+struct adf_device_quirks {
+ /* optional, defaults to ADF_BUFFER_PADDED_TO_PITCH */
+ enum {
+ ADF_BUFFER_PADDED_TO_PITCH = 0,
+ ADF_BUFFER_UNPADDED = 1,
+ } buffer_padding;
+};
+
+/**
+ * struct adf_device_ops - display device implementation ops
+ *
+ * @owner: device's module
+ * @base: common operations (see &struct adf_obj_ops)
+ * @quirks: device's quirks (see &struct adf_device_quirks)
+ *
+ * @attach: attach overlay engine @eng to interface @intf. Return 0 on success
+ * or error code (<0) on failure.
+ * @detach: detach overlay engine @eng from interface @intf. Return 0 on
+ * success or error code (<0) on failure.
+ *
+ * @validate_custom_format: validate the number and size of planes
+ * in buffers with a custom format (i.e., not one of the @DRM_FORMAT_*
+ * types defined in drm/drm_fourcc.h). Return 0 if the buffer is valid or
+ * an error code (<0) otherwise.
+ *
+ * @validate: validate that the proposed configuration @cfg is legal. The
+ * driver may optionally allocate and return some driver-private state in
+ * @driver_state, which will be passed to the corresponding post(). The
+ * driver may NOT commit any changes to hardware. Return 0 if @cfg is
+ * valid or an error code (<0) otherwise.
+ * @complete_fence: create a hardware-backed sync fence to be signaled when
+ * @cfg is removed from the screen. If unimplemented, ADF automatically
+ * creates an sw_sync fence. Return the sync fence on success or a
+ * PTR_ERR() on failure.
+ * @post: flip @cfg onto the screen. Wait for the display to begin scanning out
+ * @cfg before returning.
+ * @advance_timeline: signal the sync fence for the last configuration to leave
+ * the display. If unimplemented, ADF automatically advances an sw_sync
+ * timeline.
+ * @state_free: free driver-private state allocated during validate()
+ */
+struct adf_device_ops {
+ /* required */
+ struct module *owner;
+ const struct adf_obj_ops base;
+ /* optional */
+ const struct adf_device_quirks quirks;
+
+ /* optional */
+ int (*attach)(struct adf_device *dev, struct adf_overlay_engine *eng,
+ struct adf_interface *intf);
+ /* optional */
+ int (*detach)(struct adf_device *dev, struct adf_overlay_engine *eng,
+ struct adf_interface *intf);
+
+ /* required if any of the device's overlay engines supports at least one
+ custom format */
+ int (*validate_custom_format)(struct adf_device *dev,
+ struct adf_buffer *buf);
+
+ /* required */
+ int (*validate)(struct adf_device *dev, struct adf_post *cfg,
+ void **driver_state);
+ /* optional */
+ struct sync_fence *(*complete_fence)(struct adf_device *dev,
+ struct adf_post *cfg, void *driver_state);
+ /* required */
+ void (*post)(struct adf_device *dev, struct adf_post *cfg,
+ void *driver_state);
+ /* required if complete_fence is implemented */
+ void (*advance_timeline)(struct adf_device *dev,
+ struct adf_post *cfg, void *driver_state);
+ /* required if validate allocates driver state */
+ void (*state_free)(struct adf_device *dev, void *driver_state);
+};
+
+struct adf_attachment_list {
+ struct adf_attachment attachment;
+ struct list_head head;
+};
+
+struct adf_device {
+ struct adf_obj base;
+ struct device *dev;
+
+ const struct adf_device_ops *ops;
+
+ struct mutex client_lock;
+
+ struct idr interfaces;
+ size_t n_interfaces;
+ struct idr overlay_engines;
+
+ struct list_head post_list;
+ struct mutex post_lock;
+ struct kthread_worker post_worker;
+ struct task_struct *post_thread;
+ struct kthread_work post_work;
+
+ struct list_head attached;
+ size_t n_attached;
+ struct list_head attach_allowed;
+ size_t n_attach_allowed;
+
+ struct adf_pending_post *onscreen;
+
+ struct sw_sync_timeline *timeline;
+ int timeline_max;
+};
+
+/**
+ * struct adf_interface_ops - display interface implementation ops
+ *
+ * @base: common operations (see &struct adf_obj_ops)
+ *
+ * @blank: change the display's DPMS state. Return 0 on success or error
+ * code (<0) on failure.
+ *
+ * @alloc_simple_buffer: allocate a buffer with the specified @w, @h, and
+ * @format. @format will be a standard RGB format (i.e.,
+ * adf_format_is_rgb(@format) == true). Return 0 on success or error code
+ * (<0) on failure. On success, return the buffer, offset, and pitch in
+ * @dma_buf, @offset, and @pitch respectively.
+ * @describe_simple_post: provide driver-private data needed to post a single
+ * buffer @buf. Copy up to ADF_MAX_CUSTOM_DATA_SIZE bytes into @data
+ * (allocated by ADF) and return the number of bytes in @size. Return 0 on
+ * success or error code (<0) on failure.
+ *
+ * @modeset: change the interface's mode. @mode is not necessarily part of the
+ * modelist passed to adf_hotplug_notify_connected(); the driver may
+ * accept or reject custom modes at its discretion. Return 0 on success or
+ * error code (<0) if the mode could not be set.
+ *
+ * @screen_size: copy the screen dimensions in millimeters into @width_mm
+ * and @height_mm. Return 0 on success or error code (<0) if the display
+ * dimensions are unknown.
+ *
+ * @type_str: return a string representation of custom @intf->type
+ * (@intf->type >= @ADF_INTF_TYPE_DEVICE_CUSTOM).
+ */
+struct adf_interface_ops {
+ const struct adf_obj_ops base;
+
+ /* optional */
+ int (*blank)(struct adf_interface *intf, u8 state);
+
+ /* optional */
+ int (*alloc_simple_buffer)(struct adf_interface *intf,
+ u16 w, u16 h, u32 format,
+ struct dma_buf **dma_buf, u32 *offset, u32 *pitch);
+ /* optional */
+ int (*describe_simple_post)(struct adf_interface *intf,
+ struct adf_buffer *fb, void *data, size_t *size);
+
+ /* optional */
+ int (*modeset)(struct adf_interface *intf,
+ struct drm_mode_modeinfo *mode);
+
+ /* optional */
+ int (*screen_size)(struct adf_interface *intf, u16 *width_mm,
+ u16 *height_mm);
+
+ /* optional */
+ const char *(*type_str)(struct adf_interface *intf);
+};
+
+struct adf_interface {
+ struct adf_obj base;
+ const struct adf_interface_ops *ops;
+
+ struct drm_mode_modeinfo current_mode;
+
+ enum adf_interface_type type;
+ u32 idx;
+ u32 flags;
+
+ wait_queue_head_t vsync_wait;
+ ktime_t vsync_timestamp;
+ rwlock_t vsync_lock;
+
+ u8 dpms_state;
+
+ bool hotplug_detect;
+ struct drm_mode_modeinfo *modelist;
+ size_t n_modes;
+ rwlock_t hotplug_modelist_lock;
+};
+
+/**
+ * struct adf_interface_ops - overlay engine implementation ops
+ *
+ * @base: common operations (see &struct adf_obj_ops)
+ *
+ * @supported_formats: list of fourccs the overlay engine can scan out
+ * @n_supported_formats: length of supported_formats, up to
+ * ADF_MAX_SUPPORTED_FORMATS
+ */
+struct adf_overlay_engine_ops {
+ const struct adf_obj_ops base;
+
+ /* required */
+ const u32 *supported_formats;
+ /* required */
+ const size_t n_supported_formats;
+};
+
+struct adf_overlay_engine {
+ struct adf_obj base;
+
+ const struct adf_overlay_engine_ops *ops;
+};
+
+#define adf_obj_to_device(ptr) \
+ container_of((ptr), struct adf_device, base)
+
+#define adf_obj_to_interface(ptr) \
+ container_of((ptr), struct adf_interface, base)
+
+#define adf_obj_to_overlay_engine(ptr) \
+ container_of((ptr), struct adf_overlay_engine, base)
+
+int __printf(4, 5) adf_device_init(struct adf_device *dev,
+ struct device *parent, const struct adf_device_ops *ops,
+ const char *fmt, ...);
+void adf_device_destroy(struct adf_device *dev);
+int __printf(7, 8) adf_interface_init(struct adf_interface *intf,
+ struct adf_device *dev, enum adf_interface_type type, u32 idx,
+ u32 flags, const struct adf_interface_ops *ops, const char *fmt,
+ ...);
+void adf_interface_destroy(struct adf_interface *intf);
+static inline struct adf_device *adf_interface_parent(
+ struct adf_interface *intf)
+{
+ return intf->base.parent;
+}
+int __printf(4, 5) adf_overlay_engine_init(struct adf_overlay_engine *eng,
+ struct adf_device *dev,
+ const struct adf_overlay_engine_ops *ops, const char *fmt, ...);
+void adf_overlay_engine_destroy(struct adf_overlay_engine *eng);
+static inline struct adf_device *adf_overlay_engine_parent(
+ struct adf_overlay_engine *eng)
+{
+ return eng->base.parent;
+}
+
+int adf_attachment_allow(struct adf_device *dev, struct adf_overlay_engine *eng,
+ struct adf_interface *intf);
+
+const char *adf_obj_type_str(enum adf_obj_type type);
+const char *adf_interface_type_str(struct adf_interface *intf);
+const char *adf_event_type_str(struct adf_obj *obj, enum adf_event_type type);
+
+#define ADF_FORMAT_STR_SIZE 5
+void adf_format_str(u32 format, char buf[ADF_FORMAT_STR_SIZE]);
+int adf_format_validate_yuv(struct adf_device *dev, struct adf_buffer *buf,
+ u8 num_planes, u8 hsub, u8 vsub, u8 cpp[]);
+/**
+ * adf_format_validate_rgb - validate the number and size of planes in buffers
+ * with a custom RGB format.
+ *
+ * @dev: ADF device performing the validation
+ * @buf: buffer to validate
+ * @cpp: expected bytes per pixel
+ *
+ * adf_format_validate_rgb() is intended to be called as a helper from @dev's
+ * validate_custom_format() op. @buf must have a single RGB plane.
+ *
+ * Returns 0 if @buf has a single plane with sufficient size, or -EINVAL
+ * otherwise.
+ */
+static inline int adf_format_validate_rgb(struct adf_device *dev,
+ struct adf_buffer *buf, u8 cpp)
+{
+ return adf_format_validate_yuv(dev, buf, 1, 1, 1, &cpp);
+}
+
+int adf_event_get(struct adf_obj *obj, enum adf_event_type type);
+int adf_event_put(struct adf_obj *obj, enum adf_event_type type);
+int adf_event_notify(struct adf_obj *obj, struct adf_event *event);
+
+static inline void adf_vsync_get(struct adf_interface *intf)
+{
+ adf_event_get(&intf->base, ADF_EVENT_VSYNC);
+}
+
+static inline void adf_vsync_put(struct adf_interface *intf)
+{
+ adf_event_put(&intf->base, ADF_EVENT_VSYNC);
+}
+
+int adf_vsync_wait(struct adf_interface *intf, long timeout);
+void adf_vsync_notify(struct adf_interface *intf, ktime_t timestamp);
+
+int adf_hotplug_notify_connected(struct adf_interface *intf,
+ struct drm_mode_modeinfo *modelist, size_t n_modes);
+void adf_hotplug_notify_disconnected(struct adf_interface *intf);
+
+void adf_modeinfo_set_name(struct drm_mode_modeinfo *mode);
+void adf_modeinfo_set_vrefresh(struct drm_mode_modeinfo *mode);
+
+#endif /* _VIDEO_ADF_H */
diff --git a/include/video/adf_client.h b/include/video/adf_client.h
new file mode 100644
index 000000000000..983f2b6a5890
--- /dev/null
+++ b/include/video/adf_client.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _VIDEO_ADF_CLIENT_H_
+#define _VIDEO_ADF_CLIENT_H_
+
+#include <video/adf.h>
+
+int adf_interface_blank(struct adf_interface *intf, u8 state);
+u8 adf_interface_dpms_state(struct adf_interface *intf);
+
+void adf_interface_current_mode(struct adf_interface *intf,
+ struct drm_mode_modeinfo *mode);
+size_t adf_interface_modelist(struct adf_interface *intf,
+ struct drm_mode_modeinfo *modelist, size_t n_modes);
+int adf_interface_set_mode(struct adf_interface *intf,
+ struct drm_mode_modeinfo *mode);
+int adf_interface_get_screen_size(struct adf_interface *intf, u16 *width,
+ u16 *height);
+int adf_interface_simple_buffer_alloc(struct adf_interface *intf, u16 w, u16 h,
+ u32 format, struct dma_buf **dma_buf, u32 *offset, u32 *pitch);
+struct sync_fence *adf_interface_simple_post(struct adf_interface *intf,
+ struct adf_buffer *buf);
+
+bool adf_overlay_engine_supports_format(struct adf_overlay_engine *eng,
+ u32 format);
+
+size_t adf_device_attachments(struct adf_device *dev,
+ struct adf_attachment *attachments, size_t n_attachments);
+size_t adf_device_attachments_allowed(struct adf_device *dev,
+ struct adf_attachment *attachments, size_t n_attachments);
+bool adf_device_attached(struct adf_device *dev, struct adf_overlay_engine *eng,
+ struct adf_interface *intf);
+bool adf_device_attach_allowed(struct adf_device *dev,
+ struct adf_overlay_engine *eng, struct adf_interface *intf);
+int adf_device_attach(struct adf_device *dev, struct adf_overlay_engine *eng,
+ struct adf_interface *intf);
+int adf_device_detach(struct adf_device *dev, struct adf_overlay_engine *eng,
+ struct adf_interface *intf);
+
+struct sync_fence *adf_device_post(struct adf_device *dev,
+ struct adf_interface **intfs, size_t n_intfs,
+ struct adf_buffer *bufs, size_t n_bufs, void *custom_data,
+ size_t custom_data_size);
+struct sync_fence *adf_device_post_nocopy(struct adf_device *dev,
+ struct adf_interface **intfs, size_t n_intfs,
+ struct adf_buffer *bufs, size_t n_bufs, void *custom_data,
+ size_t custom_data_size);
+
+#endif /* _VIDEO_ADF_CLIENT_H_ */
diff --git a/include/video/adf_fbdev.h b/include/video/adf_fbdev.h
new file mode 100644
index 000000000000..b722c6b3ab02
--- /dev/null
+++ b/include/video/adf_fbdev.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _VIDEO_ADF_FBDEV_H_
+#define _VIDEO_ADF_FBDEV_H_
+
+#include <linux/fb.h>
+#include <linux/mutex.h>
+#include <video/adf.h>
+
+struct adf_fbdev {
+ struct adf_interface *intf;
+ struct adf_overlay_engine *eng;
+ struct fb_info *info;
+ u32 pseudo_palette[16];
+
+ unsigned int refcount;
+ struct mutex refcount_lock;
+
+ struct dma_buf *dma_buf;
+ u32 offset;
+ u32 pitch;
+ void *vaddr;
+ u32 format;
+
+ u16 default_xres_virtual;
+ u16 default_yres_virtual;
+ u32 default_format;
+};
+
+#if IS_ENABLED(CONFIG_ADF_FBDEV)
+void adf_modeinfo_to_fb_videomode(const struct drm_mode_modeinfo *mode,
+ struct fb_videomode *vmode);
+void adf_modeinfo_from_fb_videomode(const struct fb_videomode *vmode,
+ struct drm_mode_modeinfo *mode);
+
+int adf_fbdev_init(struct adf_fbdev *fbdev, struct adf_interface *interface,
+ struct adf_overlay_engine *eng,
+ u16 xres_virtual, u16 yres_virtual, u32 format,
+ struct fb_ops *fbops, const char *fmt, ...);
+void adf_fbdev_destroy(struct adf_fbdev *fbdev);
+
+int adf_fbdev_open(struct fb_info *info, int user);
+int adf_fbdev_release(struct fb_info *info, int user);
+int adf_fbdev_check_var(struct fb_var_screeninfo *var, struct fb_info *info);
+int adf_fbdev_set_par(struct fb_info *info);
+int adf_fbdev_blank(int blank, struct fb_info *info);
+int adf_fbdev_pan_display(struct fb_var_screeninfo *var, struct fb_info *info);
+int adf_fbdev_mmap(struct fb_info *info, struct vm_area_struct *vma);
+#else
+static inline void adf_modeinfo_to_fb_videomode(const struct drm_mode_modeinfo *mode,
+ struct fb_videomode *vmode)
+{
+ WARN_ONCE(1, "%s: CONFIG_ADF_FBDEV is disabled\n", __func__);
+}
+
+static inline void adf_modeinfo_from_fb_videomode(const struct fb_videomode *vmode,
+ struct drm_mode_modeinfo *mode)
+{
+ WARN_ONCE(1, "%s: CONFIG_ADF_FBDEV is disabled\n", __func__);
+}
+
+static inline int adf_fbdev_init(struct adf_fbdev *fbdev,
+ struct adf_interface *interface,
+ struct adf_overlay_engine *eng,
+ u16 xres_virtual, u16 yres_virtual, u32 format,
+ struct fb_ops *fbops, const char *fmt, ...)
+{
+ return -ENODEV;
+}
+
+static inline void adf_fbdev_destroy(struct adf_fbdev *fbdev) { }
+
+static inline int adf_fbdev_open(struct fb_info *info, int user)
+{
+ return -ENODEV;
+}
+
+static inline int adf_fbdev_release(struct fb_info *info, int user)
+{
+ return -ENODEV;
+}
+
+static inline int adf_fbdev_check_var(struct fb_var_screeninfo *var,
+ struct fb_info *info)
+{
+ return -ENODEV;
+}
+
+static inline int adf_fbdev_set_par(struct fb_info *info)
+{
+ return -ENODEV;
+}
+
+static inline int adf_fbdev_blank(int blank, struct fb_info *info)
+{
+ return -ENODEV;
+}
+
+static inline int adf_fbdev_pan_display(struct fb_var_screeninfo *var,
+ struct fb_info *info)
+{
+ return -ENODEV;
+}
+
+static inline int adf_fbdev_mmap(struct fb_info *info,
+ struct vm_area_struct *vma)
+{
+ return -ENODEV;
+}
+#endif
+
+#endif /* _VIDEO_ADF_FBDEV_H_ */
diff --git a/include/video/adf_format.h b/include/video/adf_format.h
new file mode 100644
index 000000000000..e03182cdcb06
--- /dev/null
+++ b/include/video/adf_format.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _VIDEO_ADF_FORMAT_H
+#define _VIDEO_ADF_FORMAT_H
+
+bool adf_format_is_standard(u32 format);
+bool adf_format_is_rgb(u32 format);
+u8 adf_format_num_planes(u32 format);
+u8 adf_format_bpp(u32 format);
+u8 adf_format_plane_cpp(u32 format, int plane);
+u8 adf_format_horz_chroma_subsampling(u32 format);
+u8 adf_format_vert_chroma_subsampling(u32 format);
+
+#endif /* _VIDEO_ADF_FORMAT_H */
diff --git a/include/video/adf_memblock.h b/include/video/adf_memblock.h
new file mode 100644
index 000000000000..6256e0eebcc9
--- /dev/null
+++ b/include/video/adf_memblock.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _VIDEO_ADF_MEMBLOCK_H_
+#define _VIDEO_ADF_MEMBLOCK_H_
+
+struct dma_buf *adf_memblock_export(phys_addr_t base, size_t size, int flags);
+
+#endif /* _VIDEO_ADF_MEMBLOCK_H_ */
diff --git a/init/Kconfig b/init/Kconfig
index 2081a4d3d917..10913ea8d4b7 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -392,6 +392,15 @@ config IRQ_TIME_ACCOUNTING
endchoice
+config SCHED_WALT
+ bool "Support window based load tracking"
+ depends on SMP
+ help
+ This feature will allow the scheduler to maintain a tunable window
+ based set of metrics for tasks and runqueues. These metrics can be
+ used to guide task placement as well as task frequency requirements
+ for cpufreq governors.
+
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
help
@@ -985,6 +994,23 @@ config RESOURCE_COUNTERS
This option enables controller independent resource accounting
infrastructure that works with cgroups.
+config CGROUP_SCHEDTUNE
+ bool "CFS tasks boosting cgroup subsystem (EXPERIMENTAL)"
+ depends on SCHED_TUNE
+ help
+ This option provides the "schedtune" controller which improves the
+ flexibility of the task boosting mechanism by introducing the support
+ to define "per task" boost values.
+
+ This new controller:
+ 1. allows only a two layers hierarchy, where the root defines the
+ system-wide boost value and its direct childrens define each one a
+ different "class of tasks" to be boosted with a different value
+ 2. supports up to 16 different task classes, each one which could be
+ configured with a different boost value
+
+ Say N if unsure.
+
config MEMCG
bool "Memory Resource Controller for Control Groups"
depends on RESOURCE_COUNTERS
@@ -1230,6 +1256,33 @@ config SCHED_AUTOGROUP
desktop applications. Task group autogeneration is currently based
upon task session.
+config SCHED_TUNE
+ bool "Boosting for CFS tasks (EXPERIMENTAL)"
+ depends on SMP
+ help
+ This option enables the system-wide support for task boosting.
+ When this support is enabled a new sysctl interface is exposed to
+ userspace via:
+ /proc/sys/kernel/sched_cfs_boost
+ which allows to set a system-wide boost value in range [0..100].
+
+ The currently boosting strategy is implemented in such a way that:
+ - a 0% boost value requires to operate in "standard" mode by
+ scheduling all tasks at the minimum capacities required by their
+ workload demand
+ - a 100% boost value requires to push at maximum the task
+ performances, "regardless" of the incurred energy consumption
+
+ A boost value in between these two boundaries is used to bias the
+ power/performance trade-off, the higher the boost value the more the
+ scheduler is biased toward performance boosting instead of energy
+ efficiency.
+
+ Since this support exposes a single system-wide knob, the specified
+ boost value is applied to all (CFS) tasks in the system.
+
+ If unsure, say N.
+
config SYSFS_DEPRECATED
bool "Enable deprecated sysfs features to support old userspace tools"
depends on SYSFS
@@ -1677,6 +1730,7 @@ choice
config SLAB
bool "SLAB"
+ select HAVE_HARDENED_USERCOPY_ALLOCATOR
help
The regular slab allocator that is established and known to work
well in all environments. It organizes cache hot objects in
@@ -1684,6 +1738,7 @@ config SLAB
config SLUB
bool "SLUB (Unqueued Allocator)"
+ select HAVE_HARDENED_USERCOPY_ALLOCATOR
help
SLUB is a slab allocator that minimizes cache line usage
instead of managing queues of cached objects (SLAB approach).
diff --git a/init/Makefile b/init/Makefile
index 7bc47ee31c36..243f61de2cba 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -3,11 +3,8 @@
#
obj-y := main.o version.o mounts.o
-ifneq ($(CONFIG_BLK_DEV_INITRD),y)
obj-y += noinitramfs.o
-else
obj-$(CONFIG_BLK_DEV_INITRD) += initramfs.o
-endif
obj-$(CONFIG_GENERIC_CALIBRATE_DELAY) += calibrate.o
ifneq ($(CONFIG_ARCH_INIT_TASK),y)
@@ -18,6 +15,7 @@ mounts-y := do_mounts.o
mounts-$(CONFIG_BLK_DEV_RAM) += do_mounts_rd.o
mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o
mounts-$(CONFIG_BLK_DEV_MD) += do_mounts_md.o
+mounts-$(CONFIG_BLK_DEV_DM) += do_mounts_dm.o
# dependencies on generated files need to be listed explicitly
$(obj)/version.o: include/generated/compile.h
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 9b3565c41502..3f5d109fb545 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -207,7 +207,7 @@ done:
* bangs.
*/
-dev_t name_to_dev_t(char *name)
+dev_t name_to_dev_t(const char *name)
{
char s[32];
char *p;
@@ -286,6 +286,7 @@ fail:
done:
return res;
}
+EXPORT_SYMBOL_GPL(name_to_dev_t);
static int __init root_dev_setup(char *line)
{
@@ -556,6 +557,7 @@ void __init prepare_namespace(void)
wait_for_device_probe();
md_run_setup();
+ dm_run_setup();
if (saved_root_name[0]) {
root_device_name = saved_root_name;
diff --git a/init/do_mounts.h b/init/do_mounts.h
index f5b978a9bb92..09d22862e8c3 100644
--- a/init/do_mounts.h
+++ b/init/do_mounts.h
@@ -74,3 +74,13 @@ void md_run_setup(void);
static inline void md_run_setup(void) {}
#endif
+
+#ifdef CONFIG_BLK_DEV_DM
+
+void dm_run_setup(void);
+
+#else
+
+static inline void dm_run_setup(void) {}
+
+#endif
diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c
new file mode 100644
index 000000000000..ecda58df9a19
--- /dev/null
+++ b/init/do_mounts_dm.c
@@ -0,0 +1,426 @@
+/* do_mounts_dm.c
+ * Copyright (C) 2010 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * All Rights Reserved.
+ * Based on do_mounts_md.c
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/device-mapper.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+
+#include "do_mounts.h"
+#include "../drivers/md/dm.h"
+
+#define DM_MAX_NAME 32
+#define DM_MAX_UUID 129
+#define DM_NO_UUID "none"
+
+#define DM_MSG_PREFIX "init"
+
+/* Separators used for parsing the dm= argument. */
+#define DM_FIELD_SEP ' '
+#define DM_LINE_SEP ','
+
+/*
+ * When the device-mapper and any targets are compiled into the kernel
+ * (not a module), one target may be created and used as the root device at
+ * boot time with the parameters given with the boot line dm=...
+ * The code for that is here.
+ */
+
+struct dm_setup_target {
+ sector_t begin;
+ sector_t length;
+ char *type;
+ char *params;
+ /* simple singly linked list */
+ struct dm_setup_target *next;
+};
+
+static struct {
+ int minor;
+ int ro;
+ char name[DM_MAX_NAME];
+ char uuid[DM_MAX_UUID];
+ char *targets;
+ struct dm_setup_target *target;
+ int target_count;
+} dm_setup_args __initdata;
+
+static __initdata int dm_early_setup;
+
+static size_t __init get_dm_option(char *str, char **next, char sep)
+{
+ size_t len = 0;
+ char *endp = NULL;
+
+ if (!str)
+ return 0;
+
+ endp = strchr(str, sep);
+ if (!endp) { /* act like strchrnul */
+ len = strlen(str);
+ endp = str + len;
+ } else {
+ len = endp - str;
+ }
+
+ if (endp == str)
+ return 0;
+
+ if (!next)
+ return len;
+
+ if (*endp == 0) {
+ /* Don't advance past the nul. */
+ *next = endp;
+ } else {
+ *next = endp + 1;
+ }
+ return len;
+}
+
+static int __init dm_setup_args_init(void)
+{
+ dm_setup_args.minor = 0;
+ dm_setup_args.ro = 0;
+ dm_setup_args.target = NULL;
+ dm_setup_args.target_count = 0;
+ return 0;
+}
+
+static int __init dm_setup_cleanup(void)
+{
+ struct dm_setup_target *target = dm_setup_args.target;
+ struct dm_setup_target *old_target = NULL;
+ while (target) {
+ kfree(target->type);
+ kfree(target->params);
+ old_target = target;
+ target = target->next;
+ kfree(old_target);
+ dm_setup_args.target_count--;
+ }
+ BUG_ON(dm_setup_args.target_count);
+ return 0;
+}
+
+static char * __init dm_setup_parse_device_args(char *str)
+{
+ char *next = NULL;
+ size_t len = 0;
+
+ /* Grab the logical name of the device to be exported to udev */
+ len = get_dm_option(str, &next, DM_FIELD_SEP);
+ if (!len) {
+ DMERR("failed to parse device name");
+ goto parse_fail;
+ }
+ len = min(len + 1, sizeof(dm_setup_args.name));
+ strlcpy(dm_setup_args.name, str, len); /* includes nul */
+ str = skip_spaces(next);
+
+ /* Grab the UUID value or "none" */
+ len = get_dm_option(str, &next, DM_FIELD_SEP);
+ if (!len) {
+ DMERR("failed to parse device uuid");
+ goto parse_fail;
+ }
+ len = min(len + 1, sizeof(dm_setup_args.uuid));
+ strlcpy(dm_setup_args.uuid, str, len);
+ str = skip_spaces(next);
+
+ /* Determine if the table/device will be read only or read-write */
+ if (!strncmp("ro,", str, 3)) {
+ dm_setup_args.ro = 1;
+ } else if (!strncmp("rw,", str, 3)) {
+ dm_setup_args.ro = 0;
+ } else {
+ DMERR("failed to parse table mode");
+ goto parse_fail;
+ }
+ str = skip_spaces(str + 3);
+
+ return str;
+
+parse_fail:
+ return NULL;
+}
+
+static void __init dm_substitute_devices(char *str, size_t str_len)
+{
+ char *candidate = str;
+ char *candidate_end = str;
+ char old_char;
+ size_t len = 0;
+ dev_t dev;
+
+ if (str_len < 3)
+ return;
+
+ while (str && *str) {
+ candidate = strchr(str, '/');
+ if (!candidate)
+ break;
+
+ /* Avoid embedded slashes */
+ if (candidate != str && *(candidate - 1) != DM_FIELD_SEP) {
+ str = strchr(candidate, DM_FIELD_SEP);
+ continue;
+ }
+
+ len = get_dm_option(candidate, &candidate_end, DM_FIELD_SEP);
+ str = skip_spaces(candidate_end);
+ if (len < 3 || len > 37) /* name_to_dev_t max; maj:mix min */
+ continue;
+
+ /* Temporarily terminate with a nul */
+ if (*candidate_end)
+ candidate_end--;
+ old_char = *candidate_end;
+ *candidate_end = '\0';
+
+ DMDEBUG("converting candidate device '%s' to dev_t", candidate);
+ /* Use the boot-time specific device naming */
+ dev = name_to_dev_t(candidate);
+ *candidate_end = old_char;
+
+ DMDEBUG(" -> %u", dev);
+ /* No suitable replacement found */
+ if (!dev)
+ continue;
+
+ /* Rewrite the /dev/path as a major:minor */
+ len = snprintf(candidate, len, "%u:%u", MAJOR(dev), MINOR(dev));
+ if (!len) {
+ DMERR("error substituting device major/minor.");
+ break;
+ }
+ candidate += len;
+ /* Pad out with spaces (fixing our nul) */
+ while (candidate < candidate_end)
+ *(candidate++) = DM_FIELD_SEP;
+ }
+}
+
+static int __init dm_setup_parse_targets(char *str)
+{
+ char *next = NULL;
+ size_t len = 0;
+ struct dm_setup_target **target = NULL;
+
+ /* Targets are defined as per the table format but with a
+ * comma as a newline separator. */
+ target = &dm_setup_args.target;
+ while (str && *str) {
+ *target = kzalloc(sizeof(struct dm_setup_target), GFP_KERNEL);
+ if (!*target) {
+ DMERR("failed to allocate memory for target %d",
+ dm_setup_args.target_count);
+ goto parse_fail;
+ }
+ dm_setup_args.target_count++;
+
+ (*target)->begin = simple_strtoull(str, &next, 10);
+ if (!next || *next != DM_FIELD_SEP) {
+ DMERR("failed to parse starting sector for target %d",
+ dm_setup_args.target_count - 1);
+ goto parse_fail;
+ }
+ str = skip_spaces(next + 1);
+
+ (*target)->length = simple_strtoull(str, &next, 10);
+ if (!next || *next != DM_FIELD_SEP) {
+ DMERR("failed to parse length for target %d",
+ dm_setup_args.target_count - 1);
+ goto parse_fail;
+ }
+ str = skip_spaces(next + 1);
+
+ len = get_dm_option(str, &next, DM_FIELD_SEP);
+ if (!len ||
+ !((*target)->type = kstrndup(str, len, GFP_KERNEL))) {
+ DMERR("failed to parse type for target %d",
+ dm_setup_args.target_count - 1);
+ goto parse_fail;
+ }
+ str = skip_spaces(next);
+
+ len = get_dm_option(str, &next, DM_LINE_SEP);
+ if (!len ||
+ !((*target)->params = kstrndup(str, len, GFP_KERNEL))) {
+ DMERR("failed to parse params for target %d",
+ dm_setup_args.target_count - 1);
+ goto parse_fail;
+ }
+ str = skip_spaces(next);
+
+ /* Before moving on, walk through the copied target and
+ * attempt to replace all /dev/xxx with the major:minor number.
+ * It may not be possible to resolve them traditionally at
+ * boot-time. */
+ dm_substitute_devices((*target)->params, len);
+
+ target = &((*target)->next);
+ }
+ DMDEBUG("parsed %d targets", dm_setup_args.target_count);
+
+ return 0;
+
+parse_fail:
+ return 1;
+}
+
+/*
+ * Parse the command-line parameters given our kernel, but do not
+ * actually try to invoke the DM device now; that is handled by
+ * dm_setup_drive after the low-level disk drivers have initialised.
+ * dm format is as follows:
+ * dm="name uuid fmode,[table line 1],[table line 2],..."
+ * May be used with root=/dev/dm-0 as it always uses the first dm minor.
+ */
+
+static int __init dm_setup(char *str)
+{
+ dm_setup_args_init();
+
+ str = dm_setup_parse_device_args(str);
+ if (!str) {
+ DMDEBUG("str is NULL");
+ goto parse_fail;
+ }
+
+ /* Target parsing is delayed until we have dynamic memory */
+ dm_setup_args.targets = str;
+
+ printk(KERN_INFO "dm: will configure '%s' on dm-%d\n",
+ dm_setup_args.name, dm_setup_args.minor);
+
+ dm_early_setup = 1;
+ return 1;
+
+parse_fail:
+ printk(KERN_WARNING "dm: Invalid arguments supplied to dm=.\n");
+ return 0;
+}
+
+
+static void __init dm_setup_drive(void)
+{
+ struct mapped_device *md = NULL;
+ struct dm_table *table = NULL;
+ struct dm_setup_target *target;
+ char *uuid = dm_setup_args.uuid;
+ fmode_t fmode = FMODE_READ;
+
+ /* Finish parsing the targets. */
+ if (dm_setup_parse_targets(dm_setup_args.targets))
+ goto parse_fail;
+
+ if (dm_create(dm_setup_args.minor, &md)) {
+ DMDEBUG("failed to create the device");
+ goto dm_create_fail;
+ }
+ DMDEBUG("created device '%s'", dm_device_name(md));
+
+ /* In addition to flagging the table below, the disk must be
+ * set explicitly ro/rw. */
+ set_disk_ro(dm_disk(md), dm_setup_args.ro);
+
+ if (!dm_setup_args.ro)
+ fmode |= FMODE_WRITE;
+ if (dm_table_create(&table, fmode, dm_setup_args.target_count, md)) {
+ DMDEBUG("failed to create the table");
+ goto dm_table_create_fail;
+ }
+
+ dm_lock_md_type(md);
+ target = dm_setup_args.target;
+ while (target) {
+ DMINFO("adding target '%llu %llu %s %s'",
+ (unsigned long long) target->begin,
+ (unsigned long long) target->length, target->type,
+ target->params);
+ if (dm_table_add_target(table, target->type, target->begin,
+ target->length, target->params)) {
+ DMDEBUG("failed to add the target to the table");
+ goto add_target_fail;
+ }
+ target = target->next;
+ }
+
+ if (dm_table_complete(table)) {
+ DMDEBUG("failed to complete the table");
+ goto table_complete_fail;
+ }
+
+ if (dm_get_md_type(md) == DM_TYPE_NONE) {
+ dm_set_md_type(md, dm_table_get_type(table));
+ if (dm_setup_md_queue(md)) {
+ DMWARN("unable to set up device queue for new table.");
+ goto setup_md_queue_fail;
+ }
+ } else if (dm_get_md_type(md) != dm_table_get_type(table)) {
+ DMWARN("can't change device type after initial table load.");
+ goto setup_md_queue_fail;
+ }
+
+ /* Suspend the device so that we can bind it to the table. */
+ if (dm_suspend(md, 0)) {
+ DMDEBUG("failed to suspend the device pre-bind");
+ goto suspend_fail;
+ }
+
+ /* Bind the table to the device. This is the only way to associate
+ * md->map with the table and set the disk capacity directly. */
+ if (dm_swap_table(md, table)) { /* should return NULL. */
+ DMDEBUG("failed to bind the device to the table");
+ goto table_bind_fail;
+ }
+
+ /* Finally, resume and the device should be ready. */
+ if (dm_resume(md)) {
+ DMDEBUG("failed to resume the device");
+ goto resume_fail;
+ }
+
+ /* Export the dm device via the ioctl interface */
+ if (!strcmp(DM_NO_UUID, dm_setup_args.uuid))
+ uuid = NULL;
+ if (dm_ioctl_export(md, dm_setup_args.name, uuid)) {
+ DMDEBUG("failed to export device with given name and uuid");
+ goto export_fail;
+ }
+ printk(KERN_INFO "dm: dm-%d is ready\n", dm_setup_args.minor);
+
+ dm_unlock_md_type(md);
+ dm_setup_cleanup();
+ return;
+
+export_fail:
+resume_fail:
+table_bind_fail:
+suspend_fail:
+setup_md_queue_fail:
+table_complete_fail:
+add_target_fail:
+ dm_unlock_md_type(md);
+dm_table_create_fail:
+ dm_put(md);
+dm_create_fail:
+ dm_setup_cleanup();
+parse_fail:
+ printk(KERN_WARNING "dm: starting dm-%d (%s) failed\n",
+ dm_setup_args.minor, dm_setup_args.name);
+}
+
+__setup("dm=", dm_setup);
+
+void __init dm_run_setup(void)
+{
+ if (!dm_early_setup)
+ return;
+ printk(KERN_INFO "dm: attempting early device configuration.\n");
+ dm_setup_drive();
+}
diff --git a/init/initramfs.c b/init/initramfs.c
index ad1bd7787bbb..d39079e690fa 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -18,6 +18,7 @@
#include <linux/dirent.h>
#include <linux/syscalls.h>
#include <linux/utime.h>
+#include <linux/initramfs.h>
static ssize_t __init xwrite(int fd, const char *p, size_t count)
{
@@ -605,9 +606,25 @@ static void __init clean_rootfs(void)
}
#endif
+static int __initdata do_skip_initramfs;
+
+static int __init skip_initramfs_param(char *str)
+{
+ if (*str)
+ return 0;
+ do_skip_initramfs = 1;
+ return 1;
+}
+__setup("skip_initramfs", skip_initramfs_param);
+
static int __init populate_rootfs(void)
{
- char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
+ char *err;
+
+ if (do_skip_initramfs)
+ return default_rootfs();
+
+ err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
if (err)
panic("%s", err); /* Failed to decompress INTERNAL initramfs */
if (initrd_start) {
diff --git a/init/main.c b/init/main.c
index 32940a68ea48..414c3f94697e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -94,9 +94,6 @@ static int kernel_init(void *);
extern void init_IRQ(void);
extern void fork_init(unsigned long);
extern void radix_tree_init(void);
-#ifndef CONFIG_DEBUG_RODATA
-static inline void mark_rodata_ro(void) { }
-#endif
/*
* Debug helper: via this flag we know that we are in 'early bootup code'
@@ -577,6 +574,10 @@ asmlinkage __visible void __init start_kernel(void)
local_irq_disable();
idr_init_cache();
rcu_init();
+
+ /* trace_printk() and trace points may be used after this */
+ trace_init();
+
context_tracking_init();
radix_tree_init();
/* init some links before init_ISA_irqs() */
@@ -928,6 +929,28 @@ static int try_to_run_init_process(const char *init_filename)
static noinline void __init kernel_init_freeable(void);
+#ifdef CONFIG_DEBUG_RODATA
+static bool rodata_enabled = true;
+static int __init set_debug_rodata(char *str)
+{
+ return strtobool(str, &rodata_enabled);
+}
+__setup("rodata=", set_debug_rodata);
+
+static void mark_readonly(void)
+{
+ if (rodata_enabled)
+ mark_rodata_ro();
+ else
+ pr_info("Kernel memory protection disabled.\n");
+}
+#else
+static inline void mark_readonly(void)
+{
+ pr_warn("This architecture does not have kernel memory protection.\n");
+}
+#endif
+
static int __ref kernel_init(void *unused)
{
int ret;
@@ -936,7 +959,7 @@ static int __ref kernel_init(void *unused)
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
free_initmem();
- mark_rodata_ro();
+ mark_readonly();
system_state = SYSTEM_RUNNING;
numa_default_policy();
diff --git a/init/noinitramfs.c b/init/noinitramfs.c
index 267739d85179..bcc8bcb053ee 100644
--- a/init/noinitramfs.c
+++ b/init/noinitramfs.c
@@ -21,11 +21,16 @@
#include <linux/stat.h>
#include <linux/kdev_t.h>
#include <linux/syscalls.h>
+#include <linux/kconfig.h>
+#include <linux/initramfs.h>
/*
* Create a simple rootfs that is similar to the default initramfs
*/
-static int __init default_rootfs(void)
+#if !IS_BUILTIN(CONFIG_BLK_DEV_INITRD)
+static
+#endif
+int __init default_rootfs(void)
{
int err;
@@ -49,4 +54,6 @@ out:
printk(KERN_WARNING "Failed to create a rootfs\n");
return err;
}
+#if !IS_BUILTIN(CONFIG_BLK_DEV_INITRD)
rootfs_initcall(default_rootfs);
+#endif
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index ebe44a53fe5a..31a913a30549 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -751,7 +751,7 @@ static struct file *do_create(struct ipc_namespace *ipc_ns, struct inode *dir,
}
mode &= ~current_umask();
- ret = vfs_create(dir, path->dentry, mode, true);
+ ret = vfs_create2(path->mnt, dir, path->dentry, mode, true);
path->dentry->d_fsdata = NULL;
if (ret)
return ERR_PTR(ret);
@@ -767,7 +767,7 @@ static struct file *do_open(struct path *path, int oflag)
if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
return ERR_PTR(-EINVAL);
acc = oflag2acc[oflag & O_ACCMODE];
- if (inode_permission(path->dentry->d_inode, acc))
+ if (inode_permission2(path->mnt, path->dentry->d_inode, acc))
return ERR_PTR(-EACCES);
return dentry_open(path, oflag, current_cred());
}
@@ -800,7 +800,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
ro = mnt_want_write(mnt); /* we'll drop it in any case */
error = 0;
mutex_lock(&root->d_inode->i_mutex);
- path.dentry = lookup_one_len(name->name, root, strlen(name->name));
+ path.dentry = lookup_one_len2(name->name, mnt, root, strlen(name->name));
if (IS_ERR(path.dentry)) {
error = PTR_ERR(path.dentry);
goto out_putfd;
@@ -871,7 +871,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
if (err)
goto out_name;
mutex_lock_nested(&mnt->mnt_root->d_inode->i_mutex, I_MUTEX_PARENT);
- dentry = lookup_one_len(name->name, mnt->mnt_root,
+ dentry = lookup_one_len2(name->name, mnt, mnt->mnt_root,
strlen(name->name));
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
@@ -883,7 +883,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
err = -ENOENT;
} else {
ihold(inode);
- err = vfs_unlink(dentry->d_parent->d_inode, dentry, NULL);
+ err = vfs_unlink2(mnt, dentry->d_parent->d_inode, dentry, NULL);
}
dput(dentry);
diff --git a/kernel/Makefile b/kernel/Makefile
index 17ea6d4a9a24..fda5903dbcda 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -17,6 +17,17 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_irq_work.o = -pg
endif
+# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
+# in coverage traces.
+KCOV_INSTRUMENT_softirq.o := n
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_module.o := n
+KCOV_INSTRUMENT_extable.o := n
+# Don't self-instrument.
+KCOV_INSTRUMENT_kcov.o := n
+KASAN_SANITIZE_kcov.o := n
+
# cond_syscall is currently not LTO compatible
CFLAGS_sys_ni.o = $(DISABLE_LTO)
@@ -65,6 +76,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_KCOV) += kcov.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
diff --git a/kernel/audit.c b/kernel/audit.c
index c6df9905f1c6..c64d84d09425 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -863,6 +863,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
}
if (s.mask & AUDIT_STATUS_PID) {
+ /* NOTE: we are using task_tgid_vnr() below because
+ * the s.pid value is relative to the namespace
+ * of the caller; at present this doesn't matter
+ * much since you can really only run auditd
+ * from the initial pid namespace, but something
+ * to keep in mind if this changes */
int new_pid = s.pid;
if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
@@ -1872,7 +1878,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
" euid=%u suid=%u fsuid=%u"
" egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
task_ppid_nr(tsk),
- task_pid_nr(tsk),
+ task_tgid_nr(tsk),
from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
from_kuid(&init_user_ns, cred->uid),
from_kgid(&init_user_ns, cred->gid),
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3a6e01013b34..5ad4dde6aeb6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -458,7 +458,7 @@ static int audit_filter_rules(struct task_struct *tsk,
switch (f->type) {
case AUDIT_PID:
- pid = task_pid_nr(tsk);
+ pid = task_tgid_nr(tsk);
result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_PPID:
@@ -2082,7 +2082,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
if (!ab)
return;
- audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+ audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
audit_log_task_context(ab);
audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
oldloginuid, loginuid, oldsessionid, sessionid, !rc);
@@ -2307,7 +2307,7 @@ void __audit_ptrace(struct task_struct *t)
{
struct audit_context *context = current->audit_context;
- context->target_pid = task_pid_nr(t);
+ context->target_pid = task_tgid_nr(t);
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
@@ -2332,7 +2332,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
if (audit_pid && t->tgid == audit_pid) {
if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
- audit_sig_pid = task_pid_nr(tsk);
+ audit_sig_pid = task_tgid_nr(tsk);
if (uid_valid(tsk->loginuid))
audit_sig_uid = tsk->loginuid;
else
@@ -2435,7 +2435,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
void __audit_log_capset(const struct cred *new, const struct cred *old)
{
struct audit_context *context = current->audit_context;
- context->capset.pid = task_pid_nr(current);
+ context->capset.pid = task_tgid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
context->capset.cap.permitted = new->cap_permitted;
@@ -2468,7 +2468,7 @@ static void audit_log_task(struct audit_buffer *ab)
from_kgid(&init_user_ns, gid),
sessionid);
audit_log_task_context(ab);
- audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
+ audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
audit_log_untrustedstring(ab, get_task_comm(comm, current));
if (mm) {
down_read(&mm->mmap_sem);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f033fbd94ac3..a22e846b3cdf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2364,7 +2364,8 @@ retry_find_task:
tcred = __task_cred(tsk);
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid)) {
+ !uid_eq(cred->euid, tcred->suid) &&
+ !ns_capable(tcred->user_ns, CAP_SYS_NICE)) {
rcu_read_unlock();
ret = -EACCES;
goto out_unlock_cgroup;
diff --git a/kernel/compat.c b/kernel/compat.c
index ebb3c369d03d..24f00610c575 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -276,8 +276,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
* core implementation decides to return random nonsense.
*/
if (ret == -ERESTART_RESTARTBLOCK) {
- struct restart_block *restart
- = &current_thread_info()->restart_block;
+ struct restart_block *restart = &current->restart_block;
restart->fn = compat_nanosleep_restart;
restart->nanosleep.compat_rmtp = rmtp;
@@ -860,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
return -EFAULT;
if (err == -ERESTART_RESTARTBLOCK) {
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = compat_clock_nanosleep_restart;
restart->nanosleep.compat_rmtp = rmtp;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 90a3d017b90c..28cb74db9646 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -22,6 +22,8 @@
#include <linux/lockdep.h>
#include <trace/events/power.h>
+#include <trace/events/sched.h>
+
#include "smpboot.h"
#ifdef CONFIG_SMP
@@ -424,6 +426,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
out_release:
cpu_hotplug_done();
+ trace_sched_cpu_hotplug(cpu, err, 0);
if (!err)
cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
return err;
@@ -499,6 +502,7 @@ out_notify:
__cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
out:
cpu_hotplug_done();
+ trace_sched_cpu_hotplug(cpu, ret, 1);
return ret;
}
@@ -587,6 +591,7 @@ void __weak arch_enable_nonboot_cpus_end(void)
void __ref enable_nonboot_cpus(void)
{
int cpu, error;
+ struct device *cpu_device;
/* Allow everyone to use the CPU hotplug again */
cpu_maps_update_begin();
@@ -604,6 +609,12 @@ void __ref enable_nonboot_cpus(void)
trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
pr_info("CPU%d is up\n", cpu);
+ cpu_device = get_cpu_device(cpu);
+ if (!cpu_device)
+ pr_err("%s: failed to get cpu%d device\n",
+ __func__, cpu);
+ else
+ kobject_uevent(&cpu_device->kobj, KOBJ_ONLINE);
continue;
}
pr_warn("Error taking CPU%d up: %d\n", cpu, error);
@@ -791,3 +802,23 @@ void init_cpu_online(const struct cpumask *src)
{
cpumask_copy(to_cpumask(cpu_online_bits), src);
}
+
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+ atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
+
+void idle_notifier_call_chain(unsigned long val)
+{
+ atomic_notifier_call_chain(&idle_notifier, val, NULL);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_call_chain);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index fe33cb70d89d..4dce92070d86 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -99,6 +99,7 @@ struct cpuset {
/* user-configured CPUs and Memory Nodes allow to tasks */
cpumask_var_t cpus_allowed;
+ cpumask_var_t cpus_requested;
nodemask_t mems_allowed;
/* effective CPUs and Memory Nodes allow to tasks */
@@ -396,7 +397,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
{
- return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
+ return cpumask_subset(p->cpus_requested, q->cpus_requested) &&
nodes_subset(p->mems_allowed, q->mems_allowed) &&
is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
is_mem_exclusive(p) <= is_mem_exclusive(q);
@@ -495,7 +496,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
cpuset_for_each_child(c, css, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
- cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+ cpumask_intersects(trial->cpus_requested, c->cpus_requested))
goto out;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
@@ -934,17 +935,18 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (!*buf) {
cpumask_clear(trialcs->cpus_allowed);
} else {
- retval = cpulist_parse(buf, trialcs->cpus_allowed);
+ retval = cpulist_parse(buf, trialcs->cpus_requested);
if (retval < 0)
return retval;
- if (!cpumask_subset(trialcs->cpus_allowed,
- top_cpuset.cpus_allowed))
+ if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask))
return -EINVAL;
+
+ cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask);
}
/* Nothing to do if the cpus didn't change */
- if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+ if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested))
return 0;
retval = validate_change(cs, trialcs);
@@ -953,6 +955,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
mutex_lock(&callback_mutex);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+ cpumask_copy(cs->cpus_requested, trialcs->cpus_requested);
mutex_unlock(&callback_mutex);
/* use trialcs->cpus_allowed as a temp variable */
@@ -1725,7 +1728,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
switch (type) {
case FILE_CPULIST:
- s += cpulist_scnprintf(s, count, cs->cpus_allowed);
+ s += cpulist_scnprintf(s, count, cs->cpus_requested);
break;
case FILE_MEMLIST:
s += nodelist_scnprintf(s, count, cs->mems_allowed);
@@ -1925,11 +1928,14 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);
if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
goto free_cs;
+ if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
+ goto free_allowed;
if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
- goto free_cpus;
+ goto free_requested;
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
+ cpumask_clear(cs->cpus_requested);
nodes_clear(cs->mems_allowed);
cpumask_clear(cs->effective_cpus);
nodes_clear(cs->effective_mems);
@@ -1938,7 +1944,9 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return &cs->css;
-free_cpus:
+free_requested:
+ free_cpumask_var(cs->cpus_requested);
+free_allowed:
free_cpumask_var(cs->cpus_allowed);
free_cs:
kfree(cs);
@@ -2001,6 +2009,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->mems_allowed = parent->mems_allowed;
cs->effective_mems = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ cpumask_copy(cs->cpus_requested, parent->cpus_requested);
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
mutex_unlock(&callback_mutex);
out_unlock:
@@ -2035,6 +2044,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
free_cpumask_var(cs->effective_cpus);
free_cpumask_var(cs->cpus_allowed);
+ free_cpumask_var(cs->cpus_requested);
kfree(cs);
}
@@ -2098,8 +2108,11 @@ int __init cpuset_init(void)
BUG();
if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
BUG();
+ if (!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL))
+ BUG();
cpumask_setall(top_cpuset.cpus_allowed);
+ cpumask_setall(top_cpuset.cpus_requested);
nodes_setall(top_cpuset.mems_allowed);
cpumask_setall(top_cpuset.effective_cpus);
nodes_setall(top_cpuset.effective_mems);
@@ -2233,7 +2246,7 @@ retry:
goto retry;
}
- cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+ cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1adf62b39b96..af0612216811 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -87,6 +87,10 @@ static int kgdb_use_con;
bool dbg_is_early = true;
/* Next cpu to become the master debug core */
int dbg_switch_cpu;
+/* Flag for entering kdb when a panic occurs */
+static bool break_on_panic = true;
+/* Flag for entering kdb when an exception occurs */
+static bool break_on_exception = true;
/* Use kdb or gdbserver mode */
int dbg_kdb_mode = 1;
@@ -101,6 +105,8 @@ early_param("kgdbcon", opt_kgdb_con);
module_param(kgdb_use_con, int, 0644);
module_param(kgdbreboot, int, 0644);
+module_param(break_on_panic, bool, 0644);
+module_param(break_on_exception, bool, 0644);
/*
* Holds information about breakpoints in a kernel. These breakpoints are
@@ -690,6 +696,9 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
if (arch_kgdb_ops.enable_nmi)
arch_kgdb_ops.enable_nmi(0);
+ if (unlikely(signo != SIGTRAP && !break_on_exception))
+ return 1;
+
memset(ks, 0, sizeof(struct kgdb_state));
ks->cpu = raw_smp_processor_id();
ks->ex_vector = evector;
@@ -821,6 +830,9 @@ static int kgdb_panic_event(struct notifier_block *self,
unsigned long val,
void *data)
{
+ if (!break_on_panic)
+ return NOTIFY_DONE;
+
if (dbg_kdb_mode)
kdb_printf("PANIC: %s\n", (char *)data);
kgdb_breakpoint();
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index b20d544f20c2..b8a3fa141685 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
} else {
kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
__func__, bp->bp_addr);
-#ifdef CONFIG_DEBUG_RODATA
if (!bp->bp_type) {
kdb_printf("Software breakpoints are unavailable.\n"
- " Change the kernel CONFIG_DEBUG_RODATA=n\n"
+ " Boot the kernel with rodata=off\n"
" OR use hw breaks: help bph\n");
}
-#endif
return 1;
}
return 0;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 7c70812caea5..13a1e6b5c40e 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -216,7 +216,7 @@ static char *kdb_read(char *buffer, size_t bufsize)
int i;
int diag, dtab_count;
int key;
-
+ static int last_crlf;
diag = kdbgetintenv("DTABCOUNT", &dtab_count);
if (diag)
@@ -237,6 +237,9 @@ poll_again:
return buffer;
if (key != 9)
tab = 0;
+ if (key != 10 && key != 13)
+ last_crlf = 0;
+
switch (key) {
case 8: /* backspace */
if (cp > buffer) {
@@ -254,7 +257,12 @@ poll_again:
*cp = tmp;
}
break;
- case 13: /* enter */
+ case 10: /* new line */
+ case 13: /* carriage return */
+ /* handle \n after \r */
+ if (last_crlf && last_crlf != key)
+ break;
+ last_crlf = key;
*lastchar++ = '\n';
*lastchar++ = '\0';
if (!KDB_STATE(KGDB_TRANS)) {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4886c0e97bbd..e1eeaf0be36f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -170,8 +170,13 @@ static struct srcu_struct pmus_srcu;
* 0 - disallow raw tracepoint access for unpriv
* 1 - disallow cpu events for unpriv
* 2 - disallow kernel profiling for unpriv
+ * 3 - disallow all unpriv perf event use
*/
+#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
+int sysctl_perf_event_paranoid __read_mostly = 3;
+#else
int sysctl_perf_event_paranoid __read_mostly = 1;
+#endif
/* Minimum for 512 kiB + 1 user control page */
int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
@@ -7443,6 +7448,9 @@ SYSCALL_DEFINE5(perf_event_open,
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
+ if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 83d4382f5699..e0a3c71f2734 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -68,7 +68,14 @@ lookup_exec_domain(unsigned int personality)
goto out;
}
-#ifdef CONFIG_MODULES
+/*
+ * Disable the request_module here to avoid trying to
+ * load the personality-8 module, which doesn't exist,
+ * and results in selinux audit noise.
+ * Disabling this here avoids folks adding module_request
+ * to their sepolicy, which is maybe too generous
+ */
+#if 0
read_unlock(&exec_domains_lock);
request_module("personality-%d", pers);
read_lock(&exec_domains_lock);
diff --git a/kernel/exit.c b/kernel/exit.c
index 654021d9f1b3..8cb2ae6841b1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -53,6 +53,9 @@
#include <linux/oom.h>
#include <linux/writeback.h>
#include <linux/shm.h>
+#include <linux/kcov.h>
+
+#include "sched/tune.h"
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -671,6 +674,7 @@ void do_exit(long code)
TASKS_RCU(int tasks_rcu_i);
profile_task_exit(tsk);
+ kcov_task_exit(tsk);
WARN_ON(blk_needs_flush_plug(tsk));
@@ -713,6 +717,9 @@ void do_exit(long code)
}
exit_signals(tsk); /* sets PF_EXITING */
+
+ schedtune_exit_task(tsk);
+
/*
* tsk->flags are checked in the futex code to protect against
* an exiting task cleaning up the robust pi futexes.
diff --git a/kernel/fork.c b/kernel/fork.c
index f45e647f8e55..3e211639e469 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -74,6 +74,7 @@
#include <linux/uprobes.h>
#include <linux/aio.h>
#include <linux/compiler.h>
+#include <linux/kcov.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -201,6 +202,9 @@ struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
+/* Notifier list called when a task struct is freed */
+static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
+
static void account_kernel_stack(struct thread_info *ti, int account)
{
struct zone *zone = page_zone(virt_to_page(ti));
@@ -234,6 +238,18 @@ static inline void put_signal_struct(struct signal_struct *sig)
free_signal_struct(sig);
}
+int task_free_register(struct notifier_block *n)
+{
+ return atomic_notifier_chain_register(&task_free_notifier, n);
+}
+EXPORT_SYMBOL(task_free_register);
+
+int task_free_unregister(struct notifier_block *n)
+{
+ return atomic_notifier_chain_unregister(&task_free_notifier, n);
+}
+EXPORT_SYMBOL(task_free_unregister);
+
void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
@@ -246,6 +262,7 @@ void __put_task_struct(struct task_struct *tsk)
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
+ atomic_notifier_call_chain(&task_free_notifier, 0, tsk);
if (!profile_handoff_task(tsk))
free_task(tsk);
}
@@ -354,6 +371,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
account_kernel_stack(ti, 1);
+ kcov_task_init(tsk);
+
return tsk;
free_ti:
@@ -735,7 +754,8 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
mm = get_task_mm(task);
if (mm && mm != current->mm &&
- !ptrace_may_access(task, mode)) {
+ !ptrace_may_access(task, mode) &&
+ !capable(CAP_SYS_RESOURCE)) {
mmput(mm);
mm = ERR_PTR(-EACCES);
}
@@ -1037,7 +1057,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
/* Thread group counters. */
thread_group_cputime_init(sig);
- cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) {
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
sig->cputimer.running = 1;
@@ -1287,6 +1307,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
+ p->cpu_power = 0;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
diff --git a/kernel/futex.c b/kernel/futex.c
index 54ebb63711f4..bd00696741ed 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2227,7 +2227,7 @@ retry:
if (!abs_time)
goto out;
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = futex_wait_restart;
restart->futex.uaddr = uaddr;
restart->futex.val = val;
diff --git a/kernel/kcov.c b/kernel/kcov.c
new file mode 100644
index 000000000000..e228cb1894d5
--- /dev/null
+++ b/kernel/kcov.c
@@ -0,0 +1,274 @@
+#define pr_fmt(fmt) "kcov: " fmt
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/kcov.h>
+#include <linux/sched.h>
+
+/*
+ * kcov descriptor (one per opened debugfs file).
+ * State transitions of the descriptor:
+ * - initial state after open()
+ * - then there must be a single ioctl(KCOV_INIT_TRACE) call
+ * - then, mmap() call (several calls are allowed but not useful)
+ * - then, repeated enable/disable for a task (only one task a time allowed)
+ */
+struct kcov {
+ /*
+ * Reference counter. We keep one for:
+ * - opened file descriptor
+ * - task with enabled coverage (we can't unwire it from another task)
+ */
+ atomic_t refcount;
+ /* The lock protects mode, size, area and t. */
+ spinlock_t lock;
+ enum kcov_mode mode;
+ /* Size of arena (in long's for KCOV_MODE_TRACE). */
+ unsigned size;
+ /* Coverage buffer shared with user space. */
+ void *area;
+ /* Task for which we collect coverage, or NULL. */
+ struct task_struct *t;
+};
+
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void __sanitizer_cov_trace_pc(void)
+{
+ struct task_struct *t;
+ enum kcov_mode mode;
+
+ t = current;
+ /*
+ * We are interested in code coverage as a function of a syscall inputs,
+ * so we ignore code executed in interrupts.
+ */
+ if (!t || in_interrupt())
+ return;
+ mode = READ_ONCE(t->kcov_mode);
+ if (mode == KCOV_MODE_TRACE) {
+ unsigned long *area;
+ unsigned long pos;
+
+ /*
+ * There is some code that runs in interrupts but for which
+ * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+ * READ_ONCE()/barrier() effectively provides load-acquire wrt
+ * interrupts, there are paired barrier()/WRITE_ONCE() in
+ * kcov_ioctl_locked().
+ */
+ barrier();
+ area = t->kcov_area;
+ /* The first word is number of subsequent PCs. */
+ pos = READ_ONCE(area[0]) + 1;
+ if (likely(pos < t->kcov_size)) {
+ area[pos] = _RET_IP_;
+ WRITE_ONCE(area[0], pos);
+ }
+ }
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+
+static void kcov_get(struct kcov *kcov)
+{
+ atomic_inc(&kcov->refcount);
+}
+
+static void kcov_put(struct kcov *kcov)
+{
+ if (atomic_dec_and_test(&kcov->refcount)) {
+ vfree(kcov->area);
+ kfree(kcov);
+ }
+}
+
+void kcov_task_init(struct task_struct *t)
+{
+ t->kcov_mode = KCOV_MODE_DISABLED;
+ t->kcov_size = 0;
+ t->kcov_area = NULL;
+ t->kcov = NULL;
+}
+
+void kcov_task_exit(struct task_struct *t)
+{
+ struct kcov *kcov;
+
+ kcov = t->kcov;
+ if (kcov == NULL)
+ return;
+ spin_lock(&kcov->lock);
+ if (WARN_ON(kcov->t != t)) {
+ spin_unlock(&kcov->lock);
+ return;
+ }
+ /* Just to not leave dangling references behind. */
+ kcov_task_init(t);
+ kcov->t = NULL;
+ spin_unlock(&kcov->lock);
+ kcov_put(kcov);
+}
+
+static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+ int res = 0;
+ void *area;
+ struct kcov *kcov = vma->vm_file->private_data;
+ unsigned long size, off;
+ struct page *page;
+
+ area = vmalloc_user(vma->vm_end - vma->vm_start);
+ if (!area)
+ return -ENOMEM;
+
+ spin_lock(&kcov->lock);
+ size = kcov->size * sizeof(unsigned long);
+ if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+ vma->vm_end - vma->vm_start != size) {
+ res = -EINVAL;
+ goto exit;
+ }
+ if (!kcov->area) {
+ kcov->area = area;
+ vma->vm_flags |= VM_DONTEXPAND;
+ spin_unlock(&kcov->lock);
+ for (off = 0; off < size; off += PAGE_SIZE) {
+ page = vmalloc_to_page(kcov->area + off);
+ if (vm_insert_page(vma, vma->vm_start + off, page))
+ WARN_ONCE(1, "vm_insert_page() failed");
+ }
+ return 0;
+ }
+exit:
+ spin_unlock(&kcov->lock);
+ vfree(area);
+ return res;
+}
+
+static int kcov_open(struct inode *inode, struct file *filep)
+{
+ struct kcov *kcov;
+
+ kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
+ if (!kcov)
+ return -ENOMEM;
+ atomic_set(&kcov->refcount, 1);
+ spin_lock_init(&kcov->lock);
+ filep->private_data = kcov;
+ return nonseekable_open(inode, filep);
+}
+
+static int kcov_close(struct inode *inode, struct file *filep)
+{
+ kcov_put(filep->private_data);
+ return 0;
+}
+
+static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
+ unsigned long arg)
+{
+ struct task_struct *t;
+ unsigned long size, unused;
+
+ switch (cmd) {
+ case KCOV_INIT_TRACE:
+ /*
+ * Enable kcov in trace mode and setup buffer size.
+ * Must happen before anything else.
+ */
+ if (kcov->mode != KCOV_MODE_DISABLED)
+ return -EBUSY;
+ /*
+ * Size must be at least 2 to hold current position and one PC.
+ * Later we allocate size * sizeof(unsigned long) memory,
+ * that must not overflow.
+ */
+ size = arg;
+ if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+ return -EINVAL;
+ kcov->size = size;
+ kcov->mode = KCOV_MODE_TRACE;
+ return 0;
+ case KCOV_ENABLE:
+ /*
+ * Enable coverage for the current task.
+ * At this point user must have been enabled trace mode,
+ * and mmapped the file. Coverage collection is disabled only
+ * at task exit or voluntary by KCOV_DISABLE. After that it can
+ * be enabled for another task.
+ */
+ unused = arg;
+ if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
+ kcov->area == NULL)
+ return -EINVAL;
+ if (kcov->t != NULL)
+ return -EBUSY;
+ t = current;
+ /* Cache in task struct for performance. */
+ t->kcov_size = kcov->size;
+ t->kcov_area = kcov->area;
+ /* See comment in __sanitizer_cov_trace_pc(). */
+ barrier();
+ WRITE_ONCE(t->kcov_mode, kcov->mode);
+ t->kcov = kcov;
+ kcov->t = t;
+ /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */
+ kcov_get(kcov);
+ return 0;
+ case KCOV_DISABLE:
+ /* Disable coverage for the current task. */
+ unused = arg;
+ if (unused != 0 || current->kcov != kcov)
+ return -EINVAL;
+ t = current;
+ if (WARN_ON(kcov->t != t))
+ return -EINVAL;
+ kcov_task_init(t);
+ kcov->t = NULL;
+ kcov_put(kcov);
+ return 0;
+ default:
+ return -ENOTTY;
+ }
+}
+
+static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ struct kcov *kcov;
+ int res;
+
+ kcov = filep->private_data;
+ spin_lock(&kcov->lock);
+ res = kcov_ioctl_locked(kcov, cmd, arg);
+ spin_unlock(&kcov->lock);
+ return res;
+}
+
+static const struct file_operations kcov_fops = {
+ .open = kcov_open,
+ .unlocked_ioctl = kcov_ioctl,
+ .mmap = kcov_mmap,
+ .release = kcov_close,
+};
+
+static int __init kcov_init(void)
+{
+ if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) {
+ pr_err("failed to create kcov in debugfs\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+device_initcall(kcov_init);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 10e489c448fe..7c40a189becc 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -325,16 +325,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create_on_node);
-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
{
- /* Must have done schedule() in kthread() before we set_task_cpu */
+ unsigned long flags;
+
if (!wait_task_inactive(p, state)) {
WARN_ON(1);
return;
}
+
/* It's safe because the task is inactive. */
- do_set_cpus_allowed(p, cpumask_of(cpu));
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ do_set_cpus_allowed(p, mask);
p->flags |= PF_NO_SETAFFINITY;
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+{
+ __kthread_bind_mask(p, cpumask_of(cpu), state);
+}
+
+void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
+{
+ __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
}
/**
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8541bfdfd232..dc968422e477 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,3 +1,6 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index c2ae4618a159..67ecc6b59e1c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -18,6 +18,15 @@ config SUSPEND_FREEZER
Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config WAKELOCK
+ bool "Android's method of preventing suspend"
+ default y
+ ---help---
+ This allows applications to prevent the CPU from suspending while
+ they need it.
+
+ Say Y if you are running an android userspace.
+
config HIBERNATE_CALLBACKS
bool
@@ -309,3 +318,10 @@ config PM_GENERIC_DOMAINS_OF
config CPU_PM
bool
depends on SUSPEND || CPU_IDLE
+
+config SUSPEND_TIME
+ bool "Log time spent in suspend"
+ ---help---
+ Prints the time spent in suspend in the kernel log, and
+ keeps statistics on the time spent in suspend in
+ /sys/kernel/debug/suspend_time
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 29472bff11ef..299f8a4d42f7 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -13,3 +13,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
+
+obj-$(CONFIG_SUSPEND) += wakeup_reason.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1f35a3478f3c..090020728342 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -646,7 +646,7 @@ static void power_down(void)
*/
int hibernate(void)
{
- int error;
+ int error, nr_calls = 0;
if (!hibernation_available()) {
pr_debug("PM: Hibernation not available.\n");
@@ -661,9 +661,11 @@ int hibernate(void)
}
pm_prepare_console();
- error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
- if (error)
+ error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
goto Exit;
+ }
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
@@ -713,7 +715,7 @@ int hibernate(void)
/* Don't bother checking whether freezer_test_done is true */
freezer_test_done = false;
Exit:
- pm_notifier_call_chain(PM_POST_HIBERNATION);
+ __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
pm_restore_console();
atomic_inc(&snapshot_device_available);
Unlock:
@@ -739,7 +741,7 @@ int hibernate(void)
*/
static int software_resume(void)
{
- int error;
+ int error, nr_calls = 0;
unsigned int flags;
/*
@@ -826,9 +828,11 @@ static int software_resume(void)
}
pm_prepare_console();
- error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
- if (error)
+ error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
goto Close_Finish;
+ }
pr_debug("PM: Preparing processes for restore.\n");
error = freeze_processes();
@@ -854,7 +858,7 @@ static int software_resume(void)
unlock_device_hotplug();
thaw_processes();
Finish:
- pm_notifier_call_chain(PM_POST_RESTORE);
+ __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
pm_restore_console();
atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9a59d042ea84..36368ca65c3d 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_pm_notifier);
-int pm_notifier_call_chain(unsigned long val)
+int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls)
{
- int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
+ int ret;
+
+ ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL,
+ nr_to_call, nr_calls);
return notifier_to_errno(ret);
}
+int pm_notifier_call_chain(unsigned long val)
+{
+ return __pm_notifier_call_chain(val, -1, NULL);
+}
/* If set, devices may be suspended and resumed asynchronously. */
int pm_async_enabled = 1;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 2df883a9d3cb..26d7f74b97e6 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -201,6 +201,8 @@ static inline void suspend_test_finish(const char *label) {}
#ifdef CONFIG_PM_SLEEP
/* kernel/power/main.c */
+extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call,
+ int *nr_calls);
extern int pm_notifier_call_chain(unsigned long val);
#endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5a6ec8678b9a..7456dcb16240 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -18,6 +18,7 @@
#include <linux/workqueue.h>
#include <linux/kmod.h>
#include <trace/events/power.h>
+#include <linux/wakeup_reason.h>
/*
* Timeout for stopping processes
@@ -35,6 +36,9 @@ static int try_to_freeze_tasks(bool user_only)
unsigned int elapsed_msecs;
bool wakeup = false;
int sleep_usecs = USEC_PER_MSEC;
+#ifdef CONFIG_PM_SLEEP
+ char suspend_abort[MAX_SUSPEND_ABORT_LEN];
+#endif
do_gettimeofday(&start);
@@ -64,6 +68,11 @@ static int try_to_freeze_tasks(bool user_only)
break;
if (pm_wakeup_pending()) {
+#ifdef CONFIG_PM_SLEEP
+ pm_get_active_wakeup_sources(suspend_abort,
+ MAX_SUSPEND_ABORT_LEN);
+ log_suspend_abort_reason(suspend_abort);
+#endif
wakeup = true;
break;
}
@@ -83,15 +92,17 @@ static int try_to_freeze_tasks(bool user_only)
do_div(elapsed_msecs64, NSEC_PER_MSEC);
elapsed_msecs = elapsed_msecs64;
- if (todo) {
+ if (wakeup) {
printk("\n");
- printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds "
- "(%d tasks refusing to freeze, wq_busy=%d):\n",
- wakeup ? "aborted" : "failed",
+ printk(KERN_ERR "Freezing of tasks aborted after %d.%03d seconds",
+ elapsed_msecs / 1000, elapsed_msecs % 1000);
+ } else if (todo) {
+ printk("\n");
+ printk(KERN_ERR "Freezing of tasks failed after %d.%03d seconds"
+ " (%d tasks refusing to freeze, wq_busy=%d):\n",
elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
- if (!wakeup) {
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
if (p != current && !freezer_should_skip(p)
@@ -99,7 +110,6 @@ static int try_to_freeze_tasks(bool user_only)
sched_show_task(p);
}
read_unlock(&tasklist_lock);
- }
} else {
printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
elapsed_msecs % 1000);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c347e3ce3a55..882f80e01240 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -26,8 +26,10 @@
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
#include <linux/ftrace.h>
+#include <linux/rtc.h>
#include <trace/events/power.h>
#include <linux/compiler.h>
+#include <linux/wakeup_reason.h>
#include "power.h"
@@ -37,7 +39,9 @@ const char *pm_states[PM_SUSPEND_MAX];
static const struct platform_suspend_ops *suspend_ops;
static const struct platform_freeze_ops *freeze_ops;
static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-static bool suspend_freeze_wake;
+
+enum freeze_state __read_mostly suspend_freeze_state;
+static DEFINE_SPINLOCK(suspend_freeze_lock);
void freeze_set_ops(const struct platform_freeze_ops *ops)
{
@@ -48,22 +52,49 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
static void freeze_begin(void)
{
- suspend_freeze_wake = false;
+ suspend_freeze_state = FREEZE_STATE_NONE;
}
static void freeze_enter(void)
{
- cpuidle_use_deepest_state(true);
+ spin_lock_irq(&suspend_freeze_lock);
+ if (pm_wakeup_pending())
+ goto out;
+
+ suspend_freeze_state = FREEZE_STATE_ENTER;
+ spin_unlock_irq(&suspend_freeze_lock);
+
+ get_online_cpus();
cpuidle_resume();
- wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+
+ /* Push all the CPUs into the idle loop. */
+ wake_up_all_idle_cpus();
+ pr_debug("PM: suspend-to-idle\n");
+ /* Make the current CPU wait so it can enter the idle loop too. */
+ wait_event(suspend_freeze_wait_head,
+ suspend_freeze_state == FREEZE_STATE_WAKE);
+ pr_debug("PM: resume from suspend-to-idle\n");
+
cpuidle_pause();
- cpuidle_use_deepest_state(false);
+ put_online_cpus();
+
+ spin_lock_irq(&suspend_freeze_lock);
+
+ out:
+ suspend_freeze_state = FREEZE_STATE_NONE;
+ spin_unlock_irq(&suspend_freeze_lock);
}
void freeze_wake(void)
{
- suspend_freeze_wake = true;
- wake_up(&suspend_freeze_wait_head);
+ unsigned long flags;
+
+ spin_lock_irqsave(&suspend_freeze_lock, flags);
+ if (suspend_freeze_state > FREEZE_STATE_NONE) {
+ suspend_freeze_state = FREEZE_STATE_WAKE;
+ wake_up(&suspend_freeze_wait_head);
+ }
+ spin_unlock_irqrestore(&suspend_freeze_lock, flags);
}
EXPORT_SYMBOL_GPL(freeze_wake);
@@ -225,16 +256,18 @@ static int suspend_test(int level)
*/
static int suspend_prepare(suspend_state_t state)
{
- int error;
+ int error, nr_calls = 0;
if (!sleep_state_supported(state))
return -EPERM;
pm_prepare_console();
- error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
- if (error)
+ error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
goto Finish;
+ }
trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
@@ -245,7 +278,7 @@ static int suspend_prepare(suspend_state_t state)
suspend_stats.failed_freeze++;
dpm_save_failed_step(SUSPEND_FREEZE);
Finish:
- pm_notifier_call_chain(PM_POST_SUSPEND);
+ __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);
pm_restore_console();
return error;
}
@@ -271,7 +304,8 @@ void __weak arch_suspend_enable_irqs(void)
*/
static int suspend_enter(suspend_state_t state, bool *wakeup)
{
- int error;
+ char suspend_abort[MAX_SUSPEND_ABORT_LEN];
+ int error, last_dev;
error = platform_suspend_prepare(state);
if (error)
@@ -279,7 +313,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_late(PMSG_SUSPEND);
if (error) {
+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+ last_dev %= REC_FAILED_NUM;
printk(KERN_ERR "PM: late suspend of devices failed\n");
+ log_suspend_abort_reason("%s device failed to power down",
+ suspend_stats.failed_devs[last_dev]);
goto Platform_finish;
}
error = platform_suspend_prepare_late(state);
@@ -288,7 +326,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+ last_dev %= REC_FAILED_NUM;
printk(KERN_ERR "PM: noirq suspend of devices failed\n");
+ log_suspend_abort_reason("noirq suspend of %s device failed",
+ suspend_stats.failed_devs[last_dev]);
goto Platform_early_resume;
}
error = platform_suspend_prepare_noirq(state);
@@ -312,8 +354,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
}
error = disable_nonboot_cpus();
- if (error || suspend_test(TEST_CPUS))
+ if (error || suspend_test(TEST_CPUS)) {
+ log_suspend_abort_reason("Disabling non-boot cpus failed");
goto Enable_cpus;
+ }
arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());
@@ -328,6 +372,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
trace_suspend_resume(TPS("machine_suspend"),
state, false);
events_check_enabled = false;
+ } else if (*wakeup) {
+ pm_get_active_wakeup_sources(suspend_abort,
+ MAX_SUSPEND_ABORT_LEN);
+ log_suspend_abort_reason(suspend_abort);
+ error = -EBUSY;
}
syscore_resume();
}
@@ -374,6 +423,7 @@ int suspend_devices_and_enter(suspend_state_t state)
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
+ log_suspend_abort_reason("Some devices failed to suspend, or early wake event detected");
goto Recover_platform;
}
suspend_test_finish("suspend devices");
@@ -472,6 +522,18 @@ static int enter_state(suspend_state_t state)
return error;
}
+static void pm_suspend_marker(char *annotation)
+{
+ struct timespec ts;
+ struct tm tm;
+
+ getnstimeofday(&ts);
+ time_to_tm(ts.tv_sec, 0, &tm);
+ pr_info("PM: suspend %s %ld-%02d-%02d %02d:%02d:%02d.%09lu UTC\n",
+ annotation, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec, ts.tv_nsec);
+}
+
/**
* pm_suspend - Externally visible function for suspending the system.
* @state: System sleep state to enter.
@@ -486,6 +548,7 @@ int pm_suspend(suspend_state_t state)
if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
return -EINVAL;
+ pm_suspend_marker("entry");
error = enter_state(state);
if (error) {
suspend_stats.fail++;
@@ -493,6 +556,7 @@ int pm_suspend(suspend_state_t state)
} else {
suspend_stats.success++;
}
+ pm_suspend_marker("exit");
return error;
}
EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 526e8911460a..35310b627388 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1);
static int snapshot_open(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
- int error;
+ int error, nr_calls = 0;
if (!hibernation_available())
return -EPERM;
@@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
swap_type_of(swsusp_resume_device, 0, NULL) : -1;
data->mode = O_RDONLY;
data->free_bitmaps = false;
- error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+ error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
if (error)
- pm_notifier_call_chain(PM_POST_HIBERNATION);
+ __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL);
} else {
/*
* Resuming. We may need to wait for the image device to
@@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp)
data->swap = -1;
data->mode = O_WRONLY;
- error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+ error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
if (!error) {
error = create_basic_memory_bitmaps();
data->free_bitmaps = !error;
- }
+ } else
+ nr_calls--;
+
if (error)
- pm_notifier_call_chain(PM_POST_RESTORE);
+ __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
}
if (error)
atomic_inc(&snapshot_device_available);
diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c
new file mode 100644
index 000000000000..8f825b9adacb
--- /dev/null
+++ b/kernel/power/wakeup_reason.c
@@ -0,0 +1,288 @@
+/*
+ * kernel/power/wakeup_reason.c
+ *
+ * Logs the reasons which caused the kernel to resume from
+ * the suspend mode.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/wakeup_reason.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+#include <linux/debugfs.h>
+
+
+#define MAX_WAKEUP_REASON_IRQS 32
+static int irq_list[MAX_WAKEUP_REASON_IRQS];
+static int irqcount;
+static bool suspend_abort;
+static char abort_reason[MAX_SUSPEND_ABORT_LEN];
+static struct kobject *wakeup_reason;
+static DEFINE_SPINLOCK(resume_reason_lock);
+
+static ktime_t last_monotime; /* monotonic time before last suspend */
+static ktime_t curr_monotime; /* monotonic time after last suspend */
+static ktime_t last_stime; /* monotonic boottime offset before last suspend */
+static ktime_t curr_stime; /* monotonic boottime offset after last suspend */
+#if IS_ENABLED(CONFIG_SUSPEND_TIME)
+static unsigned int time_in_suspend_bins[32];
+#endif
+
+static ssize_t last_resume_reason_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ int irq_no, buf_offset = 0;
+ struct irq_desc *desc;
+ spin_lock(&resume_reason_lock);
+ if (suspend_abort) {
+ buf_offset = sprintf(buf, "Abort: %s", abort_reason);
+ } else {
+ for (irq_no = 0; irq_no < irqcount; irq_no++) {
+ desc = irq_to_desc(irq_list[irq_no]);
+ if (desc && desc->action && desc->action->name)
+ buf_offset += sprintf(buf + buf_offset, "%d %s\n",
+ irq_list[irq_no], desc->action->name);
+ else
+ buf_offset += sprintf(buf + buf_offset, "%d\n",
+ irq_list[irq_no]);
+ }
+ }
+ spin_unlock(&resume_reason_lock);
+ return buf_offset;
+}
+
+static ssize_t last_suspend_time_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct timespec sleep_time;
+ struct timespec total_time;
+ struct timespec suspend_resume_time;
+
+ /*
+ * total_time is calculated from monotonic bootoffsets because
+ * unlike CLOCK_MONOTONIC it include the time spent in suspend state.
+ */
+ total_time = ktime_to_timespec(ktime_sub(curr_stime, last_stime));
+
+ /*
+ * suspend_resume_time is calculated as monotonic (CLOCK_MONOTONIC)
+ * time interval before entering suspend and post suspend.
+ */
+ suspend_resume_time = ktime_to_timespec(ktime_sub(curr_monotime, last_monotime));
+
+ /* sleep_time = total_time - suspend_resume_time */
+ sleep_time = timespec_sub(total_time, suspend_resume_time);
+
+ /* Export suspend_resume_time and sleep_time in pair here. */
+ return sprintf(buf, "%lu.%09lu %lu.%09lu\n",
+ suspend_resume_time.tv_sec, suspend_resume_time.tv_nsec,
+ sleep_time.tv_sec, sleep_time.tv_nsec);
+}
+
+static struct kobj_attribute resume_reason = __ATTR_RO(last_resume_reason);
+static struct kobj_attribute suspend_time = __ATTR_RO(last_suspend_time);
+
+static struct attribute *attrs[] = {
+ &resume_reason.attr,
+ &suspend_time.attr,
+ NULL,
+};
+static struct attribute_group attr_group = {
+ .attrs = attrs,
+};
+
+/*
+ * logs all the wake up reasons to the kernel
+ * stores the irqs to expose them to the userspace via sysfs
+ */
+void log_wakeup_reason(int irq)
+{
+ struct irq_desc *desc;
+ desc = irq_to_desc(irq);
+ if (desc && desc->action && desc->action->name)
+ printk(KERN_INFO "Resume caused by IRQ %d, %s\n", irq,
+ desc->action->name);
+ else
+ printk(KERN_INFO "Resume caused by IRQ %d\n", irq);
+
+ spin_lock(&resume_reason_lock);
+ if (irqcount == MAX_WAKEUP_REASON_IRQS) {
+ spin_unlock(&resume_reason_lock);
+ printk(KERN_WARNING "Resume caused by more than %d IRQs\n",
+ MAX_WAKEUP_REASON_IRQS);
+ return;
+ }
+
+ irq_list[irqcount++] = irq;
+ spin_unlock(&resume_reason_lock);
+}
+
+int check_wakeup_reason(int irq)
+{
+ int irq_no;
+ int ret = false;
+
+ spin_lock(&resume_reason_lock);
+ for (irq_no = 0; irq_no < irqcount; irq_no++)
+ if (irq_list[irq_no] == irq) {
+ ret = true;
+ break;
+ }
+ spin_unlock(&resume_reason_lock);
+ return ret;
+}
+
+void log_suspend_abort_reason(const char *fmt, ...)
+{
+ va_list args;
+
+ spin_lock(&resume_reason_lock);
+
+ //Suspend abort reason has already been logged.
+ if (suspend_abort) {
+ spin_unlock(&resume_reason_lock);
+ return;
+ }
+
+ suspend_abort = true;
+ va_start(args, fmt);
+ vsnprintf(abort_reason, MAX_SUSPEND_ABORT_LEN, fmt, args);
+ va_end(args);
+ spin_unlock(&resume_reason_lock);
+}
+
+/* Detects a suspend and clears all the previous wake up reasons*/
+static int wakeup_reason_pm_event(struct notifier_block *notifier,
+ unsigned long pm_event, void *unused)
+{
+#if IS_ENABLED(CONFIG_SUSPEND_TIME)
+ ktime_t temp;
+ struct timespec suspend_time;
+#endif
+
+ switch (pm_event) {
+ case PM_SUSPEND_PREPARE:
+ spin_lock(&resume_reason_lock);
+ irqcount = 0;
+ suspend_abort = false;
+ spin_unlock(&resume_reason_lock);
+ /* monotonic time since boot */
+ last_monotime = ktime_get();
+ /* monotonic time since boot including the time spent in suspend */
+ last_stime = ktime_get_boottime();
+ break;
+ case PM_POST_SUSPEND:
+ /* monotonic time since boot */
+ curr_monotime = ktime_get();
+ /* monotonic time since boot including the time spent in suspend */
+ curr_stime = ktime_get_boottime();
+
+#if IS_ENABLED(CONFIG_SUSPEND_TIME)
+ temp = ktime_sub(ktime_sub(curr_stime, last_stime),
+ ktime_sub(curr_monotime, last_monotime));
+ suspend_time = ktime_to_timespec(temp);
+ time_in_suspend_bins[fls(suspend_time.tv_sec)]++;
+ pr_info("Suspended for %lu.%03lu seconds\n", suspend_time.tv_sec,
+ suspend_time.tv_nsec / NSEC_PER_MSEC);
+#endif
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block wakeup_reason_pm_notifier_block = {
+ .notifier_call = wakeup_reason_pm_event,
+};
+
+#if IS_ENABLED(CONFIG_DEBUG_FS) && IS_ENABLED(CONFIG_SUSPEND_TIME)
+static int suspend_time_debug_show(struct seq_file *s, void *data)
+{
+ int bin;
+ seq_printf(s, "time (secs) count\n");
+ seq_printf(s, "------------------\n");
+ for (bin = 0; bin < 32; bin++) {
+ if (time_in_suspend_bins[bin] == 0)
+ continue;
+ seq_printf(s, "%4d - %4d %4u\n",
+ bin ? 1 << (bin - 1) : 0, 1 << bin,
+ time_in_suspend_bins[bin]);
+ }
+ return 0;
+}
+
+static int suspend_time_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, suspend_time_debug_show, NULL);
+}
+
+static const struct file_operations suspend_time_debug_fops = {
+ .open = suspend_time_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init suspend_time_debug_init(void)
+{
+ struct dentry *d;
+
+ d = debugfs_create_file("suspend_time", 0755, NULL, NULL,
+ &suspend_time_debug_fops);
+ if (!d) {
+ pr_err("Failed to create suspend_time debug file\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+late_initcall(suspend_time_debug_init);
+#endif
+
+/* Initializes the sysfs parameter
+ * registers the pm_event notifier
+ */
+int __init wakeup_reason_init(void)
+{
+ int retval;
+
+ retval = register_pm_notifier(&wakeup_reason_pm_notifier_block);
+ if (retval)
+ printk(KERN_WARNING "[%s] failed to register PM notifier %d\n",
+ __func__, retval);
+
+ wakeup_reason = kobject_create_and_add("wakeup_reasons", kernel_kobj);
+ if (!wakeup_reason) {
+ printk(KERN_WARNING "[%s] failed to create a sysfs kobject\n",
+ __func__);
+ return 1;
+ }
+ retval = sysfs_create_group(wakeup_reason, &attr_group);
+ if (retval) {
+ kobject_put(wakeup_reason);
+ printk(KERN_WARNING "[%s] failed to create a sysfs group %d\n",
+ __func__, retval);
+ }
+ return 0;
+}
+
+late_initcall(wakeup_reason_init);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 3100e0dd560e..403d8781c87d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -55,6 +55,10 @@
#include "console_cmdline.h"
#include "braille.h"
+#ifdef CONFIG_EARLY_PRINTK_DIRECT
+extern void printascii(char *);
+#endif
+
int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */
@@ -1707,6 +1711,10 @@ asmlinkage int vprintk_emit(int facility, int level,
}
}
+#ifdef CONFIG_EARLY_PRINTK_DIRECT
+ printascii(text);
+#endif
+
if (level == -1)
level = default_message_loglevel;
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 807ccfbf69b3..8844d2222420 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,3 +1,7 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
+
obj-y += update.o srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += tree.o
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index ab32b7b0db5c..55bb7af371b6 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_clock.o = -pg
endif
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
# needed for x86 only. Why this used to be enabled for all architectures is beyond
@@ -11,11 +15,14 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-obj-y += core.o proc.o clock.o cputime.o
+obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
obj-y += wait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
+obj-$(CONFIG_SCHED_WALT) += walt.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
+obj-$(CONFIG_SCHED_TUNE) += tune.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index eae160dd669d..a8653c2e2955 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -141,7 +141,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
p->signal->autogroup = autogroup_kref_get(ag);
- if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+ if (!READ_ONCE(sysctl_sched_autogroup_enabled))
goto out;
for_each_thread(p, t)
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd047142816..890c95f2587a 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
- int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+ int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
if (enabled && task_wants_autogroup(p, tg))
return p->signal->autogroup->tg;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d0eb16373bf0..53725f94bf6d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -89,6 +89,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+#include "walt.h"
void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
@@ -349,6 +350,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
}
}
+struct rq *
+lock_rq_of(struct task_struct *p, unsigned long *flags)
+{
+ return task_rq_lock(p, flags);
+}
+
static void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
@@ -364,6 +371,12 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
+void
+unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags)
+{
+ task_rq_unlock(rq, p, flags);
+}
+
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
@@ -566,7 +579,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+ typeof(ti->flags) old, val = READ_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
@@ -1013,6 +1026,9 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
+/*
+ * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ */
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
int oldprio)
@@ -1020,6 +1036,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
+ /* Possble rq->lock 'hole'. */
p->sched_class->switched_to(rq, p);
} else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
@@ -1051,6 +1068,199 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ * stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ * off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ * it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ * is done.
+ */
+
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+{
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_held(&rq->lock);
+
+ dequeue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ double_lock_balance(rq, cpu_rq(new_cpu));
+ set_task_cpu(p, new_cpu);
+ double_unlock_balance(rq, cpu_rq(new_cpu));
+ raw_spin_unlock(&rq->lock);
+
+ rq = cpu_rq(new_cpu);
+
+ raw_spin_lock(&rq->lock);
+ BUG_ON(task_cpu(p) != new_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ enqueue_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+
+ return rq;
+}
+
+struct migration_arg {
+ struct task_struct *task;
+ int dest_cpu;
+};
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ *
+ * Returns non-zero if task was successfully migrated.
+ */
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ if (unlikely(!cpu_active(dest_cpu)))
+ return ret;
+
+ rq = cpu_rq(src_cpu);
+
+ raw_spin_lock(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+ /* Already moved. */
+ if (task_cpu(p) != src_cpu)
+ goto done;
+
+ /* Affinity changed (again). */
+ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ goto fail;
+
+ /*
+ * If we're not on a rq, the next wake-up will ensure we're
+ * placed properly.
+ */
+ if (task_on_rq_queued(p))
+ rq = move_queued_task(p, dest_cpu);
+done:
+ ret = 1;
+fail:
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(&p->pi_lock);
+ return ret;
+}
+
+/*
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+ struct migration_arg *arg = data;
+
+ /*
+ * The original target cpu might have gone down and we might
+ * be on another cpu but it doesn't matter.
+ */
+ local_irq_disable();
+ /*
+ * We need to explicitly wake pending tasks before running
+ * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+ */
+ sched_ttwu_pending();
+ __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+ local_irq_enable();
+ return 0;
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ lockdep_assert_held(&p->pi_lock);
+
+ if (p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, new_mask);
+
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
+{
+ unsigned long flags;
+ struct rq *rq;
+ unsigned int dest_cpu;
+ int ret = 0;
+
+ rq = task_rq_lock(p, &flags);
+
+ /*
+ * Must re-check here, to close a race against __kthread_bind(),
+ * sched_setaffinity() is not guaranteed to observe the flag.
+ */
+ if (check && (p->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (cpumask_equal(&p->cpus_allowed, new_mask))
+ goto out;
+
+ if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ do_set_cpus_allowed(p, new_mask);
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
+ goto out;
+
+ dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
+ struct migration_arg arg = { p, dest_cpu };
+ /* Need help from migration thread: drop lock and wait. */
+ task_rq_unlock(rq, p, &flags);
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ tlb_migrate_finish(p->mm);
+ return 0;
+ } else if (task_on_rq_queued(p))
+ rq = move_queued_task(p, dest_cpu);
+out:
+ task_rq_unlock(rq, p, &flags);
+
+ return ret;
+}
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ return __set_cpus_allowed_ptr(p, new_mask, false);
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
@@ -1084,6 +1294,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+
+ walt_fixup_busy_time(p, new_cpu);
}
__set_task_cpu(p, new_cpu);
@@ -1191,13 +1403,6 @@ out:
return ret;
}
-struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
-};
-
-static int migration_cpu_stop(void *data);
-
/*
* wait_task_inactive - wait for a thread to unschedule.
*
@@ -1330,9 +1535,7 @@ void kick_process(struct task_struct *p)
preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_SMP
/*
* ->cpus_allowed is protected by both rq->lock and p->pi_lock
*/
@@ -1436,7 +1639,16 @@ static void update_avg(u64 *avg, u64 sample)
s64 diff = sample - *avg;
*avg += diff >> 3;
}
-#endif
+
+#else
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
+{
+ return set_cpus_allowed_ptr(p, new_mask);
+}
+
+#endif /* CONFIG_SMP */
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -1690,6 +1902,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
int cpu, success = 0;
+#ifdef CONFIG_SMP
+ struct rq *rq;
+ u64 wallclock;
+#endif
/*
* If we are going to wake up a thread waiting for CONDITION we
@@ -1742,6 +1958,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_rmb();
+ rq = cpu_rq(task_cpu(p));
+
+ raw_spin_lock(&rq->lock);
+ wallclock = walt_ktime_clock();
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+ raw_spin_unlock(&rq->lock);
+
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -1749,10 +1973,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->sched_class->task_waking(p);
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
+
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu);
@@ -1791,8 +2017,13 @@ static void try_to_wake_up_local(struct task_struct *p)
if (!(p->state & TASK_NORMAL))
goto out;
- if (!task_on_rq_queued(p))
+ if (!task_on_rq_queued(p)) {
+ u64 wallclock = walt_ktime_clock();
+
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ }
ttwu_do_wakeup(rq, p, 0);
ttwu_stat(p, smp_processor_id(), 0);
@@ -1836,6 +2067,10 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_period = 0;
dl_se->flags = 0;
dl_se->dl_bw = 0;
+
+ dl_se->dl_throttled = 0;
+ dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
}
/*
@@ -1855,13 +2090,18 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.nr_migrations = 0;
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+ walt_init_new_task_load(p);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = NULL;
+#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
RB_CLEAR_NODE(&p->dl.rb_node);
- hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ init_dl_task_timer(&p->dl);
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
@@ -2092,6 +2332,9 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
* allocated bandwidth to reflect the new situation.
*
* This function is called while holding p's rq->lock.
+ *
+ * XXX we should delay bw change until the task's 0-lag point, see
+ * __setparam_dl().
*/
static int dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr)
@@ -2146,6 +2389,11 @@ void wake_up_new_task(struct task_struct *p)
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, flags);
+
+ walt_init_new_task_load(p);
+
+ /* Initialize new task's runnable average */
+ init_entity_runnable_average(&p->se);
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
@@ -2155,10 +2403,9 @@ void wake_up_new_task(struct task_struct *p)
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
- /* Initialize new task's runnable average */
- init_task_runnable_average(p);
rq = __task_rq_lock(p);
- activate_task(rq, p, 0);
+ walt_mark_task_starting(p);
+ activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p, true);
check_preempt_curr(rq, p, WF_FORK);
@@ -2470,11 +2717,41 @@ unsigned long nr_iowait_cpu(int cpu)
return atomic_read(&this->nr_iowait);
}
+#ifdef CONFIG_CPU_QUIET
+u64 nr_running_integral(unsigned int cpu)
+{
+ unsigned int seqcnt;
+ u64 integral;
+ struct rq *q;
+
+ if (cpu >= nr_cpu_ids)
+ return 0;
+
+ q = cpu_rq(cpu);
+
+ /*
+ * Update average to avoid reading stalled value if there were
+ * no run-queue changes for a long time. On the other hand if
+ * the changes are happening right now, just read current value
+ * directly.
+ */
+
+ seqcnt = read_seqcount_begin(&q->ave_seqcnt);
+ integral = do_nr_running_integral(q);
+ if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
+ read_seqcount_begin(&q->ave_seqcnt);
+ integral = q->nr_running_integral;
+ }
+
+ return integral;
+}
+#endif
+
void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
{
- struct rq *this = this_rq();
- *nr_waiters = atomic_read(&this->nr_iowait);
- *load = this->cpu_load[0];
+ struct rq *rq = this_rq();
+ *nr_waiters = atomic_read(&rq->nr_iowait);
+ *load = rq->load.weight;
}
#ifdef CONFIG_SMP
@@ -2556,6 +2833,93 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED
+
+static inline
+unsigned long add_capacity_margin(unsigned long cpu_capacity)
+{
+ cpu_capacity = cpu_capacity * capacity_margin;
+ cpu_capacity /= SCHED_CAPACITY_SCALE;
+ return cpu_capacity;
+}
+
+static inline
+unsigned long sum_capacity_reqs(unsigned long cfs_cap,
+ struct sched_capacity_reqs *scr)
+{
+ unsigned long total = add_capacity_margin(cfs_cap + scr->rt);
+ return total += scr->dl;
+}
+
+static void sched_freq_tick_pelt(int cpu)
+{
+ unsigned long cpu_utilization = capacity_max;
+ unsigned long capacity_curr = capacity_curr_of(cpu);
+ struct sched_capacity_reqs *scr;
+
+ scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+ if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr)
+ return;
+
+ /*
+ * To make free room for a task that is building up its "real"
+ * utilization and to harm its performance the least, request
+ * a jump to a higher OPP as soon as the margin of free capacity
+ * is impacted (specified by capacity_margin).
+ */
+ set_cfs_cpu_capacity(cpu, true, cpu_utilization);
+}
+
+#ifdef CONFIG_SCHED_WALT
+static void sched_freq_tick_walt(int cpu)
+{
+ unsigned long cpu_utilization = cpu_util(cpu);
+ unsigned long capacity_curr = capacity_curr_of(cpu);
+
+ if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
+ return sched_freq_tick_pelt(cpu);
+
+ /*
+ * Add a margin to the WALT utilization.
+ * NOTE: WALT tracks a single CPU signal for all the scheduling
+ * classes, thus this margin is going to be added to the DL class as
+ * well, which is something we do not do in sched_freq_tick_pelt case.
+ */
+ cpu_utilization = add_capacity_margin(cpu_utilization);
+ if (cpu_utilization <= capacity_curr)
+ return;
+
+ /*
+ * It is likely that the load is growing so we
+ * keep the added margin in our request as an
+ * extra boost.
+ */
+ set_cfs_cpu_capacity(cpu, true, cpu_utilization);
+
+}
+#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu)
+#else
+#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu)
+#endif /* CONFIG_SCHED_WALT */
+
+static void sched_freq_tick(int cpu)
+{
+ unsigned long capacity_orig, capacity_curr;
+
+ if (!sched_freq())
+ return;
+
+ capacity_orig = capacity_orig_of(cpu);
+ capacity_curr = capacity_curr_of(cpu);
+ if (capacity_curr == capacity_orig)
+ return;
+
+ _sched_freq_tick(cpu);
+}
+#else
+static inline void sched_freq_tick(int cpu) { }
+#endif /* CONFIG_CPU_FREQ_GOV_SCHED */
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -2569,9 +2933,14 @@ void scheduler_tick(void)
sched_clock_tick();
raw_spin_lock(&rq->lock);
+ walt_set_window_start(rq);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
update_cpu_load_active(rq);
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+ walt_ktime_clock(), 0);
+ calc_global_load_tick(rq);
+ sched_freq_tick(cpu);
raw_spin_unlock(&rq->lock);
perf_event_task_tick();
@@ -2600,7 +2969,7 @@ void scheduler_tick(void)
u64 scheduler_tick_max_deferment(void)
{
struct rq *rq = this_rq();
- unsigned long next, now = ACCESS_ONCE(jiffies);
+ unsigned long next, now = READ_ONCE(jiffies);
next = rq->last_sched_tick + HZ;
@@ -2808,6 +3177,7 @@ static void __sched __schedule(void)
unsigned long *switch_count;
struct rq *rq;
int cpu;
+ u64 wallclock;
need_resched:
preempt_disable();
@@ -2857,6 +3227,9 @@ need_resched:
update_rq_clock(rq);
next = pick_next_task(rq, prev);
+ wallclock = walt_ktime_clock();
+ walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+ walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->skip_clock_update = 0;
@@ -3313,15 +3686,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
{
struct sched_dl_entity *dl_se = &p->dl;
- init_dl_task_timer(dl_se);
dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
dl_se->flags = attr->sched_flags;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
- dl_se->dl_throttled = 0;
- dl_se->dl_new = 1;
- dl_se->dl_yielded = 0;
+
+ /*
+ * Changing the parameters of a task is 'tricky' and we're not doing
+ * the correct thing -- also see task_dead_dl() and switched_from_dl().
+ *
+ * What we SHOULD do is delay the bandwidth release until the 0-lag
+ * point. This would include retaining the task_struct until that time
+ * and change dl_overflow() to not immediately decrement the current
+ * amount.
+ *
+ * Instead we retain the current runtime/deadline and let the new
+ * parameters take effect after the current reservation period lapses.
+ * This is safe (albeit pessimistic) because the 0-lag point is always
+ * before the current scheduling deadline.
+ *
+ * We can still have temporary overloads because we do not delay the
+ * change in bandwidth until that time; so admission control is
+ * not on the safe side. It does however guarantee tasks will never
+ * consume more than promised.
+ */
}
/*
@@ -3736,6 +4125,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
{
return _sched_setscheduler(p, policy, param, false);
}
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@ -4116,7 +4506,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
}
#endif
again:
- retval = set_cpus_allowed_ptr(p, new_mask);
+ retval = __set_cpus_allowed_ptr(p, new_mask, true);
if (!retval) {
cpuset_cpus_allowed(p, cpus_allowed);
@@ -4432,36 +4822,26 @@ EXPORT_SYMBOL_GPL(yield_to);
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
*/
-void __sched io_schedule(void)
-{
- struct rq *rq = raw_rq();
-
- delayacct_blkio_start();
- atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
- schedule();
- current->in_iowait = 0;
- atomic_dec(&rq->nr_iowait);
- delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
-
long __sched io_schedule_timeout(long timeout)
{
- struct rq *rq = raw_rq();
+ int old_iowait = current->in_iowait;
+ struct rq *rq;
long ret;
+ current->in_iowait = 1;
+ blk_schedule_flush_plug(current);
+
delayacct_blkio_start();
+ rq = raw_rq();
atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
ret = schedule_timeout(timeout);
- current->in_iowait = 0;
+ current->in_iowait = old_iowait;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
+
return ret;
}
+EXPORT_SYMBOL(io_schedule_timeout);
/**
* sys_sched_get_priority_max - return maximum RT priority.
@@ -4657,9 +5037,11 @@ void init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(&idle->pi_lock, flags);
+ raw_spin_lock(&rq->lock);
__sched_fork(0, idle);
+
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
@@ -4683,7 +5065,8 @@ void init_idle(struct task_struct *idle, int cpu)
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
init_idle_preempt_count(idle, cpu);
@@ -4700,149 +5083,6 @@ void init_idle(struct task_struct *idle, int cpu)
}
#ifdef CONFIG_SMP
-/*
- * move_queued_task - move a queued task to new rq.
- *
- * Returns (locked) new rq. Old rq's lock is released.
- */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
-{
- struct rq *rq = task_rq(p);
-
- lockdep_assert_held(&rq->lock);
-
- dequeue_task(rq, p, 0);
- p->on_rq = TASK_ON_RQ_MIGRATING;
- set_task_cpu(p, new_cpu);
- raw_spin_unlock(&rq->lock);
-
- rq = cpu_rq(new_cpu);
-
- raw_spin_lock(&rq->lock);
- BUG_ON(task_cpu(p) != new_cpu);
- p->on_rq = TASK_ON_RQ_QUEUED;
- enqueue_task(rq, p, 0);
- check_preempt_curr(rq, p, 0);
-
- return rq;
-}
-
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
- if (p->sched_class && p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
-
- cpumask_copy(&p->cpus_allowed, new_mask);
- p->nr_cpus_allowed = cpumask_weight(new_mask);
-}
-
-/*
- * This is how migration works:
- *
- * 1) we invoke migration_cpu_stop() on the target CPU using
- * stop_one_cpu().
- * 2) stopper starts to run (implicitly forcing the migrated thread
- * off the CPU)
- * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) if it's in the wrong runqueue then the migration thread removes
- * it and puts it into the right queue.
- * 5) stopper completes and stop_one_cpu() returns and the migration
- * is done.
- */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
- unsigned long flags;
- struct rq *rq;
- unsigned int dest_cpu;
- int ret = 0;
-
- rq = task_rq_lock(p, &flags);
-
- if (cpumask_equal(&p->cpus_allowed, new_mask))
- goto out;
-
- if (!cpumask_intersects(new_mask, cpu_active_mask)) {
- ret = -EINVAL;
- goto out;
- }
-
- do_set_cpus_allowed(p, new_mask);
-
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
- goto out;
-
- dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (task_running(rq, p) || p->state == TASK_WAKING) {
- struct migration_arg arg = { p, dest_cpu };
- /* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, p, &flags);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- tlb_migrate_finish(p->mm);
- return 0;
- } else if (task_on_rq_queued(p))
- rq = move_queued_task(p, dest_cpu);
-out:
- task_rq_unlock(rq, p, &flags);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
- * this because either it can't run here any more (set_cpus_allowed()
- * away from this CPU, or CPU going down), or because we're
- * attempting to rebalance this task on exec (sched_exec).
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
- */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
-{
- struct rq *rq;
- int ret = 0;
-
- if (unlikely(!cpu_active(dest_cpu)))
- return ret;
-
- rq = cpu_rq(src_cpu);
-
- raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
- /* Already moved. */
- if (task_cpu(p) != src_cpu)
- goto done;
-
- /* Affinity changed (again). */
- if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- goto fail;
-
- /*
- * If we're not on a rq, the next wake-up will ensure we're
- * placed properly.
- */
- if (task_on_rq_queued(p))
- rq = move_queued_task(p, dest_cpu);
-done:
- ret = 1;
-fail:
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock(&p->pi_lock);
- return ret;
-}
#ifdef CONFIG_NUMA_BALANCING
/* Migrate current task p to target_cpu */
@@ -4890,35 +5130,9 @@ void sched_setnuma(struct task_struct *p, int nid)
enqueue_task(rq, p, 0);
task_rq_unlock(rq, p, &flags);
}
-#endif
-
-/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
- * and performs thread migration by bumping thread off CPU then
- * 'pushing' onto another runqueue.
- */
-static int migration_cpu_stop(void *data)
-{
- struct migration_arg *arg = data;
-
- /*
- * The original target cpu might have gone down and we might
- * be on another cpu but it doesn't matter.
- */
- local_irq_disable();
- /*
- * We need to explicitly wake pending tasks before running
- * __migrate_task() such that we will not miss enforcing cpus_allowed
- * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
- */
- sched_ttwu_pending();
- __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
- local_irq_enable();
- return 0;
-}
+#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_HOTPLUG_CPU
-
/*
* Ensures that the idle task is using init_mm right before its cpu goes
* offline.
@@ -5021,7 +5235,6 @@ static void migrate_tasks(unsigned int dead_cpu)
rq->stop = stop;
}
-
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5094,9 +5307,60 @@ set_table_entry(struct ctl_table *entry,
}
static struct ctl_table *
+sd_alloc_ctl_energy_table(struct sched_group_energy *sge)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(5);
+
+ if (table == NULL)
+ return NULL;
+
+ set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power,
+ sge->nr_idle_states*sizeof(struct idle_state), 0644,
+ proc_doulongvec_minmax, false);
+ set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap,
+ sge->nr_cap_states*sizeof(struct capacity_state), 0644,
+ proc_doulongvec_minmax, false);
+
+ return table;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_group_table(struct sched_group *sg)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(2);
+
+ if (table == NULL)
+ return NULL;
+
+ table->procname = kstrdup("energy", GFP_KERNEL);
+ table->mode = 0555;
+ table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge);
+
+ return table;
+}
+
+static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(14);
+ struct ctl_table *table;
+ unsigned int nr_entries = 14;
+
+ int i = 0;
+ struct sched_group *sg = sd->groups;
+
+ if (sg->sge) {
+ int nr_sgs = 0;
+
+ do {} while (nr_sgs++, sg = sg->next, sg != sd->groups);
+
+ nr_entries += nr_sgs;
+ }
+
+ table = sd_alloc_ctl_entry(nr_entries);
if (table == NULL)
return NULL;
@@ -5129,7 +5393,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- /* &table[13] is terminator */
+ sg = sd->groups;
+ if (sg->sge) {
+ char buf[32];
+ struct ctl_table *entry = &table[13];
+
+ do {
+ snprintf(buf, 32, "group%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_group_table(sg);
+ } while (entry++, i++, sg = sg->next, sg != sd->groups);
+ }
+ /* &table[nr_entries-1] is terminator */
return table;
}
@@ -5200,7 +5476,7 @@ static void register_sched_domain_sysctl(void)
static void unregister_sched_domain_sysctl(void)
{
}
-#endif
+#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
static void set_rq_online(struct rq *rq)
{
@@ -5246,6 +5522,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ walt_set_window_start(rq);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
rq->calc_load_update = calc_load_update;
break;
@@ -5265,6 +5544,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
sched_ttwu_pending();
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
+ walt_migrate_sync_cpu(cpu);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
@@ -5378,9 +5658,6 @@ static int __init migration_init(void)
return 0;
}
early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
@@ -5439,17 +5716,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- /*
- * Even though we initialize ->capacity to something semi-sane,
- * we leave capacity_orig unset. This allows us to detect if
- * domain iteration is still funny without causing /0 traps.
- */
- if (!group->sgc->capacity_orig) {
- printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
- break;
- }
-
if (!cpumask_weight(sched_group_cpus(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
@@ -5469,7 +5735,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
printk(KERN_CONT " %s", str);
if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
- printk(KERN_CONT " (cpu_capacity = %d)",
+ printk(KERN_CONT " (cpu_capacity = %lu)",
group->sgc->capacity);
}
@@ -5530,7 +5796,8 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_EXEC |
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
- SD_SHARE_POWERDOMAIN)) {
+ SD_SHARE_POWERDOMAIN |
+ SD_SHARE_CAP_STATES)) {
if (sd->groups != sd->groups->next)
return 0;
}
@@ -5562,7 +5829,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
+ SD_SHARE_POWERDOMAIN |
+ SD_SHARE_CAP_STATES);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -5641,6 +5909,8 @@ static int init_rootdomain(struct root_domain *rd)
if (cpupri_init(&rd->cpupri) != 0)
goto free_rto_mask;
+
+ init_max_cpu_capacity(&rd->max_cpu_capacity);
return 0;
free_rto_mask:
@@ -5746,11 +6016,13 @@ DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
DEFINE_PER_CPU(struct sched_domain *, sd_busy);
DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+DEFINE_PER_CPU(struct sched_domain *, sd_ea);
+DEFINE_PER_CPU(struct sched_domain *, sd_scs);
static void update_top_cache_domain(int cpu)
{
struct sched_domain *sd;
- struct sched_domain *busy_sd = NULL;
+ struct sched_domain *busy_sd = NULL, *ea_sd = NULL;
int id = cpu;
int size = 1;
@@ -5771,6 +6043,17 @@ static void update_top_cache_domain(int cpu)
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+
+ for_each_domain(cpu, sd) {
+ if (sd->groups->sge)
+ ea_sd = sd;
+ else
+ break;
+ }
+ rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
+
+ sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
+ rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
}
/*
@@ -5950,7 +6233,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* die on a /0 trap.
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
- sg->sgc->capacity_orig = sg->sgc->capacity;
+ sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
/*
* Make sure the first group of this domain contains the
@@ -6080,6 +6363,66 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
}
/*
+ * Check that the per-cpu provided sd energy data is consistent for all cpus
+ * within the mask.
+ */
+static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn,
+ const struct cpumask *cpumask)
+{
+ const struct sched_group_energy * const sge = fn(cpu);
+ struct cpumask mask;
+ int i;
+
+ if (cpumask_weight(cpumask) <= 1)
+ return;
+
+ cpumask_xor(&mask, cpumask, get_cpu_mask(cpu));
+
+ for_each_cpu(i, &mask) {
+ const struct sched_group_energy * const e = fn(i);
+ int y;
+
+ BUG_ON(e->nr_idle_states != sge->nr_idle_states);
+
+ for (y = 0; y < (e->nr_idle_states); y++) {
+ BUG_ON(e->idle_states[y].power !=
+ sge->idle_states[y].power);
+ }
+
+ BUG_ON(e->nr_cap_states != sge->nr_cap_states);
+
+ for (y = 0; y < (e->nr_cap_states); y++) {
+ BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap);
+ BUG_ON(e->cap_states[y].power !=
+ sge->cap_states[y].power);
+ }
+ }
+}
+
+static void init_sched_energy(int cpu, struct sched_domain *sd,
+ sched_domain_energy_f fn)
+{
+ if (!(fn && fn(cpu)))
+ return;
+
+ if (cpu != group_balance_cpu(sd->groups))
+ return;
+
+ if (sd->child && !sd->child->groups->sge) {
+ pr_err("BUG: EAS setup broken for CPU%d\n", cpu);
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" energy data on %s but not on %s domain\n",
+ sd->name, sd->child->name);
+#endif
+ return;
+ }
+
+ check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups));
+
+ sd->groups->sge = fn(cpu);
+}
+
+/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
@@ -6185,6 +6528,7 @@ static int sched_domains_curr_level;
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
+ * SD_SHARE_CAP_STATES - describes shared capacity states
*
* Odd one out:
* SD_ASYM_PACKING - describes SMT quirks
@@ -6194,7 +6538,8 @@ static int sched_domains_curr_level;
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
- SD_SHARE_POWERDOMAIN)
+ SD_SHARE_POWERDOMAIN | \
+ SD_SHARE_CAP_STATES)
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl, int cpu)
@@ -6259,6 +6604,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
*/
if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
@@ -6583,7 +6929,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
struct sched_group *sg;
struct sched_group_capacity *sgc;
- sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sd)
return -ENOMEM;
@@ -6687,6 +7033,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
+ struct rq *rq = NULL;
int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -6725,10 +7072,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ struct sched_domain_topology_level *tl = sched_domain_topology;
+
if (!cpumask_test_cpu(i, cpu_map))
continue;
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
+ init_sched_energy(i, sd, tl->energy);
claim_allocations(i, sd);
init_sched_groups_capacity(i, sd);
}
@@ -6737,6 +7087,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
+ rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);
cpu_attach_domain(sd, d.rd, i);
}
@@ -7001,6 +7352,7 @@ void __init sched_init_smp(void)
{
cpumask_var_t non_isolated_cpus;
+ walt_init_cpu_efficiency();
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
@@ -7056,6 +7408,9 @@ int in_sched_functions(unsigned long addr)
*/
struct task_group root_task_group;
LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
#endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
@@ -7116,11 +7471,12 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
+ task_group_cache = KMEM_CACHE(task_group, 0);
+
list_add(&root_task_group.list, &task_groups);
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
-
#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
@@ -7173,7 +7529,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_capacity = SCHED_CAPACITY_SCALE;
+ rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -7183,6 +7539,11 @@ void __init sched_init(void)
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_WALT
+ rq->cur_irqload = 0;
+ rq->avg_irqload = 0;
+ rq->irqload_ts = 0;
+#endif
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -7211,6 +7572,11 @@ void __init sched_init(void)
enter_lazy_tlb(&init_mm, current);
/*
+ * During early bootup we pretend to be a normal task:
+ */
+ current->sched_class = &fair_sched_class;
+
+ /*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
@@ -7220,11 +7586,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
- /*
- * During early bootup we pretend to be a normal task:
- */
- current->sched_class = &fair_sched_class;
-
#ifdef CONFIG_SMP
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
/* May be allocated at isolcpus cmdline parse time */
@@ -7246,14 +7607,24 @@ static inline int preempt_count_equals(int preempt_offset)
return (nested == preempt_offset);
}
+static int __might_sleep_init_called;
+int __init __might_sleep_init(void)
+{
+ __might_sleep_init_called = 1;
+ return 0;
+}
+early_initcall(__might_sleep_init);
+
void __might_sleep(const char *file, int line, int preempt_offset)
{
static unsigned long prev_jiffy; /* ratelimiting */
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
- !is_idle_task(current)) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ !is_idle_task(current)) || oops_in_progress)
+ return;
+ if (system_state != SYSTEM_RUNNING &&
+ (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
@@ -7402,7 +7773,7 @@ static void free_sched_group(struct task_group *tg)
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
- kfree(tg);
+ kmem_cache_free(task_group_cache, tg);
}
/* allocate runqueue etc for a new task group */
@@ -7410,7 +7781,7 @@ struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
- tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
if (!tg)
return ERR_PTR(-ENOMEM);
@@ -7505,7 +7876,7 @@ void sched_move_task(struct task_struct *tsk)
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk, queued);
+ tsk->sched_class->task_move_group(tsk);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
@@ -7969,14 +8340,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
struct cgroup_subsys_state *old_css,
struct task_struct *task)
{
- /*
- * cgroup_exit() is called in the copy_process() failure path.
- * Ignore this case since the task hasn't ran yet, this avoids
- * trying to poke a half freed task state from generic code.
- */
- if (!(task->flags & PF_EXITING))
- return;
-
sched_move_task(task);
}
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
new file mode 100644
index 000000000000..8fe35577a131
--- /dev/null
+++ b/kernel/sched/cpufreq_sched.c
@@ -0,0 +1,500 @@
+/*
+ * Copyright (C) 2015 Michael Turquette <mturquette@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/percpu.h>
+#include <linux/irq_work.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/cpufreq_sched.h>
+
+#include "sched.h"
+
+#define THROTTLE_DOWN_NSEC 50000000 /* 50ms default */
+#define THROTTLE_UP_NSEC 500000 /* 500us default */
+
+struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE;
+static bool __read_mostly cpufreq_driver_slow;
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
+static struct cpufreq_governor cpufreq_gov_sched;
+#endif
+
+static DEFINE_PER_CPU(unsigned long, enabled);
+DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
+
+/**
+ * gov_data - per-policy data internal to the governor
+ * @up_throttle: next throttling period expiry if increasing OPP
+ * @down_throttle: next throttling period expiry if decreasing OPP
+ * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP
+ * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP
+ * @task: worker thread for dvfs transition that may block/sleep
+ * @irq_work: callback used to wake up worker thread
+ * @requested_freq: last frequency requested by the sched governor
+ *
+ * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
+ * per-policy instance of it is created when the cpufreq_sched governor receives
+ * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
+ * member of struct cpufreq_policy.
+ *
+ * Readers of this data must call down_read(policy->rwsem). Writers must
+ * call down_write(policy->rwsem).
+ */
+struct gov_data {
+ ktime_t up_throttle;
+ ktime_t down_throttle;
+ unsigned int up_throttle_nsec;
+ unsigned int down_throttle_nsec;
+ struct task_struct *task;
+ struct irq_work irq_work;
+ unsigned int requested_freq;
+};
+
+static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,
+ unsigned int freq)
+{
+ struct gov_data *gd = policy->governor_data;
+
+ /* avoid race with cpufreq_sched_stop */
+ if (!down_write_trylock(&policy->rwsem))
+ return;
+
+ __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
+
+ gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec);
+ gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec);
+ up_write(&policy->rwsem);
+}
+
+static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq)
+{
+ ktime_t now = ktime_get();
+
+ ktime_t throttle = gd->requested_freq < cur_freq ?
+ gd->down_throttle : gd->up_throttle;
+
+ if (ktime_after(now, throttle))
+ return false;
+
+ while (1) {
+ int usec_left = ktime_to_ns(ktime_sub(throttle, now));
+
+ usec_left /= NSEC_PER_USEC;
+ trace_cpufreq_sched_throttled(usec_left);
+ usleep_range(usec_left, usec_left + 100);
+ now = ktime_get();
+ if (ktime_after(now, throttle))
+ return true;
+ }
+}
+
+/*
+ * we pass in struct cpufreq_policy. This is safe because changing out the
+ * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
+ * which tears down all of the data structures and __cpufreq_governor(policy,
+ * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
+ * new policy pointer
+ */
+static int cpufreq_sched_thread(void *data)
+{
+ struct sched_param param;
+ struct cpufreq_policy *policy;
+ struct gov_data *gd;
+ unsigned int new_request = 0;
+ unsigned int last_request = 0;
+ int ret;
+
+ policy = (struct cpufreq_policy *) data;
+ gd = policy->governor_data;
+
+ param.sched_priority = 50;
+ ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
+ if (ret) {
+ pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+ do_exit(-EINVAL);
+ } else {
+ pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
+ __func__, gd->task->pid);
+ }
+
+ do {
+ new_request = gd->requested_freq;
+ if (new_request == last_request) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread_should_stop())
+ break;
+ schedule();
+ } else {
+ /*
+ * if the frequency thread sleeps while waiting to be
+ * unthrottled, start over to check for a newer request
+ */
+ if (finish_last_request(gd, policy->cur))
+ continue;
+ last_request = new_request;
+ cpufreq_sched_try_driver_target(policy, new_request);
+ }
+ } while (!kthread_should_stop());
+
+ return 0;
+}
+
+static void cpufreq_sched_irq_work(struct irq_work *irq_work)
+{
+ struct gov_data *gd;
+
+ gd = container_of(irq_work, struct gov_data, irq_work);
+ if (!gd)
+ return;
+
+ wake_up_process(gd->task);
+}
+
+static void update_fdomain_capacity_request(int cpu)
+{
+ unsigned int freq_new, index_new, cpu_tmp;
+ struct cpufreq_policy *policy;
+ struct gov_data *gd;
+ unsigned long capacity = 0;
+
+ /*
+ * Avoid grabbing the policy if possible. A test is still
+ * required after locking the CPU's policy to avoid racing
+ * with the governor changing.
+ */
+ if (!per_cpu(enabled, cpu))
+ return;
+
+ policy = cpufreq_cpu_get(cpu);
+ if (IS_ERR_OR_NULL(policy))
+ return;
+
+ if (policy->governor != &cpufreq_gov_sched ||
+ !policy->governor_data)
+ goto out;
+
+ gd = policy->governor_data;
+
+ /* find max capacity requested by cpus in this policy */
+ for_each_cpu(cpu_tmp, policy->cpus) {
+ struct sched_capacity_reqs *scr;
+
+ scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp);
+ capacity = max(capacity, scr->total);
+ }
+
+ /* Convert the new maximum capacity request into a cpu frequency */
+ freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
+ if (cpufreq_frequency_table_target(policy, policy->freq_table,
+ freq_new, CPUFREQ_RELATION_L,
+ &index_new))
+ goto out;
+ freq_new = policy->freq_table[index_new].frequency;
+
+ if (freq_new > policy->max)
+ freq_new = policy->max;
+
+ if (freq_new < policy->min)
+ freq_new = policy->min;
+
+ trace_cpufreq_sched_request_opp(cpu, capacity, freq_new,
+ gd->requested_freq);
+ if (freq_new == gd->requested_freq)
+ goto out;
+
+ gd->requested_freq = freq_new;
+
+ /*
+ * Throttling is not yet supported on platforms with fast cpufreq
+ * drivers.
+ */
+ if (cpufreq_driver_slow)
+ irq_work_queue_on(&gd->irq_work, cpu);
+ else
+ cpufreq_sched_try_driver_target(policy, freq_new);
+
+out:
+ cpufreq_cpu_put(policy);
+}
+
+void update_cpu_capacity_request(int cpu, bool request)
+{
+ unsigned long new_capacity;
+ struct sched_capacity_reqs *scr;
+
+ /* The rq lock serializes access to the CPU's sched_capacity_reqs. */
+ lockdep_assert_held(&cpu_rq(cpu)->lock);
+
+ scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+
+ new_capacity = scr->cfs + scr->rt;
+ new_capacity = new_capacity * capacity_margin
+ / SCHED_CAPACITY_SCALE;
+ new_capacity += scr->dl;
+
+ if (new_capacity == scr->total)
+ return;
+
+ trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity);
+
+ scr->total = new_capacity;
+ if (request)
+ update_fdomain_capacity_request(cpu);
+}
+
+static inline void set_sched_freq(void)
+{
+ static_key_slow_inc(&__sched_freq);
+}
+
+static inline void clear_sched_freq(void)
+{
+ static_key_slow_dec(&__sched_freq);
+}
+
+static struct attribute_group sched_attr_group_gov_pol;
+static struct attribute_group *get_sysfs_attr(void)
+{
+ return &sched_attr_group_gov_pol;
+}
+
+static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
+{
+ struct gov_data *gd;
+ int cpu;
+ int rc;
+
+ for_each_cpu(cpu, policy->cpus)
+ memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0,
+ sizeof(struct sched_capacity_reqs));
+
+ gd = kzalloc(sizeof(*gd), GFP_KERNEL);
+ if (!gd)
+ return -ENOMEM;
+
+ gd->up_throttle_nsec = policy->cpuinfo.transition_latency ?
+ policy->cpuinfo.transition_latency :
+ THROTTLE_UP_NSEC;
+ gd->down_throttle_nsec = THROTTLE_DOWN_NSEC;
+ pr_debug("%s: throttle threshold = %u [ns]\n",
+ __func__, gd->up_throttle_nsec);
+
+ policy->governor_data = gd;
+
+ rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr());
+ if (rc) {
+ pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc);
+ goto err;
+ }
+
+ if (cpufreq_driver_is_slow()) {
+ cpufreq_driver_slow = true;
+ gd->task = kthread_create(cpufreq_sched_thread, policy,
+ "kschedfreq:%d",
+ cpumask_first(policy->related_cpus));
+ if (IS_ERR_OR_NULL(gd->task)) {
+ pr_err("%s: failed to create kschedfreq thread\n",
+ __func__);
+ goto err;
+ }
+ get_task_struct(gd->task);
+ kthread_bind_mask(gd->task, policy->related_cpus);
+ wake_up_process(gd->task);
+ init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
+ }
+
+ set_sched_freq();
+
+ return 0;
+
+err:
+ policy->governor_data = NULL;
+ kfree(gd);
+ return -ENOMEM;
+}
+
+static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
+{
+ struct gov_data *gd = policy->governor_data;
+
+ clear_sched_freq();
+ if (cpufreq_driver_slow) {
+ kthread_stop(gd->task);
+ put_task_struct(gd->task);
+ }
+
+ sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr());
+
+ policy->governor_data = NULL;
+
+ kfree(gd);
+ return 0;
+}
+
+static int cpufreq_sched_start(struct cpufreq_policy *policy)
+{
+ int cpu;
+
+ for_each_cpu(cpu, policy->cpus)
+ per_cpu(enabled, cpu) = 1;
+
+ return 0;
+}
+
+static void cpufreq_sched_limits(struct cpufreq_policy *policy)
+{
+ unsigned int clamp_freq;
+ struct gov_data *gd = policy->governor_data;;
+
+ pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n",
+ policy->cpu, policy->min, policy->max,
+ policy->cur);
+
+ clamp_freq = clamp(gd->requested_freq, policy->min, policy->max);
+
+ if (policy->cur != clamp_freq)
+ __cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L);
+}
+
+static int cpufreq_sched_stop(struct cpufreq_policy *policy)
+{
+ int cpu;
+
+ for_each_cpu(cpu, policy->cpus)
+ per_cpu(enabled, cpu) = 0;
+
+ return 0;
+}
+
+static int cpufreq_sched_setup(struct cpufreq_policy *policy,
+ unsigned int event)
+{
+ switch (event) {
+ case CPUFREQ_GOV_POLICY_INIT:
+ return cpufreq_sched_policy_init(policy);
+ case CPUFREQ_GOV_POLICY_EXIT:
+ return cpufreq_sched_policy_exit(policy);
+ case CPUFREQ_GOV_START:
+ return cpufreq_sched_start(policy);
+ case CPUFREQ_GOV_STOP:
+ return cpufreq_sched_stop(policy);
+ case CPUFREQ_GOV_LIMITS:
+ cpufreq_sched_limits(policy);
+ break;
+ }
+ return 0;
+}
+
+/* Tunables */
+static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf)
+{
+ return sprintf(buf, "%u\n", gd->up_throttle_nsec);
+}
+
+static ssize_t store_up_throttle_nsec(struct gov_data *gd,
+ const char *buf, size_t count)
+{
+ int ret;
+ long unsigned int val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ gd->up_throttle_nsec = val;
+ return count;
+}
+
+static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf)
+{
+ return sprintf(buf, "%u\n", gd->down_throttle_nsec);
+}
+
+static ssize_t store_down_throttle_nsec(struct gov_data *gd,
+ const char *buf, size_t count)
+{
+ int ret;
+ long unsigned int val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ gd->down_throttle_nsec = val;
+ return count;
+}
+
+/*
+ * Create show/store routines
+ * - sys: One governor instance for complete SYSTEM
+ * - pol: One governor instance per struct cpufreq_policy
+ */
+#define show_gov_pol_sys(file_name) \
+static ssize_t show_##file_name##_gov_pol \
+(struct cpufreq_policy *policy, char *buf) \
+{ \
+ return show_##file_name(policy->governor_data, buf); \
+}
+
+#define store_gov_pol_sys(file_name) \
+static ssize_t store_##file_name##_gov_pol \
+(struct cpufreq_policy *policy, const char *buf, size_t count) \
+{ \
+ return store_##file_name(policy->governor_data, buf, count); \
+}
+
+#define gov_pol_attr_rw(_name) \
+ static struct freq_attr _name##_gov_pol = \
+ __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
+
+#define show_store_gov_pol_sys(file_name) \
+ show_gov_pol_sys(file_name); \
+ store_gov_pol_sys(file_name)
+#define tunable_handlers(file_name) \
+ show_gov_pol_sys(file_name); \
+ store_gov_pol_sys(file_name); \
+ gov_pol_attr_rw(file_name)
+
+tunable_handlers(down_throttle_nsec);
+tunable_handlers(up_throttle_nsec);
+
+/* Per policy governor instance */
+static struct attribute *sched_attributes_gov_pol[] = {
+ &up_throttle_nsec_gov_pol.attr,
+ &down_throttle_nsec_gov_pol.attr,
+ NULL,
+};
+
+static struct attribute_group sched_attr_group_gov_pol = {
+ .attrs = sched_attributes_gov_pol,
+ .name = "sched",
+};
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
+static
+#endif
+struct cpufreq_governor cpufreq_gov_sched = {
+ .name = "sched",
+ .governor = cpufreq_sched_setup,
+ .owner = THIS_MODULE,
+};
+
+static int __init cpufreq_sched_init(void)
+{
+ int cpu;
+
+ for_each_cpu(cpu, cpu_possible_mask)
+ per_cpu(enabled, cpu) = 0;
+ return cpufreq_register_governor(&cpufreq_gov_sched);
+}
+
+/* Try to make this the default governor */
+fs_initcall(cpufreq_sched_init);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 87b8576cbd50..0688ab4ca094 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,3 +1,4 @@
+#include <linux/cpufreq.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/tsacct_kern.h>
@@ -5,6 +6,7 @@
#include <linux/static_key.h>
#include <linux/context_tracking.h>
#include "sched.h"
+#include "walt.h"
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -49,6 +51,10 @@ void irqtime_account_irq(struct task_struct *curr)
unsigned long flags;
s64 delta;
int cpu;
+#ifdef CONFIG_SCHED_WALT
+ u64 wallclock;
+ bool account = true;
+#endif
if (!sched_clock_irqtime)
return;
@@ -56,6 +62,9 @@ void irqtime_account_irq(struct task_struct *curr)
local_irq_save(flags);
cpu = smp_processor_id();
+#ifdef CONFIG_SCHED_WALT
+ wallclock = sched_clock_cpu(cpu);
+#endif
delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
__this_cpu_add(irq_start_time, delta);
@@ -70,8 +79,16 @@ void irqtime_account_irq(struct task_struct *curr)
__this_cpu_add(cpu_hardirq_time, delta);
else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
__this_cpu_add(cpu_softirq_time, delta);
+#ifdef CONFIG_SCHED_WALT
+ else
+ account = false;
+#endif
irq_time_write_end();
+#ifdef CONFIG_SCHED_WALT
+ if (account)
+ walt_account_irqtime(cpu, curr, delta, wallclock);
+#endif
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
@@ -149,6 +166,11 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
/* Account for user time used */
acct_account_cputime(p);
+
+#ifdef CONFIG_CPU_FREQ_STAT
+ /* Account power usage for user time */
+ acct_update_power(p, cputime);
+#endif
}
/*
@@ -199,6 +221,11 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
/* Account for system time used */
acct_account_cputime(p);
+
+#ifdef CONFIG_CPU_FREQ_STAT
+ /* Account power usage for system time */
+ acct_update_power(p, cputime);
+#endif
}
/*
@@ -567,7 +594,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
{
cputime_t old;
- while (new > (old = ACCESS_ONCE(*counter)))
+ while (new > (old = READ_ONCE(*counter)))
cmpxchg_cputime(counter, old, new);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 40a97c3d8aba..a2d36e82ef00 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -43,6 +43,24 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
return !RB_EMPTY_NODE(&dl_se->rb_node);
}
+static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ u64 se_bw = dl_se->dl_bw;
+
+ dl_rq->avg_bw += se_bw;
+}
+
+static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ u64 se_bw = dl_se->dl_bw;
+
+ dl_rq->avg_bw -= se_bw;
+ if (dl_rq->avg_bw < 0) {
+ WARN_ON(1);
+ dl_rq->avg_bw = 0;
+ }
+}
+
static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
{
struct sched_dl_entity *dl_se = &p->dl;
@@ -218,6 +236,57 @@ static inline void set_post_schedule(struct rq *rq)
rq->post_schedule = has_pushable_dl_tasks(rq);
}
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
+
+static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+{
+ struct rq *later_rq = NULL;
+ bool fallback = false;
+
+ later_rq = find_lock_later_rq(p, rq);
+
+ if (!later_rq) {
+ int cpu;
+
+ /*
+ * If we cannot preempt any rq, fall back to pick any
+ * online cpu.
+ */
+ fallback = true;
+ cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
+ if (cpu >= nr_cpu_ids) {
+ /*
+ * Fail to find any suitable cpu.
+ * The task will never come back!
+ */
+ BUG_ON(dl_bandwidth_enabled());
+
+ /*
+ * If admission control is disabled we
+ * try a little harder to let the task
+ * run.
+ */
+ cpu = cpumask_any(cpu_active_mask);
+ }
+ later_rq = cpu_rq(cpu);
+ double_lock_balance(rq, later_rq);
+ }
+
+ /*
+ * By now the task is replenished and enqueued; migrate it.
+ */
+ deactivate_task(rq, p, 0);
+ set_task_cpu(p, later_rq->cpu);
+ activate_task(later_rq, p, 0);
+
+ if (!fallback)
+ resched_curr(later_rq);
+
+ double_unlock_balance(later_rq, rq);
+
+ return later_rq;
+}
+
#else
static inline
@@ -350,6 +419,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
}
+
+ if (dl_se->dl_yielded)
+ dl_se->dl_yielded = 0;
+ if (dl_se->dl_throttled)
+ dl_se->dl_throttled = 0;
}
/*
@@ -421,6 +495,9 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
+ if (dl_se->dl_new)
+ add_average_bw(dl_se, dl_rq);
+
/*
* The arrival of a new instance needs special treatment, i.e.,
* the actual scheduling parameters have to be "renewed".
@@ -447,24 +524,23 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
* actually started or not (i.e., the replenishment instant is in
* the future or in the past).
*/
-static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+static int start_dl_timer(struct task_struct *p)
{
- struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq);
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct hrtimer *timer = &dl_se->dl_timer;
+ struct rq *rq = task_rq(p);
ktime_t now, act;
- ktime_t soft, hard;
- unsigned long range;
s64 delta;
- if (boosted)
- return 0;
+ lockdep_assert_held(&rq->lock);
+
/*
* We want the timer to fire at the deadline, but considering
* that it is actually coming from rq->clock and not from
* hrtimer's time base reading.
*/
act = ns_to_ktime(dl_se->deadline);
- now = hrtimer_cb_get_time(&dl_se->dl_timer);
+ now = hrtimer_cb_get_time(timer);
delta = ktime_to_ns(now) - rq_clock(rq);
act = ktime_add_ns(act, delta);
@@ -476,15 +552,21 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
if (ktime_us_delta(act, now) < 0)
return 0;
- hrtimer_set_expires(&dl_se->dl_timer, act);
-
- soft = hrtimer_get_softexpires(&dl_se->dl_timer);
- hard = hrtimer_get_expires(&dl_se->dl_timer);
- range = ktime_to_ns(ktime_sub(hard, soft));
- __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
- range, HRTIMER_MODE_ABS, 0);
+ /*
+ * !enqueued will guarantee another callback; even if one is already in
+ * progress. This ensures a balanced {get,put}_task_struct().
+ *
+ * The race against __run_timer() clearing the enqueued state is
+ * harmless because we're holding task_rq()->lock, therefore the timer
+ * expiring after we've done the check will wait on its task_rq_lock()
+ * and observe our state.
+ */
+ if (!hrtimer_is_queued(timer)) {
+ get_task_struct(p);
+ hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+ }
- return hrtimer_active(&dl_se->dl_timer);
+ return 1;
}
/*
@@ -518,44 +600,97 @@ again:
}
/*
- * We need to take care of several possible races here:
- *
- * - the task might have changed its scheduling policy
- * to something different than SCHED_DEADLINE
- * - the task might have changed its reservation parameters
- * (through sched_setattr())
- * - the task might have been boosted by someone else and
- * might be in the boosting/deboosting path
+ * The task might have changed its scheduling policy to something
+ * different than SCHED_DEADLINE (through switched_fromd_dl()).
+ */
+ if (!dl_task(p)) {
+ __dl_clear_params(p);
+ goto unlock;
+ }
+
+ /*
+ * This is possible if switched_from_dl() raced against a running
+ * callback that took the above !dl_task() path and we've since then
+ * switched back into SCHED_DEADLINE.
*
- * In all this cases we bail out, as the task is already
- * in the runqueue or is going to be enqueued back anyway.
+ * There's nothing to do except drop our task reference.
+ */
+ if (dl_se->dl_new)
+ goto unlock;
+
+ /*
+ * The task might have been boosted by someone else and might be in the
+ * boosting/deboosting path, its not throttled.
+ */
+ if (dl_se->dl_boosted)
+ goto unlock;
+
+ /*
+ * Spurious timer due to start_dl_timer() race; or we already received
+ * a replenishment from rt_mutex_setprio().
*/
- if (!dl_task(p) || dl_se->dl_new ||
- dl_se->dl_boosted || !dl_se->dl_throttled)
+ if (!dl_se->dl_throttled)
goto unlock;
sched_clock_tick();
update_rq_clock(rq);
- dl_se->dl_throttled = 0;
- dl_se->dl_yielded = 0;
- if (task_on_rq_queued(p)) {
- enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
- if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
- else
- resched_curr(rq);
+
+ /*
+ * If the throttle happened during sched-out; like:
+ *
+ * schedule()
+ * deactivate_task()
+ * dequeue_task_dl()
+ * update_curr_dl()
+ * start_dl_timer()
+ * __dequeue_task_dl()
+ * prev->on_rq = 0;
+ *
+ * We can be both throttled and !queued. Replenish the counter
+ * but do not enqueue -- wait for our wakeup to do that.
+ */
+ if (!task_on_rq_queued(p)) {
+ replenish_dl_entity(dl_se, dl_se);
+ goto unlock;
+ }
+
+ enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+
#ifdef CONFIG_SMP
- /*
- * Queueing this task back might have overloaded rq,
- * check if we need to kick someone away.
- */
- if (has_pushable_dl_tasks(rq))
- push_dl_task(rq);
+ /*
+ * Perform balancing operations here; after the replenishments. We
+ * cannot drop rq->lock before this, otherwise the assertion in
+ * start_dl_timer() about not missing updates is not true.
+ *
+ * If we find that the rq the task was on is no longer available, we
+ * need to select a new rq.
+ *
+ * XXX figure out if select_task_rq_dl() deals with offline cpus.
+ */
+ if (unlikely(!rq->online))
+ rq = dl_task_offline_migration(rq, p);
+
+ /*
+ * Queueing this task back might have overloaded rq, check if we need
+ * to kick someone away.
+ */
+ if (has_pushable_dl_tasks(rq))
+ push_dl_task(rq);
#endif
- }
+
unlock:
raw_spin_unlock(&rq->lock);
+ /*
+ * This can free the task_struct, including this hrtimer, do not touch
+ * anything related to that after this.
+ */
+ put_task_struct(p);
+
return HRTIMER_NORESTART;
}
@@ -563,11 +698,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
- if (hrtimer_active(timer)) {
- hrtimer_try_to_cancel(timer);
- return;
- }
-
hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
timer->function = dl_task_timer;
}
@@ -614,14 +744,11 @@ static void update_curr_dl(struct rq *rq)
curr->se.exec_start = rq_clock_task(rq);
cpuacct_charge(curr, delta_exec);
- sched_rt_avg_update(rq, delta_exec);
-
- dl_se->runtime -= delta_exec;
+ dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
if (dl_runtime_exceeded(rq, dl_se)) {
+ dl_se->dl_throttled = 1;
__dequeue_task_dl(rq, curr, 0);
- if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
- dl_se->dl_throttled = 1;
- else
+ if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
if (!is_leftmost(curr, &rq->dl))
@@ -858,7 +985,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* its rq, the bandwidth timer callback (which clearly has not
* run yet) will take care of this.
*/
- if (p->dl.dl_throttled)
+ if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
return;
enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -922,7 +1049,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
rq = cpu_rq(cpu);
rcu_read_lock();
- curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ curr = READ_ONCE(rq->curr); /* unlocked access */
/*
* If we are dealing with a -deadline task, we must
@@ -1092,17 +1219,19 @@ static void task_fork_dl(struct task_struct *p)
static void task_dead_dl(struct task_struct *p)
{
- struct hrtimer *timer = &p->dl.dl_timer;
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+ struct dl_rq *dl_rq = dl_rq_of_se(&p->dl);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
/*
* Since we are TASK_DEAD we won't slip out of the domain!
*/
raw_spin_lock_irq(&dl_b->lock);
+ /* XXX we should retain the bw until 0-lag */
dl_b->total_bw -= p->dl.dl_bw;
raw_spin_unlock_irq(&dl_b->lock);
- hrtimer_cancel(timer);
+ clear_average_bw(&p->dl, &rq->dl);
}
static void set_curr_task_dl(struct rq *rq)
@@ -1375,7 +1504,9 @@ retry:
}
deactivate_task(rq, next_task, 0);
+ clear_average_bw(&next_task->dl, &rq->dl);
set_task_cpu(next_task, later_rq->cpu);
+ add_average_bw(&next_task->dl, &later_rq->dl);
activate_task(later_rq, next_task, 0);
resched_curr(later_rq);
@@ -1461,7 +1592,9 @@ static int pull_dl_task(struct rq *this_rq)
ret = 1;
deactivate_task(src_rq, p, 0);
+ clear_average_bw(&p->dl, &src_rq->dl);
set_task_cpu(p, this_cpu);
+ add_average_bw(&p->dl, &this_rq->dl);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
@@ -1571,20 +1704,27 @@ void init_sched_dl_class(void)
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
- if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
- hrtimer_try_to_cancel(&p->dl.dl_timer);
+ /*
+ * Start the deadline timer; if we switch back to dl before this we'll
+ * continue consuming our current CBS slice. If we stay outside of
+ * SCHED_DEADLINE until the deadline passes, the timer will reset the
+ * task.
+ */
+ if (!start_dl_timer(p))
+ __dl_clear_params(p);
- __dl_clear_params(p);
+ clear_average_bw(&p->dl, &rq->dl);
-#ifdef CONFIG_SMP
/*
* Since this might be the only -deadline task on the rq,
* this is the right place to try to pull some other one
* from an overloaded cpu, if any.
*/
- if (!rq->dl.dl_nr_running)
- pull_dl_task(rq);
-#endif
+ if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+ return;
+
+ if (pull_dl_task(rq))
+ resched_curr(rq);
}
/*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ce33780d8f20..edb1a41c686a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -68,13 +68,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
- if (!se) {
- struct sched_avg *avg = &cpu_rq(cpu)->avg;
- P(avg->runnable_avg_sum);
- P(avg->runnable_avg_period);
+ if (!se)
return;
- }
-
PN(se->exec_start);
PN(se->vruntime);
@@ -93,10 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#endif
P(se->load.weight);
#ifdef CONFIG_SMP
- P(se->avg.runnable_avg_sum);
- P(se->avg.runnable_avg_period);
- P(se->avg.load_avg_contrib);
- P(se->avg.decay_count);
+ P(se->avg.load_avg);
+ P(se->avg.util_avg);
#endif
#undef PN
#undef P
@@ -210,19 +203,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg",
+ SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
+ cfs_rq->avg.load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
cfs_rq->runnable_load_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
- cfs_rq->blocked_load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
+ cfs_rq->avg.util_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
+ atomic_long_read(&cfs_rq->removed_load_avg));
+ SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
+ atomic_long_read(&cfs_rq->removed_util_avg));
#ifdef CONFIG_FAIR_GROUP_SCHED
- SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
- cfs_rq->tg_load_contrib);
- SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
- cfs_rq->tg_runnable_contrib);
+ SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
+ cfs_rq->tg_load_avg_contrib);
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
atomic_long_read(&cfs_rq->tg->load_avg));
- SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
- atomic_read(&cfs_rq->tg->runnable_avg));
#endif
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -627,10 +622,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.load.weight);
#ifdef CONFIG_SMP
- P(se.avg.runnable_avg_sum);
- P(se.avg.runnable_avg_period);
- P(se.avg.load_avg_contrib);
- P(se.avg.decay_count);
+ P(se.avg.load_sum);
+ P(se.avg.util_sum);
+ P(se.avg.load_avg);
+ P(se.avg.util_avg);
+ P(se.avg.last_update_time);
#endif
P(policy);
P(prio);
diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c
new file mode 100644
index 000000000000..b0656b7a93e3
--- /dev/null
+++ b/kernel/sched/energy.c
@@ -0,0 +1,124 @@
+/*
+ * Obtain energy cost data from DT and populate relevant scheduler data
+ * structures.
+ *
+ * Copyright (C) 2015 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#define pr_fmt(fmt) "sched-energy: " fmt
+
+#define DEBUG
+
+#include <linux/gfp.h>
+#include <linux/of.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched_energy.h>
+#include <linux/stddef.h>
+
+struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
+
+static void free_resources(void)
+{
+ int cpu, sd_level;
+ struct sched_group_energy *sge;
+
+ for_each_possible_cpu(cpu) {
+ for_each_possible_sd_level(sd_level) {
+ sge = sge_array[cpu][sd_level];
+ if (sge) {
+ kfree(sge->cap_states);
+ kfree(sge->idle_states);
+ kfree(sge);
+ }
+ }
+ }
+}
+
+void init_sched_energy_costs(void)
+{
+ struct device_node *cn, *cp;
+ struct capacity_state *cap_states;
+ struct idle_state *idle_states;
+ struct sched_group_energy *sge;
+ const struct property *prop;
+ int sd_level, i, nstates, cpu;
+ const __be32 *val;
+
+ for_each_possible_cpu(cpu) {
+ cn = of_get_cpu_node(cpu, NULL);
+ if (!cn) {
+ pr_warn("CPU device node missing for CPU %d\n", cpu);
+ return;
+ }
+
+ if (!of_find_property(cn, "sched-energy-costs", NULL)) {
+ pr_warn("CPU device node has no sched-energy-costs\n");
+ return;
+ }
+
+ for_each_possible_sd_level(sd_level) {
+ cp = of_parse_phandle(cn, "sched-energy-costs", sd_level);
+ if (!cp)
+ break;
+
+ prop = of_find_property(cp, "busy-cost-data", NULL);
+ if (!prop || !prop->value) {
+ pr_warn("No busy-cost data, skipping sched_energy init\n");
+ goto out;
+ }
+
+ sge = kcalloc(1, sizeof(struct sched_group_energy),
+ GFP_NOWAIT);
+
+ nstates = (prop->length / sizeof(u32)) / 2;
+ cap_states = kcalloc(nstates,
+ sizeof(struct capacity_state),
+ GFP_NOWAIT);
+
+ for (i = 0, val = prop->value; i < nstates; i++) {
+ cap_states[i].cap = be32_to_cpup(val++);
+ cap_states[i].power = be32_to_cpup(val++);
+ }
+
+ sge->nr_cap_states = nstates;
+ sge->cap_states = cap_states;
+
+ prop = of_find_property(cp, "idle-cost-data", NULL);
+ if (!prop || !prop->value) {
+ pr_warn("No idle-cost data, skipping sched_energy init\n");
+ goto out;
+ }
+
+ nstates = (prop->length / sizeof(u32));
+ idle_states = kcalloc(nstates,
+ sizeof(struct idle_state),
+ GFP_NOWAIT);
+
+ for (i = 0, val = prop->value; i < nstates; i++)
+ idle_states[i].power = be32_to_cpup(val++);
+
+ sge->nr_idle_states = nstates;
+ sge->idle_states = idle_states;
+
+ sge_array[cpu][sd_level] = sge;
+ }
+ }
+
+ pr_info("Sched-energy-costs installed from DT\n");
+ return;
+
+out:
+ free_resources();
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 07a75c150eeb..c76b4d5ee05e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -30,10 +30,13 @@
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
+#include <linux/module.h>
#include <trace/events/sched.h>
#include "sched.h"
+#include "tune.h"
+#include "walt.h"
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -50,6 +53,17 @@
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+unsigned int sysctl_sched_is_big_little = 0;
+unsigned int sysctl_sched_sync_hint_enable = 1;
+unsigned int sysctl_sched_initial_task_util = 0;
+unsigned int sysctl_sched_cstate_aware = 1;
+
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+ (10 * NSEC_PER_MSEC);
+#endif
/*
* The initial- and re-scaling of tunables is configurable
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -283,9 +297,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
- int force_update);
-
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
@@ -305,8 +316,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
cfs_rq->on_list = 1;
- /* We should have no load, but we need to update last_decay. */
- update_cfs_rq_blocked_load(cfs_rq, 0);
}
}
@@ -616,15 +625,10 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
*/
static u64 __sched_period(unsigned long nr_running)
{
- u64 period = sysctl_sched_latency;
- unsigned long nr_latency = sched_nr_latency;
-
- if (unlikely(nr_running > nr_latency)) {
- period = sysctl_sched_min_granularity;
- period *= nr_running;
- }
-
- return period;
+ if (unlikely(nr_running > sched_nr_latency))
+ return nr_running * sysctl_sched_min_granularity;
+ else
+ return sysctl_sched_latency;
}
/*
@@ -669,21 +673,39 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int select_idle_sibling(struct task_struct *p, int cpu);
static unsigned long task_h_load(struct task_struct *p);
-static inline void __update_task_entity_contrib(struct sched_entity *se);
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
-/* Give new task start runnable values to heavy its load in infant time */
-void init_task_runnable_average(struct task_struct *p)
+/* Give new sched_entity start runnable values to heavy its load in infant time */
+void init_entity_runnable_average(struct sched_entity *se)
{
- u32 slice;
+ struct sched_avg *sa = &se->avg;
- p->se.avg.decay_count = 0;
- slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
- p->se.avg.runnable_avg_sum = slice;
- p->se.avg.runnable_avg_period = slice;
- __update_task_entity_contrib(&p->se);
+ sa->last_update_time = 0;
+ /*
+ * sched_avg's period_contrib should be strictly less then 1024, so
+ * we give it 1023 to make sure it is almost a period (1024us), and
+ * will definitely be update (after enqueue).
+ */
+ sa->period_contrib = 1023;
+ sa->load_avg = scale_load_down(se->load.weight);
+ sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+ sa->util_avg = sched_freq() ?
+ sysctl_sched_initial_task_util :
+ scale_load_down(SCHED_LOAD_SCALE);
+ sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
+
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
#else
-void init_task_runnable_average(struct task_struct *p)
+void init_entity_runnable_average(struct sched_entity *se)
{
}
#endif
@@ -833,7 +855,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
static unsigned int task_scan_min(struct task_struct *p)
{
- unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+ unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
@@ -1570,8 +1592,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
} else {
- delta = p->se.avg.runnable_avg_sum;
- *period = p->se.avg.runnable_avg_period;
+ delta = p->se.avg.load_sum / p->se.load.weight;
+ *period = LOAD_AVG_MAX;
}
p->last_sum_exec_runtime = runtime;
@@ -1589,7 +1611,7 @@ static void task_numa_placement(struct task_struct *p)
u64 runtime, period;
spinlock_t *group_lock = NULL;
- seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+ seq = READ_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
@@ -1724,7 +1746,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
}
rcu_read_lock();
- tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+ tsk = READ_ONCE(cpu_rq(cpu)->curr);
if (!cpupid_match_pid(tsk, cpupid))
goto no_join;
@@ -1906,7 +1928,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
static void reset_ptenuma_scan(struct task_struct *p)
{
- ACCESS_ONCE(p->mm->numa_scan_seq)++;
+ WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
p->mm->numa_scan_offset = 0;
}
@@ -2122,12 +2144,12 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
long tg_weight;
/*
- * Use this CPU's actual weight instead of the last load_contribution
- * to gain a more accurate current total weight. See
- * update_cfs_rq_load_contribution().
+ * Use this CPU's real-time load instead of the last load contribution
+ * as the updating of the contribution is delayed, and we will use the
+ * the real-time load to calc the share. See update_tg_load_avg().
*/
tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight -= cfs_rq->tg_load_contrib;
+ tg_weight -= cfs_rq->tg_load_avg_contrib;
tg_weight += cfs_rq->load.weight;
return tg_weight;
@@ -2200,14 +2222,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
-/*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables below are dependent on this value.
- */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
-
/* Precomputed fixed inverse multiplies for multiplication by y^n */
static const u32 runnable_avg_yN_inv[] = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
@@ -2256,9 +2270,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
local_n %= LOAD_AVG_PERIOD;
}
- val *= runnable_avg_yN_inv[local_n];
- /* We don't use SRR here since we always want to round down. */
- return val >> 32;
+ val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
+ return val;
}
/*
@@ -2289,6 +2302,12 @@ static u32 __compute_runnable_contrib(u64 n)
return contrib + runnable_avg_yN_sum[n];
}
+#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
+#error "load tracking assumes 2^10 as unit"
+#endif
+
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
/*
* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
@@ -2317,21 +2336,22 @@ static u32 __compute_runnable_contrib(u64 n)
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
-static __always_inline int __update_entity_runnable_avg(u64 now,
- struct sched_avg *sa,
- int runnable)
+static __always_inline int
+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
- u64 delta, periods;
- u32 runnable_contrib;
- int delta_w, decayed = 0;
+ u64 delta, scaled_delta, periods;
+ u32 contrib;
+ unsigned int delta_w, scaled_delta_w, decayed = 0;
+ unsigned long scale_freq, scale_cpu;
- delta = now - sa->last_runnable_update;
+ delta = now - sa->last_update_time;
/*
* This should only happen when time goes backwards, which it
* unfortunately does during sched clock init when we swap over to TSC.
*/
if ((s64)delta < 0) {
- sa->last_runnable_update = now;
+ sa->last_update_time = now;
return 0;
}
@@ -2342,23 +2362,36 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
delta >>= 10;
if (!delta)
return 0;
- sa->last_runnable_update = now;
+ sa->last_update_time = now;
+
+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+ trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
/* delta_w is the amount already accumulated against our next period */
- delta_w = sa->runnable_avg_period % 1024;
+ delta_w = sa->period_contrib;
if (delta + delta_w >= 1024) {
- /* period roll-over */
decayed = 1;
+ /* how much left for next period will start over, we don't know yet */
+ sa->period_contrib = 0;
+
/*
* Now that we know we're crossing a period boundary, figure
* out how much from delta we need to complete the current
* period and accrue it.
*/
delta_w = 1024 - delta_w;
- if (runnable)
- sa->runnable_avg_sum += delta_w;
- sa->runnable_avg_period += delta_w;
+ scaled_delta_w = cap_scale(delta_w, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * scaled_delta_w;
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum +=
+ weight * scaled_delta_w;
+ }
+ }
+ if (running)
+ sa->util_sum += scaled_delta_w * scale_cpu;
delta -= delta_w;
@@ -2366,299 +2399,298 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
periods = delta / 1024;
delta %= 1024;
- sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
- periods + 1);
- sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
- periods + 1);
+ sa->load_sum = decay_load(sa->load_sum, periods + 1);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum =
+ decay_load(cfs_rq->runnable_load_sum, periods + 1);
+ }
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
- runnable_contrib = __compute_runnable_contrib(periods);
- if (runnable)
- sa->runnable_avg_sum += runnable_contrib;
- sa->runnable_avg_period += runnable_contrib;
+ contrib = __compute_runnable_contrib(periods);
+ contrib = cap_scale(contrib, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * contrib;
+ }
+ if (running)
+ sa->util_sum += contrib * scale_cpu;
}
/* Remainder of delta accrued against u_0` */
- if (runnable)
- sa->runnable_avg_sum += delta;
- sa->runnable_avg_period += delta;
+ scaled_delta = cap_scale(delta, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * scaled_delta;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * scaled_delta;
+ }
+ if (running)
+ sa->util_sum += scaled_delta * scale_cpu;
+
+ sa->period_contrib += delta;
+
+ if (decayed) {
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_avg =
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+ }
+ sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+ }
return decayed;
}
-/* Synchronize an entity's decay with its parenting cfs_rq.*/
-static inline u64 __synchronize_entity_decay(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 decays = atomic64_read(&cfs_rq->decay_counter);
-
- decays -= se->avg.decay_count;
- if (!decays)
- return 0;
-
- se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
- se->avg.decay_count = 0;
-
- return decays;
-}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
- int force_update)
+/*
+ * Updating tg's load_avg is necessary before update_cfs_share (which is done)
+ * and effective_load (which is not done because it is too costly).
+ */
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
- struct task_group *tg = cfs_rq->tg;
- long tg_contrib;
+ long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
- tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
- tg_contrib -= cfs_rq->tg_load_contrib;
-
- if (!tg_contrib)
+ /*
+ * No need to update load_avg for root_task_group as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
return;
- if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
- atomic_long_add(tg_contrib, &tg->load_avg);
- cfs_rq->tg_load_contrib += tg_contrib;
+ if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+ atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
/*
- * Aggregate cfs_rq runnable averages into an equivalent task_group
- * representation for computing load contributions.
+ * Called within set_task_rq() right before setting a task's cpu. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
*/
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
- struct cfs_rq *cfs_rq)
+void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next)
{
- struct task_group *tg = cfs_rq->tg;
- long contrib;
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ return;
- /* The fraction of a cpu used by this cfs_rq */
- contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
- sa->runnable_avg_period + 1);
- contrib -= cfs_rq->tg_runnable_contrib;
+ /*
+ * We are supposed to update the task to "current" time, then its up to
+ * date and ready to go to new CPU/cfs_rq. But we have difficulty in
+ * getting what current time is, so simply throw away the out-of-date
+ * time. This will result in the wakee task is less decayed, but giving
+ * the wakee more load sounds not bad.
+ */
+ if (se->avg.last_update_time && prev) {
+ u64 p_last_update_time;
+ u64 n_last_update_time;
- if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
- atomic_add(contrib, &tg->runnable_avg);
- cfs_rq->tg_runnable_contrib += contrib;
- }
-}
+#ifndef CONFIG_64BIT
+ u64 p_last_update_time_copy;
+ u64 n_last_update_time_copy;
-static inline void __update_group_entity_contrib(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
- struct task_group *tg = cfs_rq->tg;
- int runnable_avg;
+ do {
+ p_last_update_time_copy = prev->load_last_update_time_copy;
+ n_last_update_time_copy = next->load_last_update_time_copy;
- u64 contrib;
+ smp_rmb();
- contrib = cfs_rq->tg_load_contrib * tg->shares;
- se->avg.load_avg_contrib = div_u64(contrib,
- atomic_long_read(&tg->load_avg) + 1);
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
- /*
- * For group entities we need to compute a correction term in the case
- * that they are consuming <1 cpu so that we would contribute the same
- * load as a task of equal weight.
- *
- * Explicitly co-ordinating this measurement would be expensive, but
- * fortunately the sum of each cpus contribution forms a usable
- * lower-bound on the true value.
- *
- * Consider the aggregate of 2 contributions. Either they are disjoint
- * (and the sum represents true value) or they are disjoint and we are
- * understating by the aggregate of their overlap.
- *
- * Extending this to N cpus, for a given overlap, the maximum amount we
- * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
- * cpus that overlap for this interval and w_i is the interval width.
- *
- * On a small machine; the first term is well-bounded which bounds the
- * total error since w_i is a subset of the period. Whereas on a
- * larger machine, while this first term can be larger, if w_i is the
- * of consequential size guaranteed to see n_i*w_i quickly converge to
- * our upper bound of 1-cpu.
- */
- runnable_avg = atomic_read(&tg->runnable_avg);
- if (runnable_avg < NICE_0_LOAD) {
- se->avg.load_avg_contrib *= runnable_avg;
- se->avg.load_avg_contrib >>= NICE_0_SHIFT;
+ } while (p_last_update_time != p_last_update_time_copy ||
+ n_last_update_time != n_last_update_time_copy);
+#else
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+#endif
+ __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
+ &se->avg, 0, 0, NULL);
+ se->avg.last_update_time = n_last_update_time;
}
}
-
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
-{
- __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
- __update_tg_runnable_avg(&rq->avg, &rq->cfs);
-}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
- int force_update) {}
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
- struct cfs_rq *cfs_rq) {}
-static inline void __update_group_entity_contrib(struct sched_entity *se) {}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void __update_task_entity_contrib(struct sched_entity *se)
-{
- u32 contrib;
-
- /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
- contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
- contrib /= (se->avg.runnable_avg_period + 1);
- se->avg.load_avg_contrib = scale_load(contrib);
-}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-/* Compute the current contribution to load_avg by se, return any delta */
-static long __update_entity_load_avg_contrib(struct sched_entity *se)
+/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
+static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
- long old_contrib = se->avg.load_avg_contrib;
+ struct sched_avg *sa = &cfs_rq->avg;
+ int decayed, removed = 0;
- if (entity_is_task(se)) {
- __update_task_entity_contrib(se);
- } else {
- __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
- __update_group_entity_contrib(se);
+ if (atomic_long_read(&cfs_rq->removed_load_avg)) {
+ long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+ sa->load_avg = max_t(long, sa->load_avg - r, 0);
+ sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+ removed = 1;
}
- return se->avg.load_avg_contrib - old_contrib;
-}
+ if (atomic_long_read(&cfs_rq->removed_util_avg)) {
+ long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
+ sa->util_avg = max_t(long, sa->util_avg - r, 0);
+ sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+ }
-static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
- long load_contrib)
-{
- if (likely(load_contrib < cfs_rq->blocked_load_avg))
- cfs_rq->blocked_load_avg -= load_contrib;
- else
- cfs_rq->blocked_load_avg = 0;
-}
+ decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+ scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+#ifndef CONFIG_64BIT
+ smp_wmb();
+ cfs_rq->load_last_update_time_copy = sa->last_update_time;
+#endif
+
+ /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
+ if (cfs_rq == &rq_of(cfs_rq)->cfs)
+ trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
+ return decayed || removed;
+}
-/* Update a sched_entity's runnable average */
-static inline void update_entity_load_avg(struct sched_entity *se,
- int update_cfs_rq)
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct sched_entity *se, int update_tg)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- long contrib_delta;
- u64 now;
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int cpu = cpu_of(rq_of(cfs_rq));
/*
- * For a group entity we need to use their owned cfs_rq_clock_task() in
- * case they are the parent of a throttled hierarchy.
+ * Track task load average for carrying it to new CPU after migrated, and
+ * track group sched_entity load average for task_h_load calc in migration
*/
+ __update_load_avg(now, cpu, &se->avg,
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
+
+ if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+ update_tg_load_avg(cfs_rq, 0);
+
if (entity_is_task(se))
- now = cfs_rq_clock_task(cfs_rq);
- else
- now = cfs_rq_clock_task(group_cfs_rq(se));
+ trace_sched_load_avg_task(task_of(se), &se->avg);
+}
- if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
- return;
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ goto skip_aging;
- contrib_delta = __update_entity_load_avg_contrib(se);
+ /*
+ * If we got migrated (either between CPUs or between cgroups) we'll
+ * have aged the average right before clearing @last_update_time.
+ */
+ if (se->avg.last_update_time) {
+ __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
+ &se->avg, 0, 0, NULL);
- if (!update_cfs_rq)
- return;
+ /*
+ * XXX: we could have just aged the entire load away if we've been
+ * absent from the fair class for too long.
+ */
+ }
- if (se->on_rq)
- cfs_rq->runnable_load_avg += contrib_delta;
- else
- subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+skip_aging:
+ se->avg.last_update_time = cfs_rq->avg.last_update_time;
+ cfs_rq->avg.load_avg += se->avg.load_avg;
+ cfs_rq->avg.load_sum += se->avg.load_sum;
+ cfs_rq->avg.util_avg += se->avg.util_avg;
+ cfs_rq->avg.util_sum += se->avg.util_sum;
}
-/*
- * Decay the load contributed by all blocked children and account this so that
- * their contribution may appropriately discounted when they wake up.
- */
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
- u64 decays;
+ __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
+ &se->avg, se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
- decays = now - cfs_rq->last_decay;
- if (!decays && !force_update)
- return;
+ cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
+ cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
+ cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
+ cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+}
- if (atomic_long_read(&cfs_rq->removed_load)) {
- unsigned long removed_load;
- removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
- subtract_blocked_load_contrib(cfs_rq, removed_load);
- }
+/* Add the load generated by se into cfs_rq's load average */
+static inline void
+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct sched_avg *sa = &se->avg;
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int migrated, decayed;
- if (decays) {
- cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
- decays);
- atomic64_add(decays, &cfs_rq->decay_counter);
- cfs_rq->last_decay = now;
+ migrated = !sa->last_update_time;
+ if (!migrated) {
+ __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
}
- __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+ decayed = update_cfs_rq_load_avg(now, cfs_rq);
+
+ cfs_rq->runnable_load_avg += sa->load_avg;
+ cfs_rq->runnable_load_sum += sa->load_sum;
+
+ if (migrated)
+ attach_entity_load_avg(cfs_rq, se);
+
+ if (decayed || migrated)
+ update_tg_load_avg(cfs_rq, 0);
}
-/* Add the load generated by se into cfs_rq's child load-average */
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int wakeup)
+/* Remove the runnable load generated by se from cfs_rq's runnable load average */
+static inline void
+dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- /*
- * We track migrations using entity decay_count <= 0, on a wake-up
- * migration we use a negative decay count to track the remote decays
- * accumulated while sleeping.
- *
- * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
- * are seen by enqueue_entity_load_avg() as a migration with an already
- * constructed load_avg_contrib.
- */
- if (unlikely(se->avg.decay_count <= 0)) {
- se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
- if (se->avg.decay_count) {
- /*
- * In a wake-up migration we have to approximate the
- * time sleeping. This is because we can't synchronize
- * clock_task between the two cpus, and it is not
- * guaranteed to be read-safe. Instead, we can
- * approximate this using our carried decays, which are
- * explicitly atomically readable.
- */
- se->avg.last_runnable_update -= (-se->avg.decay_count)
- << 20;
- update_entity_load_avg(se, 0);
- /* Indicate that we're now synchronized and on-rq */
- se->avg.decay_count = 0;
- }
- wakeup = 0;
- } else {
- __synchronize_entity_decay(se);
- }
+ update_load_avg(se, 1);
- /* migrated tasks did not contribute to our blocked load */
- if (wakeup) {
- subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
- update_entity_load_avg(se, 0);
- }
+ cfs_rq->runnable_load_avg =
+ max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
+ cfs_rq->runnable_load_sum =
+ max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
+}
+
+#ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+ u64 last_update_time_copy;
+ u64 last_update_time;
- cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
- /* we force update consideration on load-balancer moves */
- update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+ do {
+ last_update_time_copy = cfs_rq->load_last_update_time_copy;
+ smp_rmb();
+ last_update_time = cfs_rq->avg.last_update_time;
+ } while (last_update_time != last_update_time_copy);
+
+ return last_update_time;
}
+#else
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->avg.last_update_time;
+}
+#endif
/*
- * Remove se's load from this cfs_rq child load-average, if the entity is
- * transitioning to a blocked state we track its projected decay using
- * blocked_load_avg.
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
*/
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int sleep)
+void remove_entity_load_avg(struct sched_entity *se)
{
- update_entity_load_avg(se, 1);
- /* we force update consideration on load-balancer moves */
- update_cfs_rq_blocked_load(cfs_rq, !sleep);
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
- cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
- if (sleep) {
- cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
- se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
- } /* migrations, e.g. sleep=0 leave decay_count == 0 */
+ /*
+ * Newly created task or never used group entity should not be removed
+ * from its (source) cfs_rq
+ */
+ if (se->avg.last_update_time == 0)
+ return;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
+ atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
/*
@@ -2668,7 +2700,6 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
*/
void idle_enter_fair(struct rq *this_rq)
{
- update_rq_runnable_avg(this_rq, 1);
}
/*
@@ -2678,24 +2709,33 @@ void idle_enter_fair(struct rq *this_rq)
*/
void idle_exit_fair(struct rq *this_rq)
{
- update_rq_runnable_avg(this_rq, 0);
+}
+
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->runnable_load_avg;
+}
+
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->avg.load_avg;
}
static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */
-static inline void update_entity_load_avg(struct sched_entity *se,
- int update_cfs_rq) {}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int wakeup) {}
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int sleep) {}
-static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
- int force_update) {}
+static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline void
+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
+dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void remove_entity_load_avg(struct sched_entity *se) {}
+
+static inline void
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
+detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline int idle_balance(struct rq *rq)
{
@@ -2749,6 +2789,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
trace_sched_stat_blocked(tsk, delta);
+ trace_sched_blocked_reason(tsk);
/*
* Blocking time is in units of nanosecs, so shift by
@@ -2827,7 +2868,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
+ enqueue_entity_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -2902,7 +2943,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
+ dequeue_entity_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
@@ -2992,6 +3033,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
+ update_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
@@ -3091,7 +3133,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
- update_entity_load_avg(prev, 1);
+ update_load_avg(prev, 0);
}
cfs_rq->curr = NULL;
}
@@ -3107,8 +3149,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_entity_load_avg(curr, 1);
- update_cfs_rq_blocked_load(cfs_rq, 1);
+ update_load_avg(curr, 1);
update_cfs_shares(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
@@ -3960,6 +4001,28 @@ static inline void hrtick_update(struct rq *rq)
}
#endif
+#ifdef CONFIG_SMP
+static bool cpu_overutilized(int cpu);
+static inline unsigned long boosted_cpu_util(int cpu);
+#else
+#define boosted_cpu_util(cpu) cpu_util(cpu)
+#endif
+
+#ifdef CONFIG_SMP
+static void update_capacity_of(int cpu)
+{
+ unsigned long req_cap;
+
+ if (!sched_freq())
+ return;
+
+ /* Convert scale-invariant capacity to cpu. */
+ req_cap = boosted_cpu_util(cpu);
+ req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
+ set_cfs_cpu_capacity(cpu, true, req_cap);
+}
+#endif
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -3970,6 +4033,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+#ifdef CONFIG_SMP
+ int task_new = flags & ENQUEUE_WAKEUP_NEW;
+ int task_wakeup = flags & ENQUEUE_WAKEUP;
+#endif
for_each_sched_entity(se) {
if (se->on_rq)
@@ -3986,6 +4053,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
flags = ENQUEUE_WAKEUP;
}
@@ -3993,18 +4061,59 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
if (cfs_rq_throttled(cfs_rq))
break;
+ update_load_avg(se, 1);
update_cfs_shares(cfs_rq);
- update_entity_load_avg(se, 1);
}
- if (!se) {
- update_rq_runnable_avg(rq, rq->nr_running);
+ if (!se)
add_nr_running(rq, 1);
+
+#ifdef CONFIG_SMP
+
+ /*
+ * Update SchedTune accounting.
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ *
+ * We do it also in the case where we enqueue a throttled task;
+ * we could argue that a throttled task should not boost a CPU,
+ * however:
+ * a) properly implementing CPU boosting considering throttled
+ * tasks will increase a lot the complexity of the solution
+ * b) it's not easy to quantify the benefits introduced by
+ * such a more complex solution.
+ * Thus, for the time being we go for the simple solution and boost
+ * also for throttled RQs.
+ */
+ schedtune_enqueue_task(p, cpu_of(rq));
+
+ if (!se) {
+ walt_inc_cumulative_runnable_avg(rq, p);
+ if (!task_new && !rq->rd->overutilized &&
+ cpu_overutilized(rq->cpu)) {
+ rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
+
+ /*
+ * We want to potentially trigger a freq switch
+ * request only for tasks that are waking up; this is
+ * because we get here also during load balancing, but
+ * in these cases it seems wise to trigger as single
+ * request after load balancing is done.
+ */
+ if (task_new || task_wakeup)
+ update_capacity_of(cpu_of(rq));
}
+
+#endif /* CONFIG_SMP */
hrtick_update(rq);
}
@@ -4034,6 +4143,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -4054,26 +4164,236 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
if (cfs_rq_throttled(cfs_rq))
break;
+ update_load_avg(se, 1);
update_cfs_shares(cfs_rq);
- update_entity_load_avg(se, 1);
}
- if (!se) {
+ if (!se)
sub_nr_running(rq, 1);
- update_rq_runnable_avg(rq, 1);
+
+#ifdef CONFIG_SMP
+
+ /*
+ * Update SchedTune accounting
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ */
+ schedtune_dequeue_task(p, cpu_of(rq));
+
+ if (!se) {
+ walt_dec_cumulative_runnable_avg(rq, p);
+
+ /*
+ * We want to potentially trigger a freq switch
+ * request only for tasks that are going to sleep;
+ * this is because we get here also during load
+ * balancing, but in these cases it seems wise to
+ * trigger as single request after load balancing is
+ * done.
+ */
+ if (task_sleep) {
+ if (rq->cfs.nr_running)
+ update_capacity_of(cpu_of(rq));
+ else if (sched_freq())
+ set_cfs_cpu_capacity(cpu_of(rq), false, 0);
+ }
}
+
+#endif /* CONFIG_SMP */
+
hrtick_update(rq);
}
#ifdef CONFIG_SMP
+
+/*
+ * per rq 'load' arrray crap; XXX kill this.
+ */
+
+/*
+ * The exact cpuload calculated at every tick would be:
+ *
+ * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
+ *
+ * If a cpu misses updates for n ticks (as it was idle) and update gets
+ * called on the n+1-th tick when cpu may be busy, then we have:
+ *
+ * load_n = (1 - 1/2^i)^n * load_0
+ * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ *
+ * load' = (1 - 1/2^i)^n * load
+ *
+ * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
+ * This allows us to precompute the above in said factors, thereby allowing the
+ * reduction of an arbitrary n in O(log_2 n) steps. (See also
+ * fixed_power_int())
+ *
+ * The calculation is approximated on a 128 point scale.
+ */
+#define DEGRADE_SHIFT 7
+
+static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 64, 32, 8, 0, 0, 0, 0, 0 },
+ { 96, 72, 40, 12, 1, 0, 0, 0 },
+ { 112, 98, 75, 43, 15, 1, 0, 0 },
+ { 120, 112, 98, 76, 45, 16, 2, 0 }
+};
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+ int j = 0;
+
+ if (!missed_updates)
+ return load;
+
+ if (missed_updates >= degrade_zero_ticks[idx])
+ return 0;
+
+ if (idx == 1)
+ return load >> missed_updates;
+
+ while (missed_updates) {
+ if (missed_updates % 2)
+ load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+ missed_updates >>= 1;
+ j++;
+ }
+ return load;
+}
+
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+ unsigned long pending_updates)
+{
+ int i, scale;
+
+ this_rq->nr_load_updates++;
+
+ /* Update our load: */
+ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ unsigned long old_load, new_load;
+
+ /* scale is effectively 1 << i now, and >> i divides by scale */
+
+ old_load = this_rq->cpu_load[i];
+ old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ new_load = this_load;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale - 1;
+
+ this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+ }
+
+ sched_avg_update(this_rq);
+}
+
/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(const int cpu)
{
- return cpu_rq(cpu)->cfs.runnable_load_avg;
+ return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+static void update_idle_cpu_load(struct rq *this_rq)
+{
+ unsigned long curr_jiffies = READ_ONCE(jiffies);
+ unsigned long load = weighted_cpuload(cpu_of(this_rq));
+ unsigned long pending_updates;
+
+ /*
+ * bail if there's load or we're actually up-to-date.
+ */
+ if (load || curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ this_rq->last_load_update_tick = curr_jiffies;
+
+ __update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+ struct rq *this_rq = this_rq();
+ unsigned long curr_jiffies = READ_ONCE(jiffies);
+ unsigned long pending_updates;
+
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&this_rq->lock);
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * We were idle, this means load 0, the current load might be
+ * !0 due to remote wakeups and the sort.
+ */
+ __update_cpu_load(this_rq, 0, pending_updates);
+ }
+ raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+ unsigned long load = weighted_cpuload(cpu_of(this_rq));
+ /*
+ * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+ */
+ this_rq->last_load_update_tick = jiffies;
+ __update_cpu_load(this_rq, load, 1);
}
/*
@@ -4109,16 +4429,12 @@ static unsigned long target_load(int cpu, int type)
return max(rq->cpu_load[type-1], total);
}
-static unsigned long capacity_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity;
-}
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = rq->cfs.runnable_load_avg;
+ unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
+ unsigned long load_avg = weighted_cpuload(cpu);
if (nr_running)
return load_avg / nr_running;
@@ -4237,7 +4553,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
/*
* w = rw_i + @wl
*/
- w = se->my_q->load.weight + wl;
+ w = cfs_rq_load_avg(se->my_q) + wl;
/*
* wl = S * s'_i; see (2)
@@ -4258,7 +4574,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
/*
* wl = dw_i = S * (s'_i - s_i); see (3)
*/
- wl -= se->load.weight;
+ wl -= se->avg.load_avg;
/*
* Recursively apply this logic to all parent groups to compute
@@ -4281,28 +4597,417 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
#endif
-static int wake_wide(struct task_struct *p)
+/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+unsigned long capacity_curr_of(int cpu)
{
- int factor = this_cpu_read(sd_llc_size);
+ return cpu_rq(cpu)->cpu_capacity_orig *
+ arch_scale_freq_capacity(NULL, cpu)
+ >> SCHED_CAPACITY_SHIFT;
+}
+
+static inline bool energy_aware(void)
+{
+ return sched_feat(ENERGY_AWARE);
+}
+
+struct energy_env {
+ struct sched_group *sg_top;
+ struct sched_group *sg_cap;
+ int cap_idx;
+ int util_delta;
+ int src_cpu;
+ int dst_cpu;
+ int energy;
+ int payoff;
+ struct task_struct *task;
+ struct {
+ int before;
+ int after;
+ int delta;
+ int diff;
+ } nrg;
+ struct {
+ int before;
+ int after;
+ int delta;
+ } cap;
+};
+
+/*
+ * __cpu_norm_util() returns the cpu util relative to a specific capacity,
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
+ * energy calculations. Using the scale-invariant util returned by
+ * cpu_util() and approximating scale-invariant util by:
+ *
+ * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
+ *
+ * the normalized util can be found using the specific capacity.
+ *
+ * capacity = capacity_orig * curr_freq/max_freq
+ *
+ * norm_util = running_time/time ~ util/capacity
+ */
+static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
+{
+ int util = __cpu_util(cpu, delta);
+
+ if (util >= capacity)
+ return SCHED_CAPACITY_SCALE;
+
+ return (util << SCHED_CAPACITY_SHIFT)/capacity;
+}
+
+static int calc_util_delta(struct energy_env *eenv, int cpu)
+{
+ if (cpu == eenv->src_cpu)
+ return -eenv->util_delta;
+ if (cpu == eenv->dst_cpu)
+ return eenv->util_delta;
+ return 0;
+}
+
+static
+unsigned long group_max_util(struct energy_env *eenv)
+{
+ int i, delta;
+ unsigned long max_util = 0;
+
+ for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
+ delta = calc_util_delta(eenv, i);
+ max_util = max(max_util, __cpu_util(i, delta));
+ }
+
+ return max_util;
+}
+
+/*
+ * group_norm_util() returns the approximated group util relative to it's
+ * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
+ * energy calculations. Since task executions may or may not overlap in time in
+ * the group the true normalized util is between max(cpu_norm_util(i)) and
+ * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
+ * latter is used as the estimate as it leads to a more pessimistic energy
+ * estimate (more busy).
+ */
+static unsigned
+long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
+{
+ int i, delta;
+ unsigned long util_sum = 0;
+ unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ delta = calc_util_delta(eenv, i);
+ util_sum += __cpu_norm_util(i, capacity, delta);
+ }
+
+ if (util_sum > SCHED_CAPACITY_SCALE)
+ return SCHED_CAPACITY_SCALE;
+ return util_sum;
+}
+
+static int find_new_capacity(struct energy_env *eenv,
+ const struct sched_group_energy * const sge)
+{
+ int idx;
+ unsigned long util = group_max_util(eenv);
+
+ for (idx = 0; idx < sge->nr_cap_states; idx++) {
+ if (sge->cap_states[idx].cap >= util)
+ break;
+ }
+
+ eenv->cap_idx = idx;
+
+ return idx;
+}
+
+static int group_idle_state(struct sched_group *sg)
+{
+ int i, state = INT_MAX;
+
+ /* Find the shallowest idle state in the sched group. */
+ for_each_cpu(i, sched_group_cpus(sg))
+ state = min(state, idle_get_state_idx(cpu_rq(i)));
+
+ /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
+ state++;
+
+ return state;
+}
+
+/*
+ * sched_group_energy(): Computes the absolute energy consumption of cpus
+ * belonging to the sched_group including shared resources shared only by
+ * members of the group. Iterates over all cpus in the hierarchy below the
+ * sched_group starting from the bottom working it's way up before going to
+ * the next cpu until all cpus are covered at all levels. The current
+ * implementation is likely to gather the same util statistics multiple times.
+ * This can probably be done in a faster but more complex way.
+ * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ */
+static int sched_group_energy(struct energy_env *eenv)
+{
+ struct sched_domain *sd;
+ int cpu, total_energy = 0;
+ struct cpumask visit_cpus;
+ struct sched_group *sg;
+
+ WARN_ON(!eenv->sg_top->sge);
+
+ cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
+
+ while (!cpumask_empty(&visit_cpus)) {
+ struct sched_group *sg_shared_cap = NULL;
+
+ cpu = cpumask_first(&visit_cpus);
- /*
- * Yeah, it's the switching-frequency, could means many wakee or
- * rapidly switch, use factor here will just help to automatically
- * adjust the loose-degree, so bigger node will lead to more pull.
- */
- if (p->wakee_flips > factor) {
/*
- * wakee is somewhat hot, it needs certain amount of cpu
- * resource, so if waker is far more hot, prefer to leave
- * it alone.
+ * Is the group utilization affected by cpus outside this
+ * sched_group?
*/
- if (current->wakee_flips > (factor * p->wakee_flips))
- return 1;
+ sd = rcu_dereference(per_cpu(sd_scs, cpu));
+
+ if (!sd)
+ /*
+ * We most probably raced with hotplug; returning a
+ * wrong energy estimation is better than entering an
+ * infinite loop.
+ */
+ return -EINVAL;
+
+ if (sd->parent)
+ sg_shared_cap = sd->parent->groups;
+
+ for_each_domain(cpu, sd) {
+ sg = sd->groups;
+
+ /* Has this sched_domain already been visited? */
+ if (sd->child && group_first_cpu(sg) != cpu)
+ break;
+
+ do {
+ unsigned long group_util;
+ int sg_busy_energy, sg_idle_energy;
+ int cap_idx, idle_idx;
+
+ if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
+ eenv->sg_cap = sg_shared_cap;
+ else
+ eenv->sg_cap = sg;
+
+ cap_idx = find_new_capacity(eenv, sg->sge);
+
+ if (sg->group_weight == 1) {
+ /* Remove capacity of src CPU (before task move) */
+ if (eenv->util_delta == 0 &&
+ cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
+ eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
+ eenv->cap.delta -= eenv->cap.before;
+ }
+ /* Add capacity of dst CPU (after task move) */
+ if (eenv->util_delta != 0 &&
+ cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
+ eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
+ eenv->cap.delta += eenv->cap.after;
+ }
+ }
+
+ idle_idx = group_idle_state(sg);
+ group_util = group_norm_util(eenv, sg);
+ sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
+ >> SCHED_CAPACITY_SHIFT;
+ sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
+ * sg->sge->idle_states[idle_idx].power)
+ >> SCHED_CAPACITY_SHIFT;
+
+ total_energy += sg_busy_energy + sg_idle_energy;
+
+ if (!sd->child)
+ cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
+
+ if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
+ goto next_cpu;
+
+ } while (sg = sg->next, sg != sd->groups);
+ }
+next_cpu:
+ cpumask_clear_cpu(cpu, &visit_cpus);
+ continue;
}
+ eenv->energy = total_energy;
return 0;
}
+static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
+{
+ return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
+}
+
+/*
+ * energy_diff(): Estimate the energy impact of changing the utilization
+ * distribution. eenv specifies the change: utilisation amount, source, and
+ * destination cpu. Source or destination cpu may be -1 in which case the
+ * utilization is removed from or added to the system (e.g. task wake-up). If
+ * both are specified, the utilization is migrated.
+ */
+static inline int __energy_diff(struct energy_env *eenv)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+ int sd_cpu = -1, energy_before = 0, energy_after = 0;
+
+ struct energy_env eenv_before = {
+ .util_delta = 0,
+ .src_cpu = eenv->src_cpu,
+ .dst_cpu = eenv->dst_cpu,
+ .nrg = { 0, 0, 0, 0},
+ .cap = { 0, 0, 0 },
+ };
+
+ if (eenv->src_cpu == eenv->dst_cpu)
+ return 0;
+
+ sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
+ sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
+
+ if (!sd)
+ return 0; /* Error */
+
+ sg = sd->groups;
+
+ do {
+ if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
+ eenv_before.sg_top = eenv->sg_top = sg;
+
+ if (sched_group_energy(&eenv_before))
+ return 0; /* Invalid result abort */
+ energy_before += eenv_before.energy;
+
+ /* Keep track of SRC cpu (before) capacity */
+ eenv->cap.before = eenv_before.cap.before;
+ eenv->cap.delta = eenv_before.cap.delta;
+
+ if (sched_group_energy(eenv))
+ return 0; /* Invalid result abort */
+ energy_after += eenv->energy;
+ }
+ } while (sg = sg->next, sg != sd->groups);
+
+ eenv->nrg.before = energy_before;
+ eenv->nrg.after = energy_after;
+ eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
+ eenv->payoff = 0;
+
+ trace_sched_energy_diff(eenv->task,
+ eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+ eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+ eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+ eenv->nrg.delta, eenv->payoff);
+
+ return eenv->nrg.diff;
+}
+
+#ifdef CONFIG_SCHED_TUNE
+
+struct target_nrg schedtune_target_nrg;
+
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * corresponding to the specified energy variation.
+ */
+static inline int
+normalize_energy(int energy_diff)
+{
+ u32 normalized_nrg;
+#ifdef CONFIG_SCHED_DEBUG
+ int max_delta;
+
+ /* Check for boundaries */
+ max_delta = schedtune_target_nrg.max_power;
+ max_delta -= schedtune_target_nrg.min_power;
+ WARN_ON(abs(energy_diff) >= max_delta);
+#endif
+
+ /* Do scaling using positive numbers to increase the range */
+ normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
+
+ /* Scale by energy magnitude */
+ normalized_nrg <<= SCHED_LOAD_SHIFT;
+
+ /* Normalize on max energy for target platform */
+ normalized_nrg = reciprocal_divide(
+ normalized_nrg, schedtune_target_nrg.rdiv);
+
+ return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+}
+
+static inline int
+energy_diff(struct energy_env *eenv)
+{
+ int boost = schedtune_task_boost(eenv->task);
+ int nrg_delta;
+
+ /* Conpute "absolute" energy diff */
+ __energy_diff(eenv);
+
+ /* Return energy diff when boost margin is 0 */
+ if (boost == 0)
+ return eenv->nrg.diff;
+
+ /* Compute normalized energy diff */
+ nrg_delta = normalize_energy(eenv->nrg.diff);
+ eenv->nrg.delta = nrg_delta;
+
+ eenv->payoff = schedtune_accept_deltas(
+ eenv->nrg.delta,
+ eenv->cap.delta,
+ eenv->task);
+
+ /*
+ * When SchedTune is enabled, the energy_diff() function will return
+ * the computed energy payoff value. Since the energy_diff() return
+ * value is expected to be negative by its callers, this evaluation
+ * function return a negative value each time the evaluation return a
+ * positive payoff, which is the condition for the acceptance of
+ * a scheduling decision
+ */
+ return -eenv->payoff;
+}
+#else /* CONFIG_SCHED_TUNE */
+#define energy_diff(eenv) __energy_diff(eenv)
+#endif
+
+/*
+ * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ * A waker of many should wake a different task than the one last awakened
+ * at a frequency roughly N times higher than one of its wakees. In order
+ * to determine whether we should let the load spread vs consolodating to
+ * shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other. With
+ * both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size. Waker/wakee
+ * being client/server, worker/dispatcher, interrupt source or whatever is
+ * irrelevant, spread criteria is apparent partner count exceeds socket size.
+ */
+static int wake_wide(struct task_struct *p)
+{
+ unsigned int master = current->wakee_flips;
+ unsigned int slave = p->wakee_flips;
+ int factor = this_cpu_read(sd_llc_size);
+
+ if (master < slave)
+ swap(master, slave);
+ if (slave < factor || master < slave * factor)
+ return 0;
+ return 1;
+}
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
s64 this_load, load;
@@ -4312,13 +5017,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
unsigned long weight;
int balanced;
- /*
- * If we wake multiple tasks be careful to not bounce
- * ourselves around too much.
- */
- if (wake_wide(p))
- return 0;
-
idx = sd->wake_idx;
this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
@@ -4332,14 +5030,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
*/
if (sync) {
tg = task_group(current);
- weight = current->se.load.weight;
+ weight = current->se.avg.load_avg;
this_load += effective_load(tg, this_cpu, -weight, -weight);
load += effective_load(tg, prev_cpu, 0, -weight);
}
tg = task_group(p);
- weight = p->se.load.weight;
+ weight = p->se.avg.load_avg;
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
@@ -4376,6 +5074,160 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
return 1;
}
+static inline unsigned long task_util(struct task_struct *p)
+{
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+ unsigned long demand = p->ravg.demand;
+ return (demand << 10) / walt_ravg_window;
+ }
+#endif
+ return p->se.avg.util_avg;
+}
+
+unsigned int capacity_margin = 1280; /* ~20% margin */
+
+static inline unsigned long boosted_task_util(struct task_struct *task);
+
+static inline bool __task_fits(struct task_struct *p, int cpu, int util)
+{
+ unsigned long capacity = capacity_of(cpu);
+
+ util += boosted_task_util(p);
+
+ return (capacity * 1024) > (util * capacity_margin);
+}
+
+static inline bool task_fits_max(struct task_struct *p, int cpu)
+{
+ unsigned long capacity = capacity_of(cpu);
+ unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
+
+ if (capacity == max_capacity)
+ return true;
+
+ if (capacity * capacity_margin > max_capacity * 1024)
+ return true;
+
+ return __task_fits(p, cpu, 0);
+}
+
+static inline bool task_fits_spare(struct task_struct *p, int cpu)
+{
+ return __task_fits(p, cpu, cpu_util(cpu));
+}
+
+static bool cpu_overutilized(int cpu)
+{
+ return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+}
+
+#ifdef CONFIG_SCHED_TUNE
+
+static long
+schedtune_margin(unsigned long signal, long boost)
+{
+ long long margin = 0;
+
+ /*
+ * Signal proportional compensation (SPC)
+ *
+ * The Boost (B) value is used to compute a Margin (M) which is
+ * proportional to the complement of the original Signal (S):
+ * M = B * (SCHED_LOAD_SCALE - S), if B is positive
+ * M = B * S, if B is negative
+ * The obtained M could be used by the caller to "boost" S.
+ */
+ if (boost >= 0) {
+ margin = SCHED_LOAD_SCALE - signal;
+ margin *= boost;
+ } else
+ margin = -signal * boost;
+ /*
+ * Fast integer division by constant:
+ * Constant : (C) = 100
+ * Precision : 0.1% (P) = 0.1
+ * Reference : C * 100 / P (R) = 100000
+ *
+ * Thus:
+ * Shift bits : ceil(log(R,2)) (S) = 17
+ * Mult const : round(2^S/C) (M) = 1311
+ *
+ *
+ */
+ margin *= 1311;
+ margin >>= 17;
+
+ if (boost < 0)
+ margin *= -1;
+ return margin;
+}
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ int boost = schedtune_cpu_boost(cpu);
+
+ if (boost == 0)
+ return 0;
+
+ return schedtune_margin(util, boost);
+}
+
+static inline long
+schedtune_task_margin(struct task_struct *task)
+{
+ int boost = schedtune_task_boost(task);
+ unsigned long util;
+ long margin;
+
+ if (boost == 0)
+ return 0;
+
+ util = task_util(task);
+ margin = schedtune_margin(util, boost);
+
+ return margin;
+}
+
+#else /* CONFIG_SCHED_TUNE */
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ return 0;
+}
+
+static inline int
+schedtune_task_margin(struct task_struct *task)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_TUNE */
+
+static inline unsigned long
+boosted_cpu_util(int cpu)
+{
+ unsigned long util = cpu_util(cpu);
+ long margin = schedtune_cpu_margin(util, cpu);
+
+ trace_sched_boost_cpu(cpu, util, margin);
+
+ return util + margin;
+}
+
+static inline unsigned long
+boosted_task_util(struct task_struct *task)
+{
+ unsigned long util = task_util(task);
+ long margin = schedtune_task_margin(task);
+
+ trace_sched_boost_task(task, util, margin);
+
+ return util + margin;
+}
+
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
@@ -4385,7 +5237,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
+ struct sched_group *fit_group = NULL, *spare_group = NULL;
unsigned long min_load = ULONG_MAX, this_load = 0;
+ unsigned long fit_capacity = ULONG_MAX;
+ unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
@@ -4393,7 +5248,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
load_idx = sd->wake_idx;
do {
- unsigned long load, avg_load;
+ unsigned long load, avg_load, spare_capacity;
int local_group;
int i;
@@ -4416,6 +5271,25 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
load = target_load(i, load_idx);
avg_load += load;
+
+ /*
+ * Look for most energy-efficient group that can fit
+ * that can fit the task.
+ */
+ if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
+ fit_capacity = capacity_of(i);
+ fit_group = group;
+ }
+
+ /*
+ * Look for group which has most spare capacity on a
+ * single cpu.
+ */
+ spare_capacity = capacity_of(i) - cpu_util(i);
+ if (spare_capacity > max_spare_capacity) {
+ max_spare_capacity = spare_capacity;
+ spare_group = group;
+ }
}
/* Adjust by relative CPU capacity of the group */
@@ -4429,6 +5303,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
}
} while (group = group->next, group != sd->groups);
+ if (fit_group)
+ return fit_group;
+
+ if (spare_group)
+ return spare_group;
+
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
return idlest;
@@ -4449,7 +5329,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
- if (idle_cpu(i)) {
+ if (task_fits_spare(p, i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
@@ -4461,7 +5341,8 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
min_exit_latency = idle->exit_latency;
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
- } else if ((!idle || idle->exit_latency == min_exit_latency) &&
+ } else if (idle_cpu(i) &&
+ (!idle || idle->exit_latency == min_exit_latency) &&
rq->idle_stamp > latest_idle_timestamp) {
/*
* If equal or no active idle state, then
@@ -4470,6 +5351,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
+ } else if (shallowest_idle_cpu == -1) {
+ /*
+ * If we haven't found an idle CPU yet
+ * pick a non-idle one that can fit the task as
+ * fallback.
+ */
+ shallowest_idle_cpu = i;
}
} else {
load = weighted_cpuload(i);
@@ -4491,15 +5379,20 @@ static int select_idle_sibling(struct task_struct *p, int target)
struct sched_domain *sd;
struct sched_group *sg;
int i = task_cpu(p);
+ int best_idle = -1;
+ int best_idle_cstate = -1;
+ int best_idle_capacity = INT_MAX;
- if (idle_cpu(target))
- return target;
+ if (!sysctl_sched_cstate_aware) {
+ if (idle_cpu(target))
+ return target;
- /*
- * If the prevous cpu is cache affine and idle, don't be stupid.
- */
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ /*
+ * If the prevous cpu is cache affine and idle, don't be stupid.
+ */
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+ return i;
+ }
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
@@ -4512,22 +5405,264 @@ static int select_idle_sibling(struct task_struct *p, int target)
tsk_cpus_allowed(p)))
goto next;
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
-
- target = cpumask_first_and(sched_group_cpus(sg),
+ if (sysctl_sched_cstate_aware) {
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+ struct rq *rq = cpu_rq(i);
+ int idle_idx = idle_get_state_idx(rq);
+ unsigned long new_usage = boosted_task_util(p);
+ unsigned long capacity_orig = capacity_orig_of(i);
+ if (new_usage > capacity_orig || !idle_cpu(i))
+ goto next;
+
+ if (i == target && new_usage <= capacity_curr_of(target))
+ return target;
+
+ if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
+ best_idle = i;
+ best_idle_cstate = idle_idx;
+ best_idle_capacity = capacity_orig;
+ }
+ }
+ } else {
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ if (i == target || !idle_cpu(i))
+ goto next;
+ }
+
+ target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
- goto done;
+ goto done;
+ }
next:
sg = sg->next;
} while (sg != sd->groups);
}
+ if (best_idle > 0)
+ target = best_idle;
+
done:
return target;
}
+static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
+{
+ int iter_cpu;
+ int target_cpu = -1;
+ int target_util = 0;
+ int backup_capacity = 0;
+ int best_idle_cpu = -1;
+ int best_idle_cstate = INT_MAX;
+ int backup_cpu = -1;
+ unsigned long task_util_boosted, new_util;
+
+ task_util_boosted = boosted_task_util(p);
+ for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
+ int cur_capacity;
+ struct rq *rq;
+ int idle_idx;
+
+ /*
+ * Iterate from higher cpus for boosted tasks.
+ */
+ int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
+
+ if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
+ continue;
+
+ /*
+ * p's blocked utilization is still accounted for on prev_cpu
+ * so prev_cpu will receive a negative bias due to the double
+ * accounting. However, the blocked utilization may be zero.
+ */
+ new_util = cpu_util(i) + task_util_boosted;
+
+ /*
+ * Ensure minimum capacity to grant the required boost.
+ * The target CPU can be already at a capacity level higher
+ * than the one required to boost the task.
+ */
+ if (new_util > capacity_orig_of(i))
+ continue;
+
+#ifdef CONFIG_SCHED_WALT
+ if (walt_cpu_high_irqload(i))
+ continue;
+#endif
+ /*
+ * Unconditionally favoring tasks that prefer idle cpus to
+ * improve latency.
+ */
+ if (idle_cpu(i) && prefer_idle) {
+ if (best_idle_cpu < 0)
+ best_idle_cpu = i;
+ continue;
+ }
+
+ cur_capacity = capacity_curr_of(i);
+ rq = cpu_rq(i);
+ idle_idx = idle_get_state_idx(rq);
+
+ if (new_util < cur_capacity) {
+ if (cpu_rq(i)->nr_running) {
+ if (prefer_idle) {
+ /* Find a target cpu with highest
+ * utilization.
+ */
+ if (target_util == 0 ||
+ target_util < new_util) {
+ target_cpu = i;
+ target_util = new_util;
+ }
+ } else {
+ /* Find a target cpu with lowest
+ * utilization.
+ */
+ if (target_util == 0 ||
+ target_util > new_util) {
+ target_cpu = i;
+ target_util = new_util;
+ }
+ }
+ } else if (!prefer_idle) {
+ if (best_idle_cpu < 0 ||
+ (sysctl_sched_cstate_aware &&
+ best_idle_cstate > idle_idx)) {
+ best_idle_cstate = idle_idx;
+ best_idle_cpu = i;
+ }
+ }
+ } else if (backup_capacity == 0 ||
+ backup_capacity > cur_capacity) {
+ // Find a backup cpu with least capacity.
+ backup_capacity = cur_capacity;
+ backup_cpu = i;
+ }
+ }
+
+ if (prefer_idle && best_idle_cpu >= 0)
+ target_cpu = best_idle_cpu;
+ else if (target_cpu < 0)
+ target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
+
+ return target_cpu;
+}
+
+static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg, *sg_target;
+ int target_max_cap = INT_MAX;
+ int target_cpu = task_cpu(p);
+ unsigned long task_util_boosted, new_util;
+ int i;
+
+ if (sysctl_sched_sync_hint_enable && sync) {
+ int cpu = smp_processor_id();
+ cpumask_t search_cpus;
+ cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
+ if (cpumask_test_cpu(cpu, &search_cpus))
+ return cpu;
+ }
+
+ sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
+
+ if (!sd)
+ return target;
+
+ sg = sd->groups;
+ sg_target = sg;
+
+ if (sysctl_sched_is_big_little) {
+
+ /*
+ * Find group with sufficient capacity. We only get here if no cpu is
+ * overutilized. We may end up overutilizing a cpu by adding the task,
+ * but that should not be any worse than select_idle_sibling().
+ * load_balance() should sort it out later as we get above the tipping
+ * point.
+ */
+ do {
+ /* Assuming all cpus are the same in group */
+ int max_cap_cpu = group_first_cpu(sg);
+
+ /*
+ * Assume smaller max capacity means more energy-efficient.
+ * Ideally we should query the energy model for the right
+ * answer but it easily ends up in an exhaustive search.
+ */
+ if (capacity_of(max_cap_cpu) < target_max_cap &&
+ task_fits_max(p, max_cap_cpu)) {
+ sg_target = sg;
+ target_max_cap = capacity_of(max_cap_cpu);
+ }
+ } while (sg = sg->next, sg != sd->groups);
+
+ task_util_boosted = boosted_task_util(p);
+ /* Find cpu with sufficient capacity */
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
+ /*
+ * p's blocked utilization is still accounted for on prev_cpu
+ * so prev_cpu will receive a negative bias due to the double
+ * accounting. However, the blocked utilization may be zero.
+ */
+ new_util = cpu_util(i) + task_util_boosted;
+
+ /*
+ * Ensure minimum capacity to grant the required boost.
+ * The target CPU can be already at a capacity level higher
+ * than the one required to boost the task.
+ */
+ if (new_util > capacity_orig_of(i))
+ continue;
+
+ if (new_util < capacity_curr_of(i)) {
+ target_cpu = i;
+ if (cpu_rq(i)->nr_running)
+ break;
+ }
+
+ /* cpu has capacity at higher OPP, keep it as fallback */
+ if (target_cpu == task_cpu(p))
+ target_cpu = i;
+ }
+ } else {
+ /*
+ * Find a cpu with sufficient capacity
+ */
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ bool boosted = schedtune_task_boost(p) > 0;
+ bool prefer_idle = schedtune_prefer_idle(p) > 0;
+#else
+ bool boosted = 0;
+ bool prefer_idle = 0;
+#endif
+ int tmp_target = find_best_target(p, boosted, prefer_idle);
+ if (tmp_target >= 0) {
+ target_cpu = tmp_target;
+ if ((boosted || prefer_idle) && idle_cpu(target_cpu))
+ return target_cpu;
+ }
+ }
+
+ if (target_cpu != task_cpu(p)) {
+ struct energy_env eenv = {
+ .util_delta = task_util(p),
+ .src_cpu = task_cpu(p),
+ .dst_cpu = target_cpu,
+ .task = p,
+ };
+
+ /* Not enough spare capacity on previous cpu */
+ if (cpu_overutilized(task_cpu(p)))
+ return target_cpu;
+
+ if (energy_diff(&eenv) >= 0)
+ return task_cpu(p);
+ }
+
+ return target_cpu;
+}
+
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -4545,7 +5680,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
- int new_cpu = cpu;
+ int new_cpu = prev_cpu;
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
@@ -4553,12 +5688,14 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
return prev_cpu;
if (sd_flag & SD_BALANCE_WAKE)
- want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
+ energy_aware();
rcu_read_lock();
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
- continue;
+ break;
/*
* If both cpu and prev_cpu are part of this domain,
@@ -4572,17 +5709,23 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (tmp->flags & sd_flag)
sd = tmp;
+ else if (!want_affine)
+ break;
}
- if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
- prev_cpu = cpu;
-
- if (sd_flag & SD_BALANCE_WAKE) {
- new_cpu = select_idle_sibling(p, prev_cpu);
- goto unlock;
+ if (affine_sd) {
+ sd = NULL; /* Prefer wake_affine over balance flags */
+ if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ new_cpu = cpu;
}
- while (sd) {
+ if (!sd) {
+ if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
+ new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
+ else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+ new_cpu = select_idle_sibling(p, new_cpu);
+
+ } else while (sd) {
struct sched_group *group;
int weight;
@@ -4616,7 +5759,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
}
/* while loop will break here if sd == NULL */
}
-unlock:
rcu_read_unlock();
return new_cpu;
@@ -4625,30 +5767,32 @@ unlock:
/*
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
- * previous cpu. However, the caller only guarantees p->pi_lock is held; no
- * other assumptions, including the state of rq->lock, should be made.
+ * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
-static void
-migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
{
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
/*
- * Load tracking: accumulate removed load so that it can be processed
- * when we next update owning cfs_rq under rq->lock. Tasks contribute
- * to blocked load iff they have a positive decay-count. It can never
- * be negative here since on-rq tasks have decay-count == 0.
+ * We are supposed to update the task to "current" time, then its up to date
+ * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
+ * what current time is, so simply throw away the out-of-date time. This
+ * will result in the wakee task is less decayed, but giving the wakee more
+ * load sounds not bad.
*/
- if (se->avg.decay_count) {
- se->avg.decay_count = -__synchronize_entity_decay(se);
- atomic_long_add(se->avg.load_avg_contrib,
- &cfs_rq->removed_load);
- }
+ remove_entity_load_avg(&p->se);
+
+ /* Tell new CPU we are migrated */
+ p->se.avg.last_update_time = 0;
/* We have migrated, no longer consider this task hot */
- se->exec_start = 0;
+ p->se.exec_start = 0;
}
+
+static void task_dead_fair(struct task_struct *p)
+{
+ remove_entity_load_avg(&p->se);
+}
+#else
+#define task_fits_max(p, cpu) true
#endif /* CONFIG_SMP */
static unsigned long
@@ -4895,6 +6039,8 @@ again:
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
+ rq->misfit_task = !task_fits_max(p, rq->cpu);
+
return p;
simple:
cfs_rq = &rq->cfs;
@@ -4916,9 +6062,13 @@ simple:
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
+ rq->misfit_task = !task_fits_max(p, rq->cpu);
+
return p;
idle:
+ rq->misfit_task = 0;
+
new_tasks = idle_balance(rq);
/*
* Because idle_balance() releases (and re-acquires) rq->lock, it is
@@ -5123,6 +6273,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
enum fbq_type { regular, remote, all };
+enum group_type {
+ group_other = 0,
+ group_misfit_task,
+ group_imbalanced,
+ group_overloaded,
+};
+
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
@@ -5141,6 +6298,7 @@ struct lb_env {
int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
+ unsigned int src_grp_nr_running;
/* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus;
@@ -5151,6 +6309,7 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
+ enum group_type busiest_group_type;
struct list_head tasks;
};
@@ -5365,7 +6524,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
deactivate_task(env->src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ double_lock_balance(env->src_rq, env->dst_rq);
set_task_cpu(p, env->dst_cpu);
+ double_unlock_balance(env->src_rq, env->dst_rq);
}
/*
@@ -5419,6 +6580,13 @@ static int detach_tasks(struct lb_env *env)
return 0;
while (!list_empty(tasks)) {
+ /*
+ * We don't want to steal all, otherwise we may be treated likewise,
+ * which could at worst lead to a livelock crash.
+ */
+ if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
+ break;
+
p = list_first_entry(tasks, struct task_struct, se.group_node);
env->loop++;
@@ -5503,6 +6671,10 @@ static void attach_one_task(struct rq *rq, struct task_struct *p)
{
raw_spin_lock(&rq->lock);
attach_task(rq, p);
+ /*
+ * We want to potentially raise target_cpu's OPP.
+ */
+ update_capacity_of(cpu_of(rq));
raw_spin_unlock(&rq->lock);
}
@@ -5524,43 +6696,15 @@ static void attach_tasks(struct lb_env *env)
attach_task(env->dst_rq, p);
}
+ /*
+ * We want to potentially raise env.dst_cpu's OPP.
+ */
+ update_capacity_of(env->dst_cpu);
+
raw_spin_unlock(&env->dst_rq->lock);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * update tg->load_weight by folding this cpu's load_avg
- */
-static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
-{
- struct sched_entity *se = tg->se[cpu];
- struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
-
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- return;
-
- update_cfs_rq_blocked_load(cfs_rq, 1);
-
- if (se) {
- update_entity_load_avg(se, 1);
- /*
- * We pivot on our runnable average having decayed to zero for
- * list removal. This generally implies that all our children
- * have also been removed (modulo rounding error or bandwidth
- * control); however, such cases are rare and we can fix these
- * at enqueue.
- *
- * TODO: fix up out-of-order children on enqueue.
- */
- if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
- list_del_leaf_cfs_rq(cfs_rq);
- } else {
- struct rq *rq = rq_of(cfs_rq);
- update_rq_runnable_avg(rq, rq->nr_running);
- }
-}
-
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -5569,19 +6713,19 @@ static void update_blocked_averages(int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
update_rq_clock(rq);
+
/*
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
- /*
- * Note: We may want to consider periodically releasing
- * rq->lock about these updates so that creating many task
- * groups does not result in continually extending hold time.
- */
- __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
- }
+ /* throttled entities do not contribute to load */
+ if (throttled_hierarchy(cfs_rq))
+ continue;
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ update_tg_load_avg(cfs_rq, 0);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -5609,14 +6753,14 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
}
if (!se) {
- cfs_rq->h_load = cfs_rq->runnable_load_avg;
+ cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
cfs_rq->last_h_load_update = now;
}
while ((se = cfs_rq->h_load_next) != NULL) {
load = cfs_rq->h_load;
- load = div64_ul(load * se->avg.load_avg_contrib,
- cfs_rq->runnable_load_avg + 1);
+ load = div64_ul(load * se->avg.load_avg,
+ cfs_rq_load_avg(cfs_rq) + 1);
cfs_rq = group_cfs_rq(se);
cfs_rq->h_load = load;
cfs_rq->last_h_load_update = now;
@@ -5628,28 +6772,30 @@ static unsigned long task_h_load(struct task_struct *p)
struct cfs_rq *cfs_rq = task_cfs_rq(p);
update_cfs_rq_h_load(cfs_rq);
- return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
- cfs_rq->runnable_load_avg + 1);
+ return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
+ cfs_rq_load_avg(cfs_rq) + 1);
}
#else
static inline void update_blocked_averages(int cpu)
{
+ struct rq *rq = cpu_rq(cpu);
+ struct cfs_rq *cfs_rq = &rq->cfs;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_rq_clock(rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
static unsigned long task_h_load(struct task_struct *p)
{
- return p->se.avg.load_avg_contrib;
+ return p->se.avg.load_avg;
}
#endif
/********** Helpers for find_busiest_group ************************/
-enum group_type {
- group_other = 0,
- group_imbalanced,
- group_overloaded,
-};
-
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
@@ -5659,12 +6805,13 @@ struct sg_lb_stats {
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
unsigned long group_capacity;
+ unsigned long group_util; /* Total utilization of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group */
- unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
- int group_has_free_capacity;
+ int group_no_capacity;
+ int group_misfit_task; /* A cpu has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -5735,84 +6882,78 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
-{
- return SCHED_CAPACITY_SCALE;
-}
-
-unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
-{
- return default_scale_capacity(sd, cpu);
-}
-
-static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
- if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
- return sd->smt_gain / sd->span_weight;
-
- return SCHED_CAPACITY_SCALE;
-}
-
-unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
- return default_scale_cpu_capacity(sd, cpu);
-}
-
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, available, age_stamp, avg;
+ u64 total, used, age_stamp, avg;
s64 delta;
/*
* Since we're reading these variables without serialization make sure
* we read them once before doing sanity checks on them.
*/
- age_stamp = ACCESS_ONCE(rq->age_stamp);
- avg = ACCESS_ONCE(rq->rt_avg);
+ age_stamp = READ_ONCE(rq->age_stamp);
+ avg = READ_ONCE(rq->rt_avg);
+ delta = __rq_clock_broken(rq) - age_stamp;
- delta = rq_clock(rq) - age_stamp;
if (unlikely(delta < 0))
delta = 0;
total = sched_avg_period() + delta;
- if (unlikely(total < avg)) {
- /* Ensures that capacity won't end up being negative */
- available = 0;
- } else {
- available = total - avg;
- }
+ used = div_u64(avg, total);
- if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
- total = SCHED_CAPACITY_SCALE;
+ /*
+ * deadline bandwidth is defined at system level so we must
+ * weight this bandwidth with the max capacity of the system.
+ * As a reminder, avg_bw is 20bits width and
+ * scale_cpu_capacity is 10 bits width
+ */
+ used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
+
+ if (likely(used < SCHED_CAPACITY_SCALE))
+ return SCHED_CAPACITY_SCALE - used;
- total >>= SCHED_CAPACITY_SHIFT;
+ return 1;
+}
- return div_u64(available, total);
+void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
+{
+ raw_spin_lock_init(&mcc->lock);
+ mcc->val = 0;
+ mcc->cpu = -1;
}
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity = SCHED_CAPACITY_SCALE;
+ unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
+ struct max_cpu_capacity *mcc;
+ unsigned long max_capacity;
+ int max_cap_cpu;
+ unsigned long flags;
- if (sched_feat(ARCH_CAPACITY))
- capacity *= arch_scale_cpu_capacity(sd, cpu);
- else
- capacity *= default_scale_cpu_capacity(sd, cpu);
-
- capacity >>= SCHED_CAPACITY_SHIFT;
+ cpu_rq(cpu)->cpu_capacity_orig = capacity;
- sdg->sgc->capacity_orig = capacity;
+ mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
- if (sched_feat(ARCH_CAPACITY))
- capacity *= arch_scale_freq_capacity(sd, cpu);
- else
- capacity *= default_scale_capacity(sd, cpu);
+ raw_spin_lock_irqsave(&mcc->lock, flags);
+ max_capacity = mcc->val;
+ max_cap_cpu = mcc->cpu;
- capacity >>= SCHED_CAPACITY_SHIFT;
+ if ((max_capacity > capacity && max_cap_cpu == cpu) ||
+ (max_capacity < capacity)) {
+ mcc->val = capacity;
+ mcc->cpu = cpu;
+#ifdef CONFIG_SCHED_DEBUG
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
+ pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+ goto skip_unlock;
+#endif
+ }
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
+skip_unlock: __attribute__ ((unused));
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
@@ -5821,13 +6962,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
+ sdg->sgc->max_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity, capacity_orig;
+ unsigned long capacity, max_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -5839,7 +6981,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
return;
}
- capacity_orig = capacity = 0;
+ capacity = 0;
+ max_capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -5859,20 +7002,17 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
* Use capacity_of(), which is set irrespective of domains
* in update_cpu_capacity().
*
- * This avoids capacity/capacity_orig from being 0 and
+ * This avoids capacity from being 0 and
* causing divide-by-zero issues on boot.
- *
- * Runtime updates will correct capacity_orig.
*/
if (unlikely(!rq->sd)) {
- capacity_orig += capacity_of(cpu);
capacity += capacity_of(cpu);
- continue;
+ } else {
+ sgc = rq->sd->groups->sgc;
+ capacity += sgc->capacity;
}
- sgc = rq->sd->groups->sgc;
- capacity_orig += sgc->capacity_orig;
- capacity += sgc->capacity;
+ max_capacity = max(capacity, max_capacity);
}
} else {
/*
@@ -5882,39 +7022,28 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- capacity_orig += group->sgc->capacity_orig;
- capacity += group->sgc->capacity;
+ struct sched_group_capacity *sgc = group->sgc;
+
+ capacity += sgc->capacity;
+ max_capacity = max(sgc->max_capacity, max_capacity);
group = group->next;
} while (group != child->groups);
}
- sdg->sgc->capacity_orig = capacity_orig;
sdg->sgc->capacity = capacity;
+ sdg->sgc->max_capacity = max_capacity;
}
/*
- * Try and fix up capacity for tiny siblings, this is needed when
- * things like SD_ASYM_PACKING need f_b_g to select another sibling
- * which on its own isn't powerful enough.
- *
- * See update_sd_pick_busiest() and check_asym_packing().
+ * Check whether the capacity of the rq has been noticeably reduced by side
+ * activity. The imbalance_pct is used for the threshold.
+ * Return true is the capacity is reduced
*/
static inline int
-fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
- /*
- * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
- */
- if (!(sd->flags & SD_SHARE_CPUCAPACITY))
- return 0;
-
- /*
- * If ~90% of the cpu_capacity is still there, we're good.
- */
- if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
- return 1;
-
- return 0;
+ return ((rq->cpu_capacity * sd->imbalance_pct) <
+ (rq->cpu_capacity_orig * 100));
}
/*
@@ -5952,42 +7081,76 @@ static inline int sg_imbalanced(struct sched_group *group)
}
/*
- * Compute the group capacity factor.
- *
- * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
- * first dividing out the smt factor and computing the actual number of cores
- * and limit unit capacity with that.
+ * group_has_capacity returns true if the group has spare capacity that could
+ * be used by some tasks.
+ * We consider that a group has spare capacity if the * number of task is
+ * smaller than the number of CPUs or if the utilization is lower than the
+ * available capacity for CFS tasks.
+ * For the latter, we use a threshold to stabilize the state, to take into
+ * account the variance of the tasks' load and to return true if the available
+ * capacity in meaningful for the load balancer.
+ * As an example, an available capacity of 1% can appear but it doesn't make
+ * any benefit for the load balance.
+ */
+static inline bool
+group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running < sgs->group_weight)
+ return true;
+
+ if ((sgs->group_capacity * 100) >
+ (sgs->group_util * env->sd->imbalance_pct))
+ return true;
+
+ return false;
+}
+
+/*
+ * group_is_overloaded returns true if the group has more tasks than it can
+ * handle.
+ * group_is_overloaded is not equals to !group_has_capacity because a group
+ * with the exact right number of tasks, has no more spare capacity but is not
+ * overloaded so both group_has_capacity and group_is_overloaded return
+ * false.
*/
-static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
+static inline bool
+group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
{
- unsigned int capacity_factor, smt, cpus;
- unsigned int capacity, capacity_orig;
+ if (sgs->sum_nr_running <= sgs->group_weight)
+ return false;
- capacity = group->sgc->capacity;
- capacity_orig = group->sgc->capacity_orig;
- cpus = group->group_weight;
+ if ((sgs->group_capacity * 100) <
+ (sgs->group_util * env->sd->imbalance_pct))
+ return true;
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
- capacity_factor = cpus / smt; /* cores */
+ return false;
+}
- capacity_factor = min_t(unsigned,
- capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
- if (!capacity_factor)
- capacity_factor = fix_small_capacity(env->sd, group);
- return capacity_factor;
+/*
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-cpu capacity than sched_group ref.
+ */
+static inline bool
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+ return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE <
+ ref->sgc->max_capacity;
}
-static enum group_type
-group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+static enum group_type group_classify(struct lb_env *env,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs)
{
- if (sgs->sum_nr_running > sgs->group_capacity_factor)
+ if (sgs->group_no_capacity)
return group_overloaded;
if (sg_imbalanced(group))
return group_imbalanced;
+ if (sgs->group_misfit_task)
+ return group_misfit_task;
+
return group_other;
}
@@ -5999,14 +7162,15 @@ group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
* @overload: Indicate more than one runnable task for any CPU.
+ * @overutilized: Indicate overutilization for any CPU.
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
int local_group, struct sg_lb_stats *sgs,
- bool *overload)
+ bool *overload, bool *overutilized)
{
unsigned long load;
- int i;
+ int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
@@ -6020,9 +7184,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
load = source_load(i, load_idx);
sgs->group_load += load;
+ sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
- if (rq->nr_running > 1)
+ nr_running = rq->nr_running;
+ if (nr_running > 1)
*overload = true;
#ifdef CONFIG_NUMA_BALANCING
@@ -6030,8 +7196,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
sgs->sum_weighted_load += weighted_cpuload(i);
- if (idle_cpu(i))
+ /*
+ * No need to call idle_cpu() if nr_running is not 0
+ */
+ if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
+
+ if (cpu_overutilized(i)) {
+ *overutilized = true;
+ if (!sgs->group_misfit_task && rq->misfit_task)
+ sgs->group_misfit_task = capacity_of(i);
+ }
}
/* Adjust by relative CPU capacity of the group */
@@ -6042,11 +7217,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
- sgs->group_capacity_factor = sg_capacity_factor(env, group);
- sgs->group_type = group_classify(group, sgs);
- if (sgs->group_capacity_factor > sgs->sum_nr_running)
- sgs->group_has_free_capacity = 1;
+ sgs->group_no_capacity = group_is_overloaded(env, sgs);
+ sgs->group_type = group_classify(env, group, sgs);
}
/**
@@ -6075,9 +7248,25 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->group_type < busiest->group_type)
return false;
+ /*
+ * Candidate sg doesn't face any serious load-balance problems
+ * so don't pick it if the local sg is already filled up.
+ */
+ if (sgs->group_type == group_other &&
+ !group_has_capacity(env, &sds->local_stat))
+ return false;
+
if (sgs->avg_load <= busiest->avg_load)
return false;
+ /*
+ * Candiate sg has no more than one task per cpu and has higher
+ * per-cpu capacity. No reason to pull tasks to less capable cpus.
+ */
+ if (sgs->sum_nr_running <= sgs->group_weight &&
+ group_smaller_cpu_capacity(sds->local, sg))
+ return false;
+
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
@@ -6139,7 +7328,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
- bool overload = false;
+ bool overload = false, overutilized = false;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
@@ -6161,24 +7350,36 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
- &overload);
+ &overload, &overutilized);
if (local_group)
goto next_group;
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity factor to one so that we'll try
+ * first, lower the sg capacity so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
- * these excess tasks, i.e. nr_running < group_capacity_factor. The
- * extra check prevents the case where you always pull from the
- * heaviest group when it is already under-utilized (possible
- * with a large weight task outweighs the tasks on the system).
+ * these excess tasks. The extra check prevents the case where
+ * you always pull from the heaviest group when it is already
+ * under-utilized (possible with a large weight task outweighs
+ * the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- sds->local_stat.group_has_free_capacity)
- sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
+ group_has_capacity(env, &sds->local_stat) &&
+ (sgs->sum_nr_running > 1)) {
+ sgs->group_no_capacity = 1;
+ sgs->group_type = group_overloaded;
+ }
+
+ /*
+ * Ignore task groups with misfit tasks if local group has no
+ * capacity or if per-cpu capacity isn't higher.
+ */
+ if (sgs->group_type == group_misfit_task &&
+ (!group_has_capacity(env, &sds->local_stat) ||
+ !group_smaller_cpu_capacity(sg, sds->local)))
+ sgs->group_type = group_other;
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -6196,10 +7397,23 @@ next_group:
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+ env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
+
if (!env->sd->parent) {
/* update overload indicator if we are at root domain */
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
+
+ /* Update over-utilization (tipping point, U >= 0) indicator */
+ if (env->dst_rq->rd->overutilized != overutilized) {
+ env->dst_rq->rd->overutilized = overutilized;
+ trace_sched_overutilized(overutilized);
+ }
+ } else {
+ if (!env->dst_rq->rd->overutilized && overutilized) {
+ env->dst_rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
}
}
@@ -6348,6 +7562,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
+ /* Misfitting tasks should be migrated in any case */
+ if (busiest->group_type == group_misfit_task) {
+ env->imbalance = busiest->group_misfit_task;
+ return;
+ }
+
+ /*
+ * Busiest group is overloaded, local is not, use the spare
+ * cycles to maximize throughput
+ */
+ if (busiest->group_type == group_overloaded &&
+ local->group_type <= group_misfit_task) {
+ env->imbalance = busiest->load_per_task;
+ return;
+ }
+
env->imbalance = 0;
return fix_small_imbalance(env, sds);
}
@@ -6357,11 +7587,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
if (busiest->group_type == group_overloaded &&
local->group_type == group_overloaded) {
- load_above_capacity =
- (busiest->sum_nr_running - busiest->group_capacity_factor);
-
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
- load_above_capacity /= busiest->group_capacity;
+ load_above_capacity = busiest->sum_nr_running *
+ SCHED_LOAD_SCALE;
+ if (load_above_capacity > busiest->group_capacity)
+ load_above_capacity -= busiest->group_capacity;
+ else
+ load_above_capacity = ~0UL;
}
/*
@@ -6380,6 +7611,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
+ /* Boost imbalance to allow misfit task to be balanced. */
+ if (busiest->group_type == group_misfit_task)
+ env->imbalance = max_t(long, env->imbalance,
+ busiest->group_misfit_task);
+
/*
* if *imbalance is less than the average load per runnable task
* there is no guarantee that any tasks will be moved so we'll have
@@ -6421,9 +7657,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* this level.
*/
update_sd_lb_stats(env, &sds);
+
+ if (energy_aware() && !env->dst_rq->rd->overutilized)
+ goto out_balanced;
+
local = &sds.local_stat;
busiest = &sds.busiest_stat;
+ /* ASYM feature bypasses nice load balance check */
if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
check_asym_packing(env, &sds))
return sds.busiest;
@@ -6444,9 +7685,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
- !busiest->group_has_free_capacity)
+ if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ busiest->group_no_capacity)
+ goto force_balance;
+
+ /* Misfitting tasks should be dealt with regardless of the avg load */
+ if (busiest->group_type == group_misfit_task) {
goto force_balance;
+ }
/*
* If the local group is busier than the selected busiest group
@@ -6471,7 +7717,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* might end up to just move the imbalance on another group
*/
if ((busiest->group_type != group_overloaded) &&
- (local->idle_cpus <= (busiest->idle_cpus + 1)))
+ (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
+ !group_smaller_cpu_capacity(sds.busiest, sds.local))
goto out_balanced;
} else {
/*
@@ -6484,6 +7731,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
}
force_balance:
+ env->busiest_group_type = busiest->group_type;
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(env, &sds);
return sds.busiest;
@@ -6504,7 +7752,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
int i;
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- unsigned long capacity, capacity_factor, wl;
+ unsigned long capacity, wl;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -6533,9 +7781,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
continue;
capacity = capacity_of(i);
- capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
- if (!capacity_factor)
- capacity_factor = fix_small_capacity(env->sd, group);
wl = weighted_cpuload(i);
@@ -6543,7 +7788,10 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the cpu capacity.
*/
- if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
+
+ if (rq->nr_running == 1 && wl > env->imbalance &&
+ !check_cpu_capacity(rq, env->sd) &&
+ env->busiest_group_type != group_misfit_task)
continue;
/*
@@ -6591,6 +7839,26 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
+ /*
+ * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
+ * It's worth migrating the task if the src_cpu's capacity is reduced
+ * because of other sched_class or IRQs if more capacity stays
+ * available on dst_cpu.
+ */
+ if ((env->idle != CPU_NOT_IDLE) &&
+ (env->src_rq->cfs.h_nr_running == 1)) {
+ if ((check_cpu_capacity(env->src_rq, sd)) &&
+ (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
+ return 1;
+ }
+
+ if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+ env->src_rq->cfs.h_nr_running == 1 &&
+ cpu_overutilized(env->src_cpu) &&
+ !cpu_overutilized(env->dst_cpu)) {
+ return 1;
+ }
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -6690,6 +7958,9 @@ redo:
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+
ld_moved = 0;
if (busiest->nr_running > 1) {
/*
@@ -6699,8 +7970,6 @@ redo:
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
- env.src_cpu = busiest->cpu;
- env.src_rq = busiest;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
@@ -6711,6 +7980,11 @@ more_balance:
* ld_moved - cumulative load moved across iterations
*/
cur_ld_moved = detach_tasks(&env);
+ /*
+ * We want to potentially lower env.src_cpu's OPP.
+ */
+ if (cur_ld_moved)
+ update_capacity_of(env.src_cpu);
/*
* We've detached some tasks from busiest_rq. Every
@@ -6802,7 +8076,8 @@ more_balance:
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
- sd->nr_balance_failed++;
+ if (env.src_grp_nr_running > 1)
+ sd->nr_balance_failed++;
if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
@@ -6934,6 +8209,7 @@ static int idle_balance(struct rq *this_rq)
struct sched_domain *sd;
int pulled_task = 0;
u64 curr_cost = 0;
+ long removed_util=0;
idle_enter_fair(this_rq);
@@ -6943,8 +8219,9 @@ static int idle_balance(struct rq *this_rq)
*/
this_rq->idle_stamp = rq_clock(this_rq);
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !this_rq->rd->overload) {
+ if (!energy_aware() &&
+ (this_rq->avg_idle < sysctl_sched_migration_cost ||
+ !this_rq->rd->overload)) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
@@ -6959,6 +8236,17 @@ static int idle_balance(struct rq *this_rq)
*/
raw_spin_unlock(&this_rq->lock);
+ /*
+ * If removed_util_avg is !0 we most probably migrated some task away
+ * from this_cpu. In this case we might be willing to trigger an OPP
+ * update, but we want to do so if we don't find anybody else to pull
+ * here (we will trigger an OPP update with the pulled task's enqueue
+ * anyway).
+ *
+ * Record removed_util before calling update_blocked_averages, and use
+ * it below (before returning) to see if an OPP update is required.
+ */
+ removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg);
update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
@@ -7023,6 +8311,12 @@ out:
if (pulled_task) {
idle_exit_fair(this_rq);
this_rq->idle_stamp = 0;
+ } else if (removed_util) {
+ /*
+ * No task pulled and someone has been migrated away.
+ * Good case to trigger an OPP update.
+ */
+ update_capacity_of(this_cpu);
}
return pulled_task;
@@ -7082,8 +8376,13 @@ static int active_load_balance_cpu_stop(void *data)
schedstat_inc(sd, alb_count);
p = detach_one_task(&env);
- if (p)
+ if (p) {
schedstat_inc(sd, alb_pushed);
+ /*
+ * We want to potentially lower env.src_cpu's OPP.
+ */
+ update_capacity_of(env.src_cpu);
+ }
else
schedstat_inc(sd, alb_failed);
}
@@ -7345,8 +8644,22 @@ out:
* When the cpu is attached to null domain for ex, it will not be
* updated.
*/
- if (likely(update_next_balance))
+ if (likely(update_next_balance)) {
rq->next_balance = next_balance;
+
+#ifdef CONFIG_NO_HZ_COMMON
+ /*
+ * If this CPU has been elected to perform the nohz idle
+ * balance. Other idle CPUs have already rebalanced with
+ * nohz_idle_balance() and nohz.next_balance has been
+ * updated accordingly. This CPU is now running the idle load
+ * balance for itself and we need to update the
+ * nohz.next_balance accordingly.
+ */
+ if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
+ nohz.next_balance = rq->next_balance;
+#endif
+ }
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -7359,6 +8672,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
int this_cpu = this_rq->cpu;
struct rq *rq;
int balance_cpu;
+ /* Earliest time when we have to do rebalance again */
+ unsigned long next_balance = jiffies + 60*HZ;
+ int update_next_balance = 0;
if (idle != CPU_IDLE ||
!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
@@ -7390,32 +8706,44 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
rebalance_domains(rq, CPU_IDLE);
}
- if (time_after(this_rq->next_balance, rq->next_balance))
- this_rq->next_balance = rq->next_balance;
+ if (time_after(next_balance, rq->next_balance)) {
+ next_balance = rq->next_balance;
+ update_next_balance = 1;
+ }
}
- nohz.next_balance = this_rq->next_balance;
+
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the CPU is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ nohz.next_balance = next_balance;
end:
clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
}
/*
* Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu is the system.
+ * of an idle cpu in the system.
* - This rq has more than one task.
- * - At any scheduler domain level, this cpu's scheduler group has multiple
- * busy cpu's exceeding the group's capacity.
+ * - This rq has at least one CFS task and the capacity of the CPU is
+ * significantly reduced because of RT tasks or IRQs.
+ * - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ * multiple busy cpu.
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
-static inline int nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain *sd;
struct sched_group_capacity *sgc;
int nr_busy, cpu = rq->cpu;
+ bool kick = false;
if (unlikely(rq->idle_balance))
- return 0;
+ return false;
/*
* We may be recently in ticked or tickless idle mode. At the first
@@ -7429,38 +8757,47 @@ static inline int nohz_kick_needed(struct rq *rq)
* balancing.
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
- return 0;
+ return false;
if (time_before(now, nohz.next_balance))
- return 0;
+ return false;
- if (rq->nr_running >= 2)
- goto need_kick;
+ if (rq->nr_running >= 2 &&
+ (!energy_aware() || cpu_overutilized(cpu)))
+ return true;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu));
-
- if (sd) {
+ if (sd && !energy_aware()) {
sgc = sd->groups->sgc;
nr_busy = atomic_read(&sgc->nr_busy_cpus);
- if (nr_busy > 1)
- goto need_kick_unlock;
+ if (nr_busy > 1) {
+ kick = true;
+ goto unlock;
+ }
+
}
- sd = rcu_dereference(per_cpu(sd_asym, cpu));
+ sd = rcu_dereference(rq->sd);
+ if (sd) {
+ if ((rq->cfs.h_nr_running >= 1) &&
+ check_cpu_capacity(rq, sd)) {
+ kick = true;
+ goto unlock;
+ }
+ }
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu))
- goto need_kick_unlock;
-
- rcu_read_unlock();
- return 0;
+ sched_domain_span(sd)) < cpu)) {
+ kick = true;
+ goto unlock;
+ }
-need_kick_unlock:
+unlock:
rcu_read_unlock();
-need_kick:
- return 1;
+ return kick;
}
#else
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
@@ -7476,14 +8813,16 @@ static void run_rebalance_domains(struct softirq_action *h)
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
- rebalance_domains(this_rq, idle);
-
/*
* If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
- * stopped.
+ * stopped. Do nohz_idle_balance *before* rebalance_domains to
+ * give the idle cpus a chance to load balance. Else we may
+ * load balance only within the local sched_domain hierarchy
+ * and abort nohz_idle_balance altogether if we pull some load.
*/
nohz_idle_balance(this_rq, idle);
+ rebalance_domains(this_rq, idle);
}
/*
@@ -7536,7 +8875,15 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (numabalancing_enabled)
task_tick_numa(rq, curr);
- update_rq_runnable_avg(rq, 1);
+#ifdef CONFIG_SMP
+ if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
+ rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
+
+ rq->misfit_task = !task_fits_max(curr, rq->cpu);
+#endif
+
}
/*
@@ -7611,21 +8958,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
check_preempt_curr(rq, p, 0);
}
-static void switched_from_fair(struct rq *rq, struct task_struct *p)
+static inline bool vruntime_normalized(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
/*
- * Ensure the task's vruntime is normalized, so that when it's
- * switched back to the fair class the enqueue_entity(.flags=0) will
- * do the right thing.
+ * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
+ * the dequeue_entity(.flags=0) will already have normalized the
+ * vruntime.
+ */
+ if (p->on_rq)
+ return true;
+
+ /*
+ * When !on_rq, vruntime of the task has usually NOT been normalized.
+ * But there are some cases where it has already been normalized:
*
- * If it's queued, then the dequeue_entity(.flags=0) will already
- * have normalized the vruntime, if it's !queued, then only when
- * the task is sleeping will it still have non-normalized vruntime.
+ * - A forked child which is waiting for being woken up by
+ * wake_up_new_task().
+ * - A task which has been woken up by try_to_wake_up() and
+ * waiting for actually being woken up by sched_ttwu_pending().
*/
- if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
+ if (!se->sum_exec_runtime || p->state == TASK_WAKING)
+ return true;
+
+ return false;
+}
+
+static void detach_task_cfs_rq(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (!vruntime_normalized(p)) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
@@ -7634,44 +8999,50 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
se->vruntime -= cfs_rq->min_vruntime;
}
-#ifdef CONFIG_SMP
- /*
- * Remove our load from contribution when we leave sched_fair
- * and ensure we don't carry in an old decay_count if we
- * switch back.
- */
- if (se->avg.decay_count) {
- __synchronize_entity_decay(se);
- subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
- }
-#endif
+ /* Catch up with the cfs_rq and remove our load when we leave */
+ detach_entity_load_avg(cfs_rq, se);
}
-/*
- * We switched to the sched_fair class.
- */
-static void switched_to_fair(struct rq *rq, struct task_struct *p)
+static void attach_task_cfs_rq(struct task_struct *p)
{
-#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Since the real-depth could have been changed (only FAIR
* class maintain depth value), reset depth properly.
*/
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif
- if (!task_on_rq_queued(p))
- return;
- /*
- * We were most likely switched from sched_rt, so
- * kick off the schedule if running, otherwise just see
- * if we can still preempt the current task.
- */
- if (rq->curr == p)
- resched_curr(rq);
- else
- check_preempt_curr(rq, p, 0);
+ /* Synchronize task with its cfs_rq */
+ attach_entity_load_avg(cfs_rq, se);
+
+ if (!vruntime_normalized(p))
+ se->vruntime += cfs_rq->min_vruntime;
+}
+
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+ detach_task_cfs_rq(p);
+}
+
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
+{
+ attach_task_cfs_rq(p);
+
+ if (task_on_rq_queued(p)) {
+ /*
+ * We were most likely switched from sched_rt, so
+ * kick off the schedule if running, otherwise just see
+ * if we can still preempt the current task.
+ */
+ if (rq->curr == p)
+ resched_curr(rq);
+ else
+ check_preempt_curr(rq, p, 0);
+ }
}
/* Account for a task changing its policy or group.
@@ -7700,62 +9071,22 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
- atomic64_set(&cfs_rq->decay_counter, 1);
- atomic_long_set(&cfs_rq->removed_load, 0);
+ atomic_long_set(&cfs_rq->removed_load_avg, 0);
+ atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int queued)
+static void task_move_group_fair(struct task_struct *p)
{
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq;
-
- /*
- * If the task was not on the rq at the time of this cgroup movement
- * it must have been asleep, sleeping tasks keep their ->vruntime
- * absolute on their old rq until wakeup (needed for the fair sleeper
- * bonus in place_entity()).
- *
- * If it was on the rq, we've just 'preempted' it, which does convert
- * ->vruntime to a relative base.
- *
- * Make sure both cases convert their relative position when migrating
- * to another cgroup's rq. This does somewhat interfere with the
- * fair sleeper stuff for the first placement, but who cares.
- */
- /*
- * When !queued, vruntime of the task has usually NOT been normalized.
- * But there are some cases where it has already been normalized:
- *
- * - Moving a forked child which is waiting for being woken up by
- * wake_up_new_task().
- * - Moving a task which has been woken up by try_to_wake_up() and
- * waiting for actually being woken up by sched_ttwu_pending().
- *
- * To prevent boost or penalty in the new cfs_rq caused by delta
- * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
- */
- if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
- queued = 1;
-
- if (!queued)
- se->vruntime -= cfs_rq_of(se)->min_vruntime;
+ detach_task_cfs_rq(p);
set_task_rq(p, task_cpu(p));
- se->depth = se->parent ? se->parent->depth + 1 : 0;
- if (!queued) {
- cfs_rq = cfs_rq_of(se);
- se->vruntime += cfs_rq->min_vruntime;
+
#ifdef CONFIG_SMP
- /*
- * migrate_task_rq_fair() will have removed our previous
- * contribution, but we must synchronize for ongoing future
- * decay.
- */
- se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
- cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ /* Tell se's cfs_rq has been changed -- migrated */
+ p->se.avg.last_update_time = 0;
#endif
- }
+ attach_task_cfs_rq(p);
}
void free_fair_sched_group(struct task_group *tg)
@@ -7767,8 +9098,11 @@ void free_fair_sched_group(struct task_group *tg)
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
- if (tg->se)
+ if (tg->se) {
+ if (tg->se[i])
+ remove_entity_load_avg(tg->se[i]);
kfree(tg->se[i]);
+ }
}
kfree(tg->cfs_rq);
@@ -7805,6 +9139,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
+ init_entity_runnable_average(se);
}
return 1;
@@ -7954,6 +9289,7 @@ const struct sched_class fair_sched_class = {
.rq_offline = rq_offline_fair,
.task_waking = task_waking_fair,
+ .task_dead = task_dead_fair,
#endif
.set_curr_task = set_curr_task_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 90284d117fe6..1b90a408d483 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
*/
SCHED_FEAT(WAKEUP_PREEMPTION, true)
-/*
- * Use arch dependent cpu capacity functions
- */
-SCHED_FEAT(ARCH_CAPACITY, true)
-
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
@@ -60,6 +55,8 @@ SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
+SCHED_FEAT(ATTACH_AGE_LOAD, true)
+
/*
* Apply the automatic NUMA scheduling policy. Enabled automatically
* at runtime if running on a NUMA machine. Can be controlled via
@@ -83,3 +80,9 @@ SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
*/
SCHED_FEAT(NUMA_RESIST_LOWER, false)
#endif
+
+/*
+ * Energy aware scheduling. Use platform energy model to guide scheduling
+ * decisions optimizing for energy efficiency.
+ */
+SCHED_FEAT(ENERGY_AWARE, false)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce75e666..c0f307c4734c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
#include <linux/tick.h>
#include <linux/mm.h>
#include <linux/stackprotector.h>
+#include <linux/suspend.h>
#include <asm/tlb.h>
@@ -14,6 +15,16 @@
#include "sched.h"
+/**
+ * sched_idle_set_state - Record idle state for the current CPU.
+ * @idle_state: State to record.
+ */
+void sched_idle_set_state(struct cpuidle_state *idle_state, int index)
+{
+ idle_set_state(this_rq(), idle_state);
+ idle_set_state_idx(this_rq(), index);
+}
+
static int __read_mostly cpu_idle_force_poll;
void cpu_idle_poll_ctrl(bool enable)
@@ -104,24 +115,27 @@ static void cpuidle_idle_call(void)
rcu_idle_enter();
/*
- * Ask the cpuidle framework to choose a convenient idle state.
- * Fall back to the default arch idle method on errors.
+ * Suspend-to-idle ("freeze") is a system state in which all user space
+ * has been frozen, all I/O devices have been suspended and the only
+ * activity happens here and in iterrupts (if any). In that case bypass
+ * the cpuidle governor and go stratight for the deepest idle state
+ * available. Possibly also suspend the local tick and the entire
+ * timekeeping to prevent timer interrupts from kicking us out of idle
+ * until a proper wakeup interrupt happens.
*/
- next_state = cpuidle_select(drv, dev);
- if (next_state < 0) {
-use_default:
- /*
- * We can't use the cpuidle framework, let's use the default
- * idle routine.
- */
- if (current_clr_polling_and_test())
- local_irq_enable();
- else
- arch_cpu_idle();
-
+ if (idle_should_freeze()) {
+ cpuidle_enter_freeze();
+ local_irq_enable();
goto exit_idle;
}
+ /*
+ * Ask the cpuidle framework to choose a convenient idle state.
+ * Fall back to the default arch idle method on errors.
+ */
+ next_state = cpuidle_select(drv, dev);
+ if (next_state < 0)
+ goto use_default;
/*
* The idle task must be scheduled, it is pointless to
@@ -179,6 +193,19 @@ exit_idle:
rcu_idle_exit();
start_critical_timings();
+ return;
+
+use_default:
+ /*
+ * We can't use the cpuidle framework, let's use the default
+ * idle routine.
+ */
+ if (current_clr_polling_and_test())
+ local_irq_enable();
+ else
+ arch_cpu_idle();
+
+ goto exit_idle;
}
/*
@@ -199,6 +226,7 @@ static void cpu_idle_loop(void)
*/
__current_set_polling();
+ quiet_vmstat();
tick_nohz_idle_enter();
while (!need_resched()) {
diff --git a/kernel/sched/proc.c b/kernel/sched/loadavg.c
index 09dfe51f89b5..ef7159012cf3 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/loadavg.c
@@ -1,7 +1,9 @@
/*
- * kernel/sched/proc.c
+ * kernel/sched/loadavg.c
*
- * Kernel load calculations, forked from sched/core.c
+ * This file contains the magic bits required to compute the global loadavg
+ * figure. Its a silly number but people think its important. We go through
+ * great pains to make it work on big machines and tickless kernels.
*/
#include <linux/export.h>
@@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq)
long nr_active, delta = 0;
nr_active = this_rq->nr_running;
- nr_active += (long) this_rq->nr_uninterruptible;
+ nr_active += (long)this_rq->nr_uninterruptible;
if (nr_active != this_rq->calc_load_active) {
delta = nr_active - this_rq->calc_load_active;
@@ -97,13 +99,10 @@ long calc_load_fold_active(struct rq *this_rq)
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
- unsigned long newload;
-
- newload = load * exp + active * (FIXED_1 - exp);
- if (active >= load)
- newload += FIXED_1-1;
-
- return newload / FIXED_1;
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ load += 1UL << (FSHIFT - 1);
+ return load >> FSHIFT;
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -189,6 +188,7 @@ void calc_load_enter_idle(void)
delta = calc_load_fold_active(this_rq);
if (delta) {
int idx = calc_load_write_idx();
+
atomic_long_add(delta, &calc_load_idle[idx]);
}
}
@@ -244,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
{
unsigned long result = 1UL << frac_bits;
- if (n) for (;;) {
- if (n & 1) {
- result *= x;
- result += 1UL << (frac_bits - 1);
- result >>= frac_bits;
+ if (n) {
+ for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
}
- n >>= 1;
- if (!n)
- break;
- x *= x;
- x += 1UL << (frac_bits - 1);
- x >>= frac_bits;
}
return result;
@@ -288,7 +290,6 @@ static unsigned long
calc_load_n(unsigned long load, unsigned long exp,
unsigned long active, unsigned int n)
{
-
return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
}
@@ -342,6 +343,8 @@ static inline void calc_global_nohz(void) { }
/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks.
+ *
+ * Called from the global timer code.
*/
void calc_global_load(unsigned long ticks)
{
@@ -373,10 +376,10 @@ void calc_global_load(unsigned long ticks)
}
/*
- * Called from update_cpu_load() to periodically update this CPU's
+ * Called from scheduler_tick() to periodically update this CPU's
* active count.
*/
-static void calc_load_account_active(struct rq *this_rq)
+void calc_global_load_tick(struct rq *this_rq)
{
long delta;
@@ -389,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq)
this_rq->calc_load_update += LOAD_FREQ;
}
-
-/*
- * End of global load-average stuff
- */
-
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT 7
-static const unsigned char
- degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
- degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
- {0, 0, 0, 0, 0, 0, 0, 0},
- {64, 32, 8, 0, 0, 0, 0, 0},
- {96, 72, 40, 12, 1, 0, 0},
- {112, 98, 75, 43, 15, 1, 0},
- {120, 112, 98, 76, 45, 16, 2} };
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
- int j = 0;
-
- if (!missed_updates)
- return load;
-
- if (missed_updates >= degrade_zero_ticks[idx])
- return 0;
-
- if (idx == 1)
- return load >> missed_updates;
-
- while (missed_updates) {
- if (missed_updates % 2)
- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
- missed_updates >>= 1;
- j++;
- }
- return load;
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates)
-{
- int i, scale;
-
- this_rq->nr_load_updates++;
-
- /* Update our load: */
- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
- unsigned long old_load, new_load;
-
- /* scale is effectively 1 << i now, and >> i divides by scale */
-
- old_load = this_rq->cpu_load[i];
- old_load = decay_load_missed(old_load, pending_updates - 1, i);
- new_load = this_load;
- /*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
- */
- if (new_load > old_load)
- new_load += scale - 1;
-
- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
- }
-
- sched_avg_update(this_rq);
-}
-
-#ifdef CONFIG_SMP
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
- return rq->cfs.runnable_load_avg;
-}
-#else
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
- return rq->load.weight;
-}
-#endif
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-void update_idle_cpu_load(struct rq *this_rq)
-{
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
- unsigned long load = get_rq_runnable_load(this_rq);
- unsigned long pending_updates;
-
- /*
- * bail if there's load or we're actually up-to-date.
- */
- if (load || curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- this_rq->last_load_update_tick = curr_jiffies;
-
- __update_cpu_load(this_rq, load, pending_updates);
-}
-
-/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
- */
-void update_cpu_load_nohz(void)
-{
- struct rq *this_rq = this_rq();
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
- unsigned long pending_updates;
-
- if (curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- raw_spin_lock(&this_rq->lock);
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- if (pending_updates) {
- this_rq->last_load_update_tick = curr_jiffies;
- /*
- * We were idle, this means load 0, the current load might be
- * !0 due to remote wakeups and the sort.
- */
- __update_cpu_load(this_rq, 0, pending_updates);
- }
- raw_spin_unlock(&this_rq->lock);
-}
-#endif /* CONFIG_NO_HZ */
-
-/*
- * Called from scheduler_tick()
- */
-void update_cpu_load_active(struct rq *this_rq)
-{
- unsigned long load = get_rq_runnable_load(this_rq);
- /*
- * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
- */
- this_rq->last_load_update_tick = jiffies;
- __update_cpu_load(this_rq, load, 1);
-
- calc_load_account_active(this_rq);
-}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 20bca398084a..24ad9f1deb83 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,8 @@
#include <linux/slab.h>
+#include "walt.h"
+
int sched_rr_timeslice = RR_TIMESLICE;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -871,6 +873,51 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
return rt_task_of(rt_se)->prio;
}
+static void dump_throttled_rt_tasks(struct rt_rq *rt_rq)
+{
+ struct rt_prio_array *array = &rt_rq->active;
+ struct sched_rt_entity *rt_se;
+ char buf[500];
+ char *pos = buf;
+ char *end = buf + sizeof(buf);
+ int idx;
+
+ pos += snprintf(pos, sizeof(buf),
+ "sched: RT throttling activated for rt_rq %p (cpu %d)\n",
+ rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
+
+ if (bitmap_empty(array->bitmap, MAX_RT_PRIO))
+ goto out;
+
+ pos += snprintf(pos, end - pos, "potential CPU hogs:\n");
+ idx = sched_find_first_bit(array->bitmap);
+ while (idx < MAX_RT_PRIO) {
+ list_for_each_entry(rt_se, array->queue + idx, run_list) {
+ struct task_struct *p;
+
+ if (!rt_entity_is_task(rt_se))
+ continue;
+
+ p = rt_task_of(rt_se);
+ if (pos < end)
+ pos += snprintf(pos, end - pos, "\t%s (%d)\n",
+ p->comm, p->pid);
+ }
+ idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
+ }
+out:
+#ifdef CONFIG_PANIC_ON_RT_THROTTLING
+ /*
+ * Use pr_err() in the BUG() case since printk_sched() will
+ * not get flushed and deadlock is not a concern.
+ */
+ pr_err("%s", buf);
+ BUG();
+#else
+ printk_deferred("%s", buf);
+#endif
+}
+
static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
u64 runtime = sched_rt_runtime(rt_rq);
@@ -894,8 +941,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
* but accrue some time due to boosting.
*/
if (likely(rt_b->rt_runtime)) {
+ static bool once = false;
+
rt_rq->rt_throttled = 1;
- printk_deferred_once("sched: RT throttling activated\n");
+
+ if (!once) {
+ once = true;
+ dump_throttled_rt_tasks(rt_rq);
+ }
} else {
/*
* In case we did anyway, make it go away,
@@ -1243,6 +1296,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
rt_se->timeout = 0;
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+ walt_inc_cumulative_runnable_avg(rq, p);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
@@ -1254,6 +1308,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
update_curr_rt(rq);
dequeue_rt_entity(rt_se);
+ walt_dec_cumulative_runnable_avg(rq, p);
dequeue_pushable_task(rq, p);
}
@@ -1311,7 +1366,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
rq = cpu_rq(cpu);
rcu_read_lock();
- curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ curr = READ_ONCE(rq->curr); /* unlocked access */
/*
* If the current task on @p's runqueue is an RT task, then
@@ -1340,7 +1395,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
- if (target != -1)
+ /*
+ * Possible race. Don't bother moving it if the
+ * destination CPU is not running a lower priority task.
+ */
+ if (target != -1 &&
+ p->prio < cpu_rq(target)->rt.highest_prio.curr)
cpu = target;
}
rcu_read_unlock();
@@ -1400,6 +1460,41 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
#endif
}
+#ifdef CONFIG_SMP
+static void sched_rt_update_capacity_req(struct rq *rq)
+{
+ u64 total, used, age_stamp, avg;
+ s64 delta;
+
+ if (!sched_freq())
+ return;
+
+ sched_avg_update(rq);
+ /*
+ * Since we're reading these variables without serialization make sure
+ * we read them once before doing sanity checks on them.
+ */
+ age_stamp = READ_ONCE(rq->age_stamp);
+ avg = READ_ONCE(rq->rt_avg);
+ delta = rq_clock(rq) - age_stamp;
+
+ if (unlikely(delta < 0))
+ delta = 0;
+
+ total = sched_avg_period() + delta;
+
+ used = div_u64(avg, total);
+ if (unlikely(used > SCHED_CAPACITY_SCALE))
+ used = SCHED_CAPACITY_SCALE;
+
+ set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used));
+}
+#else
+static inline void sched_rt_update_capacity_req(struct rq *rq)
+{ }
+
+#endif
+
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
struct rt_rq *rt_rq)
{
@@ -1460,8 +1555,17 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
if (prev->sched_class == &rt_sched_class)
update_curr_rt(rq);
- if (!rt_rq->rt_queued)
+ if (!rt_rq->rt_queued) {
+ /*
+ * The next task to be picked on this rq will have a lower
+ * priority than rt tasks so we can spend some time to update
+ * the capacity used by rt tasks based on the last activity.
+ * This value will be the used as an estimation of the next
+ * activity.
+ */
+ sched_rt_update_capacity_req(rq);
return NULL;
+ }
put_prev_task(rq, prev);
@@ -2051,6 +2155,9 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
update_curr_rt(rq);
+ if (rq->rt.rt_nr_running)
+ sched_rt_update_capacity_req(rq);
+
watchdog(rq, p);
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f698089e10ca..ae835091fff4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -25,8 +25,14 @@ extern __read_mostly int scheduler_running;
extern unsigned long calc_load_update;
extern atomic_long_t calc_load_tasks;
+extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq);
+
+#ifdef CONFIG_SMP
extern void update_cpu_load_active(struct rq *this_rq);
+#else
+static inline void update_cpu_load_active(struct rq *this_rq) { }
+#endif
/*
* Helpers for converting nanosecond timing to jiffy resolution
@@ -217,8 +223,12 @@ struct task_group {
unsigned long shares;
#ifdef CONFIG_SMP
- atomic_long_t load_avg;
- atomic_t runnable_avg;
+ /*
+ * load_avg can be heavily contended at clock tick time, so put
+ * it in its own cacheline separated from the fields above which
+ * will also be accessed at each tick.
+ */
+ atomic_long_t load_avg ____cacheline_aligned;
#endif
#endif
@@ -305,7 +315,15 @@ extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
-#endif
+
+#ifdef CONFIG_SMP
+extern void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next);
+#else /* !CONFIG_SMP */
+static inline void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next) { }
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#else /* CONFIG_CGROUP_SCHED */
@@ -339,21 +357,20 @@ struct cfs_rq {
#ifdef CONFIG_SMP
/*
- * CFS Load tracking
- * Under CFS, load is tracked on a per-entity basis and aggregated up.
- * This allows for the description of both thread and group usage (in
- * the FAIR_GROUP_SCHED case).
+ * CFS load tracking
*/
- unsigned long runnable_load_avg, blocked_load_avg;
- atomic64_t decay_counter;
- u64 last_decay;
- atomic_long_t removed_load;
-
+ struct sched_avg avg;
+ u64 runnable_load_sum;
+ unsigned long runnable_load_avg;
#ifdef CONFIG_FAIR_GROUP_SCHED
- /* Required to track per-cpu representation of a task_group */
- u32 tg_runnable_contrib;
- unsigned long tg_load_contrib;
+ unsigned long tg_load_avg_contrib;
+#endif
+ atomic_long_t removed_load_avg, removed_util_avg;
+#ifndef CONFIG_64BIT
+ u64 load_last_update_time_copy;
+#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* h_load = weight * f(tg)
*
@@ -381,6 +398,10 @@ struct cfs_rq {
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
+#ifdef CONFIG_SCHED_WALT
+ u64 cumulative_runnable_avg;
+#endif
+
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
@@ -466,10 +487,18 @@ struct dl_rq {
#else
struct dl_bw dl_bw;
#endif
+ /* This is the "average utilization" for this runqueue */
+ s64 avg_bw;
};
#ifdef CONFIG_SMP
+struct max_cpu_capacity {
+ raw_spinlock_t lock;
+ unsigned long val;
+ int cpu;
+};
+
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -488,6 +517,9 @@ struct root_domain {
/* Indicate more than one runnable task for any CPU */
bool overload;
+ /* Indicate one or more cpus over-utilized (tipping point) */
+ bool overutilized;
+
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
@@ -503,6 +535,9 @@ struct root_domain {
*/
cpumask_var_t rto_mask;
struct cpupri cpupri;
+
+ /* Maximum cpu capacity in the system. */
+ struct max_cpu_capacity max_cpu_capacity;
};
extern struct root_domain def_root_domain;
@@ -532,6 +567,7 @@ struct rq {
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long last_load_update_tick;
+ unsigned int misfit_task;
#ifdef CONFIG_NO_HZ_COMMON
u64 nohz_stamp;
unsigned long nohz_flags;
@@ -541,6 +577,13 @@ struct rq {
#endif
int skip_clock_update;
+#ifdef CONFIG_CPU_QUIET
+ /* time-based average load */
+ u64 nr_last_stamp;
+ u64 nr_running_integral;
+ seqcount_t ave_seqcnt;
+#endif
+
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
@@ -553,8 +596,6 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
-
- struct sched_avg avg;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
@@ -579,6 +620,7 @@ struct rq {
struct sched_domain *sd;
unsigned long cpu_capacity;
+ unsigned long cpu_capacity_orig;
unsigned char idle_balance;
/* For active balancing */
@@ -601,6 +643,28 @@ struct rq {
u64 max_idle_balance_cost;
#endif
+#ifdef CONFIG_SCHED_WALT
+ /*
+ * max_freq = user or thermal defined maximum
+ * max_possible_freq = maximum supported by hardware
+ */
+ unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
+ struct cpumask freq_domain_cpumask;
+
+ u64 cumulative_runnable_avg;
+ int efficiency; /* Differentiate cpus with different IPC capability */
+ int load_scale_factor;
+ int capacity;
+ int max_possible_capacity;
+ u64 window_start;
+ u64 curr_runnable_sum;
+ u64 prev_runnable_sum;
+ u64 cur_irqload;
+ u64 avg_irqload;
+ u64 irqload_ts;
+#endif /* CONFIG_SCHED_WALT */
+
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
@@ -648,6 +712,7 @@ struct rq {
#ifdef CONFIG_CPU_IDLE
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
+ int idle_state_idx;
#endif
};
@@ -668,13 +733,20 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+ return READ_ONCE(rq->clock);
+}
+
static inline u64 rq_clock(struct rq *rq)
{
+ lockdep_assert_held(&rq->lock);
return rq->clock;
}
static inline u64 rq_clock_task(struct rq *rq)
{
+ lockdep_assert_held(&rq->lock);
return rq->clock_task;
}
@@ -745,6 +817,8 @@ DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+DECLARE_PER_CPU(struct sched_domain *, sd_ea);
+DECLARE_PER_CPU(struct sched_domain *, sd_scs);
struct sched_group_capacity {
atomic_t ref;
@@ -752,7 +826,8 @@ struct sched_group_capacity {
* CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
* for a single CPU.
*/
- unsigned int capacity, capacity_orig;
+ unsigned long capacity;
+ unsigned long max_capacity; /* Max per-cpu capacity in group */
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
/*
@@ -769,6 +844,7 @@ struct sched_group {
unsigned int group_weight;
struct sched_group_capacity *sgc;
+ const struct sched_group_energy *sge;
/*
* The CPUs this group covers.
@@ -842,6 +918,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
+ set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
p->se.cfs_rq = tg->cfs_rq[cpu];
p->se.parent = tg->se[cpu];
#endif
@@ -1080,7 +1157,9 @@ static const u32 prio_to_wmult[40] = {
#else
#define ENQUEUE_WAKING 0
#endif
-#define ENQUEUE_REPLENISH 8
+#define ENQUEUE_REPLENISH 0x08
+#define ENQUEUE_RESTORE 0x10
+#define ENQUEUE_WAKEUP_NEW 0x20
#define DEQUEUE_SLEEP 1
@@ -1128,6 +1207,11 @@ struct sched_class {
void (*task_fork) (struct task_struct *p);
void (*task_dead) (struct task_struct *p);
+ /*
+ * The switched_from() call is allowed to drop rq->lock, therefore we
+ * cannot assume the switched_from/switched_to pair is serliazed by
+ * rq->lock. They are however serialized by p->pi_lock.
+ */
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -1139,7 +1223,7 @@ struct sched_class {
void (*update_curr) (struct rq *rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_move_group) (struct task_struct *p, int on_rq);
+ void (*task_move_group) (struct task_struct *p);
#endif
};
@@ -1161,6 +1245,7 @@ extern const struct sched_class idle_sched_class;
#ifdef CONFIG_SMP
+extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
@@ -1187,6 +1272,17 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
WARN_ON(!rcu_read_lock_held());
return rq->idle_state;
}
+
+static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
+{
+ rq->idle_state_idx = idle_state_idx;
+}
+
+static inline int idle_get_state_idx(struct rq *rq)
+{
+ WARN_ON(!rcu_read_lock_held());
+ return rq->idle_state_idx;
+}
#else
static inline void idle_set_state(struct rq *rq,
struct cpuidle_state *idle_state)
@@ -1197,6 +1293,15 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
return NULL;
}
+
+static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
+{
+}
+
+static inline int idle_get_state_idx(struct rq *rq)
+{
+ return -1;
+}
#endif
extern void sysrq_sched_debug_show(void);
@@ -1220,11 +1325,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);
-extern void update_idle_cpu_load(struct rq *this_rq);
-
-extern void init_task_runnable_average(struct task_struct *p);
+extern void init_entity_runnable_average(struct sched_entity *se);
-static inline void add_nr_running(struct rq *rq, unsigned count)
+static inline void __add_nr_running(struct rq *rq, unsigned count)
{
unsigned prev_nr = rq->nr_running;
@@ -1252,11 +1355,48 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
}
}
-static inline void sub_nr_running(struct rq *rq, unsigned count)
+static inline void __sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
}
+#ifdef CONFIG_CPU_QUIET
+#define NR_AVE_SCALE(x) ((x) << FSHIFT)
+static inline u64 do_nr_running_integral(struct rq *rq)
+{
+ s64 nr, deltax;
+ u64 nr_running_integral = rq->nr_running_integral;
+
+ deltax = rq->clock_task - rq->nr_last_stamp;
+ nr = NR_AVE_SCALE(rq->nr_running);
+
+ nr_running_integral += nr * deltax;
+
+ return nr_running_integral;
+}
+
+static inline void add_nr_running(struct rq *rq, unsigned count)
+{
+ write_seqcount_begin(&rq->ave_seqcnt);
+ rq->nr_running_integral = do_nr_running_integral(rq);
+ rq->nr_last_stamp = rq->clock_task;
+ __add_nr_running(rq, count);
+ write_seqcount_end(&rq->ave_seqcnt);
+}
+
+static inline void sub_nr_running(struct rq *rq, unsigned count)
+{
+ write_seqcount_begin(&rq->ave_seqcnt);
+ rq->nr_running_integral = do_nr_running_integral(rq);
+ rq->nr_last_stamp = rq->clock_task;
+ __sub_nr_running(rq, count);
+ write_seqcount_end(&rq->ave_seqcnt);
+}
+#else
+#define add_nr_running __add_nr_running
+#define sub_nr_running __sub_nr_running
+#endif
+
static inline void rq_last_tick_reset(struct rq *rq)
{
#ifdef CONFIG_NO_HZ_FULL
@@ -1309,10 +1449,166 @@ static inline int hrtick_enabled(struct rq *rq)
#ifdef CONFIG_SMP
extern void sched_avg_update(struct rq *rq);
+
+#ifndef arch_scale_freq_capacity
+static __always_inline
+unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+
+#ifndef arch_scale_cpu_capacity
+static __always_inline
+unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+ if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
+ return sd->smt_gain / sd->span_weight;
+
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+
+#ifdef CONFIG_SMP
+static inline unsigned long capacity_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity;
+}
+
+static inline unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int walt_ravg_window;
+extern unsigned int walt_disabled;
+
+/*
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ */
+static inline unsigned long __cpu_util(int cpu, int delta)
+{
+ unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+ unsigned long capacity = capacity_orig_of(cpu);
+
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+ util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT;
+ do_div(util, walt_ravg_window);
+ }
+#endif
+ delta += util;
+ if (delta < 0)
+ return 0;
+
+ return (delta >= capacity) ? capacity : delta;
+}
+
+static inline unsigned long cpu_util(int cpu)
+{
+ return __cpu_util(cpu, 0);
+}
+
+#endif
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED
+#define capacity_max SCHED_CAPACITY_SCALE
+extern unsigned int capacity_margin;
+extern struct static_key __sched_freq;
+
+static inline bool sched_freq(void)
+{
+ return static_key_false(&__sched_freq);
+}
+
+DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
+void update_cpu_capacity_request(int cpu, bool request);
+
+static inline void set_cfs_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{
+ struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+ int rtdl = scr->rt + scr->dl;
+ /*
+ * WALT tracks the utilization of a CPU considering the load
+ * generated by all the scheduling classes.
+ * Since the following call to:
+ * update_cpu_capacity
+ * is already adding the RT and DL utilizations let's remove
+ * these contributions from the WALT signal.
+ */
+ if (capacity > rtdl)
+ capacity -= rtdl;
+ else
+ capacity = 0;
+ }
+#endif
+ if (scr->cfs != capacity) {
+ scr->cfs = capacity;
+ update_cpu_capacity_request(cpu, request);
+ }
+}
+
+static inline void set_rt_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{
+ if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) {
+ per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity;
+ update_cpu_capacity_request(cpu, request);
+ }
+}
+
+static inline void set_dl_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{
+ if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) {
+ per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity;
+ update_cpu_capacity_request(cpu, request);
+ }
+}
+#else
+static inline bool sched_freq(void) { return false; }
+static inline void set_cfs_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{ }
+static inline void set_rt_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{ }
+static inline void set_dl_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{ }
+#endif
+
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
- rq->rt_avg += rt_delta;
- sched_avg_update(rq);
+ rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
}
#else
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
@@ -1321,6 +1617,9 @@ static inline void sched_avg_update(struct rq *rq) { }
extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
+extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags);
+extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags);
+
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
@@ -1393,7 +1692,8 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
- raw_spin_unlock(&busiest->lock);
+ if (this_rq != busiest)
+ raw_spin_unlock(&busiest->lock);
lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 79ffec45a6ac..65e8c17cc219 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,4 +1,5 @@
#include "sched.h"
+#include "walt.h"
/*
* stop-task scheduling class.
@@ -42,12 +43,14 @@ static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
add_nr_running(rq, 1);
+ walt_inc_cumulative_runnable_avg(rq, p);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
sub_nr_running(rq, 1);
+ walt_dec_cumulative_runnable_avg(rq, p);
}
static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
new file mode 100644
index 000000000000..1fa48a77acc3
--- /dev/null
+++ b/kernel/sched/tune.c
@@ -0,0 +1,950 @@
+#include <linux/cgroup.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+
+#include <trace/events/sched.h>
+
+#include "sched.h"
+#include "tune.h"
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+static bool schedtune_initialized = false;
+#endif
+
+int sysctl_sched_cfs_boost __read_mostly;
+
+extern struct target_nrg schedtune_target_nrg;
+
+/* Performance Boost region (B) threshold params */
+static int perf_boost_idx;
+
+/* Performance Constraint region (C) threshold params */
+static int perf_constrain_idx;
+
+/**
+ * Performance-Energy (P-E) Space thresholds constants
+ */
+struct threshold_params {
+ int nrg_gain;
+ int cap_gain;
+};
+
+/*
+ * System specific P-E space thresholds constants
+ */
+static struct threshold_params
+threshold_gains[] = {
+ { 0, 5 }, /* < 10% */
+ { 1, 5 }, /* < 20% */
+ { 2, 5 }, /* < 30% */
+ { 3, 5 }, /* < 40% */
+ { 4, 5 }, /* < 50% */
+ { 5, 4 }, /* < 60% */
+ { 5, 3 }, /* < 70% */
+ { 5, 2 }, /* < 80% */
+ { 5, 1 }, /* < 90% */
+ { 5, 0 } /* <= 100% */
+};
+
+static int
+__schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ int perf_boost_idx, int perf_constrain_idx)
+{
+ int payoff = -INT_MAX;
+ int gain_idx = -1;
+
+ /* Performance Boost (B) region */
+ if (nrg_delta >= 0 && cap_delta > 0)
+ gain_idx = perf_boost_idx;
+ /* Performance Constraint (C) region */
+ else if (nrg_delta < 0 && cap_delta <= 0)
+ gain_idx = perf_constrain_idx;
+
+ /* Default: reject schedule candidate */
+ if (gain_idx == -1)
+ return payoff;
+
+ /*
+ * Evaluate "Performance Boost" vs "Energy Increase"
+ *
+ * - Performance Boost (B) region
+ *
+ * Condition: nrg_delta > 0 && cap_delta > 0
+ * Payoff criteria:
+ * cap_gain / nrg_gain < cap_delta / nrg_delta =
+ * cap_gain * nrg_delta < cap_delta * nrg_gain
+ * Note that since both nrg_gain and nrg_delta are positive, the
+ * inequality does not change. Thus:
+ *
+ * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
+ *
+ * - Performance Constraint (C) region
+ *
+ * Condition: nrg_delta < 0 && cap_delta < 0
+ * payoff criteria:
+ * cap_gain / nrg_gain > cap_delta / nrg_delta =
+ * cap_gain * nrg_delta < cap_delta * nrg_gain
+ * Note that since nrg_gain > 0 while nrg_delta < 0, the
+ * inequality change. Thus:
+ *
+ * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
+ *
+ * This means that, in case of same positive defined {cap,nrg}_gain
+ * for both the B and C regions, we can use the same payoff formula
+ * where a positive value represents the accept condition.
+ */
+ payoff = cap_delta * threshold_gains[gain_idx].nrg_gain;
+ payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain;
+
+ return payoff;
+}
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+
+/*
+ * EAS scheduler tunables for task groups.
+ */
+
+/* SchdTune tunables for a group of tasks */
+struct schedtune {
+ /* SchedTune CGroup subsystem */
+ struct cgroup_subsys_state css;
+
+ /* Boost group allocated ID */
+ int idx;
+
+ /* Boost value for tasks on that SchedTune CGroup */
+ int boost;
+
+ /* Performance Boost (B) region threshold params */
+ int perf_boost_idx;
+
+ /* Performance Constraint (C) region threshold params */
+ int perf_constrain_idx;
+
+ /* Hint to bias scheduling of tasks on that SchedTune CGroup
+ * towards idle CPUs */
+ int prefer_idle;
+};
+
+static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct schedtune, css) : NULL;
+}
+
+static inline struct schedtune *task_schedtune(struct task_struct *tsk)
+{
+ return css_st(task_css(tsk, schedtune_cgrp_id));
+}
+
+static inline struct schedtune *parent_st(struct schedtune *st)
+{
+ return css_st(st->css.parent);
+}
+
+/*
+ * SchedTune root control group
+ * The root control group is used to defined a system-wide boosting tuning,
+ * which is applied to all tasks in the system.
+ * Task specific boost tuning could be specified by creating and
+ * configuring a child control group under the root one.
+ * By default, system-wide boosting is disabled, i.e. no boosting is applied
+ * to tasks which are not into a child control group.
+ */
+static struct schedtune
+root_schedtune = {
+ .boost = 0,
+ .perf_boost_idx = 0,
+ .perf_constrain_idx = 0,
+ .prefer_idle = 0,
+};
+
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task)
+{
+ struct schedtune *ct;
+ int perf_boost_idx;
+ int perf_constrain_idx;
+
+ /* Optimal (O) region */
+ if (nrg_delta < 0 && cap_delta > 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
+ return INT_MAX;
+ }
+
+ /* Suboptimal (S) region */
+ if (nrg_delta > 0 && cap_delta < 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
+ return -INT_MAX;
+ }
+
+ /* Get task specific perf Boost/Constraints indexes */
+ rcu_read_lock();
+ ct = task_schedtune(task);
+ perf_boost_idx = ct->perf_boost_idx;
+ perf_constrain_idx = ct->perf_constrain_idx;
+ rcu_read_unlock();
+
+ return __schedtune_accept_deltas(nrg_delta, cap_delta,
+ perf_boost_idx, perf_constrain_idx);
+}
+
+/*
+ * Maximum number of boost groups to support
+ * When per-task boosting is used we still allow only limited number of
+ * boost groups for two main reasons:
+ * 1. on a real system we usually have only few classes of workloads which
+ * make sense to boost with different values (e.g. background vs foreground
+ * tasks, interactive vs low-priority tasks)
+ * 2. a limited number allows for a simpler and more memory/time efficient
+ * implementation especially for the computation of the per-CPU boost
+ * value
+ */
+#define BOOSTGROUPS_COUNT 4
+
+/* Array of configured boostgroups */
+static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
+ &root_schedtune,
+ NULL,
+};
+
+/* SchedTune boost groups
+ * Keep track of all the boost groups which impact on CPU, for example when a
+ * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
+ * likely with different boost values.
+ * Since on each system we expect only a limited number of boost groups, here
+ * we use a simple array to keep track of the metrics required to compute the
+ * maximum per-CPU boosting value.
+ */
+struct boost_groups {
+ /* Maximum boost value for all RUNNABLE tasks on a CPU */
+ bool idle;
+ int boost_max;
+ struct {
+ /* The boost for tasks on that boost group */
+ int boost;
+ /* Count of RUNNABLE tasks on that boost group */
+ unsigned tasks;
+ } group[BOOSTGROUPS_COUNT];
+ /* CPU's boost group locking */
+ raw_spinlock_t lock;
+};
+
+/* Boost groups affecting each CPU in the system */
+static DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
+
+static void
+schedtune_cpu_update(int cpu)
+{
+ struct boost_groups *bg;
+ int boost_max;
+ int idx;
+
+ bg = &per_cpu(cpu_boost_groups, cpu);
+
+ /* The root boost group is always active */
+ boost_max = bg->group[0].boost;
+ for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
+ /*
+ * A boost group affects a CPU only if it has
+ * RUNNABLE tasks on that CPU
+ */
+ if (bg->group[idx].tasks == 0)
+ continue;
+
+ boost_max = max(boost_max, bg->group[idx].boost);
+ }
+ /* Ensures boost_max is non-negative when all cgroup boost values
+ * are neagtive. Avoids under-accounting of cpu capacity which may cause
+ * task stacking and frequency spikes.*/
+ boost_max = max(boost_max, 0);
+ bg->boost_max = boost_max;
+}
+
+static int
+schedtune_boostgroup_update(int idx, int boost)
+{
+ struct boost_groups *bg;
+ int cur_boost_max;
+ int old_boost;
+ int cpu;
+
+ /* Update per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+
+ /*
+ * Keep track of current boost values to compute the per CPU
+ * maximum only when it has been affected by the new value of
+ * the updated boost group
+ */
+ cur_boost_max = bg->boost_max;
+ old_boost = bg->group[idx].boost;
+
+ /* Update the boost value of this boost group */
+ bg->group[idx].boost = boost;
+
+ /* Check if this update increase current max */
+ if (boost > cur_boost_max && bg->group[idx].tasks) {
+ bg->boost_max = boost;
+ trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
+ continue;
+ }
+
+ /* Check if this update has decreased current max */
+ if (cur_boost_max == old_boost && old_boost > boost) {
+ schedtune_cpu_update(cpu);
+ trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
+ continue;
+ }
+
+ trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
+ }
+
+ return 0;
+}
+
+#define ENQUEUE_TASK 1
+#define DEQUEUE_TASK -1
+
+static inline void
+schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
+{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ int tasks = bg->group[idx].tasks + task_count;
+
+ /* Update boosted tasks count while avoiding to make it negative */
+ bg->group[idx].tasks = max(0, tasks);
+
+ trace_sched_tune_tasks_update(p, cpu, tasks, idx,
+ bg->group[idx].boost, bg->boost_max);
+
+ /* Boost group activation or deactivation on that RQ */
+ if (tasks == 1 || tasks == 0)
+ schedtune_cpu_update(cpu);
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_enqueue_task(struct task_struct *p, int cpu)
+{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ unsigned long irq_flags;
+ struct schedtune *st;
+ int idx;
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ /*
+ * When a task is marked PF_EXITING by do_exit() it's going to be
+ * dequeued and enqueued multiple times in the exit path.
+ * Thus we avoid any further update, since we do not want to change
+ * CPU boosting while the task is exiting.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions for example on
+ * do_exit()::cgroup_exit() and task migration.
+ */
+ raw_spin_lock_irqsave(&bg->lock, irq_flags);
+ rcu_read_lock();
+
+ st = task_schedtune(p);
+ idx = st->idx;
+
+ schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
+
+ rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+static int schedtune_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct boost_groups *bg;
+ unsigned long irq_flags;
+ unsigned int cpu;
+ struct rq *rq;
+ int src_bg; /* Source boost group index */
+ int dst_bg; /* Destination boost group index */
+ int tasks;
+
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
+ cgroup_taskset_for_each(task, tset) {
+
+ /*
+ * Lock the CPU's RQ the task is enqueued to avoid race
+ * conditions with migration code while the task is being
+ * accounted
+ */
+ rq = lock_rq_of(task, &irq_flags);
+
+ if (!task->on_rq) {
+ unlock_rq_of(rq, task, &irq_flags);
+ continue;
+ }
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions on...
+ */
+ cpu = cpu_of(rq);
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ raw_spin_lock(&bg->lock);
+
+ dst_bg = css_st(css)->idx;
+ src_bg = task_schedtune(task)->idx;
+
+ /*
+ * Current task is not changing boostgroup, which can
+ * happen when the new hierarchy is in use.
+ */
+ if (unlikely(dst_bg == src_bg)) {
+ raw_spin_unlock(&bg->lock);
+ unlock_rq_of(rq, task, &irq_flags);
+ continue;
+ }
+
+ /*
+ * This is the case of a RUNNABLE task which is switching its
+ * current boost group.
+ */
+
+ /* Move task from src to dst boost group */
+ tasks = bg->group[src_bg].tasks - 1;
+ bg->group[src_bg].tasks = max(0, tasks);
+ bg->group[dst_bg].tasks += 1;
+
+ raw_spin_unlock(&bg->lock);
+ unlock_rq_of(rq, task, &irq_flags);
+
+ /* Update CPU boost group */
+ if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
+ schedtune_cpu_update(task_cpu(task));
+
+ }
+
+ return 0;
+}
+
+static void schedtune_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ /* This can happen only if SchedTune controller is mounted with
+ * other hierarchies ane one of them fails. Since usually SchedTune is
+ * mouted on its own hierarcy, for the time being we do not implement
+ * a proper rollback mechanism */
+ WARN(1, "SchedTune cancel attach not implemented");
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_dequeue_task(struct task_struct *p, int cpu)
+{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ unsigned long irq_flags;
+ struct schedtune *st;
+ int idx;
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ /*
+ * When a task is marked PF_EXITING by do_exit() it's going to be
+ * dequeued and enqueued multiple times in the exit path.
+ * Thus we avoid any further update, since we do not want to change
+ * CPU boosting while the task is exiting.
+ * The last dequeue is already enforce by the do_exit() code path
+ * via schedtune_exit_task().
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions on...
+ */
+ raw_spin_lock_irqsave(&bg->lock, irq_flags);
+ rcu_read_lock();
+
+ st = task_schedtune(p);
+ idx = st->idx;
+
+ schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
+
+ rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+void schedtune_exit_task(struct task_struct *tsk)
+{
+ struct schedtune *st;
+ unsigned long irq_flags;
+ unsigned int cpu;
+ struct rq *rq;
+ int idx;
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ rq = lock_rq_of(tsk, &irq_flags);
+ rcu_read_lock();
+
+ cpu = cpu_of(rq);
+ st = task_schedtune(tsk);
+ idx = st->idx;
+ schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
+
+ rcu_read_unlock();
+ unlock_rq_of(rq, tsk, &irq_flags);
+}
+
+int schedtune_cpu_boost(int cpu)
+{
+ struct boost_groups *bg;
+
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ return bg->boost_max;
+}
+
+int schedtune_task_boost(struct task_struct *p)
+{
+ struct schedtune *st;
+ int task_boost;
+
+ /* Get task boost value */
+ rcu_read_lock();
+ st = task_schedtune(p);
+ task_boost = st->boost;
+ rcu_read_unlock();
+
+ return task_boost;
+}
+
+int schedtune_prefer_idle(struct task_struct *p)
+{
+ struct schedtune *st;
+ int prefer_idle;
+
+ /* Get prefer_idle value */
+ rcu_read_lock();
+ st = task_schedtune(p);
+ prefer_idle = st->prefer_idle;
+ rcu_read_unlock();
+
+ return prefer_idle;
+}
+
+static u64
+prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct schedtune *st = css_st(css);
+
+ return st->prefer_idle;
+}
+
+static int
+prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 prefer_idle)
+{
+ struct schedtune *st = css_st(css);
+ st->prefer_idle = prefer_idle;
+
+ return 0;
+}
+
+static s64
+boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct schedtune *st = css_st(css);
+
+ return st->boost;
+}
+
+static int
+boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ s64 boost)
+{
+ struct schedtune *st = css_st(css);
+ unsigned threshold_idx;
+ int boost_pct;
+
+ if (boost < -100 || boost > 100)
+ return -EINVAL;
+ boost_pct = boost;
+
+ /*
+ * Update threshold params for Performance Boost (B)
+ * and Performance Constraint (C) regions.
+ * The current implementatio uses the same cuts for both
+ * B and C regions.
+ */
+ threshold_idx = clamp(boost_pct, 0, 99) / 10;
+ st->perf_boost_idx = threshold_idx;
+ st->perf_constrain_idx = threshold_idx;
+
+ st->boost = boost;
+ if (css == &root_schedtune.css) {
+ sysctl_sched_cfs_boost = boost;
+ perf_boost_idx = threshold_idx;
+ perf_constrain_idx = threshold_idx;
+ }
+
+ /* Update CPU boost */
+ schedtune_boostgroup_update(st->idx, st->boost);
+
+ trace_sched_tune_config(st->boost,
+ threshold_gains[st->perf_boost_idx].nrg_gain,
+ threshold_gains[st->perf_boost_idx].cap_gain,
+ threshold_gains[st->perf_constrain_idx].nrg_gain,
+ threshold_gains[st->perf_constrain_idx].cap_gain);
+
+ return 0;
+}
+
+static struct cftype files[] = {
+ {
+ .name = "boost",
+ .read_s64 = boost_read,
+ .write_s64 = boost_write,
+ },
+ {
+ .name = "prefer_idle",
+ .read_u64 = prefer_idle_read,
+ .write_u64 = prefer_idle_write,
+ },
+ { } /* terminate */
+};
+
+static int
+schedtune_boostgroup_init(struct schedtune *st)
+{
+ struct boost_groups *bg;
+ int cpu;
+
+ /* Keep track of allocated boost groups */
+ allocated_group[st->idx] = st;
+
+ /* Initialize the per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ bg->group[st->idx].boost = 0;
+ bg->group[st->idx].tasks = 0;
+ }
+
+ return 0;
+}
+
+static struct cgroup_subsys_state *
+schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct schedtune *st;
+ int idx;
+
+ if (!parent_css)
+ return &root_schedtune.css;
+
+ /* Allow only single level hierachies */
+ if (parent_css != &root_schedtune.css) {
+ pr_err("Nested SchedTune boosting groups not allowed\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* Allow only a limited number of boosting groups */
+ for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
+ if (!allocated_group[idx])
+ break;
+ if (idx == BOOSTGROUPS_COUNT) {
+ pr_err("Trying to create more than %d SchedTune boosting groups\n",
+ BOOSTGROUPS_COUNT);
+ return ERR_PTR(-ENOSPC);
+ }
+
+ st = kzalloc(sizeof(*st), GFP_KERNEL);
+ if (!st)
+ goto out;
+
+ /* Initialize per CPUs boost group support */
+ st->idx = idx;
+ if (schedtune_boostgroup_init(st))
+ goto release;
+
+ return &st->css;
+
+release:
+ kfree(st);
+out:
+ return ERR_PTR(-ENOMEM);
+}
+
+static void
+schedtune_boostgroup_release(struct schedtune *st)
+{
+ /* Reset this boost group */
+ schedtune_boostgroup_update(st->idx, 0);
+
+ /* Keep track of allocated boost groups */
+ allocated_group[st->idx] = NULL;
+}
+
+static void
+schedtune_css_free(struct cgroup_subsys_state *css)
+{
+ struct schedtune *st = css_st(css);
+
+ schedtune_boostgroup_release(st);
+ kfree(st);
+}
+
+struct cgroup_subsys schedtune_cgrp_subsys = {
+ .css_alloc = schedtune_css_alloc,
+ .css_free = schedtune_css_free,
+ .can_attach = schedtune_can_attach,
+ .cancel_attach = schedtune_cancel_attach,
+ .legacy_cftypes = files,
+ .early_init = 1,
+};
+
+static inline void
+schedtune_init_cgroups(void)
+{
+ struct boost_groups *bg;
+ int cpu;
+
+ /* Initialize the per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ memset(bg, 0, sizeof(struct boost_groups));
+ raw_spin_lock_init(&bg->lock);
+ }
+
+ pr_info("schedtune: configured to support %d boost groups\n",
+ BOOSTGROUPS_COUNT);
+
+ schedtune_initialized = true;
+}
+
+#else /* CONFIG_CGROUP_SCHEDTUNE */
+
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task)
+{
+ /* Optimal (O) region */
+ if (nrg_delta < 0 && cap_delta > 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
+ return INT_MAX;
+ }
+
+ /* Suboptimal (S) region */
+ if (nrg_delta > 0 && cap_delta < 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
+ return -INT_MAX;
+ }
+
+ return __schedtune_accept_deltas(nrg_delta, cap_delta,
+ perf_boost_idx, perf_constrain_idx);
+}
+
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
+int
+sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ unsigned threshold_idx;
+ int boost_pct;
+
+ if (ret || !write)
+ return ret;
+
+ if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100)
+ return -EINVAL;
+ boost_pct = sysctl_sched_cfs_boost;
+
+ /*
+ * Update threshold params for Performance Boost (B)
+ * and Performance Constraint (C) regions.
+ * The current implementatio uses the same cuts for both
+ * B and C regions.
+ */
+ threshold_idx = clamp(boost_pct, 0, 99) / 10;
+ perf_boost_idx = threshold_idx;
+ perf_constrain_idx = threshold_idx;
+
+ return 0;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static void
+schedtune_test_nrg(unsigned long delta_pwr)
+{
+ unsigned long test_delta_pwr;
+ unsigned long test_norm_pwr;
+ int idx;
+
+ /*
+ * Check normalization constants using some constant system
+ * energy values
+ */
+ pr_info("schedtune: verify normalization constants...\n");
+ for (idx = 0; idx < 6; ++idx) {
+ test_delta_pwr = delta_pwr >> idx;
+
+ /* Normalize on max energy for target platform */
+ test_norm_pwr = reciprocal_divide(
+ test_delta_pwr << SCHED_LOAD_SHIFT,
+ schedtune_target_nrg.rdiv);
+
+ pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
+ idx, test_delta_pwr, test_norm_pwr);
+ }
+}
+#else
+#define schedtune_test_nrg(delta_pwr)
+#endif
+
+/*
+ * Compute the min/max power consumption of a cluster and all its CPUs
+ */
+static void
+schedtune_add_cluster_nrg(
+ struct sched_domain *sd,
+ struct sched_group *sg,
+ struct target_nrg *ste)
+{
+ struct sched_domain *sd2;
+ struct sched_group *sg2;
+
+ struct cpumask *cluster_cpus;
+ char str[32];
+
+ unsigned long min_pwr;
+ unsigned long max_pwr;
+ int cpu;
+
+ /* Get Cluster energy using EM data for the first CPU */
+ cluster_cpus = sched_group_cpus(sg);
+ snprintf(str, 32, "CLUSTER[%*pbl]",
+ cpumask_pr_args(cluster_cpus));
+
+ min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
+ max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
+ pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+ str, min_pwr, max_pwr);
+
+ /*
+ * Keep track of this cluster's energy in the computation of the
+ * overall system energy
+ */
+ ste->min_power += min_pwr;
+ ste->max_power += max_pwr;
+
+ /* Get CPU energy using EM data for each CPU in the group */
+ for_each_cpu(cpu, cluster_cpus) {
+ /* Get a SD view for the specific CPU */
+ for_each_domain(cpu, sd2) {
+ /* Get the CPU group */
+ sg2 = sd2->groups;
+ min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
+ max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
+
+ ste->min_power += min_pwr;
+ ste->max_power += max_pwr;
+
+ snprintf(str, 32, "CPU[%d]", cpu);
+ pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+ str, min_pwr, max_pwr);
+
+ /*
+ * Assume we have EM data only at the CPU and
+ * the upper CLUSTER level
+ */
+ BUG_ON(!cpumask_equal(
+ sched_group_cpus(sg),
+ sched_group_cpus(sd2->parent->groups)
+ ));
+ break;
+ }
+ }
+}
+
+/*
+ * Initialize the constants required to compute normalized energy.
+ * The values of these constants depends on the EM data for the specific
+ * target system and topology.
+ * Thus, this function is expected to be called by the code
+ * that bind the EM to the topology information.
+ */
+static int
+schedtune_init(void)
+{
+ struct target_nrg *ste = &schedtune_target_nrg;
+ unsigned long delta_pwr = 0;
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ pr_info("schedtune: init normalization constants...\n");
+ ste->max_power = 0;
+ ste->min_power = 0;
+
+ rcu_read_lock();
+
+ /*
+ * When EAS is in use, we always have a pointer to the highest SD
+ * which provides EM data.
+ */
+ sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
+ if (!sd) {
+ pr_info("schedtune: no energy model data\n");
+ goto nodata;
+ }
+
+ sg = sd->groups;
+ do {
+ schedtune_add_cluster_nrg(sd, sg, ste);
+ } while (sg = sg->next, sg != sd->groups);
+
+ rcu_read_unlock();
+
+ pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+ "SYSTEM", ste->min_power, ste->max_power);
+
+ /* Compute normalization constants */
+ delta_pwr = ste->max_power - ste->min_power;
+ ste->rdiv = reciprocal_value(delta_pwr);
+ pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
+ ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
+
+ schedtune_test_nrg(delta_pwr);
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ schedtune_init_cgroups();
+#else
+ pr_info("schedtune: configured to support global boosting only\n");
+#endif
+
+ return 0;
+
+nodata:
+ rcu_read_unlock();
+ return -EINVAL;
+}
+postcore_initcall(schedtune_init);
+
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
new file mode 100644
index 000000000000..a00fb326cb08
--- /dev/null
+++ b/kernel/sched/tune.h
@@ -0,0 +1,54 @@
+
+#ifdef CONFIG_SCHED_TUNE
+
+#include <linux/reciprocal_div.h>
+
+/*
+ * System energy normalization constants
+ */
+struct target_nrg {
+ unsigned long min_power;
+ unsigned long max_power;
+ struct reciprocal_value rdiv;
+};
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+
+int schedtune_cpu_boost(int cpu);
+int schedtune_task_boost(struct task_struct *tsk);
+
+int schedtune_prefer_idle(struct task_struct *tsk);
+
+void schedtune_exit_task(struct task_struct *tsk);
+
+void schedtune_enqueue_task(struct task_struct *p, int cpu);
+void schedtune_dequeue_task(struct task_struct *p, int cpu);
+
+#else /* CONFIG_CGROUP_SCHEDTUNE */
+
+#define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost()
+#define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost()
+
+#define schedtune_exit_task(task) do { } while (0)
+
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
+int schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task);
+
+#else /* CONFIG_SCHED_TUNE */
+
+#define schedtune_cpu_boost(cpu) 0
+#define schedtune_task_boost(tsk) 0
+
+#define schedtune_exit_task(task) do { } while (0)
+
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+
+#define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta
+
+#endif /* CONFIG_SCHED_TUNE */
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 4dae1885db6f..f7b22adb47f7 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -596,7 +596,7 @@ EXPORT_SYMBOL(bit_wait_io);
__sched int bit_wait_timeout(struct wait_bit_key *word)
{
- unsigned long now = ACCESS_ONCE(jiffies);
+ unsigned long now = READ_ONCE(jiffies);
if (signal_pending_state(current->state, current))
return 1;
if (time_after_eq(now, word->timeout))
@@ -608,7 +608,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
__sched int bit_wait_io_timeout(struct wait_bit_key *word)
{
- unsigned long now = ACCESS_ONCE(jiffies);
+ unsigned long now = READ_ONCE(jiffies);
if (signal_pending_state(current->state, current))
return 1;
if (time_after_eq(now, word->timeout))
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
new file mode 100644
index 000000000000..07b7f84b37e2
--- /dev/null
+++ b/kernel/sched/walt.c
@@ -0,0 +1,1171 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ *
+ * Window Assisted Load Tracking (WALT) implementation credits:
+ * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
+ * Pavan Kumar Kondeti, Olav Haugan
+ *
+ * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
+ * and Todd Kjos
+ */
+
+#include <linux/syscore_ops.h>
+#include <linux/cpufreq.h>
+#include <trace/events/sched.h>
+#include <clocksource/arm_arch_timer.h>
+#include "sched.h"
+#include "walt.h"
+
+#define WINDOW_STATS_RECENT 0
+#define WINDOW_STATS_MAX 1
+#define WINDOW_STATS_MAX_RECENT_AVG 2
+#define WINDOW_STATS_AVG 3
+#define WINDOW_STATS_INVALID_POLICY 4
+
+#define EXITING_TASK_MARKER 0xdeaddead
+
+static __read_mostly unsigned int walt_ravg_hist_size = 5;
+static __read_mostly unsigned int walt_window_stats_policy =
+ WINDOW_STATS_MAX_RECENT_AVG;
+static __read_mostly unsigned int walt_account_wait_time = 1;
+static __read_mostly unsigned int walt_freq_account_wait_time = 0;
+static __read_mostly unsigned int walt_io_is_busy = 0;
+
+unsigned int sysctl_sched_walt_init_task_load_pct = 15;
+
+/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
+unsigned int __read_mostly walt_disabled = 0;
+
+static unsigned int max_possible_efficiency = 1024;
+static unsigned int min_possible_efficiency = 1024;
+
+/*
+ * Maximum possible frequency across all cpus. Task demand and cpu
+ * capacity (cpu_power) metrics are scaled in reference to it.
+ */
+static unsigned int max_possible_freq = 1;
+
+/*
+ * Minimum possible max_freq across all cpus. This will be same as
+ * max_possible_freq on homogeneous systems and could be different from
+ * max_possible_freq on heterogenous systems. min_max_freq is used to derive
+ * capacity (cpu_power) of cpus.
+ */
+static unsigned int min_max_freq = 1;
+
+static unsigned int max_capacity = 1024;
+static unsigned int min_capacity = 1024;
+static unsigned int max_load_scale_factor = 1024;
+static unsigned int max_possible_capacity = 1024;
+
+/* Mask of all CPUs that have max_possible_capacity */
+static cpumask_t mpc_mask = CPU_MASK_ALL;
+
+/* Window size (in ns) */
+__read_mostly unsigned int walt_ravg_window = 20000000;
+
+/* Min window size (in ns) = 10ms */
+#define MIN_SCHED_RAVG_WINDOW 10000000
+
+/* Max window size (in ns) = 1s */
+#define MAX_SCHED_RAVG_WINDOW 1000000000
+
+static unsigned int sync_cpu;
+static ktime_t ktime_last;
+static bool walt_ktime_suspended;
+
+static unsigned int task_load(struct task_struct *p)
+{
+ return p->ravg.demand;
+}
+
+void
+walt_inc_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p)
+{
+ rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void
+walt_dec_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p)
+{
+ rq->cumulative_runnable_avg -= p->ravg.demand;
+ BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+}
+
+static void
+fixup_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p, s64 task_load_delta)
+{
+ rq->cumulative_runnable_avg += task_load_delta;
+ if ((s64)rq->cumulative_runnable_avg < 0)
+ panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
+ task_load_delta, task_load(p));
+}
+
+u64 walt_ktime_clock(void)
+{
+ if (unlikely(walt_ktime_suspended))
+ return ktime_to_ns(ktime_last);
+ return ktime_get_ns();
+}
+
+static void walt_resume(void)
+{
+ walt_ktime_suspended = false;
+}
+
+static int walt_suspend(void)
+{
+ ktime_last = ktime_get();
+ walt_ktime_suspended = true;
+ return 0;
+}
+
+static struct syscore_ops walt_syscore_ops = {
+ .resume = walt_resume,
+ .suspend = walt_suspend
+};
+
+static int __init walt_init_ops(void)
+{
+ register_syscore_ops(&walt_syscore_ops);
+ return 0;
+}
+late_initcall(walt_init_ops);
+
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ cfs_rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
+}
+
+static int exiting_task(struct task_struct *p)
+{
+ if (p->flags & PF_EXITING) {
+ if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
+ p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int __init set_walt_ravg_window(char *str)
+{
+ get_option(&str, &walt_ravg_window);
+
+ walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+ walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+ return 0;
+}
+
+early_param("walt_ravg_window", set_walt_ravg_window);
+
+static void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+ s64 delta;
+ int nr_windows;
+
+ delta = wallclock - rq->window_start;
+ /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
+ if (delta < 0) {
+ if (arch_timer_read_counter() == 0)
+ delta = 0;
+ else
+ BUG_ON(1);
+ }
+
+ if (delta < walt_ravg_window)
+ return;
+
+ nr_windows = div64_u64(delta, walt_ravg_window);
+ rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+}
+
+static u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+ unsigned int cur_freq = rq->cur_freq;
+ int sf;
+
+ if (unlikely(cur_freq > max_possible_freq))
+ cur_freq = rq->max_possible_freq;
+
+ /* round up div64 */
+ delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
+ max_possible_freq);
+
+ sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
+
+ delta *= sf;
+ delta >>= 10;
+
+ return delta;
+}
+
+static int cpu_is_waiting_on_io(struct rq *rq)
+{
+ if (!walt_io_is_busy)
+ return 0;
+
+ return atomic_read(&rq->nr_iowait);
+}
+
+void walt_account_irqtime(int cpu, struct task_struct *curr,
+ u64 delta, u64 wallclock)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags, nr_windows;
+ u64 cur_jiffies_ts;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ /*
+ * cputime (wallclock) uses sched_clock so use the same here for
+ * consistency.
+ */
+ delta += sched_clock() - wallclock;
+ cur_jiffies_ts = get_jiffies_64();
+
+ if (is_idle_task(curr))
+ walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
+ delta);
+
+ nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+ if (nr_windows) {
+ if (nr_windows < 10) {
+ /* Decay CPU's irqload by 3/4 for each window. */
+ rq->avg_irqload *= (3 * nr_windows);
+ rq->avg_irqload = div64_u64(rq->avg_irqload,
+ 4 * nr_windows);
+ } else {
+ rq->avg_irqload = 0;
+ }
+ rq->avg_irqload += rq->cur_irqload;
+ rq->cur_irqload = 0;
+ }
+
+ rq->cur_irqload += delta;
+ rq->irqload_ts = cur_jiffies_ts;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+
+#define WALT_HIGH_IRQ_TIMEOUT 3
+
+u64 walt_irqload(int cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ s64 delta;
+ delta = get_jiffies_64() - rq->irqload_ts;
+
+ /*
+ * Current context can be preempted by irq and rq->irqload_ts can be
+ * updated by irq context so that delta can be negative.
+ * But this is okay and we can safely return as this means there
+ * was recent irq occurrence.
+ */
+
+ if (delta < WALT_HIGH_IRQ_TIMEOUT)
+ return rq->avg_irqload;
+ else
+ return 0;
+}
+
+int walt_cpu_high_irqload(int cpu) {
+ return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+ u64 irqtime, int event)
+{
+ if (is_idle_task(p)) {
+ /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+ if (event == PICK_NEXT_TASK)
+ return 0;
+
+ /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+ return irqtime || cpu_is_waiting_on_io(rq);
+ }
+
+ if (event == TASK_WAKE)
+ return 0;
+
+ if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
+ event == TASK_UPDATE)
+ return 1;
+
+ /* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+ return walt_freq_account_wait_time;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+ int new_window, nr_full_windows = 0;
+ int p_is_curr_task = (p == rq->curr);
+ u64 mark_start = p->ravg.mark_start;
+ u64 window_start = rq->window_start;
+ u32 window_size = walt_ravg_window;
+ u64 delta;
+
+ new_window = mark_start < window_start;
+ if (new_window) {
+ nr_full_windows = div64_u64((window_start - mark_start),
+ window_size);
+ if (p->ravg.active_windows < USHRT_MAX)
+ p->ravg.active_windows++;
+ }
+
+ /* Handle per-task window rollover. We don't care about the idle
+ * task or exiting tasks. */
+ if (new_window && !is_idle_task(p) && !exiting_task(p)) {
+ u32 curr_window = 0;
+
+ if (!nr_full_windows)
+ curr_window = p->ravg.curr_window;
+
+ p->ravg.prev_window = curr_window;
+ p->ravg.curr_window = 0;
+ }
+
+ if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
+ /* account_busy_for_cpu_time() = 0, so no update to the
+ * task's current window needs to be made. This could be
+ * for example
+ *
+ * - a wakeup event on a task within the current
+ * window (!new_window below, no action required),
+ * - switching to a new task from idle (PICK_NEXT_TASK)
+ * in a new window where irqtime is 0 and we aren't
+ * waiting on IO */
+
+ if (!new_window)
+ return;
+
+ /* A new window has started. The RQ demand must be rolled
+ * over if p is the current task. */
+ if (p_is_curr_task) {
+ u64 prev_sum = 0;
+
+ /* p is either idle task or an exiting task */
+ if (!nr_full_windows) {
+ prev_sum = rq->curr_runnable_sum;
+ }
+
+ rq->prev_runnable_sum = prev_sum;
+ rq->curr_runnable_sum = 0;
+ }
+
+ return;
+ }
+
+ if (!new_window) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. No rollover
+ * since we didn't start a new window. An example of this is
+ * when a task starts execution and then sleeps within the
+ * same window. */
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+ delta = wallclock - mark_start;
+ else
+ delta = irqtime;
+ delta = scale_exec_time(delta, rq);
+ rq->curr_runnable_sum += delta;
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.curr_window += delta;
+
+ return;
+ }
+
+ if (!p_is_curr_task) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has also started, but p is not the current task, so the
+ * window is not rolled over - just split up and account
+ * as necessary into curr and prev. The window is only
+ * rolled over when a new window is processed for the current
+ * task.
+ *
+ * Irqtime can't be accounted by a task that isn't the
+ * currently running task. */
+
+ if (!nr_full_windows) {
+ /* A full window hasn't elapsed, account partial
+ * contribution to previous completed window. */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!exiting_task(p))
+ p->ravg.prev_window += delta;
+ } else {
+ /* Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size). */
+ delta = scale_exec_time(window_size, rq);
+ if (!exiting_task(p))
+ p->ravg.prev_window = delta;
+ }
+ rq->prev_runnable_sum += delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ rq->curr_runnable_sum += delta;
+ if (!exiting_task(p))
+ p->ravg.curr_window = delta;
+
+ return;
+ }
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. If any of these three above conditions are true
+ * then this busy time can't be accounted as irqtime.
+ *
+ * Busy time for the idle task or exiting tasks need not
+ * be accounted.
+ *
+ * An example of this would be a task that starts execution
+ * and then sleeps once a new window has begun. */
+
+ if (!nr_full_windows) {
+ /* A full window hasn't elapsed, account partial
+ * contribution to previous completed window. */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.prev_window += delta;
+
+ delta += rq->curr_runnable_sum;
+ } else {
+ /* Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size). */
+ delta = scale_exec_time(window_size, rq);
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.prev_window = delta;
+
+ }
+ /*
+ * Rollover for normal runnable sum is done here by overwriting
+ * the values in prev_runnable_sum and curr_runnable_sum.
+ * Rollover for new task runnable sum has completed by previous
+ * if-else statement.
+ */
+ rq->prev_runnable_sum = delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ rq->curr_runnable_sum = delta;
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.curr_window = delta;
+
+ return;
+ }
+
+ if (irqtime) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. The current task must be the idle task because
+ * irqtime is not accounted for any other task.
+ *
+ * Irqtime will be accounted each time we process IRQ activity
+ * after a period of idleness, so we know the IRQ busy time
+ * started at wallclock - irqtime. */
+
+ BUG_ON(!is_idle_task(p));
+ mark_start = wallclock - irqtime;
+
+ /* Roll window over. If IRQ busy time was just in the current
+ * window then that is all that need be accounted. */
+ rq->prev_runnable_sum = rq->curr_runnable_sum;
+ if (mark_start > window_start) {
+ rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
+ return;
+ }
+
+ /* The IRQ busy time spanned multiple windows. Process the
+ * busy time preceding the current window start first. */
+ delta = window_start - mark_start;
+ if (delta > window_size)
+ delta = window_size;
+ delta = scale_exec_time(delta, rq);
+ rq->prev_runnable_sum += delta;
+
+ /* Process the remaining IRQ busy time in the current window. */
+ delta = wallclock - window_start;
+ rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+ return;
+ }
+
+ BUG();
+}
+
+static int account_busy_for_task_demand(struct task_struct *p, int event)
+{
+ /* No need to bother updating task demand for exiting tasks
+ * or the idle task. */
+ if (exiting_task(p) || is_idle_task(p))
+ return 0;
+
+ /* When a task is waking up it is completing a segment of non-busy
+ * time. Likewise, if wait time is not treated as busy time, then
+ * when a task begins to run or is migrated, it is not running and
+ * is completing a segment of non-busy time. */
+ if (event == TASK_WAKE || (!walt_account_wait_time &&
+ (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+ u32 runtime, int samples, int event)
+{
+ u32 *hist = &p->ravg.sum_history[0];
+ int ridx, widx;
+ u32 max = 0, avg, demand;
+ u64 sum = 0;
+
+ /* Ignore windows where task had no activity */
+ if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+ goto done;
+
+ /* Push new 'runtime' value onto stack */
+ widx = walt_ravg_hist_size - 1;
+ ridx = widx - samples;
+ for (; ridx >= 0; --widx, --ridx) {
+ hist[widx] = hist[ridx];
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
+ hist[widx] = runtime;
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ p->ravg.sum = 0;
+
+ if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
+ demand = runtime;
+ } else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
+ demand = max;
+ } else {
+ avg = div64_u64(sum, walt_ravg_hist_size);
+ if (walt_window_stats_policy == WINDOW_STATS_AVG)
+ demand = avg;
+ else
+ demand = max(avg, runtime);
+ }
+
+ /*
+ * A throttled deadline sched class task gets dequeued without
+ * changing p->on_rq. Since the dequeue decrements hmp stats
+ * avoid decrementing it here again.
+ */
+ if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+ !p->dl.dl_throttled))
+ fixup_cumulative_runnable_avg(rq, p, demand);
+
+ p->ravg.demand = demand;
+
+done:
+ trace_walt_update_history(rq, p, runtime, samples, event);
+ return;
+}
+
+static void add_to_task_demand(struct rq *rq, struct task_struct *p,
+ u64 delta)
+{
+ delta = scale_exec_time(delta, rq);
+ p->ravg.sum += delta;
+ if (unlikely(p->ravg.sum > walt_ravg_window))
+ p->ravg.sum = walt_ravg_window;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ * a) Task event is contained within one window.
+ * window_start < mark_start < wallclock
+ *
+ * ws ms wc
+ * | | |
+ * V V V
+ * |---------------|
+ *
+ * In this case, p->ravg.sum is updated *iff* event is appropriate
+ * (ex: event == PUT_PREV_TASK)
+ *
+ * b) Task event spans two windows.
+ * mark_start < window_start < wallclock
+ *
+ * ms ws wc
+ * | | |
+ * V V V
+ * -----|-------------------
+ *
+ * In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ * is appropriate, then a new window sample is recorded followed
+ * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ * c) Task event spans more than two windows.
+ *
+ * ms ws_tmp ws wc
+ * | | | |
+ * V V V V
+ * ---|-------|-------|-------|-------|------
+ * | |
+ * |<------ nr_full_windows ------>|
+ *
+ * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ * event is appropriate, window sample of p->ravg.sum is recorded,
+ * 'nr_full_window' samples of window_size is also recorded *iff*
+ * event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ * *iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static void update_task_demand(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock)
+{
+ u64 mark_start = p->ravg.mark_start;
+ u64 delta, window_start = rq->window_start;
+ int new_window, nr_full_windows;
+ u32 window_size = walt_ravg_window;
+
+ new_window = mark_start < window_start;
+ if (!account_busy_for_task_demand(p, event)) {
+ if (new_window)
+ /* If the time accounted isn't being accounted as
+ * busy time, and a new window started, only the
+ * previous window need be closed out with the
+ * pre-existing demand. Multiple windows may have
+ * elapsed, but since empty windows are dropped,
+ * it is not necessary to account those. */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ return;
+ }
+
+ if (!new_window) {
+ /* The simple case - busy time contained within the existing
+ * window. */
+ add_to_task_demand(rq, p, wallclock - mark_start);
+ return;
+ }
+
+ /* Busy time spans at least two windows. Temporarily rewind
+ * window_start to first window boundary after mark_start. */
+ delta = window_start - mark_start;
+ nr_full_windows = div64_u64(delta, window_size);
+ window_start -= (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (window_start - mark_start) first */
+ add_to_task_demand(rq, p, window_start - mark_start);
+
+ /* Push new sample(s) into task's demand history */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ if (nr_full_windows)
+ update_history(rq, p, scale_exec_time(window_size, rq),
+ nr_full_windows, event);
+
+ /* Roll window_start back to current to process any remainder
+ * in current window. */
+ window_start += (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (wallclock - window_start) next */
+ mark_start = window_start;
+ add_to_task_demand(rq, p, wallclock - mark_start);
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+ if (walt_disabled || !rq->window_start)
+ return;
+
+ lockdep_assert_held(&rq->lock);
+
+ update_window_start(rq, wallclock);
+
+ if (!p->ravg.mark_start)
+ goto done;
+
+ update_task_demand(p, rq, event, wallclock);
+ update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+
+done:
+ trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
+
+ p->ravg.mark_start = wallclock;
+}
+
+unsigned long __weak arch_get_cpu_efficiency(int cpu)
+{
+ return SCHED_LOAD_SCALE;
+}
+
+void walt_init_cpu_efficiency(void)
+{
+ int i, efficiency;
+ unsigned int max = 0, min = UINT_MAX;
+
+ for_each_possible_cpu(i) {
+ efficiency = arch_get_cpu_efficiency(i);
+ cpu_rq(i)->efficiency = efficiency;
+
+ if (efficiency > max)
+ max = efficiency;
+ if (efficiency < min)
+ min = efficiency;
+ }
+
+ if (max)
+ max_possible_efficiency = max;
+
+ if (min)
+ min_possible_efficiency = min;
+}
+
+static void reset_task_stats(struct task_struct *p)
+{
+ u32 sum = 0;
+
+ if (exiting_task(p))
+ sum = EXITING_TASK_MARKER;
+
+ memset(&p->ravg, 0, sizeof(struct ravg));
+ /* Retain EXITING_TASK marker */
+ p->ravg.sum_history[0] = sum;
+}
+
+void walt_mark_task_starting(struct task_struct *p)
+{
+ u64 wallclock;
+ struct rq *rq = task_rq(p);
+
+ if (!rq->window_start) {
+ reset_task_stats(p);
+ return;
+ }
+
+ wallclock = walt_ktime_clock();
+ p->ravg.mark_start = wallclock;
+}
+
+void walt_set_window_start(struct rq *rq)
+{
+ int cpu = cpu_of(rq);
+ struct rq *sync_rq = cpu_rq(sync_cpu);
+
+ if (rq->window_start)
+ return;
+
+ if (cpu == sync_cpu) {
+ rq->window_start = walt_ktime_clock();
+ } else {
+ raw_spin_unlock(&rq->lock);
+ double_rq_lock(rq, sync_rq);
+ rq->window_start = cpu_rq(sync_cpu)->window_start;
+ rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+ raw_spin_unlock(&sync_rq->lock);
+ }
+
+ rq->curr->ravg.mark_start = rq->window_start;
+}
+
+void walt_migrate_sync_cpu(int cpu)
+{
+ if (cpu == sync_cpu)
+ sync_cpu = smp_processor_id();
+}
+
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+ struct rq *src_rq = task_rq(p);
+ struct rq *dest_rq = cpu_rq(new_cpu);
+ u64 wallclock;
+
+ if (!p->on_rq && p->state != TASK_WAKING)
+ return;
+
+ if (exiting_task(p)) {
+ return;
+ }
+
+ if (p->state == TASK_WAKING)
+ double_rq_lock(src_rq, dest_rq);
+
+ wallclock = walt_ktime_clock();
+
+ walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
+ TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(dest_rq->curr, dest_rq,
+ TASK_UPDATE, wallclock, 0);
+
+ walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
+
+ if (p->ravg.curr_window) {
+ src_rq->curr_runnable_sum -= p->ravg.curr_window;
+ dest_rq->curr_runnable_sum += p->ravg.curr_window;
+ }
+
+ if (p->ravg.prev_window) {
+ src_rq->prev_runnable_sum -= p->ravg.prev_window;
+ dest_rq->prev_runnable_sum += p->ravg.prev_window;
+ }
+
+ if ((s64)src_rq->prev_runnable_sum < 0) {
+ src_rq->prev_runnable_sum = 0;
+ WARN_ON(1);
+ }
+ if ((s64)src_rq->curr_runnable_sum < 0) {
+ src_rq->curr_runnable_sum = 0;
+ WARN_ON(1);
+ }
+
+ trace_walt_migration_update_sum(src_rq, p);
+ trace_walt_migration_update_sum(dest_rq, p);
+
+ if (p->state == TASK_WAKING)
+ double_rq_unlock(src_rq, dest_rq);
+}
+
+/* Keep track of max/min capacity possible across CPUs "currently" */
+static void __update_min_max_capacity(void)
+{
+ int i;
+ int max = 0, min = INT_MAX;
+
+ for_each_online_cpu(i) {
+ if (cpu_rq(i)->capacity > max)
+ max = cpu_rq(i)->capacity;
+ if (cpu_rq(i)->capacity < min)
+ min = cpu_rq(i)->capacity;
+ }
+
+ max_capacity = max;
+ min_capacity = min;
+}
+
+static void update_min_max_capacity(void)
+{
+ unsigned long flags;
+ int i;
+
+ local_irq_save(flags);
+ for_each_possible_cpu(i)
+ raw_spin_lock(&cpu_rq(i)->lock);
+
+ __update_min_max_capacity();
+
+ for_each_possible_cpu(i)
+ raw_spin_unlock(&cpu_rq(i)->lock);
+ local_irq_restore(flags);
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
+ * least efficient cpu gets capacity of 1024
+ */
+static unsigned long capacity_scale_cpu_efficiency(int cpu)
+{
+ return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency;
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
+ * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
+ */
+static unsigned long capacity_scale_cpu_freq(int cpu)
+{
+ return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq;
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
+ * that "most" efficient cpu gets a load_scale_factor of 1
+ */
+static unsigned long load_scale_cpu_efficiency(int cpu)
+{
+ return DIV_ROUND_UP(1024 * max_possible_efficiency,
+ cpu_rq(cpu)->efficiency);
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to cpu with best max_freq
+ * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
+ * of 1.
+ */
+static unsigned long load_scale_cpu_freq(int cpu)
+{
+ return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq);
+}
+
+static int compute_capacity(int cpu)
+{
+ int capacity = 1024;
+
+ capacity *= capacity_scale_cpu_efficiency(cpu);
+ capacity >>= 10;
+
+ capacity *= capacity_scale_cpu_freq(cpu);
+ capacity >>= 10;
+
+ return capacity;
+}
+
+static int compute_load_scale_factor(int cpu)
+{
+ int load_scale = 1024;
+
+ /*
+ * load_scale_factor accounts for the fact that task load
+ * is in reference to "best" performing cpu. Task's load will need to be
+ * scaled (up) by a factor to determine suitability to be placed on a
+ * (little) cpu.
+ */
+ load_scale *= load_scale_cpu_efficiency(cpu);
+ load_scale >>= 10;
+
+ load_scale *= load_scale_cpu_freq(cpu);
+ load_scale >>= 10;
+
+ return load_scale;
+}
+
+static int cpufreq_notifier_policy(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
+ int i, update_max = 0;
+ u64 highest_mpc = 0, highest_mplsf = 0;
+ const struct cpumask *cpus = policy->related_cpus;
+ unsigned int orig_min_max_freq = min_max_freq;
+ unsigned int orig_max_possible_freq = max_possible_freq;
+ /* Initialized to policy->max in case policy->related_cpus is empty! */
+ unsigned int orig_max_freq = policy->max;
+
+ if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
+ val != CPUFREQ_CREATE_POLICY)
+ return 0;
+
+ if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
+ update_min_max_capacity();
+ return 0;
+ }
+
+ for_each_cpu(i, policy->related_cpus) {
+ cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
+ policy->related_cpus);
+ orig_max_freq = cpu_rq(i)->max_freq;
+ cpu_rq(i)->min_freq = policy->min;
+ cpu_rq(i)->max_freq = policy->max;
+ cpu_rq(i)->cur_freq = policy->cur;
+ cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
+ }
+
+ max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
+ if (min_max_freq == 1)
+ min_max_freq = UINT_MAX;
+ min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
+ BUG_ON(!min_max_freq);
+ BUG_ON(!policy->max);
+
+ /* Changes to policy other than max_freq don't require any updates */
+ if (orig_max_freq == policy->max)
+ return 0;
+
+ /*
+ * A changed min_max_freq or max_possible_freq (possible during bootup)
+ * needs to trigger re-computation of load_scale_factor and capacity for
+ * all possible cpus (even those offline). It also needs to trigger
+ * re-computation of nr_big_task count on all online cpus.
+ *
+ * A changed rq->max_freq otoh needs to trigger re-computation of
+ * load_scale_factor and capacity for just the cluster of cpus involved.
+ * Since small task definition depends on max_load_scale_factor, a
+ * changed load_scale_factor of one cluster could influence
+ * classification of tasks in another cluster. Hence a changed
+ * rq->max_freq will need to trigger re-computation of nr_big_task
+ * count on all online cpus.
+ *
+ * While it should be sufficient for nr_big_tasks to be
+ * re-computed for only online cpus, we have inadequate context
+ * information here (in policy notifier) with regard to hotplug-safety
+ * context in which notification is issued. As a result, we can't use
+ * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is
+ * fixed up to issue notification always in hotplug-safe context,
+ * re-compute nr_big_task for all possible cpus.
+ */
+
+ if (orig_min_max_freq != min_max_freq ||
+ orig_max_possible_freq != max_possible_freq) {
+ cpus = cpu_possible_mask;
+ update_max = 1;
+ }
+
+ /*
+ * Changed load_scale_factor can trigger reclassification of tasks as
+ * big or small. Make this change "atomic" so that tasks are accounted
+ * properly due to changed load_scale_factor
+ */
+ for_each_cpu(i, cpus) {
+ struct rq *rq = cpu_rq(i);
+
+ rq->capacity = compute_capacity(i);
+ rq->load_scale_factor = compute_load_scale_factor(i);
+
+ if (update_max) {
+ u64 mpc, mplsf;
+
+ mpc = div_u64(((u64) rq->capacity) *
+ rq->max_possible_freq, rq->max_freq);
+ rq->max_possible_capacity = (int) mpc;
+
+ mplsf = div_u64(((u64) rq->load_scale_factor) *
+ rq->max_possible_freq, rq->max_freq);
+
+ if (mpc > highest_mpc) {
+ highest_mpc = mpc;
+ cpumask_clear(&mpc_mask);
+ cpumask_set_cpu(i, &mpc_mask);
+ } else if (mpc == highest_mpc) {
+ cpumask_set_cpu(i, &mpc_mask);
+ }
+
+ if (mplsf > highest_mplsf)
+ highest_mplsf = mplsf;
+ }
+ }
+
+ if (update_max) {
+ max_possible_capacity = highest_mpc;
+ max_load_scale_factor = highest_mplsf;
+ }
+
+ __update_min_max_capacity();
+
+ return 0;
+}
+
+static int cpufreq_notifier_trans(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
+ unsigned int cpu = freq->cpu, new_freq = freq->new;
+ unsigned long flags;
+ int i;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return 0;
+
+ BUG_ON(!new_freq);
+
+ if (cpu_rq(cpu)->cur_freq == new_freq)
+ return 0;
+
+ for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
+ struct rq *rq = cpu_rq(i);
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+ walt_ktime_clock(), 0);
+ rq->cur_freq = new_freq;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+ return 0;
+}
+
+static struct notifier_block notifier_policy_block = {
+ .notifier_call = cpufreq_notifier_policy
+};
+
+static struct notifier_block notifier_trans_block = {
+ .notifier_call = cpufreq_notifier_trans
+};
+
+static int register_sched_callback(void)
+{
+ int ret;
+
+ ret = cpufreq_register_notifier(&notifier_policy_block,
+ CPUFREQ_POLICY_NOTIFIER);
+
+ if (!ret)
+ ret = cpufreq_register_notifier(&notifier_trans_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+
+ return 0;
+}
+
+/*
+ * cpufreq callbacks can be registered at core_initcall or later time.
+ * Any registration done prior to that is "forgotten" by cpufreq. See
+ * initialization of variable init_cpufreq_transition_notifier_list_called
+ * for further information.
+ */
+core_initcall(register_sched_callback);
+
+void walt_init_new_task_load(struct task_struct *p)
+{
+ int i;
+ u32 init_load_windows =
+ div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
+ (u64)walt_ravg_window, 100);
+ u32 init_load_pct = current->init_load_pct;
+
+ p->init_load_pct = 0;
+ memset(&p->ravg, 0, sizeof(struct ravg));
+
+ if (init_load_pct) {
+ init_load_windows = div64_u64((u64)init_load_pct *
+ (u64)walt_ravg_window, 100);
+ }
+
+ p->ravg.demand = init_load_windows;
+ for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+ p->ravg.sum_history[i] = init_load_windows;
+}
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
new file mode 100644
index 000000000000..e181c87a928d
--- /dev/null
+++ b/kernel/sched/walt.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __WALT_H
+#define __WALT_H
+
+#ifdef CONFIG_SCHED_WALT
+
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+ u64 wallclock, u64 irqtime);
+void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p);
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p);
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu);
+void walt_init_new_task_load(struct task_struct *p);
+void walt_mark_task_starting(struct task_struct *p);
+void walt_set_window_start(struct rq *rq);
+void walt_migrate_sync_cpu(int cpu);
+void walt_init_cpu_efficiency(void);
+u64 walt_ktime_clock(void);
+void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta,
+ u64 wallclock);
+
+u64 walt_irqload(int cpu);
+int walt_cpu_high_irqload(int cpu);
+
+#else /* CONFIG_SCHED_WALT */
+
+static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime) { }
+static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p) { }
+static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p) { }
+static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void walt_init_new_task_load(struct task_struct *p) { }
+static inline void walt_mark_task_starting(struct task_struct *p) { }
+static inline void walt_set_window_start(struct rq *rq) { }
+static inline void walt_migrate_sync_cpu(int cpu) { }
+static inline void walt_init_cpu_efficiency(void) { }
+static inline u64 walt_ktime_clock(void) { return 0; }
+
+#endif /* CONFIG_SCHED_WALT */
+
+extern unsigned int walt_disabled;
+
+#endif
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index a39025b02fd5..30c682adcdeb 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -629,7 +629,9 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
switch (action) {
case SECCOMP_RET_ERRNO:
- /* Set the low-order 16-bits as a errno. */
+ /* Set low-order bits as an errno, capped at MAX_ERRNO. */
+ if (data > MAX_ERRNO)
+ data = MAX_ERRNO;
syscall_set_return_value(current, task_pt_regs(current),
-data, 0);
goto skip;
diff --git a/kernel/signal.c b/kernel/signal.c
index 046c7f77cdcb..e4c4f6717939 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2501,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals);
*/
SYSCALL_DEFINE0(restart_syscall)
{
- struct restart_block *restart = &current_thread_info()->restart_block;
+ struct restart_block *restart = &current->restart_block;
return restart->fn(restart);
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 1eaa2f0b0246..38a20992febf 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -41,6 +41,9 @@
#include <linux/syscore_ops.h>
#include <linux/version.h>
#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/mempolicy.h>
+#include <linux/sched.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
@@ -2025,10 +2028,158 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
}
#endif
+#ifdef CONFIG_MMU
+static int prctl_update_vma_anon_name(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ const char __user *name_addr)
+{
+ struct mm_struct * mm = vma->vm_mm;
+ int error = 0;
+ pgoff_t pgoff;
+
+ if (name_addr == vma_get_anon_name(vma)) {
+ *prev = vma;
+ goto out;
+ }
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ *prev = vma_merge(mm, *prev, start, end, vma->vm_flags, vma->anon_vma,
+ vma->vm_file, pgoff, vma_policy(vma),
+ name_addr);
+ if (*prev) {
+ vma = *prev;
+ goto success;
+ }
+
+ *prev = vma;
+
+ if (start != vma->vm_start) {
+ error = split_vma(mm, vma, start, 1);
+ if (error)
+ goto out;
+ }
+
+ if (end != vma->vm_end) {
+ error = split_vma(mm, vma, end, 0);
+ if (error)
+ goto out;
+ }
+
+success:
+ if (!vma->vm_file)
+ vma->shared.anon_name = name_addr;
+
+out:
+ if (error == -ENOMEM)
+ error = -EAGAIN;
+ return error;
+}
+
+static int prctl_set_vma_anon_name(unsigned long start, unsigned long end,
+ unsigned long arg)
+{
+ unsigned long tmp;
+ struct vm_area_struct * vma, *prev;
+ int unmapped_error = 0;
+ int error = -EINVAL;
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ * - this matches the handling in madvise.
+ */
+ vma = find_vma_prev(current->mm, start, &prev);
+ if (vma && start > vma->vm_start)
+ prev = vma;
+
+ for (;;) {
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ return error;
+
+ /* Here start < (end|vma->vm_end). */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ if (start >= end)
+ return error;
+ }
+
+ /* Here vma->vm_start <= start < (end|vma->vm_end) */
+ tmp = vma->vm_end;
+ if (end < tmp)
+ tmp = end;
+
+ /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+ error = prctl_update_vma_anon_name(vma, &prev, start, tmp,
+ (const char __user *)arg);
+ if (error)
+ return error;
+ start = tmp;
+ if (prev && start < prev->vm_end)
+ start = prev->vm_end;
+ error = unmapped_error;
+ if (start >= end)
+ return error;
+ if (prev)
+ vma = prev->vm_next;
+ else /* madvise_remove dropped mmap_sem */
+ vma = find_vma(current->mm, start);
+ }
+}
+
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+ unsigned long len_in, unsigned long arg)
+{
+ struct mm_struct *mm = current->mm;
+ int error;
+ unsigned long len;
+ unsigned long end;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return -EINVAL;
+
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+
+ if (end == start)
+ return 0;
+
+ down_write(&mm->mmap_sem);
+
+ switch (opt) {
+ case PR_SET_VMA_ANON_NAME:
+ error = prctl_set_vma_anon_name(start, end, arg);
+ break;
+ default:
+ error = -EINVAL;
+ }
+
+ up_write(&mm->mmap_sem);
+
+ return error;
+}
+#else /* CONFIG_MMU */
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+ unsigned long len_in, unsigned long arg)
+{
+ return -EINVAL;
+}
+#endif
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
struct task_struct *me = current;
+ struct task_struct *tsk;
unsigned char comm[sizeof(me->comm)];
long error;
@@ -2171,6 +2322,26 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_GET_TID_ADDRESS:
error = prctl_get_tid_address(me, (int __user **)arg2);
break;
+ case PR_SET_TIMERSLACK_PID:
+ if (task_pid_vnr(current) != (pid_t)arg3 &&
+ !capable(CAP_SYS_NICE))
+ return -EPERM;
+ rcu_read_lock();
+ tsk = find_task_by_vpid((pid_t)arg3);
+ if (tsk == NULL) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ get_task_struct(tsk);
+ rcu_read_unlock();
+ if (arg2 <= 0)
+ tsk->timer_slack_ns =
+ tsk->default_timer_slack_ns;
+ else
+ tsk->timer_slack_ns = arg2;
+ put_task_struct(tsk);
+ error = 0;
+ break;
case PR_SET_CHILD_SUBREAPER:
me->signal->is_child_subreaper = !!arg2;
break;
@@ -2203,6 +2374,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
me->mm->def_flags &= ~VM_NOHUGEPAGE;
up_write(&me->mm->mmap_sem);
break;
+ case PR_SET_VMA:
+ error = prctl_set_vma(arg2, arg3, arg4, arg5);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4e7c3dea4b85..34eeb58aaa44 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -104,6 +104,8 @@ extern char core_pattern[];
extern unsigned int core_pipe_limit;
#endif
extern int pid_max;
+extern int extra_free_kbytes;
+extern int min_free_order_shift;
extern int pid_max_min, pid_max_max;
extern int percpu_pagelist_fraction;
extern int compat_log;
@@ -304,6 +306,64 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_sched_granularity_ns,
},
{
+ .procname = "sched_is_big_little",
+ .data = &sysctl_sched_is_big_little,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_SCHED_WALT
+ {
+ .procname = "sched_use_walt_cpu_util",
+ .data = &sysctl_sched_use_walt_cpu_util,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_use_walt_task_util",
+ .data = &sysctl_sched_use_walt_task_util,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_walt_init_task_load_pct",
+ .data = &sysctl_sched_walt_init_task_load_pct,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_walt_cpu_high_irqload",
+ .data = &sysctl_sched_walt_cpu_high_irqload,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "sched_sync_hint_enable",
+ .data = &sysctl_sched_sync_hint_enable,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_initial_task_util",
+ .data = &sysctl_sched_initial_task_util,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_cstate_aware",
+ .data = &sysctl_sched_cstate_aware,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "sched_wakeup_granularity_ns",
.data = &sysctl_sched_wakeup_granularity,
.maxlen = sizeof(unsigned int),
@@ -443,6 +503,21 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
},
#endif
+#ifdef CONFIG_SCHED_TUNE
+ {
+ .procname = "sched_cfs_boost",
+ .data = &sysctl_sched_cfs_boost,
+ .maxlen = sizeof(sysctl_sched_cfs_boost),
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ .mode = 0444,
+#else
+ .mode = 0644,
+#endif
+ .proc_handler = &sysctl_sched_cfs_boost_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
@@ -1308,6 +1383,21 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
+ .procname = "extra_free_kbytes",
+ .data = &extra_free_kbytes,
+ .maxlen = sizeof(extra_free_kbytes),
+ .mode = 0644,
+ .proc_handler = min_free_kbytes_sysctl_handler,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "min_free_order_shift",
+ .data = &min_free_order_shift,
+ .maxlen = sizeof(min_free_order_shift),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
.procname = "percpu_pagelist_fraction",
.data = &percpu_pagelist_fraction,
.maxlen = sizeof(percpu_pagelist_fraction),
@@ -1483,6 +1573,28 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+ {
+ .procname = "mmap_rnd_bits",
+ .data = &mmap_rnd_bits,
+ .maxlen = sizeof(mmap_rnd_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_bits_min,
+ .extra2 = (void *)&mmap_rnd_bits_max,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ {
+ .procname = "mmap_rnd_compat_bits",
+ .data = &mmap_rnd_compat_bits,
+ .maxlen = sizeof(mmap_rnd_compat_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_compat_bits_min,
+ .extra2 = (void *)&mmap_rnd_compat_bits_max,
+ },
+#endif
{ }
};
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 119847b93ba6..0993964fdd12 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -797,7 +797,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
goto out;
}
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = alarm_timer_nsleep_restart;
restart->nanosleep.clockid = type;
restart->nanosleep.expires = exp.tv64;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 210b84882935..95d26429123b 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1593,7 +1593,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
goto out;
}
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = hrtimer_nanosleep_restart;
restart->nanosleep.clockid = t.timer.base->clockid;
restart->nanosleep.rmtp = rmtp;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 731d7a342670..b6e20a95b6c8 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -853,10 +853,10 @@ static void check_thread_timers(struct task_struct *tsk,
/*
* Check for the special case thread timers.
*/
- soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+ soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
if (soft != RLIM_INFINITY) {
unsigned long hard =
- ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+ READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
if (hard != RLIM_INFINITY &&
tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -959,11 +959,11 @@ static void check_process_timers(struct task_struct *tsk,
SIGPROF);
check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
SIGVTALRM);
- soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (soft != RLIM_INFINITY) {
unsigned long psecs = cputime_to_secs(ptime);
unsigned long hard =
- ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+ READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
cputime_t x;
if (psecs >= hard) {
/*
@@ -1335,8 +1335,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
struct timespec *rqtp, struct timespec __user *rmtp)
{
- struct restart_block *restart_block =
- &current_thread_info()->restart_block;
+ struct restart_block *restart_block = &current->restart_block;
struct itimerspec it;
int error;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c596236607ae..9ba8641e835c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -68,8 +68,8 @@ bool __read_mostly persistent_clock_exist = false;
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
- while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) {
- tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;
+ while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
+ tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
tk->xtime_sec++;
}
}
@@ -79,20 +79,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk)
struct timespec64 ts;
ts.tv_sec = tk->xtime_sec;
- ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+ ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
return ts;
}
static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec = ts->tv_sec;
- tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;
+ tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
}
static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec += ts->tv_sec;
- tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;
+ tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
tk_normalize_xtime(tk);
}
@@ -135,11 +135,16 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
u64 tmp, ntpinterval;
struct clocksource *old_clock;
- old_clock = tk->tkr.clock;
- tk->tkr.clock = clock;
- tk->tkr.read = clock->read;
- tk->tkr.mask = clock->mask;
- tk->tkr.cycle_last = tk->tkr.read(clock);
+ old_clock = tk->tkr_mono.clock;
+ tk->tkr_mono.clock = clock;
+ tk->tkr_mono.read = clock->read;
+ tk->tkr_mono.mask = clock->mask;
+ tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+
+ tk->tkr_raw.clock = clock;
+ tk->tkr_raw.read = clock->read;
+ tk->tkr_raw.mask = clock->mask;
+ tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH;
@@ -163,11 +168,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
if (old_clock) {
int shift_change = clock->shift - old_clock->shift;
if (shift_change < 0)
- tk->tkr.xtime_nsec >>= -shift_change;
+ tk->tkr_mono.xtime_nsec >>= -shift_change;
else
- tk->tkr.xtime_nsec <<= shift_change;
+ tk->tkr_mono.xtime_nsec <<= shift_change;
}
- tk->tkr.shift = clock->shift;
+ tk->tkr_raw.xtime_nsec = 0;
+
+ tk->tkr_mono.shift = clock->shift;
+ tk->tkr_raw.shift = clock->shift;
tk->ntp_error = 0;
tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
@@ -178,7 +186,8 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
* active clocksource. These value will be adjusted via NTP
* to counteract clock drifting.
*/
- tk->tkr.mult = clock->mult;
+ tk->tkr_mono.mult = clock->mult;
+ tk->tkr_raw.mult = clock->mult;
tk->ntp_err_mult = 0;
}
@@ -208,25 +217,6 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
return nsec + arch_gettimeoffset();
}
-static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
-{
- struct clocksource *clock = tk->tkr.clock;
- cycle_t cycle_now, delta;
- s64 nsec;
-
- /* read clocksource: */
- cycle_now = tk->tkr.read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
-
- /* convert delta to nanoseconds. */
- nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
-
- /* If arch requires, add in get_arch_timeoffset() */
- return nsec + arch_gettimeoffset();
-}
-
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
* @tk: The timekeeper from which we take the update
@@ -276,7 +266,7 @@ static void update_fast_timekeeper(struct timekeeper *tk)
raw_write_seqcount_latch(&tk_fast_mono.seq);
/* Update base[0] */
- memcpy(base, &tk->tkr, sizeof(*base));
+ memcpy(base, &tk->tkr_mono, sizeof(*base));
/* Force readers back to base[0] */
raw_write_seqcount_latch(&tk_fast_mono.seq);
@@ -326,7 +316,7 @@ u64 notrace ktime_get_mono_fast_ns(void)
do {
seq = raw_read_seqcount(&tk_fast_mono.seq);
tkr = tk_fast_mono.base + (seq & 0x01);
- now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+ now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
} while (read_seqcount_retry(&tk_fast_mono.seq, seq));
return now;
@@ -341,8 +331,8 @@ static inline void update_vsyscall(struct timekeeper *tk)
xt = timespec64_to_timespec(tk_xtime(tk));
wm = timespec64_to_timespec(tk->wall_to_monotonic);
- update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
- tk->tkr.cycle_last);
+ update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
+ tk->tkr_mono.cycle_last);
}
static inline void old_vsyscall_fixup(struct timekeeper *tk)
@@ -359,11 +349,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
* (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
* users are removed, this can be killed.
*/
- remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1);
- tk->tkr.xtime_nsec -= remainder;
- tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift;
+ remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
+ tk->tkr_mono.xtime_nsec -= remainder;
+ tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
tk->ntp_error += remainder << tk->ntp_error_shift;
- tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift;
+ tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
}
#else
#define old_vsyscall_fixup(tk)
@@ -428,10 +418,10 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
nsec *= NSEC_PER_SEC;
nsec += tk->wall_to_monotonic.tv_nsec;
- tk->tkr.base_mono = ns_to_ktime(nsec);
+ tk->tkr_mono.base = ns_to_ktime(nsec);
/* Update the monotonic raw base */
- tk->base_raw = timespec64_to_ktime(tk->raw_time);
+ tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
}
/* must hold timekeeper_lock */
@@ -463,22 +453,23 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
*/
static void timekeeping_forward_now(struct timekeeper *tk)
{
- struct clocksource *clock = tk->tkr.clock;
+ struct clocksource *clock = tk->tkr_mono.clock;
cycle_t cycle_now, delta;
s64 nsec;
- cycle_now = tk->tkr.read(clock);
- delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
- tk->tkr.cycle_last = cycle_now;
+ cycle_now = tk->tkr_mono.read(clock);
+ delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+ tk->tkr_mono.cycle_last = cycle_now;
+ tk->tkr_raw.cycle_last = cycle_now;
- tk->tkr.xtime_nsec += delta * tk->tkr.mult;
+ tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
/* If arch requires, add in get_arch_timeoffset() */
- tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;
+ tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
tk_normalize_xtime(tk);
- nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
+ nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
timespec64_add_ns(&tk->raw_time, nsec);
}
@@ -499,7 +490,7 @@ int __getnstimeofday64(struct timespec64 *ts)
seq = read_seqcount_begin(&tk_core.seq);
ts->tv_sec = tk->xtime_sec;
- nsecs = timekeeping_get_ns(&tk->tkr);
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -539,8 +530,8 @@ ktime_t ktime_get(void)
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = tk->tkr.base_mono;
- nsecs = timekeeping_get_ns(&tk->tkr);
+ base = tk->tkr_mono.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -565,8 +556,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = ktime_add(tk->tkr.base_mono, *offset);
- nsecs = timekeeping_get_ns(&tk->tkr);
+ base = ktime_add(tk->tkr_mono.base, *offset);
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -607,8 +598,8 @@ ktime_t ktime_get_raw(void)
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = tk->base_raw;
- nsecs = timekeeping_get_ns_raw(tk);
+ base = tk->tkr_raw.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_raw);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -636,7 +627,7 @@ void ktime_get_ts64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
ts->tv_sec = tk->xtime_sec;
- nsec = timekeeping_get_ns(&tk->tkr);
+ nsec = timekeeping_get_ns(&tk->tkr_mono);
tomono = tk->wall_to_monotonic;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -673,8 +664,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
ts_real->tv_sec = tk->xtime_sec;
ts_real->tv_nsec = 0;
- nsecs_raw = timekeeping_get_ns_raw(tk);
- nsecs_real = timekeeping_get_ns(&tk->tkr);
+ nsecs_raw = timekeeping_get_ns(&tk->tkr_raw);
+ nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -858,7 +849,7 @@ static int change_clocksource(void *data)
*/
if (try_module_get(new->owner)) {
if (!new->enable || new->enable(new) == 0) {
- old = tk->tkr.clock;
+ old = tk->tkr_mono.clock;
tk_setup_internals(tk, new);
if (old->disable)
old->disable(old);
@@ -886,11 +877,11 @@ int timekeeping_notify(struct clocksource *clock)
{
struct timekeeper *tk = &tk_core.timekeeper;
- if (tk->tkr.clock == clock)
+ if (tk->tkr_mono.clock == clock)
return 0;
stop_machine(change_clocksource, clock, NULL);
tick_clock_notify();
- return tk->tkr.clock == clock ? 0 : -1;
+ return tk->tkr_mono.clock == clock ? 0 : -1;
}
/**
@@ -908,7 +899,7 @@ void getrawmonotonic(struct timespec *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- nsecs = timekeeping_get_ns_raw(tk);
+ nsecs = timekeeping_get_ns(&tk->tkr_raw);
ts64 = tk->raw_time;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -930,7 +921,7 @@ int timekeeping_valid_for_hres(void)
do {
seq = read_seqcount_begin(&tk_core.seq);
- ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+ ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -949,7 +940,7 @@ u64 timekeeping_max_deferment(void)
do {
seq = read_seqcount_begin(&tk_core.seq);
- ret = tk->tkr.clock->max_idle_ns;
+ ret = tk->tkr_mono.clock->max_idle_ns;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1028,7 +1019,6 @@ void __init timekeeping_init(void)
tk_set_xtime(tk, &now);
tk->raw_time.tv_sec = 0;
tk->raw_time.tv_nsec = 0;
- tk->base_raw.tv64 = 0;
if (boot.tv_sec == 0 && boot.tv_nsec == 0)
boot = tk_xtime(tk);
@@ -1116,7 +1106,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
static void timekeeping_resume(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct clocksource *clock = tk->tkr.clock;
+ struct clocksource *clock = tk->tkr_mono.clock;
unsigned long flags;
struct timespec64 ts_new, ts_delta;
struct timespec tmp;
@@ -1144,16 +1134,16 @@ static void timekeeping_resume(void)
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
- cycle_now = tk->tkr.read(clock);
+ cycle_now = tk->tkr_mono.read(clock);
if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
- cycle_now > tk->tkr.cycle_last) {
+ cycle_now > tk->tkr_mono.cycle_last) {
u64 num, max = ULLONG_MAX;
u32 mult = clock->mult;
u32 shift = clock->shift;
s64 nsec = 0;
- cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last,
- tk->tkr.mask);
+ cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
+ tk->tkr_mono.mask);
/*
* "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -1179,7 +1169,9 @@ static void timekeeping_resume(void)
__timekeeping_inject_sleeptime(tk, &ts_delta);
/* Re-base the last cycle value */
- tk->tkr.cycle_last = cycle_now;
+ tk->tkr_mono.cycle_last = cycle_now;
+ tk->tkr_raw.cycle_last = cycle_now;
+
tk->ntp_error = 0;
timekeeping_suspended = 0;
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1331,9 +1323,9 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
*
* XXX - TODO: Doc ntp_error calculation.
*/
- tk->tkr.mult += mult_adj;
+ tk->tkr_mono.mult += mult_adj;
tk->xtime_interval += interval;
- tk->tkr.xtime_nsec -= offset;
+ tk->tkr_mono.xtime_nsec -= offset;
tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
}
@@ -1395,12 +1387,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
tk->ntp_err_mult = 0;
}
- if (unlikely(tk->tkr.clock->maxadj &&
- (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) {
+ if (unlikely(tk->tkr_mono.clock->maxadj &&
+ (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
+ > tk->tkr_mono.clock->maxadj))) {
printk_once(KERN_WARNING
"Adjusting %s more than 11%% (%ld vs %ld)\n",
- tk->tkr.clock->name, (long)tk->tkr.mult,
- (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj);
+ tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
+ (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
}
/*
@@ -1417,9 +1410,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
* We'll correct this error next time through this function, when
* xtime_nsec is not as small.
*/
- if (unlikely((s64)tk->tkr.xtime_nsec < 0)) {
- s64 neg = -(s64)tk->tkr.xtime_nsec;
- tk->tkr.xtime_nsec = 0;
+ if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
+ s64 neg = -(s64)tk->tkr_mono.xtime_nsec;
+ tk->tkr_mono.xtime_nsec = 0;
tk->ntp_error += neg << tk->ntp_error_shift;
}
}
@@ -1434,13 +1427,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
*/
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
- u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;
+ u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
unsigned int clock_set = 0;
- while (tk->tkr.xtime_nsec >= nsecps) {
+ while (tk->tkr_mono.xtime_nsec >= nsecps) {
int leap;
- tk->tkr.xtime_nsec -= nsecps;
+ tk->tkr_mono.xtime_nsec -= nsecps;
tk->xtime_sec++;
/* Figure out if its a leap sec and apply if needed */
@@ -1485,9 +1478,10 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
/* Accumulate one shifted interval */
offset -= interval;
- tk->tkr.cycle_last += interval;
+ tk->tkr_mono.cycle_last += interval;
+ tk->tkr_raw.cycle_last += interval;
- tk->tkr.xtime_nsec += tk->xtime_interval << shift;
+ tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
*clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
@@ -1530,8 +1524,8 @@ void update_wall_time(void)
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = real_tk->cycle_interval;
#else
- offset = clocksource_delta(tk->tkr.read(tk->tkr.clock),
- tk->tkr.cycle_last, tk->tkr.mask);
+ offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+ tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
#endif
/* Check if there's really nothing to do */
@@ -1692,8 +1686,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = tk->tkr.base_mono;
- nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift;
+ base = tk->tkr_mono.base;
+ nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
*offs_real = tk->offs_real;
*offs_boot = tk->offs_boot;
@@ -1724,8 +1718,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = tk->tkr.base_mono;
- nsecs = timekeeping_get_ns(&tk->tkr);
+ base = tk->tkr_mono.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
*offs_real = tk->offs_real;
*offs_boot = tk->offs_boot;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5da09c899dd..37e1fb831d95 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -77,6 +77,9 @@ config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
bool
+config GPU_TRACEPOINTS
+ bool
+
config CONTEXT_SWITCH_TRACER
bool
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 67d6369ddf83..5da3cfb1bb84 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
+obj-$(CONFIG_GPU_TRACEPOINTS) += gpu-traces.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f01d4ecdf5ab..a53787c9bd51 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -18,7 +18,7 @@
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/suspend.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/hardirq.h>
#include <linux/kthread.h>
#include <linux/uaccess.h>
@@ -1002,7 +1002,7 @@ static struct tracer_stat function_stats __initdata = {
.stat_show = function_stat_show
};
-static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
{
struct ftrace_profile_stat *stat;
struct dentry *entry;
@@ -1038,15 +1038,15 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
}
}
- entry = debugfs_create_file("function_profile_enabled", 0644,
+ entry = tracefs_create_file("function_profile_enabled", 0644,
d_tracer, NULL, &ftrace_profile_fops);
if (!entry)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'function_profile_enabled' entry\n");
}
#else /* CONFIG_FUNCTION_PROFILER */
-static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
{
}
#endif /* CONFIG_FUNCTION_PROFILER */
@@ -4499,7 +4499,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops)
mutex_unlock(&ftrace_lock);
}
-static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
+static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
{
trace_create_file("available_filter_functions", 0444,
@@ -4781,7 +4781,7 @@ static int __init ftrace_nodyn_init(void)
}
core_initcall(ftrace_nodyn_init);
-static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
+static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
static inline void ftrace_startup_enable(int command) { }
static inline void ftrace_startup_all(int command) { }
/* Keep as macros so we do not need to define the commands */
@@ -5230,24 +5230,24 @@ static const struct file_operations ftrace_pid_fops = {
.release = ftrace_pid_release,
};
-static __init int ftrace_init_debugfs(void)
+static __init int ftrace_init_tracefs(void)
{
struct dentry *d_tracer;
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
- ftrace_init_dyn_debugfs(d_tracer);
+ ftrace_init_dyn_tracefs(d_tracer);
trace_create_file("set_ftrace_pid", 0644, d_tracer,
NULL, &ftrace_pid_fops);
- ftrace_profile_debugfs(d_tracer);
+ ftrace_profile_tracefs(d_tracer);
return 0;
}
-fs_initcall(ftrace_init_debugfs);
+fs_initcall(ftrace_init_tracefs);
/**
* ftrace_kill - kill ftrace
diff --git a/kernel/trace/gpu-traces.c b/kernel/trace/gpu-traces.c
new file mode 100644
index 000000000000..a4b3f00faee3
--- /dev/null
+++ b/kernel/trace/gpu-traces.c
@@ -0,0 +1,23 @@
+/*
+ * GPU tracepoints
+ *
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/gpu.h>
+
+EXPORT_TRACEPOINT_SYMBOL(gpu_sched_switch);
+EXPORT_TRACEPOINT_SYMBOL(gpu_job_enqueue);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 111dfd986dbf..edb0812e4873 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -20,6 +20,7 @@
#include <linux/notifier.h>
#include <linux/irqflags.h>
#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/pagemap.h>
#include <linux/hardirq.h>
#include <linux/linkage.h>
@@ -31,6 +32,7 @@
#include <linux/splice.h>
#include <linux/kdebug.h>
#include <linux/string.h>
+#include <linux/mount.h>
#include <linux/rwsem.h>
#include <linux/slab.h>
#include <linux/ctype.h>
@@ -812,6 +814,7 @@ static const char *trace_options[] = {
"irq-info",
"markers",
"function-trace",
+ "print-tgid",
NULL
};
@@ -1283,6 +1286,7 @@ static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
struct saved_cmdlines_buffer {
unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
unsigned *map_cmdline_to_pid;
+ unsigned *map_cmdline_to_tgid;
unsigned cmdline_num;
int cmdline_idx;
char *saved_cmdlines;
@@ -1316,12 +1320,23 @@ static int allocate_cmdlines_buffer(unsigned int val,
return -ENOMEM;
}
+ s->map_cmdline_to_tgid = kmalloc_array(val,
+ sizeof(*s->map_cmdline_to_tgid),
+ GFP_KERNEL);
+ if (!s->map_cmdline_to_tgid) {
+ kfree(s->map_cmdline_to_pid);
+ kfree(s->saved_cmdlines);
+ return -ENOMEM;
+ }
+
s->cmdline_idx = 0;
s->cmdline_num = val;
memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
sizeof(s->map_pid_to_cmdline));
memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
val * sizeof(*s->map_cmdline_to_pid));
+ memset(s->map_cmdline_to_tgid, NO_CMDLINE_MAP,
+ val * sizeof(*s->map_cmdline_to_tgid));
return 0;
}
@@ -1487,14 +1502,17 @@ static int trace_save_cmdline(struct task_struct *tsk)
if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
return 0;
+ preempt_disable();
/*
* It's not the end of the world if we don't get
* the lock, but we also don't want to spin
* nor do we want to disable interrupts,
* so if we miss here, then better luck next time.
*/
- if (!arch_spin_trylock(&trace_cmdline_lock))
+ if (!arch_spin_trylock(&trace_cmdline_lock)) {
+ preempt_enable();
return 0;
+ }
idx = savedcmd->map_pid_to_cmdline[tsk->pid];
if (idx == NO_CMDLINE_MAP) {
@@ -1517,8 +1535,9 @@ static int trace_save_cmdline(struct task_struct *tsk)
}
set_cmdline(idx, tsk->comm);
-
+ savedcmd->map_cmdline_to_tgid[idx] = tsk->tgid;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
return 1;
}
@@ -1560,6 +1579,35 @@ void trace_find_cmdline(int pid, char comm[])
preempt_enable();
}
+static int __find_tgid_locked(int pid)
+{
+ unsigned map;
+ int tgid;
+
+ map = savedcmd->map_pid_to_cmdline[pid];
+ if (map != NO_CMDLINE_MAP)
+ tgid = savedcmd->map_cmdline_to_tgid[map];
+ else
+ tgid = -1;
+
+ return tgid;
+}
+
+int trace_find_tgid(int pid)
+{
+ int tgid;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ tgid = __find_tgid_locked(pid);
+
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ return tgid;
+}
+
void tracing_record_cmdline(struct task_struct *tsk)
{
if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
@@ -2537,6 +2585,13 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
seq_puts(m, "# | | | | |\n");
}
+static void print_func_help_header_tgid(struct trace_buffer *buf, struct seq_file *m)
+{
+ print_event_info(buf, m);
+ seq_puts(m, "# TASK-PID TGID CPU# TIMESTAMP FUNCTION\n");
+ seq_puts(m, "# | | | | | |\n");
+}
+
static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
{
print_event_info(buf, m);
@@ -2549,6 +2604,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
seq_puts(m, "# | | | |||| | |\n");
}
+static void print_func_help_header_irq_tgid(struct trace_buffer *buf, struct seq_file *m)
+{
+ print_event_info(buf, m);
+ seq_puts(m, "# _-----=> irqs-off\n");
+ seq_puts(m, "# / _----=> need-resched\n");
+ seq_puts(m, "# | / _---=> hardirq/softirq\n");
+ seq_puts(m, "# || / _--=> preempt-depth\n");
+ seq_puts(m, "# ||| / delay\n");
+ seq_puts(m, "# TASK-PID TGID CPU# |||| TIMESTAMP FUNCTION\n");
+ seq_puts(m, "# | | | | |||| | |\n");
+}
+
void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
{
@@ -2849,9 +2916,15 @@ void trace_default_header(struct seq_file *m)
} else {
if (!(trace_flags & TRACE_ITER_VERBOSE)) {
if (trace_flags & TRACE_ITER_IRQ_INFO)
- print_func_help_header_irq(iter->trace_buffer, m);
+ if (trace_flags & TRACE_ITER_TGID)
+ print_func_help_header_irq_tgid(iter->trace_buffer, m);
+ else
+ print_func_help_header_irq(iter->trace_buffer, m);
else
- print_func_help_header(iter->trace_buffer, m);
+ if (trace_flags & TRACE_ITER_TGID)
+ print_func_help_header_tgid(iter->trace_buffer, m);
+ else
+ print_func_help_header(iter->trace_buffer, m);
}
}
}
@@ -3829,10 +3902,15 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
{
char buf[64];
int r;
+ unsigned int n;
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
- r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
+ n = savedcmd->cmdline_num;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ r = scnprintf(buf, sizeof(buf), "%u\n", n);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
@@ -3841,6 +3919,7 @@ static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
{
kfree(s->saved_cmdlines);
kfree(s->map_cmdline_to_pid);
+ kfree(s->map_cmdline_to_tgid);
kfree(s);
}
@@ -3857,10 +3936,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val)
return -ENOMEM;
}
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
savedcmd_temp = savedcmd;
savedcmd = s;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
free_saved_cmdlines_buffer(savedcmd_temp);
return 0;
@@ -3897,6 +3978,78 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
};
static ssize_t
+tracing_saved_tgids_read(struct file *file, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char *file_buf;
+ char *buf;
+ int len = 0;
+ int i;
+ int *pids;
+ int n = 0;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ pids = kmalloc_array(savedcmd->cmdline_num, 2*sizeof(int), GFP_KERNEL);
+ if (!pids) {
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < savedcmd->cmdline_num; i++) {
+ int pid;
+
+ pid = savedcmd->map_cmdline_to_pid[i];
+ if (pid == -1 || pid == NO_CMDLINE_MAP)
+ continue;
+
+ pids[n] = pid;
+ pids[n+1] = __find_tgid_locked(pid);
+ n += 2;
+ }
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ if (n == 0) {
+ kfree(pids);
+ return 0;
+ }
+
+ /* enough to hold max pair of pids + space, lr and nul */
+ len = n * 12;
+ file_buf = kmalloc(len, GFP_KERNEL);
+ if (!file_buf) {
+ kfree(pids);
+ return -ENOMEM;
+ }
+
+ buf = file_buf;
+ for (i = 0; i < n && len > 0; i += 2) {
+ int r;
+
+ r = snprintf(buf, len, "%d %d\n", pids[i], pids[i+1]);
+ buf += r;
+ len -= r;
+ }
+
+ len = simple_read_from_buffer(ubuf, cnt, ppos,
+ file_buf, buf - file_buf);
+
+ kfree(file_buf);
+ kfree(pids);
+
+ return len;
+}
+
+static const struct file_operations tracing_saved_tgids_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_saved_tgids_read,
+ .llseek = generic_file_llseek,
+};
+
+static ssize_t
tracing_set_trace_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -5832,28 +5985,19 @@ static __init int register_snapshot_cmd(void)
static inline __init int register_snapshot_cmd(void) { return 0; }
#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
-struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
+static struct dentry *tracing_get_dentry(struct trace_array *tr)
{
- if (tr->dir)
- return tr->dir;
-
- if (!debugfs_initialized())
- return NULL;
+ if (WARN_ON(!tr->dir))
+ return ERR_PTR(-ENODEV);
+ /* Top directory uses NULL as the parent */
if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
- tr->dir = debugfs_create_dir("tracing", NULL);
-
- if (!tr->dir)
- pr_warn_once("Could not create debugfs directory 'tracing'\n");
+ return NULL;
+ /* All sub buffers have a descriptor */
return tr->dir;
}
-struct dentry *tracing_init_dentry(void)
-{
- return tracing_init_dentry_tr(&global_trace);
-}
-
static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
{
struct dentry *d_tracer;
@@ -5861,14 +6005,14 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
if (tr->percpu_dir)
return tr->percpu_dir;
- d_tracer = tracing_init_dentry_tr(tr);
- if (!d_tracer)
+ d_tracer = tracing_get_dentry(tr);
+ if (IS_ERR(d_tracer))
return NULL;
- tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
+ tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer);
WARN_ONCE(!tr->percpu_dir,
- "Could not create debugfs directory 'per_cpu/%d'\n", cpu);
+ "Could not create tracefs directory 'per_cpu/%d'\n", cpu);
return tr->percpu_dir;
}
@@ -5885,7 +6029,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
}
static void
-tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
+tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
{
struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
struct dentry *d_cpu;
@@ -5895,9 +6039,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
return;
snprintf(cpu_dir, 30, "cpu%ld", cpu);
- d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
+ d_cpu = tracefs_create_dir(cpu_dir, d_percpu);
if (!d_cpu) {
- pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
+ pr_warning("Could not create tracefs '%s' entry\n", cpu_dir);
return;
}
@@ -6049,9 +6193,9 @@ struct dentry *trace_create_file(const char *name,
{
struct dentry *ret;
- ret = debugfs_create_file(name, mode, parent, data, fops);
+ ret = tracefs_create_file(name, mode, parent, data, fops);
if (!ret)
- pr_warning("Could not create debugfs '%s' entry\n", name);
+ pr_warning("Could not create tracefs '%s' entry\n", name);
return ret;
}
@@ -6064,13 +6208,13 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr)
if (tr->options)
return tr->options;
- d_tracer = tracing_init_dentry_tr(tr);
- if (!d_tracer)
+ d_tracer = tracing_get_dentry(tr);
+ if (IS_ERR(d_tracer))
return NULL;
- tr->options = debugfs_create_dir("options", d_tracer);
+ tr->options = tracefs_create_dir("options", d_tracer);
if (!tr->options) {
- pr_warning("Could not create debugfs directory 'options'\n");
+ pr_warning("Could not create tracefs directory 'options'\n");
return NULL;
}
@@ -6139,7 +6283,7 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
return;
for (cnt = 0; topts[cnt].opt; cnt++)
- debugfs_remove(topts[cnt].entry);
+ tracefs_remove(topts[cnt].entry);
kfree(topts);
}
@@ -6228,7 +6372,7 @@ static const struct file_operations rb_simple_fops = {
struct dentry *trace_instance_dir;
static void
-init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
static int
allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
@@ -6305,7 +6449,7 @@ static void free_trace_buffers(struct trace_array *tr)
#endif
}
-static int new_instance_create(const char *name)
+static int instance_mkdir(const char *name)
{
struct trace_array *tr;
int ret;
@@ -6344,17 +6488,17 @@ static int new_instance_create(const char *name)
if (allocate_trace_buffers(tr, trace_buf_size) < 0)
goto out_free_tr;
- tr->dir = debugfs_create_dir(name, trace_instance_dir);
+ tr->dir = tracefs_create_dir(name, trace_instance_dir);
if (!tr->dir)
goto out_free_tr;
ret = event_trace_add_tracer(tr->dir, tr);
if (ret) {
- debugfs_remove_recursive(tr->dir);
+ tracefs_remove_recursive(tr->dir);
goto out_free_tr;
}
- init_tracer_debugfs(tr, tr->dir);
+ init_tracer_tracefs(tr, tr->dir);
list_add(&tr->list, &ftrace_trace_arrays);
@@ -6375,7 +6519,7 @@ static int new_instance_create(const char *name)
}
-static int instance_delete(const char *name)
+static int instance_rmdir(const char *name)
{
struct trace_array *tr;
int found = 0;
@@ -6417,82 +6561,17 @@ static int instance_delete(const char *name)
return ret;
}
-static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
-{
- struct dentry *parent;
- int ret;
-
- /* Paranoid: Make sure the parent is the "instances" directory */
- parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
- if (WARN_ON_ONCE(parent != trace_instance_dir))
- return -ENOENT;
-
- /*
- * The inode mutex is locked, but debugfs_create_dir() will also
- * take the mutex. As the instances directory can not be destroyed
- * or changed in any other way, it is safe to unlock it, and
- * let the dentry try. If two users try to make the same dir at
- * the same time, then the new_instance_create() will determine the
- * winner.
- */
- mutex_unlock(&inode->i_mutex);
-
- ret = new_instance_create(dentry->d_iname);
-
- mutex_lock(&inode->i_mutex);
-
- return ret;
-}
-
-static int instance_rmdir(struct inode *inode, struct dentry *dentry)
-{
- struct dentry *parent;
- int ret;
-
- /* Paranoid: Make sure the parent is the "instances" directory */
- parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
- if (WARN_ON_ONCE(parent != trace_instance_dir))
- return -ENOENT;
-
- /* The caller did a dget() on dentry */
- mutex_unlock(&dentry->d_inode->i_mutex);
-
- /*
- * The inode mutex is locked, but debugfs_create_dir() will also
- * take the mutex. As the instances directory can not be destroyed
- * or changed in any other way, it is safe to unlock it, and
- * let the dentry try. If two users try to make the same dir at
- * the same time, then the instance_delete() will determine the
- * winner.
- */
- mutex_unlock(&inode->i_mutex);
-
- ret = instance_delete(dentry->d_iname);
-
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock(&dentry->d_inode->i_mutex);
-
- return ret;
-}
-
-static const struct inode_operations instance_dir_inode_operations = {
- .lookup = simple_lookup,
- .mkdir = instance_mkdir,
- .rmdir = instance_rmdir,
-};
-
static __init void create_trace_instances(struct dentry *d_tracer)
{
- trace_instance_dir = debugfs_create_dir("instances", d_tracer);
+ trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
+ instance_mkdir,
+ instance_rmdir);
if (WARN_ON(!trace_instance_dir))
return;
-
- /* Hijack the dir inode operations, to allow mkdir */
- trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
}
static void
-init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
int cpu;
@@ -6526,6 +6605,9 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_marker", 0220, d_tracer,
tr, &tracing_mark_fops);
+ trace_create_file("saved_tgids", 0444, d_tracer,
+ tr, &tracing_saved_tgids_fops);
+
trace_create_file("trace_clock", 0644, d_tracer, tr,
&trace_clock_fops);
@@ -6546,21 +6628,77 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
#endif
for_each_tracing_cpu(cpu)
- tracing_init_debugfs_percpu(tr, cpu);
+ tracing_init_tracefs_percpu(tr, cpu);
+
+}
+
+static struct vfsmount *trace_automount(void *ingore)
+{
+ struct vfsmount *mnt;
+ struct file_system_type *type;
+
+ /*
+ * To maintain backward compatibility for tools that mount
+ * debugfs to get to the tracing facility, tracefs is automatically
+ * mounted to the debugfs/tracing directory.
+ */
+ type = get_fs_type("tracefs");
+ if (!type)
+ return NULL;
+ mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+ put_filesystem(type);
+ if (IS_ERR(mnt))
+ return NULL;
+ mntget(mnt);
+
+ return mnt;
+}
+
+/**
+ * tracing_init_dentry - initialize top level trace array
+ *
+ * This is called when creating files or directories in the tracing
+ * directory. It is called via fs_initcall() by any of the boot up code
+ * and expects to return the dentry of the top level tracing directory.
+ */
+struct dentry *tracing_init_dentry(void)
+{
+ struct trace_array *tr = &global_trace;
+
+ /* The top level trace array uses NULL as parent */
+ if (tr->dir)
+ return NULL;
+
+ if (WARN_ON(!debugfs_initialized()))
+ return ERR_PTR(-ENODEV);
+ /*
+ * As there may still be users that expect the tracing
+ * files to exist in debugfs/tracing, we must automount
+ * the tracefs file system there, so older tools still
+ * work with the newer kerenl.
+ */
+ tr->dir = debugfs_create_automount("tracing", NULL,
+ trace_automount, NULL);
+ if (!tr->dir) {
+ pr_warn_once("Could not create debugfs directory 'tracing'\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return NULL;
}
-static __init int tracer_init_debugfs(void)
+static __init int tracer_init_tracefs(void)
{
struct dentry *d_tracer;
trace_access_lock_init();
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
- init_tracer_debugfs(&global_trace, d_tracer);
+ init_tracer_tracefs(&global_trace, d_tracer);
trace_create_file("tracing_thresh", 0644, d_tracer,
&global_trace, &tracing_thresh_fops);
@@ -6782,7 +6920,6 @@ __init static int tracer_alloc_buffers(void)
int ring_buf_size;
int ret = -ENOMEM;
-
if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
goto out;
@@ -6880,6 +7017,13 @@ out:
return ret;
}
+void __init trace_init(void)
+{
+ tracer_alloc_buffers();
+ init_ftrace_syscalls();
+ trace_event_init();
+}
+
__init static int clear_boot_tracer(void)
{
/*
@@ -6899,6 +7043,5 @@ __init static int clear_boot_tracer(void)
return 0;
}
-early_initcall(tracer_alloc_buffers);
-fs_initcall(tracer_init_debugfs);
+fs_initcall(tracer_init_tracefs);
late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5642436bf97e..9abe01196b6c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -333,7 +333,7 @@ struct tracer_flags {
/**
- * struct tracer - a specific tracer and its callbacks to interact with debugfs
+ * struct tracer - a specific tracer and its callbacks to interact with tracefs
* @name: the name chosen to select it on the available_tracers file
* @init: called when one switches to this tracer (echo name > current_tracer)
* @reset: called when one switches to another tracer
@@ -541,7 +541,6 @@ struct dentry *trace_create_file(const char *name,
void *data,
const struct file_operations *fops);
-struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
struct dentry *tracing_init_dentry(void);
struct ring_buffer_event;
@@ -659,6 +658,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
extern cycle_t ftrace_now(int cpu);
extern void trace_find_cmdline(int pid, char comm[]);
+extern int trace_find_tgid(int pid);
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
@@ -939,6 +939,7 @@ enum trace_iterator_flags {
TRACE_ITER_IRQ_INFO = 0x800000,
TRACE_ITER_MARKERS = 0x1000000,
TRACE_ITER_FUNCTION = 0x2000000,
+ TRACE_ITER_TGID = 0x4000000,
};
/*
@@ -1311,4 +1312,17 @@ int perf_ftrace_event_register(struct ftrace_event_call *call,
#define perf_ftrace_event_register NULL
#endif
+#ifdef CONFIG_FTRACE_SYSCALLS
+void init_ftrace_syscalls(void);
+#else
+static inline void init_ftrace_syscalls(void) { }
+#endif
+
+#ifdef CONFIG_EVENT_TRACING
+void trace_event_init(void);
+#else
+static inline void __init trace_event_init(void) { }
+#endif
+
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 51c47bc832d4..2d0b33287687 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -13,7 +13,7 @@
#include <linux/workqueue.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
@@ -448,7 +448,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir)
return;
if (!--dir->nr_events) {
- debugfs_remove_recursive(dir->entry);
+ tracefs_remove_recursive(dir->entry);
list_del(&dir->list);
__put_system_dir(dir);
}
@@ -467,7 +467,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
}
spin_unlock(&dir->d_lock);
- debugfs_remove_recursive(dir);
+ tracefs_remove_recursive(dir);
}
list_del(&file->list);
@@ -1492,7 +1492,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
} else
__get_system(system);
- dir->entry = debugfs_create_dir(name, parent);
+ dir->entry = tracefs_create_dir(name, parent);
if (!dir->entry) {
pr_warn("Failed to create system directory %s\n", name);
__put_system(system);
@@ -1505,12 +1505,12 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
dir->subsystem = system;
file->system = dir;
- entry = debugfs_create_file("filter", 0644, dir->entry, dir,
+ entry = tracefs_create_file("filter", 0644, dir->entry, dir,
&ftrace_subsystem_filter_fops);
if (!entry) {
kfree(system->filter);
system->filter = NULL;
- pr_warn("Could not create debugfs '%s/filter' entry\n", name);
+ pr_warn("Could not create tracefs '%s/filter' entry\n", name);
}
trace_create_file("enable", 0644, dir->entry, dir,
@@ -1551,9 +1551,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
d_events = parent;
name = ftrace_event_name(call);
- file->dir = debugfs_create_dir(name, d_events);
+ file->dir = tracefs_create_dir(name, d_events);
if (!file->dir) {
- pr_warn("Could not create debugfs '%s' directory\n", name);
+ pr_warn("Could not create tracefs '%s' directory\n", name);
return -1;
}
@@ -2199,7 +2199,7 @@ static inline int register_event_cmds(void) { return 0; }
/*
* The top level array has already had its ftrace_event_file
* descriptors created in order to allow for early events to
- * be recorded. This function is called after the debugfs has been
+ * be recorded. This function is called after the tracefs has been
* initialized, and we now have to create the files associated
* to the events.
*/
@@ -2282,16 +2282,16 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
struct dentry *d_events;
struct dentry *entry;
- entry = debugfs_create_file("set_event", 0644, parent,
+ entry = tracefs_create_file("set_event", 0644, parent,
tr, &ftrace_set_event_fops);
if (!entry) {
- pr_warn("Could not create debugfs 'set_event' entry\n");
+ pr_warn("Could not create tracefs 'set_event' entry\n");
return -ENOMEM;
}
- d_events = debugfs_create_dir("events", parent);
+ d_events = tracefs_create_dir("events", parent);
if (!d_events) {
- pr_warn("Could not create debugfs 'events' directory\n");
+ pr_warn("Could not create tracefs 'events' directory\n");
return -ENOMEM;
}
@@ -2383,7 +2383,7 @@ int event_trace_del_tracer(struct trace_array *tr)
down_write(&trace_event_sem);
__trace_remove_event_dirs(tr);
- debugfs_remove_recursive(tr->event_dir);
+ tracefs_remove_recursive(tr->event_dir);
up_write(&trace_event_sem);
tr->event_dir = NULL;
@@ -2461,13 +2461,13 @@ static __init int event_trace_init(void)
return -ENODEV;
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
- entry = debugfs_create_file("available_events", 0444, d_tracer,
+ entry = tracefs_create_file("available_events", 0444, d_tracer,
tr, &ftrace_avail_fops);
if (!entry)
- pr_warn("Could not create debugfs 'available_events' entry\n");
+ pr_warn("Could not create tracefs 'available_events' entry\n");
if (trace_define_common_fields())
pr_warn("tracing: Failed to allocate common fields");
@@ -2483,8 +2483,14 @@ static __init int event_trace_init(void)
#endif
return 0;
}
-early_initcall(event_trace_memsetup);
-core_initcall(event_trace_enable);
+
+void __init trace_event_init(void)
+{
+ event_trace_memsetup();
+ init_ftrace_syscalls();
+ event_trace_enable();
+}
+
fs_initcall(event_trace_init);
#ifdef CONFIG_FTRACE_STARTUP_TEST
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a88178c9ca5f..c7dddcbb0d7a 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -6,7 +6,6 @@
* is Copyright (c) Steven Rostedt <srostedt@redhat.com>
*
*/
-#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
@@ -65,6 +64,9 @@ struct fgraph_data {
#define TRACE_GRAPH_INDENT 2
+/* Flag options */
+#define TRACE_GRAPH_PRINT_FLAT 0x80
+
static unsigned int max_depth;
static struct tracer_opt trace_opts[] = {
@@ -84,6 +86,8 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
/* Display function name after trailing } */
{ TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
+ /* Use standard trace formatting rather than hierarchical */
+ { TRACER_OPT(funcgraph-flat, TRACE_GRAPH_PRINT_FLAT) },
{ } /* Empty entry */
};
@@ -151,7 +155,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
* The curr_ret_stack is initialized to -1 and get increased
* in this function. So it can be less than -1 only if it was
* filtered out via ftrace_graph_notrace_addr() which can be
- * set from set_graph_notrace file in debugfs by user.
+ * set from set_graph_notrace file in tracefs by user.
*/
if (current->curr_ret_stack < -1)
return -EBUSY;
@@ -1325,6 +1329,9 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
int cpu = iter->cpu;
int ret;
+ if (flags & TRACE_GRAPH_PRINT_FLAT)
+ return TRACE_TYPE_UNHANDLED;
+
if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
return TRACE_TYPE_HANDLED;
@@ -1382,13 +1389,6 @@ print_graph_function(struct trace_iterator *iter)
return print_graph_function_flags(iter, tracer_flags.val);
}
-static enum print_line_t
-print_graph_function_event(struct trace_iterator *iter, int flags,
- struct trace_event *event)
-{
- return print_graph_function(iter);
-}
-
static void print_lat_header(struct seq_file *s, u32 flags)
{
static const char spaces[] = " " /* 16 spaces */
@@ -1455,6 +1455,11 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
{
struct trace_iterator *iter = s->private;
+ if (flags & TRACE_GRAPH_PRINT_FLAT) {
+ trace_default_header(s);
+ return;
+ }
+
if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
return;
@@ -1530,19 +1535,6 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
return 0;
}
-static struct trace_event_functions graph_functions = {
- .trace = print_graph_function_event,
-};
-
-static struct trace_event graph_trace_entry_event = {
- .type = TRACE_GRAPH_ENT,
- .funcs = &graph_functions,
-};
-
-static struct trace_event graph_trace_ret_event = {
- .type = TRACE_GRAPH_RET,
- .funcs = &graph_functions
-};
static struct tracer graph_trace __tracer_data = {
.name = "function_graph",
@@ -1600,12 +1592,12 @@ static const struct file_operations graph_depth_fops = {
.llseek = generic_file_llseek,
};
-static __init int init_graph_debugfs(void)
+static __init int init_graph_tracefs(void)
{
struct dentry *d_tracer;
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
trace_create_file("max_graph_depth", 0644, d_tracer,
@@ -1613,22 +1605,12 @@ static __init int init_graph_debugfs(void)
return 0;
}
-fs_initcall(init_graph_debugfs);
+fs_initcall(init_graph_tracefs);
static __init int init_graph_trace(void)
{
max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
- if (!register_ftrace_event(&graph_trace_entry_event)) {
- pr_warning("Warning: could not register graph trace events\n");
- return 1;
- }
-
- if (!register_ftrace_event(&graph_trace_ret_event)) {
- pr_warning("Warning: could not register graph trace events\n");
- return 1;
- }
-
return register_tracer(&graph_trace);
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5be1613af79c..5149938f6da2 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1315,7 +1315,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
return ret;
}
-/* Make a debugfs interface for controlling probe points */
+/* Make a tracefs interface for controlling probe points */
static __init int init_kprobe_trace(void)
{
struct dentry *d_tracer;
@@ -1325,23 +1325,23 @@ static __init int init_kprobe_trace(void)
return -EINVAL;
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
- entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
+ entry = tracefs_create_file("kprobe_events", 0644, d_tracer,
NULL, &kprobe_events_ops);
/* Event list interface */
if (!entry)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'kprobe_events' entry\n");
/* Profile interface */
- entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
+ entry = tracefs_create_file("kprobe_profile", 0444, d_tracer,
NULL, &kprobe_profile_ops);
if (!entry)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'kprobe_profile' entry\n");
return 0;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0ae06473dd8a..540110ef18e6 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -565,11 +565,25 @@ int trace_print_context(struct trace_iterator *iter)
unsigned long secs, usec_rem;
char comm[TASK_COMM_LEN];
int ret;
+ int tgid;
trace_find_cmdline(entry->pid, comm);
- ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
- comm, entry->pid, iter->cpu);
+ ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+ if (!ret)
+ return 0;
+
+ if (trace_flags & TRACE_ITER_TGID) {
+ tgid = trace_find_tgid(entry->pid);
+ if (tgid < 0)
+ ret = trace_seq_puts(s, "(-----) ");
+ else
+ ret = trace_seq_printf(s, "(%5d) ", tgid);
+ if (!ret)
+ return 0;
+ }
+
+ ret = trace_seq_printf(s, "[%03d] ", iter->cpu);
if (!ret)
return 0;
@@ -898,6 +912,168 @@ static struct trace_event trace_fn_event = {
.funcs = &trace_fn_funcs,
};
+/* TRACE_GRAPH_ENT */
+static enum print_line_t trace_graph_ent_trace(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_seq *s = &iter->seq;
+ struct ftrace_graph_ent_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!trace_seq_puts(s, "graph_ent: func="))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (!seq_print_ip_sym(s, field->graph_ent.func, flags))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (!trace_seq_puts(s, "\n"))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ent_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!trace_seq_printf(&iter->seq, "%lx %d\n",
+ field->graph_ent.func,
+ field->graph_ent.depth))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ent_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_HEX_FIELD_RET(s, field->graph_ent.func);
+ SEQ_PUT_HEX_FIELD_RET(s, field->graph_ent.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_bin(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ent_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_FIELD_RET(s, field->graph_ent.func);
+ SEQ_PUT_FIELD_RET(s, field->graph_ent.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static struct trace_event_functions trace_graph_ent_funcs = {
+ .trace = trace_graph_ent_trace,
+ .raw = trace_graph_ent_raw,
+ .hex = trace_graph_ent_hex,
+ .binary = trace_graph_ent_bin,
+};
+
+static struct trace_event trace_graph_ent_event = {
+ .type = TRACE_GRAPH_ENT,
+ .funcs = &trace_graph_ent_funcs,
+};
+
+/* TRACE_GRAPH_RET */
+static enum print_line_t trace_graph_ret_trace(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+ struct ftrace_graph_ret_entry *field;
+
+ trace_assign_type(field, entry);
+
+ if (!trace_seq_puts(s, "graph_ret: func="))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (!seq_print_ip_sym(s, field->ret.func, flags))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (!trace_seq_puts(s, "\n"))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ret_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!trace_seq_printf(&iter->seq, "%lx %lld %lld %ld %d\n",
+ field->ret.func,
+ field->ret.calltime,
+ field->ret.rettime,
+ field->ret.overrun,
+ field->ret.depth))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ret_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_HEX_FIELD_RET(s, field->ret.func);
+ SEQ_PUT_HEX_FIELD_RET(s, field->ret.calltime);
+ SEQ_PUT_HEX_FIELD_RET(s, field->ret.rettime);
+ SEQ_PUT_HEX_FIELD_RET(s, field->ret.overrun);
+ SEQ_PUT_HEX_FIELD_RET(s, field->ret.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_bin(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ret_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_FIELD_RET(s, field->ret.func);
+ SEQ_PUT_FIELD_RET(s, field->ret.calltime);
+ SEQ_PUT_FIELD_RET(s, field->ret.rettime);
+ SEQ_PUT_FIELD_RET(s, field->ret.overrun);
+ SEQ_PUT_FIELD_RET(s, field->ret.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static struct trace_event_functions trace_graph_ret_funcs = {
+ .trace = trace_graph_ret_trace,
+ .raw = trace_graph_ret_raw,
+ .hex = trace_graph_ret_hex,
+ .binary = trace_graph_ret_bin,
+};
+
+static struct trace_event trace_graph_ret_event = {
+ .type = TRACE_GRAPH_RET,
+ .funcs = &trace_graph_ret_funcs,
+};
+
/* TRACE_CTX an TRACE_WAKE */
static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
char *delim)
@@ -1288,6 +1464,8 @@ static struct trace_event trace_print_event = {
static struct trace_event *events[] __initdata = {
&trace_fn_event,
+ &trace_graph_ent_event,
+ &trace_graph_ret_event,
&trace_ctx_event,
&trace_wake_event,
&trace_stack_event,
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 9008103db583..fd22ea2909a0 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -357,7 +357,7 @@ static __init int init_trace_printk_function_export(void)
struct dentry *d_tracer;
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
trace_create_file("printk_formats", 0444, d_tracer,
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 4f815fbce16d..19aff635841a 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -25,7 +25,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/smp.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/ctype.h>
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 16eddb308c33..e3099ff3b61e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -462,7 +462,7 @@ static __init int stack_trace_init(void)
struct dentry *d_tracer;
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
trace_create_file("stack_max_size", 0644, d_tracer,
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 7af67360b330..6cf935316769 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -12,7 +12,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
-#include <linux/debugfs.h>
+#include <linux/tracefs.h>
#include "trace_stat.h"
#include "trace.h"
@@ -65,7 +65,7 @@ static void reset_stat_session(struct stat_session *session)
static void destroy_session(struct stat_session *session)
{
- debugfs_remove(session->file);
+ tracefs_remove(session->file);
__reset_stat_session(session);
mutex_destroy(&session->stat_mutex);
kfree(session);
@@ -276,12 +276,12 @@ static int tracing_stat_init(void)
struct dentry *d_tracing;
d_tracing = tracing_init_dentry();
- if (!d_tracing)
+ if (IS_ERR(d_tracing))
return 0;
- stat_dir = debugfs_create_dir("trace_stat", d_tracing);
+ stat_dir = tracefs_create_dir("trace_stat", d_tracing);
if (!stat_dir)
- pr_warning("Could not create debugfs "
+ pr_warning("Could not create tracefs "
"'trace_stat' entry\n");
return 0;
}
@@ -291,7 +291,7 @@ static int init_stat_file(struct stat_session *session)
if (!stat_dir && tracing_stat_init())
return -ENODEV;
- session->file = debugfs_create_file(session->ts->name, 0644,
+ session->file = tracefs_create_file(session->ts->name, 0644,
stat_dir,
session, &tracing_stat_fops);
if (!session->file)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 902391b90a22..fe3a2e4ae232 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -523,7 +523,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
return (unsigned long)sys_call_table[nr];
}
-static int __init init_ftrace_syscalls(void)
+void __init init_ftrace_syscalls(void)
{
struct syscall_metadata *meta;
unsigned long addr;
@@ -533,7 +533,7 @@ static int __init init_ftrace_syscalls(void)
GFP_KERNEL);
if (!syscalls_metadata) {
WARN_ON(1);
- return -ENOMEM;
+ return;
}
for (i = 0; i < NR_syscalls; i++) {
@@ -545,10 +545,7 @@ static int __init init_ftrace_syscalls(void)
meta->syscall_nr = i;
syscalls_metadata[i] = meta;
}
-
- return 0;
}
-early_initcall(init_ftrace_syscalls);
#ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index db92b385c155..5dc27b3f3945 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1325,7 +1325,7 @@ static __init int init_uprobe_trace(void)
struct dentry *d_tracer;
d_tracer = tracing_init_dentry();
- if (!d_tracer)
+ if (IS_ERR(d_tracer))
return 0;
trace_create_file("uprobe_events", 0644, d_tracer,
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index a2e37c5d2f63..cc9689628427 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
cred->cap_inheritable = CAP_EMPTY_SET;
cred->cap_permitted = CAP_FULL_SET;
cred->cap_effective = CAP_FULL_SET;
+ cred->cap_ambient = CAP_EMPTY_SET;
cred->cap_bset = CAP_FULL_SET;
#ifdef CONFIG_KEYS
key_put(cred->request_key_auth);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 70bf11815f84..7eccf9b6d8b2 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -47,6 +47,11 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+#endif
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static cpumask_t __read_mostly watchdog_cpus;
+#endif
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
static unsigned long soft_lockup_nmi_warn;
@@ -221,7 +226,7 @@ void touch_softlockup_watchdog_sync(void)
__this_cpu_write(watchdog_touch_ts, 0);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
/* watchdog detector functions */
static int is_hardlockup(void)
{
@@ -235,6 +240,76 @@ static int is_hardlockup(void)
}
#endif
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static unsigned int watchdog_next_cpu(unsigned int cpu)
+{
+ cpumask_t cpus = watchdog_cpus;
+ unsigned int next_cpu;
+
+ next_cpu = cpumask_next(cpu, &cpus);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(&cpus);
+
+ if (next_cpu == cpu)
+ return nr_cpu_ids;
+
+ return next_cpu;
+}
+
+static int is_hardlockup_other_cpu(unsigned int cpu)
+{
+ unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
+
+ if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
+ return 1;
+
+ per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+ return 0;
+}
+
+static void watchdog_check_hardlockup_other_cpu(void)
+{
+ unsigned int next_cpu;
+
+ /*
+ * Test for hardlockups every 3 samples. The sample period is
+ * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
+ * watchdog_thresh (over by 20%).
+ */
+ if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
+ return;
+
+ /* check for a hardlockup on the next cpu */
+ next_cpu = watchdog_next_cpu(smp_processor_id());
+ if (next_cpu >= nr_cpu_ids)
+ return;
+
+ smp_rmb();
+
+ if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
+ per_cpu(watchdog_nmi_touch, next_cpu) = false;
+ return;
+ }
+
+ if (is_hardlockup_other_cpu(next_cpu)) {
+ /* only warn once */
+ if (per_cpu(hard_watchdog_warn, next_cpu) == true)
+ return;
+
+ if (hardlockup_panic)
+ panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu);
+ else
+ WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu);
+
+ per_cpu(hard_watchdog_warn, next_cpu) = true;
+ } else {
+ per_cpu(hard_watchdog_warn, next_cpu) = false;
+ }
+}
+#else
+static inline void watchdog_check_hardlockup_other_cpu(void) { return; }
+#endif
+
static int is_softlockup(unsigned long touch_ts)
{
unsigned long now = get_timestamp();
@@ -246,7 +321,7 @@ static int is_softlockup(unsigned long touch_ts)
return 0;
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
@@ -296,7 +371,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
__this_cpu_write(hard_watchdog_warn, false);
return;
}
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */
static void watchdog_interrupt_count(void)
{
@@ -317,6 +392,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* kick the hardlockup detector */
watchdog_interrupt_count();
+ /* test for hardlockups on the next cpu */
+ watchdog_check_hardlockup_other_cpu();
+
/* kick the softlockup detector */
wake_up_process(__this_cpu_read(softlockup_watchdog));
@@ -479,7 +557,7 @@ static void watchdog(unsigned int cpu)
__touch_watchdog();
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
/*
* People like the simple clean cpu node info on boot.
* Reduce the watchdog noise by only printing messages
@@ -568,9 +646,44 @@ static void watchdog_nmi_disable(unsigned int cpu)
}
}
#else
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static int watchdog_nmi_enable(unsigned int cpu)
+{
+ /*
+ * The new cpu will be marked online before the first hrtimer interrupt
+ * runs on it. If another cpu tests for a hardlockup on the new cpu
+ * before it has run its first hrtimer, it will get a false positive.
+ * Touch the watchdog on the new cpu to delay the first check for at
+ * least 3 sampling periods to guarantee one hrtimer has run on the new
+ * cpu.
+ */
+ per_cpu(watchdog_nmi_touch, cpu) = true;
+ smp_wmb();
+ cpumask_set_cpu(cpu, &watchdog_cpus);
+ return 0;
+}
+
+static void watchdog_nmi_disable(unsigned int cpu)
+{
+ unsigned int next_cpu = watchdog_next_cpu(cpu);
+
+ /*
+ * Offlining this cpu will cause the cpu before this one to start
+ * checking the one after this one. If this cpu just finished checking
+ * the next cpu and updating hrtimer_interrupts_saved, and then the
+ * previous cpu checks it within one sample period, it will trigger a
+ * false positive. Touch the watchdog on the next cpu to prevent it.
+ */
+ if (next_cpu < nr_cpu_ids)
+ per_cpu(watchdog_nmi_touch, next_cpu) = true;
+ smp_wmb();
+ cpumask_clear_cpu(cpu, &watchdog_cpus);
+}
+#else
static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
static void watchdog_nmi_disable(unsigned int cpu) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */
static struct smp_hotplug_thread watchdog_threads = {
.store = &softlockup_watchdog,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0963b7fbc9a6..efaae8aa1d28 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1727,9 +1727,7 @@ static struct worker *create_worker(struct worker_pool *pool)
goto fail;
set_user_nice(worker->task, pool->attrs->nice);
-
- /* prevent userland from meddling with cpumask of workqueue workers */
- worker->task->flags |= PF_NO_SETAFFINITY;
+ kthread_bind_mask(worker->task, pool->attrs->cpumask);
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
@@ -4136,7 +4134,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
}
wq->rescuer = rescuer;
- rescuer->task->flags |= PF_NO_SETAFFINITY;
+ kthread_bind_mask(rescuer->task, cpu_possible_mask);
wake_up_process(rescuer->task);
}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f9f6be126767..a4019a882691 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -639,6 +639,27 @@ source "lib/Kconfig.kasan"
endmenu # "Memory Debugging"
+config ARCH_HAS_KCOV
+ bool
+ help
+ KCOV does not have any arch-specific code, but currently it is enabled
+ only for x86_64. KCOV requires testing on other archs, and most likely
+ disabling of instrumentation for some early boot code.
+
+config KCOV
+ bool "Code coverage for fuzzing"
+ depends on ARCH_HAS_KCOV
+ select DEBUG_FS
+ help
+ KCOV exposes kernel code coverage information in a form suitable
+ for coverage-guided fuzzing (randomized testing).
+
+ If RANDOMIZE_BASE is enabled, PC values will not be stable across
+ different machines and across reboots. If you need stable PC values,
+ disable RANDOMIZE_BASE.
+
+ For more details, see Documentation/kcov.txt.
+
config DEBUG_SHIRQ
bool "Debug shared IRQ handlers"
depends on DEBUG_KERNEL
@@ -670,15 +691,27 @@ config LOCKUP_DETECTOR
The overhead should be minimal. A periodic hrtimer runs to
generate interrupts and kick the watchdog task every 4 seconds.
An NMI is generated every 10 seconds or so to check for hardlockups.
+ If NMIs are not available on the platform, every 12 seconds the
+ hrtimer interrupt on one cpu will be used to check for hardlockups
+ on the next cpu.
The frequency of hrtimer and NMI events and the soft and hard lockup
thresholds can be controlled through the sysctl watchdog_thresh.
-config HARDLOCKUP_DETECTOR
+config HARDLOCKUP_DETECTOR_NMI
def_bool y
depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG
depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI
+config HARDLOCKUP_DETECTOR_OTHER_CPU
+ def_bool y
+ depends on LOCKUP_DETECTOR && SMP
+ depends on !HARDLOCKUP_DETECTOR_NMI && !HAVE_NMI_WATCHDOG
+
+config HARDLOCKUP_DETECTOR
+ def_bool y
+ depends on HARDLOCKUP_DETECTOR_NMI || HARDLOCKUP_DETECTOR_OTHER_CPU
+
config BOOTPARAM_HARDLOCKUP_PANIC
bool "Panic (Reboot) On Hard Lockups"
depends on HARDLOCKUP_DETECTOR
@@ -814,6 +847,15 @@ config SCHED_DEBUG
that can help debug the scheduler. The runtime overhead of this
option is minimal.
+config PANIC_ON_RT_THROTTLING
+ bool "Panic on RT throttling"
+ help
+ Say Y here to enable the kernel to panic when a realtime
+ runqueue is throttled. This may be useful for detecting
+ and debugging RT throttling issues.
+
+ Say N if unsure.
+
config SCHEDSTATS
bool "Collect scheduler statistics"
depends on DEBUG_KERNEL && PROC_FS
diff --git a/lib/Makefile b/lib/Makefile
index de5c0b494761..3cf8b8008d53 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -7,6 +7,18 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS)
KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
endif
+# These files are disabled because they produce lots of non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. For example,
+# rbtree can be global and individual rotations don't correlate with inputs.
+KCOV_INSTRUMENT_string.o := n
+KCOV_INSTRUMENT_rbtree.o := n
+KCOV_INSTRUMENT_list_debug.o := n
+KCOV_INSTRUMENT_debugobjects.o := n
+KCOV_INSTRUMENT_dynamic_debug.o := n
+# Kernel does not boot if we instrument this file as it uses custom calling
+# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
+KCOV_INSTRUMENT_hweight.o := n
+
lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o timerqueue.o\
idr.o int_sqrt.o extable.o \
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index f0f5c5c3de12..6d940c72b5fc 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -47,6 +47,11 @@
#include "lz4defs.h"
+static const int dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
+#if LZ4_ARCH64
+static const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
+#endif
+
static int lz4_uncompress(const char *source, char *dest, int osize)
{
const BYTE *ip = (const BYTE *) source;
@@ -56,10 +61,6 @@ static int lz4_uncompress(const char *source, char *dest, int osize)
BYTE *cpy;
unsigned token;
size_t length;
- size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
-#if LZ4_ARCH64
- size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
-#endif
while (1) {
@@ -116,7 +117,7 @@ static int lz4_uncompress(const char *source, char *dest, int osize)
/* copy repeated sequence */
if (unlikely((op - ref) < STEPSIZE)) {
#if LZ4_ARCH64
- size_t dec64 = dec64table[op - ref];
+ int dec64 = dec64table[op - ref];
#else
const int dec64 = 0;
#endif
@@ -139,8 +140,12 @@ static int lz4_uncompress(const char *source, char *dest, int osize)
/* Error: request to write beyond destination buffer */
if (cpy > oend)
goto _output_error;
+#if LZ4_ARCH64
+ if ((ref + COPYLENGTH) > oend)
+#else
if ((ref + COPYLENGTH) > oend ||
(op + COPYLENGTH) > oend)
+#endif
goto _output_error;
LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
while (op < cpy)
@@ -177,11 +182,6 @@ static int lz4_uncompress_unknownoutputsize(const char *source, char *dest,
BYTE * const oend = op + maxoutputsize;
BYTE *cpy;
- size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
-#if LZ4_ARCH64
- size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
-#endif
-
/* Main Loop */
while (ip < iend) {
@@ -249,7 +249,7 @@ static int lz4_uncompress_unknownoutputsize(const char *source, char *dest,
/* copy repeated sequence */
if (unlikely((op - ref) < STEPSIZE)) {
#if LZ4_ARCH64
- size_t dec64 = dec64table[op - ref];
+ int dec64 = dec64table[op - ref];
#else
const int dec64 = 0;
#endif
@@ -270,7 +270,13 @@ static int lz4_uncompress_unknownoutputsize(const char *source, char *dest,
if (cpy > oend - COPYLENGTH) {
if (cpy > oend)
goto _output_error; /* write outside of buf */
-
+#if LZ4_ARCH64
+ if ((ref + COPYLENGTH) > oend)
+#else
+ if ((ref + COPYLENGTH) > oend ||
+ (op + COPYLENGTH) > oend)
+#endif
+ goto _output_error;
LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
while (op < cpy)
*op++ = *ref++;
diff --git a/lib/string.c b/lib/string.c
index 643b0a90802c..761b54f2f3c4 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -852,3 +852,20 @@ void *memchr_inv(const void *start, int c, size_t bytes)
return check_bytes8(start, value, bytes % 8);
}
EXPORT_SYMBOL(memchr_inv);
+
+/**
+ * strreplace - Replace all occurrences of character in string.
+ * @s: The string to operate on.
+ * @old: The character being replaced.
+ * @new: The character @old is replaced with.
+ *
+ * Returns pointer to the nul byte at the end of @s.
+ */
+char *strreplace(char *s, char old, char new)
+{
+ for (; *s; ++s)
+ if (*s == old)
+ *s = new;
+ return s;
+}
+EXPORT_SYMBOL(strreplace);
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index bb2b201d6ad0..dfa554cc1a5f 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -1,4 +1,5 @@
#include <linux/module.h>
+#include <linux/thread_info.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/errno.h>
@@ -38,8 +39,8 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long
unsigned long c, data;
/* Fall back to byte-at-a-time if we get a page fault */
- if (unlikely(__get_user(c,(unsigned long __user *)(src+res))))
- break;
+ unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time);
+
*(unsigned long *)(dst+res) = c;
if (has_zero(c, &data, &constants)) {
data = prep_zero_mask(c, data, &constants);
@@ -54,8 +55,7 @@ byte_at_a_time:
while (max) {
char c;
- if (unlikely(__get_user(c,src+res)))
- return -EFAULT;
+ unsafe_get_user(c,src+res, efault);
dst[res] = c;
if (!c)
return res;
@@ -74,6 +74,7 @@ byte_at_a_time:
* Nope: we hit the address space limit, and we still had more
* characters the caller would have wanted. That's an EFAULT.
*/
+efault:
return -EFAULT;
}
@@ -106,7 +107,13 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
src_addr = (unsigned long)src;
if (likely(src_addr < max_addr)) {
unsigned long max = max_addr - src_addr;
- return do_strncpy_from_user(dst, src, count, max);
+ long retval;
+
+ check_object_size(dst, count, false);
+ user_access_begin();
+ retval = do_strncpy_from_user(dst, src, count, max);
+ user_access_end();
+ return retval;
}
return -EFAULT;
}
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
index 11649615c505..02a49fccb4fe 100644
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -45,8 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
src -= align;
max += align;
- if (unlikely(__get_user(c,(unsigned long __user *)src)))
- return 0;
+ unsafe_get_user(c, (unsigned long __user *)src, efault);
c |= aligned_byte_mask(align);
for (;;) {
@@ -61,8 +60,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
if (unlikely(max <= sizeof(unsigned long)))
break;
max -= sizeof(unsigned long);
- if (unlikely(__get_user(c,(unsigned long __user *)(src+res))))
- return 0;
+ unsafe_get_user(c, (unsigned long __user *)(src+res), efault);
}
res -= align;
@@ -77,6 +75,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
* Nope: we hit the address space limit, and we still had more
* characters the caller would have wanted. That's 0.
*/
+efault:
return 0;
}
@@ -104,7 +103,12 @@ long strnlen_user(const char __user *str, long count)
src_addr = (unsigned long)str;
if (likely(src_addr < max_addr)) {
unsigned long max = max_addr - src_addr;
- return do_strnlen_user(str, count, max);
+ long retval;
+
+ user_access_begin();
+ retval = do_strnlen_user(str, count, max);
+ user_access_end();
+ return retval;
}
return 0;
}
@@ -132,7 +136,12 @@ long strlen_user(const char __user *str)
src_addr = (unsigned long)str;
if (likely(src_addr < max_addr)) {
unsigned long max = max_addr - src_addr;
- return do_strnlen_user(str, ~0ul, max);
+ long retval;
+
+ user_access_begin();
+ retval = do_strnlen_user(str, ~0ul, max);
+ user_access_end();
+ return retval;
}
return 0;
}
diff --git a/mm/Kconfig b/mm/Kconfig
index 1d1ae6b078fd..95c5728d65bc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -601,6 +601,16 @@ config PGTABLE_MAPPING
You can check speed with zsmalloc benchmark:
https://github.com/spartacus06/zsmapbench
+config ZSMALLOC_STAT
+ bool "Export zsmalloc statistics"
+ depends on ZSMALLOC
+ select DEBUG_FS
+ help
+ This option enables code in the zsmalloc to collect various
+ statistics about whats happening in zsmalloc and exports that
+ information to userspace via debugfs.
+ If unsure, say N.
+
config GENERIC_EARLY_IOREMAP
bool
diff --git a/mm/Makefile b/mm/Makefile
index 3614adbf6d32..4e5306ee3271 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,6 +5,24 @@
KASAN_SANITIZE_slab_common.o := n
KASAN_SANITIZE_slub.o := n
+# These files are disabled because they produce non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
+# free pages, or a task is migrated between nodes.
+KCOV_INSTRUMENT_slab_common.o := n
+KCOV_INSTRUMENT_slob.o := n
+KCOV_INSTRUMENT_slab.o := n
+KCOV_INSTRUMENT_slub.o := n
+KCOV_INSTRUMENT_page_alloc.o := n
+KCOV_INSTRUMENT_debug-pagealloc.o := n
+KCOV_INSTRUMENT_kmemleak.o := n
+KCOV_INSTRUMENT_kmemcheck.o := n
+KCOV_INSTRUMENT_memcontrol.o := n
+KCOV_INSTRUMENT_mmzone.o := n
+KCOV_INSTRUMENT_vmstat.o := n
+
+# Since __builtin_frame_address does work as used, disable the warning.
+CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address)
+
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
@@ -73,3 +91,4 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
+obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 64710148941e..131daadf40e4 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,4 +1,6 @@
KASAN_SANITIZE := n
+UBSAN_SANITIZE_kasan.o := n
+KCOV_INSTRUMENT := n
CFLAGS_REMOVE_kasan.o = -pg
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
index f41ac9a551cc..3f9a41cf0ac6 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/kasan_init.c
@@ -29,10 +29,10 @@
*/
unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
-#if CONFIG_ARM64_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
#endif
-#if CONFIG_ARM64_PGTABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
#endif
pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
diff --git a/mm/madvise.c b/mm/madvise.c
index 0938b30da4ab..dd7d354c5bfa 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -102,7 +102,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
- vma->vm_file, pgoff, vma_policy(vma));
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma_get_anon_name(vma));
if (*prev) {
vma = *prev;
goto success;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6a23ccc46f0f..e8b241972568 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -770,7 +770,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
- new_pol);
+ new_pol, vma_get_anon_name(vma));
if (prev) {
vma = prev;
next = vma->vm_next;
diff --git a/mm/mlock.c b/mm/mlock.c
index abf1f0f51eda..8aa350f950e6 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -26,10 +26,10 @@
int can_do_mlock(void)
{
- if (capable(CAP_IPC_LOCK))
- return 1;
if (rlimit(RLIMIT_MEMLOCK) != 0)
return 1;
+ if (capable(CAP_IPC_LOCK))
+ return 1;
return 0;
}
EXPORT_SYMBOL(can_do_mlock);
@@ -567,7 +567,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
- vma->vm_file, pgoff, vma_policy(vma));
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma_get_anon_name(vma));
if (*prev) {
vma = *prev;
goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index 954ad5a3c140..496e6d221ad8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -57,6 +57,18 @@
#define arch_rebalance_pgtables(addr, len) (addr)
#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
+const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
+int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
+const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
+int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
+#endif
+
+
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
@@ -946,7 +958,8 @@ again: remove_next = 1 + (end > next->vm_end);
* per-vma resources, so we don't attempt to merge those.
*/
static inline int is_mergeable_vma(struct vm_area_struct *vma,
- struct file *file, unsigned long vm_flags)
+ struct file *file, unsigned long vm_flags,
+ const char __user *anon_name)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -962,6 +975,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (vma->vm_ops && vma->vm_ops->close)
return 0;
+ if (vma_get_anon_name(vma) != anon_name)
+ return 0;
return 1;
}
@@ -992,9 +1007,10 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
*/
static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+ struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+ const char __user *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags) &&
+ if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
@@ -1011,9 +1027,10 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
*/
static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+ struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+ const char __user *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags) &&
+ if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = vma_pages(vma);
@@ -1024,9 +1041,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
}
/*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor. Or both (it neatly fills a hole).
*
* In most cases - when called for mmap, brk or mremap - [addr,end) is
* certain not to be mapped by the time vma_merge is called; but when
@@ -1055,8 +1072,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
- pgoff_t pgoff, struct mempolicy *policy)
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t pgoff, struct mempolicy *policy,
+ const char __user *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -1081,16 +1099,16 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
* Can it merge with the predecessor?
*/
if (prev && prev->vm_end == addr &&
- mpol_equal(vma_policy(prev), policy) &&
- can_vma_merge_after(prev, vm_flags,
- anon_vma, file, pgoff)) {
+ mpol_equal(vma_policy(prev), policy) &&
+ can_vma_merge_after(prev, vm_flags, anon_vma,
+ file, pgoff, anon_name)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen) &&
+ can_vma_merge_before(next, vm_flags, anon_vma,
+ file, pgoff+pglen, anon_name) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
@@ -1109,9 +1127,9 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
* Can this new request be merged in front of next?
*/
if (next && end == next->vm_start &&
- mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen)) {
+ mpol_equal(policy, vma_policy(next)) &&
+ can_vma_merge_before(next, vm_flags, anon_vma,
+ file, pgoff+pglen, anon_name)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL);
@@ -1598,7 +1616,8 @@ munmap_back:
/*
* Can we just expand an old mapping?
*/
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
+ NULL, NULL);
if (vma)
goto out;
@@ -2638,6 +2657,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
return 0;
}
+EXPORT_SYMBOL(do_munmap);
int vm_munmap(unsigned long start, size_t len)
{
@@ -2723,7 +2743,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL);
+ NULL, NULL, pgoff, NULL, NULL);
if (vma)
goto out;
@@ -2882,7 +2902,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma_get_anon_name(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ace93454ce8e..5911e603269d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -289,7 +289,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
*/
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma_get_anon_name(vma));
if (*pprev) {
vma = *pprev;
goto success;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7e6f7878a2d6..be880ae2f633 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -205,8 +205,21 @@ static char * const zone_names[MAX_NR_ZONES] = {
"Movable",
};
+/*
+ * Try to keep at least this much lowmem free. Do not allow normal
+ * allocations below this point, only high priority ones. Automatically
+ * tuned according to the amount of memory in the system.
+ */
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
+int min_free_order_shift = 1;
+
+/*
+ * Extra memory for the system to try freeing. Used to temporarily
+ * free memory, to make space for new workloads. Anyone can allocate
+ * down to the min watermarks controlled by min_free_kbytes above.
+ */
+int extra_free_kbytes = 0;
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
@@ -1089,39 +1102,34 @@ static void change_pageblock_range(struct page *pageblock_page,
}
/*
- * If breaking a large block of pages, move all free pages to the preferred
- * allocation list. If falling back for a reclaimable kernel allocation, be
- * more aggressive about taking ownership of free pages.
+ * When we are falling back to another migratetype during allocation, try to
+ * steal extra free pages from the same pageblocks to satisfy further
+ * allocations, instead of polluting multiple pageblocks.
*
- * On the other hand, never change migration type of MIGRATE_CMA pageblocks
- * nor move CMA pages to different free lists. We don't want unmovable pages
- * to be allocated from MIGRATE_CMA areas.
+ * If we are stealing a relatively large buddy page, it is likely there will
+ * be more free pages in the pageblock, so try to steal them all. For
+ * reclaimable and unmovable allocations, we steal regardless of page size,
+ * as fragmentation caused by those allocations polluting movable pageblocks
+ * is worse than movable allocations stealing from unmovable and reclaimable
+ * pageblocks.
*
- * Returns the allocation migratetype if free pages were stolen, or the
- * fallback migratetype if it was decided not to steal.
+ * If we claim more than half of the pageblock, change pageblock's migratetype
+ * as well.
*/
-static int try_to_steal_freepages(struct zone *zone, struct page *page,
+static void try_to_steal_freepages(struct zone *zone, struct page *page,
int start_type, int fallback_type)
{
unsigned int current_order = page_order(page);
- /*
- * When borrowing from MIGRATE_CMA, we need to release the excess
- * buddy pages to CMA itself. We also ensure the freepage_migratetype
- * is set to CMA so it is returned to the correct freelist in case
- * the page ends up being not actually allocated from the pcp lists.
- */
- if (is_migrate_cma(fallback_type))
- return fallback_type;
-
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
change_pageblock_range(page, current_order, start_type);
- return start_type;
+ return;
}
if (current_order >= pageblock_order / 2 ||
start_type == MIGRATE_RECLAIMABLE ||
+ start_type == MIGRATE_UNMOVABLE ||
page_group_by_mobility_disabled) {
int pages;
@@ -1131,11 +1139,7 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
if (pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
set_pageblock_migratetype(page, start_type);
-
- return start_type;
}
-
- return fallback_type;
}
/* Remove an element from the buddy allocator from the fallback list */
@@ -1145,14 +1149,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
struct free_area *area;
unsigned int current_order;
struct page *page;
- int migratetype, new_type, i;
/* Find the largest possible block of pages in the other list */
for (current_order = MAX_ORDER-1;
current_order >= order && current_order <= MAX_ORDER-1;
--current_order) {
+ int i;
for (i = 0;; i++) {
- migratetype = fallbacks[start_migratetype][i];
+ int migratetype = fallbacks[start_migratetype][i];
+ int buddy_type = start_migratetype;
/* MIGRATE_RESERVE handled later if necessary */
if (migratetype == MIGRATE_RESERVE)
@@ -1166,22 +1171,36 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
struct page, lru);
area->nr_free--;
- new_type = try_to_steal_freepages(zone, page,
- start_migratetype,
- migratetype);
+ if (!is_migrate_cma(migratetype)) {
+ try_to_steal_freepages(zone, page,
+ start_migratetype,
+ migratetype);
+ } else {
+ /*
+ * When borrowing from MIGRATE_CMA, we need to
+ * release the excess buddy pages to CMA
+ * itself, and we do not try to steal extra
+ * free pages.
+ */
+ buddy_type = migratetype;
+ }
/* Remove the page from the freelists */
list_del(&page->lru);
rmv_page_order(page);
expand(zone, page, order, current_order, area,
- new_type);
- /* The freepage_migratetype may differ from pageblock's
+ buddy_type);
+
+ /*
+ * The freepage_migratetype may differ from pageblock's
* migratetype depending on the decisions in
- * try_to_steal_freepages. This is OK as long as it does
- * not differ for MIGRATE_CMA type.
+ * try_to_steal_freepages(). This is OK as long as it
+ * does not differ for MIGRATE_CMA pageblocks. For CMA
+ * we need to make sure unallocated pages flushed from
+ * pcp lists are returned to the correct freelist.
*/
- set_freepage_migratetype(page, new_type);
+ set_freepage_migratetype(page, buddy_type);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, migratetype);
@@ -1750,7 +1769,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
free_pages -= z->free_area[o].nr_free << o;
/* Require fewer higher order pages to be free */
- min >>= 1;
+ min >>= min_free_order_shift;
if (free_pages <= min)
return false;
@@ -5681,6 +5700,7 @@ static void setup_per_zone_lowmem_reserve(void)
static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+ unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
@@ -5692,11 +5712,14 @@ static void __setup_per_zone_wmarks(void)
}
for_each_zone(zone) {
- u64 tmp;
+ u64 min, low;
spin_lock_irqsave(&zone->lock, flags);
- tmp = (u64)pages_min * zone->managed_pages;
- do_div(tmp, lowmem_pages);
+ min = (u64)pages_min * zone->managed_pages;
+ do_div(min, lowmem_pages);
+ low = (u64)pages_low * zone->managed_pages;
+ do_div(low, vm_total_pages);
+
if (is_highmem(zone)) {
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
@@ -5717,11 +5740,13 @@ static void __setup_per_zone_wmarks(void)
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
- zone->watermark[WMARK_MIN] = tmp;
+ zone->watermark[WMARK_MIN] = min;
}
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+ zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +
+ low + (min >> 2);
+ zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
+ low + (min >> 1);
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
@@ -5845,7 +5870,7 @@ module_init(init_per_zone_wmark_min)
/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call two helper functions whenever min_free_kbytes
- * changes.
+ * or extra_free_kbytes changes.
*/
int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
diff --git a/mm/shmem.c b/mm/shmem.c
index fac22b578eb9..b159434edcaa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3406,6 +3406,14 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
}
EXPORT_SYMBOL_GPL(shmem_file_setup);
+void shmem_set_file(struct vm_area_struct *vma, struct file *file)
+{
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ vma->vm_file = file;
+ vma->vm_ops = &shmem_vm_ops;
+}
+
/**
* shmem_zero_setup - setup a shared anonymous mapping
* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
@@ -3419,10 +3427,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
if (IS_ERR(file))
return PTR_ERR(file);
- if (vma->vm_file)
- fput(vma->vm_file);
- vma->vm_file = file;
- vma->vm_ops = &shmem_vm_ops;
+ shmem_set_file(vma, file);
return 0;
}
diff --git a/mm/slab.c b/mm/slab.c
index b7f9f6456a61..918ffe2dbb08 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4214,6 +4214,36 @@ static int __init slab_proc_init(void)
module_init(slab_proc_init);
#endif
+#ifdef CONFIG_HARDENED_USERCOPY
+/*
+ * Rejects objects that are incorrectly sized.
+ *
+ * Returns NULL if check passes, otherwise const char * to name of cache
+ * to indicate an error.
+ */
+const char *__check_heap_object(const void *ptr, unsigned long n,
+ struct page *page)
+{
+ struct kmem_cache *cachep;
+ unsigned int objnr;
+ unsigned long offset;
+
+ /* Find and validate object. */
+ cachep = page->slab_cache;
+ objnr = obj_to_index(cachep, page, (void *)ptr);
+ BUG_ON(objnr >= cachep->num);
+
+ /* Find offset within object. */
+ offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
+
+ /* Allow address range falling entirely within object size. */
+ if (offset <= cachep->object_size && n <= cachep->object_size - offset)
+ return NULL;
+
+ return cachep->name;
+}
+#endif /* CONFIG_HARDENED_USERCOPY */
+
/**
* ksize - get the actual amount of memory allocated for a given object
* @objp: Pointer to the object
diff --git a/mm/slub.c b/mm/slub.c
index 1256a8f603a2..8bc50c0ca2b5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
#endif
}
+static inline void *fixup_red_left(struct kmem_cache *s, void *p)
+{
+ if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
+ p += s->red_left_pad;
+
+ return p;
+}
+
static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
@@ -224,24 +232,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
* Core slab cache functions
*******************************************************************/
-/* Verify that a pointer has an address that is valid within a slab page */
-static inline int check_valid_pointer(struct kmem_cache *s,
- struct page *page, const void *object)
-{
- void *base;
-
- if (!object)
- return 1;
-
- base = page_address(page);
- if (object < base || object >= base + page->objects * s->size ||
- (object - base) % s->size) {
- return 0;
- }
-
- return 1;
-}
-
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
return *(void **)(object + s->offset);
@@ -271,12 +261,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
/* Loop over all objects in a slab */
#define for_each_object(__p, __s, __addr, __objects) \
- for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
- __p += (__s)->size)
+ for (__p = fixup_red_left(__s, __addr); \
+ __p < (__addr) + (__objects) * (__s)->size; \
+ __p += (__s)->size)
#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
- for (__p = (__addr), __idx = 1; __idx <= __objects;\
- __p += (__s)->size, __idx++)
+ for (__p = fixup_red_left(__s, __addr), __idx = 1; \
+ __idx <= __objects; \
+ __p += (__s)->size, __idx++)
/* Determine object index from a given position */
static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
@@ -456,6 +448,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
set_bit(slab_index(p, s, addr), map);
}
+static inline int size_from_object(struct kmem_cache *s)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ return s->size - s->red_left_pad;
+
+ return s->size;
+}
+
+static inline void *restore_red_left(struct kmem_cache *s, void *p)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ p -= s->red_left_pad;
+
+ return p;
+}
+
/*
* Debug settings:
*/
@@ -489,6 +497,26 @@ static inline void metadata_access_disable(void)
/*
* Object debugging
*/
+
+/* Verify that a pointer has an address that is valid within a slab page */
+static inline int check_valid_pointer(struct kmem_cache *s,
+ struct page *page, void *object)
+{
+ void *base;
+
+ if (!object)
+ return 1;
+
+ base = page_address(page);
+ object = restore_red_left(s, object);
+ if (object < base || object >= base + page->objects * s->size ||
+ (object - base) % s->size) {
+ return 0;
+ }
+
+ return 1;
+}
+
static void print_section(char *text, u8 *addr, unsigned int length)
{
metadata_access_enable();
@@ -628,7 +656,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
p, p - addr, get_freepointer(s, p));
- if (p > addr + 16)
+ if (s->flags & SLAB_RED_ZONE)
+ print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
+ else if (p > addr + 16)
print_section("Bytes b4 ", p - 16, 16);
print_section("Object ", p, min_t(unsigned long, s->object_size,
@@ -645,9 +675,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
- if (off != s->size)
+ if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
- print_section("Padding ", p + off, s->size - off);
+ print_section("Padding ", p + off, size_from_object(s) - off);
dump_stack();
}
@@ -677,6 +707,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
{
u8 *p = object;
+ if (s->flags & SLAB_RED_ZONE)
+ memset(p - s->red_left_pad, val, s->red_left_pad);
+
if (s->flags & __OBJECT_POISON) {
memset(p, POISON_FREE, s->object_size - 1);
p[s->object_size - 1] = POISON_END;
@@ -769,11 +802,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
/* We also have user information there */
off += 2 * sizeof(struct track);
- if (s->size == off)
+ if (size_from_object(s) == off)
return 1;
return check_bytes_and_report(s, page, p, "Object padding",
- p + off, POISON_INUSE, s->size - off);
+ p + off, POISON_INUSE, size_from_object(s) - off);
}
/* Check the pad bytes at the end of a slab page */
@@ -818,6 +851,10 @@ static int check_object(struct kmem_cache *s, struct page *page,
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, page, object, "Redzone",
+ object - s->red_left_pad, val, s->red_left_pad))
+ return 0;
+
+ if (!check_bytes_and_report(s, page, object, "Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
} else {
@@ -1441,7 +1478,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
set_freepointer(s, p, NULL);
}
- page->freelist = start;
+ page->freelist = fixup_red_left(s, start);
page->inuse = page->objects;
page->frozen = 1;
out:
@@ -3044,7 +3081,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/
size += 2 * sizeof(struct track);
- if (flags & SLAB_RED_ZONE)
+ if (flags & SLAB_RED_ZONE) {
/*
* Add some empty padding so that we can catch
* overwrites from earlier objects rather than let
@@ -3053,6 +3090,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* of the object.
*/
size += sizeof(void *);
+
+ s->red_left_pad = sizeof(void *);
+ s->red_left_pad = ALIGN(s->red_left_pad, s->align);
+ size += s->red_left_pad;
+ }
#endif
/*
@@ -3363,6 +3405,46 @@ static size_t __ksize(const void *object)
return slab_ksize(page->slab_cache);
}
+#ifdef CONFIG_HARDENED_USERCOPY
+/*
+ * Rejects objects that are incorrectly sized.
+ *
+ * Returns NULL if check passes, otherwise const char * to name of cache
+ * to indicate an error.
+ */
+const char *__check_heap_object(const void *ptr, unsigned long n,
+ struct page *page)
+{
+ struct kmem_cache *s;
+ unsigned long offset;
+ size_t object_size;
+
+ /* Find object and usable object size. */
+ s = page->slab_cache;
+ object_size = slab_ksize(s);
+
+ /* Reject impossible pointers. */
+ if (ptr < page_address(page))
+ return s->name;
+
+ /* Find offset within object. */
+ offset = (ptr - page_address(page)) % s->size;
+
+ /* Adjust for redzone and reject if within the redzone. */
+ if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
+ if (offset < s->red_left_pad)
+ return s->name;
+ offset -= s->red_left_pad;
+ }
+
+ /* Allow address range falling entirely within object size. */
+ if (offset <= object_size && n <= object_size - offset)
+ return NULL;
+
+ return s->name;
+}
+#endif /* CONFIG_HARDENED_USERCOPY */
+
size_t ksize(const void *object)
{
size_t size = __ksize(object);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 14ae9e3ec728..38b66a9823b4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -875,6 +875,48 @@ int page_swapcount(struct page *page)
}
/*
+ * How many references to @entry are currently swapped out?
+ * This considers COUNT_CONTINUED so it returns exact answer.
+ */
+int swp_swapcount(swp_entry_t entry)
+{
+ int count, tmp_count, n;
+ struct swap_info_struct *p;
+ struct page *page;
+ pgoff_t offset;
+ unsigned char *map;
+
+ p = swap_info_get(entry);
+ if (!p)
+ return 0;
+
+ count = swap_count(p->swap_map[swp_offset(entry)]);
+ if (!(count & COUNT_CONTINUED))
+ goto out;
+
+ count &= ~COUNT_CONTINUED;
+ n = SWAP_MAP_MAX + 1;
+
+ offset = swp_offset(entry);
+ page = vmalloc_to_page(p->swap_map + offset);
+ offset &= ~PAGE_MASK;
+ VM_BUG_ON(page_private(page) != SWP_CONTINUED);
+
+ do {
+ page = list_entry(page->lru.next, struct page, lru);
+ map = kmap_atomic(page);
+ tmp_count = map[offset];
+ kunmap_atomic(map);
+
+ count += (tmp_count & ~COUNT_CONTINUED) * n;
+ n *= (SWAP_CONT_MAX + 1);
+ } while (tmp_count & COUNT_CONTINUED);
+out:
+ spin_unlock(&p->lock);
+ return count;
+}
+
+/*
* We can write to an anon page without COW if there are no other references
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
diff --git a/mm/usercopy.c b/mm/usercopy.c
new file mode 100644
index 000000000000..b34996a3860b
--- /dev/null
+++ b/mm/usercopy.c
@@ -0,0 +1,278 @@
+/*
+ * This implements the various checks for CONFIG_HARDENED_USERCOPY*,
+ * which are designed to protect kernel memory from needless exposure
+ * and overwrite under many unintended conditions. This code is based
+ * on PAX_USERCOPY, which is:
+ *
+ * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
+ * Security Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <asm/sections.h>
+
+enum {
+ BAD_STACK = -1,
+ NOT_STACK = 0,
+ GOOD_FRAME,
+ GOOD_STACK,
+};
+
+/*
+ * Checks if a given pointer and length is contained by the current
+ * stack frame (if possible).
+ *
+ * Returns:
+ * NOT_STACK: not at all on the stack
+ * GOOD_FRAME: fully within a valid stack frame
+ * GOOD_STACK: fully on the stack (when can't do frame-checking)
+ * BAD_STACK: error condition (invalid stack position or bad stack frame)
+ */
+static noinline int check_stack_object(const void *obj, unsigned long len)
+{
+ const void * const stack = task_stack_page(current);
+ const void * const stackend = stack + THREAD_SIZE;
+ int ret;
+
+ /* Object is not on the stack at all. */
+ if (obj + len <= stack || stackend <= obj)
+ return NOT_STACK;
+
+ /*
+ * Reject: object partially overlaps the stack (passing the
+ * the check above means at least one end is within the stack,
+ * so if this check fails, the other end is outside the stack).
+ */
+ if (obj < stack || stackend < obj + len)
+ return BAD_STACK;
+
+ /* Check if object is safely within a valid frame. */
+ ret = arch_within_stack_frames(stack, stackend, obj, len);
+ if (ret)
+ return ret;
+
+ return GOOD_STACK;
+}
+
+static void report_usercopy(const void *ptr, unsigned long len,
+ bool to_user, const char *type)
+{
+ pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n",
+ to_user ? "exposure" : "overwrite",
+ to_user ? "from" : "to", ptr, type ? : "unknown", len);
+ /*
+ * For greater effect, it would be nice to do do_group_exit(),
+ * but BUG() actually hooks all the lock-breaking and per-arch
+ * Oops code, so that is used here instead.
+ */
+ BUG();
+}
+
+/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
+static bool overlaps(const void *ptr, unsigned long n, unsigned long low,
+ unsigned long high)
+{
+ unsigned long check_low = (uintptr_t)ptr;
+ unsigned long check_high = check_low + n;
+
+ /* Does not overlap if entirely above or entirely below. */
+ if (check_low >= high || check_high <= low)
+ return false;
+
+ return true;
+}
+
+/* Is this address range in the kernel text area? */
+static inline const char *check_kernel_text_object(const void *ptr,
+ unsigned long n)
+{
+ unsigned long textlow = (unsigned long)_stext;
+ unsigned long texthigh = (unsigned long)_etext;
+ unsigned long textlow_linear, texthigh_linear;
+
+ if (overlaps(ptr, n, textlow, texthigh))
+ return "<kernel text>";
+
+ /*
+ * Some architectures have virtual memory mappings with a secondary
+ * mapping of the kernel text, i.e. there is more than one virtual
+ * kernel address that points to the kernel image. It is usually
+ * when there is a separate linear physical memory mapping, in that
+ * __pa() is not just the reverse of __va(). This can be detected
+ * and checked:
+ */
+ textlow_linear = (unsigned long)__va(__pa(textlow));
+ /* No different mapping: we're done. */
+ if (textlow_linear == textlow)
+ return NULL;
+
+ /* Check the secondary mapping... */
+ texthigh_linear = (unsigned long)__va(__pa(texthigh));
+ if (overlaps(ptr, n, textlow_linear, texthigh_linear))
+ return "<linear kernel text>";
+
+ return NULL;
+}
+
+static inline const char *check_bogus_address(const void *ptr, unsigned long n)
+{
+ /* Reject if object wraps past end of memory. */
+ if ((unsigned long)ptr + n < (unsigned long)ptr)
+ return "<wrapped address>";
+
+ /* Reject if NULL or ZERO-allocation. */
+ if (ZERO_OR_NULL_PTR(ptr))
+ return "<null>";
+
+ return NULL;
+}
+
+/* Checks for allocs that are marked in some way as spanning multiple pages. */
+static inline const char *check_page_span(const void *ptr, unsigned long n,
+ struct page *page, bool to_user)
+{
+#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN
+ const void *end = ptr + n - 1;
+ struct page *endpage;
+ bool is_reserved, is_cma;
+
+ /*
+ * Sometimes the kernel data regions are not marked Reserved (see
+ * check below). And sometimes [_sdata,_edata) does not cover
+ * rodata and/or bss, so check each range explicitly.
+ */
+
+ /* Allow reads of kernel rodata region (if not marked as Reserved). */
+ if (ptr >= (const void *)__start_rodata &&
+ end <= (const void *)__end_rodata) {
+ if (!to_user)
+ return "<rodata>";
+ return NULL;
+ }
+
+ /* Allow kernel data region (if not marked as Reserved). */
+ if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
+ return NULL;
+
+ /* Allow kernel bss region (if not marked as Reserved). */
+ if (ptr >= (const void *)__bss_start &&
+ end <= (const void *)__bss_stop)
+ return NULL;
+
+ /* Is the object wholly within one base page? */
+ if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
+ ((unsigned long)end & (unsigned long)PAGE_MASK)))
+ return NULL;
+
+ /* Allow if fully inside the same compound (__GFP_COMP) page. */
+ endpage = virt_to_head_page(end);
+ if (likely(endpage == page))
+ return NULL;
+
+ /*
+ * Reject if range is entirely either Reserved (i.e. special or
+ * device memory), or CMA. Otherwise, reject since the object spans
+ * several independently allocated pages.
+ */
+ is_reserved = PageReserved(page);
+ is_cma = is_migrate_cma_page(page);
+ if (!is_reserved && !is_cma)
+ return "<spans multiple pages>";
+
+ for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) {
+ page = virt_to_head_page(ptr);
+ if (is_reserved && !PageReserved(page))
+ return "<spans Reserved and non-Reserved pages>";
+ if (is_cma && !is_migrate_cma_page(page))
+ return "<spans CMA and non-CMA pages>";
+ }
+#endif
+
+ return NULL;
+}
+
+static inline const char *check_heap_object(const void *ptr, unsigned long n,
+ bool to_user)
+{
+ struct page *page;
+
+ /*
+ * Some architectures (arm64) return true for virt_addr_valid() on
+ * vmalloced addresses. Work around this by checking for vmalloc
+ * first.
+ */
+ if (is_vmalloc_addr(ptr))
+ return NULL;
+
+ if (!virt_addr_valid(ptr))
+ return NULL;
+
+ page = virt_to_head_page(ptr);
+
+ /* Check slab allocator for flags and size. */
+ if (PageSlab(page))
+ return __check_heap_object(ptr, n, page);
+
+ /* Verify object does not incorrectly span multiple pages. */
+ return check_page_span(ptr, n, page, to_user);
+}
+
+/*
+ * Validates that the given object is:
+ * - not bogus address
+ * - known-safe heap or stack object
+ * - not in kernel text
+ */
+void __check_object_size(const void *ptr, unsigned long n, bool to_user)
+{
+ const char *err;
+
+ /* Skip all tests if size is zero. */
+ if (!n)
+ return;
+
+ /* Check for invalid addresses. */
+ err = check_bogus_address(ptr, n);
+ if (err)
+ goto report;
+
+ /* Check for bad heap object. */
+ err = check_heap_object(ptr, n, to_user);
+ if (err)
+ goto report;
+
+ /* Check for bad stack object. */
+ switch (check_stack_object(ptr, n)) {
+ case NOT_STACK:
+ /* Object is not touching the current process stack. */
+ break;
+ case GOOD_FRAME:
+ case GOOD_STACK:
+ /*
+ * Object is either in the correct frame (when it
+ * is possible to check) or just generally on the
+ * process stack (when frame checking not available).
+ */
+ return;
+ default:
+ err = "<process stack>";
+ goto report;
+ }
+
+ /* Check for object in kernel to avoid text exposure. */
+ err = check_kernel_text_object(ptr, n);
+ if (!err)
+ return;
+
+report:
+ report_usercopy(ptr, n, to_user, err);
+}
+EXPORT_SYMBOL(__check_object_size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d48b28219edf..4b2a225d92f9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -46,6 +46,7 @@
#include <linux/oom.h>
#include <linux/prefetch.h>
#include <linux/printk.h>
+#include <linux/debugfs.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -186,6 +187,39 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
}
+struct dentry *debug_file;
+
+static int debug_shrinker_show(struct seq_file *s, void *unused)
+{
+ struct shrinker *shrinker;
+ struct shrink_control sc;
+
+ sc.gfp_mask = -1;
+ sc.nr_to_scan = 0;
+
+ down_read(&shrinker_rwsem);
+ list_for_each_entry(shrinker, &shrinker_list, list) {
+ int num_objs;
+
+ num_objs = shrinker->count_objects(shrinker, &sc);
+ seq_printf(s, "%pf %d\n", shrinker->scan_objects, num_objs);
+ }
+ up_read(&shrinker_rwsem);
+ return 0;
+}
+
+static int debug_shrinker_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, debug_shrinker_show, inode->i_private);
+}
+
+static const struct file_operations debug_shrinker_fops = {
+ .open = debug_shrinker_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
/*
* Add a shrinker callback to be called from the vm.
*/
@@ -215,6 +249,15 @@ int register_shrinker(struct shrinker *shrinker)
}
EXPORT_SYMBOL(register_shrinker);
+static int __init add_shrinker_debug(void)
+{
+ debugfs_create_file("shrinker", 0644, NULL, NULL,
+ &debug_shrinker_fops);
+ return 0;
+}
+
+late_initcall(add_shrinker_debug);
+
/*
* Remove one
*/
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4590aa42b6cd..50e673be0289 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -455,7 +455,7 @@ static int fold_diff(int *diff)
*
* The function returns the number of global counters updated.
*/
-static int refresh_cpu_vm_stats(void)
+static int refresh_cpu_vm_stats(bool do_pagesets)
{
struct zone *zone;
int i;
@@ -479,33 +479,35 @@ static int refresh_cpu_vm_stats(void)
#endif
}
}
- cond_resched();
#ifdef CONFIG_NUMA
- /*
- * Deal with draining the remote pageset of this
- * processor
- *
- * Check if there are pages remaining in this pageset
- * if not then there is nothing to expire.
- */
- if (!__this_cpu_read(p->expire) ||
+ if (do_pagesets) {
+ cond_resched();
+ /*
+ * Deal with draining the remote pageset of this
+ * processor
+ *
+ * Check if there are pages remaining in this pageset
+ * if not then there is nothing to expire.
+ */
+ if (!__this_cpu_read(p->expire) ||
!__this_cpu_read(p->pcp.count))
- continue;
-
- /*
- * We never drain zones local to this processor.
- */
- if (zone_to_nid(zone) == numa_node_id()) {
- __this_cpu_write(p->expire, 0);
- continue;
- }
+ continue;
+
+ /*
+ * We never drain zones local to this processor.
+ */
+ if (zone_to_nid(zone) == numa_node_id()) {
+ __this_cpu_write(p->expire, 0);
+ continue;
+ }
- if (__this_cpu_dec_return(p->expire))
- continue;
+ if (__this_cpu_dec_return(p->expire))
+ continue;
- if (__this_cpu_read(p->pcp.count)) {
- drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
- changes++;
+ if (__this_cpu_read(p->pcp.count)) {
+ drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
+ changes++;
+ }
}
#endif
}
@@ -1259,7 +1261,7 @@ static cpumask_var_t cpu_stat_off;
static void vmstat_update(struct work_struct *w)
{
- if (refresh_cpu_vm_stats())
+ if (refresh_cpu_vm_stats(true)) {
/*
* Counters were updated so we expect more updates
* to occur in the future. Keep on running the
@@ -1267,7 +1269,7 @@ static void vmstat_update(struct work_struct *w)
*/
schedule_delayed_work(this_cpu_ptr(&vmstat_work),
round_jiffies_relative(sysctl_stat_interval));
- else {
+ } else {
/*
* We did not update any counters so the app may be in
* a mode where it does not cause counter updates.
@@ -1290,6 +1292,23 @@ static void vmstat_update(struct work_struct *w)
}
/*
+ * Switch off vmstat processing and then fold all the remaining differentials
+ * until the diffs stay at zero. The function is used by NOHZ and can only be
+ * invoked when tick processing is not active.
+ */
+void quiet_vmstat(void)
+{
+ if (system_state != SYSTEM_RUNNING)
+ return;
+
+ do {
+ if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+ cancel_delayed_work(this_cpu_ptr(&vmstat_work));
+
+ } while (refresh_cpu_vm_stats(false));
+}
+
+/*
* Check if the diffs for a certain cpu indicate that
* an update is needed.
*/
@@ -1321,7 +1340,7 @@ static bool need_update(int cpu)
*/
static void vmstat_shepherd(struct work_struct *w);
-static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd);
+static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
static void vmstat_shepherd(struct work_struct *w)
{
diff --git a/mm/zbud.c b/mm/zbud.c
index ecf1dbef6983..2010b6ecf5ce 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = {
.evict = zbud_zpool_evict
};
-static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zbud_zpool_create(char *name, gfp_t gfp,
+ struct zpool_ops *zpool_ops)
{
return zbud_create_pool(gfp, &zbud_zpool_ops);
}
diff --git a/mm/zpool.c b/mm/zpool.c
index 739cdf0d183a..bacdab6e47de 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
/**
* zpool_create_pool() - Create a new zpool
* @type The type of the zpool to create (e.g. zbud, zsmalloc)
+ * @name The name of the zpool (e.g. zram0, zswap)
* @gfp The GFP flags to use when allocating the pool.
* @ops The optional ops callback.
*
@@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver)
*
* Returns: New zpool on success, NULL on failure.
*/
-struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
+struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
+ struct zpool_ops *ops)
{
struct zpool_driver *driver;
struct zpool *zpool;
@@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
zpool->type = driver->type;
zpool->driver = driver;
- zpool->pool = driver->create(gfp, ops);
+ zpool->pool = driver->create(name, gfp, ops);
zpool->ops = ops;
if (!zpool->pool) {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 839a48c3ca27..a8b5e749e84e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -12,35 +12,6 @@
*/
/*
- * This allocator is designed for use with zram. Thus, the allocator is
- * supposed to work well under low memory conditions. In particular, it
- * never attempts higher order page allocation which is very likely to
- * fail under memory pressure. On the other hand, if we just use single
- * (0-order) pages, it would suffer from very high fragmentation --
- * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
- * This was one of the major issues with its predecessor (xvmalloc).
- *
- * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
- * and links them together using various 'struct page' fields. These linked
- * pages act as a single higher-order page i.e. an object can span 0-order
- * page boundaries. The code refers to these linked pages as a single entity
- * called zspage.
- *
- * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
- * since this satisfies the requirements of all its current users (in the
- * worst case, page is incompressible and is thus stored "as-is" i.e. in
- * uncompressed form). For allocation requests larger than this size, failure
- * is returned (see zs_malloc).
- *
- * Additionally, zs_malloc() does not return a dereferenceable pointer.
- * Instead, it returns an opaque handle (unsigned long) which encodes actual
- * location of the allocated object. The reason for this indirection is that
- * zsmalloc does not keep zspages permanently mapped since that would cause
- * issues on 32-bit systems where the VA region for kernel space mappings
- * is very small. So, before using the allocating memory, the object has to
- * be mapped using zs_map_object() to get a usable pointer and subsequently
- * unmapped using zs_unmap_object().
- *
* Following is how we use various fields and flags of underlying
* struct page(s) to form a zspage.
*
@@ -57,6 +28,8 @@
*
* page->private (union with page->first_page): refers to the
* component page after the first page
+ * If the page is first_page for huge object, it stores handle.
+ * Look at size_class->huge.
* page->freelist: points to the first free object in zspage.
* Free objects are linked together using in-place
* metadata.
@@ -78,6 +51,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/sched.h>
#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/highmem.h>
@@ -91,6 +65,7 @@
#include <linux/hardirq.h>
#include <linux/spinlock.h>
#include <linux/types.h>
+#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
@@ -109,6 +84,8 @@
#define ZS_MAX_ZSPAGE_ORDER 2
#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
+#define ZS_HANDLE_SIZE (sizeof(unsigned long))
+
/*
* Object location (<PFN>, <obj_idx>) is encoded as
* as single (unsigned long) handle value.
@@ -132,13 +109,33 @@
#endif
#endif
#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
-#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
+
+/*
+ * Memory for allocating for handle keeps object position by
+ * encoding <page, obj_idx> and the encoded value has a room
+ * in least bit(ie, look at obj_to_location).
+ * We use the bit to synchronize between object access by
+ * user and migration.
+ */
+#define HANDLE_PIN_BIT 0
+
+/*
+ * Head in allocated object should have OBJ_ALLOCATED_TAG
+ * to identify the object was allocated or not.
+ * It's okay to add the status bit in the least bit because
+ * header keeps handle which is 4byte-aligned address so we
+ * have room for two bit at least.
+ */
+#define OBJ_ALLOCATED_TAG 1
+#define OBJ_TAG_BITS 1
+#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
#define ZS_MIN_ALLOC_SIZE \
MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
+/* each chunk includes extra space to keep handle */
#define ZS_MAX_ALLOC_SIZE PAGE_SIZE
/*
@@ -155,8 +152,6 @@
* (reason above)
*/
#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8)
-#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
- ZS_SIZE_CLASS_DELTA + 1)
/*
* We do not maintain any list for completely empty or full pages
@@ -170,6 +165,29 @@ enum fullness_group {
ZS_FULL
};
+enum zs_stat_type {
+ OBJ_ALLOCATED,
+ OBJ_USED,
+ CLASS_ALMOST_FULL,
+ CLASS_ALMOST_EMPTY,
+ NR_ZS_STAT_TYPE,
+};
+
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static struct dentry *zs_stat_root;
+
+struct zs_size_stat {
+ unsigned long objs[NR_ZS_STAT_TYPE];
+};
+
+#endif
+
+/*
+ * number of size_classes
+ */
+static int zs_size_classes;
+
/*
* We assign a page to ZS_ALMOST_EMPTY fullness group when:
* n <= N / f, where
@@ -196,6 +214,12 @@ struct size_class {
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
int pages_per_zspage;
+ /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+ bool huge;
+
+#ifdef CONFIG_ZSMALLOC_STAT
+ struct zs_size_stat stats;
+#endif
spinlock_t lock;
@@ -209,15 +233,31 @@ struct size_class {
* This must be power of 2 and less than or equal to ZS_ALIGN
*/
struct link_free {
- /* Handle of next free chunk (encodes <PFN, obj_idx>) */
- void *next;
+ union {
+ /*
+ * Position of next free chunk (encodes <PFN, obj_idx>)
+ * It's valid for non-allocated object
+ */
+ void *next;
+ /*
+ * Handle of allocated object.
+ */
+ unsigned long handle;
+ };
};
struct zs_pool {
- struct size_class size_class[ZS_SIZE_CLASSES];
+ char *name;
+
+ struct size_class **size_class;
+ struct kmem_cache *handle_cachep;
gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
+
+#ifdef CONFIG_ZSMALLOC_STAT
+ struct dentry *stat_dentry;
+#endif
};
/*
@@ -237,15 +277,45 @@ struct mapping_area {
#endif
char *vm_addr; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
+ bool huge;
};
+static int create_handle_cache(struct zs_pool *pool)
+{
+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+ 0, 0, NULL);
+ return pool->handle_cachep ? 0 : 1;
+}
+
+static void destroy_handle_cache(struct zs_pool *pool)
+{
+ if (pool->handle_cachep)
+ kmem_cache_destroy(pool->handle_cachep);
+}
+
+static unsigned long alloc_handle(struct zs_pool *pool)
+{
+ return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
+ pool->flags & ~__GFP_HIGHMEM);
+}
+
+static void free_handle(struct zs_pool *pool, unsigned long handle)
+{
+ kmem_cache_free(pool->handle_cachep, (void *)handle);
+}
+
+static void record_obj(unsigned long handle, unsigned long obj)
+{
+ *(unsigned long *)handle = obj;
+}
+
/* zpool driver */
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops)
{
- return zs_create_pool(gfp);
+ return zs_create_pool(name, gfp);
}
static void zs_zpool_destroy(void *pool)
@@ -316,6 +386,11 @@ static struct zpool_driver zs_zpool_driver = {
MODULE_ALIAS("zpool-zsmalloc");
#endif /* CONFIG_ZPOOL */
+static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
+{
+ return pages_per_zspage * PAGE_SIZE / size;
+}
+
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
@@ -366,9 +441,182 @@ static int get_size_class_index(int size)
idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
ZS_SIZE_CLASS_DELTA);
- return idx;
+ return min(zs_size_classes - 1, idx);
+}
+
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] += cnt;
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] -= cnt;
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return class->stats.objs[type];
+}
+
+static int __init zs_stat_init(void)
+{
+ if (!debugfs_initialized())
+ return -ENODEV;
+
+ zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+ if (!zs_stat_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+ debugfs_remove_recursive(zs_stat_root);
}
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+ int i;
+ struct zs_pool *pool = s->private;
+ struct size_class *class;
+ int objs_per_zspage;
+ unsigned long class_almost_full, class_almost_empty;
+ unsigned long obj_allocated, obj_used, pages_used;
+ unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
+ unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+
+ seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n",
+ "class", "size", "almost_full", "almost_empty",
+ "obj_allocated", "obj_used", "pages_used",
+ "pages_per_zspage");
+
+ for (i = 0; i < zs_size_classes; i++) {
+ class = pool->size_class[i];
+
+ if (class->index != i)
+ continue;
+
+ spin_lock(&class->lock);
+ class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
+ class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
+ obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+ obj_used = zs_stat_get(class, OBJ_USED);
+ spin_unlock(&class->lock);
+
+ objs_per_zspage = get_maxobj_per_zspage(class->size,
+ class->pages_per_zspage);
+ pages_used = obj_allocated / objs_per_zspage *
+ class->pages_per_zspage;
+
+ seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n",
+ i, class->size, class_almost_full, class_almost_empty,
+ obj_allocated, obj_used, pages_used,
+ class->pages_per_zspage);
+
+ total_class_almost_full += class_almost_full;
+ total_class_almost_empty += class_almost_empty;
+ total_objs += obj_allocated;
+ total_used_objs += obj_used;
+ total_pages += pages_used;
+ }
+
+ seq_puts(s, "\n");
+ seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n",
+ "Total", "", total_class_almost_full,
+ total_class_almost_empty, total_objs,
+ total_used_objs, total_pages);
+
+ return 0;
+}
+
+static int zs_stats_size_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, zs_stats_size_show, inode->i_private);
+}
+
+static const struct file_operations zs_stat_size_ops = {
+ .open = zs_stats_size_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ struct dentry *entry;
+
+ if (!zs_stat_root)
+ return -ENODEV;
+
+ entry = debugfs_create_dir(name, zs_stat_root);
+ if (!entry) {
+ pr_warn("debugfs dir <%s> creation failed\n", name);
+ return -ENOMEM;
+ }
+ pool->stat_dentry = entry;
+
+ entry = debugfs_create_file("classes", S_IFREG | S_IRUGO,
+ pool->stat_dentry, pool, &zs_stat_size_ops);
+ if (!entry) {
+ pr_warn("%s: debugfs file entry <%s> creation failed\n",
+ name, "classes");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+ debugfs_remove_recursive(pool->stat_dentry);
+}
+
+#else /* CONFIG_ZSMALLOC_STAT */
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return 0;
+}
+
+static int __init zs_stat_init(void)
+{
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+}
+
+static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ return 0;
+}
+
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+}
+
+#endif
+
+
/*
* For each size class, zspages are divided into different groups
* depending on how "full" they are. This was done so that we could
@@ -389,7 +637,7 @@ static enum fullness_group get_fullness_group(struct page *page)
fg = ZS_EMPTY;
else if (inuse == max_objects)
fg = ZS_FULL;
- else if (inuse <= max_objects / fullness_threshold_frac)
+ else if (inuse <= 3 * max_objects / fullness_threshold_frac)
fg = ZS_ALMOST_EMPTY;
else
fg = ZS_ALMOST_FULL;
@@ -418,6 +666,8 @@ static void insert_zspage(struct page *page, struct size_class *class,
list_add_tail(&page->lru, &(*head)->lru);
*head = page;
+ zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
+ CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
}
/*
@@ -443,6 +693,8 @@ static void remove_zspage(struct page *page, struct size_class *class,
struct page, lru);
list_del_init(&page->lru);
+ zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
+ CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
}
/*
@@ -454,11 +706,10 @@ static void remove_zspage(struct page *page, struct size_class *class,
* page from the freelist of the old fullness group to that of the new
* fullness group.
*/
-static enum fullness_group fix_fullness_group(struct zs_pool *pool,
+static enum fullness_group fix_fullness_group(struct size_class *class,
struct page *page)
{
int class_idx;
- struct size_class *class;
enum fullness_group currfg, newfg;
BUG_ON(!is_first_page(page));
@@ -468,7 +719,6 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
if (newfg == currfg)
goto out;
- class = &pool->size_class[class_idx];
remove_zspage(page, class, currfg);
insert_zspage(page, class, newfg);
set_zspage_mapping(page, class_idx, newfg);
@@ -482,7 +732,8 @@ out:
* to form a zspage for each size class. This is important
* to reduce wastage due to unusable space left at end of
* each zspage which is given as:
- * wastage = Zp - Zp % size_class
+ * wastage = Zp % class_size
+ * usage = Zp - wastage
* where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
*
* For example, for size class of 3/8 * PAGE_SIZE, we should
@@ -541,35 +792,50 @@ static struct page *get_next_page(struct page *page)
/*
* Encode <page, obj_idx> as a single handle value.
- * On hardware platforms with physical memory starting at 0x0 the pfn
- * could be 0 so we ensure that the handle will never be 0 by adjusting the
- * encoded obj_idx value before encoding.
+ * We use the least bit of handle for tagging.
*/
-static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
+static void *location_to_obj(struct page *page, unsigned long obj_idx)
{
- unsigned long handle;
+ unsigned long obj;
if (!page) {
BUG_ON(obj_idx);
return NULL;
}
- handle = page_to_pfn(page) << OBJ_INDEX_BITS;
- handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
+ obj = page_to_pfn(page) << OBJ_INDEX_BITS;
+ obj |= ((obj_idx) & OBJ_INDEX_MASK);
+ obj <<= OBJ_TAG_BITS;
- return (void *)handle;
+ return (void *)obj;
}
/*
* Decode <page, obj_idx> pair from the given object handle. We adjust the
* decoded obj_idx back to its original value since it was adjusted in
- * obj_location_to_handle().
+ * location_to_obj().
*/
-static void obj_handle_to_location(unsigned long handle, struct page **page,
+static void obj_to_location(unsigned long obj, struct page **page,
unsigned long *obj_idx)
{
- *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
- *obj_idx = (handle & OBJ_INDEX_MASK) - 1;
+ obj >>= OBJ_TAG_BITS;
+ *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+ *obj_idx = (obj & OBJ_INDEX_MASK);
+}
+
+static unsigned long handle_to_obj(unsigned long handle)
+{
+ return *(unsigned long *)handle;
+}
+
+static unsigned long obj_to_head(struct size_class *class, struct page *page,
+ void *obj)
+{
+ if (class->huge) {
+ VM_BUG_ON(!is_first_page(page));
+ return *(unsigned long *)page_private(page);
+ } else
+ return *(unsigned long *)obj;
}
static unsigned long obj_idx_to_offset(struct page *page,
@@ -583,6 +849,25 @@ static unsigned long obj_idx_to_offset(struct page *page,
return off + obj_idx * class_size;
}
+static inline int trypin_tag(unsigned long handle)
+{
+ unsigned long *ptr = (unsigned long *)handle;
+
+ return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr);
+}
+
+static void pin_tag(unsigned long handle)
+{
+ while (!trypin_tag(handle));
+}
+
+static void unpin_tag(unsigned long handle)
+{
+ unsigned long *ptr = (unsigned long *)handle;
+
+ clear_bit_unlock(HANDLE_PIN_BIT, ptr);
+}
+
static void reset_page(struct page *page)
{
clear_bit(PG_private, &page->flags);
@@ -629,6 +914,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
struct page *next_page;
struct link_free *link;
unsigned int i = 1;
+ void *vaddr;
/*
* page->index stores offset of first object starting
@@ -639,11 +925,11 @@ static void init_zspage(struct page *first_page, struct size_class *class)
if (page != first_page)
page->index = off;
- link = (struct link_free *)kmap_atomic(page) +
- off / sizeof(*link);
+ vaddr = kmap_atomic(page);
+ link = (struct link_free *)vaddr + off / sizeof(*link);
while ((off += class->size) < PAGE_SIZE) {
- link->next = obj_location_to_handle(page, i++);
+ link->next = location_to_obj(page, i++);
link += class->size / sizeof(*link);
}
@@ -653,8 +939,8 @@ static void init_zspage(struct page *first_page, struct size_class *class)
* page (if present)
*/
next_page = get_next_page(page);
- link->next = obj_location_to_handle(next_page, 0);
- kunmap_atomic(link);
+ link->next = location_to_obj(next_page, 0);
+ kunmap_atomic(vaddr);
page = next_page;
off %= PAGE_SIZE;
}
@@ -707,7 +993,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
init_zspage(first_page, class);
- first_page->freelist = obj_location_to_handle(first_page, 0);
+ first_page->freelist = location_to_obj(first_page, 0);
/* Maximum number of objects we can store in this zspage */
first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
@@ -784,7 +1070,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
*/
if (area->vm_buf)
return 0;
- area->vm_buf = (char *)__get_free_page(GFP_KERNEL);
+ area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL);
if (!area->vm_buf)
return -ENOMEM;
return 0;
@@ -792,8 +1078,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
static inline void __zs_cpu_down(struct mapping_area *area)
{
- if (area->vm_buf)
- free_page((unsigned long)area->vm_buf);
+ kfree(area->vm_buf);
area->vm_buf = NULL;
}
@@ -830,12 +1115,19 @@ static void __zs_unmap_object(struct mapping_area *area,
{
int sizes[2];
void *addr;
- char *buf = area->vm_buf;
+ char *buf;
/* no write fastpath */
if (area->vm_mm == ZS_MM_RO)
goto out;
+ buf = area->vm_buf;
+ if (!area->huge) {
+ buf = buf + ZS_HANDLE_SIZE;
+ size -= ZS_HANDLE_SIZE;
+ off += ZS_HANDLE_SIZE;
+ }
+
sizes[0] = PAGE_SIZE - off;
sizes[1] = size - sizes[0];
@@ -881,13 +1173,26 @@ static struct notifier_block zs_cpu_nb = {
.notifier_call = zs_cpu_notifier
};
-static void zs_exit(void)
+static int zs_register_cpu_notifier(void)
{
- int cpu;
+ int cpu, uninitialized_var(ret);
-#ifdef CONFIG_ZPOOL
- zpool_unregister_driver(&zs_zpool_driver);
-#endif
+ cpu_notifier_register_begin();
+
+ __register_cpu_notifier(&zs_cpu_nb);
+ for_each_online_cpu(cpu) {
+ ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+ if (notifier_to_errno(ret))
+ break;
+ }
+
+ cpu_notifier_register_done();
+ return notifier_to_errno(ret);
+}
+
+static void zs_unregister_cpu_notifier(void)
+{
+ int cpu;
cpu_notifier_register_begin();
@@ -898,93 +1203,176 @@ static void zs_exit(void)
cpu_notifier_register_done();
}
-static int zs_init(void)
+static void init_zs_size_classes(void)
{
- int cpu, ret;
+ int nr;
- cpu_notifier_register_begin();
+ nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1;
+ if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA)
+ nr += 1;
- __register_cpu_notifier(&zs_cpu_nb);
- for_each_online_cpu(cpu) {
- ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
- if (notifier_to_errno(ret)) {
- cpu_notifier_register_done();
- goto fail;
- }
- }
+ zs_size_classes = nr;
+}
- cpu_notifier_register_done();
+static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
+{
+ if (prev->pages_per_zspage != pages_per_zspage)
+ return false;
-#ifdef CONFIG_ZPOOL
- zpool_register_driver(&zs_zpool_driver);
-#endif
+ if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage)
+ != get_maxobj_per_zspage(size, pages_per_zspage))
+ return false;
- return 0;
-fail:
- zs_exit();
- return notifier_to_errno(ret);
+ return true;
+}
+
+static bool zspage_full(struct page *page)
+{
+ BUG_ON(!is_first_page(page));
+
+ return page->inuse == page->objects;
+}
+
+unsigned long zs_get_total_pages(struct zs_pool *pool)
+{
+ return atomic_long_read(&pool->pages_allocated);
}
+EXPORT_SYMBOL_GPL(zs_get_total_pages);
/**
- * zs_create_pool - Creates an allocation pool to work from.
- * @flags: allocation flags used to allocate pool metadata
+ * zs_map_object - get address of allocated object from handle.
+ * @pool: pool from which the object was allocated
+ * @handle: handle returned from zs_malloc
*
- * This function must be called before anything when using
- * the zsmalloc allocator.
+ * Before using an object allocated from zs_malloc, it must be mapped using
+ * this function. When done with the object, it must be unmapped using
+ * zs_unmap_object.
*
- * On success, a pointer to the newly created pool is returned,
- * otherwise NULL.
+ * Only one object can be mapped per cpu at a time. There is no protection
+ * against nested mappings.
+ *
+ * This function returns with preemption and page faults disabled.
*/
-struct zs_pool *zs_create_pool(gfp_t flags)
+void *zs_map_object(struct zs_pool *pool, unsigned long handle,
+ enum zs_mapmode mm)
{
- int i, ovhd_size;
- struct zs_pool *pool;
+ struct page *page;
+ unsigned long obj, obj_idx, off;
- ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
- pool = kzalloc(ovhd_size, GFP_KERNEL);
- if (!pool)
- return NULL;
+ unsigned int class_idx;
+ enum fullness_group fg;
+ struct size_class *class;
+ struct mapping_area *area;
+ struct page *pages[2];
+ void *ret;
- for (i = 0; i < ZS_SIZE_CLASSES; i++) {
- int size;
- struct size_class *class;
+ BUG_ON(!handle);
- size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
- if (size > ZS_MAX_ALLOC_SIZE)
- size = ZS_MAX_ALLOC_SIZE;
+ /*
+ * Because we use per-cpu mapping areas shared among the
+ * pools/users, we can't allow mapping in interrupt context
+ * because it can corrupt another users mappings.
+ */
+ BUG_ON(in_interrupt());
- class = &pool->size_class[i];
- class->size = size;
- class->index = i;
- spin_lock_init(&class->lock);
- class->pages_per_zspage = get_pages_per_zspage(size);
+ /* From now on, migration cannot move the object */
+ pin_tag(handle);
+
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &page, &obj_idx);
+ get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ class = pool->size_class[class_idx];
+ off = obj_idx_to_offset(page, obj_idx, class->size);
+ area = &get_cpu_var(zs_map_area);
+ area->vm_mm = mm;
+ if (off + class->size <= PAGE_SIZE) {
+ /* this object is contained entirely within a page */
+ area->vm_addr = kmap_atomic(page);
+ ret = area->vm_addr + off;
+ goto out;
}
- pool->flags = flags;
+ /* this object spans two pages */
+ pages[0] = page;
+ pages[1] = get_next_page(page);
+ BUG_ON(!pages[1]);
- return pool;
+ ret = __zs_map_object(area, pages, off, class->size);
+out:
+ if (!class->huge)
+ ret += ZS_HANDLE_SIZE;
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(zs_create_pool);
+EXPORT_SYMBOL_GPL(zs_map_object);
-void zs_destroy_pool(struct zs_pool *pool)
+void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
{
- int i;
+ struct page *page;
+ unsigned long obj, obj_idx, off;
- for (i = 0; i < ZS_SIZE_CLASSES; i++) {
- int fg;
- struct size_class *class = &pool->size_class[i];
+ unsigned int class_idx;
+ enum fullness_group fg;
+ struct size_class *class;
+ struct mapping_area *area;
- for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
- if (class->fullness_list[fg]) {
- pr_info("Freeing non-empty class with size %db, fullness group %d\n",
- class->size, fg);
- }
- }
+ BUG_ON(!handle);
+
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &page, &obj_idx);
+ get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ class = pool->size_class[class_idx];
+ off = obj_idx_to_offset(page, obj_idx, class->size);
+
+ area = this_cpu_ptr(&zs_map_area);
+ if (off + class->size <= PAGE_SIZE)
+ kunmap_atomic(area->vm_addr);
+ else {
+ struct page *pages[2];
+
+ pages[0] = page;
+ pages[1] = get_next_page(page);
+ BUG_ON(!pages[1]);
+
+ __zs_unmap_object(area, pages, off, class->size);
}
- kfree(pool);
+ put_cpu_var(zs_map_area);
+ unpin_tag(handle);
}
-EXPORT_SYMBOL_GPL(zs_destroy_pool);
+EXPORT_SYMBOL_GPL(zs_unmap_object);
+
+static unsigned long obj_malloc(struct page *first_page,
+ struct size_class *class, unsigned long handle)
+{
+ unsigned long obj;
+ struct link_free *link;
+
+ struct page *m_page;
+ unsigned long m_objidx, m_offset;
+ void *vaddr;
+
+ handle |= OBJ_ALLOCATED_TAG;
+ obj = (unsigned long)first_page->freelist;
+ obj_to_location(obj, &m_page, &m_objidx);
+ m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+
+ vaddr = kmap_atomic(m_page);
+ link = (struct link_free *)vaddr + m_offset / sizeof(*link);
+ first_page->freelist = link->next;
+ if (!class->huge)
+ /* record handle in the header of allocated chunk */
+ link->handle = handle;
+ else
+ /* record handle in first_page->private */
+ set_page_private(first_page, handle);
+ kunmap_atomic(vaddr);
+ first_page->inuse++;
+ zs_stat_inc(class, OBJ_USED, 1);
+
+ return obj;
+}
+
/**
* zs_malloc - Allocate block of given size from pool.
@@ -997,20 +1385,20 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool);
*/
unsigned long zs_malloc(struct zs_pool *pool, size_t size)
{
- unsigned long obj;
- struct link_free *link;
- int class_idx;
+ unsigned long handle, obj;
struct size_class *class;
-
- struct page *first_page, *m_page;
- unsigned long m_objidx, m_offset;
+ struct page *first_page;
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
return 0;
- class_idx = get_size_class_index(size);
- class = &pool->size_class[class_idx];
- BUG_ON(class_idx != class->index);
+ handle = alloc_handle(pool);
+ if (!handle)
+ return 0;
+
+ /* extra space in chunk to keep the handle */
+ size += ZS_HANDLE_SIZE;
+ class = pool->size_class[get_size_class_index(size)];
spin_lock(&class->lock);
first_page = find_get_zspage(class);
@@ -1018,170 +1406,539 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
if (!first_page) {
spin_unlock(&class->lock);
first_page = alloc_zspage(class, pool->flags);
- if (unlikely(!first_page))
+ if (unlikely(!first_page)) {
+ free_handle(pool, handle);
return 0;
+ }
set_zspage_mapping(first_page, class->index, ZS_EMPTY);
atomic_long_add(class->pages_per_zspage,
&pool->pages_allocated);
+
spin_lock(&class->lock);
+ zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
}
- obj = (unsigned long)first_page->freelist;
- obj_handle_to_location(obj, &m_page, &m_objidx);
- m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
-
- link = (struct link_free *)kmap_atomic(m_page) +
- m_offset / sizeof(*link);
- first_page->freelist = link->next;
- memset(link, POISON_INUSE, sizeof(*link));
- kunmap_atomic(link);
-
- first_page->inuse++;
+ obj = obj_malloc(first_page, class, handle);
/* Now move the zspage to another fullness group, if required */
- fix_fullness_group(pool, first_page);
+ fix_fullness_group(class, first_page);
+ record_obj(handle, obj);
spin_unlock(&class->lock);
- return obj;
+ return handle;
}
EXPORT_SYMBOL_GPL(zs_malloc);
-void zs_free(struct zs_pool *pool, unsigned long obj)
+static void obj_free(struct zs_pool *pool, struct size_class *class,
+ unsigned long obj)
{
struct link_free *link;
struct page *first_page, *f_page;
unsigned long f_objidx, f_offset;
-
+ void *vaddr;
int class_idx;
- struct size_class *class;
enum fullness_group fullness;
- if (unlikely(!obj))
- return;
+ BUG_ON(!obj);
- obj_handle_to_location(obj, &f_page, &f_objidx);
+ obj &= ~OBJ_ALLOCATED_TAG;
+ obj_to_location(obj, &f_page, &f_objidx);
first_page = get_first_page(f_page);
get_zspage_mapping(first_page, &class_idx, &fullness);
- class = &pool->size_class[class_idx];
f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
- spin_lock(&class->lock);
+ vaddr = kmap_atomic(f_page);
/* Insert this object in containing zspage's freelist */
- link = (struct link_free *)((unsigned char *)kmap_atomic(f_page)
- + f_offset);
+ link = (struct link_free *)(vaddr + f_offset);
link->next = first_page->freelist;
- kunmap_atomic(link);
+ if (class->huge)
+ set_page_private(first_page, 0);
+ kunmap_atomic(vaddr);
first_page->freelist = (void *)obj;
-
first_page->inuse--;
- fullness = fix_fullness_group(pool, first_page);
- spin_unlock(&class->lock);
+ zs_stat_dec(class, OBJ_USED, 1);
+}
+
+void zs_free(struct zs_pool *pool, unsigned long handle)
+{
+ struct page *first_page, *f_page;
+ unsigned long obj, f_objidx;
+ int class_idx;
+ struct size_class *class;
+ enum fullness_group fullness;
+
+ if (unlikely(!handle))
+ return;
+ pin_tag(handle);
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &f_page, &f_objidx);
+ first_page = get_first_page(f_page);
+
+ get_zspage_mapping(first_page, &class_idx, &fullness);
+ class = pool->size_class[class_idx];
+
+ spin_lock(&class->lock);
+ obj_free(pool, class, obj);
+ fullness = fix_fullness_group(class, first_page);
if (fullness == ZS_EMPTY) {
+ zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
atomic_long_sub(class->pages_per_zspage,
&pool->pages_allocated);
free_zspage(first_page);
}
+ spin_unlock(&class->lock);
+ unpin_tag(handle);
+
+ free_handle(pool, handle);
}
EXPORT_SYMBOL_GPL(zs_free);
+static void zs_object_copy(unsigned long src, unsigned long dst,
+ struct size_class *class)
+{
+ struct page *s_page, *d_page;
+ unsigned long s_objidx, d_objidx;
+ unsigned long s_off, d_off;
+ void *s_addr, *d_addr;
+ int s_size, d_size, size;
+ int written = 0;
+
+ s_size = d_size = class->size;
+
+ obj_to_location(src, &s_page, &s_objidx);
+ obj_to_location(dst, &d_page, &d_objidx);
+
+ s_off = obj_idx_to_offset(s_page, s_objidx, class->size);
+ d_off = obj_idx_to_offset(d_page, d_objidx, class->size);
+
+ if (s_off + class->size > PAGE_SIZE)
+ s_size = PAGE_SIZE - s_off;
+
+ if (d_off + class->size > PAGE_SIZE)
+ d_size = PAGE_SIZE - d_off;
+
+ s_addr = kmap_atomic(s_page);
+ d_addr = kmap_atomic(d_page);
+
+ while (1) {
+ size = min(s_size, d_size);
+ memcpy(d_addr + d_off, s_addr + s_off, size);
+ written += size;
+
+ if (written == class->size)
+ break;
+
+ s_off += size;
+ s_size -= size;
+ d_off += size;
+ d_size -= size;
+
+ if (s_off >= PAGE_SIZE) {
+ kunmap_atomic(d_addr);
+ kunmap_atomic(s_addr);
+ s_page = get_next_page(s_page);
+ BUG_ON(!s_page);
+ s_addr = kmap_atomic(s_page);
+ d_addr = kmap_atomic(d_page);
+ s_size = class->size - written;
+ s_off = 0;
+ }
+
+ if (d_off >= PAGE_SIZE) {
+ kunmap_atomic(d_addr);
+ d_page = get_next_page(d_page);
+ BUG_ON(!d_page);
+ d_addr = kmap_atomic(d_page);
+ d_size = class->size - written;
+ d_off = 0;
+ }
+ }
+
+ kunmap_atomic(d_addr);
+ kunmap_atomic(s_addr);
+}
+
+/*
+ * Find alloced object in zspage from index object and
+ * return handle.
+ */
+static unsigned long find_alloced_obj(struct page *page, int index,
+ struct size_class *class)
+{
+ unsigned long head;
+ int offset = 0;
+ unsigned long handle = 0;
+ void *addr = kmap_atomic(page);
+
+ if (!is_first_page(page))
+ offset = page->index;
+ offset += class->size * index;
+
+ while (offset < PAGE_SIZE) {
+ head = obj_to_head(class, page, addr + offset);
+ if (head & OBJ_ALLOCATED_TAG) {
+ handle = head & ~OBJ_ALLOCATED_TAG;
+ if (trypin_tag(handle))
+ break;
+ handle = 0;
+ }
+
+ offset += class->size;
+ index++;
+ }
+
+ kunmap_atomic(addr);
+ return handle;
+}
+
+struct zs_compact_control {
+ /* Source page for migration which could be a subpage of zspage. */
+ struct page *s_page;
+ /* Destination page for migration which should be a first page
+ * of zspage. */
+ struct page *d_page;
+ /* Starting object index within @s_page which used for live object
+ * in the subpage. */
+ int index;
+ /* how many of objects are migrated */
+ int nr_migrated;
+};
+
+static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
+ struct zs_compact_control *cc)
+{
+ unsigned long used_obj, free_obj;
+ unsigned long handle;
+ struct page *s_page = cc->s_page;
+ struct page *d_page = cc->d_page;
+ unsigned long index = cc->index;
+ int nr_migrated = 0;
+ int ret = 0;
+
+ while (1) {
+ handle = find_alloced_obj(s_page, index, class);
+ if (!handle) {
+ s_page = get_next_page(s_page);
+ if (!s_page)
+ break;
+ index = 0;
+ continue;
+ }
+
+ /* Stop if there is no more space */
+ if (zspage_full(d_page)) {
+ unpin_tag(handle);
+ ret = -ENOMEM;
+ break;
+ }
+
+ used_obj = handle_to_obj(handle);
+ free_obj = obj_malloc(d_page, class, handle);
+ zs_object_copy(used_obj, free_obj, class);
+ index++;
+ record_obj(handle, free_obj);
+ unpin_tag(handle);
+ obj_free(pool, class, used_obj);
+ nr_migrated++;
+ }
+
+ /* Remember last position in this iteration */
+ cc->s_page = s_page;
+ cc->index = index;
+ cc->nr_migrated = nr_migrated;
+
+ return ret;
+}
+
+static struct page *alloc_target_page(struct size_class *class)
+{
+ int i;
+ struct page *page;
+
+ for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
+ page = class->fullness_list[i];
+ if (page) {
+ remove_zspage(page, class, i);
+ break;
+ }
+ }
+
+ return page;
+}
+
+static void putback_zspage(struct zs_pool *pool, struct size_class *class,
+ struct page *first_page)
+{
+ enum fullness_group fullness;
+
+ BUG_ON(!is_first_page(first_page));
+
+ fullness = get_fullness_group(first_page);
+ insert_zspage(first_page, class, fullness);
+ set_zspage_mapping(first_page, class->index, fullness);
+
+ if (fullness == ZS_EMPTY) {
+ zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
+ atomic_long_sub(class->pages_per_zspage,
+ &pool->pages_allocated);
+
+ free_zspage(first_page);
+ }
+}
+
+static struct page *isolate_source_page(struct size_class *class)
+{
+ struct page *page;
+
+ page = class->fullness_list[ZS_ALMOST_EMPTY];
+ if (page)
+ remove_zspage(page, class, ZS_ALMOST_EMPTY);
+
+ return page;
+}
+
+static unsigned long __zs_compact(struct zs_pool *pool,
+ struct size_class *class)
+{
+ int nr_to_migrate;
+ struct zs_compact_control cc;
+ struct page *src_page;
+ struct page *dst_page = NULL;
+ unsigned long nr_total_migrated = 0;
+
+ spin_lock(&class->lock);
+ while ((src_page = isolate_source_page(class))) {
+
+ BUG_ON(!is_first_page(src_page));
+
+ /* The goal is to migrate all live objects in source page */
+ nr_to_migrate = src_page->inuse;
+ cc.index = 0;
+ cc.s_page = src_page;
+
+ while ((dst_page = alloc_target_page(class))) {
+ cc.d_page = dst_page;
+ /*
+ * If there is no more space in dst_page, try to
+ * allocate another zspage.
+ */
+ if (!migrate_zspage(pool, class, &cc))
+ break;
+
+ putback_zspage(pool, class, dst_page);
+ nr_total_migrated += cc.nr_migrated;
+ nr_to_migrate -= cc.nr_migrated;
+ }
+
+ /* Stop if we couldn't find slot */
+ if (dst_page == NULL)
+ break;
+
+ putback_zspage(pool, class, dst_page);
+ putback_zspage(pool, class, src_page);
+ spin_unlock(&class->lock);
+ nr_total_migrated += cc.nr_migrated;
+ cond_resched();
+ spin_lock(&class->lock);
+ }
+
+ if (src_page)
+ putback_zspage(pool, class, src_page);
+
+ spin_unlock(&class->lock);
+
+ return nr_total_migrated;
+}
+
+unsigned long zs_compact(struct zs_pool *pool)
+{
+ int i;
+ unsigned long nr_migrated = 0;
+ struct size_class *class;
+
+ for (i = zs_size_classes - 1; i >= 0; i--) {
+ class = pool->size_class[i];
+ if (!class)
+ continue;
+ if (class->index != i)
+ continue;
+ nr_migrated += __zs_compact(pool, class);
+ }
+
+ return nr_migrated;
+}
+EXPORT_SYMBOL_GPL(zs_compact);
+
/**
- * zs_map_object - get address of allocated object from handle.
- * @pool: pool from which the object was allocated
- * @handle: handle returned from zs_malloc
- *
- * Before using an object allocated from zs_malloc, it must be mapped using
- * this function. When done with the object, it must be unmapped using
- * zs_unmap_object.
+ * zs_create_pool - Creates an allocation pool to work from.
+ * @flags: allocation flags used to allocate pool metadata
*
- * Only one object can be mapped per cpu at a time. There is no protection
- * against nested mappings.
+ * This function must be called before anything when using
+ * the zsmalloc allocator.
*
- * This function returns with preemption and page faults disabled.
+ * On success, a pointer to the newly created pool is returned,
+ * otherwise NULL.
*/
-void *zs_map_object(struct zs_pool *pool, unsigned long handle,
- enum zs_mapmode mm)
+struct zs_pool *zs_create_pool(char *name, gfp_t flags)
{
- struct page *page;
- unsigned long obj_idx, off;
+ int i;
+ struct zs_pool *pool;
+ struct size_class *prev_class = NULL;
- unsigned int class_idx;
- enum fullness_group fg;
- struct size_class *class;
- struct mapping_area *area;
- struct page *pages[2];
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool)
+ return NULL;
- BUG_ON(!handle);
+ pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
+ GFP_KERNEL);
+ if (!pool->size_class) {
+ kfree(pool);
+ return NULL;
+ }
+
+ pool->name = kstrdup(name, GFP_KERNEL);
+ if (!pool->name)
+ goto err;
+
+ if (create_handle_cache(pool))
+ goto err;
/*
- * Because we use per-cpu mapping areas shared among the
- * pools/users, we can't allow mapping in interrupt context
- * because it can corrupt another users mappings.
+ * Iterate reversly, because, size of size_class that we want to use
+ * for merging should be larger or equal to current size.
*/
- BUG_ON(in_interrupt());
+ for (i = zs_size_classes - 1; i >= 0; i--) {
+ int size;
+ int pages_per_zspage;
+ struct size_class *class;
- obj_handle_to_location(handle, &page, &obj_idx);
- get_zspage_mapping(get_first_page(page), &class_idx, &fg);
- class = &pool->size_class[class_idx];
- off = obj_idx_to_offset(page, obj_idx, class->size);
+ size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
+ if (size > ZS_MAX_ALLOC_SIZE)
+ size = ZS_MAX_ALLOC_SIZE;
+ pages_per_zspage = get_pages_per_zspage(size);
- area = &get_cpu_var(zs_map_area);
- area->vm_mm = mm;
- if (off + class->size <= PAGE_SIZE) {
- /* this object is contained entirely within a page */
- area->vm_addr = kmap_atomic(page);
- return area->vm_addr + off;
+ /*
+ * size_class is used for normal zsmalloc operation such
+ * as alloc/free for that size. Although it is natural that we
+ * have one size_class for each size, there is a chance that we
+ * can get more memory utilization if we use one size_class for
+ * many different sizes whose size_class have same
+ * characteristics. So, we makes size_class point to
+ * previous size_class if possible.
+ */
+ if (prev_class) {
+ if (can_merge(prev_class, size, pages_per_zspage)) {
+ pool->size_class[i] = prev_class;
+ continue;
+ }
+ }
+
+ class = kzalloc(sizeof(struct size_class), GFP_KERNEL);
+ if (!class)
+ goto err;
+
+ class->size = size;
+ class->index = i;
+ class->pages_per_zspage = pages_per_zspage;
+ if (pages_per_zspage == 1 &&
+ get_maxobj_per_zspage(size, pages_per_zspage) == 1)
+ class->huge = true;
+ spin_lock_init(&class->lock);
+ pool->size_class[i] = class;
+
+ prev_class = class;
}
- /* this object spans two pages */
- pages[0] = page;
- pages[1] = get_next_page(page);
- BUG_ON(!pages[1]);
+ pool->flags = flags;
+
+ if (zs_pool_stat_create(name, pool))
+ goto err;
+
+ return pool;
- return __zs_map_object(area, pages, off, class->size);
+err:
+ zs_destroy_pool(pool);
+ return NULL;
}
-EXPORT_SYMBOL_GPL(zs_map_object);
+EXPORT_SYMBOL_GPL(zs_create_pool);
-void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
+void zs_destroy_pool(struct zs_pool *pool)
{
- struct page *page;
- unsigned long obj_idx, off;
+ int i;
- unsigned int class_idx;
- enum fullness_group fg;
- struct size_class *class;
- struct mapping_area *area;
+ zs_pool_stat_destroy(pool);
- BUG_ON(!handle);
+ for (i = 0; i < zs_size_classes; i++) {
+ int fg;
+ struct size_class *class = pool->size_class[i];
- obj_handle_to_location(handle, &page, &obj_idx);
- get_zspage_mapping(get_first_page(page), &class_idx, &fg);
- class = &pool->size_class[class_idx];
- off = obj_idx_to_offset(page, obj_idx, class->size);
+ if (!class)
+ continue;
- area = this_cpu_ptr(&zs_map_area);
- if (off + class->size <= PAGE_SIZE)
- kunmap_atomic(area->vm_addr);
- else {
- struct page *pages[2];
+ if (class->index != i)
+ continue;
- pages[0] = page;
- pages[1] = get_next_page(page);
- BUG_ON(!pages[1]);
+ for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
+ if (class->fullness_list[fg]) {
+ pr_info("Freeing non-empty class with size %db, fullness group %d\n",
+ class->size, fg);
+ }
+ }
+ kfree(class);
+ }
- __zs_unmap_object(area, pages, off, class->size);
+ destroy_handle_cache(pool);
+ kfree(pool->size_class);
+ kfree(pool->name);
+ kfree(pool);
+}
+EXPORT_SYMBOL_GPL(zs_destroy_pool);
+
+static int __init zs_init(void)
+{
+ int ret = zs_register_cpu_notifier();
+
+ if (ret)
+ goto notifier_fail;
+
+ init_zs_size_classes();
+
+#ifdef CONFIG_ZPOOL
+ zpool_register_driver(&zs_zpool_driver);
+#endif
+
+ ret = zs_stat_init();
+ if (ret) {
+ pr_err("zs stat initialization failed\n");
+ goto stat_fail;
}
- put_cpu_var(zs_map_area);
+ return 0;
+
+stat_fail:
+#ifdef CONFIG_ZPOOL
+ zpool_unregister_driver(&zs_zpool_driver);
+#endif
+notifier_fail:
+ zs_unregister_cpu_notifier();
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(zs_unmap_object);
-unsigned long zs_get_total_pages(struct zs_pool *pool)
+static void __exit zs_exit(void)
{
- return atomic_long_read(&pool->pages_allocated);
+#ifdef CONFIG_ZPOOL
+ zpool_unregister_driver(&zs_zpool_driver);
+#endif
+ zs_unregister_cpu_notifier();
+
+ zs_stat_exit();
}
-EXPORT_SYMBOL_GPL(zs_get_total_pages);
module_init(zs_init);
module_exit(zs_exit);
diff --git a/mm/zswap.c b/mm/zswap.c
index ea064c1a09ba..e2f8c7e3c643 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -907,11 +907,12 @@ static int __init init_zswap(void)
pr_info("loading zswap\n");
- zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops);
+ zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
+ &zswap_zpool_ops);
if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
pr_info("%s zpool not available\n", zswap_zpool_type);
zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
- zswap_pool = zpool_create_pool(zswap_zpool_type, gfp,
+ zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
&zswap_zpool_ops);
}
if (!zswap_pool) {
diff --git a/net/Kconfig b/net/Kconfig
index 99815b5454bf..15a2d76703bb 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -83,6 +83,20 @@ source "net/netlabel/Kconfig"
endif # if INET
+config ANDROID_PARANOID_NETWORK
+ bool "Only allow certain groups to create sockets"
+ default ANDROID
+ help
+ none
+
+config NET_ACTIVITY_STATS
+ bool "Network activity statistics tracking"
+ default y
+ help
+ Network activity statistics are useful for tracking wireless
+ modem activity on 2G, 3G, 4G wireless networks. Counts number of
+ transmissions and groups them in specified time buckets.
+
config NETWORK_SECMARK
bool "Security Marking"
help
@@ -230,7 +244,7 @@ source "net/mpls/Kconfig"
source "net/hsr/Kconfig"
config RPS
- boolean
+ boolean "RPS"
depends on SMP && SYSFS
default y
diff --git a/net/Makefile b/net/Makefile
index 7ed1970074b0..550c1123047b 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -73,3 +73,4 @@ obj-$(CONFIG_OPENVSWITCH) += openvswitch/
obj-$(CONFIG_VSOCKETS) += vmw_vsock/
obj-$(CONFIG_NET_MPLS_GSO) += mpls/
obj-$(CONFIG_HSR) += hsr/
+obj-$(CONFIG_NET_ACTIVITY_STATS) += activity_stats.o
diff --git a/net/activity_stats.c b/net/activity_stats.c
new file mode 100644
index 000000000000..4609ce2043eb
--- /dev/null
+++ b/net/activity_stats.c
@@ -0,0 +1,119 @@
+/* net/activity_stats.c
+ *
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Author: Mike Chan (mike@android.com)
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/suspend.h>
+#include <net/net_namespace.h>
+
+/*
+ * Track transmission rates in buckets (power of 2).
+ * 1,2,4,8...512 seconds.
+ *
+ * Buckets represent the count of network transmissions at least
+ * N seconds apart, where N is 1 << bucket index.
+ */
+#define BUCKET_MAX 10
+
+/* Track network activity frequency */
+static unsigned long activity_stats[BUCKET_MAX];
+static ktime_t last_transmit;
+static ktime_t suspend_time;
+static DEFINE_SPINLOCK(activity_lock);
+
+void activity_stats_update(void)
+{
+ int i;
+ unsigned long flags;
+ ktime_t now;
+ s64 delta;
+
+ spin_lock_irqsave(&activity_lock, flags);
+ now = ktime_get();
+ delta = ktime_to_ns(ktime_sub(now, last_transmit));
+
+ for (i = BUCKET_MAX - 1; i >= 0; i--) {
+ /*
+ * Check if the time delta between network activity is within the
+ * minimum bucket range.
+ */
+ if (delta < (1000000000ULL << i))
+ continue;
+
+ activity_stats[i]++;
+ last_transmit = now;
+ break;
+ }
+ spin_unlock_irqrestore(&activity_lock, flags);
+}
+
+static int activity_stats_show(struct seq_file *m, void *v)
+{
+ int i;
+ int ret;
+
+ seq_printf(m, "Min Bucket(sec) Count\n");
+
+ for (i = 0; i < BUCKET_MAX; i++) {
+ ret = seq_printf(m, "%15d %lu\n", 1 << i, activity_stats[i]);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int activity_stats_notifier(struct notifier_block *nb,
+ unsigned long event, void *dummy)
+{
+ switch (event) {
+ case PM_SUSPEND_PREPARE:
+ suspend_time = ktime_get_real();
+ break;
+
+ case PM_POST_SUSPEND:
+ suspend_time = ktime_sub(ktime_get_real(), suspend_time);
+ last_transmit = ktime_sub(last_transmit, suspend_time);
+ }
+
+ return 0;
+}
+
+static int activity_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, activity_stats_show, PDE_DATA(inode));
+}
+
+static const struct file_operations activity_stats_fops = {
+ .open = activity_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct notifier_block activity_stats_notifier_block = {
+ .notifier_call = activity_stats_notifier,
+};
+
+static int __init activity_stats_init(void)
+{
+ proc_create("activity", S_IRUGO,
+ init_net.proc_net_stat, &activity_stats_fops);
+ return register_pm_notifier(&activity_stats_notifier_block);
+}
+
+subsys_initcall(activity_stats_init);
+
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 339c74ad4553..6ad34688a20a 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -104,11 +104,40 @@ void bt_sock_unregister(int proto)
}
EXPORT_SYMBOL(bt_sock_unregister);
+#ifdef CONFIG_PARANOID_NETWORK
+static inline int current_has_bt_admin(void)
+{
+ return !current_euid();
+}
+
+static inline int current_has_bt(void)
+{
+ return current_has_bt_admin();
+}
+# else
+static inline int current_has_bt_admin(void)
+{
+ return 1;
+}
+
+static inline int current_has_bt(void)
+{
+ return 1;
+}
+#endif
+
static int bt_sock_create(struct net *net, struct socket *sock, int proto,
int kern)
{
int err;
+ if (proto == BTPROTO_RFCOMM || proto == BTPROTO_SCO ||
+ proto == BTPROTO_L2CAP) {
+ if (!current_has_bt())
+ return -EPERM;
+ } else if (!current_has_bt_admin())
+ return -EPERM;
+
if (net != &init_net)
return -EAFNOSUPPORT;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index ffd379db5938..b8919d01f0ff 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -44,16 +44,17 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
}
#endif
- u64_stats_update_begin(&brstats->syncp);
- brstats->tx_packets++;
- brstats->tx_bytes += skb->len;
- u64_stats_update_end(&brstats->syncp);
-
BR_INPUT_SKB_CB(skb)->brdev = dev;
skb_reset_mac_header(skb);
skb_pull(skb, ETH_HLEN);
+ u64_stats_update_begin(&brstats->syncp);
+ brstats->tx_packets++;
+ /* Exclude ETH_HLEN from byte stats for consistency with Rx chain */
+ brstats->tx_bytes += skb->len;
+ u64_stats_update_end(&brstats->syncp);
+
if (!br_allowed_ingress(br, br_get_vlan_info(br), skb, &vid))
goto out;
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 99ae718b79be..654c013bb2bc 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -17,6 +17,11 @@
#include <net/sock.h>
#include <net/fib_rules.h>
+static const struct fib_kuid_range fib_kuid_range_unset = {
+ KUIDT_INIT(0),
+ KUIDT_INIT(~0),
+};
+
int fib_default_rule_add(struct fib_rules_ops *ops,
u32 pref, u32 table, u32 flags)
{
@@ -32,6 +37,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->table = table;
r->flags = flags;
r->fr_net = hold_net(ops->fro_net);
+ r->uid_range = fib_kuid_range_unset;
r->suppress_prefixlen = -1;
r->suppress_ifgroup = -1;
@@ -182,6 +188,34 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
}
EXPORT_SYMBOL_GPL(fib_rules_unregister);
+static int uid_range_set(struct fib_kuid_range *range)
+{
+ return uid_valid(range->start) && uid_valid(range->end);
+}
+
+static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb)
+{
+ struct fib_rule_uid_range *in;
+ struct fib_kuid_range out;
+
+ in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]);
+
+ out.start = make_kuid(current_user_ns(), in->start);
+ out.end = make_kuid(current_user_ns(), in->end);
+
+ return out;
+}
+
+static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range)
+{
+ struct fib_rule_uid_range out = {
+ from_kuid_munged(current_user_ns(), range->start),
+ from_kuid_munged(current_user_ns(), range->end)
+ };
+
+ return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out);
+}
+
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags)
{
@@ -196,6 +230,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
goto out;
+ if (uid_lt(fl->flowi_uid, rule->uid_range.start) ||
+ uid_gt(fl->flowi_uid, rule->uid_range.end))
+ goto out;
+
ret = ops->match(rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -378,6 +416,21 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
} else if (rule->action == FR_ACT_GOTO)
goto errout_free;
+ if (tb[FRA_UID_RANGE]) {
+ if (current_user_ns() != net->user_ns) {
+ err = -EPERM;
+ goto errout_free;
+ }
+
+ rule->uid_range = nla_get_kuid_range(tb);
+
+ if (!uid_range_set(&rule->uid_range) ||
+ !uid_lte(rule->uid_range.start, rule->uid_range.end))
+ goto errout_free;
+ } else {
+ rule->uid_range = fib_kuid_range_unset;
+ }
+
err = ops->configure(rule, skb, frh, tb);
if (err < 0)
goto errout_free;
@@ -437,6 +490,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
struct fib_rules_ops *ops = NULL;
struct fib_rule *rule, *tmp;
struct nlattr *tb[FRA_MAX+1];
+ struct fib_kuid_range range;
int err = -EINVAL;
if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
@@ -456,6 +510,14 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
if (err < 0)
goto errout;
+ if (tb[FRA_UID_RANGE]) {
+ range = nla_get_kuid_range(tb);
+ if (!uid_range_set(&range))
+ goto errout;
+ } else {
+ range = fib_kuid_range_unset;
+ }
+
list_for_each_entry(rule, &ops->rules_list, list) {
if (frh->action && (frh->action != rule->action))
continue;
@@ -484,6 +546,11 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
(rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
continue;
+ if (uid_range_set(&range) &&
+ (!uid_eq(rule->uid_range.start, range.start) ||
+ !uid_eq(rule->uid_range.end, range.end)))
+ continue;
+
if (!ops->compare(rule, frh, tb))
continue;
@@ -542,7 +609,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
+ nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ nla_total_size(4) /* FRA_FWMARK */
- + nla_total_size(4); /* FRA_FWMASK */
+ + nla_total_size(4) /* FRA_FWMASK */
+ + nla_total_size(sizeof(struct fib_kuid_range));
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -598,7 +666,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
((rule->mark_mask || rule->mark) &&
nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) ||
(rule->target &&
- nla_put_u32(skb, FRA_GOTO, rule->target)))
+ nla_put_u32(skb, FRA_GOTO, rule->target)) ||
+ (uid_range_set(&rule->uid_range) &&
+ nla_put_uid_range(skb, &rule->uid_range)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 59399fad65a2..015f5dd879d6 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -934,6 +934,7 @@ static void neigh_timer_handler(unsigned long arg)
neigh->nud_state = NUD_PROBE;
neigh->updated = jiffies;
atomic_set(&neigh->probes, 0);
+ notify = 1;
next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
}
} else {
@@ -1176,6 +1177,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
if (new != old) {
neigh_del_timer(neigh);
+ if (new & NUD_PROBE)
+ atomic_set(&neigh->probes, 0);
if (new & NUD_IN_TIMER)
neigh_add_timer(neigh, (jiffies +
((new & NUD_REACHABLE) ?
diff --git a/net/core/sock.c b/net/core/sock.c
index 8c8fb6ab8a26..774e9141079c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2303,8 +2303,11 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_type = sock->type;
sk->sk_wq = sock->wq;
sock->sk = sk;
- } else
+ sk->sk_uid = SOCK_INODE(sock)->i_uid;
+ } else {
sk->sk_wq = NULL;
+ sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
+ }
spin_lock_init(&sk->sk_dst_lock);
rwlock_init(&sk->sk_callback_lock);
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index ad704c757bb4..5bf1aebd9ab0 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -132,7 +132,7 @@ void sock_diag_unregister(const struct sock_diag_handler *hnld)
}
EXPORT_SYMBOL_GPL(sock_diag_unregister);
-static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
{
int err;
struct sock_diag_req *req = nlmsg_data(nlh);
@@ -152,8 +152,12 @@ static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
hndl = sock_diag_handlers[req->sdiag_family];
if (hndl == NULL)
err = -ENOENT;
- else
+ else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
err = hndl->dump(skb, nlh);
+ else if (nlh->nlmsg_type == SOCK_DESTROY_BACKPORT && hndl->destroy)
+ err = hndl->destroy(skb, nlh);
+ else
+ err = -EOPNOTSUPP;
mutex_unlock(&sock_diag_table_mutex);
return err;
@@ -179,7 +183,8 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return ret;
case SOCK_DIAG_BY_FAMILY:
- return __sock_diag_rcv_msg(skb, nlh);
+ case SOCK_DESTROY_BACKPORT:
+ return __sock_diag_cmd(skb, nlh);
default:
return -EINVAL;
}
@@ -194,6 +199,18 @@ static void sock_diag_rcv(struct sk_buff *skb)
mutex_unlock(&sock_diag_mutex);
}
+int sock_diag_destroy(struct sock *sk, int err)
+{
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!sk->sk_prot->diag_destroy)
+ return -EOPNOTSUPP;
+
+ return sk->sk_prot->diag_destroy(sk, err);
+}
+EXPORT_SYMBOL_GPL(sock_diag_destroy);
+
static int __net_init diag_net_init(struct net *net)
{
struct netlink_kernel_cfg cfg = {
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index ff186dac3e07..4d925dbe4bb7 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -487,6 +487,9 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
newsk->sk_backlog_rcv = dccp_v4_do_rcv;
newnp->pktoptions = NULL;
newnp->opt = NULL;
+ newnp->ipv6_mc_list = NULL;
+ newnp->ipv6_ac_list = NULL;
+ newnp->ipv6_fl_list = NULL;
newnp->mcast_oif = inet6_iif(skb);
newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
@@ -562,6 +565,10 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
/* Clone RX bits */
newnp->rxopt.all = np->rxopt.all;
+ newnp->ipv6_mc_list = NULL;
+ newnp->ipv6_ac_list = NULL;
+ newnp->ipv6_fl_list = NULL;
+
/* Clone pktoptions received with SYN */
newnp->pktoptions = NULL;
if (ireq->pktopts != NULL) {
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e682b48e0709..022e599cfff9 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -441,6 +441,19 @@ config INET_UDP_DIAG
Support for UDP socket monitoring interface used by the ss tool.
If unsure, say Y.
+config INET_DIAG_DESTROY
+ bool "INET: allow privileged process to administratively close sockets"
+ depends on INET_DIAG
+ default n
+ ---help---
+ Provides a SOCK_DESTROY operation that allows privileged processes
+ (e.g., a connection manager or a network administration tool such as
+ ss) to close sockets opened by other processes. Closing a socket in
+ this way interrupts any blocking read/write/connect operations on
+ the socket and causes future socket calls to behave as if the socket
+ had been disconnected.
+ If unsure, say N.
+
menuconfig TCP_CONG_ADVANCED
bool "TCP: advanced congestion control"
---help---
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 518c04ed666e..591745df6d51 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -15,6 +15,7 @@ obj-y := route.o inetpeer.o protocol.o \
obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
+obj-$(CONFIG_SYSFS) += sysfs_net_ipv4.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d2f8184fc8f4..aad6ea00142d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -89,6 +89,7 @@
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
#include <linux/slab.h>
+#include <linux/netfilter/xt_qtaguid.h>
#include <asm/uaccess.h>
@@ -119,6 +120,19 @@
#include <linux/mroute.h>
#endif
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+
+static inline int current_has_network(void)
+{
+ return in_egroup_p(AID_INET) || capable(CAP_NET_RAW);
+}
+#else
+static inline int current_has_network(void)
+{
+ return 1;
+}
+#endif
/* The inetsw table contains everything that inet_create needs to
* build a new socket.
@@ -259,6 +273,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
int try_loading_module = 0;
int err;
+ if (!current_has_network())
+ return -EACCES;
+
if (protocol < 0 || protocol >= IPPROTO_MAX)
return -EINVAL;
@@ -310,8 +327,7 @@ lookup_protocol:
}
err = -EPERM;
- if (sock->type == SOCK_RAW && !kern &&
- !ns_capable(net->user_ns, CAP_NET_RAW))
+ if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
goto out_rcu_unlock;
sock->ops = answer->ops;
@@ -400,6 +416,9 @@ int inet_release(struct socket *sock)
if (sk) {
long timeout;
+#ifdef CONFIG_NETFILTER_XT_MATCH_QTAGUID
+ qtaguid_untag(sock, true);
+#endif
sock_rps_reset_flow(sk);
/* Applications forget to leave groups before exiting */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1c77a94c0cbf..97548fc15d0c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -533,6 +533,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
[RTA_FLOW] = { .type = NLA_U32 },
+ [RTA_UID] = { .type = NLA_U32 },
};
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 2d6f89070373..7775a4760458 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -423,6 +423,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.daddr = daddr;
fl4.saddr = saddr;
fl4.flowi4_mark = mark;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
@@ -454,6 +455,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
param->replyopts.opt.opt.faddr : iph->saddr);
fl4->saddr = saddr;
fl4->flowi4_mark = mark;
+ fl4->flowi4_uid = sock_net_uid(net, NULL);
fl4->flowi4_tos = RT_TOS(tos);
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 5e2c4d535b26..aff19e67ffb0 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -410,7 +410,8 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
sk->sk_protocol,
flags,
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
- ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
+ ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport,
+ sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -446,7 +447,8 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
- ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport);
+ ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport,
+ sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 4eeba4e497a0..bdbc16411caf 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -48,6 +48,8 @@ struct inet_diag_entry {
struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */
struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */
#endif
+ u32 ifindex;
+ u32 mark;
};
static DEFINE_MUTEX(inet_diag_table_mutex);
@@ -89,7 +91,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct sk_buff *skb, struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh, bool net_admin)
{
const struct inet_sock *inet = inet_sk(sk);
struct inet_diag_msg *r;
@@ -150,6 +152,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
}
#endif
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark))
+ goto errout;
+
r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
r->idiag_inode = sock_i_ino(sk);
@@ -229,10 +234,12 @@ static int inet_csk_diag_fill(struct sock *sk,
struct sk_buff *skb, struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh,
+ bool net_admin)
{
return inet_sk_diag_fill(sk, inet_csk(sk),
- skb, req, user_ns, portid, seq, nlmsg_flags, unlh);
+ skb, req, user_ns, portid, seq, nlmsg_flags, unlh,
+ net_admin);
}
static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
@@ -292,25 +299,22 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
struct inet_diag_req_v2 *r,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh, bool net_admin)
{
if (sk->sk_state == TCP_TIME_WAIT)
return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq,
nlmsg_flags, unlh);
return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
- nlmsg_flags, unlh);
+ nlmsg_flags, unlh, net_admin);
}
-int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
- const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)
+struct sock *inet_diag_find_one_icsk(struct net *net,
+ struct inet_hashinfo *hashinfo,
+ struct inet_diag_req_v2 *req)
{
- int err;
struct sock *sk;
- struct sk_buff *rep;
- struct net *net = sock_net(in_skb->sk);
- err = -EINVAL;
if (req->sdiag_family == AF_INET) {
sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
req->id.idiag_dport, req->id.idiag_src[0],
@@ -318,25 +322,49 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
}
#if IS_ENABLED(CONFIG_IPV6)
else if (req->sdiag_family == AF_INET6) {
- sk = inet6_lookup(net, hashinfo,
- (struct in6_addr *)req->id.idiag_dst,
- req->id.idiag_dport,
- (struct in6_addr *)req->id.idiag_src,
- req->id.idiag_sport,
- req->id.idiag_if);
+ if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+ ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+ sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3],
+ req->id.idiag_dport, req->id.idiag_src[3],
+ req->id.idiag_sport, req->id.idiag_if);
+ else
+ sk = inet6_lookup(net, hashinfo,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if);
}
#endif
else {
- goto out_nosk;
+ return ERR_PTR(-EINVAL);
}
- err = -ENOENT;
- if (sk == NULL)
- goto out_nosk;
+ if (!sk)
+ return ERR_PTR(-ENOENT);
- err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
- if (err)
- goto out;
+ if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
+ sock_gen_put(sk);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk);
+
+int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
+ struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ struct inet_diag_req_v2 *req)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *rep;
+ struct sock *sk;
+ int err;
+
+ sk = inet_diag_find_one_icsk(net, hashinfo, req);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL);
if (!rep) {
@@ -347,7 +375,8 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
err = sk_diag_fill(sk, rep, req,
sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh);
+ nlh->nlmsg_seq, 0, nlh,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
nlmsg_free(rep);
@@ -362,12 +391,11 @@ out:
if (sk)
sock_gen_put(sk);
-out_nosk:
return err;
}
EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
-static int inet_diag_get_exact(struct sk_buff *in_skb,
+static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
const struct nlmsghdr *nlh,
struct inet_diag_req_v2 *req)
{
@@ -377,8 +405,12 @@ static int inet_diag_get_exact(struct sk_buff *in_skb,
handler = inet_diag_lock_handler(req->sdiag_protocol);
if (IS_ERR(handler))
err = PTR_ERR(handler);
- else
+ else if (cmd == SOCK_DIAG_BY_FAMILY)
err = handler->dump_one(in_skb, nlh, req);
+ else if (cmd == SOCK_DESTROY_BACKPORT && handler->destroy)
+ err = handler->destroy(in_skb, req);
+ else
+ err = -EOPNOTSUPP;
inet_diag_unlock_handler(handler);
return err;
@@ -483,6 +515,22 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
yes = 0;
break;
}
+ case INET_DIAG_BC_DEV_COND: {
+ u32 ifindex;
+
+ ifindex = *((const u32 *)(op + 1));
+ if (ifindex != entry->ifindex)
+ yes = 0;
+ break;
+ }
+ case INET_DIAG_BC_MARK_COND: {
+ struct inet_diag_markcond *cond;
+
+ cond = (struct inet_diag_markcond *)(op + 1);
+ if ((entry->mark & cond->mask) != cond->mark)
+ yes = 0;
+ break;
+ }
}
if (yes) {
@@ -519,6 +567,8 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
entry.sport = inet->inet_num;
entry.dport = ntohs(inet->inet_dport);
entry.userlocks = sk->sk_userlocks;
+ entry.ifindex = sk->sk_bound_dev_if;
+ entry.mark = sk->sk_mark;
return inet_diag_bc_run(bc, &entry);
}
@@ -541,6 +591,17 @@ static int valid_cc(const void *bc, int len, int cc)
return 0;
}
+/* data is u32 ifindex */
+static bool valid_devcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
+{
+ /* Check ifindex space. */
+ *min_len += sizeof(u32);
+ if (len < *min_len)
+ return false;
+
+ return true;
+}
/* Validate an inet_diag_hostcond. */
static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
int *min_len)
@@ -590,10 +651,25 @@ static inline bool valid_port_comparison(const struct inet_diag_bc_op *op,
return true;
}
-static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
+static bool valid_markcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
+{
+ *min_len += sizeof(struct inet_diag_markcond);
+ return len >= *min_len;
+}
+
+static int inet_diag_bc_audit(const struct nlattr *attr,
+ const struct sk_buff *skb)
{
- const void *bc = bytecode;
- int len = bytecode_len;
+ bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
+ const void *bytecode, *bc;
+ int bytecode_len, len;
+
+ if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op))
+ return -EINVAL;
+
+ bytecode = bc = nla_data(attr);
+ len = bytecode_len = nla_len(attr);
while (len > 0) {
const struct inet_diag_bc_op *op = bc;
@@ -606,6 +682,10 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
if (!valid_hostcond(bc, len, &min_len))
return -EINVAL;
break;
+ case INET_DIAG_BC_DEV_COND:
+ if (!valid_devcond(bc, len, &min_len))
+ return -EINVAL;
+ break;
case INET_DIAG_BC_S_GE:
case INET_DIAG_BC_S_LE:
case INET_DIAG_BC_D_GE:
@@ -613,6 +693,12 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
if (!valid_port_comparison(bc, len, &min_len))
return -EINVAL;
break;
+ case INET_DIAG_BC_MARK_COND:
+ if (!net_admin)
+ return -EPERM;
+ if (!valid_markcond(bc, len, &min_len))
+ return -EINVAL;
+ break;
case INET_DIAG_BC_AUTO:
case INET_DIAG_BC_JMP:
case INET_DIAG_BC_NOP:
@@ -641,7 +727,8 @@ static int inet_csk_diag_dump(struct sock *sk,
struct sk_buff *skb,
struct netlink_callback *cb,
struct inet_diag_req_v2 *r,
- const struct nlattr *bc)
+ const struct nlattr *bc,
+ bool net_admin)
{
if (!inet_diag_bc_sk(bc, sk))
return 0;
@@ -649,7 +736,8 @@ static int inet_csk_diag_dump(struct sock *sk,
return inet_csk_diag_fill(sk, skb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
+ net_admin);
}
static int inet_twsk_diag_dump(struct sock *sk,
@@ -677,6 +765,7 @@ static int inet_twsk_diag_dump(struct sock *sk,
entry.sport = tw->tw_num;
entry.dport = ntohs(tw->tw_dport);
entry.userlocks = 0;
+ entry.mark = 0;
if (!inet_diag_bc_run(bc, &entry))
return 0;
@@ -803,6 +892,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
if (bc != NULL) {
entry.sport = inet->inet_num;
entry.userlocks = sk->sk_userlocks;
+ entry.mark = sk->sk_mark;
}
for (j = s_j; j < lopt->nr_table_entries; j++) {
@@ -852,6 +942,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
int i, num;
int s_i, s_num;
struct net *net = sock_net(skb->sk);
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
s_i = cb->args[1];
s_num = num = cb->args[2];
@@ -892,7 +983,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
cb->args[3] > 0)
goto syn_recv;
- if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
+ if (inet_csk_diag_dump(sk, skb, cb, r,
+ bc, net_admin) < 0) {
spin_unlock_bh(&ilb->lock);
goto done;
}
@@ -964,7 +1056,8 @@ skip_listen_ht:
if (sk->sk_state == TCP_TIME_WAIT)
res = inet_twsk_diag_dump(sk, skb, cb, r, bc);
else
- res = inet_csk_diag_dump(sk, skb, cb, r, bc);
+ res = inet_csk_diag_dump(sk, skb, cb, r, bc,
+ net_admin);
if (res < 0) {
spin_unlock_bh(lock);
goto done;
@@ -1054,7 +1147,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
req.idiag_states = rc->idiag_states;
req.id = rc->id;
- return inet_diag_get_exact(in_skb, nlh, &req);
+ return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, &req);
}
static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
@@ -1069,13 +1162,13 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
if (nlh->nlmsg_flags & NLM_F_DUMP) {
if (nlmsg_attrlen(nlh, hdrlen)) {
struct nlattr *attr;
+ int err;
attr = nlmsg_find_attr(nlh, hdrlen,
INET_DIAG_REQ_BYTECODE);
- if (attr == NULL ||
- nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
- inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
- return -EINVAL;
+ err = inet_diag_bc_audit(attr, skb);
+ if (err)
+ return err;
}
{
struct netlink_dump_control c = {
@@ -1088,7 +1181,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
return inet_diag_get_exact_compat(skb, nlh);
}
-static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
{
int hdrlen = sizeof(struct inet_diag_req_v2);
struct net *net = sock_net(skb->sk);
@@ -1096,15 +1189,17 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
if (nlmsg_len(h) < hdrlen)
return -EINVAL;
- if (h->nlmsg_flags & NLM_F_DUMP) {
+ if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
+ h->nlmsg_flags & NLM_F_DUMP) {
if (nlmsg_attrlen(h, hdrlen)) {
struct nlattr *attr;
+ int err;
+
attr = nlmsg_find_attr(h, hdrlen,
INET_DIAG_REQ_BYTECODE);
- if (attr == NULL ||
- nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
- inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
- return -EINVAL;
+ err = inet_diag_bc_audit(attr, skb);
+ if (err)
+ return err;
}
{
struct netlink_dump_control c = {
@@ -1114,17 +1209,19 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
}
}
- return inet_diag_get_exact(skb, h, nlmsg_data(h));
+ return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h));
}
static const struct sock_diag_handler inet_diag_handler = {
.family = AF_INET,
- .dump = inet_diag_handler_dump,
+ .dump = inet_diag_handler_cmd,
+ .destroy = inet_diag_handler_cmd,
};
static const struct sock_diag_handler inet6_diag_handler = {
.family = AF_INET6,
- .dump = inet_diag_handler_dump,
+ .dump = inet_diag_handler_cmd,
+ .destroy = inet_diag_handler_cmd,
};
int inet_diag_register(const struct inet_diag_handler *h)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 357c2a914e77..edf54b04448c 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1544,7 +1544,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg),
daddr, saddr,
- tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
+ tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
+ arg->uid);
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 4c019d5c3f57..a6847b99586a 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -182,7 +182,8 @@ config IP_NF_MATCH_ECN
config IP_NF_MATCH_RPFILTER
tristate '"rpfilter" reverse path filter match support'
- depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW)
+ depends on NETFILTER_ADVANCED
+ depends on IP_NF_MANGLE || IP_NF_RAW
---help---
This option allows you to match packets whose replies would
go out via the interface the packet came in.
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 148678dd045c..8c37b1fc536a 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -662,6 +662,10 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
if (len > 0xFFFF)
return -EMSGSIZE;
+ /* Must have at least a full ICMP header. */
+ if (len < icmph_len)
+ return -EINVAL;
+
/*
* Check the flags.
*/
@@ -792,7 +796,8 @@ static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
- inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);
+ inet_sk_flowi_flags(sk), faddr, saddr, 0, 0,
+ sk->sk_uid);
security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2ec944318859..d9ce4e3583e7 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -585,7 +585,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk) |
(inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
- daddr, saddr, 0, 0);
+ daddr, saddr, 0, 0, sk->sk_uid);
if (!inet->hdrincl) {
err = raw_probe_proto_opt(&fl4, msg);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2fe9459fbe24..38b439a989b6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -499,7 +499,8 @@ void __ip_select_ident(struct iphdr *iph, int segs)
}
EXPORT_SYMBOL(__ip_select_ident);
-static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
+ const struct sock *sk,
const struct iphdr *iph,
int oif, u8 tos,
u8 prot, u32 mark, int flow_flags)
@@ -515,19 +516,21 @@ static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
flowi4_init_output(fl4, oif, mark, tos,
RT_SCOPE_UNIVERSE, prot,
flow_flags,
- iph->daddr, iph->saddr, 0, 0);
+ iph->daddr, iph->saddr, 0, 0,
+ sock_net_uid(net, sk));
}
static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
const struct sock *sk)
{
+ const struct net *net = dev_net(skb->dev);
const struct iphdr *iph = ip_hdr(skb);
int oif = skb->dev->ifindex;
u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
u32 mark = skb->mark;
- __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
+ __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
}
static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
@@ -544,7 +547,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk),
- daddr, inet->inet_saddr, 0, 0);
+ daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
rcu_read_unlock();
}
@@ -785,6 +788,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
struct rtable *rt;
struct flowi4 fl4;
const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct net *net = dev_net(skb->dev);
int oif = skb->dev->ifindex;
u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
@@ -792,7 +796,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
rt = (struct rtable *) dst;
- __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
+ __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
__ip_do_redirect(rt, skb, &fl4, true);
}
@@ -1007,7 +1011,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
if (!mark)
mark = IP4_REPLY_MARK(net, skb->mark);
- __build_flow_key(&fl4, NULL, iph, oif,
+ __build_flow_key(net, &fl4, NULL, iph, oif,
RT_TOS(iph->tos), protocol, mark, flow_flags);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
@@ -1023,7 +1027,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
struct flowi4 fl4;
struct rtable *rt;
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
if (!fl4.flowi4_mark)
fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
@@ -1042,6 +1046,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
struct rtable *rt;
struct dst_entry *odst = NULL;
bool new = false;
+ struct net *net = sock_net(sk);
bh_lock_sock(sk);
@@ -1055,7 +1060,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
goto out;
}
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
rt = (struct rtable *)odst;
if (odst->obsolete && odst->ops->check(odst, 0) == NULL) {
@@ -1095,7 +1100,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
struct flowi4 fl4;
struct rtable *rt;
- __build_flow_key(&fl4, NULL, iph, oif,
+ __build_flow_key(net, &fl4, NULL, iph, oif,
RT_TOS(iph->tos), protocol, mark, flow_flags);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
@@ -1110,9 +1115,10 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
const struct iphdr *iph = (const struct iphdr *) skb->data;
struct flowi4 fl4;
struct rtable *rt;
+ struct net *net = sock_net(sk);
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
- rt = __ip_route_output_key(sock_net(sk), &fl4);
+ __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
+ rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_do_redirect(rt, skb, &fl4, false);
ip_rt_put(rt);
@@ -2366,6 +2372,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
goto nla_put_failure;
+ if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
+ nla_put_u32(skb, RTA_UID,
+ from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
+ goto nla_put_failure;
+
error = rt->dst.error;
if (rt_is_input_route(rt)) {
@@ -2416,6 +2427,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
int err;
int mark;
struct sk_buff *skb;
+ kuid_t uid;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
if (err < 0)
@@ -2443,6 +2455,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
+ if (tb[RTA_UID])
+ uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
+ else
+ uid = (iif ? INVALID_UID : current_uid());
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
@@ -2450,6 +2466,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
fl4.flowi4_tos = rtm->rtm_tos;
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
+ fl4.flowi4_uid = uid;
if (iif) {
struct net_device *dev;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 32b98d0207b4..8940e739c1a1 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -337,7 +337,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
inet_sk_flowi_flags(sk),
opt->srr ? opt->faddr : ireq->ir_rmt_addr,
- ireq->ir_loc_addr, th->source, th->dest);
+ ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(sock_net(sk), &fl4);
if (IS_ERR(rt)) {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3d61bdbae920..4b7b86ba0661 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -145,6 +145,21 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
return ret;
}
+/* Validate changes from /proc interface. */
+static int proc_tcp_default_init_rwnd(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int old_value = *(int *)ctl->data;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ int new_value = *(int *)ctl->data;
+
+ if (write && ret == 0 && (new_value < 3 || new_value > 100))
+ *(int *)ctl->data = old_value;
+
+ return ret;
+}
+
static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -722,6 +737,13 @@ static struct ctl_table ipv4_table[] = {
.extra2 = &one,
},
{
+ .procname = "tcp_default_init_rwnd",
+ .data = &sysctl_tcp_default_init_rwnd,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_tcp_default_init_rwnd
+ },
+ {
.procname = "icmp_msgs_per_sec",
.data = &sysctl_icmp_msgs_per_sec,
.maxlen = sizeof(int),
diff --git a/net/ipv4/sysfs_net_ipv4.c b/net/ipv4/sysfs_net_ipv4.c
new file mode 100644
index 000000000000..0cbbf10026a6
--- /dev/null
+++ b/net/ipv4/sysfs_net_ipv4.c
@@ -0,0 +1,88 @@
+/*
+ * net/ipv4/sysfs_net_ipv4.c
+ *
+ * sysfs-based networking knobs (so we can, unlike with sysctl, control perms)
+ *
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * Robert Love <rlove@google.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <net/tcp.h>
+
+#define CREATE_IPV4_FILE(_name, _var) \
+static ssize_t _name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buf) \
+{ \
+ return sprintf(buf, "%d\n", _var); \
+} \
+static ssize_t _name##_store(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ const char *buf, size_t count) \
+{ \
+ int val, ret; \
+ ret = sscanf(buf, "%d", &val); \
+ if (ret != 1) \
+ return -EINVAL; \
+ if (val < 0) \
+ return -EINVAL; \
+ _var = val; \
+ return count; \
+} \
+static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+CREATE_IPV4_FILE(tcp_wmem_min, sysctl_tcp_wmem[0]);
+CREATE_IPV4_FILE(tcp_wmem_def, sysctl_tcp_wmem[1]);
+CREATE_IPV4_FILE(tcp_wmem_max, sysctl_tcp_wmem[2]);
+
+CREATE_IPV4_FILE(tcp_rmem_min, sysctl_tcp_rmem[0]);
+CREATE_IPV4_FILE(tcp_rmem_def, sysctl_tcp_rmem[1]);
+CREATE_IPV4_FILE(tcp_rmem_max, sysctl_tcp_rmem[2]);
+
+static struct attribute *ipv4_attrs[] = {
+ &tcp_wmem_min_attr.attr,
+ &tcp_wmem_def_attr.attr,
+ &tcp_wmem_max_attr.attr,
+ &tcp_rmem_min_attr.attr,
+ &tcp_rmem_def_attr.attr,
+ &tcp_rmem_max_attr.attr,
+ NULL
+};
+
+static struct attribute_group ipv4_attr_group = {
+ .attrs = ipv4_attrs,
+};
+
+static __init int sysfs_ipv4_init(void)
+{
+ struct kobject *ipv4_kobject;
+ int ret;
+
+ ipv4_kobject = kobject_create_and_add("ipv4", kernel_kobj);
+ if (!ipv4_kobject)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(ipv4_kobject, &ipv4_attr_group);
+ if (ret) {
+ kobject_put(ipv4_kobject);
+ return ret;
+ }
+
+ return 0;
+}
+
+subsys_initcall(sysfs_ipv4_init);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 458a4c046784..bd42fc96e66d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -268,6 +268,7 @@
#include <linux/crypto.h>
#include <linux/time.h>
#include <linux/slab.h>
+#include <linux/uid_stat.h>
#include <net/icmp.h>
#include <net/inet_common.h>
@@ -1307,6 +1308,10 @@ out:
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
out_nopush:
release_sock(sk);
+
+ if (copied + copied_syn)
+ uid_stat_tcp_snd(from_kuid(&init_user_ns, current_uid()),
+ copied + copied_syn);
return copied + copied_syn;
do_fault:
@@ -1578,6 +1583,8 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
if (copied > 0) {
tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, copied);
+ uid_stat_tcp_rcv(from_kuid(&init_user_ns, current_uid()),
+ copied);
}
return copied;
}
@@ -1907,6 +1914,10 @@ skip_copy:
tcp_cleanup_rbuf(sk, copied);
release_sock(sk);
+
+ if (copied > 0)
+ uid_stat_tcp_rcv(from_kuid(&init_user_ns, current_uid()),
+ copied);
return copied;
out:
@@ -1915,6 +1926,9 @@ out:
recv_urg:
err = tcp_recv_urg(sk, msg, len, flags);
+ if (err > 0)
+ uid_stat_tcp_rcv(from_kuid(&init_user_ns, current_uid()),
+ err);
goto out;
recv_sndq:
@@ -3032,6 +3046,43 @@ void tcp_done(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_done);
+int tcp_abort(struct sock *sk, int err)
+{
+ if (!sk_fullsock(sk)) {
+ sock_gen_put(sk);
+ return -EOPNOTSUPP;
+ }
+
+ /* Don't race with userspace socket closes such as tcp_close. */
+ lock_sock(sk);
+
+ if (sk->sk_state == TCP_LISTEN) {
+ tcp_set_state(sk, TCP_CLOSE);
+ inet_csk_listen_stop(sk);
+ }
+
+ /* Don't race with BH socket closes such as inet_csk_listen_stop. */
+ local_bh_disable();
+ bh_lock_sock(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_err = err;
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ smp_wmb();
+ sk->sk_error_report(sk);
+ if (tcp_need_reset(sk->sk_state))
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_done(sk);
+ }
+
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ release_sock(sk);
+ sock_put(sk);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_abort);
+
extern struct tcp_congestion_ops tcp_reno;
static __initdata unsigned long thash_entries;
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 0d73f9ddb55b..1d745eac7b8f 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -10,6 +10,8 @@
*/
#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/sock_diag.h>
#include <linux/inet_diag.h>
#include <linux/tcp.h>
@@ -45,11 +47,28 @@ static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
}
+#ifdef CONFIG_INET_DIAG_DESTROY
+static int tcp_diag_destroy(struct sk_buff *in_skb,
+ struct inet_diag_req_v2 *req)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sock *sk = inet_diag_find_one_icsk(net, &tcp_hashinfo, req);
+
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
+
+ return sock_diag_destroy(sk, ECONNABORTED);
+}
+#endif
+
static const struct inet_diag_handler tcp_diag_handler = {
.dump = tcp_diag_dump,
.dump_one = tcp_diag_dump_one,
.idiag_get_info = tcp_diag_get_info,
.idiag_type = IPPROTO_TCP,
+#ifdef CONFIG_INET_DIAG_DESTROY
+ .destroy = tcp_diag_destroy,
+#endif
};
static int __init tcp_diag_init(void)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c0e026ae388e..d8057c6441b4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -99,6 +99,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_early_retrans __read_mostly = 3;
+int sysctl_tcp_default_init_rwnd __read_mostly = TCP_INIT_CWND * 2;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -3324,7 +3325,8 @@ static void tcp_send_challenge_ack(struct sock *sk)
/* unprotected vars, we dont care of overwrites */
static u32 challenge_timestamp;
static unsigned int challenge_count;
- u32 count, now = jiffies / HZ;
+ u32 now = jiffies / HZ;
+ u32 count;
if (now != challenge_timestamp) {
u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d3d62be2376b..c2db8a9803cd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -687,6 +687,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
arg.bound_dev_if = sk->sk_bound_dev_if;
arg.tos = ip_hdr(skb)->tos;
+ arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
@@ -708,7 +709,8 @@ release_sk1:
outside socket context is ugly, certainly. What can I do?
*/
-static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
+static void tcp_v4_send_ack(const struct sock *sk, struct sk_buff *skb,
+ u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
int reply_flags, u8 tos)
@@ -723,7 +725,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
];
} rep;
struct ip_reply_arg arg;
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct net *net = sock_net(sk);
memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg));
@@ -772,6 +774,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
+ arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
@@ -785,7 +788,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
- tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp + tcptw->tw_ts_offset,
tcptw->tw_ts_recent,
@@ -804,7 +807,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
- tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
+ tcp_v4_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
tcp_time_stamp,
@@ -2429,6 +2432,7 @@ struct proto tcp_prot = {
.destroy_cgroup = tcp_destroy_cgroup,
.proto_cgroup = tcp_proto_cgroup,
#endif
+ .diag_destroy = tcp_abort,
};
EXPORT_SYMBOL(tcp_prot);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5853c5a2264e..d1d509baa257 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -197,7 +197,7 @@ u32 tcp_default_init_rwnd(u32 mss)
* (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
* limit when mss is larger than 1460.
*/
- u32 init_rwnd = TCP_INIT_CWND * 2;
+ u32 init_rwnd = sysctl_tcp_default_init_rwnd;
if (mss > 1460)
init_rwnd = max((1460 * init_rwnd) / mss, 2U);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ee26711a952a..d59939ba1162 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1007,7 +1007,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
inet_sk_flowi_flags(sk),
- faddr, saddr, dport, inet->inet_sport);
+ faddr, saddr, dport, inet->inet_sport,
+ sk->sk_uid);
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
@@ -2239,6 +2240,20 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
}
EXPORT_SYMBOL(udp_poll);
+int udp_abort(struct sock *sk, int err)
+{
+ lock_sock(sk);
+
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ udp_disconnect(sk, 0);
+
+ release_sock(sk);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(udp_abort);
+
struct proto udp_prot = {
.name = "UDP",
.owner = THIS_MODULE,
@@ -2270,6 +2285,7 @@ struct proto udp_prot = {
.compat_getsockopt = compat_udp_getsockopt,
#endif
.clear_sk = sk_prot_clear_portaddr_nulls,
+ .diag_destroy = udp_abort,
};
EXPORT_SYMBOL(udp_prot);
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 4a000f1dd757..9f5d1419f628 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -19,7 +19,7 @@
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb, struct inet_diag_req_v2 *req,
- struct nlattr *bc)
+ struct nlattr *bc, bool net_admin)
{
if (!inet_diag_bc_sk(bc, sk))
return 0;
@@ -27,7 +27,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
return inet_sk_diag_fill(sk, NULL, skb, req,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin);
}
static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
@@ -73,7 +73,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
err = inet_sk_diag_fill(sk, NULL, rep, req,
sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh);
+ nlh->nlmsg_seq, 0, nlh,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
kfree_skb(rep);
@@ -95,6 +96,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin
{
int num, s_num, slot, s_slot;
struct net *net = sock_net(skb->sk);
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
s_slot = cb->args[0];
num = s_num = cb->args[1];
@@ -129,7 +131,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin
r->id.idiag_dport)
goto next;
- if (sk_diag_dump(sk, skb, cb, r, bc) < 0) {
+ if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
spin_unlock_bh(&hslot->lock);
goto done;
}
@@ -162,11 +164,87 @@ static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
r->idiag_wqueue = sk_wmem_alloc_get(sk);
}
+#ifdef CONFIG_INET_DIAG_DESTROY
+static int __udp_diag_destroy(struct sk_buff *in_skb,
+ struct inet_diag_req_v2 *req,
+ struct udp_table *tbl)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sock *sk;
+ int err;
+
+ rcu_read_lock();
+
+ if (req->sdiag_family == AF_INET)
+ sk = __udp4_lib_lookup(net,
+ req->id.idiag_dst[0], req->id.idiag_dport,
+ req->id.idiag_src[0], req->id.idiag_sport,
+ req->id.idiag_if, tbl);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (req->sdiag_family == AF_INET6) {
+ if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+ ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+ sk = __udp4_lib_lookup(net,
+ req->id.idiag_dst[3], req->id.idiag_dport,
+ req->id.idiag_src[3], req->id.idiag_sport,
+ req->id.idiag_if, tbl);
+
+ else
+ sk = __udp6_lib_lookup(net,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if, tbl);
+ }
+#endif
+ else {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+
+ rcu_read_unlock();
+
+ if (!sk)
+ return -ENOENT;
+
+ if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
+ sock_put(sk);
+ return -ENOENT;
+ }
+
+ err = sock_diag_destroy(sk, ECONNABORTED);
+
+ sock_put(sk);
+
+ return err;
+}
+
+static int udp_diag_destroy(struct sk_buff *in_skb,
+ struct inet_diag_req_v2 *req)
+{
+ return __udp_diag_destroy(in_skb, req, &udp_table);
+}
+
+static int udplite_diag_destroy(struct sk_buff *in_skb,
+ struct inet_diag_req_v2 *req)
+{
+ return __udp_diag_destroy(in_skb, req, &udplite_table);
+}
+
+#endif
+
static const struct inet_diag_handler udp_diag_handler = {
.dump = udp_diag_dump,
.dump_one = udp_diag_dump_one,
.idiag_get_info = udp_diag_get_info,
.idiag_type = IPPROTO_UDP,
+#ifdef CONFIG_INET_DIAG_DESTROY
+ .destroy = udp_diag_destroy,
+#endif
};
static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
@@ -186,6 +264,9 @@ static const struct inet_diag_handler udplite_diag_handler = {
.dump_one = udplite_diag_dump_one,
.idiag_get_info = udp_diag_get_info,
.idiag_type = IPPROTO_UDPLITE,
+#ifdef CONFIG_INET_DIAG_DESTROY
+ .destroy = udplite_diag_destroy,
+#endif
};
static int __init udp_diag_init(void)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 7a5967bc0f38..047055dc35b4 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -107,6 +107,27 @@ static inline u32 cstamp_delta(unsigned long cstamp)
return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
}
+static inline s32 rfc3315_s14_backoff_init(s32 irt)
+{
+ /* multiply 'initial retransmission time' by 0.9 .. 1.1 */
+ u64 tmp = (900000 + prandom_u32() % 200001) * (u64)irt;
+ do_div(tmp, 1000000);
+ return (s32)tmp;
+}
+
+static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt)
+{
+ /* multiply 'retransmission timeout' by 1.9 .. 2.1 */
+ u64 tmp = (1900000 + prandom_u32() % 200001) * (u64)rt;
+ do_div(tmp, 1000000);
+ if ((s32)tmp > mrt) {
+ /* multiply 'maximum retransmission time' by 0.9 .. 1.1 */
+ tmp = (900000 + prandom_u32() % 200001) * (u64)mrt;
+ do_div(tmp, 1000000);
+ }
+ return (s32)tmp;
+}
+
#ifdef CONFIG_SYSCTL
static int addrconf_sysctl_register(struct inet6_dev *idev);
static void addrconf_sysctl_unregister(struct inet6_dev *idev);
@@ -179,6 +200,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.dad_transmits = 1,
.rtr_solicits = MAX_RTR_SOLICITATIONS,
.rtr_solicit_interval = RTR_SOLICITATION_INTERVAL,
+ .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
.rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY,
.use_tempaddr = 0,
.temp_valid_lft = TEMP_VALID_LIFETIME,
@@ -193,14 +215,17 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.accept_ra_rtr_pref = 1,
.rtr_probe_interval = 60 * HZ,
#ifdef CONFIG_IPV6_ROUTE_INFO
+ .accept_ra_rt_info_min_plen = 0,
.accept_ra_rt_info_max_plen = 0,
#endif
#endif
+ .accept_ra_rt_table = 0,
.proxy_ndp = 0,
.accept_source_route = 0, /* we do not accept RH0 by default. */
.disable_ipv6 = 0,
.accept_dad = 1,
.suppress_frag_ndisc = 1,
+ .use_oif_addrs_only = 0,
};
static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -216,6 +241,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.dad_transmits = 1,
.rtr_solicits = MAX_RTR_SOLICITATIONS,
.rtr_solicit_interval = RTR_SOLICITATION_INTERVAL,
+ .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
.rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY,
.use_tempaddr = 0,
.temp_valid_lft = TEMP_VALID_LIFETIME,
@@ -230,14 +256,17 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.accept_ra_rtr_pref = 1,
.rtr_probe_interval = 60 * HZ,
#ifdef CONFIG_IPV6_ROUTE_INFO
+ .accept_ra_rt_info_min_plen = 0,
.accept_ra_rt_info_max_plen = 0,
#endif
#endif
+ .accept_ra_rt_table = 0,
.proxy_ndp = 0,
.accept_source_route = 0, /* we do not accept RH0 by default. */
.disable_ipv6 = 0,
.accept_dad = 1,
.suppress_frag_ndisc = 1,
+ .use_oif_addrs_only = 0,
};
/* Check if a valid qdisc is available */
@@ -1171,6 +1200,9 @@ enum {
IPV6_SADDR_RULE_PRIVACY,
IPV6_SADDR_RULE_ORCHID,
IPV6_SADDR_RULE_PREFIX,
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ IPV6_SADDR_RULE_NOT_OPTIMISTIC,
+#endif
IPV6_SADDR_RULE_MAX
};
@@ -1198,6 +1230,15 @@ static inline int ipv6_saddr_preferred(int type)
return 0;
}
+static inline bool ipv6_use_optimistic_addr(struct inet6_dev *idev)
+{
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ return idev && idev->cnf.optimistic_dad && idev->cnf.use_optimistic;
+#else
+ return false;
+#endif
+}
+
static int ipv6_get_saddr_eval(struct net *net,
struct ipv6_saddr_score *score,
struct ipv6_saddr_dst *dst,
@@ -1258,10 +1299,16 @@ static int ipv6_get_saddr_eval(struct net *net,
score->scopedist = ret;
break;
case IPV6_SADDR_RULE_PREFERRED:
+ {
/* Rule 3: Avoid deprecated and optimistic addresses */
+ u8 avoid = IFA_F_DEPRECATED;
+
+ if (!ipv6_use_optimistic_addr(score->ifa->idev))
+ avoid |= IFA_F_OPTIMISTIC;
ret = ipv6_saddr_preferred(score->addr_type) ||
- !(score->ifa->flags & (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC));
+ !(score->ifa->flags & avoid);
break;
+ }
#ifdef CONFIG_IPV6_MIP6
case IPV6_SADDR_RULE_HOA:
{
@@ -1307,6 +1354,14 @@ static int ipv6_get_saddr_eval(struct net *net,
ret = score->ifa->prefix_len;
score->matchlen = ret;
break;
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ case IPV6_SADDR_RULE_NOT_OPTIMISTIC:
+ /* Optimistic addresses still have lower precedence than other
+ * preferred addresses.
+ */
+ ret = !(score->ifa->flags & IFA_F_OPTIMISTIC);
+ break;
+#endif
default:
ret = 0;
}
@@ -1318,15 +1373,96 @@ out:
return ret;
}
+static int __ipv6_dev_get_saddr(struct net *net,
+ struct ipv6_saddr_dst *dst,
+ struct inet6_dev *idev,
+ struct ipv6_saddr_score *scores,
+ int hiscore_idx)
+{
+ struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx];
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(score->ifa, &idev->addr_list, if_list) {
+ int i;
+
+ /*
+ * - Tentative Address (RFC2462 section 5.4)
+ * - A tentative address is not considered
+ * "assigned to an interface" in the traditional
+ * sense, unless it is also flagged as optimistic.
+ * - Candidate Source Address (section 4)
+ * - In any case, anycast addresses, multicast
+ * addresses, and the unspecified address MUST
+ * NOT be included in a candidate set.
+ */
+ if ((score->ifa->flags & IFA_F_TENTATIVE) &&
+ (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
+ continue;
+
+ score->addr_type = __ipv6_addr_type(&score->ifa->addr);
+
+ if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
+ score->addr_type & IPV6_ADDR_MULTICAST)) {
+ net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s",
+ idev->dev->name);
+ continue;
+ }
+
+ score->rule = -1;
+ bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);
+
+ for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
+ int minihiscore, miniscore;
+
+ minihiscore = ipv6_get_saddr_eval(net, hiscore, dst, i);
+ miniscore = ipv6_get_saddr_eval(net, score, dst, i);
+
+ if (minihiscore > miniscore) {
+ if (i == IPV6_SADDR_RULE_SCOPE &&
+ score->scopedist > 0) {
+ /*
+ * special case:
+ * each remaining entry
+ * has too small (not enough)
+ * scope, because ifa entries
+ * are sorted by their scope
+ * values.
+ */
+ goto out;
+ }
+ break;
+ } else if (minihiscore < miniscore) {
+ if (hiscore->ifa)
+ in6_ifa_put(hiscore->ifa);
+
+ in6_ifa_hold(score->ifa);
+
+ swap(hiscore, score);
+ hiscore_idx = 1 - hiscore_idx;
+
+ /* restore our iterator */
+ score->ifa = hiscore->ifa;
+
+ break;
+ }
+ }
+ }
+out:
+ read_unlock_bh(&idev->lock);
+ return hiscore_idx;
+}
+
int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
const struct in6_addr *daddr, unsigned int prefs,
struct in6_addr *saddr)
{
- struct ipv6_saddr_score scores[2],
- *score = &scores[0], *hiscore = &scores[1];
+ struct ipv6_saddr_score scores[2], *hiscore;
struct ipv6_saddr_dst dst;
+ struct inet6_dev *idev;
struct net_device *dev;
int dst_type;
+ bool use_oif_addr = false;
+ int hiscore_idx = 0;
dst_type = __ipv6_addr_type(daddr);
dst.addr = daddr;
@@ -1335,107 +1471,50 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex);
dst.prefs = prefs;
- hiscore->rule = -1;
- hiscore->ifa = NULL;
+ scores[hiscore_idx].rule = -1;
+ scores[hiscore_idx].ifa = NULL;
rcu_read_lock();
- for_each_netdev_rcu(net, dev) {
- struct inet6_dev *idev;
-
- /* Candidate Source Address (section 4)
- * - multicast and link-local destination address,
- * the set of candidate source address MUST only
- * include addresses assigned to interfaces
- * belonging to the same link as the outgoing
- * interface.
- * (- For site-local destination addresses, the
- * set of candidate source addresses MUST only
- * include addresses assigned to interfaces
- * belonging to the same site as the outgoing
- * interface.)
- */
- if (((dst_type & IPV6_ADDR_MULTICAST) ||
- dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) &&
- dst.ifindex && dev->ifindex != dst.ifindex)
- continue;
-
- idev = __in6_dev_get(dev);
- if (!idev)
- continue;
-
- read_lock_bh(&idev->lock);
- list_for_each_entry(score->ifa, &idev->addr_list, if_list) {
- int i;
-
- /*
- * - Tentative Address (RFC2462 section 5.4)
- * - A tentative address is not considered
- * "assigned to an interface" in the traditional
- * sense, unless it is also flagged as optimistic.
- * - Candidate Source Address (section 4)
- * - In any case, anycast addresses, multicast
- * addresses, and the unspecified address MUST
- * NOT be included in a candidate set.
- */
- if ((score->ifa->flags & IFA_F_TENTATIVE) &&
- (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
- continue;
-
- score->addr_type = __ipv6_addr_type(&score->ifa->addr);
+ /* Candidate Source Address (section 4)
+ * - multicast and link-local destination address,
+ * the set of candidate source address MUST only
+ * include addresses assigned to interfaces
+ * belonging to the same link as the outgoing
+ * interface.
+ * (- For site-local destination addresses, the
+ * set of candidate source addresses MUST only
+ * include addresses assigned to interfaces
+ * belonging to the same site as the outgoing
+ * interface.)
+ * - "It is RECOMMENDED that the candidate source addresses
+ * be the set of unicast addresses assigned to the
+ * interface that will be used to send to the destination
+ * (the 'outgoing' interface)." (RFC 6724)
+ */
+ if (dst_dev) {
+ idev = __in6_dev_get(dst_dev);
+ if ((dst_type & IPV6_ADDR_MULTICAST) ||
+ dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL ||
+ (idev && idev->cnf.use_oif_addrs_only)) {
+ use_oif_addr = true;
+ }
+ }
- if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
- score->addr_type & IPV6_ADDR_MULTICAST)) {
- LIMIT_NETDEBUG(KERN_DEBUG
- "ADDRCONF: unspecified / multicast address "
- "assigned as unicast address on %s",
- dev->name);
+ if (use_oif_addr) {
+ if (idev)
+ hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
+ } else {
+ for_each_netdev_rcu(net, dev) {
+ idev = __in6_dev_get(dev);
+ if (!idev)
continue;
- }
-
- score->rule = -1;
- bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);
-
- for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
- int minihiscore, miniscore;
-
- minihiscore = ipv6_get_saddr_eval(net, hiscore, &dst, i);
- miniscore = ipv6_get_saddr_eval(net, score, &dst, i);
-
- if (minihiscore > miniscore) {
- if (i == IPV6_SADDR_RULE_SCOPE &&
- score->scopedist > 0) {
- /*
- * special case:
- * each remaining entry
- * has too small (not enough)
- * scope, because ifa entries
- * are sorted by their scope
- * values.
- */
- goto try_nextdev;
- }
- break;
- } else if (minihiscore < miniscore) {
- if (hiscore->ifa)
- in6_ifa_put(hiscore->ifa);
-
- in6_ifa_hold(score->ifa);
-
- swap(hiscore, score);
-
- /* restore our iterator */
- score->ifa = hiscore->ifa;
-
- break;
- }
- }
+ hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
}
-try_nextdev:
- read_unlock_bh(&idev->lock);
}
rcu_read_unlock();
+ hiscore = &scores[hiscore_idx];
if (!hiscore->ifa)
return -EADDRNOTAVAIL;
@@ -1496,15 +1575,30 @@ static int ipv6_count_addresses(struct inet6_dev *idev)
int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
const struct net_device *dev, int strict)
{
+ return ipv6_chk_addr_and_flags(net, addr, dev, strict, IFA_F_TENTATIVE);
+}
+EXPORT_SYMBOL(ipv6_chk_addr);
+
+int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
+ const struct net_device *dev, int strict,
+ u32 banned_flags)
+{
struct inet6_ifaddr *ifp;
unsigned int hash = inet6_addr_hash(addr);
+ u32 ifp_flags;
rcu_read_lock_bh();
hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
if (!net_eq(dev_net(ifp->idev->dev), net))
continue;
+ /* Decouple optimistic from tentative for evaluation here.
+ * Ban optimistic addresses explicitly, when required.
+ */
+ ifp_flags = (ifp->flags&IFA_F_OPTIMISTIC)
+ ? (ifp->flags&~IFA_F_TENTATIVE)
+ : ifp->flags;
if (ipv6_addr_equal(&ifp->addr, addr) &&
- !(ifp->flags&IFA_F_TENTATIVE) &&
+ !(ifp_flags&banned_flags) &&
(dev == NULL || ifp->idev->dev == dev ||
!(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) {
rcu_read_unlock_bh();
@@ -1515,7 +1609,7 @@ int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
rcu_read_unlock_bh();
return 0;
}
-EXPORT_SYMBOL(ipv6_chk_addr);
+EXPORT_SYMBOL(ipv6_chk_addr_and_flags);
static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
struct net_device *dev)
@@ -1968,6 +2062,31 @@ static void __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmp
__ipv6_regen_rndid(idev);
}
+u32 addrconf_rt_table(const struct net_device *dev, u32 default_table) {
+ /* Determines into what table to put autoconf PIO/RIO/default routes
+ * learned on this device.
+ *
+ * - If 0, use the same table for every device. This puts routes into
+ * one of RT_TABLE_{PREFIX,INFO,DFLT} depending on the type of route
+ * (but note that these three are currently all equal to
+ * RT6_TABLE_MAIN).
+ * - If > 0, use the specified table.
+ * - If < 0, put routes into table dev->ifindex + (-rt_table).
+ */
+ struct inet6_dev *idev = in6_dev_get(dev);
+ u32 table;
+ int sysctl = idev->cnf.accept_ra_rt_table;
+ if (sysctl == 0) {
+ table = default_table;
+ } else if (sysctl > 0) {
+ table = (u32) sysctl;
+ } else {
+ table = (unsigned) dev->ifindex + (-sysctl);
+ }
+ in6_dev_put(idev);
+ return table;
+}
+
/*
* Add prefix route.
*/
@@ -1977,7 +2096,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
unsigned long expires, u32 flags)
{
struct fib6_config cfg = {
- .fc_table = RT6_TABLE_PREFIX,
+ .fc_table = addrconf_rt_table(dev, RT6_TABLE_PREFIX),
.fc_metric = IP6_RT_PRIO_ADDRCONF,
.fc_ifindex = dev->ifindex,
.fc_expires = expires,
@@ -2011,7 +2130,8 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
struct rt6_info *rt = NULL;
struct fib6_table *table;
- table = fib6_get_table(dev_net(dev), RT6_TABLE_PREFIX);
+ table = fib6_get_table(dev_net(dev),
+ addrconf_rt_table(dev, RT6_TABLE_PREFIX));
if (table == NULL)
return NULL;
@@ -3138,7 +3258,7 @@ static void addrconf_rs_timer(unsigned long data)
if (idev->if_flags & IF_RA_RCVD)
goto out;
- if (idev->rs_probes++ < idev->cnf.rtr_solicits) {
+ if (idev->rs_probes++ < idev->cnf.rtr_solicits || idev->cnf.rtr_solicits < 0) {
write_unlock(&idev->lock);
if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
ndisc_send_rs(dev, &lladdr,
@@ -3147,11 +3267,13 @@ static void addrconf_rs_timer(unsigned long data)
goto put;
write_lock(&idev->lock);
+ idev->rs_interval = rfc3315_s14_backoff_update(
+ idev->rs_interval, idev->cnf.rtr_solicit_max_interval);
/* The wait after the last probe can be shorter */
addrconf_mod_rs_timer(idev, (idev->rs_probes ==
idev->cnf.rtr_solicits) ?
idev->cnf.rtr_solicit_delay :
- idev->cnf.rtr_solicit_interval);
+ idev->rs_interval);
} else {
/*
* Note: we do not support deprecated "all on-link"
@@ -3226,8 +3348,15 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
* Optimistic nodes can start receiving
* Frames right away
*/
- if (ifp->flags & IFA_F_OPTIMISTIC)
+ if (ifp->flags & IFA_F_OPTIMISTIC) {
ip6_ins_rt(ifp->rt);
+ if (ipv6_use_optimistic_addr(idev)) {
+ /* Because optimistic nodes can use this address,
+ * notify listeners. If DAD fails, RTM_DELADDR is sent.
+ */
+ ipv6_ifa_notify(RTM_NEWADDR, ifp);
+ }
+ }
addrconf_dad_kick(ifp);
out:
@@ -3369,7 +3498,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp);
send_rs = send_mld &&
ipv6_accept_ra(ifp->idev) &&
- ifp->idev->cnf.rtr_solicits > 0 &&
+ ifp->idev->cnf.rtr_solicits != 0 &&
(dev->flags&IFF_LOOPBACK) == 0;
read_unlock_bh(&ifp->idev->lock);
@@ -3391,10 +3520,11 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
write_lock_bh(&ifp->idev->lock);
spin_lock(&ifp->lock);
+ ifp->idev->rs_interval = rfc3315_s14_backoff_init(
+ ifp->idev->cnf.rtr_solicit_interval);
ifp->idev->rs_probes = 1;
ifp->idev->if_flags |= IF_RS_SENT;
- addrconf_mod_rs_timer(ifp->idev,
- ifp->idev->cnf.rtr_solicit_interval);
+ addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval);
spin_unlock(&ifp->lock);
write_unlock_bh(&ifp->idev->lock);
}
@@ -4308,6 +4438,8 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits;
array[DEVCONF_RTR_SOLICIT_INTERVAL] =
jiffies_to_msecs(cnf->rtr_solicit_interval);
+ array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] =
+ jiffies_to_msecs(cnf->rtr_solicit_max_interval);
array[DEVCONF_RTR_SOLICIT_DELAY] =
jiffies_to_msecs(cnf->rtr_solicit_delay);
array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version;
@@ -4328,13 +4460,16 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_RTR_PROBE_INTERVAL] =
jiffies_to_msecs(cnf->rtr_probe_interval);
#ifdef CONFIG_IPV6_ROUTE_INFO
+ array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] = cnf->accept_ra_rt_info_min_plen;
array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen;
#endif
#endif
+ array[DEVCONF_ACCEPT_RA_RT_TABLE] = cnf->accept_ra_rt_table;
array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp;
array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route;
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad;
+ array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic;
#endif
#ifdef CONFIG_IPV6_MROUTE
array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding;
@@ -4345,6 +4480,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify;
array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc;
array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local;
+ array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
}
static inline size_t inet6_ifla6_size(void)
@@ -4496,7 +4632,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
return -EINVAL;
if (!ipv6_accept_ra(idev))
return -EINVAL;
- if (idev->cnf.rtr_solicits <= 0)
+ if (idev->cnf.rtr_solicits == 0)
return -EINVAL;
write_lock_bh(&idev->lock);
@@ -4521,8 +4657,10 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
if (update_rs) {
idev->if_flags |= IF_RS_SENT;
+ idev->rs_interval = rfc3315_s14_backoff_init(
+ idev->cnf.rtr_solicit_interval);
idev->rs_probes = 1;
- addrconf_mod_rs_timer(idev, idev->cnf.rtr_solicit_interval);
+ addrconf_mod_rs_timer(idev, idev->rs_interval);
}
/* Well, that's kinda nasty ... */
@@ -5059,6 +5197,13 @@ static struct addrconf_sysctl_table
.proc_handler = proc_dointvec_jiffies,
},
{
+ .procname = "router_solicitation_max_interval",
+ .data = &ipv6_devconf.rtr_solicit_max_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
.procname = "router_solicitation_delay",
.data = &ipv6_devconf.rtr_solicit_delay,
.maxlen = sizeof(int),
@@ -5161,6 +5306,13 @@ static struct addrconf_sysctl_table
},
#ifdef CONFIG_IPV6_ROUTE_INFO
{
+ .procname = "accept_ra_rt_info_min_plen",
+ .data = &ipv6_devconf.accept_ra_rt_info_min_plen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "accept_ra_rt_info_max_plen",
.data = &ipv6_devconf.accept_ra_rt_info_max_plen,
.maxlen = sizeof(int),
@@ -5170,6 +5322,13 @@ static struct addrconf_sysctl_table
#endif
#endif
{
+ .procname = "accept_ra_rt_table",
+ .data = &ipv6_devconf.accept_ra_rt_table,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "proxy_ndp",
.data = &ipv6_devconf.proxy_ndp,
.maxlen = sizeof(int),
@@ -5192,6 +5351,14 @@ static struct addrconf_sysctl_table
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "use_optimistic",
+ .data = &ipv6_devconf.use_optimistic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+
+ },
#endif
#ifdef CONFIG_IPV6_MROUTE
{
@@ -5245,6 +5412,14 @@ static struct addrconf_sysctl_table
.proc_handler = proc_dointvec,
},
{
+ .procname = "use_oif_addrs_only",
+ .data = &ipv6_devconf.use_oif_addrs_only,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+
+ },
+ {
/* sentinel */
}
},
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index ad95905e7a70..476257accf8f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -64,6 +64,20 @@
#include <asm/uaccess.h>
#include <linux/mroute6.h>
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+
+static inline int current_has_network(void)
+{
+ return in_egroup_p(AID_INET) || capable(CAP_NET_RAW);
+}
+#else
+static inline int current_has_network(void)
+{
+ return 1;
+}
+#endif
+
MODULE_AUTHOR("Cast of dozens");
MODULE_DESCRIPTION("IPv6 protocol stack for Linux");
MODULE_LICENSE("GPL");
@@ -109,6 +123,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
int try_loading_module = 0;
int err;
+ if (!current_has_network())
+ return -EACCES;
+
if (protocol < 0 || protocol >= IPPROTO_MAX)
return -EINVAL;
@@ -158,8 +175,7 @@ lookup_protocol:
}
err = -EPERM;
- if (sock->type == SOCK_RAW && !kern &&
- !ns_capable(net->user_ns, CAP_NET_RAW))
+ if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
goto out_rcu_unlock;
sock->ops = answer->ops;
@@ -659,6 +675,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = inet->inet_dport;
fl6.fl6_sport = inet->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
rcu_read_lock();
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 6d16eb0e0c7f..bacb85c8c897 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -660,9 +660,10 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index ee7a8983b173..6c87d58e5324 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -165,6 +165,7 @@ ipv4_connected:
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = inet->inet_dport;
fl6.fl6_sport = inet->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST))
fl6.flowi6_oif = np->mcast_oif;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 83fc3a385a26..6c51fbd062fd 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -439,9 +439,10 @@ static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index c7c8f71d0d48..462ccfe2c755 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -166,15 +166,15 @@ EXPORT_SYMBOL_GPL(ipv6_find_tlv);
* to explore inner IPv6 header, eg. ICMPv6 error messages.
*
* If target header is found, its offset is set in *offset and return protocol
- * number. Otherwise, return -1.
+ * number. Otherwise, return -ENOENT or -EBADMSG.
*
* If the first fragment doesn't contain the final protocol header or
* NEXTHDR_NONE it is considered invalid.
*
* Note that non-1st fragment is special case that "the protocol number
* of last header" is "next header" field in Fragment header. In this case,
- * *offset is meaningless and fragment offset is stored in *fragoff if fragoff
- * isn't NULL.
+ * *offset is meaningless. If fragoff is not NULL, the fragment offset is
+ * stored in *fragoff; if it is NULL, return -EINVAL.
*
* if flags is not NULL and it's a fragment, then the frag flag
* IP6_FH_F_FRAG will be set. If it's an AH header, the
@@ -253,9 +253,12 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
if (target < 0 &&
((!ipv6_ext_hdr(hp->nexthdr)) ||
hp->nexthdr == NEXTHDR_NONE)) {
- if (fragoff)
+ if (fragoff) {
*fragoff = _frag_off;
- return hp->nexthdr;
+ return hp->nexthdr;
+ } else {
+ return -EINVAL;
+ }
}
if (!found)
return -ENOENT;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 97ae70077a4f..439aaaff0d23 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -91,13 +91,14 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct net *net = dev_net(skb->dev);
if (type == ICMPV6_PKT_TOOBIG)
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
else if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
if (!(type & ICMPV6_INFOMSG_MASK))
if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
- ping_err(skb, offset, info);
+ ping_err(skb, offset, ntohl(info));
}
static int icmpv6_rcv(struct sk_buff *skb);
@@ -475,6 +476,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
fl6.flowi6_oif = iif;
fl6.fl6_icmp_type = type;
fl6.fl6_icmp_code = code;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
sk = icmpv6_xmit_lock(net);
@@ -578,6 +580,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
fl6.flowi6_oif = skb->dev->ifindex;
fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
fl6.flowi6_mark = mark;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
sk = icmpv6_xmit_lock(net);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 6cc516c825b6..fd119da3cda9 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -85,6 +85,7 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk,
fl6->flowi6_mark = ireq->ir_mark;
fl6->fl6_dport = ireq->ir_rmt_port;
fl6->fl6_sport = htons(ireq->ir_num);
+ fl6->flowi6_uid = sk->sk_uid;
security_req_classify_flow(req, flowi6_to_flowi(fl6));
dst = ip6_dst_lookup_flow(sk, fl6, final_p);
@@ -208,6 +209,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
fl6->flowi6_mark = sk->sk_mark;
fl6->fl6_sport = inet->inet_sport;
fl6->fl6_dport = inet->inet_dport;
+ fl6->flowi6_uid = sk->sk_uid;
security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
rcu_read_lock();
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index df3633e20458..140f44b94afb 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -800,6 +800,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
if (err != 0) {
/* XXX: send ICMP error even if DF is not set. */
@@ -850,6 +852,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
if (err != 0) {
if (err == -EMSGSIZE)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 8afea07e53a2..6d5499064019 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1097,6 +1097,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPIP;
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
dsfield = ipv4_get_dsfield(iph);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
@@ -1148,6 +1150,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPV6;
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
dsfield = ipv6_get_dsfield(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 591e2355cc9e..754c41c9eed5 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -560,9 +560,10 @@ static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 1b9316e1386a..54d165b9845a 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -74,9 +74,10 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index a46c50423aec..a3033cddd09c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -654,7 +654,9 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
struct in6_addr *target = (struct in6_addr *)&neigh->primary_key;
int probes = atomic_read(&neigh->probes);
- if (skb && ipv6_chk_addr(dev_net(dev), &ipv6_hdr(skb)->saddr, dev, 1))
+ if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr,
+ dev, 1,
+ IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))
saddr = &ipv6_hdr(skb)->saddr;
if ((probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES)) < 0) {
@@ -1323,6 +1325,8 @@ skip_linkparms:
if (ri->prefix_len == 0 &&
!in6_dev->cnf.accept_ra_defrtr)
continue;
+ if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen)
+ continue;
if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen)
continue;
rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3,
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index d38e6a8d8b9f..e48d26beda46 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -27,6 +27,7 @@ int ip6_route_me_harder(struct sk_buff *skb)
struct flowi6 fl6 = {
.flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
.flowi6_mark = skb->mark,
+ .flowi6_uid = sock_net_uid(net, skb->sk),
.daddr = iph->daddr,
.saddr = iph->saddr,
};
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 6af874fc187f..5ea61508bc6a 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -173,7 +173,8 @@ config IP6_NF_MATCH_MH
config IP6_NF_MATCH_RPFILTER
tristate '"rpfilter" reverse path filter match support'
- depends on NETFILTER_ADVANCED && (IP6_NF_MANGLE || IP6_NF_RAW)
+ depends on NETFILTER_ADVANCED
+ depends on IP6_NF_MANGLE || IP6_NF_RAW
---help---
This option allows you to match packets whose replies would
go out via the interface the packet came in.
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 50bb5e5c99c1..592ce12840fd 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -85,7 +85,7 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct icmp6hdr user_icmph;
int addr_type;
struct in6_addr *daddr;
- int iif = 0;
+ int oif = 0;
struct flowi6 fl6;
int err;
int hlimit;
@@ -107,25 +107,30 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (u->sin6_family != AF_INET6) {
return -EAFNOSUPPORT;
}
- if (sk->sk_bound_dev_if &&
- sk->sk_bound_dev_if != u->sin6_scope_id) {
- return -EINVAL;
- }
daddr = &(u->sin6_addr);
- iif = u->sin6_scope_id;
+ if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr)))
+ oif = u->sin6_scope_id;
} else {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
daddr = &sk->sk_v6_daddr;
}
- if (!iif)
- iif = sk->sk_bound_dev_if;
+ if (!oif)
+ oif = sk->sk_bound_dev_if;
+
+ if (!oif)
+ oif = np->sticky_pktinfo.ipi6_ifindex;
+
+ if (!oif && ipv6_addr_is_multicast(daddr))
+ oif = np->mcast_oif;
+ else if (!oif)
+ oif = np->ucast_oif;
addr_type = ipv6_addr_type(daddr);
- if (__ipv6_addr_needs_scope_id(addr_type) && !iif)
- return -EINVAL;
- if (addr_type & IPV6_ADDR_MAPPED)
+ if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) ||
+ (addr_type & IPV6_ADDR_MAPPED) ||
+ (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if))
return -EINVAL;
/* TODO: use ip6_datagram_send_ctl to get options from cmsg */
@@ -135,16 +140,13 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
fl6.flowi6_proto = IPPROTO_ICMPV6;
fl6.saddr = np->saddr;
fl6.daddr = *daddr;
+ fl6.flowi6_oif = oif;
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
fl6.fl6_icmp_type = user_icmph.icmp6_type;
fl6.fl6_icmp_code = user_icmph.icmp6_code;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
- fl6.flowi6_oif = np->mcast_oif;
- else if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->ucast_oif;
-
dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr);
if (IS_ERR(dst))
return PTR_ERR(dst);
@@ -154,11 +156,6 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (!np)
return -EBADF;
- if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
- fl6.flowi6_oif = np->mcast_oif;
- else if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->ucast_oif;
-
pfh.icmph.type = user_icmph.icmp6_type;
pfh.icmph.code = user_icmph.icmp6_code;
pfh.icmph.checksum = 0;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 0bf9f589fb27..0fb940d584b4 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -771,6 +771,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
if (sin6) {
if (addr_len < SIN6_LEN_RFC2133)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 5ded6638a5e4..223aa4292c1b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -95,13 +95,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
#ifdef CONFIG_IPV6_ROUTE_INFO
-static struct rt6_info *rt6_add_route_info(struct net *net,
+static struct rt6_info *rt6_add_route_info(struct net_device *dev,
const struct in6_addr *prefix, int prefixlen,
- const struct in6_addr *gwaddr, int ifindex,
- unsigned int pref);
-static struct rt6_info *rt6_get_route_info(struct net *net,
+ const struct in6_addr *gwaddr, unsigned int pref);
+static struct rt6_info *rt6_get_route_info(struct net_device *dev,
const struct in6_addr *prefix, int prefixlen,
- const struct in6_addr *gwaddr, int ifindex);
+ const struct in6_addr *gwaddr);
#endif
static void rt6_bind_peer(struct rt6_info *rt, int create)
@@ -700,7 +699,6 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
const struct in6_addr *gwaddr)
{
- struct net *net = dev_net(dev);
struct route_info *rinfo = (struct route_info *) opt;
struct in6_addr prefix_buf, *prefix;
unsigned int pref;
@@ -745,8 +743,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
if (rinfo->prefix_len == 0)
rt = rt6_get_dflt_router(gwaddr, dev);
else
- rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
- gwaddr, dev->ifindex);
+ rt = rt6_get_route_info(dev, prefix, rinfo->prefix_len, gwaddr);
if (rt && !lifetime) {
ip6_del_rt(rt);
@@ -754,8 +751,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
}
if (!rt && lifetime)
- rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
- pref);
+ rt = rt6_add_route_info(dev, prefix, rinfo->prefix_len, gwaddr, pref);
else if (rt)
rt->rt6i_flags = RTF_ROUTEINFO |
(rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
@@ -1159,7 +1155,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
}
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
- int oif, u32 mark)
+ int oif, u32 mark, kuid_t uid)
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
@@ -1171,6 +1167,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
fl6.daddr = iph->daddr;
fl6.saddr = iph->saddr;
fl6.flowlabel = ip6_flowinfo(iph);
+ fl6.flowi6_uid = uid;
dst = ip6_route_output(net, NULL, &fl6);
if (!dst->error)
@@ -1182,7 +1179,7 @@ EXPORT_SYMBOL_GPL(ip6_update_pmtu);
void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
{
ip6_update_pmtu(skb, sock_net(sk), mtu,
- sk->sk_bound_dev_if, sk->sk_mark);
+ sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
}
EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
@@ -1257,7 +1254,8 @@ static struct dst_entry *ip6_route_redirect(struct net *net,
flags, __ip6_route_redirect);
}
-void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
+void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
+ kuid_t uid)
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
@@ -1270,6 +1268,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
fl6.daddr = iph->daddr;
fl6.saddr = iph->saddr;
fl6.flowlabel = ip6_flowinfo(iph);
+ fl6.flowi6_uid = uid;
dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
rt6_do_redirect(dst, NULL, skb);
@@ -1291,6 +1290,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
fl6.flowi6_mark = mark;
fl6.daddr = msg->dest;
fl6.saddr = iph->daddr;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
dst = ip6_route_redirect(net, &fl6, &iph->saddr);
rt6_do_redirect(dst, NULL, skb);
@@ -1299,7 +1299,8 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
- ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
+ ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
+ sk->sk_uid);
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);
@@ -1904,15 +1905,16 @@ static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
}
#ifdef CONFIG_IPV6_ROUTE_INFO
-static struct rt6_info *rt6_get_route_info(struct net *net,
+static struct rt6_info *rt6_get_route_info(struct net_device *dev,
const struct in6_addr *prefix, int prefixlen,
- const struct in6_addr *gwaddr, int ifindex)
+ const struct in6_addr *gwaddr)
{
struct fib6_node *fn;
struct rt6_info *rt = NULL;
struct fib6_table *table;
- table = fib6_get_table(net, RT6_TABLE_INFO);
+ table = fib6_get_table(dev_net(dev),
+ addrconf_rt_table(dev, RT6_TABLE_INFO));
if (!table)
return NULL;
@@ -1922,7 +1924,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
goto out;
for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
- if (rt->dst.dev->ifindex != ifindex)
+ if (rt->dst.dev->ifindex != dev->ifindex)
continue;
if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
continue;
@@ -1936,21 +1938,20 @@ out:
return rt;
}
-static struct rt6_info *rt6_add_route_info(struct net *net,
+static struct rt6_info *rt6_add_route_info(struct net_device *dev,
const struct in6_addr *prefix, int prefixlen,
- const struct in6_addr *gwaddr, int ifindex,
- unsigned int pref)
+ const struct in6_addr *gwaddr, unsigned int pref)
{
struct fib6_config cfg = {
- .fc_table = RT6_TABLE_INFO,
+ .fc_table = addrconf_rt_table(dev, RT6_TABLE_INFO),
.fc_metric = IP6_RT_PRIO_USER,
- .fc_ifindex = ifindex,
+ .fc_ifindex = dev->ifindex,
.fc_dst_len = prefixlen,
.fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
RTF_UP | RTF_PREF(pref),
.fc_nlinfo.portid = 0,
.fc_nlinfo.nlh = NULL,
- .fc_nlinfo.nl_net = net,
+ .fc_nlinfo.nl_net = dev_net(dev),
};
cfg.fc_dst = *prefix;
@@ -1962,7 +1963,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
ip6_route_add(&cfg);
- return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
+ return rt6_get_route_info(dev, prefix, prefixlen, gwaddr);
}
#endif
@@ -1971,7 +1972,8 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev
struct rt6_info *rt;
struct fib6_table *table;
- table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
+ table = fib6_get_table(dev_net(dev),
+ addrconf_rt_table(dev, RT6_TABLE_MAIN));
if (!table)
return NULL;
@@ -1993,7 +1995,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
unsigned int pref)
{
struct fib6_config cfg = {
- .fc_table = RT6_TABLE_DFLT,
+ .fc_table = addrconf_rt_table(dev, RT6_TABLE_DFLT),
.fc_metric = IP6_RT_PRIO_USER,
.fc_ifindex = dev->ifindex,
.fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
@@ -2010,28 +2012,17 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
return rt6_get_dflt_router(gwaddr, dev);
}
-void rt6_purge_dflt_routers(struct net *net)
-{
- struct rt6_info *rt;
- struct fib6_table *table;
- /* NOTE: Keep consistent with rt6_get_dflt_router */
- table = fib6_get_table(net, RT6_TABLE_DFLT);
- if (!table)
- return;
+int rt6_addrconf_purge(struct rt6_info *rt, void *arg) {
+ if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
+ (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2))
+ return -1;
+ return 0;
+}
-restart:
- read_lock_bh(&table->tb6_lock);
- for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
- if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
- (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
- dst_hold(&rt->dst);
- read_unlock_bh(&table->tb6_lock);
- ip6_del_rt(rt);
- goto restart;
- }
- }
- read_unlock_bh(&table->tb6_lock);
+void rt6_purge_dflt_routers(struct net *net)
+{
+ fib6_clean_all(net, rt6_addrconf_purge, NULL);
}
static void rtmsg_to_fib6_config(struct net *net,
@@ -2337,6 +2328,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_PRIORITY] = { .type = NLA_U32 },
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
+ [RTA_UID] = { .type = NLA_U32 },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2728,6 +2720,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
if (tb[RTA_MARK])
fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
+ if (tb[RTA_UID])
+ fl6.flowi6_uid = make_kuid(current_user_ns(),
+ nla_get_u32(tb[RTA_UID]));
+ else
+ fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
+
if (iif) {
struct net_device *dev;
int flags = 0;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index aa9699301ea8..4218d9555ac2 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -247,6 +247,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
fl6.flowi6_mark = ireq->ir_mark;
fl6.fl6_dport = ireq->ir_rmt_port;
fl6.fl6_sport = inet_sk(sk)->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
security_req_classify_flow(req, flowi6_to_flowi(&fl6));
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 19fe7b789a72..ee4968737746 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -257,6 +257,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = usin->sin6_port;
fl6.fl6_sport = inet->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
final_p = fl6_update_dst(&fl6, opt, &final);
@@ -878,6 +879,7 @@ static void tcp_v6_send_response(struct sock *sk, struct sk_buff *skb, u32 seq,
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
+ fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
/* Pass a socket to ip6_dst_lookup either it is for RST
@@ -1113,6 +1115,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
#endif
+ newnp->ipv6_mc_list = NULL;
newnp->ipv6_ac_list = NULL;
newnp->ipv6_fl_list = NULL;
newnp->pktoptions = NULL;
@@ -1184,6 +1187,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
First: no IPv4 options.
*/
newinet->inet_opt = NULL;
+ newnp->ipv6_mc_list = NULL;
newnp->ipv6_ac_list = NULL;
newnp->ipv6_fl_list = NULL;
@@ -1945,6 +1949,7 @@ struct proto tcpv6_prot = {
.proto_cgroup = tcp_proto_cgroup,
#endif
.clear_sk = tcp_v6_clear_sk,
+ .diag_destroy = tcp_abort,
};
static const struct inet6_protocol tcpv6_protocol = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index a774afa4ce52..41778da4a032 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1218,6 +1218,7 @@ do_udp_sendmsg:
fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
if (msg->msg_controllen) {
opt = &opt_space;
@@ -1510,6 +1511,7 @@ struct proto udpv6_prot = {
.compat_getsockopt = compat_udpv6_getsockopt,
#endif
.clear_sk = udp_v6_clear_sk,
+ .diag_destroy = udp_abort,
};
static struct inet_protosw udpv6_protosw = {
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index cf0958712058..0bd7b17f4553 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -519,6 +519,7 @@ static int l2tp_ip6_sendmsg(struct kiocb *iocb, struct sock *sk,
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
if (lsa) {
if (addr_len < SIN6_LEN_RFC2133)
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index ae5096ab65eb..cd5511ada239 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -198,7 +198,7 @@ config NF_CONNTRACK_FTP
config NF_CONNTRACK_H323
tristate "H.323 protocol support"
- depends on (IPV6 || IPV6=n)
+ depends on IPV6 || IPV6=n
depends on NETFILTER_ADVANCED
help
H.323 is a VoIP signalling protocol from ITU-T. As one of the most
@@ -705,7 +705,7 @@ config NETFILTER_XT_TARGET_HL
config NETFILTER_XT_TARGET_HMARK
tristate '"HMARK" target support'
- depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
depends on NETFILTER_ADVANCED
---help---
This option adds the "HMARK" target.
@@ -846,7 +846,7 @@ config NETFILTER_XT_TARGET_REDIRECT
config NETFILTER_XT_TARGET_TEE
tristate '"TEE" - packet cloning to alternate destination'
depends on NETFILTER_ADVANCED
- depends on (IPV6 || IPV6=n)
+ depends on IPV6 || IPV6=n
depends on !NF_CONNTRACK || NF_CONNTRACK
---help---
This option adds a "TEE" target with which a packet can be cloned and
@@ -856,7 +856,7 @@ config NETFILTER_XT_TARGET_TPROXY
tristate '"TPROXY" target transparent proxying support'
depends on NETFILTER_XTABLES
depends on NETFILTER_ADVANCED
- depends on (IPV6 || IPV6=n)
+ depends on IPV6 || IPV6=n
depends on IP_NF_MANGLE
select NF_DEFRAG_IPV4
select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
@@ -895,7 +895,7 @@ config NETFILTER_XT_TARGET_SECMARK
config NETFILTER_XT_TARGET_TCPMSS
tristate '"TCPMSS" target support'
- depends on (IPV6 || IPV6=n)
+ depends on IPV6 || IPV6=n
default m if NETFILTER_ADVANCED=n
---help---
This option adds a `TCPMSS' target, which allows you to alter the
@@ -1107,7 +1107,7 @@ config NETFILTER_XT_MATCH_ESP
config NETFILTER_XT_MATCH_HASHLIMIT
tristate '"hashlimit" match support'
- depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
depends on NETFILTER_ADVANCED
help
This option adds a `hashlimit' match.
@@ -1255,6 +1255,8 @@ config NETFILTER_XT_MATCH_OWNER
based on who created the socket: the user or group. It is also
possible to check whether a socket actually exists.
+ Conflicts with '"quota, tag, uid" match'
+
config NETFILTER_XT_MATCH_POLICY
tristate 'IPsec "policy" match support'
depends on XFRM
@@ -1288,6 +1290,22 @@ config NETFILTER_XT_MATCH_PKTTYPE
To compile it as a module, choose M here. If unsure, say N.
+config NETFILTER_XT_MATCH_QTAGUID
+ bool '"quota, tag, owner" match and stats support'
+ depends on NETFILTER_XT_MATCH_SOCKET
+ depends on NETFILTER_XT_MATCH_OWNER=n
+ help
+ This option replaces the `owner' match. In addition to matching
+ on uid, it keeps stats based on a tag assigned to a socket.
+ The full tag is comprised of a UID and an accounting tag.
+ The tags are assignable to sockets from user space (e.g. a download
+ manager can assign the socket to another UID for accounting).
+ Stats and control are done via /proc/net/xt_qtaguid/.
+ It replaces owner as it takes the same arguments, but should
+ really be recognized by the iptables tool.
+
+ If unsure, say `N'.
+
config NETFILTER_XT_MATCH_QUOTA
tristate '"quota" match support'
depends on NETFILTER_ADVANCED
@@ -1298,6 +1316,29 @@ config NETFILTER_XT_MATCH_QUOTA
If you want to compile it as a module, say M here and read
<file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+config NETFILTER_XT_MATCH_QUOTA2
+ tristate '"quota2" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `quota2' match, which allows to match on a
+ byte counter correctly and not per CPU.
+ It allows naming the quotas.
+ This is based on http://xtables-addons.git.sourceforge.net
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_QUOTA2_LOG
+ bool '"quota2" Netfilter LOG support'
+ depends on NETFILTER_XT_MATCH_QUOTA2
+ default n
+ help
+ This option allows `quota2' to log ONCE when a quota limit
+ is passed. It logs via NETLINK using the NETLINK_NFLOG family.
+ It logs similarly to how ipt_ULOG would without data.
+
+ If unsure, say `N'.
+
config NETFILTER_XT_MATCH_RATEEST
tristate '"rateest" match support'
depends on NETFILTER_ADVANCED
@@ -1349,7 +1390,7 @@ config NETFILTER_XT_MATCH_SOCKET
depends on NETFILTER_XTABLES
depends on NETFILTER_ADVANCED
depends on !NF_CONNTRACK || NF_CONNTRACK
- depends on (IPV6 || IPV6=n)
+ depends on IPV6 || IPV6=n
select NF_DEFRAG_IPV4
select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index a9571be3f791..c07bfc8c77c0 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -155,7 +155,9 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_CGROUP) += xt_cgroup.o
obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QTAGUID) += xt_qtaguid_print.o xt_qtaguid.o
obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA2) += xt_quota2.o
obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o
obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 98cd0e78c94c..7a775ab9551b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -235,7 +235,7 @@ EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
static void
clean_from_lists(struct nf_conn *ct)
{
- pr_debug("clean_from_lists(%p)\n", ct);
+ pr_debug("clean_from_lists(%pK)\n", ct);
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
@@ -294,7 +294,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
struct net *net = nf_ct_net(ct);
struct nf_conntrack_l4proto *l4proto;
- pr_debug("destroy_conntrack(%p)\n", ct);
+ pr_debug("destroy_conntrack(%pK)\n", ct);
NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
NF_CT_ASSERT(!timer_pending(&ct->timeout));
@@ -321,7 +321,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
if (ct->master)
nf_ct_put(ct->master);
- pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
+ pr_debug("destroy_conntrack: returning ct=%pK to slab\n", ct);
nf_conntrack_free(ct);
}
@@ -952,7 +952,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
spin_lock(&nf_conntrack_expect_lock);
exp = nf_ct_find_expectation(net, zone, tuple);
if (exp) {
- pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
+ pr_debug("conntrack: expectation arrives ct=%pK exp=%pK\n",
ct, exp);
/* Welcome, Mr. Bond. We've been expecting you... */
__set_bit(IPS_EXPECTED_BIT, &ct->status);
@@ -1041,14 +1041,14 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
} else {
/* Once we've had two way comms, always ESTABLISHED. */
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
+ pr_debug("nf_conntrack_in: normal packet for %pK\n", ct);
*ctinfo = IP_CT_ESTABLISHED;
} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
- pr_debug("nf_conntrack_in: related packet for %p\n",
+ pr_debug("nf_conntrack_in: related packet for %pK\n",
ct);
*ctinfo = IP_CT_RELATED;
} else {
- pr_debug("nf_conntrack_in: new packet for %p\n", ct);
+ pr_debug("nf_conntrack_in: new packet for %pK\n", ct);
*ctinfo = IP_CT_NEW;
}
*set_reply = 0;
@@ -1190,7 +1190,7 @@ void nf_conntrack_alter_reply(struct nf_conn *ct,
/* Should be unconfirmed, so not in hash table yet */
NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
- pr_debug("Altering reply tuple of %p to ", ct);
+ pr_debug("Altering reply tuple of %pK to ", ct);
nf_ct_dump_tuple(newreply);
ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index e41bab38a3ca..daec7d6957a9 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -321,10 +321,12 @@ replay:
nlh = nlmsg_hdr(skb);
err = 0;
- if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) ||
- skb->len < nlh->nlmsg_len) {
- err = -EINVAL;
- goto ack;
+ if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+ skb->len < nlh->nlmsg_len ||
+ nlmsg_len(nlh) < sizeof(struct nfgenmsg)) {
+ nfnl_err_reset(&err_list);
+ success = false;
+ goto done;
}
/* Only requests are handled by the kernel */
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index f407ebc13481..0975c993a94e 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -5,6 +5,7 @@
* After timer expires a kevent will be sent.
*
* Copyright (C) 2004, 2010 Nokia Corporation
+ *
* Written by Timo Teras <ext-timo.teras@nokia.com>
*
* Converted to x_tables and reworked for upstream inclusion
@@ -38,8 +39,16 @@
#include <linux/netfilter/xt_IDLETIMER.h>
#include <linux/kdev_t.h>
#include <linux/kobject.h>
+#include <linux/skbuff.h>
#include <linux/workqueue.h>
#include <linux/sysfs.h>
+#include <linux/rtc.h>
+#include <linux/time.h>
+#include <linux/math64.h>
+#include <linux/suspend.h>
+#include <linux/notifier.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
struct idletimer_tg_attr {
struct attribute attr;
@@ -55,14 +64,110 @@ struct idletimer_tg {
struct kobject *kobj;
struct idletimer_tg_attr attr;
+ struct timespec delayed_timer_trigger;
+ struct timespec last_modified_timer;
+ struct timespec last_suspend_time;
+ struct notifier_block pm_nb;
+
+ int timeout;
unsigned int refcnt;
+ bool work_pending;
+ bool send_nl_msg;
+ bool active;
+ uid_t uid;
};
static LIST_HEAD(idletimer_tg_list);
static DEFINE_MUTEX(list_mutex);
+static DEFINE_SPINLOCK(timestamp_lock);
static struct kobject *idletimer_tg_kobj;
+static bool check_for_delayed_trigger(struct idletimer_tg *timer,
+ struct timespec *ts)
+{
+ bool state;
+ struct timespec temp;
+ spin_lock_bh(&timestamp_lock);
+ timer->work_pending = false;
+ if ((ts->tv_sec - timer->last_modified_timer.tv_sec) > timer->timeout ||
+ timer->delayed_timer_trigger.tv_sec != 0) {
+ state = false;
+ temp.tv_sec = timer->timeout;
+ temp.tv_nsec = 0;
+ if (timer->delayed_timer_trigger.tv_sec != 0) {
+ temp = timespec_add(timer->delayed_timer_trigger, temp);
+ ts->tv_sec = temp.tv_sec;
+ ts->tv_nsec = temp.tv_nsec;
+ timer->delayed_timer_trigger.tv_sec = 0;
+ timer->work_pending = true;
+ schedule_work(&timer->work);
+ } else {
+ temp = timespec_add(timer->last_modified_timer, temp);
+ ts->tv_sec = temp.tv_sec;
+ ts->tv_nsec = temp.tv_nsec;
+ }
+ } else {
+ state = timer->active;
+ }
+ spin_unlock_bh(&timestamp_lock);
+ return state;
+}
+
+static void notify_netlink_uevent(const char *iface, struct idletimer_tg *timer)
+{
+ char iface_msg[NLMSG_MAX_SIZE];
+ char state_msg[NLMSG_MAX_SIZE];
+ char timestamp_msg[NLMSG_MAX_SIZE];
+ char uid_msg[NLMSG_MAX_SIZE];
+ char *envp[] = { iface_msg, state_msg, timestamp_msg, uid_msg, NULL };
+ int res;
+ struct timespec ts;
+ uint64_t time_ns;
+ bool state;
+
+ res = snprintf(iface_msg, NLMSG_MAX_SIZE, "INTERFACE=%s",
+ iface);
+ if (NLMSG_MAX_SIZE <= res) {
+ pr_err("message too long (%d)", res);
+ return;
+ }
+
+ get_monotonic_boottime(&ts);
+ state = check_for_delayed_trigger(timer, &ts);
+ res = snprintf(state_msg, NLMSG_MAX_SIZE, "STATE=%s",
+ state ? "active" : "inactive");
+
+ if (NLMSG_MAX_SIZE <= res) {
+ pr_err("message too long (%d)", res);
+ return;
+ }
+
+ if (state) {
+ res = snprintf(uid_msg, NLMSG_MAX_SIZE, "UID=%u", timer->uid);
+ if (NLMSG_MAX_SIZE <= res)
+ pr_err("message too long (%d)", res);
+ } else {
+ res = snprintf(uid_msg, NLMSG_MAX_SIZE, "UID=");
+ if (NLMSG_MAX_SIZE <= res)
+ pr_err("message too long (%d)", res);
+ }
+
+ time_ns = timespec_to_ns(&ts);
+ res = snprintf(timestamp_msg, NLMSG_MAX_SIZE, "TIME_NS=%llu", time_ns);
+ if (NLMSG_MAX_SIZE <= res) {
+ timestamp_msg[0] = '\0';
+ pr_err("message too long (%d)", res);
+ }
+
+ pr_debug("putting nlmsg: <%s> <%s> <%s> <%s>\n", iface_msg, state_msg,
+ timestamp_msg, uid_msg);
+ kobject_uevent_env(idletimer_tg_kobj, KOBJ_CHANGE, envp);
+ return;
+
+
+}
+
static
struct idletimer_tg *__idletimer_tg_find_by_label(const char *label)
{
@@ -83,6 +188,7 @@ static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
{
struct idletimer_tg *timer;
unsigned long expires = 0;
+ unsigned long now = jiffies;
mutex_lock(&list_mutex);
@@ -92,11 +198,15 @@ static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
mutex_unlock(&list_mutex);
- if (time_after(expires, jiffies))
+ if (time_after(expires, now))
return sprintf(buf, "%u\n",
- jiffies_to_msecs(expires - jiffies) / 1000);
+ jiffies_to_msecs(expires - now) / 1000);
- return sprintf(buf, "0\n");
+ if (timer->send_nl_msg)
+ return sprintf(buf, "0 %d\n",
+ jiffies_to_msecs(now - expires) / 1000);
+ else
+ return sprintf(buf, "0\n");
}
static void idletimer_tg_work(struct work_struct *work)
@@ -105,6 +215,9 @@ static void idletimer_tg_work(struct work_struct *work)
work);
sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name);
+
+ if (timer->send_nl_msg)
+ notify_netlink_uevent(timer->attr.attr.name, timer);
}
static void idletimer_tg_expired(unsigned long data)
@@ -112,8 +225,55 @@ static void idletimer_tg_expired(unsigned long data)
struct idletimer_tg *timer = (struct idletimer_tg *) data;
pr_debug("timer %s expired\n", timer->attr.attr.name);
-
+ spin_lock_bh(&timestamp_lock);
+ timer->active = false;
+ timer->work_pending = true;
schedule_work(&timer->work);
+ spin_unlock_bh(&timestamp_lock);
+}
+
+static int idletimer_resume(struct notifier_block *notifier,
+ unsigned long pm_event, void *unused)
+{
+ struct timespec ts;
+ unsigned long time_diff, now = jiffies;
+ struct idletimer_tg *timer = container_of(notifier,
+ struct idletimer_tg, pm_nb);
+ if (!timer)
+ return NOTIFY_DONE;
+ switch (pm_event) {
+ case PM_SUSPEND_PREPARE:
+ get_monotonic_boottime(&timer->last_suspend_time);
+ break;
+ case PM_POST_SUSPEND:
+ spin_lock_bh(&timestamp_lock);
+ if (!timer->active) {
+ spin_unlock_bh(&timestamp_lock);
+ break;
+ }
+ /* since jiffies are not updated when suspended now represents
+ * the time it would have suspended */
+ if (time_after(timer->timer.expires, now)) {
+ get_monotonic_boottime(&ts);
+ ts = timespec_sub(ts, timer->last_suspend_time);
+ time_diff = timespec_to_jiffies(&ts);
+ if (timer->timer.expires > (time_diff + now)) {
+ mod_timer_pending(&timer->timer,
+ (timer->timer.expires - time_diff));
+ } else {
+ del_timer(&timer->timer);
+ timer->timer.expires = 0;
+ timer->active = false;
+ timer->work_pending = true;
+ schedule_work(&timer->work);
+ }
+ }
+ spin_unlock_bh(&timestamp_lock);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
}
static int idletimer_tg_create(struct idletimer_tg_info *info)
@@ -126,6 +286,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
goto out;
}
+ sysfs_attr_init(&info->timer->attr.attr);
info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
if (!info->timer->attr.attr.name) {
ret = -ENOMEM;
@@ -145,6 +306,21 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
setup_timer(&info->timer->timer, idletimer_tg_expired,
(unsigned long) info->timer);
info->timer->refcnt = 1;
+ info->timer->send_nl_msg = (info->send_nl_msg == 0) ? false : true;
+ info->timer->active = true;
+ info->timer->timeout = info->timeout;
+
+ info->timer->delayed_timer_trigger.tv_sec = 0;
+ info->timer->delayed_timer_trigger.tv_nsec = 0;
+ info->timer->work_pending = false;
+ info->timer->uid = 0;
+ get_monotonic_boottime(&info->timer->last_modified_timer);
+
+ info->timer->pm_nb.notifier_call = idletimer_resume;
+ ret = register_pm_notifier(&info->timer->pm_nb);
+ if (ret)
+ printk(KERN_WARNING "[%s] Failed to register pm notifier %d\n",
+ __func__, ret);
mod_timer(&info->timer->timer,
msecs_to_jiffies(info->timeout * 1000) + jiffies);
@@ -161,6 +337,42 @@ out:
return ret;
}
+static void reset_timer(const struct idletimer_tg_info *info,
+ struct sk_buff *skb)
+{
+ unsigned long now = jiffies;
+ struct idletimer_tg *timer = info->timer;
+ bool timer_prev;
+
+ spin_lock_bh(&timestamp_lock);
+ timer_prev = timer->active;
+ timer->active = true;
+ /* timer_prev is used to guard overflow problem in time_before*/
+ if (!timer_prev || time_before(timer->timer.expires, now)) {
+ pr_debug("Starting Checkentry timer (Expired, Jiffies): %lu, %lu\n",
+ timer->timer.expires, now);
+
+ /* Stores the uid resposible for waking up the radio */
+ if (skb && (skb->sk)) {
+ timer->uid = from_kuid_munged(current_user_ns(),
+ sock_i_uid(skb->sk));
+ }
+
+ /* checks if there is a pending inactive notification*/
+ if (timer->work_pending)
+ timer->delayed_timer_trigger = timer->last_modified_timer;
+ else {
+ timer->work_pending = true;
+ schedule_work(&timer->work);
+ }
+ }
+
+ get_monotonic_boottime(&timer->last_modified_timer);
+ mod_timer(&timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + now);
+ spin_unlock_bh(&timestamp_lock);
+}
+
/*
* The actual xt_tables plugin.
*/
@@ -168,15 +380,23 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb,
const struct xt_action_param *par)
{
const struct idletimer_tg_info *info = par->targinfo;
+ unsigned long now = jiffies;
pr_debug("resetting timer %s, timeout period %u\n",
info->label, info->timeout);
BUG_ON(!info->timer);
- mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ info->timer->active = true;
+ if (time_before(info->timer->timer.expires, now)) {
+ schedule_work(&info->timer->work);
+ pr_debug("Starting timer %s (Expired, Jiffies): %lu, %lu\n",
+ info->label, info->timer->timer.expires, now);
+ }
+
+ /* TODO: Avoid modifying timers on each packet */
+ reset_timer(info, skb);
return XT_CONTINUE;
}
@@ -185,7 +405,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
struct idletimer_tg_info *info = par->targinfo;
int ret;
- pr_debug("checkentry targinfo%s\n", info->label);
+ pr_debug("checkentry targinfo %s\n", info->label);
if (info->timeout == 0) {
pr_debug("timeout value is zero\n");
@@ -204,9 +424,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
info->timer = __idletimer_tg_find_by_label(info->label);
if (info->timer) {
info->timer->refcnt++;
- mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
-
+ reset_timer(info, NULL);
pr_debug("increased refcnt of timer %s to %u\n",
info->label, info->timer->refcnt);
} else {
@@ -219,6 +437,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
}
mutex_unlock(&list_mutex);
+
return 0;
}
@@ -236,11 +455,12 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
list_del(&info->timer->entry);
del_timer_sync(&info->timer->timer);
sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ unregister_pm_notifier(&info->timer->pm_nb);
kfree(info->timer->attr.attr.name);
kfree(info->timer);
} else {
pr_debug("decreased refcnt of timer %s to %u\n",
- info->label, info->timer->refcnt);
+ info->label, info->timer->refcnt);
}
mutex_unlock(&list_mutex);
@@ -248,6 +468,7 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
static struct xt_target idletimer_tg __read_mostly = {
.name = "IDLETIMER",
+ .revision = 1,
.family = NFPROTO_UNSPEC,
.target = idletimer_tg_target,
.targetsize = sizeof(struct idletimer_tg_info),
@@ -313,3 +534,4 @@ MODULE_DESCRIPTION("Xtables: idle time monitor");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS("ipt_IDLETIMER");
MODULE_ALIAS("ip6t_IDLETIMER");
+MODULE_ALIAS("arpt_IDLETIMER");
diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
new file mode 100644
index 000000000000..0d9aee7b09c8
--- /dev/null
+++ b/net/netfilter/xt_qtaguid.c
@@ -0,0 +1,3038 @@
+/*
+ * Kernel iptables module to track stats for packets based on user tags.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * There are run-time debug flags enabled via the debug_mask module param, or
+ * via the DEFAULT_DEBUG_MASK. See xt_qtaguid_internal.h.
+ */
+#define DEBUG
+
+#include <linux/file.h>
+#include <linux/inetdevice.h>
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_qtaguid.h>
+#include <linux/ratelimit.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <net/addrconf.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#endif
+
+#include <linux/netfilter/xt_socket.h>
+#include "xt_qtaguid_internal.h"
+#include "xt_qtaguid_print.h"
+#include "../../fs/proc/internal.h"
+
+/*
+ * We only use the xt_socket funcs within a similar context to avoid unexpected
+ * return values.
+ */
+#define XT_SOCKET_SUPPORTED_HOOKS \
+ ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN))
+
+
+static const char *module_procdirname = "xt_qtaguid";
+static struct proc_dir_entry *xt_qtaguid_procdir;
+
+static unsigned int proc_iface_perms = S_IRUGO;
+module_param_named(iface_perms, proc_iface_perms, uint, S_IRUGO | S_IWUSR);
+
+static struct proc_dir_entry *xt_qtaguid_stats_file;
+static unsigned int proc_stats_perms = S_IRUGO;
+module_param_named(stats_perms, proc_stats_perms, uint, S_IRUGO | S_IWUSR);
+
+static struct proc_dir_entry *xt_qtaguid_ctrl_file;
+
+/* Everybody can write. But proc_ctrl_write_limited is true by default which
+ * limits what can be controlled. See the can_*() functions.
+ */
+static unsigned int proc_ctrl_perms = S_IRUGO | S_IWUGO;
+module_param_named(ctrl_perms, proc_ctrl_perms, uint, S_IRUGO | S_IWUSR);
+
+/* Limited by default, so the gid of the ctrl and stats proc entries
+ * will limit what can be done. See the can_*() functions.
+ */
+static bool proc_stats_readall_limited = true;
+static bool proc_ctrl_write_limited = true;
+
+module_param_named(stats_readall_limited, proc_stats_readall_limited, bool,
+ S_IRUGO | S_IWUSR);
+module_param_named(ctrl_write_limited, proc_ctrl_write_limited, bool,
+ S_IRUGO | S_IWUSR);
+
+/*
+ * Limit the number of active tags (via socket tags) for a given UID.
+ * Multiple processes could share the UID.
+ */
+static int max_sock_tags = DEFAULT_MAX_SOCK_TAGS;
+module_param(max_sock_tags, int, S_IRUGO | S_IWUSR);
+
+/*
+ * After the kernel has initiallized this module, it is still possible
+ * to make it passive.
+ * Setting passive to Y:
+ * - the iface stats handling will not act on notifications.
+ * - iptables matches will never match.
+ * - ctrl commands silently succeed.
+ * - stats are always empty.
+ * This is mostly usefull when a bug is suspected.
+ */
+static bool module_passive;
+module_param_named(passive, module_passive, bool, S_IRUGO | S_IWUSR);
+
+/*
+ * Control how qtaguid data is tracked per proc/uid.
+ * Setting tag_tracking_passive to Y:
+ * - don't create proc specific structs to track tags
+ * - don't check that active tag stats exceed some limits.
+ * - don't clean up socket tags on process exits.
+ * This is mostly usefull when a bug is suspected.
+ */
+static bool qtu_proc_handling_passive;
+module_param_named(tag_tracking_passive, qtu_proc_handling_passive, bool,
+ S_IRUGO | S_IWUSR);
+
+#define QTU_DEV_NAME "xt_qtaguid"
+
+uint qtaguid_debug_mask = DEFAULT_DEBUG_MASK;
+module_param_named(debug_mask, qtaguid_debug_mask, uint, S_IRUGO | S_IWUSR);
+
+/*---------------------------------------------------------------------------*/
+static const char *iface_stat_procdirname = "iface_stat";
+static struct proc_dir_entry *iface_stat_procdir;
+/*
+ * The iface_stat_all* will go away once userspace gets use to the new fields
+ * that have a format line.
+ */
+static const char *iface_stat_all_procfilename = "iface_stat_all";
+static struct proc_dir_entry *iface_stat_all_procfile;
+static const char *iface_stat_fmt_procfilename = "iface_stat_fmt";
+static struct proc_dir_entry *iface_stat_fmt_procfile;
+
+
+static LIST_HEAD(iface_stat_list);
+static DEFINE_SPINLOCK(iface_stat_list_lock);
+
+static struct rb_root sock_tag_tree = RB_ROOT;
+static DEFINE_SPINLOCK(sock_tag_list_lock);
+
+static struct rb_root tag_counter_set_tree = RB_ROOT;
+static DEFINE_SPINLOCK(tag_counter_set_list_lock);
+
+static struct rb_root uid_tag_data_tree = RB_ROOT;
+static DEFINE_SPINLOCK(uid_tag_data_tree_lock);
+
+static struct rb_root proc_qtu_data_tree = RB_ROOT;
+/* No proc_qtu_data_tree_lock; use uid_tag_data_tree_lock */
+
+static struct qtaguid_event_counts qtu_events;
+/*----------------------------------------------*/
+static bool can_manipulate_uids(void)
+{
+ /* root pwnd */
+ return in_egroup_p(xt_qtaguid_ctrl_file->gid)
+ || unlikely(!from_kuid(&init_user_ns, current_fsuid())) || unlikely(!proc_ctrl_write_limited)
+ || unlikely(uid_eq(current_fsuid(), xt_qtaguid_ctrl_file->uid));
+}
+
+static bool can_impersonate_uid(kuid_t uid)
+{
+ return uid_eq(uid, current_fsuid()) || can_manipulate_uids();
+}
+
+static bool can_read_other_uid_stats(kuid_t uid)
+{
+ /* root pwnd */
+ return in_egroup_p(xt_qtaguid_stats_file->gid)
+ || unlikely(!from_kuid(&init_user_ns, current_fsuid())) || uid_eq(uid, current_fsuid())
+ || unlikely(!proc_stats_readall_limited)
+ || unlikely(uid_eq(current_fsuid(), xt_qtaguid_ctrl_file->uid));
+}
+
+static inline void dc_add_byte_packets(struct data_counters *counters, int set,
+ enum ifs_tx_rx direction,
+ enum ifs_proto ifs_proto,
+ int bytes,
+ int packets)
+{
+ counters->bpc[set][direction][ifs_proto].bytes += bytes;
+ counters->bpc[set][direction][ifs_proto].packets += packets;
+}
+
+static struct tag_node *tag_node_tree_search(struct rb_root *root, tag_t tag)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct tag_node *data = rb_entry(node, struct tag_node, node);
+ int result;
+ RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
+ " node=%p data=%p\n", tag, node, data);
+ result = tag_compare(tag, data->tag);
+ RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
+ " data.tag=0x%llx (uid=%u) res=%d\n",
+ tag, data->tag, get_uid_from_tag(data->tag), result);
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return data;
+ }
+ return NULL;
+}
+
+static void tag_node_tree_insert(struct tag_node *data, struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct tag_node *this = rb_entry(*new, struct tag_node,
+ node);
+ int result = tag_compare(data->tag, this->tag);
+ RB_DEBUG("qtaguid: %s(): tag=0x%llx"
+ " (uid=%u)\n", __func__,
+ this->tag,
+ get_uid_from_tag(this->tag));
+ parent = *new;
+ if (result < 0)
+ new = &((*new)->rb_left);
+ else if (result > 0)
+ new = &((*new)->rb_right);
+ else
+ BUG();
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&data->node, parent, new);
+ rb_insert_color(&data->node, root);
+}
+
+static void tag_stat_tree_insert(struct tag_stat *data, struct rb_root *root)
+{
+ tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_stat *tag_stat_tree_search(struct rb_root *root, tag_t tag)
+{
+ struct tag_node *node = tag_node_tree_search(root, tag);
+ if (!node)
+ return NULL;
+ return rb_entry(&node->node, struct tag_stat, tn.node);
+}
+
+static void tag_counter_set_tree_insert(struct tag_counter_set *data,
+ struct rb_root *root)
+{
+ tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_counter_set *tag_counter_set_tree_search(struct rb_root *root,
+ tag_t tag)
+{
+ struct tag_node *node = tag_node_tree_search(root, tag);
+ if (!node)
+ return NULL;
+ return rb_entry(&node->node, struct tag_counter_set, tn.node);
+
+}
+
+static void tag_ref_tree_insert(struct tag_ref *data, struct rb_root *root)
+{
+ tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_ref *tag_ref_tree_search(struct rb_root *root, tag_t tag)
+{
+ struct tag_node *node = tag_node_tree_search(root, tag);
+ if (!node)
+ return NULL;
+ return rb_entry(&node->node, struct tag_ref, tn.node);
+}
+
+static struct sock_tag *sock_tag_tree_search(struct rb_root *root,
+ const struct sock *sk)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct sock_tag *data = rb_entry(node, struct sock_tag,
+ sock_node);
+ if (sk < data->sk)
+ node = node->rb_left;
+ else if (sk > data->sk)
+ node = node->rb_right;
+ else
+ return data;
+ }
+ return NULL;
+}
+
+static void sock_tag_tree_insert(struct sock_tag *data, struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct sock_tag *this = rb_entry(*new, struct sock_tag,
+ sock_node);
+ parent = *new;
+ if (data->sk < this->sk)
+ new = &((*new)->rb_left);
+ else if (data->sk > this->sk)
+ new = &((*new)->rb_right);
+ else
+ BUG();
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&data->sock_node, parent, new);
+ rb_insert_color(&data->sock_node, root);
+}
+
+static void sock_tag_tree_erase(struct rb_root *st_to_free_tree)
+{
+ struct rb_node *node;
+ struct sock_tag *st_entry;
+
+ node = rb_first(st_to_free_tree);
+ while (node) {
+ st_entry = rb_entry(node, struct sock_tag, sock_node);
+ node = rb_next(node);
+ CT_DEBUG("qtaguid: %s(): "
+ "erase st: sk=%p tag=0x%llx (uid=%u)\n", __func__,
+ st_entry->sk,
+ st_entry->tag,
+ get_uid_from_tag(st_entry->tag));
+ rb_erase(&st_entry->sock_node, st_to_free_tree);
+ sock_put(st_entry->sk);
+ kfree(st_entry);
+ }
+}
+
+static struct proc_qtu_data *proc_qtu_data_tree_search(struct rb_root *root,
+ const pid_t pid)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct proc_qtu_data *data = rb_entry(node,
+ struct proc_qtu_data,
+ node);
+ if (pid < data->pid)
+ node = node->rb_left;
+ else if (pid > data->pid)
+ node = node->rb_right;
+ else
+ return data;
+ }
+ return NULL;
+}
+
+static void proc_qtu_data_tree_insert(struct proc_qtu_data *data,
+ struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct proc_qtu_data *this = rb_entry(*new,
+ struct proc_qtu_data,
+ node);
+ parent = *new;
+ if (data->pid < this->pid)
+ new = &((*new)->rb_left);
+ else if (data->pid > this->pid)
+ new = &((*new)->rb_right);
+ else
+ BUG();
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&data->node, parent, new);
+ rb_insert_color(&data->node, root);
+}
+
+static void uid_tag_data_tree_insert(struct uid_tag_data *data,
+ struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct uid_tag_data *this = rb_entry(*new,
+ struct uid_tag_data,
+ node);
+ parent = *new;
+ if (data->uid < this->uid)
+ new = &((*new)->rb_left);
+ else if (data->uid > this->uid)
+ new = &((*new)->rb_right);
+ else
+ BUG();
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&data->node, parent, new);
+ rb_insert_color(&data->node, root);
+}
+
+static struct uid_tag_data *uid_tag_data_tree_search(struct rb_root *root,
+ uid_t uid)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct uid_tag_data *data = rb_entry(node,
+ struct uid_tag_data,
+ node);
+ if (uid < data->uid)
+ node = node->rb_left;
+ else if (uid > data->uid)
+ node = node->rb_right;
+ else
+ return data;
+ }
+ return NULL;
+}
+
+/*
+ * Allocates a new uid_tag_data struct if needed.
+ * Returns a pointer to the found or allocated uid_tag_data.
+ * Returns a PTR_ERR on failures, and lock is not held.
+ * If found is not NULL:
+ * sets *found to true if not allocated.
+ * sets *found to false if allocated.
+ */
+struct uid_tag_data *get_uid_data(uid_t uid, bool *found_res)
+{
+ struct uid_tag_data *utd_entry;
+
+ /* Look for top level uid_tag_data for the UID */
+ utd_entry = uid_tag_data_tree_search(&uid_tag_data_tree, uid);
+ DR_DEBUG("qtaguid: get_uid_data(%u) utd=%p\n", uid, utd_entry);
+
+ if (found_res)
+ *found_res = utd_entry;
+ if (utd_entry)
+ return utd_entry;
+
+ utd_entry = kzalloc(sizeof(*utd_entry), GFP_ATOMIC);
+ if (!utd_entry) {
+ pr_err("qtaguid: get_uid_data(%u): "
+ "tag data alloc failed\n", uid);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ utd_entry->uid = uid;
+ utd_entry->tag_ref_tree = RB_ROOT;
+ uid_tag_data_tree_insert(utd_entry, &uid_tag_data_tree);
+ DR_DEBUG("qtaguid: get_uid_data(%u) new utd=%p\n", uid, utd_entry);
+ return utd_entry;
+}
+
+/* Never returns NULL. Either PTR_ERR or a valid ptr. */
+static struct tag_ref *new_tag_ref(tag_t new_tag,
+ struct uid_tag_data *utd_entry)
+{
+ struct tag_ref *tr_entry;
+ int res;
+
+ if (utd_entry->num_active_tags + 1 > max_sock_tags) {
+ pr_info("qtaguid: new_tag_ref(0x%llx): "
+ "tag ref alloc quota exceeded. max=%d\n",
+ new_tag, max_sock_tags);
+ res = -EMFILE;
+ goto err_res;
+
+ }
+
+ tr_entry = kzalloc(sizeof(*tr_entry), GFP_ATOMIC);
+ if (!tr_entry) {
+ pr_err("qtaguid: new_tag_ref(0x%llx): "
+ "tag ref alloc failed\n",
+ new_tag);
+ res = -ENOMEM;
+ goto err_res;
+ }
+ tr_entry->tn.tag = new_tag;
+ /* tr_entry->num_sock_tags handled by caller */
+ utd_entry->num_active_tags++;
+ tag_ref_tree_insert(tr_entry, &utd_entry->tag_ref_tree);
+ DR_DEBUG("qtaguid: new_tag_ref(0x%llx): "
+ " inserted new tag ref %p\n",
+ new_tag, tr_entry);
+ return tr_entry;
+
+err_res:
+ return ERR_PTR(res);
+}
+
+static struct tag_ref *lookup_tag_ref(tag_t full_tag,
+ struct uid_tag_data **utd_res)
+{
+ struct uid_tag_data *utd_entry;
+ struct tag_ref *tr_entry;
+ bool found_utd;
+ uid_t uid = get_uid_from_tag(full_tag);
+
+ DR_DEBUG("qtaguid: lookup_tag_ref(tag=0x%llx (uid=%u))\n",
+ full_tag, uid);
+
+ utd_entry = get_uid_data(uid, &found_utd);
+ if (IS_ERR_OR_NULL(utd_entry)) {
+ if (utd_res)
+ *utd_res = utd_entry;
+ return NULL;
+ }
+
+ tr_entry = tag_ref_tree_search(&utd_entry->tag_ref_tree, full_tag);
+ if (utd_res)
+ *utd_res = utd_entry;
+ DR_DEBUG("qtaguid: lookup_tag_ref(0x%llx) utd_entry=%p tr_entry=%p\n",
+ full_tag, utd_entry, tr_entry);
+ return tr_entry;
+}
+
+/* Never returns NULL. Either PTR_ERR or a valid ptr. */
+static struct tag_ref *get_tag_ref(tag_t full_tag,
+ struct uid_tag_data **utd_res)
+{
+ struct uid_tag_data *utd_entry;
+ struct tag_ref *tr_entry;
+
+ DR_DEBUG("qtaguid: get_tag_ref(0x%llx)\n",
+ full_tag);
+ spin_lock_bh(&uid_tag_data_tree_lock);
+ tr_entry = lookup_tag_ref(full_tag, &utd_entry);
+ BUG_ON(IS_ERR_OR_NULL(utd_entry));
+ if (!tr_entry)
+ tr_entry = new_tag_ref(full_tag, utd_entry);
+
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ if (utd_res)
+ *utd_res = utd_entry;
+ DR_DEBUG("qtaguid: get_tag_ref(0x%llx) utd=%p tr=%p\n",
+ full_tag, utd_entry, tr_entry);
+ return tr_entry;
+}
+
+/* Checks and maybe frees the UID Tag Data entry */
+static void put_utd_entry(struct uid_tag_data *utd_entry)
+{
+ /* Are we done with the UID tag data entry? */
+ if (RB_EMPTY_ROOT(&utd_entry->tag_ref_tree) &&
+ !utd_entry->num_pqd) {
+ DR_DEBUG("qtaguid: %s(): "
+ "erase utd_entry=%p uid=%u "
+ "by pid=%u tgid=%u uid=%u\n", __func__,
+ utd_entry, utd_entry->uid,
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+ BUG_ON(utd_entry->num_active_tags);
+ rb_erase(&utd_entry->node, &uid_tag_data_tree);
+ kfree(utd_entry);
+ } else {
+ DR_DEBUG("qtaguid: %s(): "
+ "utd_entry=%p still has %d tags %d proc_qtu_data\n",
+ __func__, utd_entry, utd_entry->num_active_tags,
+ utd_entry->num_pqd);
+ BUG_ON(!(utd_entry->num_active_tags ||
+ utd_entry->num_pqd));
+ }
+}
+
+/*
+ * If no sock_tags are using this tag_ref,
+ * decrements refcount of utd_entry, removes tr_entry
+ * from utd_entry->tag_ref_tree and frees.
+ */
+static void free_tag_ref_from_utd_entry(struct tag_ref *tr_entry,
+ struct uid_tag_data *utd_entry)
+{
+ DR_DEBUG("qtaguid: %s(): %p tag=0x%llx (uid=%u)\n", __func__,
+ tr_entry, tr_entry->tn.tag,
+ get_uid_from_tag(tr_entry->tn.tag));
+ if (!tr_entry->num_sock_tags) {
+ BUG_ON(!utd_entry->num_active_tags);
+ utd_entry->num_active_tags--;
+ rb_erase(&tr_entry->tn.node, &utd_entry->tag_ref_tree);
+ DR_DEBUG("qtaguid: %s(): erased %p\n", __func__, tr_entry);
+ kfree(tr_entry);
+ }
+}
+
+static void put_tag_ref_tree(tag_t full_tag, struct uid_tag_data *utd_entry)
+{
+ struct rb_node *node;
+ struct tag_ref *tr_entry;
+ tag_t acct_tag;
+
+ DR_DEBUG("qtaguid: %s(tag=0x%llx (uid=%u))\n", __func__,
+ full_tag, get_uid_from_tag(full_tag));
+ acct_tag = get_atag_from_tag(full_tag);
+ node = rb_first(&utd_entry->tag_ref_tree);
+ while (node) {
+ tr_entry = rb_entry(node, struct tag_ref, tn.node);
+ node = rb_next(node);
+ if (!acct_tag || tr_entry->tn.tag == full_tag)
+ free_tag_ref_from_utd_entry(tr_entry, utd_entry);
+ }
+}
+
+static ssize_t read_proc_u64(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ uint64_t *valuep = PDE_DATA(file_inode(file));
+ char tmp[24];
+ size_t tmp_size;
+
+ tmp_size = scnprintf(tmp, sizeof(tmp), "%llu\n", *valuep);
+ return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size);
+}
+
+static ssize_t read_proc_bool(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ bool *valuep = PDE_DATA(file_inode(file));
+ char tmp[24];
+ size_t tmp_size;
+
+ tmp_size = scnprintf(tmp, sizeof(tmp), "%u\n", *valuep);
+ return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size);
+}
+
+static int get_active_counter_set(tag_t tag)
+{
+ int active_set = 0;
+ struct tag_counter_set *tcs;
+
+ MT_DEBUG("qtaguid: get_active_counter_set(tag=0x%llx)"
+ " (uid=%u)\n",
+ tag, get_uid_from_tag(tag));
+ /* For now we only handle UID tags for active sets */
+ tag = get_utag_from_tag(tag);
+ spin_lock_bh(&tag_counter_set_list_lock);
+ tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
+ if (tcs)
+ active_set = tcs->active_set;
+ spin_unlock_bh(&tag_counter_set_list_lock);
+ return active_set;
+}
+
+/*
+ * Find the entry for tracking the specified interface.
+ * Caller must hold iface_stat_list_lock
+ */
+static struct iface_stat *get_iface_entry(const char *ifname)
+{
+ struct iface_stat *iface_entry;
+
+ /* Find the entry for tracking the specified tag within the interface */
+ if (ifname == NULL) {
+ pr_info("qtaguid: iface_stat: get() NULL device name\n");
+ return NULL;
+ }
+
+ /* Iterate over interfaces */
+ list_for_each_entry(iface_entry, &iface_stat_list, list) {
+ if (!strcmp(ifname, iface_entry->ifname))
+ goto done;
+ }
+ iface_entry = NULL;
+done:
+ return iface_entry;
+}
+
+/* This is for fmt2 only */
+static void pp_iface_stat_header(struct seq_file *m)
+{
+ seq_puts(m,
+ "ifname "
+ "total_skb_rx_bytes total_skb_rx_packets "
+ "total_skb_tx_bytes total_skb_tx_packets "
+ "rx_tcp_bytes rx_tcp_packets "
+ "rx_udp_bytes rx_udp_packets "
+ "rx_other_bytes rx_other_packets "
+ "tx_tcp_bytes tx_tcp_packets "
+ "tx_udp_bytes tx_udp_packets "
+ "tx_other_bytes tx_other_packets\n"
+ );
+}
+
+static void pp_iface_stat_line(struct seq_file *m,
+ struct iface_stat *iface_entry)
+{
+ struct data_counters *cnts;
+ int cnt_set = 0; /* We only use one set for the device */
+ cnts = &iface_entry->totals_via_skb;
+ seq_printf(m, "%s %llu %llu %llu %llu %llu %llu %llu %llu "
+ "%llu %llu %llu %llu %llu %llu %llu %llu\n",
+ iface_entry->ifname,
+ dc_sum_bytes(cnts, cnt_set, IFS_RX),
+ dc_sum_packets(cnts, cnt_set, IFS_RX),
+ dc_sum_bytes(cnts, cnt_set, IFS_TX),
+ dc_sum_packets(cnts, cnt_set, IFS_TX),
+ cnts->bpc[cnt_set][IFS_RX][IFS_TCP].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_TCP].packets,
+ cnts->bpc[cnt_set][IFS_RX][IFS_UDP].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_UDP].packets,
+ cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_TCP].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_TCP].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_UDP].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets);
+}
+
+struct proc_iface_stat_fmt_info {
+ int fmt;
+};
+
+static void *iface_stat_fmt_proc_start(struct seq_file *m, loff_t *pos)
+{
+ struct proc_iface_stat_fmt_info *p = m->private;
+ loff_t n = *pos;
+
+ /*
+ * This lock will prevent iface_stat_update() from changing active,
+ * and in turn prevent an interface from unregistering itself.
+ */
+ spin_lock_bh(&iface_stat_list_lock);
+
+ if (unlikely(module_passive))
+ return NULL;
+
+ if (!n && p->fmt == 2)
+ pp_iface_stat_header(m);
+
+ return seq_list_start(&iface_stat_list, n);
+}
+
+static void *iface_stat_fmt_proc_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &iface_stat_list, pos);
+}
+
+static void iface_stat_fmt_proc_stop(struct seq_file *m, void *p)
+{
+ spin_unlock_bh(&iface_stat_list_lock);
+}
+
+static int iface_stat_fmt_proc_show(struct seq_file *m, void *v)
+{
+ struct proc_iface_stat_fmt_info *p = m->private;
+ struct iface_stat *iface_entry;
+ struct rtnl_link_stats64 dev_stats, *stats;
+ struct rtnl_link_stats64 no_dev_stats = {0};
+
+
+ CT_DEBUG("qtaguid:proc iface_stat_fmt pid=%u tgid=%u uid=%u\n",
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+ iface_entry = list_entry(v, struct iface_stat, list);
+
+ if (iface_entry->active) {
+ stats = dev_get_stats(iface_entry->net_dev,
+ &dev_stats);
+ } else {
+ stats = &no_dev_stats;
+ }
+ /*
+ * If the meaning of the data changes, then update the fmtX
+ * string.
+ */
+ if (p->fmt == 1) {
+ seq_printf(m, "%s %d %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ iface_entry->ifname,
+ iface_entry->active,
+ iface_entry->totals_via_dev[IFS_RX].bytes,
+ iface_entry->totals_via_dev[IFS_RX].packets,
+ iface_entry->totals_via_dev[IFS_TX].bytes,
+ iface_entry->totals_via_dev[IFS_TX].packets,
+ stats->rx_bytes, stats->rx_packets,
+ stats->tx_bytes, stats->tx_packets
+ );
+ } else {
+ pp_iface_stat_line(m, iface_entry);
+ }
+ return 0;
+}
+
+static const struct file_operations read_u64_fops = {
+ .read = read_proc_u64,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations read_bool_fops = {
+ .read = read_proc_bool,
+ .llseek = default_llseek,
+};
+
+static void iface_create_proc_worker(struct work_struct *work)
+{
+ struct proc_dir_entry *proc_entry;
+ struct iface_stat_work *isw = container_of(work, struct iface_stat_work,
+ iface_work);
+ struct iface_stat *new_iface = isw->iface_entry;
+
+ /* iface_entries are not deleted, so safe to manipulate. */
+ proc_entry = proc_mkdir(new_iface->ifname, iface_stat_procdir);
+ if (IS_ERR_OR_NULL(proc_entry)) {
+ pr_err("qtaguid: iface_stat: create_proc(): alloc failed.\n");
+ kfree(isw);
+ return;
+ }
+
+ new_iface->proc_ptr = proc_entry;
+
+ proc_create_data("tx_bytes", proc_iface_perms, proc_entry,
+ &read_u64_fops,
+ &new_iface->totals_via_dev[IFS_TX].bytes);
+ proc_create_data("rx_bytes", proc_iface_perms, proc_entry,
+ &read_u64_fops,
+ &new_iface->totals_via_dev[IFS_RX].bytes);
+ proc_create_data("tx_packets", proc_iface_perms, proc_entry,
+ &read_u64_fops,
+ &new_iface->totals_via_dev[IFS_TX].packets);
+ proc_create_data("rx_packets", proc_iface_perms, proc_entry,
+ &read_u64_fops,
+ &new_iface->totals_via_dev[IFS_RX].packets);
+ proc_create_data("active", proc_iface_perms, proc_entry,
+ &read_bool_fops, &new_iface->active);
+
+ IF_DEBUG("qtaguid: iface_stat: create_proc(): done "
+ "entry=%p dev=%s\n", new_iface, new_iface->ifname);
+ kfree(isw);
+}
+
+/*
+ * Will set the entry's active state, and
+ * update the net_dev accordingly also.
+ */
+static void _iface_stat_set_active(struct iface_stat *entry,
+ struct net_device *net_dev,
+ bool activate)
+{
+ if (activate) {
+ entry->net_dev = net_dev;
+ entry->active = true;
+ IF_DEBUG("qtaguid: %s(%s): "
+ "enable tracking. rfcnt=%d\n", __func__,
+ entry->ifname,
+ __this_cpu_read(*net_dev->pcpu_refcnt));
+ } else {
+ entry->active = false;
+ entry->net_dev = NULL;
+ IF_DEBUG("qtaguid: %s(%s): "
+ "disable tracking. rfcnt=%d\n", __func__,
+ entry->ifname,
+ __this_cpu_read(*net_dev->pcpu_refcnt));
+
+ }
+}
+
+/* Caller must hold iface_stat_list_lock */
+static struct iface_stat *iface_alloc(struct net_device *net_dev)
+{
+ struct iface_stat *new_iface;
+ struct iface_stat_work *isw;
+
+ new_iface = kzalloc(sizeof(*new_iface), GFP_ATOMIC);
+ if (new_iface == NULL) {
+ pr_err("qtaguid: iface_stat: create(%s): "
+ "iface_stat alloc failed\n", net_dev->name);
+ return NULL;
+ }
+ new_iface->ifname = kstrdup(net_dev->name, GFP_ATOMIC);
+ if (new_iface->ifname == NULL) {
+ pr_err("qtaguid: iface_stat: create(%s): "
+ "ifname alloc failed\n", net_dev->name);
+ kfree(new_iface);
+ return NULL;
+ }
+ spin_lock_init(&new_iface->tag_stat_list_lock);
+ new_iface->tag_stat_tree = RB_ROOT;
+ _iface_stat_set_active(new_iface, net_dev, true);
+
+ /*
+ * ipv6 notifier chains are atomic :(
+ * No create_proc_read_entry() for you!
+ */
+ isw = kmalloc(sizeof(*isw), GFP_ATOMIC);
+ if (!isw) {
+ pr_err("qtaguid: iface_stat: create(%s): "
+ "work alloc failed\n", new_iface->ifname);
+ _iface_stat_set_active(new_iface, net_dev, false);
+ kfree(new_iface->ifname);
+ kfree(new_iface);
+ return NULL;
+ }
+ isw->iface_entry = new_iface;
+ INIT_WORK(&isw->iface_work, iface_create_proc_worker);
+ schedule_work(&isw->iface_work);
+ list_add(&new_iface->list, &iface_stat_list);
+ return new_iface;
+}
+
+static void iface_check_stats_reset_and_adjust(struct net_device *net_dev,
+ struct iface_stat *iface)
+{
+ struct rtnl_link_stats64 dev_stats, *stats;
+ bool stats_rewound;
+
+ stats = dev_get_stats(net_dev, &dev_stats);
+ /* No empty packets */
+ stats_rewound =
+ (stats->rx_bytes < iface->last_known[IFS_RX].bytes)
+ || (stats->tx_bytes < iface->last_known[IFS_TX].bytes);
+
+ IF_DEBUG("qtaguid: %s(%s): iface=%p netdev=%p "
+ "bytes rx/tx=%llu/%llu "
+ "active=%d last_known=%d "
+ "stats_rewound=%d\n", __func__,
+ net_dev ? net_dev->name : "?",
+ iface, net_dev,
+ stats->rx_bytes, stats->tx_bytes,
+ iface->active, iface->last_known_valid, stats_rewound);
+
+ if (iface->active && iface->last_known_valid && stats_rewound) {
+ pr_warn_once("qtaguid: iface_stat: %s(%s): "
+ "iface reset its stats unexpectedly\n", __func__,
+ net_dev->name);
+
+ iface->totals_via_dev[IFS_TX].bytes +=
+ iface->last_known[IFS_TX].bytes;
+ iface->totals_via_dev[IFS_TX].packets +=
+ iface->last_known[IFS_TX].packets;
+ iface->totals_via_dev[IFS_RX].bytes +=
+ iface->last_known[IFS_RX].bytes;
+ iface->totals_via_dev[IFS_RX].packets +=
+ iface->last_known[IFS_RX].packets;
+ iface->last_known_valid = false;
+ IF_DEBUG("qtaguid: %s(%s): iface=%p "
+ "used last known bytes rx/tx=%llu/%llu\n", __func__,
+ iface->ifname, iface, iface->last_known[IFS_RX].bytes,
+ iface->last_known[IFS_TX].bytes);
+ }
+}
+
+/*
+ * Create a new entry for tracking the specified interface.
+ * Do nothing if the entry already exists.
+ * Called when an interface is configured with a valid IP address.
+ */
+static void iface_stat_create(struct net_device *net_dev,
+ struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = NULL;
+ const char *ifname;
+ struct iface_stat *entry;
+ __be32 ipaddr = 0;
+ struct iface_stat *new_iface;
+
+ IF_DEBUG("qtaguid: iface_stat: create(%s): ifa=%p netdev=%p\n",
+ net_dev ? net_dev->name : "?",
+ ifa, net_dev);
+ if (!net_dev) {
+ pr_err("qtaguid: iface_stat: create(): no net dev\n");
+ return;
+ }
+
+ ifname = net_dev->name;
+ if (!ifa) {
+ in_dev = in_dev_get(net_dev);
+ if (!in_dev) {
+ pr_err("qtaguid: iface_stat: create(%s): no inet dev\n",
+ ifname);
+ return;
+ }
+ IF_DEBUG("qtaguid: iface_stat: create(%s): in_dev=%p\n",
+ ifname, in_dev);
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ IF_DEBUG("qtaguid: iface_stat: create(%s): "
+ "ifa=%p ifa_label=%s\n",
+ ifname, ifa,
+ ifa->ifa_label ? ifa->ifa_label : "(null)");
+ if (ifa->ifa_label && !strcmp(ifname, ifa->ifa_label))
+ break;
+ }
+ }
+
+ if (!ifa) {
+ IF_DEBUG("qtaguid: iface_stat: create(%s): no matching IP\n",
+ ifname);
+ goto done_put;
+ }
+ ipaddr = ifa->ifa_local;
+
+ spin_lock_bh(&iface_stat_list_lock);
+ entry = get_iface_entry(ifname);
+ if (entry != NULL) {
+ IF_DEBUG("qtaguid: iface_stat: create(%s): entry=%p\n",
+ ifname, entry);
+ iface_check_stats_reset_and_adjust(net_dev, entry);
+ _iface_stat_set_active(entry, net_dev, true);
+ IF_DEBUG("qtaguid: %s(%s): "
+ "tracking now %d on ip=%pI4\n", __func__,
+ entry->ifname, true, &ipaddr);
+ goto done_unlock_put;
+ }
+
+ new_iface = iface_alloc(net_dev);
+ IF_DEBUG("qtaguid: iface_stat: create(%s): done "
+ "entry=%p ip=%pI4\n", ifname, new_iface, &ipaddr);
+done_unlock_put:
+ spin_unlock_bh(&iface_stat_list_lock);
+done_put:
+ if (in_dev)
+ in_dev_put(in_dev);
+}
+
+static void iface_stat_create_ipv6(struct net_device *net_dev,
+ struct inet6_ifaddr *ifa)
+{
+ struct in_device *in_dev;
+ const char *ifname;
+ struct iface_stat *entry;
+ struct iface_stat *new_iface;
+ int addr_type;
+
+ IF_DEBUG("qtaguid: iface_stat: create6(): ifa=%p netdev=%p->name=%s\n",
+ ifa, net_dev, net_dev ? net_dev->name : "");
+ if (!net_dev) {
+ pr_err("qtaguid: iface_stat: create6(): no net dev!\n");
+ return;
+ }
+ ifname = net_dev->name;
+
+ in_dev = in_dev_get(net_dev);
+ if (!in_dev) {
+ pr_err("qtaguid: iface_stat: create6(%s): no inet dev\n",
+ ifname);
+ return;
+ }
+
+ IF_DEBUG("qtaguid: iface_stat: create6(%s): in_dev=%p\n",
+ ifname, in_dev);
+
+ if (!ifa) {
+ IF_DEBUG("qtaguid: iface_stat: create6(%s): no matching IP\n",
+ ifname);
+ goto done_put;
+ }
+ addr_type = ipv6_addr_type(&ifa->addr);
+
+ spin_lock_bh(&iface_stat_list_lock);
+ entry = get_iface_entry(ifname);
+ if (entry != NULL) {
+ IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
+ ifname, entry);
+ iface_check_stats_reset_and_adjust(net_dev, entry);
+ _iface_stat_set_active(entry, net_dev, true);
+ IF_DEBUG("qtaguid: %s(%s): "
+ "tracking now %d on ip=%pI6c\n", __func__,
+ entry->ifname, true, &ifa->addr);
+ goto done_unlock_put;
+ }
+
+ new_iface = iface_alloc(net_dev);
+ IF_DEBUG("qtaguid: iface_stat: create6(%s): done "
+ "entry=%p ip=%pI6c\n", ifname, new_iface, &ifa->addr);
+
+done_unlock_put:
+ spin_unlock_bh(&iface_stat_list_lock);
+done_put:
+ in_dev_put(in_dev);
+}
+
+static struct sock_tag *get_sock_stat_nl(const struct sock *sk)
+{
+ MT_DEBUG("qtaguid: get_sock_stat_nl(sk=%p)\n", sk);
+ return sock_tag_tree_search(&sock_tag_tree, sk);
+}
+
+static struct sock_tag *get_sock_stat(const struct sock *sk)
+{
+ struct sock_tag *sock_tag_entry;
+ MT_DEBUG("qtaguid: get_sock_stat(sk=%p)\n", sk);
+ if (!sk)
+ return NULL;
+ spin_lock_bh(&sock_tag_list_lock);
+ sock_tag_entry = get_sock_stat_nl(sk);
+ spin_unlock_bh(&sock_tag_list_lock);
+ return sock_tag_entry;
+}
+
+static int ipx_proto(const struct sk_buff *skb,
+ struct xt_action_param *par)
+{
+ int thoff = 0, tproto;
+
+ switch (par->family) {
+ case NFPROTO_IPV6:
+ tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
+ if (tproto < 0)
+ MT_DEBUG("%s(): transport header not found in ipv6"
+ " skb=%p\n", __func__, skb);
+ break;
+ case NFPROTO_IPV4:
+ tproto = ip_hdr(skb)->protocol;
+ break;
+ default:
+ tproto = IPPROTO_RAW;
+ }
+ return tproto;
+}
+
+static void
+data_counters_update(struct data_counters *dc, int set,
+ enum ifs_tx_rx direction, int proto, int bytes)
+{
+ switch (proto) {
+ case IPPROTO_TCP:
+ dc_add_byte_packets(dc, set, direction, IFS_TCP, bytes, 1);
+ break;
+ case IPPROTO_UDP:
+ dc_add_byte_packets(dc, set, direction, IFS_UDP, bytes, 1);
+ break;
+ case IPPROTO_IP:
+ default:
+ dc_add_byte_packets(dc, set, direction, IFS_PROTO_OTHER, bytes,
+ 1);
+ break;
+ }
+}
+
+/*
+ * Update stats for the specified interface. Do nothing if the entry
+ * does not exist (when a device was never configured with an IP address).
+ * Called when an device is being unregistered.
+ */
+static void iface_stat_update(struct net_device *net_dev, bool stash_only)
+{
+ struct rtnl_link_stats64 dev_stats, *stats;
+ struct iface_stat *entry;
+
+ stats = dev_get_stats(net_dev, &dev_stats);
+ spin_lock_bh(&iface_stat_list_lock);
+ entry = get_iface_entry(net_dev->name);
+ if (entry == NULL) {
+ IF_DEBUG("qtaguid: iface_stat: update(%s): not tracked\n",
+ net_dev->name);
+ spin_unlock_bh(&iface_stat_list_lock);
+ return;
+ }
+
+ IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
+ net_dev->name, entry);
+ if (!entry->active) {
+ IF_DEBUG("qtaguid: %s(%s): already disabled\n", __func__,
+ net_dev->name);
+ spin_unlock_bh(&iface_stat_list_lock);
+ return;
+ }
+
+ if (stash_only) {
+ entry->last_known[IFS_TX].bytes = stats->tx_bytes;
+ entry->last_known[IFS_TX].packets = stats->tx_packets;
+ entry->last_known[IFS_RX].bytes = stats->rx_bytes;
+ entry->last_known[IFS_RX].packets = stats->rx_packets;
+ entry->last_known_valid = true;
+ IF_DEBUG("qtaguid: %s(%s): "
+ "dev stats stashed rx/tx=%llu/%llu\n", __func__,
+ net_dev->name, stats->rx_bytes, stats->tx_bytes);
+ spin_unlock_bh(&iface_stat_list_lock);
+ return;
+ }
+ entry->totals_via_dev[IFS_TX].bytes += stats->tx_bytes;
+ entry->totals_via_dev[IFS_TX].packets += stats->tx_packets;
+ entry->totals_via_dev[IFS_RX].bytes += stats->rx_bytes;
+ entry->totals_via_dev[IFS_RX].packets += stats->rx_packets;
+ /* We don't need the last_known[] anymore */
+ entry->last_known_valid = false;
+ _iface_stat_set_active(entry, net_dev, false);
+ IF_DEBUG("qtaguid: %s(%s): "
+ "disable tracking. rx/tx=%llu/%llu\n", __func__,
+ net_dev->name, stats->rx_bytes, stats->tx_bytes);
+ spin_unlock_bh(&iface_stat_list_lock);
+}
+
+/*
+ * Update stats for the specified interface from the skb.
+ * Do nothing if the entry
+ * does not exist (when a device was never configured with an IP address).
+ * Called on each sk.
+ */
+static void iface_stat_update_from_skb(const struct sk_buff *skb,
+ struct xt_action_param *par)
+{
+ struct iface_stat *entry;
+ const struct net_device *el_dev;
+ enum ifs_tx_rx direction = par->in ? IFS_RX : IFS_TX;
+ int bytes = skb->len;
+ int proto;
+
+ if (!skb->dev) {
+ MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum);
+ el_dev = par->in ? : par->out;
+ } else {
+ const struct net_device *other_dev;
+ el_dev = skb->dev;
+ other_dev = par->in ? : par->out;
+ if (el_dev != other_dev) {
+ MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs "
+ "par->(in/out)=%p %s\n",
+ par->hooknum, el_dev, el_dev->name, other_dev,
+ other_dev->name);
+ }
+ }
+
+ if (unlikely(!el_dev)) {
+ pr_err_ratelimited("qtaguid[%d]: %s(): no par->in/out?!!\n",
+ par->hooknum, __func__);
+ BUG();
+ } else if (unlikely(!el_dev->name)) {
+ pr_err_ratelimited("qtaguid[%d]: %s(): no dev->name?!!\n",
+ par->hooknum, __func__);
+ BUG();
+ } else {
+ proto = ipx_proto(skb, par);
+ MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d\n",
+ par->hooknum, el_dev->name, el_dev->type,
+ par->family, proto);
+ }
+
+ spin_lock_bh(&iface_stat_list_lock);
+ entry = get_iface_entry(el_dev->name);
+ if (entry == NULL) {
+ IF_DEBUG("qtaguid: iface_stat: %s(%s): not tracked\n",
+ __func__, el_dev->name);
+ spin_unlock_bh(&iface_stat_list_lock);
+ return;
+ }
+
+ IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
+ el_dev->name, entry);
+
+ data_counters_update(&entry->totals_via_skb, 0, direction, proto,
+ bytes);
+ spin_unlock_bh(&iface_stat_list_lock);
+}
+
+static void tag_stat_update(struct tag_stat *tag_entry,
+ enum ifs_tx_rx direction, int proto, int bytes)
+{
+ int active_set;
+ active_set = get_active_counter_set(tag_entry->tn.tag);
+ MT_DEBUG("qtaguid: tag_stat_update(tag=0x%llx (uid=%u) set=%d "
+ "dir=%d proto=%d bytes=%d)\n",
+ tag_entry->tn.tag, get_uid_from_tag(tag_entry->tn.tag),
+ active_set, direction, proto, bytes);
+ data_counters_update(&tag_entry->counters, active_set, direction,
+ proto, bytes);
+ if (tag_entry->parent_counters)
+ data_counters_update(tag_entry->parent_counters, active_set,
+ direction, proto, bytes);
+}
+
+/*
+ * Create a new entry for tracking the specified {acct_tag,uid_tag} within
+ * the interface.
+ * iface_entry->tag_stat_list_lock should be held.
+ */
+static struct tag_stat *create_if_tag_stat(struct iface_stat *iface_entry,
+ tag_t tag)
+{
+ struct tag_stat *new_tag_stat_entry = NULL;
+ IF_DEBUG("qtaguid: iface_stat: %s(): ife=%p tag=0x%llx"
+ " (uid=%u)\n", __func__,
+ iface_entry, tag, get_uid_from_tag(tag));
+ new_tag_stat_entry = kzalloc(sizeof(*new_tag_stat_entry), GFP_ATOMIC);
+ if (!new_tag_stat_entry) {
+ pr_err("qtaguid: iface_stat: tag stat alloc failed\n");
+ goto done;
+ }
+ new_tag_stat_entry->tn.tag = tag;
+ tag_stat_tree_insert(new_tag_stat_entry, &iface_entry->tag_stat_tree);
+done:
+ return new_tag_stat_entry;
+}
+
+static void if_tag_stat_update(const char *ifname, uid_t uid,
+ const struct sock *sk, enum ifs_tx_rx direction,
+ int proto, int bytes)
+{
+ struct tag_stat *tag_stat_entry;
+ tag_t tag, acct_tag;
+ tag_t uid_tag;
+ struct data_counters *uid_tag_counters;
+ struct sock_tag *sock_tag_entry;
+ struct iface_stat *iface_entry;
+ struct tag_stat *new_tag_stat = NULL;
+ MT_DEBUG("qtaguid: if_tag_stat_update(ifname=%s "
+ "uid=%u sk=%p dir=%d proto=%d bytes=%d)\n",
+ ifname, uid, sk, direction, proto, bytes);
+
+ spin_lock_bh(&iface_stat_list_lock);
+ iface_entry = get_iface_entry(ifname);
+ if (!iface_entry) {
+ pr_err_ratelimited("qtaguid: iface_stat: stat_update() "
+ "%s not found\n", ifname);
+ spin_unlock_bh(&iface_stat_list_lock);
+ return;
+ }
+ /* It is ok to process data when an iface_entry is inactive */
+
+ MT_DEBUG("qtaguid: iface_stat: stat_update() dev=%s entry=%p\n",
+ ifname, iface_entry);
+
+ /*
+ * Look for a tagged sock.
+ * It will have an acct_uid.
+ */
+ sock_tag_entry = get_sock_stat(sk);
+ if (sock_tag_entry) {
+ tag = sock_tag_entry->tag;
+ acct_tag = get_atag_from_tag(tag);
+ uid_tag = get_utag_from_tag(tag);
+ } else {
+ acct_tag = make_atag_from_value(0);
+ tag = combine_atag_with_uid(acct_tag, uid);
+ uid_tag = make_tag_from_uid(uid);
+ }
+ MT_DEBUG("qtaguid: iface_stat: stat_update(): "
+ " looking for tag=0x%llx (uid=%u) in ife=%p\n",
+ tag, get_uid_from_tag(tag), iface_entry);
+ /* Loop over tag list under this interface for {acct_tag,uid_tag} */
+ spin_lock_bh(&iface_entry->tag_stat_list_lock);
+
+ tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree,
+ tag);
+ if (tag_stat_entry) {
+ /*
+ * Updating the {acct_tag, uid_tag} entry handles both stats:
+ * {0, uid_tag} will also get updated.
+ */
+ tag_stat_update(tag_stat_entry, direction, proto, bytes);
+ goto unlock;
+ }
+
+ /* Loop over tag list under this interface for {0,uid_tag} */
+ tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree,
+ uid_tag);
+ if (!tag_stat_entry) {
+ /* Here: the base uid_tag did not exist */
+ /*
+ * No parent counters. So
+ * - No {0, uid_tag} stats and no {acc_tag, uid_tag} stats.
+ */
+ new_tag_stat = create_if_tag_stat(iface_entry, uid_tag);
+ if (!new_tag_stat)
+ goto unlock;
+ uid_tag_counters = &new_tag_stat->counters;
+ } else {
+ uid_tag_counters = &tag_stat_entry->counters;
+ }
+
+ if (acct_tag) {
+ /* Create the child {acct_tag, uid_tag} and hook up parent. */
+ new_tag_stat = create_if_tag_stat(iface_entry, tag);
+ if (!new_tag_stat)
+ goto unlock;
+ new_tag_stat->parent_counters = uid_tag_counters;
+ } else {
+ /*
+ * For new_tag_stat to be still NULL here would require:
+ * {0, uid_tag} exists
+ * and {acct_tag, uid_tag} doesn't exist
+ * AND acct_tag == 0.
+ * Impossible. This reassures us that new_tag_stat
+ * below will always be assigned.
+ */
+ BUG_ON(!new_tag_stat);
+ }
+ tag_stat_update(new_tag_stat, direction, proto, bytes);
+unlock:
+ spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+ spin_unlock_bh(&iface_stat_list_lock);
+}
+
+static int iface_netdev_event_handler(struct notifier_block *nb,
+ unsigned long event, void *ptr) {
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (unlikely(module_passive))
+ return NOTIFY_DONE;
+
+ IF_DEBUG("qtaguid: iface_stat: netdev_event(): "
+ "ev=0x%lx/%s netdev=%p->name=%s\n",
+ event, netdev_evt_str(event), dev, dev ? dev->name : "");
+
+ switch (event) {
+ case NETDEV_UP:
+ iface_stat_create(dev, NULL);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ iface_stat_update(dev, event == NETDEV_DOWN);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static int iface_inet6addr_event_handler(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct inet6_ifaddr *ifa = ptr;
+ struct net_device *dev;
+
+ if (unlikely(module_passive))
+ return NOTIFY_DONE;
+
+ IF_DEBUG("qtaguid: iface_stat: inet6addr_event(): "
+ "ev=0x%lx/%s ifa=%p\n",
+ event, netdev_evt_str(event), ifa);
+
+ switch (event) {
+ case NETDEV_UP:
+ BUG_ON(!ifa || !ifa->idev);
+ dev = (struct net_device *)ifa->idev->dev;
+ iface_stat_create_ipv6(dev, ifa);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ BUG_ON(!ifa || !ifa->idev);
+ dev = (struct net_device *)ifa->idev->dev;
+ iface_stat_update(dev, event == NETDEV_DOWN);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static int iface_inetaddr_event_handler(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct in_ifaddr *ifa = ptr;
+ struct net_device *dev;
+
+ if (unlikely(module_passive))
+ return NOTIFY_DONE;
+
+ IF_DEBUG("qtaguid: iface_stat: inetaddr_event(): "
+ "ev=0x%lx/%s ifa=%p\n",
+ event, netdev_evt_str(event), ifa);
+
+ switch (event) {
+ case NETDEV_UP:
+ BUG_ON(!ifa || !ifa->ifa_dev);
+ dev = ifa->ifa_dev->dev;
+ iface_stat_create(dev, ifa);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ BUG_ON(!ifa || !ifa->ifa_dev);
+ dev = ifa->ifa_dev->dev;
+ iface_stat_update(dev, event == NETDEV_DOWN);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block iface_netdev_notifier_blk = {
+ .notifier_call = iface_netdev_event_handler,
+};
+
+static struct notifier_block iface_inetaddr_notifier_blk = {
+ .notifier_call = iface_inetaddr_event_handler,
+};
+
+static struct notifier_block iface_inet6addr_notifier_blk = {
+ .notifier_call = iface_inet6addr_event_handler,
+};
+
+static const struct seq_operations iface_stat_fmt_proc_seq_ops = {
+ .start = iface_stat_fmt_proc_start,
+ .next = iface_stat_fmt_proc_next,
+ .stop = iface_stat_fmt_proc_stop,
+ .show = iface_stat_fmt_proc_show,
+};
+
+static int proc_iface_stat_fmt_open(struct inode *inode, struct file *file)
+{
+ struct proc_iface_stat_fmt_info *s;
+
+ s = __seq_open_private(file, &iface_stat_fmt_proc_seq_ops,
+ sizeof(struct proc_iface_stat_fmt_info));
+ if (!s)
+ return -ENOMEM;
+
+ s->fmt = (uintptr_t)PDE_DATA(inode);
+ return 0;
+}
+
+static const struct file_operations proc_iface_stat_fmt_fops = {
+ .open = proc_iface_stat_fmt_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static int __init iface_stat_init(struct proc_dir_entry *parent_procdir)
+{
+ int err;
+
+ iface_stat_procdir = proc_mkdir(iface_stat_procdirname, parent_procdir);
+ if (!iface_stat_procdir) {
+ pr_err("qtaguid: iface_stat: init failed to create proc entry\n");
+ err = -1;
+ goto err;
+ }
+
+ iface_stat_all_procfile = proc_create_data(iface_stat_all_procfilename,
+ proc_iface_perms,
+ parent_procdir,
+ &proc_iface_stat_fmt_fops,
+ (void *)1 /* fmt1 */);
+ if (!iface_stat_all_procfile) {
+ pr_err("qtaguid: iface_stat: init "
+ " failed to create stat_old proc entry\n");
+ err = -1;
+ goto err_zap_entry;
+ }
+
+ iface_stat_fmt_procfile = proc_create_data(iface_stat_fmt_procfilename,
+ proc_iface_perms,
+ parent_procdir,
+ &proc_iface_stat_fmt_fops,
+ (void *)2 /* fmt2 */);
+ if (!iface_stat_fmt_procfile) {
+ pr_err("qtaguid: iface_stat: init "
+ " failed to create stat_all proc entry\n");
+ err = -1;
+ goto err_zap_all_stats_entry;
+ }
+
+
+ err = register_netdevice_notifier(&iface_netdev_notifier_blk);
+ if (err) {
+ pr_err("qtaguid: iface_stat: init "
+ "failed to register dev event handler\n");
+ goto err_zap_all_stats_entries;
+ }
+ err = register_inetaddr_notifier(&iface_inetaddr_notifier_blk);
+ if (err) {
+ pr_err("qtaguid: iface_stat: init "
+ "failed to register ipv4 dev event handler\n");
+ goto err_unreg_nd;
+ }
+
+ err = register_inet6addr_notifier(&iface_inet6addr_notifier_blk);
+ if (err) {
+ pr_err("qtaguid: iface_stat: init "
+ "failed to register ipv6 dev event handler\n");
+ goto err_unreg_ip4_addr;
+ }
+ return 0;
+
+err_unreg_ip4_addr:
+ unregister_inetaddr_notifier(&iface_inetaddr_notifier_blk);
+err_unreg_nd:
+ unregister_netdevice_notifier(&iface_netdev_notifier_blk);
+err_zap_all_stats_entries:
+ remove_proc_entry(iface_stat_fmt_procfilename, parent_procdir);
+err_zap_all_stats_entry:
+ remove_proc_entry(iface_stat_all_procfilename, parent_procdir);
+err_zap_entry:
+ remove_proc_entry(iface_stat_procdirname, parent_procdir);
+err:
+ return err;
+}
+
+static struct sock *qtaguid_find_sk(const struct sk_buff *skb,
+ struct xt_action_param *par)
+{
+ struct sock *sk;
+ unsigned int hook_mask = (1 << par->hooknum);
+
+ MT_DEBUG("qtaguid: find_sk(skb=%p) hooknum=%d family=%d\n", skb,
+ par->hooknum, par->family);
+
+ /*
+ * Let's not abuse the the xt_socket_get*_sk(), or else it will
+ * return garbage SKs.
+ */
+ if (!(hook_mask & XT_SOCKET_SUPPORTED_HOOKS))
+ return NULL;
+
+ switch (par->family) {
+ case NFPROTO_IPV6:
+ sk = xt_socket_get6_sk(skb, par);
+ break;
+ case NFPROTO_IPV4:
+ sk = xt_socket_get4_sk(skb, par);
+ break;
+ default:
+ return NULL;
+ }
+
+ if (sk) {
+ MT_DEBUG("qtaguid: %p->sk_proto=%u "
+ "->sk_state=%d\n", sk, sk->sk_protocol, sk->sk_state);
+ /*
+ * When in TCP_TIME_WAIT the sk is not a "struct sock" but
+ * "struct inet_timewait_sock" which is missing fields.
+ */
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ if (sk != skb->sk)
+ sock_gen_put(sk);
+ sk = NULL;
+ }
+ }
+ return sk;
+}
+
+static void account_for_uid(const struct sk_buff *skb,
+ const struct sock *alternate_sk, uid_t uid,
+ struct xt_action_param *par)
+{
+ const struct net_device *el_dev;
+
+ if (!skb->dev) {
+ MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum);
+ el_dev = par->in ? : par->out;
+ } else {
+ const struct net_device *other_dev;
+ el_dev = skb->dev;
+ other_dev = par->in ? : par->out;
+ if (el_dev != other_dev) {
+ MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs "
+ "par->(in/out)=%p %s\n",
+ par->hooknum, el_dev, el_dev->name, other_dev,
+ other_dev->name);
+ }
+ }
+
+ if (unlikely(!el_dev)) {
+ pr_info("qtaguid[%d]: no par->in/out?!!\n", par->hooknum);
+ } else if (unlikely(!el_dev->name)) {
+ pr_info("qtaguid[%d]: no dev->name?!!\n", par->hooknum);
+ } else {
+ int proto = ipx_proto(skb, par);
+ MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d\n",
+ par->hooknum, el_dev->name, el_dev->type,
+ par->family, proto);
+
+ if_tag_stat_update(el_dev->name, uid,
+ skb->sk ? skb->sk : alternate_sk,
+ par->in ? IFS_RX : IFS_TX,
+ proto, skb->len);
+ }
+}
+
+static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_qtaguid_match_info *info = par->matchinfo;
+ const struct file *filp;
+ bool got_sock = false;
+ struct sock *sk;
+ kuid_t sock_uid;
+ bool res;
+ bool set_sk_callback_lock = false;
+
+ if (unlikely(module_passive))
+ return (info->match ^ info->invert) == 0;
+
+ MT_DEBUG("qtaguid[%d]: entered skb=%p par->in=%p/out=%p fam=%d\n",
+ par->hooknum, skb, par->in, par->out, par->family);
+
+ atomic64_inc(&qtu_events.match_calls);
+ if (skb == NULL) {
+ res = (info->match ^ info->invert) == 0;
+ goto ret_res;
+ }
+
+ switch (par->hooknum) {
+ case NF_INET_PRE_ROUTING:
+ case NF_INET_POST_ROUTING:
+ atomic64_inc(&qtu_events.match_calls_prepost);
+ iface_stat_update_from_skb(skb, par);
+ /*
+ * We are done in pre/post. The skb will get processed
+ * further alter.
+ */
+ res = (info->match ^ info->invert);
+ goto ret_res;
+ break;
+ /* default: Fall through and do UID releated work */
+ }
+
+ sk = skb->sk;
+ /*
+ * When in TCP_TIME_WAIT the sk is not a "struct sock" but
+ * "struct inet_timewait_sock" which is missing fields.
+ * So we ignore it.
+ */
+ if (sk && sk->sk_state == TCP_TIME_WAIT)
+ sk = NULL;
+ if (sk == NULL) {
+ /*
+ * A missing sk->sk_socket happens when packets are in-flight
+ * and the matching socket is already closed and gone.
+ */
+ sk = qtaguid_find_sk(skb, par);
+ /*
+ * If we got the socket from the find_sk(), we will need to put
+ * it back, as nf_tproxy_get_sock_v4() got it.
+ */
+ got_sock = sk;
+ if (sk)
+ atomic64_inc(&qtu_events.match_found_sk_in_ct);
+ else
+ atomic64_inc(&qtu_events.match_found_no_sk_in_ct);
+ } else {
+ atomic64_inc(&qtu_events.match_found_sk);
+ }
+ MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n",
+ par->hooknum, sk, got_sock, par->family, ipx_proto(skb, par));
+ if (sk != NULL) {
+ set_sk_callback_lock = true;
+ read_lock_bh(&sk->sk_callback_lock);
+ MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n",
+ par->hooknum, sk, sk->sk_socket,
+ sk->sk_socket ? sk->sk_socket->file : (void *)-1LL);
+ filp = sk->sk_socket ? sk->sk_socket->file : NULL;
+ MT_DEBUG("qtaguid[%d]: filp...uid=%u\n",
+ par->hooknum, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1);
+ }
+
+ if (sk == NULL || sk->sk_socket == NULL) {
+ /*
+ * Here, the qtaguid_find_sk() using connection tracking
+ * couldn't find the owner, so for now we just count them
+ * against the system.
+ */
+ /*
+ * TODO: unhack how to force just accounting.
+ * For now we only do iface stats when the uid-owner is not
+ * requested.
+ */
+ if (!(info->match & XT_QTAGUID_UID))
+ account_for_uid(skb, sk, 0, par);
+ MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n",
+ par->hooknum,
+ sk ? sk->sk_socket : NULL);
+ res = (info->match ^ info->invert) == 0;
+ atomic64_inc(&qtu_events.match_no_sk);
+ goto put_sock_ret_res;
+ } else if (info->match & info->invert & XT_QTAGUID_SOCKET) {
+ res = false;
+ goto put_sock_ret_res;
+ }
+ filp = sk->sk_socket->file;
+ if (filp == NULL) {
+ MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum);
+ account_for_uid(skb, sk, 0, par);
+ res = ((info->match ^ info->invert) &
+ (XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0;
+ atomic64_inc(&qtu_events.match_no_sk_file);
+ goto put_sock_ret_res;
+ }
+ sock_uid = filp->f_cred->fsuid;
+ /*
+ * TODO: unhack how to force just accounting.
+ * For now we only do iface stats when the uid-owner is not requested
+ */
+ if (!(info->match & XT_QTAGUID_UID))
+ account_for_uid(skb, sk, from_kuid(&init_user_ns, sock_uid), par);
+
+ /*
+ * The following two tests fail the match when:
+ * id not in range AND no inverted condition requested
+ * or id in range AND inverted condition requested
+ * Thus (!a && b) || (a && !b) == a ^ b
+ */
+ if (info->match & XT_QTAGUID_UID) {
+ kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min);
+ kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max);
+
+ if ((uid_gte(filp->f_cred->fsuid, uid_min) &&
+ uid_lte(filp->f_cred->fsuid, uid_max)) ^
+ !(info->invert & XT_QTAGUID_UID)) {
+ MT_DEBUG("qtaguid[%d]: leaving uid not matching\n",
+ par->hooknum);
+ res = false;
+ goto put_sock_ret_res;
+ }
+ }
+ if (info->match & XT_QTAGUID_GID) {
+ kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min);
+ kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max);
+
+ if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
+ gid_lte(filp->f_cred->fsgid, gid_max)) ^
+ !(info->invert & XT_QTAGUID_GID)) {
+ MT_DEBUG("qtaguid[%d]: leaving gid not matching\n",
+ par->hooknum);
+ res = false;
+ goto put_sock_ret_res;
+ }
+ }
+ MT_DEBUG("qtaguid[%d]: leaving matched\n", par->hooknum);
+ res = true;
+
+put_sock_ret_res:
+ if (got_sock)
+ sock_gen_put(sk);
+ if (set_sk_callback_lock)
+ read_unlock_bh(&sk->sk_callback_lock);
+ret_res:
+ MT_DEBUG("qtaguid[%d]: left %d\n", par->hooknum, res);
+ return res;
+}
+
+#ifdef DDEBUG
+/*
+ * This function is not in xt_qtaguid_print.c because of locks visibility.
+ * The lock of sock_tag_list must be aquired before calling this function
+ */
+static void prdebug_full_state_locked(int indent_level, const char *fmt, ...)
+{
+ va_list args;
+ char *fmt_buff;
+ char *buff;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ fmt_buff = kasprintf(GFP_ATOMIC,
+ "qtaguid: %s(): %s {\n", __func__, fmt);
+ BUG_ON(!fmt_buff);
+ va_start(args, fmt);
+ buff = kvasprintf(GFP_ATOMIC,
+ fmt_buff, args);
+ BUG_ON(!buff);
+ pr_debug("%s", buff);
+ kfree(fmt_buff);
+ kfree(buff);
+ va_end(args);
+
+ prdebug_sock_tag_tree(indent_level, &sock_tag_tree);
+
+ spin_lock_bh(&uid_tag_data_tree_lock);
+ prdebug_uid_tag_data_tree(indent_level, &uid_tag_data_tree);
+ prdebug_proc_qtu_data_tree(indent_level, &proc_qtu_data_tree);
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+
+ spin_lock_bh(&iface_stat_list_lock);
+ prdebug_iface_stat_list(indent_level, &iface_stat_list);
+ spin_unlock_bh(&iface_stat_list_lock);
+
+ pr_debug("qtaguid: %s(): }\n", __func__);
+}
+#else
+static void prdebug_full_state_locked(int indent_level, const char *fmt, ...) {}
+#endif
+
+struct proc_ctrl_print_info {
+ struct sock *sk; /* socket found by reading to sk_pos */
+ loff_t sk_pos;
+};
+
+static void *qtaguid_ctrl_proc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct proc_ctrl_print_info *pcpi = m->private;
+ struct sock_tag *sock_tag_entry = v;
+ struct rb_node *node;
+
+ (*pos)++;
+
+ if (!v || v == SEQ_START_TOKEN)
+ return NULL;
+
+ node = rb_next(&sock_tag_entry->sock_node);
+ if (!node) {
+ pcpi->sk = NULL;
+ sock_tag_entry = SEQ_START_TOKEN;
+ } else {
+ sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
+ pcpi->sk = sock_tag_entry->sk;
+ }
+ pcpi->sk_pos = *pos;
+ return sock_tag_entry;
+}
+
+static void *qtaguid_ctrl_proc_start(struct seq_file *m, loff_t *pos)
+{
+ struct proc_ctrl_print_info *pcpi = m->private;
+ struct sock_tag *sock_tag_entry;
+ struct rb_node *node;
+
+ spin_lock_bh(&sock_tag_list_lock);
+
+ if (unlikely(module_passive))
+ return NULL;
+
+ if (*pos == 0) {
+ pcpi->sk_pos = 0;
+ node = rb_first(&sock_tag_tree);
+ if (!node) {
+ pcpi->sk = NULL;
+ return SEQ_START_TOKEN;
+ }
+ sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
+ pcpi->sk = sock_tag_entry->sk;
+ } else {
+ sock_tag_entry = (pcpi->sk ? get_sock_stat_nl(pcpi->sk) :
+ NULL) ?: SEQ_START_TOKEN;
+ if (*pos != pcpi->sk_pos) {
+ /* seq_read skipped a next call */
+ *pos = pcpi->sk_pos;
+ return qtaguid_ctrl_proc_next(m, sock_tag_entry, pos);
+ }
+ }
+ return sock_tag_entry;
+}
+
+static void qtaguid_ctrl_proc_stop(struct seq_file *m, void *v)
+{
+ spin_unlock_bh(&sock_tag_list_lock);
+}
+
+/*
+ * Procfs reader to get all active socket tags using style "1)" as described in
+ * fs/proc/generic.c
+ */
+static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v)
+{
+ struct sock_tag *sock_tag_entry = v;
+ uid_t uid;
+
+ CT_DEBUG("qtaguid: proc ctrl pid=%u tgid=%u uid=%u\n",
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+ if (sock_tag_entry != SEQ_START_TOKEN) {
+ int sk_ref_count;
+ uid = get_uid_from_tag(sock_tag_entry->tag);
+ CT_DEBUG("qtaguid: proc_read(): sk=%p tag=0x%llx (uid=%u) "
+ "pid=%u\n",
+ sock_tag_entry->sk,
+ sock_tag_entry->tag,
+ uid,
+ sock_tag_entry->pid
+ );
+ sk_ref_count = atomic_read(
+ &sock_tag_entry->sk->sk_refcnt);
+ seq_printf(m, "sock=%pK tag=0x%llx (uid=%u) pid=%u "
+ "f_count=%d\n",
+ sock_tag_entry->sk,
+ sock_tag_entry->tag, uid,
+ sock_tag_entry->pid, sk_ref_count);
+ } else {
+ seq_printf(m, "events: sockets_tagged=%llu "
+ "sockets_untagged=%llu "
+ "counter_set_changes=%llu "
+ "delete_cmds=%llu "
+ "iface_events=%llu "
+ "match_calls=%llu "
+ "match_calls_prepost=%llu "
+ "match_found_sk=%llu "
+ "match_found_sk_in_ct=%llu "
+ "match_found_no_sk_in_ct=%llu "
+ "match_no_sk=%llu "
+ "match_no_sk_file=%llu\n",
+ (u64)atomic64_read(&qtu_events.sockets_tagged),
+ (u64)atomic64_read(&qtu_events.sockets_untagged),
+ (u64)atomic64_read(&qtu_events.counter_set_changes),
+ (u64)atomic64_read(&qtu_events.delete_cmds),
+ (u64)atomic64_read(&qtu_events.iface_events),
+ (u64)atomic64_read(&qtu_events.match_calls),
+ (u64)atomic64_read(&qtu_events.match_calls_prepost),
+ (u64)atomic64_read(&qtu_events.match_found_sk),
+ (u64)atomic64_read(&qtu_events.match_found_sk_in_ct),
+ (u64)atomic64_read(&qtu_events.match_found_no_sk_in_ct),
+ (u64)atomic64_read(&qtu_events.match_no_sk),
+ (u64)atomic64_read(&qtu_events.match_no_sk_file));
+
+ /* Count the following as part of the last item_index. No need
+ * to lock the sock_tag_list here since it is already locked when
+ * starting the seq_file operation
+ */
+ prdebug_full_state_locked(0, "proc ctrl");
+ }
+
+ return 0;
+}
+
+/*
+ * Delete socket tags, and stat tags associated with a given
+ * accouting tag and uid.
+ */
+static int ctrl_cmd_delete(const char *input)
+{
+ char cmd;
+ int uid_int;
+ kuid_t uid;
+ uid_t entry_uid;
+ tag_t acct_tag;
+ tag_t tag;
+ int res, argc;
+ struct iface_stat *iface_entry;
+ struct rb_node *node;
+ struct sock_tag *st_entry;
+ struct rb_root st_to_free_tree = RB_ROOT;
+ struct tag_stat *ts_entry;
+ struct tag_counter_set *tcs_entry;
+ struct tag_ref *tr_entry;
+ struct uid_tag_data *utd_entry;
+
+ argc = sscanf(input, "%c %llu %u", &cmd, &acct_tag, &uid_int);
+ uid = make_kuid(&init_user_ns, uid_int);
+ CT_DEBUG("qtaguid: ctrl_delete(%s): argc=%d cmd=%c "
+ "user_tag=0x%llx uid=%u\n", input, argc, cmd,
+ acct_tag, uid_int);
+ if (argc < 2) {
+ res = -EINVAL;
+ goto err;
+ }
+ if (!valid_atag(acct_tag)) {
+ pr_info("qtaguid: ctrl_delete(%s): invalid tag\n", input);
+ res = -EINVAL;
+ goto err;
+ }
+ if (argc < 3) {
+ uid = current_fsuid();
+ uid_int = from_kuid(&init_user_ns, uid);
+ } else if (!can_impersonate_uid(uid)) {
+ pr_info("qtaguid: ctrl_delete(%s): "
+ "insufficient priv from pid=%u tgid=%u uid=%u\n",
+ input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+ res = -EPERM;
+ goto err;
+ }
+
+ tag = combine_atag_with_uid(acct_tag, uid_int);
+ CT_DEBUG("qtaguid: ctrl_delete(%s): "
+ "looking for tag=0x%llx (uid=%u)\n",
+ input, tag, uid_int);
+
+ /* Delete socket tags */
+ spin_lock_bh(&sock_tag_list_lock);
+ node = rb_first(&sock_tag_tree);
+ while (node) {
+ st_entry = rb_entry(node, struct sock_tag, sock_node);
+ entry_uid = get_uid_from_tag(st_entry->tag);
+ node = rb_next(node);
+ if (entry_uid != uid_int)
+ continue;
+
+ CT_DEBUG("qtaguid: ctrl_delete(%s): st tag=0x%llx (uid=%u)\n",
+ input, st_entry->tag, entry_uid);
+
+ if (!acct_tag || st_entry->tag == tag) {
+ rb_erase(&st_entry->sock_node, &sock_tag_tree);
+ /* Can't sockfd_put() within spinlock, do it later. */
+ sock_tag_tree_insert(st_entry, &st_to_free_tree);
+ tr_entry = lookup_tag_ref(st_entry->tag, NULL);
+ BUG_ON(tr_entry->num_sock_tags <= 0);
+ tr_entry->num_sock_tags--;
+ /*
+ * TODO: remove if, and start failing.
+ * This is a hack to work around the fact that in some
+ * places we have "if (IS_ERR_OR_NULL(pqd_entry))"
+ * and are trying to work around apps
+ * that didn't open the /dev/xt_qtaguid.
+ */
+ if (st_entry->list.next && st_entry->list.prev)
+ list_del(&st_entry->list);
+ }
+ }
+ spin_unlock_bh(&sock_tag_list_lock);
+
+ sock_tag_tree_erase(&st_to_free_tree);
+
+ /* Delete tag counter-sets */
+ spin_lock_bh(&tag_counter_set_list_lock);
+ /* Counter sets are only on the uid tag, not full tag */
+ tcs_entry = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
+ if (tcs_entry) {
+ CT_DEBUG("qtaguid: ctrl_delete(%s): "
+ "erase tcs: tag=0x%llx (uid=%u) set=%d\n",
+ input,
+ tcs_entry->tn.tag,
+ get_uid_from_tag(tcs_entry->tn.tag),
+ tcs_entry->active_set);
+ rb_erase(&tcs_entry->tn.node, &tag_counter_set_tree);
+ kfree(tcs_entry);
+ }
+ spin_unlock_bh(&tag_counter_set_list_lock);
+
+ /*
+ * If acct_tag is 0, then all entries belonging to uid are
+ * erased.
+ */
+ spin_lock_bh(&iface_stat_list_lock);
+ list_for_each_entry(iface_entry, &iface_stat_list, list) {
+ spin_lock_bh(&iface_entry->tag_stat_list_lock);
+ node = rb_first(&iface_entry->tag_stat_tree);
+ while (node) {
+ ts_entry = rb_entry(node, struct tag_stat, tn.node);
+ entry_uid = get_uid_from_tag(ts_entry->tn.tag);
+ node = rb_next(node);
+
+ CT_DEBUG("qtaguid: ctrl_delete(%s): "
+ "ts tag=0x%llx (uid=%u)\n",
+ input, ts_entry->tn.tag, entry_uid);
+
+ if (entry_uid != uid_int)
+ continue;
+ if (!acct_tag || ts_entry->tn.tag == tag) {
+ CT_DEBUG("qtaguid: ctrl_delete(%s): "
+ "erase ts: %s 0x%llx %u\n",
+ input, iface_entry->ifname,
+ get_atag_from_tag(ts_entry->tn.tag),
+ entry_uid);
+ rb_erase(&ts_entry->tn.node,
+ &iface_entry->tag_stat_tree);
+ kfree(ts_entry);
+ }
+ }
+ spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+ }
+ spin_unlock_bh(&iface_stat_list_lock);
+
+ /* Cleanup the uid_tag_data */
+ spin_lock_bh(&uid_tag_data_tree_lock);
+ node = rb_first(&uid_tag_data_tree);
+ while (node) {
+ utd_entry = rb_entry(node, struct uid_tag_data, node);
+ entry_uid = utd_entry->uid;
+ node = rb_next(node);
+
+ CT_DEBUG("qtaguid: ctrl_delete(%s): "
+ "utd uid=%u\n",
+ input, entry_uid);
+
+ if (entry_uid != uid_int)
+ continue;
+ /*
+ * Go over the tag_refs, and those that don't have
+ * sock_tags using them are freed.
+ */
+ put_tag_ref_tree(tag, utd_entry);
+ put_utd_entry(utd_entry);
+ }
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+
+ atomic64_inc(&qtu_events.delete_cmds);
+ res = 0;
+
+err:
+ return res;
+}
+
+static int ctrl_cmd_counter_set(const char *input)
+{
+ char cmd;
+ uid_t uid = 0;
+ tag_t tag;
+ int res, argc;
+ struct tag_counter_set *tcs;
+ int counter_set;
+
+ argc = sscanf(input, "%c %d %u", &cmd, &counter_set, &uid);
+ CT_DEBUG("qtaguid: ctrl_counterset(%s): argc=%d cmd=%c "
+ "set=%d uid=%u\n", input, argc, cmd,
+ counter_set, uid);
+ if (argc != 3) {
+ res = -EINVAL;
+ goto err;
+ }
+ if (counter_set < 0 || counter_set >= IFS_MAX_COUNTER_SETS) {
+ pr_info("qtaguid: ctrl_counterset(%s): invalid counter_set range\n",
+ input);
+ res = -EINVAL;
+ goto err;
+ }
+ if (!can_manipulate_uids()) {
+ pr_info("qtaguid: ctrl_counterset(%s): "
+ "insufficient priv from pid=%u tgid=%u uid=%u\n",
+ input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+ res = -EPERM;
+ goto err;
+ }
+
+ tag = make_tag_from_uid(uid);
+ spin_lock_bh(&tag_counter_set_list_lock);
+ tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
+ if (!tcs) {
+ tcs = kzalloc(sizeof(*tcs), GFP_ATOMIC);
+ if (!tcs) {
+ spin_unlock_bh(&tag_counter_set_list_lock);
+ pr_err("qtaguid: ctrl_counterset(%s): "
+ "failed to alloc counter set\n",
+ input);
+ res = -ENOMEM;
+ goto err;
+ }
+ tcs->tn.tag = tag;
+ tag_counter_set_tree_insert(tcs, &tag_counter_set_tree);
+ CT_DEBUG("qtaguid: ctrl_counterset(%s): added tcs tag=0x%llx "
+ "(uid=%u) set=%d\n",
+ input, tag, get_uid_from_tag(tag), counter_set);
+ }
+ tcs->active_set = counter_set;
+ spin_unlock_bh(&tag_counter_set_list_lock);
+ atomic64_inc(&qtu_events.counter_set_changes);
+ res = 0;
+
+err:
+ return res;
+}
+
+static int ctrl_cmd_tag(const char *input)
+{
+ char cmd;
+ int sock_fd = 0;
+ kuid_t uid;
+ unsigned int uid_int = 0;
+ tag_t acct_tag = make_atag_from_value(0);
+ tag_t full_tag;
+ struct socket *el_socket;
+ int res, argc;
+ struct sock_tag *sock_tag_entry;
+ struct tag_ref *tag_ref_entry;
+ struct uid_tag_data *uid_tag_data_entry;
+ struct proc_qtu_data *pqd_entry;
+
+ /* Unassigned args will get defaulted later. */
+ argc = sscanf(input, "%c %d %llu %u", &cmd, &sock_fd, &acct_tag, &uid_int);
+ uid = make_kuid(&init_user_ns, uid_int);
+ CT_DEBUG("qtaguid: ctrl_tag(%s): argc=%d cmd=%c sock_fd=%d "
+ "acct_tag=0x%llx uid=%u\n", input, argc, cmd, sock_fd,
+ acct_tag, uid_int);
+ if (argc < 2) {
+ res = -EINVAL;
+ goto err;
+ }
+ el_socket = sockfd_lookup(sock_fd, &res); /* This locks the file */
+ if (!el_socket) {
+ pr_info("qtaguid: ctrl_tag(%s): failed to lookup"
+ " sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n",
+ input, sock_fd, res, current->pid, current->tgid,
+ from_kuid(&init_user_ns, current_fsuid()));
+ goto err;
+ }
+ CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->sk_refcnt=%d ->sk=%p\n",
+ input, atomic_read(&el_socket->sk->sk_refcnt),
+ el_socket->sk);
+ if (argc < 3) {
+ acct_tag = make_atag_from_value(0);
+ } else if (!valid_atag(acct_tag)) {
+ pr_info("qtaguid: ctrl_tag(%s): invalid tag\n", input);
+ res = -EINVAL;
+ goto err_put;
+ }
+ CT_DEBUG("qtaguid: ctrl_tag(%s): "
+ "pid=%u tgid=%u uid=%u euid=%u fsuid=%u "
+ "ctrl.gid=%u in_group()=%d in_egroup()=%d\n",
+ input, current->pid, current->tgid,
+ from_kuid(&init_user_ns, current_uid()),
+ from_kuid(&init_user_ns, current_euid()),
+ from_kuid(&init_user_ns, current_fsuid()),
+ from_kgid(&init_user_ns, xt_qtaguid_ctrl_file->gid),
+ in_group_p(xt_qtaguid_ctrl_file->gid),
+ in_egroup_p(xt_qtaguid_ctrl_file->gid));
+ if (argc < 4) {
+ uid = current_fsuid();
+ uid_int = from_kuid(&init_user_ns, uid);
+ } else if (!can_impersonate_uid(uid)) {
+ pr_info("qtaguid: ctrl_tag(%s): "
+ "insufficient priv from pid=%u tgid=%u uid=%u\n",
+ input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+ res = -EPERM;
+ goto err_put;
+ }
+ full_tag = combine_atag_with_uid(acct_tag, uid_int);
+
+ spin_lock_bh(&sock_tag_list_lock);
+ sock_tag_entry = get_sock_stat_nl(el_socket->sk);
+ tag_ref_entry = get_tag_ref(full_tag, &uid_tag_data_entry);
+ if (IS_ERR(tag_ref_entry)) {
+ res = PTR_ERR(tag_ref_entry);
+ spin_unlock_bh(&sock_tag_list_lock);
+ goto err_put;
+ }
+ tag_ref_entry->num_sock_tags++;
+ if (sock_tag_entry) {
+ struct tag_ref *prev_tag_ref_entry;
+
+ CT_DEBUG("qtaguid: ctrl_tag(%s): retag for sk=%p "
+ "st@%p ...->sk_refcnt=%d\n",
+ input, el_socket->sk, sock_tag_entry,
+ atomic_read(&el_socket->sk->sk_refcnt));
+ prev_tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag,
+ &uid_tag_data_entry);
+ BUG_ON(IS_ERR_OR_NULL(prev_tag_ref_entry));
+ BUG_ON(prev_tag_ref_entry->num_sock_tags <= 0);
+ prev_tag_ref_entry->num_sock_tags--;
+ sock_tag_entry->tag = full_tag;
+ } else {
+ CT_DEBUG("qtaguid: ctrl_tag(%s): newtag for sk=%p\n",
+ input, el_socket->sk);
+ sock_tag_entry = kzalloc(sizeof(*sock_tag_entry),
+ GFP_ATOMIC);
+ if (!sock_tag_entry) {
+ pr_err("qtaguid: ctrl_tag(%s): "
+ "socket tag alloc failed\n",
+ input);
+ spin_unlock_bh(&sock_tag_list_lock);
+ res = -ENOMEM;
+ goto err_tag_unref_put;
+ }
+ /*
+ * Hold the sk refcount here to make sure the sk pointer cannot
+ * be freed and reused
+ */
+ sock_hold(el_socket->sk);
+ sock_tag_entry->sk = el_socket->sk;
+ sock_tag_entry->pid = current->tgid;
+ sock_tag_entry->tag = combine_atag_with_uid(acct_tag, uid_int);
+ spin_lock_bh(&uid_tag_data_tree_lock);
+ pqd_entry = proc_qtu_data_tree_search(
+ &proc_qtu_data_tree, current->tgid);
+ /*
+ * TODO: remove if, and start failing.
+ * At first, we want to catch user-space code that is not
+ * opening the /dev/xt_qtaguid.
+ */
+ if (IS_ERR_OR_NULL(pqd_entry))
+ pr_warn_once(
+ "qtaguid: %s(): "
+ "User space forgot to open /dev/xt_qtaguid? "
+ "pid=%u tgid=%u uid=%u\n", __func__,
+ current->pid, current->tgid,
+ from_kuid(&init_user_ns, current_fsuid()));
+ else
+ list_add(&sock_tag_entry->list,
+ &pqd_entry->sock_tag_list);
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+
+ sock_tag_tree_insert(sock_tag_entry, &sock_tag_tree);
+ atomic64_inc(&qtu_events.sockets_tagged);
+ }
+ spin_unlock_bh(&sock_tag_list_lock);
+ /* We keep the ref to the sk until it is untagged */
+ CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->sk_refcnt=%d\n",
+ input, sock_tag_entry,
+ atomic_read(&el_socket->sk->sk_refcnt));
+ sockfd_put(el_socket);
+ return 0;
+
+err_tag_unref_put:
+ BUG_ON(tag_ref_entry->num_sock_tags <= 0);
+ tag_ref_entry->num_sock_tags--;
+ free_tag_ref_from_utd_entry(tag_ref_entry, uid_tag_data_entry);
+err_put:
+ CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->sk_refcnt=%d\n",
+ input, atomic_read(&el_socket->sk->sk_refcnt) - 1);
+ /* Release the sock_fd that was grabbed by sockfd_lookup(). */
+ sockfd_put(el_socket);
+ return res;
+
+err:
+ CT_DEBUG("qtaguid: ctrl_tag(%s): done.\n", input);
+ return res;
+}
+
+static int ctrl_cmd_untag(const char *input)
+{
+ char cmd;
+ int sock_fd = 0;
+ struct socket *el_socket;
+ int res, argc;
+
+ argc = sscanf(input, "%c %d", &cmd, &sock_fd);
+ CT_DEBUG("qtaguid: ctrl_untag(%s): argc=%d cmd=%c sock_fd=%d\n",
+ input, argc, cmd, sock_fd);
+ if (argc < 2) {
+ res = -EINVAL;
+ return res;
+ }
+ el_socket = sockfd_lookup(sock_fd, &res); /* This locks the file */
+ if (!el_socket) {
+ pr_info("qtaguid: ctrl_untag(%s): failed to lookup"
+ " sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n",
+ input, sock_fd, res, current->pid, current->tgid,
+ from_kuid(&init_user_ns, current_fsuid()));
+ return res;
+ }
+ CT_DEBUG("qtaguid: ctrl_untag(%s): socket->...->f_count=%ld ->sk=%p\n",
+ input, atomic_long_read(&el_socket->file->f_count),
+ el_socket->sk);
+ res = qtaguid_untag(el_socket, false);
+ sockfd_put(el_socket);
+ return res;
+}
+
+int qtaguid_untag(struct socket *el_socket, bool kernel)
+{
+ int res;
+ pid_t pid;
+ struct sock_tag *sock_tag_entry;
+ struct tag_ref *tag_ref_entry;
+ struct uid_tag_data *utd_entry;
+ struct proc_qtu_data *pqd_entry;
+
+ spin_lock_bh(&sock_tag_list_lock);
+ sock_tag_entry = get_sock_stat_nl(el_socket->sk);
+ if (!sock_tag_entry) {
+ spin_unlock_bh(&sock_tag_list_lock);
+ res = -EINVAL;
+ return res;
+ }
+ /*
+ * The socket already belongs to the current process
+ * so it can do whatever it wants to it.
+ */
+ rb_erase(&sock_tag_entry->sock_node, &sock_tag_tree);
+
+ tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, &utd_entry);
+ BUG_ON(!tag_ref_entry);
+ BUG_ON(tag_ref_entry->num_sock_tags <= 0);
+ spin_lock_bh(&uid_tag_data_tree_lock);
+ if (kernel)
+ pid = sock_tag_entry->pid;
+ else
+ pid = current->tgid;
+ pqd_entry = proc_qtu_data_tree_search(
+ &proc_qtu_data_tree, pid);
+ /*
+ * TODO: remove if, and start failing.
+ * At first, we want to catch user-space code that is not
+ * opening the /dev/xt_qtaguid.
+ */
+ if (IS_ERR_OR_NULL(pqd_entry) || !sock_tag_entry->list.next) {
+ pr_warn_once("qtaguid: %s(): "
+ "User space forgot to open /dev/xt_qtaguid? "
+ "pid=%u tgid=%u sk_pid=%u, uid=%u\n", __func__,
+ current->pid, current->tgid, sock_tag_entry->pid,
+ from_kuid(&init_user_ns, current_fsuid()));
+ } else {
+ list_del(&sock_tag_entry->list);
+ }
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ /*
+ * We don't free tag_ref from the utd_entry here,
+ * only during a cmd_delete().
+ */
+ tag_ref_entry->num_sock_tags--;
+ spin_unlock_bh(&sock_tag_list_lock);
+ /*
+ * Release the sock_fd that was grabbed at tag time.
+ */
+ sock_put(sock_tag_entry->sk);
+ CT_DEBUG("qtaguid: done. st@%p ...->sk_refcnt=%d\n",
+ sock_tag_entry,
+ atomic_read(&el_socket->sk->sk_refcnt));
+
+ kfree(sock_tag_entry);
+ atomic64_inc(&qtu_events.sockets_untagged);
+
+ return 0;
+}
+
+static ssize_t qtaguid_ctrl_parse(const char *input, size_t count)
+{
+ char cmd;
+ ssize_t res;
+
+ CT_DEBUG("qtaguid: ctrl(%s): pid=%u tgid=%u uid=%u\n",
+ input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+ cmd = input[0];
+ /* Collect params for commands */
+ switch (cmd) {
+ case 'd':
+ res = ctrl_cmd_delete(input);
+ break;
+
+ case 's':
+ res = ctrl_cmd_counter_set(input);
+ break;
+
+ case 't':
+ res = ctrl_cmd_tag(input);
+ break;
+
+ case 'u':
+ res = ctrl_cmd_untag(input);
+ break;
+
+ default:
+ res = -EINVAL;
+ goto err;
+ }
+ if (!res)
+ res = count;
+err:
+ CT_DEBUG("qtaguid: ctrl(%s): res=%zd\n", input, res);
+ return res;
+}
+
+#define MAX_QTAGUID_CTRL_INPUT_LEN 255
+static ssize_t qtaguid_ctrl_proc_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *offp)
+{
+ char input_buf[MAX_QTAGUID_CTRL_INPUT_LEN];
+
+ if (unlikely(module_passive))
+ return count;
+
+ if (count >= MAX_QTAGUID_CTRL_INPUT_LEN)
+ return -EINVAL;
+
+ if (copy_from_user(input_buf, buffer, count))
+ return -EFAULT;
+
+ input_buf[count] = '\0';
+ return qtaguid_ctrl_parse(input_buf, count);
+}
+
+struct proc_print_info {
+ struct iface_stat *iface_entry;
+ int item_index;
+ tag_t tag; /* tag found by reading to tag_pos */
+ off_t tag_pos;
+ int tag_item_index;
+};
+
+static void pp_stats_header(struct seq_file *m)
+{
+ seq_puts(m,
+ "idx iface acct_tag_hex uid_tag_int cnt_set "
+ "rx_bytes rx_packets "
+ "tx_bytes tx_packets "
+ "rx_tcp_bytes rx_tcp_packets "
+ "rx_udp_bytes rx_udp_packets "
+ "rx_other_bytes rx_other_packets "
+ "tx_tcp_bytes tx_tcp_packets "
+ "tx_udp_bytes tx_udp_packets "
+ "tx_other_bytes tx_other_packets\n");
+}
+
+static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry,
+ int cnt_set)
+{
+ int ret;
+ struct data_counters *cnts;
+ tag_t tag = ts_entry->tn.tag;
+ uid_t stat_uid = get_uid_from_tag(tag);
+ struct proc_print_info *ppi = m->private;
+ /* Detailed tags are not available to everybody */
+ if (!can_read_other_uid_stats(make_kuid(&init_user_ns,stat_uid))) {
+ CT_DEBUG("qtaguid: stats line: "
+ "%s 0x%llx %u: insufficient priv "
+ "from pid=%u tgid=%u uid=%u stats.gid=%u\n",
+ ppi->iface_entry->ifname,
+ get_atag_from_tag(tag), stat_uid,
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()),
+ from_kgid(&init_user_ns,xt_qtaguid_stats_file->gid));
+ return 0;
+ }
+ ppi->item_index++;
+ cnts = &ts_entry->counters;
+ ret = seq_printf(m, "%d %s 0x%llx %u %u "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu\n",
+ ppi->item_index,
+ ppi->iface_entry->ifname,
+ get_atag_from_tag(tag),
+ stat_uid,
+ cnt_set,
+ dc_sum_bytes(cnts, cnt_set, IFS_RX),
+ dc_sum_packets(cnts, cnt_set, IFS_RX),
+ dc_sum_bytes(cnts, cnt_set, IFS_TX),
+ dc_sum_packets(cnts, cnt_set, IFS_TX),
+ cnts->bpc[cnt_set][IFS_RX][IFS_TCP].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_TCP].packets,
+ cnts->bpc[cnt_set][IFS_RX][IFS_UDP].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_UDP].packets,
+ cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_TCP].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_TCP].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_UDP].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets);
+ return ret ?: 1;
+}
+
+static bool pp_sets(struct seq_file *m, struct tag_stat *ts_entry)
+{
+ int ret;
+ int counter_set;
+ for (counter_set = 0; counter_set < IFS_MAX_COUNTER_SETS;
+ counter_set++) {
+ ret = pp_stats_line(m, ts_entry, counter_set);
+ if (ret < 0)
+ return false;
+ }
+ return true;
+}
+
+static int qtaguid_stats_proc_iface_stat_ptr_valid(struct iface_stat *ptr)
+{
+ struct iface_stat *iface_entry;
+
+ if (!ptr)
+ return false;
+
+ list_for_each_entry(iface_entry, &iface_stat_list, list)
+ if (iface_entry == ptr)
+ return true;
+ return false;
+}
+
+static void qtaguid_stats_proc_next_iface_entry(struct proc_print_info *ppi)
+{
+ spin_unlock_bh(&ppi->iface_entry->tag_stat_list_lock);
+ list_for_each_entry_continue(ppi->iface_entry, &iface_stat_list, list) {
+ spin_lock_bh(&ppi->iface_entry->tag_stat_list_lock);
+ return;
+ }
+ ppi->iface_entry = NULL;
+}
+
+static void *qtaguid_stats_proc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct proc_print_info *ppi = m->private;
+ struct tag_stat *ts_entry;
+ struct rb_node *node;
+
+ if (!v) {
+ pr_err("qtaguid: %s(): unexpected v: NULL\n", __func__);
+ return NULL;
+ }
+
+ (*pos)++;
+
+ if (!ppi->iface_entry || unlikely(module_passive))
+ return NULL;
+
+ if (v == SEQ_START_TOKEN)
+ node = rb_first(&ppi->iface_entry->tag_stat_tree);
+ else
+ node = rb_next(&((struct tag_stat *)v)->tn.node);
+
+ while (!node) {
+ qtaguid_stats_proc_next_iface_entry(ppi);
+ if (!ppi->iface_entry)
+ return NULL;
+ node = rb_first(&ppi->iface_entry->tag_stat_tree);
+ }
+
+ ts_entry = rb_entry(node, struct tag_stat, tn.node);
+ ppi->tag = ts_entry->tn.tag;
+ ppi->tag_pos = *pos;
+ ppi->tag_item_index = ppi->item_index;
+ return ts_entry;
+}
+
+static void *qtaguid_stats_proc_start(struct seq_file *m, loff_t *pos)
+{
+ struct proc_print_info *ppi = m->private;
+ struct tag_stat *ts_entry = NULL;
+
+ spin_lock_bh(&iface_stat_list_lock);
+
+ if (*pos == 0) {
+ ppi->item_index = 1;
+ ppi->tag_pos = 0;
+ if (list_empty(&iface_stat_list)) {
+ ppi->iface_entry = NULL;
+ } else {
+ ppi->iface_entry = list_first_entry(&iface_stat_list,
+ struct iface_stat,
+ list);
+ spin_lock_bh(&ppi->iface_entry->tag_stat_list_lock);
+ }
+ return SEQ_START_TOKEN;
+ }
+ if (!qtaguid_stats_proc_iface_stat_ptr_valid(ppi->iface_entry)) {
+ if (ppi->iface_entry) {
+ pr_err("qtaguid: %s(): iface_entry %p not found\n",
+ __func__, ppi->iface_entry);
+ ppi->iface_entry = NULL;
+ }
+ return NULL;
+ }
+
+ spin_lock_bh(&ppi->iface_entry->tag_stat_list_lock);
+
+ if (!ppi->tag_pos) {
+ /* seq_read skipped first next call */
+ ts_entry = SEQ_START_TOKEN;
+ } else {
+ ts_entry = tag_stat_tree_search(
+ &ppi->iface_entry->tag_stat_tree, ppi->tag);
+ if (!ts_entry) {
+ pr_info("qtaguid: %s(): tag_stat.tag 0x%llx not found. Abort.\n",
+ __func__, ppi->tag);
+ return NULL;
+ }
+ }
+
+ if (*pos == ppi->tag_pos) { /* normal resume */
+ ppi->item_index = ppi->tag_item_index;
+ } else {
+ /* seq_read skipped a next call */
+ *pos = ppi->tag_pos;
+ ts_entry = qtaguid_stats_proc_next(m, ts_entry, pos);
+ }
+
+ return ts_entry;
+}
+
+static void qtaguid_stats_proc_stop(struct seq_file *m, void *v)
+{
+ struct proc_print_info *ppi = m->private;
+ if (ppi->iface_entry)
+ spin_unlock_bh(&ppi->iface_entry->tag_stat_list_lock);
+ spin_unlock_bh(&iface_stat_list_lock);
+}
+
+/*
+ * Procfs reader to get all tag stats using style "1)" as described in
+ * fs/proc/generic.c
+ * Groups all protocols tx/rx bytes.
+ */
+static int qtaguid_stats_proc_show(struct seq_file *m, void *v)
+{
+ struct tag_stat *ts_entry = v;
+
+ if (v == SEQ_START_TOKEN)
+ pp_stats_header(m);
+ else
+ pp_sets(m, ts_entry);
+
+ return 0;
+}
+
+/*------------------------------------------*/
+static int qtudev_open(struct inode *inode, struct file *file)
+{
+ struct uid_tag_data *utd_entry;
+ struct proc_qtu_data *pqd_entry;
+ struct proc_qtu_data *new_pqd_entry;
+ int res;
+ bool utd_entry_found;
+
+ if (unlikely(qtu_proc_handling_passive))
+ return 0;
+
+ DR_DEBUG("qtaguid: qtudev_open(): pid=%u tgid=%u uid=%u\n",
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+ spin_lock_bh(&uid_tag_data_tree_lock);
+
+ /* Look for existing uid data, or alloc one. */
+ utd_entry = get_uid_data(from_kuid(&init_user_ns, current_fsuid()), &utd_entry_found);
+ if (IS_ERR_OR_NULL(utd_entry)) {
+ res = PTR_ERR(utd_entry);
+ goto err_unlock;
+ }
+
+ /* Look for existing PID based proc_data */
+ pqd_entry = proc_qtu_data_tree_search(&proc_qtu_data_tree,
+ current->tgid);
+ if (pqd_entry) {
+ pr_err("qtaguid: qtudev_open(): %u/%u %u "
+ "%s already opened\n",
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()),
+ QTU_DEV_NAME);
+ res = -EBUSY;
+ goto err_unlock_free_utd;
+ }
+
+ new_pqd_entry = kzalloc(sizeof(*new_pqd_entry), GFP_ATOMIC);
+ if (!new_pqd_entry) {
+ pr_err("qtaguid: qtudev_open(): %u/%u %u: "
+ "proc data alloc failed\n",
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+ res = -ENOMEM;
+ goto err_unlock_free_utd;
+ }
+ new_pqd_entry->pid = current->tgid;
+ INIT_LIST_HEAD(&new_pqd_entry->sock_tag_list);
+ new_pqd_entry->parent_tag_data = utd_entry;
+ utd_entry->num_pqd++;
+
+ proc_qtu_data_tree_insert(new_pqd_entry,
+ &proc_qtu_data_tree);
+
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ DR_DEBUG("qtaguid: tracking data for uid=%u in pqd=%p\n",
+ from_kuid(&init_user_ns, current_fsuid()), new_pqd_entry);
+ file->private_data = new_pqd_entry;
+ return 0;
+
+err_unlock_free_utd:
+ if (!utd_entry_found) {
+ rb_erase(&utd_entry->node, &uid_tag_data_tree);
+ kfree(utd_entry);
+ }
+err_unlock:
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ return res;
+}
+
+static int qtudev_release(struct inode *inode, struct file *file)
+{
+ struct proc_qtu_data *pqd_entry = file->private_data;
+ struct uid_tag_data *utd_entry = pqd_entry->parent_tag_data;
+ struct sock_tag *st_entry;
+ struct rb_root st_to_free_tree = RB_ROOT;
+ struct list_head *entry, *next;
+ struct tag_ref *tr;
+
+ if (unlikely(qtu_proc_handling_passive))
+ return 0;
+
+ /*
+ * Do not trust the current->pid, it might just be a kworker cleaning
+ * up after a dead proc.
+ */
+ DR_DEBUG("qtaguid: qtudev_release(): "
+ "pid=%u tgid=%u uid=%u "
+ "pqd_entry=%p->pid=%u utd_entry=%p->active_tags=%d\n",
+ current->pid, current->tgid, pqd_entry->parent_tag_data->uid,
+ pqd_entry, pqd_entry->pid, utd_entry,
+ utd_entry->num_active_tags);
+
+ spin_lock_bh(&sock_tag_list_lock);
+ spin_lock_bh(&uid_tag_data_tree_lock);
+
+ list_for_each_safe(entry, next, &pqd_entry->sock_tag_list) {
+ st_entry = list_entry(entry, struct sock_tag, list);
+ DR_DEBUG("qtaguid: %s(): "
+ "erase sock_tag=%p->sk=%p pid=%u tgid=%u uid=%u\n",
+ __func__,
+ st_entry, st_entry->sk,
+ current->pid, current->tgid,
+ pqd_entry->parent_tag_data->uid);
+
+ utd_entry = uid_tag_data_tree_search(
+ &uid_tag_data_tree,
+ get_uid_from_tag(st_entry->tag));
+ BUG_ON(IS_ERR_OR_NULL(utd_entry));
+ DR_DEBUG("qtaguid: %s(): "
+ "looking for tag=0x%llx in utd_entry=%p\n", __func__,
+ st_entry->tag, utd_entry);
+ tr = tag_ref_tree_search(&utd_entry->tag_ref_tree,
+ st_entry->tag);
+ BUG_ON(!tr);
+ BUG_ON(tr->num_sock_tags <= 0);
+ tr->num_sock_tags--;
+ free_tag_ref_from_utd_entry(tr, utd_entry);
+
+ rb_erase(&st_entry->sock_node, &sock_tag_tree);
+ list_del(&st_entry->list);
+ /* Can't sockfd_put() within spinlock, do it later. */
+ sock_tag_tree_insert(st_entry, &st_to_free_tree);
+
+ /*
+ * Try to free the utd_entry if no other proc_qtu_data is
+ * using it (num_pqd is 0) and it doesn't have active tags
+ * (num_active_tags is 0).
+ */
+ put_utd_entry(utd_entry);
+ }
+
+ rb_erase(&pqd_entry->node, &proc_qtu_data_tree);
+ BUG_ON(pqd_entry->parent_tag_data->num_pqd < 1);
+ pqd_entry->parent_tag_data->num_pqd--;
+ put_utd_entry(pqd_entry->parent_tag_data);
+ kfree(pqd_entry);
+ file->private_data = NULL;
+
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ spin_unlock_bh(&sock_tag_list_lock);
+
+
+ sock_tag_tree_erase(&st_to_free_tree);
+
+ spin_lock_bh(&sock_tag_list_lock);
+ prdebug_full_state_locked(0, "%s(): pid=%u tgid=%u", __func__,
+ current->pid, current->tgid);
+ spin_unlock_bh(&sock_tag_list_lock);
+ return 0;
+}
+
+/*------------------------------------------*/
+static const struct file_operations qtudev_fops = {
+ .owner = THIS_MODULE,
+ .open = qtudev_open,
+ .release = qtudev_release,
+};
+
+static struct miscdevice qtu_device = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = QTU_DEV_NAME,
+ .fops = &qtudev_fops,
+ /* How sad it doesn't allow for defaults: .mode = S_IRUGO | S_IWUSR */
+};
+
+static const struct seq_operations proc_qtaguid_ctrl_seqops = {
+ .start = qtaguid_ctrl_proc_start,
+ .next = qtaguid_ctrl_proc_next,
+ .stop = qtaguid_ctrl_proc_stop,
+ .show = qtaguid_ctrl_proc_show,
+};
+
+static int proc_qtaguid_ctrl_open(struct inode *inode, struct file *file)
+{
+ return seq_open_private(file, &proc_qtaguid_ctrl_seqops,
+ sizeof(struct proc_ctrl_print_info));
+}
+
+static const struct file_operations proc_qtaguid_ctrl_fops = {
+ .open = proc_qtaguid_ctrl_open,
+ .read = seq_read,
+ .write = qtaguid_ctrl_proc_write,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static const struct seq_operations proc_qtaguid_stats_seqops = {
+ .start = qtaguid_stats_proc_start,
+ .next = qtaguid_stats_proc_next,
+ .stop = qtaguid_stats_proc_stop,
+ .show = qtaguid_stats_proc_show,
+};
+
+static int proc_qtaguid_stats_open(struct inode *inode, struct file *file)
+{
+ return seq_open_private(file, &proc_qtaguid_stats_seqops,
+ sizeof(struct proc_print_info));
+}
+
+static const struct file_operations proc_qtaguid_stats_fops = {
+ .open = proc_qtaguid_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+/*------------------------------------------*/
+static int __init qtaguid_proc_register(struct proc_dir_entry **res_procdir)
+{
+ int ret;
+ *res_procdir = proc_mkdir(module_procdirname, init_net.proc_net);
+ if (!*res_procdir) {
+ pr_err("qtaguid: failed to create proc/.../xt_qtaguid\n");
+ ret = -ENOMEM;
+ goto no_dir;
+ }
+
+ xt_qtaguid_ctrl_file = proc_create_data("ctrl", proc_ctrl_perms,
+ *res_procdir,
+ &proc_qtaguid_ctrl_fops,
+ NULL);
+ if (!xt_qtaguid_ctrl_file) {
+ pr_err("qtaguid: failed to create xt_qtaguid/ctrl "
+ " file\n");
+ ret = -ENOMEM;
+ goto no_ctrl_entry;
+ }
+
+ xt_qtaguid_stats_file = proc_create_data("stats", proc_stats_perms,
+ *res_procdir,
+ &proc_qtaguid_stats_fops,
+ NULL);
+ if (!xt_qtaguid_stats_file) {
+ pr_err("qtaguid: failed to create xt_qtaguid/stats "
+ "file\n");
+ ret = -ENOMEM;
+ goto no_stats_entry;
+ }
+ /*
+ * TODO: add support counter hacking
+ * xt_qtaguid_stats_file->write_proc = qtaguid_stats_proc_write;
+ */
+ return 0;
+
+no_stats_entry:
+ remove_proc_entry("ctrl", *res_procdir);
+no_ctrl_entry:
+ remove_proc_entry("xt_qtaguid", NULL);
+no_dir:
+ return ret;
+}
+
+static struct xt_match qtaguid_mt_reg __read_mostly = {
+ /*
+ * This module masquerades as the "owner" module so that iptables
+ * tools can deal with it.
+ */
+ .name = "owner",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .match = qtaguid_mt,
+ .matchsize = sizeof(struct xt_qtaguid_match_info),
+ .me = THIS_MODULE,
+};
+
+static int __init qtaguid_mt_init(void)
+{
+ if (qtaguid_proc_register(&xt_qtaguid_procdir)
+ || iface_stat_init(xt_qtaguid_procdir)
+ || xt_register_match(&qtaguid_mt_reg)
+ || misc_register(&qtu_device))
+ return -1;
+ return 0;
+}
+
+/*
+ * TODO: allow unloading of the module.
+ * For now stats are permanent.
+ * Kconfig forces'y/n' and never an 'm'.
+ */
+
+module_init(qtaguid_mt_init);
+MODULE_AUTHOR("jpa <jpa@google.com>");
+MODULE_DESCRIPTION("Xtables: socket owner+tag matching and associated stats");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_owner");
+MODULE_ALIAS("ip6t_owner");
+MODULE_ALIAS("ipt_qtaguid");
+MODULE_ALIAS("ip6t_qtaguid");
diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h
new file mode 100644
index 000000000000..8178fbdfb036
--- /dev/null
+++ b/net/netfilter/xt_qtaguid_internal.h
@@ -0,0 +1,350 @@
+/*
+ * Kernel iptables module to track stats for packets based on user tags.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __XT_QTAGUID_INTERNAL_H__
+#define __XT_QTAGUID_INTERNAL_H__
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock_types.h>
+#include <linux/workqueue.h>
+
+/* Iface handling */
+#define IDEBUG_MASK (1<<0)
+/* Iptable Matching. Per packet. */
+#define MDEBUG_MASK (1<<1)
+/* Red-black tree handling. Per packet. */
+#define RDEBUG_MASK (1<<2)
+/* procfs ctrl/stats handling */
+#define CDEBUG_MASK (1<<3)
+/* dev and resource tracking */
+#define DDEBUG_MASK (1<<4)
+
+/* E.g (IDEBUG_MASK | CDEBUG_MASK | DDEBUG_MASK) */
+#define DEFAULT_DEBUG_MASK 0
+
+/*
+ * (Un)Define these *DEBUG to compile out/in the pr_debug calls.
+ * All undef: text size ~ 0x3030; all def: ~ 0x4404.
+ */
+#define IDEBUG
+#define MDEBUG
+#define RDEBUG
+#define CDEBUG
+#define DDEBUG
+
+#define MSK_DEBUG(mask, ...) do { \
+ if (unlikely(qtaguid_debug_mask & (mask))) \
+ pr_debug(__VA_ARGS__); \
+ } while (0)
+#ifdef IDEBUG
+#define IF_DEBUG(...) MSK_DEBUG(IDEBUG_MASK, __VA_ARGS__)
+#else
+#define IF_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef MDEBUG
+#define MT_DEBUG(...) MSK_DEBUG(MDEBUG_MASK, __VA_ARGS__)
+#else
+#define MT_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef RDEBUG
+#define RB_DEBUG(...) MSK_DEBUG(RDEBUG_MASK, __VA_ARGS__)
+#else
+#define RB_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef CDEBUG
+#define CT_DEBUG(...) MSK_DEBUG(CDEBUG_MASK, __VA_ARGS__)
+#else
+#define CT_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef DDEBUG
+#define DR_DEBUG(...) MSK_DEBUG(DDEBUG_MASK, __VA_ARGS__)
+#else
+#define DR_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+
+extern uint qtaguid_debug_mask;
+
+/*---------------------------------------------------------------------------*/
+/*
+ * Tags:
+ *
+ * They represent what the data usage counters will be tracked against.
+ * By default a tag is just based on the UID.
+ * The UID is used as the base for policing, and can not be ignored.
+ * So a tag will always at least represent a UID (uid_tag).
+ *
+ * A tag can be augmented with an "accounting tag" which is associated
+ * with a UID.
+ * User space can set the acct_tag portion of the tag which is then used
+ * with sockets: all data belonging to that socket will be counted against the
+ * tag. The policing is then based on the tag's uid_tag portion,
+ * and stats are collected for the acct_tag portion separately.
+ *
+ * There could be
+ * a: {acct_tag=1, uid_tag=10003}
+ * b: {acct_tag=2, uid_tag=10003}
+ * c: {acct_tag=3, uid_tag=10003}
+ * d: {acct_tag=0, uid_tag=10003}
+ * a, b, and c represent tags associated with specific sockets.
+ * d is for the totals for that uid, including all untagged traffic.
+ * Typically d is used with policing/quota rules.
+ *
+ * We want tag_t big enough to distinguish uid_t and acct_tag.
+ * It might become a struct if needed.
+ * Nothing should be using it as an int.
+ */
+typedef uint64_t tag_t; /* Only used via accessors */
+
+#define TAG_UID_MASK 0xFFFFFFFFULL
+#define TAG_ACCT_MASK (~0xFFFFFFFFULL)
+
+static inline int tag_compare(tag_t t1, tag_t t2)
+{
+ return t1 < t2 ? -1 : t1 == t2 ? 0 : 1;
+}
+
+static inline tag_t combine_atag_with_uid(tag_t acct_tag, uid_t uid)
+{
+ return acct_tag | uid;
+}
+static inline tag_t make_tag_from_uid(uid_t uid)
+{
+ return uid;
+}
+static inline uid_t get_uid_from_tag(tag_t tag)
+{
+ return tag & TAG_UID_MASK;
+}
+static inline tag_t get_utag_from_tag(tag_t tag)
+{
+ return tag & TAG_UID_MASK;
+}
+static inline tag_t get_atag_from_tag(tag_t tag)
+{
+ return tag & TAG_ACCT_MASK;
+}
+
+static inline bool valid_atag(tag_t tag)
+{
+ return !(tag & TAG_UID_MASK);
+}
+static inline tag_t make_atag_from_value(uint32_t value)
+{
+ return (uint64_t)value << 32;
+}
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Maximum number of socket tags that a UID is allowed to have active.
+ * Multiple processes belonging to the same UID contribute towards this limit.
+ * Special UIDs that can impersonate a UID also contribute (e.g. download
+ * manager, ...)
+ */
+#define DEFAULT_MAX_SOCK_TAGS 1024
+
+/*
+ * For now we only track 2 sets of counters.
+ * The default set is 0.
+ * Userspace can activate another set for a given uid being tracked.
+ */
+#define IFS_MAX_COUNTER_SETS 2
+
+enum ifs_tx_rx {
+ IFS_TX,
+ IFS_RX,
+ IFS_MAX_DIRECTIONS
+};
+
+/* For now, TCP, UDP, the rest */
+enum ifs_proto {
+ IFS_TCP,
+ IFS_UDP,
+ IFS_PROTO_OTHER,
+ IFS_MAX_PROTOS
+};
+
+struct byte_packet_counters {
+ uint64_t bytes;
+ uint64_t packets;
+};
+
+struct data_counters {
+ struct byte_packet_counters bpc[IFS_MAX_COUNTER_SETS][IFS_MAX_DIRECTIONS][IFS_MAX_PROTOS];
+};
+
+static inline uint64_t dc_sum_bytes(struct data_counters *counters,
+ int set,
+ enum ifs_tx_rx direction)
+{
+ return counters->bpc[set][direction][IFS_TCP].bytes
+ + counters->bpc[set][direction][IFS_UDP].bytes
+ + counters->bpc[set][direction][IFS_PROTO_OTHER].bytes;
+}
+
+static inline uint64_t dc_sum_packets(struct data_counters *counters,
+ int set,
+ enum ifs_tx_rx direction)
+{
+ return counters->bpc[set][direction][IFS_TCP].packets
+ + counters->bpc[set][direction][IFS_UDP].packets
+ + counters->bpc[set][direction][IFS_PROTO_OTHER].packets;
+}
+
+
+/* Generic X based nodes used as a base for rb_tree ops */
+struct tag_node {
+ struct rb_node node;
+ tag_t tag;
+};
+
+struct tag_stat {
+ struct tag_node tn;
+ struct data_counters counters;
+ /*
+ * If this tag is acct_tag based, we need to count against the
+ * matching parent uid_tag.
+ */
+ struct data_counters *parent_counters;
+};
+
+struct iface_stat {
+ struct list_head list; /* in iface_stat_list */
+ char *ifname;
+ bool active;
+ /* net_dev is only valid for active iface_stat */
+ struct net_device *net_dev;
+
+ struct byte_packet_counters totals_via_dev[IFS_MAX_DIRECTIONS];
+ struct data_counters totals_via_skb;
+ /*
+ * We keep the last_known, because some devices reset their counters
+ * just before NETDEV_UP, while some will reset just before
+ * NETDEV_REGISTER (which is more normal).
+ * So now, if the device didn't do a NETDEV_UNREGISTER and we see
+ * its current dev stats smaller that what was previously known, we
+ * assume an UNREGISTER and just use the last_known.
+ */
+ struct byte_packet_counters last_known[IFS_MAX_DIRECTIONS];
+ /* last_known is usable when last_known_valid is true */
+ bool last_known_valid;
+
+ struct proc_dir_entry *proc_ptr;
+
+ struct rb_root tag_stat_tree;
+ spinlock_t tag_stat_list_lock;
+};
+
+/* This is needed to create proc_dir_entries from atomic context. */
+struct iface_stat_work {
+ struct work_struct iface_work;
+ struct iface_stat *iface_entry;
+};
+
+/*
+ * Track tag that this socket is transferring data for, and not necessarily
+ * the uid that owns the socket.
+ * This is the tag against which tag_stat.counters will be billed.
+ * These structs need to be looked up by sock and pid.
+ */
+struct sock_tag {
+ struct rb_node sock_node;
+ struct sock *sk; /* Only used as a number, never dereferenced */
+ /* Used to associate with a given pid */
+ struct list_head list; /* in proc_qtu_data.sock_tag_list */
+ pid_t pid;
+
+ tag_t tag;
+};
+
+struct qtaguid_event_counts {
+ /* Various successful events */
+ atomic64_t sockets_tagged;
+ atomic64_t sockets_untagged;
+ atomic64_t counter_set_changes;
+ atomic64_t delete_cmds;
+ atomic64_t iface_events; /* Number of NETDEV_* events handled */
+
+ atomic64_t match_calls; /* Number of times iptables called mt */
+ /* Number of times iptables called mt from pre or post routing hooks */
+ atomic64_t match_calls_prepost;
+ /*
+ * match_found_sk_*: numbers related to the netfilter matching
+ * function finding a sock for the sk_buff.
+ * Total skbs processed is sum(match_found*).
+ */
+ atomic64_t match_found_sk; /* An sk was already in the sk_buff. */
+ /* The connection tracker had or didn't have the sk. */
+ atomic64_t match_found_sk_in_ct;
+ atomic64_t match_found_no_sk_in_ct;
+ /*
+ * No sk could be found. No apparent owner. Could happen with
+ * unsolicited traffic.
+ */
+ atomic64_t match_no_sk;
+ /*
+ * The file ptr in the sk_socket wasn't there.
+ * This might happen for traffic while the socket is being closed.
+ */
+ atomic64_t match_no_sk_file;
+};
+
+/* Track the set active_set for the given tag. */
+struct tag_counter_set {
+ struct tag_node tn;
+ int active_set;
+};
+
+/*----------------------------------------------*/
+/*
+ * The qtu uid data is used to track resources that are created directly or
+ * indirectly by processes (uid tracked).
+ * It is shared by the processes with the same uid.
+ * Some of the resource will be counted to prevent further rogue allocations,
+ * some will need freeing once the owner process (uid) exits.
+ */
+struct uid_tag_data {
+ struct rb_node node;
+ uid_t uid;
+
+ /*
+ * For the uid, how many accounting tags have been set.
+ */
+ int num_active_tags;
+ /* Track the number of proc_qtu_data that reference it */
+ int num_pqd;
+ struct rb_root tag_ref_tree;
+ /* No tag_node_tree_lock; use uid_tag_data_tree_lock */
+};
+
+struct tag_ref {
+ struct tag_node tn;
+
+ /*
+ * This tracks the number of active sockets that have a tag on them
+ * which matches this tag_ref.tn.tag.
+ * A tag ref can live on after the sockets are untagged.
+ * A tag ref can only be removed during a tag delete command.
+ */
+ int num_sock_tags;
+};
+
+struct proc_qtu_data {
+ struct rb_node node;
+ pid_t pid;
+
+ struct uid_tag_data *parent_tag_data;
+
+ /* Tracks the sock_tags that need freeing upon this proc's death */
+ struct list_head sock_tag_list;
+ /* No spinlock_t sock_tag_list_lock; use the global one. */
+};
+
+/*----------------------------------------------*/
+#endif /* ifndef __XT_QTAGUID_INTERNAL_H__ */
diff --git a/net/netfilter/xt_qtaguid_print.c b/net/netfilter/xt_qtaguid_print.c
new file mode 100644
index 000000000000..2a7190d285e6
--- /dev/null
+++ b/net/netfilter/xt_qtaguid_print.c
@@ -0,0 +1,566 @@
+/*
+ * Pretty printing Support for iptables xt_qtaguid module.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Most of the functions in this file just waste time if DEBUG is not defined.
+ * The matching xt_qtaguid_print.h will static inline empty funcs if the needed
+ * debug flags ore not defined.
+ * Those funcs that fail to allocate memory will panic as there is no need to
+ * hobble allong just pretending to do the requested work.
+ */
+
+#define DEBUG
+
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/net.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/spinlock_types.h>
+#include <net/sock.h>
+
+#include "xt_qtaguid_internal.h"
+#include "xt_qtaguid_print.h"
+
+#ifdef DDEBUG
+
+static void _bug_on_err_or_null(void *ptr)
+{
+ if (IS_ERR_OR_NULL(ptr)) {
+ pr_err("qtaguid: kmalloc failed\n");
+ BUG();
+ }
+}
+
+char *pp_tag_t(tag_t *tag)
+{
+ char *res;
+
+ if (!tag)
+ res = kasprintf(GFP_ATOMIC, "tag_t@null{}");
+ else
+ res = kasprintf(GFP_ATOMIC,
+ "tag_t@%p{tag=0x%llx, uid=%u}",
+ tag, *tag, get_uid_from_tag(*tag));
+ _bug_on_err_or_null(res);
+ return res;
+}
+
+char *pp_data_counters(struct data_counters *dc, bool showValues)
+{
+ char *res;
+
+ if (!dc)
+ res = kasprintf(GFP_ATOMIC, "data_counters@null{}");
+ else if (showValues)
+ res = kasprintf(
+ GFP_ATOMIC, "data_counters@%p{"
+ "set0{"
+ "rx{"
+ "tcp{b=%llu, p=%llu}, "
+ "udp{b=%llu, p=%llu},"
+ "other{b=%llu, p=%llu}}, "
+ "tx{"
+ "tcp{b=%llu, p=%llu}, "
+ "udp{b=%llu, p=%llu},"
+ "other{b=%llu, p=%llu}}}, "
+ "set1{"
+ "rx{"
+ "tcp{b=%llu, p=%llu}, "
+ "udp{b=%llu, p=%llu},"
+ "other{b=%llu, p=%llu}}, "
+ "tx{"
+ "tcp{b=%llu, p=%llu}, "
+ "udp{b=%llu, p=%llu},"
+ "other{b=%llu, p=%llu}}}}",
+ dc,
+ dc->bpc[0][IFS_RX][IFS_TCP].bytes,
+ dc->bpc[0][IFS_RX][IFS_TCP].packets,
+ dc->bpc[0][IFS_RX][IFS_UDP].bytes,
+ dc->bpc[0][IFS_RX][IFS_UDP].packets,
+ dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].bytes,
+ dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].packets,
+ dc->bpc[0][IFS_TX][IFS_TCP].bytes,
+ dc->bpc[0][IFS_TX][IFS_TCP].packets,
+ dc->bpc[0][IFS_TX][IFS_UDP].bytes,
+ dc->bpc[0][IFS_TX][IFS_UDP].packets,
+ dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].bytes,
+ dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].packets,
+ dc->bpc[1][IFS_RX][IFS_TCP].bytes,
+ dc->bpc[1][IFS_RX][IFS_TCP].packets,
+ dc->bpc[1][IFS_RX][IFS_UDP].bytes,
+ dc->bpc[1][IFS_RX][IFS_UDP].packets,
+ dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].bytes,
+ dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].packets,
+ dc->bpc[1][IFS_TX][IFS_TCP].bytes,
+ dc->bpc[1][IFS_TX][IFS_TCP].packets,
+ dc->bpc[1][IFS_TX][IFS_UDP].bytes,
+ dc->bpc[1][IFS_TX][IFS_UDP].packets,
+ dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].bytes,
+ dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].packets);
+ else
+ res = kasprintf(GFP_ATOMIC, "data_counters@%p{...}", dc);
+ _bug_on_err_or_null(res);
+ return res;
+}
+
+char *pp_tag_node(struct tag_node *tn)
+{
+ char *tag_str;
+ char *res;
+
+ if (!tn) {
+ res = kasprintf(GFP_ATOMIC, "tag_node@null{}");
+ _bug_on_err_or_null(res);
+ return res;
+ }
+ tag_str = pp_tag_t(&tn->tag);
+ res = kasprintf(GFP_ATOMIC,
+ "tag_node@%p{tag=%s}",
+ tn, tag_str);
+ _bug_on_err_or_null(res);
+ kfree(tag_str);
+ return res;
+}
+
+char *pp_tag_ref(struct tag_ref *tr)
+{
+ char *tn_str;
+ char *res;
+
+ if (!tr) {
+ res = kasprintf(GFP_ATOMIC, "tag_ref@null{}");
+ _bug_on_err_or_null(res);
+ return res;
+ }
+ tn_str = pp_tag_node(&tr->tn);
+ res = kasprintf(GFP_ATOMIC,
+ "tag_ref@%p{%s, num_sock_tags=%d}",
+ tr, tn_str, tr->num_sock_tags);
+ _bug_on_err_or_null(res);
+ kfree(tn_str);
+ return res;
+}
+
+char *pp_tag_stat(struct tag_stat *ts)
+{
+ char *tn_str;
+ char *counters_str;
+ char *parent_counters_str;
+ char *res;
+
+ if (!ts) {
+ res = kasprintf(GFP_ATOMIC, "tag_stat@null{}");
+ _bug_on_err_or_null(res);
+ return res;
+ }
+ tn_str = pp_tag_node(&ts->tn);
+ counters_str = pp_data_counters(&ts->counters, true);
+ parent_counters_str = pp_data_counters(ts->parent_counters, false);
+ res = kasprintf(GFP_ATOMIC,
+ "tag_stat@%p{%s, counters=%s, parent_counters=%s}",
+ ts, tn_str, counters_str, parent_counters_str);
+ _bug_on_err_or_null(res);
+ kfree(tn_str);
+ kfree(counters_str);
+ kfree(parent_counters_str);
+ return res;
+}
+
+char *pp_iface_stat(struct iface_stat *is)
+{
+ char *res;
+ if (!is) {
+ res = kasprintf(GFP_ATOMIC, "iface_stat@null{}");
+ } else {
+ struct data_counters *cnts = &is->totals_via_skb;
+ res = kasprintf(GFP_ATOMIC, "iface_stat@%p{"
+ "list=list_head{...}, "
+ "ifname=%s, "
+ "total_dev={rx={bytes=%llu, "
+ "packets=%llu}, "
+ "tx={bytes=%llu, "
+ "packets=%llu}}, "
+ "total_skb={rx={bytes=%llu, "
+ "packets=%llu}, "
+ "tx={bytes=%llu, "
+ "packets=%llu}}, "
+ "last_known_valid=%d, "
+ "last_known={rx={bytes=%llu, "
+ "packets=%llu}, "
+ "tx={bytes=%llu, "
+ "packets=%llu}}, "
+ "active=%d, "
+ "net_dev=%p, "
+ "proc_ptr=%p, "
+ "tag_stat_tree=rb_root{...}}",
+ is,
+ is->ifname,
+ is->totals_via_dev[IFS_RX].bytes,
+ is->totals_via_dev[IFS_RX].packets,
+ is->totals_via_dev[IFS_TX].bytes,
+ is->totals_via_dev[IFS_TX].packets,
+ dc_sum_bytes(cnts, 0, IFS_RX),
+ dc_sum_packets(cnts, 0, IFS_RX),
+ dc_sum_bytes(cnts, 0, IFS_TX),
+ dc_sum_packets(cnts, 0, IFS_TX),
+ is->last_known_valid,
+ is->last_known[IFS_RX].bytes,
+ is->last_known[IFS_RX].packets,
+ is->last_known[IFS_TX].bytes,
+ is->last_known[IFS_TX].packets,
+ is->active,
+ is->net_dev,
+ is->proc_ptr);
+ }
+ _bug_on_err_or_null(res);
+ return res;
+}
+
+char *pp_sock_tag(struct sock_tag *st)
+{
+ char *tag_str;
+ char *res;
+
+ if (!st) {
+ res = kasprintf(GFP_ATOMIC, "sock_tag@null{}");
+ _bug_on_err_or_null(res);
+ return res;
+ }
+ tag_str = pp_tag_t(&st->tag);
+ res = kasprintf(GFP_ATOMIC, "sock_tag@%p{"
+ "sock_node=rb_node{...}, "
+ "sk=%p (f_count=%d), list=list_head{...}, "
+ "pid=%u, tag=%s}",
+ st, st->sk, atomic_read(
+ &st->sk->sk_refcnt),
+ st->pid, tag_str);
+ _bug_on_err_or_null(res);
+ kfree(tag_str);
+ return res;
+}
+
+char *pp_uid_tag_data(struct uid_tag_data *utd)
+{
+ char *res;
+
+ if (!utd)
+ res = kasprintf(GFP_ATOMIC, "uid_tag_data@null{}");
+ else
+ res = kasprintf(GFP_ATOMIC, "uid_tag_data@%p{"
+ "uid=%u, num_active_acct_tags=%d, "
+ "num_pqd=%d, "
+ "tag_node_tree=rb_root{...}, "
+ "proc_qtu_data_tree=rb_root{...}}",
+ utd, utd->uid,
+ utd->num_active_tags, utd->num_pqd);
+ _bug_on_err_or_null(res);
+ return res;
+}
+
+char *pp_proc_qtu_data(struct proc_qtu_data *pqd)
+{
+ char *parent_tag_data_str;
+ char *res;
+
+ if (!pqd) {
+ res = kasprintf(GFP_ATOMIC, "proc_qtu_data@null{}");
+ _bug_on_err_or_null(res);
+ return res;
+ }
+ parent_tag_data_str = pp_uid_tag_data(pqd->parent_tag_data);
+ res = kasprintf(GFP_ATOMIC, "proc_qtu_data@%p{"
+ "node=rb_node{...}, pid=%u, "
+ "parent_tag_data=%s, "
+ "sock_tag_list=list_head{...}}",
+ pqd, pqd->pid, parent_tag_data_str
+ );
+ _bug_on_err_or_null(res);
+ kfree(parent_tag_data_str);
+ return res;
+}
+
+/*------------------------------------------*/
+void prdebug_sock_tag_tree(int indent_level,
+ struct rb_root *sock_tag_tree)
+{
+ struct rb_node *node;
+ struct sock_tag *sock_tag_entry;
+ char *str;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (RB_EMPTY_ROOT(sock_tag_tree)) {
+ str = "sock_tag_tree=rb_root{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "sock_tag_tree=rb_root{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ for (node = rb_first(sock_tag_tree);
+ node;
+ node = rb_next(node)) {
+ sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
+ str = pp_sock_tag(sock_tag_entry);
+ pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
+ kfree(str);
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_sock_tag_list(int indent_level,
+ struct list_head *sock_tag_list)
+{
+ struct sock_tag *sock_tag_entry;
+ char *str;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (list_empty(sock_tag_list)) {
+ str = "sock_tag_list=list_head{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "sock_tag_list=list_head{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ list_for_each_entry(sock_tag_entry, sock_tag_list, list) {
+ str = pp_sock_tag(sock_tag_entry);
+ pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
+ kfree(str);
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_proc_qtu_data_tree(int indent_level,
+ struct rb_root *proc_qtu_data_tree)
+{
+ char *str;
+ struct rb_node *node;
+ struct proc_qtu_data *proc_qtu_data_entry;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (RB_EMPTY_ROOT(proc_qtu_data_tree)) {
+ str = "proc_qtu_data_tree=rb_root{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "proc_qtu_data_tree=rb_root{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ for (node = rb_first(proc_qtu_data_tree);
+ node;
+ node = rb_next(node)) {
+ proc_qtu_data_entry = rb_entry(node,
+ struct proc_qtu_data,
+ node);
+ str = pp_proc_qtu_data(proc_qtu_data_entry);
+ pr_debug("%*d: %s,\n", indent_level*2, indent_level,
+ str);
+ kfree(str);
+ indent_level++;
+ prdebug_sock_tag_list(indent_level,
+ &proc_qtu_data_entry->sock_tag_list);
+ indent_level--;
+
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree)
+{
+ char *str;
+ struct rb_node *node;
+ struct tag_ref *tag_ref_entry;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (RB_EMPTY_ROOT(tag_ref_tree)) {
+ str = "tag_ref_tree{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "tag_ref_tree{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ for (node = rb_first(tag_ref_tree);
+ node;
+ node = rb_next(node)) {
+ tag_ref_entry = rb_entry(node,
+ struct tag_ref,
+ tn.node);
+ str = pp_tag_ref(tag_ref_entry);
+ pr_debug("%*d: %s,\n", indent_level*2, indent_level,
+ str);
+ kfree(str);
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_uid_tag_data_tree(int indent_level,
+ struct rb_root *uid_tag_data_tree)
+{
+ char *str;
+ struct rb_node *node;
+ struct uid_tag_data *uid_tag_data_entry;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (RB_EMPTY_ROOT(uid_tag_data_tree)) {
+ str = "uid_tag_data_tree=rb_root{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "uid_tag_data_tree=rb_root{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ for (node = rb_first(uid_tag_data_tree);
+ node;
+ node = rb_next(node)) {
+ uid_tag_data_entry = rb_entry(node, struct uid_tag_data,
+ node);
+ str = pp_uid_tag_data(uid_tag_data_entry);
+ pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
+ kfree(str);
+ if (!RB_EMPTY_ROOT(&uid_tag_data_entry->tag_ref_tree)) {
+ indent_level++;
+ prdebug_tag_ref_tree(indent_level,
+ &uid_tag_data_entry->tag_ref_tree);
+ indent_level--;
+ }
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_tag_stat_tree(int indent_level,
+ struct rb_root *tag_stat_tree)
+{
+ char *str;
+ struct rb_node *node;
+ struct tag_stat *ts_entry;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (RB_EMPTY_ROOT(tag_stat_tree)) {
+ str = "tag_stat_tree{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "tag_stat_tree{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ for (node = rb_first(tag_stat_tree);
+ node;
+ node = rb_next(node)) {
+ ts_entry = rb_entry(node, struct tag_stat, tn.node);
+ str = pp_tag_stat(ts_entry);
+ pr_debug("%*d: %s\n", indent_level*2, indent_level,
+ str);
+ kfree(str);
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_iface_stat_list(int indent_level,
+ struct list_head *iface_stat_list)
+{
+ char *str;
+ struct iface_stat *iface_entry;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (list_empty(iface_stat_list)) {
+ str = "iface_stat_list=list_head{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "iface_stat_list=list_head{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ list_for_each_entry(iface_entry, iface_stat_list, list) {
+ str = pp_iface_stat(iface_entry);
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ kfree(str);
+
+ spin_lock_bh(&iface_entry->tag_stat_list_lock);
+ if (!RB_EMPTY_ROOT(&iface_entry->tag_stat_tree)) {
+ indent_level++;
+ prdebug_tag_stat_tree(indent_level,
+ &iface_entry->tag_stat_tree);
+ indent_level--;
+ }
+ spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+#endif /* ifdef DDEBUG */
+/*------------------------------------------*/
+static const char * const netdev_event_strings[] = {
+ "netdev_unknown",
+ "NETDEV_UP",
+ "NETDEV_DOWN",
+ "NETDEV_REBOOT",
+ "NETDEV_CHANGE",
+ "NETDEV_REGISTER",
+ "NETDEV_UNREGISTER",
+ "NETDEV_CHANGEMTU",
+ "NETDEV_CHANGEADDR",
+ "NETDEV_GOING_DOWN",
+ "NETDEV_CHANGENAME",
+ "NETDEV_FEAT_CHANGE",
+ "NETDEV_BONDING_FAILOVER",
+ "NETDEV_PRE_UP",
+ "NETDEV_PRE_TYPE_CHANGE",
+ "NETDEV_POST_TYPE_CHANGE",
+ "NETDEV_POST_INIT",
+ "NETDEV_UNREGISTER_BATCH",
+ "NETDEV_RELEASE",
+ "NETDEV_NOTIFY_PEERS",
+ "NETDEV_JOIN",
+};
+
+const char *netdev_evt_str(int netdev_event)
+{
+ if (netdev_event < 0
+ || netdev_event >= ARRAY_SIZE(netdev_event_strings))
+ return "bad event num";
+ return netdev_event_strings[netdev_event];
+}
diff --git a/net/netfilter/xt_qtaguid_print.h b/net/netfilter/xt_qtaguid_print.h
new file mode 100644
index 000000000000..b63871a0be5a
--- /dev/null
+++ b/net/netfilter/xt_qtaguid_print.h
@@ -0,0 +1,120 @@
+/*
+ * Pretty printing Support for iptables xt_qtaguid module.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __XT_QTAGUID_PRINT_H__
+#define __XT_QTAGUID_PRINT_H__
+
+#include "xt_qtaguid_internal.h"
+
+#ifdef DDEBUG
+
+char *pp_tag_t(tag_t *tag);
+char *pp_data_counters(struct data_counters *dc, bool showValues);
+char *pp_tag_node(struct tag_node *tn);
+char *pp_tag_ref(struct tag_ref *tr);
+char *pp_tag_stat(struct tag_stat *ts);
+char *pp_iface_stat(struct iface_stat *is);
+char *pp_sock_tag(struct sock_tag *st);
+char *pp_uid_tag_data(struct uid_tag_data *qtd);
+char *pp_proc_qtu_data(struct proc_qtu_data *pqd);
+
+/*------------------------------------------*/
+void prdebug_sock_tag_list(int indent_level,
+ struct list_head *sock_tag_list);
+void prdebug_sock_tag_tree(int indent_level,
+ struct rb_root *sock_tag_tree);
+void prdebug_proc_qtu_data_tree(int indent_level,
+ struct rb_root *proc_qtu_data_tree);
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree);
+void prdebug_uid_tag_data_tree(int indent_level,
+ struct rb_root *uid_tag_data_tree);
+void prdebug_tag_stat_tree(int indent_level,
+ struct rb_root *tag_stat_tree);
+void prdebug_iface_stat_list(int indent_level,
+ struct list_head *iface_stat_list);
+
+#else
+
+/*------------------------------------------*/
+static inline char *pp_tag_t(tag_t *tag)
+{
+ return NULL;
+}
+static inline char *pp_data_counters(struct data_counters *dc, bool showValues)
+{
+ return NULL;
+}
+static inline char *pp_tag_node(struct tag_node *tn)
+{
+ return NULL;
+}
+static inline char *pp_tag_ref(struct tag_ref *tr)
+{
+ return NULL;
+}
+static inline char *pp_tag_stat(struct tag_stat *ts)
+{
+ return NULL;
+}
+static inline char *pp_iface_stat(struct iface_stat *is)
+{
+ return NULL;
+}
+static inline char *pp_sock_tag(struct sock_tag *st)
+{
+ return NULL;
+}
+static inline char *pp_uid_tag_data(struct uid_tag_data *qtd)
+{
+ return NULL;
+}
+static inline char *pp_proc_qtu_data(struct proc_qtu_data *pqd)
+{
+ return NULL;
+}
+
+/*------------------------------------------*/
+static inline
+void prdebug_sock_tag_list(int indent_level,
+ struct list_head *sock_tag_list)
+{
+}
+static inline
+void prdebug_sock_tag_tree(int indent_level,
+ struct rb_root *sock_tag_tree)
+{
+}
+static inline
+void prdebug_proc_qtu_data_tree(int indent_level,
+ struct rb_root *proc_qtu_data_tree)
+{
+}
+static inline
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree)
+{
+}
+static inline
+void prdebug_uid_tag_data_tree(int indent_level,
+ struct rb_root *uid_tag_data_tree)
+{
+}
+static inline
+void prdebug_tag_stat_tree(int indent_level,
+ struct rb_root *tag_stat_tree)
+{
+}
+static inline
+void prdebug_iface_stat_list(int indent_level,
+ struct list_head *iface_stat_list)
+{
+}
+#endif
+/*------------------------------------------*/
+const char *netdev_evt_str(int netdev_event);
+#endif /* ifndef __XT_QTAGUID_PRINT_H__ */
diff --git a/net/netfilter/xt_quota2.c b/net/netfilter/xt_quota2.c
new file mode 100644
index 000000000000..834594aa0085
--- /dev/null
+++ b/net/netfilter/xt_quota2.c
@@ -0,0 +1,401 @@
+/*
+ * xt_quota2 - enhanced xt_quota that can count upwards and in packets
+ * as a minimal accounting match.
+ * by Jan Engelhardt <jengelh@medozas.de>, 2008
+ *
+ * Originally based on xt_quota.c:
+ * netfilter module to enforce network quotas
+ * Sam Johnston <samj@samj.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License; either
+ * version 2 of the License, as published by the Free Software Foundation.
+ */
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_quota2.h>
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+/* For compatibility, these definitions are copied from the
+ * deprecated header file <linux/netfilter_ipv4/ipt_ULOG.h> */
+#define ULOG_MAC_LEN 80
+#define ULOG_PREFIX_LEN 32
+
+/* Format of the ULOG packets passed through netlink */
+typedef struct ulog_packet_msg {
+ unsigned long mark;
+ long timestamp_sec;
+ long timestamp_usec;
+ unsigned int hook;
+ char indev_name[IFNAMSIZ];
+ char outdev_name[IFNAMSIZ];
+ size_t data_len;
+ char prefix[ULOG_PREFIX_LEN];
+ unsigned char mac_len;
+ unsigned char mac[ULOG_MAC_LEN];
+ unsigned char payload[0];
+} ulog_packet_msg_t;
+#endif
+
+/**
+ * @lock: lock to protect quota writers from each other
+ */
+struct xt_quota_counter {
+ u_int64_t quota;
+ spinlock_t lock;
+ struct list_head list;
+ atomic_t ref;
+ char name[sizeof(((struct xt_quota_mtinfo2 *)NULL)->name)];
+ struct proc_dir_entry *procfs_entry;
+};
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+/* Harald's favorite number +1 :D From ipt_ULOG.C */
+static int qlog_nl_event = 112;
+module_param_named(event_num, qlog_nl_event, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(event_num,
+ "Event number for NETLINK_NFLOG message. 0 disables log."
+ "111 is what ipt_ULOG uses.");
+static struct sock *nflognl;
+#endif
+
+static LIST_HEAD(counter_list);
+static DEFINE_SPINLOCK(counter_list_lock);
+
+static struct proc_dir_entry *proc_xt_quota;
+static unsigned int quota_list_perms = S_IRUGO | S_IWUSR;
+static kuid_t quota_list_uid = KUIDT_INIT(0);
+static kgid_t quota_list_gid = KGIDT_INIT(0);
+module_param_named(perms, quota_list_perms, uint, S_IRUGO | S_IWUSR);
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+static void quota2_log(unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const char *prefix)
+{
+ ulog_packet_msg_t *pm;
+ struct sk_buff *log_skb;
+ size_t size;
+ struct nlmsghdr *nlh;
+
+ if (!qlog_nl_event)
+ return;
+
+ size = NLMSG_SPACE(sizeof(*pm));
+ size = max(size, (size_t)NLMSG_GOODSIZE);
+ log_skb = alloc_skb(size, GFP_ATOMIC);
+ if (!log_skb) {
+ pr_err("xt_quota2: cannot alloc skb for logging\n");
+ return;
+ }
+
+ nlh = nlmsg_put(log_skb, /*pid*/0, /*seq*/0, qlog_nl_event,
+ sizeof(*pm), 0);
+ if (!nlh) {
+ pr_err("xt_quota2: nlmsg_put failed\n");
+ kfree_skb(log_skb);
+ return;
+ }
+ pm = nlmsg_data(nlh);
+ if (skb->tstamp.tv64 == 0)
+ __net_timestamp((struct sk_buff *)skb);
+ pm->data_len = 0;
+ pm->hook = hooknum;
+ if (prefix != NULL)
+ strlcpy(pm->prefix, prefix, sizeof(pm->prefix));
+ else
+ *(pm->prefix) = '\0';
+ if (in)
+ strlcpy(pm->indev_name, in->name, sizeof(pm->indev_name));
+ else
+ pm->indev_name[0] = '\0';
+
+ if (out)
+ strlcpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
+ else
+ pm->outdev_name[0] = '\0';
+
+ NETLINK_CB(log_skb).dst_group = 1;
+ pr_debug("throwing 1 packets to netlink group 1\n");
+ netlink_broadcast(nflognl, log_skb, 0, 1, GFP_ATOMIC);
+}
+#else
+static void quota2_log(unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const char *prefix)
+{
+}
+#endif /* if+else CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG */
+
+static ssize_t quota_proc_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct xt_quota_counter *e = PDE_DATA(file_inode(file));
+ char tmp[24];
+ size_t tmp_size;
+
+ spin_lock_bh(&e->lock);
+ tmp_size = scnprintf(tmp, sizeof(tmp), "%llu\n", e->quota);
+ spin_unlock_bh(&e->lock);
+ return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size);
+}
+
+static ssize_t quota_proc_write(struct file *file, const char __user *input,
+ size_t size, loff_t *ppos)
+{
+ struct xt_quota_counter *e = PDE_DATA(file_inode(file));
+ char buf[sizeof("18446744073709551616")];
+
+ if (size > sizeof(buf))
+ size = sizeof(buf);
+ if (copy_from_user(buf, input, size) != 0)
+ return -EFAULT;
+ buf[sizeof(buf)-1] = '\0';
+
+ spin_lock_bh(&e->lock);
+ e->quota = simple_strtoull(buf, NULL, 0);
+ spin_unlock_bh(&e->lock);
+ return size;
+}
+
+static const struct file_operations q2_counter_fops = {
+ .read = quota_proc_read,
+ .write = quota_proc_write,
+ .llseek = default_llseek,
+};
+
+static struct xt_quota_counter *
+q2_new_counter(const struct xt_quota_mtinfo2 *q, bool anon)
+{
+ struct xt_quota_counter *e;
+ unsigned int size;
+
+ /* Do not need all the procfs things for anonymous counters. */
+ size = anon ? offsetof(typeof(*e), list) : sizeof(*e);
+ e = kmalloc(size, GFP_KERNEL);
+ if (e == NULL)
+ return NULL;
+
+ e->quota = q->quota;
+ spin_lock_init(&e->lock);
+ if (!anon) {
+ INIT_LIST_HEAD(&e->list);
+ atomic_set(&e->ref, 1);
+ strlcpy(e->name, q->name, sizeof(e->name));
+ }
+ return e;
+}
+
+/**
+ * q2_get_counter - get ref to counter or create new
+ * @name: name of counter
+ */
+static struct xt_quota_counter *
+q2_get_counter(const struct xt_quota_mtinfo2 *q)
+{
+ struct proc_dir_entry *p;
+ struct xt_quota_counter *e = NULL;
+ struct xt_quota_counter *new_e;
+
+ if (*q->name == '\0')
+ return q2_new_counter(q, true);
+
+ /* No need to hold a lock while getting a new counter */
+ new_e = q2_new_counter(q, false);
+ if (new_e == NULL)
+ goto out;
+
+ spin_lock_bh(&counter_list_lock);
+ list_for_each_entry(e, &counter_list, list)
+ if (strcmp(e->name, q->name) == 0) {
+ atomic_inc(&e->ref);
+ spin_unlock_bh(&counter_list_lock);
+ kfree(new_e);
+ pr_debug("xt_quota2: old counter name=%s", e->name);
+ return e;
+ }
+ e = new_e;
+ pr_debug("xt_quota2: new_counter name=%s", e->name);
+ list_add_tail(&e->list, &counter_list);
+ /* The entry having a refcount of 1 is not directly destructible.
+ * This func has not yet returned the new entry, thus iptables
+ * has not references for destroying this entry.
+ * For another rule to try to destroy it, it would 1st need for this
+ * func* to be re-invoked, acquire a new ref for the same named quota.
+ * Nobody will access the e->procfs_entry either.
+ * So release the lock. */
+ spin_unlock_bh(&counter_list_lock);
+
+ /* create_proc_entry() is not spin_lock happy */
+ p = e->procfs_entry = proc_create_data(e->name, quota_list_perms,
+ proc_xt_quota, &q2_counter_fops, e);
+
+ if (IS_ERR_OR_NULL(p)) {
+ spin_lock_bh(&counter_list_lock);
+ list_del(&e->list);
+ spin_unlock_bh(&counter_list_lock);
+ goto out;
+ }
+ proc_set_user(p, quota_list_uid, quota_list_gid);
+ return e;
+
+ out:
+ kfree(e);
+ return NULL;
+}
+
+static int quota_mt2_check(const struct xt_mtchk_param *par)
+{
+ struct xt_quota_mtinfo2 *q = par->matchinfo;
+
+ pr_debug("xt_quota2: check() flags=0x%04x", q->flags);
+
+ if (q->flags & ~XT_QUOTA_MASK)
+ return -EINVAL;
+
+ q->name[sizeof(q->name)-1] = '\0';
+ if (*q->name == '.' || strchr(q->name, '/') != NULL) {
+ printk(KERN_ERR "xt_quota.3: illegal name\n");
+ return -EINVAL;
+ }
+
+ q->master = q2_get_counter(q);
+ if (q->master == NULL) {
+ printk(KERN_ERR "xt_quota.3: memory alloc failure\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void quota_mt2_destroy(const struct xt_mtdtor_param *par)
+{
+ struct xt_quota_mtinfo2 *q = par->matchinfo;
+ struct xt_quota_counter *e = q->master;
+
+ if (*q->name == '\0') {
+ kfree(e);
+ return;
+ }
+
+ spin_lock_bh(&counter_list_lock);
+ if (!atomic_dec_and_test(&e->ref)) {
+ spin_unlock_bh(&counter_list_lock);
+ return;
+ }
+
+ list_del(&e->list);
+ remove_proc_entry(e->name, proc_xt_quota);
+ spin_unlock_bh(&counter_list_lock);
+ kfree(e);
+}
+
+static bool
+quota_mt2(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct xt_quota_mtinfo2 *q = (void *)par->matchinfo;
+ struct xt_quota_counter *e = q->master;
+ bool ret = q->flags & XT_QUOTA_INVERT;
+
+ spin_lock_bh(&e->lock);
+ if (q->flags & XT_QUOTA_GROW) {
+ /*
+ * While no_change is pointless in "grow" mode, we will
+ * implement it here simply to have a consistent behavior.
+ */
+ if (!(q->flags & XT_QUOTA_NO_CHANGE)) {
+ e->quota += (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len;
+ }
+ ret = true;
+ } else {
+ if (e->quota >= skb->len) {
+ if (!(q->flags & XT_QUOTA_NO_CHANGE))
+ e->quota -= (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len;
+ ret = !ret;
+ } else {
+ /* We are transitioning, log that fact. */
+ if (e->quota) {
+ quota2_log(par->hooknum,
+ skb,
+ par->in,
+ par->out,
+ q->name);
+ }
+ /* we do not allow even small packets from now on */
+ e->quota = 0;
+ }
+ }
+ spin_unlock_bh(&e->lock);
+ return ret;
+}
+
+static struct xt_match quota_mt2_reg[] __read_mostly = {
+ {
+ .name = "quota2",
+ .revision = 3,
+ .family = NFPROTO_IPV4,
+ .checkentry = quota_mt2_check,
+ .match = quota_mt2,
+ .destroy = quota_mt2_destroy,
+ .matchsize = sizeof(struct xt_quota_mtinfo2),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "quota2",
+ .revision = 3,
+ .family = NFPROTO_IPV6,
+ .checkentry = quota_mt2_check,
+ .match = quota_mt2,
+ .destroy = quota_mt2_destroy,
+ .matchsize = sizeof(struct xt_quota_mtinfo2),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init quota_mt2_init(void)
+{
+ int ret;
+ pr_debug("xt_quota2: init()");
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+ nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, NULL);
+ if (!nflognl)
+ return -ENOMEM;
+#endif
+
+ proc_xt_quota = proc_mkdir("xt_quota", init_net.proc_net);
+ if (proc_xt_quota == NULL)
+ return -EACCES;
+
+ ret = xt_register_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg));
+ if (ret < 0)
+ remove_proc_entry("xt_quota", init_net.proc_net);
+ pr_debug("xt_quota2: init() %d", ret);
+ return ret;
+}
+
+static void __exit quota_mt2_exit(void)
+{
+ xt_unregister_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg));
+ remove_proc_entry("xt_quota", init_net.proc_net);
+}
+
+module_init(quota_mt2_init);
+module_exit(quota_mt2_exit);
+MODULE_DESCRIPTION("Xtables: countdown quota match; up counter");
+MODULE_AUTHOR("Sam Johnston <samj@samj.net>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_quota2");
+MODULE_ALIAS("ip6t_quota2");
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 1ba67931eb1b..a3b25b227dfe 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -129,9 +129,10 @@ xt_socket_get_sock_v4(struct net *net, const u8 protocol,
return NULL;
}
-static bool
-socket_match(const struct sk_buff *skb, struct xt_action_param *par,
- const struct xt_socket_mtinfo1 *info)
+
+
+struct sock*
+xt_socket_get4_sk(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct iphdr *iph = ip_hdr(skb);
struct udphdr _hdr, *hp = NULL;
@@ -148,7 +149,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
hp = skb_header_pointer(skb, ip_hdrlen(skb),
sizeof(_hdr), &_hdr);
if (hp == NULL)
- return false;
+ return NULL;
protocol = iph->protocol;
saddr = iph->saddr;
@@ -159,9 +160,9 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
} else if (iph->protocol == IPPROTO_ICMP) {
if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
&sport, &dport))
- return false;
+ return NULL;
} else {
- return false;
+ return NULL;
}
#ifdef XT_SOCKET_HAVE_CONNTRACK
@@ -183,10 +184,30 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
}
#endif
- if (!sk)
+ if (sk)
+ atomic_inc(&sk->sk_refcnt);
+ else
sk = xt_socket_get_sock_v4(dev_net(skb->dev), protocol,
saddr, daddr, sport, dport,
par->in);
+
+ pr_debug("proto %hhu %pI4:%hu -> %pI4:%hu (orig %pI4:%hu) sock %p\n",
+ protocol, &saddr, ntohs(sport),
+ &daddr, ntohs(dport),
+ &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
+
+ return sk;
+}
+EXPORT_SYMBOL(xt_socket_get4_sk);
+
+static bool
+socket_match(const struct sk_buff *skb, struct xt_action_param *par,
+ const struct xt_socket_mtinfo1 *info)
+{
+ struct sock *sk;
+
+ sk = xt_socket_get4_sk(skb, par);
+
if (sk) {
bool wildcard;
bool transparent = true;
@@ -206,18 +227,12 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
(sk->sk_state == TCP_TIME_WAIT &&
inet_twsk(sk)->tw_transparent));
- if (sk != skb->sk)
- sock_gen_put(sk);
+ sock_gen_put(sk);
if (wildcard || !transparent)
sk = NULL;
}
- pr_debug("proto %hhu %pI4:%hu -> %pI4:%hu (orig %pI4:%hu) sock %p\n",
- protocol, &saddr, ntohs(sport),
- &daddr, ntohs(dport),
- &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
-
return (sk != NULL);
}
@@ -312,8 +327,8 @@ xt_socket_get_sock_v6(struct net *net, const u8 protocol,
return NULL;
}
-static bool
-socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
+struct sock*
+xt_socket_get6_sk(const struct sk_buff *skb, struct xt_action_param *par)
{
struct ipv6hdr *iph = ipv6_hdr(skb);
struct udphdr _hdr, *hp = NULL;
@@ -321,7 +336,6 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
struct in6_addr *daddr = NULL, *saddr = NULL;
__be16 uninitialized_var(dport), uninitialized_var(sport);
int thoff = 0, uninitialized_var(tproto);
- const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
if (tproto < 0) {
@@ -333,7 +347,7 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
hp = skb_header_pointer(skb, thoff,
sizeof(_hdr), &_hdr);
if (hp == NULL)
- return false;
+ return NULL;
saddr = &iph->saddr;
sport = hp->source;
@@ -343,15 +357,37 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
} else if (tproto == IPPROTO_ICMPV6) {
if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
&sport, &dport))
- return false;
+ return NULL;
} else {
- return false;
+ return NULL;
}
- if (!sk)
+ if (sk)
+ atomic_inc(&sk->sk_refcnt);
+ else
sk = xt_socket_get_sock_v6(dev_net(skb->dev), tproto,
saddr, daddr, sport, dport,
par->in);
+
+ pr_debug("proto %hhd %pI6:%hu -> %pI6:%hu "
+ "(orig %pI6:%hu) sock %p\n",
+ tproto, saddr, ntohs(sport),
+ daddr, ntohs(dport),
+ &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
+ return sk;
+}
+EXPORT_SYMBOL(xt_socket_get6_sk);
+
+static bool
+socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct sock *sk;
+ const struct xt_socket_mtinfo1 *info;
+
+ info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+
+ sk = xt_socket_get6_sk(skb, par);
+
if (sk) {
bool wildcard;
bool transparent = true;
@@ -378,12 +414,6 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
sk = NULL;
}
- pr_debug("proto %hhd %pI6:%hu -> %pI6:%hu "
- "(orig %pI6:%hu) sock %p\n",
- tproto, saddr, ntohs(sport),
- daddr, ntohs(dport),
- &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
-
return (sk != NULL);
}
#endif
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 05cfee756898..2ae5ae23e073 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1429,13 +1429,16 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
return -EINVAL;
}
+ mutex_lock(&fanout_mutex);
+
+ err = -EINVAL;
if (!po->running)
- return -EINVAL;
+ goto out;
+ err = -EALREADY;
if (po->fanout)
- return -EALREADY;
+ goto out;
- mutex_lock(&fanout_mutex);
match = NULL;
list_for_each_entry(f, &fanout_list, list) {
if (f->id == id &&
@@ -1491,17 +1494,16 @@ static void fanout_release(struct sock *sk)
struct packet_sock *po = pkt_sk(sk);
struct packet_fanout *f;
- f = po->fanout;
- if (!f)
- return;
-
mutex_lock(&fanout_mutex);
- po->fanout = NULL;
+ f = po->fanout;
+ if (f) {
+ po->fanout = NULL;
- if (atomic_dec_and_test(&f->sk_ref)) {
- list_del(&f->list);
- dev_remove_pack(&f->prot_hook);
- kfree(f);
+ if (atomic_dec_and_test(&f->sk_ref)) {
+ list_del(&f->list);
+ dev_remove_pack(&f->prot_hook);
+ kfree(f);
+ }
}
mutex_unlock(&fanout_mutex);
}
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index 4c10e7e6c9f6..77aa1dfc3f88 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -10,6 +10,11 @@ menuconfig RFKILL
To compile this driver as a module, choose M here: the
module will be called rfkill.
+config RFKILL_PM
+ bool "Power off on suspend"
+ depends on RFKILL && PM
+ default y
+
# LED trigger support
config RFKILL_LEDS
bool
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index a97bb7332607..f9f430a98140 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -794,7 +794,7 @@ void rfkill_resume_polling(struct rfkill *rfkill)
}
EXPORT_SYMBOL(rfkill_resume_polling);
-static int rfkill_suspend(struct device *dev, pm_message_t state)
+static __maybe_unused int rfkill_suspend(struct device *dev, pm_message_t state)
{
struct rfkill *rfkill = to_rfkill(dev);
@@ -803,7 +803,7 @@ static int rfkill_suspend(struct device *dev, pm_message_t state)
return 0;
}
-static int rfkill_resume(struct device *dev)
+static __maybe_unused int rfkill_resume(struct device *dev)
{
struct rfkill *rfkill = to_rfkill(dev);
bool cur;
@@ -818,13 +818,14 @@ static int rfkill_resume(struct device *dev)
return 0;
}
+
static struct class rfkill_class = {
.name = "rfkill",
.dev_release = rfkill_release,
.dev_groups = rfkill_dev_groups,
.dev_uevent = rfkill_dev_uevent,
- .suspend = rfkill_suspend,
- .resume = rfkill_resume,
+ .suspend = IS_ENABLED(CONFIG_RFKILL_PM) ? rfkill_suspend : NULL,
+ .resume = IS_ENABLED(CONFIG_RFKILL_PM) ? rfkill_resume : NULL,
};
bool rfkill_blocked(struct rfkill *rfkill)
diff --git a/net/socket.c b/net/socket.c
index e72371710fe3..9af54e7bb138 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -524,9 +524,23 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
return used;
}
+static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ int err = simple_setattr(dentry, iattr);
+
+ if (!err && (iattr->ia_valid & ATTR_UID)) {
+ struct socket *sock = SOCKET_I(dentry->d_inode);
+
+ sock->sk->sk_uid = iattr->ia_uid;
+ }
+
+ return err;
+}
+
static const struct inode_operations sockfs_inode_ops = {
.getxattr = sockfs_getxattr,
.listxattr = sockfs_listxattr,
+ .setattr = sockfs_setattr,
};
/**
@@ -1790,6 +1804,8 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
if (len > INT_MAX)
len = INT_MAX;
+ if (unlikely(!access_ok(VERIFY_READ, buff, len)))
+ return -EFAULT;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
@@ -1849,6 +1865,8 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
if (size > INT_MAX)
size = INT_MAX;
+ if (unlikely(!access_ok(VERIFY_WRITE, ubuf, size)))
+ return -EFAULT;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 6c23065490d3..174811243249 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -307,6 +307,20 @@ static void cfg80211_destroy_iface_wk(struct work_struct *work)
rtnl_unlock();
}
+static void cfg80211_sched_scan_stop_wk(struct work_struct *work)
+{
+ struct cfg80211_registered_device *rdev;
+
+ rdev = container_of(work, struct cfg80211_registered_device,
+ sched_scan_stop_wk);
+
+ rtnl_lock();
+
+ __cfg80211_stop_sched_scan(rdev, false);
+
+ rtnl_unlock();
+}
+
/* exported functions */
struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv)
@@ -368,6 +382,7 @@ struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv)
INIT_LIST_HEAD(&rdev->destroy_list);
spin_lock_init(&rdev->destroy_list_lock);
INIT_WORK(&rdev->destroy_work, cfg80211_destroy_iface_wk);
+ INIT_WORK(&rdev->sched_scan_stop_wk, cfg80211_sched_scan_stop_wk);
#ifdef CONFIG_CFG80211_DEFAULT_PS
rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT;
@@ -722,6 +737,7 @@ void wiphy_unregister(struct wiphy *wiphy)
flush_work(&rdev->event_work);
cancel_delayed_work_sync(&rdev->dfs_update_channels_wk);
flush_work(&rdev->destroy_work);
+ flush_work(&rdev->sched_scan_stop_wk);
#ifdef CONFIG_PM
if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup)
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 7e3a3cef7df9..8177b2d9c3f1 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -84,6 +84,8 @@ struct cfg80211_registered_device {
struct list_head destroy_list;
struct work_struct destroy_work;
+ struct work_struct sched_scan_stop_wk;
+
/* must be last because of the way we do wiphy_priv(),
* and it should at least be aligned to NETDEV_ALIGN */
struct wiphy wiphy __aligned(NETDEV_ALIGN);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 9fb1dd399788..000771ae061c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5848,6 +5848,9 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,
err = rdev_sched_scan_start(rdev, dev, request);
if (!err) {
+ if (info->attrs[NL80211_ATTR_IFACE_SOCKET_OWNER])
+ request->owner_nlportid = info->snd_portid;
+
rdev->sched_scan_req = request;
nl80211_send_sched_scan(rdev, dev,
NL80211_CMD_START_SCHED_SCAN);
@@ -11962,6 +11965,13 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) {
bool schedule_destroy_work = false;
+ bool schedule_scan_stop = false;
+ struct cfg80211_sched_scan_request *sched_scan_req =
+ rcu_dereference(rdev->sched_scan_req);
+
+ if (sched_scan_req && notify->portid &&
+ sched_scan_req->owner_nlportid == notify->portid)
+ schedule_scan_stop = true;
list_for_each_entry_rcu(wdev, &rdev->wdev_list, list) {
cfg80211_mlme_unregister_socket(wdev, notify->portid);
@@ -11992,6 +12002,12 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
spin_unlock(&rdev->destroy_list_lock);
schedule_work(&rdev->destroy_work);
}
+ } else if (schedule_scan_stop) {
+ sched_scan_req->owner_nlportid = 0;
+
+ if (rdev->ops->sched_scan_stop &&
+ rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
+ schedule_work(&rdev->sched_scan_stop_wk);
}
}
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index bda39f149810..9e62e9cd7146 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -56,7 +56,7 @@
* also linked into the probe response struct.
*/
-#define IEEE80211_SCAN_RESULT_EXPIRE (30 * HZ)
+#define IEEE80211_SCAN_RESULT_EXPIRE (7 * HZ)
static void bss_free(struct cfg80211_internal_bss *bss)
{
diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index debe733386f8..ca21ba7a0716 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -220,7 +220,7 @@ static struct xfrm_algo_desc aalg_list[] = {
.uinfo = {
.auth = {
- .icv_truncbits = 96,
+ .icv_truncbits = 128,
.icv_fullbits = 256,
}
},
diff --git a/scripts/Makefile.clean b/scripts/Makefile.clean
index a609552a86dc..cec78c48e1cf 100644
--- a/scripts/Makefile.clean
+++ b/scripts/Makefile.clean
@@ -14,7 +14,7 @@ clean := -f $(srctree)/scripts/Makefile.clean obj
# The filename Kbuild has precedence over Makefile
kbuild-dir := $(if $(filter /%,$(src)),$(src),$(srctree)/$(src))
-include $(if $(wildcard $(kbuild-dir)/Kbuild), $(kbuild-dir)/Kbuild, $(kbuild-dir)/Makefile)
+-include $(if $(wildcard $(kbuild-dir)/Kbuild), $(kbuild-dir)/Kbuild, $(kbuild-dir)/Makefile)
# Figure out what we need to build from the various variables
# ==========================================================================
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 044eb4f89a91..cea530c7cdea 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -125,10 +125,22 @@ endif
#
ifeq ($(CONFIG_KASAN),y)
_c_flags += $(if $(patsubst n%,, \
- $(KASAN_SANITIZE_$(basetarget).o)$(KASAN_SANITIZE)y), \
+ $(KASAN_SANITIZE_$(basetarget).o)$(KASAN_SANITIZE)$(CONFIG_KASAN_SANITIZE_ALL)), \
$(CFLAGS_KASAN))
endif
+ifeq ($(CONFIG_UBSAN),y)
+_c_flags += $(if $(patsubst n%,, \
+ $(UBSAN_SANITIZE_$(basetarget).o)$(UBSAN_SANITIZE)$(CONFIG_UBSAN_SANITIZE_ALL)), \
+ $(CFLAGS_UBSAN))
+endif
+
+ifeq ($(CONFIG_KCOV),y)
+_c_flags += $(if $(patsubst n%,, \
+ $(KCOV_INSTRUMENT_$(basetarget).o)$(KCOV_INSTRUMENT)y), \
+ $(CFLAGS_KCOV))
+endif
+
# If building the kernel in a separate objtree expand all occurrences
# of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/').
@@ -293,6 +305,12 @@ $(obj)/%.dtb: $(src)/%.dts FORCE
dtc-tmp = $(subst $(comma),_,$(dot-target).dts.tmp)
+# cat
+# ---------------------------------------------------------------------------
+# Concatentate multiple files together
+quiet_cmd_cat = CAT $@
+cmd_cat = (cat $(filter-out FORCE,$^) > $@) || (rm -f $@; false)
+
# Bzip2
# ---------------------------------------------------------------------------
diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst
index e48a4e9d8868..9b7b2804e50d 100644
--- a/scripts/Makefile.modinst
+++ b/scripts/Makefile.modinst
@@ -29,7 +29,7 @@ quiet_cmd_modules_install = INSTALL $@
INSTALL_MOD_DIR ?= extra
ext-mod-dir = $(INSTALL_MOD_DIR)$(subst $(patsubst %/,%,$(KBUILD_EXTMOD)),,$(@D))
-modinst_dir = $(if $(KBUILD_EXTMOD),$(ext-mod-dir),kernel/$(@D))
+modinst_dir ?= $(if $(KBUILD_EXTMOD),$(ext-mod-dir),kernel/$(@D))
$(modules):
$(call cmd,modules_install,$(MODLIB)/$(modinst_dir))
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 51c267cf2a2f..56e7ff67b418 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2162,6 +2162,7 @@ sub process {
# Check for improperly formed commit descriptions
if ($in_commit_log &&
+ $line !~ /^This reverts commit [0-9a-f]{7,40}/ &&
$line =~ /\bcommit\s+[0-9a-f]{5,}/i &&
!($line =~ /\b[Cc]ommit [0-9a-f]{12,40} \("/ ||
($line =~ /\b[Cc]ommit [0-9a-f]{12,40}\s*$/ &&
diff --git a/security/Kconfig b/security/Kconfig
index beb86b500adf..601882ce946a 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -18,6 +18,15 @@ config SECURITY_DMESG_RESTRICT
If you are unsure how to answer this question, answer N.
+config SECURITY_PERF_EVENTS_RESTRICT
+ bool "Restrict unprivileged use of performance events"
+ depends on PERF_EVENTS
+ help
+ If you say Y here, the kernel.perf_event_paranoid sysctl
+ will be set to 3 by default, and no unprivileged use of the
+ perf_event_open syscall will be permitted unless it is
+ changed.
+
config SECURITY
bool "Enable different security models"
depends on SYSFS
@@ -117,6 +126,46 @@ config LSM_MMAP_MIN_ADDR
this low address space will need the permission specific to the
systems running LSM.
+config HAVE_HARDENED_USERCOPY_ALLOCATOR
+ bool
+ help
+ The heap allocator implements __check_heap_object() for
+ validating memory ranges against heap object sizes in
+ support of CONFIG_HARDENED_USERCOPY.
+
+config HAVE_ARCH_HARDENED_USERCOPY
+ bool
+ help
+ The architecture supports CONFIG_HARDENED_USERCOPY by
+ calling check_object_size() just before performing the
+ userspace copies in the low level implementation of
+ copy_to_user() and copy_from_user().
+
+config HARDENED_USERCOPY
+ bool "Harden memory copies between kernel and userspace"
+ depends on HAVE_ARCH_HARDENED_USERCOPY
+ depends on HAVE_HARDENED_USERCOPY_ALLOCATOR
+ select BUG
+ help
+ This option checks for obviously wrong memory regions when
+ copying memory to/from the kernel (via copy_to_user() and
+ copy_from_user() functions) by rejecting memory ranges that
+ are larger than the specified heap object, span multiple
+ separately allocates pages, are not on the process stack,
+ or are part of the kernel text. This kills entire classes
+ of heap overflow exploits and similar kernel memory exposures.
+
+config HARDENED_USERCOPY_PAGESPAN
+ bool "Refuse to copy allocations that span multiple pages"
+ depends on HARDENED_USERCOPY
+ depends on !COMPILE_TEST
+ help
+ When a multi-page allocation is done without __GFP_COMP,
+ hardened usercopy will reject attempts to copy it. There are,
+ however, several cases of this in the kernel that have not all
+ been removed. This config is intended to be used only while
+ trying to find such users.
+
source security/selinux/Kconfig
source security/smack/Kconfig
source security/tomoyo/Kconfig
diff --git a/security/capability.c b/security/capability.c
index d68c57a62bcf..ef951b004140 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -12,6 +12,26 @@
#include <linux/security.h>
+static int cap_binder_set_context_mgr(struct task_struct *mgr)
+{
+ return 0;
+}
+
+static int cap_binder_transaction(struct task_struct *from, struct task_struct *to)
+{
+ return 0;
+}
+
+static int cap_binder_transfer_binder(struct task_struct *from, struct task_struct *to)
+{
+ return 0;
+}
+
+static int cap_binder_transfer_file(struct task_struct *from, struct task_struct *to, struct file *file)
+{
+ return 0;
+}
+
static int cap_syslog(int type)
{
return 0;
@@ -930,6 +950,10 @@ static void cap_audit_rule_free(void *lsmrule)
void __init security_fixup_ops(struct security_operations *ops)
{
+ set_to_cap_if_null(ops, binder_set_context_mgr);
+ set_to_cap_if_null(ops, binder_transaction);
+ set_to_cap_if_null(ops, binder_transfer_binder);
+ set_to_cap_if_null(ops, binder_transfer_file);
set_to_cap_if_null(ops, ptrace_access_check);
set_to_cap_if_null(ops, ptrace_traceme);
set_to_cap_if_null(ops, capget);
diff --git a/security/commoncap.c b/security/commoncap.c
index 6849e6c12987..740966aeb0bb 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -31,6 +31,10 @@
#include <linux/binfmts.h>
#include <linux/personality.h>
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+#endif
+
/*
* If a non-root user executes a setuid-root binary in
* !secure(SECURE_NOROOT) mode, then we raise capabilities.
@@ -78,6 +82,13 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
{
struct user_namespace *ns = targ_ns;
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+ if (cap == CAP_NET_RAW && in_egroup_p(AID_NET_RAW))
+ return 0;
+ if (cap == CAP_NET_ADMIN && in_egroup_p(AID_NET_ADMIN))
+ return 0;
+#endif
+
/* See if cred has the capability in the target user namespace
* by examining the target user namespace and all of the target
* user namespace's parents.
@@ -277,6 +288,16 @@ int cap_capset(struct cred *new,
new->cap_effective = *effective;
new->cap_inheritable = *inheritable;
new->cap_permitted = *permitted;
+
+ /*
+ * Mask off ambient bits that are no longer both permitted and
+ * inheritable.
+ */
+ new->cap_ambient = cap_intersect(new->cap_ambient,
+ cap_intersect(*permitted,
+ *inheritable));
+ if (WARN_ON(!cap_ambient_invariant_ok(new)))
+ return -EINVAL;
return 0;
}
@@ -357,6 +378,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
/*
* pP' = (X & fP) | (pI & fI)
+ * The addition of pA' is handled later.
*/
new->cap_permitted.cap[i] =
(new->cap_bset.cap[i] & permitted) |
@@ -488,10 +510,13 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
{
const struct cred *old = current_cred();
struct cred *new = bprm->cred;
- bool effective, has_cap = false;
+ bool effective, has_cap = false, is_setid;
int ret;
kuid_t root_uid;
+ if (WARN_ON(!cap_ambient_invariant_ok(old)))
+ return -EPERM;
+
effective = false;
ret = get_file_caps(bprm, &effective, &has_cap);
if (ret < 0)
@@ -536,8 +561,9 @@ skip:
*
* In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
*/
- if ((!uid_eq(new->euid, old->uid) ||
- !gid_eq(new->egid, old->gid) ||
+ is_setid = !uid_eq(new->euid, old->uid) || !gid_eq(new->egid, old->gid);
+
+ if ((is_setid ||
!cap_issubset(new->cap_permitted, old->cap_permitted)) &&
bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
/* downgrade; they get no more than they had, and maybe less */
@@ -553,10 +579,28 @@ skip:
new->suid = new->fsuid = new->euid;
new->sgid = new->fsgid = new->egid;
+ /* File caps or setid cancels ambient. */
+ if (has_cap || is_setid)
+ cap_clear(new->cap_ambient);
+
+ /*
+ * Now that we've computed pA', update pP' to give:
+ * pP' = (X & fP) | (pI & fI) | pA'
+ */
+ new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
+
+ /*
+ * Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set,
+ * this is the same as pE' = (fE ? pP' : 0) | pA'.
+ */
if (effective)
new->cap_effective = new->cap_permitted;
else
- cap_clear(new->cap_effective);
+ new->cap_effective = new->cap_ambient;
+
+ if (WARN_ON(!cap_ambient_invariant_ok(new)))
+ return -EPERM;
+
bprm->cap_effective = effective;
/*
@@ -571,7 +615,7 @@ skip:
* Number 1 above might fail if you don't have a full bset, but I think
* that is interesting information to audit.
*/
- if (!cap_isclear(new->cap_effective)) {
+ if (!cap_issubset(new->cap_effective, new->cap_ambient)) {
if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
!uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) ||
issecure(SECURE_NOROOT)) {
@@ -582,6 +626,10 @@ skip:
}
new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+
+ if (WARN_ON(!cap_ambient_invariant_ok(new)))
+ return -EPERM;
+
return 0;
}
@@ -603,7 +651,7 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
if (!uid_eq(cred->uid, root_uid)) {
if (bprm->cap_effective)
return 1;
- if (!cap_isclear(cred->cap_permitted))
+ if (!cap_issubset(cred->cap_permitted, cred->cap_ambient))
return 1;
}
@@ -705,10 +753,18 @@ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
uid_eq(old->suid, root_uid)) &&
(!uid_eq(new->uid, root_uid) &&
!uid_eq(new->euid, root_uid) &&
- !uid_eq(new->suid, root_uid)) &&
- !issecure(SECURE_KEEP_CAPS)) {
- cap_clear(new->cap_permitted);
- cap_clear(new->cap_effective);
+ !uid_eq(new->suid, root_uid))) {
+ if (!issecure(SECURE_KEEP_CAPS)) {
+ cap_clear(new->cap_permitted);
+ cap_clear(new->cap_effective);
+ }
+
+ /*
+ * Pre-ambient programs expect setresuid to nonroot followed
+ * by exec to drop capabilities. We should make sure that
+ * this remains the case.
+ */
+ cap_clear(new->cap_ambient);
}
if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
cap_clear(new->cap_effective);
@@ -938,6 +994,43 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
return commit_creds(new);
+ case PR_CAP_AMBIENT:
+ if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
+ if (arg3 | arg4 | arg5)
+ return -EINVAL;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ cap_clear(new->cap_ambient);
+ return commit_creds(new);
+ }
+
+ if (((!cap_valid(arg3)) | arg4 | arg5))
+ return -EINVAL;
+
+ if (arg2 == PR_CAP_AMBIENT_IS_SET) {
+ return !!cap_raised(current_cred()->cap_ambient, arg3);
+ } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
+ arg2 != PR_CAP_AMBIENT_LOWER) {
+ return -EINVAL;
+ } else {
+ if (arg2 == PR_CAP_AMBIENT_RAISE &&
+ (!cap_raised(current_cred()->cap_permitted, arg3) ||
+ !cap_raised(current_cred()->cap_inheritable,
+ arg3)))
+ return -EPERM;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ if (arg2 == PR_CAP_AMBIENT_RAISE)
+ cap_raise(new->cap_ambient, arg3);
+ else
+ cap_lower(new->cap_ambient, arg3);
+ return commit_creds(new);
+ }
+
default:
/* No functionality available - continue with default */
return -ENOSYS;
diff --git a/security/inode.c b/security/inode.c
index 8e7ca62078ab..b5f9474e58d8 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -105,7 +105,7 @@ struct dentry *securityfs_create_file(const char *name, umode_t mode,
dir = parent->d_inode;
mutex_lock(&dir->i_mutex);
- dentry = lookup_one_len(name, parent, strlen(name));
+ dentry = lookup_one_len2(name, mount, parent, strlen(name));
if (IS_ERR(dentry))
goto out;
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 162077db5f81..01507eaa730c 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -859,6 +859,7 @@ void key_change_session_keyring(struct callback_head *twork)
new->cap_inheritable = old->cap_inheritable;
new->cap_permitted = old->cap_permitted;
new->cap_effective = old->cap_effective;
+ new->cap_ambient = old->cap_ambient;
new->cap_bset = old->cap_bset;
new->jit_keyring = old->jit_keyring;
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 69fdf3bc765b..0099443e439a 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -220,7 +220,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
*/
BUILD_BUG_ON(sizeof(a->u) > sizeof(void *)*2);
- audit_log_format(ab, " pid=%d comm=", task_pid_nr(tsk));
+ audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
audit_log_untrustedstring(ab, tsk->comm);
switch (a->type) {
@@ -245,6 +245,21 @@ static void dump_common_audit_data(struct audit_buffer *ab,
}
break;
}
+ case LSM_AUDIT_DATA_IOCTL_OP: {
+ struct inode *inode;
+
+ audit_log_d_path(ab, " path=", &a->u.op->path);
+
+ inode = a->u.op->path.dentry->d_inode;
+ if (inode) {
+ audit_log_format(ab, " dev=");
+ audit_log_untrustedstring(ab, inode->i_sb->s_id);
+ audit_log_format(ab, " ino=%lu", inode->i_ino);
+ }
+
+ audit_log_format(ab, " ioctlcmd=%hx", a->u.op->cmd);
+ break;
+ }
case LSM_AUDIT_DATA_DENTRY: {
struct inode *inode;
@@ -279,7 +294,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
case LSM_AUDIT_DATA_TASK:
tsk = a->u.tsk;
if (tsk) {
- pid_t pid = task_pid_nr(tsk);
+ pid_t pid = task_tgid_nr(tsk);
if (pid) {
audit_log_format(ab, " pid=%d comm=", pid);
audit_log_untrustedstring(ab, tsk->comm);
diff --git a/security/security.c b/security/security.c
index 18b35c63fc0c..a00df76114bc 100644
--- a/security/security.c
+++ b/security/security.c
@@ -135,6 +135,26 @@ int __init register_security(struct security_operations *ops)
/* Security operations */
+int security_binder_set_context_mgr(struct task_struct *mgr)
+{
+ return security_ops->binder_set_context_mgr(mgr);
+}
+
+int security_binder_transaction(struct task_struct *from, struct task_struct *to)
+{
+ return security_ops->binder_transaction(from, to);
+}
+
+int security_binder_transfer_binder(struct task_struct *from, struct task_struct *to)
+{
+ return security_ops->binder_transfer_binder(from, to);
+}
+
+int security_binder_transfer_file(struct task_struct *from, struct task_struct *to, struct file *file)
+{
+ return security_ops->binder_transfer_file(from, to, file);
+}
+
int security_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
#ifdef CONFIG_SECURITY_YAMA_STACKED
@@ -472,6 +492,7 @@ int security_path_chown(struct path *path, kuid_t uid, kgid_t gid)
return 0;
return security_ops->path_chown(path, uid, gid);
}
+EXPORT_SYMBOL(security_path_chown);
int security_path_chroot(struct path *path)
{
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index a18f1fa6440b..fd33dbce9d8a 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -22,6 +22,7 @@
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/percpu.h>
+#include <linux/list.h>
#include <net/sock.h>
#include <linux/un.h>
#include <net/af_unix.h>
@@ -48,6 +49,7 @@ struct avc_entry {
u32 tsid;
u16 tclass;
struct av_decision avd;
+ struct avc_xperms_node *xp_node;
};
struct avc_node {
@@ -56,6 +58,16 @@ struct avc_node {
struct rcu_head rhead;
};
+struct avc_xperms_decision_node {
+ struct extended_perms_decision xpd;
+ struct list_head xpd_list; /* list of extended_perms_decision */
+};
+
+struct avc_xperms_node {
+ struct extended_perms xp;
+ struct list_head xpd_head; /* list head of extended_perms_decision */
+};
+
struct avc_cache {
struct hlist_head slots[AVC_CACHE_SLOTS]; /* head for avc_node->list */
spinlock_t slots_lock[AVC_CACHE_SLOTS]; /* lock for writes */
@@ -80,6 +92,9 @@ DEFINE_PER_CPU(struct avc_cache_stats, avc_cache_stats) = { 0 };
static struct avc_cache avc_cache;
static struct avc_callback_node *avc_callbacks;
static struct kmem_cache *avc_node_cachep;
+static struct kmem_cache *avc_xperms_data_cachep;
+static struct kmem_cache *avc_xperms_decision_cachep;
+static struct kmem_cache *avc_xperms_cachep;
static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass)
{
@@ -170,7 +185,17 @@ void __init avc_init(void)
atomic_set(&avc_cache.lru_hint, 0);
avc_node_cachep = kmem_cache_create("avc_node", sizeof(struct avc_node),
- 0, SLAB_PANIC, NULL);
+ 0, SLAB_PANIC, NULL);
+ avc_xperms_cachep = kmem_cache_create("avc_xperms_node",
+ sizeof(struct avc_xperms_node),
+ 0, SLAB_PANIC, NULL);
+ avc_xperms_decision_cachep = kmem_cache_create(
+ "avc_xperms_decision_node",
+ sizeof(struct avc_xperms_decision_node),
+ 0, SLAB_PANIC, NULL);
+ avc_xperms_data_cachep = kmem_cache_create("avc_xperms_data",
+ sizeof(struct extended_perms_data),
+ 0, SLAB_PANIC, NULL);
audit_log(current->audit_context, GFP_KERNEL, AUDIT_KERNEL, "AVC INITIALIZED\n");
}
@@ -205,9 +230,261 @@ int avc_get_hash_stats(char *page)
slots_used, AVC_CACHE_SLOTS, max_chain_len);
}
+/*
+ * using a linked list for extended_perms_decision lookup because the list is
+ * always small. i.e. less than 5, typically 1
+ */
+static struct extended_perms_decision *avc_xperms_decision_lookup(u8 driver,
+ struct avc_xperms_node *xp_node)
+{
+ struct avc_xperms_decision_node *xpd_node;
+
+ list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) {
+ if (xpd_node->xpd.driver == driver)
+ return &xpd_node->xpd;
+ }
+ return NULL;
+}
+
+static inline unsigned int
+avc_xperms_has_perm(struct extended_perms_decision *xpd,
+ u8 perm, u8 which)
+{
+ unsigned int rc = 0;
+
+ if ((which == XPERMS_ALLOWED) &&
+ (xpd->used & XPERMS_ALLOWED))
+ rc = security_xperm_test(xpd->allowed->p, perm);
+ else if ((which == XPERMS_AUDITALLOW) &&
+ (xpd->used & XPERMS_AUDITALLOW))
+ rc = security_xperm_test(xpd->auditallow->p, perm);
+ else if ((which == XPERMS_DONTAUDIT) &&
+ (xpd->used & XPERMS_DONTAUDIT))
+ rc = security_xperm_test(xpd->dontaudit->p, perm);
+ return rc;
+}
+
+static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node,
+ u8 driver, u8 perm)
+{
+ struct extended_perms_decision *xpd;
+ security_xperm_set(xp_node->xp.drivers.p, driver);
+ xpd = avc_xperms_decision_lookup(driver, xp_node);
+ if (xpd && xpd->allowed)
+ security_xperm_set(xpd->allowed->p, perm);
+}
+
+static void avc_xperms_decision_free(struct avc_xperms_decision_node *xpd_node)
+{
+ struct extended_perms_decision *xpd;
+
+ xpd = &xpd_node->xpd;
+ if (xpd->allowed)
+ kmem_cache_free(avc_xperms_data_cachep, xpd->allowed);
+ if (xpd->auditallow)
+ kmem_cache_free(avc_xperms_data_cachep, xpd->auditallow);
+ if (xpd->dontaudit)
+ kmem_cache_free(avc_xperms_data_cachep, xpd->dontaudit);
+ kmem_cache_free(avc_xperms_decision_cachep, xpd_node);
+}
+
+static void avc_xperms_free(struct avc_xperms_node *xp_node)
+{
+ struct avc_xperms_decision_node *xpd_node, *tmp;
+
+ if (!xp_node)
+ return;
+
+ list_for_each_entry_safe(xpd_node, tmp, &xp_node->xpd_head, xpd_list) {
+ list_del(&xpd_node->xpd_list);
+ avc_xperms_decision_free(xpd_node);
+ }
+ kmem_cache_free(avc_xperms_cachep, xp_node);
+}
+
+static void avc_copy_xperms_decision(struct extended_perms_decision *dest,
+ struct extended_perms_decision *src)
+{
+ dest->driver = src->driver;
+ dest->used = src->used;
+ if (dest->used & XPERMS_ALLOWED)
+ memcpy(dest->allowed->p, src->allowed->p,
+ sizeof(src->allowed->p));
+ if (dest->used & XPERMS_AUDITALLOW)
+ memcpy(dest->auditallow->p, src->auditallow->p,
+ sizeof(src->auditallow->p));
+ if (dest->used & XPERMS_DONTAUDIT)
+ memcpy(dest->dontaudit->p, src->dontaudit->p,
+ sizeof(src->dontaudit->p));
+}
+
+/*
+ * similar to avc_copy_xperms_decision, but only copy decision
+ * information relevant to this perm
+ */
+static inline void avc_quick_copy_xperms_decision(u8 perm,
+ struct extended_perms_decision *dest,
+ struct extended_perms_decision *src)
+{
+ /*
+ * compute index of the u32 of the 256 bits (8 u32s) that contain this
+ * command permission
+ */
+ u8 i = perm >> 5;
+
+ dest->used = src->used;
+ if (dest->used & XPERMS_ALLOWED)
+ dest->allowed->p[i] = src->allowed->p[i];
+ if (dest->used & XPERMS_AUDITALLOW)
+ dest->auditallow->p[i] = src->auditallow->p[i];
+ if (dest->used & XPERMS_DONTAUDIT)
+ dest->dontaudit->p[i] = src->dontaudit->p[i];
+}
+
+static struct avc_xperms_decision_node
+ *avc_xperms_decision_alloc(u8 which)
+{
+ struct avc_xperms_decision_node *xpd_node;
+ struct extended_perms_decision *xpd;
+
+ xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep,
+ GFP_ATOMIC | __GFP_NOMEMALLOC);
+ if (!xpd_node)
+ return NULL;
+
+ xpd = &xpd_node->xpd;
+ if (which & XPERMS_ALLOWED) {
+ xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep,
+ GFP_ATOMIC | __GFP_NOMEMALLOC);
+ if (!xpd->allowed)
+ goto error;
+ }
+ if (which & XPERMS_AUDITALLOW) {
+ xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep,
+ GFP_ATOMIC | __GFP_NOMEMALLOC);
+ if (!xpd->auditallow)
+ goto error;
+ }
+ if (which & XPERMS_DONTAUDIT) {
+ xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep,
+ GFP_ATOMIC | __GFP_NOMEMALLOC);
+ if (!xpd->dontaudit)
+ goto error;
+ }
+ return xpd_node;
+error:
+ avc_xperms_decision_free(xpd_node);
+ return NULL;
+}
+
+static int avc_add_xperms_decision(struct avc_node *node,
+ struct extended_perms_decision *src)
+{
+ struct avc_xperms_decision_node *dest_xpd;
+
+ node->ae.xp_node->xp.len++;
+ dest_xpd = avc_xperms_decision_alloc(src->used);
+ if (!dest_xpd)
+ return -ENOMEM;
+ avc_copy_xperms_decision(&dest_xpd->xpd, src);
+ list_add(&dest_xpd->xpd_list, &node->ae.xp_node->xpd_head);
+ return 0;
+}
+
+static struct avc_xperms_node *avc_xperms_alloc(void)
+{
+ struct avc_xperms_node *xp_node;
+
+ xp_node = kmem_cache_zalloc(avc_xperms_cachep,
+ GFP_ATOMIC|__GFP_NOMEMALLOC);
+ if (!xp_node)
+ return xp_node;
+ INIT_LIST_HEAD(&xp_node->xpd_head);
+ return xp_node;
+}
+
+static int avc_xperms_populate(struct avc_node *node,
+ struct avc_xperms_node *src)
+{
+ struct avc_xperms_node *dest;
+ struct avc_xperms_decision_node *dest_xpd;
+ struct avc_xperms_decision_node *src_xpd;
+
+ if (src->xp.len == 0)
+ return 0;
+ dest = avc_xperms_alloc();
+ if (!dest)
+ return -ENOMEM;
+
+ memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p));
+ dest->xp.len = src->xp.len;
+
+ /* for each source xpd allocate a destination xpd and copy */
+ list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) {
+ dest_xpd = avc_xperms_decision_alloc(src_xpd->xpd.used);
+ if (!dest_xpd)
+ goto error;
+ avc_copy_xperms_decision(&dest_xpd->xpd, &src_xpd->xpd);
+ list_add(&dest_xpd->xpd_list, &dest->xpd_head);
+ }
+ node->ae.xp_node = dest;
+ return 0;
+error:
+ avc_xperms_free(dest);
+ return -ENOMEM;
+
+}
+
+static inline u32 avc_xperms_audit_required(u32 requested,
+ struct av_decision *avd,
+ struct extended_perms_decision *xpd,
+ u8 perm,
+ int result,
+ u32 *deniedp)
+{
+ u32 denied, audited;
+
+ denied = requested & ~avd->allowed;
+ if (unlikely(denied)) {
+ audited = denied & avd->auditdeny;
+ if (audited && xpd) {
+ if (avc_xperms_has_perm(xpd, perm, XPERMS_DONTAUDIT))
+ audited &= ~requested;
+ }
+ } else if (result) {
+ audited = denied = requested;
+ } else {
+ audited = requested & avd->auditallow;
+ if (audited && xpd) {
+ if (!avc_xperms_has_perm(xpd, perm, XPERMS_AUDITALLOW))
+ audited &= ~requested;
+ }
+ }
+
+ *deniedp = denied;
+ return audited;
+}
+
+static inline int avc_xperms_audit(u32 ssid, u32 tsid, u16 tclass,
+ u32 requested, struct av_decision *avd,
+ struct extended_perms_decision *xpd,
+ u8 perm, int result,
+ struct common_audit_data *ad)
+{
+ u32 audited, denied;
+
+ audited = avc_xperms_audit_required(
+ requested, avd, xpd, perm, result, &denied);
+ if (likely(!audited))
+ return 0;
+ return slow_avc_audit(ssid, tsid, tclass, requested,
+ audited, denied, result, ad, 0);
+}
+
static void avc_node_free(struct rcu_head *rhead)
{
struct avc_node *node = container_of(rhead, struct avc_node, rhead);
+ avc_xperms_free(node->ae.xp_node);
kmem_cache_free(avc_node_cachep, node);
avc_cache_stats_incr(frees);
}
@@ -221,6 +498,7 @@ static void avc_node_delete(struct avc_node *node)
static void avc_node_kill(struct avc_node *node)
{
+ avc_xperms_free(node->ae.xp_node);
kmem_cache_free(avc_node_cachep, node);
avc_cache_stats_incr(frees);
atomic_dec(&avc_cache.active_nodes);
@@ -367,6 +645,7 @@ static int avc_latest_notif_update(int seqno, int is_insert)
* @tsid: target security identifier
* @tclass: target security class
* @avd: resulting av decision
+ * @xp_node: resulting extended permissions
*
* Insert an AVC entry for the SID pair
* (@ssid, @tsid) and class @tclass.
@@ -378,7 +657,9 @@ static int avc_latest_notif_update(int seqno, int is_insert)
* the access vectors into a cache entry, returns
* avc_node inserted. Otherwise, this function returns NULL.
*/
-static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd)
+static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass,
+ struct av_decision *avd,
+ struct avc_xperms_node *xp_node)
{
struct avc_node *pos, *node = NULL;
int hvalue;
@@ -391,10 +672,15 @@ static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass, struct av_dec
if (node) {
struct hlist_head *head;
spinlock_t *lock;
+ int rc = 0;
hvalue = avc_hash(ssid, tsid, tclass);
avc_node_populate(node, ssid, tsid, tclass, avd);
-
+ rc = avc_xperms_populate(node, xp_node);
+ if (rc) {
+ kmem_cache_free(avc_node_cachep, node);
+ return NULL;
+ }
head = &avc_cache.slots[hvalue];
lock = &avc_cache.slots_lock[hvalue];
@@ -528,14 +814,17 @@ static inline int avc_sidcmp(u32 x, u32 y)
* @perms : Permission mask bits
* @ssid,@tsid,@tclass : identifier of an AVC entry
* @seqno : sequence number when decision was made
+ * @xpd: extended_perms_decision to be added to the node
*
* if a valid AVC entry doesn't exist,this function returns -ENOENT.
* if kmalloc() called internal returns NULL, this function returns -ENOMEM.
* otherwise, this function updates the AVC entry. The original AVC-entry object
* will release later by RCU.
*/
-static int avc_update_node(u32 event, u32 perms, u32 ssid, u32 tsid, u16 tclass,
- u32 seqno)
+static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid,
+ u32 tsid, u16 tclass, u32 seqno,
+ struct extended_perms_decision *xpd,
+ u32 flags)
{
int hvalue, rc = 0;
unsigned long flag;
@@ -579,9 +868,19 @@ static int avc_update_node(u32 event, u32 perms, u32 ssid, u32 tsid, u16 tclass,
avc_node_populate(node, ssid, tsid, tclass, &orig->ae.avd);
+ if (orig->ae.xp_node) {
+ rc = avc_xperms_populate(node, orig->ae.xp_node);
+ if (rc) {
+ kmem_cache_free(avc_node_cachep, node);
+ goto out_unlock;
+ }
+ }
+
switch (event) {
case AVC_CALLBACK_GRANT:
node->ae.avd.allowed |= perms;
+ if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS))
+ avc_xperms_allow_perm(node->ae.xp_node, driver, xperm);
break;
case AVC_CALLBACK_TRY_REVOKE:
case AVC_CALLBACK_REVOKE:
@@ -599,6 +898,9 @@ static int avc_update_node(u32 event, u32 perms, u32 ssid, u32 tsid, u16 tclass,
case AVC_CALLBACK_AUDITDENY_DISABLE:
node->ae.avd.auditdeny &= ~perms;
break;
+ case AVC_CALLBACK_ADD_XPERMS:
+ avc_add_xperms_decision(node, xpd);
+ break;
}
avc_node_replace(node, orig);
out_unlock:
@@ -670,18 +972,20 @@ int avc_ss_reset(u32 seqno)
* results in a bigger stack frame.
*/
static noinline struct avc_node *avc_compute_av(u32 ssid, u32 tsid,
- u16 tclass, struct av_decision *avd)
+ u16 tclass, struct av_decision *avd,
+ struct avc_xperms_node *xp_node)
{
rcu_read_unlock();
- security_compute_av(ssid, tsid, tclass, avd);
+ INIT_LIST_HEAD(&xp_node->xpd_head);
+ security_compute_av(ssid, tsid, tclass, avd, &xp_node->xp);
rcu_read_lock();
- return avc_insert(ssid, tsid, tclass, avd);
+ return avc_insert(ssid, tsid, tclass, avd, xp_node);
}
static noinline int avc_denied(u32 ssid, u32 tsid,
- u16 tclass, u32 requested,
- unsigned flags,
- struct av_decision *avd)
+ u16 tclass, u32 requested,
+ u8 driver, u8 xperm, unsigned flags,
+ struct av_decision *avd)
{
if (flags & AVC_STRICT)
return -EACCES;
@@ -689,11 +993,91 @@ static noinline int avc_denied(u32 ssid, u32 tsid,
if (selinux_enforcing && !(avd->flags & AVD_FLAGS_PERMISSIVE))
return -EACCES;
- avc_update_node(AVC_CALLBACK_GRANT, requested, ssid,
- tsid, tclass, avd->seqno);
+ avc_update_node(AVC_CALLBACK_GRANT, requested, driver, xperm, ssid,
+ tsid, tclass, avd->seqno, NULL, flags);
return 0;
}
+/*
+ * The avc extended permissions logic adds an additional 256 bits of
+ * permissions to an avc node when extended permissions for that node are
+ * specified in the avtab. If the additional 256 permissions is not adequate,
+ * as-is the case with ioctls, then multiple may be chained together and the
+ * driver field is used to specify which set contains the permission.
+ */
+int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested,
+ u8 driver, u8 xperm, struct common_audit_data *ad)
+{
+ struct avc_node *node;
+ struct av_decision avd;
+ u32 denied;
+ struct extended_perms_decision local_xpd;
+ struct extended_perms_decision *xpd = NULL;
+ struct extended_perms_data allowed;
+ struct extended_perms_data auditallow;
+ struct extended_perms_data dontaudit;
+ struct avc_xperms_node local_xp_node;
+ struct avc_xperms_node *xp_node;
+ int rc = 0, rc2;
+
+ xp_node = &local_xp_node;
+ BUG_ON(!requested);
+
+ rcu_read_lock();
+
+ node = avc_lookup(ssid, tsid, tclass);
+ if (unlikely(!node)) {
+ node = avc_compute_av(ssid, tsid, tclass, &avd, xp_node);
+ } else {
+ memcpy(&avd, &node->ae.avd, sizeof(avd));
+ xp_node = node->ae.xp_node;
+ }
+ /* if extended permissions are not defined, only consider av_decision */
+ if (!xp_node || !xp_node->xp.len)
+ goto decision;
+
+ local_xpd.allowed = &allowed;
+ local_xpd.auditallow = &auditallow;
+ local_xpd.dontaudit = &dontaudit;
+
+ xpd = avc_xperms_decision_lookup(driver, xp_node);
+ if (unlikely(!xpd)) {
+ /*
+ * Compute the extended_perms_decision only if the driver
+ * is flagged
+ */
+ if (!security_xperm_test(xp_node->xp.drivers.p, driver)) {
+ avd.allowed &= ~requested;
+ goto decision;
+ }
+ rcu_read_unlock();
+ security_compute_xperms_decision(ssid, tsid, tclass, driver,
+ &local_xpd);
+ rcu_read_lock();
+ avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, driver, xperm,
+ ssid, tsid, tclass, avd.seqno, &local_xpd, 0);
+ } else {
+ avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd);
+ }
+ xpd = &local_xpd;
+
+ if (!avc_xperms_has_perm(xpd, xperm, XPERMS_ALLOWED))
+ avd.allowed &= ~requested;
+
+decision:
+ denied = requested & ~(avd.allowed);
+ if (unlikely(denied))
+ rc = avc_denied(ssid, tsid, tclass, requested, driver, xperm,
+ AVC_EXTENDED_PERMS, &avd);
+
+ rcu_read_unlock();
+
+ rc2 = avc_xperms_audit(ssid, tsid, tclass, requested,
+ &avd, xpd, xperm, rc, ad);
+ if (rc2)
+ return rc2;
+ return rc;
+}
/**
* avc_has_perm_noaudit - Check permissions but perform no auditing.
@@ -721,6 +1105,7 @@ inline int avc_has_perm_noaudit(u32 ssid, u32 tsid,
struct av_decision *avd)
{
struct avc_node *node;
+ struct avc_xperms_node xp_node;
int rc = 0;
u32 denied;
@@ -729,16 +1114,14 @@ inline int avc_has_perm_noaudit(u32 ssid, u32 tsid,
rcu_read_lock();
node = avc_lookup(ssid, tsid, tclass);
- if (unlikely(!node)) {
- node = avc_compute_av(ssid, tsid, tclass, avd);
- } else {
+ if (unlikely(!node))
+ node = avc_compute_av(ssid, tsid, tclass, avd, &xp_node);
+ else
memcpy(avd, &node->ae.avd, sizeof(*avd));
- avd = &node->ae.avd;
- }
denied = requested & ~(avd->allowed);
if (unlikely(denied))
- rc = avc_denied(ssid, tsid, tclass, requested, flags, avd);
+ rc = avc_denied(ssid, tsid, tclass, requested, 0, 0, flags, avd);
rcu_read_unlock();
return rc;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index be04b056c1bd..c49ed3df3f32 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -403,24 +403,15 @@ static int selinux_is_sblabel_mnt(struct super_block *sb)
{
struct superblock_security_struct *sbsec = sb->s_security;
- if (sbsec->behavior == SECURITY_FS_USE_XATTR ||
- sbsec->behavior == SECURITY_FS_USE_TRANS ||
- sbsec->behavior == SECURITY_FS_USE_TASK ||
- sbsec->behavior == SECURITY_FS_USE_NATIVE)
- return 1;
-
- /* Special handling for sysfs. Is genfs but also has setxattr handler*/
- if (strncmp(sb->s_type->name, "sysfs", sizeof("sysfs")) == 0)
- return 1;
-
- /*
- * Special handling for rootfs. Is genfs but supports
- * setting SELinux context on in-core inodes.
- */
- if (strncmp(sb->s_type->name, "rootfs", sizeof("rootfs")) == 0)
- return 1;
-
- return 0;
+ return sbsec->behavior == SECURITY_FS_USE_XATTR ||
+ sbsec->behavior == SECURITY_FS_USE_TRANS ||
+ sbsec->behavior == SECURITY_FS_USE_TASK ||
+ sbsec->behavior == SECURITY_FS_USE_NATIVE ||
+ /* Special handling. Genfs but also in-core setxattr handler */
+ !strcmp(sb->s_type->name, "sysfs") ||
+ !strcmp(sb->s_type->name, "pstore") ||
+ !strcmp(sb->s_type->name, "debugfs") ||
+ !strcmp(sb->s_type->name, "rootfs");
}
static int sb_finish_set_opts(struct super_block *sb)
@@ -741,7 +732,13 @@ static int selinux_set_mnt_opts(struct super_block *sb,
}
if (strcmp(sb->s_type->name, "proc") == 0)
- sbsec->flags |= SE_SBPROC;
+ sbsec->flags |= SE_SBPROC | SE_SBGENFS;
+
+ if (!strcmp(sb->s_type->name, "debugfs") ||
+ !strcmp(sb->s_type->name, "tracefs") ||
+ !strcmp(sb->s_type->name, "sysfs") ||
+ !strcmp(sb->s_type->name, "pstore"))
+ sbsec->flags |= SE_SBGENFS;
if (!sbsec->behavior) {
/*
@@ -1237,12 +1234,13 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc
return SECCLASS_SOCKET;
}
-#ifdef CONFIG_PROC_FS
-static int selinux_proc_get_sid(struct dentry *dentry,
- u16 tclass,
- u32 *sid)
+static int selinux_genfs_get_sid(struct dentry *dentry,
+ u16 tclass,
+ u16 flags,
+ u32 *sid)
{
int rc;
+ struct super_block *sb = dentry->d_inode->i_sb;
char *buffer, *path;
buffer = (char *)__get_free_page(GFP_KERNEL);
@@ -1253,26 +1251,20 @@ static int selinux_proc_get_sid(struct dentry *dentry,
if (IS_ERR(path))
rc = PTR_ERR(path);
else {
- /* each process gets a /proc/PID/ entry. Strip off the
- * PID part to get a valid selinux labeling.
- * e.g. /proc/1/net/rpc/nfs -> /net/rpc/nfs */
- while (path[1] >= '0' && path[1] <= '9') {
- path[1] = '/';
- path++;
+ if (flags & SE_SBPROC) {
+ /* each process gets a /proc/PID/ entry. Strip off the
+ * PID part to get a valid selinux labeling.
+ * e.g. /proc/1/net/rpc/nfs -> /net/rpc/nfs */
+ while (path[1] >= '0' && path[1] <= '9') {
+ path[1] = '/';
+ path++;
+ }
}
- rc = security_genfs_sid("proc", path, tclass, sid);
+ rc = security_genfs_sid(sb->s_type->name, path, tclass, sid);
}
free_page((unsigned long)buffer);
return rc;
}
-#else
-static int selinux_proc_get_sid(struct dentry *dentry,
- u16 tclass,
- u32 *sid)
-{
- return -EINVAL;
-}
-#endif
/* The inode's security attributes must be initialized before first use. */
static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry)
@@ -1429,7 +1421,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
/* Default to the fs superblock SID. */
isec->sid = sbsec->sid;
- if ((sbsec->flags & SE_SBPROC) && !S_ISLNK(inode->i_mode)) {
+ if ((sbsec->flags & SE_SBGENFS) && !S_ISLNK(inode->i_mode)) {
/* We must have a dentry to determine the label on
* procfs inodes */
if (opt_dentry)
@@ -1452,7 +1444,8 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
if (!dentry)
goto out_unlock;
isec->sclass = inode_mode_to_security_class(inode->i_mode);
- rc = selinux_proc_get_sid(dentry, isec->sclass, &sid);
+ rc = selinux_genfs_get_sid(dentry, isec->sclass,
+ sbsec->flags, &sid);
dput(dentry);
if (rc)
goto out_unlock;
@@ -1936,6 +1929,65 @@ static inline u32 open_file_to_av(struct file *file)
/* Hook functions begin here. */
+static int selinux_binder_set_context_mgr(struct task_struct *mgr)
+{
+ u32 mysid = current_sid();
+ u32 mgrsid = task_sid(mgr);
+
+ return avc_has_perm(mysid, mgrsid, SECCLASS_BINDER, BINDER__SET_CONTEXT_MGR, NULL);
+}
+
+static int selinux_binder_transaction(struct task_struct *from, struct task_struct *to)
+{
+ u32 mysid = current_sid();
+ u32 fromsid = task_sid(from);
+ u32 tosid = task_sid(to);
+ int rc;
+
+ if (mysid != fromsid) {
+ rc = avc_has_perm(mysid, fromsid, SECCLASS_BINDER, BINDER__IMPERSONATE, NULL);
+ if (rc)
+ return rc;
+ }
+
+ return avc_has_perm(fromsid, tosid, SECCLASS_BINDER, BINDER__CALL, NULL);
+}
+
+static int selinux_binder_transfer_binder(struct task_struct *from, struct task_struct *to)
+{
+ u32 fromsid = task_sid(from);
+ u32 tosid = task_sid(to);
+ return avc_has_perm(fromsid, tosid, SECCLASS_BINDER, BINDER__TRANSFER, NULL);
+}
+
+static int selinux_binder_transfer_file(struct task_struct *from, struct task_struct *to, struct file *file)
+{
+ u32 sid = task_sid(to);
+ struct file_security_struct *fsec = file->f_security;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode_security_struct *isec = inode->i_security;
+ struct common_audit_data ad;
+ int rc;
+
+ ad.type = LSM_AUDIT_DATA_PATH;
+ ad.u.path = file->f_path;
+
+ if (sid != fsec->sid) {
+ rc = avc_has_perm(sid, fsec->sid,
+ SECCLASS_FD,
+ FD__USE,
+ &ad);
+ if (rc)
+ return rc;
+ }
+
+ if (unlikely(IS_PRIVATE(inode)))
+ return 0;
+
+ return avc_has_perm(sid, isec->sid, isec->sclass, file_to_av(file),
+ &ad);
+}
+
static int selinux_ptrace_access_check(struct task_struct *child,
unsigned int mode)
{
@@ -2896,7 +2948,8 @@ static int selinux_inode_setattr(struct dentry *dentry, struct iattr *iattr)
ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_TIMES_SET))
return dentry_has_perm(cred, dentry, FILE__SETATTR);
- if (selinux_policycap_openperm && (ia_valid & ATTR_SIZE))
+ if (selinux_policycap_openperm && (ia_valid & ATTR_SIZE)
+ && !(ia_valid & ATTR_FILE))
av |= FILE__OPEN;
return dentry_has_perm(cred, dentry, av);
@@ -3187,6 +3240,46 @@ static void selinux_file_free_security(struct file *file)
file_free_security(file);
}
+/*
+ * Check whether a task has the ioctl permission and cmd
+ * operation to an inode.
+ */
+int ioctl_has_perm(const struct cred *cred, struct file *file,
+ u32 requested, u16 cmd)
+{
+ struct common_audit_data ad;
+ struct file_security_struct *fsec = file->f_security;
+ struct inode *inode = file_inode(file);
+ struct inode_security_struct *isec = inode->i_security;
+ struct lsm_ioctlop_audit ioctl;
+ u32 ssid = cred_sid(cred);
+ int rc;
+ u8 driver = cmd >> 8;
+ u8 xperm = cmd & 0xff;
+
+ ad.type = LSM_AUDIT_DATA_IOCTL_OP;
+ ad.u.op = &ioctl;
+ ad.u.op->cmd = cmd;
+ ad.u.op->path = file->f_path;
+
+ if (ssid != fsec->sid) {
+ rc = avc_has_perm(ssid, fsec->sid,
+ SECCLASS_FD,
+ FD__USE,
+ &ad);
+ if (rc)
+ goto out;
+ }
+
+ if (unlikely(IS_PRIVATE(inode)))
+ return 0;
+
+ rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass,
+ requested, driver, xperm, &ad);
+out:
+ return rc;
+}
+
static int selinux_file_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
@@ -3229,7 +3322,7 @@ static int selinux_file_ioctl(struct file *file, unsigned int cmd,
* to the file's ioctl() function.
*/
default:
- error = file_has_perm(cred, file, FILE__IOCTL);
+ error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd);
}
return error;
}
@@ -3572,6 +3665,38 @@ static int selinux_kernel_module_request(char *kmod_name)
SYSTEM__MODULE_REQUEST, &ad);
}
+static int selinux_kernel_module_from_file(struct file *file)
+{
+ struct common_audit_data ad;
+ struct inode_security_struct *isec;
+ struct file_security_struct *fsec;
+ struct inode *inode;
+ u32 sid = current_sid();
+ int rc;
+
+ /* init_module */
+ if (file == NULL)
+ return avc_has_perm(sid, sid, SECCLASS_SYSTEM,
+ SYSTEM__MODULE_LOAD, NULL);
+
+ /* finit_module */
+ ad.type = LSM_AUDIT_DATA_PATH;
+ ad.u.path = file->f_path;
+
+ inode = file_inode(file);
+ isec = inode->i_security;
+ fsec = file->f_security;
+
+ if (sid != fsec->sid) {
+ rc = avc_has_perm(sid, fsec->sid, SECCLASS_FD, FD__USE, &ad);
+ if (rc)
+ return rc;
+ }
+
+ return avc_has_perm(sid, isec->sid, SECCLASS_SYSTEM,
+ SYSTEM__MODULE_LOAD, &ad);
+}
+
static int selinux_task_setpgid(struct task_struct *p, pid_t pgid)
{
return current_has_perm(p, PROCESS__SETPGID);
@@ -5813,6 +5938,11 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer)
static struct security_operations selinux_ops = {
.name = "selinux",
+ .binder_set_context_mgr = selinux_binder_set_context_mgr,
+ .binder_transaction = selinux_binder_transaction,
+ .binder_transfer_binder = selinux_binder_transfer_binder,
+ .binder_transfer_file = selinux_binder_transfer_file,
+
.ptrace_access_check = selinux_ptrace_access_check,
.ptrace_traceme = selinux_ptrace_traceme,
.capget = selinux_capget,
@@ -5894,6 +6024,7 @@ static struct security_operations selinux_ops = {
.kernel_act_as = selinux_kernel_act_as,
.kernel_create_files_as = selinux_kernel_create_files_as,
.kernel_module_request = selinux_kernel_module_request,
+ .kernel_module_from_file = selinux_kernel_module_from_file,
.task_setpgid = selinux_task_setpgid,
.task_getpgid = selinux_task_getpgid,
.task_getsid = selinux_task_getsid,
diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h
index ddf8eec03f21..03a9f08053e3 100644
--- a/security/selinux/include/avc.h
+++ b/security/selinux/include/avc.h
@@ -142,6 +142,7 @@ static inline int avc_audit(u32 ssid, u32 tsid,
}
#define AVC_STRICT 1 /* Ignore permissive mode. */
+#define AVC_EXTENDED_PERMS 2 /* update extended permissions */
int avc_has_perm_noaudit(u32 ssid, u32 tsid,
u16 tclass, u32 requested,
unsigned flags,
@@ -151,6 +152,9 @@ int avc_has_perm(u32 ssid, u32 tsid,
u16 tclass, u32 requested,
struct common_audit_data *auditdata);
+int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested,
+ u8 driver, u8 perm, struct common_audit_data *ad);
+
u32 avc_policy_seqno(void);
#define AVC_CALLBACK_GRANT 1
@@ -161,6 +165,7 @@ u32 avc_policy_seqno(void);
#define AVC_CALLBACK_AUDITALLOW_DISABLE 32
#define AVC_CALLBACK_AUDITDENY_ENABLE 64
#define AVC_CALLBACK_AUDITDENY_DISABLE 128
+#define AVC_CALLBACK_ADD_XPERMS 256
int avc_add_callback(int (*callback)(u32 event), u32 events);
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index be491a74c1ed..3c7b236005e8 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -32,7 +32,7 @@ struct security_class_mapping secclass_map[] = {
"setsockcreate", NULL } },
{ "system",
{ "ipc_info", "syslog_read", "syslog_mod",
- "syslog_console", "module_request", NULL } },
+ "syslog_console", "module_request", "module_load", NULL } },
{ "capability",
{ "chown", "dac_override", "dac_read_search",
"fowner", "fsetid", "kill", "setgid", "setuid", "setpcap",
@@ -151,5 +151,6 @@ struct security_class_mapping secclass_map[] = {
{ "kernel_service", { "use_as_override", "create_files_as", NULL } },
{ "tun_socket",
{ COMMON_SOCK_PERMS, "attach_queue", NULL } },
+ { "binder", { "impersonate", "call", "set_context_mgr", "transfer", NULL } },
{ NULL }
};
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index d1e0b239b602..6a681d26bf20 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -35,13 +35,14 @@
#define POLICYDB_VERSION_NEW_OBJECT_DEFAULTS 27
#define POLICYDB_VERSION_DEFAULT_TYPE 28
#define POLICYDB_VERSION_CONSTRAINT_NAMES 29
+#define POLICYDB_VERSION_XPERMS_IOCTL 30
/* Range of policy versions we understand*/
#define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE
#ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX
#define POLICYDB_VERSION_MAX CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
#else
-#define POLICYDB_VERSION_MAX POLICYDB_VERSION_CONSTRAINT_NAMES
+#define POLICYDB_VERSION_MAX POLICYDB_VERSION_XPERMS_IOCTL
#endif
/* Mask for just the mount related flags */
@@ -56,6 +57,7 @@
/* Non-mount related flags */
#define SE_SBINITIALIZED 0x0100
#define SE_SBPROC 0x0200
+#define SE_SBGENFS 0x0400
#define CONTEXT_STR "context="
#define FSCONTEXT_STR "fscontext="
@@ -108,11 +110,38 @@ struct av_decision {
u32 flags;
};
+#define XPERMS_ALLOWED 1
+#define XPERMS_AUDITALLOW 2
+#define XPERMS_DONTAUDIT 4
+
+#define security_xperm_set(perms, x) (perms[x >> 5] |= 1 << (x & 0x1f))
+#define security_xperm_test(perms, x) (1 & (perms[x >> 5] >> (x & 0x1f)))
+struct extended_perms_data {
+ u32 p[8];
+};
+
+struct extended_perms_decision {
+ u8 used;
+ u8 driver;
+ struct extended_perms_data *allowed;
+ struct extended_perms_data *auditallow;
+ struct extended_perms_data *dontaudit;
+};
+
+struct extended_perms {
+ u16 len; /* length associated decision chain */
+ struct extended_perms_data drivers; /* flag drivers that are used */
+};
+
/* definitions of av_decision.flags */
#define AVD_FLAGS_PERMISSIVE 0x0001
void security_compute_av(u32 ssid, u32 tsid,
- u16 tclass, struct av_decision *avd);
+ u16 tclass, struct av_decision *avd,
+ struct extended_perms *xperms);
+
+void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 tclass,
+ u8 driver, struct extended_perms_decision *xpermd);
void security_compute_av_user(u32 ssid, u32 tsid,
u16 tclass, struct av_decision *avd);
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 902b5e9cec7e..053210a559ad 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -77,9 +77,10 @@ static struct nlmsg_perm nlmsg_route_perms[] =
static struct nlmsg_perm nlmsg_tcpdiag_perms[] =
{
- { TCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
- { DCCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
- { SOCK_DIAG_BY_FAMILY, NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
+ { TCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
+ { DCCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
+ { SOCK_DIAG_BY_FAMILY, NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
+ { SOCK_DESTROY_BACKPORT, NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE },
};
static struct nlmsg_perm nlmsg_xfrm_perms[] =
diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index a3dd9faa19c0..640c16b9d3fb 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -24,6 +24,7 @@
#include "policydb.h"
static struct kmem_cache *avtab_node_cachep;
+static struct kmem_cache *avtab_xperms_cachep;
static inline int avtab_hash(struct avtab_key *keyp, u16 mask)
{
@@ -37,11 +38,24 @@ avtab_insert_node(struct avtab *h, int hvalue,
struct avtab_key *key, struct avtab_datum *datum)
{
struct avtab_node *newnode;
+ struct avtab_extended_perms *xperms;
newnode = kmem_cache_zalloc(avtab_node_cachep, GFP_KERNEL);
if (newnode == NULL)
return NULL;
newnode->key = *key;
- newnode->datum = *datum;
+
+ if (key->specified & AVTAB_XPERMS) {
+ xperms = kmem_cache_zalloc(avtab_xperms_cachep, GFP_KERNEL);
+ if (xperms == NULL) {
+ kmem_cache_free(avtab_node_cachep, newnode);
+ return NULL;
+ }
+ *xperms = *(datum->u.xperms);
+ newnode->datum.u.xperms = xperms;
+ } else {
+ newnode->datum.u.data = datum->u.data;
+ }
+
if (prev) {
newnode->next = prev->next;
prev->next = newnode;
@@ -70,8 +84,12 @@ static int avtab_insert(struct avtab *h, struct avtab_key *key, struct avtab_dat
if (key->source_type == cur->key.source_type &&
key->target_type == cur->key.target_type &&
key->target_class == cur->key.target_class &&
- (specified & cur->key.specified))
+ (specified & cur->key.specified)) {
+ /* extended perms may not be unique */
+ if (specified & AVTAB_XPERMS)
+ break;
return -EEXIST;
+ }
if (key->source_type < cur->key.source_type)
break;
if (key->source_type == cur->key.source_type &&
@@ -232,6 +250,9 @@ void avtab_destroy(struct avtab *h)
while (cur) {
temp = cur;
cur = cur->next;
+ if (temp->key.specified & AVTAB_XPERMS)
+ kmem_cache_free(avtab_xperms_cachep,
+ temp->datum.u.xperms);
kmem_cache_free(avtab_node_cachep, temp);
}
h->htable[i] = NULL;
@@ -314,13 +335,42 @@ void avtab_hash_eval(struct avtab *h, char *tag)
chain2_len_sum);
}
+/*
+ * extended permissions compatibility. Make ToT Android kernels compatible
+ * with Android M releases
+ */
+#define AVTAB_OPTYPE_ALLOWED 0x1000
+#define AVTAB_OPTYPE_AUDITALLOW 0x2000
+#define AVTAB_OPTYPE_DONTAUDIT 0x4000
+#define AVTAB_OPTYPE (AVTAB_OPTYPE_ALLOWED | \
+ AVTAB_OPTYPE_AUDITALLOW | \
+ AVTAB_OPTYPE_DONTAUDIT)
+#define AVTAB_XPERMS_OPTYPE 4
+
+#define avtab_xperms_to_optype(x) (x << AVTAB_XPERMS_OPTYPE)
+#define avtab_optype_to_xperms(x) (x >> AVTAB_XPERMS_OPTYPE)
+
+static unsigned int avtab_android_m_compat;
+
+static void avtab_android_m_compat_set(void)
+{
+ if (!avtab_android_m_compat) {
+ pr_info("SELinux: Android master kernel running Android"
+ " M policy in compatibility mode.\n");
+ avtab_android_m_compat = 1;
+ }
+}
+
static uint16_t spec_order[] = {
AVTAB_ALLOWED,
AVTAB_AUDITDENY,
AVTAB_AUDITALLOW,
AVTAB_TRANSITION,
AVTAB_CHANGE,
- AVTAB_MEMBER
+ AVTAB_MEMBER,
+ AVTAB_XPERMS_ALLOWED,
+ AVTAB_XPERMS_AUDITALLOW,
+ AVTAB_XPERMS_DONTAUDIT
};
int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol,
@@ -330,10 +380,12 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol,
{
__le16 buf16[4];
u16 enabled;
- __le32 buf32[7];
u32 items, items2, val, vers = pol->policyvers;
struct avtab_key key;
struct avtab_datum datum;
+ struct avtab_extended_perms xperms;
+ __le32 buf32[ARRAY_SIZE(xperms.perms.p)];
+ unsigned int android_m_compat_optype = 0;
int i, rc;
unsigned set;
@@ -390,11 +442,15 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol,
printk(KERN_ERR "SELinux: avtab: entry has both access vectors and types\n");
return -EINVAL;
}
+ if (val & AVTAB_XPERMS) {
+ printk(KERN_ERR "SELinux: avtab: entry has extended permissions\n");
+ return -EINVAL;
+ }
for (i = 0; i < ARRAY_SIZE(spec_order); i++) {
if (val & spec_order[i]) {
key.specified = spec_order[i] | enabled;
- datum.data = le32_to_cpu(buf32[items++]);
+ datum.u.data = le32_to_cpu(buf32[items++]);
rc = insertf(a, &key, &datum, p);
if (rc)
return rc;
@@ -420,6 +476,13 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol,
key.target_class = le16_to_cpu(buf16[items++]);
key.specified = le16_to_cpu(buf16[items++]);
+ if ((key.specified & AVTAB_OPTYPE) &&
+ (vers == POLICYDB_VERSION_XPERMS_IOCTL)) {
+ key.specified = avtab_optype_to_xperms(key.specified);
+ android_m_compat_optype = 1;
+ avtab_android_m_compat_set();
+ }
+
if (!policydb_type_isvalid(pol, key.source_type) ||
!policydb_type_isvalid(pol, key.target_type) ||
!policydb_class_isvalid(pol, key.target_class)) {
@@ -437,14 +500,54 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol,
return -EINVAL;
}
- rc = next_entry(buf32, fp, sizeof(u32));
- if (rc) {
- printk(KERN_ERR "SELinux: avtab: truncated entry\n");
- return rc;
+ if ((vers < POLICYDB_VERSION_XPERMS_IOCTL) &&
+ (key.specified & AVTAB_XPERMS)) {
+ printk(KERN_ERR "SELinux: avtab: policy version %u does not "
+ "support extended permissions rules and one "
+ "was specified\n", vers);
+ return -EINVAL;
+ } else if (key.specified & AVTAB_XPERMS) {
+ memset(&xperms, 0, sizeof(struct avtab_extended_perms));
+ rc = next_entry(&xperms.specified, fp, sizeof(u8));
+ if (rc) {
+ printk(KERN_ERR "SELinux: avtab: truncated entry\n");
+ return rc;
+ }
+ if (avtab_android_m_compat ||
+ ((xperms.specified != AVTAB_XPERMS_IOCTLFUNCTION) &&
+ (xperms.specified != AVTAB_XPERMS_IOCTLDRIVER) &&
+ (vers == POLICYDB_VERSION_XPERMS_IOCTL))) {
+ xperms.driver = xperms.specified;
+ if (android_m_compat_optype)
+ xperms.specified = AVTAB_XPERMS_IOCTLDRIVER;
+ else
+ xperms.specified = AVTAB_XPERMS_IOCTLFUNCTION;
+ avtab_android_m_compat_set();
+ } else {
+ rc = next_entry(&xperms.driver, fp, sizeof(u8));
+ if (rc) {
+ printk(KERN_ERR "SELinux: avtab: truncated entry\n");
+ return rc;
+ }
+ }
+ rc = next_entry(buf32, fp, sizeof(u32)*ARRAY_SIZE(xperms.perms.p));
+ if (rc) {
+ printk(KERN_ERR "SELinux: avtab: truncated entry\n");
+ return rc;
+ }
+ for (i = 0; i < ARRAY_SIZE(xperms.perms.p); i++)
+ xperms.perms.p[i] = le32_to_cpu(buf32[i]);
+ datum.u.xperms = &xperms;
+ } else {
+ rc = next_entry(buf32, fp, sizeof(u32));
+ if (rc) {
+ printk(KERN_ERR "SELinux: avtab: truncated entry\n");
+ return rc;
+ }
+ datum.u.data = le32_to_cpu(*buf32);
}
- datum.data = le32_to_cpu(*buf32);
if ((key.specified & AVTAB_TYPE) &&
- !policydb_type_isvalid(pol, datum.data)) {
+ !policydb_type_isvalid(pol, datum.u.data)) {
printk(KERN_ERR "SELinux: avtab: invalid type\n");
return -EINVAL;
}
@@ -504,18 +607,40 @@ bad:
int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp)
{
__le16 buf16[4];
- __le32 buf32[1];
+ __le32 buf32[ARRAY_SIZE(cur->datum.u.xperms->perms.p)];
int rc;
+ unsigned int i;
buf16[0] = cpu_to_le16(cur->key.source_type);
buf16[1] = cpu_to_le16(cur->key.target_type);
buf16[2] = cpu_to_le16(cur->key.target_class);
- buf16[3] = cpu_to_le16(cur->key.specified);
+ if (avtab_android_m_compat && (cur->key.specified & AVTAB_XPERMS) &&
+ (cur->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER))
+ buf16[3] = cpu_to_le16(avtab_xperms_to_optype(cur->key.specified));
+ else
+ buf16[3] = cpu_to_le16(cur->key.specified);
rc = put_entry(buf16, sizeof(u16), 4, fp);
if (rc)
return rc;
- buf32[0] = cpu_to_le32(cur->datum.data);
- rc = put_entry(buf32, sizeof(u32), 1, fp);
+
+ if (cur->key.specified & AVTAB_XPERMS) {
+ if (avtab_android_m_compat == 0) {
+ rc = put_entry(&cur->datum.u.xperms->specified,
+ sizeof(u8), 1, fp);
+ if (rc)
+ return rc;
+ }
+ rc = put_entry(&cur->datum.u.xperms->driver, sizeof(u8), 1, fp);
+ if (rc)
+ return rc;
+ for (i = 0; i < ARRAY_SIZE(cur->datum.u.xperms->perms.p); i++)
+ buf32[i] = cpu_to_le32(cur->datum.u.xperms->perms.p[i]);
+ rc = put_entry(buf32, sizeof(u32),
+ ARRAY_SIZE(cur->datum.u.xperms->perms.p), fp);
+ } else {
+ buf32[0] = cpu_to_le32(cur->datum.u.data);
+ rc = put_entry(buf32, sizeof(u32), 1, fp);
+ }
if (rc)
return rc;
return 0;
@@ -548,9 +673,13 @@ void avtab_cache_init(void)
avtab_node_cachep = kmem_cache_create("avtab_node",
sizeof(struct avtab_node),
0, SLAB_PANIC, NULL);
+ avtab_xperms_cachep = kmem_cache_create("avtab_extended_perms",
+ sizeof(struct avtab_extended_perms),
+ 0, SLAB_PANIC, NULL);
}
void avtab_cache_destroy(void)
{
kmem_cache_destroy(avtab_node_cachep);
+ kmem_cache_destroy(avtab_xperms_cachep);
}
diff --git a/security/selinux/ss/avtab.h b/security/selinux/ss/avtab.h
index 63ce2f9e441d..8133523ca679 100644
--- a/security/selinux/ss/avtab.h
+++ b/security/selinux/ss/avtab.h
@@ -23,6 +23,8 @@
#ifndef _SS_AVTAB_H_
#define _SS_AVTAB_H_
+#include "security.h"
+
struct avtab_key {
u16 source_type; /* source type */
u16 target_type; /* target type */
@@ -35,13 +37,43 @@ struct avtab_key {
#define AVTAB_MEMBER 0x0020
#define AVTAB_CHANGE 0x0040
#define AVTAB_TYPE (AVTAB_TRANSITION | AVTAB_MEMBER | AVTAB_CHANGE)
+/* extended permissions */
+#define AVTAB_XPERMS_ALLOWED 0x0100
+#define AVTAB_XPERMS_AUDITALLOW 0x0200
+#define AVTAB_XPERMS_DONTAUDIT 0x0400
+#define AVTAB_XPERMS (AVTAB_XPERMS_ALLOWED | \
+ AVTAB_XPERMS_AUDITALLOW | \
+ AVTAB_XPERMS_DONTAUDIT)
#define AVTAB_ENABLED_OLD 0x80000000 /* reserved for used in cond_avtab */
#define AVTAB_ENABLED 0x8000 /* reserved for used in cond_avtab */
u16 specified; /* what field is specified */
};
+/*
+ * For operations that require more than the 32 permissions provided by the avc
+ * extended permissions may be used to provide 256 bits of permissions.
+ */
+struct avtab_extended_perms {
+/* These are not flags. All 256 values may be used */
+#define AVTAB_XPERMS_IOCTLFUNCTION 0x01
+#define AVTAB_XPERMS_IOCTLDRIVER 0x02
+ /* extension of the avtab_key specified */
+ u8 specified; /* ioctl, netfilter, ... */
+ /*
+ * if 256 bits is not adequate as is often the case with ioctls, then
+ * multiple extended perms may be used and the driver field
+ * specifies which permissions are included.
+ */
+ u8 driver;
+ /* 256 bits of permissions */
+ struct extended_perms_data perms;
+};
+
struct avtab_datum {
- u32 data; /* access vector or type value */
+ union {
+ u32 data; /* access vector or type value */
+ struct avtab_extended_perms *xperms;
+ } u;
};
struct avtab_node {
diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c
index 62c6773be0b7..456e1a9bcfde 100644
--- a/security/selinux/ss/conditional.c
+++ b/security/selinux/ss/conditional.c
@@ -15,6 +15,7 @@
#include "security.h"
#include "conditional.h"
+#include "services.h"
/*
* cond_evaluate_expr evaluates a conditional expr
@@ -612,10 +613,28 @@ int cond_write_list(struct policydb *p, struct cond_node *list, void *fp)
return 0;
}
+
+void cond_compute_xperms(struct avtab *ctab, struct avtab_key *key,
+ struct extended_perms_decision *xpermd)
+{
+ struct avtab_node *node;
+
+ if (!ctab || !key || !xpermd)
+ return;
+
+ for (node = avtab_search_node(ctab, key); node;
+ node = avtab_search_node_next(node, key->specified)) {
+ if (node->key.specified & AVTAB_ENABLED)
+ services_compute_xperms_decision(xpermd, node);
+ }
+ return;
+
+}
/* Determine whether additional permissions are granted by the conditional
* av table, and if so, add them to the result
*/
-void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd)
+void cond_compute_av(struct avtab *ctab, struct avtab_key *key,
+ struct av_decision *avd, struct extended_perms *xperms)
{
struct avtab_node *node;
@@ -626,7 +645,7 @@ void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decisi
node = avtab_search_node_next(node, key->specified)) {
if ((u16)(AVTAB_ALLOWED|AVTAB_ENABLED) ==
(node->key.specified & (AVTAB_ALLOWED|AVTAB_ENABLED)))
- avd->allowed |= node->datum.data;
+ avd->allowed |= node->datum.u.data;
if ((u16)(AVTAB_AUDITDENY|AVTAB_ENABLED) ==
(node->key.specified & (AVTAB_AUDITDENY|AVTAB_ENABLED)))
/* Since a '0' in an auditdeny mask represents a
@@ -634,10 +653,13 @@ void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decisi
* the '&' operand to ensure that all '0's in the mask
* are retained (much unlike the allow and auditallow cases).
*/
- avd->auditdeny &= node->datum.data;
+ avd->auditdeny &= node->datum.u.data;
if ((u16)(AVTAB_AUDITALLOW|AVTAB_ENABLED) ==
(node->key.specified & (AVTAB_AUDITALLOW|AVTAB_ENABLED)))
- avd->auditallow |= node->datum.data;
+ avd->auditallow |= node->datum.u.data;
+ if (xperms && (node->key.specified & AVTAB_ENABLED) &&
+ (node->key.specified & AVTAB_XPERMS))
+ services_compute_xperms_drivers(xperms, node);
}
return;
}
diff --git a/security/selinux/ss/conditional.h b/security/selinux/ss/conditional.h
index 4d1f87466508..ddb43e7e1c75 100644
--- a/security/selinux/ss/conditional.h
+++ b/security/selinux/ss/conditional.h
@@ -73,8 +73,10 @@ int cond_read_list(struct policydb *p, void *fp);
int cond_write_bool(void *key, void *datum, void *ptr);
int cond_write_list(struct policydb *p, struct cond_node *list, void *fp);
-void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd);
-
+void cond_compute_av(struct avtab *ctab, struct avtab_key *key,
+ struct av_decision *avd, struct extended_perms *xperms);
+void cond_compute_xperms(struct avtab *ctab, struct avtab_key *key,
+ struct extended_perms_decision *xpermd);
int evaluate_cond_node(struct policydb *p, struct cond_node *node);
#endif /* _CONDITIONAL_H_ */
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index bc2a586f095c..7e5deafc150c 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -148,6 +148,11 @@ static struct policydb_compat_info policydb_compat[] = {
.sym_num = SYM_NUM,
.ocon_num = OCON_NUM,
},
+ {
+ .version = POLICYDB_VERSION_XPERMS_IOCTL,
+ .sym_num = SYM_NUM,
+ .ocon_num = OCON_NUM,
+ },
};
static struct policydb_compat_info *policydb_lookup_compat(int version)
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index a1d3944751b9..4719b109eaa0 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -93,9 +93,10 @@ static int context_struct_to_string(struct context *context, char **scontext,
u32 *scontext_len);
static void context_struct_compute_av(struct context *scontext,
- struct context *tcontext,
- u16 tclass,
- struct av_decision *avd);
+ struct context *tcontext,
+ u16 tclass,
+ struct av_decision *avd,
+ struct extended_perms *xperms);
struct selinux_mapping {
u16 value; /* policy value */
@@ -565,7 +566,8 @@ static void type_attribute_bounds_av(struct context *scontext,
context_struct_compute_av(&lo_scontext,
tcontext,
tclass,
- &lo_avd);
+ &lo_avd,
+ NULL);
if ((lo_avd.allowed & avd->allowed) == avd->allowed)
return; /* no masked permission */
masked = ~lo_avd.allowed & avd->allowed;
@@ -580,7 +582,8 @@ static void type_attribute_bounds_av(struct context *scontext,
context_struct_compute_av(scontext,
&lo_tcontext,
tclass,
- &lo_avd);
+ &lo_avd,
+ NULL);
if ((lo_avd.allowed & avd->allowed) == avd->allowed)
return; /* no masked permission */
masked = ~lo_avd.allowed & avd->allowed;
@@ -596,7 +599,8 @@ static void type_attribute_bounds_av(struct context *scontext,
context_struct_compute_av(&lo_scontext,
&lo_tcontext,
tclass,
- &lo_avd);
+ &lo_avd,
+ NULL);
if ((lo_avd.allowed & avd->allowed) == avd->allowed)
return; /* no masked permission */
masked = ~lo_avd.allowed & avd->allowed;
@@ -613,13 +617,39 @@ static void type_attribute_bounds_av(struct context *scontext,
}
/*
- * Compute access vectors based on a context structure pair for
- * the permissions in a particular class.
+ * flag which drivers have permissions
+ * only looking for ioctl based extended permssions
+ */
+void services_compute_xperms_drivers(
+ struct extended_perms *xperms,
+ struct avtab_node *node)
+{
+ unsigned int i;
+
+ if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
+ /* if one or more driver has all permissions allowed */
+ for (i = 0; i < ARRAY_SIZE(xperms->drivers.p); i++)
+ xperms->drivers.p[i] |= node->datum.u.xperms->perms.p[i];
+ } else if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
+ /* if allowing permissions within a driver */
+ security_xperm_set(xperms->drivers.p,
+ node->datum.u.xperms->driver);
+ }
+
+ /* If no ioctl commands are allowed, ignore auditallow and auditdeny */
+ if (node->key.specified & AVTAB_XPERMS_ALLOWED)
+ xperms->len = 1;
+}
+
+/*
+ * Compute access vectors and extended permissions based on a context
+ * structure pair for the permissions in a particular class.
*/
static void context_struct_compute_av(struct context *scontext,
- struct context *tcontext,
- u16 tclass,
- struct av_decision *avd)
+ struct context *tcontext,
+ u16 tclass,
+ struct av_decision *avd,
+ struct extended_perms *xperms)
{
struct constraint_node *constraint;
struct role_allow *ra;
@@ -633,6 +663,10 @@ static void context_struct_compute_av(struct context *scontext,
avd->allowed = 0;
avd->auditallow = 0;
avd->auditdeny = 0xffffffff;
+ if (xperms) {
+ memset(&xperms->drivers, 0, sizeof(xperms->drivers));
+ xperms->len = 0;
+ }
if (unlikely(!tclass || tclass > policydb.p_classes.nprim)) {
if (printk_ratelimit())
@@ -647,7 +681,7 @@ static void context_struct_compute_av(struct context *scontext,
* this permission check, then use it.
*/
avkey.target_class = tclass;
- avkey.specified = AVTAB_AV;
+ avkey.specified = AVTAB_AV | AVTAB_XPERMS;
sattr = flex_array_get(policydb.type_attr_map_array, scontext->type - 1);
BUG_ON(!sattr);
tattr = flex_array_get(policydb.type_attr_map_array, tcontext->type - 1);
@@ -660,15 +694,18 @@ static void context_struct_compute_av(struct context *scontext,
node;
node = avtab_search_node_next(node, avkey.specified)) {
if (node->key.specified == AVTAB_ALLOWED)
- avd->allowed |= node->datum.data;
+ avd->allowed |= node->datum.u.data;
else if (node->key.specified == AVTAB_AUDITALLOW)
- avd->auditallow |= node->datum.data;
+ avd->auditallow |= node->datum.u.data;
else if (node->key.specified == AVTAB_AUDITDENY)
- avd->auditdeny &= node->datum.data;
+ avd->auditdeny &= node->datum.u.data;
+ else if (xperms && (node->key.specified & AVTAB_XPERMS))
+ services_compute_xperms_drivers(xperms, node);
}
/* Check conditional av table for additional permissions */
- cond_compute_av(&policydb.te_cond_avtab, &avkey, avd);
+ cond_compute_av(&policydb.te_cond_avtab, &avkey,
+ avd, xperms);
}
}
@@ -899,6 +936,139 @@ static void avd_init(struct av_decision *avd)
avd->flags = 0;
}
+void services_compute_xperms_decision(struct extended_perms_decision *xpermd,
+ struct avtab_node *node)
+{
+ unsigned int i;
+
+ if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
+ if (xpermd->driver != node->datum.u.xperms->driver)
+ return;
+ } else if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
+ if (!security_xperm_test(node->datum.u.xperms->perms.p,
+ xpermd->driver))
+ return;
+ } else {
+ BUG();
+ }
+
+ if (node->key.specified == AVTAB_XPERMS_ALLOWED) {
+ xpermd->used |= XPERMS_ALLOWED;
+ if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
+ memset(xpermd->allowed->p, 0xff,
+ sizeof(xpermd->allowed->p));
+ }
+ if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
+ for (i = 0; i < ARRAY_SIZE(xpermd->allowed->p); i++)
+ xpermd->allowed->p[i] |=
+ node->datum.u.xperms->perms.p[i];
+ }
+ } else if (node->key.specified == AVTAB_XPERMS_AUDITALLOW) {
+ xpermd->used |= XPERMS_AUDITALLOW;
+ if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
+ memset(xpermd->auditallow->p, 0xff,
+ sizeof(xpermd->auditallow->p));
+ }
+ if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
+ for (i = 0; i < ARRAY_SIZE(xpermd->auditallow->p); i++)
+ xpermd->auditallow->p[i] |=
+ node->datum.u.xperms->perms.p[i];
+ }
+ } else if (node->key.specified == AVTAB_XPERMS_DONTAUDIT) {
+ xpermd->used |= XPERMS_DONTAUDIT;
+ if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
+ memset(xpermd->dontaudit->p, 0xff,
+ sizeof(xpermd->dontaudit->p));
+ }
+ if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
+ for (i = 0; i < ARRAY_SIZE(xpermd->dontaudit->p); i++)
+ xpermd->dontaudit->p[i] |=
+ node->datum.u.xperms->perms.p[i];
+ }
+ } else {
+ BUG();
+ }
+}
+
+void security_compute_xperms_decision(u32 ssid,
+ u32 tsid,
+ u16 orig_tclass,
+ u8 driver,
+ struct extended_perms_decision *xpermd)
+{
+ u16 tclass;
+ struct context *scontext, *tcontext;
+ struct avtab_key avkey;
+ struct avtab_node *node;
+ struct ebitmap *sattr, *tattr;
+ struct ebitmap_node *snode, *tnode;
+ unsigned int i, j;
+
+ xpermd->driver = driver;
+ xpermd->used = 0;
+ memset(xpermd->allowed->p, 0, sizeof(xpermd->allowed->p));
+ memset(xpermd->auditallow->p, 0, sizeof(xpermd->auditallow->p));
+ memset(xpermd->dontaudit->p, 0, sizeof(xpermd->dontaudit->p));
+
+ read_lock(&policy_rwlock);
+ if (!ss_initialized)
+ goto allow;
+
+ scontext = sidtab_search(&sidtab, ssid);
+ if (!scontext) {
+ printk(KERN_ERR "SELinux: %s: unrecognized SID %d\n",
+ __func__, ssid);
+ goto out;
+ }
+
+ tcontext = sidtab_search(&sidtab, tsid);
+ if (!tcontext) {
+ printk(KERN_ERR "SELinux: %s: unrecognized SID %d\n",
+ __func__, tsid);
+ goto out;
+ }
+
+ tclass = unmap_class(orig_tclass);
+ if (unlikely(orig_tclass && !tclass)) {
+ if (policydb.allow_unknown)
+ goto allow;
+ goto out;
+ }
+
+
+ if (unlikely(!tclass || tclass > policydb.p_classes.nprim)) {
+ pr_warn_ratelimited("SELinux: Invalid class %hu\n", tclass);
+ goto out;
+ }
+
+ avkey.target_class = tclass;
+ avkey.specified = AVTAB_XPERMS;
+ sattr = flex_array_get(policydb.type_attr_map_array,
+ scontext->type - 1);
+ BUG_ON(!sattr);
+ tattr = flex_array_get(policydb.type_attr_map_array,
+ tcontext->type - 1);
+ BUG_ON(!tattr);
+ ebitmap_for_each_positive_bit(sattr, snode, i) {
+ ebitmap_for_each_positive_bit(tattr, tnode, j) {
+ avkey.source_type = i + 1;
+ avkey.target_type = j + 1;
+ for (node = avtab_search_node(&policydb.te_avtab, &avkey);
+ node;
+ node = avtab_search_node_next(node, avkey.specified))
+ services_compute_xperms_decision(xpermd, node);
+
+ cond_compute_xperms(&policydb.te_cond_avtab,
+ &avkey, xpermd);
+ }
+ }
+out:
+ read_unlock(&policy_rwlock);
+ return;
+allow:
+ memset(xpermd->allowed->p, 0xff, sizeof(xpermd->allowed->p));
+ goto out;
+}
/**
* security_compute_av - Compute access vector decisions.
@@ -906,6 +1076,7 @@ static void avd_init(struct av_decision *avd)
* @tsid: target security identifier
* @tclass: target security class
* @avd: access vector decisions
+ * @xperms: extended permissions
*
* Compute a set of access vector decisions based on the
* SID pair (@ssid, @tsid) for the permissions in @tclass.
@@ -913,13 +1084,15 @@ static void avd_init(struct av_decision *avd)
void security_compute_av(u32 ssid,
u32 tsid,
u16 orig_tclass,
- struct av_decision *avd)
+ struct av_decision *avd,
+ struct extended_perms *xperms)
{
u16 tclass;
struct context *scontext = NULL, *tcontext = NULL;
read_lock(&policy_rwlock);
avd_init(avd);
+ xperms->len = 0;
if (!ss_initialized)
goto allow;
@@ -947,7 +1120,7 @@ void security_compute_av(u32 ssid,
goto allow;
goto out;
}
- context_struct_compute_av(scontext, tcontext, tclass, avd);
+ context_struct_compute_av(scontext, tcontext, tclass, avd, xperms);
map_decision(orig_tclass, avd, policydb.allow_unknown);
out:
read_unlock(&policy_rwlock);
@@ -993,7 +1166,7 @@ void security_compute_av_user(u32 ssid,
goto out;
}
- context_struct_compute_av(scontext, tcontext, tclass, avd);
+ context_struct_compute_av(scontext, tcontext, tclass, avd, NULL);
out:
read_unlock(&policy_rwlock);
return;
@@ -1515,7 +1688,7 @@ static int security_compute_sid(u32 ssid,
if (avdatum) {
/* Use the type from the type transition/member/change rule. */
- newcontext.type = avdatum->data;
+ newcontext.type = avdatum->u.data;
}
/* if we have a objname this is a file trans check so check those rules */
diff --git a/security/selinux/ss/services.h b/security/selinux/ss/services.h
index e8d907e903cd..6abcd8729ec3 100644
--- a/security/selinux/ss/services.h
+++ b/security/selinux/ss/services.h
@@ -11,5 +11,11 @@
extern struct policydb policydb;
+void services_compute_xperms_drivers(struct extended_perms *xperms,
+ struct avtab_node *node);
+
+void services_compute_xperms_decision(struct extended_perms_decision *xpermd,
+ struct avtab_node *node);
+
#endif /* _SS_SERVICES_H_ */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 2f9e68025ede..02333234a096 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2067,12 +2067,15 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
case EPERM:
case EACCES:
return scnprintf(msg, size,
- "You may not have permission to collect %sstats.\n"
- "Consider tweaking /proc/sys/kernel/perf_event_paranoid:\n"
- " -1 - Not paranoid at all\n"
- " 0 - Disallow raw tracepoint access for unpriv\n"
- " 1 - Disallow cpu events for unpriv\n"
- " 2 - Disallow kernel profiling for unpriv",
+ "You may not have permission to collect %sstats.\n\n"
+ "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n"
+ "which controls use of the performance events system by\n"
+ "unprivileged users (without CAP_SYS_ADMIN).\n\n"
+ "The default value is 1:\n\n"
+ " -1: Allow use of (almost) all events by all users\n"
+ ">= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK\n"
+ ">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n"
+ ">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN",
target->system_wide ? "system-wide " : "");
case ENOENT:
return scnprintf(msg, size, "The %s event is not supported.",