aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-class-dual-role-usb71
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-wakeup_reasons16
-rw-r--r--Documentation/android.txt121
-rw-r--r--Documentation/block/00-INDEX6
-rw-r--r--Documentation/block/mmc-max-speed.txt38
-rw-r--r--Documentation/cpu-freq/governors.txt86
-rw-r--r--Documentation/device-mapper/boot.txt42
-rw-r--r--Documentation/devicetree/bindings/misc/memory-state-time.txt8
-rw-r--r--Documentation/filesystems/proc.txt6
-rw-r--r--Documentation/gpu/drm-kms.rst6
-rw-r--r--Documentation/kernel-parameters.txt6
-rw-r--r--Documentation/networking/ip-sysctl.txt23
-rw-r--r--Documentation/scheduler/sched-energy.txt362
-rw-r--r--Documentation/scheduler/sched-tune.txt366
-rw-r--r--Documentation/sync.txt75
-rw-r--r--Documentation/sysctl/kernel.txt4
-rw-r--r--Documentation/trace/events-power.txt1
-rw-r--r--Documentation/trace/ftrace.txt49
-rw-r--r--arch/alpha/include/uapi/asm/socket.h2
-rw-r--r--arch/arm/Kconfig24
-rw-r--r--arch/arm/Kconfig.debug8
-rw-r--r--arch/arm/Makefile5
-rw-r--r--arch/arm/boot/.gitignore1
-rw-r--r--arch/arm/boot/Makefile13
-rw-r--r--arch/arm/boot/compressed/head.S2
-rw-r--r--arch/arm/boot/dts/Makefile12
-rw-r--r--arch/arm/common/Kconfig4
-rw-r--r--arch/arm/common/Makefile1
-rw-r--r--arch/arm/common/fiq_glue.S118
-rw-r--r--arch/arm/common/fiq_glue_setup.c147
-rw-r--r--arch/arm/configs/ranchu_defconfig316
-rw-r--r--arch/arm/include/asm/elf.h8
-rw-r--r--arch/arm/include/asm/fiq_glue.h33
-rw-r--r--arch/arm/include/asm/topology.h7
-rw-r--r--arch/arm/kernel/kgdb.c4
-rw-r--r--arch/arm/kernel/process.c75
-rw-r--r--arch/arm/kernel/reboot.c30
-rw-r--r--arch/arm/kernel/topology.c149
-rw-r--r--arch/arm/mm/cache-v6.S17
-rw-r--r--arch/arm/mm/fault.c4
-rw-r--r--arch/arm64/Kconfig61
-rw-r--r--arch/arm64/Makefile9
-rw-r--r--arch/arm64/boot/.gitignore2
-rw-r--r--arch/arm64/boot/Makefile16
-rw-r--r--arch/arm64/boot/dts/Makefile14
-rw-r--r--arch/arm64/configs/ranchu64_defconfig312
-rw-r--r--arch/arm64/include/asm/assembler.h16
-rw-r--r--arch/arm64/include/asm/cpufeature.h6
-rw-r--r--arch/arm64/include/asm/efi.h28
-rw-r--r--arch/arm64/include/asm/elf.h2
-rw-r--r--arch/arm64/include/asm/esr.h6
-rw-r--r--arch/arm64/include/asm/futex.h17
-rw-r--r--arch/arm64/include/asm/hw_breakpoint.h6
-rw-r--r--arch/arm64/include/asm/kernel-pgtable.h19
-rw-r--r--arch/arm64/include/asm/mmu.h2
-rw-r--r--arch/arm64/include/asm/mmu_context.h55
-rw-r--r--arch/arm64/include/asm/thread_info.h3
-rw-r--r--arch/arm64/include/asm/topology.h8
-rw-r--r--arch/arm64/include/asm/uaccess.h203
-rw-r--r--arch/arm64/kernel/armv8_deprecated.c11
-rw-r--r--arch/arm64/kernel/asm-offsets.c3
-rw-r--r--arch/arm64/kernel/entry.S83
-rw-r--r--arch/arm64/kernel/head.S6
-rw-r--r--arch/arm64/kernel/hw_breakpoint.c153
-rw-r--r--arch/arm64/kernel/process.c66
-rw-r--r--arch/arm64/kernel/ptrace.c7
-rw-r--r--arch/arm64/kernel/setup.c9
-rw-r--r--arch/arm64/kernel/topology.c86
-rw-r--r--arch/arm64/kernel/traps.c41
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S5
-rw-r--r--arch/arm64/lib/clear_user.S11
-rw-r--r--arch/arm64/lib/copy_from_user.S11
-rw-r--r--arch/arm64/lib/copy_in_user.S11
-rw-r--r--arch/arm64/lib/copy_to_user.S11
-rw-r--r--arch/arm64/mm/cache.S6
-rw-r--r--arch/arm64/mm/context.c7
-rw-r--r--arch/arm64/mm/dma-mapping.c2
-rw-r--r--arch/arm64/mm/fault.c14
-rw-r--r--arch/arm64/mm/proc.S3
-rw-r--r--arch/arm64/xen/hypercall.S15
-rw-r--r--arch/avr32/include/uapi/asm/socket.h2
-rw-r--r--arch/frv/include/uapi/asm/socket.h2
-rw-r--r--arch/ia64/include/uapi/asm/socket.h2
-rw-r--r--arch/m32r/include/uapi/asm/socket.h2
-rw-r--r--arch/mips/include/uapi/asm/socket.h2
-rw-r--r--arch/mn10300/include/uapi/asm/socket.h2
-rw-r--r--arch/parisc/include/uapi/asm/socket.h2
-rw-r--r--arch/powerpc/include/uapi/asm/socket.h2
-rw-r--r--arch/s390/include/uapi/asm/socket.h2
-rw-r--r--arch/sparc/include/uapi/asm/socket.h2
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/configs/i386_ranchu_defconfig424
-rw-r--r--arch/x86/configs/x86_64_cuttlefish_defconfig454
-rw-r--r--arch/x86/configs/x86_64_ranchu_defconfig419
-rw-r--r--arch/x86/include/asm/idle.h7
-rw-r--r--arch/x86/kernel/process.c17
-rw-r--r--arch/xtensa/include/uapi/asm/socket.h2
-rw-r--r--block/blk-core.c84
-rw-r--r--build.config.cuttlefish.x86_6415
-rw-r--r--build.config.goldfish.arm12
-rw-r--r--build.config.goldfish.arm6412
-rw-r--r--build.config.goldfish.mips11
-rw-r--r--build.config.goldfish.mips6411
-rw-r--r--build.config.goldfish.x8612
-rw-r--r--build.config.goldfish.x86_6412
-rw-r--r--drivers/android/Kconfig22
-rw-r--r--drivers/android/Makefile3
-rw-r--r--drivers/android/binder.c4863
-rw-r--r--drivers/android/binder_alloc.c854
-rw-r--r--drivers/android/binder_alloc.h167
-rw-r--r--drivers/android/binder_alloc_selftest.c270
-rw-r--r--drivers/android/binder_trace.h41
-rw-r--r--drivers/base/power/main.c5
-rw-r--r--drivers/base/power/wakeup.c36
-rw-r--r--drivers/base/syscore.c3
-rw-r--r--drivers/clocksource/Kconfig8
-rw-r--r--drivers/clocksource/arm_arch_timer.c5
-rw-r--r--drivers/cpufreq/Kconfig51
-rw-r--r--drivers/cpufreq/Makefile1
-rw-r--r--drivers/cpufreq/cpufreq.c65
-rw-r--r--drivers/cpufreq/cpufreq_conservative.c5
-rw-r--r--drivers/cpufreq/cpufreq_interactive.c1411
-rw-r--r--drivers/cpufreq/cpufreq_performance.c5
-rw-r--r--drivers/cpufreq/cpufreq_powersave.c5
-rw-r--r--drivers/cpufreq/cpufreq_userspace.c5
-rw-r--r--drivers/cpuidle/cpuidle.c4
-rw-r--r--drivers/cpuidle/governors/menu.c7
-rw-r--r--drivers/dma-buf/fence.c11
-rw-r--r--drivers/dma-buf/reservation.c117
-rw-r--r--drivers/dma-buf/sw_sync.c10
-rw-r--r--drivers/dma-buf/sync_file.c5
-rw-r--r--drivers/gpu/drm/Kconfig1
-rw-r--r--drivers/gpu/drm/drm_atomic.c326
-rw-r--r--drivers/gpu/drm/drm_atomic_helper.c3
-rw-r--r--drivers/gpu/drm/drm_crtc.c68
-rw-r--r--drivers/gpu/drm/drm_crtc_internal.h2
-rw-r--r--drivers/gpu/drm/drm_fb_cma_helper.c35
-rw-r--r--drivers/gpu/drm/drm_fops.c4
-rw-r--r--drivers/gpu/drm/drm_plane.c1
-rw-r--r--drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c5
-rw-r--r--drivers/gpu/drm/ttm/ttm_bo.c9
-rw-r--r--drivers/hid/uhid.c17
-rw-r--r--drivers/input/Kconfig13
-rw-r--r--drivers/input/Makefile2
-rw-r--r--drivers/input/keyboard/goldfish_events.c28
-rw-r--r--drivers/input/keycombo.c261
-rw-r--r--drivers/input/keyreset.c144
-rw-r--r--drivers/input/misc/Kconfig16
-rw-r--r--drivers/input/misc/Makefile2
-rw-r--r--drivers/input/misc/gpio_axis.c192
-rw-r--r--drivers/input/misc/gpio_event.c228
-rw-r--r--drivers/input/misc/gpio_input.c390
-rw-r--r--drivers/input/misc/gpio_matrix.c440
-rw-r--r--drivers/input/misc/gpio_output.c97
-rw-r--r--drivers/input/misc/keychord.c467
-rw-r--r--drivers/md/Kconfig33
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-android-verity.c947
-rw-r--r--drivers/md/dm-android-verity.h123
-rw-r--r--drivers/md/dm-crypt.c14
-rw-r--r--drivers/md/dm-ioctl.c39
-rw-r--r--drivers/md/dm-linear.c35
-rw-r--r--drivers/md/dm-table.c1
-rw-r--r--drivers/md/dm-verity-fec.c46
-rw-r--r--drivers/md/dm-verity-fec.h4
-rw-r--r--drivers/md/dm-verity-target.c30
-rw-r--r--drivers/md/dm-verity.h10
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/misc/Kconfig21
-rw-r--r--drivers/misc/Makefile3
-rw-r--r--drivers/misc/memory_state_time.c462
-rw-r--r--drivers/misc/uid_sys_stats.c701
-rw-r--r--drivers/mmc/card/Kconfig12
-rw-r--r--drivers/mmc/card/block.c300
-rw-r--r--drivers/mmc/card/queue.c6
-rw-r--r--drivers/mmc/card/queue.h8
-rw-r--r--drivers/mmc/core/Kconfig15
-rw-r--r--drivers/mmc/core/core.c91
-rw-r--r--drivers/mmc/core/host.c16
-rw-r--r--drivers/mmc/core/host.h5
-rw-r--r--drivers/mmc/core/mmc.c14
-rw-r--r--drivers/mmc/core/sd.c83
-rw-r--r--drivers/mmc/core/sdio.c115
-rw-r--r--drivers/mmc/core/sdio_bus.c13
-rw-r--r--drivers/mmc/core/sdio_io.c33
-rw-r--r--drivers/mtd/nand/Kconfig10
-rw-r--r--drivers/net/ppp/Kconfig17
-rw-r--r--drivers/net/ppp/Makefile2
-rw-r--r--drivers/net/ppp/pppolac.c450
-rw-r--r--drivers/net/ppp/pppopns.c429
-rw-r--r--drivers/net/tun.c6
-rw-r--r--drivers/net/wireless/ti/wlcore/init.c5
-rw-r--r--drivers/nfc/fdp/i2c.c10
-rw-r--r--drivers/nfc/st21nfca/dep.c3
-rw-r--r--drivers/of/fdt.c74
-rw-r--r--drivers/platform/goldfish/Makefile3
-rw-r--r--drivers/platform/goldfish/goldfish_pipe.c273
-rw-r--r--drivers/platform/goldfish/goldfish_pipe.h92
-rw-r--r--drivers/platform/goldfish/goldfish_pipe_v2.c889
-rw-r--r--drivers/power/supply/power_supply_sysfs.c11
-rw-r--r--drivers/rtc/rtc-palmas.c44
-rw-r--r--drivers/scsi/ufs/ufshcd.c81
-rw-r--r--drivers/scsi/ufs/ufshcd.h3
-rw-r--r--drivers/staging/android/Kconfig20
-rw-r--r--drivers/staging/android/Makefile2
-rw-r--r--drivers/staging/android/TODO9
-rw-r--r--drivers/staging/android/ashmem.c26
-rw-r--r--drivers/staging/android/fiq_debugger/Kconfig58
-rw-r--r--drivers/staging/android/fiq_debugger/Makefile4
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger.c1246
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger.h64
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_arm.c240
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c202
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_priv.h37
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h94
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_watchdog.c56
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_watchdog.h20
-rw-r--r--drivers/staging/android/lowmemorykiller.c102
-rw-r--r--drivers/staging/android/trace/lowmemorykiller.h41
-rw-r--r--drivers/staging/android/uapi/vsoc_shm.h303
-rw-r--r--drivers/staging/android/vsoc.c1165
-rw-r--r--drivers/staging/goldfish/Kconfig8
-rw-r--r--drivers/staging/goldfish/Makefile6
-rw-r--r--drivers/staging/goldfish/goldfish_audio.c13
-rw-r--r--drivers/staging/goldfish/goldfish_sync_timeline.c962
-rw-r--r--drivers/staging/goldfish/goldfish_sync_timeline_fence.c254
-rw-r--r--drivers/staging/goldfish/goldfish_sync_timeline_fence.h58
-rw-r--r--drivers/tty/serial/serial_core.c3
-rw-r--r--drivers/usb/gadget/Kconfig50
-rw-r--r--drivers/usb/gadget/composite.c6
-rw-r--r--drivers/usb/gadget/configfs.c268
-rw-r--r--drivers/usb/gadget/function/Makefile8
-rw-r--r--drivers/usb/gadget/function/f_accessory.c1352
-rw-r--r--drivers/usb/gadget/function/f_audio_source.c1071
-rw-r--r--drivers/usb/gadget/function/f_midi.c66
-rw-r--r--drivers/usb/gadget/function/f_mtp.c1552
-rw-r--r--drivers/usb/gadget/function/f_mtp.h18
-rw-r--r--drivers/usb/gadget/function/f_ptp.c38
-rw-r--r--drivers/usb/phy/Kconfig17
-rw-r--r--drivers/usb/phy/Makefile2
-rw-r--r--drivers/usb/phy/class-dual-role.c529
-rw-r--r--drivers/usb/phy/otg-wakelock.c170
-rw-r--r--drivers/video/fbdev/goldfishfb.c18
-rw-r--r--drivers/w1/masters/ds2482.c47
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile5
-rw-r--r--fs/attr.c14
-rw-r--r--fs/coredump.c2
-rw-r--r--fs/dcache.c1
-rw-r--r--fs/eventpoll.c4
-rw-r--r--fs/exec.c2
-rw-r--r--fs/ext4/ext4.h3
-rw-r--r--fs/ext4/inline.c14
-rw-r--r--fs/ext4/inode.c55
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/mballoc.c28
-rw-r--r--fs/ext4/readpage.c47
-rw-r--r--fs/f2fs/data.c42
-rw-r--r--fs/f2fs/inline.c18
-rw-r--r--fs/f2fs/super.c8
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/fs_struct.c3
-rw-r--r--fs/fuse/dev.c10
-rw-r--r--fs/fuse/dir.c46
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/inode.c6
-rw-r--r--fs/internal.h4
-rw-r--r--fs/mpage.c36
-rw-r--r--fs/namei.c176
-rw-r--r--fs/namespace.c30
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/notify/inotify/inotify_user.c17
-rw-r--r--fs/open.c37
-rw-r--r--fs/pnode.c34
-rw-r--r--fs/pnode.h1
-rw-r--r--fs/proc/base.c8
-rw-r--r--fs/proc/task_mmu.c65
-rw-r--r--fs/proc_namespace.c8
-rw-r--r--fs/pstore/ram.c6
-rw-r--r--fs/sdcardfs/Kconfig13
-rw-r--r--fs/sdcardfs/Makefile7
-rw-r--r--fs/sdcardfs/dentry.c193
-rw-r--r--fs/sdcardfs/derived_perm.c471
-rw-r--r--fs/sdcardfs/file.c461
-rw-r--r--fs/sdcardfs/inode.c808
-rw-r--r--fs/sdcardfs/lookup.c471
-rw-r--r--fs/sdcardfs/main.c479
-rw-r--r--fs/sdcardfs/mmap.c88
-rw-r--r--fs/sdcardfs/multiuser.h53
-rw-r--r--fs/sdcardfs/packagelist.c881
-rw-r--r--fs/sdcardfs/sdcardfs.h638
-rw-r--r--fs/sdcardfs/super.c324
-rw-r--r--fs/super.c30
-rw-r--r--fs/sync.c1
-rw-r--r--fs/userfaultfd.c9
-rw-r--r--fs/utimes.c2
-rw-r--r--include/drm/drm_atomic.h3
-rw-r--r--include/drm/drm_crtc.h40
-rw-r--r--include/drm/drm_fb_cma_helper.h5
-rw-r--r--include/drm/drm_plane.h2
-rw-r--r--include/linux/Kbuild2
-rw-r--r--include/linux/amba/mmci.h10
-rw-r--r--include/linux/android_aid.h28
-rw-r--r--include/linux/blkdev.h76
-rw-r--r--include/linux/bpf-cgroup.h77
-rw-r--r--include/linux/cgroup-defs.h4
-rw-r--r--include/linux/cgroup_subsys.h4
-rw-r--r--include/linux/cpu.h7
-rw-r--r--include/linux/cpufreq.h39
-rw-r--r--include/linux/cpuidle.h2
-rw-r--r--include/linux/dcache.h1
-rw-r--r--include/linux/device-mapper.h13
-rw-r--r--include/linux/fence.h58
-rw-r--r--include/linux/fs.h22
-rw-r--r--include/linux/gpio_event.h170
-rw-r--r--include/linux/if_pppolac.h23
-rw-r--r--include/linux/if_pppopns.h23
-rw-r--r--include/linux/if_pppox.h21
-rw-r--r--include/linux/initramfs.h32
-rw-r--r--include/linux/ipv6.h2
-rw-r--r--include/linux/keychord.h23
-rw-r--r--include/linux/keycombo.h36
-rw-r--r--include/linux/keyreset.h29
-rw-r--r--include/linux/memory-state-time.h42
-rw-r--r--include/linux/mm.h3
-rw-r--r--include/linux/mm_types.h24
-rw-r--r--include/linux/mmc/card.h3
-rw-r--r--include/linux/mmc/core.h4
-rw-r--r--include/linux/mmc/host.h23
-rw-r--r--include/linux/mmc/mmc.h3
-rw-r--r--include/linux/mmc/pm.h1
-rw-r--r--include/linux/mmc/sdio_func.h10
-rw-r--r--include/linux/mount.h1
-rw-r--r--include/linux/namei.h3
-rw-r--r--include/linux/netfilter/xt_qtaguid.h14
-rw-r--r--include/linux/netfilter/xt_quota2.h25
-rw-r--r--include/linux/of_fdt.h21
-rw-r--r--include/linux/perf_event.h5
-rw-r--r--include/linux/platform_data/ds2482.h21
-rw-r--r--include/linux/power_supply.h8
-rw-r--r--include/linux/pstore_ram.h2
-rw-r--r--include/linux/reservation.h15
-rw-r--r--include/linux/sched.h156
-rw-r--r--include/linux/sched/sysctl.h25
-rw-r--r--include/linux/sched_energy.h44
-rw-r--r--include/linux/serial_core.h1
-rw-r--r--include/linux/sock_diag.h1
-rw-r--r--include/linux/suspend.h1
-rw-r--r--include/linux/task_io_accounting.h2
-rw-r--r--include/linux/task_io_accounting_ops.h1
-rw-r--r--include/linux/tick.h1
-rw-r--r--include/linux/timekeeping.h1
-rw-r--r--include/linux/usb/class-dual-role.h129
-rw-r--r--include/linux/usb/composite.h1
-rw-r--r--include/linux/usb/f_accessory.h23
-rw-r--r--include/linux/usb/f_mtp.h23
-rw-r--r--include/linux/wakeup_reason.h32
-rw-r--r--include/linux/wlan_plat.h30
-rw-r--r--include/net/addrconf.h2
-rw-r--r--include/net/fib_rules.h9
-rw-r--r--include/net/flow.h9
-rw-r--r--include/net/ip.h1
-rw-r--r--include/net/ip6_route.h5
-rw-r--r--include/net/route.h5
-rw-r--r--include/net/sock.h7
-rw-r--r--include/net/tcp.h1
-rw-r--r--include/trace/events/android_fs.h65
-rw-r--r--include/trace/events/android_fs_template.h64
-rw-r--r--include/trace/events/cpufreq_interactive.h112
-rw-r--r--include/trace/events/cpufreq_sched.h87
-rw-r--r--include/trace/events/gpu.h143
-rw-r--r--include/trace/events/net.h8
-rw-r--r--include/trace/events/power.h51
-rw-r--r--include/trace/events/sched.h601
-rw-r--r--include/uapi/asm-generic/socket.h2
-rw-r--r--include/uapi/linux/android/binder.h169
-rw-r--r--include/uapi/linux/bpf.h85
-rw-r--r--include/uapi/linux/fib_rules.h6
-rw-r--r--include/uapi/linux/fs.h2
-rw-r--r--include/uapi/linux/fuse.h1
-rw-r--r--include/uapi/linux/hw_breakpoint.h4
-rw-r--r--include/uapi/linux/if_pppolac.h33
-rw-r--r--include/uapi/linux/if_pppopns.h32
-rw-r--r--include/uapi/linux/if_pppox.h6
-rw-r--r--include/uapi/linux/ipv6.h7
-rw-r--r--include/uapi/linux/keychord.h52
-rw-r--r--include/uapi/linux/magic.h2
-rw-r--r--include/uapi/linux/netfilter/xt_IDLETIMER.h8
-rw-r--r--include/uapi/linux/netfilter/xt_socket.h7
-rw-r--r--include/uapi/linux/prctl.h3
-rw-r--r--include/uapi/linux/rtnetlink.h1
-rw-r--r--include/uapi/linux/sysctl.h1
-rw-r--r--include/uapi/linux/usb/f_accessory.h146
-rw-r--r--include/uapi/linux/usb/f_mtp.h61
-rw-r--r--init/Kconfig139
-rw-r--r--init/Makefile4
-rw-r--r--init/do_mounts.c1
-rw-r--r--init/do_mounts.h10
-rw-r--r--init/do_mounts_dm.c470
-rw-r--r--init/initramfs.c19
-rw-r--r--init/noinitramfs.c9
-rw-r--r--ipc/mqueue.c10
-rw-r--r--kernel/bpf/Makefile1
-rw-r--r--kernel/bpf/cgroup.c205
-rw-r--r--kernel/bpf/syscall.c93
-rw-r--r--kernel/cgroup.c22
-rw-r--r--kernel/configs/README.android15
-rw-r--r--kernel/configs/android-base-arm64.cfg5
-rw-r--r--kernel/configs/android-base.config36
-rw-r--r--kernel/configs/android-recommended.config12
-rw-r--r--kernel/cpu.c27
-rw-r--r--kernel/cpuset.c33
-rw-r--r--kernel/debug/kdb/kdb_io.c12
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/power/Kconfig1
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/process.c34
-rw-r--r--kernel/power/suspend.c35
-rw-r--r--kernel/power/wakeup_reason.c225
-rw-r--r--kernel/printk/printk.c8
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/core.c333
-rw-r--r--kernel/sched/cpufreq_sched.c499
-rw-r--r--kernel/sched/cpufreq_schedutil.c354
-rw-r--r--kernel/sched/cputime.c16
-rw-r--r--kernel/sched/debug.c95
-rw-r--r--kernel/sched/energy.c124
-rw-r--r--kernel/sched/fair.c2184
-rw-r--r--kernel/sched/features.h9
-rw-r--r--kernel/sched/idle.c3
-rw-r--r--kernel/sched/rt.c57
-rw-r--r--kernel/sched/sched.h272
-rw-r--r--kernel/sched/stats.c26
-rw-r--r--kernel/sched/stop_task.c3
-rw-r--r--kernel/sched/tune.c956
-rw-r--r--kernel/sched/tune.h55
-rw-r--r--kernel/sched/walt.c1133
-rw-r--r--kernel/sched/walt.h64
-rw-r--r--kernel/sys.c152
-rw-r--r--kernel/sysctl.c66
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/timekeeping.c29
-rw-r--r--kernel/trace/Kconfig3
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/gpu-traces.c23
-rw-r--r--kernel/trace/trace.c99
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_functions_graph.c43
-rw-r--r--kernel/trace/trace_output.c184
-rw-r--r--lib/Kconfig.debug9
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/mempolicy.c3
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c40
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/shmem.c13
-rw-r--r--net/Kconfig10
-rw-r--r--net/bluetooth/af_bluetooth.c29
-rw-r--r--net/bridge/br_device.c11
-rw-r--r--net/core/fib_rules.c78
-rw-r--r--net/core/filter.c69
-rw-r--r--net/core/sock.c13
-rw-r--r--net/core/sock_diag.c2
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c23
-rw-r--r--net/ipv4/fib_frontend.c1
-rw-r--r--net/ipv4/icmp.c2
-rw-r--r--net/ipv4/inet_connection_sock.c4
-rw-r--r--net/ipv4/ip_output.c29
-rw-r--r--net/ipv4/ping.c3
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c39
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c22
-rw-r--r--net/ipv4/sysfs_net_ipv4.c88
-rw-r--r--net/ipv4/tcp_input.c1
-rw-r--r--net/ipv4/tcp_ipv4.c9
-rw-r--r--net/ipv4/tcp_output.c2
-rw-r--r--net/ipv4/udp.c3
-rw-r--r--net/ipv6/addrconf.c49
-rw-r--r--net/ipv6/af_inet6.c21
-rw-r--r--net/ipv6/ah6.c5
-rw-r--r--net/ipv6/datagram.c1
-rw-r--r--net/ipv6/esp6.c5
-rw-r--r--net/ipv6/exthdrs_core.c13
-rw-r--r--net/ipv6/icmp.c7
-rw-r--r--net/ipv6/inet6_connection_sock.c2
-rw-r--r--net/ipv6/ip6_gre.c4
-rw-r--r--net/ipv6/ip6_output.c16
-rw-r--r--net/ipv6/ip6_tunnel.c4
-rw-r--r--net/ipv6/ip6_vti.c5
-rw-r--r--net/ipv6/ipcomp6.c5
-rw-r--r--net/ipv6/ndisc.c2
-rw-r--r--net/ipv6/netfilter.c1
-rw-r--r--net/ipv6/ping.c1
-rw-r--r--net/ipv6/raw.c1
-rw-r--r--net/ipv6/route.c70
-rw-r--r--net/ipv6/syncookies.c1
-rw-r--r--net/ipv6/tcp_ipv6.c5
-rw-r--r--net/ipv6/udp.c1
-rw-r--r--net/l2tp/l2tp_ip6.c1
-rw-r--r--net/netfilter/Kconfig41
-rw-r--r--net/netfilter/Makefile2
-rw-r--r--net/netfilter/xt_IDLETIMER.c246
-rw-r--r--net/netfilter/xt_qtaguid.c3031
-rw-r--r--net/netfilter/xt_qtaguid_internal.h350
-rw-r--r--net/netfilter/xt_qtaguid_print.c566
-rw-r--r--net/netfilter/xt_qtaguid_print.h120
-rw-r--r--net/netfilter/xt_quota2.c401
-rw-r--r--net/netfilter/xt_socket.c31
-rw-r--r--net/rfkill/Kconfig5
-rw-r--r--net/rfkill/core.c11
-rw-r--r--net/socket.c14
-rw-r--r--net/wireless/scan.c2
-rw-r--r--net/xfrm/xfrm_algo.c2
-rw-r--r--net/xfrm/xfrm_user.c18
-rw-r--r--samples/bpf/Makefile2
-rw-r--r--samples/bpf/libbpf.c23
-rw-r--r--samples/bpf/libbpf.h4
-rw-r--r--samples/bpf/test_cgrp2_attach.c147
-rw-r--r--scripts/Makefile.clean2
-rw-r--r--scripts/Makefile.lib6
-rw-r--r--scripts/Makefile.modinst2
-rwxr-xr-xscripts/checkpatch.pl1
-rw-r--r--security/Kconfig9
-rw-r--r--security/commoncap.c11
-rw-r--r--security/inode.c2
-rw-r--r--security/security.c1
-rw-r--r--security/selinux/hooks.c2
-rw-r--r--tools/include/uapi/linux/bpf.h85
-rw-r--r--tools/include/uapi/linux/hw_breakpoint.h4
532 files changed, 51885 insertions, 2804 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-dual-role-usb b/Documentation/ABI/testing/sysfs-class-dual-role-usb
new file mode 100644
index 000000000000..a900fd75430c
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-dual-role-usb
@@ -0,0 +1,71 @@
+What: /sys/class/dual_role_usb/.../
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ Provide a generic interface to monitor and change
+ the state of dual role usb ports. The name here
+ refers to the name mentioned in the
+ dual_role_phy_desc that is passed while registering
+ the dual_role_phy_intstance through
+ devm_dual_role_instance_register.
+
+What: /sys/class/dual_role_usb/.../supported_modes
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ This is a static node, once initialized this
+ is not expected to change during runtime. "dfp"
+ refers to "downstream facing port" i.e. port can
+ only act as host. "ufp" refers to "upstream
+ facing port" i.e. port can only act as device.
+ "dfp ufp" refers to "dual role port" i.e. the port
+ can either be a host port or a device port.
+
+What: /sys/class/dual_role_usb/.../mode
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ The mode node refers to the current mode in which the
+ port is operating. "dfp" for host ports. "ufp" for device
+ ports and "none" when cable is not connected.
+
+ On devices where the USB mode is software-controllable,
+ userspace can change the mode by writing "dfp" or "ufp".
+ On devices where the USB mode is fixed in hardware,
+ this attribute is read-only.
+
+What: /sys/class/dual_role_usb/.../power_role
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ The power_role node mentions whether the port
+ is "sink"ing or "source"ing power. "none" if
+ they are not connected.
+
+ On devices implementing USB Power Delivery,
+ userspace can control the power role by writing "sink" or
+ "source". On devices without USB-PD, this attribute is
+ read-only.
+
+What: /sys/class/dual_role_usb/.../data_role
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ The data_role node mentions whether the port
+ is acting as "host" or "device" for USB data connection.
+ "none" if there is no active data link.
+
+ On devices implementing USB Power Delivery, userspace
+ can control the data role by writing "host" or "device".
+ On devices without USB-PD, this attribute is read-only
+
+What: /sys/class/dual_role_usb/.../powers_vconn
+Date: June 2015
+Contact: Badhri Jagan Sridharan<badhri@google.com>
+Description:
+ The powers_vconn node mentions whether the port
+ is supplying power for VCONN pin.
+
+ On devices with software control of VCONN,
+ userspace can disable the power supply to VCONN by writing "n",
+ or enable the power supply by writing "y".
diff --git a/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons b/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons
new file mode 100644
index 000000000000..acb19b91c192
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons
@@ -0,0 +1,16 @@
+What: /sys/kernel/wakeup_reasons/last_resume_reason
+Date: February 2014
+Contact: Ruchi Kandoi <kandoiruchi@google.com>
+Description:
+ The /sys/kernel/wakeup_reasons/last_resume_reason is
+ used to report wakeup reasons after system exited suspend.
+
+What: /sys/kernel/wakeup_reasons/last_suspend_time
+Date: March 2015
+Contact: jinqian <jinqian@google.com>
+Description:
+ The /sys/kernel/wakeup_reasons/last_suspend_time is
+ used to report time spent in last suspend cycle. It contains
+ two numbers (in seconds) separated by space. First number is
+ the time spent in suspend and resume processes. Second number
+ is the time spent in sleep state. \ No newline at end of file
diff --git a/Documentation/android.txt b/Documentation/android.txt
new file mode 100644
index 000000000000..0f40a78b045f
--- /dev/null
+++ b/Documentation/android.txt
@@ -0,0 +1,121 @@
+ =============
+ A N D R O I D
+ =============
+
+Copyright (C) 2009 Google, Inc.
+Written by Mike Chan <mike@android.com>
+
+CONTENTS:
+---------
+
+1. Android
+ 1.1 Required enabled config options
+ 1.2 Required disabled config options
+ 1.3 Recommended enabled config options
+2. Contact
+
+
+1. Android
+==========
+
+Android (www.android.com) is an open source operating system for mobile devices.
+This document describes configurations needed to run the Android framework on
+top of the Linux kernel.
+
+To see a working defconfig look at msm_defconfig or goldfish_defconfig
+which can be found at http://android.git.kernel.org in kernel/common.git
+and kernel/msm.git
+
+
+1.1 Required enabled config options
+-----------------------------------
+After building a standard defconfig, ensure that these options are enabled in
+your .config or defconfig if they are not already. Based off the msm_defconfig.
+You should keep the rest of the default options enabled in the defconfig
+unless you know what you are doing.
+
+ANDROID_PARANOID_NETWORK
+ASHMEM
+CONFIG_FB_MODE_HELPERS
+CONFIG_FONT_8x16
+CONFIG_FONT_8x8
+CONFIG_YAFFS_SHORT_NAMES_IN_RAM
+DAB
+EARLYSUSPEND
+FB
+FB_CFB_COPYAREA
+FB_CFB_FILLRECT
+FB_CFB_IMAGEBLIT
+FB_DEFERRED_IO
+FB_TILEBLITTING
+HIGH_RES_TIMERS
+INOTIFY
+INOTIFY_USER
+INPUT_EVDEV
+INPUT_GPIO
+INPUT_MISC
+LEDS_CLASS
+LEDS_GPIO
+LOCK_KERNEL
+LkOGGER
+LOW_MEMORY_KILLER
+MISC_DEVICES
+NEW_LEDS
+NO_HZ
+POWER_SUPPLY
+PREEMPT
+RAMFS
+RTC_CLASS
+RTC_LIB
+SWITCH
+SWITCH_GPIO
+TMPFS
+UID_STAT
+UID16
+USB_FUNCTION
+USB_FUNCTION_ADB
+USER_WAKELOCK
+VIDEO_OUTPUT_CONTROL
+WAKELOCK
+YAFFS_AUTO_YAFFS2
+YAFFS_FS
+YAFFS_YAFFS1
+YAFFS_YAFFS2
+
+
+1.2 Required disabled config options
+------------------------------------
+CONFIG_YAFFS_DISABLE_LAZY_LOAD
+DNOTIFY
+
+
+1.3 Recommended enabled config options
+------------------------------
+ANDROID_PMEM
+PSTORE_CONSOLE
+PSTORE_RAM
+SCHEDSTATS
+DEBUG_PREEMPT
+DEBUG_MUTEXES
+DEBUG_SPINLOCK_SLEEP
+DEBUG_INFO
+FRAME_POINTER
+CPU_FREQ
+CPU_FREQ_TABLE
+CPU_FREQ_DEFAULT_GOV_ONDEMAND
+CPU_FREQ_GOV_ONDEMAND
+CRC_CCITT
+EMBEDDED
+INPUT_TOUCHSCREEN
+I2C
+I2C_BOARDINFO
+LOG_BUF_SHIFT=17
+SERIAL_CORE
+SERIAL_CORE_CONSOLE
+
+
+2. Contact
+==========
+website: http://android.git.kernel.org
+
+mailing-lists: android-kernel@googlegroups.com
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index e55103ace382..a542b9f2a30d 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -30,3 +30,9 @@ switching-sched.txt
- Switching I/O schedulers at runtime
writeback_cache_control.txt
- Control of volatile write back caches
+mmc-max-speed.txt
+ - eMMC layer speed simulation, related to /sys/block/mmcblk*/
+ attributes:
+ max_read_speed
+ max_write_speed
+ cache_size
diff --git a/Documentation/block/mmc-max-speed.txt b/Documentation/block/mmc-max-speed.txt
new file mode 100644
index 000000000000..3f052b9fb999
--- /dev/null
+++ b/Documentation/block/mmc-max-speed.txt
@@ -0,0 +1,38 @@
+eMMC Block layer simulation speed controls in /sys/block/mmcblk*/
+===============================================
+
+Turned on with CONFIG_MMC_SIMULATE_MAX_SPEED which enables MMC device speed
+limiting. Used to test and simulate the behavior of the system when
+confronted with a slow MMC.
+
+Enables max_read_speed, max_write_speed and cache_size attributes and module
+default parameters to control the write or read maximum KB/second speed
+behaviors.
+
+NB: There is room for improving the algorithm for aspects tied directly to
+eMMC specific behavior. For instance, wear leveling and stalls from an
+exhausted erase pool. We would expect that if there was a need to provide
+similar speed simulation controls to other types of block devices, aspects of
+their behavior are modelled separately (e.g. head seek times, heat assist,
+shingling and rotational latency).
+
+/sys/block/mmcblk0/max_read_speed:
+
+Number of KB/second reads allowed to the block device. Used to test and
+simulate the behavior of the system when confronted with a slow reading MMC.
+Set to 0 or "off" to place no speed limit.
+
+/sys/block/mmcblk0/max_write_speed:
+
+Number of KB/second writes allowed to the block device. Used to test and
+simulate the behavior of the system when confronted with a slow writing MMC.
+Set to 0 or "off" to place no speed limit.
+
+/sys/block/mmcblk0/cache_size:
+
+Number of MB of high speed memory or high speed SLC cache expected on the
+eMMC device being simulated. Used to help simulate the write-back behavior
+more accurately. The assumption is the cache has no delay, but draws down
+in the background to the MLC/TLC primary store at the max_write_speed rate.
+Any write speed delays will show up when the cache is full, or when an I/O
+request to flush is issued.
diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt
index c15aa75f5227..0cf9a6bff6a5 100644
--- a/Documentation/cpu-freq/governors.txt
+++ b/Documentation/cpu-freq/governors.txt
@@ -28,6 +28,7 @@ Contents:
2.3 Userspace
2.4 Ondemand
2.5 Conservative
+2.6 Interactive
3. The Governor Interface in the CPUfreq Core
@@ -218,6 +219,91 @@ a decision on when to decrease the frequency while running in any
speed. Load for frequency increase is still evaluated every
sampling rate.
+2.6 Interactive
+---------------
+
+The CPUfreq governor "interactive" is designed for latency-sensitive,
+interactive workloads. This governor sets the CPU speed depending on
+usage, similar to "ondemand" and "conservative" governors, but with a
+different set of configurable behaviors.
+
+The tunable values for this governor are:
+
+above_hispeed_delay: When speed is at or above hispeed_freq, wait for
+this long before raising speed in response to continued high load.
+The format is a single delay value, optionally followed by pairs of
+CPU speeds and the delay to use at or above those speeds. Colons can
+be used between the speeds and associated delays for readability. For
+example:
+
+ 80000 1300000:200000 1500000:40000
+
+uses delay 80000 uS until CPU speed 1.3 GHz, at which speed delay
+200000 uS is used until speed 1.5 GHz, at which speed (and above)
+delay 40000 uS is used. If speeds are specified these must appear in
+ascending order. Default is 20000 uS.
+
+boost: If non-zero, immediately boost speed of all CPUs to at least
+hispeed_freq until zero is written to this attribute. If zero, allow
+CPU speeds to drop below hispeed_freq according to load as usual.
+Default is zero.
+
+boostpulse: On each write, immediately boost speed of all CPUs to
+hispeed_freq for at least the period of time specified by
+boostpulse_duration, after which speeds are allowed to drop below
+hispeed_freq according to load as usual. Its a write-only file.
+
+boostpulse_duration: Length of time to hold CPU speed at hispeed_freq
+on a write to boostpulse, before allowing speed to drop according to
+load as usual. Default is 80000 uS.
+
+go_hispeed_load: The CPU load at which to ramp to hispeed_freq.
+Default is 99%.
+
+hispeed_freq: An intermediate "high speed" at which to initially ramp
+when CPU load hits the value specified in go_hispeed_load. If load
+stays high for the amount of time specified in above_hispeed_delay,
+then speed may be bumped higher. Default is the maximum speed allowed
+by the policy at governor initialization time.
+
+io_is_busy: If set, the governor accounts IO time as CPU busy time.
+
+min_sample_time: The minimum amount of time to spend at the current
+frequency before ramping down. Default is 80000 uS.
+
+target_loads: CPU load values used to adjust speed to influence the
+current CPU load toward that value. In general, the lower the target
+load, the more often the governor will raise CPU speeds to bring load
+below the target. The format is a single target load, optionally
+followed by pairs of CPU speeds and CPU loads to target at or above
+those speeds. Colons can be used between the speeds and associated
+target loads for readability. For example:
+
+ 85 1000000:90 1700000:99
+
+targets CPU load 85% below speed 1GHz, 90% at or above 1GHz, until
+1.7GHz and above, at which load 99% is targeted. If speeds are
+specified these must appear in ascending order. Higher target load
+values are typically specified for higher speeds, that is, target load
+values also usually appear in an ascending order. The default is
+target load 90% for all speeds.
+
+timer_rate: Sample rate for reevaluating CPU load when the CPU is not
+idle. A deferrable timer is used, such that the CPU will not be woken
+from idle to service this timer until something else needs to run.
+(The maximum time to allow deferring this timer when not running at
+minimum speed is configurable via timer_slack.) Default is 20000 uS.
+
+timer_slack: Maximum additional time to defer handling the governor
+sampling timer beyond timer_rate when running at speeds above the
+minimum. For platforms that consume additional power at idle when
+CPUs are running at speeds greater than minimum, this places an upper
+bound on how long the timer will be deferred prior to re-evaluating
+load and dropping speed. For example, if timer_rate is 20000uS and
+timer_slack is 10000uS then timers will be deferred for up to 30msec
+when not at lowest speed. A value of -1 means defer timers
+indefinitely at all speeds. Default is 80000 uS.
+
3. The Governor Interface in the CPUfreq Core
=============================================
diff --git a/Documentation/device-mapper/boot.txt b/Documentation/device-mapper/boot.txt
new file mode 100644
index 000000000000..adcaad5e5e32
--- /dev/null
+++ b/Documentation/device-mapper/boot.txt
@@ -0,0 +1,42 @@
+Boot time creation of mapped devices
+===================================
+
+It is possible to configure a device mapper device to act as the root
+device for your system in two ways.
+
+The first is to build an initial ramdisk which boots to a minimal
+userspace which configures the device, then pivot_root(8) in to it.
+
+For simple device mapper configurations, it is possible to boot directly
+using the following kernel command line:
+
+dm="<name> <uuid> <ro>,table line 1,...,table line n"
+
+name = the name to associate with the device
+ after boot, udev, if used, will use that name to label
+ the device node.
+uuid = may be 'none' or the UUID desired for the device.
+ro = may be "ro" or "rw". If "ro", the device and device table will be
+ marked read-only.
+
+Each table line may be as normal when using the dmsetup tool except for
+two variations:
+1. Any use of commas will be interpreted as a newline
+2. Quotation marks cannot be escaped and cannot be used without
+ terminating the dm= argument.
+
+Unless renamed by udev, the device node created will be dm-0 as the
+first minor number for the device-mapper is used during early creation.
+
+Example
+=======
+
+- Booting to a linear array made up of user-mode linux block devices:
+
+ dm="lroot none 0, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" \
+ root=/dev/dm-0
+
+Will boot to a rw dm-linear target of 8192 sectors split across two
+block devices identified by their major:minor numbers. After boot, udev
+will rename this target to /dev/mapper/lroot (depending on the rules).
+No uuid was assigned.
diff --git a/Documentation/devicetree/bindings/misc/memory-state-time.txt b/Documentation/devicetree/bindings/misc/memory-state-time.txt
new file mode 100644
index 000000000000..c99a506c030d
--- /dev/null
+++ b/Documentation/devicetree/bindings/misc/memory-state-time.txt
@@ -0,0 +1,8 @@
+Memory bandwidth and frequency state tracking
+
+Required properties:
+- compatible : should be:
+ "memory-state-time"
+- freq-tbl: Should contain entries with each frequency in Hz.
+- bw-buckets: Should contain upper-bound limits for each bandwidth bucket in Mbps.
+ Must match the framework power_profile.xml for the device.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 74329fd0add2..6e027ae50d7e 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -392,6 +392,8 @@ is not associated with a file:
[stack] = the stack of the main process
[vdso] = the "virtual dynamic shared object",
the kernel system call handler
+ [anon:<name>] = an anonymous mapping that has been
+ named by userspace
or if empty, the mapping is anonymous.
@@ -419,6 +421,7 @@ KernelPageSize: 4 kB
MMUPageSize: 4 kB
Locked: 0 kB
VmFlags: rd ex mr mw me dw
+Name: name from userspace
the first of these lines shows the same information as is displayed for the
mapping in /proc/PID/maps. The remaining lines show the size of the mapping
@@ -486,6 +489,9 @@ Note that there is no guarantee that every flag and associated mnemonic will
be present in all further kernel releases. Things get changed, the flags may
be vanished or the reverse -- new added.
+The "Name" field will only be present on a mapping that has been named by
+userspace, and will show the name passed in by userspace.
+
This file is only present if the CONFIG_MMU kernel configuration option is
enabled.
diff --git a/Documentation/gpu/drm-kms.rst b/Documentation/gpu/drm-kms.rst
index 53b872c105d2..db86cda6fa34 100644
--- a/Documentation/gpu/drm-kms.rst
+++ b/Documentation/gpu/drm-kms.rst
@@ -308,6 +308,12 @@ Color Management Properties
.. kernel-doc:: drivers/gpu/drm/drm_color_mgmt.c
:export:
+Explicit Fencing Properties
+---------------------------
+
+.. kernel-doc:: drivers/gpu/drm/drm_atomic.c
+ :doc: explicit fencing properties
+
Existing KMS Properties
-----------------------
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c708a50b060e..02869140671e 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -87,6 +87,7 @@ parameter is applicable:
BLACKFIN Blackfin architecture is enabled.
CLK Common clock infrastructure is enabled.
CMA Contiguous Memory Area support is enabled.
+ DM Device mapper support is enabled.
DRM Direct Rendering Management support is enabled.
DYNAMIC_DEBUG Build in debug messages and enable them at runtime
EDD BIOS Enhanced Disk Drive Services (EDD) is enabled
@@ -1034,6 +1035,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
dis_ucode_ldr [X86] Disable the microcode loader.
+ dm= [DM] Allows early creation of a device-mapper device.
+ See Documentation/device-mapper/boot.txt.
+
+ dmasound= [HW,OSS] Sound subsystem buff
+
dma_debug=off If the kernel is compiled with DMA_API_DEBUG support,
this option disables the debugging code at boot.
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index dbdc4130e149..8b93b3e79639 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -600,6 +600,16 @@ tcp_fastopen - INTEGER
Note that that additional client or server features are only
effective if the basic support (0x1 and 0x2) are enabled respectively.
+tcp_fwmark_accept - BOOLEAN
+ If set, incoming connections to listening sockets that do not have a
+ socket mark will set the mark of the accepting socket to the fwmark of
+ the incoming SYN packet. This will cause all packets on that connection
+ (starting from the first SYNACK) to be sent with that fwmark. The
+ listening socket's mark is unchanged. Listening sockets that already
+ have a fwmark set via setsockopt(SOL_SOCKET, SO_MARK, ...) are
+ unaffected.
+ Default: 0
+
tcp_syn_retries - INTEGER
Number of times initial SYNs for an active TCP connection attempt
will be retransmitted. Should not be higher than 127. Default value
@@ -1439,11 +1449,20 @@ accept_ra_pinfo - BOOLEAN
Functional default: enabled if accept_ra is enabled.
disabled if accept_ra is disabled.
+accept_ra_rt_info_min_plen - INTEGER
+ Minimum prefix length of Route Information in RA.
+
+ Route Information w/ prefix smaller than this variable shall
+ be ignored.
+
+ Functional default: 0 if accept_ra_rtr_pref is enabled.
+ -1 if accept_ra_rtr_pref is disabled.
+
accept_ra_rt_info_max_plen - INTEGER
Maximum prefix length of Route Information in RA.
- Route Information w/ prefix larger than or equal to this
- variable shall be ignored.
+ Route Information w/ prefix larger than this variable shall
+ be ignored.
Functional default: 0 if accept_ra_rtr_pref is enabled.
-1 if accept_ra_rtr_pref is disabled.
diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt
new file mode 100644
index 000000000000..dab2f9088b33
--- /dev/null
+++ b/Documentation/scheduler/sched-energy.txt
@@ -0,0 +1,362 @@
+Energy cost model for energy-aware scheduling (EXPERIMENTAL)
+
+Introduction
+=============
+
+The basic energy model uses platform energy data stored in sched_group_energy
+data structures attached to the sched_groups in the sched_domain hierarchy. The
+energy cost model offers two functions that can be used to guide scheduling
+decisions:
+
+1. static unsigned int sched_group_energy(struct energy_env *eenv)
+2. static int energy_diff(struct energy_env *eenv)
+
+sched_group_energy() estimates the energy consumed by all cpus in a specific
+sched_group including any shared resources owned exclusively by this group of
+cpus. Resources shared with other cpus are excluded (e.g. later level caches).
+
+energy_diff() estimates the total energy impact of a utilization change. That
+is, adding, removing, or migrating utilization (tasks).
+
+Both functions use a struct energy_env to specify the scenario to be evaluated:
+
+ struct energy_env {
+ struct sched_group *sg_top;
+ struct sched_group *sg_cap;
+ int cap_idx;
+ int util_delta;
+ int src_cpu;
+ int dst_cpu;
+ int energy;
+ };
+
+sg_top: sched_group to be evaluated. Not used by energy_diff().
+
+sg_cap: sched_group covering the cpus in the same frequency domain. Set by
+sched_group_energy().
+
+cap_idx: Capacity state to be used for energy calculations. Set by
+find_new_capacity().
+
+util_delta: Amount of utilization to be added, removed, or migrated.
+
+src_cpu: Source cpu from where 'util_delta' utilization is removed. Should be
+-1 if no source (e.g. task wake-up).
+
+dst_cpu: Destination cpu where 'util_delta' utilization is added. Should be -1
+if utilization is removed (e.g. terminating tasks).
+
+energy: Result of sched_group_energy().
+
+The metric used to represent utilization is the actual per-entity running time
+averaged over time using a geometric series. Very similar to the existing
+per-entity load-tracking, but _not_ scaled by task priority and capped by the
+capacity of the cpu. The latter property does mean that utilization may
+underestimate the compute requirements for task on fully/over utilized cpus.
+The greatest potential for energy savings without affecting performance too much
+is scenarios where the system isn't fully utilized. If the system is deemed
+fully utilized load-balancing should be done with task load (includes task
+priority) instead in the interest of fairness and performance.
+
+
+Background and Terminology
+===========================
+
+To make it clear from the start:
+
+energy = [joule] (resource like a battery on powered devices)
+power = energy/time = [joule/second] = [watt]
+
+The goal of energy-aware scheduling is to minimize energy, while still getting
+the job done. That is, we want to maximize:
+
+ performance [inst/s]
+ --------------------
+ power [W]
+
+which is equivalent to minimizing:
+
+ energy [J]
+ -----------
+ instruction
+
+while still getting 'good' performance. It is essentially an alternative
+optimization objective to the current performance-only objective for the
+scheduler. This alternative considers two objectives: energy-efficiency and
+performance. Hence, there needs to be a user controllable knob to switch the
+objective. Since it is early days, this is currently a sched_feature
+(ENERGY_AWARE).
+
+The idea behind introducing an energy cost model is to allow the scheduler to
+evaluate the implications of its decisions rather than applying energy-saving
+techniques blindly that may only have positive effects on some platforms. At
+the same time, the energy cost model must be as simple as possible to minimize
+the scheduler latency impact.
+
+Platform topology
+------------------
+
+The system topology (cpus, caches, and NUMA information, not peripherals) is
+represented in the scheduler by the sched_domain hierarchy which has
+sched_groups attached at each level that covers one or more cpus (see
+sched-domains.txt for more details). To add energy awareness to the scheduler
+we need to consider power and frequency domains.
+
+Power domain:
+
+A power domain is a part of the system that can be powered on/off
+independently. Power domains are typically organized in a hierarchy where you
+may be able to power down just a cpu or a group of cpus along with any
+associated resources (e.g. shared caches). Powering up a cpu means that all
+power domains it is a part of in the hierarchy must be powered up. Hence, it is
+more expensive to power up the first cpu that belongs to a higher level power
+domain than powering up additional cpus in the same high level domain. Two
+level power domain hierarchy example:
+
+ Power source
+ +-------------------------------+----...
+per group PD G G
+ | +----------+ |
+ +--------+-------| Shared | (other groups)
+per-cpu PD G G | resource |
+ | | +----------+
+ +-------+ +-------+
+ | CPU 0 | | CPU 1 |
+ +-------+ +-------+
+
+Frequency domain:
+
+Frequency domains (P-states) typically cover the same group of cpus as one of
+the power domain levels. That is, there might be several smaller power domains
+sharing the same frequency (P-state) or there might be a power domain spanning
+multiple frequency domains.
+
+From a scheduling point of view there is no need to know the actual frequencies
+[Hz]. All the scheduler cares about is the compute capacity available at the
+current state (P-state) the cpu is in and any other available states. For that
+reason, and to also factor in any cpu micro-architecture differences, compute
+capacity scaling states are called 'capacity states' in this document. For SMP
+systems this is equivalent to P-states. For mixed micro-architecture systems
+(like ARM big.LITTLE) it is P-states scaled according to the micro-architecture
+performance relative to the other cpus in the system.
+
+Energy modelling:
+------------------
+
+Due to the hierarchical nature of the power domains, the most obvious way to
+model energy costs is therefore to associate power and energy costs with
+domains (groups of cpus). Energy costs of shared resources are associated with
+the group of cpus that share the resources, only the cost of powering the
+cpu itself and any private resources (e.g. private L1 caches) is associated
+with the per-cpu groups (lowest level).
+
+For example, for an SMP system with per-cpu power domains and a cluster level
+(group of cpus) power domain we get the overall energy costs to be:
+
+ energy = energy_cluster + n * energy_cpu
+
+where 'n' is the number of cpus powered up and energy_cluster is the cost paid
+as soon as any cpu in the cluster is powered up.
+
+The power and frequency domains can naturally be mapped onto the existing
+sched_domain hierarchy and sched_groups by adding the necessary data to the
+existing data structures.
+
+The energy model considers energy consumption from two contributors (shown in
+the illustration below):
+
+1. Busy energy: Energy consumed while a cpu and the higher level groups that it
+belongs to are busy running tasks. Busy energy is associated with the state of
+the cpu, not an event. The time the cpu spends in this state varies. Thus, the
+most obvious platform parameter for this contribution is busy power
+(energy/time).
+
+2. Idle energy: Energy consumed while a cpu and higher level groups that it
+belongs to are idle (in a C-state). Like busy energy, idle energy is associated
+with the state of the cpu. Thus, the platform parameter for this contribution
+is idle power (energy/time).
+
+Energy consumed during transitions from an idle-state (C-state) to a busy state
+(P-state) or going the other way is ignored by the model to simplify the energy
+model calculations.
+
+
+ Power
+ ^
+ | busy->idle idle->busy
+ | transition transition
+ |
+ | _ __
+ | / \ / \__________________
+ |______________/ \ /
+ | \ /
+ | Busy \ Idle / Busy
+ | low P-state \____________/ high P-state
+ |
+ +------------------------------------------------------------> time
+
+Busy |--------------| |-----------------|
+
+Wakeup |------| |------|
+
+Idle |------------|
+
+
+The basic algorithm
+====================
+
+The basic idea is to determine the total energy impact when utilization is
+added or removed by estimating the impact at each level in the sched_domain
+hierarchy starting from the bottom (sched_group contains just a single cpu).
+The energy cost comes from busy time (sched_group is awake because one or more
+cpus are busy) and idle time (in an idle-state). Energy model numbers account
+for energy costs associated with all cpus in the sched_group as a group.
+
+ for_each_domain(cpu, sd) {
+ sg = sched_group_of(cpu)
+ energy_before = curr_util(sg) * busy_power(sg)
+ + (1-curr_util(sg)) * idle_power(sg)
+ energy_after = new_util(sg) * busy_power(sg)
+ + (1-new_util(sg)) * idle_power(sg)
+ energy_diff += energy_before - energy_after
+
+ }
+
+ return energy_diff
+
+{curr, new}_util: The cpu utilization at the lowest level and the overall
+non-idle time for the entire group for higher levels. Utilization is in the
+range 0.0 to 1.0 in the pseudo-code.
+
+busy_power: The power consumption of the sched_group.
+
+idle_power: The power consumption of the sched_group when idle.
+
+Note: It is a fundamental assumption that the utilization is (roughly) scale
+invariant. Task utilization tracking factors in any frequency scaling and
+performance scaling differences due to difference cpu microarchitectures such
+that task utilization can be used across the entire system.
+
+
+Platform energy data
+=====================
+
+struct sched_group_energy can be attached to sched_groups in the sched_domain
+hierarchy and has the following members:
+
+cap_states:
+ List of struct capacity_state representing the supported capacity states
+ (P-states). struct capacity_state has two members: cap and power, which
+ represents the compute capacity and the busy_power of the state. The
+ list must be ordered by capacity low->high.
+
+nr_cap_states:
+ Number of capacity states in cap_states list.
+
+idle_states:
+ List of struct idle_state containing idle_state power cost for each
+ idle-state supported by the system orderd by shallowest state first.
+ All states must be included at all level in the hierarchy, i.e. a
+ sched_group spanning just a single cpu must also include coupled
+ idle-states (cluster states). In addition to the cpuidle idle-states,
+ the list must also contain an entry for the idling using the arch
+ default idle (arch_idle_cpu()). Despite this state may not be a true
+ hardware idle-state it is considered the shallowest idle-state in the
+ energy model and must be the first entry. cpus may enter this state
+ (possibly 'active idling') if cpuidle decides not enter a cpuidle
+ idle-state. Default idle may not be used when cpuidle is enabled.
+ In this case, it should just be a copy of the first cpuidle idle-state.
+
+nr_idle_states:
+ Number of idle states in idle_states list.
+
+There are no unit requirements for the energy cost data. Data can be normalized
+with any reference, however, the normalization must be consistent across all
+energy cost data. That is, one bogo-joule/watt must be the same quantity for
+data, but we don't care what it is.
+
+A recipe for platform characterization
+=======================================
+
+Obtaining the actual model data for a particular platform requires some way of
+measuring power/energy. There isn't a tool to help with this (yet). This
+section provides a recipe for use as reference. It covers the steps used to
+characterize the ARM TC2 development platform. This sort of measurements is
+expected to be done anyway when tuning cpuidle and cpufreq for a given
+platform.
+
+The energy model needs two types of data (struct sched_group_energy holds
+these) for each sched_group where energy costs should be taken into account:
+
+1. Capacity state information
+
+A list containing the compute capacity and power consumption when fully
+utilized attributed to the group as a whole for each available capacity state.
+At the lowest level (group contains just a single cpu) this is the power of the
+cpu alone without including power consumed by resources shared with other cpus.
+It basically needs to fit the basic modelling approach described in "Background
+and Terminology" section:
+
+ energy_system = energy_shared + n * energy_cpu
+
+for a system containing 'n' busy cpus. Only 'energy_cpu' should be included at
+the lowest level. 'energy_shared' is included at the next level which
+represents the group of cpus among which the resources are shared.
+
+This model is, of course, a simplification of reality. Thus, power/energy
+attributions might not always exactly represent how the hardware is designed.
+Also, busy power is likely to depend on the workload. It is therefore
+recommended to use a representative mix of workloads when characterizing the
+capacity states.
+
+If the group has no capacity scaling support, the list will contain a single
+state where power is the busy power attributed to the group. The capacity
+should be set to a default value (1024).
+
+When frequency domains include multiple power domains, the group representing
+the frequency domain and all child groups share capacity states. This must be
+indicated by setting the SD_SHARE_CAP_STATES sched_domain flag. All groups at
+all levels that share the capacity state must have the list of capacity states
+with the power set to the contribution of the individual group.
+
+2. Idle power information
+
+Stored in the idle_states list. The power number is the group idle power
+consumption in each idle state as well when the group is idle but has not
+entered an idle-state ('active idle' as mentioned earlier). Due to the way the
+energy model is defined, the idle power of the deepest group idle state can
+alternatively be accounted for in the parent group busy power. In that case the
+group idle state power values are offset such that the idle power of the
+deepest state is zero. It is less intuitive, but it is easier to measure as
+idle power consumed by the group and the busy/idle power of the parent group
+cannot be distinguished without per group measurement points.
+
+Measuring capacity states and idle power:
+
+The capacity states' capacity and power can be estimated by running a benchmark
+workload at each available capacity state. By restricting the benchmark to run
+on subsets of cpus it is possible to extrapolate the power consumption of
+shared resources.
+
+ARM TC2 has two clusters of two and three cpus respectively. Each cluster has a
+shared L2 cache. TC2 has on-chip energy counters per cluster. Running a
+benchmark workload on just one cpu in a cluster means that power is consumed in
+the cluster (higher level group) and a single cpu (lowest level group). Adding
+another benchmark task to another cpu increases the power consumption by the
+amount consumed by the additional cpu. Hence, it is possible to extrapolate the
+cluster busy power.
+
+For platforms that don't have energy counters or equivalent instrumentation
+built-in, it may be possible to use an external DAQ to acquire similar data.
+
+If the benchmark includes some performance score (for example sysbench cpu
+benchmark), this can be used to record the compute capacity.
+
+Measuring idle power requires insight into the idle state implementation on the
+particular platform. Specifically, if the platform has coupled idle-states (or
+package states). To measure non-coupled per-cpu idle-states it is necessary to
+keep one cpu busy to keep any shared resources alive to isolate the idle power
+of the cpu from idle/busy power of the shared resources. The cpu can be tricked
+into different per-cpu idle states by disabling the other states. Based on
+various combinations of measurements with specific cpus busy and disabling
+idle-states it is possible to extrapolate the idle-state power.
diff --git a/Documentation/scheduler/sched-tune.txt b/Documentation/scheduler/sched-tune.txt
new file mode 100644
index 000000000000..9bd2231c01b1
--- /dev/null
+++ b/Documentation/scheduler/sched-tune.txt
@@ -0,0 +1,366 @@
+ Central, scheduler-driven, power-performance control
+ (EXPERIMENTAL)
+
+Abstract
+========
+
+The topic of a single simple power-performance tunable, that is wholly
+scheduler centric, and has well defined and predictable properties has come up
+on several occasions in the past [1,2]. With techniques such as a scheduler
+driven DVFS [3], we now have a good framework for implementing such a tunable.
+This document describes the overall ideas behind its design and implementation.
+
+
+Table of Contents
+=================
+
+1. Motivation
+2. Introduction
+3. Signal Boosting Strategy
+4. OPP selection using boosted CPU utilization
+5. Per task group boosting
+6. Question and Answers
+ - What about "auto" mode?
+ - What about boosting on a congested system?
+ - How CPUs are boosted when we have tasks with multiple boost values?
+7. References
+
+
+1. Motivation
+=============
+
+Sched-DVFS [3] is a new event-driven cpufreq governor which allows the
+scheduler to select the optimal DVFS operating point (OPP) for running a task
+allocated to a CPU. The introduction of sched-DVFS enables running workloads at
+the most energy efficient OPPs.
+
+However, sometimes it may be desired to intentionally boost the performance of
+a workload even if that could imply a reasonable increase in energy
+consumption. For example, in order to reduce the response time of a task, we
+may want to run the task at a higher OPP than the one that is actually required
+by it's CPU bandwidth demand.
+
+This last requirement is especially important if we consider that one of the
+main goals of the sched-DVFS component is to replace all currently available
+CPUFreq policies. Since sched-DVFS is event based, as opposed to the sampling
+driven governors we currently have, it is already more responsive at selecting
+the optimal OPP to run tasks allocated to a CPU. However, just tracking the
+actual task load demand may not be enough from a performance standpoint. For
+example, it is not possible to get behaviors similar to those provided by the
+"performance" and "interactive" CPUFreq governors.
+
+This document describes an implementation of a tunable, stacked on top of the
+sched-DVFS which extends its functionality to support task performance
+boosting.
+
+By "performance boosting" we mean the reduction of the time required to
+complete a task activation, i.e. the time elapsed from a task wakeup to its
+next deactivation (e.g. because it goes back to sleep or it terminates). For
+example, if we consider a simple periodic task which executes the same workload
+for 5[s] every 20[s] while running at a certain OPP, a boosted execution of
+that task must complete each of its activations in less than 5[s].
+
+A previous attempt [5] to introduce such a boosting feature has not been
+successful mainly because of the complexity of the proposed solution. The
+approach described in this document exposes a single simple interface to
+user-space. This single tunable knob allows the tuning of system wide
+scheduler behaviours ranging from energy efficiency at one end through to
+incremental performance boosting at the other end. This first tunable affects
+all tasks. However, a more advanced extension of the concept is also provided
+which uses CGroups to boost the performance of only selected tasks while using
+the energy efficient default for all others.
+
+The rest of this document introduces in more details the proposed solution
+which has been named SchedTune.
+
+
+2. Introduction
+===============
+
+SchedTune exposes a simple user-space interface with a single power-performance
+tunable:
+
+ /proc/sys/kernel/sched_cfs_boost
+
+This permits expressing a boost value as an integer in the range [0..100].
+
+A value of 0 (default) configures the CFS scheduler for maximum energy
+efficiency. This means that sched-DVFS runs the tasks at the minimum OPP
+required to satisfy their workload demand.
+A value of 100 configures scheduler for maximum performance, which translates
+to the selection of the maximum OPP on that CPU.
+
+The range between 0 and 100 can be set to satisfy other scenarios suitably. For
+example to satisfy interactive response or depending on other system events
+(battery level etc).
+
+A CGroup based extension is also provided, which permits further user-space
+defined task classification to tune the scheduler for different goals depending
+on the specific nature of the task, e.g. background vs interactive vs
+low-priority.
+
+The overall design of the SchedTune module is built on top of "Per-Entity Load
+Tracking" (PELT) signals and sched-DVFS by introducing a bias on the Operating
+Performance Point (OPP) selection.
+Each time a task is allocated on a CPU, sched-DVFS has the opportunity to tune
+the operating frequency of that CPU to better match the workload demand. The
+selection of the actual OPP being activated is influenced by the global boost
+value, or the boost value for the task CGroup when in use.
+
+This simple biasing approach leverages existing frameworks, which means minimal
+modifications to the scheduler, and yet it allows to achieve a range of
+different behaviours all from a single simple tunable knob.
+The only new concept introduced is that of signal boosting.
+
+
+3. Signal Boosting Strategy
+===========================
+
+The whole PELT machinery works based on the value of a few load tracking signals
+which basically track the CPU bandwidth requirements for tasks and the capacity
+of CPUs. The basic idea behind the SchedTune knob is to artificially inflate
+some of these load tracking signals to make a task or RQ appears more demanding
+that it actually is.
+
+Which signals have to be inflated depends on the specific "consumer". However,
+independently from the specific (signal, consumer) pair, it is important to
+define a simple and possibly consistent strategy for the concept of boosting a
+signal.
+
+A boosting strategy defines how the "abstract" user-space defined
+sched_cfs_boost value is translated into an internal "margin" value to be added
+to a signal to get its inflated value:
+
+ margin := boosting_strategy(sched_cfs_boost, signal)
+ boosted_signal := signal + margin
+
+Different boosting strategies were identified and analyzed before selecting the
+one found to be most effective.
+
+Signal Proportional Compensation (SPC)
+--------------------------------------
+
+In this boosting strategy the sched_cfs_boost value is used to compute a
+margin which is proportional to the complement of the original signal.
+When a signal has a maximum possible value, its complement is defined as
+the delta from the actual value and its possible maximum.
+
+Since the tunable implementation uses signals which have SCHED_LOAD_SCALE as
+the maximum possible value, the margin becomes:
+
+ margin := sched_cfs_boost * (SCHED_LOAD_SCALE - signal)
+
+Using this boosting strategy:
+- a 100% sched_cfs_boost means that the signal is scaled to the maximum value
+- each value in the range of sched_cfs_boost effectively inflates the signal in
+ question by a quantity which is proportional to the maximum value.
+
+For example, by applying the SPC boosting strategy to the selection of the OPP
+to run a task it is possible to achieve these behaviors:
+
+- 0% boosting: run the task at the minimum OPP required by its workload
+- 100% boosting: run the task at the maximum OPP available for the CPU
+- 50% boosting: run at the half-way OPP between minimum and maximum
+
+Which means that, at 50% boosting, a task will be scheduled to run at half of
+the maximum theoretically achievable performance on the specific target
+platform.
+
+A graphical representation of an SPC boosted signal is represented in the
+following figure where:
+ a) "-" represents the original signal
+ b) "b" represents a 50% boosted signal
+ c) "p" represents a 100% boosted signal
+
+
+ ^
+ | SCHED_LOAD_SCALE
+ +-----------------------------------------------------------------+
+ |pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
+ |
+ | boosted_signal
+ | bbbbbbbbbbbbbbbbbbbbbbbb
+ |
+ | original signal
+ | bbbbbbbbbbbbbbbbbbbbbbbb+----------------------+
+ | |
+ |bbbbbbbbbbbbbbbbbb |
+ | |
+ | |
+ | |
+ | +-----------------------+
+ | |
+ | |
+ | |
+ |------------------+
+ |
+ |
+ +----------------------------------------------------------------------->
+
+The plot above shows a ramped load signal (titled 'original_signal') and it's
+boosted equivalent. For each step of the original signal the boosted signal
+corresponding to a 50% boost is midway from the original signal and the upper
+bound. Boosting by 100% generates a boosted signal which is always saturated to
+the upper bound.
+
+
+4. OPP selection using boosted CPU utilization
+==============================================
+
+It is worth calling out that the implementation does not introduce any new load
+signals. Instead, it provides an API to tune existing signals. This tuning is
+done on demand and only in scheduler code paths where it is sensible to do so.
+The new API calls are defined to return either the default signal or a boosted
+one, depending on the value of sched_cfs_boost. This is a clean an non invasive
+modification of the existing existing code paths.
+
+The signal representing a CPU's utilization is boosted according to the
+previously described SPC boosting strategy. To sched-DVFS, this allows a CPU
+(ie CFS run-queue) to appear more used then it actually is.
+
+Thus, with the sched_cfs_boost enabled we have the following main functions to
+get the current utilization of a CPU:
+
+ cpu_util()
+ boosted_cpu_util()
+
+The new boosted_cpu_util() is similar to the first but returns a boosted
+utilization signal which is a function of the sched_cfs_boost value.
+
+This function is used in the CFS scheduler code paths where sched-DVFS needs to
+decide the OPP to run a CPU at.
+For example, this allows selecting the highest OPP for a CPU which has
+the boost value set to 100%.
+
+
+5. Per task group boosting
+==========================
+
+The availability of a single knob which is used to boost all tasks in the
+system is certainly a simple solution but it quite likely doesn't fit many
+utilization scenarios, especially in the mobile device space.
+
+For example, on battery powered devices there usually are many background
+services which are long running and need energy efficient scheduling. On the
+other hand, some applications are more performance sensitive and require an
+interactive response and/or maximum performance, regardless of the energy cost.
+To better service such scenarios, the SchedTune implementation has an extension
+that provides a more fine grained boosting interface.
+
+A new CGroup controller, namely "schedtune", could be enabled which allows to
+defined and configure task groups with different boosting values.
+Tasks that require special performance can be put into separate CGroups.
+The value of the boost associated with the tasks in this group can be specified
+using a single knob exposed by the CGroup controller:
+
+ schedtune.boost
+
+This knob allows the definition of a boost value that is to be used for
+SPC boosting of all tasks attached to this group.
+
+The current schedtune controller implementation is really simple and has these
+main characteristics:
+
+ 1) It is only possible to create 1 level depth hierarchies
+
+ The root control groups define the system-wide boost value to be applied
+ by default to all tasks. Its direct subgroups are named "boost groups" and
+ they define the boost value for specific set of tasks.
+ Further nested subgroups are not allowed since they do not have a sensible
+ meaning from a user-space standpoint.
+
+ 2) It is possible to define only a limited number of "boost groups"
+
+ This number is defined at compile time and by default configured to 16.
+ This is a design decision motivated by two main reasons:
+ a) In a real system we do not expect utilization scenarios with more then few
+ boost groups. For example, a reasonable collection of groups could be
+ just "background", "interactive" and "performance".
+ b) It simplifies the implementation considerably, especially for the code
+ which has to compute the per CPU boosting once there are multiple
+ RUNNABLE tasks with different boost values.
+
+Such a simple design should allow servicing the main utilization scenarios identified
+so far. It provides a simple interface which can be used to manage the
+power-performance of all tasks or only selected tasks.
+Moreover, this interface can be easily integrated by user-space run-times (e.g.
+Android, ChromeOS) to implement a QoS solution for task boosting based on tasks
+classification, which has been a long standing requirement.
+
+Setup and usage
+---------------
+
+0. Use a kernel with CGROUP_SCHEDTUNE support enabled
+
+1. Check that the "schedtune" CGroup controller is available:
+
+ root@linaro-nano:~# cat /proc/cgroups
+ #subsys_name hierarchy num_cgroups enabled
+ cpuset 0 1 1
+ cpu 0 1 1
+ schedtune 0 1 1
+
+2. Mount a tmpfs to create the CGroups mount point (Optional)
+
+ root@linaro-nano:~# sudo mount -t tmpfs cgroups /sys/fs/cgroup
+
+3. Mount the "schedtune" controller
+
+ root@linaro-nano:~# mkdir /sys/fs/cgroup/stune
+ root@linaro-nano:~# sudo mount -t cgroup -o schedtune stune /sys/fs/cgroup/stune
+
+4. Setup the system-wide boost value (Optional)
+
+ If not configured the root control group has a 0% boost value, which
+ basically disables boosting for all tasks in the system thus running in
+ an energy-efficient mode.
+
+ root@linaro-nano:~# echo $SYSBOOST > /sys/fs/cgroup/stune/schedtune.boost
+
+5. Create task groups and configure their specific boost value (Optional)
+
+ For example here we create a "performance" boost group configure to boost
+ all its tasks to 100%
+
+ root@linaro-nano:~# mkdir /sys/fs/cgroup/stune/performance
+ root@linaro-nano:~# echo 100 > /sys/fs/cgroup/stune/performance/schedtune.boost
+
+6. Move tasks into the boost group
+
+ For example, the following moves the tasks with PID $TASKPID (and all its
+ threads) into the "performance" boost group.
+
+ root@linaro-nano:~# echo "TASKPID > /sys/fs/cgroup/stune/performance/cgroup.procs
+
+This simple configuration allows only the threads of the $TASKPID task to run,
+when needed, at the highest OPP in the most capable CPU of the system.
+
+
+6. Question and Answers
+=======================
+
+What about "auto" mode?
+-----------------------
+
+The 'auto' mode as described in [5] can be implemented by interfacing SchedTune
+with some suitable user-space element. This element could use the exposed
+system-wide or cgroup based interface.
+
+How are multiple groups of tasks with different boost values managed?
+---------------------------------------------------------------------
+
+The current SchedTune implementation keeps track of the boosted RUNNABLE tasks
+on a CPU. Once sched-DVFS selects the OPP to run a CPU at, the CPU utilization
+is boosted with a value which is the maximum of the boost values of the
+currently RUNNABLE tasks in its RQ.
+
+This allows sched-DVFS to boost a CPU only while there are boosted tasks ready
+to run and switch back to the energy efficient mode as soon as the last boosted
+task is dequeued.
+
+
+7. References
+=============
+[1] http://lwn.net/Articles/552889
+[2] http://lkml.org/lkml/2012/5/18/91
+[3] http://lkml.org/lkml/2015/6/26/620
diff --git a/Documentation/sync.txt b/Documentation/sync.txt
new file mode 100644
index 000000000000..a2d05e7fa193
--- /dev/null
+++ b/Documentation/sync.txt
@@ -0,0 +1,75 @@
+Motivation:
+
+In complicated DMA pipelines such as graphics (multimedia, camera, gpu, display)
+a consumer of a buffer needs to know when the producer has finished producing
+it. Likewise the producer needs to know when the consumer is finished with the
+buffer so it can reuse it. A particular buffer may be consumed by multiple
+consumers which will retain the buffer for different amounts of time. In
+addition, a consumer may consume multiple buffers atomically.
+The sync framework adds an API which allows synchronization between the
+producers and consumers in a generic way while also allowing platforms which
+have shared hardware synchronization primitives to exploit them.
+
+Goals:
+ * provide a generic API for expressing synchronization dependencies
+ * allow drivers to exploit hardware synchronization between hardware
+ blocks
+ * provide a userspace API that allows a compositor to manage
+ dependencies.
+ * provide rich telemetry data to allow debugging slowdowns and stalls of
+ the graphics pipeline.
+
+Objects:
+ * sync_timeline
+ * sync_pt
+ * sync_fence
+
+sync_timeline:
+
+A sync_timeline is an abstract monotonically increasing counter. In general,
+each driver/hardware block context will have one of these. They can be backed
+by the appropriate hardware or rely on the generic sw_sync implementation.
+Timelines are only ever created through their specific implementations
+(i.e. sw_sync.)
+
+sync_pt:
+
+A sync_pt is an abstract value which marks a point on a sync_timeline. Sync_pts
+have a single timeline parent. They have 3 states: active, signaled, and error.
+They start in active state and transition, once, to either signaled (when the
+timeline counter advances beyond the sync_pt’s value) or error state.
+
+sync_fence:
+
+Sync_fences are the primary primitives used by drivers to coordinate
+synchronization of their buffers. They are a collection of sync_pts which may
+or may not have the same timeline parent. A sync_pt can only exist in one fence
+and the fence's list of sync_pts is immutable once created. Fences can be
+waited on synchronously or asynchronously. Two fences can also be merged to
+create a third fence containing a copy of the two fences’ sync_pts. Fences are
+backed by file descriptors to allow userspace to coordinate the display pipeline
+dependencies.
+
+Use:
+
+A driver implementing sync support should have a work submission function which:
+ * takes a fence argument specifying when to begin work
+ * asynchronously queues that work to kick off when the fence is signaled
+ * returns a fence to indicate when its work will be done.
+ * signals the returned fence once the work is completed.
+
+Consider an imaginary display driver that has the following API:
+/*
+ * assumes buf is ready to be displayed.
+ * blocks until the buffer is on screen.
+ */
+ void display_buffer(struct dma_buf *buf);
+
+The new API will become:
+/*
+ * will display buf when fence is signaled.
+ * returns immediately with a fence that will signal when buf
+ * is no longer displayed.
+ */
+struct sync_fence* display_buffer(struct dma_buf *buf,
+ struct sync_fence *fence);
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index ffab8b5caa60..52daff6d09fb 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -659,12 +659,14 @@ allowed to execute.
perf_event_paranoid:
Controls use of the performance events system by unprivileged
-users (without CAP_SYS_ADMIN). The default value is 2.
+users (without CAP_SYS_ADMIN). The default value is 3 if
+CONFIG_SECURITY_PERF_EVENTS_RESTRICT is set, or 2 otherwise.
-1: Allow use of (almost) all events by all users
>=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
>=1: Disallow CPU event access by users without CAP_SYS_ADMIN
>=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
+>=3: Disallow all event access by users without CAP_SYS_ADMIN
==============================================================
diff --git a/Documentation/trace/events-power.txt b/Documentation/trace/events-power.txt
index 21d514ced212..4d817d5acc40 100644
--- a/Documentation/trace/events-power.txt
+++ b/Documentation/trace/events-power.txt
@@ -25,6 +25,7 @@ cpufreq.
cpu_idle "state=%lu cpu_id=%lu"
cpu_frequency "state=%lu cpu_id=%lu"
+cpu_frequency_limits "min=%lu max=%lu cpu_id=%lu"
A suspend event is used to indicate the system going in and out of the
suspend mode:
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 185c39fea2a0..91723ed53470 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -362,6 +362,26 @@ of ftrace. Here is a list of some of the key files:
to correlate events across hypervisor/guest if
tb_offset is known.
+ mono: This uses the fast monotonic clock (CLOCK_MONOTONIC)
+ which is monotonic and is subject to NTP rate adjustments.
+
+ mono_raw:
+ This is the raw monotonic clock (CLOCK_MONOTONIC_RAW)
+ which is montonic but is not subject to any rate adjustments
+ and ticks at the same rate as the hardware clocksource.
+
+ boot: This is the boot clock (CLOCK_BOOTTIME) and is based on the
+ fast monotonic clock, but also accounts for time spent in
+ suspend. Since the clock access is designed for use in
+ tracing in the suspend path, some side effects are possible
+ if clock is accessed after the suspend time is accounted before
+ the fast mono clock is updated. In this case, the clock update
+ appears to happen slightly sooner than it normally would have.
+ Also on 32-bit systems, it's possible that the 64-bit boot offset
+ sees a partial update. These effects are rare and post
+ processing should be able to handle them. See comments in the
+ ktime_get_boot_fast_ns() function for more information.
+
To set a clock, simply echo the clock name into this file.
echo global > trace_clock
@@ -2102,6 +2122,35 @@ will produce:
1) 1.449 us | }
+You can disable the hierarchical function call formatting and instead print a
+flat list of function entry and return events. This uses the format described
+in the Output Formatting section and respects all the trace options that
+control that formatting. Hierarchical formatting is the default.
+
+ hierachical: echo nofuncgraph-flat > trace_options
+ flat: echo funcgraph-flat > trace_options
+
+ ie:
+
+ # tracer: function_graph
+ #
+ # entries-in-buffer/entries-written: 68355/68355 #P:2
+ #
+ # _-----=> irqs-off
+ # / _----=> need-resched
+ # | / _---=> hardirq/softirq
+ # || / _--=> preempt-depth
+ # ||| / delay
+ # TASK-PID CPU# |||| TIMESTAMP FUNCTION
+ # | | | |||| | |
+ sh-1806 [001] d... 198.843443: graph_ent: func=_raw_spin_lock
+ sh-1806 [001] d... 198.843445: graph_ent: func=__raw_spin_lock
+ sh-1806 [001] d..1 198.843447: graph_ret: func=__raw_spin_lock
+ sh-1806 [001] d..1 198.843449: graph_ret: func=_raw_spin_lock
+ sh-1806 [001] d..1 198.843451: graph_ent: func=_raw_spin_unlock_irqrestore
+ sh-1806 [001] d... 198.843453: graph_ret: func=_raw_spin_unlock_irqrestore
+
+
You might find other useful features for this tracer in the
following "dynamic ftrace" section such as tracing only specific
functions or tasks.
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 9e46d6e656d9..fa47df6a953a 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -97,4 +97,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index b5d529fdffab..00be82f3929f 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1836,6 +1836,15 @@ config XEN
help
Say Y if you want to run Linux in a Virtual Machine on Xen on ARM.
+config ARM_FLUSH_CONSOLE_ON_RESTART
+ bool "Force flush the console on restart"
+ help
+ If the console is locked while the system is rebooted, the messages
+ in the temporary logbuffer would not have propogated to all the
+ console drivers. This option forces the console lock to be
+ released if it failed to be acquired, which will cause all the
+ pending messages to be flushed.
+
endmenu
menu "Boot options"
@@ -1864,6 +1873,21 @@ config DEPRECATED_PARAM_STRUCT
This was deprecated in 2001 and announced to live on for 5 years.
Some old boot loaders still use this way.
+config BUILD_ARM_APPENDED_DTB_IMAGE
+ bool "Build a concatenated zImage/dtb by default"
+ depends on OF
+ help
+ Enabling this option will cause a concatenated zImage and list of
+ DTBs to be built by default (instead of a standalone zImage.)
+ The image will built in arch/arm/boot/zImage-dtb
+
+config BUILD_ARM_APPENDED_DTB_IMAGE_NAMES
+ string "Default dtb names"
+ depends on BUILD_ARM_APPENDED_DTB_IMAGE
+ help
+ Space separated list of names of dtbs to append when
+ building a concatenated zImage-dtb.
+
# Compressed boot loader in ROM. Yes, we really want to ask about
# TEXT and BSS so we preserve their values in the config files.
config ZBOOT_ROM_TEXT
diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug
index d83f7c369e51..17dcd9416db3 100644
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -1723,6 +1723,14 @@ config EARLY_PRINTK
kernel low-level debugging functions. Add earlyprintk to your
kernel parameters to enable this console.
+config EARLY_PRINTK_DIRECT
+ bool "Early printk direct"
+ depends on DEBUG_LL
+ help
+ Say Y here if you want to have an early console using the
+ kernel low-level debugging functions and EARLY_PRINTK is
+ not early enough.
+
config ARM_KPROBES_TEST
tristate "Kprobes test module"
depends on KPROBES && MODULES
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index e14ddca59d02..9ced939c495d 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -298,6 +298,8 @@ libs-y := arch/arm/lib/ $(libs-y)
# Default target when executing plain make
ifeq ($(CONFIG_XIP_KERNEL),y)
KBUILD_IMAGE := xipImage
+else ifeq ($(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE),y)
+KBUILD_IMAGE := zImage-dtb
else
KBUILD_IMAGE := zImage
endif
@@ -349,6 +351,9 @@ ifeq ($(CONFIG_VDSO),y)
$(Q)$(MAKE) $(build)=arch/arm/vdso $@
endif
+zImage-dtb: vmlinux scripts dtbs
+ $(Q)$(MAKE) $(build)=$(boot) MACHINE=$(MACHINE) $(boot)/$@
+
# We use MRPROPER_FILES and CLEAN_FILES now
archclean:
$(Q)$(MAKE) $(clean)=$(boot)
diff --git a/arch/arm/boot/.gitignore b/arch/arm/boot/.gitignore
index 3c79f85975aa..ad7a0253ea96 100644
--- a/arch/arm/boot/.gitignore
+++ b/arch/arm/boot/.gitignore
@@ -4,3 +4,4 @@ xipImage
bootpImage
uImage
*.dtb
+zImage-dtb \ No newline at end of file
diff --git a/arch/arm/boot/Makefile b/arch/arm/boot/Makefile
index 50f8d1be7fcb..da75630c440d 100644
--- a/arch/arm/boot/Makefile
+++ b/arch/arm/boot/Makefile
@@ -16,6 +16,7 @@ OBJCOPYFLAGS :=-O binary -R .comment -S
ifneq ($(MACHINE),)
include $(MACHINE)/Makefile.boot
endif
+include $(srctree)/arch/arm/boot/dts/Makefile
# Note: the following conditions must always be true:
# ZRELADDR == virt_to_phys(PAGE_OFFSET + TEXT_OFFSET)
@@ -29,6 +30,14 @@ export ZRELADDR INITRD_PHYS PARAMS_PHYS
targets := Image zImage xipImage bootpImage uImage
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
+endif
+DTB_OBJS := $(addprefix $(obj)/dts/,$(DTB_LIST))
+
ifeq ($(CONFIG_XIP_KERNEL),y)
$(obj)/xipImage: vmlinux FORCE
@@ -55,6 +64,10 @@ $(obj)/compressed/vmlinux: $(obj)/Image FORCE
$(obj)/zImage: $(obj)/compressed/vmlinux FORCE
$(call if_changed,objcopy)
+$(obj)/zImage-dtb: $(obj)/zImage $(DTB_OBJS) FORCE
+ $(call if_changed,cat)
+ @echo ' Kernel: $@ is ready'
+
endif
ifneq ($(LOADADDR),)
diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index 2d7f2bb0d66a..b114faa4679c 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -784,6 +784,8 @@ __armv7_mmu_cache_on:
bic r6, r6, #1 << 31 @ 32-bit translation system
bic r6, r6, #(7 << 0) | (1 << 4) @ use only ttbr0
mcrne p15, 0, r3, c2, c0, 0 @ load page table pointer
+ mcrne p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
+ mcr p15, 0, r0, c7, c5, 4 @ ISB
mcrne p15, 0, r1, c3, c0, 0 @ load domain access control
mcrne p15, 0, r6, c2, c0, 2 @ load ttb control
#endif
diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index 7037201c5e3a..54f95d365bf7 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -960,5 +960,15 @@ endif
dtstree := $(srctree)/$(src)
dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts))
-always := $(dtb-y)
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
+endif
+
+targets += dtbs dtbs_install
+targets += $(DTB_LIST)
+
+always := $(DTB_LIST)
clean-files := *.dtb
diff --git a/arch/arm/common/Kconfig b/arch/arm/common/Kconfig
index 9353184d730d..ce01364a96e3 100644
--- a/arch/arm/common/Kconfig
+++ b/arch/arm/common/Kconfig
@@ -17,3 +17,7 @@ config SHARP_PARAM
config SHARP_SCOOP
bool
+
+config FIQ_GLUE
+ bool
+ select FIQ
diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile
index 27f23b15b1ea..04aca896b338 100644
--- a/arch/arm/common/Makefile
+++ b/arch/arm/common/Makefile
@@ -4,6 +4,7 @@
obj-y += firmware.o
+obj-$(CONFIG_FIQ_GLUE) += fiq_glue.o fiq_glue_setup.o
obj-$(CONFIG_ICST) += icst.o
obj-$(CONFIG_SA1111) += sa1111.o
obj-$(CONFIG_DMABOUNCE) += dmabounce.o
diff --git a/arch/arm/common/fiq_glue.S b/arch/arm/common/fiq_glue.S
new file mode 100644
index 000000000000..24b42cec4813
--- /dev/null
+++ b/arch/arm/common/fiq_glue.S
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+
+ .global fiq_glue_end
+
+ /* fiq stack: r0-r15,cpsr,spsr of interrupted mode */
+
+ENTRY(fiq_glue)
+ /* store pc, cpsr from previous mode, reserve space for spsr */
+ mrs r12, spsr
+ sub lr, lr, #4
+ subs r10, #1
+ bne nested_fiq
+
+ str r12, [sp, #-8]!
+ str lr, [sp, #-4]!
+
+ /* store r8-r14 from previous mode */
+ sub sp, sp, #(7 * 4)
+ stmia sp, {r8-r14}^
+ nop
+
+ /* store r0-r7 from previous mode */
+ stmfd sp!, {r0-r7}
+
+ /* setup func(data,regs) arguments */
+ mov r0, r9
+ mov r1, sp
+ mov r3, r8
+
+ mov r7, sp
+
+ /* Get sp and lr from non-user modes */
+ and r4, r12, #MODE_MASK
+ cmp r4, #USR_MODE
+ beq fiq_from_usr_mode
+
+ mov r7, sp
+ orr r4, r4, #(PSR_I_BIT | PSR_F_BIT)
+ msr cpsr_c, r4
+ str sp, [r7, #(4 * 13)]
+ str lr, [r7, #(4 * 14)]
+ mrs r5, spsr
+ str r5, [r7, #(4 * 17)]
+
+ cmp r4, #(SVC_MODE | PSR_I_BIT | PSR_F_BIT)
+ /* use fiq stack if we reenter this mode */
+ subne sp, r7, #(4 * 3)
+
+fiq_from_usr_mode:
+ msr cpsr_c, #(SVC_MODE | PSR_I_BIT | PSR_F_BIT)
+ mov r2, sp
+ sub sp, r7, #12
+ stmfd sp!, {r2, ip, lr}
+ /* call func(data,regs) */
+ blx r3
+ ldmfd sp, {r2, ip, lr}
+ mov sp, r2
+
+ /* restore/discard saved state */
+ cmp r4, #USR_MODE
+ beq fiq_from_usr_mode_exit
+
+ msr cpsr_c, r4
+ ldr sp, [r7, #(4 * 13)]
+ ldr lr, [r7, #(4 * 14)]
+ msr spsr_cxsf, r5
+
+fiq_from_usr_mode_exit:
+ msr cpsr_c, #(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)
+
+ ldmfd sp!, {r0-r7}
+ ldr lr, [sp, #(4 * 7)]
+ ldr r12, [sp, #(4 * 8)]
+ add sp, sp, #(10 * 4)
+exit_fiq:
+ msr spsr_cxsf, r12
+ add r10, #1
+ cmp r11, #0
+ moveqs pc, lr
+ bx r11 /* jump to custom fiq return function */
+
+nested_fiq:
+ orr r12, r12, #(PSR_F_BIT)
+ b exit_fiq
+
+fiq_glue_end:
+
+ENTRY(fiq_glue_setup) /* func, data, sp, smc call number */
+ stmfd sp!, {r4}
+ mrs r4, cpsr
+ msr cpsr_c, #(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)
+ movs r8, r0
+ mov r9, r1
+ mov sp, r2
+ mov r11, r3
+ moveq r10, #0
+ movne r10, #1
+ msr cpsr_c, r4
+ ldmfd sp!, {r4}
+ bx lr
+
diff --git a/arch/arm/common/fiq_glue_setup.c b/arch/arm/common/fiq_glue_setup.c
new file mode 100644
index 000000000000..8cb1b611c6d5
--- /dev/null
+++ b/arch/arm/common/fiq_glue_setup.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <asm/fiq.h>
+#include <asm/fiq_glue.h>
+
+extern unsigned char fiq_glue, fiq_glue_end;
+extern void fiq_glue_setup(void *func, void *data, void *sp,
+ fiq_return_handler_t fiq_return_handler);
+
+static struct fiq_handler fiq_debbuger_fiq_handler = {
+ .name = "fiq_glue",
+};
+DEFINE_PER_CPU(void *, fiq_stack);
+static struct fiq_glue_handler *current_handler;
+static fiq_return_handler_t fiq_return_handler;
+static DEFINE_MUTEX(fiq_glue_lock);
+
+static void fiq_glue_setup_helper(void *info)
+{
+ struct fiq_glue_handler *handler = info;
+ fiq_glue_setup(handler->fiq, handler,
+ __get_cpu_var(fiq_stack) + THREAD_START_SP,
+ fiq_return_handler);
+}
+
+int fiq_glue_register_handler(struct fiq_glue_handler *handler)
+{
+ int ret;
+ int cpu;
+
+ if (!handler || !handler->fiq)
+ return -EINVAL;
+
+ mutex_lock(&fiq_glue_lock);
+ if (fiq_stack) {
+ ret = -EBUSY;
+ goto err_busy;
+ }
+
+ for_each_possible_cpu(cpu) {
+ void *stack;
+ stack = (void *)__get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
+ if (WARN_ON(!stack)) {
+ ret = -ENOMEM;
+ goto err_alloc_fiq_stack;
+ }
+ per_cpu(fiq_stack, cpu) = stack;
+ }
+
+ ret = claim_fiq(&fiq_debbuger_fiq_handler);
+ if (WARN_ON(ret))
+ goto err_claim_fiq;
+
+ current_handler = handler;
+ on_each_cpu(fiq_glue_setup_helper, handler, true);
+ set_fiq_handler(&fiq_glue, &fiq_glue_end - &fiq_glue);
+
+ mutex_unlock(&fiq_glue_lock);
+ return 0;
+
+err_claim_fiq:
+err_alloc_fiq_stack:
+ for_each_possible_cpu(cpu) {
+ __free_pages(per_cpu(fiq_stack, cpu), THREAD_SIZE_ORDER);
+ per_cpu(fiq_stack, cpu) = NULL;
+ }
+err_busy:
+ mutex_unlock(&fiq_glue_lock);
+ return ret;
+}
+
+static void fiq_glue_update_return_handler(void (*fiq_return)(void))
+{
+ fiq_return_handler = fiq_return;
+ if (current_handler)
+ on_each_cpu(fiq_glue_setup_helper, current_handler, true);
+}
+
+int fiq_glue_set_return_handler(void (*fiq_return)(void))
+{
+ int ret;
+
+ mutex_lock(&fiq_glue_lock);
+ if (fiq_return_handler) {
+ ret = -EBUSY;
+ goto err_busy;
+ }
+ fiq_glue_update_return_handler(fiq_return);
+ ret = 0;
+err_busy:
+ mutex_unlock(&fiq_glue_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(fiq_glue_set_return_handler);
+
+int fiq_glue_clear_return_handler(void (*fiq_return)(void))
+{
+ int ret;
+
+ mutex_lock(&fiq_glue_lock);
+ if (WARN_ON(fiq_return_handler != fiq_return)) {
+ ret = -EINVAL;
+ goto err_inval;
+ }
+ fiq_glue_update_return_handler(NULL);
+ ret = 0;
+err_inval:
+ mutex_unlock(&fiq_glue_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(fiq_glue_clear_return_handler);
+
+/**
+ * fiq_glue_resume - Restore fiqs after suspend or low power idle states
+ *
+ * This must be called before calling local_fiq_enable after returning from a
+ * power state where the fiq mode registers were lost. If a driver provided
+ * a resume hook when it registered the handler it will be called.
+ */
+
+void fiq_glue_resume(void)
+{
+ if (!current_handler)
+ return;
+ fiq_glue_setup(current_handler->fiq, current_handler,
+ __get_cpu_var(fiq_stack) + THREAD_START_SP,
+ fiq_return_handler);
+ if (current_handler->resume)
+ current_handler->resume(current_handler);
+}
+
diff --git a/arch/arm/configs/ranchu_defconfig b/arch/arm/configs/ranchu_defconfig
new file mode 100644
index 000000000000..49e7bbd5825a
--- /dev/null
+++ b/arch/arm/configs/ranchu_defconfig
@@ -0,0 +1,316 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=14
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+CONFIG_ARCH_MMAP_RND_BITS=16
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_IOSCHED_DEADLINE is not set
+# CONFIG_IOSCHED_CFQ is not set
+CONFIG_ARCH_VIRT=y
+CONFIG_ARM_KERNMEM_PERMS=y
+CONFIG_SMP=y
+CONFIG_PREEMPT=y
+CONFIG_AEABI=y
+CONFIG_HIGHMEM=y
+CONFIG_KSM=y
+CONFIG_SECCOMP=y
+CONFIG_CMDLINE="console=ttyAMA0"
+CONFIG_VFP=y
+CONFIG_NEON=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_LRO is not set
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_BRIDGE=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+# CONFIG_WIRELESS is not set
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_MTD=y
+CONFIG_MTD_CMDLINE_PARTS=y
+CONFIG_MTD_BLOCK=y
+CONFIG_MTD_CFI=y
+CONFIG_MTD_CFI_INTELEXT=y
+CONFIG_MTD_CFI_AMDSTD=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_SMSC911X=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+CONFIG_USB_USBNET=y
+# CONFIG_WLAN is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO_SERPORT is not set
+CONFIG_SERIO_AMBAKMI=y
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+# CONFIG_HW_RANDOM is not set
+# CONFIG_HWMON is not set
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_FB=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_FB_SIMPLE=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+# CONFIG_LOGO_LINUX_VGA16 is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_DRV_PL031=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_SW_SYNC_USER=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_FUSE_FS=y
+CONFIG_CUSE=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_NFS_FS=y
+CONFIG_ROOT_NFS=y
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_DEBUG_INFO=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_PANIC_TIMEOUT=5
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_VIRTUALIZATION=y
diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
index f13ae153fb24..d2315ffd8f12 100644
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -112,8 +112,12 @@ int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs);
#define CORE_DUMP_USE_REGSET
#define ELF_EXEC_PAGESIZE 4096
-/* This is the base location for PIE (ET_DYN with INTERP) loads. */
-#define ELF_ET_DYN_BASE 0x400000UL
+/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
+ use of this is to invoke "./ld.so someprog" to test out a new version of
+ the loader. We need to make sure that it is out of the way of the program
+ that it will "exec", and that there is sufficient room for the brk. */
+
+#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
/* When the program starts, a1 contains a pointer to a function to be
registered with atexit, as per the SVR4 ABI. A value of 0 means we
diff --git a/arch/arm/include/asm/fiq_glue.h b/arch/arm/include/asm/fiq_glue.h
new file mode 100644
index 000000000000..a9e244f9f197
--- /dev/null
+++ b/arch/arm/include/asm/fiq_glue.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __ASM_FIQ_GLUE_H
+#define __ASM_FIQ_GLUE_H
+
+struct fiq_glue_handler {
+ void (*fiq)(struct fiq_glue_handler *h, void *regs, void *svc_sp);
+ void (*resume)(struct fiq_glue_handler *h);
+};
+typedef void (*fiq_return_handler_t)(void);
+
+int fiq_glue_register_handler(struct fiq_glue_handler *handler);
+int fiq_glue_set_return_handler(fiq_return_handler_t fiq_return);
+int fiq_glue_clear_return_handler(fiq_return_handler_t fiq_return);
+
+#ifdef CONFIG_FIQ_GLUE
+void fiq_glue_resume(void);
+#else
+static inline void fiq_glue_resume(void) {}
+#endif
+
+#endif
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 370f7a732900..d06064120694 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -3,6 +3,7 @@
#ifdef CONFIG_ARM_CPU_TOPOLOGY
+#include <linux/cpufreq.h>
#include <linux/cpumask.h>
struct cputopo_arm {
@@ -24,6 +25,12 @@ void init_cpu_topology(void);
void store_cpu_topology(unsigned int cpuid);
const struct cpumask *cpu_coregroup_mask(int cpu);
+#ifdef CONFIG_CPU_FREQ
+#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
+#endif
+#define arch_scale_cpu_capacity scale_cpu_capacity
+extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
+
#else
static inline void init_cpu_topology(void) { }
diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
index 9232caee7060..f3c662299531 100644
--- a/arch/arm/kernel/kgdb.c
+++ b/arch/arm/kernel/kgdb.c
@@ -140,6 +140,8 @@ int kgdb_arch_handle_exception(int exception_vector, int signo,
static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr)
{
+ if (user_mode(regs))
+ return -1;
kgdb_handle_exception(1, SIGTRAP, 0, regs);
return 0;
@@ -147,6 +149,8 @@ static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr)
static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int instr)
{
+ if (user_mode(regs))
+ return -1;
compiled_break = 1;
kgdb_handle_exception(1, SIGTRAP, 0, regs);
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 91d2d5b01414..c6324b534b9d 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -80,6 +80,7 @@ void arch_cpu_idle_prepare(void)
void arch_cpu_idle_enter(void)
{
+ idle_notifier_call_chain(IDLE_START);
ledtrig_cpu(CPU_LED_IDLE_START);
#ifdef CONFIG_PL310_ERRATA_769419
wmb();
@@ -89,6 +90,78 @@ void arch_cpu_idle_enter(void)
void arch_cpu_idle_exit(void)
{
ledtrig_cpu(CPU_LED_IDLE_END);
+ idle_notifier_call_chain(IDLE_END);
+}
+
+/*
+ * dump a block of kernel memory from around the given address
+ */
+static void show_data(unsigned long addr, int nbytes, const char *name)
+{
+ int i, j;
+ int nlines;
+ u32 *p;
+
+ /*
+ * don't attempt to dump non-kernel addresses or
+ * values that are probably just small negative numbers
+ */
+ if (addr < PAGE_OFFSET || addr > -256UL)
+ return;
+
+ printk("\n%s: %#lx:\n", name, addr);
+
+ /*
+ * round address down to a 32 bit boundary
+ * and always dump a multiple of 32 bytes
+ */
+ p = (u32 *)(addr & ~(sizeof(u32) - 1));
+ nbytes += (addr & (sizeof(u32) - 1));
+ nlines = (nbytes + 31) / 32;
+
+
+ for (i = 0; i < nlines; i++) {
+ /*
+ * just display low 16 bits of address to keep
+ * each line of the dump < 80 characters
+ */
+ printk("%04lx ", (unsigned long)p & 0xffff);
+ for (j = 0; j < 8; j++) {
+ u32 data;
+ if (probe_kernel_address(p, data)) {
+ printk(" ********");
+ } else {
+ printk(" %08x", data);
+ }
+ ++p;
+ }
+ printk("\n");
+ }
+}
+
+static void show_extra_register_data(struct pt_regs *regs, int nbytes)
+{
+ mm_segment_t fs;
+
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ show_data(regs->ARM_pc - nbytes, nbytes * 2, "PC");
+ show_data(regs->ARM_lr - nbytes, nbytes * 2, "LR");
+ show_data(regs->ARM_sp - nbytes, nbytes * 2, "SP");
+ show_data(regs->ARM_ip - nbytes, nbytes * 2, "IP");
+ show_data(regs->ARM_fp - nbytes, nbytes * 2, "FP");
+ show_data(regs->ARM_r0 - nbytes, nbytes * 2, "R0");
+ show_data(regs->ARM_r1 - nbytes, nbytes * 2, "R1");
+ show_data(regs->ARM_r2 - nbytes, nbytes * 2, "R2");
+ show_data(regs->ARM_r3 - nbytes, nbytes * 2, "R3");
+ show_data(regs->ARM_r4 - nbytes, nbytes * 2, "R4");
+ show_data(regs->ARM_r5 - nbytes, nbytes * 2, "R5");
+ show_data(regs->ARM_r6 - nbytes, nbytes * 2, "R6");
+ show_data(regs->ARM_r7 - nbytes, nbytes * 2, "R7");
+ show_data(regs->ARM_r8 - nbytes, nbytes * 2, "R8");
+ show_data(regs->ARM_r9 - nbytes, nbytes * 2, "R9");
+ show_data(regs->ARM_r10 - nbytes, nbytes * 2, "R10");
+ set_fs(fs);
}
void __show_regs(struct pt_regs *regs)
@@ -182,6 +255,8 @@ void __show_regs(struct pt_regs *regs)
printk("Control: %08x%s\n", ctrl, buf);
}
#endif
+
+ show_extra_register_data(regs, 128);
}
void show_regs(struct pt_regs * regs)
diff --git a/arch/arm/kernel/reboot.c b/arch/arm/kernel/reboot.c
index 3fa867a2aae6..d704df89a546 100644
--- a/arch/arm/kernel/reboot.c
+++ b/arch/arm/kernel/reboot.c
@@ -6,6 +6,7 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
+#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/reboot.h>
@@ -122,6 +123,31 @@ void machine_power_off(void)
pm_power_off();
}
+#ifdef CONFIG_ARM_FLUSH_CONSOLE_ON_RESTART
+void arm_machine_flush_console(void)
+{
+ printk("\n");
+ pr_emerg("Restarting %s\n", linux_banner);
+ if (console_trylock()) {
+ console_unlock();
+ return;
+ }
+
+ mdelay(50);
+
+ local_irq_disable();
+ if (!console_trylock())
+ pr_emerg("arm_restart: Console was locked! Busting\n");
+ else
+ pr_emerg("arm_restart: Console was locked!\n");
+ console_unlock();
+}
+#else
+void arm_machine_flush_console(void)
+{
+}
+#endif
+
/*
* Restart requires that the secondary CPUs stop performing any activity
* while the primary CPU resets the system. Systems with a single CPU can
@@ -138,6 +164,10 @@ void machine_restart(char *cmd)
local_irq_disable();
smp_send_stop();
+ /* Flush the console to make sure all the relevant messages make it
+ * out to the console drivers */
+ arm_machine_flush_console();
+
if (arm_pm_restart)
arm_pm_restart(reboot_mode, cmd);
else
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index ec279d161b32..df3020071f32 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -42,9 +42,15 @@
*/
static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
-unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
+#ifdef CONFIG_CPU_FREQ
+ unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu);
+
+ return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT;
+#else
return per_cpu(cpu_scale, cpu);
+#endif
}
static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
@@ -153,6 +159,8 @@ static void __init parse_dt_topology(void)
}
+static const struct sched_group_energy * const cpu_core_energy(int cpu);
+
/*
* Look for a customed capacity of a CPU in the cpu_capacity table during the
* boot. The update of all CPUs is in O(n^2) for heteregeneous system but the
@@ -160,10 +168,14 @@ static void __init parse_dt_topology(void)
*/
static void update_cpu_capacity(unsigned int cpu)
{
- if (!cpu_capacity(cpu))
- return;
+ unsigned long capacity = SCHED_CAPACITY_SCALE;
+
+ if (cpu_core_energy(cpu)) {
+ int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1;
+ capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap;
+ }
- set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);
+ set_capacity_scale(cpu, capacity);
pr_info("CPU%u: update cpu_capacity %lu\n",
cpu, arch_scale_cpu_capacity(NULL, cpu));
@@ -275,17 +287,138 @@ void store_cpu_topology(unsigned int cpuid)
cpu_topology[cpuid].socket_id, mpidr);
}
+/*
+ * ARM TC2 specific energy cost model data. There are no unit requirements for
+ * the data. Data can be normalized to any reference point, but the
+ * normalization must be consistent. That is, one bogo-joule/watt must be the
+ * same quantity for all data, but we don't care what it is.
+ */
+static struct idle_state idle_states_cluster_a7[] = {
+ { .power = 25 }, /* arch_cpu_idle() (active idle) = WFI */
+ { .power = 25 }, /* WFI */
+ { .power = 10 }, /* cluster-sleep-l */
+ };
+
+static struct idle_state idle_states_cluster_a15[] = {
+ { .power = 70 }, /* arch_cpu_idle() (active idle) = WFI */
+ { .power = 70 }, /* WFI */
+ { .power = 25 }, /* cluster-sleep-b */
+ };
+
+static struct capacity_state cap_states_cluster_a7[] = {
+ /* Cluster only power */
+ { .cap = 150, .power = 2967, }, /* 350 MHz */
+ { .cap = 172, .power = 2792, }, /* 400 MHz */
+ { .cap = 215, .power = 2810, }, /* 500 MHz */
+ { .cap = 258, .power = 2815, }, /* 600 MHz */
+ { .cap = 301, .power = 2919, }, /* 700 MHz */
+ { .cap = 344, .power = 2847, }, /* 800 MHz */
+ { .cap = 387, .power = 3917, }, /* 900 MHz */
+ { .cap = 430, .power = 4905, }, /* 1000 MHz */
+ };
+
+static struct capacity_state cap_states_cluster_a15[] = {
+ /* Cluster only power */
+ { .cap = 426, .power = 7920, }, /* 500 MHz */
+ { .cap = 512, .power = 8165, }, /* 600 MHz */
+ { .cap = 597, .power = 8172, }, /* 700 MHz */
+ { .cap = 682, .power = 8195, }, /* 800 MHz */
+ { .cap = 768, .power = 8265, }, /* 900 MHz */
+ { .cap = 853, .power = 8446, }, /* 1000 MHz */
+ { .cap = 938, .power = 11426, }, /* 1100 MHz */
+ { .cap = 1024, .power = 15200, }, /* 1200 MHz */
+ };
+
+static struct sched_group_energy energy_cluster_a7 = {
+ .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a7),
+ .idle_states = idle_states_cluster_a7,
+ .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a7),
+ .cap_states = cap_states_cluster_a7,
+};
+
+static struct sched_group_energy energy_cluster_a15 = {
+ .nr_idle_states = ARRAY_SIZE(idle_states_cluster_a15),
+ .idle_states = idle_states_cluster_a15,
+ .nr_cap_states = ARRAY_SIZE(cap_states_cluster_a15),
+ .cap_states = cap_states_cluster_a15,
+};
+
+static struct idle_state idle_states_core_a7[] = {
+ { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */
+ { .power = 0 }, /* WFI */
+ { .power = 0 }, /* cluster-sleep-l */
+ };
+
+static struct idle_state idle_states_core_a15[] = {
+ { .power = 0 }, /* arch_cpu_idle (active idle) = WFI */
+ { .power = 0 }, /* WFI */
+ { .power = 0 }, /* cluster-sleep-b */
+ };
+
+static struct capacity_state cap_states_core_a7[] = {
+ /* Power per cpu */
+ { .cap = 150, .power = 187, }, /* 350 MHz */
+ { .cap = 172, .power = 275, }, /* 400 MHz */
+ { .cap = 215, .power = 334, }, /* 500 MHz */
+ { .cap = 258, .power = 407, }, /* 600 MHz */
+ { .cap = 301, .power = 447, }, /* 700 MHz */
+ { .cap = 344, .power = 549, }, /* 800 MHz */
+ { .cap = 387, .power = 761, }, /* 900 MHz */
+ { .cap = 430, .power = 1024, }, /* 1000 MHz */
+ };
+
+static struct capacity_state cap_states_core_a15[] = {
+ /* Power per cpu */
+ { .cap = 426, .power = 2021, }, /* 500 MHz */
+ { .cap = 512, .power = 2312, }, /* 600 MHz */
+ { .cap = 597, .power = 2756, }, /* 700 MHz */
+ { .cap = 682, .power = 3125, }, /* 800 MHz */
+ { .cap = 768, .power = 3524, }, /* 900 MHz */
+ { .cap = 853, .power = 3846, }, /* 1000 MHz */
+ { .cap = 938, .power = 5177, }, /* 1100 MHz */
+ { .cap = 1024, .power = 6997, }, /* 1200 MHz */
+ };
+
+static struct sched_group_energy energy_core_a7 = {
+ .nr_idle_states = ARRAY_SIZE(idle_states_core_a7),
+ .idle_states = idle_states_core_a7,
+ .nr_cap_states = ARRAY_SIZE(cap_states_core_a7),
+ .cap_states = cap_states_core_a7,
+};
+
+static struct sched_group_energy energy_core_a15 = {
+ .nr_idle_states = ARRAY_SIZE(idle_states_core_a15),
+ .idle_states = idle_states_core_a15,
+ .nr_cap_states = ARRAY_SIZE(cap_states_core_a15),
+ .cap_states = cap_states_core_a15,
+};
+
+/* sd energy functions */
+static inline
+const struct sched_group_energy * const cpu_cluster_energy(int cpu)
+{
+ return cpu_topology[cpu].socket_id ? &energy_cluster_a7 :
+ &energy_cluster_a15;
+}
+
+static inline
+const struct sched_group_energy * const cpu_core_energy(int cpu)
+{
+ return cpu_topology[cpu].socket_id ? &energy_core_a7 :
+ &energy_core_a15;
+}
+
static inline int cpu_corepower_flags(void)
{
- return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN;
+ return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \
+ SD_SHARE_CAP_STATES;
}
static struct sched_domain_topology_level arm_topology[] = {
#ifdef CONFIG_SCHED_MC
- { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
#endif
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) },
{ NULL, },
};
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 24659952c278..11da0f50a1fe 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -270,6 +270,11 @@ v6_dma_clean_range:
* - end - virtual end address of region
*/
ENTRY(v6_dma_flush_range)
+#ifdef CONFIG_CACHE_FLUSH_RANGE_LIMIT
+ sub r2, r1, r0
+ cmp r2, #CONFIG_CACHE_FLUSH_RANGE_LIMIT
+ bhi v6_dma_flush_dcache_all
+#endif
#ifdef CONFIG_DMA_CACHE_RWFO
ldrb r2, [r0] @ read for ownership
strb r2, [r0] @ write for ownership
@@ -292,6 +297,18 @@ ENTRY(v6_dma_flush_range)
mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
ret lr
+#ifdef CONFIG_CACHE_FLUSH_RANGE_LIMIT
+v6_dma_flush_dcache_all:
+ mov r0, #0
+#ifdef HARVARD_CACHE
+ mcr p15, 0, r0, c7, c14, 0 @ D cache clean+invalidate
+#else
+ mcr p15, 0, r0, c7, c15, 0 @ Cache clean+invalidate
+#endif
+ mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
+ mov pc, lr
+#endif
+
/*
* dma_map_area(start, size, dir)
* - start - kernel virtual start address
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 5ca207ada852..d6c9dee1b252 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -276,10 +276,10 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
local_irq_enable();
/*
- * If we're in an interrupt or have no user
+ * If we're in an interrupt, or have no irqs, or have no user
* context, we must not take the fault..
*/
- if (faulthandler_disabled() || !mm)
+ if (faulthandler_disabled() || irqs_disabled() || !mm)
goto no_context;
if (user_mode(regs))
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3e43874568f9..96be0221ab3e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -852,6 +852,14 @@ config SETEND_EMULATION
If unsure, say Y
endif
+config ARM64_SW_TTBR0_PAN
+ bool "Emulate Privileged Access Never using TTBR0_EL1 switching"
+ help
+ Enabling this option prevents the kernel from accessing
+ user-space memory directly by pointing TTBR0_EL1 to a reserved
+ zeroed area and reserved ASID. The user access routines
+ restore the valid TTBR0_EL1 temporarily.
+
menu "ARMv8.1 architectural features"
config ARM64_HW_AFDBM
@@ -1011,6 +1019,23 @@ config CMDLINE
entering them here. As a minimum, you should specify the the
root device (e.g. root=/dev/nfs).
+choice
+ prompt "Kernel command line type" if CMDLINE != ""
+ default CMDLINE_FROM_BOOTLOADER
+
+config CMDLINE_FROM_BOOTLOADER
+ bool "Use bootloader kernel arguments if available"
+ help
+ Uses the command-line options passed by the boot loader. If
+ the boot loader doesn't provide any, the default kernel command
+ string provided in CMDLINE will be used.
+
+config CMDLINE_EXTEND
+ bool "Extend bootloader kernel arguments"
+ help
+ The command-line arguments provided by the boot loader will be
+ appended to the default kernel command string.
+
config CMDLINE_FORCE
bool "Always use the default kernel command string"
help
@@ -1018,6 +1043,7 @@ config CMDLINE_FORCE
loader passes other arguments to the kernel.
This is useful if you cannot or don't want to change the
command-line options your boot loader passes to the kernel.
+endchoice
config EFI_STUB
bool
@@ -1050,6 +1076,41 @@ config DMI
However, even with this option, the resultant kernel should
continue to boot on existing non-UEFI platforms.
+config BUILD_ARM64_APPENDED_DTB_IMAGE
+ bool "Build a concatenated Image.gz/dtb by default"
+ depends on OF
+ help
+ Enabling this option will cause a concatenated Image.gz and list of
+ DTBs to be built by default (instead of a standalone Image.gz.)
+ The image will built in arch/arm64/boot/Image.gz-dtb
+
+choice
+ prompt "Appended DTB Kernel Image name"
+ depends on BUILD_ARM64_APPENDED_DTB_IMAGE
+ help
+ Enabling this option will cause a specific kernel image Image or
+ Image.gz to be used for final image creation.
+ The image will built in arch/arm64/boot/IMAGE-NAME-dtb
+
+ config IMG_GZ_DTB
+ bool "Image.gz-dtb"
+ config IMG_DTB
+ bool "Image-dtb"
+endchoice
+
+config BUILD_ARM64_APPENDED_KERNEL_IMAGE_NAME
+ string
+ depends on BUILD_ARM64_APPENDED_DTB_IMAGE
+ default "Image.gz-dtb" if IMG_GZ_DTB
+ default "Image-dtb" if IMG_DTB
+
+config BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES
+ string "Default dtb names"
+ depends on BUILD_ARM64_APPENDED_DTB_IMAGE
+ help
+ Space separated list of names of dtbs to append when
+ building a concatenated Image.gz-dtb.
+
endmenu
menu "Userspace binary formats"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index ee94597773fa..180e9be33a17 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -44,6 +44,7 @@ endif
KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr)
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
KBUILD_CFLAGS += $(call cc-option, -mpc-relative-literal-loads)
+KBUILD_CFLAGS += -fno-pic
KBUILD_AFLAGS += $(lseinstr)
ifeq ($(CONFIG_CPU_BIG_ENDIAN), y)
@@ -98,7 +99,12 @@ libs-y := arch/arm64/lib/ $(libs-y)
core-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
# Default target when executing plain make
+ifeq ($(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE),y)
+KBUILD_IMAGE := $(subst $\",,$(CONFIG_BUILD_ARM64_APPENDED_KERNEL_IMAGE_NAME))
+else
KBUILD_IMAGE := Image.gz
+endif
+
KBUILD_DTBS := dtbs
all: $(KBUILD_IMAGE) $(KBUILD_DTBS)
@@ -125,6 +131,9 @@ dtbs: prepare scripts
dtbs_install:
$(Q)$(MAKE) $(dtbinst)=$(boot)/dts
+Image-dtb Image.gz-dtb: vmlinux scripts dtbs
+ $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
+
PHONY += vdso_install
vdso_install:
$(Q)$(MAKE) $(build)=arch/arm64/kernel/vdso $@
diff --git a/arch/arm64/boot/.gitignore b/arch/arm64/boot/.gitignore
index 8dab0bb6ae66..34e35209fc2e 100644
--- a/arch/arm64/boot/.gitignore
+++ b/arch/arm64/boot/.gitignore
@@ -1,2 +1,4 @@
Image
+Image-dtb
Image.gz
+Image.gz-dtb
diff --git a/arch/arm64/boot/Makefile b/arch/arm64/boot/Makefile
index 1f012c506434..2c8cb864315e 100644
--- a/arch/arm64/boot/Makefile
+++ b/arch/arm64/boot/Makefile
@@ -14,16 +14,29 @@
# Based on the ia64 boot/Makefile.
#
+include $(srctree)/arch/arm64/boot/dts/Makefile
+
OBJCOPYFLAGS_Image :=-O binary -R .note -R .note.gnu.build-id -R .comment -S
targets := Image Image.gz
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
+endif
+DTB_OBJS := $(addprefix $(obj)/dts/,$(DTB_LIST))
+
$(obj)/Image: vmlinux FORCE
$(call if_changed,objcopy)
$(obj)/Image.bz2: $(obj)/Image FORCE
$(call if_changed,bzip2)
+$(obj)/Image-dtb: $(obj)/Image $(DTB_OBJS) FORCE
+ $(call if_changed,cat)
+
$(obj)/Image.gz: $(obj)/Image FORCE
$(call if_changed,gzip)
@@ -36,6 +49,9 @@ $(obj)/Image.lzma: $(obj)/Image FORCE
$(obj)/Image.lzo: $(obj)/Image FORCE
$(call if_changed,lzo)
+$(obj)/Image.gz-dtb: $(obj)/Image.gz $(DTB_OBJS) FORCE
+ $(call if_changed,cat)
+
install:
$(CONFIG_SHELL) $(srctree)/$(src)/install.sh $(KERNELRELEASE) \
$(obj)/Image System.map "$(INSTALL_PATH)"
diff --git a/arch/arm64/boot/dts/Makefile b/arch/arm64/boot/dts/Makefile
index 6684f97c2722..7ad2cf0a607b 100644
--- a/arch/arm64/boot/dts/Makefile
+++ b/arch/arm64/boot/dts/Makefile
@@ -28,3 +28,17 @@ dtstree := $(srctree)/$(src)
dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(foreach d,$(dts-dirs), $(wildcard $(dtstree)/$(d)/*.dts)))
always := $(dtb-y)
+
+targets += dtbs
+
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
+endif
+targets += $(DTB_LIST)
+
+dtbs: $(addprefix $(obj)/, $(DTB_LIST))
+
+clean-files := dts/*.dtb *.dtb
diff --git a/arch/arm64/configs/ranchu64_defconfig b/arch/arm64/configs/ranchu64_defconfig
new file mode 100644
index 000000000000..fc55008d8c4c
--- /dev/null
+++ b/arch/arm64/configs/ranchu64_defconfig
@@ -0,0 +1,312 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+# CONFIG_SWAP is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_BSD_PROCESS_ACCT_V3=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=14
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SCHED_AUTOGROUP=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_PROFILING=y
+CONFIG_ARCH_MMAP_RND_BITS=24
+CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_IOSCHED_DEADLINE is not set
+CONFIG_ARCH_VEXPRESS=y
+CONFIG_NR_CPUS=4
+CONFIG_PREEMPT=y
+CONFIG_KSM=y
+CONFIG_SECCOMP=y
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_SWP_EMULATION=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_CMDLINE="console=ttyAMA0"
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_COMPAT=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_LRO is not set
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_RPFILTER=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_TARGET_ECN=y
+CONFIG_IP_NF_TARGET_TTL=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MATCH_AH=y
+CONFIG_IP6_NF_MATCH_EUI64=y
+CONFIG_IP6_NF_MATCH_FRAG=y
+CONFIG_IP6_NF_MATCH_OPTS=y
+CONFIG_IP6_NF_MATCH_HL=y
+CONFIG_IP6_NF_MATCH_IPV6HEADER=y
+CONFIG_IP6_NF_MATCH_MH=y
+CONFIG_IP6_NF_MATCH_RT=y
+CONFIG_IP6_NF_TARGET_HL=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_BRIDGE=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+# CONFIG_WIRELESS is not set
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_SCSI=y
+# CONFIG_SCSI_PROC_FS is not set
+CONFIG_BLK_DEV_SD=y
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_SMC91X=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+# CONFIG_WLAN is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO_SERPORT is not set
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+# CONFIG_HW_RANDOM is not set
+CONFIG_BATTERY_GOLDFISH=y
+# CONFIG_HWMON is not set
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_FB=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_FB_SIMPLE=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+# CONFIG_LOGO_LINUX_VGA16 is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+# CONFIG_USB_SUPPORT is not set
+CONFIG_RTC_CLASS=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_TIMED_GPIO=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_SW_SYNC_USER=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_EXT2_FS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_FUSE_FS=y
+CONFIG_CUSE=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+# CONFIG_MISC_FILESYSTEMS is not set
+CONFIG_NFS_FS=y
+CONFIG_ROOT_NFS=y
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_FS=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_PANIC_TIMEOUT=5
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+# CONFIG_FTRACE is not set
+CONFIG_ATOMIC64_SELFTEST=y
+CONFIG_DEBUG_RODATA=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 3f85bbcd7e40..148849c10604 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -42,6 +42,15 @@
msr daifclr, #2
.endm
+ .macro save_and_disable_irq, flags
+ mrs \flags, daif
+ msr daifset, #2
+ .endm
+
+ .macro restore_irq, flags
+ msr daif, \flags
+ .endm
+
/*
* Enable and disable debug exceptions.
*/
@@ -451,6 +460,13 @@ alternative_endif
movk \reg, :abs_g0_nc:\val
.endm
+/*
+ * Return the current thread_info.
+ */
+ .macro get_thread_info, rd
+ mrs \rd, sp_el0
+ .endm
+
.macro pte_to_phys, phys, pte
and \phys, \pte, #(((1 << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
.endm
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 15868eca58de..1dc16f5b54ec 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -221,6 +221,12 @@ static inline bool system_supports_mixed_endian_el0(void)
return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1));
}
+static inline bool system_uses_ttbr0_pan(void)
+{
+ return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) &&
+ !cpus_have_cap(ARM64_HAS_PAN);
+}
+
#define ARM64_SSBD_UNKNOWN -1
#define ARM64_SSBD_FORCE_DISABLE 0
#define ARM64_SSBD_KERNEL 1
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 65615820155e..6c4f9e8febf3 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -1,6 +1,7 @@
#ifndef _ASM_EFI_H
#define _ASM_EFI_H
+#include <asm/cpufeature.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/neon.h>
@@ -78,7 +79,32 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
static inline void efi_set_pgd(struct mm_struct *mm)
{
- switch_mm(NULL, mm, NULL);
+ __switch_mm(mm);
+
+ if (system_uses_ttbr0_pan()) {
+ if (mm != current->active_mm) {
+ /*
+ * Update the current thread's saved ttbr0 since it is
+ * restored as part of a return from exception. Enable
+ * access to the valid TTBR0_EL1 and invoke the errata
+ * workaround directly since there is no return from
+ * exception when invoking the EFI run-time services.
+ */
+ update_saved_ttbr0(current, mm);
+ uaccess_ttbr0_enable();
+ post_ttbr_update_workaround();
+ } else {
+ /*
+ * Defer the switch to the current thread's TTBR0_EL1
+ * until uaccess_enable(). Restore the current
+ * thread's saved ttbr0 corresponding to its active_mm
+ * (if different from init_mm).
+ */
+ uaccess_ttbr0_disable();
+ if (current->active_mm != &init_mm)
+ update_saved_ttbr0(current, current->active_mm);
+ }
+ }
}
void efi_virtmap_load(void);
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index 1fb023076dfc..40a8a94db23b 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -169,7 +169,7 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
#ifdef CONFIG_COMPAT
/* PIE load location for compat arm. Must match ARM ELF_ET_DYN_BASE. */
-#define COMPAT_ELF_ET_DYN_BASE 0x000400000UL
+#define COMPAT_ELF_ET_DYN_BASE (2 * TASK_SIZE_32 / 3)
/* AArch32 registers. */
#define COMPAT_ELF_NGREG 18
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index d14c478976d0..85997c0e5443 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -175,6 +175,12 @@
#define ESR_ELx_SYS64_ISS_SYS_CTR_READ (ESR_ELx_SYS64_ISS_SYS_CTR | \
ESR_ELx_SYS64_ISS_DIR_READ)
+#define ESR_ELx_SYS64_ISS_SYS_CNTVCT (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 2, 14, 0) | \
+ ESR_ELx_SYS64_ISS_DIR_READ)
+
+#define ESR_ELx_SYS64_ISS_SYS_CNTFRQ (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 0, 14, 0) | \
+ ESR_ELx_SYS64_ISS_DIR_READ)
+
#ifndef __ASSEMBLY__
#include <asm/types.h>
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 2a5090fb9113..a891bb6bb3d4 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -21,15 +21,12 @@
#include <linux/futex.h>
#include <linux/uaccess.h>
-#include <asm/alternative.h>
-#include <asm/cpufeature.h>
#include <asm/errno.h>
-#include <asm/sysreg.h>
#define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \
+do { \
+ uaccess_enable(); \
asm volatile( \
- ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN) \
" prfm pstl1strm, %2\n" \
"1: ldxr %w1, %2\n" \
insn "\n" \
@@ -44,11 +41,11 @@
" .popsection\n" \
_ASM_EXTABLE(1b, 4b) \
_ASM_EXTABLE(2b, 4b) \
- ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN) \
: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \
: "r" (oparg), "Ir" (-EFAULT) \
- : "memory")
+ : "memory"); \
+ uaccess_disable(); \
+} while (0)
static inline int
arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
@@ -102,8 +99,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr,
return -EFAULT;
uaddr = __uaccess_mask_ptr(_uaddr);
+ uaccess_enable();
asm volatile("// futex_atomic_cmpxchg_inatomic\n"
-ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
" prfm pstl1strm, %2\n"
"1: ldxr %w1, %2\n"
" sub %w3, %w1, %w4\n"
@@ -118,10 +115,10 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
" .popsection\n"
_ASM_EXTABLE(1b, 4b)
_ASM_EXTABLE(2b, 4b)
-ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp)
: "r" (oldval), "r" (newval), "Ir" (-EFAULT)
: "memory");
+ uaccess_disable();
*uval = val;
return ret;
diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h
index 9510ace570e2..b6b167ac082b 100644
--- a/arch/arm64/include/asm/hw_breakpoint.h
+++ b/arch/arm64/include/asm/hw_breakpoint.h
@@ -77,7 +77,11 @@ static inline void decode_ctrl_reg(u32 reg,
/* Lengths */
#define ARM_BREAKPOINT_LEN_1 0x1
#define ARM_BREAKPOINT_LEN_2 0x3
+#define ARM_BREAKPOINT_LEN_3 0x7
#define ARM_BREAKPOINT_LEN_4 0xf
+#define ARM_BREAKPOINT_LEN_5 0x1f
+#define ARM_BREAKPOINT_LEN_6 0x3f
+#define ARM_BREAKPOINT_LEN_7 0x7f
#define ARM_BREAKPOINT_LEN_8 0xff
/* Kernel stepping */
@@ -119,7 +123,7 @@ struct perf_event;
struct pmu;
extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
- int *gen_len, int *gen_type);
+ int *gen_len, int *gen_type, int *offset);
extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 7e51d1b57c0c..77a27af01371 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -19,6 +19,7 @@
#ifndef __ASM_KERNEL_PGTABLE_H
#define __ASM_KERNEL_PGTABLE_H
+#include <asm/pgtable.h>
#include <asm/sparsemem.h>
/*
@@ -54,6 +55,12 @@
#define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE)
#define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE)
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+#define RESERVED_TTBR0_SIZE (PAGE_SIZE)
+#else
+#define RESERVED_TTBR0_SIZE (0)
+#endif
+
/* Initial memory map size */
#if ARM64_SWAPPER_USES_SECTION_MAPS
#define SWAPPER_BLOCK_SHIFT SECTION_SHIFT
@@ -71,8 +78,16 @@
/*
* Initial memory map attributes.
*/
-#define SWAPPER_PTE_FLAGS (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
-#define SWAPPER_PMD_FLAGS (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
+#define _SWAPPER_PTE_FLAGS (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
+#define _SWAPPER_PMD_FLAGS (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
+
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+#define SWAPPER_PTE_FLAGS (_SWAPPER_PTE_FLAGS | PTE_NG)
+#define SWAPPER_PMD_FLAGS (_SWAPPER_PMD_FLAGS | PMD_SECT_NG)
+#else
+#define SWAPPER_PTE_FLAGS _SWAPPER_PTE_FLAGS
+#define SWAPPER_PMD_FLAGS _SWAPPER_PMD_FLAGS
+#endif
#if ARM64_SWAPPER_USES_SECTION_MAPS
#define SWAPPER_MM_MMUFLAGS (PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS)
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 6ac34c75f4e1..18b9f159afcd 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -16,7 +16,9 @@
#ifndef __ASM_MMU_H
#define __ASM_MMU_H
+
#define USER_ASID_FLAG (UL(1) << 48)
+#define TTBR_ASID_MASK (UL(0xffff) << 48)
#ifndef __ASSEMBLY__
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index b96c4799f881..1ce6f3c92389 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -23,6 +23,7 @@
#include <linux/sched.h>
#include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
#include <asm/proc-fns.h>
#include <asm-generic/mm_hooks.h>
#include <asm/cputype.h>
@@ -110,7 +111,7 @@ static inline void cpu_uninstall_idmap(void)
local_flush_tlb_all();
cpu_set_default_tcr_t0sz();
- if (mm != &init_mm)
+ if (mm != &init_mm && !system_uses_ttbr0_pan())
cpu_switch_mm(mm->pgd, mm);
}
@@ -170,20 +171,27 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
}
-/*
- * This is the actual mm switch as far as the scheduler
- * is concerned. No registers are touched. We avoid
- * calling the CPU specific function when the mm hasn't
- * actually changed.
- */
-static inline void
-switch_mm(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+static inline void update_saved_ttbr0(struct task_struct *tsk,
+ struct mm_struct *mm)
{
- unsigned int cpu = smp_processor_id();
+ if (system_uses_ttbr0_pan()) {
+ u64 ttbr;
+ BUG_ON(mm->pgd == swapper_pg_dir);
+ ttbr = virt_to_phys(mm->pgd) | ASID(mm) << 48;
+ WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr);
+ }
+}
+#else
+static inline void update_saved_ttbr0(struct task_struct *tsk,
+ struct mm_struct *mm)
+{
+}
+#endif
- if (prev == next)
- return;
+static inline void __switch_mm(struct mm_struct *next)
+{
+ unsigned int cpu = smp_processor_id();
/*
* init_mm.pgd does not contain any user mappings and it is always
@@ -197,9 +205,28 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
check_and_switch_context(next, cpu);
}
+static inline void
+switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ if (prev != next)
+ __switch_mm(next);
+
+ /*
+ * Update the saved TTBR0_EL1 of the scheduled-in task as the previous
+ * value may have not been initialised yet (activate_mm caller) or the
+ * ASID has changed since the last run (following the context switch
+ * of another thread of the same process). Avoid setting the reserved
+ * TTBR0_EL1 to swapper_pg_dir (init_mm; e.g. via idle_task_exit).
+ */
+ if (next != &init_mm)
+ update_saved_ttbr0(tsk, next);
+}
+
#define deactivate_mm(tsk,mm) do { } while (0)
-#define activate_mm(prev,next) switch_mm(prev, next, NULL)
+#define activate_mm(prev,next) switch_mm(prev, next, current)
void verify_cpu_asid_bits(void);
+void post_ttbr_update_workaround(void);
#endif
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 0dd1bc13f942..7ca27e515115 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -48,6 +48,9 @@ struct thread_info {
unsigned long flags; /* low level flags */
mm_segment_t addr_limit; /* address limit */
struct task_struct *task; /* main task structure */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ u64 ttbr0; /* saved TTBR0_EL1 */
+#endif
int preempt_count; /* 0 => preemptable, <0 => bug */
int cpu; /* cpu */
};
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 8b57339823e9..7ec84d0191c8 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -31,6 +31,14 @@ int pcibus_to_node(struct pci_bus *bus);
cpumask_of_node(pcibus_to_node(bus)))
#endif /* CONFIG_NUMA */
+struct sched_domain;
+#ifdef CONFIG_CPU_FREQ
+#define arch_scale_freq_capacity cpufreq_scale_freq_capacity
+extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
+extern unsigned long cpufreq_scale_max_freq_capacity(int cpu);
+#endif
+#define arch_scale_cpu_capacity scale_cpu_capacity
+extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu);
#include <asm-generic/topology.h>
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index f5cd96c60eb9..ed846aff603c 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -18,6 +18,13 @@
#ifndef __ASM_UACCESS_H
#define __ASM_UACCESS_H
+#include <asm/alternative.h>
+#include <asm/kernel-pgtable.h>
+#include <asm/mmu.h>
+#include <asm/sysreg.h>
+
+#ifndef __ASSEMBLY__
+
/*
* User space memory access functions
*/
@@ -26,11 +33,9 @@
#include <linux/string.h>
#include <linux/thread_info.h>
-#include <asm/alternative.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
#include <asm/ptrace.h>
-#include <asm/sysreg.h>
#include <asm/errno.h>
#include <asm/memory.h>
#include <asm/compiler.h>
@@ -136,6 +141,115 @@ static inline unsigned long __range_ok(unsigned long addr, unsigned long size)
" .popsection\n"
/*
+ * User access enabling/disabling.
+ */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+static inline void __uaccess_ttbr0_disable(void)
+{
+ unsigned long flags, ttbr;
+
+ local_irq_save(flags);
+ ttbr = read_sysreg(ttbr1_el1);
+ ttbr &= ~TTBR_ASID_MASK;
+ /* reserved_ttbr0 placed at the end of swapper_pg_dir */
+ write_sysreg(ttbr + SWAPPER_DIR_SIZE, ttbr0_el1);
+ isb();
+ /* Set reserved ASID */
+ write_sysreg(ttbr, ttbr1_el1);
+ isb();
+ local_irq_restore(flags);
+}
+
+static inline void __uaccess_ttbr0_enable(void)
+{
+ unsigned long flags, ttbr0, ttbr1;
+
+ /*
+ * Disable interrupts to avoid preemption between reading the 'ttbr0'
+ * variable and the MSR. A context switch could trigger an ASID
+ * roll-over and an update of 'ttbr0'.
+ */
+ local_irq_save(flags);
+ ttbr0 = READ_ONCE(current_thread_info()->ttbr0);
+
+ /* Restore active ASID */
+ ttbr1 = read_sysreg(ttbr1_el1);
+ ttbr1 &= ~TTBR_ASID_MASK; /* safety measure */
+ ttbr1 |= ttbr0 & TTBR_ASID_MASK;
+ write_sysreg(ttbr1, ttbr1_el1);
+ isb();
+
+ /* Restore user page table */
+ write_sysreg(ttbr0, ttbr0_el1);
+ isb();
+ local_irq_restore(flags);
+}
+
+static inline bool uaccess_ttbr0_disable(void)
+{
+ if (!system_uses_ttbr0_pan())
+ return false;
+ __uaccess_ttbr0_disable();
+ return true;
+}
+
+static inline bool uaccess_ttbr0_enable(void)
+{
+ if (!system_uses_ttbr0_pan())
+ return false;
+ __uaccess_ttbr0_enable();
+ return true;
+}
+#else
+static inline bool uaccess_ttbr0_disable(void)
+{
+ return false;
+}
+
+static inline bool uaccess_ttbr0_enable(void)
+{
+ return false;
+}
+#endif
+
+#define __uaccess_disable(alt) \
+do { \
+ if (!uaccess_ttbr0_disable()) \
+ asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \
+ CONFIG_ARM64_PAN)); \
+} while (0)
+
+#define __uaccess_enable(alt) \
+do { \
+ if (!uaccess_ttbr0_enable()) \
+ asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \
+ CONFIG_ARM64_PAN)); \
+} while (0)
+
+static inline void uaccess_disable(void)
+{
+ __uaccess_disable(ARM64_HAS_PAN);
+}
+
+static inline void uaccess_enable(void)
+{
+ __uaccess_enable(ARM64_HAS_PAN);
+}
+
+/*
+ * These functions are no-ops when UAO is present.
+ */
+static inline void uaccess_disable_not_uao(void)
+{
+ __uaccess_disable(ARM64_ALT_PAN_NOT_UAO);
+}
+
+static inline void uaccess_enable_not_uao(void)
+{
+ __uaccess_enable(ARM64_ALT_PAN_NOT_UAO);
+}
+
+/*
* Sanitise a uaccess pointer such that it becomes NULL if above the
* current addr_limit.
*/
@@ -182,8 +296,7 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
do { \
unsigned long __gu_val; \
__chk_user_ptr(ptr); \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
- CONFIG_ARM64_PAN)); \
+ uaccess_enable_not_uao(); \
switch (sizeof(*(ptr))) { \
case 1: \
__get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \
@@ -204,9 +317,8 @@ do { \
default: \
BUILD_BUG(); \
} \
+ uaccess_disable_not_uao(); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
- CONFIG_ARM64_PAN)); \
} while (0)
#define __get_user_check(x, ptr, err) \
@@ -256,8 +368,7 @@ do { \
do { \
__typeof__(*(ptr)) __pu_val = (x); \
__chk_user_ptr(ptr); \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
- CONFIG_ARM64_PAN)); \
+ uaccess_enable_not_uao(); \
switch (sizeof(*(ptr))) { \
case 1: \
__put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \
@@ -278,8 +389,7 @@ do { \
default: \
BUILD_BUG(); \
} \
- asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
- CONFIG_ARM64_PAN)); \
+ uaccess_disable_not_uao(); \
} while (0)
#define __put_user_check(x, ptr, err) \
@@ -379,4 +489,77 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count);
extern __must_check long strlen_user(const char __user *str);
extern __must_check long strnlen_user(const char __user *str, long n);
+#else /* __ASSEMBLY__ */
+
+#include <asm/assembler.h>
+
+/*
+ * User access enabling/disabling macros.
+ */
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ .macro __uaccess_ttbr0_disable, tmp1
+ mrs \tmp1, ttbr1_el1 // swapper_pg_dir
+ bic \tmp1, \tmp1, #TTBR_ASID_MASK
+ add \tmp1, \tmp1, #SWAPPER_DIR_SIZE // reserved_ttbr0 at the end of swapper_pg_dir
+ msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1
+ isb
+ sub \tmp1, \tmp1, #SWAPPER_DIR_SIZE
+ msr ttbr1_el1, \tmp1 // set reserved ASID
+ isb
+ .endm
+
+ .macro __uaccess_ttbr0_enable, tmp1, tmp2
+ get_thread_info \tmp1
+ ldr \tmp1, [\tmp1, #TSK_TI_TTBR0] // load saved TTBR0_EL1
+ mrs \tmp2, ttbr1_el1
+ extr \tmp2, \tmp2, \tmp1, #48
+ ror \tmp2, \tmp2, #16
+ msr ttbr1_el1, \tmp2 // set the active ASID
+ isb
+ msr ttbr0_el1, \tmp1 // set the non-PAN TTBR0_EL1
+ isb
+ .endm
+
+ .macro uaccess_ttbr0_disable, tmp1, tmp2
+alternative_if_not ARM64_HAS_PAN
+ save_and_disable_irq \tmp2 // avoid preemption
+ __uaccess_ttbr0_disable \tmp1
+ restore_irq \tmp2
+alternative_else_nop_endif
+ .endm
+
+ .macro uaccess_ttbr0_enable, tmp1, tmp2, tmp3
+alternative_if_not ARM64_HAS_PAN
+ save_and_disable_irq \tmp3 // avoid preemption
+ __uaccess_ttbr0_enable \tmp1, \tmp2
+ restore_irq \tmp3
+alternative_else_nop_endif
+ .endm
+#else
+ .macro uaccess_ttbr0_disable, tmp1, tmp2
+ .endm
+
+ .macro uaccess_ttbr0_enable, tmp1, tmp2, tmp3
+ .endm
+#endif
+
+/*
+ * These macros are no-ops when UAO is present.
+ */
+ .macro uaccess_disable_not_uao, tmp1, tmp2
+ uaccess_ttbr0_disable \tmp1, \tmp2
+alternative_if ARM64_ALT_PAN_NOT_UAO
+ SET_PSTATE_PAN(1)
+alternative_else_nop_endif
+ .endm
+
+ .macro uaccess_enable_not_uao, tmp1, tmp2, tmp3
+ uaccess_ttbr0_enable \tmp1, \tmp2, \tmp3
+alternative_if ARM64_ALT_PAN_NOT_UAO
+ SET_PSTATE_PAN(0)
+alternative_else_nop_endif
+ .endm
+
+#endif /* __ASSEMBLY__ */
+
#endif /* __ASM_UACCESS_H */
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index c0ede237c14b..29d2ad8844a5 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -14,7 +14,6 @@
#include <linux/slab.h>
#include <linux/sysctl.h>
-#include <asm/alternative.h>
#include <asm/cpufeature.h>
#include <asm/insn.h>
#include <asm/opcodes.h>
@@ -285,10 +284,10 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table)
#define __SWP_LL_SC_LOOPS 4
#define __user_swpX_asm(data, addr, res, temp, temp2, B) \
+do { \
+ uaccess_enable(); \
__asm__ __volatile__( \
" mov %w3, %w7\n" \
- ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN) \
"0: ldxr"B" %w2, [%4]\n" \
"1: stxr"B" %w0, %w1, [%4]\n" \
" cbz %w0, 2f\n" \
@@ -306,13 +305,13 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table)
" .popsection" \
_ASM_EXTABLE(0b, 4b) \
_ASM_EXTABLE(1b, 4b) \
- ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \
- CONFIG_ARM64_PAN) \
: "=&r" (res), "+r" (data), "=&r" (temp), "=&r" (temp2) \
: "r" ((unsigned long)addr), "i" (-EAGAIN), \
"i" (-EFAULT), \
"i" (__SWP_LL_SC_LOOPS) \
- : "memory")
+ : "memory"); \
+ uaccess_disable(); \
+} while (0)
#define __user_swp_asm(data, addr, res, temp, temp2) \
__user_swpX_asm(data, addr, res, temp, temp2, "")
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index bd239b1b7a68..ee768f414077 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -42,6 +42,9 @@ int main(void)
DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit));
DEFINE(TI_TASK, offsetof(struct thread_info, task));
DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ DEFINE(TSK_TI_TTBR0, offsetof(struct thread_info, ttbr0));
+#endif
BLANK();
DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context));
BLANK();
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index ca978d7d98eb..35d674773852 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -32,7 +32,9 @@
#include <asm/memory.h>
#include <asm/mmu.h>
#include <asm/processor.h>
+#include <asm/ptrace.h>
#include <asm/thread_info.h>
+#include <asm/uaccess.h>
#include <asm/asm-uaccess.h>
#include <asm/unistd.h>
#include <asm/kernel-pgtable.h>
@@ -164,6 +166,32 @@ alternative_cb_end
mrs x22, elr_el1
mrs x23, spsr_el1
stp lr, x21, [sp, #S_LR]
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ /*
+ * Set the TTBR0 PAN bit in SPSR. When the exception is taken from
+ * EL0, there is no need to check the state of TTBR0_EL1 since
+ * accesses are always enabled.
+ * Note that the meaning of this bit differs from the ARMv8.1 PAN
+ * feature as all TTBR0_EL1 accesses are disabled, not just those to
+ * user mappings.
+ */
+alternative_if ARM64_HAS_PAN
+ b 1f // skip TTBR0 PAN
+alternative_else_nop_endif
+
+ .if \el != 0
+ mrs x21, ttbr0_el1
+ tst x21, #TTBR_ASID_MASK // Check for the reserved ASID
+ orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR
+ b.eq 1f // TTBR0 access already disabled
+ and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR
+ .endif
+
+ __uaccess_ttbr0_disable x21
+1:
+#endif
+
stp x22, x23, [sp, #S_PC]
/*
@@ -202,6 +230,40 @@ alternative_cb_end
ldp x21, x22, [sp, #S_PC] // load ELR, SPSR
.if \el == 0
ct_user_enter
+ .endif
+
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ /*
+ * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
+ * PAN bit checking.
+ */
+alternative_if ARM64_HAS_PAN
+ b 2f // skip TTBR0 PAN
+alternative_else_nop_endif
+
+ .if \el != 0
+ tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set
+ .endif
+
+ __uaccess_ttbr0_enable x0, x1
+
+ .if \el == 0
+ /*
+ * Enable errata workarounds only if returning to user. The only
+ * workaround currently required for TTBR0_EL1 changes are for the
+ * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
+ * corruption).
+ */
+ bl post_ttbr_update_workaround
+ .endif
+1:
+ .if \el != 0
+ and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit
+ .endif
+2:
+#endif
+
+ .if \el == 0
ldr x23, [sp, #S_SP] // load return stack pointer
msr sp_el0, x23
tst x22, #PSR_MODE32_BIT // native task?
@@ -221,6 +283,7 @@ alternative_else_nop_endif
apply_ssbd 0, 5f, x0, x1
5:
.endif
+
msr elr_el1, x21 // set up the return data
msr spsr_el1, x22
ldp x0, x1, [sp, #16 * 0]
@@ -257,10 +320,6 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0
.endif
.endm
- .macro get_thread_info, rd
- mrs \rd, sp_el0
- .endm
-
.macro irq_stack_entry
mov x19, sp // preserve the original sp
@@ -891,14 +950,24 @@ __ni_sys_trace:
.macro tramp_map_kernel, tmp
mrs \tmp, ttbr1_el1
- sub \tmp, \tmp, #SWAPPER_DIR_SIZE
+ sub \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
bic \tmp, \tmp, #USER_ASID_FLAG
msr ttbr1_el1, \tmp
+#ifdef CONFIG_ARCH_MSM8996
+ /* ASID already in \tmp[63:48] */
+ movk \tmp, #:abs_g2_nc:(TRAMP_VALIAS >> 12)
+ movk \tmp, #:abs_g1_nc:(TRAMP_VALIAS >> 12)
+ /* 2MB boundary containing the vectors, so we nobble the walk cache */
+ movk \tmp, #:abs_g0_nc:((TRAMP_VALIAS & ~(SZ_2M - 1)) >> 12)
+ isb
+ tlbi vae1, \tmp
+ dsb nsh
+#endif /* CONFIG_ARCH_MSM8996 */
.endm
.macro tramp_unmap_kernel, tmp
mrs \tmp, ttbr1_el1
- add \tmp, \tmp, #SWAPPER_DIR_SIZE
+ add \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
orr \tmp, \tmp, #USER_ASID_FLAG
msr ttbr1_el1, \tmp
/*
@@ -925,7 +994,9 @@ __ni_sys_trace:
tramp_map_kernel x30
#ifdef CONFIG_RANDOMIZE_BASE
adr x30, tramp_vectors + PAGE_SIZE
+#ifndef CONFIG_ARCH_MSM8996
isb
+#endif
ldr x30, [x30]
#else
ldr x30, =vectors
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index fa52817d84c5..6bf32265909d 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -326,14 +326,14 @@ __create_page_tables:
* dirty cache lines being evicted.
*/
adrp x0, idmap_pg_dir
- adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE
+ adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
bl __inval_cache_range
/*
* Clear the idmap and swapper page tables.
*/
adrp x0, idmap_pg_dir
- adrp x6, swapper_pg_dir + SWAPPER_DIR_SIZE
+ adrp x6, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
1: stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
@@ -412,7 +412,7 @@ __create_page_tables:
* tables again to remove any speculatively loaded cache lines.
*/
adrp x0, idmap_pg_dir
- adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE
+ adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE
dmb sy
bl __inval_cache_range
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 0b9e5f6290f9..fb0082ab40a7 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -318,9 +318,21 @@ static int get_hbp_len(u8 hbp_len)
case ARM_BREAKPOINT_LEN_2:
len_in_bytes = 2;
break;
+ case ARM_BREAKPOINT_LEN_3:
+ len_in_bytes = 3;
+ break;
case ARM_BREAKPOINT_LEN_4:
len_in_bytes = 4;
break;
+ case ARM_BREAKPOINT_LEN_5:
+ len_in_bytes = 5;
+ break;
+ case ARM_BREAKPOINT_LEN_6:
+ len_in_bytes = 6;
+ break;
+ case ARM_BREAKPOINT_LEN_7:
+ len_in_bytes = 7;
+ break;
case ARM_BREAKPOINT_LEN_8:
len_in_bytes = 8;
break;
@@ -350,7 +362,7 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
* to generic breakpoint descriptions.
*/
int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
- int *gen_len, int *gen_type)
+ int *gen_len, int *gen_type, int *offset)
{
/* Type */
switch (ctrl.type) {
@@ -370,17 +382,33 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
return -EINVAL;
}
+ if (!ctrl.len)
+ return -EINVAL;
+ *offset = __ffs(ctrl.len);
+
/* Len */
- switch (ctrl.len) {
+ switch (ctrl.len >> *offset) {
case ARM_BREAKPOINT_LEN_1:
*gen_len = HW_BREAKPOINT_LEN_1;
break;
case ARM_BREAKPOINT_LEN_2:
*gen_len = HW_BREAKPOINT_LEN_2;
break;
+ case ARM_BREAKPOINT_LEN_3:
+ *gen_len = HW_BREAKPOINT_LEN_3;
+ break;
case ARM_BREAKPOINT_LEN_4:
*gen_len = HW_BREAKPOINT_LEN_4;
break;
+ case ARM_BREAKPOINT_LEN_5:
+ *gen_len = HW_BREAKPOINT_LEN_5;
+ break;
+ case ARM_BREAKPOINT_LEN_6:
+ *gen_len = HW_BREAKPOINT_LEN_6;
+ break;
+ case ARM_BREAKPOINT_LEN_7:
+ *gen_len = HW_BREAKPOINT_LEN_7;
+ break;
case ARM_BREAKPOINT_LEN_8:
*gen_len = HW_BREAKPOINT_LEN_8;
break;
@@ -424,9 +452,21 @@ static int arch_build_bp_info(struct perf_event *bp)
case HW_BREAKPOINT_LEN_2:
info->ctrl.len = ARM_BREAKPOINT_LEN_2;
break;
+ case HW_BREAKPOINT_LEN_3:
+ info->ctrl.len = ARM_BREAKPOINT_LEN_3;
+ break;
case HW_BREAKPOINT_LEN_4:
info->ctrl.len = ARM_BREAKPOINT_LEN_4;
break;
+ case HW_BREAKPOINT_LEN_5:
+ info->ctrl.len = ARM_BREAKPOINT_LEN_5;
+ break;
+ case HW_BREAKPOINT_LEN_6:
+ info->ctrl.len = ARM_BREAKPOINT_LEN_6;
+ break;
+ case HW_BREAKPOINT_LEN_7:
+ info->ctrl.len = ARM_BREAKPOINT_LEN_7;
+ break;
case HW_BREAKPOINT_LEN_8:
info->ctrl.len = ARM_BREAKPOINT_LEN_8;
break;
@@ -518,18 +558,17 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
default:
return -EINVAL;
}
-
- info->address &= ~alignment_mask;
- info->ctrl.len <<= offset;
} else {
if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE)
alignment_mask = 0x3;
else
alignment_mask = 0x7;
- if (info->address & alignment_mask)
- return -EINVAL;
+ offset = info->address & alignment_mask;
}
+ info->address &= ~alignment_mask;
+ info->ctrl.len <<= offset;
+
/*
* Disallow per-task kernel breakpoints since these would
* complicate the stepping code.
@@ -662,12 +701,47 @@ unlock:
}
NOKPROBE_SYMBOL(breakpoint_handler);
+/*
+ * Arm64 hardware does not always report a watchpoint hit address that matches
+ * one of the watchpoints set. It can also report an address "near" the
+ * watchpoint if a single instruction access both watched and unwatched
+ * addresses. There is no straight-forward way, short of disassembling the
+ * offending instruction, to map that address back to the watchpoint. This
+ * function computes the distance of the memory access from the watchpoint as a
+ * heuristic for the likelyhood that a given access triggered the watchpoint.
+ *
+ * See Section D2.10.5 "Determining the memory location that caused a Watchpoint
+ * exception" of ARMv8 Architecture Reference Manual for details.
+ *
+ * The function returns the distance of the address from the bytes watched by
+ * the watchpoint. In case of an exact match, it returns 0.
+ */
+static u64 get_distance_from_watchpoint(unsigned long addr, u64 val,
+ struct arch_hw_breakpoint_ctrl *ctrl)
+{
+ u64 wp_low, wp_high;
+ u32 lens, lene;
+
+ lens = __ffs(ctrl->len);
+ lene = __fls(ctrl->len);
+
+ wp_low = val + lens;
+ wp_high = val + lene;
+ if (addr < wp_low)
+ return wp_low - addr;
+ else if (addr > wp_high)
+ return addr - wp_high;
+ else
+ return 0;
+}
+
static int watchpoint_handler(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
- int i, step = 0, *kernel_step, access;
+ int i, step = 0, *kernel_step, access, closest_match = 0;
+ u64 min_dist = -1, dist;
u32 ctrl_reg;
- u64 val, alignment_mask;
+ u64 val;
struct perf_event *wp, **slots;
struct debug_info *debug_info;
struct arch_hw_breakpoint *info;
@@ -676,35 +750,15 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
slots = this_cpu_ptr(wp_on_reg);
debug_info = &current->thread.debug;
+ /*
+ * Find all watchpoints that match the reported address. If no exact
+ * match is found. Attribute the hit to the closest watchpoint.
+ */
+ rcu_read_lock();
for (i = 0; i < core_num_wrps; ++i) {
- rcu_read_lock();
-
wp = slots[i];
-
if (wp == NULL)
- goto unlock;
-
- info = counter_arch_bp(wp);
- /* AArch32 watchpoints are either 4 or 8 bytes aligned. */
- if (is_compat_task()) {
- if (info->ctrl.len == ARM_BREAKPOINT_LEN_8)
- alignment_mask = 0x7;
- else
- alignment_mask = 0x3;
- } else {
- alignment_mask = 0x7;
- }
-
- /* Check if the watchpoint value matches. */
- val = read_wb_reg(AARCH64_DBG_REG_WVR, i);
- if (val != (untagged_addr(addr) & ~alignment_mask))
- goto unlock;
-
- /* Possible match, check the byte address select to confirm. */
- ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i);
- decode_ctrl_reg(ctrl_reg, &ctrl);
- if (!((1 << (addr & alignment_mask)) & ctrl.len))
- goto unlock;
+ continue;
/*
* Check that the access type matches.
@@ -713,18 +767,41 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
access = (esr & AARCH64_ESR_ACCESS_MASK) ? HW_BREAKPOINT_W :
HW_BREAKPOINT_R;
if (!(access & hw_breakpoint_type(wp)))
- goto unlock;
+ continue;
+ /* Check if the watchpoint value and byte select match. */
+ val = read_wb_reg(AARCH64_DBG_REG_WVR, i);
+ ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i);
+ decode_ctrl_reg(ctrl_reg, &ctrl);
+ dist = get_distance_from_watchpoint(addr, val, &ctrl);
+ if (dist < min_dist) {
+ min_dist = dist;
+ closest_match = i;
+ }
+ /* Is this an exact match? */
+ if (dist != 0)
+ continue;
+
+ info = counter_arch_bp(wp);
info->trigger = addr;
perf_bp_event(wp, regs);
/* Do we need to handle the stepping? */
if (is_default_overflow_handler(wp))
step = 1;
+ }
+ if (min_dist > 0 && min_dist != -1) {
+ /* No exact match found. */
+ wp = slots[closest_match];
+ info = counter_arch_bp(wp);
+ info->trigger = addr;
+ perf_bp_event(wp, regs);
-unlock:
- rcu_read_unlock();
+ /* Do we need to handle the stepping? */
+ if (is_default_overflow_handler(wp))
+ step = 1;
}
+ rcu_read_unlock();
if (!step)
return 0;
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index e917d119490c..d3e6f1bbc9fa 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -166,6 +166,70 @@ void machine_restart(char *cmd)
while (1);
}
+/*
+ * dump a block of kernel memory from around the given address
+ */
+static void show_data(unsigned long addr, int nbytes, const char *name)
+{
+ int i, j;
+ int nlines;
+ u32 *p;
+
+ /*
+ * don't attempt to dump non-kernel addresses or
+ * values that are probably just small negative numbers
+ */
+ if (addr < PAGE_OFFSET || addr > -256UL)
+ return;
+
+ printk("\n%s: %#lx:\n", name, addr);
+
+ /*
+ * round address down to a 32 bit boundary
+ * and always dump a multiple of 32 bytes
+ */
+ p = (u32 *)(addr & ~(sizeof(u32) - 1));
+ nbytes += (addr & (sizeof(u32) - 1));
+ nlines = (nbytes + 31) / 32;
+
+
+ for (i = 0; i < nlines; i++) {
+ /*
+ * just display low 16 bits of address to keep
+ * each line of the dump < 80 characters
+ */
+ printk("%04lx ", (unsigned long)p & 0xffff);
+ for (j = 0; j < 8; j++) {
+ u32 data;
+ if (probe_kernel_address(p, data)) {
+ printk(" ********");
+ } else {
+ printk(" %08x", data);
+ }
+ ++p;
+ }
+ printk("\n");
+ }
+}
+
+static void show_extra_register_data(struct pt_regs *regs, int nbytes)
+{
+ mm_segment_t fs;
+ unsigned int i;
+
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ show_data(regs->pc - nbytes, nbytes * 2, "PC");
+ show_data(regs->regs[30] - nbytes, nbytes * 2, "LR");
+ show_data(regs->sp - nbytes, nbytes * 2, "SP");
+ for (i = 0; i < 30; i++) {
+ char name[4];
+ snprintf(name, sizeof(name), "X%u", i);
+ show_data(regs->regs[i] - nbytes, nbytes * 2, name);
+ }
+ set_fs(fs);
+}
+
void __show_regs(struct pt_regs *regs)
{
int i, top_reg;
@@ -201,6 +265,8 @@ void __show_regs(struct pt_regs *regs)
pr_cont("\n");
}
+ if (!user_mode(regs))
+ show_extra_register_data(regs, 128);
printk("\n");
}
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 8eedeef375d6..a22161ccf447 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -327,13 +327,13 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type,
struct arch_hw_breakpoint_ctrl ctrl,
struct perf_event_attr *attr)
{
- int err, len, type, disabled = !ctrl.enabled;
+ int err, len, type, offset, disabled = !ctrl.enabled;
attr->disabled = disabled;
if (disabled)
return 0;
- err = arch_bp_generic_fields(ctrl, &len, &type);
+ err = arch_bp_generic_fields(ctrl, &len, &type, &offset);
if (err)
return err;
@@ -352,6 +352,7 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type,
attr->bp_len = len;
attr->bp_type = type;
+ attr->bp_addr += offset;
return 0;
}
@@ -404,7 +405,7 @@ static int ptrace_hbp_get_addr(unsigned int note_type,
if (IS_ERR(bp))
return PTR_ERR(bp);
- *addr = bp ? bp->attr.bp_addr : 0;
+ *addr = bp ? counter_arch_bp(bp)->address : 0;
return 0;
}
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index f534f492a268..f7545a7f6f29 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -291,6 +291,15 @@ void __init setup_arch(char **cmdline_p)
smp_init_cpus();
smp_build_mpidr_hash();
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ /*
+ * Make sure init_thread_info.ttbr0 always generates translation
+ * faults in case uaccess_enable() is inadvertently called by the init
+ * thread.
+ */
+ init_thread_info.ttbr0 = virt_to_phys(empty_zero_page);
+#endif
+
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
conswitchp = &vga_con;
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 694f6deedbab..7758f7ff131b 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -19,10 +19,30 @@
#include <linux/nodemask.h>
#include <linux/of.h>
#include <linux/sched.h>
+#include <linux/sched.h>
+#include <linux/sched_energy.h>
#include <asm/cputype.h>
#include <asm/topology.h>
+static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
+
+unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+#ifdef CONFIG_CPU_FREQ
+ unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu);
+
+ return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT;
+#else
+ return per_cpu(cpu_scale, cpu);
+#endif
+}
+
+static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
+{
+ per_cpu(cpu_scale, cpu) = capacity;
+}
+
static int __init get_cpu_for_node(struct device_node *node)
{
struct device_node *cpu_node;
@@ -206,11 +226,72 @@ out:
struct cpu_topology cpu_topology[NR_CPUS];
EXPORT_SYMBOL_GPL(cpu_topology);
+/* sd energy functions */
+static inline
+const struct sched_group_energy * const cpu_cluster_energy(int cpu)
+{
+ struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL1];
+
+ if (!sge) {
+ pr_warn("Invalid sched_group_energy for Cluster%d\n", cpu);
+ return NULL;
+ }
+
+ return sge;
+}
+
+static inline
+const struct sched_group_energy * const cpu_core_energy(int cpu)
+{
+ struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL0];
+
+ if (!sge) {
+ pr_warn("Invalid sched_group_energy for CPU%d\n", cpu);
+ return NULL;
+ }
+
+ return sge;
+}
+
const struct cpumask *cpu_coregroup_mask(int cpu)
{
return &cpu_topology[cpu].core_sibling;
}
+static int cpu_cpu_flags(void)
+{
+ return SD_ASYM_CPUCAPACITY;
+}
+
+static inline int cpu_corepower_flags(void)
+{
+ return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \
+ SD_SHARE_CAP_STATES;
+}
+
+static struct sched_domain_topology_level arm64_topology[] = {
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, cpu_cpu_flags, cpu_cluster_energy, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+static void update_cpu_capacity(unsigned int cpu)
+{
+ unsigned long capacity = SCHED_CAPACITY_SCALE;
+
+ if (cpu_core_energy(cpu)) {
+ int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1;
+ capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap;
+ }
+
+ set_capacity_scale(cpu, capacity);
+
+ pr_info("CPU%d: update cpu_capacity %lu\n",
+ cpu, arch_scale_cpu_capacity(NULL, cpu));
+}
+
static void update_siblings_masks(unsigned int cpuid)
{
struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
@@ -272,6 +353,7 @@ void store_cpu_topology(unsigned int cpuid)
topology_populated:
update_siblings_masks(cpuid);
+ update_cpu_capacity(cpuid);
}
static void __init reset_cpu_topology(void)
@@ -302,4 +384,8 @@ void __init init_cpu_topology(void)
*/
if (of_have_populated_dt() && parse_dt_topology())
reset_cpu_topology();
+ else
+ set_sched_topology(arm64_topology);
+
+ init_sched_energy_costs();
}
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 5963be2e05f0..d210bbfcd09c 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -33,6 +33,7 @@
#include <linux/syscalls.h>
#include <asm/atomic.h>
+#include <asm/barrier.h>
#include <asm/bug.h>
#include <asm/debug-monitors.h>
#include <asm/esr.h>
@@ -435,9 +436,10 @@ int cpu_enable_cache_maint_trap(void *__unused)
}
#define __user_cache_maint(insn, address, res) \
- if (address >= user_addr_max()) \
+ if (address >= user_addr_max()) { \
res = -EFAULT; \
- else \
+ } else { \
+ uaccess_ttbr0_enable(); \
asm volatile ( \
"1: " insn ", %1\n" \
" mov %w0, #0\n" \
@@ -449,7 +451,9 @@ int cpu_enable_cache_maint_trap(void *__unused)
" .popsection\n" \
_ASM_EXTABLE(1b, 3b) \
: "=r" (res) \
- : "r" (address), "i" (-EFAULT) )
+ : "r" (address), "i" (-EFAULT)); \
+ uaccess_ttbr0_disable(); \
+ }
static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs)
{
@@ -492,6 +496,25 @@ static void ctr_read_handler(unsigned int esr, struct pt_regs *regs)
regs->pc += 4;
}
+static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs)
+{
+ int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
+
+ isb();
+ if (rt != 31)
+ regs->regs[rt] = arch_counter_get_cntvct();
+ regs->pc += 4;
+}
+
+static void cntfrq_read_handler(unsigned int esr, struct pt_regs *regs)
+{
+ int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
+
+ if (rt != 31)
+ regs->regs[rt] = read_sysreg(cntfrq_el0);
+ regs->pc += 4;
+}
+
struct sys64_hook {
unsigned int esr_mask;
unsigned int esr_val;
@@ -510,6 +533,18 @@ static struct sys64_hook sys64_hooks[] = {
.esr_val = ESR_ELx_SYS64_ISS_SYS_CTR_READ,
.handler = ctr_read_handler,
},
+ {
+ /* Trap read access to CNTVCT_EL0 */
+ .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK,
+ .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTVCT,
+ .handler = cntvct_read_handler,
+ },
+ {
+ /* Trap read access to CNTFRQ_EL0 */
+ .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK,
+ .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTFRQ,
+ .handler = cntfrq_read_handler,
+ },
{},
};
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 6a584558b29d..34d3ed64fe8e 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -228,6 +228,11 @@ SECTIONS
swapper_pg_dir = .;
. += SWAPPER_DIR_SIZE;
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ reserved_ttbr0 = .;
+ . += RESERVED_TTBR0_SIZE;
+#endif
+
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
tramp_pg_dir = .;
. += PAGE_SIZE;
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index efbf610eaf4e..b581e16320dd 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -17,10 +17,7 @@
*/
#include <linux/linkage.h>
-#include <asm/alternative.h>
-#include <asm/assembler.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
.text
@@ -33,8 +30,7 @@
* Alignment fixed up by hardware.
*/
ENTRY(__arch_clear_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_enable_not_uao x2, x3, x4
mov x2, x1 // save the size for fixup return
subs x1, x1, #8
b.mi 2f
@@ -54,8 +50,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2
b.mi 5f
uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
5: mov x0, #0
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_disable_not_uao x2, x3
ret
ENDPROC(__arch_clear_user)
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 4fd67ea03bb0..c7a7d9689e8f 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -16,11 +16,8 @@
#include <linux/linkage.h>
-#include <asm/alternative.h>
-#include <asm/assembler.h>
#include <asm/cache.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
/*
* Copy from user space to a kernel buffer (alignment handled by the hardware)
@@ -67,12 +64,10 @@
end .req x5
ENTRY(__arch_copy_from_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_enable_not_uao x3, x4, x5
add end, x0, x2
#include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_disable_not_uao x3, x4
mov x0, #0 // Nothing to copy
ret
ENDPROC(__arch_copy_from_user)
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 841bf8f7fab7..800779eb3079 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -18,11 +18,8 @@
#include <linux/linkage.h>
-#include <asm/alternative.h>
-#include <asm/assembler.h>
#include <asm/cache.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
/*
* Copy from user space to user space (alignment handled by the hardware)
@@ -68,12 +65,10 @@
end .req x5
ENTRY(__arch_copy_in_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_enable_not_uao x3, x4, x5
add end, x0, x2
#include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_disable_not_uao x3, x4
mov x0, #0
ret
ENDPROC(__arch_copy_in_user)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 7a7efe255034..f6cfcc0441de 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -16,11 +16,8 @@
#include <linux/linkage.h>
-#include <asm/alternative.h>
-#include <asm/assembler.h>
#include <asm/cache.h>
-#include <asm/cpufeature.h>
-#include <asm/sysreg.h>
+#include <asm/uaccess.h>
/*
* Copy to user space from a kernel buffer (alignment handled by the hardware)
@@ -66,12 +63,10 @@
end .req x5
ENTRY(__arch_copy_to_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_enable_not_uao x3, x4, x5
add end, x0, x2
#include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
- CONFIG_ARM64_PAN)
+ uaccess_disable_not_uao x3, x4
mov x0, #0
ret
ENDPROC(__arch_copy_to_user)
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 58b5a906ff78..82ecb6c5f015 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -23,6 +23,7 @@
#include <asm/assembler.h>
#include <asm/cpufeature.h>
#include <asm/alternative.h>
+#include <asm/uaccess.h>
/*
* flush_icache_range(start,end)
@@ -48,6 +49,7 @@ ENTRY(flush_icache_range)
* - end - virtual end address of region
*/
ENTRY(__flush_cache_user_range)
+ uaccess_ttbr0_enable x2, x3, x4
dcache_line_size x2, x3
sub x3, x2, #1
bic x4, x0, x3
@@ -69,10 +71,12 @@ USER(9f, ic ivau, x4 ) // invalidate I line PoU
dsb ish
isb
mov x0, #0
+1:
+ uaccess_ttbr0_disable x1, x2
ret
9:
mov x0, #-EFAULT
- ret
+ b 1b
ENDPROC(flush_icache_range)
ENDPROC(__flush_cache_user_range)
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 62d976e843fc..c841ccecbc4d 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -233,7 +233,12 @@ switch_mm_fastpath:
arm64_apply_bp_hardening();
- cpu_switch_mm(mm->pgd, mm);
+ /*
+ * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when
+ * emulating PAN.
+ */
+ if (!system_uses_ttbr0_pan())
+ cpu_switch_mm(mm->pgd, mm);
}
/* Errata workaround post TTBRx_EL1 update. */
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index cab3574ab7d9..6dda5ff63930 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -174,7 +174,7 @@ static void *__dma_alloc(struct device *dev, size_t size,
/* create a coherent mapping */
page = virt_to_page(ptr);
coherent_ptr = dma_common_contiguous_remap(page, size, VM_USERMAP,
- prot, NULL);
+ prot, __builtin_return_address(0));
if (!coherent_ptr)
goto no_map;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index ad49ae8f3967..b9bff2258e59 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -286,13 +286,19 @@ out:
return fault;
}
-static inline bool is_permission_fault(unsigned int esr)
+static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs)
{
unsigned int ec = ESR_ELx_EC(esr);
unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
- return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) ||
- (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
+ if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR)
+ return false;
+
+ if (system_uses_ttbr0_pan())
+ return fsc_type == ESR_ELx_FSC_FAULT &&
+ (regs->pstate & PSR_PAN_BIT);
+ else
+ return fsc_type == ESR_ELx_FSC_PERM;
}
static bool is_el0_instruction_abort(unsigned int esr)
@@ -332,7 +338,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
mm_flags |= FAULT_FLAG_WRITE;
}
- if (is_permission_fault(esr) && (addr < TASK_SIZE)) {
+ if (addr < TASK_SIZE && is_permission_fault(esr, regs)) {
/* regs->orig_addr_limit may be 0 if we entered from EL0 */
if (regs->orig_addr_limit == KERNEL_DS)
die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 18d96d349a8b..d8fd0d0acea5 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -134,6 +134,9 @@ ENDPROC(cpu_do_resume)
ENTRY(cpu_do_switch_mm)
mrs x2, ttbr1_el1
mmid x1, x1 // get mm->context.id
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+ bfi x0, x1, #48, #16 // set the ASID field in TTBR0
+#endif
bfi x2, x1, #48, #16 // set the ASID
msr ttbr1_el1, x2 // in TTBR1 (since TCR.A1 is set)
isb
diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S
index 329c8027b0a9..69711f24b743 100644
--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -49,6 +49,7 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
+#include <asm/uaccess.h>
#include <xen/interface/xen.h>
@@ -91,6 +92,20 @@ ENTRY(privcmd_call)
mov x2, x3
mov x3, x4
mov x4, x5
+ /*
+ * Privcmd calls are issued by the userspace. The kernel needs to
+ * enable access to TTBR0_EL1 as the hypervisor would issue stage 1
+ * translations to user memory via AT instructions. Since AT
+ * instructions are not affected by the PAN bit (ARMv8.1), we only
+ * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation
+ * is enabled (it implies that hardware UAO and PAN disabled).
+ */
+ uaccess_ttbr0_enable x6, x7, x8
hvc XEN_IMM
+
+ /*
+ * Disable userspace access from kernel once the hyp call completed.
+ */
+ uaccess_ttbr0_disable x6, x7
ret
ENDPROC(privcmd_call);
diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
index 1fd147f09a38..5f10f9bcd417 100644
--- a/arch/avr32/include/uapi/asm/socket.h
+++ b/arch/avr32/include/uapi/asm/socket.h
@@ -90,4 +90,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _UAPI__ASM_AVR32_SOCKET_H */
diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
index afbc98f02d27..ed960d3af35d 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -90,5 +90,7 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _ASM_SOCKET_H */
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index 0018fad9039f..9790d139f1c9 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -99,4 +99,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
index 5fe42fc7b6c5..ad2567655e65 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -90,4 +90,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 2027240aafbb..2f106d0357f4 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -108,4 +108,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
index 5129f23a9ee1..69f96180a3f4 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -90,4 +90,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 9c935d717df9..b96a193a2a4d 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -89,4 +89,6 @@
#define SO_CNX_ADVICE 0x402E
+#define SO_COOKIE 0x4032
+
#endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
index 1672e3398270..e78550f71833 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -97,4 +97,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
index 41b51c2f4f1b..04fe908755b5 100644
--- a/arch/s390/include/uapi/asm/socket.h
+++ b/arch/s390/include/uapi/asm/socket.h
@@ -96,4 +96,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 31aede3af088..de15f0a09b32 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -86,6 +86,8 @@
#define SO_CNX_ADVICE 0x0037
+#define SO_COOKIE 0x003b
+
/* Security levels - as per NRL IPv6 - don't actually do anything */
#define SO_SECURITY_AUTHENTICATION 0x5001
#define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b5226a009973..93b170ea2480 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -110,6 +110,8 @@ else
KBUILD_CFLAGS += $(call cc-option,-mno-80387)
KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387)
+ KBUILD_CFLAGS += -fno-pic
+
# By default gcc and clang use a stack alignment of 16 bytes for x86.
# However the standard kernel entry on x86-64 leaves the stack on an
# 8-byte boundary. If the compiler isn't informed about the actual
diff --git a/arch/x86/configs/i386_ranchu_defconfig b/arch/x86/configs/i386_ranchu_defconfig
new file mode 100644
index 000000000000..a1c83c4e78ae
--- /dev/null
+++ b/arch/x86/configs/i386_ranchu_defconfig
@@ -0,0 +1,424 @@
+# CONFIG_64BIT is not set
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL_SYSCALL=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_ARCH_MMAP_RND_BITS=16
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_SGI_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_SMP=y
+CONFIG_X86_BIGSMP=y
+CONFIG_MCORE2=y
+CONFIG_X86_GENERIC=y
+CONFIG_HPET_TIMER=y
+CONFIG_NR_CPUS=512
+CONFIG_PREEMPT=y
+# CONFIG_X86_MCE is not set
+CONFIG_X86_REBOOTFIXUPS=y
+CONFIG_X86_MSR=y
+CONFIG_X86_CPUID=y
+CONFIG_KSM=y
+CONFIG_CMA=y
+# CONFIG_MTRR_SANITIZER is not set
+CONFIG_EFI=y
+CONFIG_EFI_STUB=y
+CONFIG_HZ_100=y
+CONFIG_PHYSICAL_START=0x100000
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_STAT is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_PCIEPORTBUS=y
+# CONFIG_PCIEASPM is not set
+CONFIG_PCCARD=y
+CONFIG_YENTA=y
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_BINFMT_MISC=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_LRO is not set
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETLABEL=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_CFG80211=y
+CONFIG_MAC80211=y
+CONFIG_MAC80211_LEDS=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DMA_CMA=y
+CONFIG_CMA_SIZE_MBYTES=16
+CONFIG_CONNECTOR=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_SPI_ATTRS=y
+CONFIG_SCSI_ISCSI_ATTRS=y
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
+CONFIG_ATA_PIIX=y
+CONFIG_PATA_AMD=y
+CONFIG_PATA_OLDPIIX=y
+CONFIG_PATA_SCH=y
+CONFIG_PATA_MPIIX=y
+CONFIG_ATA_GENERIC=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_DEBUG=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_MIRROR=y
+CONFIG_DM_ZERO=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_NETCONSOLE=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_BNX2=y
+CONFIG_TIGON3=y
+CONFIG_NET_TULIP=y
+CONFIG_E100=y
+CONFIG_E1000=y
+CONFIG_E1000E=y
+CONFIG_SKY2=y
+CONFIG_NE2K_PCI=y
+CONFIG_FORCEDETH=y
+CONFIG_8139TOO=y
+# CONFIG_8139TOO_PIO is not set
+CONFIG_R8169=y
+CONFIG_FDDI=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+CONFIG_USB_USBNET=y
+CONFIG_INPUT_POLLDEV=y
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+# CONFIG_KEYBOARD_ATKBD is not set
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO is not set
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+CONFIG_SERIAL_NONSTANDARD=y
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_NVRAM=y
+CONFIG_I2C_I801=y
+CONFIG_BATTERY_GOLDFISH=y
+CONFIG_WATCHDOG=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_AGP=y
+CONFIG_AGP_AMD64=y
+CONFIG_AGP_INTEL=y
+CONFIG_DRM=y
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_TILEBLITTING=y
+CONFIG_FB_EFI=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+# CONFIG_LCD_CLASS_DEVICE is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_HID_PID=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_MON=y
+CONFIG_USB_EHCI_HCD=y
+# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_UHCI_HCD=y
+CONFIG_USB_PRINTER=y
+CONFIG_USB_STORAGE=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_EDAC=y
+CONFIG_RTC_CLASS=y
+# CONFIG_RTC_HCTOSYS is not set
+CONFIG_DMADEVICES=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_SYNC_FILE=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_SND_HDA_INTEL=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+CONFIG_GOLDFISH_SYNC=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ISCSI_IBFT_FIND=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+# CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_FUSE_FS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+# CONFIG_NETWORK_FILESYSTEMS is not set
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_FRAME_WARN=2048
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_SCHED_TRACER=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
+CONFIG_KEYS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_CRYPTO_AES_586=y
+CONFIG_CRYPTO_TWOFISH=y
+CONFIG_ASYMMETRIC_KEY_TYPE=y
+CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
+CONFIG_X509_CERTIFICATE_PARSER=y
+CONFIG_PKCS7_MESSAGE_PARSER=y
+CONFIG_PKCS7_TEST_KEY=y
+# CONFIG_VIRTUALIZATION is not set
+CONFIG_CRC_T10DIF=y
diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig
new file mode 100644
index 000000000000..5b06edda428f
--- /dev/null
+++ b/arch/x86/configs/x86_64_cuttlefish_defconfig
@@ -0,0 +1,454 @@
+CONFIG_POSIX_MQUEUE=y
+# CONFIG_FHANDLE is not set
+# CONFIG_USELIB is not set
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_MEMCG=y
+CONFIG_MEMCG_SWAP=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_CGROUP_BPF=y
+CONFIG_NAMESPACES=y
+CONFIG_BLK_DEV_INITRD=y
+# CONFIG_RD_LZ4 is not set
+CONFIG_KALLSYMS_ALL=y
+# CONFIG_PCSPKR_PLATFORM is not set
+CONFIG_BPF_SYSCALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
+CONFIG_CC_STACKPROTECTOR_STRONG=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_SMP=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
+CONFIG_PARAVIRT_SPINLOCKS=y
+CONFIG_MCORE2=y
+CONFIG_PROCESSOR_SELECT=y
+# CONFIG_CPU_SUP_CENTAUR is not set
+CONFIG_NR_CPUS=8
+CONFIG_PREEMPT=y
+# CONFIG_MICROCODE is not set
+CONFIG_X86_MSR=y
+CONFIG_X86_CPUID=y
+CONFIG_KSM=y
+CONFIG_DEFAULT_MMAP_MIN_ADDR=65536
+CONFIG_TRANSPARENT_HUGEPAGE=y
+# CONFIG_MTRR is not set
+CONFIG_HZ_100=y
+CONFIG_KEXEC=y
+CONFIG_CRASH_DUMP=y
+CONFIG_PHYSICAL_START=0x200000
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_PHYSICAL_ALIGN=0x1000000
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=ttyS0 reboot=p nopti"
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_ACPI_PROCFS_POWER=y
+# CONFIG_ACPI_FAN is not set
+# CONFIG_ACPI_THERMAL is not set
+# CONFIG_X86_PM_TIMER is not set
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_GOV_ONDEMAND=y
+CONFIG_X86_ACPI_CPUFREQ=y
+# CONFIG_X86_ACPI_CPUFREQ_CPB is not set
+CONFIG_PCI_MMCONFIG=y
+CONFIG_PCI_MSI=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_BINFMT_MISC=y
+CONFIG_IA32_EMULATION=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
+# CONFIG_INET_XFRM_MODE_BEET is not set
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_TCP_CONG_ADVANCED=y
+# CONFIG_TCP_CONG_BIC is not set
+# CONFIG_TCP_CONG_WESTWOOD is not set
+# CONFIG_TCP_CONG_HTCP is not set
+CONFIG_TCP_MD5SIG=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETLABEL=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MATCH_IPV6HEADER=y
+CONFIG_IP6_NF_MATCH_RPFILTER=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_CFG80211=y
+CONFIG_MAC80211=y
+CONFIG_RFKILL=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DEVTMPFS=y
+CONFIG_DEBUG_DEVRES=y
+CONFIG_OF=y
+CONFIG_OF_UNITTEST=y
+# CONFIG_PNP_DEBUG_MESSAGES is not set
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_UID_SYS_STATS=y
+CONFIG_MEMORY_STATE_TIME=y
+CONFIG_SCSI=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_SPI_ATTRS=y
+CONFIG_SCSI_VIRTIO=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_MIRROR=y
+CONFIG_DM_ZERO=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_NETCONSOLE=y
+CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+# CONFIG_ETHERNET is not set
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+CONFIG_USB_USBNET=y
+# CONFIG_USB_NET_AX8817X is not set
+# CONFIG_USB_NET_AX88179_178A is not set
+# CONFIG_USB_NET_CDCETHER is not set
+# CONFIG_USB_NET_CDC_NCM is not set
+# CONFIG_USB_NET_NET1080 is not set
+# CONFIG_USB_NET_CDC_SUBSET is not set
+# CONFIG_USB_NET_ZAURUS is not set
+# CONFIG_WLAN_VENDOR_ADMTEK is not set
+# CONFIG_WLAN_VENDOR_ATH is not set
+# CONFIG_WLAN_VENDOR_ATMEL is not set
+# CONFIG_WLAN_VENDOR_BROADCOM is not set
+# CONFIG_WLAN_VENDOR_CISCO is not set
+# CONFIG_WLAN_VENDOR_INTEL is not set
+# CONFIG_WLAN_VENDOR_INTERSIL is not set
+# CONFIG_WLAN_VENDOR_MARVELL is not set
+# CONFIG_WLAN_VENDOR_MEDIATEK is not set
+# CONFIG_WLAN_VENDOR_RALINK is not set
+# CONFIG_WLAN_VENDOR_REALTEK is not set
+# CONFIG_WLAN_VENDOR_RSI is not set
+# CONFIG_WLAN_VENDOR_ST is not set
+# CONFIG_WLAN_VENDOR_TI is not set
+# CONFIG_WLAN_VENDOR_ZYDAS is not set
+CONFIG_MAC80211_HWSIM=y
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+# CONFIG_INPUT_KEYBOARD is not set
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO_I8042 is not set
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_8250=y
+# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_NR_UARTS=48
+CONFIG_SERIAL_8250_EXTENDED=y
+CONFIG_SERIAL_8250_MANY_PORTS=y
+CONFIG_SERIAL_8250_SHARE_IRQ=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_INTEL is not set
+# CONFIG_HW_RANDOM_AMD is not set
+# CONFIG_HW_RANDOM_VIA is not set
+CONFIG_HW_RANDOM_VIRTIO=y
+CONFIG_HPET=y
+# CONFIG_HPET_MMAP_DEFAULT is not set
+# CONFIG_DEVPORT is not set
+# CONFIG_ACPI_I2C_OPREGION is not set
+# CONFIG_I2C_COMPAT is not set
+# CONFIG_I2C_HELPER_AUTO is not set
+CONFIG_PTP_1588_CLOCK=y
+# CONFIG_HWMON is not set
+# CONFIG_X86_PKG_TEMP_THERMAL is not set
+CONFIG_WATCHDOG=y
+CONFIG_SOFT_WATCHDOG=y
+CONFIG_MEDIA_SUPPORT=y
+# CONFIG_DVB_TUNER_DIB0070 is not set
+# CONFIG_DVB_TUNER_DIB0090 is not set
+# CONFIG_VGA_ARB is not set
+CONFIG_DRM=y
+# CONFIG_DRM_FBDEV_EMULATION is not set
+CONFIG_DRM_VIRTIO_GPU=y
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+# CONFIG_HID_GENERIC is not set
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_DUMMY_HCD=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_ACC=y
+CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y
+CONFIG_USB_CONFIGFS_UEVENT=y
+CONFIG_USB_CONFIGFS_F_MIDI=y
+CONFIG_RTC_CLASS=y
+# CONFIG_RTC_HCTOSYS is not set
+CONFIG_SW_SYNC=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_BALLOON=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_VSOC=y
+CONFIG_ION=y
+# CONFIG_X86_PLATFORM_DEVICES is not set
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+# CONFIG_FIRMWARE_MEMMAP is not set
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_EXT4_ENCRYPTION=y
+CONFIG_QUOTA=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+# CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_QFMT_V2=y
+CONFIG_AUTOFS4_FS=y
+CONFIG_FUSE_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_SDCARD_FS=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_FRAME_WARN=1024
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_LOCKUP_DETECTOR=y
+CONFIG_PANIC_TIMEOUT=5
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_RCU_CPU_STALL_TIMEOUT=60
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_IO_DELAY_NONE=y
+CONFIG_DEBUG_BOOT_PARAMS=y
+CONFIG_OPTIMIZE_INLINING=y
+CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_PATH=y
+CONFIG_HARDENED_USERCOPY=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
+# CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
diff --git a/arch/x86/configs/x86_64_ranchu_defconfig b/arch/x86/configs/x86_64_ranchu_defconfig
new file mode 100644
index 000000000000..d50434f501fb
--- /dev/null
+++ b/arch/x86/configs/x86_64_ranchu_defconfig
@@ -0,0 +1,419 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL_SYSCALL=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_ARCH_MMAP_RND_BITS=32
+CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_SGI_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_SMP=y
+CONFIG_MCORE2=y
+CONFIG_MAXSMP=y
+CONFIG_PREEMPT=y
+# CONFIG_X86_MCE is not set
+CONFIG_X86_MSR=y
+CONFIG_X86_CPUID=y
+CONFIG_KSM=y
+CONFIG_CMA=y
+# CONFIG_MTRR_SANITIZER is not set
+CONFIG_EFI=y
+CONFIG_EFI_STUB=y
+CONFIG_HZ_100=y
+CONFIG_PHYSICAL_START=0x100000
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_STAT is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_PCI_MMCONFIG=y
+CONFIG_PCIEPORTBUS=y
+# CONFIG_PCIEASPM is not set
+CONFIG_PCCARD=y
+CONFIG_YENTA=y
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_BINFMT_MISC=y
+CONFIG_IA32_EMULATION=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_LRO is not set
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETLABEL=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_CFG80211=y
+CONFIG_MAC80211=y
+CONFIG_MAC80211_LEDS=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DMA_CMA=y
+CONFIG_CONNECTOR=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_SPI_ATTRS=y
+CONFIG_SCSI_ISCSI_ATTRS=y
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
+CONFIG_ATA_PIIX=y
+CONFIG_PATA_AMD=y
+CONFIG_PATA_OLDPIIX=y
+CONFIG_PATA_SCH=y
+CONFIG_PATA_MPIIX=y
+CONFIG_ATA_GENERIC=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_DEBUG=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_MIRROR=y
+CONFIG_DM_ZERO=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_NETCONSOLE=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_BNX2=y
+CONFIG_TIGON3=y
+CONFIG_NET_TULIP=y
+CONFIG_E100=y
+CONFIG_E1000=y
+CONFIG_E1000E=y
+CONFIG_SKY2=y
+CONFIG_NE2K_PCI=y
+CONFIG_FORCEDETH=y
+CONFIG_8139TOO=y
+# CONFIG_8139TOO_PIO is not set
+CONFIG_R8169=y
+CONFIG_FDDI=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+CONFIG_USB_USBNET=y
+CONFIG_INPUT_POLLDEV=y
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+# CONFIG_KEYBOARD_ATKBD is not set
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO is not set
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+CONFIG_SERIAL_NONSTANDARD=y
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_NVRAM=y
+CONFIG_I2C_I801=y
+CONFIG_BATTERY_GOLDFISH=y
+CONFIG_WATCHDOG=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_AGP=y
+CONFIG_AGP_AMD64=y
+CONFIG_AGP_INTEL=y
+CONFIG_DRM=y
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_TILEBLITTING=y
+CONFIG_FB_EFI=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+# CONFIG_LCD_CLASS_DEVICE is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_HID_PID=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_MON=y
+CONFIG_USB_EHCI_HCD=y
+# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_UHCI_HCD=y
+CONFIG_USB_PRINTER=y
+CONFIG_USB_STORAGE=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_EDAC=y
+CONFIG_RTC_CLASS=y
+# CONFIG_RTC_HCTOSYS is not set
+CONFIG_DMADEVICES=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_SYNC_FILE=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_SND_HDA_INTEL=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+CONFIG_GOLDFISH_SYNC=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ISCSI_IBFT_FIND=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+# CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_FUSE_FS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+# CONFIG_NETWORK_FILESYSTEMS is not set
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_SCHED_TRACER=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
+CONFIG_KEYS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_CRYPTO_TWOFISH=y
+CONFIG_ASYMMETRIC_KEY_TYPE=y
+CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
+CONFIG_X509_CERTIFICATE_PARSER=y
+CONFIG_PKCS7_MESSAGE_PARSER=y
+CONFIG_PKCS7_TEST_KEY=y
+# CONFIG_VIRTUALIZATION is not set
+CONFIG_CRC_T10DIF=y
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
index c5d1785373ed..02bab09707f2 100644
--- a/arch/x86/include/asm/idle.h
+++ b/arch/x86/include/asm/idle.h
@@ -1,13 +1,6 @@
#ifndef _ASM_X86_IDLE_H
#define _ASM_X86_IDLE_H
-#define IDLE_START 1
-#define IDLE_END 2
-
-struct notifier_block;
-void idle_notifier_register(struct notifier_block *n);
-void idle_notifier_unregister(struct notifier_block *n);
-
#ifdef CONFIG_X86_64
void enter_idle(void);
void exit_idle(void);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 00a9047539d7..e9195a139d4e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -68,19 +68,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_tss);
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU(unsigned char, is_idle);
-static ATOMIC_NOTIFIER_HEAD(idle_notifier);
-
-void idle_notifier_register(struct notifier_block *n)
-{
- atomic_notifier_chain_register(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
- atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_unregister);
#endif
/*
@@ -397,14 +384,14 @@ static inline void play_dead(void)
void enter_idle(void)
{
this_cpu_write(is_idle, 1);
- atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+ idle_notifier_call_chain(IDLE_START);
}
static void __exit_idle(void)
{
if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
return;
- atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+ idle_notifier_call_chain(IDLE_END);
}
/* Called from interrupts to signify idle end */
diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
index 81435d995e11..fc7ca2841206 100644
--- a/arch/xtensa/include/uapi/asm/socket.h
+++ b/arch/xtensa/include/uapi/asm/socket.h
@@ -101,4 +101,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* _XTENSA_SOCKET_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 77b99bf16c83..4c50e5768efa 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -40,6 +40,8 @@
#include "blk.h"
#include "blk-mq.h"
+#include <linux/math64.h>
+
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
@@ -3569,3 +3571,85 @@ int __init blk_dev_init(void)
return 0;
}
+
+/*
+ * Blk IO latency support. We want this to be as cheap as possible, so doing
+ * this lockless (and avoiding atomics), a few off by a few errors in this
+ * code is not harmful, and we don't want to do anything that is
+ * perf-impactful.
+ * TODO : If necessary, we can make the histograms per-cpu and aggregate
+ * them when printing them out.
+ */
+void
+blk_zero_latency_hist(struct io_latency_state *s)
+{
+ memset(s->latency_y_axis_read, 0,
+ sizeof(s->latency_y_axis_read));
+ memset(s->latency_y_axis_write, 0,
+ sizeof(s->latency_y_axis_write));
+ s->latency_reads_elems = 0;
+ s->latency_writes_elems = 0;
+}
+EXPORT_SYMBOL(blk_zero_latency_hist);
+
+ssize_t
+blk_latency_hist_show(struct io_latency_state *s, char *buf)
+{
+ int i;
+ int bytes_written = 0;
+ u_int64_t num_elem, elem;
+ int pct;
+
+ num_elem = s->latency_reads_elems;
+ if (num_elem > 0) {
+ bytes_written += scnprintf(buf + bytes_written,
+ PAGE_SIZE - bytes_written,
+ "IO svc_time Read Latency Histogram (n = %llu):\n",
+ num_elem);
+ for (i = 0;
+ i < ARRAY_SIZE(latency_x_axis_us);
+ i++) {
+ elem = s->latency_y_axis_read[i];
+ pct = div64_u64(elem * 100, num_elem);
+ bytes_written += scnprintf(buf + bytes_written,
+ PAGE_SIZE - bytes_written,
+ "\t< %5lluus%15llu%15d%%\n",
+ latency_x_axis_us[i],
+ elem, pct);
+ }
+ /* Last element in y-axis table is overflow */
+ elem = s->latency_y_axis_read[i];
+ pct = div64_u64(elem * 100, num_elem);
+ bytes_written += scnprintf(buf + bytes_written,
+ PAGE_SIZE - bytes_written,
+ "\t> %5dms%15llu%15d%%\n", 10,
+ elem, pct);
+ }
+ num_elem = s->latency_writes_elems;
+ if (num_elem > 0) {
+ bytes_written += scnprintf(buf + bytes_written,
+ PAGE_SIZE - bytes_written,
+ "IO svc_time Write Latency Histogram (n = %llu):\n",
+ num_elem);
+ for (i = 0;
+ i < ARRAY_SIZE(latency_x_axis_us);
+ i++) {
+ elem = s->latency_y_axis_write[i];
+ pct = div64_u64(elem * 100, num_elem);
+ bytes_written += scnprintf(buf + bytes_written,
+ PAGE_SIZE - bytes_written,
+ "\t< %5lluus%15llu%15d%%\n",
+ latency_x_axis_us[i],
+ elem, pct);
+ }
+ /* Last element in y-axis table is overflow */
+ elem = s->latency_y_axis_write[i];
+ pct = div64_u64(elem * 100, num_elem);
+ bytes_written += scnprintf(buf + bytes_written,
+ PAGE_SIZE - bytes_written,
+ "\t> %5dms%15llu%15d%%\n", 10,
+ elem, pct);
+ }
+ return bytes_written;
+}
+EXPORT_SYMBOL(blk_latency_hist_show);
diff --git a/build.config.cuttlefish.x86_64 b/build.config.cuttlefish.x86_64
new file mode 100644
index 000000000000..5a9656359f0b
--- /dev/null
+++ b/build.config.cuttlefish.x86_64
@@ -0,0 +1,15 @@
+ARCH=x86_64
+BRANCH=android-4.9
+CLANG_TRIPLE=x86_64-linux-gnu-
+CROSS_COMPILE=x86_64-linux-androidkernel-
+DEFCONFIG=x86_64_cuttlefish_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+POST_DEFCONFIG_CMDS="check_defconfig"
+CLANG_PREBUILT_BIN=prebuilts/clang/host/linux-x86/clang-4630689/bin
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
+FILES="
+arch/x86/boot/bzImage
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.arm b/build.config.goldfish.arm
new file mode 100644
index 000000000000..866da9361b71
--- /dev/null
+++ b/build.config.goldfish.arm
@@ -0,0 +1,12 @@
+ARCH=arm
+BRANCH=android-4.4
+CROSS_COMPILE=arm-linux-androidkernel-
+DEFCONFIG=ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/arm/arm-linux-androideabi-4.9/bin
+FILES="
+arch/arm/boot/zImage
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.arm64 b/build.config.goldfish.arm64
new file mode 100644
index 000000000000..9c963cf4a3d8
--- /dev/null
+++ b/build.config.goldfish.arm64
@@ -0,0 +1,12 @@
+ARCH=arm64
+BRANCH=android-4.4
+CROSS_COMPILE=aarch64-linux-android-
+DEFCONFIG=ranchu64_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin
+FILES="
+arch/arm64/boot/Image
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.mips b/build.config.goldfish.mips
new file mode 100644
index 000000000000..8af53d2c2940
--- /dev/null
+++ b/build.config.goldfish.mips
@@ -0,0 +1,11 @@
+ARCH=mips
+BRANCH=android-4.4
+CROSS_COMPILE=mips64el-linux-android-
+DEFCONFIG=ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/mips/mips64el-linux-android-4.9/bin
+FILES="
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.mips64 b/build.config.goldfish.mips64
new file mode 100644
index 000000000000..2a33d36dc4c8
--- /dev/null
+++ b/build.config.goldfish.mips64
@@ -0,0 +1,11 @@
+ARCH=mips
+BRANCH=android-4.4
+CROSS_COMPILE=mips64el-linux-android-
+DEFCONFIG=ranchu64_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/mips/mips64el-linux-android-4.9/bin
+FILES="
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.x86 b/build.config.goldfish.x86
new file mode 100644
index 000000000000..f86253f58d4d
--- /dev/null
+++ b/build.config.goldfish.x86
@@ -0,0 +1,12 @@
+ARCH=x86
+BRANCH=android-4.4
+CROSS_COMPILE=x86_64-linux-android-
+DEFCONFIG=i386_ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
+FILES="
+arch/x86/boot/bzImage
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.x86_64 b/build.config.goldfish.x86_64
new file mode 100644
index 000000000000..e1738861ec5c
--- /dev/null
+++ b/build.config.goldfish.x86_64
@@ -0,0 +1,12 @@
+ARCH=x86_64
+BRANCH=android-4.4
+CROSS_COMPILE=x86_64-linux-android-
+DEFCONFIG=x86_64_ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
+FILES="
+arch/x86/boot/bzImage
+vmlinux
+System.map
+"
diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig
index bdfc6c6f4f5a..01de42c8b74b 100644
--- a/drivers/android/Kconfig
+++ b/drivers/android/Kconfig
@@ -19,6 +19,18 @@ config ANDROID_BINDER_IPC
Android process, using Binder to identify, invoke and pass arguments
between said processes.
+config ANDROID_BINDER_DEVICES
+ string "Android Binder devices"
+ depends on ANDROID_BINDER_IPC
+ default "binder,hwbinder,vndbinder"
+ ---help---
+ Default value for the binder.devices parameter.
+
+ The binder.devices parameter is a comma-separated list of strings
+ that specifies the names of the binder device nodes that will be
+ created. Each binder device has its own context manager, and is
+ therefore logically separated from the other devices.
+
config ANDROID_BINDER_IPC_32BIT
bool
depends on !64BIT && ANDROID_BINDER_IPC
@@ -32,6 +44,16 @@ config ANDROID_BINDER_IPC_32BIT
Note that enabling this will break newer Android user-space.
+config ANDROID_BINDER_IPC_SELFTEST
+ bool "Android Binder IPC Driver Selftest"
+ depends on ANDROID_BINDER_IPC
+ ---help---
+ This feature allows binder selftest to run.
+
+ Binder selftest checks the allocation and free of binder buffers
+ exhaustively with combinations of various buffer sizes and
+ alignments.
+
endif # if ANDROID
endmenu
diff --git a/drivers/android/Makefile b/drivers/android/Makefile
index 3b7e4b072c58..a01254c43ee3 100644
--- a/drivers/android/Makefile
+++ b/drivers/android/Makefile
@@ -1,3 +1,4 @@
ccflags-y += -I$(src) # needed for trace events
-obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o
+obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o
+obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 49199bd2ab93..bc7df9c33a7b 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -15,6 +15,40 @@
*
*/
+/*
+ * Locking overview
+ *
+ * There are 3 main spinlocks which must be acquired in the
+ * order shown:
+ *
+ * 1) proc->outer_lock : protects binder_ref
+ * binder_proc_lock() and binder_proc_unlock() are
+ * used to acq/rel.
+ * 2) node->lock : protects most fields of binder_node.
+ * binder_node_lock() and binder_node_unlock() are
+ * used to acq/rel
+ * 3) proc->inner_lock : protects the thread and node lists
+ * (proc->threads, proc->waiting_threads, proc->nodes)
+ * and all todo lists associated with the binder_proc
+ * (proc->todo, thread->todo, proc->delivered_death and
+ * node->async_todo), as well as thread->transaction_stack
+ * binder_inner_proc_lock() and binder_inner_proc_unlock()
+ * are used to acq/rel
+ *
+ * Any lock under procA must never be nested under any lock at the same
+ * level or below on procB.
+ *
+ * Functions that require a lock held on entry indicate which lock
+ * in the suffix of the function name:
+ *
+ * foo_olocked() : requires node->outer_lock
+ * foo_nlocked() : requires node->lock
+ * foo_ilocked() : requires proc->inner_lock
+ * foo_oilocked(): requires proc->outer_lock and proc->inner_lock
+ * foo_nilocked(): requires node->lock and proc->inner_lock
+ * ...
+ */
+
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <asm/cacheflush.h>
@@ -24,7 +58,6 @@
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/miscdevice.h>
-#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/nsproxy.h>
@@ -34,31 +67,31 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
#include <linux/pid_namespace.h>
#include <linux/security.h>
+#include <linux/spinlock.h>
#ifdef CONFIG_ANDROID_BINDER_IPC_32BIT
#define BINDER_IPC_32BIT 1
#endif
#include <uapi/linux/android/binder.h>
+#include "binder_alloc.h"
#include "binder_trace.h"
-static DEFINE_MUTEX(binder_main_lock);
+static HLIST_HEAD(binder_deferred_list);
static DEFINE_MUTEX(binder_deferred_lock);
-static DEFINE_MUTEX(binder_mmap_lock);
+static HLIST_HEAD(binder_devices);
static HLIST_HEAD(binder_procs);
-static HLIST_HEAD(binder_deferred_list);
+static DEFINE_MUTEX(binder_procs_lock);
+
static HLIST_HEAD(binder_dead_nodes);
+static DEFINE_SPINLOCK(binder_dead_nodes_lock);
static struct dentry *binder_debugfs_dir_entry_root;
static struct dentry *binder_debugfs_dir_entry_proc;
-static struct binder_node *binder_context_mgr_node;
-static kuid_t binder_context_mgr_uid = INVALID_UID;
-static int binder_last_id;
+static atomic_t binder_last_id;
#define BINDER_DEBUG_ENTRY(name) \
static int binder_##name##_open(struct inode *inode, struct file *file) \
@@ -104,16 +137,15 @@ enum {
BINDER_DEBUG_TRANSACTION_COMPLETE = 1U << 10,
BINDER_DEBUG_FREE_BUFFER = 1U << 11,
BINDER_DEBUG_INTERNAL_REFS = 1U << 12,
- BINDER_DEBUG_BUFFER_ALLOC = 1U << 13,
- BINDER_DEBUG_PRIORITY_CAP = 1U << 14,
- BINDER_DEBUG_BUFFER_ALLOC_ASYNC = 1U << 15,
+ BINDER_DEBUG_PRIORITY_CAP = 1U << 13,
+ BINDER_DEBUG_SPINLOCKS = 1U << 14,
};
static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR |
BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION;
module_param_named(debug_mask, binder_debug_mask, uint, S_IWUSR | S_IRUGO);
-static bool binder_debug_no_lock;
-module_param_named(proc_no_lock, binder_debug_no_lock, bool, S_IWUSR | S_IRUGO);
+static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES;
+module_param_named(devices, binder_devices_param, charp, S_IRUGO);
static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait);
static int binder_stop_on_user_error;
@@ -145,6 +177,17 @@ module_param_call(stop_on_user_error, binder_set_stop_on_user_error,
binder_stop_on_user_error = 2; \
} while (0)
+#define to_flat_binder_object(hdr) \
+ container_of(hdr, struct flat_binder_object, hdr)
+
+#define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr)
+
+#define to_binder_buffer_object(hdr) \
+ container_of(hdr, struct binder_buffer_object, hdr)
+
+#define to_binder_fd_array_object(hdr) \
+ container_of(hdr, struct binder_fd_array_object, hdr)
+
enum binder_stat_types {
BINDER_STAT_PROC,
BINDER_STAT_THREAD,
@@ -157,26 +200,27 @@ enum binder_stat_types {
};
struct binder_stats {
- int br[_IOC_NR(BR_FAILED_REPLY) + 1];
- int bc[_IOC_NR(BC_DEAD_BINDER_DONE) + 1];
- int obj_created[BINDER_STAT_COUNT];
- int obj_deleted[BINDER_STAT_COUNT];
+ atomic_t br[_IOC_NR(BR_FAILED_REPLY) + 1];
+ atomic_t bc[_IOC_NR(BC_REPLY_SG) + 1];
+ atomic_t obj_created[BINDER_STAT_COUNT];
+ atomic_t obj_deleted[BINDER_STAT_COUNT];
};
static struct binder_stats binder_stats;
static inline void binder_stats_deleted(enum binder_stat_types type)
{
- binder_stats.obj_deleted[type]++;
+ atomic_inc(&binder_stats.obj_deleted[type]);
}
static inline void binder_stats_created(enum binder_stat_types type)
{
- binder_stats.obj_created[type]++;
+ atomic_inc(&binder_stats.obj_created[type]);
}
struct binder_transaction_log_entry {
int debug_id;
+ int debug_id_done;
int call_type;
int from_proc;
int from_thread;
@@ -186,10 +230,14 @@ struct binder_transaction_log_entry {
int to_node;
int data_size;
int offsets_size;
+ int return_error_line;
+ uint32_t return_error;
+ uint32_t return_error_param;
+ const char *context_name;
};
struct binder_transaction_log {
- int next;
- int full;
+ atomic_t cur;
+ bool full;
struct binder_transaction_log_entry entry[32];
};
static struct binder_transaction_log binder_transaction_log;
@@ -199,22 +247,50 @@ static struct binder_transaction_log_entry *binder_transaction_log_add(
struct binder_transaction_log *log)
{
struct binder_transaction_log_entry *e;
+ unsigned int cur = atomic_inc_return(&log->cur);
- e = &log->entry[log->next];
- memset(e, 0, sizeof(*e));
- log->next++;
- if (log->next == ARRAY_SIZE(log->entry)) {
- log->next = 0;
+ if (cur >= ARRAY_SIZE(log->entry))
log->full = 1;
- }
+ e = &log->entry[cur % ARRAY_SIZE(log->entry)];
+ WRITE_ONCE(e->debug_id_done, 0);
+ /*
+ * write-barrier to synchronize access to e->debug_id_done.
+ * We make sure the initialized 0 value is seen before
+ * memset() other fields are zeroed by memset.
+ */
+ smp_wmb();
+ memset(e, 0, sizeof(*e));
return e;
}
+struct binder_context {
+ struct binder_node *binder_context_mgr_node;
+ struct mutex context_mgr_node_lock;
+
+ kuid_t binder_context_mgr_uid;
+ const char *name;
+};
+
+struct binder_device {
+ struct hlist_node hlist;
+ struct miscdevice miscdev;
+ struct binder_context context;
+};
+
+/**
+ * struct binder_work - work enqueued on a worklist
+ * @entry: node enqueued on list
+ * @type: type of work to be performed
+ *
+ * There are separate work lists for proc, thread, and node (async).
+ */
struct binder_work {
struct list_head entry;
+
enum {
BINDER_WORK_TRANSACTION = 1,
BINDER_WORK_TRANSACTION_COMPLETE,
+ BINDER_WORK_RETURN_ERROR,
BINDER_WORK_NODE,
BINDER_WORK_DEAD_BINDER,
BINDER_WORK_DEAD_BINDER_AND_CLEAR,
@@ -222,8 +298,76 @@ struct binder_work {
} type;
};
+struct binder_error {
+ struct binder_work work;
+ uint32_t cmd;
+};
+
+/**
+ * struct binder_node - binder node bookkeeping
+ * @debug_id: unique ID for debugging
+ * (invariant after initialized)
+ * @lock: lock for node fields
+ * @work: worklist element for node work
+ * (protected by @proc->inner_lock)
+ * @rb_node: element for proc->nodes tree
+ * (protected by @proc->inner_lock)
+ * @dead_node: element for binder_dead_nodes list
+ * (protected by binder_dead_nodes_lock)
+ * @proc: binder_proc that owns this node
+ * (invariant after initialized)
+ * @refs: list of references on this node
+ * (protected by @lock)
+ * @internal_strong_refs: used to take strong references when
+ * initiating a transaction
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @local_weak_refs: weak user refs from local process
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @local_strong_refs: strong user refs from local process
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @tmp_refs: temporary kernel refs
+ * (protected by @proc->inner_lock while @proc
+ * is valid, and by binder_dead_nodes_lock
+ * if @proc is NULL. During inc/dec and node release
+ * it is also protected by @lock to provide safety
+ * as the node dies and @proc becomes NULL)
+ * @ptr: userspace pointer for node
+ * (invariant, no lock needed)
+ * @cookie: userspace cookie for node
+ * (invariant, no lock needed)
+ * @has_strong_ref: userspace notified of strong ref
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @pending_strong_ref: userspace has acked notification of strong ref
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @has_weak_ref: userspace notified of weak ref
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @pending_weak_ref: userspace has acked notification of weak ref
+ * (protected by @proc->inner_lock if @proc
+ * and by @lock)
+ * @has_async_transaction: async transaction to node in progress
+ * (protected by @lock)
+ * @sched_policy: minimum scheduling policy for node
+ * (invariant after initialized)
+ * @accept_fds: file descriptor operations supported for node
+ * (invariant after initialized)
+ * @min_priority: minimum scheduling priority
+ * (invariant after initialized)
+ * @inherit_rt: inherit RT scheduling policy from caller
+ * (invariant after initialized)
+ * @async_todo: list of async work items
+ * (protected by @proc->inner_lock)
+ *
+ * Bookkeeping structure for binder nodes.
+ */
struct binder_node {
int debug_id;
+ spinlock_t lock;
struct binder_work work;
union {
struct rb_node rb_node;
@@ -234,87 +378,181 @@ struct binder_node {
int internal_strong_refs;
int local_weak_refs;
int local_strong_refs;
+ int tmp_refs;
binder_uintptr_t ptr;
binder_uintptr_t cookie;
- unsigned has_strong_ref:1;
- unsigned pending_strong_ref:1;
- unsigned has_weak_ref:1;
- unsigned pending_weak_ref:1;
- unsigned has_async_transaction:1;
- unsigned accept_fds:1;
- unsigned min_priority:8;
+ struct {
+ /*
+ * bitfield elements protected by
+ * proc inner_lock
+ */
+ u8 has_strong_ref:1;
+ u8 pending_strong_ref:1;
+ u8 has_weak_ref:1;
+ u8 pending_weak_ref:1;
+ };
+ struct {
+ /*
+ * invariant after initialization
+ */
+ u8 sched_policy:2;
+ u8 inherit_rt:1;
+ u8 accept_fds:1;
+ u8 min_priority;
+ };
+ bool has_async_transaction;
struct list_head async_todo;
};
struct binder_ref_death {
+ /**
+ * @work: worklist element for death notifications
+ * (protected by inner_lock of the proc that
+ * this ref belongs to)
+ */
struct binder_work work;
binder_uintptr_t cookie;
};
+/**
+ * struct binder_ref_data - binder_ref counts and id
+ * @debug_id: unique ID for the ref
+ * @desc: unique userspace handle for ref
+ * @strong: strong ref count (debugging only if not locked)
+ * @weak: weak ref count (debugging only if not locked)
+ *
+ * Structure to hold ref count and ref id information. Since
+ * the actual ref can only be accessed with a lock, this structure
+ * is used to return information about the ref to callers of
+ * ref inc/dec functions.
+ */
+struct binder_ref_data {
+ int debug_id;
+ uint32_t desc;
+ int strong;
+ int weak;
+};
+
+/**
+ * struct binder_ref - struct to track references on nodes
+ * @data: binder_ref_data containing id, handle, and current refcounts
+ * @rb_node_desc: node for lookup by @data.desc in proc's rb_tree
+ * @rb_node_node: node for lookup by @node in proc's rb_tree
+ * @node_entry: list entry for node->refs list in target node
+ * (protected by @node->lock)
+ * @proc: binder_proc containing ref
+ * @node: binder_node of target node. When cleaning up a
+ * ref for deletion in binder_cleanup_ref, a non-NULL
+ * @node indicates the node must be freed
+ * @death: pointer to death notification (ref_death) if requested
+ * (protected by @node->lock)
+ *
+ * Structure to track references from procA to target node (on procB). This
+ * structure is unsafe to access without holding @proc->outer_lock.
+ */
struct binder_ref {
/* Lookups needed: */
/* node + proc => ref (transaction) */
/* desc + proc => ref (transaction, inc/dec ref) */
/* node => refs + procs (proc exit) */
- int debug_id;
+ struct binder_ref_data data;
struct rb_node rb_node_desc;
struct rb_node rb_node_node;
struct hlist_node node_entry;
struct binder_proc *proc;
struct binder_node *node;
- uint32_t desc;
- int strong;
- int weak;
struct binder_ref_death *death;
};
-struct binder_buffer {
- struct list_head entry; /* free and allocated entries by address */
- struct rb_node rb_node; /* free entry by size or allocated entry */
- /* by address */
- unsigned free:1;
- unsigned allow_user_free:1;
- unsigned async_transaction:1;
- unsigned debug_id:29;
-
- struct binder_transaction *transaction;
-
- struct binder_node *target_node;
- size_t data_size;
- size_t offsets_size;
- uint8_t data[0];
+enum binder_deferred_state {
+ BINDER_DEFERRED_FLUSH = 0x01,
+ BINDER_DEFERRED_RELEASE = 0x02,
};
-enum binder_deferred_state {
- BINDER_DEFERRED_PUT_FILES = 0x01,
- BINDER_DEFERRED_FLUSH = 0x02,
- BINDER_DEFERRED_RELEASE = 0x04,
+/**
+ * struct binder_priority - scheduler policy and priority
+ * @sched_policy scheduler policy
+ * @prio [100..139] for SCHED_NORMAL, [0..99] for FIFO/RT
+ *
+ * The binder driver supports inheriting the following scheduler policies:
+ * SCHED_NORMAL
+ * SCHED_BATCH
+ * SCHED_FIFO
+ * SCHED_RR
+ */
+struct binder_priority {
+ unsigned int sched_policy;
+ int prio;
};
+/**
+ * struct binder_proc - binder process bookkeeping
+ * @proc_node: element for binder_procs list
+ * @threads: rbtree of binder_threads in this proc
+ * (protected by @inner_lock)
+ * @nodes: rbtree of binder nodes associated with
+ * this proc ordered by node->ptr
+ * (protected by @inner_lock)
+ * @refs_by_desc: rbtree of refs ordered by ref->desc
+ * (protected by @outer_lock)
+ * @refs_by_node: rbtree of refs ordered by ref->node
+ * (protected by @outer_lock)
+ * @waiting_threads: threads currently waiting for proc work
+ * (protected by @inner_lock)
+ * @pid PID of group_leader of process
+ * (invariant after initialized)
+ * @tsk task_struct for group_leader of process
+ * (invariant after initialized)
+ * @deferred_work_node: element for binder_deferred_list
+ * (protected by binder_deferred_lock)
+ * @deferred_work: bitmap of deferred work to perform
+ * (protected by binder_deferred_lock)
+ * @is_dead: process is dead and awaiting free
+ * when outstanding transactions are cleaned up
+ * (protected by @inner_lock)
+ * @todo: list of work for this process
+ * (protected by @inner_lock)
+ * @wait: wait queue head to wait for proc work
+ * (invariant after initialized)
+ * @stats: per-process binder statistics
+ * (atomics, no lock needed)
+ * @delivered_death: list of delivered death notification
+ * (protected by @inner_lock)
+ * @max_threads: cap on number of binder threads
+ * (protected by @inner_lock)
+ * @requested_threads: number of binder threads requested but not
+ * yet started. In current implementation, can
+ * only be 0 or 1.
+ * (protected by @inner_lock)
+ * @requested_threads_started: number binder threads started
+ * (protected by @inner_lock)
+ * @tmp_ref: temporary reference to indicate proc is in use
+ * (protected by @inner_lock)
+ * @default_priority: default scheduler priority
+ * (invariant after initialized)
+ * @debugfs_entry: debugfs node
+ * @alloc: binder allocator bookkeeping
+ * @context: binder_context for this proc
+ * (invariant after initialized)
+ * @inner_lock: can nest under outer_lock and/or node lock
+ * @outer_lock: no nesting under innor or node lock
+ * Lock order: 1) outer, 2) node, 3) inner
+ *
+ * Bookkeeping structure for binder processes
+ */
struct binder_proc {
struct hlist_node proc_node;
struct rb_root threads;
struct rb_root nodes;
struct rb_root refs_by_desc;
struct rb_root refs_by_node;
+ struct list_head waiting_threads;
int pid;
- struct vm_area_struct *vma;
- struct mm_struct *vma_vm_mm;
struct task_struct *tsk;
- struct files_struct *files;
struct hlist_node deferred_work_node;
int deferred_work;
- void *buffer;
- ptrdiff_t user_buffer_offset;
+ bool is_dead;
- struct list_head buffers;
- struct rb_root free_buffers;
- struct rb_root allocated_buffers;
- size_t free_async_space;
-
- struct page **pages;
- size_t buffer_size;
- uint32_t buffer_free;
struct list_head todo;
wait_queue_head_t wait;
struct binder_stats stats;
@@ -322,9 +560,13 @@ struct binder_proc {
int max_threads;
int requested_threads;
int requested_threads_started;
- int ready_threads;
- long default_priority;
+ int tmp_ref;
+ struct binder_priority default_priority;
struct dentry *debugfs_entry;
+ struct binder_alloc alloc;
+ struct binder_context *context;
+ spinlock_t inner_lock;
+ spinlock_t outer_lock;
};
enum {
@@ -333,22 +575,60 @@ enum {
BINDER_LOOPER_STATE_EXITED = 0x04,
BINDER_LOOPER_STATE_INVALID = 0x08,
BINDER_LOOPER_STATE_WAITING = 0x10,
- BINDER_LOOPER_STATE_NEED_RETURN = 0x20
+ BINDER_LOOPER_STATE_POLL = 0x20,
};
+/**
+ * struct binder_thread - binder thread bookkeeping
+ * @proc: binder process for this thread
+ * (invariant after initialization)
+ * @rb_node: element for proc->threads rbtree
+ * (protected by @proc->inner_lock)
+ * @waiting_thread_node: element for @proc->waiting_threads list
+ * (protected by @proc->inner_lock)
+ * @pid: PID for this thread
+ * (invariant after initialization)
+ * @looper: bitmap of looping state
+ * (only accessed by this thread)
+ * @looper_needs_return: looping thread needs to exit driver
+ * (no lock needed)
+ * @transaction_stack: stack of in-progress transactions for this thread
+ * (protected by @proc->inner_lock)
+ * @todo: list of work to do for this thread
+ * (protected by @proc->inner_lock)
+ * @return_error: transaction errors reported by this thread
+ * (only accessed by this thread)
+ * @reply_error: transaction errors reported by target thread
+ * (protected by @proc->inner_lock)
+ * @wait: wait queue for thread work
+ * @stats: per-thread statistics
+ * (atomics, no lock needed)
+ * @tmp_ref: temporary reference to indicate thread is in use
+ * (atomic since @proc->inner_lock cannot
+ * always be acquired)
+ * @is_dead: thread is dead and awaiting free
+ * when outstanding transactions are cleaned up
+ * (protected by @proc->inner_lock)
+ * @task: struct task_struct for this thread
+ *
+ * Bookkeeping structure for binder threads.
+ */
struct binder_thread {
struct binder_proc *proc;
struct rb_node rb_node;
+ struct list_head waiting_thread_node;
int pid;
- int looper;
+ int looper; /* only modified by this thread */
+ bool looper_need_return; /* can be written by other thread */
struct binder_transaction *transaction_stack;
struct list_head todo;
- uint32_t return_error; /* Write failed, return error code in read buf */
- uint32_t return_error2; /* Write failed, return error code in read */
- /* buffer. Used when sending a reply to a dead process that */
- /* we are also waiting on */
+ struct binder_error return_error;
+ struct binder_error reply_error;
wait_queue_head_t wait;
struct binder_stats stats;
+ atomic_t tmp_ref;
+ bool is_dead;
+ struct task_struct *task;
};
struct binder_transaction {
@@ -365,30 +645,286 @@ struct binder_transaction {
struct binder_buffer *buffer;
unsigned int code;
unsigned int flags;
- long priority;
- long saved_priority;
+ struct binder_priority priority;
+ struct binder_priority saved_priority;
+ bool set_priority_called;
kuid_t sender_euid;
+ /**
+ * @lock: protects @from, @to_proc, and @to_thread
+ *
+ * @from, @to_proc, and @to_thread can be set to NULL
+ * during thread teardown
+ */
+ spinlock_t lock;
};
+/**
+ * binder_proc_lock() - Acquire outer lock for given binder_proc
+ * @proc: struct binder_proc to acquire
+ *
+ * Acquires proc->outer_lock. Used to protect binder_ref
+ * structures associated with the given proc.
+ */
+#define binder_proc_lock(proc) _binder_proc_lock(proc, __LINE__)
+static void
+_binder_proc_lock(struct binder_proc *proc, int line)
+{
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ spin_lock(&proc->outer_lock);
+}
+
+/**
+ * binder_proc_unlock() - Release spinlock for given binder_proc
+ * @proc: struct binder_proc to acquire
+ *
+ * Release lock acquired via binder_proc_lock()
+ */
+#define binder_proc_unlock(_proc) _binder_proc_unlock(_proc, __LINE__)
+static void
+_binder_proc_unlock(struct binder_proc *proc, int line)
+{
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ spin_unlock(&proc->outer_lock);
+}
+
+/**
+ * binder_inner_proc_lock() - Acquire inner lock for given binder_proc
+ * @proc: struct binder_proc to acquire
+ *
+ * Acquires proc->inner_lock. Used to protect todo lists
+ */
+#define binder_inner_proc_lock(proc) _binder_inner_proc_lock(proc, __LINE__)
+static void
+_binder_inner_proc_lock(struct binder_proc *proc, int line)
+{
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ spin_lock(&proc->inner_lock);
+}
+
+/**
+ * binder_inner_proc_unlock() - Release inner lock for given binder_proc
+ * @proc: struct binder_proc to acquire
+ *
+ * Release lock acquired via binder_inner_proc_lock()
+ */
+#define binder_inner_proc_unlock(proc) _binder_inner_proc_unlock(proc, __LINE__)
+static void
+_binder_inner_proc_unlock(struct binder_proc *proc, int line)
+{
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ spin_unlock(&proc->inner_lock);
+}
+
+/**
+ * binder_node_lock() - Acquire spinlock for given binder_node
+ * @node: struct binder_node to acquire
+ *
+ * Acquires node->lock. Used to protect binder_node fields
+ */
+#define binder_node_lock(node) _binder_node_lock(node, __LINE__)
+static void
+_binder_node_lock(struct binder_node *node, int line)
+{
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ spin_lock(&node->lock);
+}
+
+/**
+ * binder_node_unlock() - Release spinlock for given binder_proc
+ * @node: struct binder_node to acquire
+ *
+ * Release lock acquired via binder_node_lock()
+ */
+#define binder_node_unlock(node) _binder_node_unlock(node, __LINE__)
+static void
+_binder_node_unlock(struct binder_node *node, int line)
+{
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ spin_unlock(&node->lock);
+}
+
+/**
+ * binder_node_inner_lock() - Acquire node and inner locks
+ * @node: struct binder_node to acquire
+ *
+ * Acquires node->lock. If node->proc also acquires
+ * proc->inner_lock. Used to protect binder_node fields
+ */
+#define binder_node_inner_lock(node) _binder_node_inner_lock(node, __LINE__)
+static void
+_binder_node_inner_lock(struct binder_node *node, int line)
+{
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ spin_lock(&node->lock);
+ if (node->proc)
+ binder_inner_proc_lock(node->proc);
+}
+
+/**
+ * binder_node_unlock() - Release node and inner locks
+ * @node: struct binder_node to acquire
+ *
+ * Release lock acquired via binder_node_lock()
+ */
+#define binder_node_inner_unlock(node) _binder_node_inner_unlock(node, __LINE__)
+static void
+_binder_node_inner_unlock(struct binder_node *node, int line)
+{
+ struct binder_proc *proc = node->proc;
+
+ binder_debug(BINDER_DEBUG_SPINLOCKS,
+ "%s: line=%d\n", __func__, line);
+ if (proc)
+ binder_inner_proc_unlock(proc);
+ spin_unlock(&node->lock);
+}
+
+static bool binder_worklist_empty_ilocked(struct list_head *list)
+{
+ return list_empty(list);
+}
+
+/**
+ * binder_worklist_empty() - Check if no items on the work list
+ * @proc: binder_proc associated with list
+ * @list: list to check
+ *
+ * Return: true if there are no items on list, else false
+ */
+static bool binder_worklist_empty(struct binder_proc *proc,
+ struct list_head *list)
+{
+ bool ret;
+
+ binder_inner_proc_lock(proc);
+ ret = binder_worklist_empty_ilocked(list);
+ binder_inner_proc_unlock(proc);
+ return ret;
+}
+
+static void
+binder_enqueue_work_ilocked(struct binder_work *work,
+ struct list_head *target_list)
+{
+ BUG_ON(target_list == NULL);
+ BUG_ON(work->entry.next && !list_empty(&work->entry));
+ list_add_tail(&work->entry, target_list);
+}
+
+/**
+ * binder_enqueue_work() - Add an item to the work list
+ * @proc: binder_proc associated with list
+ * @work: struct binder_work to add to list
+ * @target_list: list to add work to
+ *
+ * Adds the work to the specified list. Asserts that work
+ * is not already on a list.
+ */
+static void
+binder_enqueue_work(struct binder_proc *proc,
+ struct binder_work *work,
+ struct list_head *target_list)
+{
+ binder_inner_proc_lock(proc);
+ binder_enqueue_work_ilocked(work, target_list);
+ binder_inner_proc_unlock(proc);
+}
+
+static void
+binder_dequeue_work_ilocked(struct binder_work *work)
+{
+ list_del_init(&work->entry);
+}
+
+/**
+ * binder_dequeue_work() - Removes an item from the work list
+ * @proc: binder_proc associated with list
+ * @work: struct binder_work to remove from list
+ *
+ * Removes the specified work item from whatever list it is on.
+ * Can safely be called if work is not on any list.
+ */
+static void
+binder_dequeue_work(struct binder_proc *proc, struct binder_work *work)
+{
+ binder_inner_proc_lock(proc);
+ binder_dequeue_work_ilocked(work);
+ binder_inner_proc_unlock(proc);
+}
+
+static struct binder_work *binder_dequeue_work_head_ilocked(
+ struct list_head *list)
+{
+ struct binder_work *w;
+
+ w = list_first_entry_or_null(list, struct binder_work, entry);
+ if (w)
+ list_del_init(&w->entry);
+ return w;
+}
+
+/**
+ * binder_dequeue_work_head() - Dequeues the item at head of list
+ * @proc: binder_proc associated with list
+ * @list: list to dequeue head
+ *
+ * Removes the head of the list if there are items on the list
+ *
+ * Return: pointer dequeued binder_work, NULL if list was empty
+ */
+static struct binder_work *binder_dequeue_work_head(
+ struct binder_proc *proc,
+ struct list_head *list)
+{
+ struct binder_work *w;
+
+ binder_inner_proc_lock(proc);
+ w = binder_dequeue_work_head_ilocked(list);
+ binder_inner_proc_unlock(proc);
+ return w;
+}
+
static void
binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer);
+static void binder_free_thread(struct binder_thread *thread);
+static void binder_free_proc(struct binder_proc *proc);
+static void binder_inc_node_tmpref_ilocked(struct binder_node *node);
+
+struct files_struct *binder_get_files_struct(struct binder_proc *proc)
+{
+ return get_files_struct(proc->tsk);
+}
static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
{
- struct files_struct *files = proc->files;
+ struct files_struct *files;
unsigned long rlim_cur;
unsigned long irqs;
+ int ret;
+ files = binder_get_files_struct(proc);
if (files == NULL)
return -ESRCH;
- if (!lock_task_sighand(proc->tsk, &irqs))
- return -EMFILE;
+ if (!lock_task_sighand(proc->tsk, &irqs)) {
+ ret = -EMFILE;
+ goto err;
+ }
rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
unlock_task_sighand(proc->tsk, &irqs);
- return __alloc_fd(files, 0, rlim_cur, flags);
+ ret = __alloc_fd(files, 0, rlim_cur, flags);
+err:
+ put_files_struct(files);
+ return ret;
}
/*
@@ -397,8 +933,12 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
static void task_fd_install(
struct binder_proc *proc, unsigned int fd, struct file *file)
{
- if (proc->files)
- __fd_install(proc->files, fd, file);
+ struct files_struct *files = binder_get_files_struct(proc);
+
+ if (files) {
+ __fd_install(files, fd, file);
+ put_files_struct(files);
+ }
}
/*
@@ -406,469 +946,299 @@ static void task_fd_install(
*/
static long task_close_fd(struct binder_proc *proc, unsigned int fd)
{
+ struct files_struct *files = binder_get_files_struct(proc);
int retval;
- if (proc->files == NULL)
+ if (files == NULL)
return -ESRCH;
- retval = __close_fd(proc->files, fd);
+ retval = __close_fd(files, fd);
/* can't restart close syscall because file table entry was cleared */
if (unlikely(retval == -ERESTARTSYS ||
retval == -ERESTARTNOINTR ||
retval == -ERESTARTNOHAND ||
retval == -ERESTART_RESTARTBLOCK))
retval = -EINTR;
+ put_files_struct(files);
return retval;
}
-static inline void binder_lock(const char *tag)
+static bool binder_has_work_ilocked(struct binder_thread *thread,
+ bool do_proc_work)
{
- trace_binder_lock(tag);
- mutex_lock(&binder_main_lock);
- trace_binder_locked(tag);
+ return !binder_worklist_empty_ilocked(&thread->todo) ||
+ thread->looper_need_return ||
+ (do_proc_work &&
+ !binder_worklist_empty_ilocked(&thread->proc->todo));
}
-static inline void binder_unlock(const char *tag)
+static bool binder_has_work(struct binder_thread *thread, bool do_proc_work)
{
- trace_binder_unlock(tag);
- mutex_unlock(&binder_main_lock);
-}
+ bool has_work;
-static void binder_set_nice(long nice)
-{
- long min_nice;
+ binder_inner_proc_lock(thread->proc);
+ has_work = binder_has_work_ilocked(thread, do_proc_work);
+ binder_inner_proc_unlock(thread->proc);
- if (can_nice(current, nice)) {
- set_user_nice(current, nice);
- return;
- }
- min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur);
- binder_debug(BINDER_DEBUG_PRIORITY_CAP,
- "%d: nice value %ld not allowed use %ld instead\n",
- current->pid, nice, min_nice);
- set_user_nice(current, min_nice);
- if (min_nice <= MAX_NICE)
- return;
- binder_user_error("%d RLIMIT_NICE not set\n", current->pid);
+ return has_work;
}
-static size_t binder_buffer_size(struct binder_proc *proc,
- struct binder_buffer *buffer)
+static bool binder_available_for_proc_work_ilocked(struct binder_thread *thread)
{
- if (list_is_last(&buffer->entry, &proc->buffers))
- return proc->buffer + proc->buffer_size - (void *)buffer->data;
- return (size_t)list_entry(buffer->entry.next,
- struct binder_buffer, entry) - (size_t)buffer->data;
+ return !thread->transaction_stack &&
+ binder_worklist_empty_ilocked(&thread->todo) &&
+ (thread->looper & (BINDER_LOOPER_STATE_ENTERED |
+ BINDER_LOOPER_STATE_REGISTERED));
}
-static void binder_insert_free_buffer(struct binder_proc *proc,
- struct binder_buffer *new_buffer)
+static void binder_wakeup_poll_threads_ilocked(struct binder_proc *proc,
+ bool sync)
{
- struct rb_node **p = &proc->free_buffers.rb_node;
- struct rb_node *parent = NULL;
- struct binder_buffer *buffer;
- size_t buffer_size;
- size_t new_buffer_size;
-
- BUG_ON(!new_buffer->free);
-
- new_buffer_size = binder_buffer_size(proc, new_buffer);
-
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: add free buffer, size %zd, at %p\n",
- proc->pid, new_buffer_size, new_buffer);
-
- while (*p) {
- parent = *p;
- buffer = rb_entry(parent, struct binder_buffer, rb_node);
- BUG_ON(!buffer->free);
-
- buffer_size = binder_buffer_size(proc, buffer);
+ struct rb_node *n;
+ struct binder_thread *thread;
- if (new_buffer_size < buffer_size)
- p = &parent->rb_left;
- else
- p = &parent->rb_right;
+ for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) {
+ thread = rb_entry(n, struct binder_thread, rb_node);
+ if (thread->looper & BINDER_LOOPER_STATE_POLL &&
+ binder_available_for_proc_work_ilocked(thread)) {
+ if (sync)
+ wake_up_interruptible_sync(&thread->wait);
+ else
+ wake_up_interruptible(&thread->wait);
+ }
}
- rb_link_node(&new_buffer->rb_node, parent, p);
- rb_insert_color(&new_buffer->rb_node, &proc->free_buffers);
}
-static void binder_insert_allocated_buffer(struct binder_proc *proc,
- struct binder_buffer *new_buffer)
+/**
+ * binder_select_thread_ilocked() - selects a thread for doing proc work.
+ * @proc: process to select a thread from
+ *
+ * Note that calling this function moves the thread off the waiting_threads
+ * list, so it can only be woken up by the caller of this function, or a
+ * signal. Therefore, callers *should* always wake up the thread this function
+ * returns.
+ *
+ * Return: If there's a thread currently waiting for process work,
+ * returns that thread. Otherwise returns NULL.
+ */
+static struct binder_thread *
+binder_select_thread_ilocked(struct binder_proc *proc)
{
- struct rb_node **p = &proc->allocated_buffers.rb_node;
- struct rb_node *parent = NULL;
- struct binder_buffer *buffer;
+ struct binder_thread *thread;
- BUG_ON(new_buffer->free);
+ assert_spin_locked(&proc->inner_lock);
+ thread = list_first_entry_or_null(&proc->waiting_threads,
+ struct binder_thread,
+ waiting_thread_node);
- while (*p) {
- parent = *p;
- buffer = rb_entry(parent, struct binder_buffer, rb_node);
- BUG_ON(buffer->free);
+ if (thread)
+ list_del_init(&thread->waiting_thread_node);
- if (new_buffer < buffer)
- p = &parent->rb_left;
- else if (new_buffer > buffer)
- p = &parent->rb_right;
- else
- BUG();
- }
- rb_link_node(&new_buffer->rb_node, parent, p);
- rb_insert_color(&new_buffer->rb_node, &proc->allocated_buffers);
+ return thread;
}
-static struct binder_buffer *binder_buffer_lookup(struct binder_proc *proc,
- uintptr_t user_ptr)
+/**
+ * binder_wakeup_thread_ilocked() - wakes up a thread for doing proc work.
+ * @proc: process to wake up a thread in
+ * @thread: specific thread to wake-up (may be NULL)
+ * @sync: whether to do a synchronous wake-up
+ *
+ * This function wakes up a thread in the @proc process.
+ * The caller may provide a specific thread to wake-up in
+ * the @thread parameter. If @thread is NULL, this function
+ * will wake up threads that have called poll().
+ *
+ * Note that for this function to work as expected, callers
+ * should first call binder_select_thread() to find a thread
+ * to handle the work (if they don't have a thread already),
+ * and pass the result into the @thread parameter.
+ */
+static void binder_wakeup_thread_ilocked(struct binder_proc *proc,
+ struct binder_thread *thread,
+ bool sync)
{
- struct rb_node *n = proc->allocated_buffers.rb_node;
- struct binder_buffer *buffer;
- struct binder_buffer *kern_ptr;
-
- kern_ptr = (struct binder_buffer *)(user_ptr - proc->user_buffer_offset
- - offsetof(struct binder_buffer, data));
-
- while (n) {
- buffer = rb_entry(n, struct binder_buffer, rb_node);
- BUG_ON(buffer->free);
+ assert_spin_locked(&proc->inner_lock);
- if (kern_ptr < buffer)
- n = n->rb_left;
- else if (kern_ptr > buffer)
- n = n->rb_right;
+ if (thread) {
+ if (sync)
+ wake_up_interruptible_sync(&thread->wait);
else
- return buffer;
+ wake_up_interruptible(&thread->wait);
+ return;
}
- return NULL;
+
+ /* Didn't find a thread waiting for proc work; this can happen
+ * in two scenarios:
+ * 1. All threads are busy handling transactions
+ * In that case, one of those threads should call back into
+ * the kernel driver soon and pick up this work.
+ * 2. Threads are using the (e)poll interface, in which case
+ * they may be blocked on the waitqueue without having been
+ * added to waiting_threads. For this case, we just iterate
+ * over all threads not handling transaction work, and
+ * wake them all up. We wake all because we don't know whether
+ * a thread that called into (e)poll is handling non-binder
+ * work currently.
+ */
+ binder_wakeup_poll_threads_ilocked(proc, sync);
}
-static int binder_update_page_range(struct binder_proc *proc, int allocate,
- void *start, void *end,
- struct vm_area_struct *vma)
+static void binder_wakeup_proc_ilocked(struct binder_proc *proc)
{
- void *page_addr;
- unsigned long user_page_addr;
- struct page **page;
- struct mm_struct *mm;
+ struct binder_thread *thread = binder_select_thread_ilocked(proc);
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: %s pages %p-%p\n", proc->pid,
- allocate ? "allocate" : "free", start, end);
-
- if (end <= start)
- return 0;
+ binder_wakeup_thread_ilocked(proc, thread, /* sync = */false);
+}
- trace_binder_update_page_range(proc, allocate, start, end);
+static bool is_rt_policy(int policy)
+{
+ return policy == SCHED_FIFO || policy == SCHED_RR;
+}
- if (vma)
- mm = NULL;
- else
- mm = get_task_mm(proc->tsk);
-
- if (mm) {
- down_write(&mm->mmap_sem);
- vma = proc->vma;
- if (vma && mm != proc->vma_vm_mm) {
- pr_err("%d: vma mm and task mm mismatch\n",
- proc->pid);
- vma = NULL;
- }
- }
+static bool is_fair_policy(int policy)
+{
+ return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+}
- if (allocate == 0)
- goto free_range;
+static bool binder_supported_policy(int policy)
+{
+ return is_fair_policy(policy) || is_rt_policy(policy);
+}
- if (vma == NULL) {
- pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n",
- proc->pid);
- goto err_no_vma;
- }
+static int to_userspace_prio(int policy, int kernel_priority)
+{
+ if (is_fair_policy(policy))
+ return PRIO_TO_NICE(kernel_priority);
+ else
+ return MAX_USER_RT_PRIO - 1 - kernel_priority;
+}
- for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
- int ret;
+static int to_kernel_prio(int policy, int user_priority)
+{
+ if (is_fair_policy(policy))
+ return NICE_TO_PRIO(user_priority);
+ else
+ return MAX_USER_RT_PRIO - 1 - user_priority;
+}
- page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE];
+static void binder_do_set_priority(struct task_struct *task,
+ struct binder_priority desired,
+ bool verify)
+{
+ int priority; /* user-space prio value */
+ bool has_cap_nice;
+ unsigned int policy = desired.sched_policy;
- BUG_ON(*page);
- *page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
- if (*page == NULL) {
- pr_err("%d: binder_alloc_buf failed for page at %p\n",
- proc->pid, page_addr);
- goto err_alloc_page_failed;
- }
- ret = map_kernel_range_noflush((unsigned long)page_addr,
- PAGE_SIZE, PAGE_KERNEL, page);
- flush_cache_vmap((unsigned long)page_addr,
- (unsigned long)page_addr + PAGE_SIZE);
- if (ret != 1) {
- pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n",
- proc->pid, page_addr);
- goto err_map_kernel_failed;
- }
- user_page_addr =
- (uintptr_t)page_addr + proc->user_buffer_offset;
- ret = vm_insert_page(vma, user_page_addr, page[0]);
- if (ret) {
- pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n",
- proc->pid, user_page_addr);
- goto err_vm_insert_page_failed;
- }
- /* vm_insert_page does not seem to increment the refcount */
- }
- if (mm) {
- up_write(&mm->mmap_sem);
- mmput(mm);
- }
- return 0;
+ if (task->policy == policy && task->normal_prio == desired.prio)
+ return;
-free_range:
- for (page_addr = end - PAGE_SIZE; page_addr >= start;
- page_addr -= PAGE_SIZE) {
- page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE];
- if (vma)
- zap_page_range(vma, (uintptr_t)page_addr +
- proc->user_buffer_offset, PAGE_SIZE, NULL);
-err_vm_insert_page_failed:
- unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
-err_map_kernel_failed:
- __free_page(*page);
- *page = NULL;
-err_alloc_page_failed:
- ;
- }
-err_no_vma:
- if (mm) {
- up_write(&mm->mmap_sem);
- mmput(mm);
- }
- return -ENOMEM;
-}
-
-static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc,
- size_t data_size,
- size_t offsets_size, int is_async)
-{
- struct rb_node *n = proc->free_buffers.rb_node;
- struct binder_buffer *buffer;
- size_t buffer_size;
- struct rb_node *best_fit = NULL;
- void *has_page_addr;
- void *end_page_addr;
- size_t size;
-
- if (proc->vma == NULL) {
- pr_err("%d: binder_alloc_buf, no vma\n",
- proc->pid);
- return NULL;
- }
+ has_cap_nice = has_capability_noaudit(task, CAP_SYS_NICE);
- size = ALIGN(data_size, sizeof(void *)) +
- ALIGN(offsets_size, sizeof(void *));
+ priority = to_userspace_prio(policy, desired.prio);
- if (size < data_size || size < offsets_size) {
- binder_user_error("%d: got transaction with invalid size %zd-%zd\n",
- proc->pid, data_size, offsets_size);
- return NULL;
- }
+ if (verify && is_rt_policy(policy) && !has_cap_nice) {
+ long max_rtprio = task_rlimit(task, RLIMIT_RTPRIO);
- if (is_async &&
- proc->free_async_space < size + sizeof(struct binder_buffer)) {
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: binder_alloc_buf size %zd failed, no async space left\n",
- proc->pid, size);
- return NULL;
+ if (max_rtprio == 0) {
+ policy = SCHED_NORMAL;
+ priority = MIN_NICE;
+ } else if (priority > max_rtprio) {
+ priority = max_rtprio;
+ }
}
- while (n) {
- buffer = rb_entry(n, struct binder_buffer, rb_node);
- BUG_ON(!buffer->free);
- buffer_size = binder_buffer_size(proc, buffer);
+ if (verify && is_fair_policy(policy) && !has_cap_nice) {
+ long min_nice = rlimit_to_nice(task_rlimit(task, RLIMIT_NICE));
- if (size < buffer_size) {
- best_fit = n;
- n = n->rb_left;
- } else if (size > buffer_size)
- n = n->rb_right;
- else {
- best_fit = n;
- break;
+ if (min_nice > MAX_NICE) {
+ binder_user_error("%d RLIMIT_NICE not set\n",
+ task->pid);
+ return;
+ } else if (priority < min_nice) {
+ priority = min_nice;
}
}
- if (best_fit == NULL) {
- pr_err("%d: binder_alloc_buf size %zd failed, no address space\n",
- proc->pid, size);
- return NULL;
- }
- if (n == NULL) {
- buffer = rb_entry(best_fit, struct binder_buffer, rb_node);
- buffer_size = binder_buffer_size(proc, buffer);
- }
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: binder_alloc_buf size %zd got buffer %p size %zd\n",
- proc->pid, size, buffer, buffer_size);
+ if (policy != desired.sched_policy ||
+ to_kernel_prio(policy, priority) != desired.prio)
+ binder_debug(BINDER_DEBUG_PRIORITY_CAP,
+ "%d: priority %d not allowed, using %d instead\n",
+ task->pid, desired.prio,
+ to_kernel_prio(policy, priority));
- has_page_addr =
- (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK);
- if (n == NULL) {
- if (size + sizeof(struct binder_buffer) + 4 >= buffer_size)
- buffer_size = size; /* no room for other buffers */
- else
- buffer_size = size + sizeof(struct binder_buffer);
- }
- end_page_addr =
- (void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size);
- if (end_page_addr > has_page_addr)
- end_page_addr = has_page_addr;
- if (binder_update_page_range(proc, 1,
- (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL))
- return NULL;
+ /* Set the actual priority */
+ if (task->policy != policy || is_rt_policy(policy)) {
+ struct sched_param params;
- rb_erase(best_fit, &proc->free_buffers);
- buffer->free = 0;
- binder_insert_allocated_buffer(proc, buffer);
- if (buffer_size != size) {
- struct binder_buffer *new_buffer = (void *)buffer->data + size;
+ params.sched_priority = is_rt_policy(policy) ? priority : 0;
- list_add(&new_buffer->entry, &buffer->entry);
- new_buffer->free = 1;
- binder_insert_free_buffer(proc, new_buffer);
+ sched_setscheduler_nocheck(task,
+ policy | SCHED_RESET_ON_FORK,
+ &params);
}
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: binder_alloc_buf size %zd got %p\n",
- proc->pid, size, buffer);
- buffer->data_size = data_size;
- buffer->offsets_size = offsets_size;
- buffer->async_transaction = is_async;
- if (is_async) {
- proc->free_async_space -= size + sizeof(struct binder_buffer);
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
- "%d: binder_alloc_buf size %zd async free %zd\n",
- proc->pid, size, proc->free_async_space);
- }
-
- return buffer;
+ if (is_fair_policy(policy))
+ set_user_nice(task, priority);
}
-static void *buffer_start_page(struct binder_buffer *buffer)
+static void binder_set_priority(struct task_struct *task,
+ struct binder_priority desired)
{
- return (void *)((uintptr_t)buffer & PAGE_MASK);
+ binder_do_set_priority(task, desired, /* verify = */ true);
}
-static void *buffer_end_page(struct binder_buffer *buffer)
+static void binder_restore_priority(struct task_struct *task,
+ struct binder_priority desired)
{
- return (void *)(((uintptr_t)(buffer + 1) - 1) & PAGE_MASK);
+ binder_do_set_priority(task, desired, /* verify = */ false);
}
-static void binder_delete_free_buffer(struct binder_proc *proc,
- struct binder_buffer *buffer)
+static void binder_transaction_priority(struct task_struct *task,
+ struct binder_transaction *t,
+ struct binder_priority node_prio,
+ bool inherit_rt)
{
- struct binder_buffer *prev, *next = NULL;
- int free_page_end = 1;
- int free_page_start = 1;
-
- BUG_ON(proc->buffers.next == &buffer->entry);
- prev = list_entry(buffer->entry.prev, struct binder_buffer, entry);
- BUG_ON(!prev->free);
- if (buffer_end_page(prev) == buffer_start_page(buffer)) {
- free_page_start = 0;
- if (buffer_end_page(prev) == buffer_end_page(buffer))
- free_page_end = 0;
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: merge free, buffer %p share page with %p\n",
- proc->pid, buffer, prev);
- }
+ struct binder_priority desired_prio;
- if (!list_is_last(&buffer->entry, &proc->buffers)) {
- next = list_entry(buffer->entry.next,
- struct binder_buffer, entry);
- if (buffer_start_page(next) == buffer_end_page(buffer)) {
- free_page_end = 0;
- if (buffer_start_page(next) ==
- buffer_start_page(buffer))
- free_page_start = 0;
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: merge free, buffer %p share page with %p\n",
- proc->pid, buffer, prev);
- }
- }
- list_del(&buffer->entry);
- if (free_page_start || free_page_end) {
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: merge free, buffer %p do not share page%s%s with %p or %p\n",
- proc->pid, buffer, free_page_start ? "" : " end",
- free_page_end ? "" : " start", prev, next);
- binder_update_page_range(proc, 0, free_page_start ?
- buffer_start_page(buffer) : buffer_end_page(buffer),
- (free_page_end ? buffer_end_page(buffer) :
- buffer_start_page(buffer)) + PAGE_SIZE, NULL);
- }
-}
-
-static void binder_free_buf(struct binder_proc *proc,
- struct binder_buffer *buffer)
-{
- size_t size, buffer_size;
-
- buffer_size = binder_buffer_size(proc, buffer);
-
- size = ALIGN(buffer->data_size, sizeof(void *)) +
- ALIGN(buffer->offsets_size, sizeof(void *));
-
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%d: binder_free_buf %p size %zd buffer_size %zd\n",
- proc->pid, buffer, size, buffer_size);
-
- BUG_ON(buffer->free);
- BUG_ON(size > buffer_size);
- BUG_ON(buffer->transaction != NULL);
- BUG_ON((void *)buffer < proc->buffer);
- BUG_ON((void *)buffer > proc->buffer + proc->buffer_size);
+ if (t->set_priority_called)
+ return;
- if (buffer->async_transaction) {
- proc->free_async_space += size + sizeof(struct binder_buffer);
+ t->set_priority_called = true;
+ t->saved_priority.sched_policy = task->policy;
+ t->saved_priority.prio = task->normal_prio;
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
- "%d: binder_free_buf size %zd async free %zd\n",
- proc->pid, size, proc->free_async_space);
+ if (!inherit_rt && is_rt_policy(desired_prio.sched_policy)) {
+ desired_prio.prio = NICE_TO_PRIO(0);
+ desired_prio.sched_policy = SCHED_NORMAL;
+ } else {
+ desired_prio.prio = t->priority.prio;
+ desired_prio.sched_policy = t->priority.sched_policy;
}
- binder_update_page_range(proc, 0,
- (void *)PAGE_ALIGN((uintptr_t)buffer->data),
- (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK),
- NULL);
- rb_erase(&buffer->rb_node, &proc->allocated_buffers);
- buffer->free = 1;
- if (!list_is_last(&buffer->entry, &proc->buffers)) {
- struct binder_buffer *next = list_entry(buffer->entry.next,
- struct binder_buffer, entry);
-
- if (next->free) {
- rb_erase(&next->rb_node, &proc->free_buffers);
- binder_delete_free_buffer(proc, next);
- }
+ if (node_prio.prio < t->priority.prio ||
+ (node_prio.prio == t->priority.prio &&
+ node_prio.sched_policy == SCHED_FIFO)) {
+ /*
+ * In case the minimum priority on the node is
+ * higher (lower value), use that priority. If
+ * the priority is the same, but the node uses
+ * SCHED_FIFO, prefer SCHED_FIFO, since it can
+ * run unbounded, unlike SCHED_RR.
+ */
+ desired_prio = node_prio;
}
- if (proc->buffers.next != &buffer->entry) {
- struct binder_buffer *prev = list_entry(buffer->entry.prev,
- struct binder_buffer, entry);
- if (prev->free) {
- binder_delete_free_buffer(proc, buffer);
- rb_erase(&prev->rb_node, &proc->free_buffers);
- buffer = prev;
- }
- }
- binder_insert_free_buffer(proc, buffer);
+ binder_set_priority(task, desired_prio);
}
-static struct binder_node *binder_get_node(struct binder_proc *proc,
- binder_uintptr_t ptr)
+static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc,
+ binder_uintptr_t ptr)
{
struct rb_node *n = proc->nodes.rb_node;
struct binder_node *node;
+ assert_spin_locked(&proc->inner_lock);
+
while (n) {
node = rb_entry(n, struct binder_node, rb_node);
@@ -876,21 +1246,47 @@ static struct binder_node *binder_get_node(struct binder_proc *proc,
n = n->rb_left;
else if (ptr > node->ptr)
n = n->rb_right;
- else
+ else {
+ /*
+ * take an implicit weak reference
+ * to ensure node stays alive until
+ * call to binder_put_node()
+ */
+ binder_inc_node_tmpref_ilocked(node);
return node;
+ }
}
return NULL;
}
-static struct binder_node *binder_new_node(struct binder_proc *proc,
- binder_uintptr_t ptr,
- binder_uintptr_t cookie)
+static struct binder_node *binder_get_node(struct binder_proc *proc,
+ binder_uintptr_t ptr)
+{
+ struct binder_node *node;
+
+ binder_inner_proc_lock(proc);
+ node = binder_get_node_ilocked(proc, ptr);
+ binder_inner_proc_unlock(proc);
+ return node;
+}
+
+static struct binder_node *binder_init_node_ilocked(
+ struct binder_proc *proc,
+ struct binder_node *new_node,
+ struct flat_binder_object *fp)
{
struct rb_node **p = &proc->nodes.rb_node;
struct rb_node *parent = NULL;
struct binder_node *node;
+ binder_uintptr_t ptr = fp ? fp->binder : 0;
+ binder_uintptr_t cookie = fp ? fp->cookie : 0;
+ __u32 flags = fp ? fp->flags : 0;
+ s8 priority;
+
+ assert_spin_locked(&proc->inner_lock);
while (*p) {
+
parent = *p;
node = rb_entry(parent, struct binder_node, rb_node);
@@ -898,39 +1294,86 @@ static struct binder_node *binder_new_node(struct binder_proc *proc,
p = &(*p)->rb_left;
else if (ptr > node->ptr)
p = &(*p)->rb_right;
- else
- return NULL;
+ else {
+ /*
+ * A matching node is already in
+ * the rb tree. Abandon the init
+ * and return it.
+ */
+ binder_inc_node_tmpref_ilocked(node);
+ return node;
+ }
}
-
- node = kzalloc(sizeof(*node), GFP_KERNEL);
- if (node == NULL)
- return NULL;
+ node = new_node;
binder_stats_created(BINDER_STAT_NODE);
+ node->tmp_refs++;
rb_link_node(&node->rb_node, parent, p);
rb_insert_color(&node->rb_node, &proc->nodes);
- node->debug_id = ++binder_last_id;
+ node->debug_id = atomic_inc_return(&binder_last_id);
node->proc = proc;
node->ptr = ptr;
node->cookie = cookie;
node->work.type = BINDER_WORK_NODE;
+ priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
+ node->sched_policy = (flags & FLAT_BINDER_FLAG_PRIORITY_MASK) >>
+ FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT;
+ node->min_priority = to_kernel_prio(node->sched_policy, priority);
+ node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
+ node->inherit_rt = !!(flags & FLAT_BINDER_FLAG_INHERIT_RT);
+ spin_lock_init(&node->lock);
INIT_LIST_HEAD(&node->work.entry);
INIT_LIST_HEAD(&node->async_todo);
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
"%d:%d node %d u%016llx c%016llx created\n",
proc->pid, current->pid, node->debug_id,
(u64)node->ptr, (u64)node->cookie);
+
return node;
}
-static int binder_inc_node(struct binder_node *node, int strong, int internal,
- struct list_head *target_list)
+static struct binder_node *binder_new_node(struct binder_proc *proc,
+ struct flat_binder_object *fp)
{
+ struct binder_node *node;
+ struct binder_node *new_node = kzalloc(sizeof(*node), GFP_KERNEL);
+
+ if (!new_node)
+ return NULL;
+ binder_inner_proc_lock(proc);
+ node = binder_init_node_ilocked(proc, new_node, fp);
+ binder_inner_proc_unlock(proc);
+ if (node != new_node)
+ /*
+ * The node was already added by another thread
+ */
+ kfree(new_node);
+
+ return node;
+}
+
+static void binder_free_node(struct binder_node *node)
+{
+ kfree(node);
+ binder_stats_deleted(BINDER_STAT_NODE);
+}
+
+static int binder_inc_node_nilocked(struct binder_node *node, int strong,
+ int internal,
+ struct list_head *target_list)
+{
+ struct binder_proc *proc = node->proc;
+
+ assert_spin_locked(&node->lock);
+ if (proc)
+ assert_spin_locked(&proc->inner_lock);
if (strong) {
if (internal) {
if (target_list == NULL &&
node->internal_strong_refs == 0 &&
- !(node == binder_context_mgr_node &&
- node->has_strong_ref)) {
+ !(node->proc &&
+ node == node->proc->context->
+ binder_context_mgr_node &&
+ node->has_strong_ref)) {
pr_err("invalid inc strong node for %d\n",
node->debug_id);
return -EINVAL;
@@ -939,8 +1382,8 @@ static int binder_inc_node(struct binder_node *node, int strong, int internal,
} else
node->local_strong_refs++;
if (!node->has_strong_ref && target_list) {
- list_del_init(&node->work.entry);
- list_add_tail(&node->work.entry, target_list);
+ binder_dequeue_work_ilocked(&node->work);
+ binder_enqueue_work_ilocked(&node->work, target_list);
}
} else {
if (!internal)
@@ -951,58 +1394,169 @@ static int binder_inc_node(struct binder_node *node, int strong, int internal,
node->debug_id);
return -EINVAL;
}
- list_add_tail(&node->work.entry, target_list);
+ binder_enqueue_work_ilocked(&node->work, target_list);
}
}
return 0;
}
-static int binder_dec_node(struct binder_node *node, int strong, int internal)
+static int binder_inc_node(struct binder_node *node, int strong, int internal,
+ struct list_head *target_list)
+{
+ int ret;
+
+ binder_node_inner_lock(node);
+ ret = binder_inc_node_nilocked(node, strong, internal, target_list);
+ binder_node_inner_unlock(node);
+
+ return ret;
+}
+
+static bool binder_dec_node_nilocked(struct binder_node *node,
+ int strong, int internal)
{
+ struct binder_proc *proc = node->proc;
+
+ assert_spin_locked(&node->lock);
+ if (proc)
+ assert_spin_locked(&proc->inner_lock);
if (strong) {
if (internal)
node->internal_strong_refs--;
else
node->local_strong_refs--;
if (node->local_strong_refs || node->internal_strong_refs)
- return 0;
+ return false;
} else {
if (!internal)
node->local_weak_refs--;
- if (node->local_weak_refs || !hlist_empty(&node->refs))
- return 0;
+ if (node->local_weak_refs || node->tmp_refs ||
+ !hlist_empty(&node->refs))
+ return false;
}
- if (node->proc && (node->has_strong_ref || node->has_weak_ref)) {
+
+ if (proc && (node->has_strong_ref || node->has_weak_ref)) {
if (list_empty(&node->work.entry)) {
- list_add_tail(&node->work.entry, &node->proc->todo);
- wake_up_interruptible(&node->proc->wait);
+ binder_enqueue_work_ilocked(&node->work, &proc->todo);
+ binder_wakeup_proc_ilocked(proc);
}
} else {
if (hlist_empty(&node->refs) && !node->local_strong_refs &&
- !node->local_weak_refs) {
- list_del_init(&node->work.entry);
- if (node->proc) {
- rb_erase(&node->rb_node, &node->proc->nodes);
+ !node->local_weak_refs && !node->tmp_refs) {
+ if (proc) {
+ binder_dequeue_work_ilocked(&node->work);
+ rb_erase(&node->rb_node, &proc->nodes);
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
"refless node %d deleted\n",
node->debug_id);
} else {
+ BUG_ON(!list_empty(&node->work.entry));
+ spin_lock(&binder_dead_nodes_lock);
+ /*
+ * tmp_refs could have changed so
+ * check it again
+ */
+ if (node->tmp_refs) {
+ spin_unlock(&binder_dead_nodes_lock);
+ return false;
+ }
hlist_del(&node->dead_node);
+ spin_unlock(&binder_dead_nodes_lock);
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
"dead node %d deleted\n",
node->debug_id);
}
- kfree(node);
- binder_stats_deleted(BINDER_STAT_NODE);
+ return true;
}
}
+ return false;
+}
- return 0;
+static void binder_dec_node(struct binder_node *node, int strong, int internal)
+{
+ bool free_node;
+
+ binder_node_inner_lock(node);
+ free_node = binder_dec_node_nilocked(node, strong, internal);
+ binder_node_inner_unlock(node);
+ if (free_node)
+ binder_free_node(node);
}
+static void binder_inc_node_tmpref_ilocked(struct binder_node *node)
+{
+ /*
+ * No call to binder_inc_node() is needed since we
+ * don't need to inform userspace of any changes to
+ * tmp_refs
+ */
+ node->tmp_refs++;
+}
-static struct binder_ref *binder_get_ref(struct binder_proc *proc,
- u32 desc, bool need_strong_ref)
+/**
+ * binder_inc_node_tmpref() - take a temporary reference on node
+ * @node: node to reference
+ *
+ * Take reference on node to prevent the node from being freed
+ * while referenced only by a local variable. The inner lock is
+ * needed to serialize with the node work on the queue (which
+ * isn't needed after the node is dead). If the node is dead
+ * (node->proc is NULL), use binder_dead_nodes_lock to protect
+ * node->tmp_refs against dead-node-only cases where the node
+ * lock cannot be acquired (eg traversing the dead node list to
+ * print nodes)
+ */
+static void binder_inc_node_tmpref(struct binder_node *node)
+{
+ binder_node_lock(node);
+ if (node->proc)
+ binder_inner_proc_lock(node->proc);
+ else
+ spin_lock(&binder_dead_nodes_lock);
+ binder_inc_node_tmpref_ilocked(node);
+ if (node->proc)
+ binder_inner_proc_unlock(node->proc);
+ else
+ spin_unlock(&binder_dead_nodes_lock);
+ binder_node_unlock(node);
+}
+
+/**
+ * binder_dec_node_tmpref() - remove a temporary reference on node
+ * @node: node to reference
+ *
+ * Release temporary reference on node taken via binder_inc_node_tmpref()
+ */
+static void binder_dec_node_tmpref(struct binder_node *node)
+{
+ bool free_node;
+
+ binder_node_inner_lock(node);
+ if (!node->proc)
+ spin_lock(&binder_dead_nodes_lock);
+ node->tmp_refs--;
+ BUG_ON(node->tmp_refs < 0);
+ if (!node->proc)
+ spin_unlock(&binder_dead_nodes_lock);
+ /*
+ * Call binder_dec_node() to check if all refcounts are 0
+ * and cleanup is needed. Calling with strong=0 and internal=1
+ * causes no actual reference to be released in binder_dec_node().
+ * If that changes, a change is needed here too.
+ */
+ free_node = binder_dec_node_nilocked(node, 0, 1);
+ binder_node_inner_unlock(node);
+ if (free_node)
+ binder_free_node(node);
+}
+
+static void binder_put_node(struct binder_node *node)
+{
+ binder_dec_node_tmpref(node);
+}
+
+static struct binder_ref *binder_get_ref_olocked(struct binder_proc *proc,
+ u32 desc, bool need_strong_ref)
{
struct rb_node *n = proc->refs_by_desc.rb_node;
struct binder_ref *ref;
@@ -1010,11 +1564,11 @@ static struct binder_ref *binder_get_ref(struct binder_proc *proc,
while (n) {
ref = rb_entry(n, struct binder_ref, rb_node_desc);
- if (desc < ref->desc) {
+ if (desc < ref->data.desc) {
n = n->rb_left;
- } else if (desc > ref->desc) {
+ } else if (desc > ref->data.desc) {
n = n->rb_right;
- } else if (need_strong_ref && !ref->strong) {
+ } else if (need_strong_ref && !ref->data.strong) {
binder_user_error("tried to use weak ref as strong ref\n");
return NULL;
} else {
@@ -1024,13 +1578,34 @@ static struct binder_ref *binder_get_ref(struct binder_proc *proc,
return NULL;
}
-static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
- struct binder_node *node)
+/**
+ * binder_get_ref_for_node_olocked() - get the ref associated with given node
+ * @proc: binder_proc that owns the ref
+ * @node: binder_node of target
+ * @new_ref: newly allocated binder_ref to be initialized or %NULL
+ *
+ * Look up the ref for the given node and return it if it exists
+ *
+ * If it doesn't exist and the caller provides a newly allocated
+ * ref, initialize the fields of the newly allocated ref and insert
+ * into the given proc rb_trees and node refs list.
+ *
+ * Return: the ref for node. It is possible that another thread
+ * allocated/initialized the ref first in which case the
+ * returned ref would be different than the passed-in
+ * new_ref. new_ref must be kfree'd by the caller in
+ * this case.
+ */
+static struct binder_ref *binder_get_ref_for_node_olocked(
+ struct binder_proc *proc,
+ struct binder_node *node,
+ struct binder_ref *new_ref)
{
- struct rb_node *n;
+ struct binder_context *context = proc->context;
struct rb_node **p = &proc->refs_by_node.rb_node;
struct rb_node *parent = NULL;
- struct binder_ref *ref, *new_ref;
+ struct binder_ref *ref;
+ struct rb_node *n;
while (*p) {
parent = *p;
@@ -1043,22 +1618,22 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
else
return ref;
}
- new_ref = kzalloc(sizeof(*ref), GFP_KERNEL);
- if (new_ref == NULL)
+ if (!new_ref)
return NULL;
+
binder_stats_created(BINDER_STAT_REF);
- new_ref->debug_id = ++binder_last_id;
+ new_ref->data.debug_id = atomic_inc_return(&binder_last_id);
new_ref->proc = proc;
new_ref->node = node;
rb_link_node(&new_ref->rb_node_node, parent, p);
rb_insert_color(&new_ref->rb_node_node, &proc->refs_by_node);
- new_ref->desc = (node == binder_context_mgr_node) ? 0 : 1;
+ new_ref->data.desc = (node == context->binder_context_mgr_node) ? 0 : 1;
for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) {
ref = rb_entry(n, struct binder_ref, rb_node_desc);
- if (ref->desc > new_ref->desc)
+ if (ref->data.desc > new_ref->data.desc)
break;
- new_ref->desc = ref->desc + 1;
+ new_ref->data.desc = ref->data.desc + 1;
}
p = &proc->refs_by_desc.rb_node;
@@ -1066,121 +1641,423 @@ static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc,
parent = *p;
ref = rb_entry(parent, struct binder_ref, rb_node_desc);
- if (new_ref->desc < ref->desc)
+ if (new_ref->data.desc < ref->data.desc)
p = &(*p)->rb_left;
- else if (new_ref->desc > ref->desc)
+ else if (new_ref->data.desc > ref->data.desc)
p = &(*p)->rb_right;
else
BUG();
}
rb_link_node(&new_ref->rb_node_desc, parent, p);
rb_insert_color(&new_ref->rb_node_desc, &proc->refs_by_desc);
- if (node) {
- hlist_add_head(&new_ref->node_entry, &node->refs);
- binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "%d new ref %d desc %d for node %d\n",
- proc->pid, new_ref->debug_id, new_ref->desc,
- node->debug_id);
- } else {
- binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "%d new ref %d desc %d for dead node\n",
- proc->pid, new_ref->debug_id, new_ref->desc);
- }
+ binder_node_lock(node);
+ hlist_add_head(&new_ref->node_entry, &node->refs);
+
+ binder_debug(BINDER_DEBUG_INTERNAL_REFS,
+ "%d new ref %d desc %d for node %d\n",
+ proc->pid, new_ref->data.debug_id, new_ref->data.desc,
+ node->debug_id);
+ binder_node_unlock(node);
return new_ref;
}
-static void binder_delete_ref(struct binder_ref *ref)
+static void binder_cleanup_ref_olocked(struct binder_ref *ref)
{
+ bool delete_node = false;
+
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
"%d delete ref %d desc %d for node %d\n",
- ref->proc->pid, ref->debug_id, ref->desc,
+ ref->proc->pid, ref->data.debug_id, ref->data.desc,
ref->node->debug_id);
rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc);
rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node);
- if (ref->strong)
- binder_dec_node(ref->node, 1, 1);
+
+ binder_node_inner_lock(ref->node);
+ if (ref->data.strong)
+ binder_dec_node_nilocked(ref->node, 1, 1);
+
hlist_del(&ref->node_entry);
- binder_dec_node(ref->node, 0, 1);
+ delete_node = binder_dec_node_nilocked(ref->node, 0, 1);
+ binder_node_inner_unlock(ref->node);
+ /*
+ * Clear ref->node unless we want the caller to free the node
+ */
+ if (!delete_node) {
+ /*
+ * The caller uses ref->node to determine
+ * whether the node needs to be freed. Clear
+ * it since the node is still alive.
+ */
+ ref->node = NULL;
+ }
+
if (ref->death) {
binder_debug(BINDER_DEBUG_DEAD_BINDER,
"%d delete ref %d desc %d has death notification\n",
- ref->proc->pid, ref->debug_id, ref->desc);
- list_del(&ref->death->work.entry);
- kfree(ref->death);
+ ref->proc->pid, ref->data.debug_id,
+ ref->data.desc);
+ binder_dequeue_work(ref->proc, &ref->death->work);
binder_stats_deleted(BINDER_STAT_DEATH);
}
- kfree(ref);
binder_stats_deleted(BINDER_STAT_REF);
}
-static int binder_inc_ref(struct binder_ref *ref, int strong,
- struct list_head *target_list)
+/**
+ * binder_inc_ref_olocked() - increment the ref for given handle
+ * @ref: ref to be incremented
+ * @strong: if true, strong increment, else weak
+ * @target_list: list to queue node work on
+ *
+ * Increment the ref. @ref->proc->outer_lock must be held on entry
+ *
+ * Return: 0, if successful, else errno
+ */
+static int binder_inc_ref_olocked(struct binder_ref *ref, int strong,
+ struct list_head *target_list)
{
int ret;
if (strong) {
- if (ref->strong == 0) {
+ if (ref->data.strong == 0) {
ret = binder_inc_node(ref->node, 1, 1, target_list);
if (ret)
return ret;
}
- ref->strong++;
+ ref->data.strong++;
} else {
- if (ref->weak == 0) {
+ if (ref->data.weak == 0) {
ret = binder_inc_node(ref->node, 0, 1, target_list);
if (ret)
return ret;
}
- ref->weak++;
+ ref->data.weak++;
}
return 0;
}
-
-static int binder_dec_ref(struct binder_ref *ref, int strong)
+/**
+ * binder_dec_ref() - dec the ref for given handle
+ * @ref: ref to be decremented
+ * @strong: if true, strong decrement, else weak
+ *
+ * Decrement the ref.
+ *
+ * Return: true if ref is cleaned up and ready to be freed
+ */
+static bool binder_dec_ref_olocked(struct binder_ref *ref, int strong)
{
if (strong) {
- if (ref->strong == 0) {
+ if (ref->data.strong == 0) {
binder_user_error("%d invalid dec strong, ref %d desc %d s %d w %d\n",
- ref->proc->pid, ref->debug_id,
- ref->desc, ref->strong, ref->weak);
- return -EINVAL;
- }
- ref->strong--;
- if (ref->strong == 0) {
- int ret;
-
- ret = binder_dec_node(ref->node, strong, 1);
- if (ret)
- return ret;
+ ref->proc->pid, ref->data.debug_id,
+ ref->data.desc, ref->data.strong,
+ ref->data.weak);
+ return false;
}
+ ref->data.strong--;
+ if (ref->data.strong == 0)
+ binder_dec_node(ref->node, strong, 1);
} else {
- if (ref->weak == 0) {
+ if (ref->data.weak == 0) {
binder_user_error("%d invalid dec weak, ref %d desc %d s %d w %d\n",
- ref->proc->pid, ref->debug_id,
- ref->desc, ref->strong, ref->weak);
- return -EINVAL;
+ ref->proc->pid, ref->data.debug_id,
+ ref->data.desc, ref->data.strong,
+ ref->data.weak);
+ return false;
}
- ref->weak--;
+ ref->data.weak--;
}
- if (ref->strong == 0 && ref->weak == 0)
- binder_delete_ref(ref);
- return 0;
+ if (ref->data.strong == 0 && ref->data.weak == 0) {
+ binder_cleanup_ref_olocked(ref);
+ return true;
+ }
+ return false;
}
-static void binder_pop_transaction(struct binder_thread *target_thread,
- struct binder_transaction *t)
+/**
+ * binder_get_node_from_ref() - get the node from the given proc/desc
+ * @proc: proc containing the ref
+ * @desc: the handle associated with the ref
+ * @need_strong_ref: if true, only return node if ref is strong
+ * @rdata: the id/refcount data for the ref
+ *
+ * Given a proc and ref handle, return the associated binder_node
+ *
+ * Return: a binder_node or NULL if not found or not strong when strong required
+ */
+static struct binder_node *binder_get_node_from_ref(
+ struct binder_proc *proc,
+ u32 desc, bool need_strong_ref,
+ struct binder_ref_data *rdata)
{
- if (target_thread) {
- BUG_ON(target_thread->transaction_stack != t);
- BUG_ON(target_thread->transaction_stack->from != target_thread);
- target_thread->transaction_stack =
- target_thread->transaction_stack->from_parent;
- t->from = NULL;
+ struct binder_node *node;
+ struct binder_ref *ref;
+
+ binder_proc_lock(proc);
+ ref = binder_get_ref_olocked(proc, desc, need_strong_ref);
+ if (!ref)
+ goto err_no_ref;
+ node = ref->node;
+ /*
+ * Take an implicit reference on the node to ensure
+ * it stays alive until the call to binder_put_node()
+ */
+ binder_inc_node_tmpref(node);
+ if (rdata)
+ *rdata = ref->data;
+ binder_proc_unlock(proc);
+
+ return node;
+
+err_no_ref:
+ binder_proc_unlock(proc);
+ return NULL;
+}
+
+/**
+ * binder_free_ref() - free the binder_ref
+ * @ref: ref to free
+ *
+ * Free the binder_ref. Free the binder_node indicated by ref->node
+ * (if non-NULL) and the binder_ref_death indicated by ref->death.
+ */
+static void binder_free_ref(struct binder_ref *ref)
+{
+ if (ref->node)
+ binder_free_node(ref->node);
+ kfree(ref->death);
+ kfree(ref);
+}
+
+/**
+ * binder_update_ref_for_handle() - inc/dec the ref for given handle
+ * @proc: proc containing the ref
+ * @desc: the handle associated with the ref
+ * @increment: true=inc reference, false=dec reference
+ * @strong: true=strong reference, false=weak reference
+ * @rdata: the id/refcount data for the ref
+ *
+ * Given a proc and ref handle, increment or decrement the ref
+ * according to "increment" arg.
+ *
+ * Return: 0 if successful, else errno
+ */
+static int binder_update_ref_for_handle(struct binder_proc *proc,
+ uint32_t desc, bool increment, bool strong,
+ struct binder_ref_data *rdata)
+{
+ int ret = 0;
+ struct binder_ref *ref;
+ bool delete_ref = false;
+
+ binder_proc_lock(proc);
+ ref = binder_get_ref_olocked(proc, desc, strong);
+ if (!ref) {
+ ret = -EINVAL;
+ goto err_no_ref;
+ }
+ if (increment)
+ ret = binder_inc_ref_olocked(ref, strong, NULL);
+ else
+ delete_ref = binder_dec_ref_olocked(ref, strong);
+
+ if (rdata)
+ *rdata = ref->data;
+ binder_proc_unlock(proc);
+
+ if (delete_ref)
+ binder_free_ref(ref);
+ return ret;
+
+err_no_ref:
+ binder_proc_unlock(proc);
+ return ret;
+}
+
+/**
+ * binder_dec_ref_for_handle() - dec the ref for given handle
+ * @proc: proc containing the ref
+ * @desc: the handle associated with the ref
+ * @strong: true=strong reference, false=weak reference
+ * @rdata: the id/refcount data for the ref
+ *
+ * Just calls binder_update_ref_for_handle() to decrement the ref.
+ *
+ * Return: 0 if successful, else errno
+ */
+static int binder_dec_ref_for_handle(struct binder_proc *proc,
+ uint32_t desc, bool strong, struct binder_ref_data *rdata)
+{
+ return binder_update_ref_for_handle(proc, desc, false, strong, rdata);
+}
+
+
+/**
+ * binder_inc_ref_for_node() - increment the ref for given proc/node
+ * @proc: proc containing the ref
+ * @node: target node
+ * @strong: true=strong reference, false=weak reference
+ * @target_list: worklist to use if node is incremented
+ * @rdata: the id/refcount data for the ref
+ *
+ * Given a proc and node, increment the ref. Create the ref if it
+ * doesn't already exist
+ *
+ * Return: 0 if successful, else errno
+ */
+static int binder_inc_ref_for_node(struct binder_proc *proc,
+ struct binder_node *node,
+ bool strong,
+ struct list_head *target_list,
+ struct binder_ref_data *rdata)
+{
+ struct binder_ref *ref;
+ struct binder_ref *new_ref = NULL;
+ int ret = 0;
+
+ binder_proc_lock(proc);
+ ref = binder_get_ref_for_node_olocked(proc, node, NULL);
+ if (!ref) {
+ binder_proc_unlock(proc);
+ new_ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+ if (!new_ref)
+ return -ENOMEM;
+ binder_proc_lock(proc);
+ ref = binder_get_ref_for_node_olocked(proc, node, new_ref);
+ }
+ ret = binder_inc_ref_olocked(ref, strong, target_list);
+ *rdata = ref->data;
+ binder_proc_unlock(proc);
+ if (new_ref && ref != new_ref)
+ /*
+ * Another thread created the ref first so
+ * free the one we allocated
+ */
+ kfree(new_ref);
+ return ret;
+}
+
+static void binder_pop_transaction_ilocked(struct binder_thread *target_thread,
+ struct binder_transaction *t)
+{
+ BUG_ON(!target_thread);
+ assert_spin_locked(&target_thread->proc->inner_lock);
+ BUG_ON(target_thread->transaction_stack != t);
+ BUG_ON(target_thread->transaction_stack->from != target_thread);
+ target_thread->transaction_stack =
+ target_thread->transaction_stack->from_parent;
+ t->from = NULL;
+}
+
+/**
+ * binder_thread_dec_tmpref() - decrement thread->tmp_ref
+ * @thread: thread to decrement
+ *
+ * A thread needs to be kept alive while being used to create or
+ * handle a transaction. binder_get_txn_from() is used to safely
+ * extract t->from from a binder_transaction and keep the thread
+ * indicated by t->from from being freed. When done with that
+ * binder_thread, this function is called to decrement the
+ * tmp_ref and free if appropriate (thread has been released
+ * and no transaction being processed by the driver)
+ */
+static void binder_thread_dec_tmpref(struct binder_thread *thread)
+{
+ /*
+ * atomic is used to protect the counter value while
+ * it cannot reach zero or thread->is_dead is false
+ */
+ binder_inner_proc_lock(thread->proc);
+ atomic_dec(&thread->tmp_ref);
+ if (thread->is_dead && !atomic_read(&thread->tmp_ref)) {
+ binder_inner_proc_unlock(thread->proc);
+ binder_free_thread(thread);
+ return;
+ }
+ binder_inner_proc_unlock(thread->proc);
+}
+
+/**
+ * binder_proc_dec_tmpref() - decrement proc->tmp_ref
+ * @proc: proc to decrement
+ *
+ * A binder_proc needs to be kept alive while being used to create or
+ * handle a transaction. proc->tmp_ref is incremented when
+ * creating a new transaction or the binder_proc is currently in-use
+ * by threads that are being released. When done with the binder_proc,
+ * this function is called to decrement the counter and free the
+ * proc if appropriate (proc has been released, all threads have
+ * been released and not currenly in-use to process a transaction).
+ */
+static void binder_proc_dec_tmpref(struct binder_proc *proc)
+{
+ binder_inner_proc_lock(proc);
+ proc->tmp_ref--;
+ if (proc->is_dead && RB_EMPTY_ROOT(&proc->threads) &&
+ !proc->tmp_ref) {
+ binder_inner_proc_unlock(proc);
+ binder_free_proc(proc);
+ return;
+ }
+ binder_inner_proc_unlock(proc);
+}
+
+/**
+ * binder_get_txn_from() - safely extract the "from" thread in transaction
+ * @t: binder transaction for t->from
+ *
+ * Atomically return the "from" thread and increment the tmp_ref
+ * count for the thread to ensure it stays alive until
+ * binder_thread_dec_tmpref() is called.
+ *
+ * Return: the value of t->from
+ */
+static struct binder_thread *binder_get_txn_from(
+ struct binder_transaction *t)
+{
+ struct binder_thread *from;
+
+ spin_lock(&t->lock);
+ from = t->from;
+ if (from)
+ atomic_inc(&from->tmp_ref);
+ spin_unlock(&t->lock);
+ return from;
+}
+
+/**
+ * binder_get_txn_from_and_acq_inner() - get t->from and acquire inner lock
+ * @t: binder transaction for t->from
+ *
+ * Same as binder_get_txn_from() except it also acquires the proc->inner_lock
+ * to guarantee that the thread cannot be released while operating on it.
+ * The caller must call binder_inner_proc_unlock() to release the inner lock
+ * as well as call binder_dec_thread_txn() to release the reference.
+ *
+ * Return: the value of t->from
+ */
+static struct binder_thread *binder_get_txn_from_and_acq_inner(
+ struct binder_transaction *t)
+{
+ struct binder_thread *from;
+
+ from = binder_get_txn_from(t);
+ if (!from)
+ return NULL;
+ binder_inner_proc_lock(from->proc);
+ if (t->from) {
+ BUG_ON(from != t->from);
+ return from;
}
- t->need_reply = 0;
+ binder_inner_proc_unlock(from->proc);
+ binder_thread_dec_tmpref(from);
+ return NULL;
+}
+
+static void binder_free_transaction(struct binder_transaction *t)
+{
if (t->buffer)
t->buffer->transaction = NULL;
kfree(t);
@@ -1195,30 +2072,28 @@ static void binder_send_failed_reply(struct binder_transaction *t,
BUG_ON(t->flags & TF_ONE_WAY);
while (1) {
- target_thread = t->from;
+ target_thread = binder_get_txn_from_and_acq_inner(t);
if (target_thread) {
- if (target_thread->return_error != BR_OK &&
- target_thread->return_error2 == BR_OK) {
- target_thread->return_error2 =
- target_thread->return_error;
- target_thread->return_error = BR_OK;
- }
- if (target_thread->return_error == BR_OK) {
- binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
- "send failed reply for transaction %d to %d:%d\n",
- t->debug_id,
- target_thread->proc->pid,
- target_thread->pid);
-
- binder_pop_transaction(target_thread, t);
- target_thread->return_error = error_code;
+ binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
+ "send failed reply for transaction %d to %d:%d\n",
+ t->debug_id,
+ target_thread->proc->pid,
+ target_thread->pid);
+
+ binder_pop_transaction_ilocked(target_thread, t);
+ if (target_thread->reply_error.cmd == BR_OK) {
+ target_thread->reply_error.cmd = error_code;
+ binder_enqueue_work_ilocked(
+ &target_thread->reply_error.work,
+ &target_thread->todo);
wake_up_interruptible(&target_thread->wait);
} else {
- pr_err("reply failed, target thread, %d:%d, has error code %d already\n",
- target_thread->proc->pid,
- target_thread->pid,
- target_thread->return_error);
+ WARN(1, "Unexpected reply error: %u\n",
+ target_thread->reply_error.cmd);
}
+ binder_inner_proc_unlock(target_thread->proc);
+ binder_thread_dec_tmpref(target_thread);
+ binder_free_transaction(t);
return;
}
next = t->from_parent;
@@ -1227,7 +2102,7 @@ static void binder_send_failed_reply(struct binder_transaction *t,
"send failed reply for transaction %d, target dead\n",
t->debug_id);
- binder_pop_transaction(target_thread, t);
+ binder_free_transaction(t);
if (next == NULL) {
binder_debug(BINDER_DEBUG_DEAD_BINDER,
"reply failed, no target thread at root\n");
@@ -1240,11 +2115,158 @@ static void binder_send_failed_reply(struct binder_transaction *t,
}
}
+/**
+ * binder_validate_object() - checks for a valid metadata object in a buffer.
+ * @buffer: binder_buffer that we're parsing.
+ * @offset: offset in the buffer at which to validate an object.
+ *
+ * Return: If there's a valid metadata object at @offset in @buffer, the
+ * size of that object. Otherwise, it returns zero.
+ */
+static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset)
+{
+ /* Check if we can read a header first */
+ struct binder_object_header *hdr;
+ size_t object_size = 0;
+
+ if (offset > buffer->data_size - sizeof(*hdr) ||
+ buffer->data_size < sizeof(*hdr) ||
+ !IS_ALIGNED(offset, sizeof(u32)))
+ return 0;
+
+ /* Ok, now see if we can read a complete object. */
+ hdr = (struct binder_object_header *)(buffer->data + offset);
+ switch (hdr->type) {
+ case BINDER_TYPE_BINDER:
+ case BINDER_TYPE_WEAK_BINDER:
+ case BINDER_TYPE_HANDLE:
+ case BINDER_TYPE_WEAK_HANDLE:
+ object_size = sizeof(struct flat_binder_object);
+ break;
+ case BINDER_TYPE_FD:
+ object_size = sizeof(struct binder_fd_object);
+ break;
+ case BINDER_TYPE_PTR:
+ object_size = sizeof(struct binder_buffer_object);
+ break;
+ case BINDER_TYPE_FDA:
+ object_size = sizeof(struct binder_fd_array_object);
+ break;
+ default:
+ return 0;
+ }
+ if (offset <= buffer->data_size - object_size &&
+ buffer->data_size >= object_size)
+ return object_size;
+ else
+ return 0;
+}
+
+/**
+ * binder_validate_ptr() - validates binder_buffer_object in a binder_buffer.
+ * @b: binder_buffer containing the object
+ * @index: index in offset array at which the binder_buffer_object is
+ * located
+ * @start: points to the start of the offset array
+ * @num_valid: the number of valid offsets in the offset array
+ *
+ * Return: If @index is within the valid range of the offset array
+ * described by @start and @num_valid, and if there's a valid
+ * binder_buffer_object at the offset found in index @index
+ * of the offset array, that object is returned. Otherwise,
+ * %NULL is returned.
+ * Note that the offset found in index @index itself is not
+ * verified; this function assumes that @num_valid elements
+ * from @start were previously verified to have valid offsets.
+ */
+static struct binder_buffer_object *binder_validate_ptr(struct binder_buffer *b,
+ binder_size_t index,
+ binder_size_t *start,
+ binder_size_t num_valid)
+{
+ struct binder_buffer_object *buffer_obj;
+ binder_size_t *offp;
+
+ if (index >= num_valid)
+ return NULL;
+
+ offp = start + index;
+ buffer_obj = (struct binder_buffer_object *)(b->data + *offp);
+ if (buffer_obj->hdr.type != BINDER_TYPE_PTR)
+ return NULL;
+
+ return buffer_obj;
+}
+
+/**
+ * binder_validate_fixup() - validates pointer/fd fixups happen in order.
+ * @b: transaction buffer
+ * @objects_start start of objects buffer
+ * @buffer: binder_buffer_object in which to fix up
+ * @offset: start offset in @buffer to fix up
+ * @last_obj: last binder_buffer_object that we fixed up in
+ * @last_min_offset: minimum fixup offset in @last_obj
+ *
+ * Return: %true if a fixup in buffer @buffer at offset @offset is
+ * allowed.
+ *
+ * For safety reasons, we only allow fixups inside a buffer to happen
+ * at increasing offsets; additionally, we only allow fixup on the last
+ * buffer object that was verified, or one of its parents.
+ *
+ * Example of what is allowed:
+ *
+ * A
+ * B (parent = A, offset = 0)
+ * C (parent = A, offset = 16)
+ * D (parent = C, offset = 0)
+ * E (parent = A, offset = 32) // min_offset is 16 (C.parent_offset)
+ *
+ * Examples of what is not allowed:
+ *
+ * Decreasing offsets within the same parent:
+ * A
+ * C (parent = A, offset = 16)
+ * B (parent = A, offset = 0) // decreasing offset within A
+ *
+ * Referring to a parent that wasn't the last object or any of its parents:
+ * A
+ * B (parent = A, offset = 0)
+ * C (parent = A, offset = 0)
+ * C (parent = A, offset = 16)
+ * D (parent = B, offset = 0) // B is not A or any of A's parents
+ */
+static bool binder_validate_fixup(struct binder_buffer *b,
+ binder_size_t *objects_start,
+ struct binder_buffer_object *buffer,
+ binder_size_t fixup_offset,
+ struct binder_buffer_object *last_obj,
+ binder_size_t last_min_offset)
+{
+ if (!last_obj) {
+ /* Nothing to fix up in */
+ return false;
+ }
+
+ while (last_obj != buffer) {
+ /*
+ * Safe to retrieve the parent of last_obj, since it
+ * was already previously verified by the driver.
+ */
+ if ((last_obj->flags & BINDER_BUFFER_FLAG_HAS_PARENT) == 0)
+ return false;
+ last_min_offset = last_obj->parent_offset + sizeof(uintptr_t);
+ last_obj = (struct binder_buffer_object *)
+ (b->data + *(objects_start + last_obj->parent));
+ }
+ return (fixup_offset >= last_min_offset);
+}
+
static void binder_transaction_buffer_release(struct binder_proc *proc,
struct binder_buffer *buffer,
binder_size_t *failed_at)
{
- binder_size_t *offp, *off_end;
+ binder_size_t *offp, *off_start, *off_end;
int debug_id = buffer->debug_id;
binder_debug(BINDER_DEBUG_TRANSACTION,
@@ -1255,28 +2277,30 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
if (buffer->target_node)
binder_dec_node(buffer->target_node, 1, 0);
- offp = (binder_size_t *)(buffer->data +
- ALIGN(buffer->data_size, sizeof(void *)));
+ off_start = (binder_size_t *)(buffer->data +
+ ALIGN(buffer->data_size, sizeof(void *)));
if (failed_at)
off_end = failed_at;
else
- off_end = (void *)offp + buffer->offsets_size;
- for (; offp < off_end; offp++) {
- struct flat_binder_object *fp;
+ off_end = (void *)off_start + buffer->offsets_size;
+ for (offp = off_start; offp < off_end; offp++) {
+ struct binder_object_header *hdr;
+ size_t object_size = binder_validate_object(buffer, *offp);
- if (*offp > buffer->data_size - sizeof(*fp) ||
- buffer->data_size < sizeof(*fp) ||
- !IS_ALIGNED(*offp, sizeof(u32))) {
- pr_err("transaction release %d bad offset %lld, size %zd\n",
+ if (object_size == 0) {
+ pr_err("transaction release %d bad object at offset %lld, size %zd\n",
debug_id, (u64)*offp, buffer->data_size);
continue;
}
- fp = (struct flat_binder_object *)(buffer->data + *offp);
- switch (fp->type) {
+ hdr = (struct binder_object_header *)(buffer->data + *offp);
+ switch (hdr->type) {
case BINDER_TYPE_BINDER:
case BINDER_TYPE_WEAK_BINDER: {
- struct binder_node *node = binder_get_node(proc, fp->binder);
+ struct flat_binder_object *fp;
+ struct binder_node *node;
+ fp = to_flat_binder_object(hdr);
+ node = binder_get_node(proc, fp->binder);
if (node == NULL) {
pr_err("transaction release %d bad node %016llx\n",
debug_id, (u64)fp->binder);
@@ -1285,90 +2309,564 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
binder_debug(BINDER_DEBUG_TRANSACTION,
" node %d u%016llx\n",
node->debug_id, (u64)node->ptr);
- binder_dec_node(node, fp->type == BINDER_TYPE_BINDER, 0);
+ binder_dec_node(node, hdr->type == BINDER_TYPE_BINDER,
+ 0);
+ binder_put_node(node);
} break;
case BINDER_TYPE_HANDLE:
case BINDER_TYPE_WEAK_HANDLE: {
- struct binder_ref *ref;
+ struct flat_binder_object *fp;
+ struct binder_ref_data rdata;
+ int ret;
- ref = binder_get_ref(proc, fp->handle,
- fp->type == BINDER_TYPE_HANDLE);
+ fp = to_flat_binder_object(hdr);
+ ret = binder_dec_ref_for_handle(proc, fp->handle,
+ hdr->type == BINDER_TYPE_HANDLE, &rdata);
- if (ref == NULL) {
- pr_err("transaction release %d bad handle %d\n",
- debug_id, fp->handle);
+ if (ret) {
+ pr_err("transaction release %d bad handle %d, ret = %d\n",
+ debug_id, fp->handle, ret);
break;
}
binder_debug(BINDER_DEBUG_TRANSACTION,
- " ref %d desc %d (node %d)\n",
- ref->debug_id, ref->desc, ref->node->debug_id);
- binder_dec_ref(ref, fp->type == BINDER_TYPE_HANDLE);
+ " ref %d desc %d\n",
+ rdata.debug_id, rdata.desc);
} break;
- case BINDER_TYPE_FD:
+ case BINDER_TYPE_FD: {
+ struct binder_fd_object *fp = to_binder_fd_object(hdr);
+
binder_debug(BINDER_DEBUG_TRANSACTION,
- " fd %d\n", fp->handle);
+ " fd %d\n", fp->fd);
if (failed_at)
- task_close_fd(proc, fp->handle);
+ task_close_fd(proc, fp->fd);
+ } break;
+ case BINDER_TYPE_PTR:
+ /*
+ * Nothing to do here, this will get cleaned up when the
+ * transaction buffer gets freed
+ */
break;
-
+ case BINDER_TYPE_FDA: {
+ struct binder_fd_array_object *fda;
+ struct binder_buffer_object *parent;
+ uintptr_t parent_buffer;
+ u32 *fd_array;
+ size_t fd_index;
+ binder_size_t fd_buf_size;
+
+ fda = to_binder_fd_array_object(hdr);
+ parent = binder_validate_ptr(buffer, fda->parent,
+ off_start,
+ offp - off_start);
+ if (!parent) {
+ pr_err("transaction release %d bad parent offset",
+ debug_id);
+ continue;
+ }
+ /*
+ * Since the parent was already fixed up, convert it
+ * back to kernel address space to access it
+ */
+ parent_buffer = parent->buffer -
+ binder_alloc_get_user_buffer_offset(
+ &proc->alloc);
+
+ fd_buf_size = sizeof(u32) * fda->num_fds;
+ if (fda->num_fds >= SIZE_MAX / sizeof(u32)) {
+ pr_err("transaction release %d invalid number of fds (%lld)\n",
+ debug_id, (u64)fda->num_fds);
+ continue;
+ }
+ if (fd_buf_size > parent->length ||
+ fda->parent_offset > parent->length - fd_buf_size) {
+ /* No space for all file descriptors here. */
+ pr_err("transaction release %d not enough space for %lld fds in buffer\n",
+ debug_id, (u64)fda->num_fds);
+ continue;
+ }
+ fd_array = (u32 *)(parent_buffer + fda->parent_offset);
+ for (fd_index = 0; fd_index < fda->num_fds; fd_index++)
+ task_close_fd(proc, fd_array[fd_index]);
+ } break;
default:
pr_err("transaction release %d bad object type %x\n",
- debug_id, fp->type);
+ debug_id, hdr->type);
break;
}
}
}
+static int binder_translate_binder(struct flat_binder_object *fp,
+ struct binder_transaction *t,
+ struct binder_thread *thread)
+{
+ struct binder_node *node;
+ struct binder_proc *proc = thread->proc;
+ struct binder_proc *target_proc = t->to_proc;
+ struct binder_ref_data rdata;
+ int ret = 0;
+
+ node = binder_get_node(proc, fp->binder);
+ if (!node) {
+ node = binder_new_node(proc, fp);
+ if (!node)
+ return -ENOMEM;
+ }
+ if (fp->cookie != node->cookie) {
+ binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n",
+ proc->pid, thread->pid, (u64)fp->binder,
+ node->debug_id, (u64)fp->cookie,
+ (u64)node->cookie);
+ ret = -EINVAL;
+ goto done;
+ }
+ if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) {
+ ret = -EPERM;
+ goto done;
+ }
+
+ ret = binder_inc_ref_for_node(target_proc, node,
+ fp->hdr.type == BINDER_TYPE_BINDER,
+ &thread->todo, &rdata);
+ if (ret)
+ goto done;
+
+ if (fp->hdr.type == BINDER_TYPE_BINDER)
+ fp->hdr.type = BINDER_TYPE_HANDLE;
+ else
+ fp->hdr.type = BINDER_TYPE_WEAK_HANDLE;
+ fp->binder = 0;
+ fp->handle = rdata.desc;
+ fp->cookie = 0;
+
+ trace_binder_transaction_node_to_ref(t, node, &rdata);
+ binder_debug(BINDER_DEBUG_TRANSACTION,
+ " node %d u%016llx -> ref %d desc %d\n",
+ node->debug_id, (u64)node->ptr,
+ rdata.debug_id, rdata.desc);
+done:
+ binder_put_node(node);
+ return ret;
+}
+
+static int binder_translate_handle(struct flat_binder_object *fp,
+ struct binder_transaction *t,
+ struct binder_thread *thread)
+{
+ struct binder_proc *proc = thread->proc;
+ struct binder_proc *target_proc = t->to_proc;
+ struct binder_node *node;
+ struct binder_ref_data src_rdata;
+ int ret = 0;
+
+ node = binder_get_node_from_ref(proc, fp->handle,
+ fp->hdr.type == BINDER_TYPE_HANDLE, &src_rdata);
+ if (!node) {
+ binder_user_error("%d:%d got transaction with invalid handle, %d\n",
+ proc->pid, thread->pid, fp->handle);
+ return -EINVAL;
+ }
+ if (security_binder_transfer_binder(proc->tsk, target_proc->tsk)) {
+ ret = -EPERM;
+ goto done;
+ }
+
+ binder_node_lock(node);
+ if (node->proc == target_proc) {
+ if (fp->hdr.type == BINDER_TYPE_HANDLE)
+ fp->hdr.type = BINDER_TYPE_BINDER;
+ else
+ fp->hdr.type = BINDER_TYPE_WEAK_BINDER;
+ fp->binder = node->ptr;
+ fp->cookie = node->cookie;
+ if (node->proc)
+ binder_inner_proc_lock(node->proc);
+ binder_inc_node_nilocked(node,
+ fp->hdr.type == BINDER_TYPE_BINDER,
+ 0, NULL);
+ if (node->proc)
+ binder_inner_proc_unlock(node->proc);
+ trace_binder_transaction_ref_to_node(t, node, &src_rdata);
+ binder_debug(BINDER_DEBUG_TRANSACTION,
+ " ref %d desc %d -> node %d u%016llx\n",
+ src_rdata.debug_id, src_rdata.desc, node->debug_id,
+ (u64)node->ptr);
+ binder_node_unlock(node);
+ } else {
+ struct binder_ref_data dest_rdata;
+
+ binder_node_unlock(node);
+ ret = binder_inc_ref_for_node(target_proc, node,
+ fp->hdr.type == BINDER_TYPE_HANDLE,
+ NULL, &dest_rdata);
+ if (ret)
+ goto done;
+
+ fp->binder = 0;
+ fp->handle = dest_rdata.desc;
+ fp->cookie = 0;
+ trace_binder_transaction_ref_to_ref(t, node, &src_rdata,
+ &dest_rdata);
+ binder_debug(BINDER_DEBUG_TRANSACTION,
+ " ref %d desc %d -> ref %d desc %d (node %d)\n",
+ src_rdata.debug_id, src_rdata.desc,
+ dest_rdata.debug_id, dest_rdata.desc,
+ node->debug_id);
+ }
+done:
+ binder_put_node(node);
+ return ret;
+}
+
+static int binder_translate_fd(int fd,
+ struct binder_transaction *t,
+ struct binder_thread *thread,
+ struct binder_transaction *in_reply_to)
+{
+ struct binder_proc *proc = thread->proc;
+ struct binder_proc *target_proc = t->to_proc;
+ int target_fd;
+ struct file *file;
+ int ret;
+ bool target_allows_fd;
+
+ if (in_reply_to)
+ target_allows_fd = !!(in_reply_to->flags & TF_ACCEPT_FDS);
+ else
+ target_allows_fd = t->buffer->target_node->accept_fds;
+ if (!target_allows_fd) {
+ binder_user_error("%d:%d got %s with fd, %d, but target does not allow fds\n",
+ proc->pid, thread->pid,
+ in_reply_to ? "reply" : "transaction",
+ fd);
+ ret = -EPERM;
+ goto err_fd_not_accepted;
+ }
+
+ file = fget(fd);
+ if (!file) {
+ binder_user_error("%d:%d got transaction with invalid fd, %d\n",
+ proc->pid, thread->pid, fd);
+ ret = -EBADF;
+ goto err_fget;
+ }
+ ret = security_binder_transfer_file(proc->tsk, target_proc->tsk, file);
+ if (ret < 0) {
+ ret = -EPERM;
+ goto err_security;
+ }
+
+ target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC);
+ if (target_fd < 0) {
+ ret = -ENOMEM;
+ goto err_get_unused_fd;
+ }
+ task_fd_install(target_proc, target_fd, file);
+ trace_binder_transaction_fd(t, fd, target_fd);
+ binder_debug(BINDER_DEBUG_TRANSACTION, " fd %d -> %d\n",
+ fd, target_fd);
+
+ return target_fd;
+
+err_get_unused_fd:
+err_security:
+ fput(file);
+err_fget:
+err_fd_not_accepted:
+ return ret;
+}
+
+static int binder_translate_fd_array(struct binder_fd_array_object *fda,
+ struct binder_buffer_object *parent,
+ struct binder_transaction *t,
+ struct binder_thread *thread,
+ struct binder_transaction *in_reply_to)
+{
+ binder_size_t fdi, fd_buf_size, num_installed_fds;
+ int target_fd;
+ uintptr_t parent_buffer;
+ u32 *fd_array;
+ struct binder_proc *proc = thread->proc;
+ struct binder_proc *target_proc = t->to_proc;
+
+ fd_buf_size = sizeof(u32) * fda->num_fds;
+ if (fda->num_fds >= SIZE_MAX / sizeof(u32)) {
+ binder_user_error("%d:%d got transaction with invalid number of fds (%lld)\n",
+ proc->pid, thread->pid, (u64)fda->num_fds);
+ return -EINVAL;
+ }
+ if (fd_buf_size > parent->length ||
+ fda->parent_offset > parent->length - fd_buf_size) {
+ /* No space for all file descriptors here. */
+ binder_user_error("%d:%d not enough space to store %lld fds in buffer\n",
+ proc->pid, thread->pid, (u64)fda->num_fds);
+ return -EINVAL;
+ }
+ /*
+ * Since the parent was already fixed up, convert it
+ * back to the kernel address space to access it
+ */
+ parent_buffer = parent->buffer -
+ binder_alloc_get_user_buffer_offset(&target_proc->alloc);
+ fd_array = (u32 *)(parent_buffer + fda->parent_offset);
+ if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) {
+ binder_user_error("%d:%d parent offset not aligned correctly.\n",
+ proc->pid, thread->pid);
+ return -EINVAL;
+ }
+ for (fdi = 0; fdi < fda->num_fds; fdi++) {
+ target_fd = binder_translate_fd(fd_array[fdi], t, thread,
+ in_reply_to);
+ if (target_fd < 0)
+ goto err_translate_fd_failed;
+ fd_array[fdi] = target_fd;
+ }
+ return 0;
+
+err_translate_fd_failed:
+ /*
+ * Failed to allocate fd or security error, free fds
+ * installed so far.
+ */
+ num_installed_fds = fdi;
+ for (fdi = 0; fdi < num_installed_fds; fdi++)
+ task_close_fd(target_proc, fd_array[fdi]);
+ return target_fd;
+}
+
+static int binder_fixup_parent(struct binder_transaction *t,
+ struct binder_thread *thread,
+ struct binder_buffer_object *bp,
+ binder_size_t *off_start,
+ binder_size_t num_valid,
+ struct binder_buffer_object *last_fixup_obj,
+ binder_size_t last_fixup_min_off)
+{
+ struct binder_buffer_object *parent;
+ u8 *parent_buffer;
+ struct binder_buffer *b = t->buffer;
+ struct binder_proc *proc = thread->proc;
+ struct binder_proc *target_proc = t->to_proc;
+
+ if (!(bp->flags & BINDER_BUFFER_FLAG_HAS_PARENT))
+ return 0;
+
+ parent = binder_validate_ptr(b, bp->parent, off_start, num_valid);
+ if (!parent) {
+ binder_user_error("%d:%d got transaction with invalid parent offset or type\n",
+ proc->pid, thread->pid);
+ return -EINVAL;
+ }
+
+ if (!binder_validate_fixup(b, off_start,
+ parent, bp->parent_offset,
+ last_fixup_obj,
+ last_fixup_min_off)) {
+ binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n",
+ proc->pid, thread->pid);
+ return -EINVAL;
+ }
+
+ if (parent->length < sizeof(binder_uintptr_t) ||
+ bp->parent_offset > parent->length - sizeof(binder_uintptr_t)) {
+ /* No space for a pointer here! */
+ binder_user_error("%d:%d got transaction with invalid parent offset\n",
+ proc->pid, thread->pid);
+ return -EINVAL;
+ }
+ parent_buffer = (u8 *)(parent->buffer -
+ binder_alloc_get_user_buffer_offset(
+ &target_proc->alloc));
+ *(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer;
+
+ return 0;
+}
+
+/**
+ * binder_proc_transaction() - sends a transaction to a process and wakes it up
+ * @t: transaction to send
+ * @proc: process to send the transaction to
+ * @thread: thread in @proc to send the transaction to (may be NULL)
+ *
+ * This function queues a transaction to the specified process. It will try
+ * to find a thread in the target process to handle the transaction and
+ * wake it up. If no thread is found, the work is queued to the proc
+ * waitqueue.
+ *
+ * If the @thread parameter is not NULL, the transaction is always queued
+ * to the waitlist of that specific thread.
+ *
+ * Return: true if the transactions was successfully queued
+ * false if the target process or thread is dead
+ */
+static bool binder_proc_transaction(struct binder_transaction *t,
+ struct binder_proc *proc,
+ struct binder_thread *thread)
+{
+ struct list_head *target_list = NULL;
+ struct binder_node *node = t->buffer->target_node;
+ struct binder_priority node_prio;
+ bool oneway = !!(t->flags & TF_ONE_WAY);
+ bool wakeup = true;
+
+ BUG_ON(!node);
+ binder_node_lock(node);
+ node_prio.prio = node->min_priority;
+ node_prio.sched_policy = node->sched_policy;
+
+ if (oneway) {
+ BUG_ON(thread);
+ if (node->has_async_transaction) {
+ target_list = &node->async_todo;
+ wakeup = false;
+ } else {
+ node->has_async_transaction = 1;
+ }
+ }
+
+ binder_inner_proc_lock(proc);
+
+ if (proc->is_dead || (thread && thread->is_dead)) {
+ binder_inner_proc_unlock(proc);
+ binder_node_unlock(node);
+ return false;
+ }
+
+ if (!thread && !target_list)
+ thread = binder_select_thread_ilocked(proc);
+
+ if (thread) {
+ target_list = &thread->todo;
+ binder_transaction_priority(thread->task, t, node_prio,
+ node->inherit_rt);
+ } else if (!target_list) {
+ target_list = &proc->todo;
+ } else {
+ BUG_ON(target_list != &node->async_todo);
+ }
+
+ binder_enqueue_work_ilocked(&t->work, target_list);
+
+ if (wakeup)
+ binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */);
+
+ binder_inner_proc_unlock(proc);
+ binder_node_unlock(node);
+
+ return true;
+}
+
+/**
+ * binder_get_node_refs_for_txn() - Get required refs on node for txn
+ * @node: struct binder_node for which to get refs
+ * @proc: returns @node->proc if valid
+ * @error: if no @proc then returns BR_DEAD_REPLY
+ *
+ * User-space normally keeps the node alive when creating a transaction
+ * since it has a reference to the target. The local strong ref keeps it
+ * alive if the sending process dies before the target process processes
+ * the transaction. If the source process is malicious or has a reference
+ * counting bug, relying on the local strong ref can fail.
+ *
+ * Since user-space can cause the local strong ref to go away, we also take
+ * a tmpref on the node to ensure it survives while we are constructing
+ * the transaction. We also need a tmpref on the proc while we are
+ * constructing the transaction, so we take that here as well.
+ *
+ * Return: The target_node with refs taken or NULL if no @node->proc is NULL.
+ * Also sets @proc if valid. If the @node->proc is NULL indicating that the
+ * target proc has died, @error is set to BR_DEAD_REPLY
+ */
+static struct binder_node *binder_get_node_refs_for_txn(
+ struct binder_node *node,
+ struct binder_proc **procp,
+ uint32_t *error)
+{
+ struct binder_node *target_node = NULL;
+
+ binder_node_inner_lock(node);
+ if (node->proc) {
+ target_node = node;
+ binder_inc_node_nilocked(node, 1, 0, NULL);
+ binder_inc_node_tmpref_ilocked(node);
+ node->proc->tmp_ref++;
+ *procp = node->proc;
+ } else
+ *error = BR_DEAD_REPLY;
+ binder_node_inner_unlock(node);
+
+ return target_node;
+}
+
static void binder_transaction(struct binder_proc *proc,
struct binder_thread *thread,
- struct binder_transaction_data *tr, int reply)
+ struct binder_transaction_data *tr, int reply,
+ binder_size_t extra_buffers_size)
{
+ int ret;
struct binder_transaction *t;
struct binder_work *tcomplete;
- binder_size_t *offp, *off_end;
+ binder_size_t *offp, *off_end, *off_start;
binder_size_t off_min;
- struct binder_proc *target_proc;
+ u8 *sg_bufp, *sg_buf_end;
+ struct binder_proc *target_proc = NULL;
struct binder_thread *target_thread = NULL;
struct binder_node *target_node = NULL;
- struct list_head *target_list;
- wait_queue_head_t *target_wait;
struct binder_transaction *in_reply_to = NULL;
struct binder_transaction_log_entry *e;
- uint32_t return_error;
+ uint32_t return_error = 0;
+ uint32_t return_error_param = 0;
+ uint32_t return_error_line = 0;
+ struct binder_buffer_object *last_fixup_obj = NULL;
+ binder_size_t last_fixup_min_off = 0;
+ struct binder_context *context = proc->context;
+ int t_debug_id = atomic_inc_return(&binder_last_id);
e = binder_transaction_log_add(&binder_transaction_log);
+ e->debug_id = t_debug_id;
e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY);
e->from_proc = proc->pid;
e->from_thread = thread->pid;
e->target_handle = tr->target.handle;
e->data_size = tr->data_size;
e->offsets_size = tr->offsets_size;
+ e->context_name = proc->context->name;
if (reply) {
+ binder_inner_proc_lock(proc);
in_reply_to = thread->transaction_stack;
if (in_reply_to == NULL) {
+ binder_inner_proc_unlock(proc);
binder_user_error("%d:%d got reply transaction with no transaction stack\n",
proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EPROTO;
+ return_error_line = __LINE__;
goto err_empty_call_stack;
}
- binder_set_nice(in_reply_to->saved_priority);
if (in_reply_to->to_thread != thread) {
+ spin_lock(&in_reply_to->lock);
binder_user_error("%d:%d got reply transaction with bad transaction stack, transaction %d has target %d:%d\n",
proc->pid, thread->pid, in_reply_to->debug_id,
in_reply_to->to_proc ?
in_reply_to->to_proc->pid : 0,
in_reply_to->to_thread ?
in_reply_to->to_thread->pid : 0);
+ spin_unlock(&in_reply_to->lock);
+ binder_inner_proc_unlock(proc);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EPROTO;
+ return_error_line = __LINE__;
in_reply_to = NULL;
goto err_bad_call_stack;
}
thread->transaction_stack = in_reply_to->to_parent;
- target_thread = in_reply_to->from;
+ binder_inner_proc_unlock(proc);
+ target_thread = binder_get_txn_from_and_acq_inner(in_reply_to);
if (target_thread == NULL) {
return_error = BR_DEAD_REPLY;
+ return_error_line = __LINE__;
goto err_dead_binder;
}
if (target_thread->transaction_stack != in_reply_to) {
@@ -1377,106 +2875,148 @@ static void binder_transaction(struct binder_proc *proc,
target_thread->transaction_stack ?
target_thread->transaction_stack->debug_id : 0,
in_reply_to->debug_id);
+ binder_inner_proc_unlock(target_thread->proc);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EPROTO;
+ return_error_line = __LINE__;
in_reply_to = NULL;
target_thread = NULL;
goto err_dead_binder;
}
target_proc = target_thread->proc;
+ target_proc->tmp_ref++;
+ binder_inner_proc_unlock(target_thread->proc);
} else {
if (tr->target.handle) {
struct binder_ref *ref;
- ref = binder_get_ref(proc, tr->target.handle, true);
- if (ref == NULL) {
+ /*
+ * There must already be a strong ref
+ * on this node. If so, do a strong
+ * increment on the node to ensure it
+ * stays alive until the transaction is
+ * done.
+ */
+ binder_proc_lock(proc);
+ ref = binder_get_ref_olocked(proc, tr->target.handle,
+ true);
+ if (ref) {
+ target_node = binder_get_node_refs_for_txn(
+ ref->node, &target_proc,
+ &return_error);
+ } else {
binder_user_error("%d:%d got transaction to invalid handle\n",
- proc->pid, thread->pid);
+ proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
- goto err_invalid_target_handle;
}
- target_node = ref->node;
+ binder_proc_unlock(proc);
} else {
- target_node = binder_context_mgr_node;
- if (target_node == NULL) {
+ mutex_lock(&context->context_mgr_node_lock);
+ target_node = context->binder_context_mgr_node;
+ if (target_node)
+ target_node = binder_get_node_refs_for_txn(
+ target_node, &target_proc,
+ &return_error);
+ else
return_error = BR_DEAD_REPLY;
- goto err_no_context_mgr_node;
- }
+ mutex_unlock(&context->context_mgr_node_lock);
}
- e->to_node = target_node->debug_id;
- target_proc = target_node->proc;
- if (target_proc == NULL) {
- return_error = BR_DEAD_REPLY;
+ if (!target_node) {
+ /*
+ * return_error is set above
+ */
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
goto err_dead_binder;
}
+ e->to_node = target_node->debug_id;
if (security_binder_transaction(proc->tsk,
target_proc->tsk) < 0) {
return_error = BR_FAILED_REPLY;
+ return_error_param = -EPERM;
+ return_error_line = __LINE__;
goto err_invalid_target_handle;
}
+ binder_inner_proc_lock(proc);
if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) {
struct binder_transaction *tmp;
tmp = thread->transaction_stack;
if (tmp->to_thread != thread) {
+ spin_lock(&tmp->lock);
binder_user_error("%d:%d got new transaction with bad transaction stack, transaction %d has target %d:%d\n",
proc->pid, thread->pid, tmp->debug_id,
tmp->to_proc ? tmp->to_proc->pid : 0,
tmp->to_thread ?
tmp->to_thread->pid : 0);
+ spin_unlock(&tmp->lock);
+ binder_inner_proc_unlock(proc);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EPROTO;
+ return_error_line = __LINE__;
goto err_bad_call_stack;
}
while (tmp) {
- if (tmp->from && tmp->from->proc == target_proc)
- target_thread = tmp->from;
+ struct binder_thread *from;
+
+ spin_lock(&tmp->lock);
+ from = tmp->from;
+ if (from && from->proc == target_proc) {
+ atomic_inc(&from->tmp_ref);
+ target_thread = from;
+ spin_unlock(&tmp->lock);
+ break;
+ }
+ spin_unlock(&tmp->lock);
tmp = tmp->from_parent;
}
}
+ binder_inner_proc_unlock(proc);
}
- if (target_thread) {
+ if (target_thread)
e->to_thread = target_thread->pid;
- target_list = &target_thread->todo;
- target_wait = &target_thread->wait;
- } else {
- target_list = &target_proc->todo;
- target_wait = &target_proc->wait;
- }
e->to_proc = target_proc->pid;
/* TODO: reuse incoming transaction for reply */
t = kzalloc(sizeof(*t), GFP_KERNEL);
if (t == NULL) {
return_error = BR_FAILED_REPLY;
+ return_error_param = -ENOMEM;
+ return_error_line = __LINE__;
goto err_alloc_t_failed;
}
binder_stats_created(BINDER_STAT_TRANSACTION);
+ spin_lock_init(&t->lock);
tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL);
if (tcomplete == NULL) {
return_error = BR_FAILED_REPLY;
+ return_error_param = -ENOMEM;
+ return_error_line = __LINE__;
goto err_alloc_tcomplete_failed;
}
binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE);
- t->debug_id = ++binder_last_id;
- e->debug_id = t->debug_id;
+ t->debug_id = t_debug_id;
if (reply)
binder_debug(BINDER_DEBUG_TRANSACTION,
- "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld\n",
+ "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld-%lld\n",
proc->pid, thread->pid, t->debug_id,
target_proc->pid, target_thread->pid,
(u64)tr->data.ptr.buffer,
(u64)tr->data.ptr.offsets,
- (u64)tr->data_size, (u64)tr->offsets_size);
+ (u64)tr->data_size, (u64)tr->offsets_size,
+ (u64)extra_buffers_size);
else
binder_debug(BINDER_DEBUG_TRANSACTION,
- "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld\n",
+ "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld-%lld\n",
proc->pid, thread->pid, t->debug_id,
target_proc->pid, target_node->debug_id,
(u64)tr->data.ptr.buffer,
(u64)tr->data.ptr.offsets,
- (u64)tr->data_size, (u64)tr->offsets_size);
+ (u64)tr->data_size, (u64)tr->offsets_size,
+ (u64)extra_buffers_size);
if (!reply && !(tr->flags & TF_ONE_WAY))
t->from = thread;
@@ -1487,32 +3027,47 @@ static void binder_transaction(struct binder_proc *proc,
t->to_thread = target_thread;
t->code = tr->code;
t->flags = tr->flags;
- t->priority = task_nice(current);
+ if (!(t->flags & TF_ONE_WAY) &&
+ binder_supported_policy(current->policy)) {
+ /* Inherit supported policies for synchronous transactions */
+ t->priority.sched_policy = current->policy;
+ t->priority.prio = current->normal_prio;
+ } else {
+ /* Otherwise, fall back to the default priority */
+ t->priority = target_proc->default_priority;
+ }
trace_binder_transaction(reply, t, target_node);
- t->buffer = binder_alloc_buf(target_proc, tr->data_size,
- tr->offsets_size, !reply && (t->flags & TF_ONE_WAY));
- if (t->buffer == NULL) {
- return_error = BR_FAILED_REPLY;
+ t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size,
+ tr->offsets_size, extra_buffers_size,
+ !reply && (t->flags & TF_ONE_WAY));
+ if (IS_ERR(t->buffer)) {
+ /*
+ * -ESRCH indicates VMA cleared. The target is dying.
+ */
+ return_error_param = PTR_ERR(t->buffer);
+ return_error = return_error_param == -ESRCH ?
+ BR_DEAD_REPLY : BR_FAILED_REPLY;
+ return_error_line = __LINE__;
+ t->buffer = NULL;
goto err_binder_alloc_buf_failed;
}
- t->buffer->allow_user_free = 0;
t->buffer->debug_id = t->debug_id;
t->buffer->transaction = t;
t->buffer->target_node = target_node;
trace_binder_transaction_alloc_buf(t->buffer);
- if (target_node)
- binder_inc_node(target_node, 1, 0, NULL);
-
- offp = (binder_size_t *)(t->buffer->data +
- ALIGN(tr->data_size, sizeof(void *)));
+ off_start = (binder_size_t *)(t->buffer->data +
+ ALIGN(tr->data_size, sizeof(void *)));
+ offp = off_start;
if (copy_from_user(t->buffer->data, (const void __user *)(uintptr_t)
tr->data.ptr.buffer, tr->data_size)) {
binder_user_error("%d:%d got transaction with invalid data ptr\n",
proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EFAULT;
+ return_error_line = __LINE__;
goto err_copy_data_failed;
}
if (copy_from_user(offp, (const void __user *)(uintptr_t)
@@ -1520,231 +3075,244 @@ static void binder_transaction(struct binder_proc *proc,
binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EFAULT;
+ return_error_line = __LINE__;
goto err_copy_data_failed;
}
if (!IS_ALIGNED(tr->offsets_size, sizeof(binder_size_t))) {
binder_user_error("%d:%d got transaction with invalid offsets size, %lld\n",
proc->pid, thread->pid, (u64)tr->offsets_size);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
+ goto err_bad_offset;
+ }
+ if (!IS_ALIGNED(extra_buffers_size, sizeof(u64))) {
+ binder_user_error("%d:%d got transaction with unaligned buffers size, %lld\n",
+ proc->pid, thread->pid,
+ (u64)extra_buffers_size);
+ return_error = BR_FAILED_REPLY;
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
goto err_bad_offset;
}
- off_end = (void *)offp + tr->offsets_size;
+ off_end = (void *)off_start + tr->offsets_size;
+ sg_bufp = (u8 *)(PTR_ALIGN(off_end, sizeof(void *)));
+ sg_buf_end = sg_bufp + extra_buffers_size;
off_min = 0;
for (; offp < off_end; offp++) {
- struct flat_binder_object *fp;
+ struct binder_object_header *hdr;
+ size_t object_size = binder_validate_object(t->buffer, *offp);
- if (*offp > t->buffer->data_size - sizeof(*fp) ||
- *offp < off_min ||
- t->buffer->data_size < sizeof(*fp) ||
- !IS_ALIGNED(*offp, sizeof(u32))) {
- binder_user_error("%d:%d got transaction with invalid offset, %lld (min %lld, max %lld)\n",
+ if (object_size == 0 || *offp < off_min) {
+ binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n",
proc->pid, thread->pid, (u64)*offp,
(u64)off_min,
- (u64)(t->buffer->data_size -
- sizeof(*fp)));
+ (u64)t->buffer->data_size);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
goto err_bad_offset;
}
- fp = (struct flat_binder_object *)(t->buffer->data + *offp);
- off_min = *offp + sizeof(struct flat_binder_object);
- switch (fp->type) {
+
+ hdr = (struct binder_object_header *)(t->buffer->data + *offp);
+ off_min = *offp + object_size;
+ switch (hdr->type) {
case BINDER_TYPE_BINDER:
case BINDER_TYPE_WEAK_BINDER: {
- struct binder_ref *ref;
- struct binder_node *node = binder_get_node(proc, fp->binder);
+ struct flat_binder_object *fp;
- if (node == NULL) {
- node = binder_new_node(proc, fp->binder, fp->cookie);
- if (node == NULL) {
- return_error = BR_FAILED_REPLY;
- goto err_binder_new_node_failed;
- }
- node->min_priority = fp->flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
- node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
- }
- if (fp->cookie != node->cookie) {
- binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n",
- proc->pid, thread->pid,
- (u64)fp->binder, node->debug_id,
- (u64)fp->cookie, (u64)node->cookie);
+ fp = to_flat_binder_object(hdr);
+ ret = binder_translate_binder(fp, t, thread);
+ if (ret < 0) {
return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_for_node_failed;
+ return_error_param = ret;
+ return_error_line = __LINE__;
+ goto err_translate_failed;
}
- if (security_binder_transfer_binder(proc->tsk,
- target_proc->tsk)) {
- return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_for_node_failed;
- }
- ref = binder_get_ref_for_node(target_proc, node);
- if (ref == NULL) {
- return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_for_node_failed;
- }
- if (fp->type == BINDER_TYPE_BINDER)
- fp->type = BINDER_TYPE_HANDLE;
- else
- fp->type = BINDER_TYPE_WEAK_HANDLE;
- fp->binder = 0;
- fp->handle = ref->desc;
- fp->cookie = 0;
- binder_inc_ref(ref, fp->type == BINDER_TYPE_HANDLE,
- &thread->todo);
-
- trace_binder_transaction_node_to_ref(t, node, ref);
- binder_debug(BINDER_DEBUG_TRANSACTION,
- " node %d u%016llx -> ref %d desc %d\n",
- node->debug_id, (u64)node->ptr,
- ref->debug_id, ref->desc);
} break;
case BINDER_TYPE_HANDLE:
case BINDER_TYPE_WEAK_HANDLE: {
- struct binder_ref *ref;
+ struct flat_binder_object *fp;
- ref = binder_get_ref(proc, fp->handle,
- fp->type == BINDER_TYPE_HANDLE);
+ fp = to_flat_binder_object(hdr);
+ ret = binder_translate_handle(fp, t, thread);
+ if (ret < 0) {
+ return_error = BR_FAILED_REPLY;
+ return_error_param = ret;
+ return_error_line = __LINE__;
+ goto err_translate_failed;
+ }
+ } break;
- if (ref == NULL) {
- binder_user_error("%d:%d got transaction with invalid handle, %d\n",
- proc->pid,
- thread->pid, fp->handle);
+ case BINDER_TYPE_FD: {
+ struct binder_fd_object *fp = to_binder_fd_object(hdr);
+ int target_fd = binder_translate_fd(fp->fd, t, thread,
+ in_reply_to);
+
+ if (target_fd < 0) {
return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_failed;
+ return_error_param = target_fd;
+ return_error_line = __LINE__;
+ goto err_translate_failed;
}
- if (security_binder_transfer_binder(proc->tsk,
- target_proc->tsk)) {
+ fp->pad_binder = 0;
+ fp->fd = target_fd;
+ } break;
+ case BINDER_TYPE_FDA: {
+ struct binder_fd_array_object *fda =
+ to_binder_fd_array_object(hdr);
+ struct binder_buffer_object *parent =
+ binder_validate_ptr(t->buffer, fda->parent,
+ off_start,
+ offp - off_start);
+ if (!parent) {
+ binder_user_error("%d:%d got transaction with invalid parent offset or type\n",
+ proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_failed;
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
+ goto err_bad_parent;
}
- if (ref->node->proc == target_proc) {
- if (fp->type == BINDER_TYPE_HANDLE)
- fp->type = BINDER_TYPE_BINDER;
- else
- fp->type = BINDER_TYPE_WEAK_BINDER;
- fp->binder = ref->node->ptr;
- fp->cookie = ref->node->cookie;
- binder_inc_node(ref->node, fp->type == BINDER_TYPE_BINDER, 0, NULL);
- trace_binder_transaction_ref_to_node(t, ref);
- binder_debug(BINDER_DEBUG_TRANSACTION,
- " ref %d desc %d -> node %d u%016llx\n",
- ref->debug_id, ref->desc, ref->node->debug_id,
- (u64)ref->node->ptr);
- } else {
- struct binder_ref *new_ref;
-
- new_ref = binder_get_ref_for_node(target_proc, ref->node);
- if (new_ref == NULL) {
- return_error = BR_FAILED_REPLY;
- goto err_binder_get_ref_for_node_failed;
- }
- fp->binder = 0;
- fp->handle = new_ref->desc;
- fp->cookie = 0;
- binder_inc_ref(new_ref, fp->type == BINDER_TYPE_HANDLE, NULL);
- trace_binder_transaction_ref_to_ref(t, ref,
- new_ref);
- binder_debug(BINDER_DEBUG_TRANSACTION,
- " ref %d desc %d -> ref %d desc %d (node %d)\n",
- ref->debug_id, ref->desc, new_ref->debug_id,
- new_ref->desc, ref->node->debug_id);
+ if (!binder_validate_fixup(t->buffer, off_start,
+ parent, fda->parent_offset,
+ last_fixup_obj,
+ last_fixup_min_off)) {
+ binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n",
+ proc->pid, thread->pid);
+ return_error = BR_FAILED_REPLY;
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
+ goto err_bad_parent;
}
- } break;
-
- case BINDER_TYPE_FD: {
- int target_fd;
- struct file *file;
-
- if (reply) {
- if (!(in_reply_to->flags & TF_ACCEPT_FDS)) {
- binder_user_error("%d:%d got reply with fd, %d, but target does not allow fds\n",
- proc->pid, thread->pid, fp->handle);
- return_error = BR_FAILED_REPLY;
- goto err_fd_not_allowed;
- }
- } else if (!target_node->accept_fds) {
- binder_user_error("%d:%d got transaction with fd, %d, but target does not allow fds\n",
- proc->pid, thread->pid, fp->handle);
+ ret = binder_translate_fd_array(fda, parent, t, thread,
+ in_reply_to);
+ if (ret < 0) {
return_error = BR_FAILED_REPLY;
- goto err_fd_not_allowed;
+ return_error_param = ret;
+ return_error_line = __LINE__;
+ goto err_translate_failed;
}
-
- file = fget(fp->handle);
- if (file == NULL) {
- binder_user_error("%d:%d got transaction with invalid fd, %d\n",
- proc->pid, thread->pid, fp->handle);
+ last_fixup_obj = parent;
+ last_fixup_min_off =
+ fda->parent_offset + sizeof(u32) * fda->num_fds;
+ } break;
+ case BINDER_TYPE_PTR: {
+ struct binder_buffer_object *bp =
+ to_binder_buffer_object(hdr);
+ size_t buf_left = sg_buf_end - sg_bufp;
+
+ if (bp->length > buf_left) {
+ binder_user_error("%d:%d got transaction with too large buffer\n",
+ proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
- goto err_fget_failed;
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
+ goto err_bad_offset;
}
- if (security_binder_transfer_file(proc->tsk,
- target_proc->tsk,
- file) < 0) {
- fput(file);
+ if (copy_from_user(sg_bufp,
+ (const void __user *)(uintptr_t)
+ bp->buffer, bp->length)) {
+ binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
+ proc->pid, thread->pid);
+ return_error_param = -EFAULT;
return_error = BR_FAILED_REPLY;
- goto err_get_unused_fd_failed;
+ return_error_line = __LINE__;
+ goto err_copy_data_failed;
}
- target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC);
- if (target_fd < 0) {
- fput(file);
+ /* Fixup buffer pointer to target proc address space */
+ bp->buffer = (uintptr_t)sg_bufp +
+ binder_alloc_get_user_buffer_offset(
+ &target_proc->alloc);
+ sg_bufp += ALIGN(bp->length, sizeof(u64));
+
+ ret = binder_fixup_parent(t, thread, bp, off_start,
+ offp - off_start,
+ last_fixup_obj,
+ last_fixup_min_off);
+ if (ret < 0) {
return_error = BR_FAILED_REPLY;
- goto err_get_unused_fd_failed;
+ return_error_param = ret;
+ return_error_line = __LINE__;
+ goto err_translate_failed;
}
- task_fd_install(target_proc, target_fd, file);
- trace_binder_transaction_fd(t, fp->handle, target_fd);
- binder_debug(BINDER_DEBUG_TRANSACTION,
- " fd %d -> %d\n", fp->handle, target_fd);
- /* TODO: fput? */
- fp->binder = 0;
- fp->handle = target_fd;
+ last_fixup_obj = bp;
+ last_fixup_min_off = 0;
} break;
-
default:
binder_user_error("%d:%d got transaction with invalid object type, %x\n",
- proc->pid, thread->pid, fp->type);
+ proc->pid, thread->pid, hdr->type);
return_error = BR_FAILED_REPLY;
+ return_error_param = -EINVAL;
+ return_error_line = __LINE__;
goto err_bad_object_type;
}
}
+ tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
+ binder_enqueue_work(proc, tcomplete, &thread->todo);
+ t->work.type = BINDER_WORK_TRANSACTION;
+
if (reply) {
+ binder_inner_proc_lock(target_proc);
+ if (target_thread->is_dead) {
+ binder_inner_proc_unlock(target_proc);
+ goto err_dead_proc_or_thread;
+ }
BUG_ON(t->buffer->async_transaction != 0);
- binder_pop_transaction(target_thread, in_reply_to);
+ binder_pop_transaction_ilocked(target_thread, in_reply_to);
+ binder_enqueue_work_ilocked(&t->work, &target_thread->todo);
+ binder_inner_proc_unlock(target_proc);
+ wake_up_interruptible_sync(&target_thread->wait);
+ binder_restore_priority(current, in_reply_to->saved_priority);
+ binder_free_transaction(in_reply_to);
} else if (!(t->flags & TF_ONE_WAY)) {
BUG_ON(t->buffer->async_transaction != 0);
+ binder_inner_proc_lock(proc);
t->need_reply = 1;
t->from_parent = thread->transaction_stack;
thread->transaction_stack = t;
+ binder_inner_proc_unlock(proc);
+ if (!binder_proc_transaction(t, target_proc, target_thread)) {
+ binder_inner_proc_lock(proc);
+ binder_pop_transaction_ilocked(thread, t);
+ binder_inner_proc_unlock(proc);
+ goto err_dead_proc_or_thread;
+ }
} else {
BUG_ON(target_node == NULL);
BUG_ON(t->buffer->async_transaction != 1);
- if (target_node->has_async_transaction) {
- target_list = &target_node->async_todo;
- target_wait = NULL;
- } else
- target_node->has_async_transaction = 1;
- }
- t->work.type = BINDER_WORK_TRANSACTION;
- list_add_tail(&t->work.entry, target_list);
- tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
- list_add_tail(&tcomplete->entry, &thread->todo);
- if (target_wait) {
- if (reply || !(t->flags & TF_ONE_WAY))
- wake_up_interruptible_sync(target_wait);
- else
- wake_up_interruptible(target_wait);
+ if (!binder_proc_transaction(t, target_proc, NULL))
+ goto err_dead_proc_or_thread;
}
+ if (target_thread)
+ binder_thread_dec_tmpref(target_thread);
+ binder_proc_dec_tmpref(target_proc);
+ if (target_node)
+ binder_dec_node_tmpref(target_node);
+ /*
+ * write barrier to synchronize with initialization
+ * of log entry
+ */
+ smp_wmb();
+ WRITE_ONCE(e->debug_id_done, t_debug_id);
return;
-err_get_unused_fd_failed:
-err_fget_failed:
-err_fd_not_allowed:
-err_binder_get_ref_for_node_failed:
-err_binder_get_ref_failed:
-err_binder_new_node_failed:
+err_dead_proc_or_thread:
+ return_error = BR_DEAD_REPLY;
+ return_error_line = __LINE__;
+ binder_dequeue_work(proc, tcomplete);
+err_translate_failed:
err_bad_object_type:
err_bad_offset:
+err_bad_parent:
err_copy_data_failed:
trace_binder_transaction_failed_buffer_release(t->buffer);
binder_transaction_buffer_release(target_proc, t->buffer, offp);
+ if (target_node)
+ binder_dec_node_tmpref(target_node);
+ target_node = NULL;
t->buffer->transaction = NULL;
- binder_free_buf(target_proc, t->buffer);
+ binder_alloc_free_buf(&target_proc->alloc, t->buffer);
err_binder_alloc_buf_failed:
kfree(tcomplete);
binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
@@ -1756,25 +3324,52 @@ err_bad_call_stack:
err_empty_call_stack:
err_dead_binder:
err_invalid_target_handle:
-err_no_context_mgr_node:
+ if (target_thread)
+ binder_thread_dec_tmpref(target_thread);
+ if (target_proc)
+ binder_proc_dec_tmpref(target_proc);
+ if (target_node) {
+ binder_dec_node(target_node, 1, 0);
+ binder_dec_node_tmpref(target_node);
+ }
+
binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
- "%d:%d transaction failed %d, size %lld-%lld\n",
- proc->pid, thread->pid, return_error,
- (u64)tr->data_size, (u64)tr->offsets_size);
+ "%d:%d transaction failed %d/%d, size %lld-%lld line %d\n",
+ proc->pid, thread->pid, return_error, return_error_param,
+ (u64)tr->data_size, (u64)tr->offsets_size,
+ return_error_line);
{
struct binder_transaction_log_entry *fe;
+ e->return_error = return_error;
+ e->return_error_param = return_error_param;
+ e->return_error_line = return_error_line;
fe = binder_transaction_log_add(&binder_transaction_log_failed);
*fe = *e;
+ /*
+ * write barrier to synchronize with initialization
+ * of log entry
+ */
+ smp_wmb();
+ WRITE_ONCE(e->debug_id_done, t_debug_id);
+ WRITE_ONCE(fe->debug_id_done, t_debug_id);
}
- BUG_ON(thread->return_error != BR_OK);
+ BUG_ON(thread->return_error.cmd != BR_OK);
if (in_reply_to) {
- thread->return_error = BR_TRANSACTION_COMPLETE;
+ binder_restore_priority(current, in_reply_to->saved_priority);
+ thread->return_error.cmd = BR_TRANSACTION_COMPLETE;
+ binder_enqueue_work(thread->proc,
+ &thread->return_error.work,
+ &thread->todo);
binder_send_failed_reply(in_reply_to, return_error);
- } else
- thread->return_error = return_error;
+ } else {
+ thread->return_error.cmd = return_error;
+ binder_enqueue_work(thread->proc,
+ &thread->return_error.work,
+ &thread->todo);
+ }
}
static int binder_thread_write(struct binder_proc *proc,
@@ -1783,19 +3378,22 @@ static int binder_thread_write(struct binder_proc *proc,
binder_size_t *consumed)
{
uint32_t cmd;
+ struct binder_context *context = proc->context;
void __user *buffer = (void __user *)(uintptr_t)binder_buffer;
void __user *ptr = buffer + *consumed;
void __user *end = buffer + size;
- while (ptr < end && thread->return_error == BR_OK) {
+ while (ptr < end && thread->return_error.cmd == BR_OK) {
+ int ret;
+
if (get_user(cmd, (uint32_t __user *)ptr))
return -EFAULT;
ptr += sizeof(uint32_t);
trace_binder_command(cmd);
if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) {
- binder_stats.bc[_IOC_NR(cmd)]++;
- proc->stats.bc[_IOC_NR(cmd)]++;
- thread->stats.bc[_IOC_NR(cmd)]++;
+ atomic_inc(&binder_stats.bc[_IOC_NR(cmd)]);
+ atomic_inc(&proc->stats.bc[_IOC_NR(cmd)]);
+ atomic_inc(&thread->stats.bc[_IOC_NR(cmd)]);
}
switch (cmd) {
case BC_INCREFS:
@@ -1803,53 +3401,61 @@ static int binder_thread_write(struct binder_proc *proc,
case BC_RELEASE:
case BC_DECREFS: {
uint32_t target;
- struct binder_ref *ref;
const char *debug_string;
+ bool strong = cmd == BC_ACQUIRE || cmd == BC_RELEASE;
+ bool increment = cmd == BC_INCREFS || cmd == BC_ACQUIRE;
+ struct binder_ref_data rdata;
if (get_user(target, (uint32_t __user *)ptr))
return -EFAULT;
+
ptr += sizeof(uint32_t);
- if (target == 0 && binder_context_mgr_node &&
- (cmd == BC_INCREFS || cmd == BC_ACQUIRE)) {
- ref = binder_get_ref_for_node(proc,
- binder_context_mgr_node);
- if (ref->desc != target) {
- binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n",
- proc->pid, thread->pid,
- ref->desc);
- }
- } else
- ref = binder_get_ref(proc, target,
- cmd == BC_ACQUIRE ||
- cmd == BC_RELEASE);
- if (ref == NULL) {
- binder_user_error("%d:%d refcount change on invalid ref %d\n",
- proc->pid, thread->pid, target);
- break;
+ ret = -1;
+ if (increment && !target) {
+ struct binder_node *ctx_mgr_node;
+ mutex_lock(&context->context_mgr_node_lock);
+ ctx_mgr_node = context->binder_context_mgr_node;
+ if (ctx_mgr_node)
+ ret = binder_inc_ref_for_node(
+ proc, ctx_mgr_node,
+ strong, NULL, &rdata);
+ mutex_unlock(&context->context_mgr_node_lock);
+ }
+ if (ret)
+ ret = binder_update_ref_for_handle(
+ proc, target, increment, strong,
+ &rdata);
+ if (!ret && rdata.desc != target) {
+ binder_user_error("%d:%d tried to acquire reference to desc %d, got %d instead\n",
+ proc->pid, thread->pid,
+ target, rdata.desc);
}
switch (cmd) {
case BC_INCREFS:
debug_string = "IncRefs";
- binder_inc_ref(ref, 0, NULL);
break;
case BC_ACQUIRE:
debug_string = "Acquire";
- binder_inc_ref(ref, 1, NULL);
break;
case BC_RELEASE:
debug_string = "Release";
- binder_dec_ref(ref, 1);
break;
case BC_DECREFS:
default:
debug_string = "DecRefs";
- binder_dec_ref(ref, 0);
+ break;
+ }
+ if (ret) {
+ binder_user_error("%d:%d %s %d refcount change on invalid ref %d ret %d\n",
+ proc->pid, thread->pid, debug_string,
+ strong, target, ret);
break;
}
binder_debug(BINDER_DEBUG_USER_REFS,
- "%d:%d %s ref %d desc %d s %d w %d for node %d\n",
- proc->pid, thread->pid, debug_string, ref->debug_id,
- ref->desc, ref->strong, ref->weak, ref->node->debug_id);
+ "%d:%d %s ref %d desc %d s %d w %d\n",
+ proc->pid, thread->pid, debug_string,
+ rdata.debug_id, rdata.desc, rdata.strong,
+ rdata.weak);
break;
}
case BC_INCREFS_DONE:
@@ -1857,6 +3463,7 @@ static int binder_thread_write(struct binder_proc *proc,
binder_uintptr_t node_ptr;
binder_uintptr_t cookie;
struct binder_node *node;
+ bool free_node;
if (get_user(node_ptr, (binder_uintptr_t __user *)ptr))
return -EFAULT;
@@ -1881,13 +3488,17 @@ static int binder_thread_write(struct binder_proc *proc,
"BC_INCREFS_DONE" : "BC_ACQUIRE_DONE",
(u64)node_ptr, node->debug_id,
(u64)cookie, (u64)node->cookie);
+ binder_put_node(node);
break;
}
+ binder_node_inner_lock(node);
if (cmd == BC_ACQUIRE_DONE) {
if (node->pending_strong_ref == 0) {
binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n",
proc->pid, thread->pid,
node->debug_id);
+ binder_node_inner_unlock(node);
+ binder_put_node(node);
break;
}
node->pending_strong_ref = 0;
@@ -1896,16 +3507,23 @@ static int binder_thread_write(struct binder_proc *proc,
binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n",
proc->pid, thread->pid,
node->debug_id);
+ binder_node_inner_unlock(node);
+ binder_put_node(node);
break;
}
node->pending_weak_ref = 0;
}
- binder_dec_node(node, cmd == BC_ACQUIRE_DONE, 0);
+ free_node = binder_dec_node_nilocked(node,
+ cmd == BC_ACQUIRE_DONE, 0);
+ WARN_ON(free_node);
binder_debug(BINDER_DEBUG_USER_REFS,
- "%d:%d %s node %d ls %d lw %d\n",
+ "%d:%d %s node %d ls %d lw %d tr %d\n",
proc->pid, thread->pid,
cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE",
- node->debug_id, node->local_strong_refs, node->local_weak_refs);
+ node->debug_id, node->local_strong_refs,
+ node->local_weak_refs, node->tmp_refs);
+ binder_node_inner_unlock(node);
+ binder_put_node(node);
break;
}
case BC_ATTEMPT_ACQUIRE:
@@ -1923,15 +3541,20 @@ static int binder_thread_write(struct binder_proc *proc,
return -EFAULT;
ptr += sizeof(binder_uintptr_t);
- buffer = binder_buffer_lookup(proc, data_ptr);
- if (buffer == NULL) {
- binder_user_error("%d:%d BC_FREE_BUFFER u%016llx no match\n",
- proc->pid, thread->pid, (u64)data_ptr);
- break;
- }
- if (!buffer->allow_user_free) {
- binder_user_error("%d:%d BC_FREE_BUFFER u%016llx matched unreturned buffer\n",
- proc->pid, thread->pid, (u64)data_ptr);
+ buffer = binder_alloc_prepare_to_free(&proc->alloc,
+ data_ptr);
+ if (IS_ERR_OR_NULL(buffer)) {
+ if (PTR_ERR(buffer) == -EPERM) {
+ binder_user_error(
+ "%d:%d BC_FREE_BUFFER u%016llx matched unreturned or currently freeing buffer\n",
+ proc->pid, thread->pid,
+ (u64)data_ptr);
+ } else {
+ binder_user_error(
+ "%d:%d BC_FREE_BUFFER u%016llx no match\n",
+ proc->pid, thread->pid,
+ (u64)data_ptr);
+ }
break;
}
binder_debug(BINDER_DEBUG_FREE_BUFFER,
@@ -1945,18 +3568,41 @@ static int binder_thread_write(struct binder_proc *proc,
buffer->transaction = NULL;
}
if (buffer->async_transaction && buffer->target_node) {
- BUG_ON(!buffer->target_node->has_async_transaction);
- if (list_empty(&buffer->target_node->async_todo))
- buffer->target_node->has_async_transaction = 0;
- else
- list_move_tail(buffer->target_node->async_todo.next, &thread->todo);
+ struct binder_node *buf_node;
+ struct binder_work *w;
+
+ buf_node = buffer->target_node;
+ binder_node_inner_lock(buf_node);
+ BUG_ON(!buf_node->has_async_transaction);
+ BUG_ON(buf_node->proc != proc);
+ w = binder_dequeue_work_head_ilocked(
+ &buf_node->async_todo);
+ if (!w) {
+ buf_node->has_async_transaction = 0;
+ } else {
+ binder_enqueue_work_ilocked(
+ w, &proc->todo);
+ binder_wakeup_proc_ilocked(proc);
+ }
+ binder_node_inner_unlock(buf_node);
}
trace_binder_transaction_buffer_release(buffer);
binder_transaction_buffer_release(proc, buffer, NULL);
- binder_free_buf(proc, buffer);
+ binder_alloc_free_buf(&proc->alloc, buffer);
break;
}
+ case BC_TRANSACTION_SG:
+ case BC_REPLY_SG: {
+ struct binder_transaction_data_sg tr;
+
+ if (copy_from_user(&tr, ptr, sizeof(tr)))
+ return -EFAULT;
+ ptr += sizeof(tr);
+ binder_transaction(proc, thread, &tr.transaction_data,
+ cmd == BC_REPLY_SG, tr.buffers_size);
+ break;
+ }
case BC_TRANSACTION:
case BC_REPLY: {
struct binder_transaction_data tr;
@@ -1964,7 +3610,8 @@ static int binder_thread_write(struct binder_proc *proc,
if (copy_from_user(&tr, ptr, sizeof(tr)))
return -EFAULT;
ptr += sizeof(tr);
- binder_transaction(proc, thread, &tr, cmd == BC_REPLY);
+ binder_transaction(proc, thread, &tr,
+ cmd == BC_REPLY, 0);
break;
}
@@ -1972,6 +3619,7 @@ static int binder_thread_write(struct binder_proc *proc,
binder_debug(BINDER_DEBUG_THREADS,
"%d:%d BC_REGISTER_LOOPER\n",
proc->pid, thread->pid);
+ binder_inner_proc_lock(proc);
if (thread->looper & BINDER_LOOPER_STATE_ENTERED) {
thread->looper |= BINDER_LOOPER_STATE_INVALID;
binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called after BC_ENTER_LOOPER\n",
@@ -1985,6 +3633,7 @@ static int binder_thread_write(struct binder_proc *proc,
proc->requested_threads_started++;
}
thread->looper |= BINDER_LOOPER_STATE_REGISTERED;
+ binder_inner_proc_unlock(proc);
break;
case BC_ENTER_LOOPER:
binder_debug(BINDER_DEBUG_THREADS,
@@ -2009,7 +3658,7 @@ static int binder_thread_write(struct binder_proc *proc,
uint32_t target;
binder_uintptr_t cookie;
struct binder_ref *ref;
- struct binder_ref_death *death;
+ struct binder_ref_death *death = NULL;
if (get_user(target, (uint32_t __user *)ptr))
return -EFAULT;
@@ -2017,7 +3666,29 @@ static int binder_thread_write(struct binder_proc *proc,
if (get_user(cookie, (binder_uintptr_t __user *)ptr))
return -EFAULT;
ptr += sizeof(binder_uintptr_t);
- ref = binder_get_ref(proc, target, false);
+ if (cmd == BC_REQUEST_DEATH_NOTIFICATION) {
+ /*
+ * Allocate memory for death notification
+ * before taking lock
+ */
+ death = kzalloc(sizeof(*death), GFP_KERNEL);
+ if (death == NULL) {
+ WARN_ON(thread->return_error.cmd !=
+ BR_OK);
+ thread->return_error.cmd = BR_ERROR;
+ binder_enqueue_work(
+ thread->proc,
+ &thread->return_error.work,
+ &thread->todo);
+ binder_debug(
+ BINDER_DEBUG_FAILED_TRANSACTION,
+ "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n",
+ proc->pid, thread->pid);
+ break;
+ }
+ }
+ binder_proc_lock(proc);
+ ref = binder_get_ref_olocked(proc, target, false);
if (ref == NULL) {
binder_user_error("%d:%d %s invalid ref %d\n",
proc->pid, thread->pid,
@@ -2025,6 +3696,8 @@ static int binder_thread_write(struct binder_proc *proc,
"BC_REQUEST_DEATH_NOTIFICATION" :
"BC_CLEAR_DEATH_NOTIFICATION",
target);
+ binder_proc_unlock(proc);
+ kfree(death);
break;
}
@@ -2034,21 +3707,18 @@ static int binder_thread_write(struct binder_proc *proc,
cmd == BC_REQUEST_DEATH_NOTIFICATION ?
"BC_REQUEST_DEATH_NOTIFICATION" :
"BC_CLEAR_DEATH_NOTIFICATION",
- (u64)cookie, ref->debug_id, ref->desc,
- ref->strong, ref->weak, ref->node->debug_id);
+ (u64)cookie, ref->data.debug_id,
+ ref->data.desc, ref->data.strong,
+ ref->data.weak, ref->node->debug_id);
+ binder_node_lock(ref->node);
if (cmd == BC_REQUEST_DEATH_NOTIFICATION) {
if (ref->death) {
binder_user_error("%d:%d BC_REQUEST_DEATH_NOTIFICATION death notification already set\n",
proc->pid, thread->pid);
- break;
- }
- death = kzalloc(sizeof(*death), GFP_KERNEL);
- if (death == NULL) {
- thread->return_error = BR_ERROR;
- binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
- "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n",
- proc->pid, thread->pid);
+ binder_node_unlock(ref->node);
+ binder_proc_unlock(proc);
+ kfree(death);
break;
}
binder_stats_created(BINDER_STAT_DEATH);
@@ -2057,17 +3727,19 @@ static int binder_thread_write(struct binder_proc *proc,
ref->death = death;
if (ref->node->proc == NULL) {
ref->death->work.type = BINDER_WORK_DEAD_BINDER;
- if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) {
- list_add_tail(&ref->death->work.entry, &thread->todo);
- } else {
- list_add_tail(&ref->death->work.entry, &proc->todo);
- wake_up_interruptible(&proc->wait);
- }
+
+ binder_inner_proc_lock(proc);
+ binder_enqueue_work_ilocked(
+ &ref->death->work, &proc->todo);
+ binder_wakeup_proc_ilocked(proc);
+ binder_inner_proc_unlock(proc);
}
} else {
if (ref->death == NULL) {
binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n",
proc->pid, thread->pid);
+ binder_node_unlock(ref->node);
+ binder_proc_unlock(proc);
break;
}
death = ref->death;
@@ -2076,22 +3748,35 @@ static int binder_thread_write(struct binder_proc *proc,
proc->pid, thread->pid,
(u64)death->cookie,
(u64)cookie);
+ binder_node_unlock(ref->node);
+ binder_proc_unlock(proc);
break;
}
ref->death = NULL;
+ binder_inner_proc_lock(proc);
if (list_empty(&death->work.entry)) {
death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION;
- if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) {
- list_add_tail(&death->work.entry, &thread->todo);
- } else {
- list_add_tail(&death->work.entry, &proc->todo);
- wake_up_interruptible(&proc->wait);
+ if (thread->looper &
+ (BINDER_LOOPER_STATE_REGISTERED |
+ BINDER_LOOPER_STATE_ENTERED))
+ binder_enqueue_work_ilocked(
+ &death->work,
+ &thread->todo);
+ else {
+ binder_enqueue_work_ilocked(
+ &death->work,
+ &proc->todo);
+ binder_wakeup_proc_ilocked(
+ proc);
}
} else {
BUG_ON(death->work.type != BINDER_WORK_DEAD_BINDER);
death->work.type = BINDER_WORK_DEAD_BINDER_AND_CLEAR;
}
+ binder_inner_proc_unlock(proc);
}
+ binder_node_unlock(ref->node);
+ binder_proc_unlock(proc);
} break;
case BC_DEAD_BINDER_DONE: {
struct binder_work *w;
@@ -2102,8 +3787,13 @@ static int binder_thread_write(struct binder_proc *proc,
return -EFAULT;
ptr += sizeof(cookie);
- list_for_each_entry(w, &proc->delivered_death, entry) {
- struct binder_ref_death *tmp_death = container_of(w, struct binder_ref_death, work);
+ binder_inner_proc_lock(proc);
+ list_for_each_entry(w, &proc->delivered_death,
+ entry) {
+ struct binder_ref_death *tmp_death =
+ container_of(w,
+ struct binder_ref_death,
+ work);
if (tmp_death->cookie == cookie) {
death = tmp_death;
@@ -2117,19 +3807,25 @@ static int binder_thread_write(struct binder_proc *proc,
if (death == NULL) {
binder_user_error("%d:%d BC_DEAD_BINDER_DONE %016llx not found\n",
proc->pid, thread->pid, (u64)cookie);
+ binder_inner_proc_unlock(proc);
break;
}
-
- list_del_init(&death->work.entry);
+ binder_dequeue_work_ilocked(&death->work);
if (death->work.type == BINDER_WORK_DEAD_BINDER_AND_CLEAR) {
death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION;
- if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) {
- list_add_tail(&death->work.entry, &thread->todo);
- } else {
- list_add_tail(&death->work.entry, &proc->todo);
- wake_up_interruptible(&proc->wait);
+ if (thread->looper &
+ (BINDER_LOOPER_STATE_REGISTERED |
+ BINDER_LOOPER_STATE_ENTERED))
+ binder_enqueue_work_ilocked(
+ &death->work, &thread->todo);
+ else {
+ binder_enqueue_work_ilocked(
+ &death->work,
+ &proc->todo);
+ binder_wakeup_proc_ilocked(proc);
}
}
+ binder_inner_proc_unlock(proc);
} break;
default:
@@ -2147,23 +3843,73 @@ static void binder_stat_br(struct binder_proc *proc,
{
trace_binder_return(cmd);
if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) {
- binder_stats.br[_IOC_NR(cmd)]++;
- proc->stats.br[_IOC_NR(cmd)]++;
- thread->stats.br[_IOC_NR(cmd)]++;
+ atomic_inc(&binder_stats.br[_IOC_NR(cmd)]);
+ atomic_inc(&proc->stats.br[_IOC_NR(cmd)]);
+ atomic_inc(&thread->stats.br[_IOC_NR(cmd)]);
}
}
-static int binder_has_proc_work(struct binder_proc *proc,
- struct binder_thread *thread)
+static int binder_put_node_cmd(struct binder_proc *proc,
+ struct binder_thread *thread,
+ void __user **ptrp,
+ binder_uintptr_t node_ptr,
+ binder_uintptr_t node_cookie,
+ int node_debug_id,
+ uint32_t cmd, const char *cmd_name)
{
- return !list_empty(&proc->todo) ||
- (thread->looper & BINDER_LOOPER_STATE_NEED_RETURN);
+ void __user *ptr = *ptrp;
+
+ if (put_user(cmd, (uint32_t __user *)ptr))
+ return -EFAULT;
+ ptr += sizeof(uint32_t);
+
+ if (put_user(node_ptr, (binder_uintptr_t __user *)ptr))
+ return -EFAULT;
+ ptr += sizeof(binder_uintptr_t);
+
+ if (put_user(node_cookie, (binder_uintptr_t __user *)ptr))
+ return -EFAULT;
+ ptr += sizeof(binder_uintptr_t);
+
+ binder_stat_br(proc, thread, cmd);
+ binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s %d u%016llx c%016llx\n",
+ proc->pid, thread->pid, cmd_name, node_debug_id,
+ (u64)node_ptr, (u64)node_cookie);
+
+ *ptrp = ptr;
+ return 0;
}
-static int binder_has_thread_work(struct binder_thread *thread)
+static int binder_wait_for_work(struct binder_thread *thread,
+ bool do_proc_work)
{
- return !list_empty(&thread->todo) || thread->return_error != BR_OK ||
- (thread->looper & BINDER_LOOPER_STATE_NEED_RETURN);
+ DEFINE_WAIT(wait);
+ struct binder_proc *proc = thread->proc;
+ int ret = 0;
+
+ freezer_do_not_count();
+ binder_inner_proc_lock(proc);
+ for (;;) {
+ prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE);
+ if (binder_has_work_ilocked(thread, do_proc_work))
+ break;
+ if (do_proc_work)
+ list_add(&thread->waiting_thread_node,
+ &proc->waiting_threads);
+ binder_inner_proc_unlock(proc);
+ schedule();
+ binder_inner_proc_lock(proc);
+ list_del_init(&thread->waiting_thread_node);
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ }
+ finish_wait(&thread->wait, &wait);
+ binder_inner_proc_unlock(proc);
+ freezer_count();
+
+ return ret;
}
static int binder_thread_read(struct binder_proc *proc,
@@ -2185,37 +3931,15 @@ static int binder_thread_read(struct binder_proc *proc,
}
retry:
- wait_for_proc_work = thread->transaction_stack == NULL &&
- list_empty(&thread->todo);
-
- if (thread->return_error != BR_OK && ptr < end) {
- if (thread->return_error2 != BR_OK) {
- if (put_user(thread->return_error2, (uint32_t __user *)ptr))
- return -EFAULT;
- ptr += sizeof(uint32_t);
- binder_stat_br(proc, thread, thread->return_error2);
- if (ptr == end)
- goto done;
- thread->return_error2 = BR_OK;
- }
- if (put_user(thread->return_error, (uint32_t __user *)ptr))
- return -EFAULT;
- ptr += sizeof(uint32_t);
- binder_stat_br(proc, thread, thread->return_error);
- thread->return_error = BR_OK;
- goto done;
- }
-
+ binder_inner_proc_lock(proc);
+ wait_for_proc_work = binder_available_for_proc_work_ilocked(thread);
+ binder_inner_proc_unlock(proc);
thread->looper |= BINDER_LOOPER_STATE_WAITING;
- if (wait_for_proc_work)
- proc->ready_threads++;
-
- binder_unlock(__func__);
trace_binder_wait_for_work(wait_for_proc_work,
!!thread->transaction_stack,
- !list_empty(&thread->todo));
+ !binder_worklist_empty(proc, &thread->todo));
if (wait_for_proc_work) {
if (!(thread->looper & (BINDER_LOOPER_STATE_REGISTERED |
BINDER_LOOPER_STATE_ENTERED))) {
@@ -2224,24 +3948,16 @@ retry:
wait_event_interruptible(binder_user_error_wait,
binder_stop_on_user_error < 2);
}
- binder_set_nice(proc->default_priority);
- if (non_block) {
- if (!binder_has_proc_work(proc, thread))
- ret = -EAGAIN;
- } else
- ret = wait_event_freezable_exclusive(proc->wait, binder_has_proc_work(proc, thread));
- } else {
- if (non_block) {
- if (!binder_has_thread_work(thread))
- ret = -EAGAIN;
- } else
- ret = wait_event_freezable(thread->wait, binder_has_thread_work(thread));
+ binder_restore_priority(current, proc->default_priority);
}
- binder_lock(__func__);
+ if (non_block) {
+ if (!binder_has_work(thread, wait_for_proc_work))
+ ret = -EAGAIN;
+ } else {
+ ret = binder_wait_for_work(thread, wait_for_proc_work);
+ }
- if (wait_for_proc_work)
- proc->ready_threads--;
thread->looper &= ~BINDER_LOOPER_STATE_WAITING;
if (ret)
@@ -2250,31 +3966,52 @@ retry:
while (1) {
uint32_t cmd;
struct binder_transaction_data tr;
- struct binder_work *w;
+ struct binder_work *w = NULL;
+ struct list_head *list = NULL;
struct binder_transaction *t = NULL;
+ struct binder_thread *t_from;
+
+ binder_inner_proc_lock(proc);
+ if (!binder_worklist_empty_ilocked(&thread->todo))
+ list = &thread->todo;
+ else if (!binder_worklist_empty_ilocked(&proc->todo) &&
+ wait_for_proc_work)
+ list = &proc->todo;
+ else {
+ binder_inner_proc_unlock(proc);
- if (!list_empty(&thread->todo)) {
- w = list_first_entry(&thread->todo, struct binder_work,
- entry);
- } else if (!list_empty(&proc->todo) && wait_for_proc_work) {
- w = list_first_entry(&proc->todo, struct binder_work,
- entry);
- } else {
/* no data added */
- if (ptr - buffer == 4 &&
- !(thread->looper & BINDER_LOOPER_STATE_NEED_RETURN))
+ if (ptr - buffer == 4 && !thread->looper_need_return)
goto retry;
break;
}
- if (end - ptr < sizeof(tr) + 4)
+ if (end - ptr < sizeof(tr) + 4) {
+ binder_inner_proc_unlock(proc);
break;
+ }
+ w = binder_dequeue_work_head_ilocked(list);
switch (w->type) {
case BINDER_WORK_TRANSACTION: {
+ binder_inner_proc_unlock(proc);
t = container_of(w, struct binder_transaction, work);
} break;
+ case BINDER_WORK_RETURN_ERROR: {
+ struct binder_error *e = container_of(
+ w, struct binder_error, work);
+
+ WARN_ON(e->cmd == BR_OK);
+ binder_inner_proc_unlock(proc);
+ if (put_user(e->cmd, (uint32_t __user *)ptr))
+ return -EFAULT;
+ e->cmd = BR_OK;
+ ptr += sizeof(uint32_t);
+
+ binder_stat_br(proc, thread, cmd);
+ } break;
case BINDER_WORK_TRANSACTION_COMPLETE: {
+ binder_inner_proc_unlock(proc);
cmd = BR_TRANSACTION_COMPLETE;
if (put_user(cmd, (uint32_t __user *)ptr))
return -EFAULT;
@@ -2284,113 +4021,134 @@ retry:
binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE,
"%d:%d BR_TRANSACTION_COMPLETE\n",
proc->pid, thread->pid);
-
- list_del(&w->entry);
kfree(w);
binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
} break;
case BINDER_WORK_NODE: {
struct binder_node *node = container_of(w, struct binder_node, work);
- uint32_t cmd = BR_NOOP;
- const char *cmd_name;
- int strong = node->internal_strong_refs || node->local_strong_refs;
- int weak = !hlist_empty(&node->refs) || node->local_weak_refs || strong;
-
- if (weak && !node->has_weak_ref) {
- cmd = BR_INCREFS;
- cmd_name = "BR_INCREFS";
+ int strong, weak;
+ binder_uintptr_t node_ptr = node->ptr;
+ binder_uintptr_t node_cookie = node->cookie;
+ int node_debug_id = node->debug_id;
+ int has_weak_ref;
+ int has_strong_ref;
+ void __user *orig_ptr = ptr;
+
+ BUG_ON(proc != node->proc);
+ strong = node->internal_strong_refs ||
+ node->local_strong_refs;
+ weak = !hlist_empty(&node->refs) ||
+ node->local_weak_refs ||
+ node->tmp_refs || strong;
+ has_strong_ref = node->has_strong_ref;
+ has_weak_ref = node->has_weak_ref;
+
+ if (weak && !has_weak_ref) {
node->has_weak_ref = 1;
node->pending_weak_ref = 1;
node->local_weak_refs++;
- } else if (strong && !node->has_strong_ref) {
- cmd = BR_ACQUIRE;
- cmd_name = "BR_ACQUIRE";
+ }
+ if (strong && !has_strong_ref) {
node->has_strong_ref = 1;
node->pending_strong_ref = 1;
node->local_strong_refs++;
- } else if (!strong && node->has_strong_ref) {
- cmd = BR_RELEASE;
- cmd_name = "BR_RELEASE";
+ }
+ if (!strong && has_strong_ref)
node->has_strong_ref = 0;
- } else if (!weak && node->has_weak_ref) {
- cmd = BR_DECREFS;
- cmd_name = "BR_DECREFS";
+ if (!weak && has_weak_ref)
node->has_weak_ref = 0;
- }
- if (cmd != BR_NOOP) {
- if (put_user(cmd, (uint32_t __user *)ptr))
- return -EFAULT;
- ptr += sizeof(uint32_t);
- if (put_user(node->ptr,
- (binder_uintptr_t __user *)ptr))
- return -EFAULT;
- ptr += sizeof(binder_uintptr_t);
- if (put_user(node->cookie,
- (binder_uintptr_t __user *)ptr))
- return -EFAULT;
- ptr += sizeof(binder_uintptr_t);
-
- binder_stat_br(proc, thread, cmd);
- binder_debug(BINDER_DEBUG_USER_REFS,
- "%d:%d %s %d u%016llx c%016llx\n",
- proc->pid, thread->pid, cmd_name,
- node->debug_id,
- (u64)node->ptr, (u64)node->cookie);
- } else {
- list_del_init(&w->entry);
- if (!weak && !strong) {
- binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "%d:%d node %d u%016llx c%016llx deleted\n",
- proc->pid, thread->pid,
- node->debug_id,
- (u64)node->ptr,
- (u64)node->cookie);
- rb_erase(&node->rb_node, &proc->nodes);
- kfree(node);
- binder_stats_deleted(BINDER_STAT_NODE);
- } else {
- binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "%d:%d node %d u%016llx c%016llx state unchanged\n",
- proc->pid, thread->pid,
- node->debug_id,
- (u64)node->ptr,
- (u64)node->cookie);
- }
- }
+ if (!weak && !strong) {
+ binder_debug(BINDER_DEBUG_INTERNAL_REFS,
+ "%d:%d node %d u%016llx c%016llx deleted\n",
+ proc->pid, thread->pid,
+ node_debug_id,
+ (u64)node_ptr,
+ (u64)node_cookie);
+ rb_erase(&node->rb_node, &proc->nodes);
+ binder_inner_proc_unlock(proc);
+ binder_node_lock(node);
+ /*
+ * Acquire the node lock before freeing the
+ * node to serialize with other threads that
+ * may have been holding the node lock while
+ * decrementing this node (avoids race where
+ * this thread frees while the other thread
+ * is unlocking the node after the final
+ * decrement)
+ */
+ binder_node_unlock(node);
+ binder_free_node(node);
+ } else
+ binder_inner_proc_unlock(proc);
+
+ if (weak && !has_weak_ref)
+ ret = binder_put_node_cmd(
+ proc, thread, &ptr, node_ptr,
+ node_cookie, node_debug_id,
+ BR_INCREFS, "BR_INCREFS");
+ if (!ret && strong && !has_strong_ref)
+ ret = binder_put_node_cmd(
+ proc, thread, &ptr, node_ptr,
+ node_cookie, node_debug_id,
+ BR_ACQUIRE, "BR_ACQUIRE");
+ if (!ret && !strong && has_strong_ref)
+ ret = binder_put_node_cmd(
+ proc, thread, &ptr, node_ptr,
+ node_cookie, node_debug_id,
+ BR_RELEASE, "BR_RELEASE");
+ if (!ret && !weak && has_weak_ref)
+ ret = binder_put_node_cmd(
+ proc, thread, &ptr, node_ptr,
+ node_cookie, node_debug_id,
+ BR_DECREFS, "BR_DECREFS");
+ if (orig_ptr == ptr)
+ binder_debug(BINDER_DEBUG_INTERNAL_REFS,
+ "%d:%d node %d u%016llx c%016llx state unchanged\n",
+ proc->pid, thread->pid,
+ node_debug_id,
+ (u64)node_ptr,
+ (u64)node_cookie);
+ if (ret)
+ return ret;
} break;
case BINDER_WORK_DEAD_BINDER:
case BINDER_WORK_DEAD_BINDER_AND_CLEAR:
case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: {
struct binder_ref_death *death;
uint32_t cmd;
+ binder_uintptr_t cookie;
death = container_of(w, struct binder_ref_death, work);
if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION)
cmd = BR_CLEAR_DEATH_NOTIFICATION_DONE;
else
cmd = BR_DEAD_BINDER;
- if (put_user(cmd, (uint32_t __user *)ptr))
- return -EFAULT;
- ptr += sizeof(uint32_t);
- if (put_user(death->cookie,
- (binder_uintptr_t __user *)ptr))
- return -EFAULT;
- ptr += sizeof(binder_uintptr_t);
- binder_stat_br(proc, thread, cmd);
+ cookie = death->cookie;
+
binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION,
"%d:%d %s %016llx\n",
proc->pid, thread->pid,
cmd == BR_DEAD_BINDER ?
"BR_DEAD_BINDER" :
"BR_CLEAR_DEATH_NOTIFICATION_DONE",
- (u64)death->cookie);
-
+ (u64)cookie);
if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) {
- list_del(&w->entry);
+ binder_inner_proc_unlock(proc);
kfree(death);
binder_stats_deleted(BINDER_STAT_DEATH);
- } else
- list_move(&w->entry, &proc->delivered_death);
+ } else {
+ binder_enqueue_work_ilocked(
+ w, &proc->delivered_death);
+ binder_inner_proc_unlock(proc);
+ }
+ if (put_user(cmd, (uint32_t __user *)ptr))
+ return -EFAULT;
+ ptr += sizeof(uint32_t);
+ if (put_user(cookie,
+ (binder_uintptr_t __user *)ptr))
+ return -EFAULT;
+ ptr += sizeof(binder_uintptr_t);
+ binder_stat_br(proc, thread, cmd);
if (cmd == BR_DEAD_BINDER)
goto done; /* DEAD_BINDER notifications can cause transactions */
} break;
@@ -2402,16 +4160,14 @@ retry:
BUG_ON(t->buffer == NULL);
if (t->buffer->target_node) {
struct binder_node *target_node = t->buffer->target_node;
+ struct binder_priority node_prio;
tr.target.ptr = target_node->ptr;
tr.cookie = target_node->cookie;
- t->saved_priority = task_nice(current);
- if (t->priority < target_node->min_priority &&
- !(t->flags & TF_ONE_WAY))
- binder_set_nice(t->priority);
- else if (!(t->flags & TF_ONE_WAY) ||
- t->saved_priority > target_node->min_priority)
- binder_set_nice(target_node->min_priority);
+ node_prio.sched_policy = target_node->sched_policy;
+ node_prio.prio = target_node->min_priority;
+ binder_transaction_priority(current, t, node_prio,
+ target_node->inherit_rt);
cmd = BR_TRANSACTION;
} else {
tr.target.ptr = 0;
@@ -2422,8 +4178,9 @@ retry:
tr.flags = t->flags;
tr.sender_euid = from_kuid(current_user_ns(), t->sender_euid);
- if (t->from) {
- struct task_struct *sender = t->from->proc->tsk;
+ t_from = binder_get_txn_from(t);
+ if (t_from) {
+ struct task_struct *sender = t_from->proc->tsk;
tr.sender_pid = task_tgid_nr_ns(sender,
task_active_pid_ns(current));
@@ -2433,18 +4190,24 @@ retry:
tr.data_size = t->buffer->data_size;
tr.offsets_size = t->buffer->offsets_size;
- tr.data.ptr.buffer = (binder_uintptr_t)(
- (uintptr_t)t->buffer->data +
- proc->user_buffer_offset);
+ tr.data.ptr.buffer = (binder_uintptr_t)
+ ((uintptr_t)t->buffer->data +
+ binder_alloc_get_user_buffer_offset(&proc->alloc));
tr.data.ptr.offsets = tr.data.ptr.buffer +
ALIGN(t->buffer->data_size,
sizeof(void *));
- if (put_user(cmd, (uint32_t __user *)ptr))
+ if (put_user(cmd, (uint32_t __user *)ptr)) {
+ if (t_from)
+ binder_thread_dec_tmpref(t_from);
return -EFAULT;
+ }
ptr += sizeof(uint32_t);
- if (copy_to_user(ptr, &tr, sizeof(tr)))
+ if (copy_to_user(ptr, &tr, sizeof(tr))) {
+ if (t_from)
+ binder_thread_dec_tmpref(t_from);
return -EFAULT;
+ }
ptr += sizeof(tr);
trace_binder_transaction_received(t);
@@ -2454,21 +4217,22 @@ retry:
proc->pid, thread->pid,
(cmd == BR_TRANSACTION) ? "BR_TRANSACTION" :
"BR_REPLY",
- t->debug_id, t->from ? t->from->proc->pid : 0,
- t->from ? t->from->pid : 0, cmd,
+ t->debug_id, t_from ? t_from->proc->pid : 0,
+ t_from ? t_from->pid : 0, cmd,
t->buffer->data_size, t->buffer->offsets_size,
(u64)tr.data.ptr.buffer, (u64)tr.data.ptr.offsets);
- list_del(&t->work.entry);
+ if (t_from)
+ binder_thread_dec_tmpref(t_from);
t->buffer->allow_user_free = 1;
if (cmd == BR_TRANSACTION && !(t->flags & TF_ONE_WAY)) {
+ binder_inner_proc_lock(thread->proc);
t->to_parent = thread->transaction_stack;
t->to_thread = thread;
thread->transaction_stack = t;
+ binder_inner_proc_unlock(thread->proc);
} else {
- t->buffer->transaction = NULL;
- kfree(t);
- binder_stats_deleted(BINDER_STAT_TRANSACTION);
+ binder_free_transaction(t);
}
break;
}
@@ -2476,29 +4240,36 @@ retry:
done:
*consumed = ptr - buffer;
- if (proc->requested_threads + proc->ready_threads == 0 &&
+ binder_inner_proc_lock(proc);
+ if (proc->requested_threads == 0 &&
+ list_empty(&thread->proc->waiting_threads) &&
proc->requested_threads_started < proc->max_threads &&
(thread->looper & (BINDER_LOOPER_STATE_REGISTERED |
BINDER_LOOPER_STATE_ENTERED)) /* the user-space code fails to */
/*spawn a new thread if we leave this out */) {
proc->requested_threads++;
+ binder_inner_proc_unlock(proc);
binder_debug(BINDER_DEBUG_THREADS,
"%d:%d BR_SPAWN_LOOPER\n",
proc->pid, thread->pid);
if (put_user(BR_SPAWN_LOOPER, (uint32_t __user *)buffer))
return -EFAULT;
binder_stat_br(proc, thread, BR_SPAWN_LOOPER);
- }
+ } else
+ binder_inner_proc_unlock(proc);
return 0;
}
-static void binder_release_work(struct list_head *list)
+static void binder_release_work(struct binder_proc *proc,
+ struct list_head *list)
{
struct binder_work *w;
- while (!list_empty(list)) {
- w = list_first_entry(list, struct binder_work, entry);
- list_del_init(&w->entry);
+ while (1) {
+ w = binder_dequeue_work_head(proc, list);
+ if (!w)
+ return;
+
switch (w->type) {
case BINDER_WORK_TRANSACTION: {
struct binder_transaction *t;
@@ -2511,11 +4282,17 @@ static void binder_release_work(struct list_head *list)
binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
"undelivered transaction %d\n",
t->debug_id);
- t->buffer->transaction = NULL;
- kfree(t);
- binder_stats_deleted(BINDER_STAT_TRANSACTION);
+ binder_free_transaction(t);
}
} break;
+ case BINDER_WORK_RETURN_ERROR: {
+ struct binder_error *e = container_of(
+ w, struct binder_error, work);
+
+ binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
+ "undelivered TRANSACTION_ERROR: %u\n",
+ e->cmd);
+ } break;
case BINDER_WORK_TRANSACTION_COMPLETE: {
binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
"undelivered TRANSACTION_COMPLETE\n");
@@ -2542,7 +4319,8 @@ static void binder_release_work(struct list_head *list)
}
-static struct binder_thread *binder_get_thread(struct binder_proc *proc)
+static struct binder_thread *binder_get_thread_ilocked(
+ struct binder_proc *proc, struct binder_thread *new_thread)
{
struct binder_thread *thread = NULL;
struct rb_node *parent = NULL;
@@ -2557,38 +4335,102 @@ static struct binder_thread *binder_get_thread(struct binder_proc *proc)
else if (current->pid > thread->pid)
p = &(*p)->rb_right;
else
- break;
+ return thread;
}
- if (*p == NULL) {
- thread = kzalloc(sizeof(*thread), GFP_KERNEL);
- if (thread == NULL)
+ if (!new_thread)
+ return NULL;
+ thread = new_thread;
+ binder_stats_created(BINDER_STAT_THREAD);
+ thread->proc = proc;
+ thread->pid = current->pid;
+ get_task_struct(current);
+ thread->task = current;
+ atomic_set(&thread->tmp_ref, 0);
+ init_waitqueue_head(&thread->wait);
+ INIT_LIST_HEAD(&thread->todo);
+ rb_link_node(&thread->rb_node, parent, p);
+ rb_insert_color(&thread->rb_node, &proc->threads);
+ thread->looper_need_return = true;
+ thread->return_error.work.type = BINDER_WORK_RETURN_ERROR;
+ thread->return_error.cmd = BR_OK;
+ thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR;
+ thread->reply_error.cmd = BR_OK;
+ INIT_LIST_HEAD(&new_thread->waiting_thread_node);
+ return thread;
+}
+
+static struct binder_thread *binder_get_thread(struct binder_proc *proc)
+{
+ struct binder_thread *thread;
+ struct binder_thread *new_thread;
+
+ binder_inner_proc_lock(proc);
+ thread = binder_get_thread_ilocked(proc, NULL);
+ binder_inner_proc_unlock(proc);
+ if (!thread) {
+ new_thread = kzalloc(sizeof(*thread), GFP_KERNEL);
+ if (new_thread == NULL)
return NULL;
- binder_stats_created(BINDER_STAT_THREAD);
- thread->proc = proc;
- thread->pid = current->pid;
- init_waitqueue_head(&thread->wait);
- INIT_LIST_HEAD(&thread->todo);
- rb_link_node(&thread->rb_node, parent, p);
- rb_insert_color(&thread->rb_node, &proc->threads);
- thread->looper |= BINDER_LOOPER_STATE_NEED_RETURN;
- thread->return_error = BR_OK;
- thread->return_error2 = BR_OK;
+ binder_inner_proc_lock(proc);
+ thread = binder_get_thread_ilocked(proc, new_thread);
+ binder_inner_proc_unlock(proc);
+ if (thread != new_thread)
+ kfree(new_thread);
}
return thread;
}
-static int binder_free_thread(struct binder_proc *proc,
- struct binder_thread *thread)
+static void binder_free_proc(struct binder_proc *proc)
+{
+ BUG_ON(!list_empty(&proc->todo));
+ BUG_ON(!list_empty(&proc->delivered_death));
+ binder_alloc_deferred_release(&proc->alloc);
+ put_task_struct(proc->tsk);
+ binder_stats_deleted(BINDER_STAT_PROC);
+ kfree(proc);
+}
+
+static void binder_free_thread(struct binder_thread *thread)
+{
+ BUG_ON(!list_empty(&thread->todo));
+ binder_stats_deleted(BINDER_STAT_THREAD);
+ binder_proc_dec_tmpref(thread->proc);
+ put_task_struct(thread->task);
+ kfree(thread);
+}
+
+static int binder_thread_release(struct binder_proc *proc,
+ struct binder_thread *thread)
{
struct binder_transaction *t;
struct binder_transaction *send_reply = NULL;
int active_transactions = 0;
-
+ struct binder_transaction *last_t = NULL;
+
+ binder_inner_proc_lock(thread->proc);
+ /*
+ * take a ref on the proc so it survives
+ * after we remove this thread from proc->threads.
+ * The corresponding dec is when we actually
+ * free the thread in binder_free_thread()
+ */
+ proc->tmp_ref++;
+ /*
+ * take a ref on this thread to ensure it
+ * survives while we are releasing it
+ */
+ atomic_inc(&thread->tmp_ref);
rb_erase(&thread->rb_node, &proc->threads);
t = thread->transaction_stack;
- if (t && t->to_thread == thread)
- send_reply = t;
+ if (t) {
+ spin_lock(&t->lock);
+ if (t->to_thread == thread)
+ send_reply = t;
+ }
+ thread->is_dead = true;
+
while (t) {
+ last_t = t;
active_transactions++;
binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
"release %d:%d transaction %d %s, still active\n",
@@ -2609,12 +4451,16 @@ static int binder_free_thread(struct binder_proc *proc,
t = t->from_parent;
} else
BUG();
+ spin_unlock(&last_t->lock);
+ if (t)
+ spin_lock(&t->lock);
}
+ binder_inner_proc_unlock(thread->proc);
+
if (send_reply)
binder_send_failed_reply(send_reply, BR_DEAD_REPLY);
- binder_release_work(&thread->todo);
- kfree(thread);
- binder_stats_deleted(BINDER_STAT_THREAD);
+ binder_release_work(proc, &thread->todo);
+ binder_thread_dec_tmpref(thread);
return active_transactions;
}
@@ -2623,34 +4469,23 @@ static unsigned int binder_poll(struct file *filp,
{
struct binder_proc *proc = filp->private_data;
struct binder_thread *thread = NULL;
- int wait_for_proc_work;
-
- binder_lock(__func__);
+ bool wait_for_proc_work;
thread = binder_get_thread(proc);
- if (!thread) {
- binder_unlock(__func__);
+ if (!thread)
return POLLERR;
- }
- wait_for_proc_work = thread->transaction_stack == NULL &&
- list_empty(&thread->todo) && thread->return_error == BR_OK;
+ binder_inner_proc_lock(thread->proc);
+ thread->looper |= BINDER_LOOPER_STATE_POLL;
+ wait_for_proc_work = binder_available_for_proc_work_ilocked(thread);
- binder_unlock(__func__);
+ binder_inner_proc_unlock(thread->proc);
+
+ poll_wait(filp, &thread->wait, wait);
+
+ if (binder_has_work(thread, wait_for_proc_work))
+ return POLLIN;
- if (wait_for_proc_work) {
- if (binder_has_proc_work(proc, thread))
- return POLLIN;
- poll_wait(filp, &proc->wait, wait);
- if (binder_has_proc_work(proc, thread))
- return POLLIN;
- } else {
- if (binder_has_thread_work(thread))
- return POLLIN;
- poll_wait(filp, &thread->wait, wait);
- if (binder_has_thread_work(thread))
- return POLLIN;
- }
return 0;
}
@@ -2697,8 +4532,10 @@ static int binder_ioctl_write_read(struct file *filp,
&bwr.read_consumed,
filp->f_flags & O_NONBLOCK);
trace_binder_read_done(ret);
- if (!list_empty(&proc->todo))
- wake_up_interruptible(&proc->wait);
+ binder_inner_proc_lock(proc);
+ if (!binder_worklist_empty_ilocked(&proc->todo))
+ binder_wakeup_proc_ilocked(proc);
+ binder_inner_proc_unlock(proc);
if (ret < 0) {
if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
ret = -EFAULT;
@@ -2722,9 +4559,12 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp)
{
int ret = 0;
struct binder_proc *proc = filp->private_data;
+ struct binder_context *context = proc->context;
+ struct binder_node *new_node;
kuid_t curr_euid = current_euid();
- if (binder_context_mgr_node != NULL) {
+ mutex_lock(&context->context_mgr_node_lock);
+ if (context->binder_context_mgr_node) {
pr_err("BINDER_SET_CONTEXT_MGR already set\n");
ret = -EBUSY;
goto out;
@@ -2732,31 +4572,60 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp)
ret = security_binder_set_context_mgr(proc->tsk);
if (ret < 0)
goto out;
- if (uid_valid(binder_context_mgr_uid)) {
- if (!uid_eq(binder_context_mgr_uid, curr_euid)) {
+ if (uid_valid(context->binder_context_mgr_uid)) {
+ if (!uid_eq(context->binder_context_mgr_uid, curr_euid)) {
pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n",
from_kuid(&init_user_ns, curr_euid),
from_kuid(&init_user_ns,
- binder_context_mgr_uid));
+ context->binder_context_mgr_uid));
ret = -EPERM;
goto out;
}
} else {
- binder_context_mgr_uid = curr_euid;
+ context->binder_context_mgr_uid = curr_euid;
}
- binder_context_mgr_node = binder_new_node(proc, 0, 0);
- if (binder_context_mgr_node == NULL) {
+ new_node = binder_new_node(proc, NULL);
+ if (!new_node) {
ret = -ENOMEM;
goto out;
}
- binder_context_mgr_node->local_weak_refs++;
- binder_context_mgr_node->local_strong_refs++;
- binder_context_mgr_node->has_strong_ref = 1;
- binder_context_mgr_node->has_weak_ref = 1;
+ binder_node_lock(new_node);
+ new_node->local_weak_refs++;
+ new_node->local_strong_refs++;
+ new_node->has_strong_ref = 1;
+ new_node->has_weak_ref = 1;
+ context->binder_context_mgr_node = new_node;
+ binder_node_unlock(new_node);
+ binder_put_node(new_node);
out:
+ mutex_unlock(&context->context_mgr_node_lock);
return ret;
}
+static int binder_ioctl_get_node_debug_info(struct binder_proc *proc,
+ struct binder_node_debug_info *info) {
+ struct rb_node *n;
+ binder_uintptr_t ptr = info->ptr;
+
+ memset(info, 0, sizeof(*info));
+
+ binder_inner_proc_lock(proc);
+ for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) {
+ struct binder_node *node = rb_entry(n, struct binder_node,
+ rb_node);
+ if (node->ptr > ptr) {
+ info->ptr = node->ptr;
+ info->cookie = node->cookie;
+ info->has_strong_ref = node->has_strong_ref;
+ info->has_weak_ref = node->has_weak_ref;
+ break;
+ }
+ }
+ binder_inner_proc_unlock(proc);
+
+ return 0;
+}
+
static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int ret;
@@ -2768,13 +4637,14 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
/*pr_info("binder_ioctl: %d:%d %x %lx\n",
proc->pid, current->pid, cmd, arg);*/
+ binder_selftest_alloc(&proc->alloc);
+
trace_binder_ioctl(cmd, arg);
ret = wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
if (ret)
goto err_unlocked;
- binder_lock(__func__);
thread = binder_get_thread(proc);
if (thread == NULL) {
ret = -ENOMEM;
@@ -2787,12 +4657,19 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (ret)
goto err;
break;
- case BINDER_SET_MAX_THREADS:
- if (copy_from_user(&proc->max_threads, ubuf, sizeof(proc->max_threads))) {
+ case BINDER_SET_MAX_THREADS: {
+ int max_threads;
+
+ if (copy_from_user(&max_threads, ubuf,
+ sizeof(max_threads))) {
ret = -EINVAL;
goto err;
}
+ binder_inner_proc_lock(proc);
+ proc->max_threads = max_threads;
+ binder_inner_proc_unlock(proc);
break;
+ }
case BINDER_SET_CONTEXT_MGR:
ret = binder_ioctl_set_ctx_mgr(filp);
if (ret)
@@ -2801,7 +4678,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case BINDER_THREAD_EXIT:
binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n",
proc->pid, thread->pid);
- binder_free_thread(proc, thread);
+ binder_thread_release(proc, thread);
thread = NULL;
break;
case BINDER_VERSION: {
@@ -2818,6 +4695,24 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
break;
}
+ case BINDER_GET_NODE_DEBUG_INFO: {
+ struct binder_node_debug_info info;
+
+ if (copy_from_user(&info, ubuf, sizeof(info))) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ ret = binder_ioctl_get_node_debug_info(proc, &info);
+ if (ret < 0)
+ goto err;
+
+ if (copy_to_user(ubuf, &info, sizeof(info))) {
+ ret = -EFAULT;
+ goto err;
+ }
+ break;
+ }
default:
ret = -EINVAL;
goto err;
@@ -2825,8 +4720,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
ret = 0;
err:
if (thread)
- thread->looper &= ~BINDER_LOOPER_STATE_NEED_RETURN;
- binder_unlock(__func__);
+ thread->looper_need_return = false;
wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
if (ret && ret != -ERESTARTSYS)
pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret);
@@ -2855,9 +4749,7 @@ static void binder_vma_close(struct vm_area_struct *vma)
proc->pid, vma->vm_start, vma->vm_end,
(vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
(unsigned long)pgprot_val(vma->vm_page_prot));
- proc->vma = NULL;
- proc->vma_vm_mm = NULL;
- binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES);
+ binder_alloc_vma_close(&proc->alloc);
}
static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -2874,10 +4766,8 @@ static const struct vm_operations_struct binder_vm_ops = {
static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
{
int ret;
- struct vm_struct *area;
struct binder_proc *proc = filp->private_data;
const char *failure_string;
- struct binder_buffer *buffer;
if (proc->tsk != current->group_leader)
return -EINVAL;
@@ -2886,8 +4776,8 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
vma->vm_end = vma->vm_start + SZ_4M;
binder_debug(BINDER_DEBUG_OPEN_CLOSE,
- "binder_mmap: %d %lx-%lx (%ld K) vma %lx pagep %lx\n",
- proc->pid, vma->vm_start, vma->vm_end,
+ "%s: %d %lx-%lx (%ld K) vma %lx pagep %lx\n",
+ __func__, proc->pid, vma->vm_start, vma->vm_end,
(vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
(unsigned long)pgprot_val(vma->vm_page_prot));
@@ -2897,73 +4787,13 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
goto err_bad_arg;
}
vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE;
-
- mutex_lock(&binder_mmap_lock);
- if (proc->buffer) {
- ret = -EBUSY;
- failure_string = "already mapped";
- goto err_already_mapped;
- }
-
- area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
- if (area == NULL) {
- ret = -ENOMEM;
- failure_string = "get_vm_area";
- goto err_get_vm_area_failed;
- }
- proc->buffer = area->addr;
- proc->user_buffer_offset = vma->vm_start - (uintptr_t)proc->buffer;
- mutex_unlock(&binder_mmap_lock);
-
-#ifdef CONFIG_CPU_CACHE_VIPT
- if (cache_is_vipt_aliasing()) {
- while (CACHE_COLOUR((vma->vm_start ^ (uint32_t)proc->buffer))) {
- pr_info("binder_mmap: %d %lx-%lx maps %p bad alignment\n", proc->pid, vma->vm_start, vma->vm_end, proc->buffer);
- vma->vm_start += PAGE_SIZE;
- }
- }
-#endif
- proc->pages = kzalloc(sizeof(proc->pages[0]) * ((vma->vm_end - vma->vm_start) / PAGE_SIZE), GFP_KERNEL);
- if (proc->pages == NULL) {
- ret = -ENOMEM;
- failure_string = "alloc page array";
- goto err_alloc_pages_failed;
- }
- proc->buffer_size = vma->vm_end - vma->vm_start;
-
vma->vm_ops = &binder_vm_ops;
vma->vm_private_data = proc;
- if (binder_update_page_range(proc, 1, proc->buffer, proc->buffer + PAGE_SIZE, vma)) {
- ret = -ENOMEM;
- failure_string = "alloc small buf";
- goto err_alloc_small_buf_failed;
- }
- buffer = proc->buffer;
- INIT_LIST_HEAD(&proc->buffers);
- list_add(&buffer->entry, &proc->buffers);
- buffer->free = 1;
- binder_insert_free_buffer(proc, buffer);
- proc->free_async_space = proc->buffer_size / 2;
- barrier();
- proc->files = get_files_struct(current);
- proc->vma = vma;
- proc->vma_vm_mm = vma->vm_mm;
-
- /*pr_info("binder_mmap: %d %lx-%lx maps %p\n",
- proc->pid, vma->vm_start, vma->vm_end, proc->buffer);*/
- return 0;
+ ret = binder_alloc_mmap_handler(&proc->alloc, vma);
+
+ return ret;
-err_alloc_small_buf_failed:
- kfree(proc->pages);
- proc->pages = NULL;
-err_alloc_pages_failed:
- mutex_lock(&binder_mmap_lock);
- vfree(proc->buffer);
- proc->buffer = NULL;
-err_get_vm_area_failed:
-err_already_mapped:
- mutex_unlock(&binder_mmap_lock);
err_bad_arg:
pr_err("binder_mmap: %d %lx-%lx %s failed %d\n",
proc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
@@ -2973,6 +4803,7 @@ err_bad_arg:
static int binder_open(struct inode *nodp, struct file *filp)
{
struct binder_proc *proc;
+ struct binder_device *binder_dev;
binder_debug(BINDER_DEBUG_OPEN_CLOSE, "binder_open: %d:%d\n",
current->group_leader->pid, current->pid);
@@ -2980,28 +4811,49 @@ static int binder_open(struct inode *nodp, struct file *filp)
proc = kzalloc(sizeof(*proc), GFP_KERNEL);
if (proc == NULL)
return -ENOMEM;
+ spin_lock_init(&proc->inner_lock);
+ spin_lock_init(&proc->outer_lock);
get_task_struct(current->group_leader);
proc->tsk = current->group_leader;
INIT_LIST_HEAD(&proc->todo);
- init_waitqueue_head(&proc->wait);
- proc->default_priority = task_nice(current);
+ if (binder_supported_policy(current->policy)) {
+ proc->default_priority.sched_policy = current->policy;
+ proc->default_priority.prio = current->normal_prio;
+ } else {
+ proc->default_priority.sched_policy = SCHED_NORMAL;
+ proc->default_priority.prio = NICE_TO_PRIO(0);
+ }
- binder_lock(__func__);
+ binder_dev = container_of(filp->private_data, struct binder_device,
+ miscdev);
+ proc->context = &binder_dev->context;
+ binder_alloc_init(&proc->alloc);
binder_stats_created(BINDER_STAT_PROC);
- hlist_add_head(&proc->proc_node, &binder_procs);
proc->pid = current->group_leader->pid;
INIT_LIST_HEAD(&proc->delivered_death);
+ INIT_LIST_HEAD(&proc->waiting_threads);
filp->private_data = proc;
- binder_unlock(__func__);
+ mutex_lock(&binder_procs_lock);
+ hlist_add_head(&proc->proc_node, &binder_procs);
+ mutex_unlock(&binder_procs_lock);
if (binder_debugfs_dir_entry_proc) {
char strbuf[11];
snprintf(strbuf, sizeof(strbuf), "%u", proc->pid);
+ /*
+ * proc debug entries are shared between contexts, so
+ * this will fail if the process tries to open the driver
+ * again with a different context. The priting code will
+ * anyway print all contexts that a given PID has, so this
+ * is not a problem.
+ */
proc->debugfs_entry = debugfs_create_file(strbuf, S_IRUGO,
- binder_debugfs_dir_entry_proc, proc, &binder_proc_fops);
+ binder_debugfs_dir_entry_proc,
+ (void *)(unsigned long)proc->pid,
+ &binder_proc_fops);
}
return 0;
@@ -3021,16 +4873,17 @@ static void binder_deferred_flush(struct binder_proc *proc)
struct rb_node *n;
int wake_count = 0;
+ binder_inner_proc_lock(proc);
for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) {
struct binder_thread *thread = rb_entry(n, struct binder_thread, rb_node);
- thread->looper |= BINDER_LOOPER_STATE_NEED_RETURN;
+ thread->looper_need_return = true;
if (thread->looper & BINDER_LOOPER_STATE_WAITING) {
wake_up_interruptible(&thread->wait);
wake_count++;
}
}
- wake_up_interruptible_all(&proc->wait);
+ binder_inner_proc_unlock(proc);
binder_debug(BINDER_DEBUG_OPEN_CLOSE,
"binder_flush: %d woke %d threads\n", proc->pid,
@@ -3051,13 +4904,21 @@ static int binder_node_release(struct binder_node *node, int refs)
{
struct binder_ref *ref;
int death = 0;
-
- list_del_init(&node->work.entry);
- binder_release_work(&node->async_todo);
-
- if (hlist_empty(&node->refs)) {
- kfree(node);
- binder_stats_deleted(BINDER_STAT_NODE);
+ struct binder_proc *proc = node->proc;
+
+ binder_release_work(proc, &node->async_todo);
+
+ binder_node_lock(node);
+ binder_inner_proc_lock(proc);
+ binder_dequeue_work_ilocked(&node->work);
+ /*
+ * The caller must have taken a temporary ref on the node,
+ */
+ BUG_ON(!node->tmp_refs);
+ if (hlist_empty(&node->refs) && node->tmp_refs == 1) {
+ binder_inner_proc_unlock(proc);
+ binder_node_unlock(node);
+ binder_free_node(node);
return refs;
}
@@ -3065,59 +4926,82 @@ static int binder_node_release(struct binder_node *node, int refs)
node->proc = NULL;
node->local_strong_refs = 0;
node->local_weak_refs = 0;
+ binder_inner_proc_unlock(proc);
+
+ spin_lock(&binder_dead_nodes_lock);
hlist_add_head(&node->dead_node, &binder_dead_nodes);
+ spin_unlock(&binder_dead_nodes_lock);
hlist_for_each_entry(ref, &node->refs, node_entry) {
refs++;
-
- if (!ref->death)
+ /*
+ * Need the node lock to synchronize
+ * with new notification requests and the
+ * inner lock to synchronize with queued
+ * death notifications.
+ */
+ binder_inner_proc_lock(ref->proc);
+ if (!ref->death) {
+ binder_inner_proc_unlock(ref->proc);
continue;
+ }
death++;
- if (list_empty(&ref->death->work.entry)) {
- ref->death->work.type = BINDER_WORK_DEAD_BINDER;
- list_add_tail(&ref->death->work.entry,
- &ref->proc->todo);
- wake_up_interruptible(&ref->proc->wait);
- } else
- BUG();
+ BUG_ON(!list_empty(&ref->death->work.entry));
+ ref->death->work.type = BINDER_WORK_DEAD_BINDER;
+ binder_enqueue_work_ilocked(&ref->death->work,
+ &ref->proc->todo);
+ binder_wakeup_proc_ilocked(ref->proc);
+ binder_inner_proc_unlock(ref->proc);
}
binder_debug(BINDER_DEBUG_DEAD_BINDER,
"node %d now dead, refs %d, death %d\n",
node->debug_id, refs, death);
+ binder_node_unlock(node);
+ binder_put_node(node);
return refs;
}
static void binder_deferred_release(struct binder_proc *proc)
{
- struct binder_transaction *t;
+ struct binder_context *context = proc->context;
struct rb_node *n;
- int threads, nodes, incoming_refs, outgoing_refs, buffers,
- active_transactions, page_count;
-
- BUG_ON(proc->vma);
- BUG_ON(proc->files);
+ int threads, nodes, incoming_refs, outgoing_refs, active_transactions;
+ mutex_lock(&binder_procs_lock);
hlist_del(&proc->proc_node);
+ mutex_unlock(&binder_procs_lock);
- if (binder_context_mgr_node && binder_context_mgr_node->proc == proc) {
+ mutex_lock(&context->context_mgr_node_lock);
+ if (context->binder_context_mgr_node &&
+ context->binder_context_mgr_node->proc == proc) {
binder_debug(BINDER_DEBUG_DEAD_BINDER,
"%s: %d context_mgr_node gone\n",
__func__, proc->pid);
- binder_context_mgr_node = NULL;
+ context->binder_context_mgr_node = NULL;
}
-
+ mutex_unlock(&context->context_mgr_node_lock);
+ binder_inner_proc_lock(proc);
+ /*
+ * Make sure proc stays alive after we
+ * remove all the threads
+ */
+ proc->tmp_ref++;
+
+ proc->is_dead = true;
threads = 0;
active_transactions = 0;
while ((n = rb_first(&proc->threads))) {
struct binder_thread *thread;
thread = rb_entry(n, struct binder_thread, rb_node);
+ binder_inner_proc_unlock(proc);
threads++;
- active_transactions += binder_free_thread(proc, thread);
+ active_transactions += binder_thread_release(proc, thread);
+ binder_inner_proc_lock(proc);
}
nodes = 0;
@@ -3127,84 +5011,50 @@ static void binder_deferred_release(struct binder_proc *proc)
node = rb_entry(n, struct binder_node, rb_node);
nodes++;
+ /*
+ * take a temporary ref on the node before
+ * calling binder_node_release() which will either
+ * kfree() the node or call binder_put_node()
+ */
+ binder_inc_node_tmpref_ilocked(node);
rb_erase(&node->rb_node, &proc->nodes);
+ binder_inner_proc_unlock(proc);
incoming_refs = binder_node_release(node, incoming_refs);
+ binder_inner_proc_lock(proc);
}
+ binder_inner_proc_unlock(proc);
outgoing_refs = 0;
+ binder_proc_lock(proc);
while ((n = rb_first(&proc->refs_by_desc))) {
struct binder_ref *ref;
ref = rb_entry(n, struct binder_ref, rb_node_desc);
outgoing_refs++;
- binder_delete_ref(ref);
- }
-
- binder_release_work(&proc->todo);
- binder_release_work(&proc->delivered_death);
-
- buffers = 0;
- while ((n = rb_first(&proc->allocated_buffers))) {
- struct binder_buffer *buffer;
-
- buffer = rb_entry(n, struct binder_buffer, rb_node);
-
- t = buffer->transaction;
- if (t) {
- t->buffer = NULL;
- buffer->transaction = NULL;
- pr_err("release proc %d, transaction %d, not freed\n",
- proc->pid, t->debug_id);
- /*BUG();*/
- }
-
- binder_free_buf(proc, buffer);
- buffers++;
- }
-
- binder_stats_deleted(BINDER_STAT_PROC);
-
- page_count = 0;
- if (proc->pages) {
- int i;
-
- for (i = 0; i < proc->buffer_size / PAGE_SIZE; i++) {
- void *page_addr;
-
- if (!proc->pages[i])
- continue;
-
- page_addr = proc->buffer + i * PAGE_SIZE;
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "%s: %d: page %d at %p not freed\n",
- __func__, proc->pid, i, page_addr);
- unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
- __free_page(proc->pages[i]);
- page_count++;
- }
- kfree(proc->pages);
- vfree(proc->buffer);
+ binder_cleanup_ref_olocked(ref);
+ binder_proc_unlock(proc);
+ binder_free_ref(ref);
+ binder_proc_lock(proc);
}
+ binder_proc_unlock(proc);
- put_task_struct(proc->tsk);
+ binder_release_work(proc, &proc->todo);
+ binder_release_work(proc, &proc->delivered_death);
binder_debug(BINDER_DEBUG_OPEN_CLOSE,
- "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d, buffers %d, pages %d\n",
+ "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d\n",
__func__, proc->pid, threads, nodes, incoming_refs,
- outgoing_refs, active_transactions, buffers, page_count);
+ outgoing_refs, active_transactions);
- kfree(proc);
+ binder_proc_dec_tmpref(proc);
}
static void binder_deferred_func(struct work_struct *work)
{
struct binder_proc *proc;
- struct files_struct *files;
-
int defer;
do {
- binder_lock(__func__);
mutex_lock(&binder_deferred_lock);
if (!hlist_empty(&binder_deferred_list)) {
proc = hlist_entry(binder_deferred_list.first,
@@ -3218,22 +5068,11 @@ static void binder_deferred_func(struct work_struct *work)
}
mutex_unlock(&binder_deferred_lock);
- files = NULL;
- if (defer & BINDER_DEFERRED_PUT_FILES) {
- files = proc->files;
- if (files)
- proc->files = NULL;
- }
-
if (defer & BINDER_DEFERRED_FLUSH)
binder_deferred_flush(proc);
if (defer & BINDER_DEFERRED_RELEASE)
binder_deferred_release(proc); /* frees proc */
-
- binder_unlock(__func__);
- if (files)
- put_files_struct(files);
} while (proc);
}
static DECLARE_WORK(binder_deferred_work, binder_deferred_func);
@@ -3251,41 +5090,52 @@ binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer)
mutex_unlock(&binder_deferred_lock);
}
-static void print_binder_transaction(struct seq_file *m, const char *prefix,
- struct binder_transaction *t)
+static void print_binder_transaction_ilocked(struct seq_file *m,
+ struct binder_proc *proc,
+ const char *prefix,
+ struct binder_transaction *t)
{
+ struct binder_proc *to_proc;
+ struct binder_buffer *buffer = t->buffer;
+
+ spin_lock(&t->lock);
+ to_proc = t->to_proc;
seq_printf(m,
- "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %ld r%d",
+ "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %d:%d r%d",
prefix, t->debug_id, t,
t->from ? t->from->proc->pid : 0,
t->from ? t->from->pid : 0,
- t->to_proc ? t->to_proc->pid : 0,
+ to_proc ? to_proc->pid : 0,
t->to_thread ? t->to_thread->pid : 0,
- t->code, t->flags, t->priority, t->need_reply);
- if (t->buffer == NULL) {
+ t->code, t->flags, t->priority.sched_policy,
+ t->priority.prio, t->need_reply);
+ spin_unlock(&t->lock);
+
+ if (proc != to_proc) {
+ /*
+ * Can only safely deref buffer if we are holding the
+ * correct proc inner lock for this node
+ */
+ seq_puts(m, "\n");
+ return;
+ }
+
+ if (buffer == NULL) {
seq_puts(m, " buffer free\n");
return;
}
- if (t->buffer->target_node)
- seq_printf(m, " node %d",
- t->buffer->target_node->debug_id);
+ if (buffer->target_node)
+ seq_printf(m, " node %d", buffer->target_node->debug_id);
seq_printf(m, " size %zd:%zd data %p\n",
- t->buffer->data_size, t->buffer->offsets_size,
- t->buffer->data);
-}
-
-static void print_binder_buffer(struct seq_file *m, const char *prefix,
- struct binder_buffer *buffer)
-{
- seq_printf(m, "%s %d: %p size %zd:%zd %s\n",
- prefix, buffer->debug_id, buffer->data,
buffer->data_size, buffer->offsets_size,
- buffer->transaction ? "active" : "delivered");
+ buffer->data);
}
-static void print_binder_work(struct seq_file *m, const char *prefix,
- const char *transaction_prefix,
- struct binder_work *w)
+static void print_binder_work_ilocked(struct seq_file *m,
+ struct binder_proc *proc,
+ const char *prefix,
+ const char *transaction_prefix,
+ struct binder_work *w)
{
struct binder_node *node;
struct binder_transaction *t;
@@ -3293,8 +5143,16 @@ static void print_binder_work(struct seq_file *m, const char *prefix,
switch (w->type) {
case BINDER_WORK_TRANSACTION:
t = container_of(w, struct binder_transaction, work);
- print_binder_transaction(m, transaction_prefix, t);
+ print_binder_transaction_ilocked(
+ m, proc, transaction_prefix, t);
break;
+ case BINDER_WORK_RETURN_ERROR: {
+ struct binder_error *e = container_of(
+ w, struct binder_error, work);
+
+ seq_printf(m, "%stransaction error: %u\n",
+ prefix, e->cmd);
+ } break;
case BINDER_WORK_TRANSACTION_COMPLETE:
seq_printf(m, "%stransaction complete\n", prefix);
break;
@@ -3319,40 +5177,46 @@ static void print_binder_work(struct seq_file *m, const char *prefix,
}
}
-static void print_binder_thread(struct seq_file *m,
- struct binder_thread *thread,
- int print_always)
+static void print_binder_thread_ilocked(struct seq_file *m,
+ struct binder_thread *thread,
+ int print_always)
{
struct binder_transaction *t;
struct binder_work *w;
size_t start_pos = m->count;
size_t header_pos;
- seq_printf(m, " thread %d: l %02x\n", thread->pid, thread->looper);
+ seq_printf(m, " thread %d: l %02x need_return %d tr %d\n",
+ thread->pid, thread->looper,
+ thread->looper_need_return,
+ atomic_read(&thread->tmp_ref));
header_pos = m->count;
t = thread->transaction_stack;
while (t) {
if (t->from == thread) {
- print_binder_transaction(m,
- " outgoing transaction", t);
+ print_binder_transaction_ilocked(m, thread->proc,
+ " outgoing transaction", t);
t = t->from_parent;
} else if (t->to_thread == thread) {
- print_binder_transaction(m,
+ print_binder_transaction_ilocked(m, thread->proc,
" incoming transaction", t);
t = t->to_parent;
} else {
- print_binder_transaction(m, " bad transaction", t);
+ print_binder_transaction_ilocked(m, thread->proc,
+ " bad transaction", t);
t = NULL;
}
}
list_for_each_entry(w, &thread->todo, entry) {
- print_binder_work(m, " ", " pending transaction", w);
+ print_binder_work_ilocked(m, thread->proc, " ",
+ " pending transaction", w);
}
if (!print_always && m->count == header_pos)
m->count = start_pos;
}
-static void print_binder_node(struct seq_file *m, struct binder_node *node)
+static void print_binder_node_nilocked(struct seq_file *m,
+ struct binder_node *node)
{
struct binder_ref *ref;
struct binder_work *w;
@@ -3362,27 +5226,35 @@ static void print_binder_node(struct seq_file *m, struct binder_node *node)
hlist_for_each_entry(ref, &node->refs, node_entry)
count++;
- seq_printf(m, " node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d",
+ seq_printf(m, " node %d: u%016llx c%016llx pri %d:%d hs %d hw %d ls %d lw %d is %d iw %d tr %d",
node->debug_id, (u64)node->ptr, (u64)node->cookie,
+ node->sched_policy, node->min_priority,
node->has_strong_ref, node->has_weak_ref,
node->local_strong_refs, node->local_weak_refs,
- node->internal_strong_refs, count);
+ node->internal_strong_refs, count, node->tmp_refs);
if (count) {
seq_puts(m, " proc");
hlist_for_each_entry(ref, &node->refs, node_entry)
seq_printf(m, " %d", ref->proc->pid);
}
seq_puts(m, "\n");
- list_for_each_entry(w, &node->async_todo, entry)
- print_binder_work(m, " ",
- " pending async transaction", w);
+ if (node->proc) {
+ list_for_each_entry(w, &node->async_todo, entry)
+ print_binder_work_ilocked(m, node->proc, " ",
+ " pending async transaction", w);
+ }
}
-static void print_binder_ref(struct seq_file *m, struct binder_ref *ref)
+static void print_binder_ref_olocked(struct seq_file *m,
+ struct binder_ref *ref)
{
- seq_printf(m, " ref %d: desc %d %snode %d s %d w %d d %p\n",
- ref->debug_id, ref->desc, ref->node->proc ? "" : "dead ",
- ref->node->debug_id, ref->strong, ref->weak, ref->death);
+ binder_node_lock(ref->node);
+ seq_printf(m, " ref %d: desc %d %snode %d s %d w %d d %pK\n",
+ ref->data.debug_id, ref->data.desc,
+ ref->node->proc ? "" : "dead ",
+ ref->node->debug_id, ref->data.strong,
+ ref->data.weak, ref->death);
+ binder_node_unlock(ref->node);
}
static void print_binder_proc(struct seq_file *m,
@@ -3392,35 +5264,60 @@ static void print_binder_proc(struct seq_file *m,
struct rb_node *n;
size_t start_pos = m->count;
size_t header_pos;
+ struct binder_node *last_node = NULL;
seq_printf(m, "proc %d\n", proc->pid);
+ seq_printf(m, "context %s\n", proc->context->name);
header_pos = m->count;
+ binder_inner_proc_lock(proc);
for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
- print_binder_thread(m, rb_entry(n, struct binder_thread,
+ print_binder_thread_ilocked(m, rb_entry(n, struct binder_thread,
rb_node), print_all);
+
for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) {
struct binder_node *node = rb_entry(n, struct binder_node,
rb_node);
- if (print_all || node->has_async_transaction)
- print_binder_node(m, node);
+ /*
+ * take a temporary reference on the node so it
+ * survives and isn't removed from the tree
+ * while we print it.
+ */
+ binder_inc_node_tmpref_ilocked(node);
+ /* Need to drop inner lock to take node lock */
+ binder_inner_proc_unlock(proc);
+ if (last_node)
+ binder_put_node(last_node);
+ binder_node_inner_lock(node);
+ print_binder_node_nilocked(m, node);
+ binder_node_inner_unlock(node);
+ last_node = node;
+ binder_inner_proc_lock(proc);
}
+ binder_inner_proc_unlock(proc);
+ if (last_node)
+ binder_put_node(last_node);
+
if (print_all) {
+ binder_proc_lock(proc);
for (n = rb_first(&proc->refs_by_desc);
n != NULL;
n = rb_next(n))
- print_binder_ref(m, rb_entry(n, struct binder_ref,
- rb_node_desc));
+ print_binder_ref_olocked(m, rb_entry(n,
+ struct binder_ref,
+ rb_node_desc));
+ binder_proc_unlock(proc);
}
- for (n = rb_first(&proc->allocated_buffers); n != NULL; n = rb_next(n))
- print_binder_buffer(m, " buffer",
- rb_entry(n, struct binder_buffer, rb_node));
+ binder_alloc_print_allocated(m, &proc->alloc);
+ binder_inner_proc_lock(proc);
list_for_each_entry(w, &proc->todo, entry)
- print_binder_work(m, " ", " pending transaction", w);
+ print_binder_work_ilocked(m, proc, " ",
+ " pending transaction", w);
list_for_each_entry(w, &proc->delivered_death, entry) {
seq_puts(m, " has delivered dead binder\n");
break;
}
+ binder_inner_proc_unlock(proc);
if (!print_all && m->count == header_pos)
m->count = start_pos;
}
@@ -3463,7 +5360,9 @@ static const char * const binder_command_strings[] = {
"BC_EXIT_LOOPER",
"BC_REQUEST_DEATH_NOTIFICATION",
"BC_CLEAR_DEATH_NOTIFICATION",
- "BC_DEAD_BINDER_DONE"
+ "BC_DEAD_BINDER_DONE",
+ "BC_TRANSACTION_SG",
+ "BC_REPLY_SG",
};
static const char * const binder_objstat_strings[] = {
@@ -3484,17 +5383,21 @@ static void print_binder_stats(struct seq_file *m, const char *prefix,
BUILD_BUG_ON(ARRAY_SIZE(stats->bc) !=
ARRAY_SIZE(binder_command_strings));
for (i = 0; i < ARRAY_SIZE(stats->bc); i++) {
- if (stats->bc[i])
+ int temp = atomic_read(&stats->bc[i]);
+
+ if (temp)
seq_printf(m, "%s%s: %d\n", prefix,
- binder_command_strings[i], stats->bc[i]);
+ binder_command_strings[i], temp);
}
BUILD_BUG_ON(ARRAY_SIZE(stats->br) !=
ARRAY_SIZE(binder_return_strings));
for (i = 0; i < ARRAY_SIZE(stats->br); i++) {
- if (stats->br[i])
+ int temp = atomic_read(&stats->br[i]);
+
+ if (temp)
seq_printf(m, "%s%s: %d\n", prefix,
- binder_return_strings[i], stats->br[i]);
+ binder_return_strings[i], temp);
}
BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) !=
@@ -3502,11 +5405,15 @@ static void print_binder_stats(struct seq_file *m, const char *prefix,
BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) !=
ARRAY_SIZE(stats->obj_deleted));
for (i = 0; i < ARRAY_SIZE(stats->obj_created); i++) {
- if (stats->obj_created[i] || stats->obj_deleted[i])
- seq_printf(m, "%s%s: active %d total %d\n", prefix,
+ int created = atomic_read(&stats->obj_created[i]);
+ int deleted = atomic_read(&stats->obj_deleted[i]);
+
+ if (created || deleted)
+ seq_printf(m, "%s%s: active %d total %d\n",
+ prefix,
binder_objstat_strings[i],
- stats->obj_created[i] - stats->obj_deleted[i],
- stats->obj_created[i]);
+ created - deleted,
+ created);
}
}
@@ -3514,50 +5421,59 @@ static void print_binder_proc_stats(struct seq_file *m,
struct binder_proc *proc)
{
struct binder_work *w;
+ struct binder_thread *thread;
struct rb_node *n;
- int count, strong, weak;
+ int count, strong, weak, ready_threads;
+ size_t free_async_space =
+ binder_alloc_get_free_async_space(&proc->alloc);
seq_printf(m, "proc %d\n", proc->pid);
+ seq_printf(m, "context %s\n", proc->context->name);
count = 0;
+ ready_threads = 0;
+ binder_inner_proc_lock(proc);
for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
count++;
+
+ list_for_each_entry(thread, &proc->waiting_threads, waiting_thread_node)
+ ready_threads++;
+
seq_printf(m, " threads: %d\n", count);
seq_printf(m, " requested threads: %d+%d/%d\n"
" ready threads %d\n"
" free async space %zd\n", proc->requested_threads,
proc->requested_threads_started, proc->max_threads,
- proc->ready_threads, proc->free_async_space);
+ ready_threads,
+ free_async_space);
count = 0;
for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n))
count++;
+ binder_inner_proc_unlock(proc);
seq_printf(m, " nodes: %d\n", count);
count = 0;
strong = 0;
weak = 0;
+ binder_proc_lock(proc);
for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) {
struct binder_ref *ref = rb_entry(n, struct binder_ref,
rb_node_desc);
count++;
- strong += ref->strong;
- weak += ref->weak;
+ strong += ref->data.strong;
+ weak += ref->data.weak;
}
+ binder_proc_unlock(proc);
seq_printf(m, " refs: %d s %d w %d\n", count, strong, weak);
- count = 0;
- for (n = rb_first(&proc->allocated_buffers); n != NULL; n = rb_next(n))
- count++;
+ count = binder_alloc_get_allocated_count(&proc->alloc);
seq_printf(m, " buffers: %d\n", count);
count = 0;
+ binder_inner_proc_lock(proc);
list_for_each_entry(w, &proc->todo, entry) {
- switch (w->type) {
- case BINDER_WORK_TRANSACTION:
+ if (w->type == BINDER_WORK_TRANSACTION)
count++;
- break;
- default:
- break;
- }
}
+ binder_inner_proc_unlock(proc);
seq_printf(m, " pending transactions: %d\n", count);
print_binder_stats(m, " ", &proc->stats);
@@ -3568,107 +5484,131 @@ static int binder_state_show(struct seq_file *m, void *unused)
{
struct binder_proc *proc;
struct binder_node *node;
- int do_lock = !binder_debug_no_lock;
-
- if (do_lock)
- binder_lock(__func__);
+ struct binder_node *last_node = NULL;
seq_puts(m, "binder state:\n");
+ spin_lock(&binder_dead_nodes_lock);
if (!hlist_empty(&binder_dead_nodes))
seq_puts(m, "dead nodes:\n");
- hlist_for_each_entry(node, &binder_dead_nodes, dead_node)
- print_binder_node(m, node);
+ hlist_for_each_entry(node, &binder_dead_nodes, dead_node) {
+ /*
+ * take a temporary reference on the node so it
+ * survives and isn't removed from the list
+ * while we print it.
+ */
+ node->tmp_refs++;
+ spin_unlock(&binder_dead_nodes_lock);
+ if (last_node)
+ binder_put_node(last_node);
+ binder_node_lock(node);
+ print_binder_node_nilocked(m, node);
+ binder_node_unlock(node);
+ last_node = node;
+ spin_lock(&binder_dead_nodes_lock);
+ }
+ spin_unlock(&binder_dead_nodes_lock);
+ if (last_node)
+ binder_put_node(last_node);
+ mutex_lock(&binder_procs_lock);
hlist_for_each_entry(proc, &binder_procs, proc_node)
print_binder_proc(m, proc, 1);
- if (do_lock)
- binder_unlock(__func__);
+ mutex_unlock(&binder_procs_lock);
+
return 0;
}
static int binder_stats_show(struct seq_file *m, void *unused)
{
struct binder_proc *proc;
- int do_lock = !binder_debug_no_lock;
-
- if (do_lock)
- binder_lock(__func__);
seq_puts(m, "binder stats:\n");
print_binder_stats(m, "", &binder_stats);
+ mutex_lock(&binder_procs_lock);
hlist_for_each_entry(proc, &binder_procs, proc_node)
print_binder_proc_stats(m, proc);
- if (do_lock)
- binder_unlock(__func__);
+ mutex_unlock(&binder_procs_lock);
+
return 0;
}
static int binder_transactions_show(struct seq_file *m, void *unused)
{
struct binder_proc *proc;
- int do_lock = !binder_debug_no_lock;
-
- if (do_lock)
- binder_lock(__func__);
seq_puts(m, "binder transactions:\n");
+ mutex_lock(&binder_procs_lock);
hlist_for_each_entry(proc, &binder_procs, proc_node)
print_binder_proc(m, proc, 0);
- if (do_lock)
- binder_unlock(__func__);
+ mutex_unlock(&binder_procs_lock);
+
return 0;
}
static int binder_proc_show(struct seq_file *m, void *unused)
{
struct binder_proc *itr;
- struct binder_proc *proc = m->private;
- int do_lock = !binder_debug_no_lock;
- bool valid_proc = false;
-
- if (do_lock)
- binder_lock(__func__);
+ int pid = (unsigned long)m->private;
+ mutex_lock(&binder_procs_lock);
hlist_for_each_entry(itr, &binder_procs, proc_node) {
- if (itr == proc) {
- valid_proc = true;
- break;
+ if (itr->pid == pid) {
+ seq_puts(m, "binder proc state:\n");
+ print_binder_proc(m, itr, 1);
}
}
- if (valid_proc) {
- seq_puts(m, "binder proc state:\n");
- print_binder_proc(m, proc, 1);
- }
- if (do_lock)
- binder_unlock(__func__);
+ mutex_unlock(&binder_procs_lock);
+
return 0;
}
static void print_binder_transaction_log_entry(struct seq_file *m,
struct binder_transaction_log_entry *e)
{
+ int debug_id = READ_ONCE(e->debug_id_done);
+ /*
+ * read barrier to guarantee debug_id_done read before
+ * we print the log values
+ */
+ smp_rmb();
seq_printf(m,
- "%d: %s from %d:%d to %d:%d node %d handle %d size %d:%d\n",
+ "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d ret %d/%d l=%d",
e->debug_id, (e->call_type == 2) ? "reply" :
((e->call_type == 1) ? "async" : "call "), e->from_proc,
- e->from_thread, e->to_proc, e->to_thread, e->to_node,
- e->target_handle, e->data_size, e->offsets_size);
+ e->from_thread, e->to_proc, e->to_thread, e->context_name,
+ e->to_node, e->target_handle, e->data_size, e->offsets_size,
+ e->return_error, e->return_error_param,
+ e->return_error_line);
+ /*
+ * read-barrier to guarantee read of debug_id_done after
+ * done printing the fields of the entry
+ */
+ smp_rmb();
+ seq_printf(m, debug_id && debug_id == READ_ONCE(e->debug_id_done) ?
+ "\n" : " (incomplete)\n");
}
static int binder_transaction_log_show(struct seq_file *m, void *unused)
{
struct binder_transaction_log *log = m->private;
+ unsigned int log_cur = atomic_read(&log->cur);
+ unsigned int count;
+ unsigned int cur;
int i;
- if (log->full) {
- for (i = log->next; i < ARRAY_SIZE(log->entry); i++)
- print_binder_transaction_log_entry(m, &log->entry[i]);
+ count = log_cur + 1;
+ cur = count < ARRAY_SIZE(log->entry) && !log->full ?
+ 0 : count % ARRAY_SIZE(log->entry);
+ if (count > ARRAY_SIZE(log->entry) || log->full)
+ count = ARRAY_SIZE(log->entry);
+ for (i = 0; i < count; i++) {
+ unsigned int index = cur++ % ARRAY_SIZE(log->entry);
+
+ print_binder_transaction_log_entry(m, &log->entry[index]);
}
- for (i = 0; i < log->next; i++)
- print_binder_transaction_log_entry(m, &log->entry[i]);
return 0;
}
@@ -3683,26 +5623,54 @@ static const struct file_operations binder_fops = {
.release = binder_release,
};
-static struct miscdevice binder_miscdev = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = "binder",
- .fops = &binder_fops
-};
-
BINDER_DEBUG_ENTRY(state);
BINDER_DEBUG_ENTRY(stats);
BINDER_DEBUG_ENTRY(transactions);
BINDER_DEBUG_ENTRY(transaction_log);
+static int __init init_binder_device(const char *name)
+{
+ int ret;
+ struct binder_device *binder_device;
+
+ binder_device = kzalloc(sizeof(*binder_device), GFP_KERNEL);
+ if (!binder_device)
+ return -ENOMEM;
+
+ binder_device->miscdev.fops = &binder_fops;
+ binder_device->miscdev.minor = MISC_DYNAMIC_MINOR;
+ binder_device->miscdev.name = name;
+
+ binder_device->context.binder_context_mgr_uid = INVALID_UID;
+ binder_device->context.name = name;
+ mutex_init(&binder_device->context.context_mgr_node_lock);
+
+ ret = misc_register(&binder_device->miscdev);
+ if (ret < 0) {
+ kfree(binder_device);
+ return ret;
+ }
+
+ hlist_add_head(&binder_device->hlist, &binder_devices);
+
+ return ret;
+}
+
static int __init binder_init(void)
{
int ret;
+ char *device_name, *device_names;
+ struct binder_device *device;
+ struct hlist_node *tmp;
+
+ atomic_set(&binder_transaction_log.cur, ~0U);
+ atomic_set(&binder_transaction_log_failed.cur, ~0U);
binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL);
if (binder_debugfs_dir_entry_root)
binder_debugfs_dir_entry_proc = debugfs_create_dir("proc",
binder_debugfs_dir_entry_root);
- ret = misc_register(&binder_miscdev);
+
if (binder_debugfs_dir_entry_root) {
debugfs_create_file("state",
S_IRUGO,
@@ -3730,6 +5698,35 @@ static int __init binder_init(void)
&binder_transaction_log_failed,
&binder_transaction_log_fops);
}
+
+ /*
+ * Copy the module_parameter string, because we don't want to
+ * tokenize it in-place.
+ */
+ device_names = kzalloc(strlen(binder_devices_param) + 1, GFP_KERNEL);
+ if (!device_names) {
+ ret = -ENOMEM;
+ goto err_alloc_device_names_failed;
+ }
+ strcpy(device_names, binder_devices_param);
+
+ while ((device_name = strsep(&device_names, ","))) {
+ ret = init_binder_device(device_name);
+ if (ret)
+ goto err_init_binder_device_failed;
+ }
+
+ return ret;
+
+err_init_binder_device_failed:
+ hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) {
+ misc_deregister(&device->miscdev);
+ hlist_del(&device->hlist);
+ kfree(device);
+ }
+err_alloc_device_names_failed:
+ debugfs_remove_recursive(binder_debugfs_dir_entry_root);
+
return ret;
}
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
new file mode 100644
index 000000000000..50c19a97ef6b
--- /dev/null
+++ b/drivers/android/binder_alloc.c
@@ -0,0 +1,854 @@
+/* binder_alloc.c
+ *
+ * Android IPC Subsystem
+ *
+ * Copyright (C) 2007-2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <asm/cacheflush.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/rtmutex.h>
+#include <linux/rbtree.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include "binder_alloc.h"
+#include "binder_trace.h"
+
+#define BINDER_MIN_ALLOC (1 * PAGE_SIZE)
+
+static DEFINE_MUTEX(binder_alloc_mmap_lock);
+
+enum {
+ BINDER_DEBUG_OPEN_CLOSE = 1U << 1,
+ BINDER_DEBUG_BUFFER_ALLOC = 1U << 2,
+ BINDER_DEBUG_BUFFER_ALLOC_ASYNC = 1U << 3,
+};
+static uint32_t binder_alloc_debug_mask;
+
+module_param_named(debug_mask, binder_alloc_debug_mask,
+ uint, 0644);
+
+#define binder_alloc_debug(mask, x...) \
+ do { \
+ if (binder_alloc_debug_mask & mask) \
+ pr_info(x); \
+ } while (0)
+
+static struct binder_buffer *binder_buffer_next(struct binder_buffer *buffer)
+{
+ return list_entry(buffer->entry.next, struct binder_buffer, entry);
+}
+
+static struct binder_buffer *binder_buffer_prev(struct binder_buffer *buffer)
+{
+ return list_entry(buffer->entry.prev, struct binder_buffer, entry);
+}
+
+static size_t binder_alloc_buffer_size(struct binder_alloc *alloc,
+ struct binder_buffer *buffer)
+{
+ if (list_is_last(&buffer->entry, &alloc->buffers))
+ return (u8 *)alloc->buffer +
+ alloc->buffer_size - (u8 *)buffer->data;
+ return (u8 *)binder_buffer_next(buffer)->data - (u8 *)buffer->data;
+}
+
+static void binder_insert_free_buffer(struct binder_alloc *alloc,
+ struct binder_buffer *new_buffer)
+{
+ struct rb_node **p = &alloc->free_buffers.rb_node;
+ struct rb_node *parent = NULL;
+ struct binder_buffer *buffer;
+ size_t buffer_size;
+ size_t new_buffer_size;
+
+ BUG_ON(!new_buffer->free);
+
+ new_buffer_size = binder_alloc_buffer_size(alloc, new_buffer);
+
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: add free buffer, size %zd, at %pK\n",
+ alloc->pid, new_buffer_size, new_buffer);
+
+ while (*p) {
+ parent = *p;
+ buffer = rb_entry(parent, struct binder_buffer, rb_node);
+ BUG_ON(!buffer->free);
+
+ buffer_size = binder_alloc_buffer_size(alloc, buffer);
+
+ if (new_buffer_size < buffer_size)
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+ }
+ rb_link_node(&new_buffer->rb_node, parent, p);
+ rb_insert_color(&new_buffer->rb_node, &alloc->free_buffers);
+}
+
+static void binder_insert_allocated_buffer_locked(
+ struct binder_alloc *alloc, struct binder_buffer *new_buffer)
+{
+ struct rb_node **p = &alloc->allocated_buffers.rb_node;
+ struct rb_node *parent = NULL;
+ struct binder_buffer *buffer;
+
+ BUG_ON(new_buffer->free);
+
+ while (*p) {
+ parent = *p;
+ buffer = rb_entry(parent, struct binder_buffer, rb_node);
+ BUG_ON(buffer->free);
+
+ if (new_buffer->data < buffer->data)
+ p = &parent->rb_left;
+ else if (new_buffer->data > buffer->data)
+ p = &parent->rb_right;
+ else
+ BUG();
+ }
+ rb_link_node(&new_buffer->rb_node, parent, p);
+ rb_insert_color(&new_buffer->rb_node, &alloc->allocated_buffers);
+}
+
+static struct binder_buffer *binder_alloc_prepare_to_free_locked(
+ struct binder_alloc *alloc,
+ uintptr_t user_ptr)
+{
+ struct rb_node *n = alloc->allocated_buffers.rb_node;
+ struct binder_buffer *buffer;
+ void *kern_ptr;
+
+ kern_ptr = (void *)(user_ptr - alloc->user_buffer_offset);
+
+ while (n) {
+ buffer = rb_entry(n, struct binder_buffer, rb_node);
+ BUG_ON(buffer->free);
+
+ if (kern_ptr < buffer->data)
+ n = n->rb_left;
+ else if (kern_ptr > buffer->data)
+ n = n->rb_right;
+ else {
+ /*
+ * Guard against user threads attempting to
+ * free the buffer when in use by kernel or
+ * after it's already been freed.
+ */
+ if (!buffer->allow_user_free)
+ return ERR_PTR(-EPERM);
+ buffer->allow_user_free = 0;
+ return buffer;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * binder_alloc_buffer_lookup() - get buffer given user ptr
+ * @alloc: binder_alloc for this proc
+ * @user_ptr: User pointer to buffer data
+ *
+ * Validate userspace pointer to buffer data and return buffer corresponding to
+ * that user pointer. Search the rb tree for buffer that matches user data
+ * pointer.
+ *
+ * Return: Pointer to buffer or NULL
+ */
+struct binder_buffer *binder_alloc_prepare_to_free(struct binder_alloc *alloc,
+ uintptr_t user_ptr)
+{
+ struct binder_buffer *buffer;
+
+ mutex_lock(&alloc->mutex);
+ buffer = binder_alloc_prepare_to_free_locked(alloc, user_ptr);
+ mutex_unlock(&alloc->mutex);
+ return buffer;
+}
+
+static int __binder_update_page_range(struct binder_alloc *alloc, int allocate,
+ void *start, void *end,
+ struct vm_area_struct *vma)
+{
+ void *page_addr;
+ unsigned long user_page_addr;
+ struct page **page;
+ struct mm_struct *mm;
+
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: %s pages %pK-%pK\n", alloc->pid,
+ allocate ? "allocate" : "free", start, end);
+
+ if (end <= start)
+ return 0;
+
+ trace_binder_update_page_range(alloc, allocate, start, end);
+
+ if (vma)
+ mm = NULL;
+ else
+ mm = get_task_mm(alloc->tsk);
+
+ if (mm) {
+ down_write(&mm->mmap_sem);
+ vma = alloc->vma;
+ if (vma && mm != alloc->vma_vm_mm) {
+ pr_err("%d: vma mm and task mm mismatch\n",
+ alloc->pid);
+ vma = NULL;
+ }
+ }
+
+ if (allocate == 0)
+ goto free_range;
+
+ if (vma == NULL) {
+ pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n",
+ alloc->pid);
+ goto err_no_vma;
+ }
+
+ for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
+ int ret;
+
+ page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE];
+
+ BUG_ON(*page);
+ *page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+ if (*page == NULL) {
+ pr_err("%d: binder_alloc_buf failed for page at %pK\n",
+ alloc->pid, page_addr);
+ goto err_alloc_page_failed;
+ }
+ ret = map_kernel_range_noflush((unsigned long)page_addr,
+ PAGE_SIZE, PAGE_KERNEL, page);
+ flush_cache_vmap((unsigned long)page_addr,
+ (unsigned long)page_addr + PAGE_SIZE);
+ if (ret != 1) {
+ pr_err("%d: binder_alloc_buf failed to map page at %pK in kernel\n",
+ alloc->pid, page_addr);
+ goto err_map_kernel_failed;
+ }
+ user_page_addr =
+ (uintptr_t)page_addr + alloc->user_buffer_offset;
+ ret = vm_insert_page(vma, user_page_addr, page[0]);
+ if (ret) {
+ pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n",
+ alloc->pid, user_page_addr);
+ goto err_vm_insert_page_failed;
+ }
+ /* vm_insert_page does not seem to increment the refcount */
+ }
+ if (mm) {
+ up_write(&mm->mmap_sem);
+ mmput(mm);
+ }
+ return 0;
+
+free_range:
+ for (page_addr = end - PAGE_SIZE; page_addr >= start;
+ page_addr -= PAGE_SIZE) {
+ page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE];
+ if (vma)
+ zap_page_range(vma, (uintptr_t)page_addr +
+ alloc->user_buffer_offset, PAGE_SIZE, NULL);
+err_vm_insert_page_failed:
+ unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
+err_map_kernel_failed:
+ __free_page(*page);
+ *page = NULL;
+err_alloc_page_failed:
+ ;
+ }
+err_no_vma:
+ if (mm) {
+ up_write(&mm->mmap_sem);
+ mmput(mm);
+ }
+ return vma ? -ENOMEM : -ESRCH;
+}
+
+static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
+ void *start, void *end,
+ struct vm_area_struct *vma)
+{
+ /*
+ * For regular updates, move up start if needed since MIN_ALLOC pages
+ * are always mapped
+ */
+ if (start - alloc->buffer < BINDER_MIN_ALLOC)
+ start = alloc->buffer + BINDER_MIN_ALLOC;
+
+ return __binder_update_page_range(alloc, allocate, start, end, vma);
+}
+
+struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
+ size_t data_size,
+ size_t offsets_size,
+ size_t extra_buffers_size,
+ int is_async)
+{
+ struct rb_node *n = alloc->free_buffers.rb_node;
+ struct binder_buffer *buffer;
+ size_t buffer_size;
+ struct rb_node *best_fit = NULL;
+ void *has_page_addr;
+ void *end_page_addr;
+ size_t size, data_offsets_size;
+ int ret;
+
+ if (alloc->vma == NULL) {
+ pr_err("%d: binder_alloc_buf, no vma\n",
+ alloc->pid);
+ return ERR_PTR(-ESRCH);
+ }
+
+ data_offsets_size = ALIGN(data_size, sizeof(void *)) +
+ ALIGN(offsets_size, sizeof(void *));
+
+ if (data_offsets_size < data_size || data_offsets_size < offsets_size) {
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: got transaction with invalid size %zd-%zd\n",
+ alloc->pid, data_size, offsets_size);
+ return ERR_PTR(-EINVAL);
+ }
+ size = data_offsets_size + ALIGN(extra_buffers_size, sizeof(void *));
+ if (size < data_offsets_size || size < extra_buffers_size) {
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: got transaction with invalid extra_buffers_size %zd\n",
+ alloc->pid, extra_buffers_size);
+ return ERR_PTR(-EINVAL);
+ }
+ if (is_async &&
+ alloc->free_async_space < size + sizeof(struct binder_buffer)) {
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: binder_alloc_buf size %zd failed, no async space left\n",
+ alloc->pid, size);
+ return ERR_PTR(-ENOSPC);
+ }
+
+ /* Pad 0-size buffers so they get assigned unique addresses */
+ size = max(size, sizeof(void *));
+
+ while (n) {
+ buffer = rb_entry(n, struct binder_buffer, rb_node);
+ BUG_ON(!buffer->free);
+ buffer_size = binder_alloc_buffer_size(alloc, buffer);
+
+ if (size < buffer_size) {
+ best_fit = n;
+ n = n->rb_left;
+ } else if (size > buffer_size)
+ n = n->rb_right;
+ else {
+ best_fit = n;
+ break;
+ }
+ }
+ if (best_fit == NULL) {
+ size_t allocated_buffers = 0;
+ size_t largest_alloc_size = 0;
+ size_t total_alloc_size = 0;
+ size_t free_buffers = 0;
+ size_t largest_free_size = 0;
+ size_t total_free_size = 0;
+
+ for (n = rb_first(&alloc->allocated_buffers); n != NULL;
+ n = rb_next(n)) {
+ buffer = rb_entry(n, struct binder_buffer, rb_node);
+ buffer_size = binder_alloc_buffer_size(alloc, buffer);
+ allocated_buffers++;
+ total_alloc_size += buffer_size;
+ if (buffer_size > largest_alloc_size)
+ largest_alloc_size = buffer_size;
+ }
+ for (n = rb_first(&alloc->free_buffers); n != NULL;
+ n = rb_next(n)) {
+ buffer = rb_entry(n, struct binder_buffer, rb_node);
+ buffer_size = binder_alloc_buffer_size(alloc, buffer);
+ free_buffers++;
+ total_free_size += buffer_size;
+ if (buffer_size > largest_free_size)
+ largest_free_size = buffer_size;
+ }
+ pr_err("%d: binder_alloc_buf size %zd failed, no address space\n",
+ alloc->pid, size);
+ pr_err("allocated: %zd (num: %zd largest: %zd), free: %zd (num: %zd largest: %zd)\n",
+ total_alloc_size, allocated_buffers, largest_alloc_size,
+ total_free_size, free_buffers, largest_free_size);
+ return ERR_PTR(-ENOSPC);
+ }
+ if (n == NULL) {
+ buffer = rb_entry(best_fit, struct binder_buffer, rb_node);
+ buffer_size = binder_alloc_buffer_size(alloc, buffer);
+ }
+
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: binder_alloc_buf size %zd got buffer %pK size %zd\n",
+ alloc->pid, size, buffer, buffer_size);
+
+ has_page_addr =
+ (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK);
+ WARN_ON(n && buffer_size != size);
+ end_page_addr =
+ (void *)PAGE_ALIGN((uintptr_t)buffer->data + size);
+ if (end_page_addr > has_page_addr)
+ end_page_addr = has_page_addr;
+ ret = binder_update_page_range(alloc, 1,
+ (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (buffer_size != size) {
+ struct binder_buffer *new_buffer;
+
+ new_buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
+ if (!new_buffer) {
+ pr_err("%s: %d failed to alloc new buffer struct\n",
+ __func__, alloc->pid);
+ goto err_alloc_buf_struct_failed;
+ }
+ new_buffer->data = (u8 *)buffer->data + size;
+ list_add(&new_buffer->entry, &buffer->entry);
+ new_buffer->free = 1;
+ binder_insert_free_buffer(alloc, new_buffer);
+ }
+
+ rb_erase(best_fit, &alloc->free_buffers);
+ buffer->free = 0;
+ buffer->allow_user_free = 0;
+ binder_insert_allocated_buffer_locked(alloc, buffer);
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: binder_alloc_buf size %zd got %pK\n",
+ alloc->pid, size, buffer);
+ buffer->data_size = data_size;
+ buffer->offsets_size = offsets_size;
+ buffer->async_transaction = is_async;
+ buffer->extra_buffers_size = extra_buffers_size;
+ if (is_async) {
+ alloc->free_async_space -= size + sizeof(struct binder_buffer);
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
+ "%d: binder_alloc_buf size %zd async free %zd\n",
+ alloc->pid, size, alloc->free_async_space);
+ }
+ return buffer;
+
+err_alloc_buf_struct_failed:
+ binder_update_page_range(alloc, 0,
+ (void *)PAGE_ALIGN((uintptr_t)buffer->data),
+ end_page_addr, NULL);
+ return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * binder_alloc_new_buf() - Allocate a new binder buffer
+ * @alloc: binder_alloc for this proc
+ * @data_size: size of user data buffer
+ * @offsets_size: user specified buffer offset
+ * @extra_buffers_size: size of extra space for meta-data (eg, security context)
+ * @is_async: buffer for async transaction
+ *
+ * Allocate a new buffer given the requested sizes. Returns
+ * the kernel version of the buffer pointer. The size allocated
+ * is the sum of the three given sizes (each rounded up to
+ * pointer-sized boundary)
+ *
+ * Return: The allocated buffer or %NULL if error
+ */
+struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
+ size_t data_size,
+ size_t offsets_size,
+ size_t extra_buffers_size,
+ int is_async)
+{
+ struct binder_buffer *buffer;
+
+ mutex_lock(&alloc->mutex);
+ buffer = binder_alloc_new_buf_locked(alloc, data_size, offsets_size,
+ extra_buffers_size, is_async);
+ mutex_unlock(&alloc->mutex);
+ return buffer;
+}
+
+static void *buffer_start_page(struct binder_buffer *buffer)
+{
+ return (void *)((uintptr_t)buffer->data & PAGE_MASK);
+}
+
+static void *prev_buffer_end_page(struct binder_buffer *buffer)
+{
+ return (void *)(((uintptr_t)(buffer->data) - 1) & PAGE_MASK);
+}
+
+static void binder_delete_free_buffer(struct binder_alloc *alloc,
+ struct binder_buffer *buffer)
+{
+ struct binder_buffer *prev, *next = NULL;
+ bool to_free = true;
+ BUG_ON(alloc->buffers.next == &buffer->entry);
+ prev = binder_buffer_prev(buffer);
+ BUG_ON(!prev->free);
+ if (prev_buffer_end_page(prev) == buffer_start_page(buffer)) {
+ to_free = false;
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: merge free, buffer %pK share page with %pK\n",
+ alloc->pid, buffer->data, prev->data);
+ }
+
+ if (!list_is_last(&buffer->entry, &alloc->buffers)) {
+ next = binder_buffer_next(buffer);
+ if (buffer_start_page(next) == buffer_start_page(buffer)) {
+ to_free = false;
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: merge free, buffer %pK share page with %pK\n",
+ alloc->pid,
+ buffer->data,
+ next->data);
+ }
+ }
+
+ if (PAGE_ALIGNED(buffer->data)) {
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: merge free, buffer start %pK is page aligned\n",
+ alloc->pid, buffer->data);
+ to_free = false;
+ }
+
+ if (to_free) {
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: merge free, buffer %pK do not share page with %pK or %pK\n",
+ alloc->pid, buffer->data,
+ prev->data, next ? next->data : NULL);
+ binder_update_page_range(alloc, 0, buffer_start_page(buffer),
+ buffer_start_page(buffer) + PAGE_SIZE,
+ NULL);
+ }
+ list_del(&buffer->entry);
+ kfree(buffer);
+}
+
+static void binder_free_buf_locked(struct binder_alloc *alloc,
+ struct binder_buffer *buffer)
+{
+ size_t size, buffer_size;
+
+ buffer_size = binder_alloc_buffer_size(alloc, buffer);
+
+ size = ALIGN(buffer->data_size, sizeof(void *)) +
+ ALIGN(buffer->offsets_size, sizeof(void *)) +
+ ALIGN(buffer->extra_buffers_size, sizeof(void *));
+
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%d: binder_free_buf %pK size %zd buffer_size %zd\n",
+ alloc->pid, buffer, size, buffer_size);
+
+ BUG_ON(buffer->free);
+ BUG_ON(size > buffer_size);
+ BUG_ON(buffer->transaction != NULL);
+ BUG_ON(buffer->data < alloc->buffer);
+ BUG_ON(buffer->data > alloc->buffer + alloc->buffer_size);
+
+ if (buffer->async_transaction) {
+ alloc->free_async_space += size + sizeof(struct binder_buffer);
+
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
+ "%d: binder_free_buf size %zd async free %zd\n",
+ alloc->pid, size, alloc->free_async_space);
+ }
+
+ binder_update_page_range(alloc, 0,
+ (void *)PAGE_ALIGN((uintptr_t)buffer->data),
+ (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK),
+ NULL);
+
+ rb_erase(&buffer->rb_node, &alloc->allocated_buffers);
+ buffer->free = 1;
+ if (!list_is_last(&buffer->entry, &alloc->buffers)) {
+ struct binder_buffer *next = binder_buffer_next(buffer);
+
+ if (next->free) {
+ rb_erase(&next->rb_node, &alloc->free_buffers);
+ binder_delete_free_buffer(alloc, next);
+ }
+ }
+ if (alloc->buffers.next != &buffer->entry) {
+ struct binder_buffer *prev = binder_buffer_prev(buffer);
+
+ if (prev->free) {
+ binder_delete_free_buffer(alloc, buffer);
+ rb_erase(&prev->rb_node, &alloc->free_buffers);
+ buffer = prev;
+ }
+ }
+ binder_insert_free_buffer(alloc, buffer);
+}
+
+/**
+ * binder_alloc_free_buf() - free a binder buffer
+ * @alloc: binder_alloc for this proc
+ * @buffer: kernel pointer to buffer
+ *
+ * Free the buffer allocated via binder_alloc_new_buffer()
+ */
+void binder_alloc_free_buf(struct binder_alloc *alloc,
+ struct binder_buffer *buffer)
+{
+ mutex_lock(&alloc->mutex);
+ binder_free_buf_locked(alloc, buffer);
+ mutex_unlock(&alloc->mutex);
+}
+
+/**
+ * binder_alloc_mmap_handler() - map virtual address space for proc
+ * @alloc: alloc structure for this proc
+ * @vma: vma passed to mmap()
+ *
+ * Called by binder_mmap() to initialize the space specified in
+ * vma for allocating binder buffers
+ *
+ * Return:
+ * 0 = success
+ * -EBUSY = address space already mapped
+ * -ENOMEM = failed to map memory to given address space
+ */
+int binder_alloc_mmap_handler(struct binder_alloc *alloc,
+ struct vm_area_struct *vma)
+{
+ int ret;
+ struct vm_struct *area;
+ const char *failure_string;
+ struct binder_buffer *buffer;
+
+ mutex_lock(&binder_alloc_mmap_lock);
+ if (alloc->buffer) {
+ ret = -EBUSY;
+ failure_string = "already mapped";
+ goto err_already_mapped;
+ }
+
+ area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
+ if (area == NULL) {
+ ret = -ENOMEM;
+ failure_string = "get_vm_area";
+ goto err_get_vm_area_failed;
+ }
+ alloc->buffer = area->addr;
+ alloc->user_buffer_offset =
+ vma->vm_start - (uintptr_t)alloc->buffer;
+ mutex_unlock(&binder_alloc_mmap_lock);
+
+#ifdef CONFIG_CPU_CACHE_VIPT
+ if (cache_is_vipt_aliasing()) {
+ while (CACHE_COLOUR(
+ (vma->vm_start ^ (uint32_t)alloc->buffer))) {
+ pr_info("%s: %d %lx-%lx maps %pK bad alignment\n",
+ __func__, alloc->pid, vma->vm_start,
+ vma->vm_end, alloc->buffer);
+ vma->vm_start += PAGE_SIZE;
+ }
+ }
+#endif
+ alloc->pages = kzalloc(sizeof(alloc->pages[0]) *
+ ((vma->vm_end - vma->vm_start) / PAGE_SIZE),
+ GFP_KERNEL);
+ if (alloc->pages == NULL) {
+ ret = -ENOMEM;
+ failure_string = "alloc page array";
+ goto err_alloc_pages_failed;
+ }
+ alloc->buffer_size = vma->vm_end - vma->vm_start;
+
+ buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
+ if (!buffer) {
+ ret = -ENOMEM;
+ failure_string = "alloc buffer struct";
+ goto err_alloc_buf_struct_failed;
+ }
+
+ if (__binder_update_page_range(alloc, 1, alloc->buffer,
+ alloc->buffer + BINDER_MIN_ALLOC, vma)) {
+ ret = -ENOMEM;
+ failure_string = "alloc small buf";
+ goto err_alloc_small_buf_failed;
+ }
+ buffer->data = alloc->buffer;
+ list_add(&buffer->entry, &alloc->buffers);
+ buffer->free = 1;
+ binder_insert_free_buffer(alloc, buffer);
+ alloc->free_async_space = alloc->buffer_size / 2;
+ barrier();
+ alloc->vma = vma;
+ alloc->vma_vm_mm = vma->vm_mm;
+
+ return 0;
+
+err_alloc_small_buf_failed:
+ kfree(buffer);
+err_alloc_buf_struct_failed:
+ kfree(alloc->pages);
+ alloc->pages = NULL;
+err_alloc_pages_failed:
+ mutex_lock(&binder_alloc_mmap_lock);
+ vfree(alloc->buffer);
+ alloc->buffer = NULL;
+err_get_vm_area_failed:
+err_already_mapped:
+ mutex_unlock(&binder_alloc_mmap_lock);
+ pr_err("%s: %d %lx-%lx %s failed %d\n", __func__,
+ alloc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
+ return ret;
+}
+
+
+void binder_alloc_deferred_release(struct binder_alloc *alloc)
+{
+ struct rb_node *n;
+ int buffers, page_count;
+ struct binder_buffer *buffer;
+
+ BUG_ON(alloc->vma);
+
+ buffers = 0;
+ mutex_lock(&alloc->mutex);
+ while ((n = rb_first(&alloc->allocated_buffers))) {
+ buffer = rb_entry(n, struct binder_buffer, rb_node);
+
+ /* Transaction should already have been freed */
+ BUG_ON(buffer->transaction);
+
+ binder_free_buf_locked(alloc, buffer);
+ buffers++;
+ }
+
+ while (!list_empty(&alloc->buffers)) {
+ buffer = list_first_entry(&alloc->buffers,
+ struct binder_buffer, entry);
+ WARN_ON(!buffer->free);
+
+ list_del(&buffer->entry);
+ WARN_ON_ONCE(!list_empty(&alloc->buffers));
+ kfree(buffer);
+ }
+
+ page_count = 0;
+ if (alloc->pages) {
+ int i;
+
+ for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) {
+ void *page_addr;
+
+ if (!alloc->pages[i])
+ continue;
+
+ page_addr = alloc->buffer + i * PAGE_SIZE;
+ binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
+ "%s: %d: page %d at %pK not freed\n",
+ __func__, alloc->pid, i, page_addr);
+ unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
+ __free_page(alloc->pages[i]);
+ page_count++;
+ }
+ kfree(alloc->pages);
+ vfree(alloc->buffer);
+ }
+ mutex_unlock(&alloc->mutex);
+
+ binder_alloc_debug(BINDER_DEBUG_OPEN_CLOSE,
+ "%s: %d buffers %d, pages %d\n",
+ __func__, alloc->pid, buffers, page_count);
+}
+
+static void print_binder_buffer(struct seq_file *m, const char *prefix,
+ struct binder_buffer *buffer)
+{
+ seq_printf(m, "%s %d: %pK size %zd:%zd:%zd %s\n",
+ prefix, buffer->debug_id, buffer->data,
+ buffer->data_size, buffer->offsets_size,
+ buffer->extra_buffers_size,
+ buffer->transaction ? "active" : "delivered");
+}
+
+/**
+ * binder_alloc_print_allocated() - print buffer info
+ * @m: seq_file for output via seq_printf()
+ * @alloc: binder_alloc for this proc
+ *
+ * Prints information about every buffer associated with
+ * the binder_alloc state to the given seq_file
+ */
+void binder_alloc_print_allocated(struct seq_file *m,
+ struct binder_alloc *alloc)
+{
+ struct rb_node *n;
+
+ mutex_lock(&alloc->mutex);
+ for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n))
+ print_binder_buffer(m, " buffer",
+ rb_entry(n, struct binder_buffer, rb_node));
+ mutex_unlock(&alloc->mutex);
+}
+
+/**
+ * binder_alloc_get_allocated_count() - return count of buffers
+ * @alloc: binder_alloc for this proc
+ *
+ * Return: count of allocated buffers
+ */
+int binder_alloc_get_allocated_count(struct binder_alloc *alloc)
+{
+ struct rb_node *n;
+ int count = 0;
+
+ mutex_lock(&alloc->mutex);
+ for (n = rb_first(&alloc->allocated_buffers); n != NULL; n = rb_next(n))
+ count++;
+ mutex_unlock(&alloc->mutex);
+ return count;
+}
+
+
+/**
+ * binder_alloc_vma_close() - invalidate address space
+ * @alloc: binder_alloc for this proc
+ *
+ * Called from binder_vma_close() when releasing address space.
+ * Clears alloc->vma to prevent new incoming transactions from
+ * allocating more buffers.
+ */
+void binder_alloc_vma_close(struct binder_alloc *alloc)
+{
+ WRITE_ONCE(alloc->vma, NULL);
+ WRITE_ONCE(alloc->vma_vm_mm, NULL);
+}
+
+/**
+ * binder_alloc_init() - called by binder_open() for per-proc initialization
+ * @alloc: binder_alloc for this proc
+ *
+ * Called from binder_open() to initialize binder_alloc fields for
+ * new binder proc
+ */
+void binder_alloc_init(struct binder_alloc *alloc)
+{
+ alloc->tsk = current->group_leader;
+ alloc->pid = current->group_leader->pid;
+ mutex_init(&alloc->mutex);
+ INIT_LIST_HEAD(&alloc->buffers);
+}
+
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
new file mode 100644
index 000000000000..395d9b56bbf1
--- /dev/null
+++ b/drivers/android/binder_alloc.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_BINDER_ALLOC_H
+#define _LINUX_BINDER_ALLOC_H
+
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/rtmutex.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+struct binder_transaction;
+
+/**
+ * struct binder_buffer - buffer used for binder transactions
+ * @entry: entry alloc->buffers
+ * @rb_node: node for allocated_buffers/free_buffers rb trees
+ * @free: true if buffer is free
+ * @allow_user_free: describe the second member of struct blah,
+ * @async_transaction: describe the second member of struct blah,
+ * @debug_id: describe the second member of struct blah,
+ * @transaction: describe the second member of struct blah,
+ * @target_node: describe the second member of struct blah,
+ * @data_size: describe the second member of struct blah,
+ * @offsets_size: describe the second member of struct blah,
+ * @extra_buffers_size: describe the second member of struct blah,
+ * @data:i describe the second member of struct blah,
+ *
+ * Bookkeeping structure for binder transaction buffers
+ */
+struct binder_buffer {
+ struct list_head entry; /* free and allocated entries by address */
+ struct rb_node rb_node; /* free entry by size or allocated entry */
+ /* by address */
+ unsigned free:1;
+ unsigned allow_user_free:1;
+ unsigned async_transaction:1;
+ unsigned debug_id:29;
+
+ struct binder_transaction *transaction;
+
+ struct binder_node *target_node;
+ size_t data_size;
+ size_t offsets_size;
+ size_t extra_buffers_size;
+ void *data;
+};
+
+/**
+ * struct binder_alloc - per-binder proc state for binder allocator
+ * @vma: vm_area_struct passed to mmap_handler
+ * (invarient after mmap)
+ * @tsk: tid for task that called init for this proc
+ * (invariant after init)
+ * @vma_vm_mm: copy of vma->vm_mm (invarient after mmap)
+ * @buffer: base of per-proc address space mapped via mmap
+ * @user_buffer_offset: offset between user and kernel VAs for buffer
+ * @buffers: list of all buffers for this proc
+ * @free_buffers: rb tree of buffers available for allocation
+ * sorted by size
+ * @allocated_buffers: rb tree of allocated buffers sorted by address
+ * @free_async_space: VA space available for async buffers. This is
+ * initialized at mmap time to 1/2 the full VA space
+ * @pages: array of physical page addresses for each
+ * page of mmap'd space
+ * @buffer_size: size of address space specified via mmap
+ * @pid: pid for associated binder_proc (invariant after init)
+ *
+ * Bookkeeping structure for per-proc address space management for binder
+ * buffers. It is normally initialized during binder_init() and binder_mmap()
+ * calls. The address space is used for both user-visible buffers and for
+ * struct binder_buffer objects used to track the user buffers
+ */
+struct binder_alloc {
+ struct mutex mutex;
+ struct task_struct *tsk;
+ struct vm_area_struct *vma;
+ struct mm_struct *vma_vm_mm;
+ void *buffer;
+ ptrdiff_t user_buffer_offset;
+ struct list_head buffers;
+ struct rb_root free_buffers;
+ struct rb_root allocated_buffers;
+ size_t free_async_space;
+ struct page **pages;
+ size_t buffer_size;
+ uint32_t buffer_free;
+ int pid;
+};
+
+#ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST
+void binder_selftest_alloc(struct binder_alloc *alloc);
+#else
+static inline void binder_selftest_alloc(struct binder_alloc *alloc) {}
+#endif
+extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc,
+ size_t data_size,
+ size_t offsets_size,
+ size_t extra_buffers_size,
+ int is_async);
+extern void binder_alloc_init(struct binder_alloc *alloc);
+extern void binder_alloc_vma_close(struct binder_alloc *alloc);
+extern struct binder_buffer *
+binder_alloc_prepare_to_free(struct binder_alloc *alloc,
+ uintptr_t user_ptr);
+extern void binder_alloc_free_buf(struct binder_alloc *alloc,
+ struct binder_buffer *buffer);
+extern int binder_alloc_mmap_handler(struct binder_alloc *alloc,
+ struct vm_area_struct *vma);
+extern void binder_alloc_deferred_release(struct binder_alloc *alloc);
+extern int binder_alloc_get_allocated_count(struct binder_alloc *alloc);
+extern void binder_alloc_print_allocated(struct seq_file *m,
+ struct binder_alloc *alloc);
+
+/**
+ * binder_alloc_get_free_async_space() - get free space available for async
+ * @alloc: binder_alloc for this proc
+ *
+ * Return: the bytes remaining in the address-space for async transactions
+ */
+static inline size_t
+binder_alloc_get_free_async_space(struct binder_alloc *alloc)
+{
+ size_t free_async_space;
+
+ mutex_lock(&alloc->mutex);
+ free_async_space = alloc->free_async_space;
+ mutex_unlock(&alloc->mutex);
+ return free_async_space;
+}
+
+/**
+ * binder_alloc_get_user_buffer_offset() - get offset between kernel/user addrs
+ * @alloc: binder_alloc for this proc
+ *
+ * Return: the offset between kernel and user-space addresses to use for
+ * virtual address conversion
+ */
+static inline ptrdiff_t
+binder_alloc_get_user_buffer_offset(struct binder_alloc *alloc)
+{
+ /*
+ * user_buffer_offset is constant if vma is set and
+ * undefined if vma is not set. It is possible to
+ * get here with !alloc->vma if the target process
+ * is dying while a transaction is being initiated.
+ * Returning the old value is ok in this case and
+ * the transaction will fail.
+ */
+ return alloc->user_buffer_offset;
+}
+
+#endif /* _LINUX_BINDER_ALLOC_H */
+
diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c
new file mode 100644
index 000000000000..0bf72079a9da
--- /dev/null
+++ b/drivers/android/binder_alloc_selftest.c
@@ -0,0 +1,270 @@
+/* binder_alloc_selftest.c
+ *
+ * Android IPC Subsystem
+ *
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm_types.h>
+#include <linux/err.h>
+#include "binder_alloc.h"
+
+#define BUFFER_NUM 5
+#define BUFFER_MIN_SIZE (PAGE_SIZE / 8)
+
+static bool binder_selftest_run = true;
+static int binder_selftest_failures;
+static DEFINE_MUTEX(binder_selftest_lock);
+
+/**
+ * enum buf_end_align_type - Page alignment of a buffer
+ * end with regard to the end of the previous buffer.
+ *
+ * In the pictures below, buf2 refers to the buffer we
+ * are aligning. buf1 refers to previous buffer by addr.
+ * Symbol [ means the start of a buffer, ] means the end
+ * of a buffer, and | means page boundaries.
+ */
+enum buf_end_align_type {
+ /**
+ * @SAME_PAGE_UNALIGNED: The end of this buffer is on
+ * the same page as the end of the previous buffer and
+ * is not page aligned. Examples:
+ * buf1 ][ buf2 ][ ...
+ * buf1 ]|[ buf2 ][ ...
+ */
+ SAME_PAGE_UNALIGNED = 0,
+ /**
+ * @SAME_PAGE_ALIGNED: When the end of the previous buffer
+ * is not page aligned, the end of this buffer is on the
+ * same page as the end of the previous buffer and is page
+ * aligned. When the previous buffer is page aligned, the
+ * end of this buffer is aligned to the next page boundary.
+ * Examples:
+ * buf1 ][ buf2 ]| ...
+ * buf1 ]|[ buf2 ]| ...
+ */
+ SAME_PAGE_ALIGNED,
+ /**
+ * @NEXT_PAGE_UNALIGNED: The end of this buffer is on
+ * the page next to the end of the previous buffer and
+ * is not page aligned. Examples:
+ * buf1 ][ buf2 | buf2 ][ ...
+ * buf1 ]|[ buf2 | buf2 ][ ...
+ */
+ NEXT_PAGE_UNALIGNED,
+ /**
+ * @NEXT_PAGE_ALIGNED: The end of this buffer is on
+ * the page next to the end of the previous buffer and
+ * is page aligned. Examples:
+ * buf1 ][ buf2 | buf2 ]| ...
+ * buf1 ]|[ buf2 | buf2 ]| ...
+ */
+ NEXT_PAGE_ALIGNED,
+ /**
+ * @NEXT_NEXT_UNALIGNED: The end of this buffer is on
+ * the page that follows the page after the end of the
+ * previous buffer and is not page aligned. Examples:
+ * buf1 ][ buf2 | buf2 | buf2 ][ ...
+ * buf1 ]|[ buf2 | buf2 | buf2 ][ ...
+ */
+ NEXT_NEXT_UNALIGNED,
+ LOOP_END,
+};
+
+static void pr_err_size_seq(size_t *sizes, int *seq)
+{
+ int i;
+
+ pr_err("alloc sizes: ");
+ for (i = 0; i < BUFFER_NUM; i++)
+ pr_cont("[%zu]", sizes[i]);
+ pr_cont("\n");
+ pr_err("free seq: ");
+ for (i = 0; i < BUFFER_NUM; i++)
+ pr_cont("[%d]", seq[i]);
+ pr_cont("\n");
+}
+
+static bool check_buffer_pages_allocated(struct binder_alloc *alloc,
+ struct binder_buffer *buffer,
+ size_t size)
+{
+ void *page_addr, *end;
+ int page_index;
+
+ end = (void *)PAGE_ALIGN((uintptr_t)buffer->data + size);
+ page_addr = buffer->data;
+ for (; page_addr < end; page_addr += PAGE_SIZE) {
+ page_index = (page_addr - alloc->buffer) / PAGE_SIZE;
+ if (!alloc->pages[page_index]) {
+ pr_err("incorrect alloc state at page index %d\n",
+ page_index);
+ return false;
+ }
+ }
+ return true;
+}
+
+static void binder_selftest_alloc_buf(struct binder_alloc *alloc,
+ struct binder_buffer *buffers[],
+ size_t *sizes, int *seq)
+{
+ int i;
+
+ for (i = 0; i < BUFFER_NUM; i++) {
+ buffers[i] = binder_alloc_new_buf(alloc, sizes[i], 0, 0, 0);
+ if (IS_ERR(buffers[i]) ||
+ !check_buffer_pages_allocated(alloc, buffers[i],
+ sizes[i])) {
+ pr_err_size_seq(sizes, seq);
+ binder_selftest_failures++;
+ }
+ }
+}
+
+static void binder_selftest_free_buf(struct binder_alloc *alloc,
+ struct binder_buffer *buffers[],
+ size_t *sizes, int *seq)
+{
+ int i;
+
+ for (i = 0; i < BUFFER_NUM; i++)
+ binder_alloc_free_buf(alloc, buffers[seq[i]]);
+
+ for (i = 0; i < (alloc->buffer_size / PAGE_SIZE); i++) {
+ if ((!alloc->pages[i]) == (i == 0)) {
+ pr_err("incorrect free state at page index %d\n", i);
+ binder_selftest_failures++;
+ }
+ }
+}
+
+static void binder_selftest_alloc_free(struct binder_alloc *alloc,
+ size_t *sizes, int *seq)
+{
+ struct binder_buffer *buffers[BUFFER_NUM];
+
+ binder_selftest_alloc_buf(alloc, buffers, sizes, seq);
+ binder_selftest_free_buf(alloc, buffers, sizes, seq);
+}
+
+static bool is_dup(int *seq, int index, int val)
+{
+ int i;
+
+ for (i = 0; i < index; i++) {
+ if (seq[i] == val)
+ return true;
+ }
+ return false;
+}
+
+/* Generate BUFFER_NUM factorial free orders. */
+static void binder_selftest_free_seq(struct binder_alloc *alloc,
+ size_t *sizes, int *seq, int index)
+{
+ int i;
+
+ if (index == BUFFER_NUM) {
+ binder_selftest_alloc_free(alloc, sizes, seq);
+ return;
+ }
+ for (i = 0; i < BUFFER_NUM; i++) {
+ if (is_dup(seq, index, i))
+ continue;
+ seq[index] = i;
+ binder_selftest_free_seq(alloc, sizes, seq, index + 1);
+ }
+}
+
+static void binder_selftest_alloc_size(struct binder_alloc *alloc,
+ size_t *end_offset)
+{
+ int i;
+ int seq[BUFFER_NUM] = {0};
+ size_t front_sizes[BUFFER_NUM];
+ size_t back_sizes[BUFFER_NUM];
+ size_t last_offset, offset = 0;
+
+ for (i = 0; i < BUFFER_NUM; i++) {
+ last_offset = offset;
+ offset = end_offset[i];
+ front_sizes[i] = offset - last_offset;
+ back_sizes[BUFFER_NUM - i - 1] = front_sizes[i];
+ }
+ /*
+ * Buffers share the first or last few pages.
+ * Only BUFFER_NUM - 1 buffer sizes are adjustable since
+ * we need one giant buffer before getting to the last page.
+ */
+ back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1];
+ binder_selftest_free_seq(alloc, front_sizes, seq, 0);
+ binder_selftest_free_seq(alloc, back_sizes, seq, 0);
+}
+
+static void binder_selftest_alloc_offset(struct binder_alloc *alloc,
+ size_t *end_offset, int index)
+{
+ int align;
+ size_t end, prev;
+
+ if (index == BUFFER_NUM) {
+ binder_selftest_alloc_size(alloc, end_offset);
+ return;
+ }
+ prev = index == 0 ? 0 : end_offset[index - 1];
+ end = prev;
+
+ BUILD_BUG_ON(BUFFER_MIN_SIZE * BUFFER_NUM >= PAGE_SIZE);
+
+ for (align = SAME_PAGE_UNALIGNED; align < LOOP_END; align++) {
+ if (align % 2)
+ end = ALIGN(end, PAGE_SIZE);
+ else
+ end += BUFFER_MIN_SIZE;
+ end_offset[index] = end;
+ binder_selftest_alloc_offset(alloc, end_offset, index + 1);
+ }
+}
+
+/**
+ * binder_selftest_alloc() - Test alloc and free of buffer pages.
+ * @alloc: Pointer to alloc struct.
+ *
+ * Allocate BUFFER_NUM buffers to cover all page alignment cases,
+ * then free them in all orders possible. Check that pages are
+ * allocated after buffer alloc and freed after freeing buffer.
+ */
+void binder_selftest_alloc(struct binder_alloc *alloc)
+{
+ size_t end_offset[BUFFER_NUM];
+
+ if (!binder_selftest_run)
+ return;
+ mutex_lock(&binder_selftest_lock);
+ if (!binder_selftest_run || !alloc->vma)
+ goto done;
+ pr_info("STARTED\n");
+ binder_selftest_alloc_offset(alloc, end_offset, 0);
+ binder_selftest_run = false;
+ if (binder_selftest_failures > 0)
+ pr_info("%d tests FAILED\n", binder_selftest_failures);
+ else
+ pr_info("PASSED\n");
+
+done:
+ mutex_unlock(&binder_selftest_lock);
+}
diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h
index 7f20f3dc8369..7967db16ba5a 100644
--- a/drivers/android/binder_trace.h
+++ b/drivers/android/binder_trace.h
@@ -23,7 +23,8 @@
struct binder_buffer;
struct binder_node;
struct binder_proc;
-struct binder_ref;
+struct binder_alloc;
+struct binder_ref_data;
struct binder_thread;
struct binder_transaction;
@@ -146,8 +147,8 @@ TRACE_EVENT(binder_transaction_received,
TRACE_EVENT(binder_transaction_node_to_ref,
TP_PROTO(struct binder_transaction *t, struct binder_node *node,
- struct binder_ref *ref),
- TP_ARGS(t, node, ref),
+ struct binder_ref_data *rdata),
+ TP_ARGS(t, node, rdata),
TP_STRUCT__entry(
__field(int, debug_id)
@@ -160,8 +161,8 @@ TRACE_EVENT(binder_transaction_node_to_ref,
__entry->debug_id = t->debug_id;
__entry->node_debug_id = node->debug_id;
__entry->node_ptr = node->ptr;
- __entry->ref_debug_id = ref->debug_id;
- __entry->ref_desc = ref->desc;
+ __entry->ref_debug_id = rdata->debug_id;
+ __entry->ref_desc = rdata->desc;
),
TP_printk("transaction=%d node=%d src_ptr=0x%016llx ==> dest_ref=%d dest_desc=%d",
__entry->debug_id, __entry->node_debug_id,
@@ -170,8 +171,9 @@ TRACE_EVENT(binder_transaction_node_to_ref,
);
TRACE_EVENT(binder_transaction_ref_to_node,
- TP_PROTO(struct binder_transaction *t, struct binder_ref *ref),
- TP_ARGS(t, ref),
+ TP_PROTO(struct binder_transaction *t, struct binder_node *node,
+ struct binder_ref_data *rdata),
+ TP_ARGS(t, node, rdata),
TP_STRUCT__entry(
__field(int, debug_id)
@@ -182,10 +184,10 @@ TRACE_EVENT(binder_transaction_ref_to_node,
),
TP_fast_assign(
__entry->debug_id = t->debug_id;
- __entry->ref_debug_id = ref->debug_id;
- __entry->ref_desc = ref->desc;
- __entry->node_debug_id = ref->node->debug_id;
- __entry->node_ptr = ref->node->ptr;
+ __entry->ref_debug_id = rdata->debug_id;
+ __entry->ref_desc = rdata->desc;
+ __entry->node_debug_id = node->debug_id;
+ __entry->node_ptr = node->ptr;
),
TP_printk("transaction=%d node=%d src_ref=%d src_desc=%d ==> dest_ptr=0x%016llx",
__entry->debug_id, __entry->node_debug_id,
@@ -194,9 +196,10 @@ TRACE_EVENT(binder_transaction_ref_to_node,
);
TRACE_EVENT(binder_transaction_ref_to_ref,
- TP_PROTO(struct binder_transaction *t, struct binder_ref *src_ref,
- struct binder_ref *dest_ref),
- TP_ARGS(t, src_ref, dest_ref),
+ TP_PROTO(struct binder_transaction *t, struct binder_node *node,
+ struct binder_ref_data *src_ref,
+ struct binder_ref_data *dest_ref),
+ TP_ARGS(t, node, src_ref, dest_ref),
TP_STRUCT__entry(
__field(int, debug_id)
@@ -208,7 +211,7 @@ TRACE_EVENT(binder_transaction_ref_to_ref,
),
TP_fast_assign(
__entry->debug_id = t->debug_id;
- __entry->node_debug_id = src_ref->node->debug_id;
+ __entry->node_debug_id = node->debug_id;
__entry->src_ref_debug_id = src_ref->debug_id;
__entry->src_ref_desc = src_ref->desc;
__entry->dest_ref_debug_id = dest_ref->debug_id;
@@ -268,9 +271,9 @@ DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release,
TP_ARGS(buffer));
TRACE_EVENT(binder_update_page_range,
- TP_PROTO(struct binder_proc *proc, bool allocate,
+ TP_PROTO(struct binder_alloc *alloc, bool allocate,
void *start, void *end),
- TP_ARGS(proc, allocate, start, end),
+ TP_ARGS(alloc, allocate, start, end),
TP_STRUCT__entry(
__field(int, proc)
__field(bool, allocate)
@@ -278,9 +281,9 @@ TRACE_EVENT(binder_update_page_range,
__field(size_t, size)
),
TP_fast_assign(
- __entry->proc = proc->pid;
+ __entry->proc = alloc->pid;
__entry->allocate = allocate;
- __entry->offset = start - proc->buffer;
+ __entry->offset = start - alloc->buffer;
__entry->size = end - start;
),
TP_printk("proc=%d allocate=%d offset=%zu size=%zu",
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 98517216879d..574d08f1673e 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -33,6 +33,7 @@
#include <linux/cpufreq.h>
#include <linux/cpuidle.h>
#include <linux/timer.h>
+#include <linux/wakeup_reason.h>
#include "../base.h"
#include "power.h"
@@ -1353,6 +1354,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
pm_callback_t callback = NULL;
char *info = NULL;
int error = 0;
+ char suspend_abort[MAX_SUSPEND_ABORT_LEN];
DECLARE_DPM_WATCHDOG_ON_STACK(wd);
TRACE_DEVICE(dev);
@@ -1375,6 +1377,9 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
pm_wakeup_event(dev, 0);
if (pm_wakeup_pending()) {
+ pm_get_active_wakeup_sources(suspend_abort,
+ MAX_SUSPEND_ABORT_LEN);
+ log_suspend_abort_reason(suspend_abort);
dev->power.direct_complete = false;
async_error = -EBUSY;
goto Complete;
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index f98121f11f7c..90c16d8952aa 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -15,6 +15,7 @@
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/pm_wakeirq.h>
+#include <linux/types.h>
#include <trace/events/power.h>
#include "power.h"
@@ -804,6 +805,37 @@ void pm_wakeup_event(struct device *dev, unsigned int msec)
}
EXPORT_SYMBOL_GPL(pm_wakeup_event);
+void pm_get_active_wakeup_sources(char *pending_wakeup_source, size_t max)
+{
+ struct wakeup_source *ws, *last_active_ws = NULL;
+ int len = 0;
+ bool active = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ws, &wakeup_sources, entry) {
+ if (ws->active && len < max) {
+ if (!active)
+ len += scnprintf(pending_wakeup_source, max,
+ "Pending Wakeup Sources: ");
+ len += scnprintf(pending_wakeup_source + len, max - len,
+ "%s ", ws->name);
+ active = true;
+ } else if (!active &&
+ (!last_active_ws ||
+ ktime_to_ns(ws->last_time) >
+ ktime_to_ns(last_active_ws->last_time))) {
+ last_active_ws = ws;
+ }
+ }
+ if (!active && last_active_ws) {
+ scnprintf(pending_wakeup_source, max,
+ "Last active Wakeup Source: %s",
+ last_active_ws->name);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(pm_get_active_wakeup_sources);
+
void pm_print_active_wakeup_sources(void)
{
struct wakeup_source *ws;
@@ -1011,7 +1043,7 @@ static int print_wakeup_source_stats(struct seq_file *m,
active_time = ktime_set(0, 0);
}
- seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
+ seq_printf(m, "%-32s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
ws->name, active_count, ws->event_count,
ws->wakeup_count, ws->expire_count,
ktime_to_ms(active_time), ktime_to_ms(total_time),
@@ -1032,7 +1064,7 @@ static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
struct wakeup_source *ws;
int srcuidx;
- seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
+ seq_puts(m, "name\t\t\t\t\tactive_count\tevent_count\twakeup_count\t"
"expire_count\tactive_since\ttotal_time\tmax_time\t"
"last_change\tprevent_suspend_time\n");
diff --git a/drivers/base/syscore.c b/drivers/base/syscore.c
index 8d98a329f6ea..96c34a95cc62 100644
--- a/drivers/base/syscore.c
+++ b/drivers/base/syscore.c
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/suspend.h>
#include <trace/events/power.h>
+#include <linux/wakeup_reason.h>
static LIST_HEAD(syscore_ops_list);
static DEFINE_MUTEX(syscore_ops_lock);
@@ -75,6 +76,8 @@ int syscore_suspend(void)
return 0;
err_out:
+ log_suspend_abort_reason("System core suspend callback %pF failed",
+ ops->suspend);
pr_err("PM: System core suspend callback %pF failed.\n", ops->suspend);
list_for_each_entry_continue(ops, &syscore_ops_list, node)
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index e2c6e43cf8ca..0bed22ff57a8 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -305,6 +305,14 @@ config ARM_ARCH_TIMER_EVTSTREAM
This must be disabled for hardware validation purposes to detect any
hardware anomalies of missing events.
+config ARM_ARCH_TIMER_VCT_ACCESS
+ bool "Support for ARM architected timer virtual counter access in userspace"
+ default !ARM64
+ depends on ARM_ARCH_TIMER
+ help
+ This option enables support for reading the ARM architected timer's
+ virtual counter in userspace.
+
config FSL_ERRATUM_A008585
bool "Workaround for Freescale/NXP Erratum A-008585"
default y
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index a2503db7e533..e3bc592b1055 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -449,7 +449,10 @@ static void arch_counter_set_user_access(void)
| ARCH_TIMER_USR_PCT_ACCESS_EN);
/* Enable user access to the virtual counter */
- cntkctl |= ARCH_TIMER_USR_VCT_ACCESS_EN;
+ if (IS_ENABLED(CONFIG_ARM_ARCH_TIMER_VCT_ACCESS))
+ cntkctl |= ARCH_TIMER_USR_VCT_ACCESS_EN;
+ else
+ cntkctl &= ~ARCH_TIMER_USR_VCT_ACCESS_EN;
arch_timer_set_cntkctl(cntkctl);
}
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index cac26fb22891..bc6afa3f04d6 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -102,6 +102,24 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
governor. If unsure have a look at the help section of the
driver. Fallback governor will be the performance governor.
+config CPU_FREQ_DEFAULT_GOV_SCHED
+ bool "sched"
+ select CPU_FREQ_GOV_SCHED
+ help
+ Use the CPUfreq governor 'sched' as default. This scales
+ cpu frequency using CPU utilization estimates from the
+ scheduler.
+
+config CPU_FREQ_DEFAULT_GOV_INTERACTIVE
+ bool "interactive"
+ select CPU_FREQ_GOV_INTERACTIVE
+ select CPU_FREQ_GOV_PERFORMANCE
+ help
+ Use the CPUFreq governor 'interactive' as default. This allows
+ you to get a full dynamic cpu frequency capable system by simply
+ loading your cpufreq low-level hardware driver, using the
+ 'interactive' governor for latency-sensitive workloads.
+
config CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
bool "schedutil"
depends on SMP
@@ -193,6 +211,39 @@ config CPU_FREQ_GOV_CONSERVATIVE
If in doubt, say N.
+config CPU_FREQ_GOV_SCHED
+ bool "'sched' cpufreq governor"
+ depends on CPU_FREQ
+ depends on SMP
+ select CPU_FREQ_GOV_COMMON
+ help
+ 'sched' - this governor scales cpu frequency from the
+ scheduler as a function of cpu capacity utilization. It does
+ not evaluate utilization on a periodic basis (as ondemand
+ does) but instead is event-driven by the scheduler.
+
+ If in doubt, say N.
+
+config CPU_FREQ_GOV_INTERACTIVE
+ tristate "'interactive' cpufreq policy governor"
+ depends on CPU_FREQ
+ select CPU_FREQ_GOV_ATTR_SET
+ select IRQ_WORK
+ help
+ 'interactive' - This driver adds a dynamic cpufreq policy governor
+ designed for latency-sensitive workloads.
+
+ This governor attempts to reduce the latency of clock
+ increases so that the system is more responsive to
+ interactive workloads.
+
+ To compile this driver as a module, choose M here: the
+ module will be called cpufreq_interactive.
+
+ For details, take a look at linux/Documentation/cpu-freq.
+
+ If in doubt, say N.
+
config CPU_FREQ_GOV_SCHEDUTIL
bool "'schedutil' cpufreq policy governor"
depends on CPU_FREQ && SMP
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 0a9b6a093646..f0c9905d68a5 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_CPU_FREQ_GOV_POWERSAVE) += cpufreq_powersave.o
obj-$(CONFIG_CPU_FREQ_GOV_USERSPACE) += cpufreq_userspace.o
obj-$(CONFIG_CPU_FREQ_GOV_ONDEMAND) += cpufreq_ondemand.o
obj-$(CONFIG_CPU_FREQ_GOV_CONSERVATIVE) += cpufreq_conservative.o
+obj-$(CONFIG_CPU_FREQ_GOV_INTERACTIVE) += cpufreq_interactive.o
obj-$(CONFIG_CPU_FREQ_GOV_COMMON) += cpufreq_governor.o
obj-$(CONFIG_CPU_FREQ_GOV_ATTR_SET) += cpufreq_governor_attr_set.o
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index d6d91e8afa9e..2beaa17df130 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -29,6 +29,9 @@
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
#include <linux/tick.h>
+#ifdef CONFIG_SMP
+#include <linux/sched.h>
+#endif
#include <trace/events/power.h>
static LIST_HEAD(cpufreq_policy_list);
@@ -117,6 +120,12 @@ bool have_governor_per_policy(void)
}
EXPORT_SYMBOL_GPL(have_governor_per_policy);
+bool cpufreq_driver_is_slow(void)
+{
+ return !(cpufreq_driver->flags & CPUFREQ_DRIVER_FAST);
+}
+EXPORT_SYMBOL_GPL(cpufreq_driver_is_slow);
+
struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy)
{
if (have_governor_per_policy())
@@ -301,6 +310,50 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
#endif
}
+/*********************************************************************
+ * FREQUENCY INVARIANT CPU CAPACITY *
+ *********************************************************************/
+
+static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
+static DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE;
+
+static void
+scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs)
+{
+ unsigned long cur = freqs ? freqs->new : policy->cur;
+ unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max;
+ struct cpufreq_cpuinfo *cpuinfo = &policy->cpuinfo;
+ int cpu;
+
+ pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n",
+ cpumask_pr_args(policy->cpus), cur, policy->max, scale);
+
+ for_each_cpu(cpu, policy->cpus)
+ per_cpu(freq_scale, cpu) = scale;
+
+ if (freqs)
+ return;
+
+ scale = (policy->max << SCHED_CAPACITY_SHIFT) / cpuinfo->max_freq;
+
+ pr_debug("cpus %*pbl cur max/max freq %u/%u kHz max freq scale %lu\n",
+ cpumask_pr_args(policy->cpus), policy->max, cpuinfo->max_freq,
+ scale);
+
+ for_each_cpu(cpu, policy->cpus)
+ per_cpu(max_freq_scale, cpu) = scale;
+}
+
+unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return per_cpu(freq_scale, cpu);
+}
+
+unsigned long cpufreq_scale_max_freq_capacity(int cpu)
+{
+ return per_cpu(max_freq_scale, cpu);
+}
+
static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
struct cpufreq_freqs *freqs, unsigned int state)
{
@@ -378,6 +431,9 @@ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy,
void cpufreq_freq_transition_begin(struct cpufreq_policy *policy,
struct cpufreq_freqs *freqs)
{
+#ifdef CONFIG_SMP
+ int cpu;
+#endif
/*
* Catch double invocations of _begin() which lead to self-deadlock.
@@ -405,6 +461,12 @@ wait:
spin_unlock(&policy->transition_lock);
+ scale_freq_capacity(policy, freqs);
+#ifdef CONFIG_SMP
+ for_each_cpu(cpu, policy->cpus)
+ trace_cpu_capacity(capacity_curr_of(cpu), cpu);
+#endif
+
cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE);
}
EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin);
@@ -2201,8 +2263,11 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
CPUFREQ_NOTIFY, new_policy);
+ scale_freq_capacity(new_policy, NULL);
+
policy->min = new_policy->min;
policy->max = new_policy->max;
+ trace_cpu_frequency_limits(policy->max, policy->min, policy->cpu);
policy->cached_target_freq = UINT_MAX;
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 00a74351f623..0fe251865ac6 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -302,7 +302,10 @@ static void cs_start(struct cpufreq_policy *policy)
dbs_info->requested_freq = policy->cur;
}
-static struct dbs_governor cs_governor = {
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
+static
+#endif
+struct dbs_governor cs_governor = {
.gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("conservative"),
.kobj_type = { .default_attrs = cs_attributes },
.gov_dbs_timer = cs_dbs_timer,
diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c
new file mode 100644
index 000000000000..5a77d9129611
--- /dev/null
+++ b/drivers/cpufreq/cpufreq_interactive.c
@@ -0,0 +1,1411 @@
+/*
+ * drivers/cpufreq/cpufreq_interactive.c
+ *
+ * Copyright (C) 2010-2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Author: Mike Chan (mike@android.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpufreq.h>
+#include <linux/irq_work.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/tick.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/cpufreq_interactive.h>
+
+#define gov_attr_ro(_name) \
+static struct governor_attr _name = \
+__ATTR(_name, 0444, show_##_name, NULL)
+
+#define gov_attr_wo(_name) \
+static struct governor_attr _name = \
+__ATTR(_name, 0200, NULL, store_##_name)
+
+#define gov_attr_rw(_name) \
+static struct governor_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+/* Separate instance required for each 'interactive' directory in sysfs */
+struct interactive_tunables {
+ struct gov_attr_set attr_set;
+
+ /* Hi speed to bump to from lo speed when load burst (default max) */
+ unsigned int hispeed_freq;
+
+ /* Go to hi speed when CPU load at or above this value. */
+#define DEFAULT_GO_HISPEED_LOAD 99
+ unsigned long go_hispeed_load;
+
+ /* Target load. Lower values result in higher CPU speeds. */
+ spinlock_t target_loads_lock;
+ unsigned int *target_loads;
+ int ntarget_loads;
+
+ /*
+ * The minimum amount of time to spend at a frequency before we can ramp
+ * down.
+ */
+#define DEFAULT_MIN_SAMPLE_TIME (80 * USEC_PER_MSEC)
+ unsigned long min_sample_time;
+
+ /* The sample rate of the timer used to increase frequency */
+ unsigned long sampling_rate;
+
+ /*
+ * Wait this long before raising speed above hispeed, by default a
+ * single timer interval.
+ */
+ spinlock_t above_hispeed_delay_lock;
+ unsigned int *above_hispeed_delay;
+ int nabove_hispeed_delay;
+
+ /* Non-zero means indefinite speed boost active */
+ int boost;
+ /* Duration of a boot pulse in usecs */
+ int boostpulse_duration;
+ /* End time of boost pulse in ktime converted to usecs */
+ u64 boostpulse_endtime;
+ bool boosted;
+
+ /*
+ * Max additional time to wait in idle, beyond sampling_rate, at speeds
+ * above minimum before wakeup to reduce speed, or -1 if unnecessary.
+ */
+#define DEFAULT_TIMER_SLACK (4 * DEFAULT_SAMPLING_RATE)
+ unsigned long timer_slack_delay;
+ unsigned long timer_slack;
+ bool io_is_busy;
+};
+
+/* Separate instance required for each 'struct cpufreq_policy' */
+struct interactive_policy {
+ struct cpufreq_policy *policy;
+ struct interactive_tunables *tunables;
+ struct list_head tunables_hook;
+};
+
+/* Separate instance required for each CPU */
+struct interactive_cpu {
+ struct update_util_data update_util;
+ struct interactive_policy *ipolicy;
+
+ struct irq_work irq_work;
+ u64 last_sample_time;
+ unsigned long next_sample_jiffies;
+ bool work_in_progress;
+
+ struct rw_semaphore enable_sem;
+ struct timer_list slack_timer;
+
+ spinlock_t load_lock; /* protects the next 4 fields */
+ u64 time_in_idle;
+ u64 time_in_idle_timestamp;
+ u64 cputime_speedadj;
+ u64 cputime_speedadj_timestamp;
+
+ spinlock_t target_freq_lock; /*protects target freq */
+ unsigned int target_freq;
+
+ unsigned int floor_freq;
+ u64 pol_floor_val_time; /* policy floor_validate_time */
+ u64 loc_floor_val_time; /* per-cpu floor_validate_time */
+ u64 pol_hispeed_val_time; /* policy hispeed_validate_time */
+ u64 loc_hispeed_val_time; /* per-cpu hispeed_validate_time */
+};
+
+static DEFINE_PER_CPU(struct interactive_cpu, interactive_cpu);
+
+/* Realtime thread handles frequency scaling */
+static struct task_struct *speedchange_task;
+static cpumask_t speedchange_cpumask;
+static spinlock_t speedchange_cpumask_lock;
+
+/* Target load. Lower values result in higher CPU speeds. */
+#define DEFAULT_TARGET_LOAD 90
+static unsigned int default_target_loads[] = {DEFAULT_TARGET_LOAD};
+
+#define DEFAULT_SAMPLING_RATE (20 * USEC_PER_MSEC)
+#define DEFAULT_ABOVE_HISPEED_DELAY DEFAULT_SAMPLING_RATE
+static unsigned int default_above_hispeed_delay[] = {
+ DEFAULT_ABOVE_HISPEED_DELAY
+};
+
+/* Iterate over interactive policies for tunables */
+#define for_each_ipolicy(__ip) \
+ list_for_each_entry(__ip, &tunables->attr_set.policy_list, tunables_hook)
+
+static struct interactive_tunables *global_tunables;
+static DEFINE_MUTEX(global_tunables_lock);
+
+static inline void update_slack_delay(struct interactive_tunables *tunables)
+{
+ tunables->timer_slack_delay = usecs_to_jiffies(tunables->timer_slack +
+ tunables->sampling_rate);
+}
+
+static bool timer_slack_required(struct interactive_cpu *icpu)
+{
+ struct interactive_policy *ipolicy = icpu->ipolicy;
+ struct interactive_tunables *tunables = ipolicy->tunables;
+
+ if (tunables->timer_slack < 0)
+ return false;
+
+ if (icpu->target_freq > ipolicy->policy->min)
+ return true;
+
+ return false;
+}
+
+static void gov_slack_timer_start(struct interactive_cpu *icpu, int cpu)
+{
+ struct interactive_tunables *tunables = icpu->ipolicy->tunables;
+
+ icpu->slack_timer.expires = jiffies + tunables->timer_slack_delay;
+ add_timer_on(&icpu->slack_timer, cpu);
+}
+
+static void gov_slack_timer_modify(struct interactive_cpu *icpu)
+{
+ struct interactive_tunables *tunables = icpu->ipolicy->tunables;
+
+ mod_timer(&icpu->slack_timer, jiffies + tunables->timer_slack_delay);
+}
+
+static void slack_timer_resched(struct interactive_cpu *icpu, int cpu,
+ bool modify)
+{
+ struct interactive_tunables *tunables = icpu->ipolicy->tunables;
+ unsigned long flags;
+
+ spin_lock_irqsave(&icpu->load_lock, flags);
+
+ icpu->time_in_idle = get_cpu_idle_time(cpu,
+ &icpu->time_in_idle_timestamp,
+ tunables->io_is_busy);
+ icpu->cputime_speedadj = 0;
+ icpu->cputime_speedadj_timestamp = icpu->time_in_idle_timestamp;
+
+ if (timer_slack_required(icpu)) {
+ if (modify)
+ gov_slack_timer_modify(icpu);
+ else
+ gov_slack_timer_start(icpu, cpu);
+ }
+
+ spin_unlock_irqrestore(&icpu->load_lock, flags);
+}
+
+static unsigned int
+freq_to_above_hispeed_delay(struct interactive_tunables *tunables,
+ unsigned int freq)
+{
+ unsigned long flags;
+ unsigned int ret;
+ int i;
+
+ spin_lock_irqsave(&tunables->above_hispeed_delay_lock, flags);
+
+ for (i = 0; i < tunables->nabove_hispeed_delay - 1 &&
+ freq >= tunables->above_hispeed_delay[i + 1]; i += 2)
+ ;
+
+ ret = tunables->above_hispeed_delay[i];
+ spin_unlock_irqrestore(&tunables->above_hispeed_delay_lock, flags);
+
+ return ret;
+}
+
+static unsigned int freq_to_targetload(struct interactive_tunables *tunables,
+ unsigned int freq)
+{
+ unsigned long flags;
+ unsigned int ret;
+ int i;
+
+ spin_lock_irqsave(&tunables->target_loads_lock, flags);
+
+ for (i = 0; i < tunables->ntarget_loads - 1 &&
+ freq >= tunables->target_loads[i + 1]; i += 2)
+ ;
+
+ ret = tunables->target_loads[i];
+ spin_unlock_irqrestore(&tunables->target_loads_lock, flags);
+ return ret;
+}
+
+/*
+ * If increasing frequencies never map to a lower target load then
+ * choose_freq() will find the minimum frequency that does not exceed its
+ * target load given the current load.
+ */
+static unsigned int choose_freq(struct interactive_cpu *icpu,
+ unsigned int loadadjfreq)
+{
+ struct cpufreq_policy *policy = icpu->ipolicy->policy;
+ struct cpufreq_frequency_table *freq_table = policy->freq_table;
+ unsigned int prevfreq, freqmin = 0, freqmax = UINT_MAX, tl;
+ unsigned int freq = policy->cur;
+ int index;
+
+ do {
+ prevfreq = freq;
+ tl = freq_to_targetload(icpu->ipolicy->tunables, freq);
+
+ /*
+ * Find the lowest frequency where the computed load is less
+ * than or equal to the target load.
+ */
+
+ index = cpufreq_frequency_table_target(policy, loadadjfreq / tl,
+ CPUFREQ_RELATION_L);
+
+ freq = freq_table[index].frequency;
+
+ if (freq > prevfreq) {
+ /* The previous frequency is too low */
+ freqmin = prevfreq;
+
+ if (freq < freqmax)
+ continue;
+
+ /* Find highest frequency that is less than freqmax */
+ index = cpufreq_frequency_table_target(policy,
+ freqmax - 1, CPUFREQ_RELATION_H);
+
+ freq = freq_table[index].frequency;
+
+ if (freq == freqmin) {
+ /*
+ * The first frequency below freqmax has already
+ * been found to be too low. freqmax is the
+ * lowest speed we found that is fast enough.
+ */
+ freq = freqmax;
+ break;
+ }
+ } else if (freq < prevfreq) {
+ /* The previous frequency is high enough. */
+ freqmax = prevfreq;
+
+ if (freq > freqmin)
+ continue;
+
+ /* Find lowest frequency that is higher than freqmin */
+ index = cpufreq_frequency_table_target(policy,
+ freqmin + 1, CPUFREQ_RELATION_L);
+
+ freq = freq_table[index].frequency;
+
+ /*
+ * If freqmax is the first frequency above
+ * freqmin then we have already found that
+ * this speed is fast enough.
+ */
+ if (freq == freqmax)
+ break;
+ }
+
+ /* If same frequency chosen as previous then done. */
+ } while (freq != prevfreq);
+
+ return freq;
+}
+
+static u64 update_load(struct interactive_cpu *icpu, int cpu)
+{
+ struct interactive_tunables *tunables = icpu->ipolicy->tunables;
+ u64 now_idle, now, active_time, delta_idle, delta_time;
+
+ now_idle = get_cpu_idle_time(cpu, &now, tunables->io_is_busy);
+ delta_idle = (now_idle - icpu->time_in_idle);
+ delta_time = (now - icpu->time_in_idle_timestamp);
+
+ if (delta_time <= delta_idle)
+ active_time = 0;
+ else
+ active_time = delta_time - delta_idle;
+
+ icpu->cputime_speedadj += active_time * icpu->ipolicy->policy->cur;
+
+ icpu->time_in_idle = now_idle;
+ icpu->time_in_idle_timestamp = now;
+
+ return now;
+}
+
+/* Re-evaluate load to see if a frequency change is required or not */
+static void eval_target_freq(struct interactive_cpu *icpu)
+{
+ struct interactive_tunables *tunables = icpu->ipolicy->tunables;
+ struct cpufreq_policy *policy = icpu->ipolicy->policy;
+ struct cpufreq_frequency_table *freq_table = policy->freq_table;
+ u64 cputime_speedadj, now, max_fvtime;
+ unsigned int new_freq, loadadjfreq, index, delta_time;
+ unsigned long flags;
+ int cpu_load;
+ int cpu = smp_processor_id();
+
+ spin_lock_irqsave(&icpu->load_lock, flags);
+ now = update_load(icpu, smp_processor_id());
+ delta_time = (unsigned int)(now - icpu->cputime_speedadj_timestamp);
+ cputime_speedadj = icpu->cputime_speedadj;
+ spin_unlock_irqrestore(&icpu->load_lock, flags);
+
+ if (WARN_ON_ONCE(!delta_time))
+ return;
+
+ spin_lock_irqsave(&icpu->target_freq_lock, flags);
+ do_div(cputime_speedadj, delta_time);
+ loadadjfreq = (unsigned int)cputime_speedadj * 100;
+ cpu_load = loadadjfreq / policy->cur;
+ tunables->boosted = tunables->boost ||
+ now < tunables->boostpulse_endtime;
+
+ if (cpu_load >= tunables->go_hispeed_load || tunables->boosted) {
+ if (policy->cur < tunables->hispeed_freq) {
+ new_freq = tunables->hispeed_freq;
+ } else {
+ new_freq = choose_freq(icpu, loadadjfreq);
+
+ if (new_freq < tunables->hispeed_freq)
+ new_freq = tunables->hispeed_freq;
+ }
+ } else {
+ new_freq = choose_freq(icpu, loadadjfreq);
+ if (new_freq > tunables->hispeed_freq &&
+ policy->cur < tunables->hispeed_freq)
+ new_freq = tunables->hispeed_freq;
+ }
+
+ if (policy->cur >= tunables->hispeed_freq &&
+ new_freq > policy->cur &&
+ now - icpu->pol_hispeed_val_time < freq_to_above_hispeed_delay(tunables, policy->cur)) {
+ trace_cpufreq_interactive_notyet(cpu, cpu_load,
+ icpu->target_freq, policy->cur, new_freq);
+ goto exit;
+ }
+
+ icpu->loc_hispeed_val_time = now;
+
+ index = cpufreq_frequency_table_target(policy, new_freq,
+ CPUFREQ_RELATION_L);
+ new_freq = freq_table[index].frequency;
+
+ /*
+ * Do not scale below floor_freq unless we have been at or above the
+ * floor frequency for the minimum sample time since last validated.
+ */
+ max_fvtime = max(icpu->pol_floor_val_time, icpu->loc_floor_val_time);
+ if (new_freq < icpu->floor_freq && icpu->target_freq >= policy->cur) {
+ if (now - max_fvtime < tunables->min_sample_time) {
+ trace_cpufreq_interactive_notyet(cpu, cpu_load,
+ icpu->target_freq, policy->cur, new_freq);
+ goto exit;
+ }
+ }
+
+ /*
+ * Update the timestamp for checking whether speed has been held at
+ * or above the selected frequency for a minimum of min_sample_time,
+ * if not boosted to hispeed_freq. If boosted to hispeed_freq then we
+ * allow the speed to drop as soon as the boostpulse duration expires
+ * (or the indefinite boost is turned off).
+ */
+
+ if (!tunables->boosted || new_freq > tunables->hispeed_freq) {
+ icpu->floor_freq = new_freq;
+ if (icpu->target_freq >= policy->cur || new_freq >= policy->cur)
+ icpu->loc_floor_val_time = now;
+ }
+
+ if (icpu->target_freq == new_freq &&
+ icpu->target_freq <= policy->cur) {
+ trace_cpufreq_interactive_already(cpu, cpu_load,
+ icpu->target_freq, policy->cur, new_freq);
+ goto exit;
+ }
+
+ trace_cpufreq_interactive_target(cpu, cpu_load, icpu->target_freq,
+ policy->cur, new_freq);
+
+ icpu->target_freq = new_freq;
+ spin_unlock_irqrestore(&icpu->target_freq_lock, flags);
+
+ spin_lock_irqsave(&speedchange_cpumask_lock, flags);
+ cpumask_set_cpu(cpu, &speedchange_cpumask);
+ spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
+
+ wake_up_process(speedchange_task);
+ return;
+
+exit:
+ spin_unlock_irqrestore(&icpu->target_freq_lock, flags);
+}
+
+static void cpufreq_interactive_update(struct interactive_cpu *icpu)
+{
+ eval_target_freq(icpu);
+ slack_timer_resched(icpu, smp_processor_id(), true);
+}
+
+static void cpufreq_interactive_idle_end(void)
+{
+ struct interactive_cpu *icpu = &per_cpu(interactive_cpu,
+ smp_processor_id());
+
+ if (!down_read_trylock(&icpu->enable_sem))
+ return;
+
+ if (icpu->ipolicy) {
+ /*
+ * We haven't sampled load for more than sampling_rate time, do
+ * it right now.
+ */
+ if (time_after_eq(jiffies, icpu->next_sample_jiffies))
+ cpufreq_interactive_update(icpu);
+ }
+
+ up_read(&icpu->enable_sem);
+}
+
+static void cpufreq_interactive_get_policy_info(struct cpufreq_policy *policy,
+ unsigned int *pmax_freq,
+ u64 *phvt, u64 *pfvt)
+{
+ struct interactive_cpu *icpu;
+ u64 hvt = ~0ULL, fvt = 0;
+ unsigned int max_freq = 0, i;
+
+ for_each_cpu(i, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, i);
+
+ fvt = max(fvt, icpu->loc_floor_val_time);
+ if (icpu->target_freq > max_freq) {
+ max_freq = icpu->target_freq;
+ hvt = icpu->loc_hispeed_val_time;
+ } else if (icpu->target_freq == max_freq) {
+ hvt = min(hvt, icpu->loc_hispeed_val_time);
+ }
+ }
+
+ *pmax_freq = max_freq;
+ *phvt = hvt;
+ *pfvt = fvt;
+}
+
+static void cpufreq_interactive_adjust_cpu(unsigned int cpu,
+ struct cpufreq_policy *policy)
+{
+ struct interactive_cpu *icpu;
+ u64 hvt, fvt;
+ unsigned int max_freq;
+ int i;
+
+ cpufreq_interactive_get_policy_info(policy, &max_freq, &hvt, &fvt);
+
+ for_each_cpu(i, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, i);
+ icpu->pol_floor_val_time = fvt;
+ }
+
+ if (max_freq != policy->cur) {
+ __cpufreq_driver_target(policy, max_freq, CPUFREQ_RELATION_H);
+ for_each_cpu(i, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, i);
+ icpu->pol_hispeed_val_time = hvt;
+ }
+ }
+
+ trace_cpufreq_interactive_setspeed(cpu, max_freq, policy->cur);
+}
+
+static int cpufreq_interactive_speedchange_task(void *data)
+{
+ unsigned int cpu;
+ cpumask_t tmp_mask;
+ unsigned long flags;
+
+again:
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_lock_irqsave(&speedchange_cpumask_lock, flags);
+
+ if (cpumask_empty(&speedchange_cpumask)) {
+ spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
+ schedule();
+
+ if (kthread_should_stop())
+ return 0;
+
+ spin_lock_irqsave(&speedchange_cpumask_lock, flags);
+ }
+
+ set_current_state(TASK_RUNNING);
+ tmp_mask = speedchange_cpumask;
+ cpumask_clear(&speedchange_cpumask);
+ spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
+
+ for_each_cpu(cpu, &tmp_mask) {
+ struct interactive_cpu *icpu = &per_cpu(interactive_cpu, cpu);
+ struct cpufreq_policy *policy;
+
+ if (unlikely(!down_read_trylock(&icpu->enable_sem)))
+ continue;
+
+ if (likely(icpu->ipolicy)) {
+ policy = icpu->ipolicy->policy;
+ cpufreq_interactive_adjust_cpu(cpu, policy);
+ }
+
+ up_read(&icpu->enable_sem);
+ }
+
+ goto again;
+}
+
+static void cpufreq_interactive_boost(struct interactive_tunables *tunables)
+{
+ struct interactive_policy *ipolicy;
+ struct cpufreq_policy *policy;
+ struct interactive_cpu *icpu;
+ unsigned long flags[2];
+ bool wakeup = false;
+ int i;
+
+ tunables->boosted = true;
+
+ spin_lock_irqsave(&speedchange_cpumask_lock, flags[0]);
+
+ for_each_ipolicy(ipolicy) {
+ policy = ipolicy->policy;
+
+ for_each_cpu(i, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, i);
+
+ if (!down_read_trylock(&icpu->enable_sem))
+ continue;
+
+ if (!icpu->ipolicy) {
+ up_read(&icpu->enable_sem);
+ continue;
+ }
+
+ spin_lock_irqsave(&icpu->target_freq_lock, flags[1]);
+ if (icpu->target_freq < tunables->hispeed_freq) {
+ icpu->target_freq = tunables->hispeed_freq;
+ cpumask_set_cpu(i, &speedchange_cpumask);
+ icpu->pol_hispeed_val_time = ktime_to_us(ktime_get());
+ wakeup = true;
+ }
+ spin_unlock_irqrestore(&icpu->target_freq_lock, flags[1]);
+
+ up_read(&icpu->enable_sem);
+ }
+ }
+
+ spin_unlock_irqrestore(&speedchange_cpumask_lock, flags[0]);
+
+ if (wakeup)
+ wake_up_process(speedchange_task);
+}
+
+static int cpufreq_interactive_notifier(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ struct interactive_cpu *icpu = &per_cpu(interactive_cpu, freq->cpu);
+ unsigned long flags;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return 0;
+
+ if (!down_read_trylock(&icpu->enable_sem))
+ return 0;
+
+ if (!icpu->ipolicy) {
+ up_read(&icpu->enable_sem);
+ return 0;
+ }
+
+ spin_lock_irqsave(&icpu->load_lock, flags);
+ update_load(icpu, freq->cpu);
+ spin_unlock_irqrestore(&icpu->load_lock, flags);
+
+ up_read(&icpu->enable_sem);
+
+ return 0;
+}
+
+static struct notifier_block cpufreq_notifier_block = {
+ .notifier_call = cpufreq_interactive_notifier,
+};
+
+static unsigned int *get_tokenized_data(const char *buf, int *num_tokens)
+{
+ const char *cp = buf;
+ int ntokens = 1, i = 0;
+ unsigned int *tokenized_data;
+ int err = -EINVAL;
+
+ while ((cp = strpbrk(cp + 1, " :")))
+ ntokens++;
+
+ if (!(ntokens & 0x1))
+ goto err;
+
+ tokenized_data = kcalloc(ntokens, sizeof(*tokenized_data), GFP_KERNEL);
+ if (!tokenized_data) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ cp = buf;
+ while (i < ntokens) {
+ if (kstrtouint(cp, 0, &tokenized_data[i++]) < 0)
+ goto err_kfree;
+
+ cp = strpbrk(cp, " :");
+ if (!cp)
+ break;
+ cp++;
+ }
+
+ if (i != ntokens)
+ goto err_kfree;
+
+ *num_tokens = ntokens;
+ return tokenized_data;
+
+err_kfree:
+ kfree(tokenized_data);
+err:
+ return ERR_PTR(err);
+}
+
+/* Interactive governor sysfs interface */
+static struct interactive_tunables *to_tunables(struct gov_attr_set *attr_set)
+{
+ return container_of(attr_set, struct interactive_tunables, attr_set);
+}
+
+#define show_one(file_name, type) \
+static ssize_t show_##file_name(struct gov_attr_set *attr_set, char *buf) \
+{ \
+ struct interactive_tunables *tunables = to_tunables(attr_set); \
+ return sprintf(buf, type "\n", tunables->file_name); \
+}
+
+static ssize_t show_target_loads(struct gov_attr_set *attr_set, char *buf)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long flags;
+ ssize_t ret = 0;
+ int i;
+
+ spin_lock_irqsave(&tunables->target_loads_lock, flags);
+
+ for (i = 0; i < tunables->ntarget_loads; i++)
+ ret += sprintf(buf + ret, "%u%s", tunables->target_loads[i],
+ i & 0x1 ? ":" : " ");
+
+ sprintf(buf + ret - 1, "\n");
+ spin_unlock_irqrestore(&tunables->target_loads_lock, flags);
+
+ return ret;
+}
+
+static ssize_t store_target_loads(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned int *new_target_loads;
+ unsigned long flags;
+ int ntokens;
+
+ new_target_loads = get_tokenized_data(buf, &ntokens);
+ if (IS_ERR(new_target_loads))
+ return PTR_ERR(new_target_loads);
+
+ spin_lock_irqsave(&tunables->target_loads_lock, flags);
+ if (tunables->target_loads != default_target_loads)
+ kfree(tunables->target_loads);
+ tunables->target_loads = new_target_loads;
+ tunables->ntarget_loads = ntokens;
+ spin_unlock_irqrestore(&tunables->target_loads_lock, flags);
+
+ return count;
+}
+
+static ssize_t show_above_hispeed_delay(struct gov_attr_set *attr_set,
+ char *buf)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long flags;
+ ssize_t ret = 0;
+ int i;
+
+ spin_lock_irqsave(&tunables->above_hispeed_delay_lock, flags);
+
+ for (i = 0; i < tunables->nabove_hispeed_delay; i++)
+ ret += sprintf(buf + ret, "%u%s",
+ tunables->above_hispeed_delay[i],
+ i & 0x1 ? ":" : " ");
+
+ sprintf(buf + ret - 1, "\n");
+ spin_unlock_irqrestore(&tunables->above_hispeed_delay_lock, flags);
+
+ return ret;
+}
+
+static ssize_t store_above_hispeed_delay(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned int *new_above_hispeed_delay = NULL;
+ unsigned long flags;
+ int ntokens;
+
+ new_above_hispeed_delay = get_tokenized_data(buf, &ntokens);
+ if (IS_ERR(new_above_hispeed_delay))
+ return PTR_ERR(new_above_hispeed_delay);
+
+ spin_lock_irqsave(&tunables->above_hispeed_delay_lock, flags);
+ if (tunables->above_hispeed_delay != default_above_hispeed_delay)
+ kfree(tunables->above_hispeed_delay);
+ tunables->above_hispeed_delay = new_above_hispeed_delay;
+ tunables->nabove_hispeed_delay = ntokens;
+ spin_unlock_irqrestore(&tunables->above_hispeed_delay_lock, flags);
+
+ return count;
+}
+
+static ssize_t store_hispeed_freq(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long int val;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->hispeed_freq = val;
+
+ return count;
+}
+
+static ssize_t store_go_hispeed_load(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->go_hispeed_load = val;
+
+ return count;
+}
+
+static ssize_t store_min_sample_time(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->min_sample_time = val;
+
+ return count;
+}
+
+static ssize_t show_timer_rate(struct gov_attr_set *attr_set, char *buf)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+
+ return sprintf(buf, "%lu\n", tunables->sampling_rate);
+}
+
+static ssize_t store_timer_rate(struct gov_attr_set *attr_set, const char *buf,
+ size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val, val_round;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ val_round = jiffies_to_usecs(usecs_to_jiffies(val));
+ if (val != val_round)
+ pr_warn("timer_rate not aligned to jiffy. Rounded up to %lu\n",
+ val_round);
+
+ tunables->sampling_rate = val_round;
+
+ return count;
+}
+
+static ssize_t store_timer_slack(struct gov_attr_set *attr_set, const char *buf,
+ size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val;
+ int ret;
+
+ ret = kstrtol(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->timer_slack = val;
+ update_slack_delay(tunables);
+
+ return count;
+}
+
+static ssize_t store_boost(struct gov_attr_set *attr_set, const char *buf,
+ size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->boost = val;
+
+ if (tunables->boost) {
+ trace_cpufreq_interactive_boost("on");
+ if (!tunables->boosted)
+ cpufreq_interactive_boost(tunables);
+ } else {
+ tunables->boostpulse_endtime = ktime_to_us(ktime_get());
+ trace_cpufreq_interactive_unboost("off");
+ }
+
+ return count;
+}
+
+static ssize_t store_boostpulse(struct gov_attr_set *attr_set, const char *buf,
+ size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->boostpulse_endtime = ktime_to_us(ktime_get()) +
+ tunables->boostpulse_duration;
+ trace_cpufreq_interactive_boost("pulse");
+ if (!tunables->boosted)
+ cpufreq_interactive_boost(tunables);
+
+ return count;
+}
+
+static ssize_t store_boostpulse_duration(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->boostpulse_duration = val;
+
+ return count;
+}
+
+static ssize_t store_io_is_busy(struct gov_attr_set *attr_set, const char *buf,
+ size_t count)
+{
+ struct interactive_tunables *tunables = to_tunables(attr_set);
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ tunables->io_is_busy = val;
+
+ return count;
+}
+
+show_one(hispeed_freq, "%u");
+show_one(go_hispeed_load, "%lu");
+show_one(min_sample_time, "%lu");
+show_one(timer_slack, "%lu");
+show_one(boost, "%u");
+show_one(boostpulse_duration, "%u");
+show_one(io_is_busy, "%u");
+
+gov_attr_rw(target_loads);
+gov_attr_rw(above_hispeed_delay);
+gov_attr_rw(hispeed_freq);
+gov_attr_rw(go_hispeed_load);
+gov_attr_rw(min_sample_time);
+gov_attr_rw(timer_rate);
+gov_attr_rw(timer_slack);
+gov_attr_rw(boost);
+gov_attr_wo(boostpulse);
+gov_attr_rw(boostpulse_duration);
+gov_attr_rw(io_is_busy);
+
+static struct attribute *interactive_attributes[] = {
+ &target_loads.attr,
+ &above_hispeed_delay.attr,
+ &hispeed_freq.attr,
+ &go_hispeed_load.attr,
+ &min_sample_time.attr,
+ &timer_rate.attr,
+ &timer_slack.attr,
+ &boost.attr,
+ &boostpulse.attr,
+ &boostpulse_duration.attr,
+ &io_is_busy.attr,
+ NULL
+};
+
+static struct kobj_type interactive_tunables_ktype = {
+ .default_attrs = interactive_attributes,
+ .sysfs_ops = &governor_sysfs_ops,
+};
+
+static int cpufreq_interactive_idle_notifier(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ if (val == IDLE_END)
+ cpufreq_interactive_idle_end();
+
+ return 0;
+}
+
+static struct notifier_block cpufreq_interactive_idle_nb = {
+ .notifier_call = cpufreq_interactive_idle_notifier,
+};
+
+/* Interactive Governor callbacks */
+struct interactive_governor {
+ struct cpufreq_governor gov;
+ unsigned int usage_count;
+};
+
+static struct interactive_governor interactive_gov;
+
+#define CPU_FREQ_GOV_INTERACTIVE (&interactive_gov.gov)
+
+static void irq_work(struct irq_work *irq_work)
+{
+ struct interactive_cpu *icpu = container_of(irq_work, struct
+ interactive_cpu, irq_work);
+
+ cpufreq_interactive_update(icpu);
+ icpu->work_in_progress = false;
+}
+
+static void update_util_handler(struct update_util_data *data, u64 time,
+ unsigned int flags)
+{
+ struct interactive_cpu *icpu = container_of(data,
+ struct interactive_cpu, update_util);
+ struct interactive_policy *ipolicy = icpu->ipolicy;
+ struct interactive_tunables *tunables = ipolicy->tunables;
+ u64 delta_ns;
+
+ /*
+ * The irq-work may not be allowed to be queued up right now.
+ * Possible reasons:
+ * - Work has already been queued up or is in progress.
+ * - It is too early (too little time from the previous sample).
+ */
+ if (icpu->work_in_progress)
+ return;
+
+ delta_ns = time - icpu->last_sample_time;
+ if ((s64)delta_ns < tunables->sampling_rate * NSEC_PER_USEC)
+ return;
+
+ icpu->last_sample_time = time;
+ icpu->next_sample_jiffies = usecs_to_jiffies(tunables->sampling_rate) +
+ jiffies;
+
+ icpu->work_in_progress = true;
+ irq_work_queue(&icpu->irq_work);
+}
+
+static void gov_set_update_util(struct interactive_policy *ipolicy)
+{
+ struct cpufreq_policy *policy = ipolicy->policy;
+ struct interactive_cpu *icpu;
+ int cpu;
+
+ for_each_cpu(cpu, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, cpu);
+
+ icpu->last_sample_time = 0;
+ icpu->next_sample_jiffies = 0;
+ cpufreq_add_update_util_hook(cpu, &icpu->update_util,
+ update_util_handler);
+ }
+}
+
+static inline void gov_clear_update_util(struct cpufreq_policy *policy)
+{
+ int i;
+
+ for_each_cpu(i, policy->cpus)
+ cpufreq_remove_update_util_hook(i);
+
+ synchronize_sched();
+}
+
+static void icpu_cancel_work(struct interactive_cpu *icpu)
+{
+ irq_work_sync(&icpu->irq_work);
+ icpu->work_in_progress = false;
+ del_timer_sync(&icpu->slack_timer);
+}
+
+static struct interactive_policy *
+interactive_policy_alloc(struct cpufreq_policy *policy)
+{
+ struct interactive_policy *ipolicy;
+
+ ipolicy = kzalloc(sizeof(*ipolicy), GFP_KERNEL);
+ if (!ipolicy)
+ return NULL;
+
+ ipolicy->policy = policy;
+
+ return ipolicy;
+}
+
+static void interactive_policy_free(struct interactive_policy *ipolicy)
+{
+ kfree(ipolicy);
+}
+
+static struct interactive_tunables *
+interactive_tunables_alloc(struct interactive_policy *ipolicy)
+{
+ struct interactive_tunables *tunables;
+
+ tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
+ if (!tunables)
+ return NULL;
+
+ gov_attr_set_init(&tunables->attr_set, &ipolicy->tunables_hook);
+ if (!have_governor_per_policy())
+ global_tunables = tunables;
+
+ ipolicy->tunables = tunables;
+
+ return tunables;
+}
+
+static void interactive_tunables_free(struct interactive_tunables *tunables)
+{
+ if (!have_governor_per_policy())
+ global_tunables = NULL;
+
+ kfree(tunables);
+}
+
+int cpufreq_interactive_init(struct cpufreq_policy *policy)
+{
+ struct interactive_policy *ipolicy;
+ struct interactive_tunables *tunables;
+ int ret;
+
+ /* State should be equivalent to EXIT */
+ if (policy->governor_data)
+ return -EBUSY;
+
+ ipolicy = interactive_policy_alloc(policy);
+ if (!ipolicy)
+ return -ENOMEM;
+
+ mutex_lock(&global_tunables_lock);
+
+ if (global_tunables) {
+ if (WARN_ON(have_governor_per_policy())) {
+ ret = -EINVAL;
+ goto free_int_policy;
+ }
+
+ policy->governor_data = ipolicy;
+ ipolicy->tunables = global_tunables;
+
+ gov_attr_set_get(&global_tunables->attr_set,
+ &ipolicy->tunables_hook);
+ goto out;
+ }
+
+ tunables = interactive_tunables_alloc(ipolicy);
+ if (!tunables) {
+ ret = -ENOMEM;
+ goto free_int_policy;
+ }
+
+ tunables->hispeed_freq = policy->max;
+ tunables->above_hispeed_delay = default_above_hispeed_delay;
+ tunables->nabove_hispeed_delay =
+ ARRAY_SIZE(default_above_hispeed_delay);
+ tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;
+ tunables->target_loads = default_target_loads;
+ tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);
+ tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;
+ tunables->boostpulse_duration = DEFAULT_MIN_SAMPLE_TIME;
+ tunables->sampling_rate = DEFAULT_SAMPLING_RATE;
+ tunables->timer_slack = DEFAULT_TIMER_SLACK;
+ update_slack_delay(tunables);
+
+ spin_lock_init(&tunables->target_loads_lock);
+ spin_lock_init(&tunables->above_hispeed_delay_lock);
+
+ policy->governor_data = ipolicy;
+
+ ret = kobject_init_and_add(&tunables->attr_set.kobj,
+ &interactive_tunables_ktype,
+ get_governor_parent_kobj(policy), "%s",
+ interactive_gov.gov.name);
+ if (ret)
+ goto fail;
+
+ /* One time initialization for governor */
+ if (!interactive_gov.usage_count++) {
+ idle_notifier_register(&cpufreq_interactive_idle_nb);
+ cpufreq_register_notifier(&cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ }
+
+ out:
+ mutex_unlock(&global_tunables_lock);
+ return 0;
+
+ fail:
+ policy->governor_data = NULL;
+ interactive_tunables_free(tunables);
+
+ free_int_policy:
+ mutex_unlock(&global_tunables_lock);
+
+ interactive_policy_free(ipolicy);
+ pr_err("governor initialization failed (%d)\n", ret);
+
+ return ret;
+}
+
+void cpufreq_interactive_exit(struct cpufreq_policy *policy)
+{
+ struct interactive_policy *ipolicy = policy->governor_data;
+ struct interactive_tunables *tunables = ipolicy->tunables;
+ unsigned int count;
+
+ mutex_lock(&global_tunables_lock);
+
+ /* Last policy using the governor ? */
+ if (!--interactive_gov.usage_count) {
+ cpufreq_unregister_notifier(&cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ idle_notifier_unregister(&cpufreq_interactive_idle_nb);
+ }
+
+ count = gov_attr_set_put(&tunables->attr_set, &ipolicy->tunables_hook);
+ policy->governor_data = NULL;
+ if (!count)
+ interactive_tunables_free(tunables);
+
+ mutex_unlock(&global_tunables_lock);
+
+ interactive_policy_free(ipolicy);
+}
+
+int cpufreq_interactive_start(struct cpufreq_policy *policy)
+{
+ struct interactive_policy *ipolicy = policy->governor_data;
+ struct interactive_cpu *icpu;
+ unsigned int cpu;
+
+ for_each_cpu(cpu, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, cpu);
+
+ icpu->target_freq = policy->cur;
+ icpu->floor_freq = icpu->target_freq;
+ icpu->pol_floor_val_time = ktime_to_us(ktime_get());
+ icpu->loc_floor_val_time = icpu->pol_floor_val_time;
+ icpu->pol_hispeed_val_time = icpu->pol_floor_val_time;
+ icpu->loc_hispeed_val_time = icpu->pol_floor_val_time;
+
+ down_write(&icpu->enable_sem);
+ icpu->ipolicy = ipolicy;
+ up_write(&icpu->enable_sem);
+
+ slack_timer_resched(icpu, cpu, false);
+ }
+
+ gov_set_update_util(ipolicy);
+ return 0;
+}
+
+void cpufreq_interactive_stop(struct cpufreq_policy *policy)
+{
+ struct interactive_policy *ipolicy = policy->governor_data;
+ struct interactive_cpu *icpu;
+ unsigned int cpu;
+
+ gov_clear_update_util(ipolicy->policy);
+
+ for_each_cpu(cpu, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, cpu);
+
+ icpu_cancel_work(icpu);
+
+ down_write(&icpu->enable_sem);
+ icpu->ipolicy = NULL;
+ up_write(&icpu->enable_sem);
+ }
+}
+
+void cpufreq_interactive_limits(struct cpufreq_policy *policy)
+{
+ struct interactive_cpu *icpu;
+ unsigned int cpu;
+ unsigned long flags;
+
+ cpufreq_policy_apply_limits(policy);
+
+ for_each_cpu(cpu, policy->cpus) {
+ icpu = &per_cpu(interactive_cpu, cpu);
+
+ spin_lock_irqsave(&icpu->target_freq_lock, flags);
+
+ if (policy->max < icpu->target_freq)
+ icpu->target_freq = policy->max;
+ else if (policy->min > icpu->target_freq)
+ icpu->target_freq = policy->min;
+
+ spin_unlock_irqrestore(&icpu->target_freq_lock, flags);
+ }
+}
+
+static struct interactive_governor interactive_gov = {
+ .gov = {
+ .name = "interactive",
+ .max_transition_latency = TRANSITION_LATENCY_LIMIT,
+ .owner = THIS_MODULE,
+ .init = cpufreq_interactive_init,
+ .exit = cpufreq_interactive_exit,
+ .start = cpufreq_interactive_start,
+ .stop = cpufreq_interactive_stop,
+ .limits = cpufreq_interactive_limits,
+ }
+};
+
+static void cpufreq_interactive_nop_timer(unsigned long data)
+{
+ /*
+ * The purpose of slack-timer is to wake up the CPU from IDLE, in order
+ * to decrease its frequency if it is not set to minimum already.
+ *
+ * This is important for platforms where CPU with higher frequencies
+ * consume higher power even at IDLE.
+ */
+}
+
+static int __init cpufreq_interactive_gov_init(void)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ struct interactive_cpu *icpu;
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ icpu = &per_cpu(interactive_cpu, cpu);
+
+ init_irq_work(&icpu->irq_work, irq_work);
+ spin_lock_init(&icpu->load_lock);
+ spin_lock_init(&icpu->target_freq_lock);
+ init_rwsem(&icpu->enable_sem);
+
+ /* Initialize per-cpu slack-timer */
+ init_timer_pinned(&icpu->slack_timer);
+ icpu->slack_timer.function = cpufreq_interactive_nop_timer;
+ }
+
+ spin_lock_init(&speedchange_cpumask_lock);
+ speedchange_task = kthread_create(cpufreq_interactive_speedchange_task,
+ NULL, "cfinteractive");
+ if (IS_ERR(speedchange_task))
+ return PTR_ERR(speedchange_task);
+
+ sched_setscheduler_nocheck(speedchange_task, SCHED_FIFO, &param);
+ get_task_struct(speedchange_task);
+
+ /* wake up so the thread does not look hung to the freezer */
+ wake_up_process(speedchange_task);
+
+ return cpufreq_register_governor(CPU_FREQ_GOV_INTERACTIVE);
+}
+
+#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE
+struct cpufreq_governor *cpufreq_default_governor(void)
+{
+ return CPU_FREQ_GOV_INTERACTIVE;
+}
+
+fs_initcall(cpufreq_interactive_gov_init);
+#else
+module_init(cpufreq_interactive_gov_init);
+#endif
+
+static void __exit cpufreq_interactive_gov_exit(void)
+{
+ cpufreq_unregister_governor(CPU_FREQ_GOV_INTERACTIVE);
+ kthread_stop(speedchange_task);
+ put_task_struct(speedchange_task);
+}
+module_exit(cpufreq_interactive_gov_exit);
+
+MODULE_AUTHOR("Mike Chan <mike@android.com>");
+MODULE_DESCRIPTION("'cpufreq_interactive' - A dynamic cpufreq governor for Latency sensitive workloads");
+MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c
index dafb679adc58..399428e40e89 100644
--- a/drivers/cpufreq/cpufreq_performance.c
+++ b/drivers/cpufreq/cpufreq_performance.c
@@ -22,7 +22,10 @@ static void cpufreq_gov_performance_limits(struct cpufreq_policy *policy)
__cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
}
-static struct cpufreq_governor cpufreq_gov_performance = {
+#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE_MODULE
+static
+#endif
+struct cpufreq_governor cpufreq_gov_performance = {
.name = "performance",
.owner = THIS_MODULE,
.limits = cpufreq_gov_performance_limits,
diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c
index 78a651038faf..5daa500fb0a9 100644
--- a/drivers/cpufreq/cpufreq_powersave.c
+++ b/drivers/cpufreq/cpufreq_powersave.c
@@ -22,7 +22,10 @@ static void cpufreq_gov_powersave_limits(struct cpufreq_policy *policy)
__cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
}
-static struct cpufreq_governor cpufreq_gov_powersave = {
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE
+static
+#endif
+struct cpufreq_governor cpufreq_gov_powersave = {
.name = "powersave",
.limits = cpufreq_gov_powersave_limits,
.owner = THIS_MODULE,
diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c
index bd897e3e134d..765166d881bb 100644
--- a/drivers/cpufreq/cpufreq_userspace.c
+++ b/drivers/cpufreq/cpufreq_userspace.c
@@ -118,7 +118,10 @@ static void cpufreq_userspace_policy_limits(struct cpufreq_policy *policy)
mutex_unlock(&userspace_mutex);
}
-static struct cpufreq_governor cpufreq_gov_userspace = {
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE
+static
+#endif
+struct cpufreq_governor cpufreq_gov_userspace = {
.name = "userspace",
.init = cpufreq_userspace_policy_init,
.exit = cpufreq_userspace_policy_exit,
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 35237c8d5206..439f460ac082 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -193,7 +193,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
}
/* Take note of the planned idle state. */
- sched_idle_set_state(target_state);
+ sched_idle_set_state(target_state, index);
trace_cpu_idle_rcuidle(index, dev->cpu);
time_start = ns_to_ktime(local_clock());
@@ -206,7 +206,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
/* The cpu is no longer idle or about to enter idle. */
- sched_idle_set_state(NULL);
+ sched_idle_set_state(NULL, -1);
if (broadcast) {
if (WARN_ON_ONCE(!irqs_disabled()))
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 03d38c291de6..65bb6fd70439 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -178,7 +178,12 @@ static inline int performance_multiplier(unsigned long nr_iowaiters, unsigned lo
/* for higher loadavg, we are more reluctant */
- mult += 2 * get_loadavg(load);
+ /*
+ * this doesn't work as intended - it is almost always 0, but can
+ * sometimes, depending on workload, spike very high into the hundreds
+ * even when the average cpu load is under 10%.
+ */
+ /* mult += 2 * get_loadavg(); */
/* for IO wait tasks (per cpu!) we add 5x each */
mult += 10 * nr_iowaiters;
diff --git a/drivers/dma-buf/fence.c b/drivers/dma-buf/fence.c
index 04bf29808200..883b3bea143c 100644
--- a/drivers/dma-buf/fence.c
+++ b/drivers/dma-buf/fence.c
@@ -68,6 +68,8 @@ int fence_signal_locked(struct fence *fence)
struct fence_cb *cur, *tmp;
int ret = 0;
+ lockdep_assert_held(fence->lock);
+
if (WARN_ON(!fence))
return -EINVAL;
@@ -159,9 +161,6 @@ fence_wait_timeout(struct fence *fence, bool intr, signed long timeout)
if (WARN_ON(timeout < 0))
return -EINVAL;
- if (timeout == 0)
- return fence_is_signaled(fence);
-
trace_fence_wait_start(fence);
ret = fence->ops->wait(fence, intr, timeout);
trace_fence_wait_end(fence);
@@ -329,8 +328,12 @@ fence_remove_callback(struct fence *fence, struct fence_cb *cb)
spin_lock_irqsave(fence->lock, flags);
ret = !list_empty(&cb->node);
- if (ret)
+ if (ret) {
list_del_init(&cb->node);
+ if (list_empty(&fence->cb_list))
+ if (fence->ops->disable_signaling)
+ fence->ops->disable_signaling(fence);
+ }
spin_unlock_irqrestore(fence->lock, flags);
diff --git a/drivers/dma-buf/reservation.c b/drivers/dma-buf/reservation.c
index 723d8af988e5..82f35a4ab390 100644
--- a/drivers/dma-buf/reservation.c
+++ b/drivers/dma-buf/reservation.c
@@ -280,18 +280,24 @@ int reservation_object_get_fences_rcu(struct reservation_object *obj,
unsigned *pshared_count,
struct fence ***pshared)
{
- unsigned shared_count = 0;
- unsigned retry = 1;
- struct fence **shared = NULL, *fence_excl = NULL;
- int ret = 0;
+ struct fence **shared = NULL;
+ struct fence *fence_excl;
+ unsigned int shared_count;
+ int ret = 1;
- while (retry) {
+ do {
struct reservation_object_list *fobj;
unsigned seq;
+ unsigned int i;
- seq = read_seqcount_begin(&obj->seq);
+ shared_count = i = 0;
rcu_read_lock();
+ seq = read_seqcount_begin(&obj->seq);
+
+ fence_excl = rcu_dereference(obj->fence_excl);
+ if (fence_excl && !fence_get_rcu(fence_excl))
+ goto unlock;
fobj = rcu_dereference(obj->fence);
if (fobj) {
@@ -309,52 +315,37 @@ int reservation_object_get_fences_rcu(struct reservation_object *obj,
}
ret = -ENOMEM;
- shared_count = 0;
break;
}
shared = nshared;
- memcpy(shared, fobj->shared, sz);
shared_count = fobj->shared_count;
- } else
- shared_count = 0;
- fence_excl = rcu_dereference(obj->fence_excl);
-
- retry = read_seqcount_retry(&obj->seq, seq);
- if (retry)
- goto unlock;
-
- if (!fence_excl || fence_get_rcu(fence_excl)) {
- unsigned i;
for (i = 0; i < shared_count; ++i) {
- if (fence_get_rcu(shared[i]))
- continue;
-
- /* uh oh, refcount failed, abort and retry */
- while (i--)
- fence_put(shared[i]);
-
- if (fence_excl) {
- fence_put(fence_excl);
- fence_excl = NULL;
- }
-
- retry = 1;
- break;
+ shared[i] = rcu_dereference(fobj->shared[i]);
+ if (!fence_get_rcu(shared[i]))
+ break;
}
- } else
- retry = 1;
+ }
+ if (i != shared_count || read_seqcount_retry(&obj->seq, seq)) {
+ while (i--)
+ fence_put(shared[i]);
+ fence_put(fence_excl);
+ goto unlock;
+ }
+
+ ret = 0;
unlock:
rcu_read_unlock();
- }
- *pshared_count = shared_count;
- if (shared_count)
- *pshared = shared;
- else {
- *pshared = NULL;
+ } while (ret);
+
+ if (!shared_count) {
kfree(shared);
+ shared = NULL;
}
+
+ *pshared_count = shared_count;
+ *pshared = shared;
*pfence_excl = fence_excl;
return ret;
@@ -379,10 +370,7 @@ long reservation_object_wait_timeout_rcu(struct reservation_object *obj,
{
struct fence *fence;
unsigned seq, shared_count, i = 0;
- long ret = timeout;
-
- if (!timeout)
- return reservation_object_test_signaled_rcu(obj, wait_all);
+ long ret = timeout ? timeout : 1;
retry:
fence = NULL;
@@ -397,9 +385,6 @@ retry:
if (fobj)
shared_count = fobj->shared_count;
- if (read_seqcount_retry(&obj->seq, seq))
- goto unlock_retry;
-
for (i = 0; i < shared_count; ++i) {
struct fence *lfence = rcu_dereference(fobj->shared[i]);
@@ -422,9 +407,6 @@ retry:
if (!shared_count) {
struct fence *fence_excl = rcu_dereference(obj->fence_excl);
- if (read_seqcount_retry(&obj->seq, seq))
- goto unlock_retry;
-
if (fence_excl &&
!test_bit(FENCE_FLAG_SIGNALED_BIT, &fence_excl->flags)) {
if (!fence_get_rcu(fence_excl))
@@ -439,6 +421,11 @@ retry:
rcu_read_unlock();
if (fence) {
+ if (read_seqcount_retry(&obj->seq, seq)) {
+ fence_put(fence);
+ goto retry;
+ }
+
ret = fence_wait_timeout(fence, intr, ret);
fence_put(fence);
if (ret > 0 && wait_all && (i + 1 < shared_count))
@@ -484,12 +471,13 @@ bool reservation_object_test_signaled_rcu(struct reservation_object *obj,
bool test_all)
{
unsigned seq, shared_count;
- int ret = true;
+ int ret;
+ rcu_read_lock();
retry:
+ ret = true;
shared_count = 0;
seq = read_seqcount_begin(&obj->seq);
- rcu_read_lock();
if (test_all) {
unsigned i;
@@ -500,46 +488,35 @@ retry:
if (fobj)
shared_count = fobj->shared_count;
- if (read_seqcount_retry(&obj->seq, seq))
- goto unlock_retry;
-
for (i = 0; i < shared_count; ++i) {
struct fence *fence = rcu_dereference(fobj->shared[i]);
ret = reservation_object_test_signaled_single(fence);
if (ret < 0)
- goto unlock_retry;
+ goto retry;
else if (!ret)
break;
}
- /*
- * There could be a read_seqcount_retry here, but nothing cares
- * about whether it's the old or newer fence pointers that are
- * signaled. That race could still have happened after checking
- * read_seqcount_retry. If you care, use ww_mutex_lock.
- */
+ if (read_seqcount_retry(&obj->seq, seq))
+ goto retry;
}
if (!shared_count) {
struct fence *fence_excl = rcu_dereference(obj->fence_excl);
- if (read_seqcount_retry(&obj->seq, seq))
- goto unlock_retry;
-
if (fence_excl) {
ret = reservation_object_test_signaled_single(
fence_excl);
if (ret < 0)
- goto unlock_retry;
+ goto retry;
+
+ if (read_seqcount_retry(&obj->seq, seq))
+ goto retry;
}
}
rcu_read_unlock();
return ret;
-
-unlock_retry:
- rcu_read_unlock();
- goto retry;
}
EXPORT_SYMBOL_GPL(reservation_object_test_signaled_rcu);
diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c
index 4f3511415b29..9dc86d3303ae 100644
--- a/drivers/dma-buf/sw_sync.c
+++ b/drivers/dma-buf/sw_sync.c
@@ -169,6 +169,13 @@ static bool timeline_fence_enable_signaling(struct fence *fence)
return true;
}
+static void timeline_fence_disable_signaling(struct fence *fence)
+{
+ struct sync_pt *pt = container_of(fence, struct sync_pt, base);
+
+ list_del_init(&pt->link);
+}
+
static void timeline_fence_value_str(struct fence *fence,
char *str, int size)
{
@@ -187,6 +194,7 @@ static const struct fence_ops timeline_fence_ops = {
.get_driver_name = timeline_fence_get_driver_name,
.get_timeline_name = timeline_fence_get_timeline_name,
.enable_signaling = timeline_fence_enable_signaling,
+ .disable_signaling = timeline_fence_disable_signaling,
.signaled = timeline_fence_signaled,
.wait = fence_default_wait,
.release = timeline_fence_release,
@@ -360,8 +368,8 @@ static long sw_sync_ioctl_create_fence(struct sync_timeline *obj,
}
sync_file = sync_file_create(&pt->base);
+ fence_put(&pt->base);
if (!sync_file) {
- fence_put(&pt->base);
err = -ENOMEM;
goto err;
}
diff --git a/drivers/dma-buf/sync_file.c b/drivers/dma-buf/sync_file.c
index f0c374d6ab40..c835f6216922 100644
--- a/drivers/dma-buf/sync_file.c
+++ b/drivers/dma-buf/sync_file.c
@@ -299,10 +299,9 @@ static unsigned int sync_file_poll(struct file *file, poll_table *wait)
poll_wait(file, &sync_file->wq, wait);
- if (!poll_does_not_wait(wait) &&
- !test_and_set_bit(POLL_ENABLED, &sync_file->fence->flags)) {
+ if (!test_and_set_bit(POLL_ENABLED, &sync_file->fence->flags)) {
if (fence_add_callback(sync_file->fence, &sync_file->cb,
- fence_check_cb_func) < 0)
+ fence_check_cb_func) < 0)
wake_up_all(&sync_file->wq);
}
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 483059a22b1b..43cb33dc8333 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -12,6 +12,7 @@ menuconfig DRM
select I2C
select I2C_ALGOBIT
select DMA_SHARED_BUFFER
+ select SYNC_FILE
help
Kernel-level support for the Direct Rendering Infrastructure (DRI)
introduced in XFree86 4.0. If you say Y here, you need to select
diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
index dd6fff1c98d6..d6f57c4bb413 100644
--- a/drivers/gpu/drm/drm_atomic.c
+++ b/drivers/gpu/drm/drm_atomic.c
@@ -30,6 +30,7 @@
#include <drm/drm_atomic.h>
#include <drm/drm_mode.h>
#include <drm/drm_plane_helper.h>
+#include <linux/sync_file.h>
#include "drm_crtc_internal.h"
@@ -292,6 +293,23 @@ drm_atomic_get_crtc_state(struct drm_atomic_state *state,
}
EXPORT_SYMBOL(drm_atomic_get_crtc_state);
+static void set_out_fence_for_crtc(struct drm_atomic_state *state,
+ struct drm_crtc *crtc, s32 __user *fence_ptr)
+{
+ state->crtcs[drm_crtc_index(crtc)].out_fence_ptr = fence_ptr;
+}
+
+static s32 __user *get_out_fence_for_crtc(struct drm_atomic_state *state,
+ struct drm_crtc *crtc)
+{
+ s32 __user *fence_ptr;
+
+ fence_ptr = state->crtcs[drm_crtc_index(crtc)].out_fence_ptr;
+ state->crtcs[drm_crtc_index(crtc)].out_fence_ptr = NULL;
+
+ return fence_ptr;
+}
+
/**
* drm_atomic_set_mode_for_crtc - set mode for CRTC
* @state: the CRTC whose incoming state to update
@@ -496,6 +514,16 @@ int drm_atomic_crtc_set_property(struct drm_crtc *crtc,
&replaced);
state->color_mgmt_changed |= replaced;
return ret;
+ } else if (property == config->prop_out_fence_ptr) {
+ s32 __user *fence_ptr = u64_to_user_ptr(val);
+
+ if (!fence_ptr)
+ return 0;
+
+ if (put_user(-1, fence_ptr))
+ return -EFAULT;
+
+ set_out_fence_for_crtc(state->state, crtc, fence_ptr);
} else if (crtc->funcs->atomic_set_property)
return crtc->funcs->atomic_set_property(crtc, state, property, val);
else
@@ -538,6 +566,8 @@ drm_atomic_crtc_get_property(struct drm_crtc *crtc,
*val = (state->ctm) ? state->ctm->base.id : 0;
else if (property == config->gamma_lut_property)
*val = (state->gamma_lut) ? state->gamma_lut->base.id : 0;
+ else if (property == config->prop_out_fence_ptr)
+ *val = 0;
else if (crtc->funcs->atomic_get_property)
return crtc->funcs->atomic_get_property(crtc, state, property, val);
else
@@ -693,6 +723,17 @@ int drm_atomic_plane_set_property(struct drm_plane *plane,
drm_atomic_set_fb_for_plane(state, fb);
if (fb)
drm_framebuffer_unreference(fb);
+ } else if (property == config->prop_in_fence_fd) {
+ if (state->fence)
+ return -EINVAL;
+
+ if (U642I64(val) == -1)
+ return 0;
+
+ state->fence = sync_file_get_fence(val);
+ if (!state->fence)
+ return -EINVAL;
+
} else if (property == config->prop_crtc_id) {
struct drm_crtc *crtc = drm_crtc_find(dev, val);
return drm_atomic_set_crtc_for_plane(state, crtc);
@@ -752,6 +793,8 @@ drm_atomic_plane_get_property(struct drm_plane *plane,
if (property == config->prop_fb_id) {
*val = (state->fb) ? state->fb->base.id : 0;
+ } else if (property == config->prop_in_fence_fd) {
+ *val = -1;
} else if (property == config->prop_crtc_id) {
*val = (state->crtc) ? state->crtc->base.id : 0;
} else if (property == config->prop_crtc_x) {
@@ -1154,6 +1197,36 @@ drm_atomic_set_fb_for_plane(struct drm_plane_state *plane_state,
EXPORT_SYMBOL(drm_atomic_set_fb_for_plane);
/**
+ * drm_atomic_set_fence_for_plane - set fence for plane
+ * @plane_state: atomic state object for the plane
+ * @fence: fence to use for the plane
+ *
+ * Helper to setup the plane_state fence in case it is not set yet.
+ * By using this drivers doesn't need to worry if the user choose
+ * implicit or explicit fencing.
+ *
+ * This function will not set the fence to the state if it was set
+ * via explicit fencing interfaces on the atomic ioctl. It will
+ * all drope the reference to the fence as we not storing it
+ * anywhere.
+ *
+ * Otherwise, if plane_state->fence is not set this function we
+ * just set it with the received implict fence.
+ */
+void
+drm_atomic_set_fence_for_plane(struct drm_plane_state *plane_state,
+ struct fence *fence)
+{
+ if (plane_state->fence) {
+ fence_put(fence);
+ return;
+ }
+
+ plane_state->fence = fence;
+}
+EXPORT_SYMBOL(drm_atomic_set_fence_for_plane);
+
+/**
* drm_atomic_set_crtc_for_connector - set crtc for connector
* @conn_state: atomic state object for the connector
* @crtc: crtc to use for the connector
@@ -1472,11 +1545,9 @@ EXPORT_SYMBOL(drm_atomic_nonblocking_commit);
*/
static struct drm_pending_vblank_event *create_vblank_event(
- struct drm_device *dev, struct drm_file *file_priv,
- struct fence *fence, uint64_t user_data)
+ struct drm_device *dev, uint64_t user_data)
{
struct drm_pending_vblank_event *e = NULL;
- int ret;
e = kzalloc(sizeof *e, GFP_KERNEL);
if (!e)
@@ -1486,17 +1557,6 @@ static struct drm_pending_vblank_event *create_vblank_event(
e->event.base.length = sizeof(e->event);
e->event.user_data = user_data;
- if (file_priv) {
- ret = drm_event_reserve_init(dev, file_priv, &e->base,
- &e->event.base);
- if (ret) {
- kfree(e);
- return NULL;
- }
- }
-
- e->base.fence = fence;
-
return e;
}
@@ -1601,6 +1661,206 @@ void drm_atomic_clean_old_fb(struct drm_device *dev,
}
EXPORT_SYMBOL(drm_atomic_clean_old_fb);
+/**
+ * DOC: explicit fencing properties
+ *
+ * Explicit fencing allows userspace to control the buffer synchronization
+ * between devices. A Fence or a group of fences are transfered to/from
+ * userspace using Sync File fds and there are two DRM properties for that.
+ * IN_FENCE_FD on each DRM Plane to send fences to the kernel and
+ * OUT_FENCE_PTR on each DRM CRTC to receive fences from the kernel.
+ *
+ * As a contrast, with implicit fencing the kernel keeps track of any
+ * ongoing rendering, and automatically ensures that the atomic update waits
+ * for any pending rendering to complete. For shared buffers represented with
+ * a struct &dma_buf this is tracked in &reservation_object structures.
+ * Implicit syncing is how Linux traditionally worked (e.g. DRI2/3 on X.org),
+ * whereas explicit fencing is what Android wants.
+ *
+ * "IN_FENCE_FD”:
+ * Use this property to pass a fence that DRM should wait on before
+ * proceeding with the Atomic Commit request and show the framebuffer for
+ * the plane on the screen. The fence can be either a normal fence or a
+ * merged one, the sync_file framework will handle both cases and use a
+ * fence_array if a merged fence is received. Passing -1 here means no
+ * fences to wait on.
+ *
+ * If the Atomic Commit request has the DRM_MODE_ATOMIC_TEST_ONLY flag
+ * it will only check if the Sync File is a valid one.
+ *
+ * On the driver side the fence is stored on the @fence parameter of
+ * struct &drm_plane_state. Drivers which also support implicit fencing
+ * should set the implicit fence using drm_atomic_set_fence_for_plane(),
+ * to make sure there's consistent behaviour between drivers in precedence
+ * of implicit vs. explicit fencing.
+ *
+ * "OUT_FENCE_PTR”:
+ * Use this property to pass a file descriptor pointer to DRM. Once the
+ * Atomic Commit request call returns OUT_FENCE_PTR will be filled with
+ * the file descriptor number of a Sync File. This Sync File contains the
+ * CRTC fence that will be signaled when all framebuffers present on the
+ * Atomic Commit * request for that given CRTC are scanned out on the
+ * screen.
+ *
+ * The Atomic Commit request fails if a invalid pointer is passed. If the
+ * Atomic Commit request fails for any other reason the out fence fd
+ * returned will be -1. On a Atomic Commit with the
+ * DRM_MODE_ATOMIC_TEST_ONLY flag the out fence will also be set to -1.
+ *
+ * Note that out-fences don't have a special interface to drivers and are
+ * internally represented by a struct &drm_pending_vblank_event in struct
+ * &drm_crtc_state, which is also used by the nonblocking atomic commit
+ * helpers and for the DRM event handling for existing userspace.
+ */
+
+struct drm_out_fence_state {
+ s32 __user *out_fence_ptr;
+ struct sync_file *sync_file;
+ int fd;
+};
+
+static int setup_out_fence(struct drm_out_fence_state *fence_state,
+ struct fence *fence)
+{
+ fence_state->fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fence_state->fd < 0)
+ return fence_state->fd;
+
+ if (put_user(fence_state->fd, fence_state->out_fence_ptr))
+ return -EFAULT;
+
+ fence_state->sync_file = sync_file_create(fence);
+ if (!fence_state->sync_file)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int prepare_crtc_signaling(struct drm_device *dev,
+ struct drm_atomic_state *state,
+ struct drm_mode_atomic *arg,
+ struct drm_file *file_priv,
+ struct drm_out_fence_state **fence_state,
+ unsigned int *num_fences)
+{
+ struct drm_crtc *crtc;
+ struct drm_crtc_state *crtc_state;
+ int i, ret;
+
+ if (arg->flags & DRM_MODE_ATOMIC_TEST_ONLY)
+ return 0;
+
+ for_each_crtc_in_state(state, crtc, crtc_state, i) {
+ s32 __user *fence_ptr;
+
+ fence_ptr = get_out_fence_for_crtc(crtc_state->state, crtc);
+
+ if (arg->flags & DRM_MODE_PAGE_FLIP_EVENT || fence_ptr) {
+ struct drm_pending_vblank_event *e;
+
+ e = create_vblank_event(dev, arg->user_data);
+ if (!e)
+ return -ENOMEM;
+
+ crtc_state->event = e;
+ }
+
+ if (arg->flags & DRM_MODE_PAGE_FLIP_EVENT) {
+ struct drm_pending_vblank_event *e = crtc_state->event;
+
+ if (!file_priv)
+ continue;
+
+ ret = drm_event_reserve_init(dev, file_priv, &e->base,
+ &e->event.base);
+ if (ret) {
+ kfree(e);
+ crtc_state->event = NULL;
+ return ret;
+ }
+ }
+
+ if (fence_ptr) {
+ struct fence *fence;
+ struct drm_out_fence_state *f;
+
+ f = krealloc(*fence_state, sizeof(**fence_state) *
+ (*num_fences + 1), GFP_KERNEL);
+ if (!f)
+ return -ENOMEM;
+
+ memset(&f[*num_fences], 0, sizeof(*f));
+
+ f[*num_fences].out_fence_ptr = fence_ptr;
+ *fence_state = f;
+
+ fence = drm_crtc_create_fence(crtc);
+ if (!fence)
+ return -ENOMEM;
+
+ ret = setup_out_fence(&f[(*num_fences)++], fence);
+ if (ret) {
+ fence_put(fence);
+ return ret;
+ }
+
+ crtc_state->event->base.fence = fence;
+ }
+ }
+
+ return 0;
+}
+
+static void complete_crtc_signaling(struct drm_device *dev,
+ struct drm_atomic_state *state,
+ struct drm_out_fence_state *fence_state,
+ unsigned int num_fences,
+ bool install_fds)
+{
+ struct drm_crtc *crtc;
+ struct drm_crtc_state *crtc_state;
+ int i;
+
+ if (install_fds) {
+ for (i = 0; i < num_fences; i++)
+ fd_install(fence_state[i].fd,
+ fence_state[i].sync_file->file);
+
+ kfree(fence_state);
+ return;
+ }
+
+ for_each_crtc_in_state(state, crtc, crtc_state, i) {
+ struct drm_pending_vblank_event *event = crtc_state->event;
+ /*
+ * Free the allocated event. drm_atomic_helper_setup_commit
+ * can allocate an event too, so only free it if it's ours
+ * to prevent a double free in drm_atomic_state_clear.
+ */
+ if (event && (event->base.fence || event->base.file_priv)) {
+ drm_event_cancel_free(dev, &event->base);
+ crtc_state->event = NULL;
+ }
+ }
+
+ if (!fence_state)
+ return;
+
+ for (i = 0; i < num_fences; i++) {
+ if (fence_state[i].sync_file)
+ fput(fence_state[i].sync_file->file);
+ if (fence_state[i].fd >= 0)
+ put_unused_fd(fence_state[i].fd);
+
+ /* If this fails log error to the user */
+ if (fence_state[i].out_fence_ptr &&
+ put_user(-1, fence_state[i].out_fence_ptr))
+ DRM_DEBUG_ATOMIC("Couldn't clear out_fence_ptr\n");
+ }
+
+ kfree(fence_state);
+}
+
int drm_mode_atomic_ioctl(struct drm_device *dev,
void *data, struct drm_file *file_priv)
{
@@ -1613,11 +1873,10 @@ int drm_mode_atomic_ioctl(struct drm_device *dev,
struct drm_atomic_state *state;
struct drm_modeset_acquire_ctx ctx;
struct drm_plane *plane;
- struct drm_crtc *crtc;
- struct drm_crtc_state *crtc_state;
+ struct drm_out_fence_state *fence_state = NULL;
unsigned plane_mask;
int ret = 0;
- unsigned int i, j;
+ unsigned int i, j, num_fences = 0;
/* disallow for drivers not supporting atomic: */
if (!drm_core_check_feature(dev, DRIVER_ATOMIC))
@@ -1732,20 +1991,10 @@ retry:
drm_mode_object_unreference(obj);
}
- if (arg->flags & DRM_MODE_PAGE_FLIP_EVENT) {
- for_each_crtc_in_state(state, crtc, crtc_state, i) {
- struct drm_pending_vblank_event *e;
-
- e = create_vblank_event(dev, file_priv, NULL,
- arg->user_data);
- if (!e) {
- ret = -ENOMEM;
- goto out;
- }
-
- crtc_state->event = e;
- }
- }
+ ret = prepare_crtc_signaling(dev, state, arg, file_priv, &fence_state,
+ &num_fences);
+ if (ret)
+ goto out;
if (arg->flags & DRM_MODE_ATOMIC_TEST_ONLY) {
/*
@@ -1762,20 +2011,7 @@ retry:
out:
drm_atomic_clean_old_fb(dev, plane_mask, ret);
- if (ret && arg->flags & DRM_MODE_PAGE_FLIP_EVENT) {
- /*
- * Free the allocated event. drm_atomic_helper_setup_commit
- * can allocate an event too, so only free it if it's ours
- * to prevent a double free in drm_atomic_state_clear.
- */
- for_each_crtc_in_state(state, crtc, crtc_state, i) {
- struct drm_pending_vblank_event *event = crtc_state->event;
- if (event && (event->base.fence || event->base.file_priv)) {
- drm_event_cancel_free(dev, &event->base);
- crtc_state->event = NULL;
- }
- }
- }
+ complete_crtc_signaling(dev, state, fence_state, num_fences, !ret);
if (ret == -EDEADLK) {
drm_atomic_state_clear(state);
diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index 50acd799babe..f34b4e8455a6 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -3166,6 +3166,9 @@ void __drm_atomic_helper_plane_destroy_state(struct drm_plane_state *state)
{
if (state->fb)
drm_framebuffer_unreference(state->fb);
+
+ if (state->fence)
+ fence_put(state->fence);
}
EXPORT_SYMBOL(__drm_atomic_helper_plane_destroy_state);
diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c
index 2d7bedf28647..79b3d521c388 100644
--- a/drivers/gpu/drm/drm_crtc.c
+++ b/drivers/gpu/drm/drm_crtc.c
@@ -33,6 +33,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/fence.h>
#include <drm/drmP.h>
#include <drm/drm_crtc.h>
#include <drm/drm_edid.h>
@@ -141,6 +142,54 @@ static void drm_crtc_unregister_all(struct drm_device *dev)
}
}
+static const struct fence_ops drm_crtc_fence_ops;
+
+static struct drm_crtc *fence_to_crtc(struct fence *fence)
+{
+ BUG_ON(fence->ops != &drm_crtc_fence_ops);
+ return container_of(fence->lock, struct drm_crtc, fence_lock);
+}
+
+static const char *drm_crtc_fence_get_driver_name(struct fence *fence)
+{
+ struct drm_crtc *crtc = fence_to_crtc(fence);
+
+ return crtc->dev->driver->name;
+}
+
+static const char *drm_crtc_fence_get_timeline_name(struct fence *fence)
+{
+ struct drm_crtc *crtc = fence_to_crtc(fence);
+
+ return crtc->timeline_name;
+}
+
+static bool drm_crtc_fence_enable_signaling(struct fence *fence)
+{
+ return true;
+}
+
+static const struct fence_ops drm_crtc_fence_ops = {
+ .get_driver_name = drm_crtc_fence_get_driver_name,
+ .get_timeline_name = drm_crtc_fence_get_timeline_name,
+ .enable_signaling = drm_crtc_fence_enable_signaling,
+ .wait = fence_default_wait,
+};
+
+struct fence *drm_crtc_create_fence(struct drm_crtc *crtc)
+{
+ struct fence *fence;
+
+ fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+ if (!fence)
+ return NULL;
+
+ fence_init(fence, &drm_crtc_fence_ops, &crtc->fence_lock,
+ crtc->fence_context, ++crtc->fence_seqno);
+
+ return fence;
+}
+
/**
* drm_crtc_init_with_planes - Initialise a new CRTC object with
* specified primary and cursor planes.
@@ -198,6 +247,11 @@ int drm_crtc_init_with_planes(struct drm_device *dev, struct drm_crtc *crtc,
return -ENOMEM;
}
+ crtc->fence_context = fence_context_alloc(1);
+ spin_lock_init(&crtc->fence_lock);
+ snprintf(crtc->timeline_name, sizeof(crtc->timeline_name),
+ "CRTC:%d-%s", crtc->base.id, crtc->name);
+
crtc->base.properties = &crtc->properties;
list_add_tail(&crtc->head, &config->crtc_list);
@@ -213,6 +267,8 @@ int drm_crtc_init_with_planes(struct drm_device *dev, struct drm_crtc *crtc,
if (drm_core_check_feature(dev, DRIVER_ATOMIC)) {
drm_object_attach_property(&crtc->base, config->prop_active, 0);
drm_object_attach_property(&crtc->base, config->prop_mode_id, 0);
+ drm_object_attach_property(&crtc->base,
+ config->prop_out_fence_ptr, 0);
}
return 0;
@@ -365,6 +421,18 @@ static int drm_mode_create_standard_properties(struct drm_device *dev)
return -ENOMEM;
dev->mode_config.prop_fb_id = prop;
+ prop = drm_property_create_signed_range(dev, DRM_MODE_PROP_ATOMIC,
+ "IN_FENCE_FD", -1, INT_MAX);
+ if (!prop)
+ return -ENOMEM;
+ dev->mode_config.prop_in_fence_fd = prop;
+
+ prop = drm_property_create_range(dev, DRM_MODE_PROP_ATOMIC,
+ "OUT_FENCE_PTR", 0, U64_MAX);
+ if (!prop)
+ return -ENOMEM;
+ dev->mode_config.prop_out_fence_ptr = prop;
+
prop = drm_property_create_object(dev, DRM_MODE_PROP_ATOMIC,
"CRTC_ID", DRM_MODE_OBJECT_CRTC);
if (!prop)
diff --git a/drivers/gpu/drm/drm_crtc_internal.h b/drivers/gpu/drm/drm_crtc_internal.h
index c48ba02c5365..df2b51a4f75e 100644
--- a/drivers/gpu/drm/drm_crtc_internal.h
+++ b/drivers/gpu/drm/drm_crtc_internal.h
@@ -41,6 +41,8 @@ int drm_crtc_check_viewport(const struct drm_crtc *crtc,
const struct drm_display_mode *mode,
const struct drm_framebuffer *fb);
+struct fence *drm_crtc_create_fence(struct drm_crtc *crtc);
+
void drm_fb_release(struct drm_file *file_priv);
/* dumb buffer support IOCTLs */
diff --git a/drivers/gpu/drm/drm_fb_cma_helper.c b/drivers/gpu/drm/drm_fb_cma_helper.c
index 1fd6eac1400c..52629b62b002 100644
--- a/drivers/gpu/drm/drm_fb_cma_helper.c
+++ b/drivers/gpu/drm/drm_fb_cma_helper.c
@@ -18,13 +18,16 @@
*/
#include <drm/drmP.h>
+#include <drm/drm_atomic.h>
#include <drm/drm_crtc.h>
#include <drm/drm_fb_helper.h>
#include <drm/drm_crtc_helper.h>
#include <drm/drm_gem_cma_helper.h>
#include <drm/drm_fb_cma_helper.h>
+#include <linux/dma-buf.h>
#include <linux/dma-mapping.h>
#include <linux/module.h>
+#include <linux/reservation.h>
#define DEFAULT_FBDEFIO_DELAY_MS 50
@@ -265,6 +268,38 @@ struct drm_gem_cma_object *drm_fb_cma_get_gem_obj(struct drm_framebuffer *fb,
}
EXPORT_SYMBOL_GPL(drm_fb_cma_get_gem_obj);
+/**
+ * drm_fb_cma_prepare_fb() - Prepare CMA framebuffer
+ * @plane: Which plane
+ * @state: Plane state attach fence to
+ *
+ * This should be put into prepare_fb hook of struct &drm_plane_helper_funcs .
+ *
+ * This function checks if the plane FB has an dma-buf attached, extracts
+ * the exclusive fence and attaches it to plane state for the atomic helper
+ * to wait on.
+ *
+ * There is no need for cleanup_fb for CMA based framebuffer drivers.
+ */
+int drm_fb_cma_prepare_fb(struct drm_plane *plane,
+ struct drm_plane_state *state)
+{
+ struct dma_buf *dma_buf;
+ struct fence *fence;
+
+ if ((plane->state->fb == state->fb) || !state->fb)
+ return 0;
+
+ dma_buf = drm_fb_cma_get_gem_obj(state->fb, 0)->base.dma_buf;
+ if (dma_buf) {
+ fence = reservation_object_get_excl_rcu(dma_buf->resv);
+ drm_atomic_set_fence_for_plane(state, fence);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(drm_fb_cma_prepare_fb);
+
#ifdef CONFIG_DEBUG_FS
static void drm_fb_cma_describe(struct drm_framebuffer *fb, struct seq_file *m)
{
diff --git a/drivers/gpu/drm/drm_fops.c b/drivers/gpu/drm/drm_fops.c
index c37b7b5f1dd3..129b80b12984 100644
--- a/drivers/gpu/drm/drm_fops.c
+++ b/drivers/gpu/drm/drm_fops.c
@@ -664,6 +664,10 @@ void drm_event_cancel_free(struct drm_device *dev,
list_del(&p->pending_link);
}
spin_unlock_irqrestore(&dev->event_lock, flags);
+
+ if (p->fence)
+ fence_put(p->fence);
+
kfree(p);
}
EXPORT_SYMBOL(drm_event_cancel_free);
diff --git a/drivers/gpu/drm/drm_plane.c b/drivers/gpu/drm/drm_plane.c
index 249c0ae52c6d..3957ef8f026b 100644
--- a/drivers/gpu/drm/drm_plane.c
+++ b/drivers/gpu/drm/drm_plane.c
@@ -137,6 +137,7 @@ int drm_universal_plane_init(struct drm_device *dev, struct drm_plane *plane,
if (drm_core_check_feature(dev, DRIVER_ATOMIC)) {
drm_object_attach_property(&plane->base, config->prop_fb_id, 0);
+ drm_object_attach_property(&plane->base, config->prop_in_fence_fd, -1);
drm_object_attach_property(&plane->base, config->prop_crtc_id, 0);
drm_object_attach_property(&plane->base, config->prop_crtc_x, 0);
drm_object_attach_property(&plane->base, config->prop_crtc_y, 0);
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
index 83bf997dda03..5e67e8b2b685 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
@@ -218,9 +218,10 @@ mdp5_plane_duplicate_state(struct drm_plane *plane)
mdp5_state = kmemdup(to_mdp5_plane_state(plane->state),
sizeof(*mdp5_state), GFP_KERNEL);
+ if (!mdp5_state)
+ return NULL;
- if (mdp5_state && mdp5_state->base.fb)
- drm_framebuffer_reference(mdp5_state->base.fb);
+ __drm_atomic_helper_plane_duplicate_state(plane, &mdp5_state->base);
mdp5_state->mode_changed = false;
mdp5_state->pending = false;
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 52a2a1a75682..5a184089dfa9 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1606,7 +1606,14 @@ EXPORT_SYMBOL(ttm_bo_unmap_virtual);
int ttm_bo_wait(struct ttm_buffer_object *bo,
bool interruptible, bool no_wait)
{
- long timeout = no_wait ? 0 : 15 * HZ;
+ long timeout = 15 * HZ;
+
+ if (no_wait) {
+ if (reservation_object_test_signaled_rcu(bo->resv, true))
+ return 0;
+ else
+ return -EBUSY;
+ }
timeout = reservation_object_wait_timeout_rcu(bo->resv, true,
interruptible, timeout);
diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c
index d02ee5304217..7256647d9418 100644
--- a/drivers/hid/uhid.c
+++ b/drivers/hid/uhid.c
@@ -30,6 +30,8 @@
#define UHID_NAME "uhid"
#define UHID_BUFSIZE 32
+static DEFINE_MUTEX(uhid_open_mutex);
+
struct uhid_device {
struct mutex devlock;
bool running;
@@ -144,15 +146,26 @@ static void uhid_hid_stop(struct hid_device *hid)
static int uhid_hid_open(struct hid_device *hid)
{
struct uhid_device *uhid = hid->driver_data;
+ int retval = 0;
- return uhid_queue_event(uhid, UHID_OPEN);
+ mutex_lock(&uhid_open_mutex);
+ if (!hid->open++) {
+ retval = uhid_queue_event(uhid, UHID_OPEN);
+ if (retval)
+ hid->open--;
+ }
+ mutex_unlock(&uhid_open_mutex);
+ return retval;
}
static void uhid_hid_close(struct hid_device *hid)
{
struct uhid_device *uhid = hid->driver_data;
- uhid_queue_event(uhid, UHID_CLOSE);
+ mutex_lock(&uhid_open_mutex);
+ if (!--hid->open)
+ uhid_queue_event(uhid, UHID_CLOSE);
+ mutex_unlock(&uhid_open_mutex);
}
static int uhid_hid_parse(struct hid_device *hid)
diff --git a/drivers/input/Kconfig b/drivers/input/Kconfig
index 6261874c07c9..34ffa0257b4b 100644
--- a/drivers/input/Kconfig
+++ b/drivers/input/Kconfig
@@ -187,6 +187,19 @@ config INPUT_APMPOWER
To compile this driver as a module, choose M here: the
module will be called apm-power.
+config INPUT_KEYRESET
+ bool "Reset key"
+ depends on INPUT
+ select INPUT_KEYCOMBO
+ ---help---
+ Say Y here if you want to reboot when some keys are pressed;
+
+config INPUT_KEYCOMBO
+ bool "Key combo"
+ depends on INPUT
+ ---help---
+ Say Y here if you want to take action when some keys are pressed;
+
comment "Input Device Drivers"
source "drivers/input/keyboard/Kconfig"
diff --git a/drivers/input/Makefile b/drivers/input/Makefile
index 595820bbabe9..6a3281ca3306 100644
--- a/drivers/input/Makefile
+++ b/drivers/input/Makefile
@@ -26,5 +26,7 @@ obj-$(CONFIG_INPUT_TOUCHSCREEN) += touchscreen/
obj-$(CONFIG_INPUT_MISC) += misc/
obj-$(CONFIG_INPUT_APMPOWER) += apm-power.o
+obj-$(CONFIG_INPUT_KEYRESET) += keyreset.o
+obj-$(CONFIG_INPUT_KEYCOMBO) += keycombo.o
obj-$(CONFIG_RMI4_CORE) += rmi4/
diff --git a/drivers/input/keyboard/goldfish_events.c b/drivers/input/keyboard/goldfish_events.c
index f6e643b589b6..c877e56a9bd5 100644
--- a/drivers/input/keyboard/goldfish_events.c
+++ b/drivers/input/keyboard/goldfish_events.c
@@ -17,6 +17,7 @@
#include <linux/interrupt.h>
#include <linux/types.h>
#include <linux/input.h>
+#include <linux/input/mt.h>
#include <linux/kernel.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
@@ -24,6 +25,8 @@
#include <linux/io.h>
#include <linux/acpi.h>
+#define GOLDFISH_MAX_FINGERS 5
+
enum {
REG_READ = 0x00,
REG_SET_PAGE = 0x00,
@@ -52,7 +55,21 @@ static irqreturn_t events_interrupt(int irq, void *dev_id)
value = __raw_readl(edev->addr + REG_READ);
input_event(edev->input, type, code, value);
- input_sync(edev->input);
+ // Send an extra (EV_SYN, SYN_REPORT, 0x0) event
+ // if a key was pressed. Some keyboard device
+ // drivers may only send the EV_KEY event and
+ // not EV_SYN.
+ // Note that sending an extra SYN_REPORT is not
+ // necessary nor correct protocol with other
+ // devices such as touchscreens, which will send
+ // their own SYN_REPORT's when sufficient event
+ // information has been collected (e.g., for
+ // touchscreens, when pressure and X/Y coordinates
+ // have been received). Hence, we will only send
+ // this extra SYN_REPORT if type == EV_KEY.
+ if (type == EV_KEY) {
+ input_sync(edev->input);
+ }
return IRQ_HANDLED;
}
@@ -154,6 +171,15 @@ static int events_probe(struct platform_device *pdev)
input_dev->name = edev->name;
input_dev->id.bustype = BUS_HOST;
+ // Set the Goldfish Device to be multi-touch.
+ // In the Ranchu kernel, there is multi-touch-specific
+ // code for handling ABS_MT_SLOT events.
+ // See drivers/input/input.c:input_handle_abs_event.
+ // If we do not issue input_mt_init_slots,
+ // the kernel will filter out needed ABS_MT_SLOT
+ // events when we touch the screen in more than one place,
+ // preventing multi-touch with more than one finger from working.
+ input_mt_init_slots(input_dev, GOLDFISH_MAX_FINGERS, 0);
events_import_bits(edev, input_dev->evbit, EV_SYN, EV_MAX);
events_import_bits(edev, input_dev->keybit, EV_KEY, KEY_MAX);
diff --git a/drivers/input/keycombo.c b/drivers/input/keycombo.c
new file mode 100644
index 000000000000..2fba451b91d5
--- /dev/null
+++ b/drivers/input/keycombo.c
@@ -0,0 +1,261 @@
+/* drivers/input/keycombo.c
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/input.h>
+#include <linux/keycombo.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+struct keycombo_state {
+ struct input_handler input_handler;
+ unsigned long keybit[BITS_TO_LONGS(KEY_CNT)];
+ unsigned long upbit[BITS_TO_LONGS(KEY_CNT)];
+ unsigned long key[BITS_TO_LONGS(KEY_CNT)];
+ spinlock_t lock;
+ struct workqueue_struct *wq;
+ int key_down_target;
+ int key_down;
+ int key_up;
+ struct delayed_work key_down_work;
+ int delay;
+ struct work_struct key_up_work;
+ void (*key_up_fn)(void *);
+ void (*key_down_fn)(void *);
+ void *priv;
+ int key_is_down;
+ struct wakeup_source combo_held_wake_source;
+ struct wakeup_source combo_up_wake_source;
+};
+
+static void do_key_down(struct work_struct *work)
+{
+ struct delayed_work *dwork = container_of(work, struct delayed_work,
+ work);
+ struct keycombo_state *state = container_of(dwork,
+ struct keycombo_state, key_down_work);
+ if (state->key_down_fn)
+ state->key_down_fn(state->priv);
+}
+
+static void do_key_up(struct work_struct *work)
+{
+ struct keycombo_state *state = container_of(work, struct keycombo_state,
+ key_up_work);
+ if (state->key_up_fn)
+ state->key_up_fn(state->priv);
+ __pm_relax(&state->combo_up_wake_source);
+}
+
+static void keycombo_event(struct input_handle *handle, unsigned int type,
+ unsigned int code, int value)
+{
+ unsigned long flags;
+ struct keycombo_state *state = handle->private;
+
+ if (type != EV_KEY)
+ return;
+
+ if (code >= KEY_MAX)
+ return;
+
+ if (!test_bit(code, state->keybit))
+ return;
+
+ spin_lock_irqsave(&state->lock, flags);
+ if (!test_bit(code, state->key) == !value)
+ goto done;
+ __change_bit(code, state->key);
+ if (test_bit(code, state->upbit)) {
+ if (value)
+ state->key_up++;
+ else
+ state->key_up--;
+ } else {
+ if (value)
+ state->key_down++;
+ else
+ state->key_down--;
+ }
+ if (state->key_down == state->key_down_target && state->key_up == 0) {
+ __pm_stay_awake(&state->combo_held_wake_source);
+ state->key_is_down = 1;
+ if (queue_delayed_work(state->wq, &state->key_down_work,
+ state->delay))
+ pr_debug("Key down work already queued!");
+ } else if (state->key_is_down) {
+ if (!cancel_delayed_work(&state->key_down_work)) {
+ __pm_stay_awake(&state->combo_up_wake_source);
+ queue_work(state->wq, &state->key_up_work);
+ }
+ __pm_relax(&state->combo_held_wake_source);
+ state->key_is_down = 0;
+ }
+done:
+ spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static int keycombo_connect(struct input_handler *handler,
+ struct input_dev *dev,
+ const struct input_device_id *id)
+{
+ int i;
+ int ret;
+ struct input_handle *handle;
+ struct keycombo_state *state =
+ container_of(handler, struct keycombo_state, input_handler);
+ for (i = 0; i < KEY_MAX; i++) {
+ if (test_bit(i, state->keybit) && test_bit(i, dev->keybit))
+ break;
+ }
+ if (i == KEY_MAX)
+ return -ENODEV;
+
+ handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+ if (!handle)
+ return -ENOMEM;
+
+ handle->dev = dev;
+ handle->handler = handler;
+ handle->name = KEYCOMBO_NAME;
+ handle->private = state;
+
+ ret = input_register_handle(handle);
+ if (ret)
+ goto err_input_register_handle;
+
+ ret = input_open_device(handle);
+ if (ret)
+ goto err_input_open_device;
+
+ return 0;
+
+err_input_open_device:
+ input_unregister_handle(handle);
+err_input_register_handle:
+ kfree(handle);
+ return ret;
+}
+
+static void keycombo_disconnect(struct input_handle *handle)
+{
+ input_close_device(handle);
+ input_unregister_handle(handle);
+ kfree(handle);
+}
+
+static const struct input_device_id keycombo_ids[] = {
+ {
+ .flags = INPUT_DEVICE_ID_MATCH_EVBIT,
+ .evbit = { BIT_MASK(EV_KEY) },
+ },
+ { },
+};
+MODULE_DEVICE_TABLE(input, keycombo_ids);
+
+static int keycombo_probe(struct platform_device *pdev)
+{
+ int ret;
+ int key, *keyp;
+ struct keycombo_state *state;
+ struct keycombo_platform_data *pdata = pdev->dev.platform_data;
+
+ if (!pdata)
+ return -EINVAL;
+
+ state = kzalloc(sizeof(*state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+
+ spin_lock_init(&state->lock);
+ keyp = pdata->keys_down;
+ while ((key = *keyp++)) {
+ if (key >= KEY_MAX)
+ continue;
+ state->key_down_target++;
+ __set_bit(key, state->keybit);
+ }
+ if (pdata->keys_up) {
+ keyp = pdata->keys_up;
+ while ((key = *keyp++)) {
+ if (key >= KEY_MAX)
+ continue;
+ __set_bit(key, state->keybit);
+ __set_bit(key, state->upbit);
+ }
+ }
+
+ state->wq = alloc_ordered_workqueue("keycombo", 0);
+ if (!state->wq)
+ return -ENOMEM;
+
+ state->priv = pdata->priv;
+
+ if (pdata->key_down_fn)
+ state->key_down_fn = pdata->key_down_fn;
+ INIT_DELAYED_WORK(&state->key_down_work, do_key_down);
+
+ if (pdata->key_up_fn)
+ state->key_up_fn = pdata->key_up_fn;
+ INIT_WORK(&state->key_up_work, do_key_up);
+
+ wakeup_source_init(&state->combo_held_wake_source, "key combo");
+ wakeup_source_init(&state->combo_up_wake_source, "key combo up");
+ state->delay = msecs_to_jiffies(pdata->key_down_delay);
+
+ state->input_handler.event = keycombo_event;
+ state->input_handler.connect = keycombo_connect;
+ state->input_handler.disconnect = keycombo_disconnect;
+ state->input_handler.name = KEYCOMBO_NAME;
+ state->input_handler.id_table = keycombo_ids;
+ ret = input_register_handler(&state->input_handler);
+ if (ret) {
+ kfree(state);
+ return ret;
+ }
+ platform_set_drvdata(pdev, state);
+ return 0;
+}
+
+int keycombo_remove(struct platform_device *pdev)
+{
+ struct keycombo_state *state = platform_get_drvdata(pdev);
+ input_unregister_handler(&state->input_handler);
+ destroy_workqueue(state->wq);
+ kfree(state);
+ return 0;
+}
+
+
+struct platform_driver keycombo_driver = {
+ .driver.name = KEYCOMBO_NAME,
+ .probe = keycombo_probe,
+ .remove = keycombo_remove,
+};
+
+static int __init keycombo_init(void)
+{
+ return platform_driver_register(&keycombo_driver);
+}
+
+static void __exit keycombo_exit(void)
+{
+ return platform_driver_unregister(&keycombo_driver);
+}
+
+module_init(keycombo_init);
+module_exit(keycombo_exit);
diff --git a/drivers/input/keyreset.c b/drivers/input/keyreset.c
new file mode 100644
index 000000000000..7e5222aec7c1
--- /dev/null
+++ b/drivers/input/keyreset.c
@@ -0,0 +1,144 @@
+/* drivers/input/keyreset.c
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/input.h>
+#include <linux/keyreset.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/keycombo.h>
+
+struct keyreset_state {
+ int restart_requested;
+ int (*reset_fn)(void);
+ struct platform_device *pdev_child;
+ struct work_struct restart_work;
+};
+
+static void do_restart(struct work_struct *unused)
+{
+ orderly_reboot();
+}
+
+static void do_reset_fn(void *priv)
+{
+ struct keyreset_state *state = priv;
+ if (state->restart_requested)
+ panic("keyboard reset failed, %d", state->restart_requested);
+ if (state->reset_fn) {
+ state->restart_requested = state->reset_fn();
+ } else {
+ pr_info("keyboard reset\n");
+ schedule_work(&state->restart_work);
+ state->restart_requested = 1;
+ }
+}
+
+static int keyreset_probe(struct platform_device *pdev)
+{
+ int ret = -ENOMEM;
+ struct keycombo_platform_data *pdata_child;
+ struct keyreset_platform_data *pdata = pdev->dev.platform_data;
+ int up_size = 0, down_size = 0, size;
+ int key, *keyp;
+ struct keyreset_state *state;
+
+ if (!pdata)
+ return -EINVAL;
+ state = devm_kzalloc(&pdev->dev, sizeof(*state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+
+ state->pdev_child = platform_device_alloc(KEYCOMBO_NAME,
+ PLATFORM_DEVID_AUTO);
+ if (!state->pdev_child)
+ return -ENOMEM;
+ state->pdev_child->dev.parent = &pdev->dev;
+ INIT_WORK(&state->restart_work, do_restart);
+
+ keyp = pdata->keys_down;
+ while ((key = *keyp++)) {
+ if (key >= KEY_MAX)
+ continue;
+ down_size++;
+ }
+ if (pdata->keys_up) {
+ keyp = pdata->keys_up;
+ while ((key = *keyp++)) {
+ if (key >= KEY_MAX)
+ continue;
+ up_size++;
+ }
+ }
+ size = sizeof(struct keycombo_platform_data)
+ + sizeof(int) * (down_size + 1);
+ pdata_child = devm_kzalloc(&pdev->dev, size, GFP_KERNEL);
+ if (!pdata_child)
+ goto error;
+ memcpy(pdata_child->keys_down, pdata->keys_down,
+ sizeof(int) * down_size);
+ if (up_size > 0) {
+ pdata_child->keys_up = devm_kzalloc(&pdev->dev, up_size + 1,
+ GFP_KERNEL);
+ if (!pdata_child->keys_up)
+ goto error;
+ memcpy(pdata_child->keys_up, pdata->keys_up,
+ sizeof(int) * up_size);
+ if (!pdata_child->keys_up)
+ goto error;
+ }
+ state->reset_fn = pdata->reset_fn;
+ pdata_child->key_down_fn = do_reset_fn;
+ pdata_child->priv = state;
+ pdata_child->key_down_delay = pdata->key_down_delay;
+ ret = platform_device_add_data(state->pdev_child, pdata_child, size);
+ if (ret)
+ goto error;
+ platform_set_drvdata(pdev, state);
+ return platform_device_add(state->pdev_child);
+error:
+ platform_device_put(state->pdev_child);
+ return ret;
+}
+
+int keyreset_remove(struct platform_device *pdev)
+{
+ struct keyreset_state *state = platform_get_drvdata(pdev);
+ platform_device_put(state->pdev_child);
+ return 0;
+}
+
+
+struct platform_driver keyreset_driver = {
+ .driver.name = KEYRESET_NAME,
+ .probe = keyreset_probe,
+ .remove = keyreset_remove,
+};
+
+static int __init keyreset_init(void)
+{
+ return platform_driver_register(&keyreset_driver);
+}
+
+static void __exit keyreset_exit(void)
+{
+ return platform_driver_unregister(&keyreset_driver);
+}
+
+module_init(keyreset_init);
+module_exit(keyreset_exit);
diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
index 7ffb614ce566..94360fe63f41 100644
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -367,6 +367,17 @@ config INPUT_ATI_REMOTE2
To compile this driver as a module, choose M here: the module will be
called ati_remote2.
+config INPUT_KEYCHORD
+ tristate "Key chord input driver support"
+ help
+ Say Y here if you want to enable the key chord driver
+ accessible at /dev/keychord. This driver can be used
+ for receiving notifications when client specified key
+ combinations are pressed.
+
+ To compile this driver as a module, choose M here: the
+ module will be called keychord.
+
config INPUT_KEYSPAN_REMOTE
tristate "Keyspan DMR USB remote control"
depends on USB_ARCH_HAS_HCD
@@ -535,6 +546,11 @@ config INPUT_SGI_BTNS
To compile this driver as a module, choose M here: the
module will be called sgi_btns.
+config INPUT_GPIO
+ tristate "GPIO driver support"
+ help
+ Say Y here if you want to support gpio based keys, wheels etc...
+
config HP_SDC_RTC
tristate "HP SDC Real Time Clock"
depends on (GSC || HP300) && SERIO
diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile
index 0b6d025f0487..64bf231faf8c 100644
--- a/drivers/input/misc/Makefile
+++ b/drivers/input/misc/Makefile
@@ -36,10 +36,12 @@ obj-$(CONFIG_INPUT_GP2A) += gp2ap002a00f.o
obj-$(CONFIG_INPUT_GPIO_BEEPER) += gpio-beeper.o
obj-$(CONFIG_INPUT_GPIO_TILT_POLLED) += gpio_tilt_polled.o
obj-$(CONFIG_INPUT_GPIO_DECODER) += gpio_decoder.o
+obj-$(CONFIG_INPUT_GPIO) += gpio_event.o gpio_matrix.o gpio_input.o gpio_output.o gpio_axis.o
obj-$(CONFIG_INPUT_HISI_POWERKEY) += hisi_powerkey.o
obj-$(CONFIG_HP_SDC_RTC) += hp_sdc_rtc.o
obj-$(CONFIG_INPUT_IMS_PCU) += ims-pcu.o
obj-$(CONFIG_INPUT_IXP4XX_BEEPER) += ixp4xx-beeper.o
+obj-$(CONFIG_INPUT_KEYCHORD) += keychord.o
obj-$(CONFIG_INPUT_KEYSPAN_REMOTE) += keyspan_remote.o
obj-$(CONFIG_INPUT_KXTJ9) += kxtj9.o
obj-$(CONFIG_INPUT_M68K_BEEP) += m68kspkr.o
diff --git a/drivers/input/misc/gpio_axis.c b/drivers/input/misc/gpio_axis.c
new file mode 100644
index 000000000000..0acf4a576f53
--- /dev/null
+++ b/drivers/input/misc/gpio_axis.c
@@ -0,0 +1,192 @@
+/* drivers/input/misc/gpio_axis.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+struct gpio_axis_state {
+ struct gpio_event_input_devs *input_devs;
+ struct gpio_event_axis_info *info;
+ uint32_t pos;
+};
+
+uint16_t gpio_axis_4bit_gray_map_table[] = {
+ [0x0] = 0x0, [0x1] = 0x1, /* 0000 0001 */
+ [0x3] = 0x2, [0x2] = 0x3, /* 0011 0010 */
+ [0x6] = 0x4, [0x7] = 0x5, /* 0110 0111 */
+ [0x5] = 0x6, [0x4] = 0x7, /* 0101 0100 */
+ [0xc] = 0x8, [0xd] = 0x9, /* 1100 1101 */
+ [0xf] = 0xa, [0xe] = 0xb, /* 1111 1110 */
+ [0xa] = 0xc, [0xb] = 0xd, /* 1010 1011 */
+ [0x9] = 0xe, [0x8] = 0xf, /* 1001 1000 */
+};
+uint16_t gpio_axis_4bit_gray_map(struct gpio_event_axis_info *info, uint16_t in)
+{
+ return gpio_axis_4bit_gray_map_table[in];
+}
+
+uint16_t gpio_axis_5bit_singletrack_map_table[] = {
+ [0x10] = 0x00, [0x14] = 0x01, [0x1c] = 0x02, /* 10000 10100 11100 */
+ [0x1e] = 0x03, [0x1a] = 0x04, [0x18] = 0x05, /* 11110 11010 11000 */
+ [0x08] = 0x06, [0x0a] = 0x07, [0x0e] = 0x08, /* 01000 01010 01110 */
+ [0x0f] = 0x09, [0x0d] = 0x0a, [0x0c] = 0x0b, /* 01111 01101 01100 */
+ [0x04] = 0x0c, [0x05] = 0x0d, [0x07] = 0x0e, /* 00100 00101 00111 */
+ [0x17] = 0x0f, [0x16] = 0x10, [0x06] = 0x11, /* 10111 10110 00110 */
+ [0x02] = 0x12, [0x12] = 0x13, [0x13] = 0x14, /* 00010 10010 10011 */
+ [0x1b] = 0x15, [0x0b] = 0x16, [0x03] = 0x17, /* 11011 01011 00011 */
+ [0x01] = 0x18, [0x09] = 0x19, [0x19] = 0x1a, /* 00001 01001 11001 */
+ [0x1d] = 0x1b, [0x15] = 0x1c, [0x11] = 0x1d, /* 11101 10101 10001 */
+};
+uint16_t gpio_axis_5bit_singletrack_map(
+ struct gpio_event_axis_info *info, uint16_t in)
+{
+ return gpio_axis_5bit_singletrack_map_table[in];
+}
+
+static void gpio_event_update_axis(struct gpio_axis_state *as, int report)
+{
+ struct gpio_event_axis_info *ai = as->info;
+ int i;
+ int change;
+ uint16_t state = 0;
+ uint16_t pos;
+ uint16_t old_pos = as->pos;
+ for (i = ai->count - 1; i >= 0; i--)
+ state = (state << 1) | gpio_get_value(ai->gpio[i]);
+ pos = ai->map(ai, state);
+ if (ai->flags & GPIOEAF_PRINT_RAW)
+ pr_info("axis %d-%d raw %x, pos %d -> %d\n",
+ ai->type, ai->code, state, old_pos, pos);
+ if (report && pos != old_pos) {
+ if (ai->type == EV_REL) {
+ change = (ai->decoded_size + pos - old_pos) %
+ ai->decoded_size;
+ if (change > ai->decoded_size / 2)
+ change -= ai->decoded_size;
+ if (change == ai->decoded_size / 2) {
+ if (ai->flags & GPIOEAF_PRINT_EVENT)
+ pr_info("axis %d-%d unknown direction, "
+ "pos %d -> %d\n", ai->type,
+ ai->code, old_pos, pos);
+ change = 0; /* no closest direction */
+ }
+ if (ai->flags & GPIOEAF_PRINT_EVENT)
+ pr_info("axis %d-%d change %d\n",
+ ai->type, ai->code, change);
+ input_report_rel(as->input_devs->dev[ai->dev],
+ ai->code, change);
+ } else {
+ if (ai->flags & GPIOEAF_PRINT_EVENT)
+ pr_info("axis %d-%d now %d\n",
+ ai->type, ai->code, pos);
+ input_event(as->input_devs->dev[ai->dev],
+ ai->type, ai->code, pos);
+ }
+ input_sync(as->input_devs->dev[ai->dev]);
+ }
+ as->pos = pos;
+}
+
+static irqreturn_t gpio_axis_irq_handler(int irq, void *dev_id)
+{
+ struct gpio_axis_state *as = dev_id;
+ gpio_event_update_axis(as, 1);
+ return IRQ_HANDLED;
+}
+
+int gpio_event_axis_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func)
+{
+ int ret;
+ int i;
+ int irq;
+ struct gpio_event_axis_info *ai;
+ struct gpio_axis_state *as;
+
+ ai = container_of(info, struct gpio_event_axis_info, info);
+ if (func == GPIO_EVENT_FUNC_SUSPEND) {
+ for (i = 0; i < ai->count; i++)
+ disable_irq(gpio_to_irq(ai->gpio[i]));
+ return 0;
+ }
+ if (func == GPIO_EVENT_FUNC_RESUME) {
+ for (i = 0; i < ai->count; i++)
+ enable_irq(gpio_to_irq(ai->gpio[i]));
+ return 0;
+ }
+
+ if (func == GPIO_EVENT_FUNC_INIT) {
+ *data = as = kmalloc(sizeof(*as), GFP_KERNEL);
+ if (as == NULL) {
+ ret = -ENOMEM;
+ goto err_alloc_axis_state_failed;
+ }
+ as->input_devs = input_devs;
+ as->info = ai;
+ if (ai->dev >= input_devs->count) {
+ pr_err("gpio_event_axis: bad device index %d >= %d "
+ "for %d:%d\n", ai->dev, input_devs->count,
+ ai->type, ai->code);
+ ret = -EINVAL;
+ goto err_bad_device_index;
+ }
+
+ input_set_capability(input_devs->dev[ai->dev],
+ ai->type, ai->code);
+ if (ai->type == EV_ABS) {
+ input_set_abs_params(input_devs->dev[ai->dev], ai->code,
+ 0, ai->decoded_size - 1, 0, 0);
+ }
+ for (i = 0; i < ai->count; i++) {
+ ret = gpio_request(ai->gpio[i], "gpio_event_axis");
+ if (ret < 0)
+ goto err_request_gpio_failed;
+ ret = gpio_direction_input(ai->gpio[i]);
+ if (ret < 0)
+ goto err_gpio_direction_input_failed;
+ ret = irq = gpio_to_irq(ai->gpio[i]);
+ if (ret < 0)
+ goto err_get_irq_num_failed;
+ ret = request_irq(irq, gpio_axis_irq_handler,
+ IRQF_TRIGGER_RISING |
+ IRQF_TRIGGER_FALLING,
+ "gpio_event_axis", as);
+ if (ret < 0)
+ goto err_request_irq_failed;
+ }
+ gpio_event_update_axis(as, 0);
+ return 0;
+ }
+
+ ret = 0;
+ as = *data;
+ for (i = ai->count - 1; i >= 0; i--) {
+ free_irq(gpio_to_irq(ai->gpio[i]), as);
+err_request_irq_failed:
+err_get_irq_num_failed:
+err_gpio_direction_input_failed:
+ gpio_free(ai->gpio[i]);
+err_request_gpio_failed:
+ ;
+ }
+err_bad_device_index:
+ kfree(as);
+ *data = NULL;
+err_alloc_axis_state_failed:
+ return ret;
+}
diff --git a/drivers/input/misc/gpio_event.c b/drivers/input/misc/gpio_event.c
new file mode 100644
index 000000000000..90f07eba3ce9
--- /dev/null
+++ b/drivers/input/misc/gpio_event.c
@@ -0,0 +1,228 @@
+/* drivers/input/misc/gpio_event.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/input.h>
+#include <linux/gpio_event.h>
+#include <linux/hrtimer.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+struct gpio_event {
+ struct gpio_event_input_devs *input_devs;
+ const struct gpio_event_platform_data *info;
+ void *state[0];
+};
+
+static int gpio_input_event(
+ struct input_dev *dev, unsigned int type, unsigned int code, int value)
+{
+ int i;
+ int devnr;
+ int ret = 0;
+ int tmp_ret;
+ struct gpio_event_info **ii;
+ struct gpio_event *ip = input_get_drvdata(dev);
+
+ for (devnr = 0; devnr < ip->input_devs->count; devnr++)
+ if (ip->input_devs->dev[devnr] == dev)
+ break;
+ if (devnr == ip->input_devs->count) {
+ pr_err("gpio_input_event: unknown device %p\n", dev);
+ return -EIO;
+ }
+
+ for (i = 0, ii = ip->info->info; i < ip->info->info_count; i++, ii++) {
+ if ((*ii)->event) {
+ tmp_ret = (*ii)->event(ip->input_devs, *ii,
+ &ip->state[i],
+ devnr, type, code, value);
+ if (tmp_ret)
+ ret = tmp_ret;
+ }
+ }
+ return ret;
+}
+
+static int gpio_event_call_all_func(struct gpio_event *ip, int func)
+{
+ int i;
+ int ret;
+ struct gpio_event_info **ii;
+
+ if (func == GPIO_EVENT_FUNC_INIT || func == GPIO_EVENT_FUNC_RESUME) {
+ ii = ip->info->info;
+ for (i = 0; i < ip->info->info_count; i++, ii++) {
+ if ((*ii)->func == NULL) {
+ ret = -ENODEV;
+ pr_err("gpio_event_probe: Incomplete pdata, "
+ "no function\n");
+ goto err_no_func;
+ }
+ if (func == GPIO_EVENT_FUNC_RESUME && (*ii)->no_suspend)
+ continue;
+ ret = (*ii)->func(ip->input_devs, *ii, &ip->state[i],
+ func);
+ if (ret) {
+ pr_err("gpio_event_probe: function failed\n");
+ goto err_func_failed;
+ }
+ }
+ return 0;
+ }
+
+ ret = 0;
+ i = ip->info->info_count;
+ ii = ip->info->info + i;
+ while (i > 0) {
+ i--;
+ ii--;
+ if ((func & ~1) == GPIO_EVENT_FUNC_SUSPEND && (*ii)->no_suspend)
+ continue;
+ (*ii)->func(ip->input_devs, *ii, &ip->state[i], func & ~1);
+err_func_failed:
+err_no_func:
+ ;
+ }
+ return ret;
+}
+
+static void __maybe_unused gpio_event_suspend(struct gpio_event *ip)
+{
+ gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_SUSPEND);
+ if (ip->info->power)
+ ip->info->power(ip->info, 0);
+}
+
+static void __maybe_unused gpio_event_resume(struct gpio_event *ip)
+{
+ if (ip->info->power)
+ ip->info->power(ip->info, 1);
+ gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_RESUME);
+}
+
+static int gpio_event_probe(struct platform_device *pdev)
+{
+ int err;
+ struct gpio_event *ip;
+ struct gpio_event_platform_data *event_info;
+ int dev_count = 1;
+ int i;
+ int registered = 0;
+
+ event_info = pdev->dev.platform_data;
+ if (event_info == NULL) {
+ pr_err("gpio_event_probe: No pdata\n");
+ return -ENODEV;
+ }
+ if ((!event_info->name && !event_info->names[0]) ||
+ !event_info->info || !event_info->info_count) {
+ pr_err("gpio_event_probe: Incomplete pdata\n");
+ return -ENODEV;
+ }
+ if (!event_info->name)
+ while (event_info->names[dev_count])
+ dev_count++;
+ ip = kzalloc(sizeof(*ip) +
+ sizeof(ip->state[0]) * event_info->info_count +
+ sizeof(*ip->input_devs) +
+ sizeof(ip->input_devs->dev[0]) * dev_count, GFP_KERNEL);
+ if (ip == NULL) {
+ err = -ENOMEM;
+ pr_err("gpio_event_probe: Failed to allocate private data\n");
+ goto err_kp_alloc_failed;
+ }
+ ip->input_devs = (void*)&ip->state[event_info->info_count];
+ platform_set_drvdata(pdev, ip);
+
+ for (i = 0; i < dev_count; i++) {
+ struct input_dev *input_dev = input_allocate_device();
+ if (input_dev == NULL) {
+ err = -ENOMEM;
+ pr_err("gpio_event_probe: "
+ "Failed to allocate input device\n");
+ goto err_input_dev_alloc_failed;
+ }
+ input_set_drvdata(input_dev, ip);
+ input_dev->name = event_info->name ?
+ event_info->name : event_info->names[i];
+ input_dev->event = gpio_input_event;
+ ip->input_devs->dev[i] = input_dev;
+ }
+ ip->input_devs->count = dev_count;
+ ip->info = event_info;
+ if (event_info->power)
+ ip->info->power(ip->info, 1);
+
+ err = gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_INIT);
+ if (err)
+ goto err_call_all_func_failed;
+
+ for (i = 0; i < dev_count; i++) {
+ err = input_register_device(ip->input_devs->dev[i]);
+ if (err) {
+ pr_err("gpio_event_probe: Unable to register %s "
+ "input device\n", ip->input_devs->dev[i]->name);
+ goto err_input_register_device_failed;
+ }
+ registered++;
+ }
+
+ return 0;
+
+err_input_register_device_failed:
+ gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_UNINIT);
+err_call_all_func_failed:
+ if (event_info->power)
+ ip->info->power(ip->info, 0);
+ for (i = 0; i < registered; i++)
+ input_unregister_device(ip->input_devs->dev[i]);
+ for (i = dev_count - 1; i >= registered; i--) {
+ input_free_device(ip->input_devs->dev[i]);
+err_input_dev_alloc_failed:
+ ;
+ }
+ kfree(ip);
+err_kp_alloc_failed:
+ return err;
+}
+
+static int gpio_event_remove(struct platform_device *pdev)
+{
+ struct gpio_event *ip = platform_get_drvdata(pdev);
+ int i;
+
+ gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_UNINIT);
+ if (ip->info->power)
+ ip->info->power(ip->info, 0);
+ for (i = 0; i < ip->input_devs->count; i++)
+ input_unregister_device(ip->input_devs->dev[i]);
+ kfree(ip);
+ return 0;
+}
+
+static struct platform_driver gpio_event_driver = {
+ .probe = gpio_event_probe,
+ .remove = gpio_event_remove,
+ .driver = {
+ .name = GPIO_EVENT_DEV_NAME,
+ },
+};
+
+module_platform_driver(gpio_event_driver);
+
+MODULE_DESCRIPTION("GPIO Event Driver");
+MODULE_LICENSE("GPL");
+
diff --git a/drivers/input/misc/gpio_input.c b/drivers/input/misc/gpio_input.c
new file mode 100644
index 000000000000..eefd02725aff
--- /dev/null
+++ b/drivers/input/misc/gpio_input.c
@@ -0,0 +1,390 @@
+/* drivers/input/misc/gpio_input.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+#include <linux/hrtimer.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/pm_wakeup.h>
+
+enum {
+ DEBOUNCE_UNSTABLE = BIT(0), /* Got irq, while debouncing */
+ DEBOUNCE_PRESSED = BIT(1),
+ DEBOUNCE_NOTPRESSED = BIT(2),
+ DEBOUNCE_WAIT_IRQ = BIT(3), /* Stable irq state */
+ DEBOUNCE_POLL = BIT(4), /* Stable polling state */
+
+ DEBOUNCE_UNKNOWN =
+ DEBOUNCE_PRESSED | DEBOUNCE_NOTPRESSED,
+};
+
+struct gpio_key_state {
+ struct gpio_input_state *ds;
+ uint8_t debounce;
+};
+
+struct gpio_input_state {
+ struct gpio_event_input_devs *input_devs;
+ const struct gpio_event_input_info *info;
+ struct hrtimer timer;
+ int use_irq;
+ int debounce_count;
+ spinlock_t irq_lock;
+ struct wakeup_source *ws;
+ struct gpio_key_state key_state[0];
+};
+
+static enum hrtimer_restart gpio_event_input_timer_func(struct hrtimer *timer)
+{
+ int i;
+ int pressed;
+ struct gpio_input_state *ds =
+ container_of(timer, struct gpio_input_state, timer);
+ unsigned gpio_flags = ds->info->flags;
+ unsigned npolarity;
+ int nkeys = ds->info->keymap_size;
+ const struct gpio_event_direct_entry *key_entry;
+ struct gpio_key_state *key_state;
+ unsigned long irqflags;
+ uint8_t debounce;
+ bool sync_needed;
+
+#if 0
+ key_entry = kp->keys_info->keymap;
+ key_state = kp->key_state;
+ for (i = 0; i < nkeys; i++, key_entry++, key_state++)
+ pr_info("gpio_read_detect_status %d %d\n", key_entry->gpio,
+ gpio_read_detect_status(key_entry->gpio));
+#endif
+ key_entry = ds->info->keymap;
+ key_state = ds->key_state;
+ sync_needed = false;
+ spin_lock_irqsave(&ds->irq_lock, irqflags);
+ for (i = 0; i < nkeys; i++, key_entry++, key_state++) {
+ debounce = key_state->debounce;
+ if (debounce & DEBOUNCE_WAIT_IRQ)
+ continue;
+ if (key_state->debounce & DEBOUNCE_UNSTABLE) {
+ debounce = key_state->debounce = DEBOUNCE_UNKNOWN;
+ enable_irq(gpio_to_irq(key_entry->gpio));
+ if (gpio_flags & GPIOEDF_PRINT_KEY_UNSTABLE)
+ pr_info("gpio_keys_scan_keys: key %x-%x, %d "
+ "(%d) continue debounce\n",
+ ds->info->type, key_entry->code,
+ i, key_entry->gpio);
+ }
+ npolarity = !(gpio_flags & GPIOEDF_ACTIVE_HIGH);
+ pressed = gpio_get_value(key_entry->gpio) ^ npolarity;
+ if (debounce & DEBOUNCE_POLL) {
+ if (pressed == !(debounce & DEBOUNCE_PRESSED)) {
+ ds->debounce_count++;
+ key_state->debounce = DEBOUNCE_UNKNOWN;
+ if (gpio_flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+ pr_info("gpio_keys_scan_keys: key %x-"
+ "%x, %d (%d) start debounce\n",
+ ds->info->type, key_entry->code,
+ i, key_entry->gpio);
+ }
+ continue;
+ }
+ if (pressed && (debounce & DEBOUNCE_NOTPRESSED)) {
+ if (gpio_flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+ pr_info("gpio_keys_scan_keys: key %x-%x, %d "
+ "(%d) debounce pressed 1\n",
+ ds->info->type, key_entry->code,
+ i, key_entry->gpio);
+ key_state->debounce = DEBOUNCE_PRESSED;
+ continue;
+ }
+ if (!pressed && (debounce & DEBOUNCE_PRESSED)) {
+ if (gpio_flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+ pr_info("gpio_keys_scan_keys: key %x-%x, %d "
+ "(%d) debounce pressed 0\n",
+ ds->info->type, key_entry->code,
+ i, key_entry->gpio);
+ key_state->debounce = DEBOUNCE_NOTPRESSED;
+ continue;
+ }
+ /* key is stable */
+ ds->debounce_count--;
+ if (ds->use_irq)
+ key_state->debounce |= DEBOUNCE_WAIT_IRQ;
+ else
+ key_state->debounce |= DEBOUNCE_POLL;
+ if (gpio_flags & GPIOEDF_PRINT_KEYS)
+ pr_info("gpio_keys_scan_keys: key %x-%x, %d (%d) "
+ "changed to %d\n", ds->info->type,
+ key_entry->code, i, key_entry->gpio, pressed);
+ input_event(ds->input_devs->dev[key_entry->dev], ds->info->type,
+ key_entry->code, pressed);
+ sync_needed = true;
+ }
+ if (sync_needed) {
+ for (i = 0; i < ds->input_devs->count; i++)
+ input_sync(ds->input_devs->dev[i]);
+ }
+
+#if 0
+ key_entry = kp->keys_info->keymap;
+ key_state = kp->key_state;
+ for (i = 0; i < nkeys; i++, key_entry++, key_state++) {
+ pr_info("gpio_read_detect_status %d %d\n", key_entry->gpio,
+ gpio_read_detect_status(key_entry->gpio));
+ }
+#endif
+
+ if (ds->debounce_count)
+ hrtimer_start(timer, ds->info->debounce_time, HRTIMER_MODE_REL);
+ else if (!ds->use_irq)
+ hrtimer_start(timer, ds->info->poll_time, HRTIMER_MODE_REL);
+ else
+ __pm_relax(ds->ws);
+
+ spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+
+ return HRTIMER_NORESTART;
+}
+
+static irqreturn_t gpio_event_input_irq_handler(int irq, void *dev_id)
+{
+ struct gpio_key_state *ks = dev_id;
+ struct gpio_input_state *ds = ks->ds;
+ int keymap_index = ks - ds->key_state;
+ const struct gpio_event_direct_entry *key_entry;
+ unsigned long irqflags;
+ int pressed;
+
+ if (!ds->use_irq)
+ return IRQ_HANDLED;
+
+ key_entry = &ds->info->keymap[keymap_index];
+
+ if (ds->info->debounce_time.tv64) {
+ spin_lock_irqsave(&ds->irq_lock, irqflags);
+ if (ks->debounce & DEBOUNCE_WAIT_IRQ) {
+ ks->debounce = DEBOUNCE_UNKNOWN;
+ if (ds->debounce_count++ == 0) {
+ __pm_stay_awake(ds->ws);
+ hrtimer_start(
+ &ds->timer, ds->info->debounce_time,
+ HRTIMER_MODE_REL);
+ }
+ if (ds->info->flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+ pr_info("gpio_event_input_irq_handler: "
+ "key %x-%x, %d (%d) start debounce\n",
+ ds->info->type, key_entry->code,
+ keymap_index, key_entry->gpio);
+ } else {
+ disable_irq_nosync(irq);
+ ks->debounce = DEBOUNCE_UNSTABLE;
+ }
+ spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+ } else {
+ pressed = gpio_get_value(key_entry->gpio) ^
+ !(ds->info->flags & GPIOEDF_ACTIVE_HIGH);
+ if (ds->info->flags & GPIOEDF_PRINT_KEYS)
+ pr_info("gpio_event_input_irq_handler: key %x-%x, %d "
+ "(%d) changed to %d\n",
+ ds->info->type, key_entry->code, keymap_index,
+ key_entry->gpio, pressed);
+ input_event(ds->input_devs->dev[key_entry->dev], ds->info->type,
+ key_entry->code, pressed);
+ input_sync(ds->input_devs->dev[key_entry->dev]);
+ }
+ return IRQ_HANDLED;
+}
+
+static int gpio_event_input_request_irqs(struct gpio_input_state *ds)
+{
+ int i;
+ int err;
+ unsigned int irq;
+ unsigned long req_flags = IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING;
+
+ for (i = 0; i < ds->info->keymap_size; i++) {
+ err = irq = gpio_to_irq(ds->info->keymap[i].gpio);
+ if (err < 0)
+ goto err_gpio_get_irq_num_failed;
+ err = request_irq(irq, gpio_event_input_irq_handler,
+ req_flags, "gpio_keys", &ds->key_state[i]);
+ if (err) {
+ pr_err("gpio_event_input_request_irqs: request_irq "
+ "failed for input %d, irq %d\n",
+ ds->info->keymap[i].gpio, irq);
+ goto err_request_irq_failed;
+ }
+ if (ds->info->info.no_suspend) {
+ err = enable_irq_wake(irq);
+ if (err) {
+ pr_err("gpio_event_input_request_irqs: "
+ "enable_irq_wake failed for input %d, "
+ "irq %d\n",
+ ds->info->keymap[i].gpio, irq);
+ goto err_enable_irq_wake_failed;
+ }
+ }
+ }
+ return 0;
+
+ for (i = ds->info->keymap_size - 1; i >= 0; i--) {
+ irq = gpio_to_irq(ds->info->keymap[i].gpio);
+ if (ds->info->info.no_suspend)
+ disable_irq_wake(irq);
+err_enable_irq_wake_failed:
+ free_irq(irq, &ds->key_state[i]);
+err_request_irq_failed:
+err_gpio_get_irq_num_failed:
+ ;
+ }
+ return err;
+}
+
+int gpio_event_input_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func)
+{
+ int ret;
+ int i;
+ unsigned long irqflags;
+ struct gpio_event_input_info *di;
+ struct gpio_input_state *ds = *data;
+ char *wlname;
+
+ di = container_of(info, struct gpio_event_input_info, info);
+
+ if (func == GPIO_EVENT_FUNC_SUSPEND) {
+ if (ds->use_irq)
+ for (i = 0; i < di->keymap_size; i++)
+ disable_irq(gpio_to_irq(di->keymap[i].gpio));
+ hrtimer_cancel(&ds->timer);
+ return 0;
+ }
+ if (func == GPIO_EVENT_FUNC_RESUME) {
+ spin_lock_irqsave(&ds->irq_lock, irqflags);
+ if (ds->use_irq)
+ for (i = 0; i < di->keymap_size; i++)
+ enable_irq(gpio_to_irq(di->keymap[i].gpio));
+ hrtimer_start(&ds->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+ spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+ return 0;
+ }
+
+ if (func == GPIO_EVENT_FUNC_INIT) {
+ if (ktime_to_ns(di->poll_time) <= 0)
+ di->poll_time = ktime_set(0, 20 * NSEC_PER_MSEC);
+
+ *data = ds = kzalloc(sizeof(*ds) + sizeof(ds->key_state[0]) *
+ di->keymap_size, GFP_KERNEL);
+ if (ds == NULL) {
+ ret = -ENOMEM;
+ pr_err("gpio_event_input_func: "
+ "Failed to allocate private data\n");
+ goto err_ds_alloc_failed;
+ }
+ ds->debounce_count = di->keymap_size;
+ ds->input_devs = input_devs;
+ ds->info = di;
+ wlname = kasprintf(GFP_KERNEL, "gpio_input:%s%s",
+ input_devs->dev[0]->name,
+ (input_devs->count > 1) ? "..." : "");
+
+ ds->ws = wakeup_source_register(wlname);
+ kfree(wlname);
+ if (!ds->ws) {
+ ret = -ENOMEM;
+ pr_err("gpio_event_input_func: "
+ "Failed to allocate wakeup source\n");
+ goto err_ws_failed;
+ }
+
+ spin_lock_init(&ds->irq_lock);
+
+ for (i = 0; i < di->keymap_size; i++) {
+ int dev = di->keymap[i].dev;
+ if (dev >= input_devs->count) {
+ pr_err("gpio_event_input_func: bad device "
+ "index %d >= %d for key code %d\n",
+ dev, input_devs->count,
+ di->keymap[i].code);
+ ret = -EINVAL;
+ goto err_bad_keymap;
+ }
+ input_set_capability(input_devs->dev[dev], di->type,
+ di->keymap[i].code);
+ ds->key_state[i].ds = ds;
+ ds->key_state[i].debounce = DEBOUNCE_UNKNOWN;
+ }
+
+ for (i = 0; i < di->keymap_size; i++) {
+ ret = gpio_request(di->keymap[i].gpio, "gpio_kp_in");
+ if (ret) {
+ pr_err("gpio_event_input_func: gpio_request "
+ "failed for %d\n", di->keymap[i].gpio);
+ goto err_gpio_request_failed;
+ }
+ ret = gpio_direction_input(di->keymap[i].gpio);
+ if (ret) {
+ pr_err("gpio_event_input_func: "
+ "gpio_direction_input failed for %d\n",
+ di->keymap[i].gpio);
+ goto err_gpio_configure_failed;
+ }
+ }
+
+ ret = gpio_event_input_request_irqs(ds);
+
+ spin_lock_irqsave(&ds->irq_lock, irqflags);
+ ds->use_irq = ret == 0;
+
+ pr_info("GPIO Input Driver: Start gpio inputs for %s%s in %s "
+ "mode\n", input_devs->dev[0]->name,
+ (input_devs->count > 1) ? "..." : "",
+ ret == 0 ? "interrupt" : "polling");
+
+ hrtimer_init(&ds->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ ds->timer.function = gpio_event_input_timer_func;
+ hrtimer_start(&ds->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+ spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+ return 0;
+ }
+
+ ret = 0;
+ spin_lock_irqsave(&ds->irq_lock, irqflags);
+ hrtimer_cancel(&ds->timer);
+ if (ds->use_irq) {
+ for (i = di->keymap_size - 1; i >= 0; i--) {
+ int irq = gpio_to_irq(di->keymap[i].gpio);
+ if (ds->info->info.no_suspend)
+ disable_irq_wake(irq);
+ free_irq(irq, &ds->key_state[i]);
+ }
+ }
+ spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+
+ for (i = di->keymap_size - 1; i >= 0; i--) {
+err_gpio_configure_failed:
+ gpio_free(di->keymap[i].gpio);
+err_gpio_request_failed:
+ ;
+ }
+err_bad_keymap:
+ wakeup_source_unregister(ds->ws);
+err_ws_failed:
+ kfree(ds);
+err_ds_alloc_failed:
+ return ret;
+}
diff --git a/drivers/input/misc/gpio_matrix.c b/drivers/input/misc/gpio_matrix.c
new file mode 100644
index 000000000000..08769dd88f56
--- /dev/null
+++ b/drivers/input/misc/gpio_matrix.c
@@ -0,0 +1,440 @@
+/* drivers/input/misc/gpio_matrix.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+struct gpio_kp {
+ struct gpio_event_input_devs *input_devs;
+ struct gpio_event_matrix_info *keypad_info;
+ struct hrtimer timer;
+ struct wakeup_source wake_src;
+ int current_output;
+ unsigned int use_irq:1;
+ unsigned int key_state_changed:1;
+ unsigned int last_key_state_changed:1;
+ unsigned int some_keys_pressed:2;
+ unsigned int disabled_irq:1;
+ unsigned long keys_pressed[0];
+};
+
+static void clear_phantom_key(struct gpio_kp *kp, int out, int in)
+{
+ struct gpio_event_matrix_info *mi = kp->keypad_info;
+ int key_index = out * mi->ninputs + in;
+ unsigned short keyentry = mi->keymap[key_index];
+ unsigned short keycode = keyentry & MATRIX_KEY_MASK;
+ unsigned short dev = keyentry >> MATRIX_CODE_BITS;
+
+ if (!test_bit(keycode, kp->input_devs->dev[dev]->key)) {
+ if (mi->flags & GPIOKPF_PRINT_PHANTOM_KEYS)
+ pr_info("gpiomatrix: phantom key %x, %d-%d (%d-%d) "
+ "cleared\n", keycode, out, in,
+ mi->output_gpios[out], mi->input_gpios[in]);
+ __clear_bit(key_index, kp->keys_pressed);
+ } else {
+ if (mi->flags & GPIOKPF_PRINT_PHANTOM_KEYS)
+ pr_info("gpiomatrix: phantom key %x, %d-%d (%d-%d) "
+ "not cleared\n", keycode, out, in,
+ mi->output_gpios[out], mi->input_gpios[in]);
+ }
+}
+
+static int restore_keys_for_input(struct gpio_kp *kp, int out, int in)
+{
+ int rv = 0;
+ int key_index;
+
+ key_index = out * kp->keypad_info->ninputs + in;
+ while (out < kp->keypad_info->noutputs) {
+ if (test_bit(key_index, kp->keys_pressed)) {
+ rv = 1;
+ clear_phantom_key(kp, out, in);
+ }
+ key_index += kp->keypad_info->ninputs;
+ out++;
+ }
+ return rv;
+}
+
+static void remove_phantom_keys(struct gpio_kp *kp)
+{
+ int out, in, inp;
+ int key_index;
+
+ if (kp->some_keys_pressed < 3)
+ return;
+
+ for (out = 0; out < kp->keypad_info->noutputs; out++) {
+ inp = -1;
+ key_index = out * kp->keypad_info->ninputs;
+ for (in = 0; in < kp->keypad_info->ninputs; in++, key_index++) {
+ if (test_bit(key_index, kp->keys_pressed)) {
+ if (inp == -1) {
+ inp = in;
+ continue;
+ }
+ if (inp >= 0) {
+ if (!restore_keys_for_input(kp, out + 1,
+ inp))
+ break;
+ clear_phantom_key(kp, out, inp);
+ inp = -2;
+ }
+ restore_keys_for_input(kp, out, in);
+ }
+ }
+ }
+}
+
+static void report_key(struct gpio_kp *kp, int key_index, int out, int in)
+{
+ struct gpio_event_matrix_info *mi = kp->keypad_info;
+ int pressed = test_bit(key_index, kp->keys_pressed);
+ unsigned short keyentry = mi->keymap[key_index];
+ unsigned short keycode = keyentry & MATRIX_KEY_MASK;
+ unsigned short dev = keyentry >> MATRIX_CODE_BITS;
+
+ if (pressed != test_bit(keycode, kp->input_devs->dev[dev]->key)) {
+ if (keycode == KEY_RESERVED) {
+ if (mi->flags & GPIOKPF_PRINT_UNMAPPED_KEYS)
+ pr_info("gpiomatrix: unmapped key, %d-%d "
+ "(%d-%d) changed to %d\n",
+ out, in, mi->output_gpios[out],
+ mi->input_gpios[in], pressed);
+ } else {
+ if (mi->flags & GPIOKPF_PRINT_MAPPED_KEYS)
+ pr_info("gpiomatrix: key %x, %d-%d (%d-%d) "
+ "changed to %d\n", keycode,
+ out, in, mi->output_gpios[out],
+ mi->input_gpios[in], pressed);
+ input_report_key(kp->input_devs->dev[dev], keycode, pressed);
+ }
+ }
+}
+
+static void report_sync(struct gpio_kp *kp)
+{
+ int i;
+
+ for (i = 0; i < kp->input_devs->count; i++)
+ input_sync(kp->input_devs->dev[i]);
+}
+
+static enum hrtimer_restart gpio_keypad_timer_func(struct hrtimer *timer)
+{
+ int out, in;
+ int key_index;
+ int gpio;
+ struct gpio_kp *kp = container_of(timer, struct gpio_kp, timer);
+ struct gpio_event_matrix_info *mi = kp->keypad_info;
+ unsigned gpio_keypad_flags = mi->flags;
+ unsigned polarity = !!(gpio_keypad_flags & GPIOKPF_ACTIVE_HIGH);
+
+ out = kp->current_output;
+ if (out == mi->noutputs) {
+ out = 0;
+ kp->last_key_state_changed = kp->key_state_changed;
+ kp->key_state_changed = 0;
+ kp->some_keys_pressed = 0;
+ } else {
+ key_index = out * mi->ninputs;
+ for (in = 0; in < mi->ninputs; in++, key_index++) {
+ gpio = mi->input_gpios[in];
+ if (gpio_get_value(gpio) ^ !polarity) {
+ if (kp->some_keys_pressed < 3)
+ kp->some_keys_pressed++;
+ kp->key_state_changed |= !__test_and_set_bit(
+ key_index, kp->keys_pressed);
+ } else
+ kp->key_state_changed |= __test_and_clear_bit(
+ key_index, kp->keys_pressed);
+ }
+ gpio = mi->output_gpios[out];
+ if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+ gpio_set_value(gpio, !polarity);
+ else
+ gpio_direction_input(gpio);
+ out++;
+ }
+ kp->current_output = out;
+ if (out < mi->noutputs) {
+ gpio = mi->output_gpios[out];
+ if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+ gpio_set_value(gpio, polarity);
+ else
+ gpio_direction_output(gpio, polarity);
+ hrtimer_start(timer, mi->settle_time, HRTIMER_MODE_REL);
+ return HRTIMER_NORESTART;
+ }
+ if (gpio_keypad_flags & GPIOKPF_DEBOUNCE) {
+ if (kp->key_state_changed) {
+ hrtimer_start(&kp->timer, mi->debounce_delay,
+ HRTIMER_MODE_REL);
+ return HRTIMER_NORESTART;
+ }
+ kp->key_state_changed = kp->last_key_state_changed;
+ }
+ if (kp->key_state_changed) {
+ if (gpio_keypad_flags & GPIOKPF_REMOVE_SOME_PHANTOM_KEYS)
+ remove_phantom_keys(kp);
+ key_index = 0;
+ for (out = 0; out < mi->noutputs; out++)
+ for (in = 0; in < mi->ninputs; in++, key_index++)
+ report_key(kp, key_index, out, in);
+ report_sync(kp);
+ }
+ if (!kp->use_irq || kp->some_keys_pressed) {
+ hrtimer_start(timer, mi->poll_time, HRTIMER_MODE_REL);
+ return HRTIMER_NORESTART;
+ }
+
+ /* No keys are pressed, reenable interrupt */
+ for (out = 0; out < mi->noutputs; out++) {
+ if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+ gpio_set_value(mi->output_gpios[out], polarity);
+ else
+ gpio_direction_output(mi->output_gpios[out], polarity);
+ }
+ for (in = 0; in < mi->ninputs; in++)
+ enable_irq(gpio_to_irq(mi->input_gpios[in]));
+ __pm_relax(&kp->wake_src);
+ return HRTIMER_NORESTART;
+}
+
+static irqreturn_t gpio_keypad_irq_handler(int irq_in, void *dev_id)
+{
+ int i;
+ struct gpio_kp *kp = dev_id;
+ struct gpio_event_matrix_info *mi = kp->keypad_info;
+ unsigned gpio_keypad_flags = mi->flags;
+
+ if (!kp->use_irq) {
+ /* ignore interrupt while registering the handler */
+ kp->disabled_irq = 1;
+ disable_irq_nosync(irq_in);
+ return IRQ_HANDLED;
+ }
+
+ for (i = 0; i < mi->ninputs; i++)
+ disable_irq_nosync(gpio_to_irq(mi->input_gpios[i]));
+ for (i = 0; i < mi->noutputs; i++) {
+ if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+ gpio_set_value(mi->output_gpios[i],
+ !(gpio_keypad_flags & GPIOKPF_ACTIVE_HIGH));
+ else
+ gpio_direction_input(mi->output_gpios[i]);
+ }
+ __pm_stay_awake(&kp->wake_src);
+ hrtimer_start(&kp->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+ return IRQ_HANDLED;
+}
+
+static int gpio_keypad_request_irqs(struct gpio_kp *kp)
+{
+ int i;
+ int err;
+ unsigned int irq;
+ unsigned long request_flags;
+ struct gpio_event_matrix_info *mi = kp->keypad_info;
+
+ switch (mi->flags & (GPIOKPF_ACTIVE_HIGH|GPIOKPF_LEVEL_TRIGGERED_IRQ)) {
+ default:
+ request_flags = IRQF_TRIGGER_FALLING;
+ break;
+ case GPIOKPF_ACTIVE_HIGH:
+ request_flags = IRQF_TRIGGER_RISING;
+ break;
+ case GPIOKPF_LEVEL_TRIGGERED_IRQ:
+ request_flags = IRQF_TRIGGER_LOW;
+ break;
+ case GPIOKPF_LEVEL_TRIGGERED_IRQ | GPIOKPF_ACTIVE_HIGH:
+ request_flags = IRQF_TRIGGER_HIGH;
+ break;
+ }
+
+ for (i = 0; i < mi->ninputs; i++) {
+ err = irq = gpio_to_irq(mi->input_gpios[i]);
+ if (err < 0)
+ goto err_gpio_get_irq_num_failed;
+ err = request_irq(irq, gpio_keypad_irq_handler, request_flags,
+ "gpio_kp", kp);
+ if (err) {
+ pr_err("gpiomatrix: request_irq failed for input %d, "
+ "irq %d\n", mi->input_gpios[i], irq);
+ goto err_request_irq_failed;
+ }
+ err = enable_irq_wake(irq);
+ if (err) {
+ pr_err("gpiomatrix: set_irq_wake failed for input %d, "
+ "irq %d\n", mi->input_gpios[i], irq);
+ }
+ disable_irq(irq);
+ if (kp->disabled_irq) {
+ kp->disabled_irq = 0;
+ enable_irq(irq);
+ }
+ }
+ return 0;
+
+ for (i = mi->noutputs - 1; i >= 0; i--) {
+ free_irq(gpio_to_irq(mi->input_gpios[i]), kp);
+err_request_irq_failed:
+err_gpio_get_irq_num_failed:
+ ;
+ }
+ return err;
+}
+
+int gpio_event_matrix_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func)
+{
+ int i;
+ int err;
+ int key_count;
+ struct gpio_kp *kp;
+ struct gpio_event_matrix_info *mi;
+
+ mi = container_of(info, struct gpio_event_matrix_info, info);
+ if (func == GPIO_EVENT_FUNC_SUSPEND || func == GPIO_EVENT_FUNC_RESUME) {
+ /* TODO: disable scanning */
+ return 0;
+ }
+
+ if (func == GPIO_EVENT_FUNC_INIT) {
+ if (mi->keymap == NULL ||
+ mi->input_gpios == NULL ||
+ mi->output_gpios == NULL) {
+ err = -ENODEV;
+ pr_err("gpiomatrix: Incomplete pdata\n");
+ goto err_invalid_platform_data;
+ }
+ key_count = mi->ninputs * mi->noutputs;
+
+ *data = kp = kzalloc(sizeof(*kp) + sizeof(kp->keys_pressed[0]) *
+ BITS_TO_LONGS(key_count), GFP_KERNEL);
+ if (kp == NULL) {
+ err = -ENOMEM;
+ pr_err("gpiomatrix: Failed to allocate private data\n");
+ goto err_kp_alloc_failed;
+ }
+ kp->input_devs = input_devs;
+ kp->keypad_info = mi;
+ for (i = 0; i < key_count; i++) {
+ unsigned short keyentry = mi->keymap[i];
+ unsigned short keycode = keyentry & MATRIX_KEY_MASK;
+ unsigned short dev = keyentry >> MATRIX_CODE_BITS;
+ if (dev >= input_devs->count) {
+ pr_err("gpiomatrix: bad device index %d >= "
+ "%d for key code %d\n",
+ dev, input_devs->count, keycode);
+ err = -EINVAL;
+ goto err_bad_keymap;
+ }
+ if (keycode && keycode <= KEY_MAX)
+ input_set_capability(input_devs->dev[dev],
+ EV_KEY, keycode);
+ }
+
+ for (i = 0; i < mi->noutputs; i++) {
+ err = gpio_request(mi->output_gpios[i], "gpio_kp_out");
+ if (err) {
+ pr_err("gpiomatrix: gpio_request failed for "
+ "output %d\n", mi->output_gpios[i]);
+ goto err_request_output_gpio_failed;
+ }
+ if (gpio_cansleep(mi->output_gpios[i])) {
+ pr_err("gpiomatrix: unsupported output gpio %d,"
+ " can sleep\n", mi->output_gpios[i]);
+ err = -EINVAL;
+ goto err_output_gpio_configure_failed;
+ }
+ if (mi->flags & GPIOKPF_DRIVE_INACTIVE)
+ err = gpio_direction_output(mi->output_gpios[i],
+ !(mi->flags & GPIOKPF_ACTIVE_HIGH));
+ else
+ err = gpio_direction_input(mi->output_gpios[i]);
+ if (err) {
+ pr_err("gpiomatrix: gpio_configure failed for "
+ "output %d\n", mi->output_gpios[i]);
+ goto err_output_gpio_configure_failed;
+ }
+ }
+ for (i = 0; i < mi->ninputs; i++) {
+ err = gpio_request(mi->input_gpios[i], "gpio_kp_in");
+ if (err) {
+ pr_err("gpiomatrix: gpio_request failed for "
+ "input %d\n", mi->input_gpios[i]);
+ goto err_request_input_gpio_failed;
+ }
+ err = gpio_direction_input(mi->input_gpios[i]);
+ if (err) {
+ pr_err("gpiomatrix: gpio_direction_input failed"
+ " for input %d\n", mi->input_gpios[i]);
+ goto err_gpio_direction_input_failed;
+ }
+ }
+ kp->current_output = mi->noutputs;
+ kp->key_state_changed = 1;
+
+ hrtimer_init(&kp->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ kp->timer.function = gpio_keypad_timer_func;
+ wakeup_source_init(&kp->wake_src, "gpio_kp");
+ err = gpio_keypad_request_irqs(kp);
+ kp->use_irq = err == 0;
+
+ pr_info("GPIO Matrix Keypad Driver: Start keypad matrix for "
+ "%s%s in %s mode\n", input_devs->dev[0]->name,
+ (input_devs->count > 1) ? "..." : "",
+ kp->use_irq ? "interrupt" : "polling");
+
+ if (kp->use_irq)
+ __pm_stay_awake(&kp->wake_src);
+ hrtimer_start(&kp->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+
+ return 0;
+ }
+
+ err = 0;
+ kp = *data;
+
+ if (kp->use_irq)
+ for (i = mi->noutputs - 1; i >= 0; i--)
+ free_irq(gpio_to_irq(mi->input_gpios[i]), kp);
+
+ hrtimer_cancel(&kp->timer);
+ wakeup_source_trash(&kp->wake_src);
+ for (i = mi->noutputs - 1; i >= 0; i--) {
+err_gpio_direction_input_failed:
+ gpio_free(mi->input_gpios[i]);
+err_request_input_gpio_failed:
+ ;
+ }
+ for (i = mi->noutputs - 1; i >= 0; i--) {
+err_output_gpio_configure_failed:
+ gpio_free(mi->output_gpios[i]);
+err_request_output_gpio_failed:
+ ;
+ }
+err_bad_keymap:
+ kfree(kp);
+err_kp_alloc_failed:
+err_invalid_platform_data:
+ return err;
+}
diff --git a/drivers/input/misc/gpio_output.c b/drivers/input/misc/gpio_output.c
new file mode 100644
index 000000000000..2aac2fad0a17
--- /dev/null
+++ b/drivers/input/misc/gpio_output.c
@@ -0,0 +1,97 @@
+/* drivers/input/misc/gpio_output.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+
+int gpio_event_output_event(
+ struct gpio_event_input_devs *input_devs, struct gpio_event_info *info,
+ void **data, unsigned int dev, unsigned int type,
+ unsigned int code, int value)
+{
+ int i;
+ struct gpio_event_output_info *oi;
+ oi = container_of(info, struct gpio_event_output_info, info);
+ if (type != oi->type)
+ return 0;
+ if (!(oi->flags & GPIOEDF_ACTIVE_HIGH))
+ value = !value;
+ for (i = 0; i < oi->keymap_size; i++)
+ if (dev == oi->keymap[i].dev && code == oi->keymap[i].code)
+ gpio_set_value(oi->keymap[i].gpio, value);
+ return 0;
+}
+
+int gpio_event_output_func(
+ struct gpio_event_input_devs *input_devs, struct gpio_event_info *info,
+ void **data, int func)
+{
+ int ret;
+ int i;
+ struct gpio_event_output_info *oi;
+ oi = container_of(info, struct gpio_event_output_info, info);
+
+ if (func == GPIO_EVENT_FUNC_SUSPEND || func == GPIO_EVENT_FUNC_RESUME)
+ return 0;
+
+ if (func == GPIO_EVENT_FUNC_INIT) {
+ int output_level = !(oi->flags & GPIOEDF_ACTIVE_HIGH);
+
+ for (i = 0; i < oi->keymap_size; i++) {
+ int dev = oi->keymap[i].dev;
+ if (dev >= input_devs->count) {
+ pr_err("gpio_event_output_func: bad device "
+ "index %d >= %d for key code %d\n",
+ dev, input_devs->count,
+ oi->keymap[i].code);
+ ret = -EINVAL;
+ goto err_bad_keymap;
+ }
+ input_set_capability(input_devs->dev[dev], oi->type,
+ oi->keymap[i].code);
+ }
+
+ for (i = 0; i < oi->keymap_size; i++) {
+ ret = gpio_request(oi->keymap[i].gpio,
+ "gpio_event_output");
+ if (ret) {
+ pr_err("gpio_event_output_func: gpio_request "
+ "failed for %d\n", oi->keymap[i].gpio);
+ goto err_gpio_request_failed;
+ }
+ ret = gpio_direction_output(oi->keymap[i].gpio,
+ output_level);
+ if (ret) {
+ pr_err("gpio_event_output_func: "
+ "gpio_direction_output failed for %d\n",
+ oi->keymap[i].gpio);
+ goto err_gpio_direction_output_failed;
+ }
+ }
+ return 0;
+ }
+
+ ret = 0;
+ for (i = oi->keymap_size - 1; i >= 0; i--) {
+err_gpio_direction_output_failed:
+ gpio_free(oi->keymap[i].gpio);
+err_gpio_request_failed:
+ ;
+ }
+err_bad_keymap:
+ return ret;
+}
+
diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c
new file mode 100644
index 000000000000..fdcc14653b64
--- /dev/null
+++ b/drivers/input/misc/keychord.c
@@ -0,0 +1,467 @@
+/*
+ * drivers/input/misc/keychord.c
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+*/
+
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/keychord.h>
+#include <linux/sched.h>
+
+#define KEYCHORD_NAME "keychord"
+#define BUFFER_SIZE 16
+
+MODULE_AUTHOR("Mike Lockwood <lockwood@android.com>");
+MODULE_DESCRIPTION("Key chord input driver");
+MODULE_SUPPORTED_DEVICE("keychord");
+MODULE_LICENSE("GPL");
+
+#define NEXT_KEYCHORD(kc) ((struct input_keychord *) \
+ ((char *)kc + sizeof(struct input_keychord) + \
+ kc->count * sizeof(kc->keycodes[0])))
+
+struct keychord_device {
+ struct input_handler input_handler;
+ int registered;
+
+ /* list of keychords to monitor */
+ struct input_keychord *keychords;
+ int keychord_count;
+
+ /* bitmask of keys contained in our keychords */
+ unsigned long keybit[BITS_TO_LONGS(KEY_CNT)];
+ /* current state of the keys */
+ unsigned long keystate[BITS_TO_LONGS(KEY_CNT)];
+ /* number of keys that are currently pressed */
+ int key_down;
+
+ /* second input_device_id is needed for null termination */
+ struct input_device_id device_ids[2];
+
+ spinlock_t lock;
+ wait_queue_head_t waitq;
+ unsigned char head;
+ unsigned char tail;
+ __u16 buff[BUFFER_SIZE];
+ /* Bit to serialize writes to this device */
+#define KEYCHORD_BUSY 0x01
+ unsigned long flags;
+ wait_queue_head_t write_waitq;
+};
+
+static int check_keychord(struct keychord_device *kdev,
+ struct input_keychord *keychord)
+{
+ int i;
+
+ if (keychord->count != kdev->key_down)
+ return 0;
+
+ for (i = 0; i < keychord->count; i++) {
+ if (!test_bit(keychord->keycodes[i], kdev->keystate))
+ return 0;
+ }
+
+ /* we have a match */
+ return 1;
+}
+
+static void keychord_event(struct input_handle *handle, unsigned int type,
+ unsigned int code, int value)
+{
+ struct keychord_device *kdev = handle->private;
+ struct input_keychord *keychord;
+ unsigned long flags;
+ int i, got_chord = 0;
+
+ if (type != EV_KEY || code >= KEY_MAX)
+ return;
+
+ spin_lock_irqsave(&kdev->lock, flags);
+ /* do nothing if key state did not change */
+ if (!test_bit(code, kdev->keystate) == !value)
+ goto done;
+ __change_bit(code, kdev->keystate);
+ if (value)
+ kdev->key_down++;
+ else
+ kdev->key_down--;
+
+ /* don't notify on key up */
+ if (!value)
+ goto done;
+ /* ignore this event if it is not one of the keys we are monitoring */
+ if (!test_bit(code, kdev->keybit))
+ goto done;
+
+ keychord = kdev->keychords;
+ if (!keychord)
+ goto done;
+
+ /* check to see if the keyboard state matches any keychords */
+ for (i = 0; i < kdev->keychord_count; i++) {
+ if (check_keychord(kdev, keychord)) {
+ kdev->buff[kdev->head] = keychord->id;
+ kdev->head = (kdev->head + 1) % BUFFER_SIZE;
+ got_chord = 1;
+ break;
+ }
+ /* skip to next keychord */
+ keychord = NEXT_KEYCHORD(keychord);
+ }
+
+done:
+ spin_unlock_irqrestore(&kdev->lock, flags);
+
+ if (got_chord) {
+ pr_info("keychord: got keychord id %d. Any tasks: %d\n",
+ keychord->id,
+ !list_empty_careful(&kdev->waitq.task_list));
+ wake_up_interruptible(&kdev->waitq);
+ }
+}
+
+static int keychord_connect(struct input_handler *handler,
+ struct input_dev *dev,
+ const struct input_device_id *id)
+{
+ int i, ret;
+ struct input_handle *handle;
+ struct keychord_device *kdev =
+ container_of(handler, struct keychord_device, input_handler);
+
+ /*
+ * ignore this input device if it does not contain any keycodes
+ * that we are monitoring
+ */
+ for (i = 0; i < KEY_MAX; i++) {
+ if (test_bit(i, kdev->keybit) && test_bit(i, dev->keybit))
+ break;
+ }
+ if (i == KEY_MAX)
+ return -ENODEV;
+
+ handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+ if (!handle)
+ return -ENOMEM;
+
+ handle->dev = dev;
+ handle->handler = handler;
+ handle->name = KEYCHORD_NAME;
+ handle->private = kdev;
+
+ ret = input_register_handle(handle);
+ if (ret)
+ goto err_input_register_handle;
+
+ ret = input_open_device(handle);
+ if (ret)
+ goto err_input_open_device;
+
+ pr_info("keychord: using input dev %s for fevent\n", dev->name);
+ return 0;
+
+err_input_open_device:
+ input_unregister_handle(handle);
+err_input_register_handle:
+ kfree(handle);
+ return ret;
+}
+
+static void keychord_disconnect(struct input_handle *handle)
+{
+ input_close_device(handle);
+ input_unregister_handle(handle);
+ kfree(handle);
+}
+
+/*
+ * keychord_read is used to read keychord events from the driver
+ */
+static ssize_t keychord_read(struct file *file, char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct keychord_device *kdev = file->private_data;
+ __u16 id;
+ int retval;
+ unsigned long flags;
+
+ if (count < sizeof(id))
+ return -EINVAL;
+ count = sizeof(id);
+
+ if (kdev->head == kdev->tail && (file->f_flags & O_NONBLOCK))
+ return -EAGAIN;
+
+ retval = wait_event_interruptible(kdev->waitq,
+ kdev->head != kdev->tail);
+ if (retval)
+ return retval;
+
+ spin_lock_irqsave(&kdev->lock, flags);
+ /* pop a keychord ID off the queue */
+ id = kdev->buff[kdev->tail];
+ kdev->tail = (kdev->tail + 1) % BUFFER_SIZE;
+ spin_unlock_irqrestore(&kdev->lock, flags);
+
+ if (copy_to_user(buffer, &id, count))
+ return -EFAULT;
+
+ return count;
+}
+
+/*
+ * serializes writes on a device. can use mutex_lock_interruptible()
+ * for this particular use case as well - a matter of preference.
+ */
+static int
+keychord_write_lock(struct keychord_device *kdev)
+{
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&kdev->lock, flags);
+ while (kdev->flags & KEYCHORD_BUSY) {
+ spin_unlock_irqrestore(&kdev->lock, flags);
+ ret = wait_event_interruptible(kdev->write_waitq,
+ ((kdev->flags & KEYCHORD_BUSY) == 0));
+ if (ret)
+ return ret;
+ spin_lock_irqsave(&kdev->lock, flags);
+ }
+ kdev->flags |= KEYCHORD_BUSY;
+ spin_unlock_irqrestore(&kdev->lock, flags);
+ return 0;
+}
+
+static void
+keychord_write_unlock(struct keychord_device *kdev)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&kdev->lock, flags);
+ kdev->flags &= ~KEYCHORD_BUSY;
+ spin_unlock_irqrestore(&kdev->lock, flags);
+ wake_up_interruptible(&kdev->write_waitq);
+}
+
+/*
+ * keychord_write is used to configure the driver
+ */
+static ssize_t keychord_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct keychord_device *kdev = file->private_data;
+ struct input_keychord *keychords = 0;
+ struct input_keychord *keychord;
+ int ret, i, key;
+ unsigned long flags;
+ size_t resid = count;
+ size_t key_bytes;
+
+ if (count < sizeof(struct input_keychord))
+ return -EINVAL;
+ keychords = kzalloc(count, GFP_KERNEL);
+ if (!keychords)
+ return -ENOMEM;
+
+ /* read list of keychords from userspace */
+ if (copy_from_user(keychords, buffer, count)) {
+ kfree(keychords);
+ return -EFAULT;
+ }
+
+ /*
+ * Serialize writes to this device to prevent various races.
+ * 1) writers racing here could do duplicate input_unregister_handler()
+ * calls, resulting in attempting to unlink a node from a list that
+ * does not exist.
+ * 2) writers racing here could do duplicate input_register_handler() calls
+ * below, resulting in a duplicate insertion of a node into the list.
+ * 3) a double kfree of keychords can occur (in the event that
+ * input_register_handler() fails below.
+ */
+ ret = keychord_write_lock(kdev);
+ if (ret) {
+ kfree(keychords);
+ return ret;
+ }
+
+ /* unregister handler before changing configuration */
+ if (kdev->registered) {
+ input_unregister_handler(&kdev->input_handler);
+ kdev->registered = 0;
+ }
+
+ spin_lock_irqsave(&kdev->lock, flags);
+ /* clear any existing configuration */
+ kfree(kdev->keychords);
+ kdev->keychords = 0;
+ kdev->keychord_count = 0;
+ kdev->key_down = 0;
+ memset(kdev->keybit, 0, sizeof(kdev->keybit));
+ memset(kdev->keystate, 0, sizeof(kdev->keystate));
+ kdev->head = kdev->tail = 0;
+
+ keychord = keychords;
+
+ while (resid > 0) {
+ /* Is the entire keychord entry header present ? */
+ if (resid < sizeof(struct input_keychord)) {
+ pr_err("keychord: Insufficient bytes present for header %zu\n",
+ resid);
+ goto err_unlock_return;
+ }
+ resid -= sizeof(struct input_keychord);
+ if (keychord->count <= 0) {
+ pr_err("keychord: invalid keycode count %d\n",
+ keychord->count);
+ goto err_unlock_return;
+ }
+ key_bytes = keychord->count * sizeof(keychord->keycodes[0]);
+ /* Do we have all the expected keycodes ? */
+ if (resid < key_bytes) {
+ pr_err("keychord: Insufficient bytes present for keycount %zu\n",
+ resid);
+ goto err_unlock_return;
+ }
+ resid -= key_bytes;
+
+ if (keychord->version != KEYCHORD_VERSION) {
+ pr_err("keychord: unsupported version %d\n",
+ keychord->version);
+ goto err_unlock_return;
+ }
+
+ /* keep track of the keys we are monitoring in keybit */
+ for (i = 0; i < keychord->count; i++) {
+ key = keychord->keycodes[i];
+ if (key < 0 || key >= KEY_CNT) {
+ pr_err("keychord: keycode %d out of range\n",
+ key);
+ goto err_unlock_return;
+ }
+ __set_bit(key, kdev->keybit);
+ }
+
+ kdev->keychord_count++;
+ keychord = NEXT_KEYCHORD(keychord);
+ }
+
+ kdev->keychords = keychords;
+ spin_unlock_irqrestore(&kdev->lock, flags);
+
+ ret = input_register_handler(&kdev->input_handler);
+ if (ret) {
+ kfree(keychords);
+ kdev->keychords = 0;
+ keychord_write_unlock(kdev);
+ return ret;
+ }
+ kdev->registered = 1;
+
+ keychord_write_unlock(kdev);
+
+ return count;
+
+err_unlock_return:
+ spin_unlock_irqrestore(&kdev->lock, flags);
+ kfree(keychords);
+ keychord_write_unlock(kdev);
+ return -EINVAL;
+}
+
+static unsigned int keychord_poll(struct file *file, poll_table *wait)
+{
+ struct keychord_device *kdev = file->private_data;
+
+ poll_wait(file, &kdev->waitq, wait);
+
+ if (kdev->head != kdev->tail)
+ return POLLIN | POLLRDNORM;
+
+ return 0;
+}
+
+static int keychord_open(struct inode *inode, struct file *file)
+{
+ struct keychord_device *kdev;
+
+ kdev = kzalloc(sizeof(struct keychord_device), GFP_KERNEL);
+ if (!kdev)
+ return -ENOMEM;
+
+ spin_lock_init(&kdev->lock);
+ init_waitqueue_head(&kdev->waitq);
+ init_waitqueue_head(&kdev->write_waitq);
+
+ kdev->input_handler.event = keychord_event;
+ kdev->input_handler.connect = keychord_connect;
+ kdev->input_handler.disconnect = keychord_disconnect;
+ kdev->input_handler.name = KEYCHORD_NAME;
+ kdev->input_handler.id_table = kdev->device_ids;
+
+ kdev->device_ids[0].flags = INPUT_DEVICE_ID_MATCH_EVBIT;
+ __set_bit(EV_KEY, kdev->device_ids[0].evbit);
+
+ file->private_data = kdev;
+
+ return 0;
+}
+
+static int keychord_release(struct inode *inode, struct file *file)
+{
+ struct keychord_device *kdev = file->private_data;
+
+ if (kdev->registered)
+ input_unregister_handler(&kdev->input_handler);
+ kfree(kdev->keychords);
+ kfree(kdev);
+
+ return 0;
+}
+
+static const struct file_operations keychord_fops = {
+ .owner = THIS_MODULE,
+ .open = keychord_open,
+ .release = keychord_release,
+ .read = keychord_read,
+ .write = keychord_write,
+ .poll = keychord_poll,
+};
+
+static struct miscdevice keychord_misc = {
+ .fops = &keychord_fops,
+ .name = KEYCHORD_NAME,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+static int __init keychord_init(void)
+{
+ return misc_register(&keychord_misc);
+}
+
+static void __exit keychord_exit(void)
+{
+ misc_deregister(&keychord_misc);
+}
+
+module_init(keychord_init);
+module_exit(keychord_exit);
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 197e29d1c2e6..e7b8f49e060f 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -459,6 +459,21 @@ config DM_VERITY
If unsure, say N.
+config DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
+ bool "Prefetch size 128"
+
+config DM_VERITY_HASH_PREFETCH_MIN_SIZE
+ int "Verity hash prefetch minimum size"
+ depends on DM_VERITY
+ range 1 4096
+ default 128 if DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
+ default 1
+ ---help---
+ This sets minimum number of hash blocks to prefetch for dm-verity.
+ For devices like eMMC, having larger prefetch size like 128 can improve
+ performance with increased memory consumption for keeping more hashes
+ in RAM.
+
config DM_VERITY_FEC
bool "Verity forward error correction support"
depends on DM_VERITY
@@ -501,4 +516,22 @@ config DM_LOG_WRITES
If unsure, say N.
+config DM_ANDROID_VERITY
+ bool "Android verity target support"
+ depends on DM_VERITY=y
+ depends on X509_CERTIFICATE_PARSER
+ depends on SYSTEM_TRUSTED_KEYRING
+ depends on PUBLIC_KEY_ALGO_RSA
+ depends on KEYS
+ depends on ASYMMETRIC_KEY_TYPE
+ depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE
+ depends on MD_LINEAR=y
+ select DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
+ ---help---
+ This device-mapper target is virtually a VERITY target. This
+ target is setup by reading the metadata contents piggybacked
+ to the actual data blocks in the block device. The signature
+ of the metadata contents are verified against the key included
+ in the system keyring. Upon success, the underlying verity
+ target is setup.
endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1af87a0..f26ce41af389 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
obj-$(CONFIG_DM_ERA) += dm-era.o
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
+obj-$(CONFIG_DM_ANDROID_VERITY) += dm-android-verity.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c
new file mode 100644
index 000000000000..eb4bdf66087c
--- /dev/null
+++ b/drivers/md/dm-android-verity.c
@@ -0,0 +1,947 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/device-mapper.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/of.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+
+#include <asm/setup.h>
+#include <crypto/hash.h>
+#include <crypto/public_key.h>
+#include <crypto/sha.h>
+#include <keys/asymmetric-type.h>
+#include <keys/system_keyring.h>
+
+#include "dm-verity.h"
+#include "dm-android-verity.h"
+
+static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH];
+static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH];
+static char veritykeyid[VERITY_DEFAULT_KEY_ID_LENGTH];
+static char buildvariant[BUILD_VARIANT];
+
+static bool target_added;
+static bool verity_enabled = true;
+struct dentry *debug_dir;
+static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv);
+
+static struct target_type android_verity_target = {
+ .name = "android-verity",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = android_verity_ctr,
+ .dtr = verity_dtr,
+ .map = verity_map,
+ .status = verity_status,
+ .prepare_ioctl = verity_prepare_ioctl,
+ .iterate_devices = verity_iterate_devices,
+ .io_hints = verity_io_hints,
+};
+
+static int __init verified_boot_state_param(char *line)
+{
+ strlcpy(verifiedbootstate, line, sizeof(verifiedbootstate));
+ return 1;
+}
+
+__setup("androidboot.verifiedbootstate=", verified_boot_state_param);
+
+static int __init verity_mode_param(char *line)
+{
+ strlcpy(veritymode, line, sizeof(veritymode));
+ return 1;
+}
+
+__setup("androidboot.veritymode=", verity_mode_param);
+
+static int __init verity_keyid_param(char *line)
+{
+ strlcpy(veritykeyid, line, sizeof(veritykeyid));
+ return 1;
+}
+
+__setup("veritykeyid=", verity_keyid_param);
+
+static int __init verity_buildvariant(char *line)
+{
+ strlcpy(buildvariant, line, sizeof(buildvariant));
+ return 1;
+}
+
+__setup("buildvariant=", verity_buildvariant);
+
+static inline bool default_verity_key_id(void)
+{
+ return veritykeyid[0] != '\0';
+}
+
+static inline bool is_eng(void)
+{
+ static const char typeeng[] = "eng";
+
+ return !strncmp(buildvariant, typeeng, sizeof(typeeng));
+}
+
+static inline bool is_userdebug(void)
+{
+ static const char typeuserdebug[] = "userdebug";
+
+ return !strncmp(buildvariant, typeuserdebug, sizeof(typeuserdebug));
+}
+
+static inline bool is_unlocked(void)
+{
+ static const char unlocked[] = "orange";
+
+ return !strncmp(verifiedbootstate, unlocked, sizeof(unlocked));
+}
+
+static int table_extract_mpi_array(struct public_key_signature *pks,
+ const void *data, size_t len)
+{
+ MPI mpi = mpi_read_raw_data(data, len);
+
+ if (!mpi) {
+ DMERR("Error while allocating mpi array");
+ return -ENOMEM;
+ }
+
+ pks->mpi[0] = mpi;
+ pks->nr_mpi = 1;
+ return 0;
+}
+
+static struct public_key_signature *table_make_digest(
+ enum hash_algo hash,
+ const void *table,
+ unsigned long table_len)
+{
+ struct public_key_signature *pks = NULL;
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ size_t digest_size, desc_size;
+ int ret;
+
+ /* Allocate the hashing algorithm we're going to need and find out how
+ * big the hash operational data will be.
+ */
+ tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
+ if (IS_ERR(tfm))
+ return ERR_CAST(tfm);
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ digest_size = crypto_shash_digestsize(tfm);
+
+ /* We allocate the hash operational data storage on the end of out
+ * context data and the digest output buffer on the end of that.
+ */
+ ret = -ENOMEM;
+ pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
+ if (!pks)
+ goto error;
+
+ pks->pkey_hash_algo = hash;
+ pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
+ pks->digest_size = digest_size;
+
+ desc = (struct shash_desc *)(pks + 1);
+ desc->tfm = tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto error;
+
+ ret = crypto_shash_finup(desc, table, table_len, pks->digest);
+ if (ret < 0)
+ goto error;
+
+ crypto_free_shash(tfm);
+ return pks;
+
+error:
+ kfree(pks);
+ crypto_free_shash(tfm);
+ return ERR_PTR(ret);
+}
+
+static int read_block_dev(struct bio_read *payload, struct block_device *bdev,
+ sector_t offset, int length)
+{
+ struct bio *bio;
+ int err = 0, i;
+
+ payload->number_of_pages = DIV_ROUND_UP(length, PAGE_SIZE);
+
+ bio = bio_alloc(GFP_KERNEL, payload->number_of_pages);
+ if (!bio) {
+ DMERR("Error while allocating bio");
+ return -ENOMEM;
+ }
+
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = offset;
+
+ payload->page_io = kzalloc(sizeof(struct page *) *
+ payload->number_of_pages, GFP_KERNEL);
+ if (!payload->page_io) {
+ DMERR("page_io array alloc failed");
+ err = -ENOMEM;
+ goto free_bio;
+ }
+
+ for (i = 0; i < payload->number_of_pages; i++) {
+ payload->page_io[i] = alloc_page(GFP_KERNEL);
+ if (!payload->page_io[i]) {
+ DMERR("alloc_page failed");
+ err = -ENOMEM;
+ goto free_pages;
+ }
+ if (!bio_add_page(bio, payload->page_io[i], PAGE_SIZE, 0)) {
+ DMERR("bio_add_page error");
+ err = -EIO;
+ goto free_pages;
+ }
+ }
+
+ if (!submit_bio_wait(READ, bio))
+ /* success */
+ goto free_bio;
+ DMERR("bio read failed");
+ err = -EIO;
+
+free_pages:
+ for (i = 0; i < payload->number_of_pages; i++)
+ if (payload->page_io[i])
+ __free_page(payload->page_io[i]);
+ kfree(payload->page_io);
+free_bio:
+ bio_put(bio);
+ return err;
+}
+
+static inline u64 fec_div_round_up(u64 x, u64 y)
+{
+ u64 remainder;
+
+ return div64_u64_rem(x, y, &remainder) +
+ (remainder > 0 ? 1 : 0);
+}
+
+static inline void populate_fec_metadata(struct fec_header *header,
+ struct fec_ecc_metadata *ecc)
+{
+ ecc->blocks = fec_div_round_up(le64_to_cpu(header->inp_size),
+ FEC_BLOCK_SIZE);
+ ecc->roots = le32_to_cpu(header->roots);
+ ecc->start = le64_to_cpu(header->inp_size);
+}
+
+static inline int validate_fec_header(struct fec_header *header, u64 offset)
+{
+ /* move offset to make the sanity check work for backup header
+ * as well. */
+ offset -= offset % FEC_BLOCK_SIZE;
+ if (le32_to_cpu(header->magic) != FEC_MAGIC ||
+ le32_to_cpu(header->version) != FEC_VERSION ||
+ le32_to_cpu(header->size) != sizeof(struct fec_header) ||
+ le32_to_cpu(header->roots) == 0 ||
+ le32_to_cpu(header->roots) >= FEC_RSM)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int extract_fec_header(dev_t dev, struct fec_header *fec,
+ struct fec_ecc_metadata *ecc)
+{
+ u64 device_size;
+ struct bio_read payload;
+ int i, err = 0;
+ struct block_device *bdev;
+
+ bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+
+ if (IS_ERR_OR_NULL(bdev)) {
+ DMERR("bdev get error");
+ return PTR_ERR(bdev);
+ }
+
+ device_size = i_size_read(bdev->bd_inode);
+
+ /* fec metadata size is a power of 2 and PAGE_SIZE
+ * is a power of 2 as well.
+ */
+ BUG_ON(FEC_BLOCK_SIZE > PAGE_SIZE);
+ /* 512 byte sector alignment */
+ BUG_ON(((device_size - FEC_BLOCK_SIZE) % (1 << SECTOR_SHIFT)) != 0);
+
+ err = read_block_dev(&payload, bdev, (device_size -
+ FEC_BLOCK_SIZE) / (1 << SECTOR_SHIFT), FEC_BLOCK_SIZE);
+ if (err) {
+ DMERR("Error while reading verity metadata");
+ goto error;
+ }
+
+ BUG_ON(sizeof(struct fec_header) > PAGE_SIZE);
+ memcpy(fec, page_address(payload.page_io[0]),
+ sizeof(*fec));
+
+ ecc->valid = true;
+ if (validate_fec_header(fec, device_size - FEC_BLOCK_SIZE)) {
+ /* Try the backup header */
+ memcpy(fec, page_address(payload.page_io[0]) + FEC_BLOCK_SIZE
+ - sizeof(*fec) ,
+ sizeof(*fec));
+ if (validate_fec_header(fec, device_size -
+ sizeof(struct fec_header)))
+ ecc->valid = false;
+ }
+
+ if (ecc->valid)
+ populate_fec_metadata(fec, ecc);
+
+ for (i = 0; i < payload.number_of_pages; i++)
+ __free_page(payload.page_io[i]);
+ kfree(payload.page_io);
+
+error:
+ blkdev_put(bdev, FMODE_READ);
+ return err;
+}
+static void find_metadata_offset(struct fec_header *fec,
+ struct block_device *bdev, u64 *metadata_offset)
+{
+ u64 device_size;
+
+ device_size = i_size_read(bdev->bd_inode);
+
+ if (le32_to_cpu(fec->magic) == FEC_MAGIC)
+ *metadata_offset = le64_to_cpu(fec->inp_size) -
+ VERITY_METADATA_SIZE;
+ else
+ *metadata_offset = device_size - VERITY_METADATA_SIZE;
+}
+
+static int find_size(dev_t dev, u64 *device_size)
+{
+ struct block_device *bdev;
+
+ bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+ if (IS_ERR_OR_NULL(bdev)) {
+ DMERR("blkdev_get_by_dev failed");
+ return PTR_ERR(bdev);
+ }
+
+ *device_size = i_size_read(bdev->bd_inode);
+ *device_size >>= SECTOR_SHIFT;
+
+ DMINFO("blkdev size in sectors: %llu", *device_size);
+ blkdev_put(bdev, FMODE_READ);
+ return 0;
+}
+
+static int verify_header(struct android_metadata_header *header)
+{
+ int retval = -EINVAL;
+
+ if (is_userdebug() && le32_to_cpu(header->magic_number) ==
+ VERITY_METADATA_MAGIC_DISABLE)
+ return VERITY_STATE_DISABLE;
+
+ if (!(le32_to_cpu(header->magic_number) ==
+ VERITY_METADATA_MAGIC_NUMBER) ||
+ (le32_to_cpu(header->magic_number) ==
+ VERITY_METADATA_MAGIC_DISABLE)) {
+ DMERR("Incorrect magic number");
+ return retval;
+ }
+
+ if (le32_to_cpu(header->protocol_version) !=
+ VERITY_METADATA_VERSION) {
+ DMERR("Unsupported version %u",
+ le32_to_cpu(header->protocol_version));
+ return retval;
+ }
+
+ return 0;
+}
+
+static int extract_metadata(dev_t dev, struct fec_header *fec,
+ struct android_metadata **metadata,
+ bool *verity_enabled)
+{
+ struct block_device *bdev;
+ struct android_metadata_header *header;
+ int i;
+ u32 table_length, copy_length, offset;
+ u64 metadata_offset;
+ struct bio_read payload;
+ int err = 0;
+
+ bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+
+ if (IS_ERR_OR_NULL(bdev)) {
+ DMERR("blkdev_get_by_dev failed");
+ return -ENODEV;
+ }
+
+ find_metadata_offset(fec, bdev, &metadata_offset);
+
+ /* Verity metadata size is a power of 2 and PAGE_SIZE
+ * is a power of 2 as well.
+ * PAGE_SIZE is also a multiple of 512 bytes.
+ */
+ if (VERITY_METADATA_SIZE > PAGE_SIZE)
+ BUG_ON(VERITY_METADATA_SIZE % PAGE_SIZE != 0);
+ /* 512 byte sector alignment */
+ BUG_ON(metadata_offset % (1 << SECTOR_SHIFT) != 0);
+
+ err = read_block_dev(&payload, bdev, metadata_offset /
+ (1 << SECTOR_SHIFT), VERITY_METADATA_SIZE);
+ if (err) {
+ DMERR("Error while reading verity metadata");
+ goto blkdev_release;
+ }
+
+ header = kzalloc(sizeof(*header), GFP_KERNEL);
+ if (!header) {
+ DMERR("kzalloc failed for header");
+ err = -ENOMEM;
+ goto free_payload;
+ }
+
+ memcpy(header, page_address(payload.page_io[0]),
+ sizeof(*header));
+
+ DMINFO("bio magic_number:%u protocol_version:%d table_length:%u",
+ le32_to_cpu(header->magic_number),
+ le32_to_cpu(header->protocol_version),
+ le32_to_cpu(header->table_length));
+
+ err = verify_header(header);
+
+ if (err == VERITY_STATE_DISABLE) {
+ DMERR("Mounting root with verity disabled");
+ *verity_enabled = false;
+ /* we would still have to read the metadata to figure out
+ * the data blocks size. Or may be could map the entire
+ * partition similar to mounting the device.
+ *
+ * Reset error as well as the verity_enabled flag is changed.
+ */
+ err = 0;
+ } else if (err)
+ goto free_header;
+
+ *metadata = kzalloc(sizeof(**metadata), GFP_KERNEL);
+ if (!*metadata) {
+ DMERR("kzalloc for metadata failed");
+ err = -ENOMEM;
+ goto free_header;
+ }
+
+ (*metadata)->header = header;
+ table_length = le32_to_cpu(header->table_length);
+
+ if (table_length == 0 ||
+ table_length > (VERITY_METADATA_SIZE -
+ sizeof(struct android_metadata_header))) {
+ DMERR("table_length too long");
+ err = -EINVAL;
+ goto free_metadata;
+ }
+
+ (*metadata)->verity_table = kzalloc(table_length + 1, GFP_KERNEL);
+
+ if (!(*metadata)->verity_table) {
+ DMERR("kzalloc verity_table failed");
+ err = -ENOMEM;
+ goto free_metadata;
+ }
+
+ if (sizeof(struct android_metadata_header) +
+ table_length <= PAGE_SIZE) {
+ memcpy((*metadata)->verity_table,
+ page_address(payload.page_io[0])
+ + sizeof(struct android_metadata_header),
+ table_length);
+ } else {
+ copy_length = PAGE_SIZE -
+ sizeof(struct android_metadata_header);
+ memcpy((*metadata)->verity_table,
+ page_address(payload.page_io[0])
+ + sizeof(struct android_metadata_header),
+ copy_length);
+ table_length -= copy_length;
+ offset = copy_length;
+ i = 1;
+ while (table_length != 0) {
+ if (table_length > PAGE_SIZE) {
+ memcpy((*metadata)->verity_table + offset,
+ page_address(payload.page_io[i]),
+ PAGE_SIZE);
+ offset += PAGE_SIZE;
+ table_length -= PAGE_SIZE;
+ } else {
+ memcpy((*metadata)->verity_table + offset,
+ page_address(payload.page_io[i]),
+ table_length);
+ table_length = 0;
+ }
+ i++;
+ }
+ }
+ (*metadata)->verity_table[table_length] = '\0';
+
+ DMINFO("verity_table: %s", (*metadata)->verity_table);
+ goto free_payload;
+
+free_metadata:
+ kfree(*metadata);
+free_header:
+ kfree(header);
+free_payload:
+ for (i = 0; i < payload.number_of_pages; i++)
+ if (payload.page_io[i])
+ __free_page(payload.page_io[i]);
+ kfree(payload.page_io);
+blkdev_release:
+ blkdev_put(bdev, FMODE_READ);
+ return err;
+}
+
+/* helper functions to extract properties from dts */
+const char *find_dt_value(const char *name)
+{
+ struct device_node *firmware;
+ const char *value;
+
+ firmware = of_find_node_by_path("/firmware/android");
+ if (!firmware)
+ return NULL;
+ value = of_get_property(firmware, name, NULL);
+ of_node_put(firmware);
+
+ return value;
+}
+
+static int verity_mode(void)
+{
+ static const char enforcing[] = "enforcing";
+ static const char verified_mode_prop[] = "veritymode";
+ const char *value;
+
+ value = find_dt_value(verified_mode_prop);
+ if (!value)
+ value = veritymode;
+ if (!strncmp(value, enforcing, sizeof(enforcing) - 1))
+ return DM_VERITY_MODE_RESTART;
+
+ return DM_VERITY_MODE_EIO;
+}
+
+static int verify_verity_signature(char *key_id,
+ struct android_metadata *metadata)
+{
+ key_ref_t key_ref;
+ struct key *key;
+ struct public_key_signature *pks = NULL;
+ int retval = -EINVAL;
+
+ key_ref = keyring_search(make_key_ref(system_trusted_keyring, 1),
+ &key_type_asymmetric, key_id);
+
+ if (IS_ERR(key_ref)) {
+ DMERR("keyring: key not found");
+ return -ENOKEY;
+ }
+
+ key = key_ref_to_ptr(key_ref);
+
+ pks = table_make_digest(HASH_ALGO_SHA256,
+ (const void *)metadata->verity_table,
+ le32_to_cpu(metadata->header->table_length));
+
+ if (IS_ERR(pks)) {
+ DMERR("hashing failed");
+ retval = PTR_ERR(pks);
+ pks = NULL;
+ goto error;
+ }
+
+ retval = table_extract_mpi_array(pks, &metadata->header->signature[0],
+ RSANUMBYTES);
+ if (retval < 0) {
+ DMERR("Error extracting mpi %d", retval);
+ goto error;
+ }
+
+ retval = verify_signature(key, pks);
+ mpi_free(pks->rsa.s);
+error:
+ kfree(pks);
+ key_put(key);
+
+ return retval;
+}
+
+static void handle_error(void)
+{
+ int mode = verity_mode();
+ if (mode == DM_VERITY_MODE_RESTART) {
+ DMERR("triggering restart");
+ kernel_restart("dm-verity device corrupted");
+ } else {
+ DMERR("Mounting verity root failed");
+ }
+}
+
+static inline bool test_mult_overflow(sector_t a, u32 b)
+{
+ sector_t r = (sector_t)~0ULL;
+
+ sector_div(r, b);
+ return a > r;
+}
+
+static int add_as_linear_device(struct dm_target *ti, char *dev)
+{
+ /*Move to linear mapping defines*/
+ char *linear_table_args[DM_LINEAR_ARGS] = {dev,
+ DM_LINEAR_TARGET_OFFSET};
+ int err = 0;
+
+ android_verity_target.dtr = dm_linear_dtr,
+ android_verity_target.map = dm_linear_map,
+ android_verity_target.status = dm_linear_status,
+ android_verity_target.prepare_ioctl = dm_linear_prepare_ioctl,
+ android_verity_target.iterate_devices = dm_linear_iterate_devices,
+ android_verity_target.direct_access = dm_linear_direct_access,
+ android_verity_target.io_hints = NULL;
+
+ set_disk_ro(dm_disk(dm_table_get_md(ti->table)), 0);
+
+ err = dm_linear_ctr(ti, DM_LINEAR_ARGS, linear_table_args);
+
+ if (!err) {
+ DMINFO("Added android-verity as a linear target");
+ target_added = true;
+ } else
+ DMERR("Failed to add android-verity as linear target");
+
+ return err;
+}
+
+static int create_linear_device(struct dm_target *ti, dev_t dev,
+ char *target_device)
+{
+ u64 device_size = 0;
+ int err = find_size(dev, &device_size);
+
+ if (err) {
+ DMERR("error finding bdev size");
+ handle_error();
+ return err;
+ }
+
+ ti->len = device_size;
+ err = add_as_linear_device(ti, target_device);
+ if (err) {
+ handle_error();
+ return err;
+ }
+ verity_enabled = false;
+ return 0;
+}
+
+/*
+ * Target parameters:
+ * <key id> Key id of the public key in the system keyring.
+ * Verity metadata's signature would be verified against
+ * this. If the key id contains spaces, replace them
+ * with '#'.
+ * <block device> The block device for which dm-verity is being setup.
+ */
+static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ dev_t uninitialized_var(dev);
+ struct android_metadata *metadata = NULL;
+ int err = 0, i, mode;
+ char *key_id, *table_ptr, dummy, *target_device,
+ *verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS];
+ /* One for specifying number of opt args and one for mode */
+ sector_t data_sectors;
+ u32 data_block_size;
+ unsigned int no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS;
+ struct fec_header uninitialized_var(fec);
+ struct fec_ecc_metadata uninitialized_var(ecc);
+ char buf[FEC_ARG_LENGTH], *buf_ptr;
+ unsigned long long tmpll;
+
+ if (argc == 1) {
+ /* Use the default keyid */
+ if (default_verity_key_id())
+ key_id = veritykeyid;
+ else if (!is_eng()) {
+ DMERR("veritykeyid= is not set");
+ handle_error();
+ return -EINVAL;
+ }
+ } else if (argc == 2)
+ key_id = argv[1];
+ else {
+ DMERR("Incorrect number of arguments");
+ handle_error();
+ return -EINVAL;
+ }
+
+ target_device = argv[0];
+
+ dev = name_to_dev_t(target_device);
+ if (!dev) {
+ DMERR("no dev found for %s", target_device);
+ handle_error();
+ return -EINVAL;
+ }
+
+ if (is_eng())
+ return create_linear_device(ti, dev, target_device);
+
+ strreplace(key_id, '#', ' ');
+
+ DMINFO("key:%s dev:%s", key_id, target_device);
+
+ if (extract_fec_header(dev, &fec, &ecc)) {
+ DMERR("Error while extracting fec header");
+ handle_error();
+ return -EINVAL;
+ }
+
+ err = extract_metadata(dev, &fec, &metadata, &verity_enabled);
+
+ if (err) {
+ /* Allow invalid metadata when the device is unlocked */
+ if (is_unlocked()) {
+ DMWARN("Allow invalid metadata when unlocked");
+ return create_linear_device(ti, dev, target_device);
+ }
+ DMERR("Error while extracting metadata");
+ handle_error();
+ goto free_metadata;
+ }
+
+ if (verity_enabled) {
+ err = verify_verity_signature(key_id, metadata);
+
+ if (err) {
+ DMERR("Signature verification failed");
+ handle_error();
+ goto free_metadata;
+ } else
+ DMINFO("Signature verification success");
+ }
+
+ table_ptr = metadata->verity_table;
+
+ for (i = 0; i < VERITY_TABLE_ARGS; i++) {
+ verity_table_args[i] = strsep(&table_ptr, " ");
+ if (verity_table_args[i] == NULL)
+ break;
+ }
+
+ if (i != VERITY_TABLE_ARGS) {
+ DMERR("Verity table not in the expected format");
+ err = -EINVAL;
+ handle_error();
+ goto free_metadata;
+ }
+
+ if (sscanf(verity_table_args[5], "%llu%c", &tmpll, &dummy)
+ != 1) {
+ DMERR("Verity table not in the expected format");
+ handle_error();
+ err = -EINVAL;
+ goto free_metadata;
+ }
+
+ if (tmpll > ULONG_MAX) {
+ DMERR("<num_data_blocks> too large. Forgot to turn on CONFIG_LBDAF?");
+ handle_error();
+ err = -EINVAL;
+ goto free_metadata;
+ }
+
+ data_sectors = tmpll;
+
+ if (sscanf(verity_table_args[3], "%u%c", &data_block_size, &dummy)
+ != 1) {
+ DMERR("Verity table not in the expected format");
+ handle_error();
+ err = -EINVAL;
+ goto free_metadata;
+ }
+
+ if (test_mult_overflow(data_sectors, data_block_size >>
+ SECTOR_SHIFT)) {
+ DMERR("data_sectors too large");
+ handle_error();
+ err = -EOVERFLOW;
+ goto free_metadata;
+ }
+
+ data_sectors *= data_block_size >> SECTOR_SHIFT;
+ DMINFO("Data sectors %llu", (unsigned long long)data_sectors);
+
+ /* update target length */
+ ti->len = data_sectors;
+
+ /* Setup linear target and free */
+ if (!verity_enabled) {
+ err = add_as_linear_device(ti, target_device);
+ goto free_metadata;
+ }
+
+ /*substitute data_dev and hash_dev*/
+ verity_table_args[1] = target_device;
+ verity_table_args[2] = target_device;
+
+ mode = verity_mode();
+
+ if (ecc.valid && IS_BUILTIN(CONFIG_DM_VERITY_FEC)) {
+ if (mode) {
+ err = snprintf(buf, FEC_ARG_LENGTH,
+ "%u %s " VERITY_TABLE_OPT_FEC_FORMAT,
+ 1 + VERITY_TABLE_OPT_FEC_ARGS,
+ mode == DM_VERITY_MODE_RESTART ?
+ VERITY_TABLE_OPT_RESTART :
+ VERITY_TABLE_OPT_LOGGING,
+ target_device,
+ ecc.start / FEC_BLOCK_SIZE, ecc.blocks,
+ ecc.roots);
+ } else {
+ err = snprintf(buf, FEC_ARG_LENGTH,
+ "%u " VERITY_TABLE_OPT_FEC_FORMAT,
+ VERITY_TABLE_OPT_FEC_ARGS, target_device,
+ ecc.start / FEC_BLOCK_SIZE, ecc.blocks,
+ ecc.roots);
+ }
+ } else if (mode) {
+ err = snprintf(buf, FEC_ARG_LENGTH,
+ "2 " VERITY_TABLE_OPT_IGNZERO " %s",
+ mode == DM_VERITY_MODE_RESTART ?
+ VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING);
+ } else {
+ err = snprintf(buf, FEC_ARG_LENGTH, "1 %s",
+ "ignore_zero_blocks");
+ }
+
+ if (err < 0 || err >= FEC_ARG_LENGTH)
+ goto free_metadata;
+
+ buf_ptr = buf;
+
+ for (i = VERITY_TABLE_ARGS; i < (VERITY_TABLE_ARGS +
+ VERITY_TABLE_OPT_FEC_ARGS + 2); i++) {
+ verity_table_args[i] = strsep(&buf_ptr, " ");
+ if (verity_table_args[i] == NULL) {
+ no_of_args = i;
+ break;
+ }
+ }
+
+ err = verity_ctr(ti, no_of_args, verity_table_args);
+
+ if (err)
+ DMERR("android-verity failed to mount as verity target");
+ else {
+ target_added = true;
+ DMINFO("android-verity mounted as verity target");
+ }
+
+free_metadata:
+ if (metadata) {
+ kfree(metadata->header);
+ kfree(metadata->verity_table);
+ }
+ kfree(metadata);
+ return err;
+}
+
+static int __init dm_android_verity_init(void)
+{
+ int r;
+ struct dentry *file;
+
+ r = dm_register_target(&android_verity_target);
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ /* Tracks the status of the last added target */
+ debug_dir = debugfs_create_dir("android_verity", NULL);
+
+ if (IS_ERR_OR_NULL(debug_dir)) {
+ DMERR("Cannot create android_verity debugfs directory: %ld",
+ PTR_ERR(debug_dir));
+ goto end;
+ }
+
+ file = debugfs_create_bool("target_added", S_IRUGO, debug_dir,
+ &target_added);
+
+ if (IS_ERR_OR_NULL(file)) {
+ DMERR("Cannot create android_verity debugfs directory: %ld",
+ PTR_ERR(debug_dir));
+ debugfs_remove_recursive(debug_dir);
+ goto end;
+ }
+
+ file = debugfs_create_bool("verity_enabled", S_IRUGO, debug_dir,
+ &verity_enabled);
+
+ if (IS_ERR_OR_NULL(file)) {
+ DMERR("Cannot create android_verity debugfs directory: %ld",
+ PTR_ERR(debug_dir));
+ debugfs_remove_recursive(debug_dir);
+ }
+
+end:
+ return r;
+}
+
+static void __exit dm_android_verity_exit(void)
+{
+ if (!IS_ERR_OR_NULL(debug_dir))
+ debugfs_remove_recursive(debug_dir);
+
+ dm_unregister_target(&android_verity_target);
+}
+
+module_init(dm_android_verity_init);
+module_exit(dm_android_verity_exit);
diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h
new file mode 100644
index 000000000000..c8d7ab642780
--- /dev/null
+++ b/drivers/md/dm-android-verity.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef DM_ANDROID_VERITY_H
+#define DM_ANDROID_VERITY_H
+
+#include <crypto/sha.h>
+
+#define RSANUMBYTES 256
+#define VERITY_METADATA_MAGIC_NUMBER 0xb001b001
+#define VERITY_METADATA_MAGIC_DISABLE 0x46464f56
+#define VERITY_METADATA_VERSION 0
+#define VERITY_STATE_DISABLE 1
+#define DATA_BLOCK_SIZE (4 * 1024)
+#define VERITY_METADATA_SIZE (8 * DATA_BLOCK_SIZE)
+#define VERITY_TABLE_ARGS 10
+#define VERITY_COMMANDLINE_PARAM_LENGTH 20
+#define BUILD_VARIANT 20
+
+/*
+ * <subject>:<sha1-id> is the format for the identifier.
+ * subject can either be the Common Name(CN) + Organization Name(O) or
+ * just the CN if the it is prefixed with O
+ * From https://tools.ietf.org/html/rfc5280#appendix-A
+ * ub-organization-name-length INTEGER ::= 64
+ * ub-common-name-length INTEGER ::= 64
+ *
+ * http://lxr.free-electrons.com/source/crypto/asymmetric_keys/x509_cert_parser.c?v=3.9#L278
+ * ctx->o_size + 2 + ctx->cn_size + 1
+ * + 41 characters for ":" and sha1 id
+ * 64 + 2 + 64 + 1 + 1 + 40 (172)
+ * setting VERITY_DEFAULT_KEY_ID_LENGTH to 200 characters.
+ */
+#define VERITY_DEFAULT_KEY_ID_LENGTH 200
+
+#define FEC_MAGIC 0xFECFECFE
+#define FEC_BLOCK_SIZE (4 * 1024)
+#define FEC_VERSION 0
+#define FEC_RSM 255
+#define FEC_ARG_LENGTH 300
+
+#define VERITY_TABLE_OPT_RESTART "restart_on_corruption"
+#define VERITY_TABLE_OPT_LOGGING "ignore_corruption"
+#define VERITY_TABLE_OPT_IGNZERO "ignore_zero_blocks"
+
+#define VERITY_TABLE_OPT_FEC_FORMAT \
+ "use_fec_from_device %s fec_start %llu fec_blocks %llu fec_roots %u ignore_zero_blocks"
+#define VERITY_TABLE_OPT_FEC_ARGS 9
+
+#define VERITY_DEBUG 0
+
+#define DM_MSG_PREFIX "android-verity"
+
+#define DM_LINEAR_ARGS 2
+#define DM_LINEAR_TARGET_OFFSET "0"
+
+/*
+ * There can be two formats.
+ * if fec is present
+ * <data_blocks> <verity_tree> <verity_metdata_32K><fec_data><fec_data_4K>
+ * if fec is not present
+ * <data_blocks> <verity_tree> <verity_metdata_32K>
+ */
+struct fec_header {
+ __le32 magic;
+ __le32 version;
+ __le32 size;
+ __le32 roots;
+ __le32 fec_size;
+ __le64 inp_size;
+ u8 hash[SHA256_DIGEST_SIZE];
+} __attribute__((packed));
+
+struct android_metadata_header {
+ __le32 magic_number;
+ __le32 protocol_version;
+ char signature[RSANUMBYTES];
+ __le32 table_length;
+};
+
+struct android_metadata {
+ struct android_metadata_header *header;
+ char *verity_table;
+};
+
+struct fec_ecc_metadata {
+ bool valid;
+ u32 roots;
+ u64 blocks;
+ u64 rounds;
+ u64 start;
+};
+
+struct bio_read {
+ struct page **page_io;
+ int number_of_pages;
+};
+
+extern struct target_type linear_target;
+
+extern void dm_linear_dtr(struct dm_target *ti);
+extern int dm_linear_map(struct dm_target *ti, struct bio *bio);
+extern void dm_linear_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen);
+extern int dm_linear_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode);
+extern int dm_linear_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data);
+extern int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv);
+extern long dm_linear_direct_access(struct dm_target *ti, sector_t sector,
+ void **kaddr, pfn_t *pfn, long size);
+#endif /* DM_ANDROID_VERITY_H */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0aedd0ebccec..7a5b75fd39d6 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1866,16 +1866,24 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
ret = -ENOMEM;
- cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1);
+ cc->io_queue = alloc_workqueue("kcryptd_io",
+ WQ_HIGHPRI |
+ WQ_MEM_RECLAIM,
+ 1);
if (!cc->io_queue) {
ti->error = "Couldn't create kcryptd io queue";
goto bad;
}
if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
- cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+ cc->crypt_queue = alloc_workqueue("kcryptd",
+ WQ_HIGHPRI |
+ WQ_MEM_RECLAIM, 1);
else
- cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
+ cc->crypt_queue = alloc_workqueue("kcryptd",
+ WQ_HIGHPRI |
+ WQ_MEM_RECLAIM |
+ WQ_UNBOUND,
num_online_cpus());
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 6964b252952a..446d76e14c58 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1921,6 +1921,45 @@ void dm_interface_exit(void)
dm_hash_exit();
}
+
+/**
+ * dm_ioctl_export - Permanently export a mapped device via the ioctl interface
+ * @md: Pointer to mapped_device
+ * @name: Buffer (size DM_NAME_LEN) for name
+ * @uuid: Buffer (size DM_UUID_LEN) for uuid or NULL if not desired
+ */
+int dm_ioctl_export(struct mapped_device *md, const char *name,
+ const char *uuid)
+{
+ int r = 0;
+ struct hash_cell *hc;
+
+ if (!md) {
+ r = -ENXIO;
+ goto out;
+ }
+
+ /* The name and uuid can only be set once. */
+ mutex_lock(&dm_hash_cells_mutex);
+ hc = dm_get_mdptr(md);
+ mutex_unlock(&dm_hash_cells_mutex);
+ if (hc) {
+ DMERR("%s: already exported", dm_device_name(md));
+ r = -ENXIO;
+ goto out;
+ }
+
+ r = dm_hash_insert(name, uuid, md);
+ if (r) {
+ DMERR("%s: could not bind to '%s'", dm_device_name(md), name);
+ goto out;
+ }
+
+ /* Let udev know we've changed. */
+ dm_kobject_uevent(md, KOBJ_CHANGE, dm_get_event_nr(md));
+out:
+ return r;
+}
/**
* dm_copy_name_and_uuid - Copy mapped device name & uuid into supplied buffers
* @md: Pointer to mapped_device
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 4788b0b989a9..4ad62d680547 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -25,7 +25,7 @@ struct linear_c {
/*
* Construct a linear mapping: <dev_path> <offset>
*/
-static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct linear_c *lc;
unsigned long long tmp;
@@ -66,14 +66,16 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
kfree(lc);
return ret;
}
+EXPORT_SYMBOL_GPL(dm_linear_ctr);
-static void linear_dtr(struct dm_target *ti)
+void dm_linear_dtr(struct dm_target *ti)
{
struct linear_c *lc = (struct linear_c *) ti->private;
dm_put_device(ti, lc->dev);
kfree(lc);
}
+EXPORT_SYMBOL_GPL(dm_linear_dtr);
static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector)
{
@@ -92,14 +94,15 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
linear_map_sector(ti, bio->bi_iter.bi_sector);
}
-static int linear_map(struct dm_target *ti, struct bio *bio)
+int dm_linear_map(struct dm_target *ti, struct bio *bio)
{
linear_map_bio(ti, bio);
return DM_MAPIO_REMAPPED;
}
+EXPORT_SYMBOL_GPL(dm_linear_map);
-static void linear_status(struct dm_target *ti, status_type_t type,
+void dm_linear_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
struct linear_c *lc = (struct linear_c *) ti->private;
@@ -115,8 +118,9 @@ static void linear_status(struct dm_target *ti, status_type_t type,
break;
}
}
+EXPORT_SYMBOL_GPL(dm_linear_status);
-static int linear_prepare_ioctl(struct dm_target *ti,
+int dm_linear_prepare_ioctl(struct dm_target *ti,
struct block_device **bdev, fmode_t *mode)
{
struct linear_c *lc = (struct linear_c *) ti->private;
@@ -132,16 +136,18 @@ static int linear_prepare_ioctl(struct dm_target *ti,
return 1;
return 0;
}
+EXPORT_SYMBOL_GPL(dm_linear_prepare_ioctl);
-static int linear_iterate_devices(struct dm_target *ti,
+int dm_linear_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct linear_c *lc = ti->private;
return fn(ti, lc->dev, lc->start, ti->len, data);
}
+EXPORT_SYMBOL_GPL(dm_linear_iterate_devices);
-static long linear_direct_access(struct dm_target *ti, sector_t sector,
+long dm_linear_direct_access(struct dm_target *ti, sector_t sector,
void **kaddr, pfn_t *pfn, long size)
{
struct linear_c *lc = ti->private;
@@ -158,18 +164,19 @@ static long linear_direct_access(struct dm_target *ti, sector_t sector,
return ret;
}
+EXPORT_SYMBOL_GPL(dm_linear_direct_access);
static struct target_type linear_target = {
.name = "linear",
.version = {1, 3, 0},
.module = THIS_MODULE,
- .ctr = linear_ctr,
- .dtr = linear_dtr,
- .map = linear_map,
- .status = linear_status,
- .prepare_ioctl = linear_prepare_ioctl,
- .iterate_devices = linear_iterate_devices,
- .direct_access = linear_direct_access,
+ .ctr = dm_linear_ctr,
+ .dtr = dm_linear_dtr,
+ .map = dm_linear_map,
+ .status = dm_linear_status,
+ .prepare_ioctl = dm_linear_prepare_ioctl,
+ .iterate_devices = dm_linear_iterate_devices,
+ .direct_access = dm_linear_direct_access,
};
int __init dm_linear_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5ac239d0f787..d837a289429a 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -11,6 +11,7 @@
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/namei.h>
+#include <linux/mount.h>
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/slab.h>
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 78f36012eaca..3b6231596284 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -11,6 +11,7 @@
#include "dm-verity-fec.h"
#include <linux/math64.h>
+#include <linux/sysfs.h>
#define DM_MSG_PREFIX "verity-fec"
@@ -175,9 +176,11 @@ error:
if (r < 0 && neras)
DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
v->data_dev->name, (unsigned long long)rsb, r);
- else if (r > 0)
+ else if (r > 0) {
DMWARN_LIMIT("%s: FEC %llu: corrected %d errors",
v->data_dev->name, (unsigned long long)rsb, r);
+ atomic_add_unless(&v->fec->corrected, 1, INT_MAX);
+ }
return r;
}
@@ -556,6 +559,7 @@ unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
void verity_fec_dtr(struct dm_verity *v)
{
struct dm_verity_fec *f = v->fec;
+ struct kobject *kobj = &f->kobj_holder.kobj;
if (!verity_fec_is_enabled(v))
goto out;
@@ -572,6 +576,12 @@ void verity_fec_dtr(struct dm_verity *v)
if (f->dev)
dm_put_device(v->ti, f->dev);
+
+ if (kobj->state_initialized) {
+ kobject_put(kobj);
+ wait_for_completion(dm_get_completion_from_kobject(kobj));
+ }
+
out:
kfree(f);
v->fec = NULL;
@@ -660,6 +670,28 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
return 0;
}
+static ssize_t corrected_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct dm_verity_fec *f = container_of(kobj, struct dm_verity_fec,
+ kobj_holder.kobj);
+
+ return sprintf(buf, "%d\n", atomic_read(&f->corrected));
+}
+
+static struct kobj_attribute attr_corrected = __ATTR_RO(corrected);
+
+static struct attribute *fec_attrs[] = {
+ &attr_corrected.attr,
+ NULL
+};
+
+static struct kobj_type fec_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_attrs = fec_attrs,
+ .release = dm_kobject_release
+};
+
/*
* Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr.
*/
@@ -683,8 +715,10 @@ int verity_fec_ctr_alloc(struct dm_verity *v)
*/
int verity_fec_ctr(struct dm_verity *v)
{
+ int r;
struct dm_verity_fec *f = v->fec;
struct dm_target *ti = v->ti;
+ struct mapped_device *md = dm_table_get_md(ti->table);
u64 hash_blocks;
if (!verity_fec_is_enabled(v)) {
@@ -692,6 +726,16 @@ int verity_fec_ctr(struct dm_verity *v)
return 0;
}
+ /* Create a kobject and sysfs attributes */
+ init_completion(&f->kobj_holder.completion);
+
+ r = kobject_init_and_add(&f->kobj_holder.kobj, &fec_ktype,
+ &disk_to_dev(dm_disk(md))->kobj, "%s", "fec");
+ if (r) {
+ ti->error = "Cannot create kobject";
+ return r;
+ }
+
/*
* FEC is computed over data blocks, possible metadata, and
* hash blocks. In other words, FEC covers total of fec_blocks
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index bb31ce87a933..4db0cae262eb 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -12,6 +12,8 @@
#ifndef DM_VERITY_FEC_H
#define DM_VERITY_FEC_H
+#include "dm.h"
+#include "dm-core.h"
#include "dm-verity.h"
#include <linux/rslib.h>
@@ -51,6 +53,8 @@ struct dm_verity_fec {
mempool_t *extra_pool; /* mempool for extra buffers */
mempool_t *output_pool; /* mempool for output */
struct kmem_cache *cache; /* cache for buffers */
+ atomic_t corrected; /* corrected errors */
+ struct dm_kobject_holder kobj_holder; /* for sysfs attributes */
};
/* per-bio data */
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 0aba34a7b3b3..5d0a9963b108 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -501,6 +501,7 @@ static void verity_prefetch_io(struct work_struct *work)
container_of(work, struct dm_verity_prefetch_work, work);
struct dm_verity *v = pw->v;
int i;
+ sector_t prefetch_size;
for (i = v->levels - 2; i >= 0; i--) {
sector_t hash_block_start;
@@ -523,8 +524,14 @@ static void verity_prefetch_io(struct work_struct *work)
hash_block_end = v->hash_blocks - 1;
}
no_prefetch_cluster:
+ // for emmc, it is more efficient to send bigger read
+ prefetch_size = max((sector_t)CONFIG_DM_VERITY_HASH_PREFETCH_MIN_SIZE,
+ hash_block_end - hash_block_start + 1);
+ if ((hash_block_start + prefetch_size) >= (v->hash_start + v->hash_blocks)) {
+ prefetch_size = hash_block_end - hash_block_start + 1;
+ }
dm_bufio_prefetch(v->bufio, hash_block_start,
- hash_block_end - hash_block_start + 1);
+ prefetch_size);
}
kfree(pw);
@@ -551,7 +558,7 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
* Bio map function. It allocates dm_verity_io structure and bio vector and
* fills them. Then it issues prefetches and the I/O.
*/
-static int verity_map(struct dm_target *ti, struct bio *bio)
+int verity_map(struct dm_target *ti, struct bio *bio)
{
struct dm_verity *v = ti->private;
struct dm_verity_io *io;
@@ -592,11 +599,12 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
+EXPORT_SYMBOL_GPL(verity_map);
/*
* Status: V (valid) or C (corruption found)
*/
-static void verity_status(struct dm_target *ti, status_type_t type,
+void verity_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
struct dm_verity *v = ti->private;
@@ -655,8 +663,9 @@ static void verity_status(struct dm_target *ti, status_type_t type,
break;
}
}
+EXPORT_SYMBOL_GPL(verity_status);
-static int verity_prepare_ioctl(struct dm_target *ti,
+int verity_prepare_ioctl(struct dm_target *ti,
struct block_device **bdev, fmode_t *mode)
{
struct dm_verity *v = ti->private;
@@ -668,16 +677,18 @@ static int verity_prepare_ioctl(struct dm_target *ti,
return 1;
return 0;
}
+EXPORT_SYMBOL_GPL(verity_prepare_ioctl);
-static int verity_iterate_devices(struct dm_target *ti,
+int verity_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct dm_verity *v = ti->private;
return fn(ti, v->data_dev, v->data_start, ti->len, data);
}
+EXPORT_SYMBOL_GPL(verity_iterate_devices);
-static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
+void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct dm_verity *v = ti->private;
@@ -689,8 +700,9 @@ static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
blk_limits_io_min(limits, limits->logical_block_size);
}
+EXPORT_SYMBOL_GPL(verity_io_hints);
-static void verity_dtr(struct dm_target *ti)
+void verity_dtr(struct dm_target *ti)
{
struct dm_verity *v = ti->private;
@@ -719,6 +731,7 @@ static void verity_dtr(struct dm_target *ti)
kfree(v);
}
+EXPORT_SYMBOL_GPL(verity_dtr);
static int verity_alloc_zero_digest(struct dm_verity *v)
{
@@ -817,7 +830,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
* <digest>
* <salt> Hex string or "-" if no salt.
*/
-static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
struct dm_verity *v;
struct dm_arg_set as;
@@ -1053,6 +1066,7 @@ bad:
return r;
}
+EXPORT_SYMBOL_GPL(verity_ctr);
static struct target_type verity_target = {
.name = "verity",
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index fb419f422d73..75effca400a3 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -126,4 +126,14 @@ extern int verity_hash(struct dm_verity *v, struct shash_desc *desc,
extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
sector_t block, u8 *digest, bool *is_zero);
+extern void verity_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen);
+extern int verity_prepare_ioctl(struct dm_target *ti,
+ struct block_device **bdev, fmode_t *mode);
+extern int verity_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data);
+extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits);
+extern void verity_dtr(struct dm_target *ti);
+extern int verity_ctr(struct dm_target *ti, unsigned argc, char **argv);
+extern int verity_map(struct dm_target *ti, struct bio *bio);
#endif /* DM_VERITY_H */
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index f0aad08b9654..ed25f30a7550 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -80,8 +80,6 @@ void dm_set_md_type(struct mapped_device *md, unsigned type);
unsigned dm_get_md_type(struct mapped_device *md);
struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
-int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
-
/*
* To check the return value from dm_table_find_target().
*/
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 64971baf11fa..9360e6ebb4ea 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -766,6 +766,27 @@ config PANEL_BOOT_MESSAGE
An empty message will only clear the display at driver init time. Any other
printf()-formatted message is valid with newline and escape codes.
+config UID_SYS_STATS
+ bool "Per-UID statistics"
+ depends on PROFILING && TASK_XACCT && TASK_IO_ACCOUNTING
+ help
+ Per UID based cpu time statistics exported to /proc/uid_cputime
+ Per UID based io statistics exported to /proc/uid_io
+ Per UID based procstat control in /proc/uid_procstat
+
+config UID_SYS_STATS_DEBUG
+ bool "Per-TASK statistics"
+ depends on UID_SYS_STATS
+ default n
+ help
+ Per TASK based io statistics exported to /proc/uid_io
+
+config MEMORY_STATE_TIME
+ tristate "Memory freq/bandwidth time statistics"
+ depends on PROFILING
+ help
+ Memory time statistics exported to /sys/kernel/memory_state_time
+
source "drivers/misc/c2port/Kconfig"
source "drivers/misc/eeprom/Kconfig"
source "drivers/misc/cb710/Kconfig"
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 2bf79ba4a39e..234754db7773 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -54,6 +54,9 @@ obj-$(CONFIG_VEXPRESS_SYSCFG) += vexpress-syscfg.o
obj-$(CONFIG_CXL_BASE) += cxl/
obj-$(CONFIG_PANEL) += panel.o
+obj-$(CONFIG_UID_SYS_STATS) += uid_sys_stats.o
+obj-$(CONFIG_MEMORY_STATE_TIME) += memory_state_time.o
+
lkdtm-$(CONFIG_LKDTM) += lkdtm_core.o
lkdtm-$(CONFIG_LKDTM) += lkdtm_bugs.o
lkdtm-$(CONFIG_LKDTM) += lkdtm_heap.o
diff --git a/drivers/misc/memory_state_time.c b/drivers/misc/memory_state_time.c
new file mode 100644
index 000000000000..ba94dcf09169
--- /dev/null
+++ b/drivers/misc/memory_state_time.c
@@ -0,0 +1,462 @@
+/* drivers/misc/memory_state_time.c
+ *
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/hashtable.h>
+#include <linux/kconfig.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/memory-state-time.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of_platform.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/time.h>
+#include <linux/timekeeping.h>
+#include <linux/workqueue.h>
+
+#define KERNEL_ATTR_RO(_name) \
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define KERNEL_ATTR_RW(_name) \
+static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+#define FREQ_HASH_BITS 4
+DECLARE_HASHTABLE(freq_hash_table, FREQ_HASH_BITS);
+
+static DEFINE_MUTEX(mem_lock);
+
+#define TAG "memory_state_time"
+#define BW_NODE "/soc/memory-state-time"
+#define FREQ_TBL "freq-tbl"
+#define BW_TBL "bw-buckets"
+#define NUM_SOURCES "num-sources"
+
+#define LOWEST_FREQ 2
+
+static int curr_bw;
+static int curr_freq;
+static u32 *bw_buckets;
+static u32 *freq_buckets;
+static int num_freqs;
+static int num_buckets;
+static int registered_bw_sources;
+static u64 last_update;
+static bool init_success;
+static struct workqueue_struct *memory_wq;
+static u32 num_sources = 10;
+static int *bandwidths;
+
+struct freq_entry {
+ int freq;
+ u64 *buckets; /* Bandwidth buckets. */
+ struct hlist_node hash;
+};
+
+struct queue_container {
+ struct work_struct update_state;
+ int value;
+ u64 time_now;
+ int id;
+ struct mutex *lock;
+};
+
+static int find_bucket(int bw)
+{
+ int i;
+
+ if (bw_buckets != NULL) {
+ for (i = 0; i < num_buckets; i++) {
+ if (bw_buckets[i] > bw) {
+ pr_debug("Found bucket %d for bandwidth %d\n",
+ i, bw);
+ return i;
+ }
+ }
+ return num_buckets - 1;
+ }
+ return 0;
+}
+
+static u64 get_time_diff(u64 time_now)
+{
+ u64 ms;
+
+ ms = time_now - last_update;
+ last_update = time_now;
+ return ms;
+}
+
+static ssize_t show_stat_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int i, j;
+ int len = 0;
+ struct freq_entry *freq_entry;
+
+ for (i = 0; i < num_freqs; i++) {
+ hash_for_each_possible(freq_hash_table, freq_entry, hash,
+ freq_buckets[i]) {
+ if (freq_entry->freq == freq_buckets[i]) {
+ len += scnprintf(buf + len, PAGE_SIZE - len,
+ "%d ", freq_buckets[i]);
+ if (len >= PAGE_SIZE)
+ break;
+ for (j = 0; j < num_buckets; j++) {
+ len += scnprintf(buf + len,
+ PAGE_SIZE - len,
+ "%llu ",
+ freq_entry->buckets[j]);
+ }
+ len += scnprintf(buf + len, PAGE_SIZE - len,
+ "\n");
+ }
+ }
+ }
+ pr_debug("Current Time: %llu\n", ktime_get_boot_ns());
+ return len;
+}
+KERNEL_ATTR_RO(show_stat);
+
+static void update_table(u64 time_now)
+{
+ struct freq_entry *freq_entry;
+
+ pr_debug("Last known bw %d freq %d\n", curr_bw, curr_freq);
+ hash_for_each_possible(freq_hash_table, freq_entry, hash, curr_freq) {
+ if (curr_freq == freq_entry->freq) {
+ freq_entry->buckets[find_bucket(curr_bw)]
+ += get_time_diff(time_now);
+ break;
+ }
+ }
+}
+
+static bool freq_exists(int freq)
+{
+ int i;
+
+ for (i = 0; i < num_freqs; i++) {
+ if (freq == freq_buckets[i])
+ return true;
+ }
+ return false;
+}
+
+static int calculate_total_bw(int bw, int index)
+{
+ int i;
+ int total_bw = 0;
+
+ pr_debug("memory_state_time New bw %d for id %d\n", bw, index);
+ bandwidths[index] = bw;
+ for (i = 0; i < registered_bw_sources; i++)
+ total_bw += bandwidths[i];
+ return total_bw;
+}
+
+static void freq_update_do_work(struct work_struct *work)
+{
+ struct queue_container *freq_state_update
+ = container_of(work, struct queue_container,
+ update_state);
+ if (freq_state_update) {
+ mutex_lock(&mem_lock);
+ update_table(freq_state_update->time_now);
+ curr_freq = freq_state_update->value;
+ mutex_unlock(&mem_lock);
+ kfree(freq_state_update);
+ }
+}
+
+static void bw_update_do_work(struct work_struct *work)
+{
+ struct queue_container *bw_state_update
+ = container_of(work, struct queue_container,
+ update_state);
+ if (bw_state_update) {
+ mutex_lock(&mem_lock);
+ update_table(bw_state_update->time_now);
+ curr_bw = calculate_total_bw(bw_state_update->value,
+ bw_state_update->id);
+ mutex_unlock(&mem_lock);
+ kfree(bw_state_update);
+ }
+}
+
+static void memory_state_freq_update(struct memory_state_update_block *ub,
+ int value)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+ if (freq_exists(value) && init_success) {
+ struct queue_container *freq_container
+ = kmalloc(sizeof(struct queue_container),
+ GFP_KERNEL);
+ if (!freq_container)
+ return;
+ INIT_WORK(&freq_container->update_state,
+ freq_update_do_work);
+ freq_container->time_now = ktime_get_boot_ns();
+ freq_container->value = value;
+ pr_debug("Scheduling freq update in work queue\n");
+ queue_work(memory_wq, &freq_container->update_state);
+ } else {
+ pr_debug("Freq does not exist.\n");
+ }
+ }
+}
+
+static void memory_state_bw_update(struct memory_state_update_block *ub,
+ int value)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+ if (init_success) {
+ struct queue_container *bw_container
+ = kmalloc(sizeof(struct queue_container),
+ GFP_KERNEL);
+ if (!bw_container)
+ return;
+ INIT_WORK(&bw_container->update_state,
+ bw_update_do_work);
+ bw_container->time_now = ktime_get_boot_ns();
+ bw_container->value = value;
+ bw_container->id = ub->id;
+ pr_debug("Scheduling bandwidth update in work queue\n");
+ queue_work(memory_wq, &bw_container->update_state);
+ }
+ }
+}
+
+struct memory_state_update_block *memory_state_register_frequency_source(void)
+{
+ struct memory_state_update_block *block;
+
+ if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+ pr_debug("Allocating frequency source\n");
+ block = kmalloc(sizeof(struct memory_state_update_block),
+ GFP_KERNEL);
+ if (!block)
+ return NULL;
+ block->update_call = memory_state_freq_update;
+ return block;
+ }
+ pr_err("Config option disabled.\n");
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(memory_state_register_frequency_source);
+
+struct memory_state_update_block *memory_state_register_bandwidth_source(void)
+{
+ struct memory_state_update_block *block;
+
+ if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+ pr_debug("Allocating bandwidth source %d\n",
+ registered_bw_sources);
+ block = kmalloc(sizeof(struct memory_state_update_block),
+ GFP_KERNEL);
+ if (!block)
+ return NULL;
+ block->update_call = memory_state_bw_update;
+ if (registered_bw_sources < num_sources) {
+ block->id = registered_bw_sources++;
+ } else {
+ pr_err("Unable to allocate source; max number reached\n");
+ kfree(block);
+ return NULL;
+ }
+ return block;
+ }
+ pr_err("Config option disabled.\n");
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(memory_state_register_bandwidth_source);
+
+/* Buckets are designated by their maximum.
+ * Returns the buckets decided by the capability of the device.
+ */
+static int get_bw_buckets(struct device *dev)
+{
+ int ret, lenb;
+ struct device_node *node = dev->of_node;
+
+ of_property_read_u32(node, NUM_SOURCES, &num_sources);
+ if (!of_find_property(node, BW_TBL, &lenb)) {
+ pr_err("Missing %s property\n", BW_TBL);
+ return -ENODATA;
+ }
+
+ bandwidths = devm_kzalloc(dev,
+ sizeof(*bandwidths) * num_sources, GFP_KERNEL);
+ if (!bandwidths)
+ return -ENOMEM;
+ lenb /= sizeof(*bw_buckets);
+ bw_buckets = devm_kzalloc(dev, lenb * sizeof(*bw_buckets),
+ GFP_KERNEL);
+ if (!bw_buckets) {
+ devm_kfree(dev, bandwidths);
+ return -ENOMEM;
+ }
+ ret = of_property_read_u32_array(node, BW_TBL, bw_buckets,
+ lenb);
+ if (ret < 0) {
+ devm_kfree(dev, bandwidths);
+ devm_kfree(dev, bw_buckets);
+ pr_err("Unable to read bandwidth table from device tree.\n");
+ return ret;
+ }
+
+ curr_bw = 0;
+ num_buckets = lenb;
+ return 0;
+}
+
+/* Adds struct freq_entry nodes to the hashtable for each compatible frequency.
+ * Returns the supported number of frequencies.
+ */
+static int freq_buckets_init(struct device *dev)
+{
+ struct freq_entry *freq_entry;
+ int i;
+ int ret, lenf;
+ struct device_node *node = dev->of_node;
+
+ if (!of_find_property(node, FREQ_TBL, &lenf)) {
+ pr_err("Missing %s property\n", FREQ_TBL);
+ return -ENODATA;
+ }
+
+ lenf /= sizeof(*freq_buckets);
+ freq_buckets = devm_kzalloc(dev, lenf * sizeof(*freq_buckets),
+ GFP_KERNEL);
+ if (!freq_buckets)
+ return -ENOMEM;
+ pr_debug("freqs found len %d\n", lenf);
+ ret = of_property_read_u32_array(node, FREQ_TBL, freq_buckets,
+ lenf);
+ if (ret < 0) {
+ devm_kfree(dev, freq_buckets);
+ pr_err("Unable to read frequency table from device tree.\n");
+ return ret;
+ }
+ pr_debug("ret freq %d\n", ret);
+
+ num_freqs = lenf;
+ curr_freq = freq_buckets[LOWEST_FREQ];
+
+ for (i = 0; i < num_freqs; i++) {
+ freq_entry = devm_kzalloc(dev, sizeof(struct freq_entry),
+ GFP_KERNEL);
+ if (!freq_entry)
+ return -ENOMEM;
+ freq_entry->buckets = devm_kzalloc(dev, sizeof(u64)*num_buckets,
+ GFP_KERNEL);
+ if (!freq_entry->buckets) {
+ devm_kfree(dev, freq_entry);
+ return -ENOMEM;
+ }
+ pr_debug("memory_state_time Adding freq to ht %d\n",
+ freq_buckets[i]);
+ freq_entry->freq = freq_buckets[i];
+ hash_add(freq_hash_table, &freq_entry->hash, freq_buckets[i]);
+ }
+ return 0;
+}
+
+struct kobject *memory_kobj;
+EXPORT_SYMBOL_GPL(memory_kobj);
+
+static struct attribute *memory_attrs[] = {
+ &show_stat_attr.attr,
+ NULL
+};
+
+static struct attribute_group memory_attr_group = {
+ .attrs = memory_attrs,
+};
+
+static int memory_state_time_probe(struct platform_device *pdev)
+{
+ int error;
+
+ error = get_bw_buckets(&pdev->dev);
+ if (error)
+ return error;
+ error = freq_buckets_init(&pdev->dev);
+ if (error)
+ return error;
+ last_update = ktime_get_boot_ns();
+ init_success = true;
+
+ pr_debug("memory_state_time initialized with num_freqs %d\n",
+ num_freqs);
+ return 0;
+}
+
+static const struct of_device_id match_table[] = {
+ { .compatible = "memory-state-time" },
+ {}
+};
+
+static struct platform_driver memory_state_time_driver = {
+ .probe = memory_state_time_probe,
+ .driver = {
+ .name = "memory-state-time",
+ .of_match_table = match_table,
+ .owner = THIS_MODULE,
+ },
+};
+
+static int __init memory_state_time_init(void)
+{
+ int error;
+
+ hash_init(freq_hash_table);
+ memory_wq = create_singlethread_workqueue("memory_wq");
+ if (!memory_wq) {
+ pr_err("Unable to create workqueue.\n");
+ return -EINVAL;
+ }
+ /*
+ * Create sys/kernel directory for memory_state_time.
+ */
+ memory_kobj = kobject_create_and_add(TAG, kernel_kobj);
+ if (!memory_kobj) {
+ pr_err("Unable to allocate memory_kobj for sysfs directory.\n");
+ error = -ENOMEM;
+ goto wq;
+ }
+ error = sysfs_create_group(memory_kobj, &memory_attr_group);
+ if (error) {
+ pr_err("Unable to create sysfs folder.\n");
+ goto kobj;
+ }
+
+ error = platform_driver_register(&memory_state_time_driver);
+ if (error) {
+ pr_err("Unable to register memory_state_time platform driver.\n");
+ goto group;
+ }
+ return 0;
+
+group: sysfs_remove_group(memory_kobj, &memory_attr_group);
+kobj: kobject_put(memory_kobj);
+wq: destroy_workqueue(memory_wq);
+ return error;
+}
+module_init(memory_state_time_init);
diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c
new file mode 100644
index 000000000000..456406b46347
--- /dev/null
+++ b/drivers/misc/uid_sys_stats.c
@@ -0,0 +1,701 @@
+/* drivers/misc/uid_cputime.c
+ *
+ * Copyright (C) 2014 - 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/atomic.h>
+#include <linux/err.h>
+#include <linux/hashtable.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/profile.h>
+#include <linux/rtmutex.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+
+#define UID_HASH_BITS 10
+DECLARE_HASHTABLE(hash_table, UID_HASH_BITS);
+
+static DEFINE_RT_MUTEX(uid_lock);
+static struct proc_dir_entry *cpu_parent;
+static struct proc_dir_entry *io_parent;
+static struct proc_dir_entry *proc_parent;
+
+struct io_stats {
+ u64 read_bytes;
+ u64 write_bytes;
+ u64 rchar;
+ u64 wchar;
+ u64 fsync;
+};
+
+#define UID_STATE_FOREGROUND 0
+#define UID_STATE_BACKGROUND 1
+#define UID_STATE_BUCKET_SIZE 2
+
+#define UID_STATE_TOTAL_CURR 2
+#define UID_STATE_TOTAL_LAST 3
+#define UID_STATE_DEAD_TASKS 4
+#define UID_STATE_SIZE 5
+
+#define MAX_TASK_COMM_LEN 256
+
+struct task_entry {
+ char comm[MAX_TASK_COMM_LEN];
+ pid_t pid;
+ struct io_stats io[UID_STATE_SIZE];
+ struct hlist_node hash;
+};
+
+struct uid_entry {
+ uid_t uid;
+ cputime_t utime;
+ cputime_t stime;
+ cputime_t active_utime;
+ cputime_t active_stime;
+ int state;
+ struct io_stats io[UID_STATE_SIZE];
+ struct hlist_node hash;
+#ifdef CONFIG_UID_SYS_STATS_DEBUG
+ DECLARE_HASHTABLE(task_entries, UID_HASH_BITS);
+#endif
+};
+
+static u64 compute_write_bytes(struct task_struct *task)
+{
+ if (task->ioac.write_bytes <= task->ioac.cancelled_write_bytes)
+ return 0;
+
+ return task->ioac.write_bytes - task->ioac.cancelled_write_bytes;
+}
+
+static void compute_io_bucket_stats(struct io_stats *io_bucket,
+ struct io_stats *io_curr,
+ struct io_stats *io_last,
+ struct io_stats *io_dead)
+{
+ /* tasks could switch to another uid group, but its io_last in the
+ * previous uid group could still be positive.
+ * therefore before each update, do an overflow check first
+ */
+ int64_t delta;
+
+ delta = io_curr->read_bytes + io_dead->read_bytes -
+ io_last->read_bytes;
+ io_bucket->read_bytes += delta > 0 ? delta : 0;
+ delta = io_curr->write_bytes + io_dead->write_bytes -
+ io_last->write_bytes;
+ io_bucket->write_bytes += delta > 0 ? delta : 0;
+ delta = io_curr->rchar + io_dead->rchar - io_last->rchar;
+ io_bucket->rchar += delta > 0 ? delta : 0;
+ delta = io_curr->wchar + io_dead->wchar - io_last->wchar;
+ io_bucket->wchar += delta > 0 ? delta : 0;
+ delta = io_curr->fsync + io_dead->fsync - io_last->fsync;
+ io_bucket->fsync += delta > 0 ? delta : 0;
+
+ io_last->read_bytes = io_curr->read_bytes;
+ io_last->write_bytes = io_curr->write_bytes;
+ io_last->rchar = io_curr->rchar;
+ io_last->wchar = io_curr->wchar;
+ io_last->fsync = io_curr->fsync;
+
+ memset(io_dead, 0, sizeof(struct io_stats));
+}
+
+#ifdef CONFIG_UID_SYS_STATS_DEBUG
+static void get_full_task_comm(struct task_entry *task_entry,
+ struct task_struct *task)
+{
+ int i = 0, offset = 0, len = 0;
+ /* save one byte for terminating null character */
+ int unused_len = MAX_TASK_COMM_LEN - TASK_COMM_LEN - 1;
+ char buf[unused_len];
+ struct mm_struct *mm = task->mm;
+
+ /* fill the first TASK_COMM_LEN bytes with thread name */
+ __get_task_comm(task_entry->comm, TASK_COMM_LEN, task);
+ i = strlen(task_entry->comm);
+ while (i < TASK_COMM_LEN)
+ task_entry->comm[i++] = ' ';
+
+ /* next the executable file name */
+ if (mm) {
+ down_read(&mm->mmap_sem);
+ if (mm->exe_file) {
+ char *pathname = d_path(&mm->exe_file->f_path, buf,
+ unused_len);
+
+ if (!IS_ERR(pathname)) {
+ len = strlcpy(task_entry->comm + i, pathname,
+ unused_len);
+ i += len;
+ task_entry->comm[i++] = ' ';
+ unused_len--;
+ }
+ }
+ up_read(&mm->mmap_sem);
+ }
+ unused_len -= len;
+
+ /* fill the rest with command line argument
+ * replace each null or new line character
+ * between args in argv with whitespace */
+ len = get_cmdline(task, buf, unused_len);
+ while (offset < len) {
+ if (buf[offset] != '\0' && buf[offset] != '\n')
+ task_entry->comm[i++] = buf[offset];
+ else
+ task_entry->comm[i++] = ' ';
+ offset++;
+ }
+
+ /* get rid of trailing whitespaces in case when arg is memset to
+ * zero before being reset in userspace
+ */
+ while (task_entry->comm[i-1] == ' ')
+ i--;
+ task_entry->comm[i] = '\0';
+}
+
+static struct task_entry *find_task_entry(struct uid_entry *uid_entry,
+ struct task_struct *task)
+{
+ struct task_entry *task_entry;
+
+ hash_for_each_possible(uid_entry->task_entries, task_entry, hash,
+ task->pid) {
+ if (task->pid == task_entry->pid) {
+ /* if thread name changed, update the entire command */
+ int len = strnchr(task_entry->comm, ' ', TASK_COMM_LEN)
+ - task_entry->comm;
+
+ if (strncmp(task_entry->comm, task->comm, len))
+ get_full_task_comm(task_entry, task);
+ return task_entry;
+ }
+ }
+ return NULL;
+}
+
+static struct task_entry *find_or_register_task(struct uid_entry *uid_entry,
+ struct task_struct *task)
+{
+ struct task_entry *task_entry;
+ pid_t pid = task->pid;
+
+ task_entry = find_task_entry(uid_entry, task);
+ if (task_entry)
+ return task_entry;
+
+ task_entry = kzalloc(sizeof(struct task_entry), GFP_ATOMIC);
+ if (!task_entry)
+ return NULL;
+
+ get_full_task_comm(task_entry, task);
+
+ task_entry->pid = pid;
+ hash_add(uid_entry->task_entries, &task_entry->hash, (unsigned int)pid);
+
+ return task_entry;
+}
+
+static void remove_uid_tasks(struct uid_entry *uid_entry)
+{
+ struct task_entry *task_entry;
+ unsigned long bkt_task;
+ struct hlist_node *tmp_task;
+
+ hash_for_each_safe(uid_entry->task_entries, bkt_task,
+ tmp_task, task_entry, hash) {
+ hash_del(&task_entry->hash);
+ kfree(task_entry);
+ }
+}
+
+static void set_io_uid_tasks_zero(struct uid_entry *uid_entry)
+{
+ struct task_entry *task_entry;
+ unsigned long bkt_task;
+
+ hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) {
+ memset(&task_entry->io[UID_STATE_TOTAL_CURR], 0,
+ sizeof(struct io_stats));
+ }
+}
+
+static void add_uid_tasks_io_stats(struct uid_entry *uid_entry,
+ struct task_struct *task, int slot)
+{
+ struct task_entry *task_entry = find_or_register_task(uid_entry, task);
+ struct io_stats *task_io_slot = &task_entry->io[slot];
+
+ task_io_slot->read_bytes += task->ioac.read_bytes;
+ task_io_slot->write_bytes += compute_write_bytes(task);
+ task_io_slot->rchar += task->ioac.rchar;
+ task_io_slot->wchar += task->ioac.wchar;
+ task_io_slot->fsync += task->ioac.syscfs;
+}
+
+static void compute_io_uid_tasks(struct uid_entry *uid_entry)
+{
+ struct task_entry *task_entry;
+ unsigned long bkt_task;
+
+ hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) {
+ compute_io_bucket_stats(&task_entry->io[uid_entry->state],
+ &task_entry->io[UID_STATE_TOTAL_CURR],
+ &task_entry->io[UID_STATE_TOTAL_LAST],
+ &task_entry->io[UID_STATE_DEAD_TASKS]);
+ }
+}
+
+static void show_io_uid_tasks(struct seq_file *m, struct uid_entry *uid_entry)
+{
+ struct task_entry *task_entry;
+ unsigned long bkt_task;
+
+ hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) {
+ /* Separated by comma because space exists in task comm */
+ seq_printf(m, "task,%s,%lu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n",
+ task_entry->comm,
+ (unsigned long)task_entry->pid,
+ task_entry->io[UID_STATE_FOREGROUND].rchar,
+ task_entry->io[UID_STATE_FOREGROUND].wchar,
+ task_entry->io[UID_STATE_FOREGROUND].read_bytes,
+ task_entry->io[UID_STATE_FOREGROUND].write_bytes,
+ task_entry->io[UID_STATE_BACKGROUND].rchar,
+ task_entry->io[UID_STATE_BACKGROUND].wchar,
+ task_entry->io[UID_STATE_BACKGROUND].read_bytes,
+ task_entry->io[UID_STATE_BACKGROUND].write_bytes,
+ task_entry->io[UID_STATE_FOREGROUND].fsync,
+ task_entry->io[UID_STATE_BACKGROUND].fsync);
+ }
+}
+#else
+static void remove_uid_tasks(struct uid_entry *uid_entry) {};
+static void set_io_uid_tasks_zero(struct uid_entry *uid_entry) {};
+static void add_uid_tasks_io_stats(struct uid_entry *uid_entry,
+ struct task_struct *task, int slot) {};
+static void compute_io_uid_tasks(struct uid_entry *uid_entry) {};
+static void show_io_uid_tasks(struct seq_file *m,
+ struct uid_entry *uid_entry) {}
+#endif
+
+static struct uid_entry *find_uid_entry(uid_t uid)
+{
+ struct uid_entry *uid_entry;
+ hash_for_each_possible(hash_table, uid_entry, hash, uid) {
+ if (uid_entry->uid == uid)
+ return uid_entry;
+ }
+ return NULL;
+}
+
+static struct uid_entry *find_or_register_uid(uid_t uid)
+{
+ struct uid_entry *uid_entry;
+
+ uid_entry = find_uid_entry(uid);
+ if (uid_entry)
+ return uid_entry;
+
+ uid_entry = kzalloc(sizeof(struct uid_entry), GFP_ATOMIC);
+ if (!uid_entry)
+ return NULL;
+
+ uid_entry->uid = uid;
+#ifdef CONFIG_UID_SYS_STATS_DEBUG
+ hash_init(uid_entry->task_entries);
+#endif
+ hash_add(hash_table, &uid_entry->hash, uid);
+
+ return uid_entry;
+}
+
+static int uid_cputime_show(struct seq_file *m, void *v)
+{
+ struct uid_entry *uid_entry = NULL;
+ struct task_struct *task, *temp;
+ struct user_namespace *user_ns = current_user_ns();
+ cputime_t utime;
+ cputime_t stime;
+ unsigned long bkt;
+ uid_t uid;
+
+ rt_mutex_lock(&uid_lock);
+
+ hash_for_each(hash_table, bkt, uid_entry, hash) {
+ uid_entry->active_stime = 0;
+ uid_entry->active_utime = 0;
+ }
+
+ read_lock(&tasklist_lock);
+ do_each_thread(temp, task) {
+ uid = from_kuid_munged(user_ns, task_uid(task));
+ if (!uid_entry || uid_entry->uid != uid)
+ uid_entry = find_or_register_uid(uid);
+ if (!uid_entry) {
+ read_unlock(&tasklist_lock);
+ rt_mutex_unlock(&uid_lock);
+ pr_err("%s: failed to find the uid_entry for uid %d\n",
+ __func__, uid);
+ return -ENOMEM;
+ }
+ task_cputime_adjusted(task, &utime, &stime);
+ uid_entry->active_utime += utime;
+ uid_entry->active_stime += stime;
+ } while_each_thread(temp, task);
+ read_unlock(&tasklist_lock);
+
+ hash_for_each(hash_table, bkt, uid_entry, hash) {
+ cputime_t total_utime = uid_entry->utime +
+ uid_entry->active_utime;
+ cputime_t total_stime = uid_entry->stime +
+ uid_entry->active_stime;
+ seq_printf(m, "%d: %llu %llu\n", uid_entry->uid,
+ (unsigned long long)jiffies_to_msecs(
+ cputime_to_jiffies(total_utime)) * USEC_PER_MSEC,
+ (unsigned long long)jiffies_to_msecs(
+ cputime_to_jiffies(total_stime)) * USEC_PER_MSEC);
+ }
+
+ rt_mutex_unlock(&uid_lock);
+ return 0;
+}
+
+static int uid_cputime_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, uid_cputime_show, PDE_DATA(inode));
+}
+
+static const struct file_operations uid_cputime_fops = {
+ .open = uid_cputime_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int uid_remove_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, NULL, NULL);
+}
+
+static ssize_t uid_remove_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
+{
+ struct uid_entry *uid_entry;
+ struct hlist_node *tmp;
+ char uids[128];
+ char *start_uid, *end_uid = NULL;
+ long int uid_start = 0, uid_end = 0;
+
+ if (count >= sizeof(uids))
+ count = sizeof(uids) - 1;
+
+ if (copy_from_user(uids, buffer, count))
+ return -EFAULT;
+
+ uids[count] = '\0';
+ end_uid = uids;
+ start_uid = strsep(&end_uid, "-");
+
+ if (!start_uid || !end_uid)
+ return -EINVAL;
+
+ if (kstrtol(start_uid, 10, &uid_start) != 0 ||
+ kstrtol(end_uid, 10, &uid_end) != 0) {
+ return -EINVAL;
+ }
+ rt_mutex_lock(&uid_lock);
+
+ for (; uid_start <= uid_end; uid_start++) {
+ hash_for_each_possible_safe(hash_table, uid_entry, tmp,
+ hash, (uid_t)uid_start) {
+ if (uid_start == uid_entry->uid) {
+ remove_uid_tasks(uid_entry);
+ hash_del(&uid_entry->hash);
+ kfree(uid_entry);
+ }
+ }
+ }
+
+ rt_mutex_unlock(&uid_lock);
+ return count;
+}
+
+static const struct file_operations uid_remove_fops = {
+ .open = uid_remove_open,
+ .release = single_release,
+ .write = uid_remove_write,
+};
+
+
+static void add_uid_io_stats(struct uid_entry *uid_entry,
+ struct task_struct *task, int slot)
+{
+ struct io_stats *io_slot = &uid_entry->io[slot];
+
+ io_slot->read_bytes += task->ioac.read_bytes;
+ io_slot->write_bytes += compute_write_bytes(task);
+ io_slot->rchar += task->ioac.rchar;
+ io_slot->wchar += task->ioac.wchar;
+ io_slot->fsync += task->ioac.syscfs;
+
+ add_uid_tasks_io_stats(uid_entry, task, slot);
+}
+
+static void update_io_stats_all_locked(void)
+{
+ struct uid_entry *uid_entry = NULL;
+ struct task_struct *task, *temp;
+ struct user_namespace *user_ns = current_user_ns();
+ unsigned long bkt;
+ uid_t uid;
+
+ hash_for_each(hash_table, bkt, uid_entry, hash) {
+ memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0,
+ sizeof(struct io_stats));
+ set_io_uid_tasks_zero(uid_entry);
+ }
+
+ rcu_read_lock();
+ do_each_thread(temp, task) {
+ uid = from_kuid_munged(user_ns, task_uid(task));
+ if (!uid_entry || uid_entry->uid != uid)
+ uid_entry = find_or_register_uid(uid);
+ if (!uid_entry)
+ continue;
+ add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR);
+ } while_each_thread(temp, task);
+ rcu_read_unlock();
+
+ hash_for_each(hash_table, bkt, uid_entry, hash) {
+ compute_io_bucket_stats(&uid_entry->io[uid_entry->state],
+ &uid_entry->io[UID_STATE_TOTAL_CURR],
+ &uid_entry->io[UID_STATE_TOTAL_LAST],
+ &uid_entry->io[UID_STATE_DEAD_TASKS]);
+ compute_io_uid_tasks(uid_entry);
+ }
+}
+
+static void update_io_stats_uid_locked(struct uid_entry *uid_entry)
+{
+ struct task_struct *task, *temp;
+ struct user_namespace *user_ns = current_user_ns();
+
+ memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0,
+ sizeof(struct io_stats));
+ set_io_uid_tasks_zero(uid_entry);
+
+ rcu_read_lock();
+ do_each_thread(temp, task) {
+ if (from_kuid_munged(user_ns, task_uid(task)) != uid_entry->uid)
+ continue;
+ add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR);
+ } while_each_thread(temp, task);
+ rcu_read_unlock();
+
+ compute_io_bucket_stats(&uid_entry->io[uid_entry->state],
+ &uid_entry->io[UID_STATE_TOTAL_CURR],
+ &uid_entry->io[UID_STATE_TOTAL_LAST],
+ &uid_entry->io[UID_STATE_DEAD_TASKS]);
+ compute_io_uid_tasks(uid_entry);
+}
+
+
+static int uid_io_show(struct seq_file *m, void *v)
+{
+ struct uid_entry *uid_entry;
+ unsigned long bkt;
+
+ rt_mutex_lock(&uid_lock);
+
+ update_io_stats_all_locked();
+
+ hash_for_each(hash_table, bkt, uid_entry, hash) {
+ seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ uid_entry->uid,
+ uid_entry->io[UID_STATE_FOREGROUND].rchar,
+ uid_entry->io[UID_STATE_FOREGROUND].wchar,
+ uid_entry->io[UID_STATE_FOREGROUND].read_bytes,
+ uid_entry->io[UID_STATE_FOREGROUND].write_bytes,
+ uid_entry->io[UID_STATE_BACKGROUND].rchar,
+ uid_entry->io[UID_STATE_BACKGROUND].wchar,
+ uid_entry->io[UID_STATE_BACKGROUND].read_bytes,
+ uid_entry->io[UID_STATE_BACKGROUND].write_bytes,
+ uid_entry->io[UID_STATE_FOREGROUND].fsync,
+ uid_entry->io[UID_STATE_BACKGROUND].fsync);
+
+ show_io_uid_tasks(m, uid_entry);
+ }
+
+ rt_mutex_unlock(&uid_lock);
+ return 0;
+}
+
+static int uid_io_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, uid_io_show, PDE_DATA(inode));
+}
+
+static const struct file_operations uid_io_fops = {
+ .open = uid_io_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int uid_procstat_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, NULL, NULL);
+}
+
+static ssize_t uid_procstat_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
+{
+ struct uid_entry *uid_entry;
+ uid_t uid;
+ int argc, state;
+ char input[128];
+
+ if (count >= sizeof(input))
+ return -EINVAL;
+
+ if (copy_from_user(input, buffer, count))
+ return -EFAULT;
+
+ input[count] = '\0';
+
+ argc = sscanf(input, "%u %d", &uid, &state);
+ if (argc != 2)
+ return -EINVAL;
+
+ if (state != UID_STATE_BACKGROUND && state != UID_STATE_FOREGROUND)
+ return -EINVAL;
+
+ rt_mutex_lock(&uid_lock);
+
+ uid_entry = find_or_register_uid(uid);
+ if (!uid_entry) {
+ rt_mutex_unlock(&uid_lock);
+ return -EINVAL;
+ }
+
+ if (uid_entry->state == state) {
+ rt_mutex_unlock(&uid_lock);
+ return count;
+ }
+
+ update_io_stats_uid_locked(uid_entry);
+
+ uid_entry->state = state;
+
+ rt_mutex_unlock(&uid_lock);
+
+ return count;
+}
+
+static const struct file_operations uid_procstat_fops = {
+ .open = uid_procstat_open,
+ .release = single_release,
+ .write = uid_procstat_write,
+};
+
+static int process_notifier(struct notifier_block *self,
+ unsigned long cmd, void *v)
+{
+ struct task_struct *task = v;
+ struct uid_entry *uid_entry;
+ cputime_t utime, stime;
+ uid_t uid;
+
+ if (!task)
+ return NOTIFY_OK;
+
+ rt_mutex_lock(&uid_lock);
+ uid = from_kuid_munged(current_user_ns(), task_uid(task));
+ uid_entry = find_or_register_uid(uid);
+ if (!uid_entry) {
+ pr_err("%s: failed to find uid %d\n", __func__, uid);
+ goto exit;
+ }
+
+ task_cputime_adjusted(task, &utime, &stime);
+ uid_entry->utime += utime;
+ uid_entry->stime += stime;
+
+ add_uid_io_stats(uid_entry, task, UID_STATE_DEAD_TASKS);
+
+exit:
+ rt_mutex_unlock(&uid_lock);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block process_notifier_block = {
+ .notifier_call = process_notifier,
+};
+
+static int __init proc_uid_sys_stats_init(void)
+{
+ hash_init(hash_table);
+
+ cpu_parent = proc_mkdir("uid_cputime", NULL);
+ if (!cpu_parent) {
+ pr_err("%s: failed to create uid_cputime proc entry\n",
+ __func__);
+ goto err;
+ }
+
+ proc_create_data("remove_uid_range", 0222, cpu_parent,
+ &uid_remove_fops, NULL);
+ proc_create_data("show_uid_stat", 0444, cpu_parent,
+ &uid_cputime_fops, NULL);
+
+ io_parent = proc_mkdir("uid_io", NULL);
+ if (!io_parent) {
+ pr_err("%s: failed to create uid_io proc entry\n",
+ __func__);
+ goto err;
+ }
+
+ proc_create_data("stats", 0444, io_parent,
+ &uid_io_fops, NULL);
+
+ proc_parent = proc_mkdir("uid_procstat", NULL);
+ if (!proc_parent) {
+ pr_err("%s: failed to create uid_procstat proc entry\n",
+ __func__);
+ goto err;
+ }
+
+ proc_create_data("set", 0222, proc_parent,
+ &uid_procstat_fops, NULL);
+
+ profile_event_register(PROFILE_TASK_EXIT, &process_notifier_block);
+
+ return 0;
+
+err:
+ remove_proc_subtree("uid_cputime", NULL);
+ remove_proc_subtree("uid_io", NULL);
+ remove_proc_subtree("uid_procstat", NULL);
+ return -ENOMEM;
+}
+
+early_initcall(proc_uid_sys_stats_init);
diff --git a/drivers/mmc/card/Kconfig b/drivers/mmc/card/Kconfig
index 5562308699bc..6142ec1b9dfb 100644
--- a/drivers/mmc/card/Kconfig
+++ b/drivers/mmc/card/Kconfig
@@ -68,3 +68,15 @@ config MMC_TEST
This driver is only of interest to those developing or
testing a host driver. Most people should say N here.
+
+config MMC_SIMULATE_MAX_SPEED
+ bool "Turn on maximum speed control per block device"
+ depends on MMC_BLOCK
+ help
+ Say Y here to enable MMC device speed limiting. Used to test and
+ simulate the behavior of the system when confronted with a slow MMC.
+
+ Enables max_read_speed, max_write_speed and cache_size attributes to
+ control the write or read maximum KB/second speed behaviors.
+
+ If unsure, say N here.
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index 709a872ed484..817fcf8c0ac6 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -287,6 +287,250 @@ out:
return ret;
}
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+
+static int max_read_speed, max_write_speed, cache_size = 4;
+
+module_param(max_read_speed, int, S_IRUSR | S_IRGRP);
+MODULE_PARM_DESC(max_read_speed, "maximum KB/s read speed 0=off");
+module_param(max_write_speed, int, S_IRUSR | S_IRGRP);
+MODULE_PARM_DESC(max_write_speed, "maximum KB/s write speed 0=off");
+module_param(cache_size, int, S_IRUSR | S_IRGRP);
+MODULE_PARM_DESC(cache_size, "MB high speed memory or SLC cache");
+
+/*
+ * helper macros and expectations:
+ * size - unsigned long number of bytes
+ * jiffies - unsigned long HZ timestamp difference
+ * speed - unsigned KB/s transfer rate
+ */
+#define size_and_speed_to_jiffies(size, speed) \
+ ((size) * HZ / (speed) / 1024UL)
+#define jiffies_and_speed_to_size(jiffies, speed) \
+ (((speed) * (jiffies) * 1024UL) / HZ)
+#define jiffies_and_size_to_speed(jiffies, size) \
+ ((size) * HZ / (jiffies) / 1024UL)
+
+/* Limits to report warning */
+/* jiffies_and_size_to_speed(10*HZ, queue_max_hw_sectors(q) * 512UL) ~ 25 */
+#define MIN_SPEED(q) 250 /* 10 times faster than a floppy disk */
+#define MAX_SPEED(q) jiffies_and_size_to_speed(1, queue_max_sectors(q) * 512UL)
+
+#define speed_valid(speed) ((speed) > 0)
+
+static const char off[] = "off\n";
+
+static int max_speed_show(int speed, char *buf)
+{
+ if (speed)
+ return scnprintf(buf, PAGE_SIZE, "%uKB/s\n", speed);
+ else
+ return scnprintf(buf, PAGE_SIZE, off);
+}
+
+static int max_speed_store(const char *buf, struct request_queue *q)
+{
+ unsigned int limit, set = 0;
+
+ if (!strncasecmp(off, buf, sizeof(off) - 2))
+ return set;
+ if (kstrtouint(buf, 0, &set) || (set > INT_MAX))
+ return -EINVAL;
+ if (set == 0)
+ return set;
+ limit = MAX_SPEED(q);
+ if (set > limit)
+ pr_warn("max speed %u ineffective above %u\n", set, limit);
+ limit = MIN_SPEED(q);
+ if (set < limit)
+ pr_warn("max speed %u painful below %u\n", set, limit);
+ return set;
+}
+
+static ssize_t max_write_speed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+ int ret = max_speed_show(atomic_read(&md->queue.max_write_speed), buf);
+
+ mmc_blk_put(md);
+ return ret;
+}
+
+static ssize_t max_write_speed_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+ int set = max_speed_store(buf, md->queue.queue);
+
+ if (set < 0) {
+ mmc_blk_put(md);
+ return set;
+ }
+
+ atomic_set(&md->queue.max_write_speed, set);
+ mmc_blk_put(md);
+ return count;
+}
+
+static const DEVICE_ATTR(max_write_speed, S_IRUGO | S_IWUSR,
+ max_write_speed_show, max_write_speed_store);
+
+static ssize_t max_read_speed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+ int ret = max_speed_show(atomic_read(&md->queue.max_read_speed), buf);
+
+ mmc_blk_put(md);
+ return ret;
+}
+
+static ssize_t max_read_speed_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+ int set = max_speed_store(buf, md->queue.queue);
+
+ if (set < 0) {
+ mmc_blk_put(md);
+ return set;
+ }
+
+ atomic_set(&md->queue.max_read_speed, set);
+ mmc_blk_put(md);
+ return count;
+}
+
+static const DEVICE_ATTR(max_read_speed, S_IRUGO | S_IWUSR,
+ max_read_speed_show, max_read_speed_store);
+
+static ssize_t cache_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev));
+ struct mmc_queue *mq = &md->queue;
+ int cache_size = atomic_read(&mq->cache_size);
+ int ret;
+
+ if (!cache_size)
+ ret = scnprintf(buf, PAGE_SIZE, off);
+ else {
+ int speed = atomic_read(&mq->max_write_speed);
+
+ if (!speed_valid(speed))
+ ret = scnprintf(buf, PAGE_SIZE, "%uMB\n", cache_size);
+ else { /* We accept race between cache_jiffies and cache_used */
+ unsigned long size = jiffies_and_speed_to_size(
+ jiffies - mq->cache_jiffies, speed);
+ long used = atomic_long_read(&mq->cache_used);
+
+ if (size >= used)
+ size = 0;
+ else
+ size = (used - size) * 100 / cache_size
+ / 1024UL / 1024UL;
+
+ ret = scnprintf(buf, PAGE_SIZE, "%uMB %lu%% used\n",
+ cache_size, size);
+ }
+ }
+
+ mmc_blk_put(md);
+ return ret;
+}
+
+static ssize_t cache_size_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct mmc_blk_data *md;
+ unsigned int set = 0;
+
+ if (strncasecmp(off, buf, sizeof(off) - 2)
+ && (kstrtouint(buf, 0, &set) || (set > INT_MAX)))
+ return -EINVAL;
+
+ md = mmc_blk_get(dev_to_disk(dev));
+ atomic_set(&md->queue.cache_size, set);
+ mmc_blk_put(md);
+ return count;
+}
+
+static const DEVICE_ATTR(cache_size, S_IRUGO | S_IWUSR,
+ cache_size_show, cache_size_store);
+
+/* correct for write-back */
+static long mmc_blk_cache_used(struct mmc_queue *mq, unsigned long waitfor)
+{
+ long used = 0;
+ int speed = atomic_read(&mq->max_write_speed);
+
+ if (speed_valid(speed)) {
+ unsigned long size = jiffies_and_speed_to_size(
+ waitfor - mq->cache_jiffies, speed);
+ used = atomic_long_read(&mq->cache_used);
+
+ if (size >= used)
+ used = 0;
+ else
+ used -= size;
+ }
+
+ atomic_long_set(&mq->cache_used, used);
+ mq->cache_jiffies = waitfor;
+
+ return used;
+}
+
+static void mmc_blk_simulate_delay(
+ struct mmc_queue *mq,
+ struct request *req,
+ unsigned long waitfor)
+{
+ int max_speed;
+
+ if (!req)
+ return;
+
+ max_speed = (rq_data_dir(req) == READ)
+ ? atomic_read(&mq->max_read_speed)
+ : atomic_read(&mq->max_write_speed);
+ if (speed_valid(max_speed)) {
+ unsigned long bytes = blk_rq_bytes(req);
+
+ if (rq_data_dir(req) != READ) {
+ int cache_size = atomic_read(&mq->cache_size);
+
+ if (cache_size) {
+ unsigned long size = cache_size * 1024L * 1024L;
+ long used = mmc_blk_cache_used(mq, waitfor);
+
+ used += bytes;
+ atomic_long_set(&mq->cache_used, used);
+ bytes = 0;
+ if (used > size)
+ bytes = used - size;
+ }
+ }
+ waitfor += size_and_speed_to_jiffies(bytes, max_speed);
+ if (time_is_after_jiffies(waitfor)) {
+ long msecs = jiffies_to_msecs(waitfor - jiffies);
+
+ if (likely(msecs > 0))
+ msleep(msecs);
+ }
+ }
+}
+
+#else
+
+#define mmc_blk_simulate_delay(mq, req, waitfor)
+
+#endif
+
static int mmc_blk_open(struct block_device *bdev, fmode_t mode)
{
struct mmc_blk_data *md = mmc_blk_get(bdev->bd_disk);
@@ -1284,6 +1528,23 @@ static int mmc_blk_issue_flush(struct mmc_queue *mq, struct request *req)
if (ret)
ret = -EIO;
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ else if (atomic_read(&mq->cache_size)) {
+ long used = mmc_blk_cache_used(mq, jiffies);
+
+ if (used) {
+ int speed = atomic_read(&mq->max_write_speed);
+
+ if (speed_valid(speed)) {
+ unsigned long msecs = jiffies_to_msecs(
+ size_and_speed_to_jiffies(
+ used, speed));
+ if (msecs)
+ msleep(msecs);
+ }
+ }
+ }
+#endif
blk_end_request_all(req, ret);
return ret ? 0 : 1;
@@ -1965,6 +2226,9 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc)
struct mmc_async_req *areq;
const u8 packed_nr = 2;
u8 reqs = 0;
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ unsigned long waitfor = jiffies;
+#endif
if (!rqc && !mq->mqrq_prev->req)
return 0;
@@ -2015,6 +2279,8 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc)
*/
mmc_blk_reset_success(md, type);
+ mmc_blk_simulate_delay(mq, rqc, waitfor);
+
if (mmc_packed_cmd(mq_rq->cmd_type)) {
ret = mmc_blk_end_packed_req(mq_rq);
break;
@@ -2437,6 +2703,14 @@ static void mmc_blk_remove_req(struct mmc_blk_data *md)
card->ext_csd.boot_ro_lockable)
device_remove_file(disk_to_dev(md->disk),
&md->power_ro_lock);
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ device_remove_file(disk_to_dev(md->disk),
+ &dev_attr_max_write_speed);
+ device_remove_file(disk_to_dev(md->disk),
+ &dev_attr_max_read_speed);
+ device_remove_file(disk_to_dev(md->disk),
+ &dev_attr_cache_size);
+#endif
del_gendisk(md->disk);
}
@@ -2471,6 +2745,24 @@ static int mmc_add_disk(struct mmc_blk_data *md)
ret = device_create_file(disk_to_dev(md->disk), &md->force_ro);
if (ret)
goto force_ro_fail;
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ atomic_set(&md->queue.max_write_speed, max_write_speed);
+ ret = device_create_file(disk_to_dev(md->disk),
+ &dev_attr_max_write_speed);
+ if (ret)
+ goto max_write_speed_fail;
+ atomic_set(&md->queue.max_read_speed, max_read_speed);
+ ret = device_create_file(disk_to_dev(md->disk),
+ &dev_attr_max_read_speed);
+ if (ret)
+ goto max_read_speed_fail;
+ atomic_set(&md->queue.cache_size, cache_size);
+ atomic_long_set(&md->queue.cache_used, 0);
+ md->queue.cache_jiffies = jiffies;
+ ret = device_create_file(disk_to_dev(md->disk), &dev_attr_cache_size);
+ if (ret)
+ goto cache_size_fail;
+#endif
if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) &&
card->ext_csd.boot_ro_lockable) {
@@ -2495,6 +2787,14 @@ static int mmc_add_disk(struct mmc_blk_data *md)
return ret;
power_ro_lock_fail:
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ device_remove_file(disk_to_dev(md->disk), &dev_attr_cache_size);
+cache_size_fail:
+ device_remove_file(disk_to_dev(md->disk), &dev_attr_max_read_speed);
+max_read_speed_fail:
+ device_remove_file(disk_to_dev(md->disk), &dev_attr_max_write_speed);
+max_write_speed_fail:
+#endif
device_remove_file(disk_to_dev(md->disk), &md->force_ro);
force_ro_fail:
del_gendisk(md->disk);
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index 8037f73a109a..1810f765f0d1 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -19,6 +19,7 @@
#include <linux/mmc/card.h>
#include <linux/mmc/host.h>
+#include <linux/sched/rt.h>
#include "queue.h"
#include "block.h"
@@ -53,6 +54,11 @@ static int mmc_queue_thread(void *d)
{
struct mmc_queue *mq = d;
struct request_queue *q = mq->queue;
+ struct sched_param scheduler_params = {0};
+
+ scheduler_params.sched_priority = 1;
+
+ sched_setscheduler(current, SCHED_FIFO, &scheduler_params);
current->flags |= PF_MEMALLOC;
diff --git a/drivers/mmc/card/queue.h b/drivers/mmc/card/queue.h
index 342f1e3f301e..fe58d31cbc7e 100644
--- a/drivers/mmc/card/queue.h
+++ b/drivers/mmc/card/queue.h
@@ -62,6 +62,14 @@ struct mmc_queue {
struct mmc_queue_req mqrq[2];
struct mmc_queue_req *mqrq_cur;
struct mmc_queue_req *mqrq_prev;
+#ifdef CONFIG_MMC_SIMULATE_MAX_SPEED
+ atomic_t max_write_speed;
+ atomic_t max_read_speed;
+ atomic_t cache_size;
+ /* i/o tracking */
+ atomic_long_t cache_used;
+ unsigned long cache_jiffies;
+#endif
};
extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *,
diff --git a/drivers/mmc/core/Kconfig b/drivers/mmc/core/Kconfig
index 250f223aaa80..daad32f85033 100644
--- a/drivers/mmc/core/Kconfig
+++ b/drivers/mmc/core/Kconfig
@@ -22,3 +22,18 @@ config PWRSEQ_SIMPLE
This driver can also be built as a module. If so, the module
will be called pwrseq_simple.
+
+config MMC_EMBEDDED_SDIO
+ boolean "MMC embedded SDIO device support (EXPERIMENTAL)"
+ help
+ If you say Y here, support will be added for embedded SDIO
+ devices which do not contain the necessary enumeration
+ support in hardware to be properly detected.
+
+config MMC_PARANOID_SD_INIT
+ bool "Enable paranoid SD card initialization (EXPERIMENTAL)"
+ help
+ If you say Y here, the MMC layer will be extra paranoid
+ about re-trying SD init requests. This can be a useful
+ work-around for buggy controllers and hardware. Enable
+ if you are experiencing issues with SD detection.
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index cff5829790c9..c82c203c5d26 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -201,6 +201,19 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq)
pr_debug("%s: %d bytes transferred: %d\n",
mmc_hostname(host),
mrq->data->bytes_xfered, mrq->data->error);
+#ifdef CONFIG_BLOCK
+ if (mrq->lat_hist_enabled) {
+ ktime_t completion;
+ u_int64_t delta_us;
+
+ completion = ktime_get();
+ delta_us = ktime_us_delta(completion,
+ mrq->io_start);
+ blk_update_latency_hist(&host->io_lat_s,
+ (mrq->data->flags & MMC_DATA_READ),
+ delta_us);
+ }
+#endif
}
if (mrq->stop) {
@@ -699,8 +712,16 @@ struct mmc_async_req *mmc_start_req(struct mmc_host *host,
}
}
- if (!err && areq)
+ if (!err && areq) {
+#ifdef CONFIG_BLOCK
+ if (host->latency_hist_enabled) {
+ areq->mrq->io_start = ktime_get();
+ areq->mrq->lat_hist_enabled = 1;
+ } else
+ areq->mrq->lat_hist_enabled = 0;
+#endif
start_err = __mmc_start_data_req(host, areq->mrq);
+ }
if (host->areq)
mmc_post_req(host, host->areq->mrq, 0);
@@ -2051,7 +2072,7 @@ void mmc_init_erase(struct mmc_card *card)
}
static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card,
- unsigned int arg, unsigned int qty)
+ unsigned int arg, unsigned int qty)
{
unsigned int erase_timeout;
@@ -3034,6 +3055,22 @@ void mmc_init_context_info(struct mmc_host *host)
init_waitqueue_head(&host->context_info.wait);
}
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+void mmc_set_embedded_sdio_data(struct mmc_host *host,
+ struct sdio_cis *cis,
+ struct sdio_cccr *cccr,
+ struct sdio_embedded_func *funcs,
+ int num_funcs)
+{
+ host->embedded_sdio_data.cis = cis;
+ host->embedded_sdio_data.cccr = cccr;
+ host->embedded_sdio_data.funcs = funcs;
+ host->embedded_sdio_data.num_funcs = num_funcs;
+}
+
+EXPORT_SYMBOL(mmc_set_embedded_sdio_data);
+#endif
+
static int __init mmc_init(void)
{
int ret;
@@ -3066,6 +3103,56 @@ static void __exit mmc_exit(void)
mmc_unregister_bus();
}
+#ifdef CONFIG_BLOCK
+static ssize_t
+latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct mmc_host *host = cls_dev_to_mmc_host(dev);
+
+ return blk_latency_hist_show(&host->io_lat_s, buf);
+}
+
+/*
+ * Values permitted 0, 1, 2.
+ * 0 -> Disable IO latency histograms (default)
+ * 1 -> Enable IO latency histograms
+ * 2 -> Zero out IO latency histograms
+ */
+static ssize_t
+latency_hist_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct mmc_host *host = cls_dev_to_mmc_host(dev);
+ long value;
+
+ if (kstrtol(buf, 0, &value))
+ return -EINVAL;
+ if (value == BLK_IO_LAT_HIST_ZERO)
+ blk_zero_latency_hist(&host->io_lat_s);
+ else if (value == BLK_IO_LAT_HIST_ENABLE ||
+ value == BLK_IO_LAT_HIST_DISABLE)
+ host->latency_hist_enabled = value;
+ return count;
+}
+
+static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR,
+ latency_hist_show, latency_hist_store);
+
+void
+mmc_latency_hist_sysfs_init(struct mmc_host *host)
+{
+ if (device_create_file(&host->class_dev, &dev_attr_latency_hist))
+ dev_err(&host->class_dev,
+ "Failed to create latency_hist sysfs entry\n");
+}
+
+void
+mmc_latency_hist_sysfs_exit(struct mmc_host *host)
+{
+ device_remove_file(&host->class_dev, &dev_attr_latency_hist);
+}
+#endif
+
subsys_initcall(mmc_init);
module_exit(mmc_exit);
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 848b3453517e..07f88919693d 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -31,8 +31,6 @@
#include "slot-gpio.h"
#include "pwrseq.h"
-#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev)
-
static DEFINE_IDA(mmc_host_ida);
static DEFINE_SPINLOCK(mmc_host_lock);
@@ -428,8 +426,13 @@ int mmc_add_host(struct mmc_host *host)
mmc_add_host_debugfs(host);
#endif
+#ifdef CONFIG_BLOCK
+ mmc_latency_hist_sysfs_init(host);
+#endif
+
mmc_start_host(host);
- mmc_register_pm_notifier(host);
+ if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY))
+ mmc_register_pm_notifier(host);
return 0;
}
@@ -446,13 +449,18 @@ EXPORT_SYMBOL(mmc_add_host);
*/
void mmc_remove_host(struct mmc_host *host)
{
- mmc_unregister_pm_notifier(host);
+ if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY))
+ mmc_unregister_pm_notifier(host);
mmc_stop_host(host);
#ifdef CONFIG_DEBUG_FS
mmc_remove_host_debugfs(host);
#endif
+#ifdef CONFIG_BLOCK
+ mmc_latency_hist_sysfs_exit(host);
+#endif
+
device_del(&host->class_dev);
led_trigger_unregister_simple(host->led);
diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h
index 992bf5397633..bf38533406fd 100644
--- a/drivers/mmc/core/host.h
+++ b/drivers/mmc/core/host.h
@@ -12,6 +12,8 @@
#define _MMC_CORE_HOST_H
#include <linux/mmc/host.h>
+#define cls_dev_to_mmc_host(d) container_of(d, struct mmc_host, class_dev)
+
int mmc_register_host_class(void);
void mmc_unregister_host_class(void);
@@ -21,5 +23,8 @@ void mmc_retune_hold(struct mmc_host *host);
void mmc_retune_release(struct mmc_host *host);
int mmc_retune(struct mmc_host *host);
+void mmc_latency_hist_sysfs_init(struct mmc_host *host);
+void mmc_latency_hist_sysfs_exit(struct mmc_host *host);
+
#endif
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 0c6de9f12ee8..3e5954f8734d 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -617,6 +617,12 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd)
card->ext_csd.ffu_capable =
(ext_csd[EXT_CSD_SUPPORTED_MODE] & 0x1) &&
!(ext_csd[EXT_CSD_FW_CONFIG] & 0x1);
+
+ card->ext_csd.pre_eol_info = ext_csd[EXT_CSD_PRE_EOL_INFO];
+ card->ext_csd.device_life_time_est_typ_a =
+ ext_csd[EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_A];
+ card->ext_csd.device_life_time_est_typ_b =
+ ext_csd[EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_B];
}
out:
return err;
@@ -746,6 +752,11 @@ MMC_DEV_ATTR(manfid, "0x%06x\n", card->cid.manfid);
MMC_DEV_ATTR(name, "%s\n", card->cid.prod_name);
MMC_DEV_ATTR(oemid, "0x%04x\n", card->cid.oemid);
MMC_DEV_ATTR(prv, "0x%x\n", card->cid.prv);
+MMC_DEV_ATTR(rev, "0x%x\n", card->ext_csd.rev);
+MMC_DEV_ATTR(pre_eol_info, "%02x\n", card->ext_csd.pre_eol_info);
+MMC_DEV_ATTR(life_time, "0x%02x 0x%02x\n",
+ card->ext_csd.device_life_time_est_typ_a,
+ card->ext_csd.device_life_time_est_typ_b);
MMC_DEV_ATTR(serial, "0x%08x\n", card->cid.serial);
MMC_DEV_ATTR(enhanced_area_offset, "%llu\n",
card->ext_csd.enhanced_area_offset);
@@ -799,6 +810,9 @@ static struct attribute *mmc_std_attrs[] = {
&dev_attr_name.attr,
&dev_attr_oemid.attr,
&dev_attr_prv.attr,
+ &dev_attr_rev.attr,
+ &dev_attr_pre_eol_info.attr,
+ &dev_attr_life_time.attr,
&dev_attr_serial.attr,
&dev_attr_enhanced_area_offset.attr,
&dev_attr_enhanced_area_size.attr,
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index f09148a4ab55..ad709340e663 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -847,6 +847,9 @@ int mmc_sd_setup_card(struct mmc_host *host, struct mmc_card *card,
bool reinit)
{
int err;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ int retries;
+#endif
if (!reinit) {
/*
@@ -873,7 +876,26 @@ int mmc_sd_setup_card(struct mmc_host *host, struct mmc_card *card,
/*
* Fetch switch information from card.
*/
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ for (retries = 1; retries <= 3; retries++) {
+ err = mmc_read_switch(card);
+ if (!err) {
+ if (retries > 1) {
+ printk(KERN_WARNING
+ "%s: recovered\n",
+ mmc_hostname(host));
+ }
+ break;
+ } else {
+ printk(KERN_WARNING
+ "%s: read switch failed (attempt %d)\n",
+ mmc_hostname(host), retries);
+ }
+ }
+#else
err = mmc_read_switch(card);
+#endif
+
if (err)
return err;
}
@@ -1071,7 +1093,10 @@ static int mmc_sd_alive(struct mmc_host *host)
*/
static void mmc_sd_detect(struct mmc_host *host)
{
- int err;
+ int err = 0;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ int retries = 5;
+#endif
BUG_ON(!host);
BUG_ON(!host->card);
@@ -1081,7 +1106,23 @@ static void mmc_sd_detect(struct mmc_host *host)
/*
* Just check if our card has been removed.
*/
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ while(retries) {
+ err = mmc_send_status(host->card, NULL);
+ if (err) {
+ retries--;
+ udelay(5);
+ continue;
+ }
+ break;
+ }
+ if (!retries) {
+ printk(KERN_ERR "%s(%s): Unable to re-detect card (%d)\n",
+ __func__, mmc_hostname(host), err);
+ }
+#else
err = _mmc_detect_card_removed(host);
+#endif
mmc_put_card(host->card);
@@ -1143,6 +1184,9 @@ static int mmc_sd_suspend(struct mmc_host *host)
static int _mmc_sd_resume(struct mmc_host *host)
{
int err = 0;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ int retries;
+#endif
BUG_ON(!host);
BUG_ON(!host->card);
@@ -1153,7 +1197,23 @@ static int _mmc_sd_resume(struct mmc_host *host)
goto out;
mmc_power_up(host, host->card->ocr);
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ retries = 5;
+ while (retries) {
+ err = mmc_sd_init_card(host, host->card->ocr, host->card);
+
+ if (err) {
+ printk(KERN_ERR "%s: Re-init card rc = %d (retries = %d)\n",
+ mmc_hostname(host), err, retries);
+ mdelay(5);
+ retries--;
+ continue;
+ }
+ break;
+ }
+#else
err = mmc_sd_init_card(host, host->card->ocr, host->card);
+#endif
mmc_card_clr_suspended(host->card);
out:
@@ -1228,6 +1288,9 @@ int mmc_attach_sd(struct mmc_host *host)
{
int err;
u32 ocr, rocr;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ int retries;
+#endif
BUG_ON(!host);
WARN_ON(!host->claimed);
@@ -1264,9 +1327,27 @@ int mmc_attach_sd(struct mmc_host *host)
/*
* Detect and init the card.
*/
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+ retries = 5;
+ while (retries) {
+ err = mmc_sd_init_card(host, rocr, NULL);
+ if (err) {
+ retries--;
+ continue;
+ }
+ break;
+ }
+
+ if (!retries) {
+ printk(KERN_ERR "%s: mmc_sd_init_card() failure (err = %d)\n",
+ mmc_hostname(host), err);
+ goto err;
+ }
+#else
err = mmc_sd_init_card(host, rocr, NULL);
if (err)
goto err;
+#endif
mmc_release_host(host);
err = mmc_add_card(host->card);
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index bd44ba8116d1..b5ec3c8cf580 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -10,6 +10,7 @@
*/
#include <linux/err.h>
+#include <linux/module.h>
#include <linux/pm_runtime.h>
#include <linux/mmc/host.h>
@@ -21,6 +22,7 @@
#include "core.h"
#include "bus.h"
+#include "host.h"
#include "sd.h"
#include "sdio_bus.h"
#include "mmc_ops.h"
@@ -28,6 +30,10 @@
#include "sdio_ops.h"
#include "sdio_cis.h"
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+#include <linux/mmc/sdio_ids.h>
+#endif
+
static int sdio_read_fbr(struct sdio_func *func)
{
int ret;
@@ -697,19 +703,35 @@ try_again:
goto finish;
}
- /*
- * Read the common registers.
- */
- err = sdio_read_cccr(card, ocr);
- if (err)
- goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ if (host->embedded_sdio_data.cccr)
+ memcpy(&card->cccr, host->embedded_sdio_data.cccr, sizeof(struct sdio_cccr));
+ else {
+#endif
+ /*
+ * Read the common registers.
+ */
+ err = sdio_read_cccr(card, ocr);
+ if (err)
+ goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ }
+#endif
- /*
- * Read the common CIS tuples.
- */
- err = sdio_read_common_cis(card);
- if (err)
- goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ if (host->embedded_sdio_data.cis)
+ memcpy(&card->cis, host->embedded_sdio_data.cis, sizeof(struct sdio_cis));
+ else {
+#endif
+ /*
+ * Read the common CIS tuples.
+ */
+ err = sdio_read_common_cis(card);
+ if (err)
+ goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ }
+#endif
if (oldcard) {
int same = (card->cis.vendor == oldcard->cis.vendor &&
@@ -1118,14 +1140,36 @@ int mmc_attach_sdio(struct mmc_host *host)
funcs = (ocr & 0x70000000) >> 28;
card->sdio_funcs = 0;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ if (host->embedded_sdio_data.funcs)
+ card->sdio_funcs = funcs = host->embedded_sdio_data.num_funcs;
+#endif
+
/*
* Initialize (but don't add) all present functions.
*/
for (i = 0; i < funcs; i++, card->sdio_funcs++) {
- err = sdio_init_func(host->card, i + 1);
- if (err)
- goto remove;
-
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ if (host->embedded_sdio_data.funcs) {
+ struct sdio_func *tmp;
+
+ tmp = sdio_alloc_func(host->card);
+ if (IS_ERR(tmp))
+ goto remove;
+ tmp->num = (i + 1);
+ card->sdio_func[i] = tmp;
+ tmp->class = host->embedded_sdio_data.funcs[i].f_class;
+ tmp->max_blksize = host->embedded_sdio_data.funcs[i].f_maxblksize;
+ tmp->vendor = card->cis.vendor;
+ tmp->device = card->cis.device;
+ } else {
+#endif
+ err = sdio_init_func(host->card, i + 1);
+ if (err)
+ goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ }
+#endif
/*
* Enable Runtime PM for this func (if supported)
*/
@@ -1173,3 +1217,42 @@ err:
return err;
}
+int sdio_reset_comm(struct mmc_card *card)
+{
+ struct mmc_host *host = card->host;
+ u32 ocr;
+ u32 rocr;
+ int err;
+
+ printk("%s():\n", __func__);
+ mmc_claim_host(host);
+
+ mmc_retune_disable(host);
+
+ mmc_go_idle(host);
+
+ mmc_set_clock(host, host->f_min);
+
+ err = mmc_send_io_op_cond(host, 0, &ocr);
+ if (err)
+ goto err;
+
+ rocr = mmc_select_voltage(host, ocr);
+ if (!rocr) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = mmc_sdio_init_card(host, rocr, card, 0);
+ if (err)
+ goto err;
+
+ mmc_release_host(host);
+ return 0;
+err:
+ printk("%s: Error resetting SDIO communications (%d)\n",
+ mmc_hostname(host), err);
+ mmc_release_host(host);
+ return err;
+}
+EXPORT_SYMBOL(sdio_reset_comm);
diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c
index d56a3b6c2fb9..528524a22c80 100644
--- a/drivers/mmc/core/sdio_bus.c
+++ b/drivers/mmc/core/sdio_bus.c
@@ -28,6 +28,10 @@
#include "sdio_cis.h"
#include "sdio_bus.h"
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+#include <linux/mmc/host.h>
+#endif
+
#define to_sdio_driver(d) container_of(d, struct sdio_driver, drv)
/* show configuration fields */
@@ -263,7 +267,14 @@ static void sdio_release_func(struct device *dev)
{
struct sdio_func *func = dev_to_sdio_func(dev);
- sdio_free_func_cis(func);
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ /*
+ * If this device is embedded then we never allocated
+ * cis tables for this func
+ */
+ if (!func->card->host->embedded_sdio_data.funcs)
+#endif
+ sdio_free_func_cis(func);
kfree(func->info);
kfree(func->tmpbuf);
diff --git a/drivers/mmc/core/sdio_io.c b/drivers/mmc/core/sdio_io.c
index 406e5f037e32..3734cba53dbb 100644
--- a/drivers/mmc/core/sdio_io.c
+++ b/drivers/mmc/core/sdio_io.c
@@ -390,6 +390,39 @@ u8 sdio_readb(struct sdio_func *func, unsigned int addr, int *err_ret)
EXPORT_SYMBOL_GPL(sdio_readb);
/**
+ * sdio_readb_ext - read a single byte from a SDIO function
+ * @func: SDIO function to access
+ * @addr: address to read
+ * @err_ret: optional status value from transfer
+ * @in: value to add to argument
+ *
+ * Reads a single byte from the address space of a given SDIO
+ * function. If there is a problem reading the address, 0xff
+ * is returned and @err_ret will contain the error code.
+ */
+unsigned char sdio_readb_ext(struct sdio_func *func, unsigned int addr,
+ int *err_ret, unsigned in)
+{
+ int ret;
+ unsigned char val;
+
+ BUG_ON(!func);
+
+ if (err_ret)
+ *err_ret = 0;
+
+ ret = mmc_io_rw_direct(func->card, 0, func->num, addr, (u8)in, &val);
+ if (ret) {
+ if (err_ret)
+ *err_ret = ret;
+ return 0xFF;
+ }
+
+ return val;
+}
+EXPORT_SYMBOL_GPL(sdio_readb_ext);
+
+/**
* sdio_writeb - write a single byte to a SDIO function
* @func: SDIO function to access
* @b: byte to write
diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index b254090b8a1b..50ee1bad3690 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -1,3 +1,10 @@
+config MTD_NAND_IDS
+ tristate "Include chip ids for known NAND devices."
+ depends on MTD
+ help
+ Useful for NAND drivers that do not use the NAND subsystem but
+ still like to take advantage of the known chip information.
+
config MTD_NAND_ECC
tristate
@@ -109,9 +116,6 @@ config MTD_NAND_OMAP_BCH
config MTD_NAND_OMAP_BCH_BUILD
def_tristate MTD_NAND_OMAP2 && MTD_NAND_OMAP_BCH
-config MTD_NAND_IDS
- tristate
-
config MTD_NAND_RICOH
tristate "Ricoh xD card reader"
default n
diff --git a/drivers/net/ppp/Kconfig b/drivers/net/ppp/Kconfig
index 1373c6d7278d..282aec4860eb 100644
--- a/drivers/net/ppp/Kconfig
+++ b/drivers/net/ppp/Kconfig
@@ -149,6 +149,23 @@ config PPPOL2TP
tunnels. L2TP is replacing PPTP for VPN uses.
if TTY
+config PPPOLAC
+ tristate "PPP on L2TP Access Concentrator"
+ depends on PPP && INET
+ help
+ L2TP (RFC 2661) is a tunneling protocol widely used in virtual private
+ networks. This driver handles L2TP data packets between a UDP socket
+ and a PPP channel, but only permits one session per socket. Thus it is
+ fairly simple and suited for clients.
+
+config PPPOPNS
+ tristate "PPP on PPTP Network Server"
+ depends on PPP && INET
+ help
+ PPTP (RFC 2637) is a tunneling protocol widely used in virtual private
+ networks. This driver handles PPTP data packets between a RAW socket
+ and a PPP channel. It is fairly simple and easy to use.
+
config PPP_ASYNC
tristate "PPP support for async serial ports"
depends on PPP
diff --git a/drivers/net/ppp/Makefile b/drivers/net/ppp/Makefile
index a6b6297b0066..d283d03c4683 100644
--- a/drivers/net/ppp/Makefile
+++ b/drivers/net/ppp/Makefile
@@ -11,3 +11,5 @@ obj-$(CONFIG_PPP_SYNC_TTY) += ppp_synctty.o
obj-$(CONFIG_PPPOE) += pppox.o pppoe.o
obj-$(CONFIG_PPPOL2TP) += pppox.o
obj-$(CONFIG_PPTP) += pppox.o pptp.o
+obj-$(CONFIG_PPPOLAC) += pppox.o pppolac.o
+obj-$(CONFIG_PPPOPNS) += pppox.o pppopns.o
diff --git a/drivers/net/ppp/pppolac.c b/drivers/net/ppp/pppolac.c
new file mode 100644
index 000000000000..3a45cf805288
--- /dev/null
+++ b/drivers/net/ppp/pppolac.c
@@ -0,0 +1,450 @@
+/* drivers/net/pppolac.c
+ *
+ * Driver for PPP on L2TP Access Concentrator / PPPoLAC Socket (RFC 2661)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* This driver handles L2TP data packets between a UDP socket and a PPP channel.
+ * The socket must keep connected, and only one session per socket is permitted.
+ * Sequencing of outgoing packets is controlled by LNS. Incoming packets with
+ * sequences are reordered within a sliding window of one second. Currently
+ * reordering only happens when a packet is received. It is done for simplicity
+ * since no additional locks or threads are required. This driver only works on
+ * IPv4 due to the lack of UDP encapsulation support in IPv6. */
+
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <linux/netdevice.h>
+#include <linux/net.h>
+#include <linux/udp.h>
+#include <linux/ppp_defs.h>
+#include <linux/if_ppp.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_channel.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+
+#define L2TP_CONTROL_BIT 0x80
+#define L2TP_LENGTH_BIT 0x40
+#define L2TP_SEQUENCE_BIT 0x08
+#define L2TP_OFFSET_BIT 0x02
+#define L2TP_VERSION 0x02
+#define L2TP_VERSION_MASK 0x0F
+
+#define PPP_ADDR 0xFF
+#define PPP_CTRL 0x03
+
+union unaligned {
+ __u32 u32;
+} __attribute__((packed));
+
+static inline union unaligned *unaligned(void *ptr)
+{
+ return (union unaligned *)ptr;
+}
+
+struct meta {
+ __u32 sequence;
+ __u32 timestamp;
+};
+
+static inline struct meta *skb_meta(struct sk_buff *skb)
+{
+ return (struct meta *)skb->cb;
+}
+
+/******************************************************************************/
+
+static int pppolac_recv_core(struct sock *sk_udp, struct sk_buff *skb)
+{
+ struct sock *sk = (struct sock *)sk_udp->sk_user_data;
+ struct pppolac_opt *opt = &pppox_sk(sk)->proto.lac;
+ struct meta *meta = skb_meta(skb);
+ __u32 now = jiffies;
+ __u8 bits;
+ __u8 *ptr;
+
+ /* Drop the packet if L2TP header is missing. */
+ if (skb->len < sizeof(struct udphdr) + 6)
+ goto drop;
+
+ /* Put it back if it is a control packet. */
+ if (skb->data[sizeof(struct udphdr)] & L2TP_CONTROL_BIT)
+ return opt->backlog_rcv(sk_udp, skb);
+
+ /* Skip UDP header. */
+ skb_pull(skb, sizeof(struct udphdr));
+
+ /* Check the version. */
+ if ((skb->data[1] & L2TP_VERSION_MASK) != L2TP_VERSION)
+ goto drop;
+ bits = skb->data[0];
+ ptr = &skb->data[2];
+
+ /* Check the length if it is present. */
+ if (bits & L2TP_LENGTH_BIT) {
+ if ((ptr[0] << 8 | ptr[1]) != skb->len)
+ goto drop;
+ ptr += 2;
+ }
+
+ /* Skip all fields including optional ones. */
+ if (!skb_pull(skb, 6 + (bits & L2TP_SEQUENCE_BIT ? 4 : 0) +
+ (bits & L2TP_LENGTH_BIT ? 2 : 0) +
+ (bits & L2TP_OFFSET_BIT ? 2 : 0)))
+ goto drop;
+
+ /* Skip the offset padding if it is present. */
+ if (bits & L2TP_OFFSET_BIT &&
+ !skb_pull(skb, skb->data[-2] << 8 | skb->data[-1]))
+ goto drop;
+
+ /* Check the tunnel and the session. */
+ if (unaligned(ptr)->u32 != opt->local)
+ goto drop;
+
+ /* Check the sequence if it is present. */
+ if (bits & L2TP_SEQUENCE_BIT) {
+ meta->sequence = ptr[4] << 8 | ptr[5];
+ if ((__s16)(meta->sequence - opt->recv_sequence) < 0)
+ goto drop;
+ }
+
+ /* Skip PPP address and control if they are present. */
+ if (skb->len >= 2 && skb->data[0] == PPP_ADDR &&
+ skb->data[1] == PPP_CTRL)
+ skb_pull(skb, 2);
+
+ /* Fix PPP protocol if it is compressed. */
+ if (skb->len >= 1 && skb->data[0] & 1)
+ skb_push(skb, 1)[0] = 0;
+
+ /* Drop the packet if PPP protocol is missing. */
+ if (skb->len < 2)
+ goto drop;
+
+ /* Perform reordering if sequencing is enabled. */
+ atomic_set(&opt->sequencing, bits & L2TP_SEQUENCE_BIT);
+ if (bits & L2TP_SEQUENCE_BIT) {
+ struct sk_buff *skb1;
+
+ /* Insert the packet into receive queue in order. */
+ skb_set_owner_r(skb, sk);
+ skb_queue_walk(&sk->sk_receive_queue, skb1) {
+ struct meta *meta1 = skb_meta(skb1);
+ __s16 order = meta->sequence - meta1->sequence;
+ if (order == 0)
+ goto drop;
+ if (order < 0) {
+ meta->timestamp = meta1->timestamp;
+ skb_insert(skb1, skb, &sk->sk_receive_queue);
+ skb = NULL;
+ break;
+ }
+ }
+ if (skb) {
+ meta->timestamp = now;
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ }
+
+ /* Remove packets from receive queue as long as
+ * 1. the receive buffer is full,
+ * 2. they are queued longer than one second, or
+ * 3. there are no missing packets before them. */
+ skb_queue_walk_safe(&sk->sk_receive_queue, skb, skb1) {
+ meta = skb_meta(skb);
+ if (atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
+ now - meta->timestamp < HZ &&
+ meta->sequence != opt->recv_sequence)
+ break;
+ skb_unlink(skb, &sk->sk_receive_queue);
+ opt->recv_sequence = (__u16)(meta->sequence + 1);
+ skb_orphan(skb);
+ ppp_input(&pppox_sk(sk)->chan, skb);
+ }
+ return NET_RX_SUCCESS;
+ }
+
+ /* Flush receive queue if sequencing is disabled. */
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_orphan(skb);
+ ppp_input(&pppox_sk(sk)->chan, skb);
+ return NET_RX_SUCCESS;
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static int pppolac_recv(struct sock *sk_udp, struct sk_buff *skb)
+{
+ sock_hold(sk_udp);
+ sk_receive_skb(sk_udp, skb, 0);
+ return 0;
+}
+
+static struct sk_buff_head delivery_queue;
+
+static void pppolac_xmit_core(struct work_struct *delivery_work)
+{
+ mm_segment_t old_fs = get_fs();
+ struct sk_buff *skb;
+
+ set_fs(KERNEL_DS);
+ while ((skb = skb_dequeue(&delivery_queue))) {
+ struct sock *sk_udp = skb->sk;
+ struct kvec iov = {.iov_base = skb->data, .iov_len = skb->len};
+ struct msghdr msg = {
+ .msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT,
+ };
+
+ iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1,
+ skb->len);
+ sk_udp->sk_prot->sendmsg(sk_udp, &msg, skb->len);
+ kfree_skb(skb);
+ }
+ set_fs(old_fs);
+}
+
+static DECLARE_WORK(delivery_work, pppolac_xmit_core);
+
+static int pppolac_xmit(struct ppp_channel *chan, struct sk_buff *skb)
+{
+ struct sock *sk_udp = (struct sock *)chan->private;
+ struct pppolac_opt *opt = &pppox_sk(sk_udp->sk_user_data)->proto.lac;
+
+ /* Install PPP address and control. */
+ skb_push(skb, 2);
+ skb->data[0] = PPP_ADDR;
+ skb->data[1] = PPP_CTRL;
+
+ /* Install L2TP header. */
+ if (atomic_read(&opt->sequencing)) {
+ skb_push(skb, 10);
+ skb->data[0] = L2TP_SEQUENCE_BIT;
+ skb->data[6] = opt->xmit_sequence >> 8;
+ skb->data[7] = opt->xmit_sequence;
+ skb->data[8] = 0;
+ skb->data[9] = 0;
+ opt->xmit_sequence++;
+ } else {
+ skb_push(skb, 6);
+ skb->data[0] = 0;
+ }
+ skb->data[1] = L2TP_VERSION;
+ unaligned(&skb->data[2])->u32 = opt->remote;
+
+ /* Now send the packet via the delivery queue. */
+ skb_set_owner_w(skb, sk_udp);
+ skb_queue_tail(&delivery_queue, skb);
+ schedule_work(&delivery_work);
+ return 1;
+}
+
+/******************************************************************************/
+
+static struct ppp_channel_ops pppolac_channel_ops = {
+ .start_xmit = pppolac_xmit,
+};
+
+static int pppolac_connect(struct socket *sock, struct sockaddr *useraddr,
+ int addrlen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct pppox_sock *po = pppox_sk(sk);
+ struct sockaddr_pppolac *addr = (struct sockaddr_pppolac *)useraddr;
+ struct socket *sock_udp = NULL;
+ struct sock *sk_udp;
+ int error;
+
+ if (addrlen != sizeof(struct sockaddr_pppolac) ||
+ !addr->local.tunnel || !addr->local.session ||
+ !addr->remote.tunnel || !addr->remote.session) {
+ return -EINVAL;
+ }
+
+ lock_sock(sk);
+ error = -EALREADY;
+ if (sk->sk_state != PPPOX_NONE)
+ goto out;
+
+ sock_udp = sockfd_lookup(addr->udp_socket, &error);
+ if (!sock_udp)
+ goto out;
+ sk_udp = sock_udp->sk;
+ lock_sock(sk_udp);
+
+ /* Remove this check when IPv6 supports UDP encapsulation. */
+ error = -EAFNOSUPPORT;
+ if (sk_udp->sk_family != AF_INET)
+ goto out;
+ error = -EPROTONOSUPPORT;
+ if (sk_udp->sk_protocol != IPPROTO_UDP)
+ goto out;
+ error = -EDESTADDRREQ;
+ if (sk_udp->sk_state != TCP_ESTABLISHED)
+ goto out;
+ error = -EBUSY;
+ if (udp_sk(sk_udp)->encap_type || sk_udp->sk_user_data)
+ goto out;
+ if (!sk_udp->sk_bound_dev_if) {
+ struct dst_entry *dst = sk_dst_get(sk_udp);
+ error = -ENODEV;
+ if (!dst)
+ goto out;
+ sk_udp->sk_bound_dev_if = dst->dev->ifindex;
+ dst_release(dst);
+ }
+
+ po->chan.hdrlen = 12;
+ po->chan.private = sk_udp;
+ po->chan.ops = &pppolac_channel_ops;
+ po->chan.mtu = PPP_MRU - 80;
+ po->proto.lac.local = unaligned(&addr->local)->u32;
+ po->proto.lac.remote = unaligned(&addr->remote)->u32;
+ atomic_set(&po->proto.lac.sequencing, 1);
+ po->proto.lac.backlog_rcv = sk_udp->sk_backlog_rcv;
+
+ error = ppp_register_channel(&po->chan);
+ if (error)
+ goto out;
+
+ sk->sk_state = PPPOX_CONNECTED;
+ udp_sk(sk_udp)->encap_type = UDP_ENCAP_L2TPINUDP;
+ udp_sk(sk_udp)->encap_rcv = pppolac_recv;
+ sk_udp->sk_backlog_rcv = pppolac_recv_core;
+ sk_udp->sk_user_data = sk;
+out:
+ if (sock_udp) {
+ release_sock(sk_udp);
+ if (error)
+ sockfd_put(sock_udp);
+ }
+ release_sock(sk);
+ return error;
+}
+
+static int pppolac_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (!sk)
+ return 0;
+
+ lock_sock(sk);
+ if (sock_flag(sk, SOCK_DEAD)) {
+ release_sock(sk);
+ return -EBADF;
+ }
+
+ if (sk->sk_state != PPPOX_NONE) {
+ struct sock *sk_udp = (struct sock *)pppox_sk(sk)->chan.private;
+ lock_sock(sk_udp);
+ skb_queue_purge(&sk->sk_receive_queue);
+ pppox_unbind_sock(sk);
+ udp_sk(sk_udp)->encap_type = 0;
+ udp_sk(sk_udp)->encap_rcv = NULL;
+ sk_udp->sk_backlog_rcv = pppox_sk(sk)->proto.lac.backlog_rcv;
+ sk_udp->sk_user_data = NULL;
+ release_sock(sk_udp);
+ sockfd_put(sk_udp->sk_socket);
+ }
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+ release_sock(sk);
+ sock_put(sk);
+ return 0;
+}
+
+/******************************************************************************/
+
+static struct proto pppolac_proto = {
+ .name = "PPPOLAC",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct pppox_sock),
+};
+
+static struct proto_ops pppolac_proto_ops = {
+ .family = PF_PPPOX,
+ .owner = THIS_MODULE,
+ .release = pppolac_release,
+ .bind = sock_no_bind,
+ .connect = pppolac_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = sock_no_poll,
+ .ioctl = pppox_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = sock_no_sendmsg,
+ .recvmsg = sock_no_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static int pppolac_create(struct net *net, struct socket *sock, int kern)
+{
+ struct sock *sk;
+
+ sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppolac_proto, kern);
+ if (!sk)
+ return -ENOMEM;
+
+ sock_init_data(sock, sk);
+ sock->state = SS_UNCONNECTED;
+ sock->ops = &pppolac_proto_ops;
+ sk->sk_protocol = PX_PROTO_OLAC;
+ sk->sk_state = PPPOX_NONE;
+ return 0;
+}
+
+/******************************************************************************/
+
+static struct pppox_proto pppolac_pppox_proto = {
+ .create = pppolac_create,
+ .owner = THIS_MODULE,
+};
+
+static int __init pppolac_init(void)
+{
+ int error;
+
+ error = proto_register(&pppolac_proto, 0);
+ if (error)
+ return error;
+
+ error = register_pppox_proto(PX_PROTO_OLAC, &pppolac_pppox_proto);
+ if (error)
+ proto_unregister(&pppolac_proto);
+ else
+ skb_queue_head_init(&delivery_queue);
+ return error;
+}
+
+static void __exit pppolac_exit(void)
+{
+ unregister_pppox_proto(PX_PROTO_OLAC);
+ proto_unregister(&pppolac_proto);
+}
+
+module_init(pppolac_init);
+module_exit(pppolac_exit);
+
+MODULE_DESCRIPTION("PPP on L2TP Access Concentrator (PPPoLAC)");
+MODULE_AUTHOR("Chia-chi Yeh <chiachi@android.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ppp/pppopns.c b/drivers/net/ppp/pppopns.c
new file mode 100644
index 000000000000..cdb4fa1af734
--- /dev/null
+++ b/drivers/net/ppp/pppopns.c
@@ -0,0 +1,429 @@
+/* drivers/net/pppopns.c
+ *
+ * Driver for PPP on PPTP Network Server / PPPoPNS Socket (RFC 2637)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* This driver handles PPTP data packets between a RAW socket and a PPP channel.
+ * The socket is created in the kernel space and connected to the same address
+ * of the control socket. Outgoing packets are always sent with sequences but
+ * without acknowledgements. Incoming packets with sequences are reordered
+ * within a sliding window of one second. Currently reordering only happens when
+ * a packet is received. It is done for simplicity since no additional locks or
+ * threads are required. This driver should work on both IPv4 and IPv6. */
+
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <linux/netdevice.h>
+#include <linux/net.h>
+#include <linux/ppp_defs.h>
+#include <linux/if.h>
+#include <linux/if_ppp.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_channel.h>
+#include <asm/uaccess.h>
+
+#define GRE_HEADER_SIZE 8
+
+#define PPTP_GRE_BITS htons(0x2001)
+#define PPTP_GRE_BITS_MASK htons(0xEF7F)
+#define PPTP_GRE_SEQ_BIT htons(0x1000)
+#define PPTP_GRE_ACK_BIT htons(0x0080)
+#define PPTP_GRE_TYPE htons(0x880B)
+
+#define PPP_ADDR 0xFF
+#define PPP_CTRL 0x03
+
+struct header {
+ __u16 bits;
+ __u16 type;
+ __u16 length;
+ __u16 call;
+ __u32 sequence;
+} __attribute__((packed));
+
+struct meta {
+ __u32 sequence;
+ __u32 timestamp;
+};
+
+static inline struct meta *skb_meta(struct sk_buff *skb)
+{
+ return (struct meta *)skb->cb;
+}
+
+/******************************************************************************/
+
+static int pppopns_recv_core(struct sock *sk_raw, struct sk_buff *skb)
+{
+ struct sock *sk = (struct sock *)sk_raw->sk_user_data;
+ struct pppopns_opt *opt = &pppox_sk(sk)->proto.pns;
+ struct meta *meta = skb_meta(skb);
+ __u32 now = jiffies;
+ struct header *hdr;
+
+ /* Skip transport header */
+ skb_pull(skb, skb_transport_header(skb) - skb->data);
+
+ /* Drop the packet if GRE header is missing. */
+ if (skb->len < GRE_HEADER_SIZE)
+ goto drop;
+ hdr = (struct header *)skb->data;
+
+ /* Check the header. */
+ if (hdr->type != PPTP_GRE_TYPE || hdr->call != opt->local ||
+ (hdr->bits & PPTP_GRE_BITS_MASK) != PPTP_GRE_BITS)
+ goto drop;
+
+ /* Skip all fields including optional ones. */
+ if (!skb_pull(skb, GRE_HEADER_SIZE +
+ (hdr->bits & PPTP_GRE_SEQ_BIT ? 4 : 0) +
+ (hdr->bits & PPTP_GRE_ACK_BIT ? 4 : 0)))
+ goto drop;
+
+ /* Check the length. */
+ if (skb->len != ntohs(hdr->length))
+ goto drop;
+
+ /* Check the sequence if it is present. */
+ if (hdr->bits & PPTP_GRE_SEQ_BIT) {
+ meta->sequence = ntohl(hdr->sequence);
+ if ((__s32)(meta->sequence - opt->recv_sequence) < 0)
+ goto drop;
+ }
+
+ /* Skip PPP address and control if they are present. */
+ if (skb->len >= 2 && skb->data[0] == PPP_ADDR &&
+ skb->data[1] == PPP_CTRL)
+ skb_pull(skb, 2);
+
+ /* Fix PPP protocol if it is compressed. */
+ if (skb->len >= 1 && skb->data[0] & 1)
+ skb_push(skb, 1)[0] = 0;
+
+ /* Drop the packet if PPP protocol is missing. */
+ if (skb->len < 2)
+ goto drop;
+
+ /* Perform reordering if sequencing is enabled. */
+ if (hdr->bits & PPTP_GRE_SEQ_BIT) {
+ struct sk_buff *skb1;
+
+ /* Insert the packet into receive queue in order. */
+ skb_set_owner_r(skb, sk);
+ skb_queue_walk(&sk->sk_receive_queue, skb1) {
+ struct meta *meta1 = skb_meta(skb1);
+ __s32 order = meta->sequence - meta1->sequence;
+ if (order == 0)
+ goto drop;
+ if (order < 0) {
+ meta->timestamp = meta1->timestamp;
+ skb_insert(skb1, skb, &sk->sk_receive_queue);
+ skb = NULL;
+ break;
+ }
+ }
+ if (skb) {
+ meta->timestamp = now;
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ }
+
+ /* Remove packets from receive queue as long as
+ * 1. the receive buffer is full,
+ * 2. they are queued longer than one second, or
+ * 3. there are no missing packets before them. */
+ skb_queue_walk_safe(&sk->sk_receive_queue, skb, skb1) {
+ meta = skb_meta(skb);
+ if (atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
+ now - meta->timestamp < HZ &&
+ meta->sequence != opt->recv_sequence)
+ break;
+ skb_unlink(skb, &sk->sk_receive_queue);
+ opt->recv_sequence = meta->sequence + 1;
+ skb_orphan(skb);
+ ppp_input(&pppox_sk(sk)->chan, skb);
+ }
+ return NET_RX_SUCCESS;
+ }
+
+ /* Flush receive queue if sequencing is disabled. */
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_orphan(skb);
+ ppp_input(&pppox_sk(sk)->chan, skb);
+ return NET_RX_SUCCESS;
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static void pppopns_recv(struct sock *sk_raw)
+{
+ struct sk_buff *skb;
+ while ((skb = skb_dequeue(&sk_raw->sk_receive_queue))) {
+ sock_hold(sk_raw);
+ sk_receive_skb(sk_raw, skb, 0);
+ }
+}
+
+static struct sk_buff_head delivery_queue;
+
+static void pppopns_xmit_core(struct work_struct *delivery_work)
+{
+ mm_segment_t old_fs = get_fs();
+ struct sk_buff *skb;
+
+ set_fs(KERNEL_DS);
+ while ((skb = skb_dequeue(&delivery_queue))) {
+ struct sock *sk_raw = skb->sk;
+ struct kvec iov = {.iov_base = skb->data, .iov_len = skb->len};
+ struct msghdr msg = {
+ .msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT,
+ };
+
+ iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1,
+ skb->len);
+ sk_raw->sk_prot->sendmsg(sk_raw, &msg, skb->len);
+ kfree_skb(skb);
+ }
+ set_fs(old_fs);
+}
+
+static DECLARE_WORK(delivery_work, pppopns_xmit_core);
+
+static int pppopns_xmit(struct ppp_channel *chan, struct sk_buff *skb)
+{
+ struct sock *sk_raw = (struct sock *)chan->private;
+ struct pppopns_opt *opt = &pppox_sk(sk_raw->sk_user_data)->proto.pns;
+ struct header *hdr;
+ __u16 length;
+
+ /* Install PPP address and control. */
+ skb_push(skb, 2);
+ skb->data[0] = PPP_ADDR;
+ skb->data[1] = PPP_CTRL;
+ length = skb->len;
+
+ /* Install PPTP GRE header. */
+ hdr = (struct header *)skb_push(skb, 12);
+ hdr->bits = PPTP_GRE_BITS | PPTP_GRE_SEQ_BIT;
+ hdr->type = PPTP_GRE_TYPE;
+ hdr->length = htons(length);
+ hdr->call = opt->remote;
+ hdr->sequence = htonl(opt->xmit_sequence);
+ opt->xmit_sequence++;
+
+ /* Now send the packet via the delivery queue. */
+ skb_set_owner_w(skb, sk_raw);
+ skb_queue_tail(&delivery_queue, skb);
+ schedule_work(&delivery_work);
+ return 1;
+}
+
+/******************************************************************************/
+
+static struct ppp_channel_ops pppopns_channel_ops = {
+ .start_xmit = pppopns_xmit,
+};
+
+static int pppopns_connect(struct socket *sock, struct sockaddr *useraddr,
+ int addrlen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct pppox_sock *po = pppox_sk(sk);
+ struct sockaddr_pppopns *addr = (struct sockaddr_pppopns *)useraddr;
+ struct sockaddr_storage ss;
+ struct socket *sock_tcp = NULL;
+ struct socket *sock_raw = NULL;
+ struct sock *sk_tcp;
+ struct sock *sk_raw;
+ int error;
+
+ if (addrlen != sizeof(struct sockaddr_pppopns))
+ return -EINVAL;
+
+ lock_sock(sk);
+ error = -EALREADY;
+ if (sk->sk_state != PPPOX_NONE)
+ goto out;
+
+ sock_tcp = sockfd_lookup(addr->tcp_socket, &error);
+ if (!sock_tcp)
+ goto out;
+ sk_tcp = sock_tcp->sk;
+ error = -EPROTONOSUPPORT;
+ if (sk_tcp->sk_protocol != IPPROTO_TCP)
+ goto out;
+ addrlen = sizeof(struct sockaddr_storage);
+ error = kernel_getpeername(sock_tcp, (struct sockaddr *)&ss, &addrlen);
+ if (error)
+ goto out;
+ if (!sk_tcp->sk_bound_dev_if) {
+ struct dst_entry *dst = sk_dst_get(sk_tcp);
+ error = -ENODEV;
+ if (!dst)
+ goto out;
+ sk_tcp->sk_bound_dev_if = dst->dev->ifindex;
+ dst_release(dst);
+ }
+
+ error = sock_create(ss.ss_family, SOCK_RAW, IPPROTO_GRE, &sock_raw);
+ if (error)
+ goto out;
+ sk_raw = sock_raw->sk;
+ sk_raw->sk_bound_dev_if = sk_tcp->sk_bound_dev_if;
+ error = kernel_connect(sock_raw, (struct sockaddr *)&ss, addrlen, 0);
+ if (error)
+ goto out;
+
+ po->chan.hdrlen = 14;
+ po->chan.private = sk_raw;
+ po->chan.ops = &pppopns_channel_ops;
+ po->chan.mtu = PPP_MRU - 80;
+ po->proto.pns.local = addr->local;
+ po->proto.pns.remote = addr->remote;
+ po->proto.pns.data_ready = sk_raw->sk_data_ready;
+ po->proto.pns.backlog_rcv = sk_raw->sk_backlog_rcv;
+
+ error = ppp_register_channel(&po->chan);
+ if (error)
+ goto out;
+
+ sk->sk_state = PPPOX_CONNECTED;
+ lock_sock(sk_raw);
+ sk_raw->sk_data_ready = pppopns_recv;
+ sk_raw->sk_backlog_rcv = pppopns_recv_core;
+ sk_raw->sk_user_data = sk;
+ release_sock(sk_raw);
+out:
+ if (sock_tcp)
+ sockfd_put(sock_tcp);
+ if (error && sock_raw)
+ sock_release(sock_raw);
+ release_sock(sk);
+ return error;
+}
+
+static int pppopns_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (!sk)
+ return 0;
+
+ lock_sock(sk);
+ if (sock_flag(sk, SOCK_DEAD)) {
+ release_sock(sk);
+ return -EBADF;
+ }
+
+ if (sk->sk_state != PPPOX_NONE) {
+ struct sock *sk_raw = (struct sock *)pppox_sk(sk)->chan.private;
+ lock_sock(sk_raw);
+ skb_queue_purge(&sk->sk_receive_queue);
+ pppox_unbind_sock(sk);
+ sk_raw->sk_data_ready = pppox_sk(sk)->proto.pns.data_ready;
+ sk_raw->sk_backlog_rcv = pppox_sk(sk)->proto.pns.backlog_rcv;
+ sk_raw->sk_user_data = NULL;
+ release_sock(sk_raw);
+ sock_release(sk_raw->sk_socket);
+ }
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+ release_sock(sk);
+ sock_put(sk);
+ return 0;
+}
+
+/******************************************************************************/
+
+static struct proto pppopns_proto = {
+ .name = "PPPOPNS",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct pppox_sock),
+};
+
+static struct proto_ops pppopns_proto_ops = {
+ .family = PF_PPPOX,
+ .owner = THIS_MODULE,
+ .release = pppopns_release,
+ .bind = sock_no_bind,
+ .connect = pppopns_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = sock_no_poll,
+ .ioctl = pppox_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = sock_no_sendmsg,
+ .recvmsg = sock_no_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static int pppopns_create(struct net *net, struct socket *sock, int kern)
+{
+ struct sock *sk;
+
+ sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppopns_proto, kern);
+ if (!sk)
+ return -ENOMEM;
+
+ sock_init_data(sock, sk);
+ sock->state = SS_UNCONNECTED;
+ sock->ops = &pppopns_proto_ops;
+ sk->sk_protocol = PX_PROTO_OPNS;
+ sk->sk_state = PPPOX_NONE;
+ return 0;
+}
+
+/******************************************************************************/
+
+static struct pppox_proto pppopns_pppox_proto = {
+ .create = pppopns_create,
+ .owner = THIS_MODULE,
+};
+
+static int __init pppopns_init(void)
+{
+ int error;
+
+ error = proto_register(&pppopns_proto, 0);
+ if (error)
+ return error;
+
+ error = register_pppox_proto(PX_PROTO_OPNS, &pppopns_pppox_proto);
+ if (error)
+ proto_unregister(&pppopns_proto);
+ else
+ skb_queue_head_init(&delivery_queue);
+ return error;
+}
+
+static void __exit pppopns_exit(void)
+{
+ unregister_pppox_proto(PX_PROTO_OPNS);
+ proto_unregister(&pppopns_proto);
+}
+
+module_init(pppopns_init);
+module_exit(pppopns_exit);
+
+MODULE_DESCRIPTION("PPP on PPTP Network Server (PPPoPNS)");
+MODULE_AUTHOR("Chia-chi Yeh <chiachi@android.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 0260bc15bc0c..e92e86c7748e 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2021,6 +2021,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
int le;
int ret;
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+ if (cmd != TUNGETIFF && !capable(CAP_NET_ADMIN)) {
+ return -EPERM;
+ }
+#endif
+
if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) {
if (copy_from_user(&ifr, argp, ifreq_len))
return -EFAULT;
diff --git a/drivers/net/wireless/ti/wlcore/init.c b/drivers/net/wireless/ti/wlcore/init.c
index d0b7734030ef..b7974b4dbb34 100644
--- a/drivers/net/wireless/ti/wlcore/init.c
+++ b/drivers/net/wireless/ti/wlcore/init.c
@@ -549,6 +549,11 @@ static int wl12xx_init_ap_role(struct wl1271 *wl, struct wl12xx_vif *wlvif)
{
int ret;
+ /* Disable filtering */
+ ret = wl1271_acx_group_address_tbl(wl, wlvif, false, NULL, 0);
+ if (ret < 0)
+ return ret;
+
ret = wl1271_acx_ap_max_tx_retry(wl, wlvif);
if (ret < 0)
return ret;
diff --git a/drivers/nfc/fdp/i2c.c b/drivers/nfc/fdp/i2c.c
index 712936f5d2d6..fbd26ecbf4a4 100644
--- a/drivers/nfc/fdp/i2c.c
+++ b/drivers/nfc/fdp/i2c.c
@@ -177,6 +177,16 @@ static int fdp_nci_i2c_read(struct fdp_i2c_phy *phy, struct sk_buff **skb)
/* Packet that contains a length */
if (tmp[0] == 0 && tmp[1] == 0) {
phy->next_read_size = (tmp[2] << 8) + tmp[3] + 3;
+ /*
+ * Ensure next_read_size does not exceed sizeof(tmp)
+ * for reading that many bytes during next iteration
+ */
+ if (phy->next_read_size > FDP_NCI_I2C_MAX_PAYLOAD) {
+ dev_dbg(&client->dev, "%s: corrupted packet\n",
+ __func__);
+ phy->next_read_size = 5;
+ goto flush;
+ }
} else {
phy->next_read_size = FDP_NCI_I2C_MIN_PAYLOAD;
diff --git a/drivers/nfc/st21nfca/dep.c b/drivers/nfc/st21nfca/dep.c
index 798a32bbac5d..206285210ab5 100644
--- a/drivers/nfc/st21nfca/dep.c
+++ b/drivers/nfc/st21nfca/dep.c
@@ -217,7 +217,8 @@ static int st21nfca_tm_recv_atr_req(struct nfc_hci_dev *hdev,
atr_req = (struct st21nfca_atr_req *)skb->data;
- if (atr_req->length < sizeof(struct st21nfca_atr_req)) {
+ if (atr_req->length < sizeof(struct st21nfca_atr_req) ||
+ atr_req->length > skb->len) {
r = -EPROTO;
goto exit;
}
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index e9360d5cbcba..e37a2a597b2d 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -1063,42 +1063,66 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
return 0;
}
+/*
+ * Convert configs to something easy to use in C code
+ */
+#if defined(CONFIG_CMDLINE_FORCE)
+static const int overwrite_incoming_cmdline = 1;
+static const int read_dt_cmdline;
+static const int concat_cmdline;
+#elif defined(CONFIG_CMDLINE_EXTEND)
+static const int overwrite_incoming_cmdline;
+static const int read_dt_cmdline = 1;
+static const int concat_cmdline = 1;
+#else /* CMDLINE_FROM_BOOTLOADER */
+static const int overwrite_incoming_cmdline;
+static const int read_dt_cmdline = 1;
+static const int concat_cmdline;
+#endif
+
+#ifdef CONFIG_CMDLINE
+static const char *config_cmdline = CONFIG_CMDLINE;
+#else
+static const char *config_cmdline = "";
+#endif
+
int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
int depth, void *data)
{
- int l;
- const char *p;
+ int l = 0;
+ const char *p = NULL;
+ char *cmdline = data;
pr_debug("search \"chosen\", depth: %d, uname: %s\n", depth, uname);
- if (depth != 1 || !data ||
+ if (depth != 1 || !cmdline ||
(strcmp(uname, "chosen") != 0 && strcmp(uname, "chosen@0") != 0))
return 0;
early_init_dt_check_for_initrd(node);
- /* Retrieve command line */
- p = of_get_flat_dt_prop(node, "bootargs", &l);
- if (p != NULL && l > 0)
- strlcpy(data, p, min((int)l, COMMAND_LINE_SIZE));
-
- /*
- * CONFIG_CMDLINE is meant to be a default in case nothing else
- * managed to set the command line, unless CONFIG_CMDLINE_FORCE
- * is set in which case we override whatever was found earlier.
- */
-#ifdef CONFIG_CMDLINE
-#if defined(CONFIG_CMDLINE_EXTEND)
- strlcat(data, " ", COMMAND_LINE_SIZE);
- strlcat(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
-#elif defined(CONFIG_CMDLINE_FORCE)
- strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
-#else
- /* No arguments from boot loader, use kernel's cmdl*/
- if (!((char *)data)[0])
- strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
-#endif
-#endif /* CONFIG_CMDLINE */
+ /* Put CONFIG_CMDLINE in if forced or if data had nothing in it to start */
+ if (overwrite_incoming_cmdline || !cmdline[0])
+ strlcpy(cmdline, config_cmdline, COMMAND_LINE_SIZE);
+
+ /* Retrieve command line unless forcing */
+ if (read_dt_cmdline)
+ p = of_get_flat_dt_prop(node, "bootargs", &l);
+
+ if (p != NULL && l > 0) {
+ if (concat_cmdline) {
+ int cmdline_len;
+ int copy_len;
+ strlcat(cmdline, " ", COMMAND_LINE_SIZE);
+ cmdline_len = strlen(cmdline);
+ copy_len = COMMAND_LINE_SIZE - cmdline_len - 1;
+ copy_len = min((int)l, copy_len);
+ strncpy(cmdline + cmdline_len, p, copy_len);
+ cmdline[cmdline_len + copy_len] = '\0';
+ } else {
+ strlcpy(cmdline, p, min((int)l, COMMAND_LINE_SIZE));
+ }
+ }
pr_debug("Command line is: %s\n", (char*)data);
diff --git a/drivers/platform/goldfish/Makefile b/drivers/platform/goldfish/Makefile
index d3487125838c..277a820ee4e1 100644
--- a/drivers/platform/goldfish/Makefile
+++ b/drivers/platform/goldfish/Makefile
@@ -2,4 +2,5 @@
# Makefile for Goldfish platform specific drivers
#
obj-$(CONFIG_GOLDFISH_BUS) += pdev_bus.o
-obj-$(CONFIG_GOLDFISH_PIPE) += goldfish_pipe.o
+obj-$(CONFIG_GOLDFISH_PIPE) += goldfish_pipe_all.o
+goldfish_pipe_all-objs := goldfish_pipe.o goldfish_pipe_v2.o
diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index 1aba2c74160e..91e0a5645799 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -15,52 +15,11 @@
*
*/
-/* This source file contains the implementation of a special device driver
- * that intends to provide a *very* fast communication channel between the
- * guest system and the QEMU emulator.
- *
- * Usage from the guest is simply the following (error handling simplified):
- *
- * int fd = open("/dev/qemu_pipe",O_RDWR);
- * .... write() or read() through the pipe.
- *
- * This driver doesn't deal with the exact protocol used during the session.
- * It is intended to be as simple as something like:
- *
- * // do this _just_ after opening the fd to connect to a specific
- * // emulator service.
- * const char* msg = "<pipename>";
- * if (write(fd, msg, strlen(msg)+1) < 0) {
- * ... could not connect to <pipename> service
- * close(fd);
- * }
- *
- * // after this, simply read() and write() to communicate with the
- * // service. Exact protocol details left as an exercise to the reader.
- *
- * This driver is very fast because it doesn't copy any data through
- * intermediate buffers, since the emulator is capable of translating
- * guest user addresses into host ones.
- *
- * Note that we must however ensure that each user page involved in the
- * exchange is properly mapped during a transfer.
+/* This source file contains the implementation of the legacy version of
+ * a goldfish pipe device driver. See goldfish_pipe_v2.c for the current
+ * version.
*/
-
-#include <linux/module.h>
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/spinlock.h>
-#include <linux/miscdevice.h>
-#include <linux/platform_device.h>
-#include <linux/poll.h>
-#include <linux/sched.h>
-#include <linux/bitops.h>
-#include <linux/slab.h>
-#include <linux/io.h>
-#include <linux/goldfish.h>
-#include <linux/dma-mapping.h>
-#include <linux/mm.h>
-#include <linux/acpi.h>
+#include "goldfish_pipe.h"
/*
* IMPORTANT: The following constants must match the ones used and defined
@@ -110,29 +69,15 @@
#define PIPE_WAKE_READ (1 << 1) /* pipe can now be read from */
#define PIPE_WAKE_WRITE (1 << 2) /* pipe can now be written to */
-struct access_params {
- unsigned long channel;
- u32 size;
- unsigned long address;
- u32 cmd;
- u32 result;
- /* reserved for future extension */
- u32 flags;
-};
+#define MAX_PAGES_TO_GRAB 32
-/* The global driver data. Holds a reference to the i/o page used to
- * communicate with the emulator, and a wake queue for blocked tasks
- * waiting to be awoken.
- */
-struct goldfish_pipe_dev {
- spinlock_t lock;
- unsigned char __iomem *base;
- struct access_params *aps;
- int irq;
- u32 version;
-};
+#define DEBUG 0
-static struct goldfish_pipe_dev pipe_dev[1];
+#if DEBUG
+#define DPRINT(...) { printk(KERN_ERR __VA_ARGS__); }
+#else
+#define DPRINT(...)
+#endif
/* This data type models a given pipe instance */
struct goldfish_pipe {
@@ -142,6 +87,15 @@ struct goldfish_pipe {
wait_queue_head_t wake_queue;
};
+struct access_params {
+ unsigned long channel;
+ u32 size;
+ unsigned long address;
+ u32 cmd;
+ u32 result;
+ /* reserved for future extension */
+ u32 flags;
+};
/* Bit flags for the 'flags' field */
enum {
@@ -231,8 +185,10 @@ static int setup_access_params_addr(struct platform_device *pdev,
if (valid_batchbuffer_addr(dev, aps)) {
dev->aps = aps;
return 0;
- } else
+ } else {
+ devm_kfree(&pdev->dev, aps);
return -1;
+ }
}
/* A value that will not be set by qemu emulator */
@@ -269,6 +225,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
struct goldfish_pipe *pipe = filp->private_data;
struct goldfish_pipe_dev *dev = pipe->dev;
unsigned long address, address_end;
+ struct page* pages[MAX_PAGES_TO_GRAB] = {};
int count = 0, ret = -EINVAL;
/* If the emulator already closed the pipe, no need to go further */
@@ -293,45 +250,61 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
while (address < address_end) {
unsigned long page_end = (address & PAGE_MASK) + PAGE_SIZE;
- unsigned long next = page_end < address_end ? page_end
- : address_end;
- unsigned long avail = next - address;
- int status, wakeBit;
- struct page *page;
-
- /* Either vaddr or paddr depending on the device version */
- unsigned long xaddr;
+ unsigned long next, avail;
+ int status, wakeBit, page_i, num_contiguous_pages;
+ long first_page, last_page, requested_pages;
+ unsigned long xaddr, xaddr_prev, xaddr_i;
/*
- * We grab the pages on a page-by-page basis in case user
- * space gives us a potentially huge buffer but the read only
- * returns a small amount, then there's no need to pin that
- * much memory to the process.
+ * Attempt to grab multiple physically contiguous pages.
*/
- down_read(&current->mm->mmap_sem);
- ret = get_user_pages(address, 1, is_write ? 0 : FOLL_WRITE,
- &page, NULL);
- up_read(&current->mm->mmap_sem);
- if (ret < 0)
- break;
+ first_page = address & PAGE_MASK;
+ last_page = (address_end - 1) & PAGE_MASK;
+ requested_pages = ((last_page - first_page) >> PAGE_SHIFT) + 1;
+ if (requested_pages > MAX_PAGES_TO_GRAB) {
+ requested_pages = MAX_PAGES_TO_GRAB;
+ }
+ ret = get_user_pages_fast(first_page, requested_pages,
+ !is_write, pages);
+
+ DPRINT("%s: requested pages: %d %d %p\n", __FUNCTION__,
+ ret, requested_pages, first_page);
+ if (ret == 0) {
+ DPRINT("%s: error: (requested pages == 0) (wanted %d)\n",
+ __FUNCTION__, requested_pages);
+ mutex_unlock(&pipe->lock);
+ return ret;
+ }
+ if (ret < 0) {
+ DPRINT("%s: (requested pages < 0) %d \n",
+ __FUNCTION__, requested_pages);
+ mutex_unlock(&pipe->lock);
+ return ret;
+ }
- if (dev->version) {
- /* Device version 1 or newer (qemu-android) expects the
- * physical address.
- */
- xaddr = page_to_phys(page) | (address & ~PAGE_MASK);
- } else {
- /* Device version 0 (classic emulator) expects the
- * virtual address.
- */
- xaddr = address;
+ xaddr = page_to_phys(pages[0]) | (address & ~PAGE_MASK);
+ xaddr_prev = xaddr;
+ num_contiguous_pages = ret == 0 ? 0 : 1;
+ for (page_i = 1; page_i < ret; page_i++) {
+ xaddr_i = page_to_phys(pages[page_i]) | (address & ~PAGE_MASK);
+ if (xaddr_i == xaddr_prev + PAGE_SIZE) {
+ page_end += PAGE_SIZE;
+ xaddr_prev = xaddr_i;
+ num_contiguous_pages++;
+ } else {
+ DPRINT("%s: discontinuous page boundary: %d pages instead\n",
+ __FUNCTION__, page_i);
+ break;
+ }
}
+ next = page_end < address_end ? page_end : address_end;
+ avail = next - address;
/* Now, try to transfer the bytes in the current page */
spin_lock_irqsave(&dev->lock, irq_flags);
if (access_with_param(dev,
- is_write ? CMD_WRITE_BUFFER : CMD_READ_BUFFER,
- xaddr, avail, pipe, &status)) {
+ is_write ? CMD_WRITE_BUFFER : CMD_READ_BUFFER,
+ xaddr, avail, pipe, &status)) {
gf_write_ptr(pipe, dev->base + PIPE_REG_CHANNEL,
dev->base + PIPE_REG_CHANNEL_HIGH);
writel(avail, dev->base + PIPE_REG_SIZE);
@@ -344,9 +317,13 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
}
spin_unlock_irqrestore(&dev->lock, irq_flags);
- if (status > 0 && !is_write)
- set_page_dirty(page);
- put_page(page);
+ for (page_i = 0; page_i < ret; page_i++) {
+ if (status > 0 && !is_write &&
+ page_i < num_contiguous_pages) {
+ set_page_dirty(pages[page_i]);
+ }
+ put_page(pages[page_i]);
+ }
if (status > 0) { /* Correct transfer */
count += status;
@@ -368,7 +345,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
*/
if (status != PIPE_ERROR_AGAIN)
pr_info_ratelimited("goldfish_pipe: backend returned error %d on %s\n",
- status, is_write ? "write" : "read");
+ status, is_write ? "write" : "read");
ret = 0;
break;
}
@@ -378,7 +355,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
* non-blocking mode, just return the error code.
*/
if (status != PIPE_ERROR_AGAIN ||
- (filp->f_flags & O_NONBLOCK) != 0) {
+ (filp->f_flags & O_NONBLOCK) != 0) {
ret = goldfish_pipe_error_convert(status);
break;
}
@@ -392,7 +369,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
/* Tell the emulator we're going to wait for a wake event */
goldfish_cmd(pipe,
- is_write ? CMD_WAKE_ON_WRITE : CMD_WAKE_ON_READ);
+ is_write ? CMD_WAKE_ON_WRITE : CMD_WAKE_ON_READ);
/* Unlock the pipe, then wait for the wake signal */
mutex_unlock(&pipe->lock);
@@ -538,6 +515,8 @@ static int goldfish_pipe_open(struct inode *inode, struct file *file)
pipe->dev = dev;
mutex_init(&pipe->lock);
+ DPRINT("%s: call. pipe_dev pipe_dev=0x%lx new_pipe_addr=0x%lx file=0x%lx\n", __FUNCTION__, pipe_dev, pipe, file);
+ // spin lock init, write head of list, i guess
init_waitqueue_head(&pipe->wake_queue);
/*
@@ -560,6 +539,7 @@ static int goldfish_pipe_release(struct inode *inode, struct file *filp)
{
struct goldfish_pipe *pipe = filp->private_data;
+ DPRINT("%s: call. pipe=0x%lx file=0x%lx\n", __FUNCTION__, pipe, filp);
/* The guest is closing the channel, so tell the emulator right now */
goldfish_cmd(pipe, CMD_CLOSE);
kfree(pipe);
@@ -576,98 +556,33 @@ static const struct file_operations goldfish_pipe_fops = {
.release = goldfish_pipe_release,
};
-static struct miscdevice goldfish_pipe_device = {
+static struct miscdevice goldfish_pipe_dev = {
.minor = MISC_DYNAMIC_MINOR,
.name = "goldfish_pipe",
.fops = &goldfish_pipe_fops,
};
-static int goldfish_pipe_probe(struct platform_device *pdev)
+int goldfish_pipe_device_init_v1(struct platform_device *pdev)
{
- int err;
- struct resource *r;
struct goldfish_pipe_dev *dev = pipe_dev;
-
- /* not thread safe, but this should not happen */
- WARN_ON(dev->base != NULL);
-
- spin_lock_init(&dev->lock);
-
- r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- if (r == NULL || resource_size(r) < PAGE_SIZE) {
- dev_err(&pdev->dev, "can't allocate i/o page\n");
- return -EINVAL;
- }
- dev->base = devm_ioremap(&pdev->dev, r->start, PAGE_SIZE);
- if (dev->base == NULL) {
- dev_err(&pdev->dev, "ioremap failed\n");
- return -EINVAL;
- }
-
- r = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
- if (r == NULL) {
- err = -EINVAL;
- goto error;
- }
- dev->irq = r->start;
-
- err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt,
+ int err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt,
IRQF_SHARED, "goldfish_pipe", dev);
if (err) {
- dev_err(&pdev->dev, "unable to allocate IRQ\n");
- goto error;
+ dev_err(&pdev->dev, "unable to allocate IRQ for v1\n");
+ return err;
}
- err = misc_register(&goldfish_pipe_device);
+ err = misc_register(&goldfish_pipe_dev);
if (err) {
- dev_err(&pdev->dev, "unable to register device\n");
- goto error;
+ dev_err(&pdev->dev, "unable to register v1 device\n");
+ return err;
}
- setup_access_params_addr(pdev, dev);
- /* Although the pipe device in the classic Android emulator does not
- * recognize the 'version' register, it won't treat this as an error
- * either and will simply return 0, which is fine.
- */
- dev->version = readl(dev->base + PIPE_REG_VERSION);
+ setup_access_params_addr(pdev, dev);
return 0;
-
-error:
- dev->base = NULL;
- return err;
}
-static int goldfish_pipe_remove(struct platform_device *pdev)
+void goldfish_pipe_device_deinit_v1(struct platform_device *pdev)
{
- struct goldfish_pipe_dev *dev = pipe_dev;
- misc_deregister(&goldfish_pipe_device);
- dev->base = NULL;
- return 0;
+ misc_deregister(&goldfish_pipe_dev);
}
-
-static const struct acpi_device_id goldfish_pipe_acpi_match[] = {
- { "GFSH0003", 0 },
- { },
-};
-MODULE_DEVICE_TABLE(acpi, goldfish_pipe_acpi_match);
-
-static const struct of_device_id goldfish_pipe_of_match[] = {
- { .compatible = "google,android-pipe", },
- {},
-};
-MODULE_DEVICE_TABLE(of, goldfish_pipe_of_match);
-
-static struct platform_driver goldfish_pipe = {
- .probe = goldfish_pipe_probe,
- .remove = goldfish_pipe_remove,
- .driver = {
- .name = "goldfish_pipe",
- .owner = THIS_MODULE,
- .of_match_table = goldfish_pipe_of_match,
- .acpi_match_table = ACPI_PTR(goldfish_pipe_acpi_match),
- }
-};
-
-module_platform_driver(goldfish_pipe);
-MODULE_AUTHOR("David Turner <digit@google.com>");
-MODULE_LICENSE("GPL");
diff --git a/drivers/platform/goldfish/goldfish_pipe.h b/drivers/platform/goldfish/goldfish_pipe.h
new file mode 100644
index 000000000000..6cd1b63be8c9
--- /dev/null
+++ b/drivers/platform/goldfish/goldfish_pipe.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef GOLDFISH_PIPE_H
+#define GOLDFISH_PIPE_H
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/miscdevice.h>
+#include <linux/platform_device.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/goldfish.h>
+#include <linux/dma-mapping.h>
+#include <linux/mm.h>
+#include <linux/acpi.h>
+
+
+/* Initialize the legacy version of the pipe device driver */
+int goldfish_pipe_device_init_v1(struct platform_device *pdev);
+
+/* Deinitialize the legacy version of the pipe device driver */
+void goldfish_pipe_device_deinit_v1(struct platform_device *pdev);
+
+/* Forward declarations for the device struct */
+struct goldfish_pipe;
+struct goldfish_pipe_device_buffers;
+
+/* The global driver data. Holds a reference to the i/o page used to
+ * communicate with the emulator, and a wake queue for blocked tasks
+ * waiting to be awoken.
+ */
+struct goldfish_pipe_dev {
+ /*
+ * Global device spinlock. Protects the following members:
+ * - pipes, pipes_capacity
+ * - [*pipes, *pipes + pipes_capacity) - array data
+ * - first_signalled_pipe,
+ * goldfish_pipe::prev_signalled,
+ * goldfish_pipe::next_signalled,
+ * goldfish_pipe::signalled_flags - all singnalled-related fields,
+ * in all allocated pipes
+ * - open_command_params - PIPE_CMD_OPEN-related buffers
+ *
+ * It looks like a lot of different fields, but the trick is that the only
+ * operation that happens often is the signalled pipes array manipulation.
+ * That's why it's OK for now to keep the rest of the fields under the same
+ * lock. If we notice too much contention because of PIPE_CMD_OPEN,
+ * then we should add a separate lock there.
+ */
+ spinlock_t lock;
+
+ /*
+ * Array of the pipes of |pipes_capacity| elements,
+ * indexed by goldfish_pipe::id
+ */
+ struct goldfish_pipe **pipes;
+ u32 pipes_capacity;
+
+ /* Pointers to the buffers host uses for interaction with this driver */
+ struct goldfish_pipe_dev_buffers *buffers;
+
+ /* Head of a doubly linked list of signalled pipes */
+ struct goldfish_pipe *first_signalled_pipe;
+
+ /* Some device-specific data */
+ int irq;
+ int version;
+ unsigned char __iomem *base;
+
+ /* v1-specific access parameters */
+ struct access_params *aps;
+};
+
+extern struct goldfish_pipe_dev pipe_dev[1];
+
+#endif /* GOLDFISH_PIPE_H */
diff --git a/drivers/platform/goldfish/goldfish_pipe_v2.c b/drivers/platform/goldfish/goldfish_pipe_v2.c
new file mode 100644
index 000000000000..ad373ed36555
--- /dev/null
+++ b/drivers/platform/goldfish/goldfish_pipe_v2.c
@@ -0,0 +1,889 @@
+/*
+ * Copyright (C) 2012 Intel, Inc.
+ * Copyright (C) 2013 Intel, Inc.
+ * Copyright (C) 2014 Linaro Limited
+ * Copyright (C) 2011-2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/* This source file contains the implementation of a special device driver
+ * that intends to provide a *very* fast communication channel between the
+ * guest system and the QEMU emulator.
+ *
+ * Usage from the guest is simply the following (error handling simplified):
+ *
+ * int fd = open("/dev/qemu_pipe",O_RDWR);
+ * .... write() or read() through the pipe.
+ *
+ * This driver doesn't deal with the exact protocol used during the session.
+ * It is intended to be as simple as something like:
+ *
+ * // do this _just_ after opening the fd to connect to a specific
+ * // emulator service.
+ * const char* msg = "<pipename>";
+ * if (write(fd, msg, strlen(msg)+1) < 0) {
+ * ... could not connect to <pipename> service
+ * close(fd);
+ * }
+ *
+ * // after this, simply read() and write() to communicate with the
+ * // service. Exact protocol details left as an exercise to the reader.
+ *
+ * This driver is very fast because it doesn't copy any data through
+ * intermediate buffers, since the emulator is capable of translating
+ * guest user addresses into host ones.
+ *
+ * Note that we must however ensure that each user page involved in the
+ * exchange is properly mapped during a transfer.
+ */
+
+#include "goldfish_pipe.h"
+
+
+/*
+ * Update this when something changes in the driver's behavior so the host
+ * can benefit from knowing it
+ */
+enum {
+ PIPE_DRIVER_VERSION = 2,
+ PIPE_CURRENT_DEVICE_VERSION = 2
+};
+
+/*
+ * IMPORTANT: The following constants must match the ones used and defined
+ * in external/qemu/hw/goldfish_pipe.c in the Android source tree.
+ */
+
+/* List of bitflags returned in status of CMD_POLL command */
+enum PipePollFlags {
+ PIPE_POLL_IN = 1 << 0,
+ PIPE_POLL_OUT = 1 << 1,
+ PIPE_POLL_HUP = 1 << 2
+};
+
+/* Possible status values used to signal errors - see goldfish_pipe_error_convert */
+enum PipeErrors {
+ PIPE_ERROR_INVAL = -1,
+ PIPE_ERROR_AGAIN = -2,
+ PIPE_ERROR_NOMEM = -3,
+ PIPE_ERROR_IO = -4
+};
+
+/* Bit-flags used to signal events from the emulator */
+enum PipeWakeFlags {
+ PIPE_WAKE_CLOSED = 1 << 0, /* emulator closed pipe */
+ PIPE_WAKE_READ = 1 << 1, /* pipe can now be read from */
+ PIPE_WAKE_WRITE = 1 << 2 /* pipe can now be written to */
+};
+
+/* Bit flags for the 'flags' field */
+enum PipeFlagsBits {
+ BIT_CLOSED_ON_HOST = 0, /* pipe closed by host */
+ BIT_WAKE_ON_WRITE = 1, /* want to be woken on writes */
+ BIT_WAKE_ON_READ = 2, /* want to be woken on reads */
+};
+
+enum PipeRegs {
+ PIPE_REG_CMD = 0,
+
+ PIPE_REG_SIGNAL_BUFFER_HIGH = 4,
+ PIPE_REG_SIGNAL_BUFFER = 8,
+ PIPE_REG_SIGNAL_BUFFER_COUNT = 12,
+
+ PIPE_REG_OPEN_BUFFER_HIGH = 20,
+ PIPE_REG_OPEN_BUFFER = 24,
+
+ PIPE_REG_VERSION = 36,
+
+ PIPE_REG_GET_SIGNALLED = 48,
+};
+
+enum PipeCmdCode {
+ PIPE_CMD_OPEN = 1, /* to be used by the pipe device itself */
+ PIPE_CMD_CLOSE,
+ PIPE_CMD_POLL,
+ PIPE_CMD_WRITE,
+ PIPE_CMD_WAKE_ON_WRITE,
+ PIPE_CMD_READ,
+ PIPE_CMD_WAKE_ON_READ,
+
+ /*
+ * TODO(zyy): implement a deferred read/write execution to allow parallel
+ * processing of pipe operations on the host.
+ */
+ PIPE_CMD_WAKE_ON_DONE_IO,
+};
+
+enum {
+ MAX_BUFFERS_PER_COMMAND = 336,
+ MAX_SIGNALLED_PIPES = 64,
+ INITIAL_PIPES_CAPACITY = 64
+};
+
+struct goldfish_pipe_dev;
+struct goldfish_pipe;
+struct goldfish_pipe_command;
+
+/* A per-pipe command structure, shared with the host */
+struct goldfish_pipe_command {
+ s32 cmd; /* PipeCmdCode, guest -> host */
+ s32 id; /* pipe id, guest -> host */
+ s32 status; /* command execution status, host -> guest */
+ s32 reserved; /* to pad to 64-bit boundary */
+ union {
+ /* Parameters for PIPE_CMD_{READ,WRITE} */
+ struct {
+ u32 buffers_count; /* number of buffers, guest -> host */
+ s32 consumed_size; /* number of consumed bytes, host -> guest */
+ u64 ptrs[MAX_BUFFERS_PER_COMMAND]; /* buffer pointers, guest -> host */
+ u32 sizes[MAX_BUFFERS_PER_COMMAND]; /* buffer sizes, guest -> host */
+ } rw_params;
+ };
+};
+
+/* A single signalled pipe information */
+struct signalled_pipe_buffer {
+ u32 id;
+ u32 flags;
+};
+
+/* Parameters for the PIPE_CMD_OPEN command */
+struct open_command_param {
+ u64 command_buffer_ptr;
+ u32 rw_params_max_count;
+};
+
+/* Device-level set of buffers shared with the host */
+struct goldfish_pipe_dev_buffers {
+ struct open_command_param open_command_params;
+ struct signalled_pipe_buffer signalled_pipe_buffers[MAX_SIGNALLED_PIPES];
+};
+
+/* This data type models a given pipe instance */
+struct goldfish_pipe {
+ u32 id; /* pipe ID - index into goldfish_pipe_dev::pipes array */
+ unsigned long flags; /* The wake flags pipe is waiting for
+ * Note: not protected with any lock, uses atomic operations
+ * and barriers to make it thread-safe.
+ */
+ unsigned long signalled_flags; /* wake flags host have signalled,
+ * - protected by goldfish_pipe_dev::lock */
+
+ struct goldfish_pipe_command *command_buffer; /* A pointer to command buffer */
+
+ /* doubly linked list of signalled pipes, protected by goldfish_pipe_dev::lock */
+ struct goldfish_pipe *prev_signalled;
+ struct goldfish_pipe *next_signalled;
+
+ /*
+ * A pipe's own lock. Protects the following:
+ * - *command_buffer - makes sure a command can safely write its parameters
+ * to the host and read the results back.
+ */
+ struct mutex lock;
+
+ wait_queue_head_t wake_queue; /* A wake queue for sleeping until host signals an event */
+ struct goldfish_pipe_dev *dev; /* Pointer to the parent goldfish_pipe_dev instance */
+};
+
+struct goldfish_pipe_dev pipe_dev[1] = {};
+
+static int goldfish_cmd_locked(struct goldfish_pipe *pipe, enum PipeCmdCode cmd)
+{
+ pipe->command_buffer->cmd = cmd;
+ pipe->command_buffer->status = PIPE_ERROR_INVAL; /* failure by default */
+ writel(pipe->id, pipe->dev->base + PIPE_REG_CMD);
+ return pipe->command_buffer->status;
+}
+
+static int goldfish_cmd(struct goldfish_pipe *pipe, enum PipeCmdCode cmd)
+{
+ int status;
+ if (mutex_lock_interruptible(&pipe->lock))
+ return PIPE_ERROR_IO;
+ status = goldfish_cmd_locked(pipe, cmd);
+ mutex_unlock(&pipe->lock);
+ return status;
+}
+
+/*
+ * This function converts an error code returned by the emulator through
+ * the PIPE_REG_STATUS i/o register into a valid negative errno value.
+ */
+static int goldfish_pipe_error_convert(int status)
+{
+ switch (status) {
+ case PIPE_ERROR_AGAIN:
+ return -EAGAIN;
+ case PIPE_ERROR_NOMEM:
+ return -ENOMEM;
+ case PIPE_ERROR_IO:
+ return -EIO;
+ default:
+ return -EINVAL;
+ }
+}
+
+static int pin_user_pages(unsigned long first_page, unsigned long last_page,
+ unsigned last_page_size, int is_write,
+ struct page *pages[MAX_BUFFERS_PER_COMMAND], unsigned *iter_last_page_size)
+{
+ int ret;
+ int requested_pages = ((last_page - first_page) >> PAGE_SHIFT) + 1;
+ if (requested_pages > MAX_BUFFERS_PER_COMMAND) {
+ requested_pages = MAX_BUFFERS_PER_COMMAND;
+ *iter_last_page_size = PAGE_SIZE;
+ } else {
+ *iter_last_page_size = last_page_size;
+ }
+
+ ret = get_user_pages_fast(
+ first_page, requested_pages, !is_write, pages);
+ if (ret <= 0)
+ return -EFAULT;
+ if (ret < requested_pages)
+ *iter_last_page_size = PAGE_SIZE;
+ return ret;
+
+}
+
+static void release_user_pages(struct page **pages, int pages_count,
+ int is_write, s32 consumed_size)
+{
+ int i;
+ for (i = 0; i < pages_count; i++) {
+ if (!is_write && consumed_size > 0) {
+ set_page_dirty(pages[i]);
+ }
+ put_page(pages[i]);
+ }
+}
+
+/* Populate the call parameters, merging adjacent pages together */
+static void populate_rw_params(
+ struct page **pages, int pages_count,
+ unsigned long address, unsigned long address_end,
+ unsigned long first_page, unsigned long last_page,
+ unsigned iter_last_page_size, int is_write,
+ struct goldfish_pipe_command *command)
+{
+ /*
+ * Process the first page separately - it's the only page that
+ * needs special handling for its start address.
+ */
+ unsigned long xaddr = page_to_phys(pages[0]);
+ unsigned long xaddr_prev = xaddr;
+ int buffer_idx = 0;
+ int i = 1;
+ int size_on_page = first_page == last_page
+ ? (int)(address_end - address)
+ : (PAGE_SIZE - (address & ~PAGE_MASK));
+ command->rw_params.ptrs[0] = (u64)(xaddr | (address & ~PAGE_MASK));
+ command->rw_params.sizes[0] = size_on_page;
+ for (; i < pages_count; ++i) {
+ xaddr = page_to_phys(pages[i]);
+ size_on_page = (i == pages_count - 1) ? iter_last_page_size : PAGE_SIZE;
+ if (xaddr == xaddr_prev + PAGE_SIZE) {
+ command->rw_params.sizes[buffer_idx] += size_on_page;
+ } else {
+ ++buffer_idx;
+ command->rw_params.ptrs[buffer_idx] = (u64)xaddr;
+ command->rw_params.sizes[buffer_idx] = size_on_page;
+ }
+ xaddr_prev = xaddr;
+ }
+ command->rw_params.buffers_count = buffer_idx + 1;
+}
+
+static int transfer_max_buffers(struct goldfish_pipe* pipe,
+ unsigned long address, unsigned long address_end, int is_write,
+ unsigned long last_page, unsigned int last_page_size,
+ s32* consumed_size, int* status)
+{
+ struct page *pages[MAX_BUFFERS_PER_COMMAND];
+ unsigned long first_page = address & PAGE_MASK;
+ unsigned int iter_last_page_size;
+ int pages_count = pin_user_pages(first_page, last_page,
+ last_page_size, is_write,
+ pages, &iter_last_page_size);
+ if (pages_count < 0)
+ return pages_count;
+
+ /* Serialize access to the pipe command buffers */
+ if (mutex_lock_interruptible(&pipe->lock))
+ return -ERESTARTSYS;
+
+ populate_rw_params(pages, pages_count, address, address_end,
+ first_page, last_page, iter_last_page_size, is_write,
+ pipe->command_buffer);
+
+ /* Transfer the data */
+ *status = goldfish_cmd_locked(pipe,
+ is_write ? PIPE_CMD_WRITE : PIPE_CMD_READ);
+
+ *consumed_size = pipe->command_buffer->rw_params.consumed_size;
+
+ mutex_unlock(&pipe->lock);
+
+ release_user_pages(pages, pages_count, is_write, *consumed_size);
+
+ return 0;
+}
+
+static int wait_for_host_signal(struct goldfish_pipe *pipe, int is_write)
+{
+ u32 wakeBit = is_write ? BIT_WAKE_ON_WRITE : BIT_WAKE_ON_READ;
+ set_bit(wakeBit, &pipe->flags);
+
+ /* Tell the emulator we're going to wait for a wake event */
+ (void)goldfish_cmd(pipe,
+ is_write ? PIPE_CMD_WAKE_ON_WRITE : PIPE_CMD_WAKE_ON_READ);
+
+ while (test_bit(wakeBit, &pipe->flags)) {
+ if (wait_event_interruptible(
+ pipe->wake_queue,
+ !test_bit(wakeBit, &pipe->flags)))
+ return -ERESTARTSYS;
+
+ if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags))
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static ssize_t goldfish_pipe_read_write(struct file *filp,
+ char __user *buffer, size_t bufflen, int is_write)
+{
+ struct goldfish_pipe *pipe = filp->private_data;
+ int count = 0, ret = -EINVAL;
+ unsigned long address, address_end, last_page;
+ unsigned int last_page_size;
+
+ /* If the emulator already closed the pipe, no need to go further */
+ if (unlikely(test_bit(BIT_CLOSED_ON_HOST, &pipe->flags)))
+ return -EIO;
+ /* Null reads or writes succeeds */
+ if (unlikely(bufflen == 0))
+ return 0;
+ /* Check the buffer range for access */
+ if (unlikely(!access_ok(is_write ? VERIFY_WRITE : VERIFY_READ,
+ buffer, bufflen)))
+ return -EFAULT;
+
+ address = (unsigned long)buffer;
+ address_end = address + bufflen;
+ last_page = (address_end - 1) & PAGE_MASK;
+ last_page_size = ((address_end - 1) & ~PAGE_MASK) + 1;
+
+ while (address < address_end) {
+ s32 consumed_size;
+ int status;
+ ret = transfer_max_buffers(pipe, address, address_end, is_write,
+ last_page, last_page_size, &consumed_size, &status);
+ if (ret < 0)
+ break;
+
+ if (consumed_size > 0) {
+ /* No matter what's the status, we've transfered something */
+ count += consumed_size;
+ address += consumed_size;
+ }
+ if (status > 0)
+ continue;
+ if (status == 0) {
+ /* EOF */
+ ret = 0;
+ break;
+ }
+ if (count > 0) {
+ /*
+ * An error occured, but we already transfered
+ * something on one of the previous iterations.
+ * Just return what we already copied and log this
+ * err.
+ */
+ if (status != PIPE_ERROR_AGAIN)
+ pr_info_ratelimited("goldfish_pipe: backend error %d on %s\n",
+ status, is_write ? "write" : "read");
+ break;
+ }
+
+ /*
+ * If the error is not PIPE_ERROR_AGAIN, or if we are in
+ * non-blocking mode, just return the error code.
+ */
+ if (status != PIPE_ERROR_AGAIN || (filp->f_flags & O_NONBLOCK) != 0) {
+ ret = goldfish_pipe_error_convert(status);
+ break;
+ }
+
+ status = wait_for_host_signal(pipe, is_write);
+ if (status < 0)
+ return status;
+ }
+
+ if (count > 0)
+ return count;
+ return ret;
+}
+
+static ssize_t goldfish_pipe_read(struct file *filp, char __user *buffer,
+ size_t bufflen, loff_t *ppos)
+{
+ return goldfish_pipe_read_write(filp, buffer, bufflen, /* is_write */ 0);
+}
+
+static ssize_t goldfish_pipe_write(struct file *filp,
+ const char __user *buffer, size_t bufflen,
+ loff_t *ppos)
+{
+ return goldfish_pipe_read_write(filp,
+ /* cast away the const */(char __user *)buffer, bufflen,
+ /* is_write */ 1);
+}
+
+static unsigned int goldfish_pipe_poll(struct file *filp, poll_table *wait)
+{
+ struct goldfish_pipe *pipe = filp->private_data;
+ unsigned int mask = 0;
+ int status;
+
+ poll_wait(filp, &pipe->wake_queue, wait);
+
+ status = goldfish_cmd(pipe, PIPE_CMD_POLL);
+ if (status < 0) {
+ return -ERESTARTSYS;
+ }
+
+ if (status & PIPE_POLL_IN)
+ mask |= POLLIN | POLLRDNORM;
+ if (status & PIPE_POLL_OUT)
+ mask |= POLLOUT | POLLWRNORM;
+ if (status & PIPE_POLL_HUP)
+ mask |= POLLHUP;
+ if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags))
+ mask |= POLLERR;
+
+ return mask;
+}
+
+static void signalled_pipes_add_locked(struct goldfish_pipe_dev *dev,
+ u32 id, u32 flags)
+{
+ struct goldfish_pipe *pipe;
+
+ BUG_ON(id >= dev->pipes_capacity);
+
+ pipe = dev->pipes[id];
+ if (!pipe)
+ return;
+ pipe->signalled_flags |= flags;
+
+ if (pipe->prev_signalled || pipe->next_signalled
+ || dev->first_signalled_pipe == pipe)
+ return; /* already in the list */
+ pipe->next_signalled = dev->first_signalled_pipe;
+ if (dev->first_signalled_pipe) {
+ dev->first_signalled_pipe->prev_signalled = pipe;
+ }
+ dev->first_signalled_pipe = pipe;
+}
+
+static void signalled_pipes_remove_locked(struct goldfish_pipe_dev *dev,
+ struct goldfish_pipe *pipe) {
+ if (pipe->prev_signalled)
+ pipe->prev_signalled->next_signalled = pipe->next_signalled;
+ if (pipe->next_signalled)
+ pipe->next_signalled->prev_signalled = pipe->prev_signalled;
+ if (pipe == dev->first_signalled_pipe)
+ dev->first_signalled_pipe = pipe->next_signalled;
+ pipe->prev_signalled = NULL;
+ pipe->next_signalled = NULL;
+}
+
+static struct goldfish_pipe *signalled_pipes_pop_front(struct goldfish_pipe_dev *dev,
+ int *wakes)
+{
+ struct goldfish_pipe *pipe;
+ unsigned long flags;
+ spin_lock_irqsave(&dev->lock, flags);
+
+ pipe = dev->first_signalled_pipe;
+ if (pipe) {
+ *wakes = pipe->signalled_flags;
+ pipe->signalled_flags = 0;
+ /*
+ * This is an optimized version of signalled_pipes_remove_locked() -
+ * we want to make it as fast as possible to wake the sleeping pipe
+ * operations faster
+ */
+ dev->first_signalled_pipe = pipe->next_signalled;
+ if (dev->first_signalled_pipe)
+ dev->first_signalled_pipe->prev_signalled = NULL;
+ pipe->next_signalled = NULL;
+ }
+
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return pipe;
+}
+
+static void goldfish_interrupt_task(unsigned long unused)
+{
+ struct goldfish_pipe_dev *dev = pipe_dev;
+ /* Iterate over the signalled pipes and wake them one by one */
+ struct goldfish_pipe *pipe;
+ int wakes;
+ while ((pipe = signalled_pipes_pop_front(dev, &wakes)) != NULL) {
+ if (wakes & PIPE_WAKE_CLOSED) {
+ pipe->flags = 1 << BIT_CLOSED_ON_HOST;
+ } else {
+ if (wakes & PIPE_WAKE_READ)
+ clear_bit(BIT_WAKE_ON_READ, &pipe->flags);
+ if (wakes & PIPE_WAKE_WRITE)
+ clear_bit(BIT_WAKE_ON_WRITE, &pipe->flags);
+ }
+ /*
+ * wake_up_interruptible() implies a write barrier, so don't explicitly
+ * add another one here.
+ */
+ wake_up_interruptible(&pipe->wake_queue);
+ }
+}
+DECLARE_TASKLET(goldfish_interrupt_tasklet, goldfish_interrupt_task, 0);
+
+/*
+ * The general idea of the interrupt handling:
+ *
+ * 1. device raises an interrupt if there's at least one signalled pipe
+ * 2. IRQ handler reads the signalled pipes and their count from the device
+ * 3. device writes them into a shared buffer and returns the count
+ * it only resets the IRQ if it has returned all signalled pipes,
+ * otherwise it leaves it raised, so IRQ handler will be called
+ * again for the next chunk
+ * 4. IRQ handler adds all returned pipes to the device's signalled pipes list
+ * 5. IRQ handler launches a tasklet to process the signalled pipes from the
+ * list in a separate context
+ */
+static irqreturn_t goldfish_pipe_interrupt(int irq, void *dev_id)
+{
+ u32 count;
+ u32 i;
+ unsigned long flags;
+ struct goldfish_pipe_dev *dev = dev_id;
+ if (dev != pipe_dev)
+ return IRQ_NONE;
+
+ /* Request the signalled pipes from the device */
+ spin_lock_irqsave(&dev->lock, flags);
+
+ count = readl(dev->base + PIPE_REG_GET_SIGNALLED);
+ if (count == 0) {
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return IRQ_NONE;
+ }
+ if (count > MAX_SIGNALLED_PIPES)
+ count = MAX_SIGNALLED_PIPES;
+
+ for (i = 0; i < count; ++i)
+ signalled_pipes_add_locked(dev,
+ dev->buffers->signalled_pipe_buffers[i].id,
+ dev->buffers->signalled_pipe_buffers[i].flags);
+
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ tasklet_schedule(&goldfish_interrupt_tasklet);
+ return IRQ_HANDLED;
+}
+
+static int get_free_pipe_id_locked(struct goldfish_pipe_dev *dev)
+{
+ int id;
+ for (id = 0; id < dev->pipes_capacity; ++id)
+ if (!dev->pipes[id])
+ return id;
+
+ {
+ /* Reallocate the array */
+ u32 new_capacity = 2 * dev->pipes_capacity;
+ struct goldfish_pipe **pipes =
+ kcalloc(new_capacity, sizeof(*pipes),
+ GFP_ATOMIC);
+ if (!pipes)
+ return -ENOMEM;
+ memcpy(pipes, dev->pipes, sizeof(*pipes) * dev->pipes_capacity);
+ kfree(dev->pipes);
+ dev->pipes = pipes;
+ id = dev->pipes_capacity;
+ dev->pipes_capacity = new_capacity;
+ }
+ return id;
+}
+
+/**
+ * goldfish_pipe_open - open a channel to the AVD
+ * @inode: inode of device
+ * @file: file struct of opener
+ *
+ * Create a new pipe link between the emulator and the use application.
+ * Each new request produces a new pipe.
+ *
+ * Note: we use the pipe ID as a mux. All goldfish emulations are 32bit
+ * right now so this is fine. A move to 64bit will need this addressing
+ */
+static int goldfish_pipe_open(struct inode *inode, struct file *file)
+{
+ struct goldfish_pipe_dev *dev = pipe_dev;
+ unsigned long flags;
+ int id;
+ int status;
+
+ /* Allocate new pipe kernel object */
+ struct goldfish_pipe *pipe = kzalloc(sizeof(*pipe), GFP_KERNEL);
+ if (pipe == NULL)
+ return -ENOMEM;
+
+ pipe->dev = dev;
+ mutex_init(&pipe->lock);
+ init_waitqueue_head(&pipe->wake_queue);
+
+ /*
+ * Command buffer needs to be allocated on its own page to make sure it is
+ * physically contiguous in host's address space.
+ */
+ pipe->command_buffer =
+ (struct goldfish_pipe_command*)__get_free_page(GFP_KERNEL);
+ if (!pipe->command_buffer) {
+ status = -ENOMEM;
+ goto err_pipe;
+ }
+
+ spin_lock_irqsave(&dev->lock, flags);
+
+ id = get_free_pipe_id_locked(dev);
+ if (id < 0) {
+ status = id;
+ goto err_id_locked;
+ }
+
+ dev->pipes[id] = pipe;
+ pipe->id = id;
+ pipe->command_buffer->id = id;
+
+ /* Now tell the emulator we're opening a new pipe. */
+ dev->buffers->open_command_params.rw_params_max_count =
+ MAX_BUFFERS_PER_COMMAND;
+ dev->buffers->open_command_params.command_buffer_ptr =
+ (u64)(unsigned long)__pa(pipe->command_buffer);
+ status = goldfish_cmd_locked(pipe, PIPE_CMD_OPEN);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ if (status < 0)
+ goto err_cmd;
+ /* All is done, save the pipe into the file's private data field */
+ file->private_data = pipe;
+ return 0;
+
+err_cmd:
+ spin_lock_irqsave(&dev->lock, flags);
+ dev->pipes[id] = NULL;
+err_id_locked:
+ spin_unlock_irqrestore(&dev->lock, flags);
+ free_page((unsigned long)pipe->command_buffer);
+err_pipe:
+ kfree(pipe);
+ return status;
+}
+
+static int goldfish_pipe_release(struct inode *inode, struct file *filp)
+{
+ unsigned long flags;
+ struct goldfish_pipe *pipe = filp->private_data;
+ struct goldfish_pipe_dev *dev = pipe->dev;
+
+ /* The guest is closing the channel, so tell the emulator right now */
+ (void)goldfish_cmd(pipe, PIPE_CMD_CLOSE);
+
+ spin_lock_irqsave(&dev->lock, flags);
+ dev->pipes[pipe->id] = NULL;
+ signalled_pipes_remove_locked(dev, pipe);
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ filp->private_data = NULL;
+ free_page((unsigned long)pipe->command_buffer);
+ kfree(pipe);
+ return 0;
+}
+
+static const struct file_operations goldfish_pipe_fops = {
+ .owner = THIS_MODULE,
+ .read = goldfish_pipe_read,
+ .write = goldfish_pipe_write,
+ .poll = goldfish_pipe_poll,
+ .open = goldfish_pipe_open,
+ .release = goldfish_pipe_release,
+};
+
+static struct miscdevice goldfish_pipe_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "goldfish_pipe",
+ .fops = &goldfish_pipe_fops,
+};
+
+static int goldfish_pipe_device_init_v2(struct platform_device *pdev)
+{
+ char *page;
+ struct goldfish_pipe_dev *dev = pipe_dev;
+ int err = devm_request_irq(&pdev->dev, dev->irq, goldfish_pipe_interrupt,
+ IRQF_SHARED, "goldfish_pipe", dev);
+ if (err) {
+ dev_err(&pdev->dev, "unable to allocate IRQ for v2\n");
+ return err;
+ }
+
+ err = misc_register(&goldfish_pipe_dev);
+ if (err) {
+ dev_err(&pdev->dev, "unable to register v2 device\n");
+ return err;
+ }
+
+ dev->first_signalled_pipe = NULL;
+ dev->pipes_capacity = INITIAL_PIPES_CAPACITY;
+ dev->pipes = kcalloc(dev->pipes_capacity, sizeof(*dev->pipes), GFP_KERNEL);
+ if (!dev->pipes)
+ return -ENOMEM;
+
+ /*
+ * We're going to pass two buffers, open_command_params and
+ * signalled_pipe_buffers, to the host. This means each of those buffers
+ * needs to be contained in a single physical page. The easiest choice is
+ * to just allocate a page and place the buffers in it.
+ */
+ BUG_ON(sizeof(*dev->buffers) > PAGE_SIZE);
+ page = (char*)__get_free_page(GFP_KERNEL);
+ if (!page) {
+ kfree(dev->pipes);
+ return -ENOMEM;
+ }
+ dev->buffers = (struct goldfish_pipe_dev_buffers*)page;
+
+ /* Send the buffer addresses to the host */
+ {
+ u64 paddr = __pa(&dev->buffers->signalled_pipe_buffers);
+ writel((u32)(unsigned long)(paddr >> 32), dev->base + PIPE_REG_SIGNAL_BUFFER_HIGH);
+ writel((u32)(unsigned long)paddr, dev->base + PIPE_REG_SIGNAL_BUFFER);
+ writel((u32)MAX_SIGNALLED_PIPES, dev->base + PIPE_REG_SIGNAL_BUFFER_COUNT);
+
+ paddr = __pa(&dev->buffers->open_command_params);
+ writel((u32)(unsigned long)(paddr >> 32), dev->base + PIPE_REG_OPEN_BUFFER_HIGH);
+ writel((u32)(unsigned long)paddr, dev->base + PIPE_REG_OPEN_BUFFER);
+ }
+ return 0;
+}
+
+static void goldfish_pipe_device_deinit_v2(struct platform_device *pdev) {
+ struct goldfish_pipe_dev *dev = pipe_dev;
+ misc_deregister(&goldfish_pipe_dev);
+ kfree(dev->pipes);
+ free_page((unsigned long)dev->buffers);
+}
+
+static int goldfish_pipe_probe(struct platform_device *pdev)
+{
+ int err;
+ struct resource *r;
+ struct goldfish_pipe_dev *dev = pipe_dev;
+
+ BUG_ON(sizeof(struct goldfish_pipe_command) > PAGE_SIZE);
+
+ /* not thread safe, but this should not happen */
+ WARN_ON(dev->base != NULL);
+
+ spin_lock_init(&dev->lock);
+
+ r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (r == NULL || resource_size(r) < PAGE_SIZE) {
+ dev_err(&pdev->dev, "can't allocate i/o page\n");
+ return -EINVAL;
+ }
+ dev->base = devm_ioremap(&pdev->dev, r->start, PAGE_SIZE);
+ if (dev->base == NULL) {
+ dev_err(&pdev->dev, "ioremap failed\n");
+ return -EINVAL;
+ }
+
+ r = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+ if (r == NULL) {
+ err = -EINVAL;
+ goto error;
+ }
+ dev->irq = r->start;
+
+ /*
+ * Exchange the versions with the host device
+ *
+ * Note: v1 driver used to not report its version, so we write it before
+ * reading device version back: this allows the host implementation to
+ * detect the old driver (if there was no version write before read).
+ */
+ writel((u32)PIPE_DRIVER_VERSION, dev->base + PIPE_REG_VERSION);
+ dev->version = readl(dev->base + PIPE_REG_VERSION);
+ if (dev->version < PIPE_CURRENT_DEVICE_VERSION) {
+ /* initialize the old device version */
+ err = goldfish_pipe_device_init_v1(pdev);
+ } else {
+ /* Host device supports the new interface */
+ err = goldfish_pipe_device_init_v2(pdev);
+ }
+ if (!err)
+ return 0;
+
+error:
+ dev->base = NULL;
+ return err;
+}
+
+static int goldfish_pipe_remove(struct platform_device *pdev)
+{
+ struct goldfish_pipe_dev *dev = pipe_dev;
+ if (dev->version < PIPE_CURRENT_DEVICE_VERSION)
+ goldfish_pipe_device_deinit_v1(pdev);
+ else
+ goldfish_pipe_device_deinit_v2(pdev);
+ dev->base = NULL;
+ return 0;
+}
+
+static const struct acpi_device_id goldfish_pipe_acpi_match[] = {
+ { "GFSH0003", 0 },
+ { },
+};
+MODULE_DEVICE_TABLE(acpi, goldfish_pipe_acpi_match);
+
+static const struct of_device_id goldfish_pipe_of_match[] = {
+ { .compatible = "google,android-pipe", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, goldfish_pipe_of_match);
+
+static struct platform_driver goldfish_pipe_driver = {
+ .probe = goldfish_pipe_probe,
+ .remove = goldfish_pipe_remove,
+ .driver = {
+ .name = "goldfish_pipe",
+ .of_match_table = goldfish_pipe_of_match,
+ .acpi_match_table = ACPI_PTR(goldfish_pipe_acpi_match),
+ }
+};
+
+module_platform_driver(goldfish_pipe_driver);
+MODULE_AUTHOR("David Turner <digit@google.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c
index bcde8d13476a..fdb824fdf6c1 100644
--- a/drivers/power/supply/power_supply_sysfs.c
+++ b/drivers/power/supply/power_supply_sysfs.c
@@ -107,7 +107,10 @@ static ssize_t power_supply_show_property(struct device *dev,
else if (off >= POWER_SUPPLY_PROP_MODEL_NAME)
return sprintf(buf, "%s\n", value.strval);
- return sprintf(buf, "%d\n", value.intval);
+ if (off == POWER_SUPPLY_PROP_CHARGE_COUNTER_EXT)
+ return sprintf(buf, "%lld\n", value.int64val);
+ else
+ return sprintf(buf, "%d\n", value.intval);
}
static ssize_t power_supply_store_property(struct device *dev,
@@ -198,6 +201,12 @@ static struct device_attribute power_supply_attrs[] = {
POWER_SUPPLY_ATTR(scope),
POWER_SUPPLY_ATTR(charge_term_current),
POWER_SUPPLY_ATTR(calibrate),
+ /* Local extensions */
+ POWER_SUPPLY_ATTR(usb_hc),
+ POWER_SUPPLY_ATTR(usb_otg),
+ POWER_SUPPLY_ATTR(charge_enabled),
+ /* Local extensions of type int64_t */
+ POWER_SUPPLY_ATTR(charge_counter_ext),
/* Properties of type `const char *' */
POWER_SUPPLY_ATTR(model_name),
POWER_SUPPLY_ATTR(manufacturer),
diff --git a/drivers/rtc/rtc-palmas.c b/drivers/rtc/rtc-palmas.c
index 4bcfb88674d3..34aea38ebfa6 100644
--- a/drivers/rtc/rtc-palmas.c
+++ b/drivers/rtc/rtc-palmas.c
@@ -45,6 +45,42 @@ struct palmas_rtc {
/* Total number of RTC registers needed to set time*/
#define PALMAS_NUM_TIME_REGS (PALMAS_YEARS_REG - PALMAS_SECONDS_REG + 1)
+/*
+ * Special bin2bcd mapping to deal with bcd storage of year.
+ *
+ * 0-69 -> 0xD0
+ * 70-99 (1970 - 1999) -> 0xD0 - 0xF9 (correctly rolls to 0x00)
+ * 100-199 (2000 - 2099) -> 0x00 - 0x99 (does not roll to 0xA0 :-( )
+ * 200-229 (2100 - 2129) -> 0xA0 - 0xC9 (really for completeness)
+ * 230- -> 0xC9
+ *
+ * Confirmed: the only transition that does not work correctly for this rtc
+ * clock is the transition from 2099 to 2100, it proceeds to 2000. We will
+ * accept this issue since the clock retains and transitions the year correctly
+ * in all other conditions.
+ */
+static unsigned char year_bin2bcd(int val)
+{
+ if (val < 70)
+ return 0xD0;
+ if (val < 100)
+ return bin2bcd(val - 20) | 0x80; /* KISS leverage of bin2bcd */
+ if (val >= 230)
+ return 0xC9;
+ if (val >= 200)
+ return bin2bcd(val - 180) | 0x80;
+ return bin2bcd(val - 100);
+}
+
+static int year_bcd2bin(unsigned char val)
+{
+ if (val >= 0xD0)
+ return bcd2bin(val & 0x7F) + 20;
+ if (val >= 0xA0)
+ return bcd2bin(val & 0x7F) + 180;
+ return bcd2bin(val) + 100;
+}
+
static int palmas_rtc_read_time(struct device *dev, struct rtc_time *tm)
{
unsigned char rtc_data[PALMAS_NUM_TIME_REGS];
@@ -71,7 +107,7 @@ static int palmas_rtc_read_time(struct device *dev, struct rtc_time *tm)
tm->tm_hour = bcd2bin(rtc_data[2]);
tm->tm_mday = bcd2bin(rtc_data[3]);
tm->tm_mon = bcd2bin(rtc_data[4]) - 1;
- tm->tm_year = bcd2bin(rtc_data[5]) + 100;
+ tm->tm_year = year_bcd2bin(rtc_data[5]);
return ret;
}
@@ -87,7 +123,7 @@ static int palmas_rtc_set_time(struct device *dev, struct rtc_time *tm)
rtc_data[2] = bin2bcd(tm->tm_hour);
rtc_data[3] = bin2bcd(tm->tm_mday);
rtc_data[4] = bin2bcd(tm->tm_mon + 1);
- rtc_data[5] = bin2bcd(tm->tm_year - 100);
+ rtc_data[5] = year_bin2bcd(tm->tm_year);
/* Stop RTC while updating the RTC time registers */
ret = palmas_update_bits(palmas, PALMAS_RTC_BASE, PALMAS_RTC_CTRL_REG,
@@ -142,7 +178,7 @@ static int palmas_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
alm->time.tm_hour = bcd2bin(alarm_data[2]);
alm->time.tm_mday = bcd2bin(alarm_data[3]);
alm->time.tm_mon = bcd2bin(alarm_data[4]) - 1;
- alm->time.tm_year = bcd2bin(alarm_data[5]) + 100;
+ alm->time.tm_year = year_bcd2bin(alarm_data[5]);
ret = palmas_read(palmas, PALMAS_RTC_BASE, PALMAS_RTC_INTERRUPTS_REG,
&int_val);
@@ -173,7 +209,7 @@ static int palmas_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
alarm_data[2] = bin2bcd(alm->time.tm_hour);
alarm_data[3] = bin2bcd(alm->time.tm_mday);
alarm_data[4] = bin2bcd(alm->time.tm_mon + 1);
- alarm_data[5] = bin2bcd(alm->time.tm_year - 100);
+ alarm_data[5] = year_bin2bcd(alm->time.tm_year);
ret = palmas_bulk_write(palmas, PALMAS_RTC_BASE,
PALMAS_ALARM_SECONDS_REG, alarm_data, PALMAS_NUM_TIME_REGS);
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 5cfd56f08ffb..526c271dc790 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -41,6 +41,7 @@
#include <linux/devfreq.h>
#include <linux/nls.h>
#include <linux/of.h>
+#include <linux/blkdev.h>
#include "ufshcd.h"
#include "ufs_quirks.h"
#include "unipro.h"
@@ -1488,6 +1489,17 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
clear_bit_unlock(tag, &hba->lrb_in_use);
goto out;
}
+
+ /* IO svc time latency histogram */
+ if (hba != NULL && cmd->request != NULL) {
+ if (hba->latency_hist_enabled &&
+ (cmd->request->cmd_type == REQ_TYPE_FS)) {
+ cmd->request->lat_hist_io_start = ktime_get();
+ cmd->request->lat_hist_enabled = 1;
+ } else
+ cmd->request->lat_hist_enabled = 0;
+ }
+
WARN_ON(hba->clk_gating.state != CLKS_ON);
lrbp = &hba->lrb[tag];
@@ -3696,6 +3708,7 @@ static void __ufshcd_transfer_req_compl(struct ufs_hba *hba,
struct scsi_cmnd *cmd;
int result;
int index;
+ struct request *req;
for_each_set_bit(index, &completed_reqs, hba->nutrs) {
lrbp = &hba->lrb[index];
@@ -3707,6 +3720,22 @@ static void __ufshcd_transfer_req_compl(struct ufs_hba *hba,
/* Mark completed command as NULL in LRB */
lrbp->cmd = NULL;
clear_bit_unlock(index, &hba->lrb_in_use);
+ req = cmd->request;
+ if (req) {
+ /* Update IO svc time latency histogram */
+ if (req->lat_hist_enabled) {
+ ktime_t completion;
+ u_int64_t delta_us;
+
+ completion = ktime_get();
+ delta_us = ktime_us_delta(completion,
+ req->lat_hist_io_start);
+ /* rq_data_dir() => true if WRITE */
+ blk_update_latency_hist(&hba->io_lat_s,
+ (rq_data_dir(req) == READ),
+ delta_us);
+ }
+ }
/* Do not touch lrbp after scsi done */
cmd->scsi_done(cmd);
__ufshcd_release(hba);
@@ -6495,6 +6524,54 @@ out:
}
EXPORT_SYMBOL(ufshcd_shutdown);
+/*
+ * Values permitted 0, 1, 2.
+ * 0 -> Disable IO latency histograms (default)
+ * 1 -> Enable IO latency histograms
+ * 2 -> Zero out IO latency histograms
+ */
+static ssize_t
+latency_hist_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ufs_hba *hba = dev_get_drvdata(dev);
+ long value;
+
+ if (kstrtol(buf, 0, &value))
+ return -EINVAL;
+ if (value == BLK_IO_LAT_HIST_ZERO)
+ blk_zero_latency_hist(&hba->io_lat_s);
+ else if (value == BLK_IO_LAT_HIST_ENABLE ||
+ value == BLK_IO_LAT_HIST_DISABLE)
+ hba->latency_hist_enabled = value;
+ return count;
+}
+
+ssize_t
+latency_hist_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ufs_hba *hba = dev_get_drvdata(dev);
+
+ return blk_latency_hist_show(&hba->io_lat_s, buf);
+}
+
+static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR,
+ latency_hist_show, latency_hist_store);
+
+static void
+ufshcd_init_latency_hist(struct ufs_hba *hba)
+{
+ if (device_create_file(hba->dev, &dev_attr_latency_hist))
+ dev_err(hba->dev, "Failed to create latency_hist sysfs entry\n");
+}
+
+static void
+ufshcd_exit_latency_hist(struct ufs_hba *hba)
+{
+ device_create_file(hba->dev, &dev_attr_latency_hist);
+}
+
/**
* ufshcd_remove - de-allocate SCSI host and host memory space
* data structure memory
@@ -6508,6 +6585,7 @@ void ufshcd_remove(struct ufs_hba *hba)
ufshcd_hba_stop(hba, true);
ufshcd_exit_clk_gating(hba);
+ ufshcd_exit_latency_hist(hba);
if (ufshcd_is_clkscaling_enabled(hba))
devfreq_remove_device(hba->devfreq);
ufshcd_hba_exit(hba);
@@ -6856,6 +6934,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
/* Hold auto suspend until async scan completes */
pm_runtime_get_sync(dev);
+ ufshcd_init_latency_hist(hba);
+
/*
* We are assuming that device wasn't put in sleep/power-down
* state exclusively during the boot stage before kernel.
@@ -6872,6 +6952,7 @@ out_remove_scsi_host:
scsi_remove_host(hba->host);
exit_gating:
ufshcd_exit_clk_gating(hba);
+ ufshcd_exit_latency_hist(hba);
out_disable:
hba->is_irq_enabled = false;
ufshcd_hba_exit(hba);
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index 6dbd2e176333..845812d01fd3 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -575,6 +575,9 @@ struct ufs_hba {
bool is_urgent_bkops_lvl_checked;
struct ufs_desc_size desc_size;
+
+ int latency_hist_enabled;
+ struct io_latency_state io_lat_s;
};
/* Returns true if clocks can be gated. Otherwise false */
diff --git a/drivers/staging/android/Kconfig b/drivers/staging/android/Kconfig
index 6c00d6f765c6..a17c483f906e 100644
--- a/drivers/staging/android/Kconfig
+++ b/drivers/staging/android/Kconfig
@@ -24,8 +24,28 @@ config ANDROID_LOW_MEMORY_KILLER
scripts (/init.rc), and it defines priority values with minimum free memory size
for each priority.
+config ANDROID_LOW_MEMORY_KILLER_AUTODETECT_OOM_ADJ_VALUES
+ bool "Android Low Memory Killer: detect oom_adj values"
+ depends on ANDROID_LOW_MEMORY_KILLER
+ default y
+ ---help---
+ Detect oom_adj values written to
+ /sys/module/lowmemorykiller/parameters/adj and convert them
+ to oom_score_adj values.
+
+config ANDROID_VSOC
+ tristate "Android Virtual SoC support"
+ default n
+ depends on PCI_MSI
+ ---help---
+ This option adds support for the Virtual SoC driver needed to boot
+ a 'cuttlefish' Android image inside QEmu. The driver interacts with
+ a QEmu ivshmem device. If built as a module, it will be called vsoc.
+
source "drivers/staging/android/ion/Kconfig"
+source "drivers/staging/android/fiq_debugger/Kconfig"
+
endif # if ANDROID
endmenu
diff --git a/drivers/staging/android/Makefile b/drivers/staging/android/Makefile
index 7ed1be798909..93c5f5a7390a 100644
--- a/drivers/staging/android/Makefile
+++ b/drivers/staging/android/Makefile
@@ -1,6 +1,8 @@
ccflags-y += -I$(src) # needed for trace events
obj-y += ion/
+obj-$(CONFIG_FIQ_DEBUGGER) += fiq_debugger/
obj-$(CONFIG_ASHMEM) += ashmem.o
obj-$(CONFIG_ANDROID_LOW_MEMORY_KILLER) += lowmemorykiller.o
+obj-$(CONFIG_ANDROID_VSOC) += vsoc.o
diff --git a/drivers/staging/android/TODO b/drivers/staging/android/TODO
index 64d8c8720960..edfb6809eb9e 100644
--- a/drivers/staging/android/TODO
+++ b/drivers/staging/android/TODO
@@ -33,5 +33,14 @@ sync framework:
- clean up and ABI check for security issues
- move it to drivers/base/dma-buf
+vsoc.c, uapi/vsoc_shm.h
+ - The current driver uses the same wait queue for all of the futexes in a
+ region. This will cause false wakeups in regions with a large number of
+ waiting threads. We should eventually use multiple queues and select the
+ queue based on the region.
+ - Add debugfs support for examining the permissions of regions.
+ - Remove VSOC_WAIT_FOR_INCOMING_INTERRUPT ioctl. This functionality has been
+ superseded by the futex and is there for legacy reasons.
+
Please send patches to Greg Kroah-Hartman <greg@kroah.com> and Cc:
Arve Hjønnevåg <arve@android.com> and Riley Andrews <riandrews@android.com>
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index c6314d1552ea..5af176b707a3 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -415,22 +415,14 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
}
get_file(asma->file);
- /*
- * XXX - Reworked to use shmem_zero_setup() instead of
- * shmem_set_file while we're in staging. -jstultz
- */
- if (vma->vm_flags & VM_SHARED) {
- ret = shmem_zero_setup(vma);
- if (ret) {
- fput(asma->file);
- goto out;
- }
+ if (vma->vm_flags & VM_SHARED)
+ shmem_set_file(vma, asma->file);
+ else {
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ vma->vm_file = asma->file;
}
- if (vma->vm_file)
- fput(vma->vm_file);
- vma->vm_file = asma->file;
-
out:
mutex_unlock(&ashmem_mutex);
return ret;
@@ -467,9 +459,9 @@ ashmem_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
loff_t start = range->pgstart * PAGE_SIZE;
loff_t end = (range->pgend + 1) * PAGE_SIZE;
- vfs_fallocate(range->asma->file,
- FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
- start, end - start);
+ range->asma->file->f_op->fallocate(range->asma->file,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ start, end - start);
range->purged = ASHMEM_WAS_PURGED;
lru_del(range);
diff --git a/drivers/staging/android/fiq_debugger/Kconfig b/drivers/staging/android/fiq_debugger/Kconfig
new file mode 100644
index 000000000000..60fc224d4efc
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/Kconfig
@@ -0,0 +1,58 @@
+config FIQ_DEBUGGER
+ bool "FIQ Mode Serial Debugger"
+ default n
+ depends on ARM || ARM64
+ help
+ The FIQ serial debugger can accept commands even when the
+ kernel is unresponsive due to being stuck with interrupts
+ disabled.
+
+config FIQ_DEBUGGER_NO_SLEEP
+ bool "Keep serial debugger active"
+ depends on FIQ_DEBUGGER
+ default n
+ help
+ Enables the serial debugger at boot. Passing
+ fiq_debugger.no_sleep on the kernel commandline will
+ override this config option.
+
+config FIQ_DEBUGGER_WAKEUP_IRQ_ALWAYS_ON
+ bool "Don't disable wakeup IRQ when debugger is active"
+ depends on FIQ_DEBUGGER
+ default n
+ help
+ Don't disable the wakeup irq when enabling the uart clock. This will
+ cause extra interrupts, but it makes the serial debugger usable with
+ on some MSM radio builds that ignore the uart clock request in power
+ collapse.
+
+config FIQ_DEBUGGER_CONSOLE
+ bool "Console on FIQ Serial Debugger port"
+ depends on FIQ_DEBUGGER
+ default n
+ help
+ Enables a console so that printk messages are displayed on
+ the debugger serial port as the occur.
+
+config FIQ_DEBUGGER_CONSOLE_DEFAULT_ENABLE
+ bool "Put the FIQ debugger into console mode by default"
+ depends on FIQ_DEBUGGER_CONSOLE
+ default n
+ help
+ If enabled, this puts the fiq debugger into console mode by default.
+ Otherwise, the fiq debugger will start out in debug mode.
+
+config FIQ_DEBUGGER_UART_OVERLAY
+ bool "Install uart DT overlay"
+ depends on FIQ_DEBUGGER
+ select OF_OVERLAY
+ default n
+ help
+ If enabled, fiq debugger is calling fiq_debugger_uart_overlay()
+ that will apply overlay uart_overlay@0 to disable proper uart.
+
+config FIQ_WATCHDOG
+ bool
+ select FIQ_DEBUGGER
+ select PSTORE_RAM
+ default n
diff --git a/drivers/staging/android/fiq_debugger/Makefile b/drivers/staging/android/fiq_debugger/Makefile
new file mode 100644
index 000000000000..a7ca4871cad3
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/Makefile
@@ -0,0 +1,4 @@
+obj-y += fiq_debugger.o
+obj-$(CONFIG_ARM) += fiq_debugger_arm.o
+obj-$(CONFIG_ARM64) += fiq_debugger_arm64.o
+obj-$(CONFIG_FIQ_WATCHDOG) += fiq_watchdog.o
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.c b/drivers/staging/android/fiq_debugger/fiq_debugger.c
new file mode 100644
index 000000000000..675b974b2a6e
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger.c
@@ -0,0 +1,1246 @@
+/*
+ * drivers/staging/android/fiq_debugger.c
+ *
+ * Serial Debugger Interface accessed through an FIQ interrupt.
+ *
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/console.h>
+#include <linux/interrupt.h>
+#include <linux/clk.h>
+#include <linux/platform_device.h>
+#include <linux/kernel_stat.h>
+#include <linux/kmsg_dump.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/timer.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+
+#ifdef CONFIG_FIQ_GLUE
+#include <asm/fiq_glue.h>
+#endif
+
+#ifdef CONFIG_FIQ_DEBUGGER_UART_OVERLAY
+#include <linux/of.h>
+#endif
+
+#include <linux/uaccess.h>
+
+#include "fiq_debugger.h"
+#include "fiq_debugger_priv.h"
+#include "fiq_debugger_ringbuf.h"
+
+#define DEBUG_MAX 64
+#define MAX_UNHANDLED_FIQ_COUNT 1000000
+
+#define MAX_FIQ_DEBUGGER_PORTS 4
+
+struct fiq_debugger_state {
+#ifdef CONFIG_FIQ_GLUE
+ struct fiq_glue_handler handler;
+#endif
+ struct fiq_debugger_output output;
+
+ int fiq;
+ int uart_irq;
+ int signal_irq;
+ int wakeup_irq;
+ bool wakeup_irq_no_set_wake;
+ struct clk *clk;
+ struct fiq_debugger_pdata *pdata;
+ struct platform_device *pdev;
+
+ char debug_cmd[DEBUG_MAX];
+ int debug_busy;
+ int debug_abort;
+
+ char debug_buf[DEBUG_MAX];
+ int debug_count;
+
+ bool no_sleep;
+ bool debug_enable;
+ bool ignore_next_wakeup_irq;
+ struct timer_list sleep_timer;
+ spinlock_t sleep_timer_lock;
+ bool uart_enabled;
+ struct wakeup_source debugger_wake_src;
+ bool console_enable;
+ int current_cpu;
+ atomic_t unhandled_fiq_count;
+ bool in_fiq;
+
+ struct work_struct work;
+ spinlock_t work_lock;
+ char work_cmd[DEBUG_MAX];
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+ spinlock_t console_lock;
+ struct console console;
+ struct tty_port tty_port;
+ struct fiq_debugger_ringbuf *tty_rbuf;
+ bool syslog_dumping;
+#endif
+
+ unsigned int last_irqs[NR_IRQS];
+ unsigned int last_local_timer_irqs[NR_CPUS];
+};
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+struct tty_driver *fiq_tty_driver;
+#endif
+
+#ifdef CONFIG_FIQ_DEBUGGER_NO_SLEEP
+static bool initial_no_sleep = true;
+#else
+static bool initial_no_sleep;
+#endif
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE_DEFAULT_ENABLE
+static bool initial_debug_enable = true;
+static bool initial_console_enable = true;
+#else
+static bool initial_debug_enable;
+static bool initial_console_enable;
+#endif
+
+static bool fiq_kgdb_enable;
+static bool fiq_debugger_disable;
+
+module_param_named(no_sleep, initial_no_sleep, bool, 0644);
+module_param_named(debug_enable, initial_debug_enable, bool, 0644);
+module_param_named(console_enable, initial_console_enable, bool, 0644);
+module_param_named(kgdb_enable, fiq_kgdb_enable, bool, 0644);
+module_param_named(disable, fiq_debugger_disable, bool, 0644);
+
+#ifdef CONFIG_FIQ_DEBUGGER_WAKEUP_IRQ_ALWAYS_ON
+static inline
+void fiq_debugger_enable_wakeup_irq(struct fiq_debugger_state *state) {}
+static inline
+void fiq_debugger_disable_wakeup_irq(struct fiq_debugger_state *state) {}
+#else
+static inline
+void fiq_debugger_enable_wakeup_irq(struct fiq_debugger_state *state)
+{
+ if (state->wakeup_irq < 0)
+ return;
+ enable_irq(state->wakeup_irq);
+ if (!state->wakeup_irq_no_set_wake)
+ enable_irq_wake(state->wakeup_irq);
+}
+static inline
+void fiq_debugger_disable_wakeup_irq(struct fiq_debugger_state *state)
+{
+ if (state->wakeup_irq < 0)
+ return;
+ disable_irq_nosync(state->wakeup_irq);
+ if (!state->wakeup_irq_no_set_wake)
+ disable_irq_wake(state->wakeup_irq);
+}
+#endif
+
+static inline bool fiq_debugger_have_fiq(struct fiq_debugger_state *state)
+{
+ return (state->fiq >= 0);
+}
+
+#ifdef CONFIG_FIQ_GLUE
+static void fiq_debugger_force_irq(struct fiq_debugger_state *state)
+{
+ unsigned int irq = state->signal_irq;
+
+ if (WARN_ON(!fiq_debugger_have_fiq(state)))
+ return;
+ if (state->pdata->force_irq) {
+ state->pdata->force_irq(state->pdev, irq);
+ } else {
+ struct irq_chip *chip = irq_get_chip(irq);
+ if (chip && chip->irq_retrigger)
+ chip->irq_retrigger(irq_get_irq_data(irq));
+ }
+}
+#endif
+
+static void fiq_debugger_uart_enable(struct fiq_debugger_state *state)
+{
+ if (state->clk)
+ clk_enable(state->clk);
+ if (state->pdata->uart_enable)
+ state->pdata->uart_enable(state->pdev);
+}
+
+static void fiq_debugger_uart_disable(struct fiq_debugger_state *state)
+{
+ if (state->pdata->uart_disable)
+ state->pdata->uart_disable(state->pdev);
+ if (state->clk)
+ clk_disable(state->clk);
+}
+
+static void fiq_debugger_uart_flush(struct fiq_debugger_state *state)
+{
+ if (state->pdata->uart_flush)
+ state->pdata->uart_flush(state->pdev);
+}
+
+static void fiq_debugger_putc(struct fiq_debugger_state *state, char c)
+{
+ state->pdata->uart_putc(state->pdev, c);
+}
+
+static void fiq_debugger_puts(struct fiq_debugger_state *state, char *s)
+{
+ unsigned c;
+ while ((c = *s++)) {
+ if (c == '\n')
+ fiq_debugger_putc(state, '\r');
+ fiq_debugger_putc(state, c);
+ }
+}
+
+static void fiq_debugger_prompt(struct fiq_debugger_state *state)
+{
+ fiq_debugger_puts(state, "debug> ");
+}
+
+static void fiq_debugger_dump_kernel_log(struct fiq_debugger_state *state)
+{
+ char buf[512];
+ size_t len;
+ struct kmsg_dumper dumper = { .active = true };
+
+
+ kmsg_dump_rewind_nolock(&dumper);
+ while (kmsg_dump_get_line_nolock(&dumper, true, buf,
+ sizeof(buf) - 1, &len)) {
+ buf[len] = 0;
+ fiq_debugger_puts(state, buf);
+ }
+}
+
+static void fiq_debugger_printf(struct fiq_debugger_output *output,
+ const char *fmt, ...)
+{
+ struct fiq_debugger_state *state;
+ char buf[256];
+ va_list ap;
+
+ state = container_of(output, struct fiq_debugger_state, output);
+ va_start(ap, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+
+ fiq_debugger_puts(state, buf);
+}
+
+/* Safe outside fiq context */
+static int fiq_debugger_printf_nfiq(void *cookie, const char *fmt, ...)
+{
+ struct fiq_debugger_state *state = cookie;
+ char buf[256];
+ va_list ap;
+ unsigned long irq_flags;
+
+ va_start(ap, fmt);
+ vsnprintf(buf, 128, fmt, ap);
+ va_end(ap);
+
+ local_irq_save(irq_flags);
+ fiq_debugger_puts(state, buf);
+ fiq_debugger_uart_flush(state);
+ local_irq_restore(irq_flags);
+ return state->debug_abort;
+}
+
+static void fiq_debugger_dump_irqs(struct fiq_debugger_state *state)
+{
+ int n;
+ struct irq_desc *desc;
+
+ fiq_debugger_printf(&state->output,
+ "irqnr total since-last status name\n");
+ for_each_irq_desc(n, desc) {
+ struct irqaction *act = desc->action;
+ if (!act && !kstat_irqs(n))
+ continue;
+ fiq_debugger_printf(&state->output, "%5d: %10u %11u %8x %s\n", n,
+ kstat_irqs(n),
+ kstat_irqs(n) - state->last_irqs[n],
+ desc->status_use_accessors,
+ (act && act->name) ? act->name : "???");
+ state->last_irqs[n] = kstat_irqs(n);
+ }
+}
+
+static void fiq_debugger_do_ps(struct fiq_debugger_state *state)
+{
+ struct task_struct *g;
+ struct task_struct *p;
+ unsigned task_state;
+ static const char stat_nam[] = "RSDTtZX";
+
+ fiq_debugger_printf(&state->output, "pid ppid prio task pc\n");
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ task_state = p->state ? __ffs(p->state) + 1 : 0;
+ fiq_debugger_printf(&state->output,
+ "%5d %5d %4d ", p->pid, p->parent->pid, p->prio);
+ fiq_debugger_printf(&state->output, "%-13.13s %c", p->comm,
+ task_state >= sizeof(stat_nam) ? '?' : stat_nam[task_state]);
+ if (task_state == TASK_RUNNING)
+ fiq_debugger_printf(&state->output, " running\n");
+ else
+ fiq_debugger_printf(&state->output, " %08lx\n",
+ thread_saved_pc(p));
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+}
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+static void fiq_debugger_begin_syslog_dump(struct fiq_debugger_state *state)
+{
+ state->syslog_dumping = true;
+}
+
+static void fiq_debugger_end_syslog_dump(struct fiq_debugger_state *state)
+{
+ state->syslog_dumping = false;
+}
+#else
+extern int do_syslog(int type, char __user *bug, int count);
+static void fiq_debugger_begin_syslog_dump(struct fiq_debugger_state *state)
+{
+ do_syslog(5 /* clear */, NULL, 0);
+}
+
+static void fiq_debugger_end_syslog_dump(struct fiq_debugger_state *state)
+{
+ fiq_debugger_dump_kernel_log(state);
+}
+#endif
+
+static void fiq_debugger_do_sysrq(struct fiq_debugger_state *state, char rq)
+{
+ if ((rq == 'g' || rq == 'G') && !fiq_kgdb_enable) {
+ fiq_debugger_printf(&state->output, "sysrq-g blocked\n");
+ return;
+ }
+ fiq_debugger_begin_syslog_dump(state);
+ handle_sysrq(rq);
+ fiq_debugger_end_syslog_dump(state);
+}
+
+#ifdef CONFIG_KGDB
+static void fiq_debugger_do_kgdb(struct fiq_debugger_state *state)
+{
+ if (!fiq_kgdb_enable) {
+ fiq_debugger_printf(&state->output, "kgdb through fiq debugger not enabled\n");
+ return;
+ }
+
+ fiq_debugger_printf(&state->output, "enabling console and triggering kgdb\n");
+ state->console_enable = true;
+ handle_sysrq('g');
+}
+#endif
+
+static void fiq_debugger_schedule_work(struct fiq_debugger_state *state,
+ char *cmd)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&state->work_lock, flags);
+ if (state->work_cmd[0] != '\0') {
+ fiq_debugger_printf(&state->output, "work command processor busy\n");
+ spin_unlock_irqrestore(&state->work_lock, flags);
+ return;
+ }
+
+ strlcpy(state->work_cmd, cmd, sizeof(state->work_cmd));
+ spin_unlock_irqrestore(&state->work_lock, flags);
+
+ schedule_work(&state->work);
+}
+
+static void fiq_debugger_work(struct work_struct *work)
+{
+ struct fiq_debugger_state *state;
+ char work_cmd[DEBUG_MAX];
+ char *cmd;
+ unsigned long flags;
+
+ state = container_of(work, struct fiq_debugger_state, work);
+
+ spin_lock_irqsave(&state->work_lock, flags);
+
+ strlcpy(work_cmd, state->work_cmd, sizeof(work_cmd));
+ state->work_cmd[0] = '\0';
+
+ spin_unlock_irqrestore(&state->work_lock, flags);
+
+ cmd = work_cmd;
+ if (!strncmp(cmd, "reboot", 6)) {
+ cmd += 6;
+ while (*cmd == ' ')
+ cmd++;
+ if (cmd != '\0')
+ kernel_restart(cmd);
+ else
+ kernel_restart(NULL);
+ } else {
+ fiq_debugger_printf(&state->output, "unknown work command '%s'\n",
+ work_cmd);
+ }
+}
+
+/* This function CANNOT be called in FIQ context */
+static void fiq_debugger_irq_exec(struct fiq_debugger_state *state, char *cmd)
+{
+ if (!strcmp(cmd, "ps"))
+ fiq_debugger_do_ps(state);
+ if (!strcmp(cmd, "sysrq"))
+ fiq_debugger_do_sysrq(state, 'h');
+ if (!strncmp(cmd, "sysrq ", 6))
+ fiq_debugger_do_sysrq(state, cmd[6]);
+#ifdef CONFIG_KGDB
+ if (!strcmp(cmd, "kgdb"))
+ fiq_debugger_do_kgdb(state);
+#endif
+ if (!strncmp(cmd, "reboot", 6))
+ fiq_debugger_schedule_work(state, cmd);
+}
+
+static void fiq_debugger_help(struct fiq_debugger_state *state)
+{
+ fiq_debugger_printf(&state->output,
+ "FIQ Debugger commands:\n"
+ " pc PC status\n"
+ " regs Register dump\n"
+ " allregs Extended Register dump\n"
+ " bt Stack trace\n"
+ " reboot [<c>] Reboot with command <c>\n"
+ " reset [<c>] Hard reset with command <c>\n"
+ " irqs Interupt status\n"
+ " kmsg Kernel log\n"
+ " version Kernel version\n");
+ fiq_debugger_printf(&state->output,
+ " sleep Allow sleep while in FIQ\n"
+ " nosleep Disable sleep while in FIQ\n"
+ " console Switch terminal to console\n"
+ " cpu Current CPU\n"
+ " cpu <number> Switch to CPU<number>\n");
+ fiq_debugger_printf(&state->output,
+ " ps Process list\n"
+ " sysrq sysrq options\n"
+ " sysrq <param> Execute sysrq with <param>\n");
+#ifdef CONFIG_KGDB
+ fiq_debugger_printf(&state->output,
+ " kgdb Enter kernel debugger\n");
+#endif
+}
+
+static void fiq_debugger_take_affinity(void *info)
+{
+ struct fiq_debugger_state *state = info;
+ struct cpumask cpumask;
+
+ cpumask_clear(&cpumask);
+ cpumask_set_cpu(get_cpu(), &cpumask);
+
+ irq_set_affinity(state->uart_irq, &cpumask);
+}
+
+static void fiq_debugger_switch_cpu(struct fiq_debugger_state *state, int cpu)
+{
+ if (!fiq_debugger_have_fiq(state))
+ smp_call_function_single(cpu, fiq_debugger_take_affinity, state,
+ false);
+ state->current_cpu = cpu;
+}
+
+static bool fiq_debugger_fiq_exec(struct fiq_debugger_state *state,
+ const char *cmd, const struct pt_regs *regs,
+ void *svc_sp)
+{
+ bool signal_helper = false;
+
+ if (!strcmp(cmd, "help") || !strcmp(cmd, "?")) {
+ fiq_debugger_help(state);
+ } else if (!strcmp(cmd, "pc")) {
+ fiq_debugger_dump_pc(&state->output, regs);
+ } else if (!strcmp(cmd, "regs")) {
+ fiq_debugger_dump_regs(&state->output, regs);
+ } else if (!strcmp(cmd, "allregs")) {
+ fiq_debugger_dump_allregs(&state->output, regs);
+ } else if (!strcmp(cmd, "bt")) {
+ fiq_debugger_dump_stacktrace(&state->output, regs, 100, svc_sp);
+ } else if (!strncmp(cmd, "reset", 5)) {
+ cmd += 5;
+ while (*cmd == ' ')
+ cmd++;
+ if (*cmd) {
+ char tmp_cmd[32];
+ strlcpy(tmp_cmd, cmd, sizeof(tmp_cmd));
+ machine_restart(tmp_cmd);
+ } else {
+ machine_restart(NULL);
+ }
+ } else if (!strcmp(cmd, "irqs")) {
+ fiq_debugger_dump_irqs(state);
+ } else if (!strcmp(cmd, "kmsg")) {
+ fiq_debugger_dump_kernel_log(state);
+ } else if (!strcmp(cmd, "version")) {
+ fiq_debugger_printf(&state->output, "%s\n", linux_banner);
+ } else if (!strcmp(cmd, "sleep")) {
+ state->no_sleep = false;
+ fiq_debugger_printf(&state->output, "enabling sleep\n");
+ } else if (!strcmp(cmd, "nosleep")) {
+ state->no_sleep = true;
+ fiq_debugger_printf(&state->output, "disabling sleep\n");
+ } else if (!strcmp(cmd, "console")) {
+ fiq_debugger_printf(&state->output, "console mode\n");
+ fiq_debugger_uart_flush(state);
+ state->console_enable = true;
+ } else if (!strcmp(cmd, "cpu")) {
+ fiq_debugger_printf(&state->output, "cpu %d\n", state->current_cpu);
+ } else if (!strncmp(cmd, "cpu ", 4)) {
+ unsigned long cpu = 0;
+ if (kstrtoul(cmd + 4, 10, &cpu) == 0)
+ fiq_debugger_switch_cpu(state, cpu);
+ else
+ fiq_debugger_printf(&state->output, "invalid cpu\n");
+ fiq_debugger_printf(&state->output, "cpu %d\n", state->current_cpu);
+ } else {
+ if (state->debug_busy) {
+ fiq_debugger_printf(&state->output,
+ "command processor busy. trying to abort.\n");
+ state->debug_abort = -1;
+ } else {
+ strcpy(state->debug_cmd, cmd);
+ state->debug_busy = 1;
+ }
+
+ return true;
+ }
+ if (!state->console_enable)
+ fiq_debugger_prompt(state);
+
+ return signal_helper;
+}
+
+static void fiq_debugger_sleep_timer_expired(unsigned long data)
+{
+ struct fiq_debugger_state *state = (struct fiq_debugger_state *)data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&state->sleep_timer_lock, flags);
+ if (state->uart_enabled && !state->no_sleep) {
+ if (state->debug_enable && !state->console_enable) {
+ state->debug_enable = false;
+ fiq_debugger_printf_nfiq(state,
+ "suspending fiq debugger\n");
+ }
+ state->ignore_next_wakeup_irq = true;
+ fiq_debugger_uart_disable(state);
+ state->uart_enabled = false;
+ fiq_debugger_enable_wakeup_irq(state);
+ }
+ __pm_relax(&state->debugger_wake_src);
+ spin_unlock_irqrestore(&state->sleep_timer_lock, flags);
+}
+
+static void fiq_debugger_handle_wakeup(struct fiq_debugger_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&state->sleep_timer_lock, flags);
+ if (state->wakeup_irq >= 0 && state->ignore_next_wakeup_irq) {
+ state->ignore_next_wakeup_irq = false;
+ } else if (!state->uart_enabled) {
+ __pm_stay_awake(&state->debugger_wake_src);
+ fiq_debugger_uart_enable(state);
+ state->uart_enabled = true;
+ fiq_debugger_disable_wakeup_irq(state);
+ mod_timer(&state->sleep_timer, jiffies + HZ / 2);
+ }
+ spin_unlock_irqrestore(&state->sleep_timer_lock, flags);
+}
+
+static irqreturn_t fiq_debugger_wakeup_irq_handler(int irq, void *dev)
+{
+ struct fiq_debugger_state *state = dev;
+
+ if (!state->no_sleep)
+ fiq_debugger_puts(state, "WAKEUP\n");
+ fiq_debugger_handle_wakeup(state);
+
+ return IRQ_HANDLED;
+}
+
+static
+void fiq_debugger_handle_console_irq_context(struct fiq_debugger_state *state)
+{
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+ if (state->tty_port.ops) {
+ int i;
+ int count = fiq_debugger_ringbuf_level(state->tty_rbuf);
+ for (i = 0; i < count; i++) {
+ int c = fiq_debugger_ringbuf_peek(state->tty_rbuf, 0);
+ tty_insert_flip_char(&state->tty_port, c, TTY_NORMAL);
+ if (!fiq_debugger_ringbuf_consume(state->tty_rbuf, 1))
+ pr_warn("fiq tty failed to consume byte\n");
+ }
+ tty_flip_buffer_push(&state->tty_port);
+ }
+#endif
+}
+
+static void fiq_debugger_handle_irq_context(struct fiq_debugger_state *state)
+{
+ if (!state->no_sleep) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&state->sleep_timer_lock, flags);
+ __pm_stay_awake(&state->debugger_wake_src);
+ mod_timer(&state->sleep_timer, jiffies + HZ * 5);
+ spin_unlock_irqrestore(&state->sleep_timer_lock, flags);
+ }
+ fiq_debugger_handle_console_irq_context(state);
+ if (state->debug_busy) {
+ fiq_debugger_irq_exec(state, state->debug_cmd);
+ if (!state->console_enable)
+ fiq_debugger_prompt(state);
+ state->debug_busy = 0;
+ }
+}
+
+static int fiq_debugger_getc(struct fiq_debugger_state *state)
+{
+ return state->pdata->uart_getc(state->pdev);
+}
+
+static bool fiq_debugger_handle_uart_interrupt(struct fiq_debugger_state *state,
+ int this_cpu, const struct pt_regs *regs, void *svc_sp)
+{
+ int c;
+ static int last_c;
+ int count = 0;
+ bool signal_helper = false;
+
+ if (this_cpu != state->current_cpu) {
+ if (state->in_fiq)
+ return false;
+
+ if (atomic_inc_return(&state->unhandled_fiq_count) !=
+ MAX_UNHANDLED_FIQ_COUNT)
+ return false;
+
+ fiq_debugger_printf(&state->output,
+ "fiq_debugger: cpu %d not responding, "
+ "reverting to cpu %d\n", state->current_cpu,
+ this_cpu);
+
+ atomic_set(&state->unhandled_fiq_count, 0);
+ fiq_debugger_switch_cpu(state, this_cpu);
+ return false;
+ }
+
+ state->in_fiq = true;
+
+ while ((c = fiq_debugger_getc(state)) != FIQ_DEBUGGER_NO_CHAR) {
+ count++;
+ if (!state->debug_enable) {
+ if ((c == 13) || (c == 10)) {
+ state->debug_enable = true;
+ state->debug_count = 0;
+ fiq_debugger_prompt(state);
+ }
+ } else if (c == FIQ_DEBUGGER_BREAK) {
+ state->console_enable = false;
+ fiq_debugger_puts(state, "fiq debugger mode\n");
+ state->debug_count = 0;
+ fiq_debugger_prompt(state);
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+ } else if (state->console_enable && state->tty_rbuf) {
+ fiq_debugger_ringbuf_push(state->tty_rbuf, c);
+ signal_helper = true;
+#endif
+ } else if ((c >= ' ') && (c < 127)) {
+ if (state->debug_count < (DEBUG_MAX - 1)) {
+ state->debug_buf[state->debug_count++] = c;
+ fiq_debugger_putc(state, c);
+ }
+ } else if ((c == 8) || (c == 127)) {
+ if (state->debug_count > 0) {
+ state->debug_count--;
+ fiq_debugger_putc(state, 8);
+ fiq_debugger_putc(state, ' ');
+ fiq_debugger_putc(state, 8);
+ }
+ } else if ((c == 13) || (c == 10)) {
+ if (c == '\r' || (c == '\n' && last_c != '\r')) {
+ fiq_debugger_putc(state, '\r');
+ fiq_debugger_putc(state, '\n');
+ }
+ if (state->debug_count) {
+ state->debug_buf[state->debug_count] = 0;
+ state->debug_count = 0;
+ signal_helper |=
+ fiq_debugger_fiq_exec(state,
+ state->debug_buf,
+ regs, svc_sp);
+ } else {
+ fiq_debugger_prompt(state);
+ }
+ }
+ last_c = c;
+ }
+ if (!state->console_enable)
+ fiq_debugger_uart_flush(state);
+ if (state->pdata->fiq_ack)
+ state->pdata->fiq_ack(state->pdev, state->fiq);
+
+ /* poke sleep timer if necessary */
+ if (state->debug_enable && !state->no_sleep)
+ signal_helper = true;
+
+ atomic_set(&state->unhandled_fiq_count, 0);
+ state->in_fiq = false;
+
+ return signal_helper;
+}
+
+#ifdef CONFIG_FIQ_GLUE
+static void fiq_debugger_fiq(struct fiq_glue_handler *h,
+ const struct pt_regs *regs, void *svc_sp)
+{
+ struct fiq_debugger_state *state =
+ container_of(h, struct fiq_debugger_state, handler);
+ unsigned int this_cpu = THREAD_INFO(svc_sp)->cpu;
+ bool need_irq;
+
+ need_irq = fiq_debugger_handle_uart_interrupt(state, this_cpu, regs,
+ svc_sp);
+ if (need_irq)
+ fiq_debugger_force_irq(state);
+}
+#endif
+
+/*
+ * When not using FIQs, we only use this single interrupt as an entry point.
+ * This just effectively takes over the UART interrupt and does all the work
+ * in this context.
+ */
+static irqreturn_t fiq_debugger_uart_irq(int irq, void *dev)
+{
+ struct fiq_debugger_state *state = dev;
+ bool not_done;
+
+ fiq_debugger_handle_wakeup(state);
+
+ /* handle the debugger irq in regular context */
+ not_done = fiq_debugger_handle_uart_interrupt(state, smp_processor_id(),
+ get_irq_regs(),
+ current_thread_info());
+ if (not_done)
+ fiq_debugger_handle_irq_context(state);
+
+ return IRQ_HANDLED;
+}
+
+/*
+ * If FIQs are used, not everything can happen in fiq context.
+ * FIQ handler does what it can and then signals this interrupt to finish the
+ * job in irq context.
+ */
+static irqreturn_t fiq_debugger_signal_irq(int irq, void *dev)
+{
+ struct fiq_debugger_state *state = dev;
+
+ if (state->pdata->force_irq_ack)
+ state->pdata->force_irq_ack(state->pdev, state->signal_irq);
+
+ fiq_debugger_handle_irq_context(state);
+
+ return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_FIQ_GLUE
+static void fiq_debugger_resume(struct fiq_glue_handler *h)
+{
+ struct fiq_debugger_state *state =
+ container_of(h, struct fiq_debugger_state, handler);
+ if (state->pdata->uart_resume)
+ state->pdata->uart_resume(state->pdev);
+}
+#endif
+
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+struct tty_driver *fiq_debugger_console_device(struct console *co, int *index)
+{
+ *index = co->index;
+ return fiq_tty_driver;
+}
+
+static void fiq_debugger_console_write(struct console *co,
+ const char *s, unsigned int count)
+{
+ struct fiq_debugger_state *state;
+ unsigned long flags;
+
+ state = container_of(co, struct fiq_debugger_state, console);
+
+ if (!state->console_enable && !state->syslog_dumping)
+ return;
+
+ fiq_debugger_uart_enable(state);
+ spin_lock_irqsave(&state->console_lock, flags);
+ while (count--) {
+ if (*s == '\n')
+ fiq_debugger_putc(state, '\r');
+ fiq_debugger_putc(state, *s++);
+ }
+ fiq_debugger_uart_flush(state);
+ spin_unlock_irqrestore(&state->console_lock, flags);
+ fiq_debugger_uart_disable(state);
+}
+
+static struct console fiq_debugger_console = {
+ .name = "ttyFIQ",
+ .device = fiq_debugger_console_device,
+ .write = fiq_debugger_console_write,
+ .flags = CON_PRINTBUFFER | CON_ANYTIME | CON_ENABLED,
+};
+
+int fiq_tty_open(struct tty_struct *tty, struct file *filp)
+{
+ int line = tty->index;
+ struct fiq_debugger_state **states = tty->driver->driver_state;
+ struct fiq_debugger_state *state = states[line];
+
+ return tty_port_open(&state->tty_port, tty, filp);
+}
+
+void fiq_tty_close(struct tty_struct *tty, struct file *filp)
+{
+ tty_port_close(tty->port, tty, filp);
+}
+
+int fiq_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
+{
+ int i;
+ int line = tty->index;
+ struct fiq_debugger_state **states = tty->driver->driver_state;
+ struct fiq_debugger_state *state = states[line];
+
+ if (!state->console_enable)
+ return count;
+
+ fiq_debugger_uart_enable(state);
+ spin_lock_irq(&state->console_lock);
+ for (i = 0; i < count; i++)
+ fiq_debugger_putc(state, *buf++);
+ spin_unlock_irq(&state->console_lock);
+ fiq_debugger_uart_disable(state);
+
+ return count;
+}
+
+int fiq_tty_write_room(struct tty_struct *tty)
+{
+ return 16;
+}
+
+#ifdef CONFIG_CONSOLE_POLL
+static int fiq_tty_poll_init(struct tty_driver *driver, int line, char *options)
+{
+ return 0;
+}
+
+static int fiq_tty_poll_get_char(struct tty_driver *driver, int line)
+{
+ struct fiq_debugger_state **states = driver->driver_state;
+ struct fiq_debugger_state *state = states[line];
+ int c = NO_POLL_CHAR;
+
+ fiq_debugger_uart_enable(state);
+ if (fiq_debugger_have_fiq(state)) {
+ int count = fiq_debugger_ringbuf_level(state->tty_rbuf);
+ if (count > 0) {
+ c = fiq_debugger_ringbuf_peek(state->tty_rbuf, 0);
+ fiq_debugger_ringbuf_consume(state->tty_rbuf, 1);
+ }
+ } else {
+ c = fiq_debugger_getc(state);
+ if (c == FIQ_DEBUGGER_NO_CHAR)
+ c = NO_POLL_CHAR;
+ }
+ fiq_debugger_uart_disable(state);
+
+ return c;
+}
+
+static void fiq_tty_poll_put_char(struct tty_driver *driver, int line, char ch)
+{
+ struct fiq_debugger_state **states = driver->driver_state;
+ struct fiq_debugger_state *state = states[line];
+ fiq_debugger_uart_enable(state);
+ fiq_debugger_putc(state, ch);
+ fiq_debugger_uart_disable(state);
+}
+#endif
+
+static const struct tty_port_operations fiq_tty_port_ops;
+
+static const struct tty_operations fiq_tty_driver_ops = {
+ .write = fiq_tty_write,
+ .write_room = fiq_tty_write_room,
+ .open = fiq_tty_open,
+ .close = fiq_tty_close,
+#ifdef CONFIG_CONSOLE_POLL
+ .poll_init = fiq_tty_poll_init,
+ .poll_get_char = fiq_tty_poll_get_char,
+ .poll_put_char = fiq_tty_poll_put_char,
+#endif
+};
+
+static int fiq_debugger_tty_init(void)
+{
+ int ret;
+ struct fiq_debugger_state **states = NULL;
+
+ states = kzalloc(sizeof(*states) * MAX_FIQ_DEBUGGER_PORTS, GFP_KERNEL);
+ if (!states) {
+ pr_err("Failed to allocate fiq debugger state structres\n");
+ return -ENOMEM;
+ }
+
+ fiq_tty_driver = alloc_tty_driver(MAX_FIQ_DEBUGGER_PORTS);
+ if (!fiq_tty_driver) {
+ pr_err("Failed to allocate fiq debugger tty\n");
+ ret = -ENOMEM;
+ goto err_free_state;
+ }
+
+ fiq_tty_driver->owner = THIS_MODULE;
+ fiq_tty_driver->driver_name = "fiq-debugger";
+ fiq_tty_driver->name = "ttyFIQ";
+ fiq_tty_driver->type = TTY_DRIVER_TYPE_SERIAL;
+ fiq_tty_driver->subtype = SERIAL_TYPE_NORMAL;
+ fiq_tty_driver->init_termios = tty_std_termios;
+ fiq_tty_driver->flags = TTY_DRIVER_REAL_RAW |
+ TTY_DRIVER_DYNAMIC_DEV;
+ fiq_tty_driver->driver_state = states;
+
+ fiq_tty_driver->init_termios.c_cflag =
+ B115200 | CS8 | CREAD | HUPCL | CLOCAL;
+ fiq_tty_driver->init_termios.c_ispeed = 115200;
+ fiq_tty_driver->init_termios.c_ospeed = 115200;
+
+ tty_set_operations(fiq_tty_driver, &fiq_tty_driver_ops);
+
+ ret = tty_register_driver(fiq_tty_driver);
+ if (ret) {
+ pr_err("Failed to register fiq tty: %d\n", ret);
+ goto err_free_tty;
+ }
+
+ pr_info("Registered FIQ tty driver\n");
+ return 0;
+
+err_free_tty:
+ put_tty_driver(fiq_tty_driver);
+ fiq_tty_driver = NULL;
+err_free_state:
+ kfree(states);
+ return ret;
+}
+
+static int fiq_debugger_tty_init_one(struct fiq_debugger_state *state)
+{
+ int ret;
+ struct device *tty_dev;
+ struct fiq_debugger_state **states = fiq_tty_driver->driver_state;
+
+ states[state->pdev->id] = state;
+
+ state->tty_rbuf = fiq_debugger_ringbuf_alloc(1024);
+ if (!state->tty_rbuf) {
+ pr_err("Failed to allocate fiq debugger ringbuf\n");
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ tty_port_init(&state->tty_port);
+ state->tty_port.ops = &fiq_tty_port_ops;
+
+ tty_dev = tty_port_register_device(&state->tty_port, fiq_tty_driver,
+ state->pdev->id, &state->pdev->dev);
+ if (IS_ERR(tty_dev)) {
+ pr_err("Failed to register fiq debugger tty device\n");
+ ret = PTR_ERR(tty_dev);
+ goto err;
+ }
+
+ device_set_wakeup_capable(tty_dev, 1);
+
+ pr_info("Registered fiq debugger ttyFIQ%d\n", state->pdev->id);
+
+ return 0;
+
+err:
+ fiq_debugger_ringbuf_free(state->tty_rbuf);
+ state->tty_rbuf = NULL;
+ return ret;
+}
+#endif
+
+static int fiq_debugger_dev_suspend(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct fiq_debugger_state *state = platform_get_drvdata(pdev);
+
+ if (state->pdata->uart_dev_suspend)
+ return state->pdata->uart_dev_suspend(pdev);
+ return 0;
+}
+
+static int fiq_debugger_dev_resume(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct fiq_debugger_state *state = platform_get_drvdata(pdev);
+
+ if (state->pdata->uart_dev_resume)
+ return state->pdata->uart_dev_resume(pdev);
+ return 0;
+}
+
+static int fiq_debugger_probe(struct platform_device *pdev)
+{
+ int ret;
+ struct fiq_debugger_pdata *pdata = dev_get_platdata(&pdev->dev);
+ struct fiq_debugger_state *state;
+ int fiq;
+ int uart_irq;
+
+ if (pdev->id >= MAX_FIQ_DEBUGGER_PORTS)
+ return -EINVAL;
+
+ if (!pdata->uart_getc || !pdata->uart_putc)
+ return -EINVAL;
+ if ((pdata->uart_enable && !pdata->uart_disable) ||
+ (!pdata->uart_enable && pdata->uart_disable))
+ return -EINVAL;
+
+ fiq = platform_get_irq_byname(pdev, "fiq");
+ uart_irq = platform_get_irq_byname(pdev, "uart_irq");
+
+ /* uart_irq mode and fiq mode are mutually exclusive, but one of them
+ * is required */
+ if ((uart_irq < 0 && fiq < 0) || (uart_irq >= 0 && fiq >= 0))
+ return -EINVAL;
+ if (fiq >= 0 && !pdata->fiq_enable)
+ return -EINVAL;
+
+ state = kzalloc(sizeof(*state), GFP_KERNEL);
+ state->output.printf = fiq_debugger_printf;
+ setup_timer(&state->sleep_timer, fiq_debugger_sleep_timer_expired,
+ (unsigned long)state);
+ state->pdata = pdata;
+ state->pdev = pdev;
+ state->no_sleep = initial_no_sleep;
+ state->debug_enable = initial_debug_enable;
+ state->console_enable = initial_console_enable;
+
+ state->fiq = fiq;
+ state->uart_irq = uart_irq;
+ state->signal_irq = platform_get_irq_byname(pdev, "signal");
+ state->wakeup_irq = platform_get_irq_byname(pdev, "wakeup");
+
+ INIT_WORK(&state->work, fiq_debugger_work);
+ spin_lock_init(&state->work_lock);
+
+ platform_set_drvdata(pdev, state);
+
+ spin_lock_init(&state->sleep_timer_lock);
+
+ if (state->wakeup_irq < 0 && fiq_debugger_have_fiq(state))
+ state->no_sleep = true;
+ state->ignore_next_wakeup_irq = !state->no_sleep;
+
+ wakeup_source_init(&state->debugger_wake_src, "serial-debug");
+
+ state->clk = clk_get(&pdev->dev, NULL);
+ if (IS_ERR(state->clk))
+ state->clk = NULL;
+
+ /* do not call pdata->uart_enable here since uart_init may still
+ * need to do some initialization before uart_enable can work.
+ * So, only try to manage the clock during init.
+ */
+ if (state->clk)
+ clk_enable(state->clk);
+
+ if (pdata->uart_init) {
+ ret = pdata->uart_init(pdev);
+ if (ret)
+ goto err_uart_init;
+ }
+
+ fiq_debugger_printf_nfiq(state,
+ "<hit enter %sto activate fiq debugger>\n",
+ state->no_sleep ? "" : "twice ");
+
+#ifdef CONFIG_FIQ_GLUE
+ if (fiq_debugger_have_fiq(state)) {
+ state->handler.fiq = fiq_debugger_fiq;
+ state->handler.resume = fiq_debugger_resume;
+ ret = fiq_glue_register_handler(&state->handler);
+ if (ret) {
+ pr_err("%s: could not install fiq handler\n", __func__);
+ goto err_register_irq;
+ }
+
+ pdata->fiq_enable(pdev, state->fiq, 1);
+ } else
+#endif
+ {
+ ret = request_irq(state->uart_irq, fiq_debugger_uart_irq,
+ IRQF_NO_SUSPEND, "debug", state);
+ if (ret) {
+ pr_err("%s: could not install irq handler\n", __func__);
+ goto err_register_irq;
+ }
+
+ /* for irq-only mode, we want this irq to wake us up, if it
+ * can.
+ */
+ enable_irq_wake(state->uart_irq);
+ }
+
+ if (state->clk)
+ clk_disable(state->clk);
+
+ if (state->signal_irq >= 0) {
+ ret = request_irq(state->signal_irq, fiq_debugger_signal_irq,
+ IRQF_TRIGGER_RISING, "debug-signal", state);
+ if (ret)
+ pr_err("serial_debugger: could not install signal_irq");
+ }
+
+ if (state->wakeup_irq >= 0) {
+ ret = request_irq(state->wakeup_irq,
+ fiq_debugger_wakeup_irq_handler,
+ IRQF_TRIGGER_FALLING,
+ "debug-wakeup", state);
+ if (ret) {
+ pr_err("serial_debugger: "
+ "could not install wakeup irq\n");
+ state->wakeup_irq = -1;
+ } else {
+ ret = enable_irq_wake(state->wakeup_irq);
+ if (ret) {
+ pr_err("serial_debugger: "
+ "could not enable wakeup\n");
+ state->wakeup_irq_no_set_wake = true;
+ }
+ }
+ }
+ if (state->no_sleep)
+ fiq_debugger_handle_wakeup(state);
+
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+ spin_lock_init(&state->console_lock);
+ state->console = fiq_debugger_console;
+ state->console.index = pdev->id;
+ if (!console_set_on_cmdline)
+ add_preferred_console(state->console.name,
+ state->console.index, NULL);
+ register_console(&state->console);
+ fiq_debugger_tty_init_one(state);
+#endif
+ return 0;
+
+err_register_irq:
+ if (pdata->uart_free)
+ pdata->uart_free(pdev);
+err_uart_init:
+ if (state->clk)
+ clk_disable(state->clk);
+ if (state->clk)
+ clk_put(state->clk);
+ wakeup_source_trash(&state->debugger_wake_src);
+ platform_set_drvdata(pdev, NULL);
+ kfree(state);
+ return ret;
+}
+
+static const struct dev_pm_ops fiq_debugger_dev_pm_ops = {
+ .suspend = fiq_debugger_dev_suspend,
+ .resume = fiq_debugger_dev_resume,
+};
+
+static struct platform_driver fiq_debugger_driver = {
+ .probe = fiq_debugger_probe,
+ .driver = {
+ .name = "fiq_debugger",
+ .pm = &fiq_debugger_dev_pm_ops,
+ },
+};
+
+#if defined(CONFIG_FIQ_DEBUGGER_UART_OVERLAY)
+int fiq_debugger_uart_overlay(void)
+{
+ struct device_node *onp = of_find_node_by_path("/uart_overlay@0");
+ int ret;
+
+ if (!onp) {
+ pr_err("serial_debugger: uart overlay not found\n");
+ return -ENODEV;
+ }
+
+ ret = of_overlay_create(onp);
+ if (ret < 0) {
+ pr_err("serial_debugger: fail to create overlay: %d\n", ret);
+ of_node_put(onp);
+ return ret;
+ }
+
+ pr_info("serial_debugger: uart overlay applied\n");
+ return 0;
+}
+#endif
+
+static int __init fiq_debugger_init(void)
+{
+ if (fiq_debugger_disable) {
+ pr_err("serial_debugger: disabled\n");
+ return -ENODEV;
+ }
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+ fiq_debugger_tty_init();
+#endif
+#if defined(CONFIG_FIQ_DEBUGGER_UART_OVERLAY)
+ fiq_debugger_uart_overlay();
+#endif
+ return platform_driver_register(&fiq_debugger_driver);
+}
+
+postcore_initcall(fiq_debugger_init);
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.h b/drivers/staging/android/fiq_debugger/fiq_debugger.h
new file mode 100644
index 000000000000..c9ec4f8db086
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger.h
@@ -0,0 +1,64 @@
+/*
+ * drivers/staging/android/fiq_debugger/fiq_debugger.h
+ *
+ * Copyright (C) 2010 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _ARCH_ARM_MACH_TEGRA_FIQ_DEBUGGER_H_
+#define _ARCH_ARM_MACH_TEGRA_FIQ_DEBUGGER_H_
+
+#include <linux/serial_core.h>
+
+#define FIQ_DEBUGGER_NO_CHAR NO_POLL_CHAR
+#define FIQ_DEBUGGER_BREAK 0x00ff0100
+
+#define FIQ_DEBUGGER_FIQ_IRQ_NAME "fiq"
+#define FIQ_DEBUGGER_SIGNAL_IRQ_NAME "signal"
+#define FIQ_DEBUGGER_WAKEUP_IRQ_NAME "wakeup"
+
+/**
+ * struct fiq_debugger_pdata - fiq debugger platform data
+ * @uart_resume: used to restore uart state right before enabling
+ * the fiq.
+ * @uart_enable: Do the work necessary to communicate with the uart
+ * hw (enable clocks, etc.). This must be ref-counted.
+ * @uart_disable: Do the work necessary to disable the uart hw
+ * (disable clocks, etc.). This must be ref-counted.
+ * @uart_dev_suspend: called during PM suspend, generally not needed
+ * for real fiq mode debugger.
+ * @uart_dev_resume: called during PM resume, generally not needed
+ * for real fiq mode debugger.
+ */
+struct fiq_debugger_pdata {
+ int (*uart_init)(struct platform_device *pdev);
+ void (*uart_free)(struct platform_device *pdev);
+ int (*uart_resume)(struct platform_device *pdev);
+ int (*uart_getc)(struct platform_device *pdev);
+ void (*uart_putc)(struct platform_device *pdev, unsigned int c);
+ void (*uart_flush)(struct platform_device *pdev);
+ void (*uart_enable)(struct platform_device *pdev);
+ void (*uart_disable)(struct platform_device *pdev);
+
+ int (*uart_dev_suspend)(struct platform_device *pdev);
+ int (*uart_dev_resume)(struct platform_device *pdev);
+
+ void (*fiq_enable)(struct platform_device *pdev, unsigned int fiq,
+ bool enable);
+ void (*fiq_ack)(struct platform_device *pdev, unsigned int fiq);
+
+ void (*force_irq)(struct platform_device *pdev, unsigned int irq);
+ void (*force_irq_ack)(struct platform_device *pdev, unsigned int irq);
+};
+
+#endif
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_arm.c b/drivers/staging/android/fiq_debugger/fiq_debugger_arm.c
new file mode 100644
index 000000000000..8b3e0137be1a
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_arm.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/ptrace.h>
+#include <linux/uaccess.h>
+
+#include <asm/stacktrace.h>
+
+#include "fiq_debugger_priv.h"
+
+static char *mode_name(unsigned cpsr)
+{
+ switch (cpsr & MODE_MASK) {
+ case USR_MODE: return "USR";
+ case FIQ_MODE: return "FIQ";
+ case IRQ_MODE: return "IRQ";
+ case SVC_MODE: return "SVC";
+ case ABT_MODE: return "ABT";
+ case UND_MODE: return "UND";
+ case SYSTEM_MODE: return "SYS";
+ default: return "???";
+ }
+}
+
+void fiq_debugger_dump_pc(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ output->printf(output, " pc %08x cpsr %08x mode %s\n",
+ regs->ARM_pc, regs->ARM_cpsr, mode_name(regs->ARM_cpsr));
+}
+
+void fiq_debugger_dump_regs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ output->printf(output,
+ " r0 %08x r1 %08x r2 %08x r3 %08x\n",
+ regs->ARM_r0, regs->ARM_r1, regs->ARM_r2, regs->ARM_r3);
+ output->printf(output,
+ " r4 %08x r5 %08x r6 %08x r7 %08x\n",
+ regs->ARM_r4, regs->ARM_r5, regs->ARM_r6, regs->ARM_r7);
+ output->printf(output,
+ " r8 %08x r9 %08x r10 %08x r11 %08x mode %s\n",
+ regs->ARM_r8, regs->ARM_r9, regs->ARM_r10, regs->ARM_fp,
+ mode_name(regs->ARM_cpsr));
+ output->printf(output,
+ " ip %08x sp %08x lr %08x pc %08x cpsr %08x\n",
+ regs->ARM_ip, regs->ARM_sp, regs->ARM_lr, regs->ARM_pc,
+ regs->ARM_cpsr);
+}
+
+struct mode_regs {
+ unsigned long sp_svc;
+ unsigned long lr_svc;
+ unsigned long spsr_svc;
+
+ unsigned long sp_abt;
+ unsigned long lr_abt;
+ unsigned long spsr_abt;
+
+ unsigned long sp_und;
+ unsigned long lr_und;
+ unsigned long spsr_und;
+
+ unsigned long sp_irq;
+ unsigned long lr_irq;
+ unsigned long spsr_irq;
+
+ unsigned long r8_fiq;
+ unsigned long r9_fiq;
+ unsigned long r10_fiq;
+ unsigned long r11_fiq;
+ unsigned long r12_fiq;
+ unsigned long sp_fiq;
+ unsigned long lr_fiq;
+ unsigned long spsr_fiq;
+};
+
+static void __naked get_mode_regs(struct mode_regs *regs)
+{
+ asm volatile (
+ "mrs r1, cpsr\n"
+ "msr cpsr_c, #0xd3 @(SVC_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+ "stmia r0!, {r13 - r14}\n"
+ "mrs r2, spsr\n"
+ "msr cpsr_c, #0xd7 @(ABT_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+ "stmia r0!, {r2, r13 - r14}\n"
+ "mrs r2, spsr\n"
+ "msr cpsr_c, #0xdb @(UND_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+ "stmia r0!, {r2, r13 - r14}\n"
+ "mrs r2, spsr\n"
+ "msr cpsr_c, #0xd2 @(IRQ_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+ "stmia r0!, {r2, r13 - r14}\n"
+ "mrs r2, spsr\n"
+ "msr cpsr_c, #0xd1 @(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+ "stmia r0!, {r2, r8 - r14}\n"
+ "mrs r2, spsr\n"
+ "stmia r0!, {r2}\n"
+ "msr cpsr_c, r1\n"
+ "bx lr\n");
+}
+
+
+void fiq_debugger_dump_allregs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ struct mode_regs mode_regs;
+ unsigned long mode = regs->ARM_cpsr & MODE_MASK;
+
+ fiq_debugger_dump_regs(output, regs);
+ get_mode_regs(&mode_regs);
+
+ output->printf(output,
+ "%csvc: sp %08x lr %08x spsr %08x\n",
+ mode == SVC_MODE ? '*' : ' ',
+ mode_regs.sp_svc, mode_regs.lr_svc, mode_regs.spsr_svc);
+ output->printf(output,
+ "%cabt: sp %08x lr %08x spsr %08x\n",
+ mode == ABT_MODE ? '*' : ' ',
+ mode_regs.sp_abt, mode_regs.lr_abt, mode_regs.spsr_abt);
+ output->printf(output,
+ "%cund: sp %08x lr %08x spsr %08x\n",
+ mode == UND_MODE ? '*' : ' ',
+ mode_regs.sp_und, mode_regs.lr_und, mode_regs.spsr_und);
+ output->printf(output,
+ "%cirq: sp %08x lr %08x spsr %08x\n",
+ mode == IRQ_MODE ? '*' : ' ',
+ mode_regs.sp_irq, mode_regs.lr_irq, mode_regs.spsr_irq);
+ output->printf(output,
+ "%cfiq: r8 %08x r9 %08x r10 %08x r11 %08x r12 %08x\n",
+ mode == FIQ_MODE ? '*' : ' ',
+ mode_regs.r8_fiq, mode_regs.r9_fiq, mode_regs.r10_fiq,
+ mode_regs.r11_fiq, mode_regs.r12_fiq);
+ output->printf(output,
+ " fiq: sp %08x lr %08x spsr %08x\n",
+ mode_regs.sp_fiq, mode_regs.lr_fiq, mode_regs.spsr_fiq);
+}
+
+struct stacktrace_state {
+ struct fiq_debugger_output *output;
+ unsigned int depth;
+};
+
+static int report_trace(struct stackframe *frame, void *d)
+{
+ struct stacktrace_state *sts = d;
+
+ if (sts->depth) {
+ sts->output->printf(sts->output,
+ " pc: %p (%pF), lr %p (%pF), sp %p, fp %p\n",
+ frame->pc, frame->pc, frame->lr, frame->lr,
+ frame->sp, frame->fp);
+ sts->depth--;
+ return 0;
+ }
+ sts->output->printf(sts->output, " ...\n");
+
+ return sts->depth == 0;
+}
+
+struct frame_tail {
+ struct frame_tail *fp;
+ unsigned long sp;
+ unsigned long lr;
+} __attribute__((packed));
+
+static struct frame_tail *user_backtrace(struct fiq_debugger_output *output,
+ struct frame_tail *tail)
+{
+ struct frame_tail buftail[2];
+
+ /* Also check accessibility of one struct frame_tail beyond */
+ if (!access_ok(VERIFY_READ, tail, sizeof(buftail))) {
+ output->printf(output, " invalid frame pointer %p\n",
+ tail);
+ return NULL;
+ }
+ if (__copy_from_user_inatomic(buftail, tail, sizeof(buftail))) {
+ output->printf(output,
+ " failed to copy frame pointer %p\n", tail);
+ return NULL;
+ }
+
+ output->printf(output, " %p\n", buftail[0].lr);
+
+ /* frame pointers should strictly progress back up the stack
+ * (towards higher addresses) */
+ if (tail >= buftail[0].fp)
+ return NULL;
+
+ return buftail[0].fp-1;
+}
+
+void fiq_debugger_dump_stacktrace(struct fiq_debugger_output *output,
+ const struct pt_regs *regs, unsigned int depth, void *ssp)
+{
+ struct frame_tail *tail;
+ struct thread_info *real_thread_info = THREAD_INFO(ssp);
+ struct stacktrace_state sts;
+
+ sts.depth = depth;
+ sts.output = output;
+ *current_thread_info() = *real_thread_info;
+
+ if (!current)
+ output->printf(output, "current NULL\n");
+ else
+ output->printf(output, "pid: %d comm: %s\n",
+ current->pid, current->comm);
+ fiq_debugger_dump_regs(output, regs);
+
+ if (!user_mode(regs)) {
+ struct stackframe frame;
+ frame.fp = regs->ARM_fp;
+ frame.sp = regs->ARM_sp;
+ frame.lr = regs->ARM_lr;
+ frame.pc = regs->ARM_pc;
+ output->printf(output,
+ " pc: %p (%pF), lr %p (%pF), sp %p, fp %p\n",
+ regs->ARM_pc, regs->ARM_pc, regs->ARM_lr, regs->ARM_lr,
+ regs->ARM_sp, regs->ARM_fp);
+ walk_stackframe(&frame, report_trace, &sts);
+ return;
+ }
+
+ tail = ((struct frame_tail *) regs->ARM_fp) - 1;
+ while (depth-- && tail && !((unsigned long) tail & 3))
+ tail = user_backtrace(output, tail);
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c b/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c
new file mode 100644
index 000000000000..97246bcbcd62
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/ptrace.h>
+#include <asm/stacktrace.h>
+
+#include "fiq_debugger_priv.h"
+
+static char *mode_name(const struct pt_regs *regs)
+{
+ if (compat_user_mode(regs)) {
+ return "USR";
+ } else {
+ switch (processor_mode(regs)) {
+ case PSR_MODE_EL0t: return "EL0t";
+ case PSR_MODE_EL1t: return "EL1t";
+ case PSR_MODE_EL1h: return "EL1h";
+ case PSR_MODE_EL2t: return "EL2t";
+ case PSR_MODE_EL2h: return "EL2h";
+ default: return "???";
+ }
+ }
+}
+
+void fiq_debugger_dump_pc(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ output->printf(output, " pc %016lx cpsr %08lx mode %s\n",
+ regs->pc, regs->pstate, mode_name(regs));
+}
+
+void fiq_debugger_dump_regs_aarch32(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ output->printf(output, " r0 %08x r1 %08x r2 %08x r3 %08x\n",
+ regs->compat_usr(0), regs->compat_usr(1),
+ regs->compat_usr(2), regs->compat_usr(3));
+ output->printf(output, " r4 %08x r5 %08x r6 %08x r7 %08x\n",
+ regs->compat_usr(4), regs->compat_usr(5),
+ regs->compat_usr(6), regs->compat_usr(7));
+ output->printf(output, " r8 %08x r9 %08x r10 %08x r11 %08x\n",
+ regs->compat_usr(8), regs->compat_usr(9),
+ regs->compat_usr(10), regs->compat_usr(11));
+ output->printf(output, " ip %08x sp %08x lr %08x pc %08x\n",
+ regs->compat_usr(12), regs->compat_sp,
+ regs->compat_lr, regs->pc);
+ output->printf(output, " cpsr %08x (%s)\n",
+ regs->pstate, mode_name(regs));
+}
+
+void fiq_debugger_dump_regs_aarch64(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+
+ output->printf(output, " x0 %016lx x1 %016lx\n",
+ regs->regs[0], regs->regs[1]);
+ output->printf(output, " x2 %016lx x3 %016lx\n",
+ regs->regs[2], regs->regs[3]);
+ output->printf(output, " x4 %016lx x5 %016lx\n",
+ regs->regs[4], regs->regs[5]);
+ output->printf(output, " x6 %016lx x7 %016lx\n",
+ regs->regs[6], regs->regs[7]);
+ output->printf(output, " x8 %016lx x9 %016lx\n",
+ regs->regs[8], regs->regs[9]);
+ output->printf(output, " x10 %016lx x11 %016lx\n",
+ regs->regs[10], regs->regs[11]);
+ output->printf(output, " x12 %016lx x13 %016lx\n",
+ regs->regs[12], regs->regs[13]);
+ output->printf(output, " x14 %016lx x15 %016lx\n",
+ regs->regs[14], regs->regs[15]);
+ output->printf(output, " x16 %016lx x17 %016lx\n",
+ regs->regs[16], regs->regs[17]);
+ output->printf(output, " x18 %016lx x19 %016lx\n",
+ regs->regs[18], regs->regs[19]);
+ output->printf(output, " x20 %016lx x21 %016lx\n",
+ regs->regs[20], regs->regs[21]);
+ output->printf(output, " x22 %016lx x23 %016lx\n",
+ regs->regs[22], regs->regs[23]);
+ output->printf(output, " x24 %016lx x25 %016lx\n",
+ regs->regs[24], regs->regs[25]);
+ output->printf(output, " x26 %016lx x27 %016lx\n",
+ regs->regs[26], regs->regs[27]);
+ output->printf(output, " x28 %016lx x29 %016lx\n",
+ regs->regs[28], regs->regs[29]);
+ output->printf(output, " x30 %016lx sp %016lx\n",
+ regs->regs[30], regs->sp);
+ output->printf(output, " pc %016lx cpsr %08x (%s)\n",
+ regs->pc, regs->pstate, mode_name(regs));
+}
+
+void fiq_debugger_dump_regs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ if (compat_user_mode(regs))
+ fiq_debugger_dump_regs_aarch32(output, regs);
+ else
+ fiq_debugger_dump_regs_aarch64(output, regs);
+}
+
+#define READ_SPECIAL_REG(x) ({ \
+ u64 val; \
+ asm volatile ("mrs %0, " # x : "=r"(val)); \
+ val; \
+})
+
+void fiq_debugger_dump_allregs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs)
+{
+ u32 pstate = READ_SPECIAL_REG(CurrentEl);
+ bool in_el2 = (pstate & PSR_MODE_MASK) >= PSR_MODE_EL2t;
+
+ fiq_debugger_dump_regs(output, regs);
+
+ output->printf(output, " sp_el0 %016lx\n",
+ READ_SPECIAL_REG(sp_el0));
+
+ if (in_el2)
+ output->printf(output, " sp_el1 %016lx\n",
+ READ_SPECIAL_REG(sp_el1));
+
+ output->printf(output, " elr_el1 %016lx\n",
+ READ_SPECIAL_REG(elr_el1));
+
+ output->printf(output, " spsr_el1 %08lx\n",
+ READ_SPECIAL_REG(spsr_el1));
+
+ if (in_el2) {
+ output->printf(output, " spsr_irq %08lx\n",
+ READ_SPECIAL_REG(spsr_irq));
+ output->printf(output, " spsr_abt %08lx\n",
+ READ_SPECIAL_REG(spsr_abt));
+ output->printf(output, " spsr_und %08lx\n",
+ READ_SPECIAL_REG(spsr_und));
+ output->printf(output, " spsr_fiq %08lx\n",
+ READ_SPECIAL_REG(spsr_fiq));
+ output->printf(output, " spsr_el2 %08lx\n",
+ READ_SPECIAL_REG(elr_el2));
+ output->printf(output, " spsr_el2 %08lx\n",
+ READ_SPECIAL_REG(spsr_el2));
+ }
+}
+
+struct stacktrace_state {
+ struct fiq_debugger_output *output;
+ unsigned int depth;
+};
+
+static int report_trace(struct stackframe *frame, void *d)
+{
+ struct stacktrace_state *sts = d;
+
+ if (sts->depth) {
+ sts->output->printf(sts->output, "%pF:\n", frame->pc);
+ sts->output->printf(sts->output,
+ " pc %016lx sp %016lx fp %016lx\n",
+ frame->pc, frame->sp, frame->fp);
+ sts->depth--;
+ return 0;
+ }
+ sts->output->printf(sts->output, " ...\n");
+
+ return sts->depth == 0;
+}
+
+void fiq_debugger_dump_stacktrace(struct fiq_debugger_output *output,
+ const struct pt_regs *regs, unsigned int depth, void *ssp)
+{
+ struct thread_info *real_thread_info = THREAD_INFO(ssp);
+ struct stacktrace_state sts;
+
+ sts.depth = depth;
+ sts.output = output;
+ *current_thread_info() = *real_thread_info;
+
+ if (!current)
+ output->printf(output, "current NULL\n");
+ else
+ output->printf(output, "pid: %d comm: %s\n",
+ current->pid, current->comm);
+ fiq_debugger_dump_regs(output, regs);
+
+ if (!user_mode(regs)) {
+ struct stackframe frame;
+ frame.fp = regs->regs[29];
+ frame.sp = regs->sp;
+ frame.pc = regs->pc;
+ output->printf(output, "\n");
+ walk_stackframe(current, &frame, report_trace, &sts);
+ }
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_priv.h b/drivers/staging/android/fiq_debugger/fiq_debugger_priv.h
new file mode 100644
index 000000000000..d5d051f727a8
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_priv.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _FIQ_DEBUGGER_PRIV_H_
+#define _FIQ_DEBUGGER_PRIV_H_
+
+#define THREAD_INFO(sp) ((struct thread_info *) \
+ ((unsigned long)(sp) & ~(THREAD_SIZE - 1)))
+
+struct fiq_debugger_output {
+ void (*printf)(struct fiq_debugger_output *output, const char *fmt, ...);
+};
+
+struct pt_regs;
+
+void fiq_debugger_dump_pc(struct fiq_debugger_output *output,
+ const struct pt_regs *regs);
+void fiq_debugger_dump_regs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs);
+void fiq_debugger_dump_allregs(struct fiq_debugger_output *output,
+ const struct pt_regs *regs);
+void fiq_debugger_dump_stacktrace(struct fiq_debugger_output *output,
+ const struct pt_regs *regs, unsigned int depth, void *ssp);
+
+#endif
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h b/drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h
new file mode 100644
index 000000000000..10c3c5d09098
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h
@@ -0,0 +1,94 @@
+/*
+ * drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h
+ *
+ * simple lockless ringbuffer
+ *
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+struct fiq_debugger_ringbuf {
+ int len;
+ int head;
+ int tail;
+ u8 buf[];
+};
+
+
+static inline struct fiq_debugger_ringbuf *fiq_debugger_ringbuf_alloc(int len)
+{
+ struct fiq_debugger_ringbuf *rbuf;
+
+ rbuf = kzalloc(sizeof(*rbuf) + len, GFP_KERNEL);
+ if (rbuf == NULL)
+ return NULL;
+
+ rbuf->len = len;
+ rbuf->head = 0;
+ rbuf->tail = 0;
+ smp_mb();
+
+ return rbuf;
+}
+
+static inline void fiq_debugger_ringbuf_free(struct fiq_debugger_ringbuf *rbuf)
+{
+ kfree(rbuf);
+}
+
+static inline int fiq_debugger_ringbuf_level(struct fiq_debugger_ringbuf *rbuf)
+{
+ int level = rbuf->head - rbuf->tail;
+
+ if (level < 0)
+ level = rbuf->len + level;
+
+ return level;
+}
+
+static inline int fiq_debugger_ringbuf_room(struct fiq_debugger_ringbuf *rbuf)
+{
+ return rbuf->len - fiq_debugger_ringbuf_level(rbuf) - 1;
+}
+
+static inline u8
+fiq_debugger_ringbuf_peek(struct fiq_debugger_ringbuf *rbuf, int i)
+{
+ return rbuf->buf[(rbuf->tail + i) % rbuf->len];
+}
+
+static inline int
+fiq_debugger_ringbuf_consume(struct fiq_debugger_ringbuf *rbuf, int count)
+{
+ count = min(count, fiq_debugger_ringbuf_level(rbuf));
+
+ rbuf->tail = (rbuf->tail + count) % rbuf->len;
+ smp_mb();
+
+ return count;
+}
+
+static inline int
+fiq_debugger_ringbuf_push(struct fiq_debugger_ringbuf *rbuf, u8 datum)
+{
+ if (fiq_debugger_ringbuf_room(rbuf) == 0)
+ return 0;
+
+ rbuf->buf[rbuf->head] = datum;
+ smp_mb();
+ rbuf->head = (rbuf->head + 1) % rbuf->len;
+ smp_mb();
+
+ return 1;
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_watchdog.c b/drivers/staging/android/fiq_debugger/fiq_watchdog.c
new file mode 100644
index 000000000000..194b54138417
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_watchdog.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/pstore_ram.h>
+
+#include "fiq_watchdog.h"
+#include "fiq_debugger_priv.h"
+
+static DEFINE_RAW_SPINLOCK(fiq_watchdog_lock);
+
+static void fiq_watchdog_printf(struct fiq_debugger_output *output,
+ const char *fmt, ...)
+{
+ char buf[256];
+ va_list ap;
+ int len;
+
+ va_start(ap, fmt);
+ len = vscnprintf(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+
+ ramoops_console_write_buf(buf, len);
+}
+
+struct fiq_debugger_output fiq_watchdog_output = {
+ .printf = fiq_watchdog_printf,
+};
+
+void fiq_watchdog_triggered(const struct pt_regs *regs, void *svc_sp)
+{
+ char msg[24];
+ int len;
+
+ raw_spin_lock(&fiq_watchdog_lock);
+
+ len = scnprintf(msg, sizeof(msg), "watchdog fiq cpu %d\n",
+ THREAD_INFO(svc_sp)->cpu);
+ ramoops_console_write_buf(msg, len);
+
+ fiq_debugger_dump_stacktrace(&fiq_watchdog_output, regs, 100, svc_sp);
+
+ raw_spin_unlock(&fiq_watchdog_lock);
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_watchdog.h b/drivers/staging/android/fiq_debugger/fiq_watchdog.h
new file mode 100644
index 000000000000..c6b507f8d976
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_watchdog.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _FIQ_WATCHDOG_H_
+#define _FIQ_WATCHDOG_H_
+
+void fiq_watchdog_triggered(const struct pt_regs *regs, void *svc_sp);
+
+#endif
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index ec3b66561412..687be3615ff7 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -43,6 +43,9 @@
#include <linux/profile.h>
#include <linux/notifier.h>
+#define CREATE_TRACE_POINTS
+#include "trace/lowmemorykiller.h"
+
static u32 lowmem_debug_level = 1;
static short lowmem_adj[6] = {
0,
@@ -93,6 +96,7 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
int other_file = global_node_page_state(NR_FILE_PAGES) -
global_node_page_state(NR_SHMEM) -
+ global_node_page_state(NR_UNEVICTABLE) -
total_swapcache_pages();
if (lowmem_adj_size < array_size)
@@ -160,23 +164,27 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
p->comm, p->pid, oom_score_adj, tasksize);
}
if (selected) {
+ long cache_size = other_file * (long)(PAGE_SIZE / 1024);
+ long cache_limit = minfree * (long)(PAGE_SIZE / 1024);
+ long free = other_free * (long)(PAGE_SIZE / 1024);
+
task_lock(selected);
send_sig(SIGKILL, selected, 0);
if (selected->mm)
task_set_lmk_waiting(selected);
task_unlock(selected);
- lowmem_print(1, "Killing '%s' (%d), adj %hd,\n"
+ trace_lowmemory_kill(selected, cache_size, cache_limit, free);
+ lowmem_print(1, "Killing '%s' (%d) (tgid %d), adj %hd,\n"
" to free %ldkB on behalf of '%s' (%d) because\n"
" cache %ldkB is below limit %ldkB for oom_score_adj %hd\n"
" Free memory is %ldkB above reserved\n",
- selected->comm, selected->pid,
+ selected->comm, selected->pid, selected->tgid,
selected_oom_score_adj,
selected_tasksize * (long)(PAGE_SIZE / 1024),
current->comm, current->pid,
- other_file * (long)(PAGE_SIZE / 1024),
- minfree * (long)(PAGE_SIZE / 1024),
+ cache_size, cache_limit,
min_score_adj,
- other_free * (long)(PAGE_SIZE / 1024));
+ free);
lowmem_deathpending_timeout = jiffies + HZ;
rem += selected_tasksize;
}
@@ -200,12 +208,96 @@ static int __init lowmem_init(void)
}
device_initcall(lowmem_init);
+#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_AUTODETECT_OOM_ADJ_VALUES
+static short lowmem_oom_adj_to_oom_score_adj(short oom_adj)
+{
+ if (oom_adj == OOM_ADJUST_MAX)
+ return OOM_SCORE_ADJ_MAX;
+ else
+ return (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
+}
+
+static void lowmem_autodetect_oom_adj_values(void)
+{
+ int i;
+ short oom_adj;
+ short oom_score_adj;
+ int array_size = ARRAY_SIZE(lowmem_adj);
+
+ if (lowmem_adj_size < array_size)
+ array_size = lowmem_adj_size;
+
+ if (array_size <= 0)
+ return;
+
+ oom_adj = lowmem_adj[array_size - 1];
+ if (oom_adj > OOM_ADJUST_MAX)
+ return;
+
+ oom_score_adj = lowmem_oom_adj_to_oom_score_adj(oom_adj);
+ if (oom_score_adj <= OOM_ADJUST_MAX)
+ return;
+
+ lowmem_print(1, "lowmem_shrink: convert oom_adj to oom_score_adj:\n");
+ for (i = 0; i < array_size; i++) {
+ oom_adj = lowmem_adj[i];
+ oom_score_adj = lowmem_oom_adj_to_oom_score_adj(oom_adj);
+ lowmem_adj[i] = oom_score_adj;
+ lowmem_print(1, "oom_adj %d => oom_score_adj %d\n",
+ oom_adj, oom_score_adj);
+ }
+}
+
+static int lowmem_adj_array_set(const char *val, const struct kernel_param *kp)
+{
+ int ret;
+
+ ret = param_array_ops.set(val, kp);
+
+ /* HACK: Autodetect oom_adj values in lowmem_adj array */
+ lowmem_autodetect_oom_adj_values();
+
+ return ret;
+}
+
+static int lowmem_adj_array_get(char *buffer, const struct kernel_param *kp)
+{
+ return param_array_ops.get(buffer, kp);
+}
+
+static void lowmem_adj_array_free(void *arg)
+{
+ param_array_ops.free(arg);
+}
+
+static struct kernel_param_ops lowmem_adj_array_ops = {
+ .set = lowmem_adj_array_set,
+ .get = lowmem_adj_array_get,
+ .free = lowmem_adj_array_free,
+};
+
+static const struct kparam_array __param_arr_adj = {
+ .max = ARRAY_SIZE(lowmem_adj),
+ .num = &lowmem_adj_size,
+ .ops = &param_ops_short,
+ .elemsize = sizeof(lowmem_adj[0]),
+ .elem = lowmem_adj,
+};
+#endif
+
/*
* not really modular, but the easiest way to keep compat with existing
* bootargs behaviour is to continue using module_param here.
*/
module_param_named(cost, lowmem_shrinker.seeks, int, 0644);
+#ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_AUTODETECT_OOM_ADJ_VALUES
+module_param_cb(adj, &lowmem_adj_array_ops,
+ .arr = &__param_arr_adj,
+ 0644);
+__MODULE_PARM_TYPE(adj, "array of short");
+#else
module_param_array_named(adj, lowmem_adj, short, &lowmem_adj_size, 0644);
+#endif
module_param_array_named(minfree, lowmem_minfree, uint, &lowmem_minfree_size,
0644);
module_param_named(debug_level, lowmem_debug_level, uint, 0644);
diff --git a/drivers/staging/android/trace/lowmemorykiller.h b/drivers/staging/android/trace/lowmemorykiller.h
new file mode 100644
index 000000000000..f43d3fae75ee
--- /dev/null
+++ b/drivers/staging/android/trace/lowmemorykiller.h
@@ -0,0 +1,41 @@
+#undef TRACE_SYSTEM
+#define TRACE_INCLUDE_PATH ../../drivers/staging/android/trace
+#define TRACE_SYSTEM lowmemorykiller
+
+#if !defined(_TRACE_LOWMEMORYKILLER_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_LOWMEMORYKILLER_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(lowmemory_kill,
+ TP_PROTO(struct task_struct *killed_task, long cache_size, \
+ long cache_limit, long free),
+
+ TP_ARGS(killed_task, cache_size, cache_limit, free),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(long, pagecache_size)
+ __field(long, pagecache_limit)
+ __field(long, free)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, killed_task->comm, TASK_COMM_LEN);
+ __entry->pid = killed_task->pid;
+ __entry->pagecache_size = cache_size;
+ __entry->pagecache_limit = cache_limit;
+ __entry->free = free;
+ ),
+
+ TP_printk("%s (%d), page cache %ldkB (limit %ldkB), free %ldKb",
+ __entry->comm, __entry->pid, __entry->pagecache_size,
+ __entry->pagecache_limit, __entry->free)
+);
+
+
+#endif /* if !defined(_TRACE_LOWMEMORYKILLER_H) || defined(TRACE_HEADER_MULTI_READ) */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/drivers/staging/android/uapi/vsoc_shm.h b/drivers/staging/android/uapi/vsoc_shm.h
new file mode 100644
index 000000000000..741b1387c25b
--- /dev/null
+++ b/drivers/staging/android/uapi/vsoc_shm.h
@@ -0,0 +1,303 @@
+/*
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _UAPI_LINUX_VSOC_SHM_H
+#define _UAPI_LINUX_VSOC_SHM_H
+
+#include <linux/types.h>
+
+/**
+ * A permission is a token that permits a receiver to read and/or write an area
+ * of memory within a Vsoc region.
+ *
+ * An fd_scoped permission grants both read and write access, and can be
+ * attached to a file description (see open(2)).
+ * Ownership of the area can then be shared by passing a file descriptor
+ * among processes.
+ *
+ * begin_offset and end_offset define the area of memory that is controlled by
+ * the permission. owner_offset points to a word, also in shared memory, that
+ * controls ownership of the area.
+ *
+ * ownership of the region expires when the associated file description is
+ * released.
+ *
+ * At most one permission can be attached to each file description.
+ *
+ * This is useful when implementing HALs like gralloc that scope and pass
+ * ownership of shared resources via file descriptors.
+ *
+ * The caller is responsibe for doing any fencing.
+ *
+ * The calling process will normally identify a currently free area of
+ * memory. It will construct a proposed fd_scoped_permission_arg structure:
+ *
+ * begin_offset and end_offset describe the area being claimed
+ *
+ * owner_offset points to the location in shared memory that indicates the
+ * owner of the area.
+ *
+ * owned_value is the value that will be stored in owner_offset iff the
+ * permission can be granted. It must be different than VSOC_REGION_FREE.
+ *
+ * Two fd_scoped_permission structures are compatible if they vary only by
+ * their owned_value fields.
+ *
+ * The driver ensures that, for any group of simultaneous callers proposing
+ * compatible fd_scoped_permissions, it will accept exactly one of the
+ * propopsals. The other callers will get a failure with errno of EAGAIN.
+ *
+ * A process receiving a file descriptor can identify the region being
+ * granted using the VSOC_GET_FD_SCOPED_PERMISSION ioctl.
+ */
+struct fd_scoped_permission {
+ __u32 begin_offset;
+ __u32 end_offset;
+ __u32 owner_offset;
+ __u32 owned_value;
+};
+
+/*
+ * This value represents a free area of memory. The driver expects to see this
+ * value at owner_offset when creating a permission otherwise it will not do it,
+ * and will write this value back once the permission is no longer needed.
+ */
+#define VSOC_REGION_FREE ((__u32)0)
+
+/**
+ * ioctl argument for VSOC_CREATE_FD_SCOPE_PERMISSION
+ */
+struct fd_scoped_permission_arg {
+ struct fd_scoped_permission perm;
+ __s32 managed_region_fd;
+};
+
+#define VSOC_NODE_FREE ((__u32)0)
+
+/*
+ * Describes a signal table in shared memory. Each non-zero entry in the
+ * table indicates that the receiver should signal the futex at the given
+ * offset. Offsets are relative to the region, not the shared memory window.
+ *
+ * interrupt_signalled_offset is used to reliably signal interrupts across the
+ * vmm boundary. There are two roles: transmitter and receiver. For example,
+ * in the host_to_guest_signal_table the host is the transmitter and the
+ * guest is the receiver. The protocol is as follows:
+ *
+ * 1. The transmitter should convert the offset of the futex to an offset
+ * in the signal table [0, (1 << num_nodes_lg2))
+ * The transmitter can choose any appropriate hashing algorithm, including
+ * hash = futex_offset & ((1 << num_nodes_lg2) - 1)
+ *
+ * 3. The transmitter should atomically compare and swap futex_offset with 0
+ * at hash. There are 3 possible outcomes
+ * a. The swap fails because the futex_offset is already in the table.
+ * The transmitter should stop.
+ * b. Some other offset is in the table. This is a hash collision. The
+ * transmitter should move to another table slot and try again. One
+ * possible algorithm:
+ * hash = (hash + 1) & ((1 << num_nodes_lg2) - 1)
+ * c. The swap worked. Continue below.
+ *
+ * 3. The transmitter atomically swaps 1 with the value at the
+ * interrupt_signalled_offset. There are two outcomes:
+ * a. The prior value was 1. In this case an interrupt has already been
+ * posted. The transmitter is done.
+ * b. The prior value was 0, indicating that the receiver may be sleeping.
+ * The transmitter will issue an interrupt.
+ *
+ * 4. On waking the receiver immediately exchanges a 0 with the
+ * interrupt_signalled_offset. If it receives a 0 then this a spurious
+ * interrupt. That may occasionally happen in the current protocol, but
+ * should be rare.
+ *
+ * 5. The receiver scans the signal table by atomicaly exchanging 0 at each
+ * location. If a non-zero offset is returned from the exchange the
+ * receiver wakes all sleepers at the given offset:
+ * futex((int*)(region_base + old_value), FUTEX_WAKE, MAX_INT);
+ *
+ * 6. The receiver thread then does a conditional wait, waking immediately
+ * if the value at interrupt_signalled_offset is non-zero. This catches cases
+ * here additional signals were posted while the table was being scanned.
+ * On the guest the wait is handled via the VSOC_WAIT_FOR_INCOMING_INTERRUPT
+ * ioctl.
+ */
+struct vsoc_signal_table_layout {
+ /* log_2(Number of signal table entries) */
+ __u32 num_nodes_lg2;
+ /*
+ * Offset to the first signal table entry relative to the start of the
+ * region
+ */
+ __u32 futex_uaddr_table_offset;
+ /*
+ * Offset to an atomic_t / atomic uint32_t. A non-zero value indicates
+ * that one or more offsets are currently posted in the table.
+ * semi-unique access to an entry in the table
+ */
+ __u32 interrupt_signalled_offset;
+};
+
+#define VSOC_REGION_WHOLE ((__s32)0)
+#define VSOC_DEVICE_NAME_SZ 16
+
+/**
+ * Each HAL would (usually) talk to a single device region
+ * Mulitple entities care about these regions:
+ * - The ivshmem_server will populate the regions in shared memory
+ * - The guest kernel will read the region, create minor device nodes, and
+ * allow interested parties to register for FUTEX_WAKE events in the region
+ * - HALs will access via the minor device nodes published by the guest kernel
+ * - Host side processes will access the region via the ivshmem_server:
+ * 1. Pass name to ivshmem_server at a UNIX socket
+ * 2. ivshmemserver will reply with 2 fds:
+ * - host->guest doorbell fd
+ * - guest->host doorbell fd
+ * - fd for the shared memory region
+ * - region offset
+ * 3. Start a futex receiver thread on the doorbell fd pointed at the
+ * signal_nodes
+ */
+struct vsoc_device_region {
+ __u16 current_version;
+ __u16 min_compatible_version;
+ __u32 region_begin_offset;
+ __u32 region_end_offset;
+ __u32 offset_of_region_data;
+ struct vsoc_signal_table_layout guest_to_host_signal_table;
+ struct vsoc_signal_table_layout host_to_guest_signal_table;
+ /* Name of the device. Must always be terminated with a '\0', so
+ * the longest supported device name is 15 characters.
+ */
+ char device_name[VSOC_DEVICE_NAME_SZ];
+ /* There are two ways that permissions to access regions are handled:
+ * - When subdivided_by is VSOC_REGION_WHOLE, any process that can
+ * open the device node for the region gains complete access to it.
+ * - When subdivided is set processes that open the region cannot
+ * access it. Access to a sub-region must be established by invoking
+ * the VSOC_CREATE_FD_SCOPE_PERMISSION ioctl on the region
+ * referenced in subdivided_by, providing a fileinstance
+ * (represented by a fd) opened on this region.
+ */
+ __u32 managed_by;
+};
+
+/*
+ * The vsoc layout descriptor.
+ * The first 4K should be reserved for the shm header and region descriptors.
+ * The regions should be page aligned.
+ */
+
+struct vsoc_shm_layout_descriptor {
+ __u16 major_version;
+ __u16 minor_version;
+
+ /* size of the shm. This may be redundant but nice to have */
+ __u32 size;
+
+ /* number of shared memory regions */
+ __u32 region_count;
+
+ /* The offset to the start of region descriptors */
+ __u32 vsoc_region_desc_offset;
+};
+
+/*
+ * This specifies the current version that should be stored in
+ * vsoc_shm_layout_descriptor.major_version and
+ * vsoc_shm_layout_descriptor.minor_version.
+ * It should be updated only if the vsoc_device_region and
+ * vsoc_shm_layout_descriptor structures have changed.
+ * Versioning within each region is transferred
+ * via the min_compatible_version and current_version fields in
+ * vsoc_device_region. The driver does not consult these fields: they are left
+ * for the HALs and host processes and will change independently of the layout
+ * version.
+ */
+#define CURRENT_VSOC_LAYOUT_MAJOR_VERSION 2
+#define CURRENT_VSOC_LAYOUT_MINOR_VERSION 0
+
+#define VSOC_CREATE_FD_SCOPED_PERMISSION \
+ _IOW(0xF5, 0, struct fd_scoped_permission)
+#define VSOC_GET_FD_SCOPED_PERMISSION _IOR(0xF5, 1, struct fd_scoped_permission)
+
+/*
+ * This is used to signal the host to scan the guest_to_host_signal_table
+ * for new futexes to wake. This sends an interrupt if one is not already
+ * in flight.
+ */
+#define VSOC_MAYBE_SEND_INTERRUPT_TO_HOST _IO(0xF5, 2)
+
+/*
+ * When this returns the guest will scan host_to_guest_signal_table to
+ * check for new futexes to wake.
+ */
+/* TODO(ghartman): Consider moving this to the bottom half */
+#define VSOC_WAIT_FOR_INCOMING_INTERRUPT _IO(0xF5, 3)
+
+/*
+ * Guest HALs will use this to retrieve the region description after
+ * opening their device node.
+ */
+#define VSOC_DESCRIBE_REGION _IOR(0xF5, 4, struct vsoc_device_region)
+
+/*
+ * Wake any threads that may be waiting for a host interrupt on this region.
+ * This is mostly used during shutdown.
+ */
+#define VSOC_SELF_INTERRUPT _IO(0xF5, 5)
+
+/*
+ * This is used to signal the host to scan the guest_to_host_signal_table
+ * for new futexes to wake. This sends an interrupt unconditionally.
+ */
+#define VSOC_SEND_INTERRUPT_TO_HOST _IO(0xF5, 6)
+
+enum wait_types {
+ VSOC_WAIT_UNDEFINED = 0,
+ VSOC_WAIT_IF_EQUAL = 1,
+ VSOC_WAIT_IF_EQUAL_TIMEOUT = 2
+};
+
+/*
+ * Wait for a condition to be true
+ *
+ * Note, this is sized and aligned so the 32 bit and 64 bit layouts are
+ * identical.
+ */
+struct vsoc_cond_wait {
+ /* Input: Offset of the 32 bit word to check */
+ __u32 offset;
+ /* Input: Value that will be compared with the offset */
+ __u32 value;
+ /* Monotonic time to wake at in seconds */
+ __u64 wake_time_sec;
+ /* Input: Monotonic time to wait in nanoseconds */
+ __u32 wake_time_nsec;
+ /* Input: Type of wait */
+ __u32 wait_type;
+ /* Output: Number of times the thread woke before returning. */
+ __u32 wakes;
+ /* Ensure that we're 8-byte aligned and 8 byte length for 32/64 bit
+ * compatibility.
+ */
+ __u32 reserved_1;
+};
+
+#define VSOC_COND_WAIT _IOWR(0xF5, 7, struct vsoc_cond_wait)
+
+/* Wake any local threads waiting at the offset given in arg */
+#define VSOC_COND_WAKE _IO(0xF5, 8)
+
+#endif /* _UAPI_LINUX_VSOC_SHM_H */
diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c
new file mode 100644
index 000000000000..954ed2c5d807
--- /dev/null
+++ b/drivers/staging/android/vsoc.c
@@ -0,0 +1,1165 @@
+/*
+ * drivers/android/staging/vsoc.c
+ *
+ * Android Virtual System on a Chip (VSoC) driver
+ *
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * Author: ghartman@google.com
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ *
+ * Based on drivers/char/kvm_ivshmem.c - driver for KVM Inter-VM shared memory
+ * Copyright 2009 Cam Macdonell <cam@cs.ualberta.ca>
+ *
+ * Based on cirrusfb.c and 8139cp.c:
+ * Copyright 1999-2001 Jeff Garzik
+ * Copyright 2001-2004 Jeff Garzik
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/freezer.h>
+#include <linux/futex.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/cdev.h>
+#include <linux/file.h>
+#include "uapi/vsoc_shm.h"
+
+#define VSOC_DEV_NAME "vsoc"
+
+/*
+ * Description of the ivshmem-doorbell PCI device used by QEmu. These
+ * constants follow docs/specs/ivshmem-spec.txt, which can be found in
+ * the QEmu repository. This was last reconciled with the version that
+ * came out with 2.8
+ */
+
+/*
+ * These constants are determined KVM Inter-VM shared memory device
+ * register offsets
+ */
+enum {
+ INTR_MASK = 0x00, /* Interrupt Mask */
+ INTR_STATUS = 0x04, /* Interrupt Status */
+ IV_POSITION = 0x08, /* VM ID */
+ DOORBELL = 0x0c, /* Doorbell */
+};
+
+static const int REGISTER_BAR; /* Equal to 0 */
+static const int MAX_REGISTER_BAR_LEN = 0x100;
+/*
+ * The MSI-x BAR is not used directly.
+ *
+ * static const int MSI_X_BAR = 1;
+ */
+static const int SHARED_MEMORY_BAR = 2;
+
+struct vsoc_region_data {
+ char name[VSOC_DEVICE_NAME_SZ + 1];
+ wait_queue_head_t interrupt_wait_queue;
+ /* TODO(b/73664181): Use multiple futex wait queues */
+ wait_queue_head_t futex_wait_queue;
+ /* Flag indicating that an interrupt has been signalled by the host. */
+ atomic_t *incoming_signalled;
+ /* Flag indicating the guest has signalled the host. */
+ atomic_t *outgoing_signalled;
+ bool irq_requested;
+ bool device_created;
+};
+
+struct vsoc_device {
+ /* Kernel virtual address of REGISTER_BAR. */
+ void __iomem *regs;
+ /* Physical address of SHARED_MEMORY_BAR. */
+ phys_addr_t shm_phys_start;
+ /* Kernel virtual address of SHARED_MEMORY_BAR. */
+ void __iomem *kernel_mapped_shm;
+ /* Size of the entire shared memory window in bytes. */
+ size_t shm_size;
+ /*
+ * Pointer to the virtual address of the shared memory layout structure.
+ * This is probably identical to kernel_mapped_shm, but saving this
+ * here saves a lot of annoying casts.
+ */
+ struct vsoc_shm_layout_descriptor *layout;
+ /*
+ * Points to a table of region descriptors in the kernel's virtual
+ * address space. Calculated from
+ * vsoc_shm_layout_descriptor.vsoc_region_desc_offset
+ */
+ struct vsoc_device_region *regions;
+ /* Head of a list of permissions that have been granted. */
+ struct list_head permissions;
+ struct pci_dev *dev;
+ /* Per-region (and therefore per-interrupt) information. */
+ struct vsoc_region_data *regions_data;
+ /*
+ * Table of msi-x entries. This has to be separated from struct
+ * vsoc_region_data because the kernel deals with them as an array.
+ */
+ struct msix_entry *msix_entries;
+ /* Mutex that protectes the permission list */
+ struct mutex mtx;
+ /* Major number assigned by the kernel */
+ int major;
+ /* Character device assigned by the kernel */
+ struct cdev cdev;
+ /* Device class assigned by the kernel */
+ struct class *class;
+ /*
+ * Flags that indicate what we've initialized. These are used to do an
+ * orderly cleanup of the device.
+ */
+ bool enabled_device;
+ bool requested_regions;
+ bool cdev_added;
+ bool class_added;
+ bool msix_enabled;
+};
+
+static struct vsoc_device vsoc_dev;
+
+/*
+ * TODO(ghartman): Add a /sys filesystem entry that summarizes the permissions.
+ */
+
+struct fd_scoped_permission_node {
+ struct fd_scoped_permission permission;
+ struct list_head list;
+};
+
+struct vsoc_private_data {
+ struct fd_scoped_permission_node *fd_scoped_permission_node;
+};
+
+static long vsoc_ioctl(struct file *, unsigned int, unsigned long);
+static int vsoc_mmap(struct file *, struct vm_area_struct *);
+static int vsoc_open(struct inode *, struct file *);
+static int vsoc_release(struct inode *, struct file *);
+static ssize_t vsoc_read(struct file *, char __user *, size_t, loff_t *);
+static ssize_t vsoc_write(struct file *, const char __user *, size_t, loff_t *);
+static loff_t vsoc_lseek(struct file *filp, loff_t offset, int origin);
+static int do_create_fd_scoped_permission(
+ struct vsoc_device_region *region_p,
+ struct fd_scoped_permission_node *np,
+ struct fd_scoped_permission_arg __user *arg);
+static void do_destroy_fd_scoped_permission(
+ struct vsoc_device_region *owner_region_p,
+ struct fd_scoped_permission *perm);
+static long do_vsoc_describe_region(struct file *,
+ struct vsoc_device_region __user *);
+static ssize_t vsoc_get_area(struct file *filp, __u32 *perm_off);
+
+/**
+ * Validate arguments on entry points to the driver.
+ */
+inline int vsoc_validate_inode(struct inode *inode)
+{
+ if (iminor(inode) >= vsoc_dev.layout->region_count) {
+ dev_err(&vsoc_dev.dev->dev,
+ "describe_region: invalid region %d\n", iminor(inode));
+ return -ENODEV;
+ }
+ return 0;
+}
+
+inline int vsoc_validate_filep(struct file *filp)
+{
+ int ret = vsoc_validate_inode(file_inode(filp));
+
+ if (ret)
+ return ret;
+ if (!filp->private_data) {
+ dev_err(&vsoc_dev.dev->dev,
+ "No private data on fd, region %d\n",
+ iminor(file_inode(filp)));
+ return -EBADFD;
+ }
+ return 0;
+}
+
+/* Converts from shared memory offset to virtual address */
+static inline void *shm_off_to_virtual_addr(__u32 offset)
+{
+ return (void __force *)vsoc_dev.kernel_mapped_shm + offset;
+}
+
+/* Converts from shared memory offset to physical address */
+static inline phys_addr_t shm_off_to_phys_addr(__u32 offset)
+{
+ return vsoc_dev.shm_phys_start + offset;
+}
+
+/**
+ * Convenience functions to obtain the region from the inode or file.
+ * Dangerous to call before validating the inode/file.
+ */
+static inline struct vsoc_device_region *vsoc_region_from_inode(
+ struct inode *inode)
+{
+ return &vsoc_dev.regions[iminor(inode)];
+}
+
+static inline struct vsoc_device_region *vsoc_region_from_filep(
+ struct file *inode)
+{
+ return vsoc_region_from_inode(file_inode(inode));
+}
+
+static inline uint32_t vsoc_device_region_size(struct vsoc_device_region *r)
+{
+ return r->region_end_offset - r->region_begin_offset;
+}
+
+static const struct file_operations vsoc_ops = {
+ .owner = THIS_MODULE,
+ .open = vsoc_open,
+ .mmap = vsoc_mmap,
+ .read = vsoc_read,
+ .unlocked_ioctl = vsoc_ioctl,
+ .compat_ioctl = vsoc_ioctl,
+ .write = vsoc_write,
+ .llseek = vsoc_lseek,
+ .release = vsoc_release,
+};
+
+static struct pci_device_id vsoc_id_table[] = {
+ {0x1af4, 0x1110, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+ {0},
+};
+
+MODULE_DEVICE_TABLE(pci, vsoc_id_table);
+
+static void vsoc_remove_device(struct pci_dev *pdev);
+static int vsoc_probe_device(struct pci_dev *pdev,
+ const struct pci_device_id *ent);
+
+static struct pci_driver vsoc_pci_driver = {
+ .name = "vsoc",
+ .id_table = vsoc_id_table,
+ .probe = vsoc_probe_device,
+ .remove = vsoc_remove_device,
+};
+
+static int do_create_fd_scoped_permission(
+ struct vsoc_device_region *region_p,
+ struct fd_scoped_permission_node *np,
+ struct fd_scoped_permission_arg __user *arg)
+{
+ struct file *managed_filp;
+ s32 managed_fd;
+ atomic_t *owner_ptr = NULL;
+ struct vsoc_device_region *managed_region_p;
+
+ if (copy_from_user(&np->permission, &arg->perm, sizeof(*np)) ||
+ copy_from_user(&managed_fd,
+ &arg->managed_region_fd, sizeof(managed_fd))) {
+ return -EFAULT;
+ }
+ managed_filp = fdget(managed_fd).file;
+ /* Check that it's a valid fd, */
+ if (!managed_filp || vsoc_validate_filep(managed_filp))
+ return -EPERM;
+ /* EEXIST if the given fd already has a permission. */
+ if (((struct vsoc_private_data *)managed_filp->private_data)->
+ fd_scoped_permission_node)
+ return -EEXIST;
+ managed_region_p = vsoc_region_from_filep(managed_filp);
+ /* Check that the provided region is managed by this one */
+ if (&vsoc_dev.regions[managed_region_p->managed_by] != region_p)
+ return -EPERM;
+ /* The area must be well formed and have non-zero size */
+ if (np->permission.begin_offset >= np->permission.end_offset)
+ return -EINVAL;
+ /* The area must fit in the memory window */
+ if (np->permission.end_offset >
+ vsoc_device_region_size(managed_region_p))
+ return -ERANGE;
+ /* The area must be in the region data section */
+ if (np->permission.begin_offset <
+ managed_region_p->offset_of_region_data)
+ return -ERANGE;
+ /* The area must be page aligned */
+ if (!PAGE_ALIGNED(np->permission.begin_offset) ||
+ !PAGE_ALIGNED(np->permission.end_offset))
+ return -EINVAL;
+ /* Owner offset must be naturally aligned in the window */
+ if (np->permission.owner_offset &
+ (sizeof(np->permission.owner_offset) - 1))
+ return -EINVAL;
+ /* The owner flag must reside in the owner memory */
+ if (np->permission.owner_offset + sizeof(np->permission.owner_offset) >
+ vsoc_device_region_size(region_p))
+ return -ERANGE;
+ /* The owner flag must reside in the data section */
+ if (np->permission.owner_offset < region_p->offset_of_region_data)
+ return -EINVAL;
+ /* The owner value must change to claim the memory */
+ if (np->permission.owned_value == VSOC_REGION_FREE)
+ return -EINVAL;
+ owner_ptr =
+ (atomic_t *)shm_off_to_virtual_addr(region_p->region_begin_offset +
+ np->permission.owner_offset);
+ /* We've already verified that this is in the shared memory window, so
+ * it should be safe to write to this address.
+ */
+ if (atomic_cmpxchg(owner_ptr,
+ VSOC_REGION_FREE,
+ np->permission.owned_value) != VSOC_REGION_FREE) {
+ return -EBUSY;
+ }
+ ((struct vsoc_private_data *)managed_filp->private_data)->
+ fd_scoped_permission_node = np;
+ /* The file offset needs to be adjusted if the calling
+ * process did any read/write operations on the fd
+ * before creating the permission.
+ */
+ if (managed_filp->f_pos) {
+ if (managed_filp->f_pos > np->permission.end_offset) {
+ /* If the offset is beyond the permission end, set it
+ * to the end.
+ */
+ managed_filp->f_pos = np->permission.end_offset;
+ } else {
+ /* If the offset is within the permission interval
+ * keep it there otherwise reset it to zero.
+ */
+ if (managed_filp->f_pos < np->permission.begin_offset) {
+ managed_filp->f_pos = 0;
+ } else {
+ managed_filp->f_pos -=
+ np->permission.begin_offset;
+ }
+ }
+ }
+ return 0;
+}
+
+static void do_destroy_fd_scoped_permission_node(
+ struct vsoc_device_region *owner_region_p,
+ struct fd_scoped_permission_node *node)
+{
+ if (node) {
+ do_destroy_fd_scoped_permission(owner_region_p,
+ &node->permission);
+ mutex_lock(&vsoc_dev.mtx);
+ list_del(&node->list);
+ mutex_unlock(&vsoc_dev.mtx);
+ kfree(node);
+ }
+}
+
+static void do_destroy_fd_scoped_permission(
+ struct vsoc_device_region *owner_region_p,
+ struct fd_scoped_permission *perm)
+{
+ atomic_t *owner_ptr = NULL;
+ int prev = 0;
+
+ if (!perm)
+ return;
+ owner_ptr = (atomic_t *)shm_off_to_virtual_addr(
+ owner_region_p->region_begin_offset + perm->owner_offset);
+ prev = atomic_xchg(owner_ptr, VSOC_REGION_FREE);
+ if (prev != perm->owned_value)
+ dev_err(&vsoc_dev.dev->dev,
+ "%x-%x: owner (%s) %x: expected to be %x was %x",
+ perm->begin_offset, perm->end_offset,
+ owner_region_p->device_name, perm->owner_offset,
+ perm->owned_value, prev);
+}
+
+static long do_vsoc_describe_region(struct file *filp,
+ struct vsoc_device_region __user *dest)
+{
+ struct vsoc_device_region *region_p;
+ int retval = vsoc_validate_filep(filp);
+
+ if (retval)
+ return retval;
+ region_p = vsoc_region_from_filep(filp);
+ if (copy_to_user(dest, region_p, sizeof(*region_p)))
+ return -EFAULT;
+ return 0;
+}
+
+/**
+ * Implements the inner logic of cond_wait. Copies to and from userspace are
+ * done in the helper function below.
+ */
+static int handle_vsoc_cond_wait(struct file *filp, struct vsoc_cond_wait *arg)
+{
+ DEFINE_WAIT(wait);
+ u32 region_number = iminor(file_inode(filp));
+ struct vsoc_region_data *data = vsoc_dev.regions_data + region_number;
+ struct hrtimer_sleeper timeout, *to = NULL;
+ int ret = 0;
+ struct vsoc_device_region *region_p = vsoc_region_from_filep(filp);
+ atomic_t *address = NULL;
+ struct timespec ts;
+
+ /* Ensure that the offset is aligned */
+ if (arg->offset & (sizeof(uint32_t) - 1))
+ return -EADDRNOTAVAIL;
+ /* Ensure that the offset is within shared memory */
+ if (((uint64_t)arg->offset) + region_p->region_begin_offset +
+ sizeof(uint32_t) > region_p->region_end_offset)
+ return -E2BIG;
+ address = shm_off_to_virtual_addr(region_p->region_begin_offset +
+ arg->offset);
+
+ /* Ensure that the type of wait is valid */
+ switch (arg->wait_type) {
+ case VSOC_WAIT_IF_EQUAL:
+ break;
+ case VSOC_WAIT_IF_EQUAL_TIMEOUT:
+ to = &timeout;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (to) {
+ /* Copy the user-supplied timesec into the kernel structure.
+ * We do things this way to flatten differences between 32 bit
+ * and 64 bit timespecs.
+ */
+ ts.tv_sec = arg->wake_time_sec;
+ ts.tv_nsec = arg->wake_time_nsec;
+
+ if (!timespec_valid(&ts))
+ return -EINVAL;
+ hrtimer_init_on_stack(&to->timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
+ hrtimer_set_expires_range_ns(&to->timer, timespec_to_ktime(ts),
+ current->timer_slack_ns);
+
+ hrtimer_init_sleeper(to, current);
+ }
+
+ while (1) {
+ prepare_to_wait(&data->futex_wait_queue, &wait,
+ TASK_INTERRUPTIBLE);
+ /*
+ * Check the sentinel value after prepare_to_wait. If the value
+ * changes after this check the writer will call signal,
+ * changing the task state from INTERRUPTIBLE to RUNNING. That
+ * will ensure that schedule() will eventually schedule this
+ * task.
+ */
+ if (atomic_read(address) != arg->value) {
+ ret = 0;
+ break;
+ }
+ if (to) {
+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+ if (likely(to->task))
+ freezable_schedule();
+ hrtimer_cancel(&to->timer);
+ if (!to->task) {
+ ret = -ETIMEDOUT;
+ break;
+ }
+ } else {
+ freezable_schedule();
+ }
+ /* Count the number of times that we woke up. This is useful
+ * for unit testing.
+ */
+ ++arg->wakes;
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ }
+ finish_wait(&data->futex_wait_queue, &wait);
+ if (to)
+ destroy_hrtimer_on_stack(&to->timer);
+ return ret;
+}
+
+/**
+ * Handles the details of copying from/to userspace to ensure that the copies
+ * happen on all of the return paths of cond_wait.
+ */
+static int do_vsoc_cond_wait(struct file *filp,
+ struct vsoc_cond_wait __user *untrusted_in)
+{
+ struct vsoc_cond_wait arg;
+ int rval = 0;
+
+ if (copy_from_user(&arg, untrusted_in, sizeof(arg)))
+ return -EFAULT;
+ /* wakes is an out parameter. Initialize it to something sensible. */
+ arg.wakes = 0;
+ rval = handle_vsoc_cond_wait(filp, &arg);
+ if (copy_to_user(untrusted_in, &arg, sizeof(arg)))
+ return -EFAULT;
+ return rval;
+}
+
+static int do_vsoc_cond_wake(struct file *filp, uint32_t offset)
+{
+ struct vsoc_device_region *region_p = vsoc_region_from_filep(filp);
+ u32 region_number = iminor(file_inode(filp));
+ struct vsoc_region_data *data = vsoc_dev.regions_data + region_number;
+ /* Ensure that the offset is aligned */
+ if (offset & (sizeof(uint32_t) - 1))
+ return -EADDRNOTAVAIL;
+ /* Ensure that the offset is within shared memory */
+ if (((uint64_t)offset) + region_p->region_begin_offset +
+ sizeof(uint32_t) > region_p->region_end_offset)
+ return -E2BIG;
+ /*
+ * TODO(b/73664181): Use multiple futex wait queues.
+ * We need to wake every sleeper when the condition changes. Typically
+ * only a single thread will be waiting on the condition, but there
+ * are exceptions. The worst case is about 10 threads.
+ */
+ wake_up_interruptible_all(&data->futex_wait_queue);
+ return 0;
+}
+
+static long vsoc_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ int rv = 0;
+ struct vsoc_device_region *region_p;
+ u32 reg_num;
+ struct vsoc_region_data *reg_data;
+ int retval = vsoc_validate_filep(filp);
+
+ if (retval)
+ return retval;
+ region_p = vsoc_region_from_filep(filp);
+ reg_num = iminor(file_inode(filp));
+ reg_data = vsoc_dev.regions_data + reg_num;
+ switch (cmd) {
+ case VSOC_CREATE_FD_SCOPED_PERMISSION:
+ {
+ struct fd_scoped_permission_node *node = NULL;
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ /* We can't allocate memory for the permission */
+ if (!node)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&node->list);
+ rv = do_create_fd_scoped_permission(
+ region_p,
+ node,
+ (struct fd_scoped_permission_arg __user *)arg);
+ if (!rv) {
+ mutex_lock(&vsoc_dev.mtx);
+ list_add(&node->list, &vsoc_dev.permissions);
+ mutex_unlock(&vsoc_dev.mtx);
+ } else {
+ kfree(node);
+ return rv;
+ }
+ }
+ break;
+
+ case VSOC_GET_FD_SCOPED_PERMISSION:
+ {
+ struct fd_scoped_permission_node *node =
+ ((struct vsoc_private_data *)filp->private_data)->
+ fd_scoped_permission_node;
+ if (!node)
+ return -ENOENT;
+ if (copy_to_user
+ ((struct fd_scoped_permission __user *)arg,
+ &node->permission, sizeof(node->permission)))
+ return -EFAULT;
+ }
+ break;
+
+ case VSOC_MAYBE_SEND_INTERRUPT_TO_HOST:
+ if (!atomic_xchg(
+ reg_data->outgoing_signalled,
+ 1)) {
+ writel(reg_num, vsoc_dev.regs + DOORBELL);
+ return 0;
+ } else {
+ return -EBUSY;
+ }
+ break;
+
+ case VSOC_SEND_INTERRUPT_TO_HOST:
+ writel(reg_num, vsoc_dev.regs + DOORBELL);
+ return 0;
+
+ case VSOC_WAIT_FOR_INCOMING_INTERRUPT:
+ wait_event_interruptible(
+ reg_data->interrupt_wait_queue,
+ (atomic_read(reg_data->incoming_signalled) != 0));
+ break;
+
+ case VSOC_DESCRIBE_REGION:
+ return do_vsoc_describe_region(
+ filp,
+ (struct vsoc_device_region __user *)arg);
+
+ case VSOC_SELF_INTERRUPT:
+ atomic_set(reg_data->incoming_signalled, 1);
+ wake_up_interruptible(&reg_data->interrupt_wait_queue);
+ break;
+
+ case VSOC_COND_WAIT:
+ return do_vsoc_cond_wait(filp,
+ (struct vsoc_cond_wait __user *)arg);
+ case VSOC_COND_WAKE:
+ return do_vsoc_cond_wake(filp, arg);
+
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static ssize_t vsoc_read(struct file *filp, char __user *buffer, size_t len,
+ loff_t *poffset)
+{
+ __u32 area_off;
+ const void *area_p;
+ ssize_t area_len;
+ int retval = vsoc_validate_filep(filp);
+
+ if (retval)
+ return retval;
+ area_len = vsoc_get_area(filp, &area_off);
+ area_p = shm_off_to_virtual_addr(area_off);
+ area_p += *poffset;
+ area_len -= *poffset;
+ if (area_len <= 0)
+ return 0;
+ if (area_len < len)
+ len = area_len;
+ if (copy_to_user(buffer, area_p, len))
+ return -EFAULT;
+ *poffset += len;
+ return len;
+}
+
+static loff_t vsoc_lseek(struct file *filp, loff_t offset, int origin)
+{
+ ssize_t area_len = 0;
+ int retval = vsoc_validate_filep(filp);
+
+ if (retval)
+ return retval;
+ area_len = vsoc_get_area(filp, NULL);
+ switch (origin) {
+ case SEEK_SET:
+ break;
+
+ case SEEK_CUR:
+ if (offset > 0 && offset + filp->f_pos < 0)
+ return -EOVERFLOW;
+ offset += filp->f_pos;
+ break;
+
+ case SEEK_END:
+ if (offset > 0 && offset + area_len < 0)
+ return -EOVERFLOW;
+ offset += area_len;
+ break;
+
+ case SEEK_DATA:
+ if (offset >= area_len)
+ return -EINVAL;
+ if (offset < 0)
+ offset = 0;
+ break;
+
+ case SEEK_HOLE:
+ /* Next hole is always the end of the region, unless offset is
+ * beyond that
+ */
+ if (offset < area_len)
+ offset = area_len;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (offset < 0 || offset > area_len)
+ return -EINVAL;
+ filp->f_pos = offset;
+
+ return offset;
+}
+
+static ssize_t vsoc_write(struct file *filp, const char __user *buffer,
+ size_t len, loff_t *poffset)
+{
+ __u32 area_off;
+ void *area_p;
+ ssize_t area_len;
+ int retval = vsoc_validate_filep(filp);
+
+ if (retval)
+ return retval;
+ area_len = vsoc_get_area(filp, &area_off);
+ area_p = shm_off_to_virtual_addr(area_off);
+ area_p += *poffset;
+ area_len -= *poffset;
+ if (area_len <= 0)
+ return 0;
+ if (area_len < len)
+ len = area_len;
+ if (copy_from_user(area_p, buffer, len))
+ return -EFAULT;
+ *poffset += len;
+ return len;
+}
+
+static irqreturn_t vsoc_interrupt(int irq, void *region_data_v)
+{
+ struct vsoc_region_data *region_data =
+ (struct vsoc_region_data *)region_data_v;
+ int reg_num = region_data - vsoc_dev.regions_data;
+
+ if (unlikely(!region_data))
+ return IRQ_NONE;
+
+ if (unlikely(reg_num < 0 ||
+ reg_num >= vsoc_dev.layout->region_count)) {
+ dev_err(&vsoc_dev.dev->dev,
+ "invalid irq @%p reg_num=0x%04x\n",
+ region_data, reg_num);
+ return IRQ_NONE;
+ }
+ if (unlikely(vsoc_dev.regions_data + reg_num != region_data)) {
+ dev_err(&vsoc_dev.dev->dev,
+ "irq not aligned @%p reg_num=0x%04x\n",
+ region_data, reg_num);
+ return IRQ_NONE;
+ }
+ wake_up_interruptible(&region_data->interrupt_wait_queue);
+ return IRQ_HANDLED;
+}
+
+static int vsoc_probe_device(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ int result;
+ int i;
+ resource_size_t reg_size;
+ dev_t devt;
+
+ vsoc_dev.dev = pdev;
+ result = pci_enable_device(pdev);
+ if (result) {
+ dev_err(&pdev->dev,
+ "pci_enable_device failed %s: error %d\n",
+ pci_name(pdev), result);
+ return result;
+ }
+ vsoc_dev.enabled_device = true;
+ result = pci_request_regions(pdev, "vsoc");
+ if (result < 0) {
+ dev_err(&pdev->dev, "pci_request_regions failed\n");
+ vsoc_remove_device(pdev);
+ return -EBUSY;
+ }
+ vsoc_dev.requested_regions = true;
+ /* Set up the control registers in BAR 0 */
+ reg_size = pci_resource_len(pdev, REGISTER_BAR);
+ if (reg_size > MAX_REGISTER_BAR_LEN)
+ vsoc_dev.regs =
+ pci_iomap(pdev, REGISTER_BAR, MAX_REGISTER_BAR_LEN);
+ else
+ vsoc_dev.regs = pci_iomap(pdev, REGISTER_BAR, reg_size);
+
+ if (!vsoc_dev.regs) {
+ dev_err(&pdev->dev,
+ "cannot map registers of size %zu\n",
+ (size_t)reg_size);
+ vsoc_remove_device(pdev);
+ return -EBUSY;
+ }
+
+ /* Map the shared memory in BAR 2 */
+ vsoc_dev.shm_phys_start = pci_resource_start(pdev, SHARED_MEMORY_BAR);
+ vsoc_dev.shm_size = pci_resource_len(pdev, SHARED_MEMORY_BAR);
+
+ dev_info(&pdev->dev, "shared memory @ DMA %pa size=0x%zx\n",
+ &vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
+ vsoc_dev.kernel_mapped_shm = pci_iomap_wc(pdev, SHARED_MEMORY_BAR, 0);
+ if (!vsoc_dev.kernel_mapped_shm) {
+ dev_err(&vsoc_dev.dev->dev, "cannot iomap region\n");
+ vsoc_remove_device(pdev);
+ return -EBUSY;
+ }
+
+ vsoc_dev.layout = (struct vsoc_shm_layout_descriptor __force *)
+ vsoc_dev.kernel_mapped_shm;
+ dev_info(&pdev->dev, "major_version: %d\n",
+ vsoc_dev.layout->major_version);
+ dev_info(&pdev->dev, "minor_version: %d\n",
+ vsoc_dev.layout->minor_version);
+ dev_info(&pdev->dev, "size: 0x%x\n", vsoc_dev.layout->size);
+ dev_info(&pdev->dev, "regions: %d\n", vsoc_dev.layout->region_count);
+ if (vsoc_dev.layout->major_version !=
+ CURRENT_VSOC_LAYOUT_MAJOR_VERSION) {
+ dev_err(&vsoc_dev.dev->dev,
+ "driver supports only major_version %d\n",
+ CURRENT_VSOC_LAYOUT_MAJOR_VERSION);
+ vsoc_remove_device(pdev);
+ return -EBUSY;
+ }
+ result = alloc_chrdev_region(&devt, 0, vsoc_dev.layout->region_count,
+ VSOC_DEV_NAME);
+ if (result) {
+ dev_err(&vsoc_dev.dev->dev, "alloc_chrdev_region failed\n");
+ vsoc_remove_device(pdev);
+ return -EBUSY;
+ }
+ vsoc_dev.major = MAJOR(devt);
+ cdev_init(&vsoc_dev.cdev, &vsoc_ops);
+ vsoc_dev.cdev.owner = THIS_MODULE;
+ result = cdev_add(&vsoc_dev.cdev, devt, vsoc_dev.layout->region_count);
+ if (result) {
+ dev_err(&vsoc_dev.dev->dev, "cdev_add error\n");
+ vsoc_remove_device(pdev);
+ return -EBUSY;
+ }
+ vsoc_dev.cdev_added = true;
+ vsoc_dev.class = class_create(THIS_MODULE, VSOC_DEV_NAME);
+ if (IS_ERR(vsoc_dev.class)) {
+ dev_err(&vsoc_dev.dev->dev, "class_create failed\n");
+ vsoc_remove_device(pdev);
+ return PTR_ERR(vsoc_dev.class);
+ }
+ vsoc_dev.class_added = true;
+ vsoc_dev.regions = (struct vsoc_device_region __force *)
+ ((void *)vsoc_dev.layout +
+ vsoc_dev.layout->vsoc_region_desc_offset);
+ vsoc_dev.msix_entries = kcalloc(
+ vsoc_dev.layout->region_count,
+ sizeof(vsoc_dev.msix_entries[0]), GFP_KERNEL);
+ if (!vsoc_dev.msix_entries) {
+ dev_err(&vsoc_dev.dev->dev,
+ "unable to allocate msix_entries\n");
+ vsoc_remove_device(pdev);
+ return -ENOSPC;
+ }
+ vsoc_dev.regions_data = kcalloc(
+ vsoc_dev.layout->region_count,
+ sizeof(vsoc_dev.regions_data[0]), GFP_KERNEL);
+ if (!vsoc_dev.regions_data) {
+ dev_err(&vsoc_dev.dev->dev,
+ "unable to allocate regions' data\n");
+ vsoc_remove_device(pdev);
+ return -ENOSPC;
+ }
+ for (i = 0; i < vsoc_dev.layout->region_count; ++i)
+ vsoc_dev.msix_entries[i].entry = i;
+
+ result = pci_enable_msix_exact(vsoc_dev.dev, vsoc_dev.msix_entries,
+ vsoc_dev.layout->region_count);
+ if (result) {
+ dev_info(&pdev->dev, "pci_enable_msix failed: %d\n", result);
+ vsoc_remove_device(pdev);
+ return -ENOSPC;
+ }
+ /* Check that all regions are well formed */
+ for (i = 0; i < vsoc_dev.layout->region_count; ++i) {
+ const struct vsoc_device_region *region = vsoc_dev.regions + i;
+
+ if (!PAGE_ALIGNED(region->region_begin_offset) ||
+ !PAGE_ALIGNED(region->region_end_offset)) {
+ dev_err(&vsoc_dev.dev->dev,
+ "region %d not aligned (%x:%x)", i,
+ region->region_begin_offset,
+ region->region_end_offset);
+ vsoc_remove_device(pdev);
+ return -EFAULT;
+ }
+ if (region->region_begin_offset >= region->region_end_offset ||
+ region->region_end_offset > vsoc_dev.shm_size) {
+ dev_err(&vsoc_dev.dev->dev,
+ "region %d offsets are wrong: %x %x %zx",
+ i, region->region_begin_offset,
+ region->region_end_offset, vsoc_dev.shm_size);
+ vsoc_remove_device(pdev);
+ return -EFAULT;
+ }
+ if (region->managed_by >= vsoc_dev.layout->region_count) {
+ dev_err(&vsoc_dev.dev->dev,
+ "region %d has invalid owner: %u",
+ i, region->managed_by);
+ vsoc_remove_device(pdev);
+ return -EFAULT;
+ }
+ }
+ vsoc_dev.msix_enabled = true;
+ for (i = 0; i < vsoc_dev.layout->region_count; ++i) {
+ const struct vsoc_device_region *region = vsoc_dev.regions + i;
+ size_t name_sz = sizeof(vsoc_dev.regions_data[i].name) - 1;
+ const struct vsoc_signal_table_layout *h_to_g_signal_table =
+ &region->host_to_guest_signal_table;
+ const struct vsoc_signal_table_layout *g_to_h_signal_table =
+ &region->guest_to_host_signal_table;
+
+ vsoc_dev.regions_data[i].name[name_sz] = '\0';
+ memcpy(vsoc_dev.regions_data[i].name, region->device_name,
+ name_sz);
+ dev_info(&pdev->dev, "region %d name=%s\n",
+ i, vsoc_dev.regions_data[i].name);
+ init_waitqueue_head(
+ &vsoc_dev.regions_data[i].interrupt_wait_queue);
+ init_waitqueue_head(&vsoc_dev.regions_data[i].futex_wait_queue);
+ vsoc_dev.regions_data[i].incoming_signalled =
+ shm_off_to_virtual_addr(region->region_begin_offset) +
+ h_to_g_signal_table->interrupt_signalled_offset;
+ vsoc_dev.regions_data[i].outgoing_signalled =
+ shm_off_to_virtual_addr(region->region_begin_offset) +
+ g_to_h_signal_table->interrupt_signalled_offset;
+ result = request_irq(
+ vsoc_dev.msix_entries[i].vector,
+ vsoc_interrupt, 0,
+ vsoc_dev.regions_data[i].name,
+ vsoc_dev.regions_data + i);
+ if (result) {
+ dev_info(&pdev->dev,
+ "request_irq failed irq=%d vector=%d\n",
+ i, vsoc_dev.msix_entries[i].vector);
+ vsoc_remove_device(pdev);
+ return -ENOSPC;
+ }
+ vsoc_dev.regions_data[i].irq_requested = true;
+ if (!device_create(vsoc_dev.class, NULL,
+ MKDEV(vsoc_dev.major, i),
+ NULL, vsoc_dev.regions_data[i].name)) {
+ dev_err(&vsoc_dev.dev->dev, "device_create failed\n");
+ vsoc_remove_device(pdev);
+ return -EBUSY;
+ }
+ vsoc_dev.regions_data[i].device_created = true;
+ }
+ return 0;
+}
+
+/*
+ * This should undo all of the allocations in the probe function in reverse
+ * order.
+ *
+ * Notes:
+ *
+ * The device may have been partially initialized, so double check
+ * that the allocations happened.
+ *
+ * This function may be called multiple times, so mark resources as freed
+ * as they are deallocated.
+ */
+static void vsoc_remove_device(struct pci_dev *pdev)
+{
+ int i;
+ /*
+ * pdev is the first thing to be set on probe and the last thing
+ * to be cleared here. If it's NULL then there is no cleanup.
+ */
+ if (!pdev || !vsoc_dev.dev)
+ return;
+ dev_info(&pdev->dev, "remove_device\n");
+ if (vsoc_dev.regions_data) {
+ for (i = 0; i < vsoc_dev.layout->region_count; ++i) {
+ if (vsoc_dev.regions_data[i].device_created) {
+ device_destroy(vsoc_dev.class,
+ MKDEV(vsoc_dev.major, i));
+ vsoc_dev.regions_data[i].device_created = false;
+ }
+ if (vsoc_dev.regions_data[i].irq_requested)
+ free_irq(vsoc_dev.msix_entries[i].vector, NULL);
+ vsoc_dev.regions_data[i].irq_requested = false;
+ }
+ kfree(vsoc_dev.regions_data);
+ vsoc_dev.regions_data = NULL;
+ }
+ if (vsoc_dev.msix_enabled) {
+ pci_disable_msix(pdev);
+ vsoc_dev.msix_enabled = false;
+ }
+ kfree(vsoc_dev.msix_entries);
+ vsoc_dev.msix_entries = NULL;
+ vsoc_dev.regions = NULL;
+ if (vsoc_dev.class_added) {
+ class_destroy(vsoc_dev.class);
+ vsoc_dev.class_added = false;
+ }
+ if (vsoc_dev.cdev_added) {
+ cdev_del(&vsoc_dev.cdev);
+ vsoc_dev.cdev_added = false;
+ }
+ if (vsoc_dev.major && vsoc_dev.layout) {
+ unregister_chrdev_region(MKDEV(vsoc_dev.major, 0),
+ vsoc_dev.layout->region_count);
+ vsoc_dev.major = 0;
+ }
+ vsoc_dev.layout = NULL;
+ if (vsoc_dev.kernel_mapped_shm) {
+ pci_iounmap(pdev, vsoc_dev.kernel_mapped_shm);
+ vsoc_dev.kernel_mapped_shm = NULL;
+ }
+ if (vsoc_dev.regs) {
+ pci_iounmap(pdev, vsoc_dev.regs);
+ vsoc_dev.regs = NULL;
+ }
+ if (vsoc_dev.requested_regions) {
+ pci_release_regions(pdev);
+ vsoc_dev.requested_regions = false;
+ }
+ if (vsoc_dev.enabled_device) {
+ pci_disable_device(pdev);
+ vsoc_dev.enabled_device = false;
+ }
+ /* Do this last: it indicates that the device is not initialized. */
+ vsoc_dev.dev = NULL;
+}
+
+static void __exit vsoc_cleanup_module(void)
+{
+ vsoc_remove_device(vsoc_dev.dev);
+ pci_unregister_driver(&vsoc_pci_driver);
+}
+
+static int __init vsoc_init_module(void)
+{
+ int err = -ENOMEM;
+
+ INIT_LIST_HEAD(&vsoc_dev.permissions);
+ mutex_init(&vsoc_dev.mtx);
+
+ err = pci_register_driver(&vsoc_pci_driver);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+static int vsoc_open(struct inode *inode, struct file *filp)
+{
+ /* Can't use vsoc_validate_filep because filp is still incomplete */
+ int ret = vsoc_validate_inode(inode);
+
+ if (ret)
+ return ret;
+ filp->private_data =
+ kzalloc(sizeof(struct vsoc_private_data), GFP_KERNEL);
+ if (!filp->private_data)
+ return -ENOMEM;
+ return 0;
+}
+
+static int vsoc_release(struct inode *inode, struct file *filp)
+{
+ struct vsoc_private_data *private_data = NULL;
+ struct fd_scoped_permission_node *node = NULL;
+ struct vsoc_device_region *owner_region_p = NULL;
+ int retval = vsoc_validate_filep(filp);
+
+ if (retval)
+ return retval;
+ private_data = (struct vsoc_private_data *)filp->private_data;
+ if (!private_data)
+ return 0;
+
+ node = private_data->fd_scoped_permission_node;
+ if (node) {
+ owner_region_p = vsoc_region_from_inode(inode);
+ if (owner_region_p->managed_by != VSOC_REGION_WHOLE) {
+ owner_region_p =
+ &vsoc_dev.regions[owner_region_p->managed_by];
+ }
+ do_destroy_fd_scoped_permission_node(owner_region_p, node);
+ private_data->fd_scoped_permission_node = NULL;
+ }
+ kfree(private_data);
+ filp->private_data = NULL;
+
+ return 0;
+}
+
+/*
+ * Returns the device relative offset and length of the area specified by the
+ * fd scoped permission. If there is no fd scoped permission set, a default
+ * permission covering the entire region is assumed, unless the region is owned
+ * by another one, in which case the default is a permission with zero size.
+ */
+static ssize_t vsoc_get_area(struct file *filp, __u32 *area_offset)
+{
+ __u32 off = 0;
+ ssize_t length = 0;
+ struct vsoc_device_region *region_p;
+ struct fd_scoped_permission *perm;
+
+ region_p = vsoc_region_from_filep(filp);
+ off = region_p->region_begin_offset;
+ perm = &((struct vsoc_private_data *)filp->private_data)->
+ fd_scoped_permission_node->permission;
+ if (perm) {
+ off += perm->begin_offset;
+ length = perm->end_offset - perm->begin_offset;
+ } else if (region_p->managed_by == VSOC_REGION_WHOLE) {
+ /* No permission set and the regions is not owned by another,
+ * default to full region access.
+ */
+ length = vsoc_device_region_size(region_p);
+ } else {
+ /* return zero length, access is denied. */
+ length = 0;
+ }
+ if (area_offset)
+ *area_offset = off;
+ return length;
+}
+
+static int vsoc_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ unsigned long len = vma->vm_end - vma->vm_start;
+ __u32 area_off;
+ phys_addr_t mem_off;
+ ssize_t area_len;
+ int retval = vsoc_validate_filep(filp);
+
+ if (retval)
+ return retval;
+ area_len = vsoc_get_area(filp, &area_off);
+ /* Add the requested offset */
+ area_off += (vma->vm_pgoff << PAGE_SHIFT);
+ area_len -= (vma->vm_pgoff << PAGE_SHIFT);
+ if (area_len < len)
+ return -EINVAL;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ mem_off = shm_off_to_phys_addr(area_off);
+ if (io_remap_pfn_range(vma, vma->vm_start, mem_off >> PAGE_SHIFT,
+ len, vma->vm_page_prot))
+ return -EAGAIN;
+ return 0;
+}
+
+module_init(vsoc_init_module);
+module_exit(vsoc_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Greg Hartman <ghartman@google.com>");
+MODULE_DESCRIPTION("VSoC interpretation of QEmu's ivshmem device");
+MODULE_VERSION("1.0");
diff --git a/drivers/staging/goldfish/Kconfig b/drivers/staging/goldfish/Kconfig
index 4e094602437c..d293bbc22c79 100644
--- a/drivers/staging/goldfish/Kconfig
+++ b/drivers/staging/goldfish/Kconfig
@@ -4,6 +4,14 @@ config GOLDFISH_AUDIO
---help---
Emulated audio channel for the Goldfish Android Virtual Device
+config GOLDFISH_SYNC
+ tristate "Goldfish AVD Sync Driver"
+ depends on GOLDFISH
+ depends on SW_SYNC
+ depends on SYNC_FILE
+ ---help---
+ Emulated sync fences for the Goldfish Android Virtual Device
+
config MTD_GOLDFISH_NAND
tristate "Goldfish NAND device"
depends on GOLDFISH
diff --git a/drivers/staging/goldfish/Makefile b/drivers/staging/goldfish/Makefile
index dec34ad58162..3313fce4e940 100644
--- a/drivers/staging/goldfish/Makefile
+++ b/drivers/staging/goldfish/Makefile
@@ -4,3 +4,9 @@
obj-$(CONFIG_GOLDFISH_AUDIO) += goldfish_audio.o
obj-$(CONFIG_MTD_GOLDFISH_NAND) += goldfish_nand.o
+
+# and sync
+
+ccflags-y := -Idrivers/staging/android
+goldfish_sync-objs := goldfish_sync_timeline_fence.o goldfish_sync_timeline.o
+obj-$(CONFIG_GOLDFISH_SYNC) += goldfish_sync.o
diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c
index bd559956f199..0bb0ee2e691f 100644
--- a/drivers/staging/goldfish/goldfish_audio.c
+++ b/drivers/staging/goldfish/goldfish_audio.c
@@ -28,6 +28,7 @@
#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/goldfish.h>
+#include <linux/acpi.h>
MODULE_AUTHOR("Google, Inc.");
MODULE_DESCRIPTION("Android QEMU Audio Driver");
@@ -116,6 +117,7 @@ static ssize_t goldfish_audio_read(struct file *fp, char __user *buf,
size_t count, loff_t *pos)
{
struct goldfish_audio *data = fp->private_data;
+ unsigned long irq_flags;
int length;
int result = 0;
@@ -129,6 +131,10 @@ static ssize_t goldfish_audio_read(struct file *fp, char __user *buf,
wait_event_interruptible(data->wait, data->buffer_status &
AUDIO_INT_READ_BUFFER_FULL);
+ spin_lock_irqsave(&data->lock, irq_flags);
+ data->buffer_status &= ~AUDIO_INT_READ_BUFFER_FULL;
+ spin_unlock_irqrestore(&data->lock, irq_flags);
+
length = AUDIO_READ(data, AUDIO_READ_BUFFER_AVAILABLE);
/* copy data to user space */
@@ -351,12 +357,19 @@ static const struct of_device_id goldfish_audio_of_match[] = {
};
MODULE_DEVICE_TABLE(of, goldfish_audio_of_match);
+static const struct acpi_device_id goldfish_audio_acpi_match[] = {
+ { "GFSH0005", 0 },
+ { },
+};
+MODULE_DEVICE_TABLE(acpi, goldfish_audio_acpi_match);
+
static struct platform_driver goldfish_audio_driver = {
.probe = goldfish_audio_probe,
.remove = goldfish_audio_remove,
.driver = {
.name = "goldfish_audio",
.of_match_table = goldfish_audio_of_match,
+ .acpi_match_table = ACPI_PTR(goldfish_audio_acpi_match),
}
};
diff --git a/drivers/staging/goldfish/goldfish_sync_timeline.c b/drivers/staging/goldfish/goldfish_sync_timeline.c
new file mode 100644
index 000000000000..5bef4c6c0283
--- /dev/null
+++ b/drivers/staging/goldfish/goldfish_sync_timeline.c
@@ -0,0 +1,962 @@
+/*
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/fdtable.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/acpi.h>
+
+#include <linux/string.h>
+
+#include <linux/fs.h>
+#include <linux/syscalls.h>
+#include <linux/sync_file.h>
+#include <linux/fence.h>
+
+#include "goldfish_sync_timeline_fence.h"
+
+#define ERR(...) printk(KERN_ERR __VA_ARGS__);
+
+#define INFO(...) printk(KERN_INFO __VA_ARGS__);
+
+#define DPRINT(...) pr_debug(__VA_ARGS__);
+
+#define DTRACE() DPRINT("%s: enter", __func__)
+
+/* The Goldfish sync driver is designed to provide a interface
+ * between the underlying host's sync device and the kernel's
+ * fence sync framework..
+ * The purpose of the device/driver is to enable lightweight
+ * creation and signaling of timelines and fences
+ * in order to synchronize the guest with host-side graphics events.
+ *
+ * Each time the interrupt trips, the driver
+ * may perform a sync operation.
+ */
+
+/* The operations are: */
+
+/* Ready signal - used to mark when irq should lower */
+#define CMD_SYNC_READY 0
+
+/* Create a new timeline. writes timeline handle */
+#define CMD_CREATE_SYNC_TIMELINE 1
+
+/* Create a fence object. reads timeline handle and time argument.
+ * Writes fence fd to the SYNC_REG_HANDLE register. */
+#define CMD_CREATE_SYNC_FENCE 2
+
+/* Increments timeline. reads timeline handle and time argument */
+#define CMD_SYNC_TIMELINE_INC 3
+
+/* Destroys a timeline. reads timeline handle */
+#define CMD_DESTROY_SYNC_TIMELINE 4
+
+/* Starts a wait on the host with
+ * the given glsync object and sync thread handle. */
+#define CMD_TRIGGER_HOST_WAIT 5
+
+/* The register layout is: */
+
+#define SYNC_REG_BATCH_COMMAND 0x00 /* host->guest batch commands */
+#define SYNC_REG_BATCH_GUESTCOMMAND 0x04 /* guest->host batch commands */
+#define SYNC_REG_BATCH_COMMAND_ADDR 0x08 /* communicate physical address of host->guest batch commands */
+#define SYNC_REG_BATCH_COMMAND_ADDR_HIGH 0x0c /* 64-bit part */
+#define SYNC_REG_BATCH_GUESTCOMMAND_ADDR 0x10 /* communicate physical address of guest->host commands */
+#define SYNC_REG_BATCH_GUESTCOMMAND_ADDR_HIGH 0x14 /* 64-bit part */
+#define SYNC_REG_INIT 0x18 /* signals that the device has been probed */
+
+/* There is an ioctl associated with goldfish sync driver.
+ * Make it conflict with ioctls that are not likely to be used
+ * in the emulator.
+ *
+ * '@' 00-0F linux/radeonfb.h conflict!
+ * '@' 00-0F drivers/video/aty/aty128fb.c conflict!
+ */
+#define GOLDFISH_SYNC_IOC_MAGIC '@'
+
+#define GOLDFISH_SYNC_IOC_QUEUE_WORK _IOWR(GOLDFISH_SYNC_IOC_MAGIC, 0, struct goldfish_sync_ioctl_info)
+
+/* The above definitions (command codes, register layout, ioctl definitions)
+ * need to be in sync with the following files:
+ *
+ * Host-side (emulator):
+ * external/qemu/android/emulation/goldfish_sync.h
+ * external/qemu-android/hw/misc/goldfish_sync.c
+ *
+ * Guest-side (system image):
+ * device/generic/goldfish-opengl/system/egl/goldfish_sync.h
+ * device/generic/goldfish/ueventd.ranchu.rc
+ * platform/build/target/board/generic/sepolicy/file_contexts
+ */
+struct goldfish_sync_hostcmd {
+ /* sorted for alignment */
+ uint64_t handle;
+ uint64_t hostcmd_handle;
+ uint32_t cmd;
+ uint32_t time_arg;
+};
+
+struct goldfish_sync_guestcmd {
+ uint64_t host_command; /* uint64_t for alignment */
+ uint64_t glsync_handle;
+ uint64_t thread_handle;
+ uint64_t guest_timeline_handle;
+};
+
+#define GOLDFISH_SYNC_MAX_CMDS 32
+
+struct goldfish_sync_state {
+ char __iomem *reg_base;
+ int irq;
+
+ /* Spinlock protects |to_do| / |to_do_end|. */
+ spinlock_t lock;
+ /* |mutex_lock| protects all concurrent access
+ * to timelines for both kernel and user space. */
+ struct mutex mutex_lock;
+
+ /* Buffer holding commands issued from host. */
+ struct goldfish_sync_hostcmd to_do[GOLDFISH_SYNC_MAX_CMDS];
+ uint32_t to_do_end;
+
+ /* Addresses for the reading or writing
+ * of individual commands. The host can directly write
+ * to |batch_hostcmd| (and then this driver immediately
+ * copies contents to |to_do|). This driver either replies
+ * through |batch_hostcmd| or simply issues a
+ * guest->host command through |batch_guestcmd|.
+ */
+ struct goldfish_sync_hostcmd *batch_hostcmd;
+ struct goldfish_sync_guestcmd *batch_guestcmd;
+
+ /* Used to give this struct itself to a work queue
+ * function for executing actual sync commands. */
+ struct work_struct work_item;
+};
+
+static struct goldfish_sync_state global_sync_state[1];
+
+struct goldfish_sync_timeline_obj {
+ struct goldfish_sync_timeline *sync_tl;
+ uint32_t current_time;
+ /* We need to be careful about when we deallocate
+ * this |goldfish_sync_timeline_obj| struct.
+ * In order to ensure proper cleanup, we need to
+ * consider the triggered host-side wait that may
+ * still be in flight when the guest close()'s a
+ * goldfish_sync device's sync context fd (and
+ * destroys the |sync_tl| field above).
+ * The host-side wait may raise IRQ
+ * and tell the kernel to increment the timeline _after_
+ * the |sync_tl| has already been set to null.
+ *
+ * From observations on OpenGL apps and CTS tests, this
+ * happens at some very low probability upon context
+ * destruction or process close, but it does happen
+ * and it needs to be handled properly. Otherwise,
+ * if we clean up the surrounding |goldfish_sync_timeline_obj|
+ * too early, any |handle| field of any host->guest command
+ * might not even point to a null |sync_tl| field,
+ * but to garbage memory or even a reclaimed |sync_tl|.
+ * If we do not count such "pending waits" and kfree the object
+ * immediately upon |goldfish_sync_timeline_destroy|,
+ * we might get mysterous RCU stalls after running a long
+ * time because the garbage memory that is being read
+ * happens to be interpretable as a |spinlock_t| struct
+ * that is currently in the locked state.
+ *
+ * To track when to free the |goldfish_sync_timeline_obj|
+ * itself, we maintain a kref.
+ * The kref essentially counts the timeline itself plus
+ * the number of waits in flight. kref_init/kref_put
+ * are issued on
+ * |goldfish_sync_timeline_create|/|goldfish_sync_timeline_destroy|
+ * and kref_get/kref_put are issued on
+ * |goldfish_sync_fence_create|/|goldfish_sync_timeline_inc|.
+ *
+ * The timeline is destroyed after reference count
+ * reaches zero, which would happen after
+ * |goldfish_sync_timeline_destroy| and all pending
+ * |goldfish_sync_timeline_inc|'s are fulfilled.
+ *
+ * NOTE (1): We assume that |fence_create| and
+ * |timeline_inc| calls are 1:1, otherwise the kref scheme
+ * will not work. This is a valid assumption as long
+ * as the host-side virtual device implementation
+ * does not insert any timeline increments
+ * that we did not trigger from here.
+ *
+ * NOTE (2): The use of kref by itself requires no locks,
+ * but this does not mean everything works without locks.
+ * Related timeline operations do require a lock of some sort,
+ * or at least are not proven to work without it.
+ * In particualr, we assume that all the operations
+ * done on the |kref| field above are done in contexts where
+ * |global_sync_state->mutex_lock| is held. Do not
+ * remove that lock until everything is proven to work
+ * without it!!! */
+ struct kref kref;
+};
+
+/* We will call |delete_timeline_obj| when the last reference count
+ * of the kref is decremented. This deletes the sync
+ * timeline object along with the wrapper itself. */
+static void delete_timeline_obj(struct kref* kref) {
+ struct goldfish_sync_timeline_obj* obj =
+ container_of(kref, struct goldfish_sync_timeline_obj, kref);
+
+ goldfish_sync_timeline_put_internal(obj->sync_tl);
+ obj->sync_tl = NULL;
+ kfree(obj);
+}
+
+static uint64_t gensym_ctr;
+static void gensym(char *dst)
+{
+ sprintf(dst, "goldfish_sync:gensym:%llu", gensym_ctr);
+ gensym_ctr++;
+}
+
+/* |goldfish_sync_timeline_create| assumes that |global_sync_state->mutex_lock|
+ * is held. */
+static struct goldfish_sync_timeline_obj*
+goldfish_sync_timeline_create(void)
+{
+
+ char timeline_name[256];
+ struct goldfish_sync_timeline *res_sync_tl = NULL;
+ struct goldfish_sync_timeline_obj *res;
+
+ DTRACE();
+
+ gensym(timeline_name);
+
+ res_sync_tl = goldfish_sync_timeline_create_internal(timeline_name);
+ if (!res_sync_tl) {
+ ERR("Failed to create goldfish_sw_sync timeline.");
+ return NULL;
+ }
+
+ res = kzalloc(sizeof(struct goldfish_sync_timeline_obj), GFP_KERNEL);
+ res->sync_tl = res_sync_tl;
+ res->current_time = 0;
+ kref_init(&res->kref);
+
+ DPRINT("new timeline_obj=0x%p", res);
+ return res;
+}
+
+/* |goldfish_sync_fence_create| assumes that |global_sync_state->mutex_lock|
+ * is held. */
+static int
+goldfish_sync_fence_create(struct goldfish_sync_timeline_obj *obj,
+ uint32_t val)
+{
+
+ int fd;
+ char fence_name[256];
+ struct sync_pt *syncpt = NULL;
+ struct sync_file *sync_file_obj = NULL;
+ struct goldfish_sync_timeline *tl;
+
+ DTRACE();
+
+ if (!obj) return -1;
+
+ tl = obj->sync_tl;
+
+ syncpt = goldfish_sync_pt_create_internal(
+ tl, sizeof(struct sync_pt) + 4, val);
+ if (!syncpt) {
+ ERR("could not create sync point! "
+ "goldfish_sync_timeline=0x%p val=%d",
+ tl, val);
+ return -1;
+ }
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0) {
+ ERR("could not get unused fd for sync fence. "
+ "errno=%d", fd);
+ goto err_cleanup_pt;
+ }
+
+ gensym(fence_name);
+
+ sync_file_obj = sync_file_create(&syncpt->base);
+ if (!sync_file_obj) {
+ ERR("could not create sync fence! "
+ "goldfish_sync_timeline=0x%p val=%d sync_pt=0x%p",
+ tl, val, syncpt);
+ goto err_cleanup_fd_pt;
+ }
+
+ DPRINT("installing sync fence into fd %d sync_file_obj=0x%p",
+ fd, sync_file_obj);
+ fd_install(fd, sync_file_obj->file);
+ kref_get(&obj->kref);
+
+ return fd;
+
+err_cleanup_fd_pt:
+ put_unused_fd(fd);
+err_cleanup_pt:
+ fence_put(&syncpt->base);
+ return -1;
+}
+
+/* |goldfish_sync_timeline_inc| assumes that |global_sync_state->mutex_lock|
+ * is held. */
+static void
+goldfish_sync_timeline_inc(struct goldfish_sync_timeline_obj *obj, uint32_t inc)
+{
+ DTRACE();
+ /* Just give up if someone else nuked the timeline.
+ * Whoever it was won't care that it doesn't get signaled. */
+ if (!obj) return;
+
+ DPRINT("timeline_obj=0x%p", obj);
+ goldfish_sync_timeline_signal_internal(obj->sync_tl, inc);
+ DPRINT("incremented timeline. increment max_time");
+ obj->current_time += inc;
+
+ /* Here, we will end up deleting the timeline object if it
+ * turns out that this call was a pending increment after
+ * |goldfish_sync_timeline_destroy| was called. */
+ kref_put(&obj->kref, delete_timeline_obj);
+ DPRINT("done");
+}
+
+/* |goldfish_sync_timeline_destroy| assumes
+ * that |global_sync_state->mutex_lock| is held. */
+static void
+goldfish_sync_timeline_destroy(struct goldfish_sync_timeline_obj *obj)
+{
+ DTRACE();
+ /* See description of |goldfish_sync_timeline_obj| for why we
+ * should not immediately destroy |obj| */
+ kref_put(&obj->kref, delete_timeline_obj);
+}
+
+static inline void
+goldfish_sync_cmd_queue(struct goldfish_sync_state *sync_state,
+ uint32_t cmd,
+ uint64_t handle,
+ uint32_t time_arg,
+ uint64_t hostcmd_handle)
+{
+ struct goldfish_sync_hostcmd *to_add;
+
+ DTRACE();
+
+ BUG_ON(sync_state->to_do_end == GOLDFISH_SYNC_MAX_CMDS);
+
+ to_add = &sync_state->to_do[sync_state->to_do_end];
+
+ to_add->cmd = cmd;
+ to_add->handle = handle;
+ to_add->time_arg = time_arg;
+ to_add->hostcmd_handle = hostcmd_handle;
+
+ sync_state->to_do_end += 1;
+}
+
+static inline void
+goldfish_sync_hostcmd_reply(struct goldfish_sync_state *sync_state,
+ uint32_t cmd,
+ uint64_t handle,
+ uint32_t time_arg,
+ uint64_t hostcmd_handle)
+{
+ unsigned long irq_flags;
+ struct goldfish_sync_hostcmd *batch_hostcmd =
+ sync_state->batch_hostcmd;
+
+ DTRACE();
+
+ spin_lock_irqsave(&sync_state->lock, irq_flags);
+
+ batch_hostcmd->cmd = cmd;
+ batch_hostcmd->handle = handle;
+ batch_hostcmd->time_arg = time_arg;
+ batch_hostcmd->hostcmd_handle = hostcmd_handle;
+ writel(0, sync_state->reg_base + SYNC_REG_BATCH_COMMAND);
+
+ spin_unlock_irqrestore(&sync_state->lock, irq_flags);
+}
+
+static inline void
+goldfish_sync_send_guestcmd(struct goldfish_sync_state *sync_state,
+ uint32_t cmd,
+ uint64_t glsync_handle,
+ uint64_t thread_handle,
+ uint64_t timeline_handle)
+{
+ unsigned long irq_flags;
+ struct goldfish_sync_guestcmd *batch_guestcmd =
+ sync_state->batch_guestcmd;
+
+ DTRACE();
+
+ spin_lock_irqsave(&sync_state->lock, irq_flags);
+
+ batch_guestcmd->host_command = (uint64_t)cmd;
+ batch_guestcmd->glsync_handle = (uint64_t)glsync_handle;
+ batch_guestcmd->thread_handle = (uint64_t)thread_handle;
+ batch_guestcmd->guest_timeline_handle = (uint64_t)timeline_handle;
+ writel(0, sync_state->reg_base + SYNC_REG_BATCH_GUESTCOMMAND);
+
+ spin_unlock_irqrestore(&sync_state->lock, irq_flags);
+}
+
+/* |goldfish_sync_interrupt| handles IRQ raises from the virtual device.
+ * In the context of OpenGL, this interrupt will fire whenever we need
+ * to signal a fence fd in the guest, with the command
+ * |CMD_SYNC_TIMELINE_INC|.
+ * However, because this function will be called in an interrupt context,
+ * it is necessary to do the actual work of signaling off of interrupt context.
+ * The shared work queue is used for this purpose. At the end when
+ * all pending commands are intercepted by the interrupt handler,
+ * we call |schedule_work|, which will later run the actual
+ * desired sync command in |goldfish_sync_work_item_fn|.
+ */
+static irqreturn_t goldfish_sync_interrupt(int irq, void *dev_id)
+{
+
+ struct goldfish_sync_state *sync_state = dev_id;
+
+ uint32_t nextcmd;
+ uint32_t command_r;
+ uint64_t handle_rw;
+ uint32_t time_r;
+ uint64_t hostcmd_handle_rw;
+
+ int count = 0;
+
+ DTRACE();
+
+ sync_state = dev_id;
+
+ spin_lock(&sync_state->lock);
+
+ for (;;) {
+
+ readl(sync_state->reg_base + SYNC_REG_BATCH_COMMAND);
+ nextcmd = sync_state->batch_hostcmd->cmd;
+
+ if (nextcmd == 0)
+ break;
+
+ command_r = nextcmd;
+ handle_rw = sync_state->batch_hostcmd->handle;
+ time_r = sync_state->batch_hostcmd->time_arg;
+ hostcmd_handle_rw = sync_state->batch_hostcmd->hostcmd_handle;
+
+ goldfish_sync_cmd_queue(
+ sync_state,
+ command_r,
+ handle_rw,
+ time_r,
+ hostcmd_handle_rw);
+
+ count++;
+ }
+
+ spin_unlock(&sync_state->lock);
+
+ schedule_work(&sync_state->work_item);
+
+ return (count == 0) ? IRQ_NONE : IRQ_HANDLED;
+}
+
+/* |goldfish_sync_work_item_fn| does the actual work of servicing
+ * host->guest sync commands. This function is triggered whenever
+ * the IRQ for the goldfish sync device is raised. Once it starts
+ * running, it grabs the contents of the buffer containing the
+ * commands it needs to execute (there may be multiple, because
+ * our IRQ is active high and not edge triggered), and then
+ * runs all of them one after the other.
+ */
+static void goldfish_sync_work_item_fn(struct work_struct *input)
+{
+
+ struct goldfish_sync_state *sync_state;
+ int sync_fence_fd;
+
+ struct goldfish_sync_timeline_obj *timeline;
+ uint64_t timeline_ptr;
+
+ uint64_t hostcmd_handle;
+
+ uint32_t cmd;
+ uint64_t handle;
+ uint32_t time_arg;
+
+ struct goldfish_sync_hostcmd *todo;
+ uint32_t todo_end;
+
+ unsigned long irq_flags;
+
+ struct goldfish_sync_hostcmd to_run[GOLDFISH_SYNC_MAX_CMDS];
+ uint32_t i = 0;
+
+ sync_state = container_of(input, struct goldfish_sync_state, work_item);
+
+ mutex_lock(&sync_state->mutex_lock);
+
+ spin_lock_irqsave(&sync_state->lock, irq_flags); {
+
+ todo_end = sync_state->to_do_end;
+
+ DPRINT("num sync todos: %u", sync_state->to_do_end);
+
+ for (i = 0; i < todo_end; i++)
+ to_run[i] = sync_state->to_do[i];
+
+ /* We expect that commands will come in at a slow enough rate
+ * so that incoming items will not be more than
+ * GOLDFISH_SYNC_MAX_CMDS.
+ *
+ * This is because the way the sync device is used,
+ * it's only for managing buffer data transfers per frame,
+ * with a sequential dependency between putting things in
+ * to_do and taking them out. Once a set of commands is
+ * queued up in to_do, the user of the device waits for
+ * them to be processed before queuing additional commands,
+ * which limits the rate at which commands come in
+ * to the rate at which we take them out here.
+ *
+ * We also don't expect more than MAX_CMDS to be issued
+ * at once; there is a correspondence between
+ * which buffers need swapping to the (display / buffer queue)
+ * to particular commands, and we don't expect there to be
+ * enough display or buffer queues in operation at once
+ * to overrun GOLDFISH_SYNC_MAX_CMDS.
+ */
+ sync_state->to_do_end = 0;
+
+ } spin_unlock_irqrestore(&sync_state->lock, irq_flags);
+
+ for (i = 0; i < todo_end; i++) {
+ DPRINT("todo index: %u", i);
+
+ todo = &to_run[i];
+
+ cmd = todo->cmd;
+
+ handle = (uint64_t)todo->handle;
+ time_arg = todo->time_arg;
+ hostcmd_handle = (uint64_t)todo->hostcmd_handle;
+
+ DTRACE();
+
+ timeline = (struct goldfish_sync_timeline_obj *)(uintptr_t)handle;
+
+ switch (cmd) {
+ case CMD_SYNC_READY:
+ break;
+ case CMD_CREATE_SYNC_TIMELINE:
+ DPRINT("exec CMD_CREATE_SYNC_TIMELINE: "
+ "handle=0x%llx time_arg=%d",
+ handle, time_arg);
+ timeline = goldfish_sync_timeline_create();
+ timeline_ptr = (uintptr_t)timeline;
+ goldfish_sync_hostcmd_reply(sync_state, CMD_CREATE_SYNC_TIMELINE,
+ timeline_ptr,
+ 0,
+ hostcmd_handle);
+ DPRINT("sync timeline created: %p", timeline);
+ break;
+ case CMD_CREATE_SYNC_FENCE:
+ DPRINT("exec CMD_CREATE_SYNC_FENCE: "
+ "handle=0x%llx time_arg=%d",
+ handle, time_arg);
+ sync_fence_fd = goldfish_sync_fence_create(timeline, time_arg);
+ goldfish_sync_hostcmd_reply(sync_state, CMD_CREATE_SYNC_FENCE,
+ sync_fence_fd,
+ 0,
+ hostcmd_handle);
+ break;
+ case CMD_SYNC_TIMELINE_INC:
+ DPRINT("exec CMD_SYNC_TIMELINE_INC: "
+ "handle=0x%llx time_arg=%d",
+ handle, time_arg);
+ goldfish_sync_timeline_inc(timeline, time_arg);
+ break;
+ case CMD_DESTROY_SYNC_TIMELINE:
+ DPRINT("exec CMD_DESTROY_SYNC_TIMELINE: "
+ "handle=0x%llx time_arg=%d",
+ handle, time_arg);
+ goldfish_sync_timeline_destroy(timeline);
+ break;
+ }
+ DPRINT("Done executing sync command");
+ }
+ mutex_unlock(&sync_state->mutex_lock);
+}
+
+/* Guest-side interface: file operations */
+
+/* Goldfish sync context and ioctl info.
+ *
+ * When a sync context is created by open()-ing the goldfish sync device, we
+ * create a sync context (|goldfish_sync_context|).
+ *
+ * Currently, the only data required to track is the sync timeline itself
+ * along with the current time, which are all packed up in the
+ * |goldfish_sync_timeline_obj| field. We use a |goldfish_sync_context|
+ * as the filp->private_data.
+ *
+ * Next, when a sync context user requests that work be queued and a fence
+ * fd provided, we use the |goldfish_sync_ioctl_info| struct, which holds
+ * information about which host handles to touch for this particular
+ * queue-work operation. We need to know about the host-side sync thread
+ * and the particular host-side GLsync object. We also possibly write out
+ * a file descriptor.
+ */
+struct goldfish_sync_context {
+ struct goldfish_sync_timeline_obj *timeline;
+};
+
+struct goldfish_sync_ioctl_info {
+ uint64_t host_glsync_handle_in;
+ uint64_t host_syncthread_handle_in;
+ int fence_fd_out;
+};
+
+static int goldfish_sync_open(struct inode *inode, struct file *file)
+{
+
+ struct goldfish_sync_context *sync_context;
+
+ DTRACE();
+
+ mutex_lock(&global_sync_state->mutex_lock);
+
+ sync_context = kzalloc(sizeof(struct goldfish_sync_context), GFP_KERNEL);
+
+ if (sync_context == NULL) {
+ ERR("Creation of goldfish sync context failed!");
+ mutex_unlock(&global_sync_state->mutex_lock);
+ return -ENOMEM;
+ }
+
+ sync_context->timeline = NULL;
+
+ file->private_data = sync_context;
+
+ DPRINT("successfully create a sync context @0x%p", sync_context);
+
+ mutex_unlock(&global_sync_state->mutex_lock);
+
+ return 0;
+}
+
+static int goldfish_sync_release(struct inode *inode, struct file *file)
+{
+
+ struct goldfish_sync_context *sync_context;
+
+ DTRACE();
+
+ mutex_lock(&global_sync_state->mutex_lock);
+
+ sync_context = file->private_data;
+
+ if (sync_context->timeline)
+ goldfish_sync_timeline_destroy(sync_context->timeline);
+
+ sync_context->timeline = NULL;
+
+ kfree(sync_context);
+
+ mutex_unlock(&global_sync_state->mutex_lock);
+
+ return 0;
+}
+
+/* |goldfish_sync_ioctl| is the guest-facing interface of goldfish sync
+ * and is used in conjunction with eglCreateSyncKHR to queue up the
+ * actual work of waiting for the EGL sync command to complete,
+ * possibly returning a fence fd to the guest.
+ */
+static long goldfish_sync_ioctl(struct file *file,
+ unsigned int cmd,
+ unsigned long arg)
+{
+ struct goldfish_sync_context *sync_context_data;
+ struct goldfish_sync_timeline_obj *timeline;
+ int fd_out;
+ struct goldfish_sync_ioctl_info ioctl_data;
+
+ DTRACE();
+
+ sync_context_data = file->private_data;
+ fd_out = -1;
+
+ switch (cmd) {
+ case GOLDFISH_SYNC_IOC_QUEUE_WORK:
+
+ DPRINT("exec GOLDFISH_SYNC_IOC_QUEUE_WORK");
+
+ mutex_lock(&global_sync_state->mutex_lock);
+
+ if (copy_from_user(&ioctl_data,
+ (void __user *)arg,
+ sizeof(ioctl_data))) {
+ ERR("Failed to copy memory for ioctl_data from user.");
+ mutex_unlock(&global_sync_state->mutex_lock);
+ return -EFAULT;
+ }
+
+ if (ioctl_data.host_syncthread_handle_in == 0) {
+ DPRINT("Error: zero host syncthread handle!!!");
+ mutex_unlock(&global_sync_state->mutex_lock);
+ return -EFAULT;
+ }
+
+ if (!sync_context_data->timeline) {
+ DPRINT("no timeline yet, create one.");
+ sync_context_data->timeline = goldfish_sync_timeline_create();
+ DPRINT("timeline: 0x%p", &sync_context_data->timeline);
+ }
+
+ timeline = sync_context_data->timeline;
+ fd_out = goldfish_sync_fence_create(timeline,
+ timeline->current_time + 1);
+ DPRINT("Created fence with fd %d and current time %u (timeline: 0x%p)",
+ fd_out,
+ sync_context_data->timeline->current_time + 1,
+ sync_context_data->timeline);
+
+ ioctl_data.fence_fd_out = fd_out;
+
+ if (copy_to_user((void __user *)arg,
+ &ioctl_data,
+ sizeof(ioctl_data))) {
+ DPRINT("Error, could not copy to user!!!");
+
+ sys_close(fd_out);
+ /* We won't be doing an increment, kref_put immediately. */
+ kref_put(&timeline->kref, delete_timeline_obj);
+ mutex_unlock(&global_sync_state->mutex_lock);
+ return -EFAULT;
+ }
+
+ /* We are now about to trigger a host-side wait;
+ * accumulate on |pending_waits|. */
+ goldfish_sync_send_guestcmd(global_sync_state,
+ CMD_TRIGGER_HOST_WAIT,
+ ioctl_data.host_glsync_handle_in,
+ ioctl_data.host_syncthread_handle_in,
+ (uint64_t)(uintptr_t)(sync_context_data->timeline));
+
+ mutex_unlock(&global_sync_state->mutex_lock);
+ return 0;
+ default:
+ return -ENOTTY;
+ }
+}
+
+static const struct file_operations goldfish_sync_fops = {
+ .owner = THIS_MODULE,
+ .open = goldfish_sync_open,
+ .release = goldfish_sync_release,
+ .unlocked_ioctl = goldfish_sync_ioctl,
+ .compat_ioctl = goldfish_sync_ioctl,
+};
+
+static struct miscdevice goldfish_sync_device = {
+ .name = "goldfish_sync",
+ .fops = &goldfish_sync_fops,
+};
+
+
+static bool setup_verify_batch_cmd_addr(struct goldfish_sync_state *sync_state,
+ void *batch_addr,
+ uint32_t addr_offset,
+ uint32_t addr_offset_high)
+{
+ uint64_t batch_addr_phys;
+ uint32_t batch_addr_phys_test_lo;
+ uint32_t batch_addr_phys_test_hi;
+
+ if (!batch_addr) {
+ ERR("Could not use batch command address!");
+ return false;
+ }
+
+ batch_addr_phys = virt_to_phys(batch_addr);
+ writel((uint32_t)(batch_addr_phys),
+ sync_state->reg_base + addr_offset);
+ writel((uint32_t)(batch_addr_phys >> 32),
+ sync_state->reg_base + addr_offset_high);
+
+ batch_addr_phys_test_lo =
+ readl(sync_state->reg_base + addr_offset);
+ batch_addr_phys_test_hi =
+ readl(sync_state->reg_base + addr_offset_high);
+
+ if (virt_to_phys(batch_addr) !=
+ (((uint64_t)batch_addr_phys_test_hi << 32) |
+ batch_addr_phys_test_lo)) {
+ ERR("Invalid batch command address!");
+ return false;
+ }
+
+ return true;
+}
+
+int goldfish_sync_probe(struct platform_device *pdev)
+{
+ struct resource *ioresource;
+ struct goldfish_sync_state *sync_state = global_sync_state;
+ int status;
+
+ DTRACE();
+
+ sync_state->to_do_end = 0;
+
+ spin_lock_init(&sync_state->lock);
+ mutex_init(&sync_state->mutex_lock);
+
+ platform_set_drvdata(pdev, sync_state);
+
+ ioresource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (ioresource == NULL) {
+ ERR("platform_get_resource failed");
+ return -ENODEV;
+ }
+
+ sync_state->reg_base =
+ devm_ioremap(&pdev->dev, ioresource->start, PAGE_SIZE);
+ if (sync_state->reg_base == NULL) {
+ ERR("Could not ioremap");
+ return -ENOMEM;
+ }
+
+ sync_state->irq = platform_get_irq(pdev, 0);
+ if (sync_state->irq < 0) {
+ ERR("Could not platform_get_irq");
+ return -ENODEV;
+ }
+
+ status = devm_request_irq(&pdev->dev,
+ sync_state->irq,
+ goldfish_sync_interrupt,
+ IRQF_SHARED,
+ pdev->name,
+ sync_state);
+ if (status) {
+ ERR("request_irq failed");
+ return -ENODEV;
+ }
+
+ INIT_WORK(&sync_state->work_item,
+ goldfish_sync_work_item_fn);
+
+ misc_register(&goldfish_sync_device);
+
+ /* Obtain addresses for batch send/recv of commands. */
+ {
+ struct goldfish_sync_hostcmd *batch_addr_hostcmd;
+ struct goldfish_sync_guestcmd *batch_addr_guestcmd;
+
+ batch_addr_hostcmd =
+ devm_kzalloc(&pdev->dev, sizeof(struct goldfish_sync_hostcmd),
+ GFP_KERNEL);
+ batch_addr_guestcmd =
+ devm_kzalloc(&pdev->dev, sizeof(struct goldfish_sync_guestcmd),
+ GFP_KERNEL);
+
+ if (!setup_verify_batch_cmd_addr(sync_state,
+ batch_addr_hostcmd,
+ SYNC_REG_BATCH_COMMAND_ADDR,
+ SYNC_REG_BATCH_COMMAND_ADDR_HIGH)) {
+ ERR("goldfish_sync: Could not setup batch command address");
+ return -ENODEV;
+ }
+
+ if (!setup_verify_batch_cmd_addr(sync_state,
+ batch_addr_guestcmd,
+ SYNC_REG_BATCH_GUESTCOMMAND_ADDR,
+ SYNC_REG_BATCH_GUESTCOMMAND_ADDR_HIGH)) {
+ ERR("goldfish_sync: Could not setup batch guest command address");
+ return -ENODEV;
+ }
+
+ sync_state->batch_hostcmd = batch_addr_hostcmd;
+ sync_state->batch_guestcmd = batch_addr_guestcmd;
+ }
+
+ INFO("goldfish_sync: Initialized goldfish sync device");
+
+ writel(0, sync_state->reg_base + SYNC_REG_INIT);
+
+ return 0;
+}
+
+static int goldfish_sync_remove(struct platform_device *pdev)
+{
+ struct goldfish_sync_state *sync_state = global_sync_state;
+
+ DTRACE();
+
+ misc_deregister(&goldfish_sync_device);
+ memset(sync_state, 0, sizeof(struct goldfish_sync_state));
+ return 0;
+}
+
+static const struct of_device_id goldfish_sync_of_match[] = {
+ { .compatible = "google,goldfish-sync", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, goldfish_sync_of_match);
+
+static const struct acpi_device_id goldfish_sync_acpi_match[] = {
+ { "GFSH0006", 0 },
+ { },
+};
+
+MODULE_DEVICE_TABLE(acpi, goldfish_sync_acpi_match);
+
+static struct platform_driver goldfish_sync = {
+ .probe = goldfish_sync_probe,
+ .remove = goldfish_sync_remove,
+ .driver = {
+ .name = "goldfish_sync",
+ .of_match_table = goldfish_sync_of_match,
+ .acpi_match_table = ACPI_PTR(goldfish_sync_acpi_match),
+ }
+};
+
+module_platform_driver(goldfish_sync);
+
+MODULE_AUTHOR("Google, Inc.");
+MODULE_DESCRIPTION("Android QEMU Sync Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0");
diff --git a/drivers/staging/goldfish/goldfish_sync_timeline_fence.c b/drivers/staging/goldfish/goldfish_sync_timeline_fence.c
new file mode 100644
index 000000000000..e671618cf888
--- /dev/null
+++ b/drivers/staging/goldfish/goldfish_sync_timeline_fence.c
@@ -0,0 +1,254 @@
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/syscalls.h>
+#include <linux/sync_file.h>
+#include <linux/fence.h>
+
+#include "goldfish_sync_timeline_fence.h"
+
+/*
+ * Timeline-based sync for Goldfish Sync
+ * Based on "Sync File validation framework"
+ * (drivers/dma-buf/sw_sync.c)
+ *
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/**
+ * struct goldfish_sync_timeline - sync object
+ * @kref: reference count on fence.
+ * @name: name of the goldfish_sync_timeline. Useful for debugging
+ * @child_list_head: list of children sync_pts for this goldfish_sync_timeline
+ * @child_list_lock: lock protecting @child_list_head and fence.status
+ * @active_list_head: list of active (unsignaled/errored) sync_pts
+ */
+struct goldfish_sync_timeline {
+ struct kref kref;
+ char name[32];
+
+ /* protected by child_list_lock */
+ u64 context;
+ int value;
+
+ struct list_head child_list_head;
+ spinlock_t child_list_lock;
+
+ struct list_head active_list_head;
+};
+
+static inline struct goldfish_sync_timeline *fence_parent(struct fence *fence)
+{
+ return container_of(fence->lock, struct goldfish_sync_timeline,
+ child_list_lock);
+}
+
+static const struct fence_ops goldfish_sync_timeline_fence_ops;
+
+static inline struct sync_pt *goldfish_sync_fence_to_sync_pt(struct fence *fence)
+{
+ if (fence->ops != &goldfish_sync_timeline_fence_ops)
+ return NULL;
+ return container_of(fence, struct sync_pt, base);
+}
+
+/**
+ * goldfish_sync_timeline_create_internal() - creates a sync object
+ * @name: sync_timeline name
+ *
+ * Creates a new sync_timeline. Returns the sync_timeline object or NULL in
+ * case of error.
+ */
+struct goldfish_sync_timeline
+*goldfish_sync_timeline_create_internal(const char *name)
+{
+ struct goldfish_sync_timeline *obj;
+
+ obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
+ return NULL;
+
+ kref_init(&obj->kref);
+ obj->context = fence_context_alloc(1);
+ strlcpy(obj->name, name, sizeof(obj->name));
+
+ INIT_LIST_HEAD(&obj->child_list_head);
+ INIT_LIST_HEAD(&obj->active_list_head);
+ spin_lock_init(&obj->child_list_lock);
+
+ return obj;
+}
+
+static void goldfish_sync_timeline_free_internal(struct kref *kref)
+{
+ struct goldfish_sync_timeline *obj =
+ container_of(kref, struct goldfish_sync_timeline, kref);
+
+ kfree(obj);
+}
+
+static void goldfish_sync_timeline_get_internal(
+ struct goldfish_sync_timeline *obj)
+{
+ kref_get(&obj->kref);
+}
+
+void goldfish_sync_timeline_put_internal(struct goldfish_sync_timeline *obj)
+{
+ kref_put(&obj->kref, goldfish_sync_timeline_free_internal);
+}
+
+/**
+ * goldfish_sync_timeline_signal() -
+ * signal a status change on a goldfish_sync_timeline
+ * @obj: sync_timeline to signal
+ * @inc: num to increment on timeline->value
+ *
+ * A sync implementation should call this any time one of it's fences
+ * has signaled or has an error condition.
+ */
+void goldfish_sync_timeline_signal_internal(struct goldfish_sync_timeline *obj,
+ unsigned int inc)
+{
+ unsigned long flags;
+ struct sync_pt *pt, *next;
+
+ spin_lock_irqsave(&obj->child_list_lock, flags);
+
+ obj->value += inc;
+
+ list_for_each_entry_safe(pt, next, &obj->active_list_head,
+ active_list) {
+ if (fence_is_signaled_locked(&pt->base))
+ list_del_init(&pt->active_list);
+ }
+
+ spin_unlock_irqrestore(&obj->child_list_lock, flags);
+}
+
+/**
+ * goldfish_sync_pt_create_internal() - creates a sync pt
+ * @parent: fence's parent sync_timeline
+ * @size: size to allocate for this pt
+ * @inc: value of the fence
+ *
+ * Creates a new sync_pt as a child of @parent. @size bytes will be
+ * allocated allowing for implementation specific data to be kept after
+ * the generic sync_timeline struct. Returns the sync_pt object or
+ * NULL in case of error.
+ */
+struct sync_pt *goldfish_sync_pt_create_internal(
+ struct goldfish_sync_timeline *obj, int size,
+ unsigned int value)
+{
+ unsigned long flags;
+ struct sync_pt *pt;
+
+ if (size < sizeof(*pt))
+ return NULL;
+
+ pt = kzalloc(size, GFP_KERNEL);
+ if (!pt)
+ return NULL;
+
+ spin_lock_irqsave(&obj->child_list_lock, flags);
+ goldfish_sync_timeline_get_internal(obj);
+ fence_init(&pt->base, &goldfish_sync_timeline_fence_ops, &obj->child_list_lock,
+ obj->context, value);
+ list_add_tail(&pt->child_list, &obj->child_list_head);
+ INIT_LIST_HEAD(&pt->active_list);
+ spin_unlock_irqrestore(&obj->child_list_lock, flags);
+ return pt;
+}
+
+static const char *goldfish_sync_timeline_fence_get_driver_name(
+ struct fence *fence)
+{
+ return "sw_sync";
+}
+
+static const char *goldfish_sync_timeline_fence_get_timeline_name(
+ struct fence *fence)
+{
+ struct goldfish_sync_timeline *parent = fence_parent(fence);
+
+ return parent->name;
+}
+
+static void goldfish_sync_timeline_fence_release(struct fence *fence)
+{
+ struct sync_pt *pt = goldfish_sync_fence_to_sync_pt(fence);
+ struct goldfish_sync_timeline *parent = fence_parent(fence);
+ unsigned long flags;
+
+ spin_lock_irqsave(fence->lock, flags);
+ list_del(&pt->child_list);
+ if (!list_empty(&pt->active_list))
+ list_del(&pt->active_list);
+ spin_unlock_irqrestore(fence->lock, flags);
+
+ goldfish_sync_timeline_put_internal(parent);
+ fence_free(fence);
+}
+
+static bool goldfish_sync_timeline_fence_signaled(struct fence *fence)
+{
+ struct goldfish_sync_timeline *parent = fence_parent(fence);
+
+ return (fence->seqno > parent->value) ? false : true;
+}
+
+static bool goldfish_sync_timeline_fence_enable_signaling(struct fence *fence)
+{
+ struct sync_pt *pt = goldfish_sync_fence_to_sync_pt(fence);
+ struct goldfish_sync_timeline *parent = fence_parent(fence);
+
+ if (goldfish_sync_timeline_fence_signaled(fence))
+ return false;
+
+ list_add_tail(&pt->active_list, &parent->active_list_head);
+ return true;
+}
+
+static void goldfish_sync_timeline_fence_disable_signaling(struct fence *fence)
+{
+ struct sync_pt *pt = container_of(fence, struct sync_pt, base);
+
+ list_del_init(&pt->active_list);
+}
+
+static void goldfish_sync_timeline_fence_value_str(struct fence *fence,
+ char *str, int size)
+{
+ snprintf(str, size, "%d", fence->seqno);
+}
+
+static void goldfish_sync_timeline_fence_timeline_value_str(
+ struct fence *fence,
+ char *str, int size)
+{
+ struct goldfish_sync_timeline *parent = fence_parent(fence);
+
+ snprintf(str, size, "%d", parent->value);
+}
+
+static const struct fence_ops goldfish_sync_timeline_fence_ops = {
+ .get_driver_name = goldfish_sync_timeline_fence_get_driver_name,
+ .get_timeline_name = goldfish_sync_timeline_fence_get_timeline_name,
+ .enable_signaling = goldfish_sync_timeline_fence_enable_signaling,
+ .disable_signaling = goldfish_sync_timeline_fence_disable_signaling,
+ .signaled = goldfish_sync_timeline_fence_signaled,
+ .wait = fence_default_wait,
+ .release = goldfish_sync_timeline_fence_release,
+ .fence_value_str = goldfish_sync_timeline_fence_value_str,
+ .timeline_value_str = goldfish_sync_timeline_fence_timeline_value_str,
+};
diff --git a/drivers/staging/goldfish/goldfish_sync_timeline_fence.h b/drivers/staging/goldfish/goldfish_sync_timeline_fence.h
new file mode 100644
index 000000000000..fc25924652c1
--- /dev/null
+++ b/drivers/staging/goldfish/goldfish_sync_timeline_fence.h
@@ -0,0 +1,58 @@
+#include <linux/sync_file.h>
+#include <linux/fence.h>
+
+/**
+ * struct sync_pt - sync_pt object
+ * @base: base fence object
+ * @child_list: sync timeline child's list
+ * @active_list: sync timeline active child's list
+ */
+struct sync_pt {
+ struct fence base;
+ struct list_head child_list;
+ struct list_head active_list;
+};
+
+/**
+ * goldfish_sync_timeline_create_internal() - creates a sync object
+ * @name: goldfish_sync_timeline name
+ *
+ * Creates a new goldfish_sync_timeline.
+ * Returns the goldfish_sync_timeline object or NULL in case of error.
+ */
+struct goldfish_sync_timeline
+*goldfish_sync_timeline_create_internal(const char *name);
+
+/**
+ * goldfish_sync_pt_create_internal() - creates a sync pt
+ * @parent: fence's parent goldfish_sync_timeline
+ * @size: size to allocate for this pt
+ * @inc: value of the fence
+ *
+ * Creates a new sync_pt as a child of @parent. @size bytes will be
+ * allocated allowing for implementation specific data to be kept after
+ * the generic sync_timeline struct. Returns the sync_pt object or
+ * NULL in case of error.
+ */
+struct sync_pt
+*goldfish_sync_pt_create_internal(struct goldfish_sync_timeline *obj,
+ int size, unsigned int value);
+
+/**
+ * goldfish_sync_timeline_signal_internal() -
+ * signal a status change on a sync_timeline
+ * @obj: goldfish_sync_timeline to signal
+ * @inc: num to increment on timeline->value
+ *
+ * A sync implementation should call this any time one of it's fences
+ * has signaled or has an error condition.
+ */
+void goldfish_sync_timeline_signal_internal(struct goldfish_sync_timeline *obj,
+ unsigned int inc);
+
+/**
+ * goldfish_sync_timeline_put_internal() - dec refcount of a sync_timeline
+ * and clean up memory if it was the last ref.
+ * @obj: goldfish_sync_timeline to decref
+ */
+void goldfish_sync_timeline_put_internal(struct goldfish_sync_timeline *obj);
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 53e6db8b0330..17c2ee2acb65 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -131,6 +131,9 @@ static void __uart_start(struct tty_struct *tty)
struct uart_state *state = tty->driver_data;
struct uart_port *port = state->uart_port;
+ if (port && port->ops->wake_peer)
+ port->ops->wake_peer(port);
+
if (port && !uart_tx_stopped(port))
port->ops->start_tx(port);
}
diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig
index f3ee80ece682..f6cce5ac69c5 100644
--- a/drivers/usb/gadget/Kconfig
+++ b/drivers/usb/gadget/Kconfig
@@ -209,6 +209,18 @@ config USB_F_PRINTER
config USB_F_TCM
tristate
+config USB_F_MTP
+ tristate
+
+config USB_F_PTP
+ tristate
+
+config USB_F_AUDIO_SRC
+ tristate
+
+config USB_F_ACC
+ tristate
+
# this first set of drivers all depend on bulk-capable hardware.
config USB_CONFIGFS
@@ -362,6 +374,44 @@ config USB_CONFIGFS_F_FS
implemented in kernel space (for instance Ethernet, serial or
mass storage) and other are implemented in user space.
+config USB_CONFIGFS_F_MTP
+ boolean "MTP gadget"
+ depends on USB_CONFIGFS
+ select USB_F_MTP
+ help
+ USB gadget MTP support
+
+config USB_CONFIGFS_F_PTP
+ boolean "PTP gadget"
+ depends on USB_CONFIGFS && USB_CONFIGFS_F_MTP
+ select USB_F_PTP
+ help
+ USB gadget PTP support
+
+config USB_CONFIGFS_F_ACC
+ boolean "Accessory gadget"
+ depends on USB_CONFIGFS
+ select USB_F_ACC
+ help
+ USB gadget Accessory support
+
+config USB_CONFIGFS_F_AUDIO_SRC
+ boolean "Audio Source gadget"
+ depends on USB_CONFIGFS && USB_CONFIGFS_F_ACC
+ depends on SND
+ select SND_PCM
+ select USB_F_AUDIO_SRC
+ help
+ USB gadget Audio Source support
+
+config USB_CONFIGFS_UEVENT
+ boolean "Uevent notification of Gadget state"
+ depends on USB_CONFIGFS
+ help
+ Enable uevent notifications to userspace when the gadget
+ state changes. The gadget can be in any of the following
+ three states: "CONNECTED/DISCONNECTED/CONFIGURED"
+
config USB_CONFIGFS_F_UAC1
bool "Audio Class 1.0"
depends on USB_CONFIGFS
diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index 2c022a08f163..a8b4ca04cc50 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -1996,6 +1996,12 @@ void composite_disconnect(struct usb_gadget *gadget)
struct usb_composite_dev *cdev = get_gadget_data(gadget);
unsigned long flags;
+ if (cdev == NULL) {
+ WARN(1, "%s: Calling disconnect on a Gadget that is \
+ not connected\n", __func__);
+ return;
+ }
+
/* REVISIT: should we have config and device level
* disconnect callbacks?
*/
diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c
index a5ca409dc97e..b1d22d8c9f7e 100644
--- a/drivers/usb/gadget/configfs.c
+++ b/drivers/usb/gadget/configfs.c
@@ -9,6 +9,31 @@
#include "u_f.h"
#include "u_os_desc.h"
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+#include <linux/platform_device.h>
+#include <linux/kdev_t.h>
+#include <linux/usb/ch9.h>
+
+#ifdef CONFIG_USB_CONFIGFS_F_ACC
+extern int acc_ctrlrequest(struct usb_composite_dev *cdev,
+ const struct usb_ctrlrequest *ctrl);
+void acc_disconnect(void);
+#endif
+static struct class *android_class;
+static struct device *android_device;
+static int index;
+
+struct device *create_function_device(char *name)
+{
+ if (android_device && !IS_ERR(android_device))
+ return device_create(android_class, android_device,
+ MKDEV(0, index++), NULL, name);
+ else
+ return ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL_GPL(create_function_device);
+#endif
+
int check_user_usb_string(const char *name,
struct usb_gadget_strings *stringtab_dev)
{
@@ -60,6 +85,12 @@ struct gadget_info {
bool use_os_desc;
char b_vendor_code;
char qw_sign[OS_STRING_QW_SIGN_LEN];
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ bool connected;
+ bool sw_connected;
+ struct work_struct work;
+ struct device *dev;
+#endif
};
static inline struct gadget_info *to_gadget_info(struct config_item *item)
@@ -265,7 +296,7 @@ static ssize_t gadget_dev_desc_UDC_store(struct config_item *item,
mutex_lock(&gi->lock);
- if (!strlen(name)) {
+ if (!strlen(name) || strcmp(name, "none") == 0) {
ret = unregister_gadget(gi);
if (ret)
goto err;
@@ -1369,6 +1400,60 @@ err_comp_cleanup:
return ret;
}
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+static void android_work(struct work_struct *data)
+{
+ struct gadget_info *gi = container_of(data, struct gadget_info, work);
+ struct usb_composite_dev *cdev = &gi->cdev;
+ char *disconnected[2] = { "USB_STATE=DISCONNECTED", NULL };
+ char *connected[2] = { "USB_STATE=CONNECTED", NULL };
+ char *configured[2] = { "USB_STATE=CONFIGURED", NULL };
+ /* 0-connected 1-configured 2-disconnected*/
+ bool status[3] = { false, false, false };
+ unsigned long flags;
+ bool uevent_sent = false;
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (cdev->config)
+ status[1] = true;
+
+ if (gi->connected != gi->sw_connected) {
+ if (gi->connected)
+ status[0] = true;
+ else
+ status[2] = true;
+ gi->sw_connected = gi->connected;
+ }
+ spin_unlock_irqrestore(&cdev->lock, flags);
+
+ if (status[0]) {
+ kobject_uevent_env(&android_device->kobj,
+ KOBJ_CHANGE, connected);
+ pr_info("%s: sent uevent %s\n", __func__, connected[0]);
+ uevent_sent = true;
+ }
+
+ if (status[1]) {
+ kobject_uevent_env(&android_device->kobj,
+ KOBJ_CHANGE, configured);
+ pr_info("%s: sent uevent %s\n", __func__, configured[0]);
+ uevent_sent = true;
+ }
+
+ if (status[2]) {
+ kobject_uevent_env(&android_device->kobj,
+ KOBJ_CHANGE, disconnected);
+ pr_info("%s: sent uevent %s\n", __func__, disconnected[0]);
+ uevent_sent = true;
+ }
+
+ if (!uevent_sent) {
+ pr_info("%s: did not send uevent (%d %d %p)\n", __func__,
+ gi->connected, gi->sw_connected, cdev->config);
+ }
+}
+#endif
+
static void configfs_composite_unbind(struct usb_gadget *gadget)
{
struct usb_composite_dev *cdev;
@@ -1388,14 +1473,91 @@ static void configfs_composite_unbind(struct usb_gadget *gadget)
set_gadget_data(gadget, NULL);
}
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+static int android_setup(struct usb_gadget *gadget,
+ const struct usb_ctrlrequest *c)
+{
+ struct usb_composite_dev *cdev = get_gadget_data(gadget);
+ unsigned long flags;
+ struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev);
+ int value = -EOPNOTSUPP;
+ struct usb_function_instance *fi;
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (!gi->connected) {
+ gi->connected = 1;
+ schedule_work(&gi->work);
+ }
+ spin_unlock_irqrestore(&cdev->lock, flags);
+ list_for_each_entry(fi, &gi->available_func, cfs_list) {
+ if (fi != NULL && fi->f != NULL && fi->f->setup != NULL) {
+ value = fi->f->setup(fi->f, c);
+ if (value >= 0)
+ break;
+ }
+ }
+
+#ifdef CONFIG_USB_CONFIGFS_F_ACC
+ if (value < 0)
+ value = acc_ctrlrequest(cdev, c);
+#endif
+
+ if (value < 0)
+ value = composite_setup(gadget, c);
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (c->bRequest == USB_REQ_SET_CONFIGURATION &&
+ cdev->config) {
+ schedule_work(&gi->work);
+ }
+ spin_unlock_irqrestore(&cdev->lock, flags);
+
+ return value;
+}
+
+static void android_disconnect(struct usb_gadget *gadget)
+{
+ struct usb_composite_dev *cdev = get_gadget_data(gadget);
+ struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev);
+
+ /* FIXME: There's a race between usb_gadget_udc_stop() which is likely
+ * to set the gadget driver to NULL in the udc driver and this drivers
+ * gadget disconnect fn which likely checks for the gadget driver to
+ * be a null ptr. It happens that unbind (doing set_gadget_data(NULL))
+ * is called before the gadget driver is set to NULL and the udc driver
+ * calls disconnect fn which results in cdev being a null ptr.
+ */
+ if (cdev == NULL) {
+ WARN(1, "%s: gadget driver already disconnected\n", __func__);
+ return;
+ }
+
+ /* accessory HID support can be active while the
+ accessory function is not actually enabled,
+ so we need to inform it when we are disconnected.
+ */
+
+#ifdef CONFIG_USB_CONFIGFS_F_ACC
+ acc_disconnect();
+#endif
+ gi->connected = 0;
+ schedule_work(&gi->work);
+ composite_disconnect(gadget);
+}
+#endif
+
static const struct usb_gadget_driver configfs_driver_template = {
.bind = configfs_composite_bind,
.unbind = configfs_composite_unbind,
-
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ .setup = android_setup,
+ .reset = android_disconnect,
+ .disconnect = android_disconnect,
+#else
.setup = composite_setup,
.reset = composite_disconnect,
.disconnect = composite_disconnect,
-
+#endif
.suspend = composite_suspend,
.resume = composite_resume,
@@ -1407,6 +1569,89 @@ static const struct usb_gadget_driver configfs_driver_template = {
.match_existing_only = 1,
};
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+static ssize_t state_show(struct device *pdev, struct device_attribute *attr,
+ char *buf)
+{
+ struct gadget_info *dev = dev_get_drvdata(pdev);
+ struct usb_composite_dev *cdev;
+ char *state = "DISCONNECTED";
+ unsigned long flags;
+
+ if (!dev)
+ goto out;
+
+ cdev = &dev->cdev;
+
+ if (!cdev)
+ goto out;
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (cdev->config)
+ state = "CONFIGURED";
+ else if (dev->connected)
+ state = "CONNECTED";
+ spin_unlock_irqrestore(&cdev->lock, flags);
+out:
+ return sprintf(buf, "%s\n", state);
+}
+
+static DEVICE_ATTR(state, S_IRUGO, state_show, NULL);
+
+static struct device_attribute *android_usb_attributes[] = {
+ &dev_attr_state,
+ NULL
+};
+
+static int android_device_create(struct gadget_info *gi)
+{
+ struct device_attribute **attrs;
+ struct device_attribute *attr;
+
+ INIT_WORK(&gi->work, android_work);
+ android_device = device_create(android_class, NULL,
+ MKDEV(0, 0), NULL, "android0");
+ if (IS_ERR(android_device))
+ return PTR_ERR(android_device);
+
+ dev_set_drvdata(android_device, gi);
+
+ attrs = android_usb_attributes;
+ while ((attr = *attrs++)) {
+ int err;
+
+ err = device_create_file(android_device, attr);
+ if (err) {
+ device_destroy(android_device->class,
+ android_device->devt);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static void android_device_destroy(void)
+{
+ struct device_attribute **attrs;
+ struct device_attribute *attr;
+
+ attrs = android_usb_attributes;
+ while ((attr = *attrs++))
+ device_remove_file(android_device, attr);
+ device_destroy(android_device->class, android_device->devt);
+}
+#else
+static inline int android_device_create(struct gadget_info *gi)
+{
+ return 0;
+}
+
+static inline void android_device_destroy(void)
+{
+}
+#endif
+
static struct config_group *gadgets_make(
struct config_group *group,
const char *name)
@@ -1458,7 +1703,11 @@ static struct config_group *gadgets_make(
if (!gi->composite.gadget_driver.function)
goto err;
+ if (android_device_create(gi) < 0)
+ goto err;
+
return &gi->group;
+
err:
kfree(gi);
return ERR_PTR(-ENOMEM);
@@ -1467,6 +1716,7 @@ err:
static void gadgets_drop(struct config_group *group, struct config_item *item)
{
config_item_put(item);
+ android_device_destroy();
}
static struct configfs_group_operations gadgets_ops = {
@@ -1506,6 +1756,13 @@ static int __init gadget_cfs_init(void)
config_group_init(&gadget_subsys.su_group);
ret = configfs_register_subsystem(&gadget_subsys);
+
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ android_class = class_create(THIS_MODULE, "android_usb");
+ if (IS_ERR(android_class))
+ return PTR_ERR(android_class);
+#endif
+
return ret;
}
module_init(gadget_cfs_init);
@@ -1513,5 +1770,10 @@ module_init(gadget_cfs_init);
static void __exit gadget_cfs_exit(void)
{
configfs_unregister_subsystem(&gadget_subsys);
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ if (!IS_ERR(android_class))
+ class_destroy(android_class);
+#endif
+
}
module_exit(gadget_cfs_exit);
diff --git a/drivers/usb/gadget/function/Makefile b/drivers/usb/gadget/function/Makefile
index cb8c225e8549..78682d5e4dc7 100644
--- a/drivers/usb/gadget/function/Makefile
+++ b/drivers/usb/gadget/function/Makefile
@@ -46,3 +46,11 @@ usb_f_printer-y := f_printer.o
obj-$(CONFIG_USB_F_PRINTER) += usb_f_printer.o
usb_f_tcm-y := f_tcm.o
obj-$(CONFIG_USB_F_TCM) += usb_f_tcm.o
+usb_f_mtp-y := f_mtp.o
+obj-$(CONFIG_USB_F_MTP) += usb_f_mtp.o
+usb_f_ptp-y := f_ptp.o
+obj-$(CONFIG_USB_F_PTP) += usb_f_ptp.o
+usb_f_audio_source-y := f_audio_source.o
+obj-$(CONFIG_USB_F_AUDIO_SRC) += usb_f_audio_source.o
+usb_f_accessory-y := f_accessory.o
+obj-$(CONFIG_USB_F_ACC) += usb_f_accessory.o
diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c
new file mode 100644
index 000000000000..7aa2656a2328
--- /dev/null
+++ b/drivers/usb/gadget/function/f_accessory.c
@@ -0,0 +1,1352 @@
+/*
+ * Gadget Function Driver for Android USB accessories
+ *
+ * Copyright (C) 2011 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/* #define DEBUG */
+/* #define VERBOSE_DEBUG */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+
+#include <linux/hid.h>
+#include <linux/hiddev.h>
+#include <linux/usb.h>
+#include <linux/usb/ch9.h>
+#include <linux/usb/f_accessory.h>
+
+#include <linux/configfs.h>
+#include <linux/usb/composite.h>
+
+#define MAX_INST_NAME_LEN 40
+#define BULK_BUFFER_SIZE 16384
+#define ACC_STRING_SIZE 256
+
+#define PROTOCOL_VERSION 2
+
+/* String IDs */
+#define INTERFACE_STRING_INDEX 0
+
+/* number of tx and rx requests to allocate */
+#define TX_REQ_MAX 4
+#define RX_REQ_MAX 2
+
+struct acc_hid_dev {
+ struct list_head list;
+ struct hid_device *hid;
+ struct acc_dev *dev;
+ /* accessory defined ID */
+ int id;
+ /* HID report descriptor */
+ u8 *report_desc;
+ /* length of HID report descriptor */
+ int report_desc_len;
+ /* number of bytes of report_desc we have received so far */
+ int report_desc_offset;
+};
+
+struct acc_dev {
+ struct usb_function function;
+ struct usb_composite_dev *cdev;
+ spinlock_t lock;
+
+ struct usb_ep *ep_in;
+ struct usb_ep *ep_out;
+
+ /* online indicates state of function_set_alt & function_unbind
+ * set to 1 when we connect
+ */
+ int online:1;
+
+ /* disconnected indicates state of open & release
+ * Set to 1 when we disconnect.
+ * Not cleared until our file is closed.
+ */
+ int disconnected:1;
+
+ /* strings sent by the host */
+ char manufacturer[ACC_STRING_SIZE];
+ char model[ACC_STRING_SIZE];
+ char description[ACC_STRING_SIZE];
+ char version[ACC_STRING_SIZE];
+ char uri[ACC_STRING_SIZE];
+ char serial[ACC_STRING_SIZE];
+
+ /* for acc_complete_set_string */
+ int string_index;
+
+ /* set to 1 if we have a pending start request */
+ int start_requested;
+
+ int audio_mode;
+
+ /* synchronize access to our device file */
+ atomic_t open_excl;
+
+ struct list_head tx_idle;
+
+ wait_queue_head_t read_wq;
+ wait_queue_head_t write_wq;
+ struct usb_request *rx_req[RX_REQ_MAX];
+ int rx_done;
+
+ /* delayed work for handling ACCESSORY_START */
+ struct delayed_work start_work;
+
+ /* worker for registering and unregistering hid devices */
+ struct work_struct hid_work;
+
+ /* list of active HID devices */
+ struct list_head hid_list;
+
+ /* list of new HID devices to register */
+ struct list_head new_hid_list;
+
+ /* list of dead HID devices to unregister */
+ struct list_head dead_hid_list;
+};
+
+static struct usb_interface_descriptor acc_interface_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bInterfaceNumber = 0,
+ .bNumEndpoints = 2,
+ .bInterfaceClass = USB_CLASS_VENDOR_SPEC,
+ .bInterfaceSubClass = USB_SUBCLASS_VENDOR_SPEC,
+ .bInterfaceProtocol = 0,
+};
+
+static struct usb_endpoint_descriptor acc_highspeed_in_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor acc_highspeed_out_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor acc_fullspeed_in_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_endpoint_descriptor acc_fullspeed_out_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_descriptor_header *fs_acc_descs[] = {
+ (struct usb_descriptor_header *) &acc_interface_desc,
+ (struct usb_descriptor_header *) &acc_fullspeed_in_desc,
+ (struct usb_descriptor_header *) &acc_fullspeed_out_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *hs_acc_descs[] = {
+ (struct usb_descriptor_header *) &acc_interface_desc,
+ (struct usb_descriptor_header *) &acc_highspeed_in_desc,
+ (struct usb_descriptor_header *) &acc_highspeed_out_desc,
+ NULL,
+};
+
+static struct usb_string acc_string_defs[] = {
+ [INTERFACE_STRING_INDEX].s = "Android Accessory Interface",
+ { }, /* end of list */
+};
+
+static struct usb_gadget_strings acc_string_table = {
+ .language = 0x0409, /* en-US */
+ .strings = acc_string_defs,
+};
+
+static struct usb_gadget_strings *acc_strings[] = {
+ &acc_string_table,
+ NULL,
+};
+
+/* temporary variable used between acc_open() and acc_gadget_bind() */
+static struct acc_dev *_acc_dev;
+
+struct acc_instance {
+ struct usb_function_instance func_inst;
+ const char *name;
+};
+
+static inline struct acc_dev *func_to_dev(struct usb_function *f)
+{
+ return container_of(f, struct acc_dev, function);
+}
+
+static struct usb_request *acc_request_new(struct usb_ep *ep, int buffer_size)
+{
+ struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL);
+
+ if (!req)
+ return NULL;
+
+ /* now allocate buffers for the requests */
+ req->buf = kmalloc(buffer_size, GFP_KERNEL);
+ if (!req->buf) {
+ usb_ep_free_request(ep, req);
+ return NULL;
+ }
+
+ return req;
+}
+
+static void acc_request_free(struct usb_request *req, struct usb_ep *ep)
+{
+ if (req) {
+ kfree(req->buf);
+ usb_ep_free_request(ep, req);
+ }
+}
+
+/* add a request to the tail of a list */
+static void req_put(struct acc_dev *dev, struct list_head *head,
+ struct usb_request *req)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ list_add_tail(&req->list, head);
+ spin_unlock_irqrestore(&dev->lock, flags);
+}
+
+/* remove a request from the head of a list */
+static struct usb_request *req_get(struct acc_dev *dev, struct list_head *head)
+{
+ unsigned long flags;
+ struct usb_request *req;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ if (list_empty(head)) {
+ req = 0;
+ } else {
+ req = list_first_entry(head, struct usb_request, list);
+ list_del(&req->list);
+ }
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return req;
+}
+
+static void acc_set_disconnected(struct acc_dev *dev)
+{
+ dev->disconnected = 1;
+}
+
+static void acc_complete_in(struct usb_ep *ep, struct usb_request *req)
+{
+ struct acc_dev *dev = _acc_dev;
+
+ if (req->status == -ESHUTDOWN) {
+ pr_debug("acc_complete_in set disconnected");
+ acc_set_disconnected(dev);
+ }
+
+ req_put(dev, &dev->tx_idle, req);
+
+ wake_up(&dev->write_wq);
+}
+
+static void acc_complete_out(struct usb_ep *ep, struct usb_request *req)
+{
+ struct acc_dev *dev = _acc_dev;
+
+ dev->rx_done = 1;
+ if (req->status == -ESHUTDOWN) {
+ pr_debug("acc_complete_out set disconnected");
+ acc_set_disconnected(dev);
+ }
+
+ wake_up(&dev->read_wq);
+}
+
+static void acc_complete_set_string(struct usb_ep *ep, struct usb_request *req)
+{
+ struct acc_dev *dev = ep->driver_data;
+ char *string_dest = NULL;
+ int length = req->actual;
+
+ if (req->status != 0) {
+ pr_err("acc_complete_set_string, err %d\n", req->status);
+ return;
+ }
+
+ switch (dev->string_index) {
+ case ACCESSORY_STRING_MANUFACTURER:
+ string_dest = dev->manufacturer;
+ break;
+ case ACCESSORY_STRING_MODEL:
+ string_dest = dev->model;
+ break;
+ case ACCESSORY_STRING_DESCRIPTION:
+ string_dest = dev->description;
+ break;
+ case ACCESSORY_STRING_VERSION:
+ string_dest = dev->version;
+ break;
+ case ACCESSORY_STRING_URI:
+ string_dest = dev->uri;
+ break;
+ case ACCESSORY_STRING_SERIAL:
+ string_dest = dev->serial;
+ break;
+ }
+ if (string_dest) {
+ unsigned long flags;
+
+ if (length >= ACC_STRING_SIZE)
+ length = ACC_STRING_SIZE - 1;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ memcpy(string_dest, req->buf, length);
+ /* ensure zero termination */
+ string_dest[length] = 0;
+ spin_unlock_irqrestore(&dev->lock, flags);
+ } else {
+ pr_err("unknown accessory string index %d\n",
+ dev->string_index);
+ }
+}
+
+static void acc_complete_set_hid_report_desc(struct usb_ep *ep,
+ struct usb_request *req)
+{
+ struct acc_hid_dev *hid = req->context;
+ struct acc_dev *dev = hid->dev;
+ int length = req->actual;
+
+ if (req->status != 0) {
+ pr_err("acc_complete_set_hid_report_desc, err %d\n",
+ req->status);
+ return;
+ }
+
+ memcpy(hid->report_desc + hid->report_desc_offset, req->buf, length);
+ hid->report_desc_offset += length;
+ if (hid->report_desc_offset == hid->report_desc_len) {
+ /* After we have received the entire report descriptor
+ * we schedule work to initialize the HID device
+ */
+ schedule_work(&dev->hid_work);
+ }
+}
+
+static void acc_complete_send_hid_event(struct usb_ep *ep,
+ struct usb_request *req)
+{
+ struct acc_hid_dev *hid = req->context;
+ int length = req->actual;
+
+ if (req->status != 0) {
+ pr_err("acc_complete_send_hid_event, err %d\n", req->status);
+ return;
+ }
+
+ hid_report_raw_event(hid->hid, HID_INPUT_REPORT, req->buf, length, 1);
+}
+
+static int acc_hid_parse(struct hid_device *hid)
+{
+ struct acc_hid_dev *hdev = hid->driver_data;
+
+ hid_parse_report(hid, hdev->report_desc, hdev->report_desc_len);
+ return 0;
+}
+
+static int acc_hid_start(struct hid_device *hid)
+{
+ return 0;
+}
+
+static void acc_hid_stop(struct hid_device *hid)
+{
+}
+
+static int acc_hid_open(struct hid_device *hid)
+{
+ return 0;
+}
+
+static void acc_hid_close(struct hid_device *hid)
+{
+}
+
+static int acc_hid_raw_request(struct hid_device *hid, unsigned char reportnum,
+ __u8 *buf, size_t len, unsigned char rtype, int reqtype)
+{
+ return 0;
+}
+
+static struct hid_ll_driver acc_hid_ll_driver = {
+ .parse = acc_hid_parse,
+ .start = acc_hid_start,
+ .stop = acc_hid_stop,
+ .open = acc_hid_open,
+ .close = acc_hid_close,
+ .raw_request = acc_hid_raw_request,
+};
+
+static struct acc_hid_dev *acc_hid_new(struct acc_dev *dev,
+ int id, int desc_len)
+{
+ struct acc_hid_dev *hdev;
+
+ hdev = kzalloc(sizeof(*hdev), GFP_ATOMIC);
+ if (!hdev)
+ return NULL;
+ hdev->report_desc = kzalloc(desc_len, GFP_ATOMIC);
+ if (!hdev->report_desc) {
+ kfree(hdev);
+ return NULL;
+ }
+ hdev->dev = dev;
+ hdev->id = id;
+ hdev->report_desc_len = desc_len;
+
+ return hdev;
+}
+
+static struct acc_hid_dev *acc_hid_get(struct list_head *list, int id)
+{
+ struct acc_hid_dev *hid;
+
+ list_for_each_entry(hid, list, list) {
+ if (hid->id == id)
+ return hid;
+ }
+ return NULL;
+}
+
+static int acc_register_hid(struct acc_dev *dev, int id, int desc_length)
+{
+ struct acc_hid_dev *hid;
+ unsigned long flags;
+
+ /* report descriptor length must be > 0 */
+ if (desc_length <= 0)
+ return -EINVAL;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ /* replace HID if one already exists with this ID */
+ hid = acc_hid_get(&dev->hid_list, id);
+ if (!hid)
+ hid = acc_hid_get(&dev->new_hid_list, id);
+ if (hid)
+ list_move(&hid->list, &dev->dead_hid_list);
+
+ hid = acc_hid_new(dev, id, desc_length);
+ if (!hid) {
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return -ENOMEM;
+ }
+
+ list_add(&hid->list, &dev->new_hid_list);
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ /* schedule work to register the HID device */
+ schedule_work(&dev->hid_work);
+ return 0;
+}
+
+static int acc_unregister_hid(struct acc_dev *dev, int id)
+{
+ struct acc_hid_dev *hid;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ hid = acc_hid_get(&dev->hid_list, id);
+ if (!hid)
+ hid = acc_hid_get(&dev->new_hid_list, id);
+ if (!hid) {
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return -EINVAL;
+ }
+
+ list_move(&hid->list, &dev->dead_hid_list);
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ schedule_work(&dev->hid_work);
+ return 0;
+}
+
+static int create_bulk_endpoints(struct acc_dev *dev,
+ struct usb_endpoint_descriptor *in_desc,
+ struct usb_endpoint_descriptor *out_desc)
+{
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *req;
+ struct usb_ep *ep;
+ int i;
+
+ DBG(cdev, "create_bulk_endpoints dev: %p\n", dev);
+
+ ep = usb_ep_autoconfig(cdev->gadget, in_desc);
+ if (!ep) {
+ DBG(cdev, "usb_ep_autoconfig for ep_in failed\n");
+ return -ENODEV;
+ }
+ DBG(cdev, "usb_ep_autoconfig for ep_in got %s\n", ep->name);
+ ep->driver_data = dev; /* claim the endpoint */
+ dev->ep_in = ep;
+
+ ep = usb_ep_autoconfig(cdev->gadget, out_desc);
+ if (!ep) {
+ DBG(cdev, "usb_ep_autoconfig for ep_out failed\n");
+ return -ENODEV;
+ }
+ DBG(cdev, "usb_ep_autoconfig for ep_out got %s\n", ep->name);
+ ep->driver_data = dev; /* claim the endpoint */
+ dev->ep_out = ep;
+
+ /* now allocate requests for our endpoints */
+ for (i = 0; i < TX_REQ_MAX; i++) {
+ req = acc_request_new(dev->ep_in, BULK_BUFFER_SIZE);
+ if (!req)
+ goto fail;
+ req->complete = acc_complete_in;
+ req_put(dev, &dev->tx_idle, req);
+ }
+ for (i = 0; i < RX_REQ_MAX; i++) {
+ req = acc_request_new(dev->ep_out, BULK_BUFFER_SIZE);
+ if (!req)
+ goto fail;
+ req->complete = acc_complete_out;
+ dev->rx_req[i] = req;
+ }
+
+ return 0;
+
+fail:
+ pr_err("acc_bind() could not allocate requests\n");
+ while ((req = req_get(dev, &dev->tx_idle)))
+ acc_request_free(req, dev->ep_in);
+ for (i = 0; i < RX_REQ_MAX; i++)
+ acc_request_free(dev->rx_req[i], dev->ep_out);
+ return -1;
+}
+
+static ssize_t acc_read(struct file *fp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct acc_dev *dev = fp->private_data;
+ struct usb_request *req;
+ ssize_t r = count;
+ unsigned xfer;
+ int ret = 0;
+
+ pr_debug("acc_read(%zu)\n", count);
+
+ if (dev->disconnected) {
+ pr_debug("acc_read disconnected");
+ return -ENODEV;
+ }
+
+ if (count > BULK_BUFFER_SIZE)
+ count = BULK_BUFFER_SIZE;
+
+ /* we will block until we're online */
+ pr_debug("acc_read: waiting for online\n");
+ ret = wait_event_interruptible(dev->read_wq, dev->online);
+ if (ret < 0) {
+ r = ret;
+ goto done;
+ }
+
+ if (dev->rx_done) {
+ // last req cancelled. try to get it.
+ req = dev->rx_req[0];
+ goto copy_data;
+ }
+
+requeue_req:
+ /* queue a request */
+ req = dev->rx_req[0];
+ req->length = count;
+ dev->rx_done = 0;
+ ret = usb_ep_queue(dev->ep_out, req, GFP_KERNEL);
+ if (ret < 0) {
+ r = -EIO;
+ goto done;
+ } else {
+ pr_debug("rx %p queue\n", req);
+ }
+
+ /* wait for a request to complete */
+ ret = wait_event_interruptible(dev->read_wq, dev->rx_done);
+ if (ret < 0) {
+ r = ret;
+ ret = usb_ep_dequeue(dev->ep_out, req);
+ if (ret != 0) {
+ // cancel failed. There can be a data already received.
+ // it will be retrieved in the next read.
+ pr_debug("acc_read: cancelling failed %d", ret);
+ }
+ goto done;
+ }
+
+copy_data:
+ dev->rx_done = 0;
+ if (dev->online) {
+ /* If we got a 0-len packet, throw it back and try again. */
+ if (req->actual == 0)
+ goto requeue_req;
+
+ pr_debug("rx %p %u\n", req, req->actual);
+ xfer = (req->actual < count) ? req->actual : count;
+ r = xfer;
+ if (copy_to_user(buf, req->buf, xfer))
+ r = -EFAULT;
+ } else
+ r = -EIO;
+
+done:
+ pr_debug("acc_read returning %zd\n", r);
+ return r;
+}
+
+static ssize_t acc_write(struct file *fp, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct acc_dev *dev = fp->private_data;
+ struct usb_request *req = 0;
+ ssize_t r = count;
+ unsigned xfer;
+ int ret;
+
+ pr_debug("acc_write(%zu)\n", count);
+
+ if (!dev->online || dev->disconnected) {
+ pr_debug("acc_write disconnected or not online");
+ return -ENODEV;
+ }
+
+ while (count > 0) {
+ if (!dev->online) {
+ pr_debug("acc_write dev->error\n");
+ r = -EIO;
+ break;
+ }
+
+ /* get an idle tx request to use */
+ req = 0;
+ ret = wait_event_interruptible(dev->write_wq,
+ ((req = req_get(dev, &dev->tx_idle)) || !dev->online));
+ if (!req) {
+ r = ret;
+ break;
+ }
+
+ if (count > BULK_BUFFER_SIZE) {
+ xfer = BULK_BUFFER_SIZE;
+ /* ZLP, They will be more TX requests so not yet. */
+ req->zero = 0;
+ } else {
+ xfer = count;
+ /* If the data length is a multple of the
+ * maxpacket size then send a zero length packet(ZLP).
+ */
+ req->zero = ((xfer % dev->ep_in->maxpacket) == 0);
+ }
+ if (copy_from_user(req->buf, buf, xfer)) {
+ r = -EFAULT;
+ break;
+ }
+
+ req->length = xfer;
+ ret = usb_ep_queue(dev->ep_in, req, GFP_KERNEL);
+ if (ret < 0) {
+ pr_debug("acc_write: xfer error %d\n", ret);
+ r = -EIO;
+ break;
+ }
+
+ buf += xfer;
+ count -= xfer;
+
+ /* zero this so we don't try to free it on error exit */
+ req = 0;
+ }
+
+ if (req)
+ req_put(dev, &dev->tx_idle, req);
+
+ pr_debug("acc_write returning %zd\n", r);
+ return r;
+}
+
+static long acc_ioctl(struct file *fp, unsigned code, unsigned long value)
+{
+ struct acc_dev *dev = fp->private_data;
+ char *src = NULL;
+ int ret;
+
+ switch (code) {
+ case ACCESSORY_GET_STRING_MANUFACTURER:
+ src = dev->manufacturer;
+ break;
+ case ACCESSORY_GET_STRING_MODEL:
+ src = dev->model;
+ break;
+ case ACCESSORY_GET_STRING_DESCRIPTION:
+ src = dev->description;
+ break;
+ case ACCESSORY_GET_STRING_VERSION:
+ src = dev->version;
+ break;
+ case ACCESSORY_GET_STRING_URI:
+ src = dev->uri;
+ break;
+ case ACCESSORY_GET_STRING_SERIAL:
+ src = dev->serial;
+ break;
+ case ACCESSORY_IS_START_REQUESTED:
+ return dev->start_requested;
+ case ACCESSORY_GET_AUDIO_MODE:
+ return dev->audio_mode;
+ }
+ if (!src)
+ return -EINVAL;
+
+ ret = strlen(src) + 1;
+ if (copy_to_user((void __user *)value, src, ret))
+ ret = -EFAULT;
+ return ret;
+}
+
+static int acc_open(struct inode *ip, struct file *fp)
+{
+ printk(KERN_INFO "acc_open\n");
+ if (atomic_xchg(&_acc_dev->open_excl, 1))
+ return -EBUSY;
+
+ _acc_dev->disconnected = 0;
+ fp->private_data = _acc_dev;
+ return 0;
+}
+
+static int acc_release(struct inode *ip, struct file *fp)
+{
+ printk(KERN_INFO "acc_release\n");
+
+ WARN_ON(!atomic_xchg(&_acc_dev->open_excl, 0));
+ /* indicate that we are disconnected
+ * still could be online so don't touch online flag
+ */
+ _acc_dev->disconnected = 1;
+ return 0;
+}
+
+/* file operations for /dev/usb_accessory */
+static const struct file_operations acc_fops = {
+ .owner = THIS_MODULE,
+ .read = acc_read,
+ .write = acc_write,
+ .unlocked_ioctl = acc_ioctl,
+ .open = acc_open,
+ .release = acc_release,
+};
+
+static int acc_hid_probe(struct hid_device *hdev,
+ const struct hid_device_id *id)
+{
+ int ret;
+
+ ret = hid_parse(hdev);
+ if (ret)
+ return ret;
+ return hid_hw_start(hdev, HID_CONNECT_DEFAULT);
+}
+
+static struct miscdevice acc_device = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "usb_accessory",
+ .fops = &acc_fops,
+};
+
+static const struct hid_device_id acc_hid_table[] = {
+ { HID_USB_DEVICE(HID_ANY_ID, HID_ANY_ID) },
+ { }
+};
+
+static struct hid_driver acc_hid_driver = {
+ .name = "USB accessory",
+ .id_table = acc_hid_table,
+ .probe = acc_hid_probe,
+};
+
+static void acc_complete_setup_noop(struct usb_ep *ep, struct usb_request *req)
+{
+ /*
+ * Default no-op function when nothing needs to be done for the
+ * setup request
+ */
+}
+
+int acc_ctrlrequest(struct usb_composite_dev *cdev,
+ const struct usb_ctrlrequest *ctrl)
+{
+ struct acc_dev *dev = _acc_dev;
+ int value = -EOPNOTSUPP;
+ struct acc_hid_dev *hid;
+ int offset;
+ u8 b_requestType = ctrl->bRequestType;
+ u8 b_request = ctrl->bRequest;
+ u16 w_index = le16_to_cpu(ctrl->wIndex);
+ u16 w_value = le16_to_cpu(ctrl->wValue);
+ u16 w_length = le16_to_cpu(ctrl->wLength);
+ unsigned long flags;
+
+/*
+ printk(KERN_INFO "acc_ctrlrequest "
+ "%02x.%02x v%04x i%04x l%u\n",
+ b_requestType, b_request,
+ w_value, w_index, w_length);
+*/
+
+ if (b_requestType == (USB_DIR_OUT | USB_TYPE_VENDOR)) {
+ if (b_request == ACCESSORY_START) {
+ dev->start_requested = 1;
+ schedule_delayed_work(
+ &dev->start_work, msecs_to_jiffies(10));
+ value = 0;
+ cdev->req->complete = acc_complete_setup_noop;
+ } else if (b_request == ACCESSORY_SEND_STRING) {
+ dev->string_index = w_index;
+ cdev->gadget->ep0->driver_data = dev;
+ cdev->req->complete = acc_complete_set_string;
+ value = w_length;
+ } else if (b_request == ACCESSORY_SET_AUDIO_MODE &&
+ w_index == 0 && w_length == 0) {
+ dev->audio_mode = w_value;
+ cdev->req->complete = acc_complete_setup_noop;
+ value = 0;
+ } else if (b_request == ACCESSORY_REGISTER_HID) {
+ cdev->req->complete = acc_complete_setup_noop;
+ value = acc_register_hid(dev, w_value, w_index);
+ } else if (b_request == ACCESSORY_UNREGISTER_HID) {
+ cdev->req->complete = acc_complete_setup_noop;
+ value = acc_unregister_hid(dev, w_value);
+ } else if (b_request == ACCESSORY_SET_HID_REPORT_DESC) {
+ spin_lock_irqsave(&dev->lock, flags);
+ hid = acc_hid_get(&dev->new_hid_list, w_value);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ if (!hid) {
+ value = -EINVAL;
+ goto err;
+ }
+ offset = w_index;
+ if (offset != hid->report_desc_offset
+ || offset + w_length > hid->report_desc_len) {
+ value = -EINVAL;
+ goto err;
+ }
+ cdev->req->context = hid;
+ cdev->req->complete = acc_complete_set_hid_report_desc;
+ value = w_length;
+ } else if (b_request == ACCESSORY_SEND_HID_EVENT) {
+ spin_lock_irqsave(&dev->lock, flags);
+ hid = acc_hid_get(&dev->hid_list, w_value);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ if (!hid) {
+ value = -EINVAL;
+ goto err;
+ }
+ cdev->req->context = hid;
+ cdev->req->complete = acc_complete_send_hid_event;
+ value = w_length;
+ }
+ } else if (b_requestType == (USB_DIR_IN | USB_TYPE_VENDOR)) {
+ if (b_request == ACCESSORY_GET_PROTOCOL) {
+ *((u16 *)cdev->req->buf) = PROTOCOL_VERSION;
+ value = sizeof(u16);
+ cdev->req->complete = acc_complete_setup_noop;
+ /* clear any string left over from a previous session */
+ memset(dev->manufacturer, 0, sizeof(dev->manufacturer));
+ memset(dev->model, 0, sizeof(dev->model));
+ memset(dev->description, 0, sizeof(dev->description));
+ memset(dev->version, 0, sizeof(dev->version));
+ memset(dev->uri, 0, sizeof(dev->uri));
+ memset(dev->serial, 0, sizeof(dev->serial));
+ dev->start_requested = 0;
+ dev->audio_mode = 0;
+ }
+ }
+
+ if (value >= 0) {
+ cdev->req->zero = 0;
+ cdev->req->length = value;
+ value = usb_ep_queue(cdev->gadget->ep0, cdev->req, GFP_ATOMIC);
+ if (value < 0)
+ ERROR(cdev, "%s setup response queue error\n",
+ __func__);
+ }
+
+err:
+ if (value == -EOPNOTSUPP)
+ VDBG(cdev,
+ "unknown class-specific control req "
+ "%02x.%02x v%04x i%04x l%u\n",
+ ctrl->bRequestType, ctrl->bRequest,
+ w_value, w_index, w_length);
+ return value;
+}
+EXPORT_SYMBOL_GPL(acc_ctrlrequest);
+
+static int
+__acc_function_bind(struct usb_configuration *c,
+ struct usb_function *f, bool configfs)
+{
+ struct usb_composite_dev *cdev = c->cdev;
+ struct acc_dev *dev = func_to_dev(f);
+ int id;
+ int ret;
+
+ DBG(cdev, "acc_function_bind dev: %p\n", dev);
+
+ if (configfs) {
+ if (acc_string_defs[INTERFACE_STRING_INDEX].id == 0) {
+ ret = usb_string_id(c->cdev);
+ if (ret < 0)
+ return ret;
+ acc_string_defs[INTERFACE_STRING_INDEX].id = ret;
+ acc_interface_desc.iInterface = ret;
+ }
+ dev->cdev = c->cdev;
+ }
+ ret = hid_register_driver(&acc_hid_driver);
+ if (ret)
+ return ret;
+
+ dev->start_requested = 0;
+
+ /* allocate interface ID(s) */
+ id = usb_interface_id(c, f);
+ if (id < 0)
+ return id;
+ acc_interface_desc.bInterfaceNumber = id;
+
+ /* allocate endpoints */
+ ret = create_bulk_endpoints(dev, &acc_fullspeed_in_desc,
+ &acc_fullspeed_out_desc);
+ if (ret)
+ return ret;
+
+ /* support high speed hardware */
+ if (gadget_is_dualspeed(c->cdev->gadget)) {
+ acc_highspeed_in_desc.bEndpointAddress =
+ acc_fullspeed_in_desc.bEndpointAddress;
+ acc_highspeed_out_desc.bEndpointAddress =
+ acc_fullspeed_out_desc.bEndpointAddress;
+ }
+
+ DBG(cdev, "%s speed %s: IN/%s, OUT/%s\n",
+ gadget_is_dualspeed(c->cdev->gadget) ? "dual" : "full",
+ f->name, dev->ep_in->name, dev->ep_out->name);
+ return 0;
+}
+
+static int
+acc_function_bind_configfs(struct usb_configuration *c,
+ struct usb_function *f) {
+ return __acc_function_bind(c, f, true);
+}
+
+static void
+kill_all_hid_devices(struct acc_dev *dev)
+{
+ struct acc_hid_dev *hid;
+ struct list_head *entry, *temp;
+ unsigned long flags;
+
+ /* do nothing if usb accessory device doesn't exist */
+ if (!dev)
+ return;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ list_for_each_safe(entry, temp, &dev->hid_list) {
+ hid = list_entry(entry, struct acc_hid_dev, list);
+ list_del(&hid->list);
+ list_add(&hid->list, &dev->dead_hid_list);
+ }
+ list_for_each_safe(entry, temp, &dev->new_hid_list) {
+ hid = list_entry(entry, struct acc_hid_dev, list);
+ list_del(&hid->list);
+ list_add(&hid->list, &dev->dead_hid_list);
+ }
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ schedule_work(&dev->hid_work);
+}
+
+static void
+acc_hid_unbind(struct acc_dev *dev)
+{
+ hid_unregister_driver(&acc_hid_driver);
+ kill_all_hid_devices(dev);
+}
+
+static void
+acc_function_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct acc_dev *dev = func_to_dev(f);
+ struct usb_request *req;
+ int i;
+
+ dev->online = 0; /* clear online flag */
+ wake_up(&dev->read_wq); /* unblock reads on closure */
+ wake_up(&dev->write_wq); /* likewise for writes */
+
+ while ((req = req_get(dev, &dev->tx_idle)))
+ acc_request_free(req, dev->ep_in);
+ for (i = 0; i < RX_REQ_MAX; i++)
+ acc_request_free(dev->rx_req[i], dev->ep_out);
+
+ acc_hid_unbind(dev);
+}
+
+static void acc_start_work(struct work_struct *data)
+{
+ char *envp[2] = { "ACCESSORY=START", NULL };
+
+ kobject_uevent_env(&acc_device.this_device->kobj, KOBJ_CHANGE, envp);
+}
+
+static int acc_hid_init(struct acc_hid_dev *hdev)
+{
+ struct hid_device *hid;
+ int ret;
+
+ hid = hid_allocate_device();
+ if (IS_ERR(hid))
+ return PTR_ERR(hid);
+
+ hid->ll_driver = &acc_hid_ll_driver;
+ hid->dev.parent = acc_device.this_device;
+
+ hid->bus = BUS_USB;
+ hid->vendor = HID_ANY_ID;
+ hid->product = HID_ANY_ID;
+ hid->driver_data = hdev;
+ ret = hid_add_device(hid);
+ if (ret) {
+ pr_err("can't add hid device: %d\n", ret);
+ hid_destroy_device(hid);
+ return ret;
+ }
+
+ hdev->hid = hid;
+ return 0;
+}
+
+static void acc_hid_delete(struct acc_hid_dev *hid)
+{
+ kfree(hid->report_desc);
+ kfree(hid);
+}
+
+static void acc_hid_work(struct work_struct *data)
+{
+ struct acc_dev *dev = _acc_dev;
+ struct list_head *entry, *temp;
+ struct acc_hid_dev *hid;
+ struct list_head new_list, dead_list;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&new_list);
+
+ spin_lock_irqsave(&dev->lock, flags);
+
+ /* copy hids that are ready for initialization to new_list */
+ list_for_each_safe(entry, temp, &dev->new_hid_list) {
+ hid = list_entry(entry, struct acc_hid_dev, list);
+ if (hid->report_desc_offset == hid->report_desc_len)
+ list_move(&hid->list, &new_list);
+ }
+
+ if (list_empty(&dev->dead_hid_list)) {
+ INIT_LIST_HEAD(&dead_list);
+ } else {
+ /* move all of dev->dead_hid_list to dead_list */
+ dead_list.prev = dev->dead_hid_list.prev;
+ dead_list.next = dev->dead_hid_list.next;
+ dead_list.next->prev = &dead_list;
+ dead_list.prev->next = &dead_list;
+ INIT_LIST_HEAD(&dev->dead_hid_list);
+ }
+
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ /* register new HID devices */
+ list_for_each_safe(entry, temp, &new_list) {
+ hid = list_entry(entry, struct acc_hid_dev, list);
+ if (acc_hid_init(hid)) {
+ pr_err("can't add HID device %p\n", hid);
+ acc_hid_delete(hid);
+ } else {
+ spin_lock_irqsave(&dev->lock, flags);
+ list_move(&hid->list, &dev->hid_list);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ }
+ }
+
+ /* remove dead HID devices */
+ list_for_each_safe(entry, temp, &dead_list) {
+ hid = list_entry(entry, struct acc_hid_dev, list);
+ list_del(&hid->list);
+ if (hid->hid)
+ hid_destroy_device(hid->hid);
+ acc_hid_delete(hid);
+ }
+}
+
+static int acc_function_set_alt(struct usb_function *f,
+ unsigned intf, unsigned alt)
+{
+ struct acc_dev *dev = func_to_dev(f);
+ struct usb_composite_dev *cdev = f->config->cdev;
+ int ret;
+
+ DBG(cdev, "acc_function_set_alt intf: %d alt: %d\n", intf, alt);
+
+ ret = config_ep_by_speed(cdev->gadget, f, dev->ep_in);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(dev->ep_in);
+ if (ret)
+ return ret;
+
+ ret = config_ep_by_speed(cdev->gadget, f, dev->ep_out);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(dev->ep_out);
+ if (ret) {
+ usb_ep_disable(dev->ep_in);
+ return ret;
+ }
+
+ dev->online = 1;
+ dev->disconnected = 0; /* if online then not disconnected */
+
+ /* readers may be blocked waiting for us to go online */
+ wake_up(&dev->read_wq);
+ return 0;
+}
+
+static void acc_function_disable(struct usb_function *f)
+{
+ struct acc_dev *dev = func_to_dev(f);
+ struct usb_composite_dev *cdev = dev->cdev;
+
+ DBG(cdev, "acc_function_disable\n");
+ acc_set_disconnected(dev); /* this now only sets disconnected */
+ dev->online = 0; /* so now need to clear online flag here too */
+ usb_ep_disable(dev->ep_in);
+ usb_ep_disable(dev->ep_out);
+
+ /* readers may be blocked waiting for us to go online */
+ wake_up(&dev->read_wq);
+
+ VDBG(cdev, "%s disabled\n", dev->function.name);
+}
+
+static int acc_setup(void)
+{
+ struct acc_dev *dev;
+ int ret;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+
+ spin_lock_init(&dev->lock);
+ init_waitqueue_head(&dev->read_wq);
+ init_waitqueue_head(&dev->write_wq);
+ atomic_set(&dev->open_excl, 0);
+ INIT_LIST_HEAD(&dev->tx_idle);
+ INIT_LIST_HEAD(&dev->hid_list);
+ INIT_LIST_HEAD(&dev->new_hid_list);
+ INIT_LIST_HEAD(&dev->dead_hid_list);
+ INIT_DELAYED_WORK(&dev->start_work, acc_start_work);
+ INIT_WORK(&dev->hid_work, acc_hid_work);
+
+ /* _acc_dev must be set before calling usb_gadget_register_driver */
+ _acc_dev = dev;
+
+ ret = misc_register(&acc_device);
+ if (ret)
+ goto err;
+
+ return 0;
+
+err:
+ kfree(dev);
+ pr_err("USB accessory gadget driver failed to initialize\n");
+ return ret;
+}
+
+void acc_disconnect(void)
+{
+ /* unregister all HID devices if USB is disconnected */
+ kill_all_hid_devices(_acc_dev);
+}
+EXPORT_SYMBOL_GPL(acc_disconnect);
+
+static void acc_cleanup(void)
+{
+ misc_deregister(&acc_device);
+ kfree(_acc_dev);
+ _acc_dev = NULL;
+}
+static struct acc_instance *to_acc_instance(struct config_item *item)
+{
+ return container_of(to_config_group(item), struct acc_instance,
+ func_inst.group);
+}
+
+static void acc_attr_release(struct config_item *item)
+{
+ struct acc_instance *fi_acc = to_acc_instance(item);
+
+ usb_put_function_instance(&fi_acc->func_inst);
+}
+
+static struct configfs_item_operations acc_item_ops = {
+ .release = acc_attr_release,
+};
+
+static struct config_item_type acc_func_type = {
+ .ct_item_ops = &acc_item_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct acc_instance *to_fi_acc(struct usb_function_instance *fi)
+{
+ return container_of(fi, struct acc_instance, func_inst);
+}
+
+static int acc_set_inst_name(struct usb_function_instance *fi, const char *name)
+{
+ struct acc_instance *fi_acc;
+ char *ptr;
+ int name_len;
+
+ name_len = strlen(name) + 1;
+ if (name_len > MAX_INST_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ ptr = kstrndup(name, name_len, GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ fi_acc = to_fi_acc(fi);
+ fi_acc->name = ptr;
+ return 0;
+}
+
+static void acc_free_inst(struct usb_function_instance *fi)
+{
+ struct acc_instance *fi_acc;
+
+ fi_acc = to_fi_acc(fi);
+ kfree(fi_acc->name);
+ acc_cleanup();
+}
+
+static struct usb_function_instance *acc_alloc_inst(void)
+{
+ struct acc_instance *fi_acc;
+ struct acc_dev *dev;
+ int err;
+
+ fi_acc = kzalloc(sizeof(*fi_acc), GFP_KERNEL);
+ if (!fi_acc)
+ return ERR_PTR(-ENOMEM);
+ fi_acc->func_inst.set_inst_name = acc_set_inst_name;
+ fi_acc->func_inst.free_func_inst = acc_free_inst;
+
+ err = acc_setup();
+ if (err) {
+ kfree(fi_acc);
+ pr_err("Error setting ACCESSORY\n");
+ return ERR_PTR(err);
+ }
+
+ config_group_init_type_name(&fi_acc->func_inst.group,
+ "", &acc_func_type);
+ dev = _acc_dev;
+ return &fi_acc->func_inst;
+}
+
+static void acc_free(struct usb_function *f)
+{
+/*NO-OP: no function specific resource allocation in mtp_alloc*/
+}
+
+int acc_ctrlrequest_configfs(struct usb_function *f,
+ const struct usb_ctrlrequest *ctrl) {
+ if (f->config != NULL && f->config->cdev != NULL)
+ return acc_ctrlrequest(f->config->cdev, ctrl);
+ else
+ return -1;
+}
+
+static struct usb_function *acc_alloc(struct usb_function_instance *fi)
+{
+ struct acc_dev *dev = _acc_dev;
+
+ pr_info("acc_alloc\n");
+
+ dev->function.name = "accessory";
+ dev->function.strings = acc_strings,
+ dev->function.fs_descriptors = fs_acc_descs;
+ dev->function.hs_descriptors = hs_acc_descs;
+ dev->function.bind = acc_function_bind_configfs;
+ dev->function.unbind = acc_function_unbind;
+ dev->function.set_alt = acc_function_set_alt;
+ dev->function.disable = acc_function_disable;
+ dev->function.free_func = acc_free;
+ dev->function.setup = acc_ctrlrequest_configfs;
+
+ return &dev->function;
+}
+DECLARE_USB_FUNCTION_INIT(accessory, acc_alloc_inst, acc_alloc);
+MODULE_LICENSE("GPL");
diff --git a/drivers/usb/gadget/function/f_audio_source.c b/drivers/usb/gadget/function/f_audio_source.c
new file mode 100644
index 000000000000..8124af33b738
--- /dev/null
+++ b/drivers/usb/gadget/function/f_audio_source.c
@@ -0,0 +1,1071 @@
+/*
+ * Gadget Function Driver for USB audio source device
+ *
+ * Copyright (C) 2012 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/usb/audio.h>
+#include <linux/wait.h>
+#include <linux/pm_qos.h>
+#include <sound/core.h>
+#include <sound/initval.h>
+#include <sound/pcm.h>
+
+#include <linux/usb.h>
+#include <linux/usb_usual.h>
+#include <linux/usb/ch9.h>
+#include <linux/configfs.h>
+#include <linux/usb/composite.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#define SAMPLE_RATE 44100
+#define FRAMES_PER_MSEC (SAMPLE_RATE / 1000)
+
+#define IN_EP_MAX_PACKET_SIZE 256
+
+/* Number of requests to allocate */
+#define IN_EP_REQ_COUNT 4
+
+#define AUDIO_AC_INTERFACE 0
+#define AUDIO_AS_INTERFACE 1
+#define AUDIO_NUM_INTERFACES 2
+#define MAX_INST_NAME_LEN 40
+
+/* B.3.1 Standard AC Interface Descriptor */
+static struct usb_interface_descriptor ac_interface_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bNumEndpoints = 0,
+ .bInterfaceClass = USB_CLASS_AUDIO,
+ .bInterfaceSubClass = USB_SUBCLASS_AUDIOCONTROL,
+};
+
+DECLARE_UAC_AC_HEADER_DESCRIPTOR(2);
+
+#define UAC_DT_AC_HEADER_LENGTH UAC_DT_AC_HEADER_SIZE(AUDIO_NUM_INTERFACES)
+/* 1 input terminal, 1 output terminal and 1 feature unit */
+#define UAC_DT_TOTAL_LENGTH (UAC_DT_AC_HEADER_LENGTH \
+ + UAC_DT_INPUT_TERMINAL_SIZE + UAC_DT_OUTPUT_TERMINAL_SIZE \
+ + UAC_DT_FEATURE_UNIT_SIZE(0))
+/* B.3.2 Class-Specific AC Interface Descriptor */
+static struct uac1_ac_header_descriptor_2 ac_header_desc = {
+ .bLength = UAC_DT_AC_HEADER_LENGTH,
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_HEADER,
+ .bcdADC = __constant_cpu_to_le16(0x0100),
+ .wTotalLength = __constant_cpu_to_le16(UAC_DT_TOTAL_LENGTH),
+ .bInCollection = AUDIO_NUM_INTERFACES,
+ .baInterfaceNr = {
+ [0] = AUDIO_AC_INTERFACE,
+ [1] = AUDIO_AS_INTERFACE,
+ }
+};
+
+#define INPUT_TERMINAL_ID 1
+static struct uac_input_terminal_descriptor input_terminal_desc = {
+ .bLength = UAC_DT_INPUT_TERMINAL_SIZE,
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_INPUT_TERMINAL,
+ .bTerminalID = INPUT_TERMINAL_ID,
+ .wTerminalType = UAC_INPUT_TERMINAL_MICROPHONE,
+ .bAssocTerminal = 0,
+ .wChannelConfig = 0x3,
+};
+
+DECLARE_UAC_FEATURE_UNIT_DESCRIPTOR(0);
+
+#define FEATURE_UNIT_ID 2
+static struct uac_feature_unit_descriptor_0 feature_unit_desc = {
+ .bLength = UAC_DT_FEATURE_UNIT_SIZE(0),
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_FEATURE_UNIT,
+ .bUnitID = FEATURE_UNIT_ID,
+ .bSourceID = INPUT_TERMINAL_ID,
+ .bControlSize = 2,
+};
+
+#define OUTPUT_TERMINAL_ID 3
+static struct uac1_output_terminal_descriptor output_terminal_desc = {
+ .bLength = UAC_DT_OUTPUT_TERMINAL_SIZE,
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_OUTPUT_TERMINAL,
+ .bTerminalID = OUTPUT_TERMINAL_ID,
+ .wTerminalType = UAC_TERMINAL_STREAMING,
+ .bAssocTerminal = FEATURE_UNIT_ID,
+ .bSourceID = FEATURE_UNIT_ID,
+};
+
+/* B.4.1 Standard AS Interface Descriptor */
+static struct usb_interface_descriptor as_interface_alt_0_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bAlternateSetting = 0,
+ .bNumEndpoints = 0,
+ .bInterfaceClass = USB_CLASS_AUDIO,
+ .bInterfaceSubClass = USB_SUBCLASS_AUDIOSTREAMING,
+};
+
+static struct usb_interface_descriptor as_interface_alt_1_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bAlternateSetting = 1,
+ .bNumEndpoints = 1,
+ .bInterfaceClass = USB_CLASS_AUDIO,
+ .bInterfaceSubClass = USB_SUBCLASS_AUDIOSTREAMING,
+};
+
+/* B.4.2 Class-Specific AS Interface Descriptor */
+static struct uac1_as_header_descriptor as_header_desc = {
+ .bLength = UAC_DT_AS_HEADER_SIZE,
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_AS_GENERAL,
+ .bTerminalLink = INPUT_TERMINAL_ID,
+ .bDelay = 1,
+ .wFormatTag = UAC_FORMAT_TYPE_I_PCM,
+};
+
+DECLARE_UAC_FORMAT_TYPE_I_DISCRETE_DESC(1);
+
+static struct uac_format_type_i_discrete_descriptor_1 as_type_i_desc = {
+ .bLength = UAC_FORMAT_TYPE_I_DISCRETE_DESC_SIZE(1),
+ .bDescriptorType = USB_DT_CS_INTERFACE,
+ .bDescriptorSubtype = UAC_FORMAT_TYPE,
+ .bFormatType = UAC_FORMAT_TYPE_I,
+ .bSubframeSize = 2,
+ .bBitResolution = 16,
+ .bSamFreqType = 1,
+};
+
+/* Standard ISO IN Endpoint Descriptor for highspeed */
+static struct usb_endpoint_descriptor hs_as_in_ep_desc = {
+ .bLength = USB_DT_ENDPOINT_AUDIO_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_SYNC_SYNC
+ | USB_ENDPOINT_XFER_ISOC,
+ .wMaxPacketSize = __constant_cpu_to_le16(IN_EP_MAX_PACKET_SIZE),
+ .bInterval = 4, /* poll 1 per millisecond */
+};
+
+/* Standard ISO IN Endpoint Descriptor for highspeed */
+static struct usb_endpoint_descriptor fs_as_in_ep_desc = {
+ .bLength = USB_DT_ENDPOINT_AUDIO_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_SYNC_SYNC
+ | USB_ENDPOINT_XFER_ISOC,
+ .wMaxPacketSize = __constant_cpu_to_le16(IN_EP_MAX_PACKET_SIZE),
+ .bInterval = 1, /* poll 1 per millisecond */
+};
+
+/* Class-specific AS ISO OUT Endpoint Descriptor */
+static struct uac_iso_endpoint_descriptor as_iso_in_desc = {
+ .bLength = UAC_ISO_ENDPOINT_DESC_SIZE,
+ .bDescriptorType = USB_DT_CS_ENDPOINT,
+ .bDescriptorSubtype = UAC_EP_GENERAL,
+ .bmAttributes = 1,
+ .bLockDelayUnits = 1,
+ .wLockDelay = __constant_cpu_to_le16(1),
+};
+
+static struct usb_descriptor_header *hs_audio_desc[] = {
+ (struct usb_descriptor_header *)&ac_interface_desc,
+ (struct usb_descriptor_header *)&ac_header_desc,
+
+ (struct usb_descriptor_header *)&input_terminal_desc,
+ (struct usb_descriptor_header *)&output_terminal_desc,
+ (struct usb_descriptor_header *)&feature_unit_desc,
+
+ (struct usb_descriptor_header *)&as_interface_alt_0_desc,
+ (struct usb_descriptor_header *)&as_interface_alt_1_desc,
+ (struct usb_descriptor_header *)&as_header_desc,
+
+ (struct usb_descriptor_header *)&as_type_i_desc,
+
+ (struct usb_descriptor_header *)&hs_as_in_ep_desc,
+ (struct usb_descriptor_header *)&as_iso_in_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *fs_audio_desc[] = {
+ (struct usb_descriptor_header *)&ac_interface_desc,
+ (struct usb_descriptor_header *)&ac_header_desc,
+
+ (struct usb_descriptor_header *)&input_terminal_desc,
+ (struct usb_descriptor_header *)&output_terminal_desc,
+ (struct usb_descriptor_header *)&feature_unit_desc,
+
+ (struct usb_descriptor_header *)&as_interface_alt_0_desc,
+ (struct usb_descriptor_header *)&as_interface_alt_1_desc,
+ (struct usb_descriptor_header *)&as_header_desc,
+
+ (struct usb_descriptor_header *)&as_type_i_desc,
+
+ (struct usb_descriptor_header *)&fs_as_in_ep_desc,
+ (struct usb_descriptor_header *)&as_iso_in_desc,
+ NULL,
+};
+
+static struct snd_pcm_hardware audio_hw_info = {
+ .info = SNDRV_PCM_INFO_MMAP |
+ SNDRV_PCM_INFO_MMAP_VALID |
+ SNDRV_PCM_INFO_BATCH |
+ SNDRV_PCM_INFO_INTERLEAVED |
+ SNDRV_PCM_INFO_BLOCK_TRANSFER,
+
+ .formats = SNDRV_PCM_FMTBIT_S16_LE,
+ .channels_min = 2,
+ .channels_max = 2,
+ .rate_min = SAMPLE_RATE,
+ .rate_max = SAMPLE_RATE,
+
+ .buffer_bytes_max = 1024 * 1024,
+ .period_bytes_min = 64,
+ .period_bytes_max = 512 * 1024,
+ .periods_min = 2,
+ .periods_max = 1024,
+};
+
+/*-------------------------------------------------------------------------*/
+
+struct audio_source_config {
+ int card;
+ int device;
+};
+
+struct audio_dev {
+ struct usb_function func;
+ struct snd_card *card;
+ struct snd_pcm *pcm;
+ struct snd_pcm_substream *substream;
+
+ struct list_head idle_reqs;
+ struct usb_ep *in_ep;
+
+ spinlock_t lock;
+
+ /* beginning, end and current position in our buffer */
+ void *buffer_start;
+ void *buffer_end;
+ void *buffer_pos;
+
+ /* byte size of a "period" */
+ unsigned int period;
+ /* bytes sent since last call to snd_pcm_period_elapsed */
+ unsigned int period_offset;
+ /* time we started playing */
+ ktime_t start_time;
+ /* number of frames sent since start_time */
+ s64 frames_sent;
+ struct audio_source_config *config;
+ /* for creating and issuing QoS requests */
+ struct pm_qos_request pm_qos;
+};
+
+static inline struct audio_dev *func_to_audio(struct usb_function *f)
+{
+ return container_of(f, struct audio_dev, func);
+}
+
+/*-------------------------------------------------------------------------*/
+
+struct audio_source_instance {
+ struct usb_function_instance func_inst;
+ const char *name;
+ struct audio_source_config *config;
+ struct device *audio_device;
+};
+
+static void audio_source_attr_release(struct config_item *item);
+
+static struct configfs_item_operations audio_source_item_ops = {
+ .release = audio_source_attr_release,
+};
+
+static struct config_item_type audio_source_func_type = {
+ .ct_item_ops = &audio_source_item_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static ssize_t audio_source_pcm_show(struct device *dev,
+ struct device_attribute *attr, char *buf);
+
+static DEVICE_ATTR(pcm, S_IRUGO, audio_source_pcm_show, NULL);
+
+static struct device_attribute *audio_source_function_attributes[] = {
+ &dev_attr_pcm,
+ NULL
+};
+
+/*--------------------------------------------------------------------------*/
+
+static struct usb_request *audio_request_new(struct usb_ep *ep, int buffer_size)
+{
+ struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL);
+
+ if (!req)
+ return NULL;
+
+ req->buf = kmalloc(buffer_size, GFP_KERNEL);
+ if (!req->buf) {
+ usb_ep_free_request(ep, req);
+ return NULL;
+ }
+ req->length = buffer_size;
+ return req;
+}
+
+static void audio_request_free(struct usb_request *req, struct usb_ep *ep)
+{
+ if (req) {
+ kfree(req->buf);
+ usb_ep_free_request(ep, req);
+ }
+}
+
+static void audio_req_put(struct audio_dev *audio, struct usb_request *req)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&audio->lock, flags);
+ list_add_tail(&req->list, &audio->idle_reqs);
+ spin_unlock_irqrestore(&audio->lock, flags);
+}
+
+static struct usb_request *audio_req_get(struct audio_dev *audio)
+{
+ unsigned long flags;
+ struct usb_request *req;
+
+ spin_lock_irqsave(&audio->lock, flags);
+ if (list_empty(&audio->idle_reqs)) {
+ req = 0;
+ } else {
+ req = list_first_entry(&audio->idle_reqs, struct usb_request,
+ list);
+ list_del(&req->list);
+ }
+ spin_unlock_irqrestore(&audio->lock, flags);
+ return req;
+}
+
+/* send the appropriate number of packets to match our bitrate */
+static void audio_send(struct audio_dev *audio)
+{
+ struct snd_pcm_runtime *runtime;
+ struct usb_request *req;
+ int length, length1, length2, ret;
+ s64 msecs;
+ s64 frames;
+ ktime_t now;
+
+ /* audio->substream will be null if we have been closed */
+ if (!audio->substream)
+ return;
+ /* audio->buffer_pos will be null if we have been stopped */
+ if (!audio->buffer_pos)
+ return;
+
+ runtime = audio->substream->runtime;
+
+ /* compute number of frames to send */
+ now = ktime_get();
+ msecs = div_s64((ktime_to_ns(now) - ktime_to_ns(audio->start_time)),
+ 1000000);
+ frames = div_s64((msecs * SAMPLE_RATE), 1000);
+
+ /* Readjust our frames_sent if we fall too far behind.
+ * If we get too far behind it is better to drop some frames than
+ * to keep sending data too fast in an attempt to catch up.
+ */
+ if (frames - audio->frames_sent > 10 * FRAMES_PER_MSEC)
+ audio->frames_sent = frames - FRAMES_PER_MSEC;
+
+ frames -= audio->frames_sent;
+
+ /* We need to send something to keep the pipeline going */
+ if (frames <= 0)
+ frames = FRAMES_PER_MSEC;
+
+ while (frames > 0) {
+ req = audio_req_get(audio);
+ if (!req)
+ break;
+
+ length = frames_to_bytes(runtime, frames);
+ if (length > IN_EP_MAX_PACKET_SIZE)
+ length = IN_EP_MAX_PACKET_SIZE;
+
+ if (audio->buffer_pos + length > audio->buffer_end)
+ length1 = audio->buffer_end - audio->buffer_pos;
+ else
+ length1 = length;
+ memcpy(req->buf, audio->buffer_pos, length1);
+ if (length1 < length) {
+ /* Wrap around and copy remaining length
+ * at beginning of buffer.
+ */
+ length2 = length - length1;
+ memcpy(req->buf + length1, audio->buffer_start,
+ length2);
+ audio->buffer_pos = audio->buffer_start + length2;
+ } else {
+ audio->buffer_pos += length1;
+ if (audio->buffer_pos >= audio->buffer_end)
+ audio->buffer_pos = audio->buffer_start;
+ }
+
+ req->length = length;
+ ret = usb_ep_queue(audio->in_ep, req, GFP_ATOMIC);
+ if (ret < 0) {
+ pr_err("usb_ep_queue failed ret: %d\n", ret);
+ audio_req_put(audio, req);
+ break;
+ }
+
+ frames -= bytes_to_frames(runtime, length);
+ audio->frames_sent += bytes_to_frames(runtime, length);
+ }
+}
+
+static void audio_control_complete(struct usb_ep *ep, struct usb_request *req)
+{
+ /* nothing to do here */
+}
+
+static void audio_data_complete(struct usb_ep *ep, struct usb_request *req)
+{
+ struct audio_dev *audio = req->context;
+
+ pr_debug("audio_data_complete req->status %d req->actual %d\n",
+ req->status, req->actual);
+
+ audio_req_put(audio, req);
+
+ if (!audio->buffer_start || req->status)
+ return;
+
+ audio->period_offset += req->actual;
+ if (audio->period_offset >= audio->period) {
+ snd_pcm_period_elapsed(audio->substream);
+ audio->period_offset = 0;
+ }
+ audio_send(audio);
+}
+
+static int audio_set_endpoint_req(struct usb_function *f,
+ const struct usb_ctrlrequest *ctrl)
+{
+ int value = -EOPNOTSUPP;
+ u16 ep = le16_to_cpu(ctrl->wIndex);
+ u16 len = le16_to_cpu(ctrl->wLength);
+ u16 w_value = le16_to_cpu(ctrl->wValue);
+
+ pr_debug("bRequest 0x%x, w_value 0x%04x, len %d, endpoint %d\n",
+ ctrl->bRequest, w_value, len, ep);
+
+ switch (ctrl->bRequest) {
+ case UAC_SET_CUR:
+ case UAC_SET_MIN:
+ case UAC_SET_MAX:
+ case UAC_SET_RES:
+ value = len;
+ break;
+ default:
+ break;
+ }
+
+ return value;
+}
+
+static int audio_get_endpoint_req(struct usb_function *f,
+ const struct usb_ctrlrequest *ctrl)
+{
+ struct usb_composite_dev *cdev = f->config->cdev;
+ int value = -EOPNOTSUPP;
+ u8 ep = ((le16_to_cpu(ctrl->wIndex) >> 8) & 0xFF);
+ u16 len = le16_to_cpu(ctrl->wLength);
+ u16 w_value = le16_to_cpu(ctrl->wValue);
+ u8 *buf = cdev->req->buf;
+
+ pr_debug("bRequest 0x%x, w_value 0x%04x, len %d, endpoint %d\n",
+ ctrl->bRequest, w_value, len, ep);
+
+ if (w_value == UAC_EP_CS_ATTR_SAMPLE_RATE << 8) {
+ switch (ctrl->bRequest) {
+ case UAC_GET_CUR:
+ case UAC_GET_MIN:
+ case UAC_GET_MAX:
+ case UAC_GET_RES:
+ /* return our sample rate */
+ buf[0] = (u8)SAMPLE_RATE;
+ buf[1] = (u8)(SAMPLE_RATE >> 8);
+ buf[2] = (u8)(SAMPLE_RATE >> 16);
+ value = 3;
+ break;
+ default:
+ break;
+ }
+ }
+
+ return value;
+}
+
+static int
+audio_setup(struct usb_function *f, const struct usb_ctrlrequest *ctrl)
+{
+ struct usb_composite_dev *cdev = f->config->cdev;
+ struct usb_request *req = cdev->req;
+ int value = -EOPNOTSUPP;
+ u16 w_index = le16_to_cpu(ctrl->wIndex);
+ u16 w_value = le16_to_cpu(ctrl->wValue);
+ u16 w_length = le16_to_cpu(ctrl->wLength);
+
+ /* composite driver infrastructure handles everything; interface
+ * activation uses set_alt().
+ */
+ switch (ctrl->bRequestType) {
+ case USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_ENDPOINT:
+ value = audio_set_endpoint_req(f, ctrl);
+ break;
+
+ case USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_ENDPOINT:
+ value = audio_get_endpoint_req(f, ctrl);
+ break;
+ }
+
+ /* respond with data transfer or status phase? */
+ if (value >= 0) {
+ pr_debug("audio req%02x.%02x v%04x i%04x l%d\n",
+ ctrl->bRequestType, ctrl->bRequest,
+ w_value, w_index, w_length);
+ req->zero = 0;
+ req->length = value;
+ req->complete = audio_control_complete;
+ value = usb_ep_queue(cdev->gadget->ep0, req, GFP_ATOMIC);
+ if (value < 0)
+ pr_err("audio response on err %d\n", value);
+ }
+
+ /* device either stalls (value < 0) or reports success */
+ return value;
+}
+
+static int audio_set_alt(struct usb_function *f, unsigned intf, unsigned alt)
+{
+ struct audio_dev *audio = func_to_audio(f);
+ struct usb_composite_dev *cdev = f->config->cdev;
+ int ret;
+
+ pr_debug("audio_set_alt intf %d, alt %d\n", intf, alt);
+
+ ret = config_ep_by_speed(cdev->gadget, f, audio->in_ep);
+ if (ret)
+ return ret;
+
+ usb_ep_enable(audio->in_ep);
+ return 0;
+}
+
+static void audio_disable(struct usb_function *f)
+{
+ struct audio_dev *audio = func_to_audio(f);
+
+ pr_debug("audio_disable\n");
+ usb_ep_disable(audio->in_ep);
+}
+
+static void audio_free_func(struct usb_function *f)
+{
+ /* no-op */
+}
+
+/*-------------------------------------------------------------------------*/
+
+static void audio_build_desc(struct audio_dev *audio)
+{
+ u8 *sam_freq;
+ int rate;
+
+ /* Set channel numbers */
+ input_terminal_desc.bNrChannels = 2;
+ as_type_i_desc.bNrChannels = 2;
+
+ /* Set sample rates */
+ rate = SAMPLE_RATE;
+ sam_freq = as_type_i_desc.tSamFreq[0];
+ memcpy(sam_freq, &rate, 3);
+}
+
+
+static int snd_card_setup(struct usb_configuration *c,
+ struct audio_source_config *config);
+static struct audio_source_instance *to_fi_audio_source(
+ const struct usb_function_instance *fi);
+
+
+/* audio function driver setup/binding */
+static int
+audio_bind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct usb_composite_dev *cdev = c->cdev;
+ struct audio_dev *audio = func_to_audio(f);
+ int status;
+ struct usb_ep *ep;
+ struct usb_request *req;
+ int i;
+ int err;
+
+ if (IS_ENABLED(CONFIG_USB_CONFIGFS)) {
+ struct audio_source_instance *fi_audio =
+ to_fi_audio_source(f->fi);
+ struct audio_source_config *config =
+ fi_audio->config;
+
+ err = snd_card_setup(c, config);
+ if (err)
+ return err;
+ }
+
+ audio_build_desc(audio);
+
+ /* allocate instance-specific interface IDs, and patch descriptors */
+ status = usb_interface_id(c, f);
+ if (status < 0)
+ goto fail;
+ ac_interface_desc.bInterfaceNumber = status;
+
+ /* AUDIO_AC_INTERFACE */
+ ac_header_desc.baInterfaceNr[0] = status;
+
+ status = usb_interface_id(c, f);
+ if (status < 0)
+ goto fail;
+ as_interface_alt_0_desc.bInterfaceNumber = status;
+ as_interface_alt_1_desc.bInterfaceNumber = status;
+
+ /* AUDIO_AS_INTERFACE */
+ ac_header_desc.baInterfaceNr[1] = status;
+
+ status = -ENODEV;
+
+ /* allocate our endpoint */
+ ep = usb_ep_autoconfig(cdev->gadget, &fs_as_in_ep_desc);
+ if (!ep)
+ goto fail;
+ audio->in_ep = ep;
+ ep->driver_data = audio; /* claim */
+
+ if (gadget_is_dualspeed(c->cdev->gadget))
+ hs_as_in_ep_desc.bEndpointAddress =
+ fs_as_in_ep_desc.bEndpointAddress;
+
+ f->fs_descriptors = fs_audio_desc;
+ f->hs_descriptors = hs_audio_desc;
+
+ for (i = 0, status = 0; i < IN_EP_REQ_COUNT && status == 0; i++) {
+ req = audio_request_new(ep, IN_EP_MAX_PACKET_SIZE);
+ if (req) {
+ req->context = audio;
+ req->complete = audio_data_complete;
+ audio_req_put(audio, req);
+ } else
+ status = -ENOMEM;
+ }
+
+fail:
+ return status;
+}
+
+static void
+audio_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct audio_dev *audio = func_to_audio(f);
+ struct usb_request *req;
+
+ while ((req = audio_req_get(audio)))
+ audio_request_free(req, audio->in_ep);
+
+ snd_card_free_when_closed(audio->card);
+ audio->card = NULL;
+ audio->pcm = NULL;
+ audio->substream = NULL;
+ audio->in_ep = NULL;
+
+ if (IS_ENABLED(CONFIG_USB_CONFIGFS)) {
+ struct audio_source_instance *fi_audio =
+ to_fi_audio_source(f->fi);
+ struct audio_source_config *config =
+ fi_audio->config;
+
+ config->card = -1;
+ config->device = -1;
+ }
+}
+
+static void audio_pcm_playback_start(struct audio_dev *audio)
+{
+ audio->start_time = ktime_get();
+ audio->frames_sent = 0;
+ audio_send(audio);
+}
+
+static void audio_pcm_playback_stop(struct audio_dev *audio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&audio->lock, flags);
+ audio->buffer_start = 0;
+ audio->buffer_end = 0;
+ audio->buffer_pos = 0;
+ spin_unlock_irqrestore(&audio->lock, flags);
+}
+
+static int audio_pcm_open(struct snd_pcm_substream *substream)
+{
+ struct snd_pcm_runtime *runtime = substream->runtime;
+ struct audio_dev *audio = substream->private_data;
+
+ runtime->private_data = audio;
+ runtime->hw = audio_hw_info;
+ snd_pcm_limit_hw_rates(runtime);
+ runtime->hw.channels_max = 2;
+
+ audio->substream = substream;
+
+ /* Add the QoS request and set the latency to 0 */
+ pm_qos_add_request(&audio->pm_qos, PM_QOS_CPU_DMA_LATENCY, 0);
+
+ return 0;
+}
+
+static int audio_pcm_close(struct snd_pcm_substream *substream)
+{
+ struct audio_dev *audio = substream->private_data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&audio->lock, flags);
+
+ /* Remove the QoS request */
+ pm_qos_remove_request(&audio->pm_qos);
+
+ audio->substream = NULL;
+ spin_unlock_irqrestore(&audio->lock, flags);
+
+ return 0;
+}
+
+static int audio_pcm_hw_params(struct snd_pcm_substream *substream,
+ struct snd_pcm_hw_params *params)
+{
+ unsigned int channels = params_channels(params);
+ unsigned int rate = params_rate(params);
+
+ if (rate != SAMPLE_RATE)
+ return -EINVAL;
+ if (channels != 2)
+ return -EINVAL;
+
+ return snd_pcm_lib_alloc_vmalloc_buffer(substream,
+ params_buffer_bytes(params));
+}
+
+static int audio_pcm_hw_free(struct snd_pcm_substream *substream)
+{
+ return snd_pcm_lib_free_vmalloc_buffer(substream);
+}
+
+static int audio_pcm_prepare(struct snd_pcm_substream *substream)
+{
+ struct snd_pcm_runtime *runtime = substream->runtime;
+ struct audio_dev *audio = runtime->private_data;
+
+ audio->period = snd_pcm_lib_period_bytes(substream);
+ audio->period_offset = 0;
+ audio->buffer_start = runtime->dma_area;
+ audio->buffer_end = audio->buffer_start
+ + snd_pcm_lib_buffer_bytes(substream);
+ audio->buffer_pos = audio->buffer_start;
+
+ return 0;
+}
+
+static snd_pcm_uframes_t audio_pcm_pointer(struct snd_pcm_substream *substream)
+{
+ struct snd_pcm_runtime *runtime = substream->runtime;
+ struct audio_dev *audio = runtime->private_data;
+ ssize_t bytes = audio->buffer_pos - audio->buffer_start;
+
+ /* return offset of next frame to fill in our buffer */
+ return bytes_to_frames(runtime, bytes);
+}
+
+static int audio_pcm_playback_trigger(struct snd_pcm_substream *substream,
+ int cmd)
+{
+ struct audio_dev *audio = substream->runtime->private_data;
+ int ret = 0;
+
+ switch (cmd) {
+ case SNDRV_PCM_TRIGGER_START:
+ case SNDRV_PCM_TRIGGER_RESUME:
+ audio_pcm_playback_start(audio);
+ break;
+
+ case SNDRV_PCM_TRIGGER_STOP:
+ case SNDRV_PCM_TRIGGER_SUSPEND:
+ audio_pcm_playback_stop(audio);
+ break;
+
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static struct audio_dev _audio_dev = {
+ .func = {
+ .name = "audio_source",
+ .bind = audio_bind,
+ .unbind = audio_unbind,
+ .set_alt = audio_set_alt,
+ .setup = audio_setup,
+ .disable = audio_disable,
+ .free_func = audio_free_func,
+ },
+ .lock = __SPIN_LOCK_UNLOCKED(_audio_dev.lock),
+ .idle_reqs = LIST_HEAD_INIT(_audio_dev.idle_reqs),
+};
+
+static struct snd_pcm_ops audio_playback_ops = {
+ .open = audio_pcm_open,
+ .close = audio_pcm_close,
+ .ioctl = snd_pcm_lib_ioctl,
+ .hw_params = audio_pcm_hw_params,
+ .hw_free = audio_pcm_hw_free,
+ .prepare = audio_pcm_prepare,
+ .trigger = audio_pcm_playback_trigger,
+ .pointer = audio_pcm_pointer,
+};
+
+int audio_source_bind_config(struct usb_configuration *c,
+ struct audio_source_config *config)
+{
+ struct audio_dev *audio;
+ int err;
+
+ config->card = -1;
+ config->device = -1;
+
+ audio = &_audio_dev;
+
+ err = snd_card_setup(c, config);
+ if (err)
+ return err;
+
+ err = usb_add_function(c, &audio->func);
+ if (err)
+ goto add_fail;
+
+ return 0;
+
+add_fail:
+ snd_card_free(audio->card);
+ return err;
+}
+
+static int snd_card_setup(struct usb_configuration *c,
+ struct audio_source_config *config)
+{
+ struct audio_dev *audio;
+ struct snd_card *card;
+ struct snd_pcm *pcm;
+ int err;
+
+ audio = &_audio_dev;
+
+ err = snd_card_new(&c->cdev->gadget->dev,
+ SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1,
+ THIS_MODULE, 0, &card);
+ if (err)
+ return err;
+
+ err = snd_pcm_new(card, "USB audio source", 0, 1, 0, &pcm);
+ if (err)
+ goto pcm_fail;
+
+ pcm->private_data = audio;
+ pcm->info_flags = 0;
+ audio->pcm = pcm;
+
+ strlcpy(pcm->name, "USB gadget audio", sizeof(pcm->name));
+
+ snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &audio_playback_ops);
+ snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_DEV,
+ NULL, 0, 64 * 1024);
+
+ strlcpy(card->driver, "audio_source", sizeof(card->driver));
+ strlcpy(card->shortname, card->driver, sizeof(card->shortname));
+ strlcpy(card->longname, "USB accessory audio source",
+ sizeof(card->longname));
+
+ err = snd_card_register(card);
+ if (err)
+ goto register_fail;
+
+ config->card = pcm->card->number;
+ config->device = pcm->device;
+ audio->card = card;
+ return 0;
+
+register_fail:
+pcm_fail:
+ snd_card_free(audio->card);
+ return err;
+}
+
+static struct audio_source_instance *to_audio_source_instance(
+ struct config_item *item)
+{
+ return container_of(to_config_group(item), struct audio_source_instance,
+ func_inst.group);
+}
+
+static struct audio_source_instance *to_fi_audio_source(
+ const struct usb_function_instance *fi)
+{
+ return container_of(fi, struct audio_source_instance, func_inst);
+}
+
+static void audio_source_attr_release(struct config_item *item)
+{
+ struct audio_source_instance *fi_audio = to_audio_source_instance(item);
+
+ usb_put_function_instance(&fi_audio->func_inst);
+}
+
+static int audio_source_set_inst_name(struct usb_function_instance *fi,
+ const char *name)
+{
+ struct audio_source_instance *fi_audio;
+ char *ptr;
+ int name_len;
+
+ name_len = strlen(name) + 1;
+ if (name_len > MAX_INST_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ ptr = kstrndup(name, name_len, GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ fi_audio = to_fi_audio_source(fi);
+ fi_audio->name = ptr;
+
+ return 0;
+}
+
+static void audio_source_free_inst(struct usb_function_instance *fi)
+{
+ struct audio_source_instance *fi_audio;
+
+ fi_audio = to_fi_audio_source(fi);
+ device_destroy(fi_audio->audio_device->class,
+ fi_audio->audio_device->devt);
+ kfree(fi_audio->name);
+ kfree(fi_audio->config);
+}
+
+static ssize_t audio_source_pcm_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct audio_source_instance *fi_audio = dev_get_drvdata(dev);
+ struct audio_source_config *config = fi_audio->config;
+
+ /* print PCM card and device numbers */
+ return sprintf(buf, "%d %d\n", config->card, config->device);
+}
+
+struct device *create_function_device(char *name);
+
+static struct usb_function_instance *audio_source_alloc_inst(void)
+{
+ struct audio_source_instance *fi_audio;
+ struct device_attribute **attrs;
+ struct device_attribute *attr;
+ struct device *dev;
+ void *err_ptr;
+ int err = 0;
+
+ fi_audio = kzalloc(sizeof(*fi_audio), GFP_KERNEL);
+ if (!fi_audio)
+ return ERR_PTR(-ENOMEM);
+
+ fi_audio->func_inst.set_inst_name = audio_source_set_inst_name;
+ fi_audio->func_inst.free_func_inst = audio_source_free_inst;
+
+ fi_audio->config = kzalloc(sizeof(struct audio_source_config),
+ GFP_KERNEL);
+ if (!fi_audio->config) {
+ err_ptr = ERR_PTR(-ENOMEM);
+ goto fail_audio;
+ }
+
+ config_group_init_type_name(&fi_audio->func_inst.group, "",
+ &audio_source_func_type);
+ dev = create_function_device("f_audio_source");
+
+ if (IS_ERR(dev)) {
+ err_ptr = dev;
+ goto fail_audio_config;
+ }
+
+ fi_audio->config->card = -1;
+ fi_audio->config->device = -1;
+ fi_audio->audio_device = dev;
+
+ attrs = audio_source_function_attributes;
+ if (attrs) {
+ while ((attr = *attrs++) && !err)
+ err = device_create_file(dev, attr);
+ if (err) {
+ err_ptr = ERR_PTR(-EINVAL);
+ goto fail_device;
+ }
+ }
+
+ dev_set_drvdata(dev, fi_audio);
+ _audio_dev.config = fi_audio->config;
+
+ return &fi_audio->func_inst;
+
+fail_device:
+ device_destroy(dev->class, dev->devt);
+fail_audio_config:
+ kfree(fi_audio->config);
+fail_audio:
+ kfree(fi_audio);
+ return err_ptr;
+
+}
+
+static struct usb_function *audio_source_alloc(struct usb_function_instance *fi)
+{
+ return &_audio_dev.func;
+}
+
+DECLARE_USB_FUNCTION_INIT(audio_source, audio_source_alloc_inst,
+ audio_source_alloc);
+MODULE_LICENSE("GPL");
diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c
index 70ac1963b598..7d6a48c5ca40 100644
--- a/drivers/usb/gadget/function/f_midi.c
+++ b/drivers/usb/gadget/function/f_midi.c
@@ -1168,6 +1168,65 @@ static void f_midi_free_inst(struct usb_function_instance *f)
kfree(opts);
}
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+extern struct device *create_function_device(char *name);
+static ssize_t alsa_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct usb_function_instance *fi_midi = dev_get_drvdata(dev);
+ struct f_midi *midi;
+
+ if (!fi_midi->f)
+ dev_warn(dev, "f_midi: function not set\n");
+
+ if (fi_midi && fi_midi->f) {
+ midi = func_to_midi(fi_midi->f);
+ if (midi->rmidi && midi->rmidi->card)
+ return sprintf(buf, "%d %d\n",
+ midi->rmidi->card->number, midi->rmidi->device);
+ }
+
+ /* print PCM card and device numbers */
+ return sprintf(buf, "%d %d\n", -1, -1);
+}
+
+static DEVICE_ATTR(alsa, S_IRUGO, alsa_show, NULL);
+
+static struct device_attribute *alsa_function_attributes[] = {
+ &dev_attr_alsa,
+ NULL
+};
+
+static int create_alsa_device(struct usb_function_instance *fi)
+{
+ struct device *dev;
+ struct device_attribute **attrs;
+ struct device_attribute *attr;
+ int err = 0;
+
+ dev = create_function_device("f_midi");
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
+ attrs = alsa_function_attributes;
+ if (attrs) {
+ while ((attr = *attrs++) && !err)
+ err = device_create_file(dev, attr);
+ if (err) {
+ device_destroy(dev->class, dev->devt);
+ return -EINVAL;
+ }
+ }
+ dev_set_drvdata(dev, fi);
+ return 0;
+}
+#else
+static int create_alsa_device(struct usb_function_instance *fi)
+{
+ return 0;
+}
+#endif
+
static struct usb_function_instance *f_midi_alloc_inst(void)
{
struct f_midi_opts *opts;
@@ -1185,6 +1244,11 @@ static struct usb_function_instance *f_midi_alloc_inst(void)
opts->in_ports = 1;
opts->out_ports = 1;
+ if (create_alsa_device(&opts->func_inst)) {
+ kfree(opts);
+ return ERR_PTR(-ENODEV);
+ }
+
config_group_init_type_name(&opts->func_inst.group, "",
&midi_func_type);
@@ -1202,6 +1266,7 @@ static void f_midi_free(struct usb_function *f)
mutex_lock(&opts->lock);
kfifo_free(&midi->in_req_fifo);
kfree(midi);
+ opts->func_inst.f = NULL;
--opts->refcnt;
mutex_unlock(&opts->lock);
}
@@ -1281,6 +1346,7 @@ static struct usb_function *f_midi_alloc(struct usb_function_instance *fi)
midi->func.disable = f_midi_disable;
midi->func.free_func = f_midi_free;
+ fi->f = &midi->func;
return &midi->func;
setup_fail:
diff --git a/drivers/usb/gadget/function/f_mtp.c b/drivers/usb/gadget/function/f_mtp.c
new file mode 100644
index 000000000000..9515b2a7d0e0
--- /dev/null
+++ b/drivers/usb/gadget/function/f_mtp.c
@@ -0,0 +1,1552 @@
+/*
+ * Gadget Function Driver for MTP
+ *
+ * Copyright (C) 2010 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/* #define DEBUG */
+/* #define VERBOSE_DEBUG */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+
+#include <linux/usb.h>
+#include <linux/usb_usual.h>
+#include <linux/usb/ch9.h>
+#include <linux/usb/f_mtp.h>
+#include <linux/configfs.h>
+#include <linux/usb/composite.h>
+
+#include "configfs.h"
+
+#define MTP_BULK_BUFFER_SIZE 16384
+#define INTR_BUFFER_SIZE 28
+#define MAX_INST_NAME_LEN 40
+#define MTP_MAX_FILE_SIZE 0xFFFFFFFFL
+
+/* String IDs */
+#define INTERFACE_STRING_INDEX 0
+
+/* values for mtp_dev.state */
+#define STATE_OFFLINE 0 /* initial state, disconnected */
+#define STATE_READY 1 /* ready for userspace calls */
+#define STATE_BUSY 2 /* processing userspace calls */
+#define STATE_CANCELED 3 /* transaction canceled by host */
+#define STATE_ERROR 4 /* error from completion routine */
+
+/* number of tx and rx requests to allocate */
+#define TX_REQ_MAX 4
+#define RX_REQ_MAX 2
+#define INTR_REQ_MAX 5
+
+/* ID for Microsoft MTP OS String */
+#define MTP_OS_STRING_ID 0xEE
+
+/* MTP class reqeusts */
+#define MTP_REQ_CANCEL 0x64
+#define MTP_REQ_GET_EXT_EVENT_DATA 0x65
+#define MTP_REQ_RESET 0x66
+#define MTP_REQ_GET_DEVICE_STATUS 0x67
+
+/* constants for device status */
+#define MTP_RESPONSE_OK 0x2001
+#define MTP_RESPONSE_DEVICE_BUSY 0x2019
+#define DRIVER_NAME "mtp"
+
+static const char mtp_shortname[] = DRIVER_NAME "_usb";
+
+struct mtp_dev {
+ struct usb_function function;
+ struct usb_composite_dev *cdev;
+ spinlock_t lock;
+
+ struct usb_ep *ep_in;
+ struct usb_ep *ep_out;
+ struct usb_ep *ep_intr;
+
+ int state;
+
+ /* synchronize access to our device file */
+ atomic_t open_excl;
+ /* to enforce only one ioctl at a time */
+ atomic_t ioctl_excl;
+
+ struct list_head tx_idle;
+ struct list_head intr_idle;
+
+ wait_queue_head_t read_wq;
+ wait_queue_head_t write_wq;
+ wait_queue_head_t intr_wq;
+ struct usb_request *rx_req[RX_REQ_MAX];
+ int rx_done;
+
+ /* for processing MTP_SEND_FILE, MTP_RECEIVE_FILE and
+ * MTP_SEND_FILE_WITH_HEADER ioctls on a work queue
+ */
+ struct workqueue_struct *wq;
+ struct work_struct send_file_work;
+ struct work_struct receive_file_work;
+ struct file *xfer_file;
+ loff_t xfer_file_offset;
+ int64_t xfer_file_length;
+ unsigned xfer_send_header;
+ uint16_t xfer_command;
+ uint32_t xfer_transaction_id;
+ int xfer_result;
+};
+
+static struct usb_interface_descriptor mtp_interface_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bInterfaceNumber = 0,
+ .bNumEndpoints = 3,
+ .bInterfaceClass = USB_CLASS_VENDOR_SPEC,
+ .bInterfaceSubClass = USB_SUBCLASS_VENDOR_SPEC,
+ .bInterfaceProtocol = 0,
+};
+
+static struct usb_interface_descriptor ptp_interface_desc = {
+ .bLength = USB_DT_INTERFACE_SIZE,
+ .bDescriptorType = USB_DT_INTERFACE,
+ .bInterfaceNumber = 0,
+ .bNumEndpoints = 3,
+ .bInterfaceClass = USB_CLASS_STILL_IMAGE,
+ .bInterfaceSubClass = 1,
+ .bInterfaceProtocol = 1,
+};
+
+static struct usb_endpoint_descriptor mtp_ss_in_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(1024),
+};
+
+static struct usb_ss_ep_comp_descriptor mtp_ss_in_comp_desc = {
+ .bLength = sizeof(mtp_ss_in_comp_desc),
+ .bDescriptorType = USB_DT_SS_ENDPOINT_COMP,
+ /* .bMaxBurst = DYNAMIC, */
+};
+
+static struct usb_endpoint_descriptor mtp_ss_out_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(1024),
+};
+
+static struct usb_ss_ep_comp_descriptor mtp_ss_out_comp_desc = {
+ .bLength = sizeof(mtp_ss_out_comp_desc),
+ .bDescriptorType = USB_DT_SS_ENDPOINT_COMP,
+ /* .bMaxBurst = DYNAMIC, */
+};
+
+static struct usb_endpoint_descriptor mtp_highspeed_in_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor mtp_highspeed_out_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = __constant_cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor mtp_fullspeed_in_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_endpoint_descriptor mtp_fullspeed_out_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_endpoint_descriptor mtp_intr_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_INT,
+ .wMaxPacketSize = __constant_cpu_to_le16(INTR_BUFFER_SIZE),
+ .bInterval = 6,
+};
+
+static struct usb_ss_ep_comp_descriptor mtp_intr_ss_comp_desc = {
+ .bLength = sizeof(mtp_intr_ss_comp_desc),
+ .bDescriptorType = USB_DT_SS_ENDPOINT_COMP,
+ .wBytesPerInterval = cpu_to_le16(INTR_BUFFER_SIZE),
+};
+
+static struct usb_descriptor_header *fs_mtp_descs[] = {
+ (struct usb_descriptor_header *) &mtp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_fullspeed_in_desc,
+ (struct usb_descriptor_header *) &mtp_fullspeed_out_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *hs_mtp_descs[] = {
+ (struct usb_descriptor_header *) &mtp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_highspeed_in_desc,
+ (struct usb_descriptor_header *) &mtp_highspeed_out_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *ss_mtp_descs[] = {
+ (struct usb_descriptor_header *) &mtp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_ss_in_desc,
+ (struct usb_descriptor_header *) &mtp_ss_in_comp_desc,
+ (struct usb_descriptor_header *) &mtp_ss_out_desc,
+ (struct usb_descriptor_header *) &mtp_ss_out_comp_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ (struct usb_descriptor_header *) &mtp_intr_ss_comp_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *fs_ptp_descs[] = {
+ (struct usb_descriptor_header *) &ptp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_fullspeed_in_desc,
+ (struct usb_descriptor_header *) &mtp_fullspeed_out_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *hs_ptp_descs[] = {
+ (struct usb_descriptor_header *) &ptp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_highspeed_in_desc,
+ (struct usb_descriptor_header *) &mtp_highspeed_out_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ NULL,
+};
+
+static struct usb_descriptor_header *ss_ptp_descs[] = {
+ (struct usb_descriptor_header *) &ptp_interface_desc,
+ (struct usb_descriptor_header *) &mtp_ss_in_desc,
+ (struct usb_descriptor_header *) &mtp_ss_in_comp_desc,
+ (struct usb_descriptor_header *) &mtp_ss_out_desc,
+ (struct usb_descriptor_header *) &mtp_ss_out_comp_desc,
+ (struct usb_descriptor_header *) &mtp_intr_desc,
+ (struct usb_descriptor_header *) &mtp_intr_ss_comp_desc,
+ NULL,
+};
+
+static struct usb_string mtp_string_defs[] = {
+ /* Naming interface "MTP" so libmtp will recognize us */
+ [INTERFACE_STRING_INDEX].s = "MTP",
+ { }, /* end of list */
+};
+
+static struct usb_gadget_strings mtp_string_table = {
+ .language = 0x0409, /* en-US */
+ .strings = mtp_string_defs,
+};
+
+static struct usb_gadget_strings *mtp_strings[] = {
+ &mtp_string_table,
+ NULL,
+};
+
+/* Microsoft MTP OS String */
+static u8 mtp_os_string[] = {
+ 18, /* sizeof(mtp_os_string) */
+ USB_DT_STRING,
+ /* Signature field: "MSFT100" */
+ 'M', 0, 'S', 0, 'F', 0, 'T', 0, '1', 0, '0', 0, '0', 0,
+ /* vendor code */
+ 1,
+ /* padding */
+ 0
+};
+
+/* Microsoft Extended Configuration Descriptor Header Section */
+struct mtp_ext_config_desc_header {
+ __le32 dwLength;
+ __u16 bcdVersion;
+ __le16 wIndex;
+ __u8 bCount;
+ __u8 reserved[7];
+};
+
+/* Microsoft Extended Configuration Descriptor Function Section */
+struct mtp_ext_config_desc_function {
+ __u8 bFirstInterfaceNumber;
+ __u8 bInterfaceCount;
+ __u8 compatibleID[8];
+ __u8 subCompatibleID[8];
+ __u8 reserved[6];
+};
+
+/* MTP Extended Configuration Descriptor */
+struct {
+ struct mtp_ext_config_desc_header header;
+ struct mtp_ext_config_desc_function function;
+} mtp_ext_config_desc = {
+ .header = {
+ .dwLength = __constant_cpu_to_le32(sizeof(mtp_ext_config_desc)),
+ .bcdVersion = __constant_cpu_to_le16(0x0100),
+ .wIndex = __constant_cpu_to_le16(4),
+ .bCount = 1,
+ },
+ .function = {
+ .bFirstInterfaceNumber = 0,
+ .bInterfaceCount = 1,
+ .compatibleID = { 'M', 'T', 'P' },
+ },
+};
+
+struct mtp_device_status {
+ __le16 wLength;
+ __le16 wCode;
+};
+
+struct mtp_data_header {
+ /* length of packet, including this header */
+ __le32 length;
+ /* container type (2 for data packet) */
+ __le16 type;
+ /* MTP command code */
+ __le16 command;
+ /* MTP transaction ID */
+ __le32 transaction_id;
+};
+
+struct mtp_instance {
+ struct usb_function_instance func_inst;
+ const char *name;
+ struct mtp_dev *dev;
+ char mtp_ext_compat_id[16];
+ struct usb_os_desc mtp_os_desc;
+};
+
+/* temporary variable used between mtp_open() and mtp_gadget_bind() */
+static struct mtp_dev *_mtp_dev;
+
+static inline struct mtp_dev *func_to_mtp(struct usb_function *f)
+{
+ return container_of(f, struct mtp_dev, function);
+}
+
+static struct usb_request *mtp_request_new(struct usb_ep *ep, int buffer_size)
+{
+ struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL);
+
+ if (!req)
+ return NULL;
+
+ /* now allocate buffers for the requests */
+ req->buf = kmalloc(buffer_size, GFP_KERNEL);
+ if (!req->buf) {
+ usb_ep_free_request(ep, req);
+ return NULL;
+ }
+
+ return req;
+}
+
+static void mtp_request_free(struct usb_request *req, struct usb_ep *ep)
+{
+ if (req) {
+ kfree(req->buf);
+ usb_ep_free_request(ep, req);
+ }
+}
+
+static inline int mtp_lock(atomic_t *excl)
+{
+ if (atomic_inc_return(excl) == 1) {
+ return 0;
+ } else {
+ atomic_dec(excl);
+ return -1;
+ }
+}
+
+static inline void mtp_unlock(atomic_t *excl)
+{
+ atomic_dec(excl);
+}
+
+/* add a request to the tail of a list */
+static void mtp_req_put(struct mtp_dev *dev, struct list_head *head,
+ struct usb_request *req)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ list_add_tail(&req->list, head);
+ spin_unlock_irqrestore(&dev->lock, flags);
+}
+
+/* remove a request from the head of a list */
+static struct usb_request
+*mtp_req_get(struct mtp_dev *dev, struct list_head *head)
+{
+ unsigned long flags;
+ struct usb_request *req;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ if (list_empty(head)) {
+ req = 0;
+ } else {
+ req = list_first_entry(head, struct usb_request, list);
+ list_del(&req->list);
+ }
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return req;
+}
+
+static void mtp_complete_in(struct usb_ep *ep, struct usb_request *req)
+{
+ struct mtp_dev *dev = _mtp_dev;
+
+ if (req->status != 0)
+ dev->state = STATE_ERROR;
+
+ mtp_req_put(dev, &dev->tx_idle, req);
+
+ wake_up(&dev->write_wq);
+}
+
+static void mtp_complete_out(struct usb_ep *ep, struct usb_request *req)
+{
+ struct mtp_dev *dev = _mtp_dev;
+
+ dev->rx_done = 1;
+ if (req->status != 0)
+ dev->state = STATE_ERROR;
+
+ wake_up(&dev->read_wq);
+}
+
+static void mtp_complete_intr(struct usb_ep *ep, struct usb_request *req)
+{
+ struct mtp_dev *dev = _mtp_dev;
+
+ if (req->status != 0)
+ dev->state = STATE_ERROR;
+
+ mtp_req_put(dev, &dev->intr_idle, req);
+
+ wake_up(&dev->intr_wq);
+}
+
+static int mtp_create_bulk_endpoints(struct mtp_dev *dev,
+ struct usb_endpoint_descriptor *in_desc,
+ struct usb_endpoint_descriptor *out_desc,
+ struct usb_endpoint_descriptor *intr_desc)
+{
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *req;
+ struct usb_ep *ep;
+ int i;
+
+ DBG(cdev, "create_bulk_endpoints dev: %p\n", dev);
+
+ ep = usb_ep_autoconfig(cdev->gadget, in_desc);
+ if (!ep) {
+ DBG(cdev, "usb_ep_autoconfig for ep_in failed\n");
+ return -ENODEV;
+ }
+ DBG(cdev, "usb_ep_autoconfig for ep_in got %s\n", ep->name);
+ ep->driver_data = dev; /* claim the endpoint */
+ dev->ep_in = ep;
+
+ ep = usb_ep_autoconfig(cdev->gadget, out_desc);
+ if (!ep) {
+ DBG(cdev, "usb_ep_autoconfig for ep_out failed\n");
+ return -ENODEV;
+ }
+ DBG(cdev, "usb_ep_autoconfig for mtp ep_out got %s\n", ep->name);
+ ep->driver_data = dev; /* claim the endpoint */
+ dev->ep_out = ep;
+
+ ep = usb_ep_autoconfig(cdev->gadget, intr_desc);
+ if (!ep) {
+ DBG(cdev, "usb_ep_autoconfig for ep_intr failed\n");
+ return -ENODEV;
+ }
+ DBG(cdev, "usb_ep_autoconfig for mtp ep_intr got %s\n", ep->name);
+ ep->driver_data = dev; /* claim the endpoint */
+ dev->ep_intr = ep;
+
+ /* now allocate requests for our endpoints */
+ for (i = 0; i < TX_REQ_MAX; i++) {
+ req = mtp_request_new(dev->ep_in, MTP_BULK_BUFFER_SIZE);
+ if (!req)
+ goto fail;
+ req->complete = mtp_complete_in;
+ mtp_req_put(dev, &dev->tx_idle, req);
+ }
+ for (i = 0; i < RX_REQ_MAX; i++) {
+ req = mtp_request_new(dev->ep_out, MTP_BULK_BUFFER_SIZE);
+ if (!req)
+ goto fail;
+ req->complete = mtp_complete_out;
+ dev->rx_req[i] = req;
+ }
+ for (i = 0; i < INTR_REQ_MAX; i++) {
+ req = mtp_request_new(dev->ep_intr, INTR_BUFFER_SIZE);
+ if (!req)
+ goto fail;
+ req->complete = mtp_complete_intr;
+ mtp_req_put(dev, &dev->intr_idle, req);
+ }
+
+ return 0;
+
+fail:
+ pr_err("mtp_bind() could not allocate requests\n");
+ return -1;
+}
+
+static ssize_t mtp_read(struct file *fp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct mtp_dev *dev = fp->private_data;
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *req;
+ ssize_t r = count;
+ unsigned xfer;
+ int ret = 0;
+ size_t len = 0;
+
+ DBG(cdev, "mtp_read(%zu)\n", count);
+
+ /* we will block until we're online */
+ DBG(cdev, "mtp_read: waiting for online state\n");
+ ret = wait_event_interruptible(dev->read_wq,
+ dev->state != STATE_OFFLINE);
+ if (ret < 0) {
+ r = ret;
+ goto done;
+ }
+ spin_lock_irq(&dev->lock);
+ if (dev->ep_out->desc) {
+ len = usb_ep_align_maybe(cdev->gadget, dev->ep_out, count);
+ if (len > MTP_BULK_BUFFER_SIZE) {
+ spin_unlock_irq(&dev->lock);
+ return -EINVAL;
+ }
+ }
+
+ if (dev->state == STATE_CANCELED) {
+ /* report cancelation to userspace */
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+ return -ECANCELED;
+ }
+ dev->state = STATE_BUSY;
+ spin_unlock_irq(&dev->lock);
+
+requeue_req:
+ /* queue a request */
+ req = dev->rx_req[0];
+ req->length = len;
+ dev->rx_done = 0;
+ ret = usb_ep_queue(dev->ep_out, req, GFP_KERNEL);
+ if (ret < 0) {
+ r = -EIO;
+ goto done;
+ } else {
+ DBG(cdev, "rx %p queue\n", req);
+ }
+
+ /* wait for a request to complete */
+ ret = wait_event_interruptible(dev->read_wq, dev->rx_done);
+ if (ret < 0) {
+ r = ret;
+ usb_ep_dequeue(dev->ep_out, req);
+ goto done;
+ }
+ if (dev->state == STATE_BUSY) {
+ /* If we got a 0-len packet, throw it back and try again. */
+ if (req->actual == 0)
+ goto requeue_req;
+
+ DBG(cdev, "rx %p %d\n", req, req->actual);
+ xfer = (req->actual < count) ? req->actual : count;
+ r = xfer;
+ if (copy_to_user(buf, req->buf, xfer))
+ r = -EFAULT;
+ } else
+ r = -EIO;
+
+done:
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED)
+ r = -ECANCELED;
+ else if (dev->state != STATE_OFFLINE)
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+
+ DBG(cdev, "mtp_read returning %zd\n", r);
+ return r;
+}
+
+static ssize_t mtp_write(struct file *fp, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct mtp_dev *dev = fp->private_data;
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *req = 0;
+ ssize_t r = count;
+ unsigned xfer;
+ int sendZLP = 0;
+ int ret;
+
+ DBG(cdev, "mtp_write(%zu)\n", count);
+
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED) {
+ /* report cancelation to userspace */
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+ return -ECANCELED;
+ }
+ if (dev->state == STATE_OFFLINE) {
+ spin_unlock_irq(&dev->lock);
+ return -ENODEV;
+ }
+ dev->state = STATE_BUSY;
+ spin_unlock_irq(&dev->lock);
+
+ /* we need to send a zero length packet to signal the end of transfer
+ * if the transfer size is aligned to a packet boundary.
+ */
+ if ((count & (dev->ep_in->maxpacket - 1)) == 0)
+ sendZLP = 1;
+
+ while (count > 0 || sendZLP) {
+ /* so we exit after sending ZLP */
+ if (count == 0)
+ sendZLP = 0;
+
+ if (dev->state != STATE_BUSY) {
+ DBG(cdev, "mtp_write dev->error\n");
+ r = -EIO;
+ break;
+ }
+
+ /* get an idle tx request to use */
+ req = 0;
+ ret = wait_event_interruptible(dev->write_wq,
+ ((req = mtp_req_get(dev, &dev->tx_idle))
+ || dev->state != STATE_BUSY));
+ if (!req) {
+ r = ret;
+ break;
+ }
+
+ if (count > MTP_BULK_BUFFER_SIZE)
+ xfer = MTP_BULK_BUFFER_SIZE;
+ else
+ xfer = count;
+ if (xfer && copy_from_user(req->buf, buf, xfer)) {
+ r = -EFAULT;
+ break;
+ }
+
+ req->length = xfer;
+ ret = usb_ep_queue(dev->ep_in, req, GFP_KERNEL);
+ if (ret < 0) {
+ DBG(cdev, "mtp_write: xfer error %d\n", ret);
+ r = -EIO;
+ break;
+ }
+
+ buf += xfer;
+ count -= xfer;
+
+ /* zero this so we don't try to free it on error exit */
+ req = 0;
+ }
+
+ if (req)
+ mtp_req_put(dev, &dev->tx_idle, req);
+
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED)
+ r = -ECANCELED;
+ else if (dev->state != STATE_OFFLINE)
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+
+ DBG(cdev, "mtp_write returning %zd\n", r);
+ return r;
+}
+
+/* read from a local file and write to USB */
+static void send_file_work(struct work_struct *data)
+{
+ struct mtp_dev *dev = container_of(data, struct mtp_dev,
+ send_file_work);
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *req = 0;
+ struct mtp_data_header *header;
+ struct file *filp;
+ loff_t offset;
+ int64_t count;
+ int xfer, ret, hdr_size;
+ int r = 0;
+ int sendZLP = 0;
+
+ /* read our parameters */
+ smp_rmb();
+ filp = dev->xfer_file;
+ offset = dev->xfer_file_offset;
+ count = dev->xfer_file_length;
+
+ DBG(cdev, "send_file_work(%lld %lld)\n", offset, count);
+
+ if (dev->xfer_send_header) {
+ hdr_size = sizeof(struct mtp_data_header);
+ count += hdr_size;
+ } else {
+ hdr_size = 0;
+ }
+
+ /* we need to send a zero length packet to signal the end of transfer
+ * if the transfer size is aligned to a packet boundary.
+ */
+ if ((count & (dev->ep_in->maxpacket - 1)) == 0)
+ sendZLP = 1;
+
+ while (count > 0 || sendZLP) {
+ /* so we exit after sending ZLP */
+ if (count == 0)
+ sendZLP = 0;
+
+ /* get an idle tx request to use */
+ req = 0;
+ ret = wait_event_interruptible(dev->write_wq,
+ (req = mtp_req_get(dev, &dev->tx_idle))
+ || dev->state != STATE_BUSY);
+ if (dev->state == STATE_CANCELED) {
+ r = -ECANCELED;
+ break;
+ }
+ if (!req) {
+ r = ret;
+ break;
+ }
+
+ if (count > MTP_BULK_BUFFER_SIZE)
+ xfer = MTP_BULK_BUFFER_SIZE;
+ else
+ xfer = count;
+
+ if (hdr_size) {
+ /* prepend MTP data header */
+ header = (struct mtp_data_header *)req->buf;
+ /*
+ * set file size with header according to
+ * MTP Specification v1.0
+ */
+ header->length = (count > MTP_MAX_FILE_SIZE) ?
+ MTP_MAX_FILE_SIZE : __cpu_to_le32(count);
+ header->type = __cpu_to_le16(2); /* data packet */
+ header->command = __cpu_to_le16(dev->xfer_command);
+ header->transaction_id =
+ __cpu_to_le32(dev->xfer_transaction_id);
+ }
+
+ ret = vfs_read(filp, req->buf + hdr_size, xfer - hdr_size,
+ &offset);
+ if (ret < 0) {
+ r = ret;
+ break;
+ }
+ xfer = ret + hdr_size;
+ hdr_size = 0;
+
+ req->length = xfer;
+ ret = usb_ep_queue(dev->ep_in, req, GFP_KERNEL);
+ if (ret < 0) {
+ DBG(cdev, "send_file_work: xfer error %d\n", ret);
+ dev->state = STATE_ERROR;
+ r = -EIO;
+ break;
+ }
+
+ count -= xfer;
+
+ /* zero this so we don't try to free it on error exit */
+ req = 0;
+ }
+
+ if (req)
+ mtp_req_put(dev, &dev->tx_idle, req);
+
+ DBG(cdev, "send_file_work returning %d\n", r);
+ /* write the result */
+ dev->xfer_result = r;
+ smp_wmb();
+}
+
+/* read from USB and write to a local file */
+static void receive_file_work(struct work_struct *data)
+{
+ struct mtp_dev *dev = container_of(data, struct mtp_dev,
+ receive_file_work);
+ struct usb_composite_dev *cdev = dev->cdev;
+ struct usb_request *read_req = NULL, *write_req = NULL;
+ struct file *filp;
+ loff_t offset;
+ int64_t count;
+ int ret, cur_buf = 0;
+ int r = 0;
+
+ /* read our parameters */
+ smp_rmb();
+ filp = dev->xfer_file;
+ offset = dev->xfer_file_offset;
+ count = dev->xfer_file_length;
+
+ DBG(cdev, "receive_file_work(%lld)\n", count);
+
+ while (count > 0 || write_req) {
+ if (count > 0) {
+ /* queue a request */
+ read_req = dev->rx_req[cur_buf];
+ cur_buf = (cur_buf + 1) % RX_REQ_MAX;
+
+ read_req->length = (count > MTP_BULK_BUFFER_SIZE
+ ? MTP_BULK_BUFFER_SIZE : count);
+ dev->rx_done = 0;
+ ret = usb_ep_queue(dev->ep_out, read_req, GFP_KERNEL);
+ if (ret < 0) {
+ r = -EIO;
+ dev->state = STATE_ERROR;
+ break;
+ }
+ }
+
+ if (write_req) {
+ DBG(cdev, "rx %p %d\n", write_req, write_req->actual);
+ ret = vfs_write(filp, write_req->buf, write_req->actual,
+ &offset);
+ DBG(cdev, "vfs_write %d\n", ret);
+ if (ret != write_req->actual) {
+ r = -EIO;
+ dev->state = STATE_ERROR;
+ break;
+ }
+ write_req = NULL;
+ }
+
+ if (read_req) {
+ /* wait for our last read to complete */
+ ret = wait_event_interruptible(dev->read_wq,
+ dev->rx_done || dev->state != STATE_BUSY);
+ if (dev->state == STATE_CANCELED) {
+ r = -ECANCELED;
+ if (!dev->rx_done)
+ usb_ep_dequeue(dev->ep_out, read_req);
+ break;
+ }
+ if (read_req->status) {
+ r = read_req->status;
+ break;
+ }
+ /* if xfer_file_length is 0xFFFFFFFF, then we read until
+ * we get a zero length packet
+ */
+ if (count != 0xFFFFFFFF)
+ count -= read_req->actual;
+ if (read_req->actual < read_req->length) {
+ /*
+ * short packet is used to signal EOF for
+ * sizes > 4 gig
+ */
+ DBG(cdev, "got short packet\n");
+ count = 0;
+ }
+
+ write_req = read_req;
+ read_req = NULL;
+ }
+ }
+
+ DBG(cdev, "receive_file_work returning %d\n", r);
+ /* write the result */
+ dev->xfer_result = r;
+ smp_wmb();
+}
+
+static int mtp_send_event(struct mtp_dev *dev, struct mtp_event *event)
+{
+ struct usb_request *req = NULL;
+ int ret;
+ int length = event->length;
+
+ DBG(dev->cdev, "mtp_send_event(%zu)\n", event->length);
+
+ if (length < 0 || length > INTR_BUFFER_SIZE)
+ return -EINVAL;
+ if (dev->state == STATE_OFFLINE)
+ return -ENODEV;
+
+ ret = wait_event_interruptible_timeout(dev->intr_wq,
+ (req = mtp_req_get(dev, &dev->intr_idle)),
+ msecs_to_jiffies(1000));
+ if (!req)
+ return -ETIME;
+
+ if (copy_from_user(req->buf, (void __user *)event->data, length)) {
+ mtp_req_put(dev, &dev->intr_idle, req);
+ return -EFAULT;
+ }
+ req->length = length;
+ ret = usb_ep_queue(dev->ep_intr, req, GFP_KERNEL);
+ if (ret)
+ mtp_req_put(dev, &dev->intr_idle, req);
+
+ return ret;
+}
+
+static long mtp_ioctl(struct file *fp, unsigned code, unsigned long value)
+{
+ struct mtp_dev *dev = fp->private_data;
+ struct file *filp = NULL;
+ int ret = -EINVAL;
+
+ if (mtp_lock(&dev->ioctl_excl))
+ return -EBUSY;
+
+ switch (code) {
+ case MTP_SEND_FILE:
+ case MTP_RECEIVE_FILE:
+ case MTP_SEND_FILE_WITH_HEADER:
+ {
+ struct mtp_file_range mfr;
+ struct work_struct *work;
+
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED) {
+ /* report cancelation to userspace */
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+ ret = -ECANCELED;
+ goto out;
+ }
+ if (dev->state == STATE_OFFLINE) {
+ spin_unlock_irq(&dev->lock);
+ ret = -ENODEV;
+ goto out;
+ }
+ dev->state = STATE_BUSY;
+ spin_unlock_irq(&dev->lock);
+
+ if (copy_from_user(&mfr, (void __user *)value, sizeof(mfr))) {
+ ret = -EFAULT;
+ goto fail;
+ }
+ /* hold a reference to the file while we are working with it */
+ filp = fget(mfr.fd);
+ if (!filp) {
+ ret = -EBADF;
+ goto fail;
+ }
+
+ /* write the parameters */
+ dev->xfer_file = filp;
+ dev->xfer_file_offset = mfr.offset;
+ dev->xfer_file_length = mfr.length;
+ smp_wmb();
+
+ if (code == MTP_SEND_FILE_WITH_HEADER) {
+ work = &dev->send_file_work;
+ dev->xfer_send_header = 1;
+ dev->xfer_command = mfr.command;
+ dev->xfer_transaction_id = mfr.transaction_id;
+ } else if (code == MTP_SEND_FILE) {
+ work = &dev->send_file_work;
+ dev->xfer_send_header = 0;
+ } else {
+ work = &dev->receive_file_work;
+ }
+
+ /* We do the file transfer on a work queue so it will run
+ * in kernel context, which is necessary for vfs_read and
+ * vfs_write to use our buffers in the kernel address space.
+ */
+ queue_work(dev->wq, work);
+ /* wait for operation to complete */
+ flush_workqueue(dev->wq);
+ fput(filp);
+
+ /* read the result */
+ smp_rmb();
+ ret = dev->xfer_result;
+ break;
+ }
+ case MTP_SEND_EVENT:
+ {
+ struct mtp_event event;
+ /* return here so we don't change dev->state below,
+ * which would interfere with bulk transfer state.
+ */
+ if (copy_from_user(&event, (void __user *)value, sizeof(event)))
+ ret = -EFAULT;
+ else
+ ret = mtp_send_event(dev, &event);
+ goto out;
+ }
+ }
+
+fail:
+ spin_lock_irq(&dev->lock);
+ if (dev->state == STATE_CANCELED)
+ ret = -ECANCELED;
+ else if (dev->state != STATE_OFFLINE)
+ dev->state = STATE_READY;
+ spin_unlock_irq(&dev->lock);
+out:
+ mtp_unlock(&dev->ioctl_excl);
+ DBG(dev->cdev, "ioctl returning %d\n", ret);
+ return ret;
+}
+
+static int mtp_open(struct inode *ip, struct file *fp)
+{
+ printk(KERN_INFO "mtp_open\n");
+ if (mtp_lock(&_mtp_dev->open_excl))
+ return -EBUSY;
+
+ /* clear any error condition */
+ if (_mtp_dev->state != STATE_OFFLINE)
+ _mtp_dev->state = STATE_READY;
+
+ fp->private_data = _mtp_dev;
+ return 0;
+}
+
+static int mtp_release(struct inode *ip, struct file *fp)
+{
+ printk(KERN_INFO "mtp_release\n");
+
+ mtp_unlock(&_mtp_dev->open_excl);
+ return 0;
+}
+
+/* file operations for /dev/mtp_usb */
+static const struct file_operations mtp_fops = {
+ .owner = THIS_MODULE,
+ .read = mtp_read,
+ .write = mtp_write,
+ .unlocked_ioctl = mtp_ioctl,
+ .open = mtp_open,
+ .release = mtp_release,
+};
+
+static struct miscdevice mtp_device = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = mtp_shortname,
+ .fops = &mtp_fops,
+};
+
+static int mtp_ctrlrequest(struct usb_composite_dev *cdev,
+ const struct usb_ctrlrequest *ctrl)
+{
+ struct mtp_dev *dev = _mtp_dev;
+ int value = -EOPNOTSUPP;
+ u16 w_index = le16_to_cpu(ctrl->wIndex);
+ u16 w_value = le16_to_cpu(ctrl->wValue);
+ u16 w_length = le16_to_cpu(ctrl->wLength);
+ unsigned long flags;
+
+ VDBG(cdev, "mtp_ctrlrequest "
+ "%02x.%02x v%04x i%04x l%u\n",
+ ctrl->bRequestType, ctrl->bRequest,
+ w_value, w_index, w_length);
+
+ /* Handle MTP OS string */
+ if (ctrl->bRequestType ==
+ (USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE)
+ && ctrl->bRequest == USB_REQ_GET_DESCRIPTOR
+ && (w_value >> 8) == USB_DT_STRING
+ && (w_value & 0xFF) == MTP_OS_STRING_ID) {
+ value = (w_length < sizeof(mtp_os_string)
+ ? w_length : sizeof(mtp_os_string));
+ memcpy(cdev->req->buf, mtp_os_string, value);
+ } else if ((ctrl->bRequestType & USB_TYPE_MASK) == USB_TYPE_VENDOR) {
+ /* Handle MTP OS descriptor */
+ DBG(cdev, "vendor request: %d index: %d value: %d length: %d\n",
+ ctrl->bRequest, w_index, w_value, w_length);
+
+ if (ctrl->bRequest == 1
+ && (ctrl->bRequestType & USB_DIR_IN)
+ && (w_index == 4 || w_index == 5)) {
+ value = (w_length < sizeof(mtp_ext_config_desc) ?
+ w_length : sizeof(mtp_ext_config_desc));
+ memcpy(cdev->req->buf, &mtp_ext_config_desc, value);
+ }
+ } else if ((ctrl->bRequestType & USB_TYPE_MASK) == USB_TYPE_CLASS) {
+ DBG(cdev, "class request: %d index: %d value: %d length: %d\n",
+ ctrl->bRequest, w_index, w_value, w_length);
+
+ if (ctrl->bRequest == MTP_REQ_CANCEL && w_index == 0
+ && w_value == 0) {
+ DBG(cdev, "MTP_REQ_CANCEL\n");
+
+ spin_lock_irqsave(&dev->lock, flags);
+ if (dev->state == STATE_BUSY) {
+ dev->state = STATE_CANCELED;
+ wake_up(&dev->read_wq);
+ wake_up(&dev->write_wq);
+ }
+ spin_unlock_irqrestore(&dev->lock, flags);
+
+ /* We need to queue a request to read the remaining
+ * bytes, but we don't actually need to look at
+ * the contents.
+ */
+ value = w_length;
+ } else if (ctrl->bRequest == MTP_REQ_GET_DEVICE_STATUS
+ && w_index == 0 && w_value == 0) {
+ struct mtp_device_status *status = cdev->req->buf;
+
+ status->wLength =
+ __constant_cpu_to_le16(sizeof(*status));
+
+ DBG(cdev, "MTP_REQ_GET_DEVICE_STATUS\n");
+ spin_lock_irqsave(&dev->lock, flags);
+ /* device status is "busy" until we report
+ * the cancelation to userspace
+ */
+ if (dev->state == STATE_CANCELED)
+ status->wCode =
+ __cpu_to_le16(MTP_RESPONSE_DEVICE_BUSY);
+ else
+ status->wCode =
+ __cpu_to_le16(MTP_RESPONSE_OK);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ value = sizeof(*status);
+ }
+ }
+
+ /* respond with data transfer or status phase? */
+ if (value >= 0) {
+ int rc;
+
+ cdev->req->zero = value < w_length;
+ cdev->req->length = value;
+ rc = usb_ep_queue(cdev->gadget->ep0, cdev->req, GFP_ATOMIC);
+ if (rc < 0)
+ ERROR(cdev, "%s: response queue error\n", __func__);
+ }
+ return value;
+}
+
+static int
+mtp_function_bind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct usb_composite_dev *cdev = c->cdev;
+ struct mtp_dev *dev = func_to_mtp(f);
+ int id;
+ int ret;
+ struct mtp_instance *fi_mtp;
+
+ dev->cdev = cdev;
+ DBG(cdev, "mtp_function_bind dev: %p\n", dev);
+
+ /* allocate interface ID(s) */
+ id = usb_interface_id(c, f);
+ if (id < 0)
+ return id;
+ mtp_interface_desc.bInterfaceNumber = id;
+
+ if (mtp_string_defs[INTERFACE_STRING_INDEX].id == 0) {
+ ret = usb_string_id(c->cdev);
+ if (ret < 0)
+ return ret;
+ mtp_string_defs[INTERFACE_STRING_INDEX].id = ret;
+ mtp_interface_desc.iInterface = ret;
+ }
+
+ fi_mtp = container_of(f->fi, struct mtp_instance, func_inst);
+
+ if (cdev->use_os_string) {
+ f->os_desc_table = kzalloc(sizeof(*f->os_desc_table),
+ GFP_KERNEL);
+ if (!f->os_desc_table)
+ return -ENOMEM;
+ f->os_desc_n = 1;
+ f->os_desc_table[0].os_desc = &fi_mtp->mtp_os_desc;
+ }
+
+ /* allocate endpoints */
+ ret = mtp_create_bulk_endpoints(dev, &mtp_fullspeed_in_desc,
+ &mtp_fullspeed_out_desc, &mtp_intr_desc);
+ if (ret)
+ return ret;
+
+ /* support high speed hardware */
+ if (gadget_is_dualspeed(c->cdev->gadget)) {
+ mtp_highspeed_in_desc.bEndpointAddress =
+ mtp_fullspeed_in_desc.bEndpointAddress;
+ mtp_highspeed_out_desc.bEndpointAddress =
+ mtp_fullspeed_out_desc.bEndpointAddress;
+ }
+ /* support super speed hardware */
+ if (gadget_is_superspeed(c->cdev->gadget)) {
+ unsigned max_burst;
+
+ /* Calculate bMaxBurst, we know packet size is 1024 */
+ max_burst = min_t(unsigned, MTP_BULK_BUFFER_SIZE / 1024, 15);
+ mtp_ss_in_desc.bEndpointAddress =
+ mtp_fullspeed_in_desc.bEndpointAddress;
+ mtp_ss_in_comp_desc.bMaxBurst = max_burst;
+ mtp_ss_out_desc.bEndpointAddress =
+ mtp_fullspeed_out_desc.bEndpointAddress;
+ mtp_ss_out_comp_desc.bMaxBurst = max_burst;
+ }
+
+ DBG(cdev, "%s speed %s: IN/%s, OUT/%s\n",
+ gadget_is_superspeed(c->cdev->gadget) ? "super" :
+ (gadget_is_dualspeed(c->cdev->gadget) ? "dual" : "full"),
+ f->name, dev->ep_in->name, dev->ep_out->name);
+ return 0;
+}
+
+static void
+mtp_function_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+ struct mtp_dev *dev = func_to_mtp(f);
+ struct usb_request *req;
+ int i;
+
+ mtp_string_defs[INTERFACE_STRING_INDEX].id = 0;
+ while ((req = mtp_req_get(dev, &dev->tx_idle)))
+ mtp_request_free(req, dev->ep_in);
+ for (i = 0; i < RX_REQ_MAX; i++)
+ mtp_request_free(dev->rx_req[i], dev->ep_out);
+ while ((req = mtp_req_get(dev, &dev->intr_idle)))
+ mtp_request_free(req, dev->ep_intr);
+ dev->state = STATE_OFFLINE;
+ kfree(f->os_desc_table);
+ f->os_desc_n = 0;
+}
+
+static int mtp_function_set_alt(struct usb_function *f,
+ unsigned intf, unsigned alt)
+{
+ struct mtp_dev *dev = func_to_mtp(f);
+ struct usb_composite_dev *cdev = f->config->cdev;
+ int ret;
+
+ DBG(cdev, "mtp_function_set_alt intf: %d alt: %d\n", intf, alt);
+
+ ret = config_ep_by_speed(cdev->gadget, f, dev->ep_in);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(dev->ep_in);
+ if (ret)
+ return ret;
+
+ ret = config_ep_by_speed(cdev->gadget, f, dev->ep_out);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(dev->ep_out);
+ if (ret) {
+ usb_ep_disable(dev->ep_in);
+ return ret;
+ }
+
+ ret = config_ep_by_speed(cdev->gadget, f, dev->ep_intr);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(dev->ep_intr);
+ if (ret) {
+ usb_ep_disable(dev->ep_out);
+ usb_ep_disable(dev->ep_in);
+ return ret;
+ }
+ dev->state = STATE_READY;
+
+ /* readers may be blocked waiting for us to go online */
+ wake_up(&dev->read_wq);
+ return 0;
+}
+
+static void mtp_function_disable(struct usb_function *f)
+{
+ struct mtp_dev *dev = func_to_mtp(f);
+ struct usb_composite_dev *cdev = dev->cdev;
+
+ DBG(cdev, "mtp_function_disable\n");
+ dev->state = STATE_OFFLINE;
+ usb_ep_disable(dev->ep_in);
+ usb_ep_disable(dev->ep_out);
+ usb_ep_disable(dev->ep_intr);
+
+ /* readers may be blocked waiting for us to go online */
+ wake_up(&dev->read_wq);
+
+ VDBG(cdev, "%s disabled\n", dev->function.name);
+}
+
+static int __mtp_setup(struct mtp_instance *fi_mtp)
+{
+ struct mtp_dev *dev;
+ int ret;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+
+ if (fi_mtp != NULL)
+ fi_mtp->dev = dev;
+
+ if (!dev)
+ return -ENOMEM;
+
+ spin_lock_init(&dev->lock);
+ init_waitqueue_head(&dev->read_wq);
+ init_waitqueue_head(&dev->write_wq);
+ init_waitqueue_head(&dev->intr_wq);
+ atomic_set(&dev->open_excl, 0);
+ atomic_set(&dev->ioctl_excl, 0);
+ INIT_LIST_HEAD(&dev->tx_idle);
+ INIT_LIST_HEAD(&dev->intr_idle);
+
+ dev->wq = create_singlethread_workqueue("f_mtp");
+ if (!dev->wq) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+ INIT_WORK(&dev->send_file_work, send_file_work);
+ INIT_WORK(&dev->receive_file_work, receive_file_work);
+
+ _mtp_dev = dev;
+
+ ret = misc_register(&mtp_device);
+ if (ret)
+ goto err2;
+
+ return 0;
+
+err2:
+ destroy_workqueue(dev->wq);
+err1:
+ _mtp_dev = NULL;
+ kfree(dev);
+ printk(KERN_ERR "mtp gadget driver failed to initialize\n");
+ return ret;
+}
+
+static int mtp_setup_configfs(struct mtp_instance *fi_mtp)
+{
+ return __mtp_setup(fi_mtp);
+}
+
+
+static void mtp_cleanup(void)
+{
+ struct mtp_dev *dev = _mtp_dev;
+
+ if (!dev)
+ return;
+
+ misc_deregister(&mtp_device);
+ destroy_workqueue(dev->wq);
+ _mtp_dev = NULL;
+ kfree(dev);
+}
+
+static struct mtp_instance *to_mtp_instance(struct config_item *item)
+{
+ return container_of(to_config_group(item), struct mtp_instance,
+ func_inst.group);
+}
+
+static void mtp_attr_release(struct config_item *item)
+{
+ struct mtp_instance *fi_mtp = to_mtp_instance(item);
+
+ usb_put_function_instance(&fi_mtp->func_inst);
+}
+
+static struct configfs_item_operations mtp_item_ops = {
+ .release = mtp_attr_release,
+};
+
+static struct config_item_type mtp_func_type = {
+ .ct_item_ops = &mtp_item_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+
+static struct mtp_instance *to_fi_mtp(struct usb_function_instance *fi)
+{
+ return container_of(fi, struct mtp_instance, func_inst);
+}
+
+static int mtp_set_inst_name(struct usb_function_instance *fi, const char *name)
+{
+ struct mtp_instance *fi_mtp;
+ char *ptr;
+ int name_len;
+
+ name_len = strlen(name) + 1;
+ if (name_len > MAX_INST_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ ptr = kstrndup(name, name_len, GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ fi_mtp = to_fi_mtp(fi);
+ fi_mtp->name = ptr;
+
+ return 0;
+}
+
+static void mtp_free_inst(struct usb_function_instance *fi)
+{
+ struct mtp_instance *fi_mtp;
+
+ fi_mtp = to_fi_mtp(fi);
+ kfree(fi_mtp->name);
+ mtp_cleanup();
+ kfree(fi_mtp);
+}
+
+struct usb_function_instance *alloc_inst_mtp_ptp(bool mtp_config)
+{
+ struct mtp_instance *fi_mtp;
+ int ret = 0;
+ struct usb_os_desc *descs[1];
+ char *names[1];
+
+ fi_mtp = kzalloc(sizeof(*fi_mtp), GFP_KERNEL);
+ if (!fi_mtp)
+ return ERR_PTR(-ENOMEM);
+ fi_mtp->func_inst.set_inst_name = mtp_set_inst_name;
+ fi_mtp->func_inst.free_func_inst = mtp_free_inst;
+
+ fi_mtp->mtp_os_desc.ext_compat_id = fi_mtp->mtp_ext_compat_id;
+ INIT_LIST_HEAD(&fi_mtp->mtp_os_desc.ext_prop);
+ descs[0] = &fi_mtp->mtp_os_desc;
+ names[0] = "MTP";
+
+ if (mtp_config) {
+ ret = mtp_setup_configfs(fi_mtp);
+ if (ret) {
+ kfree(fi_mtp);
+ pr_err("Error setting MTP\n");
+ return ERR_PTR(ret);
+ }
+ } else
+ fi_mtp->dev = _mtp_dev;
+
+ config_group_init_type_name(&fi_mtp->func_inst.group,
+ "", &mtp_func_type);
+ usb_os_desc_prepare_interf_dir(&fi_mtp->func_inst.group, 1,
+ descs, names, THIS_MODULE);
+
+ return &fi_mtp->func_inst;
+}
+EXPORT_SYMBOL_GPL(alloc_inst_mtp_ptp);
+
+static struct usb_function_instance *mtp_alloc_inst(void)
+{
+ return alloc_inst_mtp_ptp(true);
+}
+
+static int mtp_ctrlreq_configfs(struct usb_function *f,
+ const struct usb_ctrlrequest *ctrl)
+{
+ return mtp_ctrlrequest(f->config->cdev, ctrl);
+}
+
+static void mtp_free(struct usb_function *f)
+{
+ /*NO-OP: no function specific resource allocation in mtp_alloc*/
+}
+
+struct usb_function *function_alloc_mtp_ptp(struct usb_function_instance *fi,
+ bool mtp_config)
+{
+ struct mtp_instance *fi_mtp = to_fi_mtp(fi);
+ struct mtp_dev *dev;
+
+ /*
+ * PTP piggybacks on MTP function so make sure we have
+ * created MTP function before we associate this PTP
+ * function with a gadget configuration.
+ */
+ if (fi_mtp->dev == NULL) {
+ pr_err("Error: Create MTP function before linking"
+ " PTP function with a gadget configuration\n");
+ pr_err("\t1: Delete existing PTP function if any\n");
+ pr_err("\t2: Create MTP function\n");
+ pr_err("\t3: Create and symlink PTP function"
+ " with a gadget configuration\n");
+ return ERR_PTR(-EINVAL); /* Invalid Configuration */
+ }
+
+ dev = fi_mtp->dev;
+ dev->function.name = DRIVER_NAME;
+ dev->function.strings = mtp_strings;
+ if (mtp_config) {
+ dev->function.fs_descriptors = fs_mtp_descs;
+ dev->function.hs_descriptors = hs_mtp_descs;
+ dev->function.ss_descriptors = ss_mtp_descs;
+ } else {
+ dev->function.fs_descriptors = fs_ptp_descs;
+ dev->function.hs_descriptors = hs_ptp_descs;
+ dev->function.ss_descriptors = ss_ptp_descs;
+ }
+ dev->function.bind = mtp_function_bind;
+ dev->function.unbind = mtp_function_unbind;
+ dev->function.set_alt = mtp_function_set_alt;
+ dev->function.disable = mtp_function_disable;
+ dev->function.setup = mtp_ctrlreq_configfs;
+ dev->function.free_func = mtp_free;
+
+ return &dev->function;
+}
+EXPORT_SYMBOL_GPL(function_alloc_mtp_ptp);
+
+static struct usb_function *mtp_alloc(struct usb_function_instance *fi)
+{
+ return function_alloc_mtp_ptp(fi, true);
+}
+
+DECLARE_USB_FUNCTION_INIT(mtp, mtp_alloc_inst, mtp_alloc);
+MODULE_LICENSE("GPL");
diff --git a/drivers/usb/gadget/function/f_mtp.h b/drivers/usb/gadget/function/f_mtp.h
new file mode 100644
index 000000000000..7adb1ff08eff
--- /dev/null
+++ b/drivers/usb/gadget/function/f_mtp.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Badhri Jagan Sridharan <badhri@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+extern struct usb_function_instance *alloc_inst_mtp_ptp(bool mtp_config);
+extern struct usb_function *function_alloc_mtp_ptp(
+ struct usb_function_instance *fi, bool mtp_config);
diff --git a/drivers/usb/gadget/function/f_ptp.c b/drivers/usb/gadget/function/f_ptp.c
new file mode 100644
index 000000000000..da3e4d53e085
--- /dev/null
+++ b/drivers/usb/gadget/function/f_ptp.c
@@ -0,0 +1,38 @@
+/*
+ * Gadget Function Driver for PTP
+ *
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Badhri Jagan Sridharan <badhri@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include <linux/configfs.h>
+#include <linux/usb/composite.h>
+
+#include "f_mtp.h"
+
+static struct usb_function_instance *ptp_alloc_inst(void)
+{
+ return alloc_inst_mtp_ptp(false);
+}
+
+static struct usb_function *ptp_alloc(struct usb_function_instance *fi)
+{
+ return function_alloc_mtp_ptp(fi, false);
+}
+
+DECLARE_USB_FUNCTION_INIT(ptp, ptp_alloc_inst, ptp_alloc);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Badhri Jagan Sridharan");
diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig
index 125cea1c3c8d..e47bb366877d 100644
--- a/drivers/usb/phy/Kconfig
+++ b/drivers/usb/phy/Kconfig
@@ -6,6 +6,14 @@ menu "USB Physical Layer drivers"
config USB_PHY
def_bool n
+config USB_OTG_WAKELOCK
+ bool "Hold a wakelock when USB connected"
+ depends on PM_WAKELOCKS
+ select USB_OTG_UTILS
+ help
+ Select this to automatically hold a wakelock when USB is
+ connected, preventing suspend.
+
#
# USB Transceiver Drivers
#
@@ -209,4 +217,13 @@ config USB_ULPI_VIEWPORT
Provides read/write operations to the ULPI phy register set for
controllers with a viewport register (e.g. Chipidea/ARC controllers).
+config DUAL_ROLE_USB_INTF
+ bool "Generic DUAL ROLE sysfs interface"
+ depends on SYSFS && USB_PHY
+ help
+ A generic sysfs interface to track and change the state of
+ dual role usb phys. The usb phy drivers can register to
+ this interface to expose it capabilities to the userspace
+ and thereby allowing userspace to change the port mode.
+
endmenu
diff --git a/drivers/usb/phy/Makefile b/drivers/usb/phy/Makefile
index b433e5d89be4..f65ac3e1fc07 100644
--- a/drivers/usb/phy/Makefile
+++ b/drivers/usb/phy/Makefile
@@ -3,6 +3,8 @@
#
obj-$(CONFIG_USB_PHY) += phy.o
obj-$(CONFIG_OF) += of.o
+obj-$(CONFIG_USB_OTG_WAKELOCK) += otg-wakelock.o
+obj-$(CONFIG_DUAL_ROLE_USB_INTF) += class-dual-role.o
# transceiver drivers, keep the list sorted
diff --git a/drivers/usb/phy/class-dual-role.c b/drivers/usb/phy/class-dual-role.c
new file mode 100644
index 000000000000..51fcb545a9d5
--- /dev/null
+++ b/drivers/usb/phy/class-dual-role.c
@@ -0,0 +1,529 @@
+/*
+ * class-dual-role.c
+ *
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/usb/class-dual-role.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/types.h>
+
+#define DUAL_ROLE_NOTIFICATION_TIMEOUT 2000
+
+static ssize_t dual_role_store_property(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count);
+static ssize_t dual_role_show_property(struct device *dev,
+ struct device_attribute *attr,
+ char *buf);
+
+#define DUAL_ROLE_ATTR(_name) \
+{ \
+ .attr = { .name = #_name }, \
+ .show = dual_role_show_property, \
+ .store = dual_role_store_property, \
+}
+
+static struct device_attribute dual_role_attrs[] = {
+ DUAL_ROLE_ATTR(supported_modes),
+ DUAL_ROLE_ATTR(mode),
+ DUAL_ROLE_ATTR(power_role),
+ DUAL_ROLE_ATTR(data_role),
+ DUAL_ROLE_ATTR(powers_vconn),
+};
+
+struct class *dual_role_class;
+EXPORT_SYMBOL_GPL(dual_role_class);
+
+static struct device_type dual_role_dev_type;
+
+static char *kstrdupcase(const char *str, gfp_t gfp, bool to_upper)
+{
+ char *ret, *ustr;
+
+ ustr = ret = kmalloc(strlen(str) + 1, gfp);
+
+ if (!ret)
+ return NULL;
+
+ while (*str)
+ *ustr++ = to_upper ? toupper(*str++) : tolower(*str++);
+
+ *ustr = 0;
+
+ return ret;
+}
+
+static void dual_role_changed_work(struct work_struct *work)
+{
+ struct dual_role_phy_instance *dual_role =
+ container_of(work, struct dual_role_phy_instance,
+ changed_work);
+
+ dev_dbg(&dual_role->dev, "%s\n", __func__);
+ kobject_uevent(&dual_role->dev.kobj, KOBJ_CHANGE);
+}
+
+void dual_role_instance_changed(struct dual_role_phy_instance *dual_role)
+{
+ dev_dbg(&dual_role->dev, "%s\n", __func__);
+ pm_wakeup_event(&dual_role->dev, DUAL_ROLE_NOTIFICATION_TIMEOUT);
+ schedule_work(&dual_role->changed_work);
+}
+EXPORT_SYMBOL_GPL(dual_role_instance_changed);
+
+int dual_role_get_property(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ unsigned int *val)
+{
+ return dual_role->desc->get_property(dual_role, prop, val);
+}
+EXPORT_SYMBOL_GPL(dual_role_get_property);
+
+int dual_role_set_property(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ const unsigned int *val)
+{
+ if (!dual_role->desc->set_property)
+ return -ENODEV;
+
+ return dual_role->desc->set_property(dual_role, prop, val);
+}
+EXPORT_SYMBOL_GPL(dual_role_set_property);
+
+int dual_role_property_is_writeable(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop)
+{
+ if (!dual_role->desc->property_is_writeable)
+ return -ENODEV;
+
+ return dual_role->desc->property_is_writeable(dual_role, prop);
+}
+EXPORT_SYMBOL_GPL(dual_role_property_is_writeable);
+
+static void dual_role_dev_release(struct device *dev)
+{
+ struct dual_role_phy_instance *dual_role =
+ container_of(dev, struct dual_role_phy_instance, dev);
+ pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
+ kfree(dual_role);
+}
+
+static struct dual_role_phy_instance *__must_check
+__dual_role_register(struct device *parent,
+ const struct dual_role_phy_desc *desc)
+{
+ struct device *dev;
+ struct dual_role_phy_instance *dual_role;
+ int rc;
+
+ dual_role = kzalloc(sizeof(*dual_role), GFP_KERNEL);
+ if (!dual_role)
+ return ERR_PTR(-ENOMEM);
+
+ dev = &dual_role->dev;
+
+ device_initialize(dev);
+
+ dev->class = dual_role_class;
+ dev->type = &dual_role_dev_type;
+ dev->parent = parent;
+ dev->release = dual_role_dev_release;
+ dev_set_drvdata(dev, dual_role);
+ dual_role->desc = desc;
+
+ rc = dev_set_name(dev, "%s", desc->name);
+ if (rc)
+ goto dev_set_name_failed;
+
+ INIT_WORK(&dual_role->changed_work, dual_role_changed_work);
+
+ rc = device_init_wakeup(dev, true);
+ if (rc)
+ goto wakeup_init_failed;
+
+ rc = device_add(dev);
+ if (rc)
+ goto device_add_failed;
+
+ dual_role_instance_changed(dual_role);
+
+ return dual_role;
+
+device_add_failed:
+ device_init_wakeup(dev, false);
+wakeup_init_failed:
+dev_set_name_failed:
+ put_device(dev);
+ kfree(dual_role);
+
+ return ERR_PTR(rc);
+}
+
+static void dual_role_instance_unregister(struct dual_role_phy_instance
+ *dual_role)
+{
+ cancel_work_sync(&dual_role->changed_work);
+ device_init_wakeup(&dual_role->dev, false);
+ device_unregister(&dual_role->dev);
+}
+
+static void devm_dual_role_release(struct device *dev, void *res)
+{
+ struct dual_role_phy_instance **dual_role = res;
+
+ dual_role_instance_unregister(*dual_role);
+}
+
+struct dual_role_phy_instance *__must_check
+devm_dual_role_instance_register(struct device *parent,
+ const struct dual_role_phy_desc *desc)
+{
+ struct dual_role_phy_instance **ptr, *dual_role;
+
+ ptr = devres_alloc(devm_dual_role_release, sizeof(*ptr), GFP_KERNEL);
+
+ if (!ptr)
+ return ERR_PTR(-ENOMEM);
+ dual_role = __dual_role_register(parent, desc);
+ if (IS_ERR(dual_role)) {
+ devres_free(ptr);
+ } else {
+ *ptr = dual_role;
+ devres_add(parent, ptr);
+ }
+ return dual_role;
+}
+EXPORT_SYMBOL_GPL(devm_dual_role_instance_register);
+
+static int devm_dual_role_match(struct device *dev, void *res, void *data)
+{
+ struct dual_role_phy_instance **r = res;
+
+ if (WARN_ON(!r || !*r))
+ return 0;
+
+ return *r == data;
+}
+
+void devm_dual_role_instance_unregister(struct device *dev,
+ struct dual_role_phy_instance
+ *dual_role)
+{
+ int rc;
+
+ rc = devres_release(dev, devm_dual_role_release,
+ devm_dual_role_match, dual_role);
+ WARN_ON(rc);
+}
+EXPORT_SYMBOL_GPL(devm_dual_role_instance_unregister);
+
+void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role)
+{
+ return dual_role->drv_data;
+}
+EXPORT_SYMBOL_GPL(dual_role_get_drvdata);
+
+/***************** Device attribute functions **************************/
+
+/* port type */
+static char *supported_modes_text[] = {
+ "ufp dfp", "dfp", "ufp"
+};
+
+/* current mode */
+static char *mode_text[] = {
+ "ufp", "dfp", "none"
+};
+
+/* Power role */
+static char *pr_text[] = {
+ "source", "sink", "none"
+};
+
+/* Data role */
+static char *dr_text[] = {
+ "host", "device", "none"
+};
+
+/* Vconn supply */
+static char *vconn_supply_text[] = {
+ "n", "y"
+};
+
+static ssize_t dual_role_show_property(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ ssize_t ret = 0;
+ struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+ const ptrdiff_t off = attr - dual_role_attrs;
+ unsigned int value;
+
+ if (off == DUAL_ROLE_PROP_SUPPORTED_MODES) {
+ value = dual_role->desc->supported_modes;
+ } else {
+ ret = dual_role_get_property(dual_role, off, &value);
+
+ if (ret < 0) {
+ if (ret == -ENODATA)
+ dev_dbg(dev,
+ "driver has no data for `%s' property\n",
+ attr->attr.name);
+ else if (ret != -ENODEV)
+ dev_err(dev,
+ "driver failed to report `%s' property: %zd\n",
+ attr->attr.name, ret);
+ return ret;
+ }
+ }
+
+ if (off == DUAL_ROLE_PROP_SUPPORTED_MODES) {
+ BUILD_BUG_ON(DUAL_ROLE_PROP_SUPPORTED_MODES_TOTAL !=
+ ARRAY_SIZE(supported_modes_text));
+ if (value < DUAL_ROLE_PROP_SUPPORTED_MODES_TOTAL)
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ supported_modes_text[value]);
+ else
+ return -EIO;
+ } else if (off == DUAL_ROLE_PROP_MODE) {
+ BUILD_BUG_ON(DUAL_ROLE_PROP_MODE_TOTAL !=
+ ARRAY_SIZE(mode_text));
+ if (value < DUAL_ROLE_PROP_MODE_TOTAL)
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ mode_text[value]);
+ else
+ return -EIO;
+ } else if (off == DUAL_ROLE_PROP_PR) {
+ BUILD_BUG_ON(DUAL_ROLE_PROP_PR_TOTAL != ARRAY_SIZE(pr_text));
+ if (value < DUAL_ROLE_PROP_PR_TOTAL)
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ pr_text[value]);
+ else
+ return -EIO;
+ } else if (off == DUAL_ROLE_PROP_DR) {
+ BUILD_BUG_ON(DUAL_ROLE_PROP_DR_TOTAL != ARRAY_SIZE(dr_text));
+ if (value < DUAL_ROLE_PROP_DR_TOTAL)
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ dr_text[value]);
+ else
+ return -EIO;
+ } else if (off == DUAL_ROLE_PROP_VCONN_SUPPLY) {
+ BUILD_BUG_ON(DUAL_ROLE_PROP_VCONN_SUPPLY_TOTAL !=
+ ARRAY_SIZE(vconn_supply_text));
+ if (value < DUAL_ROLE_PROP_VCONN_SUPPLY_TOTAL)
+ return snprintf(buf, PAGE_SIZE, "%s\n",
+ vconn_supply_text[value]);
+ else
+ return -EIO;
+ } else
+ return -EIO;
+}
+
+static ssize_t dual_role_store_property(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret;
+ struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+ const ptrdiff_t off = attr - dual_role_attrs;
+ unsigned int value;
+ int total, i;
+ char *dup_buf, **text_array;
+ bool result = false;
+
+ dup_buf = kstrdupcase(buf, GFP_KERNEL, false);
+ switch (off) {
+ case DUAL_ROLE_PROP_MODE:
+ total = DUAL_ROLE_PROP_MODE_TOTAL;
+ text_array = mode_text;
+ break;
+ case DUAL_ROLE_PROP_PR:
+ total = DUAL_ROLE_PROP_PR_TOTAL;
+ text_array = pr_text;
+ break;
+ case DUAL_ROLE_PROP_DR:
+ total = DUAL_ROLE_PROP_DR_TOTAL;
+ text_array = dr_text;
+ break;
+ case DUAL_ROLE_PROP_VCONN_SUPPLY:
+ ret = strtobool(dup_buf, &result);
+ value = result;
+ if (!ret)
+ goto setprop;
+ default:
+ ret = -EINVAL;
+ goto error;
+ }
+
+ for (i = 0; i <= total; i++) {
+ if (i == total) {
+ ret = -ENOTSUPP;
+ goto error;
+ }
+ if (!strncmp(*(text_array + i), dup_buf,
+ strlen(*(text_array + i)))) {
+ value = i;
+ break;
+ }
+ }
+
+setprop:
+ ret = dual_role->desc->set_property(dual_role, off, &value);
+
+error:
+ kfree(dup_buf);
+
+ if (ret < 0)
+ return ret;
+
+ return count;
+}
+
+static umode_t dual_role_attr_is_visible(struct kobject *kobj,
+ struct attribute *attr, int attrno)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+ umode_t mode = S_IRUSR | S_IRGRP | S_IROTH;
+ int i;
+
+ if (attrno == DUAL_ROLE_PROP_SUPPORTED_MODES)
+ return mode;
+
+ for (i = 0; i < dual_role->desc->num_properties; i++) {
+ int property = dual_role->desc->properties[i];
+
+ if (property == attrno) {
+ if (dual_role->desc->property_is_writeable &&
+ dual_role_property_is_writeable(dual_role, property)
+ > 0)
+ mode |= S_IWUSR;
+
+ return mode;
+ }
+ }
+
+ return 0;
+}
+
+static struct attribute *__dual_role_attrs[ARRAY_SIZE(dual_role_attrs) + 1];
+
+static struct attribute_group dual_role_attr_group = {
+ .attrs = __dual_role_attrs,
+ .is_visible = dual_role_attr_is_visible,
+};
+
+static const struct attribute_group *dual_role_attr_groups[] = {
+ &dual_role_attr_group,
+ NULL,
+};
+
+void dual_role_init_attrs(struct device_type *dev_type)
+{
+ int i;
+
+ dev_type->groups = dual_role_attr_groups;
+
+ for (i = 0; i < ARRAY_SIZE(dual_role_attrs); i++)
+ __dual_role_attrs[i] = &dual_role_attrs[i].attr;
+}
+
+int dual_role_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+ int ret = 0, j;
+ char *prop_buf;
+ char *attrname;
+
+ dev_dbg(dev, "uevent\n");
+
+ if (!dual_role || !dual_role->desc) {
+ dev_dbg(dev, "No dual_role phy yet\n");
+ return ret;
+ }
+
+ dev_dbg(dev, "DUAL_ROLE_NAME=%s\n", dual_role->desc->name);
+
+ ret = add_uevent_var(env, "DUAL_ROLE_NAME=%s", dual_role->desc->name);
+ if (ret)
+ return ret;
+
+ prop_buf = (char *)get_zeroed_page(GFP_KERNEL);
+ if (!prop_buf)
+ return -ENOMEM;
+
+ for (j = 0; j < dual_role->desc->num_properties; j++) {
+ struct device_attribute *attr;
+ char *line;
+
+ attr = &dual_role_attrs[dual_role->desc->properties[j]];
+
+ ret = dual_role_show_property(dev, attr, prop_buf);
+ if (ret == -ENODEV || ret == -ENODATA) {
+ ret = 0;
+ continue;
+ }
+
+ if (ret < 0)
+ goto out;
+ line = strnchr(prop_buf, PAGE_SIZE, '\n');
+ if (line)
+ *line = 0;
+
+ attrname = kstrdupcase(attr->attr.name, GFP_KERNEL, true);
+ if (!attrname)
+ ret = -ENOMEM;
+
+ dev_dbg(dev, "prop %s=%s\n", attrname, prop_buf);
+
+ ret = add_uevent_var(env, "DUAL_ROLE_%s=%s", attrname,
+ prop_buf);
+ kfree(attrname);
+ if (ret)
+ goto out;
+ }
+
+out:
+ free_page((unsigned long)prop_buf);
+
+ return ret;
+}
+
+/******************* Module Init ***********************************/
+
+static int __init dual_role_class_init(void)
+{
+ dual_role_class = class_create(THIS_MODULE, "dual_role_usb");
+
+ if (IS_ERR(dual_role_class))
+ return PTR_ERR(dual_role_class);
+
+ dual_role_class->dev_uevent = dual_role_uevent;
+ dual_role_init_attrs(&dual_role_dev_type);
+
+ return 0;
+}
+
+static void __exit dual_role_class_exit(void)
+{
+ class_destroy(dual_role_class);
+}
+
+subsys_initcall(dual_role_class_init);
+module_exit(dual_role_class_exit);
diff --git a/drivers/usb/phy/otg-wakelock.c b/drivers/usb/phy/otg-wakelock.c
new file mode 100644
index 000000000000..ecd741027f53
--- /dev/null
+++ b/drivers/usb/phy/otg-wakelock.c
@@ -0,0 +1,170 @@
+/*
+ * otg-wakelock.c
+ *
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/usb/otg.h>
+
+#define TEMPORARY_HOLD_TIME 2000
+
+static bool enabled = true;
+static struct usb_phy *otgwl_xceiv;
+static struct notifier_block otgwl_nb;
+
+/*
+ * otgwl_spinlock is held while the VBUS lock is grabbed or dropped and the
+ * held field is updated to match.
+ */
+
+static DEFINE_SPINLOCK(otgwl_spinlock);
+
+/*
+ * Only one lock, but since these 3 fields are associated with each other...
+ */
+
+struct otgwl_lock {
+ char name[40];
+ struct wakeup_source wakesrc;
+ bool held;
+};
+
+/*
+ * VBUS present lock. Also used as a timed lock on charger
+ * connect/disconnect and USB host disconnect, to allow the system
+ * to react to the change in power.
+ */
+
+static struct otgwl_lock vbus_lock;
+
+static void otgwl_hold(struct otgwl_lock *lock)
+{
+ if (!lock->held) {
+ __pm_stay_awake(&lock->wakesrc);
+ lock->held = true;
+ }
+}
+
+static void otgwl_temporary_hold(struct otgwl_lock *lock)
+{
+ __pm_wakeup_event(&lock->wakesrc, TEMPORARY_HOLD_TIME);
+ lock->held = false;
+}
+
+static void otgwl_drop(struct otgwl_lock *lock)
+{
+ if (lock->held) {
+ __pm_relax(&lock->wakesrc);
+ lock->held = false;
+ }
+}
+
+static void otgwl_handle_event(unsigned long event)
+{
+ unsigned long irqflags;
+
+ spin_lock_irqsave(&otgwl_spinlock, irqflags);
+
+ if (!enabled) {
+ otgwl_drop(&vbus_lock);
+ spin_unlock_irqrestore(&otgwl_spinlock, irqflags);
+ return;
+ }
+
+ switch (event) {
+ case USB_EVENT_VBUS:
+ case USB_EVENT_ENUMERATED:
+ otgwl_hold(&vbus_lock);
+ break;
+
+ case USB_EVENT_NONE:
+ case USB_EVENT_ID:
+ case USB_EVENT_CHARGER:
+ otgwl_temporary_hold(&vbus_lock);
+ break;
+
+ default:
+ break;
+ }
+
+ spin_unlock_irqrestore(&otgwl_spinlock, irqflags);
+}
+
+static int otgwl_otg_notifications(struct notifier_block *nb,
+ unsigned long event, void *unused)
+{
+ otgwl_handle_event(event);
+ return NOTIFY_OK;
+}
+
+static int set_enabled(const char *val, const struct kernel_param *kp)
+{
+ int rv = param_set_bool(val, kp);
+
+ if (rv)
+ return rv;
+
+ if (otgwl_xceiv)
+ otgwl_handle_event(otgwl_xceiv->last_event);
+
+ return 0;
+}
+
+static struct kernel_param_ops enabled_param_ops = {
+ .set = set_enabled,
+ .get = param_get_bool,
+};
+
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0644);
+MODULE_PARM_DESC(enabled, "enable wakelock when VBUS present");
+
+static int __init otg_wakelock_init(void)
+{
+ int ret;
+ struct usb_phy *phy;
+
+ phy = usb_get_phy(USB_PHY_TYPE_USB2);
+
+ if (IS_ERR(phy)) {
+ pr_err("%s: No USB transceiver found\n", __func__);
+ return PTR_ERR(phy);
+ }
+ otgwl_xceiv = phy;
+
+ snprintf(vbus_lock.name, sizeof(vbus_lock.name), "vbus-%s",
+ dev_name(otgwl_xceiv->dev));
+ wakeup_source_init(&vbus_lock.wakesrc, vbus_lock.name);
+
+ otgwl_nb.notifier_call = otgwl_otg_notifications;
+ ret = usb_register_notifier(otgwl_xceiv, &otgwl_nb);
+
+ if (ret) {
+ pr_err("%s: usb_register_notifier on transceiver %s"
+ " failed\n", __func__,
+ dev_name(otgwl_xceiv->dev));
+ otgwl_xceiv = NULL;
+ wakeup_source_trash(&vbus_lock.wakesrc);
+ return ret;
+ }
+
+ otgwl_handle_event(otgwl_xceiv->last_event);
+ return ret;
+}
+
+late_initcall(otg_wakelock_init);
diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c
index 14a93cb21310..8c93ad1dd9cc 100644
--- a/drivers/video/fbdev/goldfishfb.c
+++ b/drivers/video/fbdev/goldfishfb.c
@@ -26,6 +26,7 @@
#include <linux/interrupt.h>
#include <linux/ioport.h>
#include <linux/platform_device.h>
+#include <linux/acpi.h>
enum {
FB_GET_WIDTH = 0x00,
@@ -234,7 +235,7 @@ static int goldfish_fb_probe(struct platform_device *pdev)
fb->fb.var.activate = FB_ACTIVATE_NOW;
fb->fb.var.height = readl(fb->reg_base + FB_GET_PHYS_HEIGHT);
fb->fb.var.width = readl(fb->reg_base + FB_GET_PHYS_WIDTH);
- fb->fb.var.pixclock = 10000;
+ fb->fb.var.pixclock = 0;
fb->fb.var.red.offset = 11;
fb->fb.var.red.length = 5;
@@ -305,12 +306,25 @@ static int goldfish_fb_remove(struct platform_device *pdev)
return 0;
}
+static const struct of_device_id goldfish_fb_of_match[] = {
+ { .compatible = "google,goldfish-fb", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, goldfish_fb_of_match);
+
+static const struct acpi_device_id goldfish_fb_acpi_match[] = {
+ { "GFSH0004", 0 },
+ { },
+};
+MODULE_DEVICE_TABLE(acpi, goldfish_fb_acpi_match);
static struct platform_driver goldfish_fb_driver = {
.probe = goldfish_fb_probe,
.remove = goldfish_fb_remove,
.driver = {
- .name = "goldfish_fb"
+ .name = "goldfish_fb",
+ .of_match_table = goldfish_fb_of_match,
+ .acpi_match_table = ACPI_PTR(goldfish_fb_acpi_match),
}
};
diff --git a/drivers/w1/masters/ds2482.c b/drivers/w1/masters/ds2482.c
index 2e30db1b1a43..fa13fa8c81af 100644
--- a/drivers/w1/masters/ds2482.c
+++ b/drivers/w1/masters/ds2482.c
@@ -18,6 +18,8 @@
#include <linux/slab.h>
#include <linux/i2c.h>
#include <linux/delay.h>
+#include <linux/gpio.h>
+#include <linux/platform_data/ds2482.h>
#include <asm/delay.h>
#include "../w1.h"
@@ -97,7 +99,8 @@ static const u8 ds2482_chan_rd[8] =
static int ds2482_probe(struct i2c_client *client,
const struct i2c_device_id *id);
static int ds2482_remove(struct i2c_client *client);
-
+static int ds2482_suspend(struct device *dev);
+static int ds2482_resume(struct device *dev);
/**
* Driver data (common to all clients)
@@ -108,9 +111,15 @@ static const struct i2c_device_id ds2482_id[] = {
};
MODULE_DEVICE_TABLE(i2c, ds2482_id);
+static const struct dev_pm_ops ds2482_pm_ops = {
+ .suspend = ds2482_suspend,
+ .resume = ds2482_resume,
+};
+
static struct i2c_driver ds2482_driver = {
.driver = {
.name = "ds2482",
+ .pm = &ds2482_pm_ops,
},
.probe = ds2482_probe,
.remove = ds2482_remove,
@@ -132,6 +141,7 @@ struct ds2482_w1_chan {
struct ds2482_data {
struct i2c_client *client;
struct mutex access_lock;
+ int slpz_gpio;
/* 1-wire interface(s) */
int w1_count; /* 1 or 8 */
@@ -460,11 +470,31 @@ static u8 ds2482_w1_set_pullup(void *data, int delay)
return retval;
}
+static int ds2482_suspend(struct device *dev)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ struct ds2482_data *data = i2c_get_clientdata(client);
+
+ if (data->slpz_gpio >= 0)
+ gpio_set_value(data->slpz_gpio, 0);
+ return 0;
+}
+
+static int ds2482_resume(struct device *dev)
+{
+ struct i2c_client *client = to_i2c_client(dev);
+ struct ds2482_data *data = i2c_get_clientdata(client);
+
+ if (data->slpz_gpio >= 0)
+ gpio_set_value(data->slpz_gpio, 1);
+ return 0;
+}
static int ds2482_probe(struct i2c_client *client,
const struct i2c_device_id *id)
{
struct ds2482_data *data;
+ struct ds2482_platform_data *pdata;
int err = -ENODEV;
int temp1;
int idx;
@@ -531,6 +561,16 @@ static int ds2482_probe(struct i2c_client *client,
}
}
+ pdata = client->dev.platform_data;
+ data->slpz_gpio = pdata ? pdata->slpz_gpio : -1;
+
+ if (data->slpz_gpio >= 0) {
+ err = gpio_request_one(data->slpz_gpio, GPIOF_OUT_INIT_HIGH,
+ "ds2482.slpz");
+ if (err < 0)
+ goto exit_w1_remove;
+ }
+
return 0;
exit_w1_remove:
@@ -555,6 +595,11 @@ static int ds2482_remove(struct i2c_client *client)
w1_remove_master_device(&data->w1_ch[idx].w1_bm);
}
+ if (data->slpz_gpio >= 0) {
+ gpio_set_value(data->slpz_gpio, 0);
+ gpio_free(data->slpz_gpio);
+ }
+
/* Free the memory */
kfree(data);
return 0;
diff --git a/fs/Kconfig b/fs/Kconfig
index 4bd03a2b0518..20a8d95b8b3d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -227,6 +227,7 @@ source "fs/orangefs/Kconfig"
source "fs/adfs/Kconfig"
source "fs/affs/Kconfig"
source "fs/ecryptfs/Kconfig"
+source "fs/sdcardfs/Kconfig"
source "fs/hfs/Kconfig"
source "fs/hfsplus/Kconfig"
source "fs/befs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index ed2b63257ba9..f207d4303052 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -3,7 +3,7 @@
#
# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
# Rewritten to use lists instead of if-statements.
-#
+#
obj-y := open.o read_write.o file_table.o super.o \
char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
@@ -61,7 +61,7 @@ obj-y += devpts/
obj-$(CONFIG_PROFILING) += dcookies.o
obj-$(CONFIG_DLM) += dlm/
-
+
# Do not add any filesystems before this line
obj-$(CONFIG_FSCACHE) += fscache/
obj-$(CONFIG_REISERFS_FS) += reiserfs/
@@ -83,6 +83,7 @@ obj-$(CONFIG_ISO9660_FS) += isofs/
obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+
obj-$(CONFIG_HFS_FS) += hfs/
obj-$(CONFIG_ECRYPT_FS) += ecryptfs/
+obj-$(CONFIG_SDCARD_FS) += sdcardfs/
obj-$(CONFIG_VXFS_FS) += freevxfs/
obj-$(CONFIG_NFS_FS) += nfs/
obj-$(CONFIG_EXPORTFS) += exportfs/
diff --git a/fs/attr.c b/fs/attr.c
index c902b3d53508..c4093c5196be 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -200,7 +200,7 @@ EXPORT_SYMBOL(setattr_copy);
* the file open for write, as there can be no conflicting delegation in
* that case.
*/
-int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
+int notify_change2(struct vfsmount *mnt, struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
{
struct inode *inode = dentry->d_inode;
umode_t mode = inode->i_mode;
@@ -224,7 +224,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
return -EPERM;
if (!inode_owner_or_capable(inode)) {
- error = inode_permission(inode, MAY_WRITE);
+ error = inode_permission2(mnt, inode, MAY_WRITE);
if (error)
return error;
}
@@ -307,7 +307,9 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
if (error)
return error;
- if (inode->i_op->setattr)
+ if (mnt && inode->i_op->setattr2)
+ error = inode->i_op->setattr2(mnt, dentry, attr);
+ else if (inode->i_op->setattr)
error = inode->i_op->setattr(dentry, attr);
else
error = simple_setattr(dentry, attr);
@@ -320,4 +322,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
return error;
}
+EXPORT_SYMBOL(notify_change2);
+
+int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
+{
+ return notify_change2(NULL, dentry, attr, delegated_inode);
+}
EXPORT_SYMBOL(notify_change);
diff --git a/fs/coredump.c b/fs/coredump.c
index 4407e27beca9..00a900a51a8b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -744,7 +744,7 @@ void do_coredump(const siginfo_t *siginfo)
goto close_fail;
if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
goto close_fail;
- if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+ if (do_truncate2(cprm.file->f_path.mnt, cprm.file->f_path.dentry, 0, 0, cprm.file))
goto close_fail;
}
diff --git a/fs/dcache.c b/fs/dcache.c
index f903b86b06e5..183e0ae89c2b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3257,6 +3257,7 @@ char *d_absolute_path(const struct path *path,
return ERR_PTR(error);
return res;
}
+EXPORT_SYMBOL(d_absolute_path);
/*
* same as __d_path but appends "(deleted)" for unlinked files.
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3cbc30413add..5b96ba77e623 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,6 +34,7 @@
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <linux/device.h>
+#include <linux/freezer.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
@@ -1673,7 +1674,8 @@ fetch_events:
}
spin_unlock_irqrestore(&ep->lock, flags);
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+ if (!freezable_schedule_hrtimeout_range(to, slack,
+ HRTIMER_MODE_ABS))
timed_out = 1;
spin_lock_irqsave(&ep->lock, flags);
diff --git a/fs/exec.c b/fs/exec.c
index fcd8642ef2d2..df13a1945c70 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1303,7 +1303,7 @@ EXPORT_SYMBOL(flush_old_exec);
void would_dump(struct linux_binprm *bprm, struct file *file)
{
struct inode *inode = file_inode(file);
- if (inode_permission(inode, MAY_READ) < 0) {
+ if (inode_permission2(file->f_path.mnt, inode, MAY_READ) < 0) {
struct user_namespace *old, *user_ns;
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 567a6c7af677..0877f0b1facb 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2445,7 +2445,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count);
-extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+extern int ext4_trim_fs(struct super_block *, struct fstrim_range *,
+ unsigned long blkdev_flags);
/* inode.c */
int ext4_inode_is_fast_symlink(struct inode *inode);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index d06cfe372609..e09a61dc236b 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -18,6 +18,7 @@
#include "ext4.h"
#include "xattr.h"
#include "truncate.h"
+#include <trace/events/android_fs.h>
#define EXT4_XATTR_SYSTEM_DATA "data"
#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
@@ -502,6 +503,17 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
return -EAGAIN;
}
+ if (trace_android_fs_dataread_start_enabled()) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_dataread_start(inode, page_offset(page),
+ PAGE_SIZE, current->pid,
+ path, current->comm);
+ }
+
/*
* Current inline data can only exist in the 1st page,
* So for all the other pages, just set them uptodate.
@@ -513,6 +525,8 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
SetPageUptodate(page);
}
+ trace_android_fs_dataread_end(inode, page_offset(page), PAGE_SIZE);
+
up_read(&EXT4_I(inode)->xattr_sem);
unlock_page(page);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f62eca8cbde0..76647a7d22e1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -44,6 +44,7 @@
#include "truncate.h"
#include <trace/events/ext4.h>
+#include <trace/events/android_fs.h>
#define MPAGE_DA_EXTENT_TAIL 0x01
@@ -1182,6 +1183,16 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index;
unsigned from, to;
+ if (trace_android_fs_datawrite_start_enabled()) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_datawrite_start(inode, pos, len,
+ current->pid, path,
+ current->comm);
+ }
trace_ext4_write_begin(inode, pos, len, flags);
/*
* Reserve one block more for addition to orphan list in case
@@ -1320,6 +1331,7 @@ static int ext4_write_end(struct file *file,
int i_size_changed = 0;
int inline_data = ext4_has_inline_data(inode);
+ trace_android_fs_datawrite_end(inode, pos, len);
trace_ext4_write_end(inode, pos, len, copied);
if (inline_data) {
ret = ext4_write_inline_data_end(inode, pos, len,
@@ -1425,6 +1437,7 @@ static int ext4_journalled_write_end(struct file *file,
int size_changed = 0;
int inline_data = ext4_has_inline_data(inode);
+ trace_android_fs_datawrite_end(inode, pos, len);
trace_ext4_journalled_write_end(inode, pos, len, copied);
from = pos & (PAGE_SIZE - 1);
to = from + len;
@@ -2921,6 +2934,16 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
len, flags, pagep, fsdata);
}
*fsdata = (void *)0;
+ if (trace_android_fs_datawrite_start_enabled()) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_datawrite_start(inode, pos, len,
+ current->pid,
+ path, current->comm);
+ }
trace_ext4_da_write_begin(inode, pos, len, flags);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@@ -3039,6 +3062,7 @@ static int ext4_da_write_end(struct file *file,
return ext4_write_end(file, mapping, pos,
len, copied, page, fsdata);
+ trace_android_fs_datawrite_end(inode, pos, len);
trace_ext4_da_write_end(inode, pos, len, copied);
start = pos & (PAGE_SIZE - 1);
end = start + copied - 1;
@@ -3602,6 +3626,7 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
size_t count = iov_iter_count(iter);
loff_t offset = iocb->ki_pos;
ssize_t ret;
+ int rw = iov_iter_rw(iter);
#ifdef CONFIG_EXT4_FS_ENCRYPTION
if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode))
@@ -3618,12 +3643,42 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (ext4_has_inline_data(inode))
return 0;
+ if (trace_android_fs_dataread_start_enabled() &&
+ (rw == READ)) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_dataread_start(inode, offset, count,
+ current->pid, path,
+ current->comm);
+ }
+ if (trace_android_fs_datawrite_start_enabled() &&
+ (rw == WRITE)) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_datawrite_start(inode, offset, count,
+ current->pid, path,
+ current->comm);
+ }
trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
if (iov_iter_rw(iter) == READ)
ret = ext4_direct_IO_read(iocb, iter);
else
ret = ext4_direct_IO_write(iocb, iter);
trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
+
+ if (trace_android_fs_dataread_start_enabled() &&
+ (rw == READ))
+ trace_android_fs_dataread_end(inode, offset, count);
+ if (trace_android_fs_datawrite_start_enabled() &&
+ (rw == WRITE))
+ trace_android_fs_datawrite_end(inode, offset, count);
+
return ret;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2880e017cd0a..e3ad9d47dd80 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -737,11 +737,13 @@ resizefs_out:
return err;
}
+ case FIDTRIM:
case FITRIM:
{
struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret = 0;
+ int flags = cmd == FIDTRIM ? BLKDEV_DISCARD_SECURE : 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -749,13 +751,16 @@ resizefs_out:
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
+ if ((flags & BLKDEV_DISCARD_SECURE) && !blk_queue_secure_erase(q))
+ return -EOPNOTSUPP;
+
if (copy_from_user(&range, (struct fstrim_range __user *)arg,
sizeof(range)))
return -EFAULT;
range.minlen = max((unsigned int)range.minlen,
q->limits.discard_granularity);
- ret = ext4_trim_fs(sb, &range);
+ ret = ext4_trim_fs(sb, &range, flags);
if (ret < 0)
return ret;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a49d0e5d7baf..3d6f73e38873 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2775,7 +2775,8 @@ int ext4_mb_release(struct super_block *sb)
}
static inline int ext4_issue_discard(struct super_block *sb,
- ext4_group_t block_group, ext4_grpblk_t cluster, int count)
+ ext4_group_t block_group, ext4_grpblk_t cluster, int count,
+ unsigned long flags)
{
ext4_fsblk_t discard_block;
@@ -2784,7 +2785,7 @@ static inline int ext4_issue_discard(struct super_block *sb,
count = EXT4_C2B(EXT4_SB(sb), count);
trace_ext4_discard_blocks(sb,
(unsigned long long) discard_block, count);
- return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+ return sb_issue_discard(sb, discard_block, count, GFP_NOFS, flags);
}
/*
@@ -2806,7 +2807,7 @@ static void ext4_free_data_callback(struct super_block *sb,
if (test_opt(sb, DISCARD)) {
err = ext4_issue_discard(sb, entry->efd_group,
entry->efd_start_cluster,
- entry->efd_count);
+ entry->efd_count, 0);
if (err && err != -EOPNOTSUPP)
ext4_msg(sb, KERN_WARNING, "discard request in"
" group:%d block:%d count:%d failed"
@@ -4865,7 +4866,8 @@ do_more:
* them with group lock_held
*/
if (test_opt(sb, DISCARD)) {
- err = ext4_issue_discard(sb, block_group, bit, count);
+ err = ext4_issue_discard(sb, block_group, bit, count,
+ 0);
if (err && err != -EOPNOTSUPP)
ext4_msg(sb, KERN_WARNING, "discard request in"
" group:%d block:%d count:%lu failed"
@@ -5061,13 +5063,15 @@ error_return:
* @count: number of blocks to TRIM
* @group: alloc. group we are working with
* @e4b: ext4 buddy for the group
+ * @blkdev_flags: flags for the block device
*
* Trim "count" blocks starting at "start" in the "group". To assure that no
* one will allocate those blocks, mark it as used in buddy bitmap. This must
* be called with under the group lock.
*/
static int ext4_trim_extent(struct super_block *sb, int start, int count,
- ext4_group_t group, struct ext4_buddy *e4b)
+ ext4_group_t group, struct ext4_buddy *e4b,
+ unsigned long blkdev_flags)
__releases(bitlock)
__acquires(bitlock)
{
@@ -5088,7 +5092,7 @@ __acquires(bitlock)
*/
mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group);
- ret = ext4_issue_discard(sb, group, start, count);
+ ret = ext4_issue_discard(sb, group, start, count, blkdev_flags);
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
return ret;
@@ -5101,6 +5105,7 @@ __acquires(bitlock)
* @start: first group block to examine
* @max: last group block to examine
* @minblocks: minimum extent block count
+ * @blkdev_flags: flags for the block device
*
* ext4_trim_all_free walks through group's buddy bitmap searching for free
* extents. When the free block is found, ext4_trim_extent is called to TRIM
@@ -5115,7 +5120,7 @@ __acquires(bitlock)
static ext4_grpblk_t
ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ext4_grpblk_t start, ext4_grpblk_t max,
- ext4_grpblk_t minblocks)
+ ext4_grpblk_t minblocks, unsigned long blkdev_flags)
{
void *bitmap;
ext4_grpblk_t next, count = 0, free_count = 0;
@@ -5148,7 +5153,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
if ((next - start) >= minblocks) {
ret = ext4_trim_extent(sb, start,
- next - start, group, &e4b);
+ next - start, group, &e4b,
+ blkdev_flags);
if (ret && ret != -EOPNOTSUPP)
break;
ret = 0;
@@ -5190,6 +5196,7 @@ out:
* ext4_trim_fs() -- trim ioctl handle function
* @sb: superblock for filesystem
* @range: fstrim_range structure
+ * @blkdev_flags: flags for the block device
*
* start: First Byte to trim
* len: number of Bytes to trim from start
@@ -5198,7 +5205,8 @@ out:
* start to start+len. For each such a group ext4_trim_all_free function
* is invoked to trim all free space.
*/
-int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range,
+ unsigned long blkdev_flags)
{
struct ext4_group_info *grp;
ext4_group_t group, first_group, last_group;
@@ -5254,7 +5262,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
if (grp->bb_free >= minlen) {
cnt = ext4_trim_all_free(sb, group, first_cluster,
- end, minlen);
+ end, minlen, blkdev_flags);
if (cnt < 0) {
ret = cnt;
break;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index a81b829d56de..2531cc1df4bd 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -45,6 +45,7 @@
#include <linux/cleancache.h>
#include "ext4.h"
+#include <trace/events/android_fs.h>
static inline bool ext4_bio_encrypted(struct bio *bio)
{
@@ -55,6 +56,17 @@ static inline bool ext4_bio_encrypted(struct bio *bio)
#endif
}
+static void
+ext4_trace_read_completion(struct bio *bio)
+{
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+ if (first_page != NULL)
+ trace_android_fs_dataread_end(first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size);
+}
+
/*
* I/O completion handler for multipage BIOs.
*
@@ -72,6 +84,9 @@ static void mpage_end_io(struct bio *bio)
struct bio_vec *bv;
int i;
+ if (trace_android_fs_dataread_start_enabled())
+ ext4_trace_read_completion(bio);
+
if (ext4_bio_encrypted(bio)) {
if (bio->bi_error) {
fscrypt_release_ctx(bio->bi_private);
@@ -95,6 +110,30 @@ static void mpage_end_io(struct bio *bio)
bio_put(bio);
}
+static void
+ext4_submit_bio_read(struct bio *bio)
+{
+ if (trace_android_fs_dataread_start_enabled()) {
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+ if (first_page != NULL) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ first_page->mapping->host);
+ trace_android_fs_dataread_start(
+ first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size,
+ current->pid,
+ path,
+ current->comm);
+ }
+ }
+ submit_bio(bio);
+}
+
int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
unsigned nr_pages)
@@ -235,7 +274,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
*/
if (bio && (last_block_in_bio != blocks[0] - 1)) {
submit_and_realloc:
- submit_bio(bio);
+ ext4_submit_bio_read(bio);
bio = NULL;
}
if (bio == NULL) {
@@ -268,14 +307,14 @@ int ext4_mpage_readpages(struct address_space *mapping,
if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
(relative_block == map.m_len)) ||
(first_hole != blocks_per_page)) {
- submit_bio(bio);
+ ext4_submit_bio_read(bio);
bio = NULL;
} else
last_block_in_bio = blocks[blocks_per_page - 1];
goto next_page;
confused:
if (bio) {
- submit_bio(bio);
+ ext4_submit_bio_read(bio);
bio = NULL;
}
if (!PageUptodate(page))
@@ -288,6 +327,6 @@ int ext4_mpage_readpages(struct address_space *mapping,
}
BUG_ON(pages && !list_empty(pages));
if (bio)
- submit_bio(bio);
+ ext4_submit_bio_read(bio);
return 0;
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ae354ac67da1..b7f31547fadc 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -28,6 +28,7 @@
#include "segment.h"
#include "trace.h"
#include <trace/events/f2fs.h>
+#include <trace/events/android_fs.h>
static void f2fs_read_end_io(struct bio *bio)
{
@@ -1606,6 +1607,16 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
block_t blkaddr = NULL_ADDR;
int err = 0;
+ if (trace_android_fs_datawrite_start_enabled()) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_datawrite_start(inode, pos, len,
+ current->pid, path,
+ current->comm);
+ }
trace_f2fs_write_begin(inode, pos, len, flags);
/*
@@ -1702,6 +1713,7 @@ static int f2fs_write_end(struct file *file,
{
struct inode *inode = page->mapping->host;
+ trace_android_fs_datawrite_end(inode, pos, len);
trace_f2fs_write_end(inode, pos, len, copied);
/*
@@ -1763,6 +1775,29 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
trace_f2fs_direct_IO_enter(inode, offset, count, rw);
+ if (trace_android_fs_dataread_start_enabled() &&
+ (rw == READ)) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_dataread_start(inode, offset,
+ count, current->pid, path,
+ current->comm);
+ }
+ if (trace_android_fs_datawrite_start_enabled() &&
+ (rw == WRITE)) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_datawrite_start(inode, offset, count,
+ current->pid, path,
+ current->comm);
+ }
+
down_read(&F2FS_I(inode)->dio_rwsem[rw]);
err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio);
up_read(&F2FS_I(inode)->dio_rwsem[rw]);
@@ -1774,6 +1809,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
f2fs_write_failed(mapping, offset + count);
}
+ if (trace_android_fs_dataread_start_enabled() &&
+ (rw == READ))
+ trace_android_fs_dataread_end(inode, offset, count);
+ if (trace_android_fs_datawrite_start_enabled() &&
+ (rw == WRITE))
+ trace_android_fs_datawrite_end(inode, offset, count);
+
trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
return err;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 482888ee8942..066201ab7a6e 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -13,6 +13,7 @@
#include "f2fs.h"
#include "node.h"
+#include <trace/events/android_fs.h>
bool f2fs_may_inline_data(struct inode *inode)
{
@@ -82,14 +83,29 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
{
struct page *ipage;
+ if (trace_android_fs_dataread_start_enabled()) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ inode);
+ trace_android_fs_dataread_start(inode, page_offset(page),
+ PAGE_SIZE, current->pid,
+ path, current->comm);
+ }
+
ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage)) {
+ trace_android_fs_dataread_end(inode, page_offset(page),
+ PAGE_SIZE);
unlock_page(page);
return PTR_ERR(ipage);
}
if (!f2fs_has_inline_data(inode)) {
f2fs_put_page(ipage, 1);
+ trace_android_fs_dataread_end(inode, page_offset(page),
+ PAGE_SIZE);
return -EAGAIN;
}
@@ -101,6 +117,8 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
if (!PageUptodate(page))
SetPageUptodate(page);
f2fs_put_page(ipage, 1);
+ trace_android_fs_dataread_end(inode, page_offset(page),
+ PAGE_SIZE);
unlock_page(page);
return 0;
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 91bf72334722..d5e9fce24aa0 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1442,18 +1442,20 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
if (unlikely(fsmeta >= total))
return 1;
- main_segs = le32_to_cpu(raw_super->segment_count_main);
+ main_segs = le32_to_cpu(sbi->raw_super->segment_count_main);
blocks_per_seg = sbi->blocks_per_seg;
for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs ||
- le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg)
+ le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) {
return 1;
+ }
}
for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs ||
- le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg)
+ le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) {
return 1;
+ }
}
sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f3aea1b8702c..17ad41d88bea 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2112,7 +2112,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
(dirtytime && (inode->i_state & I_DIRTY_INODE)))
return;
- if (unlikely(block_dump))
+ if (unlikely(block_dump > 1))
block_dump___mark_inode_dirty(inode);
spin_lock(&inode->i_lock);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 7dca743b2ce1..940c683561dd 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -44,6 +44,7 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path)
if (old_pwd.dentry)
path_put(&old_pwd);
}
+EXPORT_SYMBOL(set_fs_pwd);
static inline int replace_path(struct path *p, const struct path *old, const struct path *new)
{
@@ -89,6 +90,7 @@ void free_fs_struct(struct fs_struct *fs)
path_put(&fs->pwd);
kmem_cache_free(fs_cachep, fs);
}
+EXPORT_SYMBOL(free_fs_struct);
void exit_fs(struct task_struct *tsk)
{
@@ -127,6 +129,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
}
return fs;
}
+EXPORT_SYMBOL_GPL(copy_fs_struct);
int unshare_fs_struct(void)
{
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b4253181b5d4..09f0de209a16 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -13,12 +13,14 @@
#include <linux/poll.h>
#include <linux/uio.h>
#include <linux/miscdevice.h>
+#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/pipe_fs_i.h>
#include <linux/swap.h>
#include <linux/splice.h>
+#include <linux/freezer.h>
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
@@ -471,7 +473,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
* Either request is already in userspace, or it was forced.
* Wait it out.
*/
- wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
+ while (!test_bit(FR_FINISHED, &req->flags))
+ wait_event_freezable(req->waitq,
+ test_bit(FR_FINISHED, &req->flags));
}
static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
@@ -1906,6 +1910,10 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
cs->move_pages = 0;
err = copy_out_args(cs, &req->out, nbytes);
+ if (req->in.h.opcode == FUSE_CANONICAL_PATH) {
+ req->out.h.error = kern_path((char *)req->out.args[0].value, 0,
+ req->canonical_path);
+ }
fuse_copy_finish(cs);
spin_lock(&fpq->lock);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 60dd2bc10776..547a324d45fd 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -262,6 +262,50 @@ invalid:
goto out;
}
+/*
+ * Get the canonical path. Since we must translate to a path, this must be done
+ * in the context of the userspace daemon, however, the userspace daemon cannot
+ * look up paths on its own. Instead, we handle the lookup as a special case
+ * inside of the write request.
+ */
+static void fuse_dentry_canonical_path(const struct path *path, struct path *canonical_path) {
+ struct inode *inode = path->dentry->d_inode;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_req *req;
+ int err;
+ char *path_name;
+
+ req = fuse_get_req(fc, 1);
+ err = PTR_ERR(req);
+ if (IS_ERR(req))
+ goto default_path;
+
+ path_name = (char*)__get_free_page(GFP_KERNEL);
+ if (!path_name) {
+ fuse_put_request(fc, req);
+ goto default_path;
+ }
+
+ req->in.h.opcode = FUSE_CANONICAL_PATH;
+ req->in.h.nodeid = get_node_id(inode);
+ req->in.numargs = 0;
+ req->out.numargs = 1;
+ req->out.args[0].size = PATH_MAX;
+ req->out.args[0].value = path_name;
+ req->canonical_path = canonical_path;
+ req->out.argvar = 1;
+ fuse_request_send(fc, req);
+ err = req->out.h.error;
+ fuse_put_request(fc, req);
+ free_page((unsigned long)path_name);
+ if (!err)
+ return;
+default_path:
+ canonical_path->dentry = path->dentry;
+ canonical_path->mnt = path->mnt;
+ path_get(canonical_path);
+}
+
static int invalid_nodeid(u64 nodeid)
{
return !nodeid || nodeid == FUSE_ROOT_ID;
@@ -284,11 +328,13 @@ const struct dentry_operations fuse_dentry_operations = {
.d_revalidate = fuse_dentry_revalidate,
.d_init = fuse_dentry_init,
.d_release = fuse_dentry_release,
+ .d_canonical_path = fuse_dentry_canonical_path,
};
const struct dentry_operations fuse_root_dentry_operations = {
.d_init = fuse_dentry_init,
.d_release = fuse_dentry_release,
+ .d_canonical_path = fuse_dentry_canonical_path,
};
int fuse_valid_type(int m)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1c905c7666de..a10c56e2147b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -368,6 +368,9 @@ struct fuse_req {
/** Inode used in the request or NULL */
struct inode *inode;
+ /** Path used for completing d_canonical_path */
+ struct path *canonical_path;
+
/** AIO control block */
struct fuse_io_priv *io;
diff --git a/fs/inode.c b/fs/inode.c
index 2071ff5343c5..1d1a9573ca70 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1781,7 +1781,7 @@ int dentry_needs_remove_privs(struct dentry *dentry)
return mask;
}
-static int __remove_privs(struct dentry *dentry, int kill)
+static int __remove_privs(struct vfsmount *mnt, struct dentry *dentry, int kill)
{
struct iattr newattrs;
@@ -1790,7 +1790,7 @@ static int __remove_privs(struct dentry *dentry, int kill)
* Note we call this on write, so notify_change will not
* encounter any conflicting delegations:
*/
- return notify_change(dentry, &newattrs, NULL);
+ return notify_change2(mnt, dentry, &newattrs, NULL);
}
/*
@@ -1812,7 +1812,7 @@ int file_remove_privs(struct file *file)
if (kill < 0)
return kill;
if (kill)
- error = __remove_privs(dentry, kill);
+ error = __remove_privs(file->f_path.mnt, dentry, kill);
if (!error)
inode_has_no_xattr(inode);
diff --git a/fs/internal.h b/fs/internal.h
index 8b7143b0211c..3e58863de514 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -88,9 +88,11 @@ extern struct file *get_empty_filp(void);
* super.c
*/
extern int do_remount_sb(struct super_block *, int, void *, int);
+extern int do_remount_sb2(struct vfsmount *, struct super_block *, int,
+ void *, int);
extern bool trylock_super(struct super_block *sb);
extern struct dentry *mount_fs(struct file_system_type *,
- int, const char *, void *);
+ int, const char *, struct vfsmount *, void *);
extern struct super_block *user_get_super(dev_t);
/*
diff --git a/fs/mpage.c b/fs/mpage.c
index e2ea442bb9e1..d4e17c88ce08 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -31,6 +31,14 @@
#include <linux/cleancache.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/android_fs.h>
+
+EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_start);
+EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_end);
+EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_start);
+EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_end);
+
/*
* I/O completion handler for multipage BIOs.
*
@@ -48,6 +56,16 @@ static void mpage_end_io(struct bio *bio)
struct bio_vec *bv;
int i;
+ if (trace_android_fs_dataread_end_enabled() &&
+ (bio_data_dir(bio) == READ)) {
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+ if (first_page != NULL)
+ trace_android_fs_dataread_end(first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size);
+ }
+
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
page_endio(page, op_is_write(bio_op(bio)), bio->bi_error);
@@ -58,6 +76,24 @@ static void mpage_end_io(struct bio *bio)
static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio)
{
+ if (trace_android_fs_dataread_start_enabled() && (op == REQ_OP_READ)) {
+ struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+ if (first_page != NULL) {
+ char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+ path = android_fstrace_get_pathname(pathbuf,
+ MAX_TRACE_PATHBUF_LEN,
+ first_page->mapping->host);
+ trace_android_fs_dataread_start(
+ first_page->mapping->host,
+ page_offset(first_page),
+ bio->bi_iter.bi_size,
+ current->pid,
+ path,
+ current->comm);
+ }
+ }
bio->bi_end_io = mpage_end_io;
bio_set_op_attrs(bio, op, op_flags);
guard_bio_eod(op, bio);
diff --git a/fs/namei.c b/fs/namei.c
index eb4626bad88a..88eb41cdd639 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -376,9 +376,11 @@ EXPORT_SYMBOL(generic_permission);
* flag in inode->i_opflags, that says "this has not special
* permission function, use the fast case".
*/
-static inline int do_inode_permission(struct inode *inode, int mask)
+static inline int do_inode_permission(struct vfsmount *mnt, struct inode *inode, int mask)
{
if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
+ if (likely(mnt && inode->i_op->permission2))
+ return inode->i_op->permission2(mnt, inode, mask);
if (likely(inode->i_op->permission))
return inode->i_op->permission(inode, mask);
@@ -402,7 +404,7 @@ static inline int do_inode_permission(struct inode *inode, int mask)
* This does not check for a read-only file system. You probably want
* inode_permission().
*/
-int __inode_permission(struct inode *inode, int mask)
+int __inode_permission2(struct vfsmount *mnt, struct inode *inode, int mask)
{
int retval;
@@ -422,7 +424,7 @@ int __inode_permission(struct inode *inode, int mask)
return -EACCES;
}
- retval = do_inode_permission(inode, mask);
+ retval = do_inode_permission(mnt, inode, mask);
if (retval)
return retval;
@@ -430,7 +432,14 @@ int __inode_permission(struct inode *inode, int mask)
if (retval)
return retval;
- return security_inode_permission(inode, mask);
+ retval = security_inode_permission(inode, mask);
+ return retval;
+}
+EXPORT_SYMBOL(__inode_permission2);
+
+int __inode_permission(struct inode *inode, int mask)
+{
+ return __inode_permission2(NULL, inode, mask);
}
EXPORT_SYMBOL(__inode_permission);
@@ -466,14 +475,20 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
*
* When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
*/
-int inode_permission(struct inode *inode, int mask)
+int inode_permission2(struct vfsmount *mnt, struct inode *inode, int mask)
{
int retval;
retval = sb_permission(inode->i_sb, inode, mask);
if (retval)
return retval;
- return __inode_permission(inode, mask);
+ return __inode_permission2(mnt, inode, mask);
+}
+EXPORT_SYMBOL(inode_permission2);
+
+int inode_permission(struct inode *inode, int mask)
+{
+ return inode_permission2(NULL, inode, mask);
}
EXPORT_SYMBOL(inode_permission);
@@ -1706,13 +1721,13 @@ out:
static inline int may_lookup(struct nameidata *nd)
{
if (nd->flags & LOOKUP_RCU) {
- int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
+ int err = inode_permission2(nd->path.mnt, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
if (err != -ECHILD)
return err;
if (unlazy_walk(nd, NULL, 0))
return -ECHILD;
}
- return inode_permission(nd->inode, MAY_EXEC);
+ return inode_permission2(nd->path.mnt, nd->inode, MAY_EXEC);
}
static inline int handle_dots(struct nameidata *nd, int type)
@@ -2186,11 +2201,12 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
nd->depth = 0;
if (flags & LOOKUP_ROOT) {
struct dentry *root = nd->root.dentry;
+ struct vfsmount *mnt = nd->root.mnt;
struct inode *inode = root->d_inode;
if (*s) {
if (!d_can_lookup(root))
return ERR_PTR(-ENOTDIR);
- retval = inode_permission(inode, MAY_EXEC);
+ retval = inode_permission2(mnt, inode, MAY_EXEC);
if (retval)
return ERR_PTR(retval);
}
@@ -2455,6 +2471,7 @@ EXPORT_SYMBOL(vfs_path_lookup);
/**
* lookup_one_len - filesystem helper to lookup single pathname component
* @name: pathname component to lookup
+ * @mnt: mount we are looking up on
* @base: base directory to lookup from
* @len: maximum length @len should be interpreted to
*
@@ -2463,7 +2480,7 @@ EXPORT_SYMBOL(vfs_path_lookup);
*
* The caller must hold base->i_mutex.
*/
-struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+struct dentry *lookup_one_len2(const char *name, struct vfsmount *mnt, struct dentry *base, int len)
{
struct qstr this;
unsigned int c;
@@ -2497,12 +2514,18 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
return ERR_PTR(err);
}
- err = inode_permission(base->d_inode, MAY_EXEC);
+ err = inode_permission2(mnt, base->d_inode, MAY_EXEC);
if (err)
return ERR_PTR(err);
return __lookup_hash(&this, base, 0);
}
+EXPORT_SYMBOL(lookup_one_len2);
+
+struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+{
+ return lookup_one_len2(name, NULL, base, len);
+}
EXPORT_SYMBOL(lookup_one_len);
/**
@@ -2805,7 +2828,7 @@ EXPORT_SYMBOL(__check_sticky);
* 11. We don't allow removal of NFS sillyrenamed files; it's handled by
* nfs_async_unlink().
*/
-static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
+static int may_delete(struct vfsmount *mnt, struct inode *dir, struct dentry *victim, bool isdir)
{
struct inode *inode = d_backing_inode(victim);
int error;
@@ -2817,7 +2840,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
BUG_ON(victim->d_parent->d_inode != dir);
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
- error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ error = inode_permission2(mnt, dir, MAY_WRITE | MAY_EXEC);
if (error)
return error;
if (IS_APPEND(dir))
@@ -2849,7 +2872,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
* 4. We should have write and exec permissions on dir
* 5. We can't do it if dir is immutable (done in permission())
*/
-static inline int may_create(struct inode *dir, struct dentry *child)
+static inline int may_create(struct vfsmount *mnt, struct inode *dir, struct dentry *child)
{
struct user_namespace *s_user_ns;
audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
@@ -2861,7 +2884,7 @@ static inline int may_create(struct inode *dir, struct dentry *child)
if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
!kgid_has_mapping(s_user_ns, current_fsgid()))
return -EOVERFLOW;
- return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ return inode_permission2(mnt, dir, MAY_WRITE | MAY_EXEC);
}
/*
@@ -2908,10 +2931,10 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
}
EXPORT_SYMBOL(unlock_rename);
-int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool want_excl)
+int vfs_create2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool want_excl)
{
- int error = may_create(dir, dentry);
+ int error = may_create(mnt, dir, dentry);
if (error)
return error;
@@ -2927,6 +2950,13 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
fsnotify_create(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_create2);
+
+int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool want_excl)
+{
+ return vfs_create2(NULL, dir, dentry, mode, want_excl);
+}
EXPORT_SYMBOL(vfs_create);
bool may_open_dev(const struct path *path)
@@ -2938,6 +2968,7 @@ bool may_open_dev(const struct path *path)
static int may_open(struct path *path, int acc_mode, int flag)
{
struct dentry *dentry = path->dentry;
+ struct vfsmount *mnt = path->mnt;
struct inode *inode = dentry->d_inode;
int error;
@@ -2962,7 +2993,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
break;
}
- error = inode_permission(inode, MAY_OPEN | acc_mode);
+ error = inode_permission2(mnt, inode, MAY_OPEN | acc_mode);
if (error)
return error;
@@ -2997,7 +3028,7 @@ static int handle_truncate(struct file *filp)
if (!error)
error = security_path_truncate(path);
if (!error) {
- error = do_truncate(path->dentry, 0,
+ error = do_truncate2(path->mnt, path->dentry, 0,
ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
filp);
}
@@ -3024,7 +3055,7 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m
!kgid_has_mapping(s_user_ns, current_fsgid()))
return -EOVERFLOW;
- error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
+ error = inode_permission2(dir->mnt, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
if (error)
return error;
@@ -3461,7 +3492,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
goto out;
dir = path.dentry->d_inode;
/* we want directory to be writable */
- error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ error = inode_permission2(nd->path.mnt, dir, MAY_WRITE | MAY_EXEC);
if (error)
goto out2;
if (!dir->i_op->tmpfile) {
@@ -3714,9 +3745,9 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname,
}
EXPORT_SYMBOL(user_path_create);
-int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+int vfs_mknod2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
- int error = may_create(dir, dentry);
+ int error = may_create(mnt, dir, dentry);
if (error)
return error;
@@ -3740,6 +3771,12 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
fsnotify_create(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_mknod2);
+
+int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+{
+ return vfs_mknod2(NULL, dir, dentry, mode, dev);
+}
EXPORT_SYMBOL(vfs_mknod);
static int may_mknod(umode_t mode)
@@ -3782,12 +3819,12 @@ retry:
goto out;
switch (mode & S_IFMT) {
case 0: case S_IFREG:
- error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+ error = vfs_create2(path.mnt, path.dentry->d_inode,dentry,mode,true);
if (!error)
ima_post_path_mknod(dentry);
break;
case S_IFCHR: case S_IFBLK:
- error = vfs_mknod(path.dentry->d_inode,dentry,mode,
+ error = vfs_mknod2(path.mnt, path.dentry->d_inode,dentry,mode,
new_decode_dev(dev));
break;
case S_IFIFO: case S_IFSOCK:
@@ -3808,9 +3845,9 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
return sys_mknodat(AT_FDCWD, filename, mode, dev);
}
-int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+int vfs_mkdir2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, umode_t mode)
{
- int error = may_create(dir, dentry);
+ int error = may_create(mnt, dir, dentry);
unsigned max_links = dir->i_sb->s_max_links;
if (error)
@@ -3832,6 +3869,12 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
fsnotify_mkdir(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_mkdir2);
+
+int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ return vfs_mkdir2(NULL, dir, dentry, mode);
+}
EXPORT_SYMBOL(vfs_mkdir);
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
@@ -3850,7 +3893,7 @@ retry:
mode &= ~current_umask();
error = security_path_mkdir(&path, dentry, mode);
if (!error)
- error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+ error = vfs_mkdir2(path.mnt, path.dentry->d_inode, dentry, mode);
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
@@ -3864,9 +3907,9 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
return sys_mkdirat(AT_FDCWD, pathname, mode);
}
-int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+int vfs_rmdir2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry)
{
- int error = may_delete(dir, dentry, 1);
+ int error = may_delete(mnt, dir, dentry, 1);
if (error)
return error;
@@ -3901,6 +3944,12 @@ out:
d_delete(dentry);
return error;
}
+EXPORT_SYMBOL(vfs_rmdir2);
+
+int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ return vfs_rmdir2(NULL, dir, dentry);
+}
EXPORT_SYMBOL(vfs_rmdir);
static long do_rmdir(int dfd, const char __user *pathname)
@@ -3946,7 +3995,7 @@ retry:
error = security_path_rmdir(&path, dentry);
if (error)
goto exit3;
- error = vfs_rmdir(path.dentry->d_inode, dentry);
+ error = vfs_rmdir2(path.mnt, path.dentry->d_inode, dentry);
exit3:
dput(dentry);
exit2:
@@ -3985,10 +4034,10 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported.
*/
-int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+int vfs_unlink2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
{
struct inode *target = dentry->d_inode;
- int error = may_delete(dir, dentry, 0);
+ int error = may_delete(mnt, dir, dentry, 0);
if (error)
return error;
@@ -4023,6 +4072,12 @@ out:
return error;
}
+EXPORT_SYMBOL(vfs_unlink2);
+
+int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+{
+ return vfs_unlink2(NULL, dir, dentry, delegated_inode);
+}
EXPORT_SYMBOL(vfs_unlink);
/*
@@ -4070,7 +4125,7 @@ retry_deleg:
error = security_path_unlink(&path, dentry);
if (error)
goto exit2;
- error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
+ error = vfs_unlink2(path.mnt, path.dentry->d_inode, dentry, &delegated_inode);
exit2:
dput(dentry);
}
@@ -4120,9 +4175,9 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
return do_unlinkat(AT_FDCWD, pathname);
}
-int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+int vfs_symlink2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, const char *oldname)
{
- int error = may_create(dir, dentry);
+ int error = may_create(mnt, dir, dentry);
if (error)
return error;
@@ -4139,6 +4194,12 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
fsnotify_create(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_symlink2);
+
+int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+{
+ return vfs_symlink2(NULL, dir, dentry, oldname);
+}
EXPORT_SYMBOL(vfs_symlink);
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
@@ -4161,7 +4222,7 @@ retry:
error = security_path_symlink(&path, dentry, from->name);
if (!error)
- error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
+ error = vfs_symlink2(path.mnt, path.dentry->d_inode, dentry, from->name);
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
@@ -4196,7 +4257,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported.
*/
-int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+int vfs_link2(struct vfsmount *mnt, struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
{
struct inode *inode = old_dentry->d_inode;
unsigned max_links = dir->i_sb->s_max_links;
@@ -4205,7 +4266,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
if (!inode)
return -ENOENT;
- error = may_create(dir, new_dentry);
+ error = may_create(mnt, dir, new_dentry);
if (error)
return error;
@@ -4255,6 +4316,12 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
fsnotify_link(dir, inode, new_dentry);
return error;
}
+EXPORT_SYMBOL(vfs_link2);
+
+int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+{
+ return vfs_link2(NULL, old_dentry, dir, new_dentry, delegated_inode);
+}
EXPORT_SYMBOL(vfs_link);
/*
@@ -4310,7 +4377,7 @@ retry:
error = security_path_link(old_path.dentry, &new_path, new_dentry);
if (error)
goto out_dput;
- error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
+ error = vfs_link2(old_path.mnt, old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
out_dput:
done_path_create(&new_path, new_dentry);
if (delegated_inode) {
@@ -4385,7 +4452,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* ->i_mutex on parents, which works but leads to some truly excessive
* locking].
*/
-int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+int vfs_rename2(struct vfsmount *mnt,
+ struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
struct inode **delegated_inode, unsigned int flags)
{
@@ -4404,19 +4472,19 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (d_real_inode(old_dentry) == d_real_inode(new_dentry))
return 0;
- error = may_delete(old_dir, old_dentry, is_dir);
+ error = may_delete(mnt, old_dir, old_dentry, is_dir);
if (error)
return error;
if (!target) {
- error = may_create(new_dir, new_dentry);
+ error = may_create(mnt, new_dir, new_dentry);
} else {
new_is_dir = d_is_dir(new_dentry);
if (!(flags & RENAME_EXCHANGE))
- error = may_delete(new_dir, new_dentry, is_dir);
+ error = may_delete(mnt, new_dir, new_dentry, is_dir);
else
- error = may_delete(new_dir, new_dentry, new_is_dir);
+ error = may_delete(mnt, new_dir, new_dentry, new_is_dir);
}
if (error)
return error;
@@ -4430,12 +4498,12 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
*/
if (new_dir != old_dir) {
if (is_dir) {
- error = inode_permission(source, MAY_WRITE);
+ error = inode_permission2(mnt, source, MAY_WRITE);
if (error)
return error;
}
if ((flags & RENAME_EXCHANGE) && new_is_dir) {
- error = inode_permission(target, MAY_WRITE);
+ error = inode_permission2(mnt, target, MAY_WRITE);
if (error)
return error;
}
@@ -4512,6 +4580,14 @@ out:
return error;
}
+EXPORT_SYMBOL(vfs_rename2);
+
+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ struct inode **delegated_inode, unsigned int flags)
+{
+ return vfs_rename2(NULL, old_dir, old_dentry, new_dir, new_dentry, delegated_inode, flags);
+}
EXPORT_SYMBOL(vfs_rename);
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
@@ -4625,7 +4701,7 @@ retry_deleg:
&new_path, new_dentry, flags);
if (error)
goto exit5;
- error = vfs_rename(old_path.dentry->d_inode, old_dentry,
+ error = vfs_rename2(old_path.mnt, old_path.dentry->d_inode, old_dentry,
new_path.dentry->d_inode, new_dentry,
&delegated_inode, flags);
exit5:
@@ -4670,7 +4746,7 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
int vfs_whiteout(struct inode *dir, struct dentry *dentry)
{
- int error = may_create(dir, dentry);
+ int error = may_create(NULL, dir, dentry);
if (error)
return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index 41f906a6f5d9..77b46bf40f09 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -227,6 +227,7 @@ static struct mount *alloc_vfsmnt(const char *name)
mnt->mnt_count = 1;
mnt->mnt_writers = 0;
#endif
+ mnt->mnt.data = NULL;
INIT_HLIST_NODE(&mnt->mnt_hash);
INIT_LIST_HEAD(&mnt->mnt_child);
@@ -581,6 +582,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
static void free_vfsmnt(struct mount *mnt)
{
+ kfree(mnt->mnt.data);
kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
free_percpu(mnt->mnt_pcp);
@@ -984,10 +986,18 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
if (!mnt)
return ERR_PTR(-ENOMEM);
+ if (type->alloc_mnt_data) {
+ mnt->mnt.data = type->alloc_mnt_data();
+ if (!mnt->mnt.data) {
+ mnt_free_id(mnt);
+ free_vfsmnt(mnt);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
if (flags & MS_KERNMOUNT)
mnt->mnt.mnt_flags = MNT_INTERNAL;
- root = mount_fs(type, flags, name, data);
+ root = mount_fs(type, flags, name, &mnt->mnt, data);
if (IS_ERR(root)) {
mnt_free_id(mnt);
free_vfsmnt(mnt);
@@ -1031,6 +1041,14 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
if (!mnt)
return ERR_PTR(-ENOMEM);
+ if (sb->s_op->clone_mnt_data) {
+ mnt->mnt.data = sb->s_op->clone_mnt_data(old->mnt.data);
+ if (!mnt->mnt.data) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ }
+
if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
mnt->mnt_group_id = 0; /* not a peer of original */
else
@@ -2342,8 +2360,14 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
err = change_mount_flags(path->mnt, flags);
else if (!capable(CAP_SYS_ADMIN))
err = -EPERM;
- else
- err = do_remount_sb(sb, flags, data, 0);
+ else {
+ err = do_remount_sb2(path->mnt, sb, flags, data, 0);
+ namespace_lock();
+ lock_mount_hash();
+ propagate_remount(mnt);
+ unlock_mount_hash();
+ namespace_unlock();
+ }
if (!err) {
lock_mount_hash();
mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 258e8f635148..cef9885de214 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -498,7 +498,7 @@ static int fanotify_find_path(int dfd, const char __user *filename,
}
/* you can only watch an inode if you have read permissions on it */
- ret = inode_permission(path->dentry->d_inode, MAY_READ);
+ ret = inode_permission2(path->mnt, path->dentry->d_inode, MAY_READ);
if (ret)
path_put(path);
out:
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 69d1ea3d292a..4da5c6a1134f 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -337,7 +337,7 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
if (error)
return error;
/* you can only watch an inode if you have read permissions on it */
- error = inode_permission(path->dentry->d_inode, MAY_READ);
+ error = inode_permission2(path->mnt, path->dentry->d_inode, MAY_READ);
if (error)
path_put(path);
return error;
@@ -702,6 +702,8 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
struct fsnotify_group *group;
struct inode *inode;
struct path path;
+ struct path alteredpath;
+ struct path *canonical_path = &path;
struct fd f;
int ret;
unsigned flags = 0;
@@ -741,13 +743,22 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
if (ret)
goto fput_and_out;
+ /* support stacked filesystems */
+ if(path.dentry && path.dentry->d_op) {
+ if (path.dentry->d_op->d_canonical_path) {
+ path.dentry->d_op->d_canonical_path(&path, &alteredpath);
+ canonical_path = &alteredpath;
+ path_put(&path);
+ }
+ }
+
/* inode held in place by reference to path; group by fget on fd */
- inode = path.dentry->d_inode;
+ inode = canonical_path->dentry->d_inode;
group = f.file->private_data;
/* create/update an inode mark */
ret = inotify_update_watch(group, inode, mask);
- path_put(&path);
+ path_put(canonical_path);
fput_and_out:
fdput(f);
return ret;
diff --git a/fs/open.c b/fs/open.c
index a6c6244f4993..73b7d19129a1 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -34,8 +34,8 @@
#include "internal.h"
-int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
- struct file *filp)
+int do_truncate2(struct vfsmount *mnt, struct dentry *dentry, loff_t length,
+ unsigned int time_attrs, struct file *filp)
{
int ret;
struct iattr newattrs;
@@ -60,18 +60,25 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
inode_lock(dentry->d_inode);
/* Note any delegations or leases have already been broken: */
- ret = notify_change(dentry, &newattrs, NULL);
+ ret = notify_change2(mnt, dentry, &newattrs, NULL);
inode_unlock(dentry->d_inode);
return ret;
}
+int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
+ struct file *filp)
+{
+ return do_truncate2(NULL, dentry, length, time_attrs, filp);
+}
long vfs_truncate(const struct path *path, loff_t length)
{
struct inode *inode;
+ struct vfsmount *mnt;
struct dentry *upperdentry;
long error;
inode = path->dentry->d_inode;
+ mnt = path->mnt;
/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
if (S_ISDIR(inode->i_mode))
@@ -83,7 +90,7 @@ long vfs_truncate(const struct path *path, loff_t length)
if (error)
goto out;
- error = inode_permission(inode, MAY_WRITE);
+ error = inode_permission2(mnt, inode, MAY_WRITE);
if (error)
goto mnt_drop_write_and_out;
@@ -117,7 +124,7 @@ long vfs_truncate(const struct path *path, loff_t length)
if (!error)
error = security_path_truncate(path);
if (!error)
- error = do_truncate(path->dentry, length, 0, NULL);
+ error = do_truncate2(mnt, path->dentry, length, 0, NULL);
put_write_and_out:
put_write_access(upperdentry->d_inode);
@@ -166,6 +173,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
struct inode *inode;
struct dentry *dentry;
+ struct vfsmount *mnt;
struct fd f;
int error;
@@ -182,6 +190,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
small = 0;
dentry = f.file->f_path.dentry;
+ mnt = f.file->f_path.mnt;
inode = dentry->d_inode;
error = -EINVAL;
if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
@@ -201,7 +210,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
if (!error)
error = security_path_truncate(&f.file->f_path);
if (!error)
- error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
+ error = do_truncate2(mnt, dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
sb_end_write(inode->i_sb);
out_putf:
fdput(f);
@@ -357,6 +366,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
struct cred *override_cred;
struct path path;
struct inode *inode;
+ struct vfsmount *mnt;
int res;
unsigned int lookup_flags = LOOKUP_FOLLOW;
@@ -387,6 +397,7 @@ retry:
goto out;
inode = d_backing_inode(path.dentry);
+ mnt = path.mnt;
if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
/*
@@ -398,7 +409,7 @@ retry:
goto out_path_release;
}
- res = inode_permission(inode, mode | MAY_ACCESS);
+ res = inode_permission2(mnt, inode, mode | MAY_ACCESS);
/* SuS v2 requires we report a read only fs too */
if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
goto out_path_release;
@@ -442,7 +453,7 @@ retry:
if (error)
goto out;
- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+ error = inode_permission2(path.mnt, path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
if (error)
goto dput_and_out;
@@ -462,6 +473,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
struct fd f = fdget_raw(fd);
struct inode *inode;
+ struct vfsmount *mnt;
int error = -EBADF;
error = -EBADF;
@@ -469,12 +481,13 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
goto out;
inode = file_inode(f.file);
+ mnt = f.file->f_path.mnt;
error = -ENOTDIR;
if (!S_ISDIR(inode->i_mode))
goto out_putf;
- error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
+ error = inode_permission2(mnt, inode, MAY_EXEC | MAY_CHDIR);
if (!error)
set_fs_pwd(current->fs, &f.file->f_path);
out_putf:
@@ -493,7 +506,7 @@ retry:
if (error)
goto out;
- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+ error = inode_permission2(path.mnt, path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
if (error)
goto dput_and_out;
@@ -533,7 +546,7 @@ retry_deleg:
goto out_unlock;
newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
- error = notify_change(path->dentry, &newattrs, &delegated_inode);
+ error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
out_unlock:
inode_unlock(inode);
if (delegated_inode) {
@@ -613,7 +626,7 @@ retry_deleg:
inode_lock(inode);
error = security_path_chown(path, uid, gid);
if (!error)
- error = notify_change(path->dentry, &newattrs, &delegated_inode);
+ error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
inode_unlock(inode);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
diff --git a/fs/pnode.c b/fs/pnode.c
index d15c63e97ef1..ddb846f878b8 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -609,3 +609,37 @@ int propagate_umount(struct list_head *list)
return 0;
}
+
+/*
+ * Iterates over all slaves, and slaves of slaves.
+ */
+static struct mount *next_descendent(struct mount *root, struct mount *cur)
+{
+ if (!IS_MNT_NEW(cur) && !list_empty(&cur->mnt_slave_list))
+ return first_slave(cur);
+ do {
+ struct mount *master = cur->mnt_master;
+
+ if (!master || cur->mnt_slave.next != &master->mnt_slave_list) {
+ struct mount *next = next_slave(cur);
+
+ return (next == root) ? NULL : next;
+ }
+ cur = master;
+ } while (cur != root);
+ return NULL;
+}
+
+void propagate_remount(struct mount *mnt)
+{
+ struct mount *m = mnt;
+ struct super_block *sb = mnt->mnt.mnt_sb;
+
+ if (sb->s_op->copy_mnt_data) {
+ m = next_descendent(mnt, m);
+ while (m) {
+ sb->s_op->copy_mnt_data(m->mnt.data, mnt->mnt.data);
+ m = next_descendent(mnt, m);
+ }
+ }
+}
diff --git a/fs/pnode.h b/fs/pnode.h
index dc87e65becd2..a9a6576540ad 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -44,6 +44,7 @@ int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
int propagate_umount(struct list_head *);
int propagate_mount_busy(struct mount *, int);
void propagate_mount_unlock(struct mount *);
+void propagate_remount(struct mount *);
void mnt_release_group_id(struct mount *);
int get_dominating_id(struct mount *mnt, const struct path *root);
unsigned int mnt_get_count(struct mount *mnt);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 79702d405ba7..0fffb9b9071a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2953,8 +2953,8 @@ static const struct pid_entry tgid_base_stuff[] = {
ONE("cgroup", S_IRUGO, proc_cgroup_show),
#endif
ONE("oom_score", S_IRUGO, proc_oom_score),
- REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
- REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+ REG("oom_adj", S_IRUSR, proc_oom_adj_operations),
+ REG("oom_score_adj", S_IRUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
@@ -3344,8 +3344,8 @@ static const struct pid_entry tid_base_stuff[] = {
ONE("cgroup", S_IRUGO, proc_cgroup_show),
#endif
ONE("oom_score", S_IRUGO, proc_oom_score),
- REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
- REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+ REG("oom_adj", S_IRUSR, proc_oom_adj_operations),
+ REG("oom_score_adj", S_IRUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5138e781737a..9182f84efa9a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -127,6 +127,56 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
+static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma)
+{
+ const char __user *name = vma_get_anon_name(vma);
+ struct mm_struct *mm = vma->vm_mm;
+
+ unsigned long page_start_vaddr;
+ unsigned long page_offset;
+ unsigned long num_pages;
+ unsigned long max_len = NAME_MAX;
+ int i;
+
+ page_start_vaddr = (unsigned long)name & PAGE_MASK;
+ page_offset = (unsigned long)name - page_start_vaddr;
+ num_pages = DIV_ROUND_UP(page_offset + max_len, PAGE_SIZE);
+
+ seq_puts(m, "[anon:");
+
+ for (i = 0; i < num_pages; i++) {
+ int len;
+ int write_len;
+ const char *kaddr;
+ long pages_pinned;
+ struct page *page;
+
+ pages_pinned = get_user_pages_remote(current, mm,
+ page_start_vaddr, 1, 0, &page, NULL);
+ if (pages_pinned < 1) {
+ seq_puts(m, "<fault>]");
+ return;
+ }
+
+ kaddr = (const char *)kmap(page);
+ len = min(max_len, PAGE_SIZE - page_offset);
+ write_len = strnlen(kaddr + page_offset, len);
+ seq_write(m, kaddr + page_offset, write_len);
+ kunmap(page);
+ put_page(page);
+
+ /* if strnlen hit a null terminator then we're done */
+ if (write_len != len)
+ break;
+
+ max_len -= len;
+ page_offset = 0;
+ page_start_vaddr += PAGE_SIZE;
+ }
+
+ seq_putc(m, ']');
+}
+
static void vma_stop(struct proc_maps_private *priv)
{
struct mm_struct *mm = priv->mm;
@@ -341,8 +391,15 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
goto done;
}
- if (is_stack(priv, vma))
+ if (is_stack(priv, vma)) {
name = "[stack]";
+ goto done;
+ }
+
+ if (vma_get_anon_name(vma)) {
+ seq_pad(m, ' ');
+ seq_print_vma_name(m, vma);
+ }
}
done:
@@ -756,6 +813,12 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
show_map_vma(m, vma, is_pid);
+ if (vma_get_anon_name(vma)) {
+ seq_puts(m, "Name: ");
+ seq_print_vma_name(m, vma);
+ seq_putc(m, '\n');
+ }
+
seq_printf(m,
"Size: %8lu kB\n"
"Rss: %8lu kB\n"
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 3f1190d18991..6863773aff25 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -118,7 +118,9 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
if (err)
goto out;
show_mnt_opts(m, mnt);
- if (sb->s_op->show_options)
+ if (sb->s_op->show_options2)
+ err = sb->s_op->show_options2(mnt, m, mnt_path.dentry);
+ else if (sb->s_op->show_options)
err = sb->s_op->show_options(m, mnt_path.dentry);
seq_puts(m, " 0 0\n");
out:
@@ -180,7 +182,9 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
err = show_sb_opts(m, sb);
if (err)
goto out;
- if (sb->s_op->show_options)
+ if (sb->s_op->show_options2) {
+ err = sb->s_op->show_options2(mnt, m, mnt->mnt_root);
+ } else if (sb->s_op->show_options)
err = sb->s_op->show_options(m, mnt->mnt_root);
seq_putc(m, '\n');
out:
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 8b09271e5d66..8e151fb9bb76 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -550,6 +550,12 @@ static int ramoops_parse_dt(struct platform_device *pdev,
return 0;
}
+void notrace ramoops_console_write_buf(const char *buf, size_t size)
+{
+ struct ramoops_context *cxt = &oops_cxt;
+ persistent_ram_write(cxt->cprz, buf, size);
+}
+
static int ramoops_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
diff --git a/fs/sdcardfs/Kconfig b/fs/sdcardfs/Kconfig
new file mode 100644
index 000000000000..a1c103316ac7
--- /dev/null
+++ b/fs/sdcardfs/Kconfig
@@ -0,0 +1,13 @@
+config SDCARD_FS
+ tristate "sdcard file system"
+ depends on CONFIGFS_FS
+ default n
+ help
+ Sdcardfs is based on Wrapfs file system.
+
+config SDCARD_FS_FADV_NOACTIVE
+ bool "sdcardfs fadvise noactive support"
+ depends on FADV_NOACTIVE
+ default y
+ help
+ Sdcardfs supports fadvise noactive mode.
diff --git a/fs/sdcardfs/Makefile b/fs/sdcardfs/Makefile
new file mode 100644
index 000000000000..b84fbb2b45a4
--- /dev/null
+++ b/fs/sdcardfs/Makefile
@@ -0,0 +1,7 @@
+SDCARDFS_VERSION="0.1"
+
+EXTRA_CFLAGS += -DSDCARDFS_VERSION=\"$(SDCARDFS_VERSION)\"
+
+obj-$(CONFIG_SDCARD_FS) += sdcardfs.o
+
+sdcardfs-y := dentry.o file.o inode.o main.o super.o lookup.o mmap.o packagelist.o derived_perm.o
diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c
new file mode 100644
index 000000000000..e9426a61d04a
--- /dev/null
+++ b/fs/sdcardfs/dentry.c
@@ -0,0 +1,193 @@
+/*
+ * fs/sdcardfs/dentry.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include "linux/ctype.h"
+
+/*
+ * returns: -ERRNO if error (returned to user)
+ * 0: tell VFS to invalidate dentry
+ * 1: dentry is valid
+ */
+static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ int err = 1;
+ struct path parent_lower_path, lower_path;
+ struct dentry *parent_dentry = NULL;
+ struct dentry *parent_lower_dentry = NULL;
+ struct dentry *lower_cur_parent_dentry = NULL;
+ struct dentry *lower_dentry = NULL;
+ struct inode *inode;
+ struct sdcardfs_inode_data *data;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ spin_lock(&dentry->d_lock);
+ if (IS_ROOT(dentry)) {
+ spin_unlock(&dentry->d_lock);
+ return 1;
+ }
+ spin_unlock(&dentry->d_lock);
+
+ /* check uninitialized obb_dentry and
+ * whether the base obbpath has been changed or not
+ */
+ if (is_obbpath_invalid(dentry)) {
+ d_drop(dentry);
+ return 0;
+ }
+
+ parent_dentry = dget_parent(dentry);
+ sdcardfs_get_lower_path(parent_dentry, &parent_lower_path);
+ sdcardfs_get_real_lower(dentry, &lower_path);
+ parent_lower_dentry = parent_lower_path.dentry;
+ lower_dentry = lower_path.dentry;
+ lower_cur_parent_dentry = dget_parent(lower_dentry);
+
+ if ((lower_dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+ err = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+ if (err == 0) {
+ d_drop(dentry);
+ goto out;
+ }
+ }
+
+ spin_lock(&lower_dentry->d_lock);
+ if (d_unhashed(lower_dentry)) {
+ spin_unlock(&lower_dentry->d_lock);
+ d_drop(dentry);
+ err = 0;
+ goto out;
+ }
+ spin_unlock(&lower_dentry->d_lock);
+
+ if (parent_lower_dentry != lower_cur_parent_dentry) {
+ d_drop(dentry);
+ err = 0;
+ goto out;
+ }
+
+ if (dentry < lower_dentry) {
+ spin_lock(&dentry->d_lock);
+ spin_lock_nested(&lower_dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ } else {
+ spin_lock(&lower_dentry->d_lock);
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ }
+
+ if (!qstr_case_eq(&dentry->d_name, &lower_dentry->d_name)) {
+ __d_drop(dentry);
+ err = 0;
+ }
+
+ if (dentry < lower_dentry) {
+ spin_unlock(&lower_dentry->d_lock);
+ spin_unlock(&dentry->d_lock);
+ } else {
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&lower_dentry->d_lock);
+ }
+ if (!err)
+ goto out;
+
+ /* If our top's inode is gone, we may be out of date */
+ inode = igrab(d_inode(dentry));
+ if (inode) {
+ data = top_data_get(SDCARDFS_I(inode));
+ if (!data || data->abandoned) {
+ d_drop(dentry);
+ err = 0;
+ }
+ if (data)
+ data_put(data);
+ iput(inode);
+ }
+
+out:
+ dput(parent_dentry);
+ dput(lower_cur_parent_dentry);
+ sdcardfs_put_lower_path(parent_dentry, &parent_lower_path);
+ sdcardfs_put_real_lower(dentry, &lower_path);
+ return err;
+}
+
+static void sdcardfs_d_release(struct dentry *dentry)
+{
+ /* release and reset the lower paths */
+ if (has_graft_path(dentry))
+ sdcardfs_put_reset_orig_path(dentry);
+ sdcardfs_put_reset_lower_path(dentry);
+ free_dentry_private_data(dentry);
+}
+
+static int sdcardfs_hash_ci(const struct dentry *dentry,
+ struct qstr *qstr)
+{
+ /*
+ * This function is copy of vfat_hashi.
+ * FIXME Should we support national language?
+ * Refer to vfat_hashi()
+ * struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
+ */
+ const unsigned char *name;
+ unsigned int len;
+ unsigned long hash;
+
+ name = qstr->name;
+ len = qstr->len;
+
+ hash = init_name_hash(dentry);
+ while (len--)
+ hash = partial_name_hash(tolower(*name++), hash);
+ qstr->hash = end_name_hash(hash);
+
+ return 0;
+}
+
+/*
+ * Case insensitive compare of two vfat names.
+ */
+static int sdcardfs_cmp_ci(const struct dentry *dentry,
+ unsigned int len, const char *str, const struct qstr *name)
+{
+ /* FIXME Should we support national language? */
+
+ if (name->len == len) {
+ if (str_n_case_eq(name->name, str, len))
+ return 0;
+ }
+ return 1;
+}
+
+static void sdcardfs_canonical_path(const struct path *path,
+ struct path *actual_path)
+{
+ sdcardfs_get_real_lower(path->dentry, actual_path);
+}
+
+const struct dentry_operations sdcardfs_ci_dops = {
+ .d_revalidate = sdcardfs_d_revalidate,
+ .d_release = sdcardfs_d_release,
+ .d_hash = sdcardfs_hash_ci,
+ .d_compare = sdcardfs_cmp_ci,
+ .d_canonical_path = sdcardfs_canonical_path,
+};
+
diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c
new file mode 100644
index 000000000000..1239d1cd208b
--- /dev/null
+++ b/fs/sdcardfs/derived_perm.c
@@ -0,0 +1,471 @@
+/*
+ * fs/sdcardfs/derived_perm.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+
+/* copy derived state from parent inode */
+static void inherit_derived_state(struct inode *parent, struct inode *child)
+{
+ struct sdcardfs_inode_info *pi = SDCARDFS_I(parent);
+ struct sdcardfs_inode_info *ci = SDCARDFS_I(child);
+
+ ci->data->perm = PERM_INHERIT;
+ ci->data->userid = pi->data->userid;
+ ci->data->d_uid = pi->data->d_uid;
+ ci->data->under_android = pi->data->under_android;
+ ci->data->under_cache = pi->data->under_cache;
+ ci->data->under_obb = pi->data->under_obb;
+ set_top(ci, pi->top_data);
+}
+
+/* helper function for derived state */
+void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
+ uid_t uid, bool under_android,
+ struct sdcardfs_inode_data *top)
+{
+ struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
+
+ info->data->perm = perm;
+ info->data->userid = userid;
+ info->data->d_uid = uid;
+ info->data->under_android = under_android;
+ info->data->under_cache = false;
+ info->data->under_obb = false;
+ set_top(info, top);
+}
+
+/* While renaming, there is a point where we want the path from dentry,
+ * but the name from newdentry
+ */
+void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
+ const struct qstr *name)
+{
+ struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry));
+ struct sdcardfs_inode_data *parent_data =
+ SDCARDFS_I(d_inode(parent))->data;
+ appid_t appid;
+ unsigned long user_num;
+ int err;
+ struct qstr q_Android = QSTR_LITERAL("Android");
+ struct qstr q_data = QSTR_LITERAL("data");
+ struct qstr q_obb = QSTR_LITERAL("obb");
+ struct qstr q_media = QSTR_LITERAL("media");
+ struct qstr q_cache = QSTR_LITERAL("cache");
+
+ /* By default, each inode inherits from its parent.
+ * the properties are maintained on its private fields
+ * because the inode attributes will be modified with that of
+ * its lower inode.
+ * These values are used by our custom permission call instead
+ * of using the inode permissions.
+ */
+
+ inherit_derived_state(d_inode(parent), d_inode(dentry));
+
+ /* Files don't get special labels */
+ if (!S_ISDIR(d_inode(dentry)->i_mode))
+ return;
+ /* Derive custom permissions based on parent and current node */
+ switch (parent_data->perm) {
+ case PERM_INHERIT:
+ case PERM_ANDROID_PACKAGE_CACHE:
+ /* Already inherited above */
+ break;
+ case PERM_PRE_ROOT:
+ /* Legacy internal layout places users at top level */
+ info->data->perm = PERM_ROOT;
+ err = kstrtoul(name->name, 10, &user_num);
+ if (err)
+ info->data->userid = 0;
+ else
+ info->data->userid = user_num;
+ set_top(info, info->data);
+ break;
+ case PERM_ROOT:
+ /* Assume masked off by default. */
+ if (qstr_case_eq(name, &q_Android)) {
+ /* App-specific directories inside; let anyone traverse */
+ info->data->perm = PERM_ANDROID;
+ info->data->under_android = true;
+ set_top(info, info->data);
+ }
+ break;
+ case PERM_ANDROID:
+ if (qstr_case_eq(name, &q_data)) {
+ /* App-specific directories inside; let anyone traverse */
+ info->data->perm = PERM_ANDROID_DATA;
+ set_top(info, info->data);
+ } else if (qstr_case_eq(name, &q_obb)) {
+ /* App-specific directories inside; let anyone traverse */
+ info->data->perm = PERM_ANDROID_OBB;
+ info->data->under_obb = true;
+ set_top(info, info->data);
+ /* Single OBB directory is always shared */
+ } else if (qstr_case_eq(name, &q_media)) {
+ /* App-specific directories inside; let anyone traverse */
+ info->data->perm = PERM_ANDROID_MEDIA;
+ set_top(info, info->data);
+ }
+ break;
+ case PERM_ANDROID_OBB:
+ case PERM_ANDROID_DATA:
+ case PERM_ANDROID_MEDIA:
+ info->data->perm = PERM_ANDROID_PACKAGE;
+ appid = get_appid(name->name);
+ if (appid != 0 && !is_excluded(name->name, parent_data->userid))
+ info->data->d_uid =
+ multiuser_get_uid(parent_data->userid, appid);
+ set_top(info, info->data);
+ break;
+ case PERM_ANDROID_PACKAGE:
+ if (qstr_case_eq(name, &q_cache)) {
+ info->data->perm = PERM_ANDROID_PACKAGE_CACHE;
+ info->data->under_cache = true;
+ }
+ break;
+ }
+}
+
+void get_derived_permission(struct dentry *parent, struct dentry *dentry)
+{
+ get_derived_permission_new(parent, dentry, &dentry->d_name);
+}
+
+static appid_t get_type(const char *name)
+{
+ const char *ext = strrchr(name, '.');
+ appid_t id;
+
+ if (ext && ext[0]) {
+ ext = &ext[1];
+ id = get_ext_gid(ext);
+ return id?:AID_MEDIA_RW;
+ }
+ return AID_MEDIA_RW;
+}
+
+void fixup_lower_ownership(struct dentry *dentry, const char *name)
+{
+ struct path path;
+ struct inode *inode;
+ struct inode *delegated_inode = NULL;
+ int error;
+ struct sdcardfs_inode_info *info;
+ struct sdcardfs_inode_data *info_d;
+ struct sdcardfs_inode_data *info_top;
+ perm_t perm;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ uid_t uid = sbi->options.fs_low_uid;
+ gid_t gid = sbi->options.fs_low_gid;
+ struct iattr newattrs;
+
+ info = SDCARDFS_I(d_inode(dentry));
+ info_d = info->data;
+ perm = info_d->perm;
+ if (info_d->under_obb) {
+ perm = PERM_ANDROID_OBB;
+ } else if (info_d->under_cache) {
+ perm = PERM_ANDROID_PACKAGE_CACHE;
+ } else if (perm == PERM_INHERIT) {
+ info_top = top_data_get(info);
+ perm = info_top->perm;
+ data_put(info_top);
+ }
+
+ switch (perm) {
+ case PERM_ROOT:
+ case PERM_ANDROID:
+ case PERM_ANDROID_DATA:
+ case PERM_ANDROID_MEDIA:
+ case PERM_ANDROID_PACKAGE:
+ case PERM_ANDROID_PACKAGE_CACHE:
+ uid = multiuser_get_uid(info_d->userid, uid);
+ break;
+ case PERM_ANDROID_OBB:
+ uid = AID_MEDIA_OBB;
+ break;
+ case PERM_PRE_ROOT:
+ default:
+ break;
+ }
+ switch (perm) {
+ case PERM_ROOT:
+ case PERM_ANDROID:
+ case PERM_ANDROID_DATA:
+ case PERM_ANDROID_MEDIA:
+ if (S_ISDIR(d_inode(dentry)->i_mode))
+ gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW);
+ else
+ gid = multiuser_get_uid(info_d->userid, get_type(name));
+ break;
+ case PERM_ANDROID_OBB:
+ gid = AID_MEDIA_OBB;
+ break;
+ case PERM_ANDROID_PACKAGE:
+ if (uid_is_app(info_d->d_uid))
+ gid = multiuser_get_ext_gid(info_d->d_uid);
+ else
+ gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW);
+ break;
+ case PERM_ANDROID_PACKAGE_CACHE:
+ if (uid_is_app(info_d->d_uid))
+ gid = multiuser_get_ext_cache_gid(info_d->d_uid);
+ else
+ gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW);
+ break;
+ case PERM_PRE_ROOT:
+ default:
+ break;
+ }
+
+ sdcardfs_get_lower_path(dentry, &path);
+ inode = d_inode(path.dentry);
+ if (d_inode(path.dentry)->i_gid.val != gid || d_inode(path.dentry)->i_uid.val != uid) {
+retry_deleg:
+ newattrs.ia_valid = ATTR_GID | ATTR_UID | ATTR_FORCE;
+ newattrs.ia_uid = make_kuid(current_user_ns(), uid);
+ newattrs.ia_gid = make_kgid(current_user_ns(), gid);
+ if (!S_ISDIR(inode->i_mode))
+ newattrs.ia_valid |=
+ ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
+ inode_lock(inode);
+ error = security_path_chown(&path, newattrs.ia_uid, newattrs.ia_gid);
+ if (!error)
+ error = notify_change2(path.mnt, path.dentry, &newattrs, &delegated_inode);
+ inode_unlock(inode);
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
+ if (error)
+ pr_debug("sdcardfs: Failed to touch up lower fs gid/uid for %s\n", name);
+ }
+ sdcardfs_put_lower_path(dentry, &path);
+}
+
+static int descendant_may_need_fixup(struct sdcardfs_inode_data *data,
+ struct limit_search *limit)
+{
+ if (data->perm == PERM_ROOT)
+ return (limit->flags & BY_USERID) ?
+ data->userid == limit->userid : 1;
+ if (data->perm == PERM_PRE_ROOT || data->perm == PERM_ANDROID)
+ return 1;
+ return 0;
+}
+
+static int needs_fixup(perm_t perm)
+{
+ if (perm == PERM_ANDROID_DATA || perm == PERM_ANDROID_OBB
+ || perm == PERM_ANDROID_MEDIA)
+ return 1;
+ return 0;
+}
+
+static void __fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit, int depth)
+{
+ struct dentry *child;
+ struct sdcardfs_inode_info *info;
+
+ /*
+ * All paths will terminate their recursion on hitting PERM_ANDROID_OBB,
+ * PERM_ANDROID_MEDIA, or PERM_ANDROID_DATA. This happens at a depth of
+ * at most 3.
+ */
+ WARN(depth > 3, "%s: Max expected depth exceeded!\n", __func__);
+ spin_lock_nested(&dentry->d_lock, depth);
+ if (!d_inode(dentry)) {
+ spin_unlock(&dentry->d_lock);
+ return;
+ }
+ info = SDCARDFS_I(d_inode(dentry));
+
+ if (needs_fixup(info->data->perm)) {
+ list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+ spin_lock_nested(&child->d_lock, depth + 1);
+ if (!(limit->flags & BY_NAME) || qstr_case_eq(&child->d_name, &limit->name)) {
+ if (d_inode(child)) {
+ get_derived_permission(dentry, child);
+ fixup_tmp_permissions(d_inode(child));
+ spin_unlock(&child->d_lock);
+ break;
+ }
+ }
+ spin_unlock(&child->d_lock);
+ }
+ } else if (descendant_may_need_fixup(info->data, limit)) {
+ list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+ __fixup_perms_recursive(child, limit, depth + 1);
+ }
+ }
+ spin_unlock(&dentry->d_lock);
+}
+
+void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit)
+{
+ __fixup_perms_recursive(dentry, limit, 0);
+}
+
+/* main function for updating derived permission */
+inline void update_derived_permission_lock(struct dentry *dentry)
+{
+ struct dentry *parent;
+
+ if (!dentry || !d_inode(dentry)) {
+ pr_err("sdcardfs: %s: invalid dentry\n", __func__);
+ return;
+ }
+ /* FIXME:
+ * 1. need to check whether the dentry is updated or not
+ * 2. remove the root dentry update
+ */
+ if (!IS_ROOT(dentry)) {
+ parent = dget_parent(dentry);
+ if (parent) {
+ get_derived_permission(parent, dentry);
+ dput(parent);
+ }
+ }
+ fixup_tmp_permissions(d_inode(dentry));
+}
+
+int need_graft_path(struct dentry *dentry)
+{
+ int ret = 0;
+ struct dentry *parent = dget_parent(dentry);
+ struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent));
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ struct qstr obb = QSTR_LITERAL("obb");
+
+ if (parent_info->data->perm == PERM_ANDROID &&
+ qstr_case_eq(&dentry->d_name, &obb)) {
+
+ /* /Android/obb is the base obbpath of DERIVED_UNIFIED */
+ if (!(sbi->options.multiuser == false
+ && parent_info->data->userid == 0)) {
+ ret = 1;
+ }
+ }
+ dput(parent);
+ return ret;
+}
+
+int is_obbpath_invalid(struct dentry *dent)
+{
+ int ret = 0;
+ struct sdcardfs_dentry_info *di = SDCARDFS_D(dent);
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dent->d_sb);
+ char *path_buf, *obbpath_s;
+ int need_put = 0;
+ struct path lower_path;
+
+ /* check the base obbpath has been changed.
+ * this routine can check an uninitialized obb dentry as well.
+ * regarding the uninitialized obb, refer to the sdcardfs_mkdir()
+ */
+ spin_lock(&di->lock);
+ if (di->orig_path.dentry) {
+ if (!di->lower_path.dentry) {
+ ret = 1;
+ } else {
+ path_get(&di->lower_path);
+
+ path_buf = kmalloc(PATH_MAX, GFP_ATOMIC);
+ if (!path_buf) {
+ ret = 1;
+ pr_err("sdcardfs: fail to allocate path_buf in %s.\n", __func__);
+ } else {
+ obbpath_s = d_path(&di->lower_path, path_buf, PATH_MAX);
+ if (d_unhashed(di->lower_path.dentry) ||
+ !str_case_eq(sbi->obbpath_s, obbpath_s)) {
+ ret = 1;
+ }
+ kfree(path_buf);
+ }
+
+ pathcpy(&lower_path, &di->lower_path);
+ need_put = 1;
+ }
+ }
+ spin_unlock(&di->lock);
+ if (need_put)
+ path_put(&lower_path);
+ return ret;
+}
+
+int is_base_obbpath(struct dentry *dentry)
+{
+ int ret = 0;
+ struct dentry *parent = dget_parent(dentry);
+ struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent));
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ struct qstr q_obb = QSTR_LITERAL("obb");
+
+ spin_lock(&SDCARDFS_D(dentry)->lock);
+ if (sbi->options.multiuser) {
+ if (parent_info->data->perm == PERM_PRE_ROOT &&
+ qstr_case_eq(&dentry->d_name, &q_obb)) {
+ ret = 1;
+ }
+ } else if (parent_info->data->perm == PERM_ANDROID &&
+ qstr_case_eq(&dentry->d_name, &q_obb)) {
+ ret = 1;
+ }
+ spin_unlock(&SDCARDFS_D(dentry)->lock);
+ return ret;
+}
+
+/* The lower_path will be stored to the dentry's orig_path
+ * and the base obbpath will be copyed to the lower_path variable.
+ * if an error returned, there's no change in the lower_path
+ * returns: -ERRNO if error (0: no error)
+ */
+int setup_obb_dentry(struct dentry *dentry, struct path *lower_path)
+{
+ int err = 0;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ struct path obbpath;
+
+ /* A local obb dentry must have its own orig_path to support rmdir
+ * and mkdir of itself. Usually, we expect that the sbi->obbpath
+ * is avaiable on this stage.
+ */
+ sdcardfs_set_orig_path(dentry, lower_path);
+
+ err = kern_path(sbi->obbpath_s,
+ LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &obbpath);
+
+ if (!err) {
+ /* the obbpath base has been found */
+ pathcpy(lower_path, &obbpath);
+ } else {
+ /* if the sbi->obbpath is not available, we can optionally
+ * setup the lower_path with its orig_path.
+ * but, the current implementation just returns an error
+ * because the sdcard daemon also regards this case as
+ * a lookup fail.
+ */
+ pr_info("sdcardfs: the sbi->obbpath is not available\n");
+ }
+ return err;
+}
+
+
diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c
new file mode 100644
index 000000000000..399531bc1f6f
--- /dev/null
+++ b/fs/sdcardfs/file.c
@@ -0,0 +1,461 @@
+/*
+ * fs/sdcardfs/file.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE
+#include <linux/backing-dev.h>
+#endif
+
+static ssize_t sdcardfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int err;
+ struct file *lower_file;
+ struct dentry *dentry = file->f_path.dentry;
+#ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE
+ struct backing_dev_info *bdi;
+#endif
+
+ lower_file = sdcardfs_lower_file(file);
+
+#ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE
+ if (file->f_mode & FMODE_NOACTIVE) {
+ if (!(lower_file->f_mode & FMODE_NOACTIVE)) {
+ bdi = lower_file->f_mapping->backing_dev_info;
+ lower_file->f_ra.ra_pages = bdi->ra_pages * 2;
+ spin_lock(&lower_file->f_lock);
+ lower_file->f_mode |= FMODE_NOACTIVE;
+ spin_unlock(&lower_file->f_lock);
+ }
+ }
+#endif
+
+ err = vfs_read(lower_file, buf, count, ppos);
+ /* update our inode atime upon a successful lower read */
+ if (err >= 0)
+ fsstack_copy_attr_atime(d_inode(dentry),
+ file_inode(lower_file));
+
+ return err;
+}
+
+static ssize_t sdcardfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int err;
+ struct file *lower_file;
+ struct dentry *dentry = file->f_path.dentry;
+
+ /* check disk space */
+ if (!check_min_free_space(dentry, count, 0)) {
+ pr_err("No minimum free space.\n");
+ return -ENOSPC;
+ }
+
+ lower_file = sdcardfs_lower_file(file);
+ err = vfs_write(lower_file, buf, count, ppos);
+ /* update our inode times+sizes upon a successful lower write */
+ if (err >= 0) {
+ fsstack_copy_inode_size(d_inode(dentry),
+ file_inode(lower_file));
+ fsstack_copy_attr_times(d_inode(dentry),
+ file_inode(lower_file));
+ }
+
+ return err;
+}
+
+static int sdcardfs_readdir(struct file *file, struct dir_context *ctx)
+{
+ int err;
+ struct file *lower_file = NULL;
+ struct dentry *dentry = file->f_path.dentry;
+
+ lower_file = sdcardfs_lower_file(file);
+
+ lower_file->f_pos = file->f_pos;
+ err = iterate_dir(lower_file, ctx);
+ file->f_pos = lower_file->f_pos;
+ if (err >= 0) /* copy the atime */
+ fsstack_copy_attr_atime(d_inode(dentry),
+ file_inode(lower_file));
+ return err;
+}
+
+static long sdcardfs_unlocked_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ long err = -ENOTTY;
+ struct file *lower_file;
+ const struct cred *saved_cred = NULL;
+ struct dentry *dentry = file->f_path.dentry;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+
+ lower_file = sdcardfs_lower_file(file);
+
+ /* XXX: use vfs_ioctl if/when VFS exports it */
+ if (!lower_file || !lower_file->f_op)
+ goto out;
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(sbi, SDCARDFS_I(file_inode(file))->data);
+ if (!saved_cred) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (lower_file->f_op->unlocked_ioctl)
+ err = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
+
+ /* some ioctls can change inode attributes (EXT2_IOC_SETFLAGS) */
+ if (!err)
+ sdcardfs_copy_and_fix_attrs(file_inode(file),
+ file_inode(lower_file));
+ revert_fsids(saved_cred);
+out:
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+static long sdcardfs_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ long err = -ENOTTY;
+ struct file *lower_file;
+ const struct cred *saved_cred = NULL;
+ struct dentry *dentry = file->f_path.dentry;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+
+ lower_file = sdcardfs_lower_file(file);
+
+ /* XXX: use vfs_ioctl if/when VFS exports it */
+ if (!lower_file || !lower_file->f_op)
+ goto out;
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(sbi, SDCARDFS_I(file_inode(file))->data);
+ if (!saved_cred) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (lower_file->f_op->compat_ioctl)
+ err = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
+
+ revert_fsids(saved_cred);
+out:
+ return err;
+}
+#endif
+
+static int sdcardfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ int err = 0;
+ bool willwrite;
+ struct file *lower_file;
+ const struct vm_operations_struct *saved_vm_ops = NULL;
+
+ /* this might be deferred to mmap's writepage */
+ willwrite = ((vma->vm_flags | VM_SHARED | VM_WRITE) == vma->vm_flags);
+
+ /*
+ * File systems which do not implement ->writepage may use
+ * generic_file_readonly_mmap as their ->mmap op. If you call
+ * generic_file_readonly_mmap with VM_WRITE, you'd get an -EINVAL.
+ * But we cannot call the lower ->mmap op, so we can't tell that
+ * writeable mappings won't work. Therefore, our only choice is to
+ * check if the lower file system supports the ->writepage, and if
+ * not, return EINVAL (the same error that
+ * generic_file_readonly_mmap returns in that case).
+ */
+ lower_file = sdcardfs_lower_file(file);
+ if (willwrite && !lower_file->f_mapping->a_ops->writepage) {
+ err = -EINVAL;
+ pr_err("sdcardfs: lower file system does not support writeable mmap\n");
+ goto out;
+ }
+
+ /*
+ * find and save lower vm_ops.
+ *
+ * XXX: the VFS should have a cleaner way of finding the lower vm_ops
+ */
+ if (!SDCARDFS_F(file)->lower_vm_ops) {
+ err = lower_file->f_op->mmap(lower_file, vma);
+ if (err) {
+ pr_err("sdcardfs: lower mmap failed %d\n", err);
+ goto out;
+ }
+ saved_vm_ops = vma->vm_ops; /* save: came from lower ->mmap */
+ }
+
+ /*
+ * Next 3 lines are all I need from generic_file_mmap. I definitely
+ * don't want its test for ->readpage which returns -ENOEXEC.
+ */
+ file_accessed(file);
+ vma->vm_ops = &sdcardfs_vm_ops;
+
+ file->f_mapping->a_ops = &sdcardfs_aops; /* set our aops */
+ if (!SDCARDFS_F(file)->lower_vm_ops) /* save for our ->fault */
+ SDCARDFS_F(file)->lower_vm_ops = saved_vm_ops;
+ vma->vm_private_data = file;
+ get_file(lower_file);
+ vma->vm_file = lower_file;
+
+out:
+ return err;
+}
+
+static int sdcardfs_open(struct inode *inode, struct file *file)
+{
+ int err = 0;
+ struct file *lower_file = NULL;
+ struct path lower_path;
+ struct dentry *dentry = file->f_path.dentry;
+ struct dentry *parent = dget_parent(dentry);
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ const struct cred *saved_cred = NULL;
+
+ /* don't open unhashed/deleted files */
+ if (d_unhashed(dentry)) {
+ err = -ENOENT;
+ goto out_err;
+ }
+
+ if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
+ err = -EACCES;
+ goto out_err;
+ }
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(sbi, SDCARDFS_I(inode)->data);
+ if (!saved_cred) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ file->private_data =
+ kzalloc(sizeof(struct sdcardfs_file_info), GFP_KERNEL);
+ if (!SDCARDFS_F(file)) {
+ err = -ENOMEM;
+ goto out_revert_cred;
+ }
+
+ /* open lower object and link sdcardfs's file struct to lower's */
+ sdcardfs_get_lower_path(file->f_path.dentry, &lower_path);
+ lower_file = dentry_open(&lower_path, file->f_flags, current_cred());
+ path_put(&lower_path);
+ if (IS_ERR(lower_file)) {
+ err = PTR_ERR(lower_file);
+ lower_file = sdcardfs_lower_file(file);
+ if (lower_file) {
+ sdcardfs_set_lower_file(file, NULL);
+ fput(lower_file); /* fput calls dput for lower_dentry */
+ }
+ } else {
+ sdcardfs_set_lower_file(file, lower_file);
+ }
+
+ if (err)
+ kfree(SDCARDFS_F(file));
+ else
+ sdcardfs_copy_and_fix_attrs(inode, sdcardfs_lower_inode(inode));
+
+out_revert_cred:
+ revert_fsids(saved_cred);
+out_err:
+ dput(parent);
+ return err;
+}
+
+static int sdcardfs_flush(struct file *file, fl_owner_t id)
+{
+ int err = 0;
+ struct file *lower_file = NULL;
+
+ lower_file = sdcardfs_lower_file(file);
+ if (lower_file && lower_file->f_op && lower_file->f_op->flush) {
+ filemap_write_and_wait(file->f_mapping);
+ err = lower_file->f_op->flush(lower_file, id);
+ }
+
+ return err;
+}
+
+/* release all lower object references & free the file info structure */
+static int sdcardfs_file_release(struct inode *inode, struct file *file)
+{
+ struct file *lower_file;
+
+ lower_file = sdcardfs_lower_file(file);
+ if (lower_file) {
+ sdcardfs_set_lower_file(file, NULL);
+ fput(lower_file);
+ }
+
+ kfree(SDCARDFS_F(file));
+ return 0;
+}
+
+static int sdcardfs_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
+{
+ int err;
+ struct file *lower_file;
+ struct path lower_path;
+ struct dentry *dentry = file->f_path.dentry;
+
+ err = __generic_file_fsync(file, start, end, datasync);
+ if (err)
+ goto out;
+
+ lower_file = sdcardfs_lower_file(file);
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ err = vfs_fsync_range(lower_file, start, end, datasync);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+out:
+ return err;
+}
+
+static int sdcardfs_fasync(int fd, struct file *file, int flag)
+{
+ int err = 0;
+ struct file *lower_file = NULL;
+
+ lower_file = sdcardfs_lower_file(file);
+ if (lower_file->f_op && lower_file->f_op->fasync)
+ err = lower_file->f_op->fasync(fd, lower_file, flag);
+
+ return err;
+}
+
+/*
+ * Sdcardfs cannot use generic_file_llseek as ->llseek, because it would
+ * only set the offset of the upper file. So we have to implement our
+ * own method to set both the upper and lower file offsets
+ * consistently.
+ */
+static loff_t sdcardfs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ int err;
+ struct file *lower_file;
+
+ err = generic_file_llseek(file, offset, whence);
+ if (err < 0)
+ goto out;
+
+ lower_file = sdcardfs_lower_file(file);
+ err = generic_file_llseek(lower_file, offset, whence);
+
+out:
+ return err;
+}
+
+/*
+ * Sdcardfs read_iter, redirect modified iocb to lower read_iter
+ */
+ssize_t sdcardfs_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ int err;
+ struct file *file = iocb->ki_filp, *lower_file;
+
+ lower_file = sdcardfs_lower_file(file);
+ if (!lower_file->f_op->read_iter) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ get_file(lower_file); /* prevent lower_file from being released */
+ iocb->ki_filp = lower_file;
+ err = lower_file->f_op->read_iter(iocb, iter);
+ iocb->ki_filp = file;
+ fput(lower_file);
+ /* update upper inode atime as needed */
+ if (err >= 0 || err == -EIOCBQUEUED)
+ fsstack_copy_attr_atime(file->f_path.dentry->d_inode,
+ file_inode(lower_file));
+out:
+ return err;
+}
+
+/*
+ * Sdcardfs write_iter, redirect modified iocb to lower write_iter
+ */
+ssize_t sdcardfs_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ int err;
+ struct file *file = iocb->ki_filp, *lower_file;
+
+ lower_file = sdcardfs_lower_file(file);
+ if (!lower_file->f_op->write_iter) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ get_file(lower_file); /* prevent lower_file from being released */
+ iocb->ki_filp = lower_file;
+ err = lower_file->f_op->write_iter(iocb, iter);
+ iocb->ki_filp = file;
+ fput(lower_file);
+ /* update upper inode times/sizes as needed */
+ if (err >= 0 || err == -EIOCBQUEUED) {
+ fsstack_copy_inode_size(file->f_path.dentry->d_inode,
+ file_inode(lower_file));
+ fsstack_copy_attr_times(file->f_path.dentry->d_inode,
+ file_inode(lower_file));
+ }
+out:
+ return err;
+}
+
+const struct file_operations sdcardfs_main_fops = {
+ .llseek = generic_file_llseek,
+ .read = sdcardfs_read,
+ .write = sdcardfs_write,
+ .unlocked_ioctl = sdcardfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = sdcardfs_compat_ioctl,
+#endif
+ .mmap = sdcardfs_mmap,
+ .open = sdcardfs_open,
+ .flush = sdcardfs_flush,
+ .release = sdcardfs_file_release,
+ .fsync = sdcardfs_fsync,
+ .fasync = sdcardfs_fasync,
+ .read_iter = sdcardfs_read_iter,
+ .write_iter = sdcardfs_write_iter,
+};
+
+/* trimmed directory options */
+const struct file_operations sdcardfs_dir_fops = {
+ .llseek = sdcardfs_file_llseek,
+ .read = generic_read_dir,
+ .iterate = sdcardfs_readdir,
+ .unlocked_ioctl = sdcardfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = sdcardfs_compat_ioctl,
+#endif
+ .open = sdcardfs_open,
+ .release = sdcardfs_file_release,
+ .flush = sdcardfs_flush,
+ .fsync = sdcardfs_fsync,
+ .fasync = sdcardfs_fasync,
+};
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c
new file mode 100644
index 000000000000..5318bc6bbc68
--- /dev/null
+++ b/fs/sdcardfs/inode.c
@@ -0,0 +1,808 @@
+/*
+ * fs/sdcardfs/inode.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include <linux/fs_struct.h>
+#include <linux/ratelimit.h>
+
+const struct cred *override_fsids(struct sdcardfs_sb_info *sbi,
+ struct sdcardfs_inode_data *data)
+{
+ struct cred *cred;
+ const struct cred *old_cred;
+ uid_t uid;
+
+ cred = prepare_creds();
+ if (!cred)
+ return NULL;
+
+ if (data->under_obb)
+ uid = AID_MEDIA_OBB;
+ else
+ uid = multiuser_get_uid(data->userid, sbi->options.fs_low_uid);
+ cred->fsuid = make_kuid(&init_user_ns, uid);
+ cred->fsgid = make_kgid(&init_user_ns, sbi->options.fs_low_gid);
+
+ old_cred = override_creds(cred);
+
+ return old_cred;
+}
+
+void revert_fsids(const struct cred *old_cred)
+{
+ const struct cred *cur_cred;
+
+ cur_cred = current->cred;
+ revert_creds(old_cred);
+ put_cred(cur_cred);
+}
+
+static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool want_excl)
+{
+ int err;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_dentry_mnt;
+ struct dentry *lower_parent_dentry = NULL;
+ struct path lower_path;
+ const struct cred *saved_cred = NULL;
+ struct fs_struct *saved_fs;
+ struct fs_struct *copied_fs;
+
+ if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+ err = -EACCES;
+ goto out_eacces;
+ }
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb),
+ SDCARDFS_I(dir)->data);
+ if (!saved_cred)
+ return -ENOMEM;
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ lower_dentry_mnt = lower_path.mnt;
+ lower_parent_dentry = lock_parent(lower_dentry);
+
+ /* set last 16bytes of mode field to 0664 */
+ mode = (mode & S_IFMT) | 00664;
+
+ /* temporarily change umask for lower fs write */
+ saved_fs = current->fs;
+ copied_fs = copy_fs_struct(current->fs);
+ if (!copied_fs) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ copied_fs->umask = 0;
+ task_lock(current);
+ current->fs = copied_fs;
+ task_unlock(current);
+
+ err = vfs_create2(lower_dentry_mnt, d_inode(lower_parent_dentry), lower_dentry, mode, want_excl);
+ if (err)
+ goto out;
+
+ err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path,
+ SDCARDFS_I(dir)->data->userid);
+ if (err)
+ goto out;
+ fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
+ fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry));
+ fixup_lower_ownership(dentry, dentry->d_name.name);
+
+out:
+ task_lock(current);
+ current->fs = saved_fs;
+ task_unlock(current);
+ free_fs_struct(copied_fs);
+out_unlock:
+ unlock_dir(lower_parent_dentry);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ revert_fsids(saved_cred);
+out_eacces:
+ return err;
+}
+
+static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int err;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_mnt;
+ struct inode *lower_dir_inode = sdcardfs_lower_inode(dir);
+ struct dentry *lower_dir_dentry;
+ struct path lower_path;
+ const struct cred *saved_cred = NULL;
+
+ if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+ err = -EACCES;
+ goto out_eacces;
+ }
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb),
+ SDCARDFS_I(dir)->data);
+ if (!saved_cred)
+ return -ENOMEM;
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ lower_mnt = lower_path.mnt;
+ dget(lower_dentry);
+ lower_dir_dentry = lock_parent(lower_dentry);
+
+ err = vfs_unlink2(lower_mnt, lower_dir_inode, lower_dentry, NULL);
+
+ /*
+ * Note: unlinking on top of NFS can cause silly-renamed files.
+ * Trying to delete such files results in EBUSY from NFS
+ * below. Silly-renamed files will get deleted by NFS later on, so
+ * we just need to detect them here and treat such EBUSY errors as
+ * if the upper file was successfully deleted.
+ */
+ if (err == -EBUSY && lower_dentry->d_flags & DCACHE_NFSFS_RENAMED)
+ err = 0;
+ if (err)
+ goto out;
+ fsstack_copy_attr_times(dir, lower_dir_inode);
+ fsstack_copy_inode_size(dir, lower_dir_inode);
+ set_nlink(d_inode(dentry),
+ sdcardfs_lower_inode(d_inode(dentry))->i_nlink);
+ d_inode(dentry)->i_ctime = dir->i_ctime;
+ d_drop(dentry); /* this is needed, else LTP fails (VFS won't do it) */
+out:
+ unlock_dir(lower_dir_dentry);
+ dput(lower_dentry);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ revert_fsids(saved_cred);
+out_eacces:
+ return err;
+}
+
+static int touch(char *abs_path, mode_t mode)
+{
+ struct file *filp = filp_open(abs_path, O_RDWR|O_CREAT|O_EXCL|O_NOFOLLOW, mode);
+
+ if (IS_ERR(filp)) {
+ if (PTR_ERR(filp) == -EEXIST) {
+ return 0;
+ } else {
+ pr_err("sdcardfs: failed to open(%s): %ld\n",
+ abs_path, PTR_ERR(filp));
+ return PTR_ERR(filp);
+ }
+ }
+ filp_close(filp, current->files);
+ return 0;
+}
+
+static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ int err;
+ int make_nomedia_in_obb = 0;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_mnt;
+ struct dentry *lower_parent_dentry = NULL;
+ struct path lower_path;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+ const struct cred *saved_cred = NULL;
+ struct sdcardfs_inode_data *pd = SDCARDFS_I(dir)->data;
+ int touch_err = 0;
+ struct fs_struct *saved_fs;
+ struct fs_struct *copied_fs;
+ struct qstr q_obb = QSTR_LITERAL("obb");
+ struct qstr q_data = QSTR_LITERAL("data");
+
+ if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+ err = -EACCES;
+ goto out_eacces;
+ }
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb),
+ SDCARDFS_I(dir)->data);
+ if (!saved_cred)
+ return -ENOMEM;
+
+ /* check disk space */
+ if (!check_min_free_space(dentry, 0, 1)) {
+ pr_err("sdcardfs: No minimum free space.\n");
+ err = -ENOSPC;
+ goto out_revert;
+ }
+
+ /* the lower_dentry is negative here */
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ lower_mnt = lower_path.mnt;
+ lower_parent_dentry = lock_parent(lower_dentry);
+
+ /* set last 16bytes of mode field to 0775 */
+ mode = (mode & S_IFMT) | 00775;
+
+ /* temporarily change umask for lower fs write */
+ saved_fs = current->fs;
+ copied_fs = copy_fs_struct(current->fs);
+ if (!copied_fs) {
+ err = -ENOMEM;
+ unlock_dir(lower_parent_dentry);
+ goto out_unlock;
+ }
+ copied_fs->umask = 0;
+ task_lock(current);
+ current->fs = copied_fs;
+ task_unlock(current);
+
+ err = vfs_mkdir2(lower_mnt, d_inode(lower_parent_dentry), lower_dentry, mode);
+
+ if (err) {
+ unlock_dir(lower_parent_dentry);
+ goto out;
+ }
+
+ /* if it is a local obb dentry, setup it with the base obbpath */
+ if (need_graft_path(dentry)) {
+
+ err = setup_obb_dentry(dentry, &lower_path);
+ if (err) {
+ /* if the sbi->obbpath is not available, the lower_path won't be
+ * changed by setup_obb_dentry() but the lower path is saved to
+ * its orig_path. this dentry will be revalidated later.
+ * but now, the lower_path should be NULL
+ */
+ sdcardfs_put_reset_lower_path(dentry);
+
+ /* the newly created lower path which saved to its orig_path or
+ * the lower_path is the base obbpath.
+ * therefore, an additional path_get is required
+ */
+ path_get(&lower_path);
+ } else
+ make_nomedia_in_obb = 1;
+ }
+
+ err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path, pd->userid);
+ if (err) {
+ unlock_dir(lower_parent_dentry);
+ goto out;
+ }
+
+ fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
+ fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry));
+ /* update number of links on parent directory */
+ set_nlink(dir, sdcardfs_lower_inode(dir)->i_nlink);
+ fixup_lower_ownership(dentry, dentry->d_name.name);
+ unlock_dir(lower_parent_dentry);
+ if ((!sbi->options.multiuser) && (qstr_case_eq(&dentry->d_name, &q_obb))
+ && (pd->perm == PERM_ANDROID) && (pd->userid == 0))
+ make_nomedia_in_obb = 1;
+
+ /* When creating /Android/data and /Android/obb, mark them as .nomedia */
+ if (make_nomedia_in_obb ||
+ ((pd->perm == PERM_ANDROID)
+ && (qstr_case_eq(&dentry->d_name, &q_data)))) {
+ revert_fsids(saved_cred);
+ saved_cred = override_fsids(sbi,
+ SDCARDFS_I(d_inode(dentry))->data);
+ if (!saved_cred) {
+ pr_err("sdcardfs: failed to set up .nomedia in %s: %d\n",
+ lower_path.dentry->d_name.name,
+ -ENOMEM);
+ goto out;
+ }
+ set_fs_pwd(current->fs, &lower_path);
+ touch_err = touch(".nomedia", 0664);
+ if (touch_err) {
+ pr_err("sdcardfs: failed to create .nomedia in %s: %d\n",
+ lower_path.dentry->d_name.name,
+ touch_err);
+ goto out;
+ }
+ }
+out:
+ task_lock(current);
+ current->fs = saved_fs;
+ task_unlock(current);
+
+ free_fs_struct(copied_fs);
+out_unlock:
+ sdcardfs_put_lower_path(dentry, &lower_path);
+out_revert:
+ revert_fsids(saved_cred);
+out_eacces:
+ return err;
+}
+
+static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct dentry *lower_dentry;
+ struct dentry *lower_dir_dentry;
+ struct vfsmount *lower_mnt;
+ int err;
+ struct path lower_path;
+ const struct cred *saved_cred = NULL;
+
+ if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+ err = -EACCES;
+ goto out_eacces;
+ }
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb),
+ SDCARDFS_I(dir)->data);
+ if (!saved_cred)
+ return -ENOMEM;
+
+ /* sdcardfs_get_real_lower(): in case of remove an user's obb dentry
+ * the dentry on the original path should be deleted.
+ */
+ sdcardfs_get_real_lower(dentry, &lower_path);
+
+ lower_dentry = lower_path.dentry;
+ lower_mnt = lower_path.mnt;
+ lower_dir_dentry = lock_parent(lower_dentry);
+
+ err = vfs_rmdir2(lower_mnt, d_inode(lower_dir_dentry), lower_dentry);
+ if (err)
+ goto out;
+
+ d_drop(dentry); /* drop our dentry on success (why not VFS's job?) */
+ if (d_inode(dentry))
+ clear_nlink(d_inode(dentry));
+ fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
+ fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
+ set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink);
+
+out:
+ unlock_dir(lower_dir_dentry);
+ sdcardfs_put_real_lower(dentry, &lower_path);
+ revert_fsids(saved_cred);
+out_eacces:
+ return err;
+}
+
+/*
+ * The locking rules in sdcardfs_rename are complex. We could use a simpler
+ * superblock-level name-space lock for renames and copy-ups.
+ */
+static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ int err = 0;
+ struct dentry *lower_old_dentry = NULL;
+ struct dentry *lower_new_dentry = NULL;
+ struct dentry *lower_old_dir_dentry = NULL;
+ struct dentry *lower_new_dir_dentry = NULL;
+ struct vfsmount *lower_mnt = NULL;
+ struct dentry *trap = NULL;
+ struct path lower_old_path, lower_new_path;
+ const struct cred *saved_cred = NULL;
+
+ if (flags)
+ return -EINVAL;
+
+ if (!check_caller_access_to_name(old_dir, &old_dentry->d_name) ||
+ !check_caller_access_to_name(new_dir, &new_dentry->d_name)) {
+ err = -EACCES;
+ goto out_eacces;
+ }
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(SDCARDFS_SB(old_dir->i_sb),
+ SDCARDFS_I(new_dir)->data);
+ if (!saved_cred)
+ return -ENOMEM;
+
+ sdcardfs_get_real_lower(old_dentry, &lower_old_path);
+ sdcardfs_get_lower_path(new_dentry, &lower_new_path);
+ lower_old_dentry = lower_old_path.dentry;
+ lower_new_dentry = lower_new_path.dentry;
+ lower_mnt = lower_old_path.mnt;
+ lower_old_dir_dentry = dget_parent(lower_old_dentry);
+ lower_new_dir_dentry = dget_parent(lower_new_dentry);
+
+ trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ /* source should not be ancestor of target */
+ if (trap == lower_old_dentry) {
+ err = -EINVAL;
+ goto out;
+ }
+ /* target should not be ancestor of source */
+ if (trap == lower_new_dentry) {
+ err = -ENOTEMPTY;
+ goto out;
+ }
+
+ err = vfs_rename2(lower_mnt,
+ d_inode(lower_old_dir_dentry), lower_old_dentry,
+ d_inode(lower_new_dir_dentry), lower_new_dentry,
+ NULL, 0);
+ if (err)
+ goto out;
+
+ /* Copy attrs from lower dir, but i_uid/i_gid */
+ sdcardfs_copy_and_fix_attrs(new_dir, d_inode(lower_new_dir_dentry));
+ fsstack_copy_inode_size(new_dir, d_inode(lower_new_dir_dentry));
+
+ if (new_dir != old_dir) {
+ sdcardfs_copy_and_fix_attrs(old_dir, d_inode(lower_old_dir_dentry));
+ fsstack_copy_inode_size(old_dir, d_inode(lower_old_dir_dentry));
+ }
+ get_derived_permission_new(new_dentry->d_parent, old_dentry, &new_dentry->d_name);
+ fixup_tmp_permissions(d_inode(old_dentry));
+ fixup_lower_ownership(old_dentry, new_dentry->d_name.name);
+ d_invalidate(old_dentry); /* Can't fixup ownership recursively :( */
+out:
+ unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ dput(lower_old_dir_dentry);
+ dput(lower_new_dir_dentry);
+ sdcardfs_put_real_lower(old_dentry, &lower_old_path);
+ sdcardfs_put_lower_path(new_dentry, &lower_new_path);
+ revert_fsids(saved_cred);
+out_eacces:
+ return err;
+}
+
+#if 0
+static int sdcardfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+ int err;
+ struct dentry *lower_dentry;
+ struct path lower_path;
+ /* XXX readlink does not requires overriding credential */
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ if (!d_inode(lower_dentry)->i_op ||
+ !d_inode(lower_dentry)->i_op->readlink) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = d_inode(lower_dentry)->i_op->readlink(lower_dentry,
+ buf, bufsiz);
+ if (err < 0)
+ goto out;
+ fsstack_copy_attr_atime(d_inode(dentry), d_inode(lower_dentry));
+
+out:
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ return err;
+}
+#endif
+
+#if 0
+static const char *sdcardfs_follow_link(struct dentry *dentry, void **cookie)
+{
+ char *buf;
+ int len = PAGE_SIZE, err;
+ mm_segment_t old_fs;
+
+ /* This is freed by the put_link method assuming a successful call. */
+ buf = kmalloc(len, GFP_KERNEL);
+ if (!buf) {
+ buf = ERR_PTR(-ENOMEM);
+ return buf;
+ }
+
+ /* read the symlink, and then we will follow it */
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sdcardfs_readlink(dentry, buf, len);
+ set_fs(old_fs);
+ if (err < 0) {
+ kfree(buf);
+ buf = ERR_PTR(err);
+ } else {
+ buf[err] = '\0';
+ }
+ return *cookie = buf;
+}
+#endif
+
+static int sdcardfs_permission_wrn(struct inode *inode, int mask)
+{
+ WARN_RATELIMIT(1, "sdcardfs does not support permission. Use permission2.\n");
+ return -EINVAL;
+}
+
+void copy_attrs(struct inode *dest, const struct inode *src)
+{
+ dest->i_mode = src->i_mode;
+ dest->i_uid = src->i_uid;
+ dest->i_gid = src->i_gid;
+ dest->i_rdev = src->i_rdev;
+ dest->i_atime = src->i_atime;
+ dest->i_mtime = src->i_mtime;
+ dest->i_ctime = src->i_ctime;
+ dest->i_blkbits = src->i_blkbits;
+ dest->i_flags = src->i_flags;
+#ifdef CONFIG_FS_POSIX_ACL
+ dest->i_acl = src->i_acl;
+#endif
+#ifdef CONFIG_SECURITY
+ dest->i_security = src->i_security;
+#endif
+}
+
+static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int mask)
+{
+ int err;
+ struct inode tmp;
+ struct sdcardfs_inode_data *top = top_data_get(SDCARDFS_I(inode));
+
+ if (!top)
+ return -EINVAL;
+
+ /*
+ * Permission check on sdcardfs inode.
+ * Calling process should have AID_SDCARD_RW permission
+ * Since generic_permission only needs i_mode, i_uid,
+ * i_gid, and i_sb, we can create a fake inode to pass
+ * this information down in.
+ *
+ * The underlying code may attempt to take locks in some
+ * cases for features we're not using, but if that changes,
+ * locks must be dealt with to avoid undefined behavior.
+ */
+ copy_attrs(&tmp, inode);
+ tmp.i_uid = make_kuid(&init_user_ns, top->d_uid);
+ tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, top));
+ tmp.i_mode = (inode->i_mode & S_IFMT)
+ | get_mode(mnt, SDCARDFS_I(inode), top);
+ data_put(top);
+ tmp.i_sb = inode->i_sb;
+ if (IS_POSIXACL(inode))
+ pr_warn("%s: This may be undefined behavior...\n", __func__);
+ err = generic_permission(&tmp, mask);
+ return err;
+}
+
+static int sdcardfs_setattr_wrn(struct dentry *dentry, struct iattr *ia)
+{
+ WARN_RATELIMIT(1, "sdcardfs does not support setattr. User setattr2.\n");
+ return -EINVAL;
+}
+
+static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct iattr *ia)
+{
+ int err;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_mnt;
+ struct inode *inode;
+ struct inode *lower_inode;
+ struct path lower_path;
+ struct iattr lower_ia;
+ struct dentry *parent;
+ struct inode tmp;
+ struct dentry tmp_d;
+ struct sdcardfs_inode_data *top;
+
+ const struct cred *saved_cred = NULL;
+
+ inode = d_inode(dentry);
+ top = top_data_get(SDCARDFS_I(inode));
+
+ if (!top)
+ return -EINVAL;
+
+ /*
+ * Permission check on sdcardfs inode.
+ * Calling process should have AID_SDCARD_RW permission
+ * Since generic_permission only needs i_mode, i_uid,
+ * i_gid, and i_sb, we can create a fake inode to pass
+ * this information down in.
+ *
+ * The underlying code may attempt to take locks in some
+ * cases for features we're not using, but if that changes,
+ * locks must be dealt with to avoid undefined behavior.
+ *
+ */
+ copy_attrs(&tmp, inode);
+ tmp.i_uid = make_kuid(&init_user_ns, top->d_uid);
+ tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, top));
+ tmp.i_mode = (inode->i_mode & S_IFMT)
+ | get_mode(mnt, SDCARDFS_I(inode), top);
+ tmp.i_size = i_size_read(inode);
+ data_put(top);
+ tmp.i_sb = inode->i_sb;
+ tmp_d.d_inode = &tmp;
+
+ /*
+ * Check if user has permission to change dentry. We don't check if
+ * this user can change the lower inode: that should happen when
+ * calling notify_change on the lower inode.
+ */
+ /* prepare our own lower struct iattr (with the lower file) */
+ memcpy(&lower_ia, ia, sizeof(lower_ia));
+ /* Allow touch updating timestamps. A previous permission check ensures
+ * we have write access. Changes to mode, owner, and group are ignored
+ */
+ ia->ia_valid |= ATTR_FORCE;
+ err = setattr_prepare(&tmp_d, ia);
+
+ if (!err) {
+ /* check the Android group ID */
+ parent = dget_parent(dentry);
+ if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name))
+ err = -EACCES;
+ dput(parent);
+ }
+
+ if (err)
+ goto out_err;
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(SDCARDFS_SB(dentry->d_sb),
+ SDCARDFS_I(inode)->data);
+ if (!saved_cred)
+ return -ENOMEM;
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ lower_dentry = lower_path.dentry;
+ lower_mnt = lower_path.mnt;
+ lower_inode = sdcardfs_lower_inode(inode);
+
+ if (ia->ia_valid & ATTR_FILE)
+ lower_ia.ia_file = sdcardfs_lower_file(ia->ia_file);
+
+ lower_ia.ia_valid &= ~(ATTR_UID | ATTR_GID | ATTR_MODE);
+
+ /*
+ * If shrinking, first truncate upper level to cancel writing dirty
+ * pages beyond the new eof; and also if its' maxbytes is more
+ * limiting (fail with -EFBIG before making any change to the lower
+ * level). There is no need to vmtruncate the upper level
+ * afterwards in the other cases: we fsstack_copy_inode_size from
+ * the lower level.
+ */
+ if (ia->ia_valid & ATTR_SIZE) {
+ err = inode_newsize_ok(&tmp, ia->ia_size);
+ if (err) {
+ goto out;
+ }
+ truncate_setsize(inode, ia->ia_size);
+ }
+
+ /*
+ * mode change is for clearing setuid/setgid bits. Allow lower fs
+ * to interpret this in its own way.
+ */
+ if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+ lower_ia.ia_valid &= ~ATTR_MODE;
+
+ /* notify the (possibly copied-up) lower inode */
+ /*
+ * Note: we use d_inode(lower_dentry), because lower_inode may be
+ * unlinked (no inode->i_sb and i_ino==0. This happens if someone
+ * tries to open(), unlink(), then ftruncate() a file.
+ */
+ inode_lock(d_inode(lower_dentry));
+ err = notify_change2(lower_mnt, lower_dentry, &lower_ia, /* note: lower_ia */
+ NULL);
+ inode_unlock(d_inode(lower_dentry));
+ if (err)
+ goto out;
+
+ /* get attributes from the lower inode and update derived permissions */
+ sdcardfs_copy_and_fix_attrs(inode, lower_inode);
+
+ /*
+ * Not running fsstack_copy_inode_size(inode, lower_inode), because
+ * VFS should update our inode size, and notify_change on
+ * lower_inode should update its size.
+ */
+
+out:
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ revert_fsids(saved_cred);
+out_err:
+ return err;
+}
+
+static int sdcardfs_fillattr(struct vfsmount *mnt,
+ struct inode *inode, struct kstat *stat)
+{
+ struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
+ struct sdcardfs_inode_data *top = top_data_get(info);
+
+ if (!top)
+ return -EINVAL;
+
+ stat->dev = inode->i_sb->s_dev;
+ stat->ino = inode->i_ino;
+ stat->mode = (inode->i_mode & S_IFMT) | get_mode(mnt, info, top);
+ stat->nlink = inode->i_nlink;
+ stat->uid = make_kuid(&init_user_ns, top->d_uid);
+ stat->gid = make_kgid(&init_user_ns, get_gid(mnt, top));
+ stat->rdev = inode->i_rdev;
+ stat->size = i_size_read(inode);
+ stat->atime = inode->i_atime;
+ stat->mtime = inode->i_mtime;
+ stat->ctime = inode->i_ctime;
+ stat->blksize = (1 << inode->i_blkbits);
+ stat->blocks = inode->i_blocks;
+ data_put(top);
+ return 0;
+}
+
+static int sdcardfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct kstat lower_stat;
+ struct path lower_path;
+ struct dentry *parent;
+ int err;
+
+ parent = dget_parent(dentry);
+ if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
+ dput(parent);
+ return -EACCES;
+ }
+ dput(parent);
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ err = vfs_getattr(&lower_path, &lower_stat);
+ if (err)
+ goto out;
+ sdcardfs_copy_and_fix_attrs(d_inode(dentry),
+ d_inode(lower_path.dentry));
+ err = sdcardfs_fillattr(mnt, d_inode(dentry), stat);
+ stat->blocks = lower_stat.blocks;
+out:
+ sdcardfs_put_lower_path(dentry, &lower_path);
+ return err;
+}
+
+const struct inode_operations sdcardfs_symlink_iops = {
+ .permission2 = sdcardfs_permission,
+ .setattr2 = sdcardfs_setattr,
+ /* XXX Following operations are implemented,
+ * but FUSE(sdcard) or FAT does not support them
+ * These methods are *NOT* perfectly tested.
+ .readlink = sdcardfs_readlink,
+ .follow_link = sdcardfs_follow_link,
+ .put_link = kfree_put_link,
+ */
+};
+
+const struct inode_operations sdcardfs_dir_iops = {
+ .create = sdcardfs_create,
+ .lookup = sdcardfs_lookup,
+ .permission = sdcardfs_permission_wrn,
+ .permission2 = sdcardfs_permission,
+ .unlink = sdcardfs_unlink,
+ .mkdir = sdcardfs_mkdir,
+ .rmdir = sdcardfs_rmdir,
+ .rename = sdcardfs_rename,
+ .setattr = sdcardfs_setattr_wrn,
+ .setattr2 = sdcardfs_setattr,
+ .getattr = sdcardfs_getattr,
+};
+
+const struct inode_operations sdcardfs_main_iops = {
+ .permission = sdcardfs_permission_wrn,
+ .permission2 = sdcardfs_permission,
+ .setattr = sdcardfs_setattr_wrn,
+ .setattr2 = sdcardfs_setattr,
+ .getattr = sdcardfs_getattr,
+};
diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c
new file mode 100644
index 000000000000..e1bff0da1925
--- /dev/null
+++ b/fs/sdcardfs/lookup.c
@@ -0,0 +1,471 @@
+/*
+ * fs/sdcardfs/lookup.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include "linux/delay.h"
+
+/* The dentry cache is just so we have properly sized dentries */
+static struct kmem_cache *sdcardfs_dentry_cachep;
+
+int sdcardfs_init_dentry_cache(void)
+{
+ sdcardfs_dentry_cachep =
+ kmem_cache_create("sdcardfs_dentry",
+ sizeof(struct sdcardfs_dentry_info),
+ 0, SLAB_RECLAIM_ACCOUNT, NULL);
+
+ return sdcardfs_dentry_cachep ? 0 : -ENOMEM;
+}
+
+void sdcardfs_destroy_dentry_cache(void)
+{
+ kmem_cache_destroy(sdcardfs_dentry_cachep);
+}
+
+void free_dentry_private_data(struct dentry *dentry)
+{
+ if (!dentry || !dentry->d_fsdata)
+ return;
+ kmem_cache_free(sdcardfs_dentry_cachep, dentry->d_fsdata);
+ dentry->d_fsdata = NULL;
+}
+
+/* allocate new dentry private data */
+int new_dentry_private_data(struct dentry *dentry)
+{
+ struct sdcardfs_dentry_info *info = SDCARDFS_D(dentry);
+
+ /* use zalloc to init dentry_info.lower_path */
+ info = kmem_cache_zalloc(sdcardfs_dentry_cachep, GFP_ATOMIC);
+ if (!info)
+ return -ENOMEM;
+
+ spin_lock_init(&info->lock);
+ dentry->d_fsdata = info;
+
+ return 0;
+}
+
+struct inode_data {
+ struct inode *lower_inode;
+ userid_t id;
+};
+
+static int sdcardfs_inode_test(struct inode *inode, void *candidate_data/*void *candidate_lower_inode*/)
+{
+ struct inode *current_lower_inode = sdcardfs_lower_inode(inode);
+ userid_t current_userid = SDCARDFS_I(inode)->data->userid;
+
+ if (current_lower_inode == ((struct inode_data *)candidate_data)->lower_inode &&
+ current_userid == ((struct inode_data *)candidate_data)->id)
+ return 1; /* found a match */
+ else
+ return 0; /* no match */
+}
+
+static int sdcardfs_inode_set(struct inode *inode, void *lower_inode)
+{
+ /* we do actual inode initialization in sdcardfs_iget */
+ return 0;
+}
+
+struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode, userid_t id)
+{
+ struct sdcardfs_inode_info *info;
+ struct inode_data data;
+ struct inode *inode; /* the new inode to return */
+
+ if (!igrab(lower_inode))
+ return ERR_PTR(-ESTALE);
+
+ data.id = id;
+ data.lower_inode = lower_inode;
+ inode = iget5_locked(sb, /* our superblock */
+ /*
+ * hashval: we use inode number, but we can
+ * also use "(unsigned long)lower_inode"
+ * instead.
+ */
+ lower_inode->i_ino, /* hashval */
+ sdcardfs_inode_test, /* inode comparison function */
+ sdcardfs_inode_set, /* inode init function */
+ &data); /* data passed to test+set fxns */
+ if (!inode) {
+ iput(lower_inode);
+ return ERR_PTR(-ENOMEM);
+ }
+ /* if found a cached inode, then just return it (after iput) */
+ if (!(inode->i_state & I_NEW)) {
+ iput(lower_inode);
+ return inode;
+ }
+
+ /* initialize new inode */
+ info = SDCARDFS_I(inode);
+
+ inode->i_ino = lower_inode->i_ino;
+ sdcardfs_set_lower_inode(inode, lower_inode);
+
+ inode->i_version++;
+
+ /* use different set of inode ops for symlinks & directories */
+ if (S_ISDIR(lower_inode->i_mode))
+ inode->i_op = &sdcardfs_dir_iops;
+ else if (S_ISLNK(lower_inode->i_mode))
+ inode->i_op = &sdcardfs_symlink_iops;
+ else
+ inode->i_op = &sdcardfs_main_iops;
+
+ /* use different set of file ops for directories */
+ if (S_ISDIR(lower_inode->i_mode))
+ inode->i_fop = &sdcardfs_dir_fops;
+ else
+ inode->i_fop = &sdcardfs_main_fops;
+
+ inode->i_mapping->a_ops = &sdcardfs_aops;
+
+ inode->i_atime.tv_sec = 0;
+ inode->i_atime.tv_nsec = 0;
+ inode->i_mtime.tv_sec = 0;
+ inode->i_mtime.tv_nsec = 0;
+ inode->i_ctime.tv_sec = 0;
+ inode->i_ctime.tv_nsec = 0;
+
+ /* properly initialize special inodes */
+ if (S_ISBLK(lower_inode->i_mode) || S_ISCHR(lower_inode->i_mode) ||
+ S_ISFIFO(lower_inode->i_mode) || S_ISSOCK(lower_inode->i_mode))
+ init_special_inode(inode, lower_inode->i_mode,
+ lower_inode->i_rdev);
+
+ /* all well, copy inode attributes */
+ sdcardfs_copy_and_fix_attrs(inode, lower_inode);
+ fsstack_copy_inode_size(inode, lower_inode);
+
+ unlock_new_inode(inode);
+ return inode;
+}
+
+/*
+ * Helper interpose routine, called directly by ->lookup to handle
+ * spliced dentries.
+ */
+static struct dentry *__sdcardfs_interpose(struct dentry *dentry,
+ struct super_block *sb,
+ struct path *lower_path,
+ userid_t id)
+{
+ struct inode *inode;
+ struct inode *lower_inode;
+ struct super_block *lower_sb;
+ struct dentry *ret_dentry;
+
+ lower_inode = d_inode(lower_path->dentry);
+ lower_sb = sdcardfs_lower_super(sb);
+
+ /* check that the lower file system didn't cross a mount point */
+ if (lower_inode->i_sb != lower_sb) {
+ ret_dentry = ERR_PTR(-EXDEV);
+ goto out;
+ }
+
+ /*
+ * We allocate our new inode below by calling sdcardfs_iget,
+ * which will initialize some of the new inode's fields
+ */
+
+ /* inherit lower inode number for sdcardfs's inode */
+ inode = sdcardfs_iget(sb, lower_inode, id);
+ if (IS_ERR(inode)) {
+ ret_dentry = ERR_CAST(inode);
+ goto out;
+ }
+
+ ret_dentry = d_splice_alias(inode, dentry);
+ dentry = ret_dentry ?: dentry;
+ if (!IS_ERR(dentry))
+ update_derived_permission_lock(dentry);
+out:
+ return ret_dentry;
+}
+
+/*
+ * Connect an sdcardfs inode dentry/inode with several lower ones. This is
+ * the classic stackable file system "vnode interposition" action.
+ *
+ * @dentry: sdcardfs's dentry which interposes on lower one
+ * @sb: sdcardfs's super_block
+ * @lower_path: the lower path (caller does path_get/put)
+ */
+int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb,
+ struct path *lower_path, userid_t id)
+{
+ struct dentry *ret_dentry;
+
+ ret_dentry = __sdcardfs_interpose(dentry, sb, lower_path, id);
+ return PTR_ERR(ret_dentry);
+}
+
+struct sdcardfs_name_data {
+ struct dir_context ctx;
+ const struct qstr *to_find;
+ char *name;
+ bool found;
+};
+
+static int sdcardfs_name_match(struct dir_context *ctx, const char *name,
+ int namelen, loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct sdcardfs_name_data *buf = container_of(ctx, struct sdcardfs_name_data, ctx);
+ struct qstr candidate = QSTR_INIT(name, namelen);
+
+ if (qstr_case_eq(buf->to_find, &candidate)) {
+ memcpy(buf->name, name, namelen);
+ buf->name[namelen] = 0;
+ buf->found = true;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Main driver function for sdcardfs's lookup.
+ *
+ * Returns: NULL (ok), ERR_PTR if an error occurred.
+ * Fills in lower_parent_path with <dentry,mnt> on success.
+ */
+static struct dentry *__sdcardfs_lookup(struct dentry *dentry,
+ unsigned int flags, struct path *lower_parent_path, userid_t id)
+{
+ int err = 0;
+ struct vfsmount *lower_dir_mnt;
+ struct dentry *lower_dir_dentry = NULL;
+ struct dentry *lower_dentry;
+ const struct qstr *name;
+ struct path lower_path;
+ struct qstr dname;
+ struct dentry *ret_dentry = NULL;
+ struct sdcardfs_sb_info *sbi;
+
+ sbi = SDCARDFS_SB(dentry->d_sb);
+ /* must initialize dentry operations */
+ d_set_d_op(dentry, &sdcardfs_ci_dops);
+
+ if (IS_ROOT(dentry))
+ goto out;
+
+ name = &dentry->d_name;
+
+ /* now start the actual lookup procedure */
+ lower_dir_dentry = lower_parent_path->dentry;
+ lower_dir_mnt = lower_parent_path->mnt;
+
+ /* Use vfs_path_lookup to check if the dentry exists or not */
+ err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name->name, 0,
+ &lower_path);
+ /* check for other cases */
+ if (err == -ENOENT) {
+ struct file *file;
+ const struct cred *cred = current_cred();
+
+ struct sdcardfs_name_data buffer = {
+ .ctx.actor = sdcardfs_name_match,
+ .to_find = name,
+ .name = __getname(),
+ .found = false,
+ };
+
+ if (!buffer.name) {
+ err = -ENOMEM;
+ goto out;
+ }
+ file = dentry_open(lower_parent_path, O_RDONLY, cred);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto put_name;
+ }
+ err = iterate_dir(file, &buffer.ctx);
+ fput(file);
+ if (err)
+ goto put_name;
+
+ if (buffer.found)
+ err = vfs_path_lookup(lower_dir_dentry,
+ lower_dir_mnt,
+ buffer.name, 0,
+ &lower_path);
+ else
+ err = -ENOENT;
+put_name:
+ __putname(buffer.name);
+ }
+
+ /* no error: handle positive dentries */
+ if (!err) {
+ /* check if the dentry is an obb dentry
+ * if true, the lower_inode must be replaced with
+ * the inode of the graft path
+ */
+
+ if (need_graft_path(dentry)) {
+
+ /* setup_obb_dentry()
+ * The lower_path will be stored to the dentry's orig_path
+ * and the base obbpath will be copyed to the lower_path variable.
+ * if an error returned, there's no change in the lower_path
+ * returns: -ERRNO if error (0: no error)
+ */
+ err = setup_obb_dentry(dentry, &lower_path);
+
+ if (err) {
+ /* if the sbi->obbpath is not available, we can optionally
+ * setup the lower_path with its orig_path.
+ * but, the current implementation just returns an error
+ * because the sdcard daemon also regards this case as
+ * a lookup fail.
+ */
+ pr_info("sdcardfs: base obbpath is not available\n");
+ sdcardfs_put_reset_orig_path(dentry);
+ goto out;
+ }
+ }
+
+ sdcardfs_set_lower_path(dentry, &lower_path);
+ ret_dentry =
+ __sdcardfs_interpose(dentry, dentry->d_sb, &lower_path, id);
+ if (IS_ERR(ret_dentry)) {
+ err = PTR_ERR(ret_dentry);
+ /* path_put underlying path on error */
+ sdcardfs_put_reset_lower_path(dentry);
+ }
+ goto out;
+ }
+
+ /*
+ * We don't consider ENOENT an error, and we want to return a
+ * negative dentry.
+ */
+ if (err && err != -ENOENT)
+ goto out;
+
+ /* instatiate a new negative dentry */
+ dname.name = name->name;
+ dname.len = name->len;
+
+ /* See if the low-level filesystem might want
+ * to use its own hash
+ */
+ lower_dentry = d_hash_and_lookup(lower_dir_dentry, &dname);
+ if (IS_ERR(lower_dentry))
+ return lower_dentry;
+ if (!lower_dentry) {
+ /* We called vfs_path_lookup earlier, and did not get a negative
+ * dentry then. Don't confuse the lower filesystem by forcing
+ * one on it now...
+ */
+ err = -ENOENT;
+ goto out;
+ }
+
+ lower_path.dentry = lower_dentry;
+ lower_path.mnt = mntget(lower_dir_mnt);
+ sdcardfs_set_lower_path(dentry, &lower_path);
+
+ /*
+ * If the intent is to create a file, then don't return an error, so
+ * the VFS will continue the process of making this negative dentry
+ * into a positive one.
+ */
+ if (flags & (LOOKUP_CREATE|LOOKUP_RENAME_TARGET))
+ err = 0;
+
+out:
+ if (err)
+ return ERR_PTR(err);
+ return ret_dentry;
+}
+
+/*
+ * On success:
+ * fills dentry object appropriate values and returns NULL.
+ * On fail (== error)
+ * returns error ptr
+ *
+ * @dir : Parent inode.
+ * @dentry : Target dentry to lookup. we should set each of fields.
+ * (dentry->d_name is initialized already)
+ * @nd : nameidata of parent inode
+ */
+struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct dentry *ret = NULL, *parent;
+ struct path lower_parent_path;
+ int err = 0;
+ const struct cred *saved_cred = NULL;
+
+ parent = dget_parent(dentry);
+
+ if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
+ ret = ERR_PTR(-EACCES);
+ goto out_err;
+ }
+
+ /* save current_cred and override it */
+ saved_cred = override_fsids(SDCARDFS_SB(dir->i_sb),
+ SDCARDFS_I(dir)->data);
+ if (!saved_cred) {
+ ret = ERR_PTR(-ENOMEM);
+ goto out_err;
+ }
+
+ sdcardfs_get_lower_path(parent, &lower_parent_path);
+
+ /* allocate dentry private data. We free it in ->d_release */
+ err = new_dentry_private_data(dentry);
+ if (err) {
+ ret = ERR_PTR(err);
+ goto out;
+ }
+
+ ret = __sdcardfs_lookup(dentry, flags, &lower_parent_path,
+ SDCARDFS_I(dir)->data->userid);
+ if (IS_ERR(ret))
+ goto out;
+ if (ret)
+ dentry = ret;
+ if (d_inode(dentry)) {
+ fsstack_copy_attr_times(d_inode(dentry),
+ sdcardfs_lower_inode(d_inode(dentry)));
+ /* get derived permission */
+ get_derived_permission(parent, dentry);
+ fixup_tmp_permissions(d_inode(dentry));
+ fixup_lower_ownership(dentry, dentry->d_name.name);
+ }
+ /* update parent directory's atime */
+ fsstack_copy_attr_atime(d_inode(parent),
+ sdcardfs_lower_inode(d_inode(parent)));
+
+out:
+ sdcardfs_put_lower_path(parent, &lower_parent_path);
+ revert_fsids(saved_cred);
+out_err:
+ dput(parent);
+ return ret;
+}
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c
new file mode 100644
index 000000000000..80825b287836
--- /dev/null
+++ b/fs/sdcardfs/main.c
@@ -0,0 +1,479 @@
+/*
+ * fs/sdcardfs/main.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/parser.h>
+
+enum {
+ Opt_fsuid,
+ Opt_fsgid,
+ Opt_gid,
+ Opt_debug,
+ Opt_mask,
+ Opt_multiuser,
+ Opt_userid,
+ Opt_reserved_mb,
+ Opt_err,
+};
+
+static const match_table_t sdcardfs_tokens = {
+ {Opt_fsuid, "fsuid=%u"},
+ {Opt_fsgid, "fsgid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_debug, "debug"},
+ {Opt_mask, "mask=%u"},
+ {Opt_userid, "userid=%d"},
+ {Opt_multiuser, "multiuser"},
+ {Opt_reserved_mb, "reserved_mb=%u"},
+ {Opt_err, NULL}
+};
+
+static int parse_options(struct super_block *sb, char *options, int silent,
+ int *debug, struct sdcardfs_vfsmount_options *vfsopts,
+ struct sdcardfs_mount_options *opts)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+
+ /* by default, we use AID_MEDIA_RW as uid, gid */
+ opts->fs_low_uid = AID_MEDIA_RW;
+ opts->fs_low_gid = AID_MEDIA_RW;
+ vfsopts->mask = 0;
+ opts->multiuser = false;
+ opts->fs_user_id = 0;
+ vfsopts->gid = 0;
+ /* by default, 0MB is reserved */
+ opts->reserved_mb = 0;
+
+ *debug = 0;
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, sdcardfs_tokens, args);
+
+ switch (token) {
+ case Opt_debug:
+ *debug = 1;
+ break;
+ case Opt_fsuid:
+ if (match_int(&args[0], &option))
+ return 0;
+ opts->fs_low_uid = option;
+ break;
+ case Opt_fsgid:
+ if (match_int(&args[0], &option))
+ return 0;
+ opts->fs_low_gid = option;
+ break;
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ return 0;
+ vfsopts->gid = option;
+ break;
+ case Opt_userid:
+ if (match_int(&args[0], &option))
+ return 0;
+ opts->fs_user_id = option;
+ break;
+ case Opt_mask:
+ if (match_int(&args[0], &option))
+ return 0;
+ vfsopts->mask = option;
+ break;
+ case Opt_multiuser:
+ opts->multiuser = true;
+ break;
+ case Opt_reserved_mb:
+ if (match_int(&args[0], &option))
+ return 0;
+ opts->reserved_mb = option;
+ break;
+ /* unknown option */
+ default:
+ if (!silent)
+ pr_err("Unrecognized mount option \"%s\" or missing value", p);
+ return -EINVAL;
+ }
+ }
+
+ if (*debug) {
+ pr_info("sdcardfs : options - debug:%d\n", *debug);
+ pr_info("sdcardfs : options - uid:%d\n",
+ opts->fs_low_uid);
+ pr_info("sdcardfs : options - gid:%d\n",
+ opts->fs_low_gid);
+ }
+
+ return 0;
+}
+
+int parse_options_remount(struct super_block *sb, char *options, int silent,
+ struct sdcardfs_vfsmount_options *vfsopts)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+ int debug;
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, sdcardfs_tokens, args);
+
+ switch (token) {
+ case Opt_debug:
+ debug = 1;
+ break;
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ return 0;
+ vfsopts->gid = option;
+
+ break;
+ case Opt_mask:
+ if (match_int(&args[0], &option))
+ return 0;
+ vfsopts->mask = option;
+ break;
+ case Opt_multiuser:
+ case Opt_userid:
+ case Opt_fsuid:
+ case Opt_fsgid:
+ case Opt_reserved_mb:
+ pr_warn("Option \"%s\" can't be changed during remount\n", p);
+ break;
+ /* unknown option */
+ default:
+ if (!silent)
+ pr_err("Unrecognized mount option \"%s\" or missing value", p);
+ return -EINVAL;
+ }
+ }
+
+ if (debug) {
+ pr_info("sdcardfs : options - debug:%d\n", debug);
+ pr_info("sdcardfs : options - gid:%d\n", vfsopts->gid);
+ pr_info("sdcardfs : options - mask:%d\n", vfsopts->mask);
+ }
+
+ return 0;
+}
+
+#if 0
+/*
+ * our custom d_alloc_root work-alike
+ *
+ * we can't use d_alloc_root if we want to use our own interpose function
+ * unchanged, so we simply call our own "fake" d_alloc_root
+ */
+static struct dentry *sdcardfs_d_alloc_root(struct super_block *sb)
+{
+ struct dentry *ret = NULL;
+
+ if (sb) {
+ static const struct qstr name = {
+ .name = "/",
+ .len = 1
+ };
+
+ ret = d_alloc(NULL, &name);
+ if (ret) {
+ d_set_d_op(ret, &sdcardfs_ci_dops);
+ ret->d_sb = sb;
+ ret->d_parent = ret;
+ }
+ }
+ return ret;
+}
+#endif
+
+DEFINE_MUTEX(sdcardfs_super_list_lock);
+EXPORT_SYMBOL_GPL(sdcardfs_super_list_lock);
+LIST_HEAD(sdcardfs_super_list);
+EXPORT_SYMBOL_GPL(sdcardfs_super_list);
+
+/*
+ * There is no need to lock the sdcardfs_super_info's rwsem as there is no
+ * way anyone can have a reference to the superblock at this point in time.
+ */
+static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
+ const char *dev_name, void *raw_data, int silent)
+{
+ int err = 0;
+ int debug;
+ struct super_block *lower_sb;
+ struct path lower_path;
+ struct sdcardfs_sb_info *sb_info;
+ struct sdcardfs_vfsmount_options *mnt_opt = mnt->data;
+ struct inode *inode;
+
+ pr_info("sdcardfs version 2.0\n");
+
+ if (!dev_name) {
+ pr_err("sdcardfs: read_super: missing dev_name argument\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ pr_info("sdcardfs: dev_name -> %s\n", dev_name);
+ pr_info("sdcardfs: options -> %s\n", (char *)raw_data);
+ pr_info("sdcardfs: mnt -> %p\n", mnt);
+
+ /* parse lower path */
+ err = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
+ &lower_path);
+ if (err) {
+ pr_err("sdcardfs: error accessing lower directory '%s'\n", dev_name);
+ goto out;
+ }
+
+ /* allocate superblock private data */
+ sb->s_fs_info = kzalloc(sizeof(struct sdcardfs_sb_info), GFP_KERNEL);
+ if (!SDCARDFS_SB(sb)) {
+ pr_crit("sdcardfs: read_super: out of memory\n");
+ err = -ENOMEM;
+ goto out_free;
+ }
+
+ sb_info = sb->s_fs_info;
+ /* parse options */
+ err = parse_options(sb, raw_data, silent, &debug, mnt_opt, &sb_info->options);
+ if (err) {
+ pr_err("sdcardfs: invalid options\n");
+ goto out_freesbi;
+ }
+
+ /* set the lower superblock field of upper superblock */
+ lower_sb = lower_path.dentry->d_sb;
+ atomic_inc(&lower_sb->s_active);
+ sdcardfs_set_lower_super(sb, lower_sb);
+
+ /* inherit maxbytes from lower file system */
+ sb->s_maxbytes = lower_sb->s_maxbytes;
+
+ /*
+ * Our c/m/atime granularity is 1 ns because we may stack on file
+ * systems whose granularity is as good.
+ */
+ sb->s_time_gran = 1;
+
+ sb->s_magic = SDCARDFS_SUPER_MAGIC;
+ sb->s_op = &sdcardfs_sops;
+
+ /* get a new inode and allocate our root dentry */
+ inode = sdcardfs_iget(sb, d_inode(lower_path.dentry), 0);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_sput;
+ }
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root) {
+ err = -ENOMEM;
+ goto out_iput;
+ }
+ d_set_d_op(sb->s_root, &sdcardfs_ci_dops);
+
+ /* link the upper and lower dentries */
+ sb->s_root->d_fsdata = NULL;
+ err = new_dentry_private_data(sb->s_root);
+ if (err)
+ goto out_freeroot;
+
+ /* set the lower dentries for s_root */
+ sdcardfs_set_lower_path(sb->s_root, &lower_path);
+
+ /*
+ * No need to call interpose because we already have a positive
+ * dentry, which was instantiated by d_make_root. Just need to
+ * d_rehash it.
+ */
+ d_rehash(sb->s_root);
+
+ /* setup permission policy */
+ sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL);
+ mutex_lock(&sdcardfs_super_list_lock);
+ if (sb_info->options.multiuser) {
+ setup_derived_state(d_inode(sb->s_root), PERM_PRE_ROOT,
+ sb_info->options.fs_user_id, AID_ROOT,
+ false, SDCARDFS_I(d_inode(sb->s_root))->data);
+ snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name);
+ } else {
+ setup_derived_state(d_inode(sb->s_root), PERM_ROOT,
+ sb_info->options.fs_user_id, AID_ROOT,
+ false, SDCARDFS_I(d_inode(sb->s_root))->data);
+ snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name);
+ }
+ fixup_tmp_permissions(d_inode(sb->s_root));
+ sb_info->sb = sb;
+ list_add(&sb_info->list, &sdcardfs_super_list);
+ mutex_unlock(&sdcardfs_super_list_lock);
+
+ if (!silent)
+ pr_info("sdcardfs: mounted on top of %s type %s\n",
+ dev_name, lower_sb->s_type->name);
+ goto out; /* all is well */
+
+ /* no longer needed: free_dentry_private_data(sb->s_root); */
+out_freeroot:
+ dput(sb->s_root);
+out_iput:
+ iput(inode);
+out_sput:
+ /* drop refs we took earlier */
+ atomic_dec(&lower_sb->s_active);
+out_freesbi:
+ kfree(SDCARDFS_SB(sb));
+ sb->s_fs_info = NULL;
+out_free:
+ path_put(&lower_path);
+
+out:
+ return err;
+}
+
+struct sdcardfs_mount_private {
+ struct vfsmount *mnt;
+ const char *dev_name;
+ void *raw_data;
+};
+
+static int __sdcardfs_fill_super(
+ struct super_block *sb,
+ void *_priv, int silent)
+{
+ struct sdcardfs_mount_private *priv = _priv;
+
+ return sdcardfs_read_super(priv->mnt,
+ sb, priv->dev_name, priv->raw_data, silent);
+}
+
+static struct dentry *sdcardfs_mount(struct vfsmount *mnt,
+ struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *raw_data)
+{
+ struct sdcardfs_mount_private priv = {
+ .mnt = mnt,
+ .dev_name = dev_name,
+ .raw_data = raw_data
+ };
+
+ return mount_nodev(fs_type, flags,
+ &priv, __sdcardfs_fill_super);
+}
+
+static struct dentry *sdcardfs_mount_wrn(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *raw_data)
+{
+ WARN(1, "sdcardfs does not support mount. Use mount2.\n");
+ return ERR_PTR(-EINVAL);
+}
+
+void *sdcardfs_alloc_mnt_data(void)
+{
+ return kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL);
+}
+
+void sdcardfs_kill_sb(struct super_block *sb)
+{
+ struct sdcardfs_sb_info *sbi;
+
+ if (sb->s_magic == SDCARDFS_SUPER_MAGIC) {
+ sbi = SDCARDFS_SB(sb);
+ mutex_lock(&sdcardfs_super_list_lock);
+ list_del(&sbi->list);
+ mutex_unlock(&sdcardfs_super_list_lock);
+ }
+ kill_anon_super(sb);
+}
+
+static struct file_system_type sdcardfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = SDCARDFS_NAME,
+ .mount = sdcardfs_mount_wrn,
+ .mount2 = sdcardfs_mount,
+ .alloc_mnt_data = sdcardfs_alloc_mnt_data,
+ .kill_sb = sdcardfs_kill_sb,
+ .fs_flags = 0,
+};
+MODULE_ALIAS_FS(SDCARDFS_NAME);
+
+static int __init init_sdcardfs_fs(void)
+{
+ int err;
+
+ pr_info("Registering sdcardfs " SDCARDFS_VERSION "\n");
+
+ err = sdcardfs_init_inode_cache();
+ if (err)
+ goto out;
+ err = sdcardfs_init_dentry_cache();
+ if (err)
+ goto out;
+ err = packagelist_init();
+ if (err)
+ goto out;
+ err = register_filesystem(&sdcardfs_fs_type);
+out:
+ if (err) {
+ sdcardfs_destroy_inode_cache();
+ sdcardfs_destroy_dentry_cache();
+ packagelist_exit();
+ }
+ return err;
+}
+
+static void __exit exit_sdcardfs_fs(void)
+{
+ sdcardfs_destroy_inode_cache();
+ sdcardfs_destroy_dentry_cache();
+ packagelist_exit();
+ unregister_filesystem(&sdcardfs_fs_type);
+ pr_info("Completed sdcardfs module unload\n");
+}
+
+/* Original wrapfs authors */
+MODULE_AUTHOR("Erez Zadok, Filesystems and Storage Lab, Stony Brook University (http://www.fsl.cs.sunysb.edu/)");
+
+/* Original sdcardfs authors */
+MODULE_AUTHOR("Woojoong Lee, Daeho Jeong, Kitae Lee, Yeongjin Gil System Memory Lab., Samsung Electronics");
+
+/* Current maintainer */
+MODULE_AUTHOR("Daniel Rosenberg, Google");
+MODULE_DESCRIPTION("Sdcardfs " SDCARDFS_VERSION);
+MODULE_LICENSE("GPL");
+
+module_init(init_sdcardfs_fs);
+module_exit(exit_sdcardfs_fs);
diff --git a/fs/sdcardfs/mmap.c b/fs/sdcardfs/mmap.c
new file mode 100644
index 000000000000..391d2a7d10e9
--- /dev/null
+++ b/fs/sdcardfs/mmap.c
@@ -0,0 +1,88 @@
+/*
+ * fs/sdcardfs/mmap.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+
+static int sdcardfs_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ int err;
+ struct file *file;
+ const struct vm_operations_struct *lower_vm_ops;
+
+ file = (struct file *)vma->vm_private_data;
+ lower_vm_ops = SDCARDFS_F(file)->lower_vm_ops;
+ BUG_ON(!lower_vm_ops);
+
+ err = lower_vm_ops->fault(vma, vmf);
+ return err;
+}
+
+static void sdcardfs_vm_open(struct vm_area_struct *vma)
+{
+ struct file *file = (struct file *)vma->vm_private_data;
+
+ get_file(file);
+}
+
+static void sdcardfs_vm_close(struct vm_area_struct *vma)
+{
+ struct file *file = (struct file *)vma->vm_private_data;
+
+ fput(file);
+}
+
+static int sdcardfs_page_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ int err = 0;
+ struct file *file;
+ const struct vm_operations_struct *lower_vm_ops;
+
+ file = (struct file *)vma->vm_private_data;
+ lower_vm_ops = SDCARDFS_F(file)->lower_vm_ops;
+ BUG_ON(!lower_vm_ops);
+ if (!lower_vm_ops->page_mkwrite)
+ goto out;
+
+ err = lower_vm_ops->page_mkwrite(vma, vmf);
+out:
+ return err;
+}
+
+static ssize_t sdcardfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+ /*
+ * This function should never be called directly. We need it
+ * to exist, to get past a check in open_check_o_direct(),
+ * which is called from do_last().
+ */
+ return -EINVAL;
+}
+
+const struct address_space_operations sdcardfs_aops = {
+ .direct_IO = sdcardfs_direct_IO,
+};
+
+const struct vm_operations_struct sdcardfs_vm_ops = {
+ .fault = sdcardfs_fault,
+ .page_mkwrite = sdcardfs_page_mkwrite,
+ .open = sdcardfs_vm_open,
+ .close = sdcardfs_vm_close,
+};
diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h
new file mode 100644
index 000000000000..85341e753f8c
--- /dev/null
+++ b/fs/sdcardfs/multiuser.h
@@ -0,0 +1,53 @@
+/*
+ * fs/sdcardfs/multiuser.h
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#define AID_USER_OFFSET 100000 /* offset for uid ranges for each user */
+#define AID_APP_START 10000 /* first app user */
+#define AID_APP_END 19999 /* last app user */
+#define AID_CACHE_GID_START 20000 /* start of gids for apps to mark cached data */
+#define AID_EXT_GID_START 30000 /* start of gids for apps to mark external data */
+#define AID_EXT_CACHE_GID_START 40000 /* start of gids for apps to mark external cached data */
+#define AID_EXT_CACHE_GID_END 49999 /* end of gids for apps to mark external cached data */
+#define AID_SHARED_GID_START 50000 /* start of gids for apps in each user to share */
+
+typedef uid_t userid_t;
+typedef uid_t appid_t;
+
+static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id)
+{
+ return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET);
+}
+
+static inline bool uid_is_app(uid_t uid)
+{
+ appid_t appid = uid % AID_USER_OFFSET;
+
+ return appid >= AID_APP_START && appid <= AID_APP_END;
+}
+
+static inline gid_t multiuser_get_ext_cache_gid(uid_t uid)
+{
+ return uid - AID_APP_START + AID_EXT_CACHE_GID_START;
+}
+
+static inline gid_t multiuser_get_ext_gid(uid_t uid)
+{
+ return uid - AID_APP_START + AID_EXT_GID_START;
+}
diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c
new file mode 100644
index 000000000000..6da0c2186d39
--- /dev/null
+++ b/fs/sdcardfs/packagelist.c
@@ -0,0 +1,881 @@
+/*
+ * fs/sdcardfs/packagelist.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include <linux/hashtable.h>
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/radix-tree.h>
+#include <linux/dcache.h>
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+struct hashtable_entry {
+ struct hlist_node hlist;
+ struct hlist_node dlist; /* for deletion cleanup */
+ struct qstr key;
+ atomic_t value;
+};
+
+static DEFINE_HASHTABLE(package_to_appid, 8);
+static DEFINE_HASHTABLE(package_to_userid, 8);
+static DEFINE_HASHTABLE(ext_to_groupid, 8);
+
+
+static struct kmem_cache *hashtable_entry_cachep;
+
+static unsigned int full_name_case_hash(const void *salt, const unsigned char *name, unsigned int len)
+{
+ unsigned long hash = init_name_hash(salt);
+
+ while (len--)
+ hash = partial_name_hash(tolower(*name++), hash);
+ return end_name_hash(hash);
+}
+
+static inline void qstr_init(struct qstr *q, const char *name)
+{
+ q->name = name;
+ q->len = strlen(q->name);
+ q->hash = full_name_case_hash(0, q->name, q->len);
+}
+
+static inline int qstr_copy(const struct qstr *src, struct qstr *dest)
+{
+ dest->name = kstrdup(src->name, GFP_KERNEL);
+ dest->hash_len = src->hash_len;
+ return !!dest->name;
+}
+
+
+static appid_t __get_appid(const struct qstr *key)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = key->hash;
+ appid_t ret_id;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key)) {
+ ret_id = atomic_read(&hash_cur->value);
+ rcu_read_unlock();
+ return ret_id;
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+appid_t get_appid(const char *key)
+{
+ struct qstr q;
+
+ qstr_init(&q, key);
+ return __get_appid(&q);
+}
+
+static appid_t __get_ext_gid(const struct qstr *key)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = key->hash;
+ appid_t ret_id;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key)) {
+ ret_id = atomic_read(&hash_cur->value);
+ rcu_read_unlock();
+ return ret_id;
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+appid_t get_ext_gid(const char *key)
+{
+ struct qstr q;
+
+ qstr_init(&q, key);
+ return __get_ext_gid(&q);
+}
+
+static appid_t __is_excluded(const struct qstr *app_name, userid_t user)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = app_name->hash;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+ if (atomic_read(&hash_cur->value) == user &&
+ qstr_case_eq(app_name, &hash_cur->key)) {
+ rcu_read_unlock();
+ return 1;
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+appid_t is_excluded(const char *key, userid_t user)
+{
+ struct qstr q;
+ qstr_init(&q, key);
+ return __is_excluded(&q, user);
+}
+
+/* Kernel has already enforced everything we returned through
+ * derive_permissions_locked(), so this is used to lock down access
+ * even further, such as enforcing that apps hold sdcard_rw.
+ */
+int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name)
+{
+ struct qstr q_autorun = QSTR_LITERAL("autorun.inf");
+ struct qstr q__android_secure = QSTR_LITERAL(".android_secure");
+ struct qstr q_android_secure = QSTR_LITERAL("android_secure");
+
+ /* Always block security-sensitive files at root */
+ if (parent_node && SDCARDFS_I(parent_node)->data->perm == PERM_ROOT) {
+ if (qstr_case_eq(name, &q_autorun)
+ || qstr_case_eq(name, &q__android_secure)
+ || qstr_case_eq(name, &q_android_secure)) {
+ return 0;
+ }
+ }
+
+ /* Root always has access; access for any other UIDs should always
+ * be controlled through packages.list.
+ */
+ if (from_kuid(&init_user_ns, current_fsuid()) == 0)
+ return 1;
+
+ /* No extra permissions to enforce */
+ return 1;
+}
+
+static struct hashtable_entry *alloc_hashtable_entry(const struct qstr *key,
+ appid_t value)
+{
+ struct hashtable_entry *ret = kmem_cache_alloc(hashtable_entry_cachep,
+ GFP_KERNEL);
+ if (!ret)
+ return NULL;
+ INIT_HLIST_NODE(&ret->dlist);
+ INIT_HLIST_NODE(&ret->hlist);
+
+ if (!qstr_copy(key, &ret->key)) {
+ kmem_cache_free(hashtable_entry_cachep, ret);
+ return NULL;
+ }
+
+ atomic_set(&ret->value, value);
+ return ret;
+}
+
+static int insert_packagelist_appid_entry_locked(const struct qstr *key, appid_t value)
+{
+ struct hashtable_entry *hash_cur;
+ struct hashtable_entry *new_entry;
+ unsigned int hash = key->hash;
+
+ hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key)) {
+ atomic_set(&hash_cur->value, value);
+ return 0;
+ }
+ }
+ new_entry = alloc_hashtable_entry(key, value);
+ if (!new_entry)
+ return -ENOMEM;
+ hash_add_rcu(package_to_appid, &new_entry->hlist, hash);
+ return 0;
+}
+
+static int insert_ext_gid_entry_locked(const struct qstr *key, appid_t value)
+{
+ struct hashtable_entry *hash_cur;
+ struct hashtable_entry *new_entry;
+ unsigned int hash = key->hash;
+
+ /* An extension can only belong to one gid */
+ hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key))
+ return -EINVAL;
+ }
+ new_entry = alloc_hashtable_entry(key, value);
+ if (!new_entry)
+ return -ENOMEM;
+ hash_add_rcu(ext_to_groupid, &new_entry->hlist, hash);
+ return 0;
+}
+
+static int insert_userid_exclude_entry_locked(const struct qstr *key, userid_t value)
+{
+ struct hashtable_entry *hash_cur;
+ struct hashtable_entry *new_entry;
+ unsigned int hash = key->hash;
+
+ /* Only insert if not already present */
+ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+ if (atomic_read(&hash_cur->value) == value &&
+ qstr_case_eq(key, &hash_cur->key))
+ return 0;
+ }
+ new_entry = alloc_hashtable_entry(key, value);
+ if (!new_entry)
+ return -ENOMEM;
+ hash_add_rcu(package_to_userid, &new_entry->hlist, hash);
+ return 0;
+}
+
+static void fixup_all_perms_name(const struct qstr *key)
+{
+ struct sdcardfs_sb_info *sbinfo;
+ struct limit_search limit = {
+ .flags = BY_NAME,
+ .name = QSTR_INIT(key->name, key->len),
+ };
+ list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
+ if (sbinfo_has_sdcard_magic(sbinfo))
+ fixup_perms_recursive(sbinfo->sb->s_root, &limit);
+ }
+}
+
+static void fixup_all_perms_name_userid(const struct qstr *key, userid_t userid)
+{
+ struct sdcardfs_sb_info *sbinfo;
+ struct limit_search limit = {
+ .flags = BY_NAME | BY_USERID,
+ .name = QSTR_INIT(key->name, key->len),
+ .userid = userid,
+ };
+ list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
+ if (sbinfo_has_sdcard_magic(sbinfo))
+ fixup_perms_recursive(sbinfo->sb->s_root, &limit);
+ }
+}
+
+static void fixup_all_perms_userid(userid_t userid)
+{
+ struct sdcardfs_sb_info *sbinfo;
+ struct limit_search limit = {
+ .flags = BY_USERID,
+ .userid = userid,
+ };
+ list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
+ if (sbinfo_has_sdcard_magic(sbinfo))
+ fixup_perms_recursive(sbinfo->sb->s_root, &limit);
+ }
+}
+
+static int insert_packagelist_entry(const struct qstr *key, appid_t value)
+{
+ int err;
+
+ mutex_lock(&sdcardfs_super_list_lock);
+ err = insert_packagelist_appid_entry_locked(key, value);
+ if (!err)
+ fixup_all_perms_name(key);
+ mutex_unlock(&sdcardfs_super_list_lock);
+
+ return err;
+}
+
+static int insert_ext_gid_entry(const struct qstr *key, appid_t value)
+{
+ int err;
+
+ mutex_lock(&sdcardfs_super_list_lock);
+ err = insert_ext_gid_entry_locked(key, value);
+ mutex_unlock(&sdcardfs_super_list_lock);
+
+ return err;
+}
+
+static int insert_userid_exclude_entry(const struct qstr *key, userid_t value)
+{
+ int err;
+
+ mutex_lock(&sdcardfs_super_list_lock);
+ err = insert_userid_exclude_entry_locked(key, value);
+ if (!err)
+ fixup_all_perms_name_userid(key, value);
+ mutex_unlock(&sdcardfs_super_list_lock);
+
+ return err;
+}
+
+static void free_hashtable_entry(struct hashtable_entry *entry)
+{
+ kfree(entry->key.name);
+ kmem_cache_free(hashtable_entry_cachep, entry);
+}
+
+static void remove_packagelist_entry_locked(const struct qstr *key)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = key->hash;
+ struct hlist_node *h_t;
+ HLIST_HEAD(free_list);
+
+ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key)) {
+ hash_del_rcu(&hash_cur->hlist);
+ hlist_add_head(&hash_cur->dlist, &free_list);
+ }
+ }
+ hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key)) {
+ hash_del_rcu(&hash_cur->hlist);
+ hlist_add_head(&hash_cur->dlist, &free_list);
+ break;
+ }
+ }
+ synchronize_rcu();
+ hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist)
+ free_hashtable_entry(hash_cur);
+}
+
+static void remove_packagelist_entry(const struct qstr *key)
+{
+ mutex_lock(&sdcardfs_super_list_lock);
+ remove_packagelist_entry_locked(key);
+ fixup_all_perms_name(key);
+ mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void remove_ext_gid_entry_locked(const struct qstr *key, gid_t group)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = key->hash;
+
+ hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key) && atomic_read(&hash_cur->value) == group) {
+ hash_del_rcu(&hash_cur->hlist);
+ synchronize_rcu();
+ free_hashtable_entry(hash_cur);
+ break;
+ }
+ }
+}
+
+static void remove_ext_gid_entry(const struct qstr *key, gid_t group)
+{
+ mutex_lock(&sdcardfs_super_list_lock);
+ remove_ext_gid_entry_locked(key, group);
+ mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void remove_userid_all_entry_locked(userid_t userid)
+{
+ struct hashtable_entry *hash_cur;
+ struct hlist_node *h_t;
+ HLIST_HEAD(free_list);
+ int i;
+
+ hash_for_each_rcu(package_to_userid, i, hash_cur, hlist) {
+ if (atomic_read(&hash_cur->value) == userid) {
+ hash_del_rcu(&hash_cur->hlist);
+ hlist_add_head(&hash_cur->dlist, &free_list);
+ }
+ }
+ synchronize_rcu();
+ hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist) {
+ free_hashtable_entry(hash_cur);
+ }
+}
+
+static void remove_userid_all_entry(userid_t userid)
+{
+ mutex_lock(&sdcardfs_super_list_lock);
+ remove_userid_all_entry_locked(userid);
+ fixup_all_perms_userid(userid);
+ mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void remove_userid_exclude_entry_locked(const struct qstr *key, userid_t userid)
+{
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = key->hash;
+
+ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(key, &hash_cur->key) &&
+ atomic_read(&hash_cur->value) == userid) {
+ hash_del_rcu(&hash_cur->hlist);
+ synchronize_rcu();
+ free_hashtable_entry(hash_cur);
+ break;
+ }
+ }
+}
+
+static void remove_userid_exclude_entry(const struct qstr *key, userid_t userid)
+{
+ mutex_lock(&sdcardfs_super_list_lock);
+ remove_userid_exclude_entry_locked(key, userid);
+ fixup_all_perms_name_userid(key, userid);
+ mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void packagelist_destroy(void)
+{
+ struct hashtable_entry *hash_cur;
+ struct hlist_node *h_t;
+ HLIST_HEAD(free_list);
+ int i;
+
+ mutex_lock(&sdcardfs_super_list_lock);
+ hash_for_each_rcu(package_to_appid, i, hash_cur, hlist) {
+ hash_del_rcu(&hash_cur->hlist);
+ hlist_add_head(&hash_cur->dlist, &free_list);
+ }
+ hash_for_each_rcu(package_to_userid, i, hash_cur, hlist) {
+ hash_del_rcu(&hash_cur->hlist);
+ hlist_add_head(&hash_cur->dlist, &free_list);
+ }
+ synchronize_rcu();
+ hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist)
+ free_hashtable_entry(hash_cur);
+ mutex_unlock(&sdcardfs_super_list_lock);
+ pr_info("sdcardfs: destroyed packagelist pkgld\n");
+}
+
+#define SDCARDFS_CONFIGFS_ATTR(_pfx, _name) \
+static struct configfs_attribute _pfx##attr_##_name = { \
+ .ca_name = __stringify(_name), \
+ .ca_mode = S_IRUGO | S_IWUGO, \
+ .ca_owner = THIS_MODULE, \
+ .show = _pfx##_name##_show, \
+ .store = _pfx##_name##_store, \
+}
+
+#define SDCARDFS_CONFIGFS_ATTR_RO(_pfx, _name) \
+static struct configfs_attribute _pfx##attr_##_name = { \
+ .ca_name = __stringify(_name), \
+ .ca_mode = S_IRUGO, \
+ .ca_owner = THIS_MODULE, \
+ .show = _pfx##_name##_show, \
+}
+
+#define SDCARDFS_CONFIGFS_ATTR_WO(_pfx, _name) \
+static struct configfs_attribute _pfx##attr_##_name = { \
+ .ca_name = __stringify(_name), \
+ .ca_mode = S_IWUGO, \
+ .ca_owner = THIS_MODULE, \
+ .store = _pfx##_name##_store, \
+}
+
+struct package_details {
+ struct config_item item;
+ struct qstr name;
+};
+
+static inline struct package_details *to_package_details(struct config_item *item)
+{
+ return item ? container_of(item, struct package_details, item) : NULL;
+}
+
+static ssize_t package_details_appid_show(struct config_item *item, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%u\n", __get_appid(&to_package_details(item)->name));
+}
+
+static ssize_t package_details_appid_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ unsigned int tmp;
+ int ret;
+
+ ret = kstrtouint(page, 10, &tmp);
+ if (ret)
+ return ret;
+
+ ret = insert_packagelist_entry(&to_package_details(item)->name, tmp);
+
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static ssize_t package_details_excluded_userids_show(struct config_item *item,
+ char *page)
+{
+ struct package_details *package_details = to_package_details(item);
+ struct hashtable_entry *hash_cur;
+ unsigned int hash = package_details->name.hash;
+ int count = 0;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+ if (qstr_case_eq(&package_details->name, &hash_cur->key))
+ count += scnprintf(page + count, PAGE_SIZE - count,
+ "%d ", atomic_read(&hash_cur->value));
+ }
+ rcu_read_unlock();
+ if (count)
+ count--;
+ count += scnprintf(page + count, PAGE_SIZE - count, "\n");
+ return count;
+}
+
+static ssize_t package_details_excluded_userids_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ unsigned int tmp;
+ int ret;
+
+ ret = kstrtouint(page, 10, &tmp);
+ if (ret)
+ return ret;
+
+ ret = insert_userid_exclude_entry(&to_package_details(item)->name, tmp);
+
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static ssize_t package_details_clear_userid_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ unsigned int tmp;
+ int ret;
+
+ ret = kstrtouint(page, 10, &tmp);
+ if (ret)
+ return ret;
+ remove_userid_exclude_entry(&to_package_details(item)->name, tmp);
+ return count;
+}
+
+static void package_details_release(struct config_item *item)
+{
+ struct package_details *package_details = to_package_details(item);
+
+ pr_info("sdcardfs: removing %s\n", package_details->name.name);
+ remove_packagelist_entry(&package_details->name);
+ kfree(package_details->name.name);
+ kfree(package_details);
+}
+
+SDCARDFS_CONFIGFS_ATTR(package_details_, appid);
+SDCARDFS_CONFIGFS_ATTR(package_details_, excluded_userids);
+SDCARDFS_CONFIGFS_ATTR_WO(package_details_, clear_userid);
+
+static struct configfs_attribute *package_details_attrs[] = {
+ &package_details_attr_appid,
+ &package_details_attr_excluded_userids,
+ &package_details_attr_clear_userid,
+ NULL,
+};
+
+static struct configfs_item_operations package_details_item_ops = {
+ .release = package_details_release,
+};
+
+static struct config_item_type package_appid_type = {
+ .ct_item_ops = &package_details_item_ops,
+ .ct_attrs = package_details_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+struct extensions_value {
+ struct config_group group;
+ unsigned int num;
+};
+
+struct extension_details {
+ struct config_item item;
+ struct qstr name;
+ unsigned int num;
+};
+
+static inline struct extensions_value *to_extensions_value(struct config_item *item)
+{
+ return item ? container_of(to_config_group(item), struct extensions_value, group) : NULL;
+}
+
+static inline struct extension_details *to_extension_details(struct config_item *item)
+{
+ return item ? container_of(item, struct extension_details, item) : NULL;
+}
+
+static void extension_details_release(struct config_item *item)
+{
+ struct extension_details *extension_details = to_extension_details(item);
+
+ pr_info("sdcardfs: No longer mapping %s files to gid %d\n",
+ extension_details->name.name, extension_details->num);
+ remove_ext_gid_entry(&extension_details->name, extension_details->num);
+ kfree(extension_details->name.name);
+ kfree(extension_details);
+}
+
+static struct configfs_item_operations extension_details_item_ops = {
+ .release = extension_details_release,
+};
+
+static struct config_item_type extension_details_type = {
+ .ct_item_ops = &extension_details_item_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item *extension_details_make_item(struct config_group *group, const char *name)
+{
+ struct extensions_value *extensions_value = to_extensions_value(&group->cg_item);
+ struct extension_details *extension_details = kzalloc(sizeof(struct extension_details), GFP_KERNEL);
+ const char *tmp;
+ int ret;
+
+ if (!extension_details)
+ return ERR_PTR(-ENOMEM);
+
+ tmp = kstrdup(name, GFP_KERNEL);
+ if (!tmp) {
+ kfree(extension_details);
+ return ERR_PTR(-ENOMEM);
+ }
+ qstr_init(&extension_details->name, tmp);
+ ret = insert_ext_gid_entry(&extension_details->name, extensions_value->num);
+
+ if (ret) {
+ kfree(extension_details->name.name);
+ kfree(extension_details);
+ return ERR_PTR(ret);
+ }
+ config_item_init_type_name(&extension_details->item, name, &extension_details_type);
+
+ return &extension_details->item;
+}
+
+static struct configfs_group_operations extensions_value_group_ops = {
+ .make_item = extension_details_make_item,
+};
+
+static struct config_item_type extensions_name_type = {
+ .ct_group_ops = &extensions_value_group_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_group *extensions_make_group(struct config_group *group, const char *name)
+{
+ struct extensions_value *extensions_value;
+ unsigned int tmp;
+ int ret;
+
+ extensions_value = kzalloc(sizeof(struct extensions_value), GFP_KERNEL);
+ if (!extensions_value)
+ return ERR_PTR(-ENOMEM);
+ ret = kstrtouint(name, 10, &tmp);
+ if (ret) {
+ kfree(extensions_value);
+ return ERR_PTR(ret);
+ }
+
+ extensions_value->num = tmp;
+ config_group_init_type_name(&extensions_value->group, name,
+ &extensions_name_type);
+ return &extensions_value->group;
+}
+
+static void extensions_drop_group(struct config_group *group, struct config_item *item)
+{
+ struct extensions_value *value = to_extensions_value(item);
+
+ pr_info("sdcardfs: No longer mapping any files to gid %d\n", value->num);
+ kfree(value);
+}
+
+static struct configfs_group_operations extensions_group_ops = {
+ .make_group = extensions_make_group,
+ .drop_item = extensions_drop_group,
+};
+
+static struct config_item_type extensions_type = {
+ .ct_group_ops = &extensions_group_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+struct config_group extension_group = {
+ .cg_item = {
+ .ci_namebuf = "extensions",
+ .ci_type = &extensions_type,
+ },
+};
+
+static struct config_item *packages_make_item(struct config_group *group, const char *name)
+{
+ struct package_details *package_details;
+ const char *tmp;
+
+ package_details = kzalloc(sizeof(struct package_details), GFP_KERNEL);
+ if (!package_details)
+ return ERR_PTR(-ENOMEM);
+ tmp = kstrdup(name, GFP_KERNEL);
+ if (!tmp) {
+ kfree(package_details);
+ return ERR_PTR(-ENOMEM);
+ }
+ qstr_init(&package_details->name, tmp);
+ config_item_init_type_name(&package_details->item, name,
+ &package_appid_type);
+
+ return &package_details->item;
+}
+
+static ssize_t packages_list_show(struct config_item *item, char *page)
+{
+ struct hashtable_entry *hash_cur_app;
+ struct hashtable_entry *hash_cur_user;
+ int i;
+ int count = 0, written = 0;
+ const char errormsg[] = "<truncated>\n";
+ unsigned int hash;
+
+ rcu_read_lock();
+ hash_for_each_rcu(package_to_appid, i, hash_cur_app, hlist) {
+ written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n",
+ hash_cur_app->key.name, atomic_read(&hash_cur_app->value));
+ hash = hash_cur_app->key.hash;
+ hash_for_each_possible_rcu(package_to_userid, hash_cur_user, hlist, hash) {
+ if (qstr_case_eq(&hash_cur_app->key, &hash_cur_user->key)) {
+ written += scnprintf(page + count + written - 1,
+ PAGE_SIZE - sizeof(errormsg) - count - written + 1,
+ " %d\n", atomic_read(&hash_cur_user->value)) - 1;
+ }
+ }
+ if (count + written == PAGE_SIZE - sizeof(errormsg) - 1) {
+ count += scnprintf(page + count, PAGE_SIZE - count, errormsg);
+ break;
+ }
+ count += written;
+ }
+ rcu_read_unlock();
+
+ return count;
+}
+
+static ssize_t packages_remove_userid_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ unsigned int tmp;
+ int ret;
+
+ ret = kstrtouint(page, 10, &tmp);
+ if (ret)
+ return ret;
+ remove_userid_all_entry(tmp);
+ return count;
+}
+
+static struct configfs_attribute packages_attr_packages_gid_list = {
+ .ca_name = "packages_gid.list",
+ .ca_mode = S_IRUGO,
+ .ca_owner = THIS_MODULE,
+ .show = packages_list_show,
+};
+
+SDCARDFS_CONFIGFS_ATTR_WO(packages_, remove_userid);
+
+static struct configfs_attribute *packages_attrs[] = {
+ &packages_attr_packages_gid_list,
+ &packages_attr_remove_userid,
+ NULL,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations packages_group_ops = {
+ .make_item = packages_make_item,
+};
+
+static struct config_item_type packages_type = {
+ .ct_group_ops = &packages_group_ops,
+ .ct_attrs = packages_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+struct config_group *sd_default_groups[] = {
+ &extension_group,
+ NULL,
+};
+
+static struct configfs_subsystem sdcardfs_packages = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "sdcardfs",
+ .ci_type = &packages_type,
+ },
+ },
+};
+
+static int configfs_sdcardfs_init(void)
+{
+ int ret, i;
+ struct configfs_subsystem *subsys = &sdcardfs_packages;
+
+ config_group_init(&subsys->su_group);
+ for (i = 0; sd_default_groups[i]; i++) {
+ config_group_init(sd_default_groups[i]);
+ configfs_add_default_group(sd_default_groups[i], &subsys->su_group);
+ }
+ mutex_init(&subsys->su_mutex);
+ ret = configfs_register_subsystem(subsys);
+ if (ret) {
+ pr_err("Error %d while registering subsystem %s\n",
+ ret,
+ subsys->su_group.cg_item.ci_namebuf);
+ }
+ return ret;
+}
+
+static void configfs_sdcardfs_exit(void)
+{
+ configfs_unregister_subsystem(&sdcardfs_packages);
+}
+
+int packagelist_init(void)
+{
+ hashtable_entry_cachep =
+ kmem_cache_create("packagelist_hashtable_entry",
+ sizeof(struct hashtable_entry), 0, 0, NULL);
+ if (!hashtable_entry_cachep) {
+ pr_err("sdcardfs: failed creating pkgl_hashtable entry slab cache\n");
+ return -ENOMEM;
+ }
+
+ configfs_sdcardfs_init();
+ return 0;
+}
+
+void packagelist_exit(void)
+{
+ configfs_sdcardfs_exit();
+ packagelist_destroy();
+ kmem_cache_destroy(hashtable_entry_cachep);
+}
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h
new file mode 100644
index 000000000000..c6f63deb15e7
--- /dev/null
+++ b/fs/sdcardfs/sdcardfs.h
@@ -0,0 +1,638 @@
+/*
+ * fs/sdcardfs/sdcardfs.h
+ *
+ * The sdcardfs v2.0
+ * This file system replaces the sdcard daemon on Android
+ * On version 2.0, some of the daemon functions have been ported
+ * to support the multi-user concepts of Android 4.4
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#ifndef _SDCARDFS_H_
+#define _SDCARDFS_H_
+
+#include <linux/dcache.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/aio.h>
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/fs_stack.h>
+#include <linux/magic.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/security.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include "multiuser.h"
+
+/* the file system name */
+#define SDCARDFS_NAME "sdcardfs"
+
+/* sdcardfs root inode number */
+#define SDCARDFS_ROOT_INO 1
+
+/* useful for tracking code reachability */
+#define UDBG pr_default("DBG:%s:%s:%d\n", __FILE__, __func__, __LINE__)
+
+#define SDCARDFS_DIRENT_SIZE 256
+
+/* temporary static uid settings for development */
+#define AID_ROOT 0 /* uid for accessing /mnt/sdcard & extSdcard */
+#define AID_MEDIA_RW 1023 /* internal media storage write access */
+
+#define AID_SDCARD_RW 1015 /* external storage write access */
+#define AID_SDCARD_R 1028 /* external storage read access */
+#define AID_SDCARD_PICS 1033 /* external storage photos access */
+#define AID_SDCARD_AV 1034 /* external storage audio/video access */
+#define AID_SDCARD_ALL 1035 /* access all users external storage */
+#define AID_MEDIA_OBB 1059 /* obb files */
+
+#define AID_SDCARD_IMAGE 1057
+
+#define AID_PACKAGE_INFO 1027
+
+
+/*
+ * Permissions are handled by our permission function.
+ * We don't want anyone who happens to look at our inode value to prematurely
+ * block access, so store more permissive values. These are probably never
+ * used.
+ */
+#define fixup_tmp_permissions(x) \
+ do { \
+ (x)->i_uid = make_kuid(&init_user_ns, \
+ SDCARDFS_I(x)->data->d_uid); \
+ (x)->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW); \
+ (x)->i_mode = ((x)->i_mode & S_IFMT) | 0775;\
+ } while (0)
+
+/* Android 5.0 support */
+
+/* Permission mode for a specific node. Controls how file permissions
+ * are derived for children nodes.
+ */
+typedef enum {
+ /* Nothing special; this node should just inherit from its parent. */
+ PERM_INHERIT,
+ /* This node is one level above a normal root; used for legacy layouts
+ * which use the first level to represent user_id.
+ */
+ PERM_PRE_ROOT,
+ /* This node is "/" */
+ PERM_ROOT,
+ /* This node is "/Android" */
+ PERM_ANDROID,
+ /* This node is "/Android/data" */
+ PERM_ANDROID_DATA,
+ /* This node is "/Android/obb" */
+ PERM_ANDROID_OBB,
+ /* This node is "/Android/media" */
+ PERM_ANDROID_MEDIA,
+ /* This node is "/Android/[data|media|obb]/[package]" */
+ PERM_ANDROID_PACKAGE,
+ /* This node is "/Android/[data|media|obb]/[package]/cache" */
+ PERM_ANDROID_PACKAGE_CACHE,
+} perm_t;
+
+struct sdcardfs_sb_info;
+struct sdcardfs_mount_options;
+struct sdcardfs_inode_info;
+struct sdcardfs_inode_data;
+
+/* Do not directly use this function. Use OVERRIDE_CRED() instead. */
+const struct cred *override_fsids(struct sdcardfs_sb_info *sbi,
+ struct sdcardfs_inode_data *data);
+/* Do not directly use this function, use REVERT_CRED() instead. */
+void revert_fsids(const struct cred *old_cred);
+
+/* operations vectors defined in specific files */
+extern const struct file_operations sdcardfs_main_fops;
+extern const struct file_operations sdcardfs_dir_fops;
+extern const struct inode_operations sdcardfs_main_iops;
+extern const struct inode_operations sdcardfs_dir_iops;
+extern const struct inode_operations sdcardfs_symlink_iops;
+extern const struct super_operations sdcardfs_sops;
+extern const struct dentry_operations sdcardfs_ci_dops;
+extern const struct address_space_operations sdcardfs_aops, sdcardfs_dummy_aops;
+extern const struct vm_operations_struct sdcardfs_vm_ops;
+
+extern int sdcardfs_init_inode_cache(void);
+extern void sdcardfs_destroy_inode_cache(void);
+extern int sdcardfs_init_dentry_cache(void);
+extern void sdcardfs_destroy_dentry_cache(void);
+extern int new_dentry_private_data(struct dentry *dentry);
+extern void free_dentry_private_data(struct dentry *dentry);
+extern struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags);
+extern struct inode *sdcardfs_iget(struct super_block *sb,
+ struct inode *lower_inode, userid_t id);
+extern int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb,
+ struct path *lower_path, userid_t id);
+
+/* file private data */
+struct sdcardfs_file_info {
+ struct file *lower_file;
+ const struct vm_operations_struct *lower_vm_ops;
+};
+
+struct sdcardfs_inode_data {
+ struct kref refcount;
+ bool abandoned;
+
+ perm_t perm;
+ userid_t userid;
+ uid_t d_uid;
+ bool under_android;
+ bool under_cache;
+ bool under_obb;
+};
+
+/* sdcardfs inode data in memory */
+struct sdcardfs_inode_info {
+ struct inode *lower_inode;
+ /* state derived based on current position in hierarchy */
+ struct sdcardfs_inode_data *data;
+
+ /* top folder for ownership */
+ struct sdcardfs_inode_data *top_data;
+
+ struct inode vfs_inode;
+};
+
+
+/* sdcardfs dentry data in memory */
+struct sdcardfs_dentry_info {
+ spinlock_t lock; /* protects lower_path */
+ struct path lower_path;
+ struct path orig_path;
+};
+
+struct sdcardfs_mount_options {
+ uid_t fs_low_uid;
+ gid_t fs_low_gid;
+ userid_t fs_user_id;
+ bool multiuser;
+ unsigned int reserved_mb;
+};
+
+struct sdcardfs_vfsmount_options {
+ gid_t gid;
+ mode_t mask;
+};
+
+extern int parse_options_remount(struct super_block *sb, char *options, int silent,
+ struct sdcardfs_vfsmount_options *vfsopts);
+
+/* sdcardfs super-block data in memory */
+struct sdcardfs_sb_info {
+ struct super_block *sb;
+ struct super_block *lower_sb;
+ /* derived perm policy : some of options have been added
+ * to sdcardfs_mount_options (Android 4.4 support)
+ */
+ struct sdcardfs_mount_options options;
+ spinlock_t lock; /* protects obbpath */
+ char *obbpath_s;
+ struct path obbpath;
+ void *pkgl_id;
+ struct list_head list;
+};
+
+/*
+ * inode to private data
+ *
+ * Since we use containers and the struct inode is _inside_ the
+ * sdcardfs_inode_info structure, SDCARDFS_I will always (given a non-NULL
+ * inode pointer), return a valid non-NULL pointer.
+ */
+static inline struct sdcardfs_inode_info *SDCARDFS_I(const struct inode *inode)
+{
+ return container_of(inode, struct sdcardfs_inode_info, vfs_inode);
+}
+
+/* dentry to private data */
+#define SDCARDFS_D(dent) ((struct sdcardfs_dentry_info *)(dent)->d_fsdata)
+
+/* superblock to private data */
+#define SDCARDFS_SB(super) ((struct sdcardfs_sb_info *)(super)->s_fs_info)
+
+/* file to private Data */
+#define SDCARDFS_F(file) ((struct sdcardfs_file_info *)((file)->private_data))
+
+/* file to lower file */
+static inline struct file *sdcardfs_lower_file(const struct file *f)
+{
+ return SDCARDFS_F(f)->lower_file;
+}
+
+static inline void sdcardfs_set_lower_file(struct file *f, struct file *val)
+{
+ SDCARDFS_F(f)->lower_file = val;
+}
+
+/* inode to lower inode. */
+static inline struct inode *sdcardfs_lower_inode(const struct inode *i)
+{
+ return SDCARDFS_I(i)->lower_inode;
+}
+
+static inline void sdcardfs_set_lower_inode(struct inode *i, struct inode *val)
+{
+ SDCARDFS_I(i)->lower_inode = val;
+}
+
+/* superblock to lower superblock */
+static inline struct super_block *sdcardfs_lower_super(
+ const struct super_block *sb)
+{
+ return SDCARDFS_SB(sb)->lower_sb;
+}
+
+static inline void sdcardfs_set_lower_super(struct super_block *sb,
+ struct super_block *val)
+{
+ SDCARDFS_SB(sb)->lower_sb = val;
+}
+
+/* path based (dentry/mnt) macros */
+static inline void pathcpy(struct path *dst, const struct path *src)
+{
+ dst->dentry = src->dentry;
+ dst->mnt = src->mnt;
+}
+
+/* sdcardfs_get_pname functions calls path_get()
+ * therefore, the caller must call "proper" path_put functions
+ */
+#define SDCARDFS_DENT_FUNC(pname) \
+static inline void sdcardfs_get_##pname(const struct dentry *dent, \
+ struct path *pname) \
+{ \
+ spin_lock(&SDCARDFS_D(dent)->lock); \
+ pathcpy(pname, &SDCARDFS_D(dent)->pname); \
+ path_get(pname); \
+ spin_unlock(&SDCARDFS_D(dent)->lock); \
+ return; \
+} \
+static inline void sdcardfs_put_##pname(const struct dentry *dent, \
+ struct path *pname) \
+{ \
+ path_put(pname); \
+ return; \
+} \
+static inline void sdcardfs_set_##pname(const struct dentry *dent, \
+ struct path *pname) \
+{ \
+ spin_lock(&SDCARDFS_D(dent)->lock); \
+ pathcpy(&SDCARDFS_D(dent)->pname, pname); \
+ spin_unlock(&SDCARDFS_D(dent)->lock); \
+ return; \
+} \
+static inline void sdcardfs_reset_##pname(const struct dentry *dent) \
+{ \
+ spin_lock(&SDCARDFS_D(dent)->lock); \
+ SDCARDFS_D(dent)->pname.dentry = NULL; \
+ SDCARDFS_D(dent)->pname.mnt = NULL; \
+ spin_unlock(&SDCARDFS_D(dent)->lock); \
+ return; \
+} \
+static inline void sdcardfs_put_reset_##pname(const struct dentry *dent) \
+{ \
+ struct path pname; \
+ spin_lock(&SDCARDFS_D(dent)->lock); \
+ if (SDCARDFS_D(dent)->pname.dentry) { \
+ pathcpy(&pname, &SDCARDFS_D(dent)->pname); \
+ SDCARDFS_D(dent)->pname.dentry = NULL; \
+ SDCARDFS_D(dent)->pname.mnt = NULL; \
+ spin_unlock(&SDCARDFS_D(dent)->lock); \
+ path_put(&pname); \
+ } else \
+ spin_unlock(&SDCARDFS_D(dent)->lock); \
+ return; \
+}
+
+SDCARDFS_DENT_FUNC(lower_path)
+SDCARDFS_DENT_FUNC(orig_path)
+
+static inline bool sbinfo_has_sdcard_magic(struct sdcardfs_sb_info *sbinfo)
+{
+ return sbinfo && sbinfo->sb
+ && sbinfo->sb->s_magic == SDCARDFS_SUPER_MAGIC;
+}
+
+static inline struct sdcardfs_inode_data *data_get(
+ struct sdcardfs_inode_data *data)
+{
+ if (data)
+ kref_get(&data->refcount);
+ return data;
+}
+
+static inline struct sdcardfs_inode_data *top_data_get(
+ struct sdcardfs_inode_info *info)
+{
+ return data_get(info->top_data);
+}
+
+extern void data_release(struct kref *ref);
+
+static inline void data_put(struct sdcardfs_inode_data *data)
+{
+ kref_put(&data->refcount, data_release);
+}
+
+static inline void release_own_data(struct sdcardfs_inode_info *info)
+{
+ /*
+ * This happens exactly once per inode. At this point, the inode that
+ * originally held this data is about to be freed, and all references
+ * to it are held as a top value, and will likely be released soon.
+ */
+ info->data->abandoned = true;
+ data_put(info->data);
+}
+
+static inline void set_top(struct sdcardfs_inode_info *info,
+ struct sdcardfs_inode_data *top)
+{
+ struct sdcardfs_inode_data *old_top = info->top_data;
+
+ if (top)
+ data_get(top);
+ info->top_data = top;
+ if (old_top)
+ data_put(old_top);
+}
+
+static inline int get_gid(struct vfsmount *mnt,
+ struct sdcardfs_inode_data *data)
+{
+ struct sdcardfs_vfsmount_options *opts = mnt->data;
+
+ if (opts->gid == AID_SDCARD_RW)
+ /* As an optimization, certain trusted system components only run
+ * as owner but operate across all users. Since we're now handing
+ * out the sdcard_rw GID only to trusted apps, we're okay relaxing
+ * the user boundary enforcement for the default view. The UIDs
+ * assigned to app directories are still multiuser aware.
+ */
+ return AID_SDCARD_RW;
+ else
+ return multiuser_get_uid(data->userid, opts->gid);
+}
+
+static inline int get_mode(struct vfsmount *mnt,
+ struct sdcardfs_inode_info *info,
+ struct sdcardfs_inode_data *data)
+{
+ int owner_mode;
+ int filtered_mode;
+ struct sdcardfs_vfsmount_options *opts = mnt->data;
+ int visible_mode = 0775 & ~opts->mask;
+
+
+ if (data->perm == PERM_PRE_ROOT) {
+ /* Top of multi-user view should always be visible to ensure
+ * secondary users can traverse inside.
+ */
+ visible_mode = 0711;
+ } else if (data->under_android) {
+ /* Block "other" access to Android directories, since only apps
+ * belonging to a specific user should be in there; we still
+ * leave +x open for the default view.
+ */
+ if (opts->gid == AID_SDCARD_RW)
+ visible_mode = visible_mode & ~0006;
+ else
+ visible_mode = visible_mode & ~0007;
+ }
+ owner_mode = info->lower_inode->i_mode & 0700;
+ filtered_mode = visible_mode & (owner_mode | (owner_mode >> 3) | (owner_mode >> 6));
+ return filtered_mode;
+}
+
+static inline int has_graft_path(const struct dentry *dent)
+{
+ int ret = 0;
+
+ spin_lock(&SDCARDFS_D(dent)->lock);
+ if (SDCARDFS_D(dent)->orig_path.dentry != NULL)
+ ret = 1;
+ spin_unlock(&SDCARDFS_D(dent)->lock);
+
+ return ret;
+}
+
+static inline void sdcardfs_get_real_lower(const struct dentry *dent,
+ struct path *real_lower)
+{
+ /* in case of a local obb dentry
+ * the orig_path should be returned
+ */
+ if (has_graft_path(dent))
+ sdcardfs_get_orig_path(dent, real_lower);
+ else
+ sdcardfs_get_lower_path(dent, real_lower);
+}
+
+static inline void sdcardfs_put_real_lower(const struct dentry *dent,
+ struct path *real_lower)
+{
+ if (has_graft_path(dent))
+ sdcardfs_put_orig_path(dent, real_lower);
+ else
+ sdcardfs_put_lower_path(dent, real_lower);
+}
+
+extern struct mutex sdcardfs_super_list_lock;
+extern struct list_head sdcardfs_super_list;
+
+/* for packagelist.c */
+extern appid_t get_appid(const char *app_name);
+extern appid_t get_ext_gid(const char *app_name);
+extern appid_t is_excluded(const char *app_name, userid_t userid);
+extern int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name);
+extern int packagelist_init(void);
+extern void packagelist_exit(void);
+
+/* for derived_perm.c */
+#define BY_NAME (1 << 0)
+#define BY_USERID (1 << 1)
+struct limit_search {
+ unsigned int flags;
+ struct qstr name;
+ userid_t userid;
+};
+
+extern void setup_derived_state(struct inode *inode, perm_t perm,
+ userid_t userid, uid_t uid, bool under_android,
+ struct sdcardfs_inode_data *top);
+extern void get_derived_permission(struct dentry *parent, struct dentry *dentry);
+extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name);
+extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit);
+
+extern void update_derived_permission_lock(struct dentry *dentry);
+void fixup_lower_ownership(struct dentry *dentry, const char *name);
+extern int need_graft_path(struct dentry *dentry);
+extern int is_base_obbpath(struct dentry *dentry);
+extern int is_obbpath_invalid(struct dentry *dentry);
+extern int setup_obb_dentry(struct dentry *dentry, struct path *lower_path);
+
+/* locking helpers */
+static inline struct dentry *lock_parent(struct dentry *dentry)
+{
+ struct dentry *dir = dget_parent(dentry);
+
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
+ return dir;
+}
+
+static inline void unlock_dir(struct dentry *dir)
+{
+ inode_unlock(d_inode(dir));
+ dput(dir);
+}
+
+static inline int prepare_dir(const char *path_s, uid_t uid, gid_t gid, mode_t mode)
+{
+ int err;
+ struct dentry *dent;
+ struct iattr attrs;
+ struct path parent;
+
+ dent = kern_path_locked(path_s, &parent);
+ if (IS_ERR(dent)) {
+ err = PTR_ERR(dent);
+ if (err == -EEXIST)
+ err = 0;
+ goto out_unlock;
+ }
+
+ err = vfs_mkdir2(parent.mnt, d_inode(parent.dentry), dent, mode);
+ if (err) {
+ if (err == -EEXIST)
+ err = 0;
+ goto out_dput;
+ }
+
+ attrs.ia_uid = make_kuid(&init_user_ns, uid);
+ attrs.ia_gid = make_kgid(&init_user_ns, gid);
+ attrs.ia_valid = ATTR_UID | ATTR_GID;
+ inode_lock(d_inode(dent));
+ notify_change2(parent.mnt, dent, &attrs, NULL);
+ inode_unlock(d_inode(dent));
+
+out_dput:
+ dput(dent);
+
+out_unlock:
+ /* parent dentry locked by lookup_create */
+ inode_unlock(d_inode(parent.dentry));
+ path_put(&parent);
+ return err;
+}
+
+/*
+ * Return 1, if a disk has enough free space, otherwise 0.
+ * We assume that any files can not be overwritten.
+ */
+static inline int check_min_free_space(struct dentry *dentry, size_t size, int dir)
+{
+ int err;
+ struct path lower_path;
+ struct kstatfs statfs;
+ u64 avail;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+
+ if (sbi->options.reserved_mb) {
+ /* Get fs stat of lower filesystem. */
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ err = vfs_statfs(&lower_path, &statfs);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+
+ if (unlikely(err))
+ return 0;
+
+ /* Invalid statfs informations. */
+ if (unlikely(statfs.f_bsize == 0))
+ return 0;
+
+ /* if you are checking directory, set size to f_bsize. */
+ if (unlikely(dir))
+ size = statfs.f_bsize;
+
+ /* available size */
+ avail = statfs.f_bavail * statfs.f_bsize;
+
+ /* not enough space */
+ if ((u64)size > avail)
+ return 0;
+
+ /* enough space */
+ if ((avail - size) > (sbi->options.reserved_mb * 1024 * 1024))
+ return 1;
+
+ return 0;
+ } else
+ return 1;
+}
+
+/*
+ * Copies attrs and maintains sdcardfs managed attrs
+ * Since our permission check handles all special permissions, set those to be open
+ */
+static inline void sdcardfs_copy_and_fix_attrs(struct inode *dest, const struct inode *src)
+{
+ dest->i_mode = (src->i_mode & S_IFMT) | S_IRWXU | S_IRWXG |
+ S_IROTH | S_IXOTH; /* 0775 */
+ dest->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(dest)->data->d_uid);
+ dest->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW);
+ dest->i_rdev = src->i_rdev;
+ dest->i_atime = src->i_atime;
+ dest->i_mtime = src->i_mtime;
+ dest->i_ctime = src->i_ctime;
+ dest->i_blkbits = src->i_blkbits;
+ dest->i_flags = src->i_flags;
+ set_nlink(dest, src->i_nlink);
+}
+
+static inline bool str_case_eq(const char *s1, const char *s2)
+{
+ return !strcasecmp(s1, s2);
+}
+
+static inline bool str_n_case_eq(const char *s1, const char *s2, size_t len)
+{
+ return !strncasecmp(s1, s2, len);
+}
+
+static inline bool qstr_case_eq(const struct qstr *q1, const struct qstr *q2)
+{
+ return q1->len == q2->len && str_case_eq(q1->name, q2->name);
+}
+
+#define QSTR_LITERAL(string) QSTR_INIT(string, sizeof(string)-1)
+
+#endif /* not _SDCARDFS_H_ */
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c
new file mode 100644
index 000000000000..7f4539b4b249
--- /dev/null
+++ b/fs/sdcardfs/super.c
@@ -0,0 +1,324 @@
+/*
+ * fs/sdcardfs/super.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ * Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ * Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009 Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed. It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+
+/*
+ * The inode cache is used with alloc_inode for both our inode info and the
+ * vfs inode.
+ */
+static struct kmem_cache *sdcardfs_inode_cachep;
+
+/*
+ * To support the top references, we must track some data separately.
+ * An sdcardfs_inode_info always has a reference to its data, and once set up,
+ * also has a reference to its top. The top may be itself, in which case it
+ * holds two references to its data. When top is changed, it takes a ref to the
+ * new data and then drops the ref to the old data.
+ */
+static struct kmem_cache *sdcardfs_inode_data_cachep;
+
+void data_release(struct kref *ref)
+{
+ struct sdcardfs_inode_data *data =
+ container_of(ref, struct sdcardfs_inode_data, refcount);
+
+ kmem_cache_free(sdcardfs_inode_data_cachep, data);
+}
+
+/* final actions when unmounting a file system */
+static void sdcardfs_put_super(struct super_block *sb)
+{
+ struct sdcardfs_sb_info *spd;
+ struct super_block *s;
+
+ spd = SDCARDFS_SB(sb);
+ if (!spd)
+ return;
+
+ if (spd->obbpath_s) {
+ kfree(spd->obbpath_s);
+ path_put(&spd->obbpath);
+ }
+
+ /* decrement lower super references */
+ s = sdcardfs_lower_super(sb);
+ sdcardfs_set_lower_super(sb, NULL);
+ atomic_dec(&s->s_active);
+
+ kfree(spd);
+ sb->s_fs_info = NULL;
+}
+
+static int sdcardfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ int err;
+ struct path lower_path;
+ u32 min_blocks;
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+
+ sdcardfs_get_lower_path(dentry, &lower_path);
+ err = vfs_statfs(&lower_path, buf);
+ sdcardfs_put_lower_path(dentry, &lower_path);
+
+ if (sbi->options.reserved_mb) {
+ /* Invalid statfs informations. */
+ if (buf->f_bsize == 0) {
+ pr_err("Returned block size is zero.\n");
+ return -EINVAL;
+ }
+
+ min_blocks = ((sbi->options.reserved_mb * 1024 * 1024)/buf->f_bsize);
+ buf->f_blocks -= min_blocks;
+
+ if (buf->f_bavail > min_blocks)
+ buf->f_bavail -= min_blocks;
+ else
+ buf->f_bavail = 0;
+
+ /* Make reserved blocks invisiable to media storage */
+ buf->f_bfree = buf->f_bavail;
+ }
+
+ /* set return buf to our f/s to avoid confusing user-level utils */
+ buf->f_type = SDCARDFS_SUPER_MAGIC;
+
+ return err;
+}
+
+/*
+ * @flags: numeric mount options
+ * @options: mount options string
+ */
+static int sdcardfs_remount_fs(struct super_block *sb, int *flags, char *options)
+{
+ int err = 0;
+
+ /*
+ * The VFS will take care of "ro" and "rw" flags among others. We
+ * can safely accept a few flags (RDONLY, MANDLOCK), and honor
+ * SILENT, but anything else left over is an error.
+ */
+ if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT)) != 0) {
+ pr_err("sdcardfs: remount flags 0x%x unsupported\n", *flags);
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+/*
+ * @mnt: mount point we are remounting
+ * @sb: superblock we are remounting
+ * @flags: numeric mount options
+ * @options: mount options string
+ */
+static int sdcardfs_remount_fs2(struct vfsmount *mnt, struct super_block *sb,
+ int *flags, char *options)
+{
+ int err = 0;
+
+ /*
+ * The VFS will take care of "ro" and "rw" flags among others. We
+ * can safely accept a few flags (RDONLY, MANDLOCK), and honor
+ * SILENT, but anything else left over is an error.
+ */
+ if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT | MS_REMOUNT)) != 0) {
+ pr_err("sdcardfs: remount flags 0x%x unsupported\n", *flags);
+ err = -EINVAL;
+ }
+ pr_info("Remount options were %s for vfsmnt %p.\n", options, mnt);
+ err = parse_options_remount(sb, options, *flags & ~MS_SILENT, mnt->data);
+
+
+ return err;
+}
+
+static void *sdcardfs_clone_mnt_data(void *data)
+{
+ struct sdcardfs_vfsmount_options *opt = kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL);
+ struct sdcardfs_vfsmount_options *old = data;
+
+ if (!opt)
+ return NULL;
+ opt->gid = old->gid;
+ opt->mask = old->mask;
+ return opt;
+}
+
+static void sdcardfs_copy_mnt_data(void *data, void *newdata)
+{
+ struct sdcardfs_vfsmount_options *old = data;
+ struct sdcardfs_vfsmount_options *new = newdata;
+
+ old->gid = new->gid;
+ old->mask = new->mask;
+}
+
+/*
+ * Called by iput() when the inode reference count reached zero
+ * and the inode is not hashed anywhere. Used to clear anything
+ * that needs to be, before the inode is completely destroyed and put
+ * on the inode free list.
+ */
+static void sdcardfs_evict_inode(struct inode *inode)
+{
+ struct inode *lower_inode;
+
+ truncate_inode_pages(&inode->i_data, 0);
+ set_top(SDCARDFS_I(inode), NULL);
+ clear_inode(inode);
+ /*
+ * Decrement a reference to a lower_inode, which was incremented
+ * by our read_inode when it was created initially.
+ */
+ lower_inode = sdcardfs_lower_inode(inode);
+ sdcardfs_set_lower_inode(inode, NULL);
+ iput(lower_inode);
+}
+
+static struct inode *sdcardfs_alloc_inode(struct super_block *sb)
+{
+ struct sdcardfs_inode_info *i;
+ struct sdcardfs_inode_data *d;
+
+ i = kmem_cache_alloc(sdcardfs_inode_cachep, GFP_KERNEL);
+ if (!i)
+ return NULL;
+
+ /* memset everything up to the inode to 0 */
+ memset(i, 0, offsetof(struct sdcardfs_inode_info, vfs_inode));
+
+ d = kmem_cache_alloc(sdcardfs_inode_data_cachep,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!d) {
+ kmem_cache_free(sdcardfs_inode_cachep, i);
+ return NULL;
+ }
+
+ i->data = d;
+ kref_init(&d->refcount);
+
+ i->vfs_inode.i_version = 1;
+ return &i->vfs_inode;
+}
+
+static void i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+
+ release_own_data(SDCARDFS_I(inode));
+ kmem_cache_free(sdcardfs_inode_cachep, SDCARDFS_I(inode));
+}
+
+static void sdcardfs_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, i_callback);
+}
+
+/* sdcardfs inode cache constructor */
+static void init_once(void *obj)
+{
+ struct sdcardfs_inode_info *i = obj;
+
+ inode_init_once(&i->vfs_inode);
+}
+
+int sdcardfs_init_inode_cache(void)
+{
+ sdcardfs_inode_cachep =
+ kmem_cache_create("sdcardfs_inode_cache",
+ sizeof(struct sdcardfs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT, init_once);
+
+ if (!sdcardfs_inode_cachep)
+ return -ENOMEM;
+
+ sdcardfs_inode_data_cachep =
+ kmem_cache_create("sdcardfs_inode_data_cache",
+ sizeof(struct sdcardfs_inode_data), 0,
+ SLAB_RECLAIM_ACCOUNT, NULL);
+ if (!sdcardfs_inode_data_cachep) {
+ kmem_cache_destroy(sdcardfs_inode_cachep);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/* sdcardfs inode cache destructor */
+void sdcardfs_destroy_inode_cache(void)
+{
+ kmem_cache_destroy(sdcardfs_inode_data_cachep);
+ kmem_cache_destroy(sdcardfs_inode_cachep);
+}
+
+/*
+ * Used only in nfs, to kill any pending RPC tasks, so that subsequent
+ * code can actually succeed and won't leave tasks that need handling.
+ */
+static void sdcardfs_umount_begin(struct super_block *sb)
+{
+ struct super_block *lower_sb;
+
+ lower_sb = sdcardfs_lower_super(sb);
+ if (lower_sb && lower_sb->s_op && lower_sb->s_op->umount_begin)
+ lower_sb->s_op->umount_begin(lower_sb);
+}
+
+static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m,
+ struct dentry *root)
+{
+ struct sdcardfs_sb_info *sbi = SDCARDFS_SB(root->d_sb);
+ struct sdcardfs_mount_options *opts = &sbi->options;
+ struct sdcardfs_vfsmount_options *vfsopts = mnt->data;
+
+ if (opts->fs_low_uid != 0)
+ seq_printf(m, ",fsuid=%u", opts->fs_low_uid);
+ if (opts->fs_low_gid != 0)
+ seq_printf(m, ",fsgid=%u", opts->fs_low_gid);
+ if (vfsopts->gid != 0)
+ seq_printf(m, ",gid=%u", vfsopts->gid);
+ if (opts->multiuser)
+ seq_puts(m, ",multiuser");
+ if (vfsopts->mask)
+ seq_printf(m, ",mask=%u", vfsopts->mask);
+ if (opts->fs_user_id)
+ seq_printf(m, ",userid=%u", opts->fs_user_id);
+ if (opts->reserved_mb != 0)
+ seq_printf(m, ",reserved=%uMB", opts->reserved_mb);
+
+ return 0;
+};
+
+const struct super_operations sdcardfs_sops = {
+ .put_super = sdcardfs_put_super,
+ .statfs = sdcardfs_statfs,
+ .remount_fs = sdcardfs_remount_fs,
+ .remount_fs2 = sdcardfs_remount_fs2,
+ .clone_mnt_data = sdcardfs_clone_mnt_data,
+ .copy_mnt_data = sdcardfs_copy_mnt_data,
+ .evict_inode = sdcardfs_evict_inode,
+ .umount_begin = sdcardfs_umount_begin,
+ .show_options2 = sdcardfs_show_options,
+ .alloc_inode = sdcardfs_alloc_inode,
+ .destroy_inode = sdcardfs_destroy_inode,
+ .drop_inode = generic_delete_inode,
+};
diff --git a/fs/super.c b/fs/super.c
index 7e9beab77259..d80a1d2d491b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -761,7 +761,8 @@ rescan:
}
/**
- * do_remount_sb - asks filesystem to change mount options.
+ * do_remount_sb2 - asks filesystem to change mount options.
+ * @mnt: mount we are looking at
* @sb: superblock in question
* @flags: numeric part of options
* @data: the rest of options
@@ -769,7 +770,7 @@ rescan:
*
* Alters the mount options of a mounted file system.
*/
-int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+int do_remount_sb2(struct vfsmount *mnt, struct super_block *sb, int flags, void *data, int force)
{
int retval;
int remount_ro;
@@ -811,7 +812,16 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
}
}
- if (sb->s_op->remount_fs) {
+ if (mnt && sb->s_op->remount_fs2) {
+ retval = sb->s_op->remount_fs2(mnt, sb, &flags, data);
+ if (retval) {
+ if (!force)
+ goto cancel_readonly;
+ /* If forced remount, go ahead despite any errors */
+ WARN(1, "forced remount of a %s fs returned %i\n",
+ sb->s_type->name, retval);
+ }
+ } else if (sb->s_op->remount_fs) {
retval = sb->s_op->remount_fs(sb, &flags, data);
if (retval) {
if (!force)
@@ -843,12 +853,17 @@ cancel_readonly:
return retval;
}
+int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+{
+ return do_remount_sb2(NULL, sb, flags, data, force);
+}
+
static void do_emergency_remount(struct work_struct *work)
{
struct super_block *sb, *p = NULL;
spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list) {
+ list_for_each_entry_reverse(sb, &super_blocks, s_list) {
if (hlist_unhashed(&sb->s_instances))
continue;
sb->s_count++;
@@ -1168,7 +1183,7 @@ struct dentry *mount_single(struct file_system_type *fs_type,
EXPORT_SYMBOL(mount_single);
struct dentry *
-mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
+mount_fs(struct file_system_type *type, int flags, const char *name, struct vfsmount *mnt, void *data)
{
struct dentry *root;
struct super_block *sb;
@@ -1185,7 +1200,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
goto out_free_secdata;
}
- root = type->mount(type, flags, name, data);
+ if (type->mount2)
+ root = type->mount2(mnt, type, flags, name, data);
+ else
+ root = type->mount(type, flags, name, data);
if (IS_ERR(root)) {
error = PTR_ERR(root);
goto out_free_secdata;
diff --git a/fs/sync.c b/fs/sync.c
index 2a54c1f22035..5c2420c3eb17 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -218,6 +218,7 @@ static int do_fsync(unsigned int fd, int datasync)
if (f.file) {
ret = vfs_fsync(f.file, datasync);
fdput(f);
+ inc_syscfs(current);
}
return ret;
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 784d667475ae..9d9c032d5fe5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -493,7 +493,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
new_flags, vma->anon_vma,
vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX);
+ NULL_VM_UFFD_CTX,
+ vma_get_anon_name(vma));
if (prev)
vma = prev;
else
@@ -872,7 +873,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- ((struct vm_userfaultfd_ctx){ ctx }));
+ ((struct vm_userfaultfd_ctx){ ctx }),
+ vma_get_anon_name(vma));
if (prev) {
vma = prev;
goto next;
@@ -1009,7 +1011,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX);
+ NULL_VM_UFFD_CTX,
+ vma_get_anon_name(vma));
if (prev) {
vma = prev;
goto next;
diff --git a/fs/utimes.c b/fs/utimes.c
index 22307cdf7014..87ce37bcaa84 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -91,7 +91,7 @@ static int utimes_common(struct path *path, struct timespec *times)
}
retry_deleg:
inode_lock(inode);
- error = notify_change(path->dentry, &newattrs, &delegated_inode);
+ error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
inode_unlock(inode);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
diff --git a/include/drm/drm_atomic.h b/include/drm/drm_atomic.h
index 9701f2dfb784..a5696c1806c9 100644
--- a/include/drm/drm_atomic.h
+++ b/include/drm/drm_atomic.h
@@ -144,6 +144,7 @@ struct __drm_crtcs_state {
struct drm_crtc *ptr;
struct drm_crtc_state *state;
struct drm_crtc_commit *commit;
+ s32 __user *out_fence_ptr;
};
struct __drm_connnectors_state {
@@ -316,6 +317,8 @@ drm_atomic_set_crtc_for_plane(struct drm_plane_state *plane_state,
struct drm_crtc *crtc);
void drm_atomic_set_fb_for_plane(struct drm_plane_state *plane_state,
struct drm_framebuffer *fb);
+void drm_atomic_set_fence_for_plane(struct drm_plane_state *plane_state,
+ struct fence *fence);
int __must_check
drm_atomic_set_crtc_for_connector(struct drm_connector_state *conn_state,
struct drm_crtc *crtc);
diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h
index 0aa292526567..f3d58c7eb97e 100644
--- a/include/drm/drm_crtc.h
+++ b/include/drm/drm_crtc.h
@@ -680,6 +680,35 @@ struct drm_crtc {
* context.
*/
struct drm_modeset_acquire_ctx *acquire_ctx;
+
+ /**
+ * @fence_context:
+ *
+ * timeline context used for fence operations.
+ */
+ unsigned int fence_context;
+
+ /**
+ * @fence_lock:
+ *
+ * spinlock to protect the fences in the fence_context.
+ */
+
+ spinlock_t fence_lock;
+ /**
+ * @fence_seqno:
+ *
+ * Seqno variable used as monotonic counter for the fences
+ * created on the CRTC's timeline.
+ */
+ unsigned long fence_seqno;
+
+ /**
+ * @timeline_name:
+ *
+ * The name of the CRTC's fence timeline.
+ */
+ char timeline_name[32];
};
/**
@@ -1160,6 +1189,17 @@ struct drm_mode_config {
*/
struct drm_property *prop_fb_id;
/**
+ * @prop_in_fence_fd: Sync File fd representing the incoming fences
+ * for a Plane.
+ */
+ struct drm_property *prop_in_fence_fd;
+ /**
+ * @prop_out_fence_ptr: Sync File fd pointer representing the
+ * outgoing fences for a CRTC. Userspace should provide a pointer to a
+ * value of type s32, and then cast that pointer to u64.
+ */
+ struct drm_property *prop_out_fence_ptr;
+ /**
* @prop_crtc_id: Default atomic plane property to specify the
* &drm_crtc.
*/
diff --git a/include/drm/drm_fb_cma_helper.h b/include/drm/drm_fb_cma_helper.h
index f313211f8ed5..3b00f6480b83 100644
--- a/include/drm/drm_fb_cma_helper.h
+++ b/include/drm/drm_fb_cma_helper.h
@@ -12,6 +12,8 @@ struct drm_fb_helper;
struct drm_device;
struct drm_file;
struct drm_mode_fb_cmd2;
+struct drm_plane;
+struct drm_plane_state;
struct drm_fbdev_cma *drm_fbdev_cma_init_with_funcs(struct drm_device *dev,
unsigned int preferred_bpp, unsigned int num_crtc,
@@ -41,6 +43,9 @@ struct drm_framebuffer *drm_fb_cma_create(struct drm_device *dev,
struct drm_gem_cma_object *drm_fb_cma_get_gem_obj(struct drm_framebuffer *fb,
unsigned int plane);
+int drm_fb_cma_prepare_fb(struct drm_plane *plane,
+ struct drm_plane_state *state);
+
#ifdef CONFIG_DEBUG_FS
struct seq_file;
diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
index 8b4dc62470ff..952ef84dc046 100644
--- a/include/drm/drm_plane.h
+++ b/include/drm/drm_plane.h
@@ -65,7 +65,7 @@ struct drm_plane_state {
struct drm_crtc *crtc; /* do not write directly, use drm_atomic_set_crtc_for_plane() */
struct drm_framebuffer *fb; /* do not write directly, use drm_atomic_set_fb_for_plane() */
- struct fence *fence;
+ struct fence *fence; /* do not write directly, use drm_atomic_set_fence_for_plane() */
/* Signed dest location allows it to be partially off screen */
int32_t crtc_x, crtc_y;
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
new file mode 100644
index 000000000000..a4608897a5f3
--- /dev/null
+++ b/include/linux/Kbuild
@@ -0,0 +1,2 @@
+header-y += if_pppolac.h
+header-y += if_pppopns.h
diff --git a/include/linux/amba/mmci.h b/include/linux/amba/mmci.h
index 8c98113069ce..eff56cb0016a 100644
--- a/include/linux/amba/mmci.h
+++ b/include/linux/amba/mmci.h
@@ -5,6 +5,15 @@
#define AMBA_MMCI_H
#include <linux/mmc/host.h>
+#include <linux/mmc/card.h>
+#include <linux/mmc/sdio_func.h>
+
+struct embedded_sdio_data {
+ struct sdio_cis cis;
+ struct sdio_cccr cccr;
+ struct sdio_embedded_func *funcs;
+ int num_funcs;
+};
/**
* struct mmci_platform_data - platform configuration for the MMCI
@@ -31,6 +40,7 @@ struct mmci_platform_data {
int gpio_wp;
int gpio_cd;
bool cd_invert;
+ struct embedded_sdio_data *embedded_sdio;
};
#endif
diff --git a/include/linux/android_aid.h b/include/linux/android_aid.h
new file mode 100644
index 000000000000..6f1fa1792dfc
--- /dev/null
+++ b/include/linux/android_aid.h
@@ -0,0 +1,28 @@
+/* include/linux/android_aid.h
+ *
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_ANDROID_AID_H
+#define _LINUX_ANDROID_AID_H
+
+/* AIDs that the kernel treats differently */
+#define AID_OBSOLETE_000 KGIDT_INIT(3001) /* was NET_BT_ADMIN */
+#define AID_OBSOLETE_001 KGIDT_INIT(3002) /* was NET_BT */
+#define AID_INET KGIDT_INIT(3003)
+#define AID_NET_RAW KGIDT_INIT(3004)
+#define AID_NET_ADMIN KGIDT_INIT(3005)
+#define AID_NET_BW_STATS KGIDT_INIT(3006) /* read bandwidth statistics */
+#define AID_NET_BW_ACCT KGIDT_INIT(3007) /* change bandwidth statistics accounting */
+
+#endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bd738aafd432..238261e80ed5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -196,6 +196,9 @@ struct request {
/* for bidi */
struct request *next_rq;
+
+ ktime_t lat_hist_io_start;
+ int lat_hist_enabled;
};
#define REQ_OP_SHIFT (8 * sizeof(u64) - REQ_OP_BITS)
@@ -1700,6 +1703,79 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *,
extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
extern int bdev_dax_supported(struct super_block *, int);
extern bool bdev_dax_capable(struct block_device *);
+
+/*
+ * X-axis for IO latency histogram support.
+ */
+static const u_int64_t latency_x_axis_us[] = {
+ 100,
+ 200,
+ 300,
+ 400,
+ 500,
+ 600,
+ 700,
+ 800,
+ 900,
+ 1000,
+ 1200,
+ 1400,
+ 1600,
+ 1800,
+ 2000,
+ 2500,
+ 3000,
+ 4000,
+ 5000,
+ 6000,
+ 7000,
+ 9000,
+ 10000
+};
+
+#define BLK_IO_LAT_HIST_DISABLE 0
+#define BLK_IO_LAT_HIST_ENABLE 1
+#define BLK_IO_LAT_HIST_ZERO 2
+
+struct io_latency_state {
+ u_int64_t latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1];
+ u_int64_t latency_reads_elems;
+ u_int64_t latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1];
+ u_int64_t latency_writes_elems;
+};
+
+static inline void
+blk_update_latency_hist(struct io_latency_state *s,
+ int read,
+ u_int64_t delta_us)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) {
+ if (delta_us < (u_int64_t)latency_x_axis_us[i]) {
+ if (read)
+ s->latency_y_axis_read[i]++;
+ else
+ s->latency_y_axis_write[i]++;
+ break;
+ }
+ }
+ if (i == ARRAY_SIZE(latency_x_axis_us)) {
+ /* Overflowed the histogram */
+ if (read)
+ s->latency_y_axis_read[i]++;
+ else
+ s->latency_y_axis_write[i]++;
+ }
+ if (read)
+ s->latency_reads_elems++;
+ else
+ s->latency_writes_elems++;
+}
+
+void blk_zero_latency_hist(struct io_latency_state *s);
+ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf);
+
#else /* CONFIG_BLOCK */
struct block_device;
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
new file mode 100644
index 000000000000..ace92fce296d
--- /dev/null
+++ b/include/linux/bpf-cgroup.h
@@ -0,0 +1,77 @@
+#ifndef _BPF_CGROUP_H
+#define _BPF_CGROUP_H
+
+#include <linux/jump_label.h>
+#include <uapi/linux/bpf.h>
+
+struct sock;
+struct cgroup;
+struct sk_buff;
+
+#ifdef CONFIG_CGROUP_BPF
+
+extern struct static_key_false cgroup_bpf_enabled_key;
+#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
+
+struct cgroup_bpf {
+ /*
+ * Store two sets of bpf_prog pointers, one for programs that are
+ * pinned directly to this cgroup, and one for those that are effective
+ * when this cgroup is accessed.
+ */
+ struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
+ struct bpf_prog *effective[MAX_BPF_ATTACH_TYPE];
+ bool disallow_override[MAX_BPF_ATTACH_TYPE];
+};
+
+void cgroup_bpf_put(struct cgroup *cgrp);
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
+
+int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
+ struct bpf_prog *prog, enum bpf_attach_type type,
+ bool overridable);
+
+/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
+int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, bool overridable);
+
+int __cgroup_bpf_run_filter(struct sock *sk,
+ struct sk_buff *skb,
+ enum bpf_attach_type type);
+
+/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) \
+({ \
+ int __ret = 0; \
+ if (cgroup_bpf_enabled) \
+ __ret = __cgroup_bpf_run_filter(sk, skb, \
+ BPF_CGROUP_INET_INGRESS); \
+ \
+ __ret; \
+})
+
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) \
+({ \
+ int __ret = 0; \
+ if (cgroup_bpf_enabled && sk && sk == skb->sk) { \
+ typeof(sk) __sk = sk_to_full_sk(sk); \
+ if (sk_fullsock(__sk)) \
+ __ret = __cgroup_bpf_run_filter(__sk, skb, \
+ BPF_CGROUP_INET_EGRESS); \
+ } \
+ __ret; \
+})
+
+#else
+
+struct cgroup_bpf {};
+static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
+static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
+ struct cgroup *parent) {}
+
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
+
+#endif /* CONFIG_CGROUP_BPF */
+
+#endif /* _BPF_CGROUP_H */
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1619a3213af5..9c72b21d4b39 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,6 +16,7 @@
#include <linux/percpu-refcount.h>
#include <linux/percpu-rwsem.h>
#include <linux/workqueue.h>
+#include <linux/bpf-cgroup.h>
#ifdef CONFIG_CGROUPS
@@ -301,6 +302,9 @@ struct cgroup {
/* used to schedule release agent */
struct work_struct release_agent_work;
+ /* used to store eBPF programs */
+ struct cgroup_bpf bpf;
+
/* ids of the ancestors at each level including self */
int ancestor_ids[];
};
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336acee9..7f4a2a5a2a77 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -20,6 +20,10 @@ SUBSYS(cpu)
SUBSYS(cpuacct)
#endif
+#if IS_ENABLED(CONFIG_CGROUP_SCHEDTUNE)
+SUBSYS(schedtune)
+#endif
+
#if IS_ENABLED(CONFIG_BLK_CGROUP)
SUBSYS(io)
#endif
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index ae5ac89324df..fdf5be472eff 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -276,4 +276,11 @@ static inline void cpu_smt_check_topology_early(void) { }
static inline void cpu_smt_check_topology(void) { }
#endif
+#define IDLE_START 1
+#define IDLE_END 2
+
+void idle_notifier_register(struct notifier_block *n);
+void idle_notifier_unregister(struct notifier_block *n);
+void idle_notifier_call_chain(unsigned long val);
+
#endif /* _LINUX_CPU_H_ */
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 32dc0cbd51ca..cc57986d3bfe 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -177,6 +177,7 @@ u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy);
int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
int cpufreq_update_policy(unsigned int cpu);
bool have_governor_per_policy(void);
+bool cpufreq_driver_is_slow(void);
struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
@@ -359,6 +360,14 @@ struct cpufreq_driver {
*/
#define CPUFREQ_NEED_INITIAL_FREQ_CHECK (1 << 5)
+/*
+ * Indicates that it is safe to call cpufreq_driver_target from
+ * non-interruptable context in scheduler hot paths. Drivers must
+ * opt-in to this flag, as the safe default is that they might sleep
+ * or be too slow for hot path use.
+ */
+#define CPUFREQ_DRIVER_FAST (1 << 6)
+
int cpufreq_register_driver(struct cpufreq_driver *driver_data);
int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
@@ -553,6 +562,32 @@ struct governor_attr {
ssize_t (*store)(struct gov_attr_set *attr_set, const char *buf,
size_t count);
};
+/* CPUFREQ DEFAULT GOVERNOR */
+/*
+ * Performance governor is fallback governor if any other gov failed to auto
+ * load due latency restrictions
+ */
+#ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE
+extern struct cpufreq_governor cpufreq_gov_performance;
+#endif
+#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE
+#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_performance)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE)
+extern struct cpufreq_governor cpufreq_gov_powersave;
+#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_powersave)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE)
+extern struct cpufreq_governor cpufreq_gov_userspace;
+#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_userspace)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND)
+extern struct cpufreq_governor cpufreq_gov_ondemand;
+#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_ondemand)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
+extern struct cpufreq_governor cpufreq_gov_conservative;
+#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED)
+extern struct cpufreq_governor cpufreq_gov_sched;
+#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_sched)
+#endif
/*********************************************************************
* FREQUENCY TABLE HELPERS *
@@ -886,4 +921,8 @@ unsigned int cpufreq_generic_get(unsigned int cpu);
int cpufreq_generic_init(struct cpufreq_policy *policy,
struct cpufreq_frequency_table *table,
unsigned int transition_latency);
+
+struct sched_domain;
+unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
+unsigned long cpufreq_scale_max_freq_capacity(int cpu);
#endif /* _LINUX_CPUFREQ_H */
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index bb31373c3478..9a8eec9e59b2 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -207,7 +207,7 @@ static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv,
#endif
/* kernel/sched/idle.c */
-extern void sched_idle_set_state(struct cpuidle_state *idle_state);
+extern void sched_idle_set_state(struct cpuidle_state *idle_state, int index);
extern void default_idle_call(void);
#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index b757ee42bc63..014d7f9ac615 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -142,6 +142,7 @@ struct dentry_operations {
int (*d_manage)(struct dentry *, bool);
struct dentry *(*d_real)(struct dentry *, const struct inode *,
unsigned int);
+ void (*d_canonical_path)(const struct path *, struct path *);
} ____cacheline_aligned;
/*
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ef7962e84444..cf86f528e615 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -408,6 +408,12 @@ void dm_set_mdptr(struct mapped_device *md, void *ptr);
void *dm_get_mdptr(struct mapped_device *md);
/*
+ * Export the device via the ioctl interface (uses mdptr).
+ */
+int dm_ioctl_export(struct mapped_device *md, const char *name,
+ const char *uuid);
+
+/*
* A device can still be used while suspended, but I/O is deferred.
*/
int dm_suspend(struct mapped_device *md, unsigned suspend_flags);
@@ -434,6 +440,13 @@ union map_info *dm_get_rq_mapinfo(struct request *rq);
struct queue_limits *dm_get_queue_limits(struct mapped_device *md);
+void dm_lock_md_type(struct mapped_device *md);
+void dm_unlock_md_type(struct mapped_device *md);
+void dm_set_md_type(struct mapped_device *md, unsigned type);
+unsigned dm_get_md_type(struct mapped_device *md);
+int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
+unsigned dm_table_get_type(struct dm_table *t);
+
/*
* Geometry functions.
*/
diff --git a/include/linux/fence.h b/include/linux/fence.h
index 9bb2c0c97a21..7c9b78cf2a7e 100644
--- a/include/linux/fence.h
+++ b/include/linux/fence.h
@@ -108,6 +108,7 @@ struct fence_cb {
* @get_driver_name: returns the driver name.
* @get_timeline_name: return the name of the context this fence belongs to.
* @enable_signaling: enable software signaling of fence.
+ * @disable_signaling: disable software signaling of fence (optional).
* @signaled: [optional] peek whether the fence is signaled, can be null.
* @wait: custom wait implementation, or fence_default_wait.
* @release: [optional] called on destruction of fence, can be null
@@ -167,6 +168,7 @@ struct fence_ops {
const char * (*get_driver_name)(struct fence *fence);
const char * (*get_timeline_name)(struct fence *fence);
bool (*enable_signaling)(struct fence *fence);
+ void (*disable_signaling)(struct fence *fence);
bool (*signaled)(struct fence *fence);
signed long (*wait)(struct fence *fence, bool intr, signed long timeout);
void (*release)(struct fence *fence);
@@ -183,6 +185,16 @@ void fence_release(struct kref *kref);
void fence_free(struct fence *fence);
/**
+ * fence_put - decreases refcount of the fence
+ * @fence: [in] fence to reduce refcount of
+ */
+static inline void fence_put(struct fence *fence)
+{
+ if (fence)
+ kref_put(&fence->refcount, fence_release);
+}
+
+/**
* fence_get - increases refcount of the fence
* @fence: [in] fence to increase refcount of
*
@@ -210,13 +222,49 @@ static inline struct fence *fence_get_rcu(struct fence *fence)
}
/**
- * fence_put - decreases refcount of the fence
- * @fence: [in] fence to reduce refcount of
+ * fence_get_rcu_safe - acquire a reference to an RCU tracked fence
+ * @fence: [in] pointer to fence to increase refcount of
+ *
+ * Function returns NULL if no refcount could be obtained, or the fence.
+ * This function handles acquiring a reference to a fence that may be
+ * reallocated within the RCU grace period (such as with SLAB_DESTROY_BY_RCU),
+ * so long as the caller is using RCU on the pointer to the fence.
+ *
+ * An alternative mechanism is to employ a seqlock to protect a bunch of
+ * fences, such as used by struct reservation_object. When using a seqlock,
+ * the seqlock must be taken before and checked after a reference to the
+ * fence is acquired (as shown here).
+ *
+ * The caller is required to hold the RCU read lock.
*/
-static inline void fence_put(struct fence *fence)
+static inline struct fence *fence_get_rcu_safe(struct fence * __rcu *fencep)
{
- if (fence)
- kref_put(&fence->refcount, fence_release);
+ do {
+ struct fence *fence;
+
+ fence = rcu_dereference(*fencep);
+ if (!fence || !fence_get_rcu(fence))
+ return NULL;
+
+ /* The atomic_inc_not_zero() inside fence_get_rcu()
+ * provides a full memory barrier upon success (such as now).
+ * This is paired with the write barrier from assigning
+ * to the __rcu protected fence pointer so that if that
+ * pointer still matches the current fence, we know we
+ * have successfully acquire a reference to it. If it no
+ * longer matches, we are holding a reference to some other
+ * reallocated pointer. This is possible if the allocator
+ * is using a freelist like SLAB_DESTROY_BY_RCU where the
+ * fence remains valid for the RCU grace period, but it
+ * may be reallocated. When using such allocators, we are
+ * responsible for ensuring the reference we get is to
+ * the right fence, as below.
+ */
+ if (fence == rcu_access_pointer(*fencep))
+ return rcu_pointer_handoff(fence);
+
+ fence_put(fence);
+ } while (1);
}
int fence_signal(struct fence *fence);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bcad2b963296..ba61781e1b54 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1612,13 +1612,21 @@ extern bool inode_owner_or_capable(const struct inode *inode);
* VFS helper functions..
*/
extern int vfs_create(struct inode *, struct dentry *, umode_t, bool);
+extern int vfs_create2(struct vfsmount *, struct inode *, struct dentry *, umode_t, bool);
extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
+extern int vfs_mkdir2(struct vfsmount *, struct inode *, struct dentry *, umode_t);
extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+extern int vfs_mknod2(struct vfsmount *, struct inode *, struct dentry *, umode_t, dev_t);
extern int vfs_symlink(struct inode *, struct dentry *, const char *);
+extern int vfs_symlink2(struct vfsmount *, struct inode *, struct dentry *, const char *);
extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
+extern int vfs_link2(struct vfsmount *, struct dentry *, struct inode *, struct dentry *, struct inode **);
extern int vfs_rmdir(struct inode *, struct dentry *);
+extern int vfs_rmdir2(struct vfsmount *, struct inode *, struct dentry *);
extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
+extern int vfs_unlink2(struct vfsmount *, struct inode *, struct dentry *, struct inode **);
extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
+extern int vfs_rename2(struct vfsmount *, struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
extern int vfs_whiteout(struct inode *, struct dentry *);
/*
@@ -1746,6 +1754,7 @@ struct inode_operations {
struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
int (*permission) (struct inode *, int);
+ int (*permission2) (struct vfsmount *, struct inode *, int);
struct posix_acl * (*get_acl)(struct inode *, int);
int (*readlink) (struct dentry *, char __user *,int);
@@ -1760,6 +1769,7 @@ struct inode_operations {
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
int (*setattr) (struct dentry *, struct iattr *);
+ int (*setattr2) (struct vfsmount *, struct dentry *, struct iattr *);
int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
@@ -1808,9 +1818,13 @@ struct super_operations {
int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *);
int (*remount_fs) (struct super_block *, int *, char *);
+ int (*remount_fs2) (struct vfsmount *, struct super_block *, int *, char *);
+ void *(*clone_mnt_data) (void *);
+ void (*copy_mnt_data) (void *, void *);
void (*umount_begin) (struct super_block *);
int (*show_options)(struct seq_file *, struct dentry *);
+ int (*show_options2)(struct vfsmount *,struct seq_file *, struct dentry *);
int (*show_devname)(struct seq_file *, struct dentry *);
int (*show_path)(struct seq_file *, struct dentry *);
int (*show_stats)(struct seq_file *, struct dentry *);
@@ -2044,6 +2058,9 @@ struct file_system_type {
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);
+ struct dentry *(*mount2) (struct vfsmount *, struct file_system_type *, int,
+ const char *, void *);
+ void *(*alloc_mnt_data) (void);
void (*kill_sb) (struct super_block *);
struct module *owner;
struct file_system_type * next;
@@ -2342,6 +2359,8 @@ struct filename {
extern long vfs_truncate(const struct path *, loff_t);
extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
struct file *filp);
+extern int do_truncate2(struct vfsmount *, struct dentry *, loff_t start,
+ unsigned int time_attrs, struct file *filp);
extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern long do_sys_open(int dfd, const char __user *filename, int flags,
@@ -2584,8 +2603,11 @@ extern void emergency_remount(void);
extern sector_t bmap(struct inode *, sector_t);
#endif
extern int notify_change(struct dentry *, struct iattr *, struct inode **);
+extern int notify_change2(struct vfsmount *, struct dentry *, struct iattr *, struct inode **);
extern int inode_permission(struct inode *, int);
+extern int inode_permission2(struct vfsmount *, struct inode *, int);
extern int __inode_permission(struct inode *, int);
+extern int __inode_permission2(struct vfsmount *, struct inode *, int);
extern int generic_permission(struct inode *, int);
extern int __check_sticky(struct inode *dir, struct inode *inode);
diff --git a/include/linux/gpio_event.h b/include/linux/gpio_event.h
new file mode 100644
index 000000000000..2613fc5e4a93
--- /dev/null
+++ b/include/linux/gpio_event.h
@@ -0,0 +1,170 @@
+/* include/linux/gpio_event.h
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_GPIO_EVENT_H
+#define _LINUX_GPIO_EVENT_H
+
+#include <linux/input.h>
+
+struct gpio_event_input_devs {
+ int count;
+ struct input_dev *dev[];
+};
+enum {
+ GPIO_EVENT_FUNC_UNINIT = 0x0,
+ GPIO_EVENT_FUNC_INIT = 0x1,
+ GPIO_EVENT_FUNC_SUSPEND = 0x2,
+ GPIO_EVENT_FUNC_RESUME = 0x3,
+};
+struct gpio_event_info {
+ int (*func)(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info,
+ void **data, int func);
+ int (*event)(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info,
+ void **data, unsigned int dev, unsigned int type,
+ unsigned int code, int value); /* out events */
+ bool no_suspend;
+};
+
+struct gpio_event_platform_data {
+ const char *name;
+ struct gpio_event_info **info;
+ size_t info_count;
+ int (*power)(const struct gpio_event_platform_data *pdata, bool on);
+ const char *names[]; /* If name is NULL, names contain a NULL */
+ /* terminated list of input devices to create */
+};
+
+#define GPIO_EVENT_DEV_NAME "gpio-event"
+
+/* Key matrix */
+
+enum gpio_event_matrix_flags {
+ /* unset: drive active output low, set: drive active output high */
+ GPIOKPF_ACTIVE_HIGH = 1U << 0,
+ GPIOKPF_DEBOUNCE = 1U << 1,
+ GPIOKPF_REMOVE_SOME_PHANTOM_KEYS = 1U << 2,
+ GPIOKPF_REMOVE_PHANTOM_KEYS = GPIOKPF_REMOVE_SOME_PHANTOM_KEYS |
+ GPIOKPF_DEBOUNCE,
+ GPIOKPF_DRIVE_INACTIVE = 1U << 3,
+ GPIOKPF_LEVEL_TRIGGERED_IRQ = 1U << 4,
+ GPIOKPF_PRINT_UNMAPPED_KEYS = 1U << 16,
+ GPIOKPF_PRINT_MAPPED_KEYS = 1U << 17,
+ GPIOKPF_PRINT_PHANTOM_KEYS = 1U << 18,
+};
+
+#define MATRIX_CODE_BITS (10)
+#define MATRIX_KEY_MASK ((1U << MATRIX_CODE_BITS) - 1)
+#define MATRIX_KEY(dev, code) \
+ (((dev) << MATRIX_CODE_BITS) | (code & MATRIX_KEY_MASK))
+
+extern int gpio_event_matrix_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func);
+struct gpio_event_matrix_info {
+ /* initialize to gpio_event_matrix_func */
+ struct gpio_event_info info;
+ /* size must be ninputs * noutputs */
+ const unsigned short *keymap;
+ unsigned int *input_gpios;
+ unsigned int *output_gpios;
+ unsigned int ninputs;
+ unsigned int noutputs;
+ /* time to wait before reading inputs after driving each output */
+ ktime_t settle_time;
+ /* time to wait before scanning the keypad a second time */
+ ktime_t debounce_delay;
+ ktime_t poll_time;
+ unsigned flags;
+};
+
+/* Directly connected inputs and outputs */
+
+enum gpio_event_direct_flags {
+ GPIOEDF_ACTIVE_HIGH = 1U << 0,
+/* GPIOEDF_USE_DOWN_IRQ = 1U << 1, */
+/* GPIOEDF_USE_IRQ = (1U << 2) | GPIOIDF_USE_DOWN_IRQ, */
+ GPIOEDF_PRINT_KEYS = 1U << 8,
+ GPIOEDF_PRINT_KEY_DEBOUNCE = 1U << 9,
+ GPIOEDF_PRINT_KEY_UNSTABLE = 1U << 10,
+};
+
+struct gpio_event_direct_entry {
+ uint32_t gpio:16;
+ uint32_t code:10;
+ uint32_t dev:6;
+};
+
+/* inputs */
+extern int gpio_event_input_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func);
+struct gpio_event_input_info {
+ /* initialize to gpio_event_input_func */
+ struct gpio_event_info info;
+ ktime_t debounce_time;
+ ktime_t poll_time;
+ uint16_t flags;
+ uint16_t type;
+ const struct gpio_event_direct_entry *keymap;
+ size_t keymap_size;
+};
+
+/* outputs */
+extern int gpio_event_output_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func);
+extern int gpio_event_output_event(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data,
+ unsigned int dev, unsigned int type,
+ unsigned int code, int value);
+struct gpio_event_output_info {
+ /* initialize to gpio_event_output_func and gpio_event_output_event */
+ struct gpio_event_info info;
+ uint16_t flags;
+ uint16_t type;
+ const struct gpio_event_direct_entry *keymap;
+ size_t keymap_size;
+};
+
+
+/* axes */
+
+enum gpio_event_axis_flags {
+ GPIOEAF_PRINT_UNKNOWN_DIRECTION = 1U << 16,
+ GPIOEAF_PRINT_RAW = 1U << 17,
+ GPIOEAF_PRINT_EVENT = 1U << 18,
+};
+
+extern int gpio_event_axis_func(struct gpio_event_input_devs *input_devs,
+ struct gpio_event_info *info, void **data, int func);
+struct gpio_event_axis_info {
+ /* initialize to gpio_event_axis_func */
+ struct gpio_event_info info;
+ uint8_t count; /* number of gpios for this axis */
+ uint8_t dev; /* device index when using multiple input devices */
+ uint8_t type; /* EV_REL or EV_ABS */
+ uint16_t code;
+ uint16_t decoded_size;
+ uint16_t (*map)(struct gpio_event_axis_info *info, uint16_t in);
+ uint32_t *gpio;
+ uint32_t flags;
+};
+#define gpio_axis_2bit_gray_map gpio_axis_4bit_gray_map
+#define gpio_axis_3bit_gray_map gpio_axis_4bit_gray_map
+uint16_t gpio_axis_4bit_gray_map(
+ struct gpio_event_axis_info *info, uint16_t in);
+uint16_t gpio_axis_5bit_singletrack_map(
+ struct gpio_event_axis_info *info, uint16_t in);
+
+#endif
diff --git a/include/linux/if_pppolac.h b/include/linux/if_pppolac.h
new file mode 100644
index 000000000000..e40aa1075a30
--- /dev/null
+++ b/include/linux/if_pppolac.h
@@ -0,0 +1,23 @@
+/* include/linux/if_pppolac.h
+ *
+ * Header for PPP on L2TP Access Concentrator / PPPoLAC Socket (RFC 2661)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ * Author: Chia-chi Yeh <chiachi@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_IF_PPPOLAC_H
+#define __LINUX_IF_PPPOLAC_H
+
+#include <uapi/linux/if_pppolac.h>
+
+#endif /* __LINUX_IF_PPPOLAC_H */
diff --git a/include/linux/if_pppopns.h b/include/linux/if_pppopns.h
new file mode 100644
index 000000000000..4ac621a9ce7c
--- /dev/null
+++ b/include/linux/if_pppopns.h
@@ -0,0 +1,23 @@
+/* include/linux/if_pppopns.h
+ *
+ * Header for PPP on PPTP Network Server / PPPoPNS Socket (RFC 2637)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ * Author: Chia-chi Yeh <chiachi@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_IF_PPPOPNS_H
+#define __LINUX_IF_PPPOPNS_H
+
+#include <uapi/linux/if_pppopns.h>
+
+#endif /* __LINUX_IF_PPPOPNS_H */
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index ba7a9b0c7c57..325727a7096a 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -43,6 +43,25 @@ struct pptp_opt {
u32 seq_sent, seq_recv;
int ppp_flags;
};
+
+struct pppolac_opt {
+ __u32 local;
+ __u32 remote;
+ __u32 recv_sequence;
+ __u32 xmit_sequence;
+ atomic_t sequencing;
+ int (*backlog_rcv)(struct sock *sk_udp, struct sk_buff *skb);
+};
+
+struct pppopns_opt {
+ __u16 local;
+ __u16 remote;
+ __u32 recv_sequence;
+ __u32 xmit_sequence;
+ void (*data_ready)(struct sock *sk_raw);
+ int (*backlog_rcv)(struct sock *sk_raw, struct sk_buff *skb);
+};
+
#include <net/sock.h>
struct pppox_sock {
@@ -53,6 +72,8 @@ struct pppox_sock {
union {
struct pppoe_opt pppoe;
struct pptp_opt pptp;
+ struct pppolac_opt lac;
+ struct pppopns_opt pns;
} proto;
__be16 num;
};
diff --git a/include/linux/initramfs.h b/include/linux/initramfs.h
new file mode 100644
index 000000000000..fc7da63b125b
--- /dev/null
+++ b/include/linux/initramfs.h
@@ -0,0 +1,32 @@
+/*
+ * include/linux/initramfs.h
+ *
+ * Copyright (C) 2015, Google
+ * Rom Lemarchand <romlem@android.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _LINUX_INITRAMFS_H
+#define _LINUX_INITRAMFS_H
+
+#include <linux/kconfig.h>
+
+#if IS_BUILTIN(CONFIG_BLK_DEV_INITRD)
+
+int __init default_rootfs(void);
+
+#endif
+
+#endif /* _LINUX_INITRAMFS_H */
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index b9dfca557a6c..11ff75140361 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -37,9 +37,11 @@ struct ipv6_devconf {
__s32 accept_ra_rtr_pref;
__s32 rtr_probe_interval;
#ifdef CONFIG_IPV6_ROUTE_INFO
+ __s32 accept_ra_rt_info_min_plen;
__s32 accept_ra_rt_info_max_plen;
#endif
#endif
+ __s32 accept_ra_rt_table;
__s32 proxy_ndp;
__s32 accept_source_route;
__s32 accept_ra_from_local;
diff --git a/include/linux/keychord.h b/include/linux/keychord.h
new file mode 100644
index 000000000000..08cf5402102c
--- /dev/null
+++ b/include/linux/keychord.h
@@ -0,0 +1,23 @@
+/*
+ * Key chord input driver
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+*/
+
+#ifndef __LINUX_KEYCHORD_H_
+#define __LINUX_KEYCHORD_H_
+
+#include <uapi/linux/keychord.h>
+
+#endif /* __LINUX_KEYCHORD_H_ */
diff --git a/include/linux/keycombo.h b/include/linux/keycombo.h
new file mode 100644
index 000000000000..c6db2626b0d3
--- /dev/null
+++ b/include/linux/keycombo.h
@@ -0,0 +1,36 @@
+/*
+ * include/linux/keycombo.h - platform data structure for keycombo driver
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_KEYCOMBO_H
+#define _LINUX_KEYCOMBO_H
+
+#define KEYCOMBO_NAME "keycombo"
+
+/*
+ * if key_down_fn and key_up_fn are both present, you are guaranteed that
+ * key_down_fn will return before key_up_fn is called, and that key_up_fn
+ * is called iff key_down_fn is called.
+ */
+struct keycombo_platform_data {
+ void (*key_down_fn)(void *);
+ void (*key_up_fn)(void *);
+ void *priv;
+ int key_down_delay; /* Time in ms */
+ int *keys_up;
+ int keys_down[]; /* 0 terminated */
+};
+
+#endif /* _LINUX_KEYCOMBO_H */
diff --git a/include/linux/keyreset.h b/include/linux/keyreset.h
new file mode 100644
index 000000000000..2e34afab65e4
--- /dev/null
+++ b/include/linux/keyreset.h
@@ -0,0 +1,29 @@
+/*
+ * include/linux/keyreset.h - platform data structure for resetkeys driver
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_KEYRESET_H
+#define _LINUX_KEYRESET_H
+
+#define KEYRESET_NAME "keyreset"
+
+struct keyreset_platform_data {
+ int (*reset_fn)(void);
+ int key_down_delay;
+ int *keys_up;
+ int keys_down[]; /* 0 terminated */
+};
+
+#endif /* _LINUX_KEYRESET_H */
diff --git a/include/linux/memory-state-time.h b/include/linux/memory-state-time.h
new file mode 100644
index 000000000000..d2212b027866
--- /dev/null
+++ b/include/linux/memory-state-time.h
@@ -0,0 +1,42 @@
+/* include/linux/memory-state-time.h
+ *
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/workqueue.h>
+
+#define UPDATE_MEMORY_STATE(BLOCK, VALUE) BLOCK->update_call(BLOCK, VALUE)
+
+struct memory_state_update_block;
+
+typedef void (*memory_state_update_fn_t)(struct memory_state_update_block *ub,
+ int value);
+
+/* This struct is populated when you pass it to a memory_state_register*
+ * function. The update_call function is used for an update and defined in the
+ * typedef memory_state_update_fn_t
+ */
+struct memory_state_update_block {
+ memory_state_update_fn_t update_call;
+ int id;
+};
+
+/* Register a frequency struct memory_state_update_block to provide updates to
+ * memory_state_time about frequency changes using its update_call function.
+ */
+struct memory_state_update_block *memory_state_register_frequency_source(void);
+
+/* Register a bandwidth struct memory_state_update_block to provide updates to
+ * memory_state_time about bandwidth changes using its update_call function.
+ */
+struct memory_state_update_block *memory_state_register_bandwidth_source(void);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 11a5a46ce72b..5013b3a7ebcb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1143,6 +1143,7 @@ extern void pagefault_out_of_memory(void);
extern void show_free_areas(unsigned int flags);
extern bool skip_free_areas_node(unsigned int flags, int nid);
+void shmem_set_file(struct vm_area_struct *vma, struct file *file);
int shmem_zero_setup(struct vm_area_struct *);
#ifdef CONFIG_SHMEM
bool shmem_mapping(struct address_space *mapping);
@@ -1966,7 +1967,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
extern struct vm_area_struct *vma_merge(struct mm_struct *,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- struct mempolicy *, struct vm_userfaultfd_ctx);
+ struct mempolicy *, struct vm_userfaultfd_ctx, const char __user *);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int split_vma(struct mm_struct *,
struct vm_area_struct *, unsigned long addr, int new_below);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8d6decd50220..35daed7743ee 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -326,11 +326,18 @@ struct vm_area_struct {
/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap interval tree.
+ *
+ * For private anonymous mappings, a pointer to a null terminated string
+ * in the user process containing the name given to the vma, or NULL
+ * if unnamed.
*/
- struct {
- struct rb_node rb;
- unsigned long rb_subtree_last;
- } shared;
+ union {
+ struct {
+ struct rb_node rb;
+ unsigned long rb_subtree_last;
+ } shared;
+ const char __user *anon_name;
+ };
/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
@@ -620,4 +627,13 @@ typedef struct {
unsigned long val;
} swp_entry_t;
+/* Return the name for an anonymous mapping or NULL for a file-backed mapping */
+static inline const char __user *vma_get_anon_name(struct vm_area_struct *vma)
+{
+ if (vma->vm_file)
+ return NULL;
+
+ return vma->anon_name;
+}
+
#endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 73fad83acbcb..510a73a7a3d2 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -119,6 +119,9 @@ struct mmc_ext_csd {
u8 raw_pwr_cl_ddr_200_360; /* 253 */
u8 raw_bkops_status; /* 246 */
u8 raw_sectors[4]; /* 212 - 4 bytes */
+ u8 pre_eol_info; /* 267 */
+ u8 device_life_time_est_typ_a; /* 268 */
+ u8 device_life_time_est_typ_b; /* 269 */
unsigned int feature_support;
#define MMC_DISCARD_FEATURE BIT(0) /* CMD38 feature */
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 2b953eb8ceae..46a4b798c7cf 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -142,6 +142,10 @@ struct mmc_request {
/* Allow other commands during this ongoing data transfer or busy wait */
bool cap_cmd_during_tfr;
+ ktime_t io_start;
+#ifdef CONFIG_BLOCK
+ int lat_hist_enabled;
+#endif
};
struct mmc_card;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 0b2439441cc8..fac3b5c27f4f 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -16,6 +16,7 @@
#include <linux/sched.h>
#include <linux/device.h>
#include <linux/fault-inject.h>
+#include <linux/blkdev.h>
#include <linux/mmc/core.h>
#include <linux/mmc/card.h>
@@ -397,6 +398,20 @@ struct mmc_host {
int dsr_req; /* DSR value is valid */
u32 dsr; /* optional driver stage (DSR) value */
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+ struct {
+ struct sdio_cis *cis;
+ struct sdio_cccr *cccr;
+ struct sdio_embedded_func *funcs;
+ int num_funcs;
+ } embedded_sdio_data;
+#endif
+
+#ifdef CONFIG_BLOCK
+ int latency_hist_enabled;
+ struct io_latency_state io_lat_s;
+#endif
+
unsigned long private[0] ____cacheline_aligned;
};
@@ -406,6 +421,14 @@ void mmc_remove_host(struct mmc_host *);
void mmc_free_host(struct mmc_host *);
int mmc_of_parse(struct mmc_host *host);
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+extern void mmc_set_embedded_sdio_data(struct mmc_host *host,
+ struct sdio_cis *cis,
+ struct sdio_cccr *cccr,
+ struct sdio_embedded_func *funcs,
+ int num_funcs);
+#endif
+
static inline void *mmc_priv(struct mmc_host *host)
{
return (void *)host->private;
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index c376209c70ef..a034d07c218d 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -331,6 +331,9 @@ struct _mmc_csd {
#define EXT_CSD_CACHE_SIZE 249 /* RO, 4 bytes */
#define EXT_CSD_PWR_CL_DDR_200_360 253 /* RO */
#define EXT_CSD_FIRMWARE_VERSION 254 /* RO, 8 bytes */
+#define EXT_CSD_PRE_EOL_INFO 267 /* RO */
+#define EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_A 268 /* RO */
+#define EXT_CSD_DEVICE_LIFE_TIME_EST_TYP_B 269 /* RO */
#define EXT_CSD_SUPPORTED_MODE 493 /* RO */
#define EXT_CSD_TAG_UNIT_SIZE 498 /* RO */
#define EXT_CSD_DATA_TAG_SUPPORT 499 /* RO */
diff --git a/include/linux/mmc/pm.h b/include/linux/mmc/pm.h
index 4a139204c20c..6e2d6a135c7e 100644
--- a/include/linux/mmc/pm.h
+++ b/include/linux/mmc/pm.h
@@ -26,5 +26,6 @@ typedef unsigned int mmc_pm_flag_t;
#define MMC_PM_KEEP_POWER (1 << 0) /* preserve card power during suspend */
#define MMC_PM_WAKE_SDIO_IRQ (1 << 1) /* wake up host system on SDIO IRQ assertion */
+#define MMC_PM_IGNORE_PM_NOTIFY (1 << 2) /* ignore mmc pm notify */
#endif /* LINUX_MMC_PM_H */
diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h
index 97ca105347a6..f563bcf55e7f 100644
--- a/include/linux/mmc/sdio_func.h
+++ b/include/linux/mmc/sdio_func.h
@@ -23,6 +23,14 @@ struct sdio_func;
typedef void (sdio_irq_handler_t)(struct sdio_func *);
/*
+ * Structure used to hold embedded SDIO device data from platform layer
+ */
+struct sdio_embedded_func {
+ uint8_t f_class;
+ uint32_t f_maxblksize;
+};
+
+/*
* SDIO function CIS tuple (unknown to the core)
*/
struct sdio_func_tuple {
@@ -128,6 +136,8 @@ extern int sdio_release_irq(struct sdio_func *func);
extern unsigned int sdio_align_size(struct sdio_func *func, unsigned int sz);
extern u8 sdio_readb(struct sdio_func *func, unsigned int addr, int *err_ret);
+extern u8 sdio_readb_ext(struct sdio_func *func, unsigned int addr, int *err_ret,
+ unsigned in);
extern u16 sdio_readw(struct sdio_func *func, unsigned int addr, int *err_ret);
extern u32 sdio_readl(struct sdio_func *func, unsigned int addr, int *err_ret);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index e0f3a82eee6d..5615a9eb59db 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -67,6 +67,7 @@ struct vfsmount {
struct dentry *mnt_root; /* root of the mounted tree */
struct super_block *mnt_sb; /* pointer to superblock */
int mnt_flags;
+ void *data;
};
struct file; /* forward dec */
diff --git a/include/linux/namei.h b/include/linux/namei.h
index f29abda31e6d..cf437f56baf4 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -78,8 +78,11 @@ extern struct dentry *user_path_create(int, const char __user *, struct path *,
extern void done_path_create(struct path *, struct dentry *);
extern struct dentry *kern_path_locked(const char *, struct path *);
extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
+extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
+ const char *, unsigned int, struct path *);
extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len2(const char *, struct vfsmount *mnt, struct dentry *, int);
extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
extern int follow_down_one(struct path *);
diff --git a/include/linux/netfilter/xt_qtaguid.h b/include/linux/netfilter/xt_qtaguid.h
new file mode 100644
index 000000000000..1c671552ec37
--- /dev/null
+++ b/include/linux/netfilter/xt_qtaguid.h
@@ -0,0 +1,14 @@
+#ifndef _XT_QTAGUID_MATCH_H
+#define _XT_QTAGUID_MATCH_H
+
+/* For now we just replace the xt_owner.
+ * FIXME: make iptables aware of qtaguid. */
+#include <linux/netfilter/xt_owner.h>
+
+#define XT_QTAGUID_UID XT_OWNER_UID
+#define XT_QTAGUID_GID XT_OWNER_GID
+#define XT_QTAGUID_SOCKET XT_OWNER_SOCKET
+#define xt_qtaguid_match_info xt_owner_match_info
+
+int qtaguid_untag(struct socket *sock, bool kernel);
+#endif /* _XT_QTAGUID_MATCH_H */
diff --git a/include/linux/netfilter/xt_quota2.h b/include/linux/netfilter/xt_quota2.h
new file mode 100644
index 000000000000..eadc6903314e
--- /dev/null
+++ b/include/linux/netfilter/xt_quota2.h
@@ -0,0 +1,25 @@
+#ifndef _XT_QUOTA_H
+#define _XT_QUOTA_H
+
+enum xt_quota_flags {
+ XT_QUOTA_INVERT = 1 << 0,
+ XT_QUOTA_GROW = 1 << 1,
+ XT_QUOTA_PACKET = 1 << 2,
+ XT_QUOTA_NO_CHANGE = 1 << 3,
+ XT_QUOTA_MASK = 0x0F,
+};
+
+struct xt_quota_counter;
+
+struct xt_quota_mtinfo2 {
+ char name[15];
+ u_int8_t flags;
+
+ /* Comparison-invariant */
+ aligned_u64 quota;
+
+ /* Used internally by the kernel */
+ struct xt_quota_counter *master __attribute__((aligned(8)));
+};
+
+#endif /* _XT_QUOTA_H */
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 4341f32516d8..501d461a6a1d 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -63,6 +63,27 @@ extern int of_flat_dt_match(unsigned long node, const char *const *matches);
extern unsigned long of_get_flat_dt_root(void);
extern int of_get_flat_dt_size(void);
+/*
+ * early_init_dt_scan_chosen - scan the device tree for ramdisk and bootargs
+ *
+ * The boot arguments will be placed into the memory pointed to by @data.
+ * That memory should be COMMAND_LINE_SIZE big and initialized to be a valid
+ * (possibly empty) string. Logic for what will be in @data after this
+ * function finishes:
+ *
+ * - CONFIG_CMDLINE_FORCE=true
+ * CONFIG_CMDLINE
+ * - CONFIG_CMDLINE_EXTEND=true, @data is non-empty string
+ * @data + dt bootargs (even if dt bootargs are empty)
+ * - CONFIG_CMDLINE_EXTEND=true, @data is empty string
+ * CONFIG_CMDLINE + dt bootargs (even if dt bootargs are empty)
+ * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=non-empty:
+ * dt bootargs
+ * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=empty, @data is non-empty string
+ * @data is left unchanged
+ * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=empty, @data is empty string
+ * CONFIG_CMDLINE (or "" if that's not defined)
+ */
extern int early_init_dt_scan_chosen(unsigned long node, const char *uname,
int depth, void *data);
extern int early_init_dt_scan_memory(unsigned long node, const char *uname,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 78ed8105e64d..61ab566856e6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1168,6 +1168,11 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
int perf_event_max_stack_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
+static inline bool perf_paranoid_any(void)
+{
+ return sysctl_perf_event_paranoid > 2;
+}
+
static inline bool perf_paranoid_tracepoint_raw(void)
{
return sysctl_perf_event_paranoid > -1;
diff --git a/include/linux/platform_data/ds2482.h b/include/linux/platform_data/ds2482.h
new file mode 100644
index 000000000000..5a6879e2a09a
--- /dev/null
+++ b/include/linux/platform_data/ds2482.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2012 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __PLATFORM_DATA_DS2482__
+#define __PLATFORM_DATA_DS2482__
+
+struct ds2482_platform_data {
+ int slpz_gpio;
+};
+
+#endif /* __PLATFORM_DATA_DS2482__ */
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index ad97baf7b8de..29fc01af3b37 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -18,6 +18,7 @@
#include <linux/leds.h>
#include <linux/spinlock.h>
#include <linux/notifier.h>
+#include <linux/types.h>
/*
* All voltages, currents, charges, energies, time and temperatures in uV,
@@ -148,6 +149,12 @@ enum power_supply_property {
POWER_SUPPLY_PROP_SCOPE,
POWER_SUPPLY_PROP_CHARGE_TERM_CURRENT,
POWER_SUPPLY_PROP_CALIBRATE,
+ /* Local extensions */
+ POWER_SUPPLY_PROP_USB_HC,
+ POWER_SUPPLY_PROP_USB_OTG,
+ POWER_SUPPLY_PROP_CHARGE_ENABLED,
+ /* Local extensions of type int64_t */
+ POWER_SUPPLY_PROP_CHARGE_COUNTER_EXT,
/* Properties of type `const char *' */
POWER_SUPPLY_PROP_MODEL_NAME,
POWER_SUPPLY_PROP_MANUFACTURER,
@@ -175,6 +182,7 @@ enum power_supply_notifier_events {
union power_supply_propval {
int intval;
const char *strval;
+ int64_t int64val;
};
struct device_node;
diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
index 4058bf991868..cb5edd64a3d3 100644
--- a/include/linux/pstore_ram.h
+++ b/include/linux/pstore_ram.h
@@ -80,6 +80,8 @@ void persistent_ram_free_old(struct persistent_ram_zone *prz);
ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
char *str, size_t len);
+void ramoops_console_write_buf(const char *buf, size_t size);
+
/*
* Ramoops platform data
* @mem_size memory size for ramoops
diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index b0f305e77b7f..bad7710866af 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -177,17 +177,14 @@ static inline struct fence *
reservation_object_get_excl_rcu(struct reservation_object *obj)
{
struct fence *fence;
- unsigned seq;
-retry:
- seq = read_seqcount_begin(&obj->seq);
+
+ if (!rcu_access_pointer(obj->fence_excl))
+ return NULL;
+
rcu_read_lock();
- fence = rcu_dereference(obj->fence_excl);
- if (read_seqcount_retry(&obj->seq, seq)) {
- rcu_read_unlock();
- goto retry;
- }
- fence = fence_get(fence);
+ fence = fence_get_rcu_safe(&obj->fence_excl);
rcu_read_unlock();
+
return fence;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f4a551a5482c..1d499e8e2e91 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -173,6 +173,9 @@ extern bool single_task_running(void);
extern unsigned long nr_iowait(void);
extern unsigned long nr_iowait_cpu(int cpu);
extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
+#ifdef CONFIG_CPU_QUIET
+extern u64 nr_running_integral(unsigned int cpu);
+#endif
extern void calc_global_load(unsigned long ticks);
@@ -315,6 +318,15 @@ extern char ___assert_task_state[1 - 2*!!(
/* Task command name length */
#define TASK_COMM_LEN 16
+enum task_event {
+ PUT_PREV_TASK = 0,
+ PICK_NEXT_TASK = 1,
+ TASK_WAKE = 2,
+ TASK_MIGRATE = 3,
+ TASK_UPDATE = 4,
+ IRQ_UPDATE = 5,
+};
+
#include <linux/spinlock.h>
/*
@@ -982,6 +994,14 @@ enum cpu_idle_type {
#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
+struct sched_capacity_reqs {
+ unsigned long cfs;
+ unsigned long rt;
+ unsigned long dl;
+
+ unsigned long total;
+};
+
/*
* Wake-queues are lists of tasks with a pending wakeup, whose
* callers have already marked the task as woken internally,
@@ -1045,6 +1065,7 @@ extern void wake_up_q(struct wake_q_head *head);
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
#define SD_NUMA 0x4000 /* cross-node balancing */
+#define SD_SHARE_CAP_STATES 0x8000 /* Domain members share capacity state */
#ifdef CONFIG_SCHED_SMT
static inline int cpu_smt_flags(void)
@@ -1077,8 +1098,57 @@ struct sched_domain_attr {
extern int sched_domain_level_max;
+struct capacity_state {
+ unsigned long cap; /* compute capacity */
+ unsigned long power; /* power consumption at this compute capacity */
+};
+
+struct idle_state {
+ unsigned long power; /* power consumption in this idle state */
+};
+
+struct sched_group_energy {
+ unsigned int nr_idle_states; /* number of idle states */
+ struct idle_state *idle_states; /* ptr to idle state array */
+ unsigned int nr_cap_states; /* number of capacity states */
+ struct capacity_state *cap_states; /* ptr to capacity state array */
+};
+
+unsigned long capacity_curr_of(int cpu);
+
struct sched_group;
+struct eas_stats {
+ /* select_idle_sibling() stats */
+ u64 sis_attempts;
+ u64 sis_idle;
+ u64 sis_cache_affine;
+ u64 sis_suff_cap;
+ u64 sis_idle_cpu;
+ u64 sis_count;
+
+ /* select_energy_cpu_brute() stats */
+ u64 secb_attempts;
+ u64 secb_sync;
+ u64 secb_idle_bt;
+ u64 secb_insuff_cap;
+ u64 secb_no_nrg_sav;
+ u64 secb_nrg_sav;
+ u64 secb_count;
+
+ /* find_best_target() stats */
+ u64 fbt_attempts;
+ u64 fbt_no_cpu;
+ u64 fbt_no_sd;
+ u64 fbt_pref_idle;
+ u64 fbt_count;
+
+ /* cas */
+ /* select_task_rq_fair() stats */
+ u64 cas_attempts;
+ u64 cas_count;
+};
+
struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
@@ -1147,6 +1217,8 @@ struct sched_domain {
unsigned int ttwu_wake_remote;
unsigned int ttwu_move_affine;
unsigned int ttwu_move_balance;
+
+ struct eas_stats eas_stats;
#endif
#ifdef CONFIG_SCHED_DEBUG
char *name;
@@ -1184,6 +1256,8 @@ bool cpus_share_cache(int this_cpu, int that_cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
typedef int (*sched_domain_flags_f)(void);
+typedef
+const struct sched_group_energy * const(*sched_domain_energy_f)(int cpu);
#define SDTL_OVERLAP 0x01
@@ -1197,6 +1271,7 @@ struct sd_data {
struct sched_domain_topology_level {
sched_domain_mask_f mask;
sched_domain_flags_f sd_flags;
+ sched_domain_energy_f energy;
int flags;
int numa_level;
struct sd_data data;
@@ -1342,6 +1417,70 @@ struct sched_statistics {
u64 nr_wakeups_affine_attempts;
u64 nr_wakeups_passive;
u64 nr_wakeups_idle;
+
+ /* select_idle_sibling() */
+ u64 nr_wakeups_sis_attempts;
+ u64 nr_wakeups_sis_idle;
+ u64 nr_wakeups_sis_cache_affine;
+ u64 nr_wakeups_sis_suff_cap;
+ u64 nr_wakeups_sis_idle_cpu;
+ u64 nr_wakeups_sis_count;
+
+ /* energy_aware_wake_cpu() */
+ u64 nr_wakeups_secb_attempts;
+ u64 nr_wakeups_secb_sync;
+ u64 nr_wakeups_secb_idle_bt;
+ u64 nr_wakeups_secb_insuff_cap;
+ u64 nr_wakeups_secb_no_nrg_sav;
+ u64 nr_wakeups_secb_nrg_sav;
+ u64 nr_wakeups_secb_count;
+
+ /* find_best_target() */
+ u64 nr_wakeups_fbt_attempts;
+ u64 nr_wakeups_fbt_no_cpu;
+ u64 nr_wakeups_fbt_no_sd;
+ u64 nr_wakeups_fbt_pref_idle;
+ u64 nr_wakeups_fbt_count;
+
+ /* cas */
+ /* select_task_rq_fair() */
+ u64 nr_wakeups_cas_attempts;
+ u64 nr_wakeups_cas_count;
+};
+#endif
+
+#ifdef CONFIG_SCHED_WALT
+#define RAVG_HIST_SIZE_MAX 5
+
+/* ravg represents frequency scaled cpu-demand of tasks */
+struct ravg {
+ /*
+ * 'mark_start' marks the beginning of an event (task waking up, task
+ * starting to execute, task being preempted) within a window
+ *
+ * 'sum' represents how runnable a task has been within current
+ * window. It incorporates both running time and wait time and is
+ * frequency scaled.
+ *
+ * 'sum_history' keeps track of history of 'sum' seen over previous
+ * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
+ * ignored.
+ *
+ * 'demand' represents maximum sum seen over previous
+ * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
+ * demand for tasks.
+ *
+ * 'curr_window' represents task's contribution to cpu busy time
+ * statistics (rq->curr_runnable_sum) in current window
+ *
+ * 'prev_window' represents task's contribution to cpu busy time
+ * statistics (rq->prev_runnable_sum) in previous window
+ */
+ u64 mark_start;
+ u32 sum, demand;
+ u32 sum_history[RAVG_HIST_SIZE_MAX];
+ u32 curr_window, prev_window;
+ u16 active_windows;
};
#endif
@@ -1516,6 +1655,15 @@ struct task_struct {
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
+#ifdef CONFIG_SCHED_WALT
+ struct ravg ravg;
+ /*
+ * 'init_load_pct' represents the initial task load assigned to children
+ * of this task
+ */
+ u32 init_load_pct;
+#endif
+
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
@@ -3571,6 +3719,11 @@ static inline void inc_syscw(struct task_struct *tsk)
{
tsk->ioac.syscw++;
}
+
+static inline void inc_syscfs(struct task_struct *tsk)
+{
+ tsk->ioac.syscfs++;
+}
#else
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
{
@@ -3587,6 +3740,9 @@ static inline void inc_syscr(struct task_struct *tsk)
static inline void inc_syscw(struct task_struct *tsk)
{
}
+static inline void inc_syscfs(struct task_struct *tsk)
+{
+}
#endif
#ifndef TASK_SIZE_OF
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 22db1e63707e..48dfcce44234 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -18,6 +18,15 @@ extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_child_runs_first;
+extern unsigned int sysctl_sched_sync_hint_enable;
+extern unsigned int sysctl_sched_initial_task_util;
+extern unsigned int sysctl_sched_cstate_aware;
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int sysctl_sched_walt_init_task_load_pct;
+extern unsigned int sysctl_sched_walt_cpu_high_irqload;
+#endif
enum sched_tunable_scaling {
SCHED_TUNABLESCALING_NONE,
@@ -56,6 +65,22 @@ extern int sysctl_sched_rt_runtime;
extern unsigned int sysctl_sched_cfs_bandwidth_slice;
#endif
+#ifdef CONFIG_SCHED_TUNE
+extern unsigned int sysctl_sched_cfs_boost;
+int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length,
+ loff_t *ppos);
+static inline unsigned int get_sysctl_sched_cfs_boost(void)
+{
+ return sysctl_sched_cfs_boost;
+}
+#else
+static inline unsigned int get_sysctl_sched_cfs_boost(void)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_SCHED_AUTOGROUP
extern unsigned int sysctl_sched_autogroup_enabled;
#endif
diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h
new file mode 100644
index 000000000000..1daf3e1f98a7
--- /dev/null
+++ b/include/linux/sched_energy.h
@@ -0,0 +1,44 @@
+#ifndef _LINUX_SCHED_ENERGY_H
+#define _LINUX_SCHED_ENERGY_H
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+/*
+ * There doesn't seem to be an NR_CPUS style max number of sched domain
+ * levels so here's an arbitrary constant one for the moment.
+ *
+ * The levels alluded to here correspond to entries in struct
+ * sched_domain_topology_level that are meant to be populated by arch
+ * specific code (topology.c).
+ */
+#define NR_SD_LEVELS 8
+
+#define SD_LEVEL0 0
+#define SD_LEVEL1 1
+#define SD_LEVEL2 2
+#define SD_LEVEL3 3
+#define SD_LEVEL4 4
+#define SD_LEVEL5 5
+#define SD_LEVEL6 6
+#define SD_LEVEL7 7
+
+/*
+ * Convenience macro for iterating through said sd levels.
+ */
+#define for_each_possible_sd_level(level) \
+ for (level = 0; level < NR_SD_LEVELS; level++)
+
+#ifdef CONFIG_SMP
+
+extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
+
+void init_sched_energy_costs(void);
+
+#else
+
+#define init_sched_energy_costs() do { } while (0)
+
+#endif /* CONFIG_SMP */
+
+#endif
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index eb4f6456521e..c7dff69a4103 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -66,6 +66,7 @@ struct uart_ops {
void (*set_ldisc)(struct uart_port *, struct ktermios *);
void (*pm)(struct uart_port *, unsigned int state,
unsigned int oldstate);
+ void (*wake_peer)(struct uart_port *);
/*
* Return a string describing the type of the port
diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index a0596ca0e80a..a2f8109bb215 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -24,6 +24,7 @@ void sock_diag_unregister(const struct sock_diag_handler *h);
void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
+u64 sock_gen_cookie(struct sock *sk);
int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie);
void sock_diag_save_cookie(struct sock *sk, __u32 *cookie);
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 249dafce2788..90d85690ab7b 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -436,6 +436,7 @@ extern bool pm_get_wakeup_count(unsigned int *count, bool block);
extern bool pm_save_wakeup_count(unsigned int count);
extern void pm_wakep_autosleep_enabled(bool set);
extern void pm_print_active_wakeup_sources(void);
+extern void pm_get_active_wakeup_sources(char *pending_sources, size_t max);
static inline void lock_system_sleep(void)
{
diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h
index bdf855c2856f..2dd338fdf881 100644
--- a/include/linux/task_io_accounting.h
+++ b/include/linux/task_io_accounting.h
@@ -18,6 +18,8 @@ struct task_io_accounting {
u64 syscr;
/* # of write syscalls */
u64 syscw;
+ /* # of fsync syscalls */
+ u64 syscfs;
#endif /* CONFIG_TASK_XACCT */
#ifdef CONFIG_TASK_IO_ACCOUNTING
diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h
index 4d090f9ee608..1b505c804af3 100644
--- a/include/linux/task_io_accounting_ops.h
+++ b/include/linux/task_io_accounting_ops.h
@@ -96,6 +96,7 @@ static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,
dst->wchar += src->wchar;
dst->syscr += src->syscr;
dst->syscw += src->syscw;
+ dst->syscfs += src->syscfs;
}
#else
static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 62be0786d6d0..78ec2eb4d340 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -117,6 +117,7 @@ extern void tick_nohz_idle_enter(void);
extern void tick_nohz_idle_exit(void);
extern void tick_nohz_irq_exit(void);
extern ktime_t tick_nohz_get_sleep_length(void);
+extern unsigned long tick_nohz_get_idle_calls(void);
extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
#else /* !CONFIG_NO_HZ_COMMON */
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 09168c52ab64..361f8bf1429d 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -249,6 +249,7 @@ static inline u64 ktime_get_raw_ns(void)
extern u64 ktime_get_mono_fast_ns(void);
extern u64 ktime_get_raw_fast_ns(void);
+extern u64 ktime_get_boot_fast_ns(void);
/*
* Timespec interfaces utilizing the ktime based ones
diff --git a/include/linux/usb/class-dual-role.h b/include/linux/usb/class-dual-role.h
new file mode 100644
index 000000000000..c6df2238012e
--- /dev/null
+++ b/include/linux/usb/class-dual-role.h
@@ -0,0 +1,129 @@
+#ifndef __LINUX_CLASS_DUAL_ROLE_H__
+#define __LINUX_CLASS_DUAL_ROLE_H__
+
+#include <linux/workqueue.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+
+struct device;
+
+enum dual_role_supported_modes {
+ DUAL_ROLE_SUPPORTED_MODES_DFP_AND_UFP = 0,
+ DUAL_ROLE_SUPPORTED_MODES_DFP,
+ DUAL_ROLE_SUPPORTED_MODES_UFP,
+/*The following should be the last element*/
+ DUAL_ROLE_PROP_SUPPORTED_MODES_TOTAL,
+};
+
+enum {
+ DUAL_ROLE_PROP_MODE_UFP = 0,
+ DUAL_ROLE_PROP_MODE_DFP,
+ DUAL_ROLE_PROP_MODE_NONE,
+/*The following should be the last element*/
+ DUAL_ROLE_PROP_MODE_TOTAL,
+};
+
+enum {
+ DUAL_ROLE_PROP_PR_SRC = 0,
+ DUAL_ROLE_PROP_PR_SNK,
+ DUAL_ROLE_PROP_PR_NONE,
+/*The following should be the last element*/
+ DUAL_ROLE_PROP_PR_TOTAL,
+
+};
+
+enum {
+ DUAL_ROLE_PROP_DR_HOST = 0,
+ DUAL_ROLE_PROP_DR_DEVICE,
+ DUAL_ROLE_PROP_DR_NONE,
+/*The following should be the last element*/
+ DUAL_ROLE_PROP_DR_TOTAL,
+};
+
+enum {
+ DUAL_ROLE_PROP_VCONN_SUPPLY_NO = 0,
+ DUAL_ROLE_PROP_VCONN_SUPPLY_YES,
+/*The following should be the last element*/
+ DUAL_ROLE_PROP_VCONN_SUPPLY_TOTAL,
+};
+
+enum dual_role_property {
+ DUAL_ROLE_PROP_SUPPORTED_MODES = 0,
+ DUAL_ROLE_PROP_MODE,
+ DUAL_ROLE_PROP_PR,
+ DUAL_ROLE_PROP_DR,
+ DUAL_ROLE_PROP_VCONN_SUPPLY,
+};
+
+struct dual_role_phy_instance;
+
+/* Description of typec port */
+struct dual_role_phy_desc {
+ /* /sys/class/dual_role_usb/<name>/ */
+ const char *name;
+ enum dual_role_supported_modes supported_modes;
+ enum dual_role_property *properties;
+ size_t num_properties;
+
+ /* Callback for "cat /sys/class/dual_role_usb/<name>/<property>" */
+ int (*get_property)(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ unsigned int *val);
+ /* Callback for "echo <value> >
+ * /sys/class/dual_role_usb/<name>/<property>" */
+ int (*set_property)(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ const unsigned int *val);
+ /* Decides whether userspace can change a specific property */
+ int (*property_is_writeable)(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop);
+};
+
+struct dual_role_phy_instance {
+ const struct dual_role_phy_desc *desc;
+
+ /* Driver private data */
+ void *drv_data;
+
+ struct device dev;
+ struct work_struct changed_work;
+};
+
+#if IS_ENABLED(CONFIG_DUAL_ROLE_USB_INTF)
+extern void dual_role_instance_changed(struct dual_role_phy_instance
+ *dual_role);
+extern struct dual_role_phy_instance *__must_check
+devm_dual_role_instance_register(struct device *parent,
+ const struct dual_role_phy_desc *desc);
+extern void devm_dual_role_instance_unregister(struct device *dev,
+ struct dual_role_phy_instance
+ *dual_role);
+extern int dual_role_get_property(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ unsigned int *val);
+extern int dual_role_set_property(struct dual_role_phy_instance *dual_role,
+ enum dual_role_property prop,
+ const unsigned int *val);
+extern int dual_role_property_is_writeable(struct dual_role_phy_instance
+ *dual_role,
+ enum dual_role_property prop);
+extern void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role);
+#else /* CONFIG_DUAL_ROLE_USB_INTF */
+static inline void dual_role_instance_changed(struct dual_role_phy_instance
+ *dual_role){}
+static inline struct dual_role_phy_instance *__must_check
+devm_dual_role_instance_register(struct device *parent,
+ const struct dual_role_phy_desc *desc)
+{
+ return ERR_PTR(-ENOSYS);
+}
+static inline void devm_dual_role_instance_unregister(struct device *dev,
+ struct dual_role_phy_instance
+ *dual_role){}
+static inline void *dual_role_get_drvdata(struct dual_role_phy_instance
+ *dual_role)
+{
+ return ERR_PTR(-ENOSYS);
+}
+#endif /* CONFIG_DUAL_ROLE_USB_INTF */
+#endif /* __LINUX_CLASS_DUAL_ROLE_H__ */
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index 667d20454a21..74f97cee3051 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -584,6 +584,7 @@ struct usb_function_instance {
struct config_group group;
struct list_head cfs_list;
struct usb_function_driver *fd;
+ struct usb_function *f;
int (*set_inst_name)(struct usb_function_instance *inst,
const char *name);
void (*free_func_inst)(struct usb_function_instance *inst);
diff --git a/include/linux/usb/f_accessory.h b/include/linux/usb/f_accessory.h
new file mode 100644
index 000000000000..ebe3c4d59309
--- /dev/null
+++ b/include/linux/usb/f_accessory.h
@@ -0,0 +1,23 @@
+/*
+ * Gadget Function Driver for Android USB accessories
+ *
+ * Copyright (C) 2011 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __LINUX_USB_F_ACCESSORY_H
+#define __LINUX_USB_F_ACCESSORY_H
+
+#include <uapi/linux/usb/f_accessory.h>
+
+#endif /* __LINUX_USB_F_ACCESSORY_H */
diff --git a/include/linux/usb/f_mtp.h b/include/linux/usb/f_mtp.h
new file mode 100644
index 000000000000..4e8417791bea
--- /dev/null
+++ b/include/linux/usb/f_mtp.h
@@ -0,0 +1,23 @@
+/*
+ * Gadget Function Driver for MTP
+ *
+ * Copyright (C) 2010 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __LINUX_USB_F_MTP_H
+#define __LINUX_USB_F_MTP_H
+
+#include <uapi/linux/usb/f_mtp.h>
+
+#endif /* __LINUX_USB_F_MTP_H */
diff --git a/include/linux/wakeup_reason.h b/include/linux/wakeup_reason.h
new file mode 100644
index 000000000000..d84d8c301546
--- /dev/null
+++ b/include/linux/wakeup_reason.h
@@ -0,0 +1,32 @@
+/*
+ * include/linux/wakeup_reason.h
+ *
+ * Logs the reason which caused the kernel to resume
+ * from the suspend mode.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _LINUX_WAKEUP_REASON_H
+#define _LINUX_WAKEUP_REASON_H
+
+#define MAX_SUSPEND_ABORT_LEN 256
+
+void log_wakeup_reason(int irq);
+int check_wakeup_reason(int irq);
+
+#ifdef CONFIG_SUSPEND
+void log_suspend_abort_reason(const char *fmt, ...);
+#else
+static inline void log_suspend_abort_reason(const char *fmt, ...) { }
+#endif
+
+#endif /* _LINUX_WAKEUP_REASON_H */
diff --git a/include/linux/wlan_plat.h b/include/linux/wlan_plat.h
new file mode 100644
index 000000000000..8e8b06f1ba4a
--- /dev/null
+++ b/include/linux/wlan_plat.h
@@ -0,0 +1,30 @@
+/* include/linux/wlan_plat.h
+ *
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef _LINUX_WLAN_PLAT_H_
+#define _LINUX_WLAN_PLAT_H_
+
+#define WLAN_PLAT_NODFS_FLAG 0x01
+
+struct wifi_platform_data {
+ int (*set_power)(int val);
+ int (*set_reset)(int val);
+ int (*set_carddetect)(int val);
+ void *(*mem_prealloc)(int section, unsigned long size);
+ int (*get_mac_addr)(unsigned char *buf);
+ int (*get_wake_irq)(void);
+ void *(*get_country_code)(char *ccode, u32 flags);
+};
+
+#endif
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index b8ee8a113e32..858f308d69ca 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -244,6 +244,8 @@ static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset)
void addrconf_prefix_rcv(struct net_device *dev,
u8 *opt, int len, bool sllao);
+u32 addrconf_rt_table(const struct net_device *dev, u32 default_table);
+
/*
* anycast prototypes (anycast.c)
*/
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 456e4a6006ab..8dbfdf728cd8 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -8,6 +8,11 @@
#include <net/flow.h>
#include <net/rtnetlink.h>
+struct fib_kuid_range {
+ kuid_t start;
+ kuid_t end;
+};
+
struct fib_rule {
struct list_head list;
int iifindex;
@@ -30,6 +35,7 @@ struct fib_rule {
int suppress_prefixlen;
char iifname[IFNAMSIZ];
char oifname[IFNAMSIZ];
+ struct fib_kuid_range uid_range;
struct rcu_head rcu;
};
@@ -92,7 +98,8 @@ struct fib_rules_ops {
[FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \
[FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \
[FRA_GOTO] = { .type = NLA_U32 }, \
- [FRA_L3MDEV] = { .type = NLA_U8 }
+ [FRA_L3MDEV] = { .type = NLA_U8 }, \
+ [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }
static inline void fib_rule_get(struct fib_rule *rule)
{
diff --git a/include/net/flow.h b/include/net/flow.h
index 035aa7716967..6bbbca8af8e3 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -11,6 +11,7 @@
#include <linux/in6.h>
#include <linux/atomic.h>
#include <net/flow_dissector.h>
+#include <linux/uidgid.h>
/*
* ifindex generation is per-net namespace, and loopback is
@@ -37,6 +38,7 @@ struct flowi_common {
#define FLOWI_FLAG_SKIP_NH_OIF 0x04
__u32 flowic_secid;
struct flowi_tunnel flowic_tun_key;
+ kuid_t flowic_uid;
};
union flowi_uli {
@@ -74,6 +76,7 @@ struct flowi4 {
#define flowi4_flags __fl_common.flowic_flags
#define flowi4_secid __fl_common.flowic_secid
#define flowi4_tun_key __fl_common.flowic_tun_key
+#define flowi4_uid __fl_common.flowic_uid
/* (saddr,daddr) must be grouped, same order as in IP header */
__be32 saddr;
@@ -93,7 +96,8 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
__u32 mark, __u8 tos, __u8 scope,
__u8 proto, __u8 flags,
__be32 daddr, __be32 saddr,
- __be16 dport, __be16 sport)
+ __be16 dport, __be16 sport,
+ kuid_t uid)
{
fl4->flowi4_oif = oif;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
@@ -104,6 +108,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
fl4->flowi4_flags = flags;
fl4->flowi4_secid = 0;
fl4->flowi4_tun_key.tun_id = 0;
+ fl4->flowi4_uid = uid;
fl4->daddr = daddr;
fl4->saddr = saddr;
fl4->fl4_dport = dport;
@@ -131,6 +136,7 @@ struct flowi6 {
#define flowi6_flags __fl_common.flowic_flags
#define flowi6_secid __fl_common.flowic_secid
#define flowi6_tun_key __fl_common.flowic_tun_key
+#define flowi6_uid __fl_common.flowic_uid
struct in6_addr daddr;
struct in6_addr saddr;
/* Note: flowi6_tos is encoded in flowlabel, too. */
@@ -176,6 +182,7 @@ struct flowi {
#define flowi_flags u.__fl_common.flowic_flags
#define flowi_secid u.__fl_common.flowic_secid
#define flowi_tun_key u.__fl_common.flowic_tun_key
+#define flowi_uid u.__fl_common.flowic_uid
} __attribute__((__aligned__(BITS_PER_LONG/8)));
static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4)
diff --git a/include/net/ip.h b/include/net/ip.h
index 8646da034851..eacff8e14f29 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -180,6 +180,7 @@ struct ip_reply_arg {
/* -1 if not needed */
int bound_dev_if;
u8 tos;
+ kuid_t uid;
};
#define IP_REPLY_ARG_NOSRCCHECK 1
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 2c43993e079c..4341731f39a5 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -142,9 +142,10 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
const struct in6_addr *gwaddr);
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif,
- u32 mark);
+ u32 mark, kuid_t uid);
void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu);
-void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark);
+void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
+ kuid_t uid);
void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
u32 mark);
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk);
diff --git a/include/net/route.h b/include/net/route.h
index b8488efef920..2702b7ac9a29 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -154,7 +154,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi
flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos,
RT_SCOPE_UNIVERSE, proto,
sk ? inet_sk_flowi_flags(sk) : 0,
- daddr, saddr, dport, sport);
+ daddr, saddr, dport, sport, sock_net_uid(net, sk));
if (sk)
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
return ip_route_output_flow(net, fl4, sk);
@@ -270,7 +270,8 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
flow_flags |= FLOWI_FLAG_ANYSRC;
flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
- protocol, flow_flags, dst, src, dport, sport);
+ protocol, flow_flags, dst, src, dport, sport,
+ sk->sk_uid);
}
static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
diff --git a/include/net/sock.h b/include/net/sock.h
index 6d42ed883bf9..badd144fca1c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -419,6 +419,7 @@ struct sock {
u32 sk_max_ack_backlog;
__u32 sk_priority;
__u32 sk_mark;
+ kuid_t sk_uid;
struct pid *sk_peer_pid;
const struct cred *sk_peer_cred;
long sk_rcvtimeo;
@@ -1647,6 +1648,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
sk->sk_wq = parent->wq;
parent->sk = sk;
sk_set_socket(sk, parent);
+ sk->sk_uid = SOCK_INODE(parent)->i_uid;
security_sock_graft(sk, parent);
write_unlock_bh(&sk->sk_callback_lock);
}
@@ -1654,6 +1656,11 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
kuid_t sock_i_uid(struct sock *sk);
unsigned long sock_i_ino(struct sock *sk);
+static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
+{
+ return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
+}
+
static inline u32 net_tx_rndhash(void)
{
u32 v = prandom_u32();
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c3f4f6a9e6c3..d5eeb2b98129 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -271,6 +271,7 @@ extern int sysctl_tcp_autocorking;
extern int sysctl_tcp_invalid_ratelimit;
extern int sysctl_tcp_pacing_ss_ratio;
extern int sysctl_tcp_pacing_ca_ratio;
+extern int sysctl_tcp_default_init_rwnd;
extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
diff --git a/include/trace/events/android_fs.h b/include/trace/events/android_fs.h
new file mode 100644
index 000000000000..49509533d3fa
--- /dev/null
+++ b/include/trace/events/android_fs.h
@@ -0,0 +1,65 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM android_fs
+
+#if !defined(_TRACE_ANDROID_FS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ANDROID_FS_H
+
+#include <linux/tracepoint.h>
+#include <trace/events/android_fs_template.h>
+
+DEFINE_EVENT(android_fs_data_start_template, android_fs_dataread_start,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes,
+ pid_t pid, char *pathname, char *command),
+ TP_ARGS(inode, offset, bytes, pid, pathname, command));
+
+DEFINE_EVENT(android_fs_data_end_template, android_fs_dataread_end,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes),
+ TP_ARGS(inode, offset, bytes));
+
+DEFINE_EVENT(android_fs_data_start_template, android_fs_datawrite_start,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes,
+ pid_t pid, char *pathname, char *command),
+ TP_ARGS(inode, offset, bytes, pid, pathname, command));
+
+DEFINE_EVENT(android_fs_data_end_template, android_fs_datawrite_end,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes),
+ TP_ARGS(inode, offset, bytes));
+
+#endif /* _TRACE_ANDROID_FS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
+#ifndef ANDROID_FSTRACE_GET_PATHNAME
+#define ANDROID_FSTRACE_GET_PATHNAME
+
+/* Sizes an on-stack array, so careful if sizing this up ! */
+#define MAX_TRACE_PATHBUF_LEN 256
+
+static inline char *
+android_fstrace_get_pathname(char *buf, int buflen, struct inode *inode)
+{
+ char *path;
+ struct dentry *d;
+
+ /*
+ * d_obtain_alias() will either iput() if it locates an existing
+ * dentry or transfer the reference to the new dentry created.
+ * So get an extra reference here.
+ */
+ ihold(inode);
+ d = d_obtain_alias(inode);
+ if (likely(!IS_ERR(d))) {
+ path = dentry_path_raw(d, buf, buflen);
+ if (unlikely(IS_ERR(path))) {
+ strcpy(buf, "ERROR");
+ path = buf;
+ }
+ dput(d);
+ } else {
+ strcpy(buf, "ERROR");
+ path = buf;
+ }
+ return path;
+}
+#endif
diff --git a/include/trace/events/android_fs_template.h b/include/trace/events/android_fs_template.h
new file mode 100644
index 000000000000..b23d17b56c63
--- /dev/null
+++ b/include/trace/events/android_fs_template.h
@@ -0,0 +1,64 @@
+#if !defined(_TRACE_ANDROID_FS_TEMPLATE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ANDROID_FS_TEMPLATE_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(android_fs_data_start_template,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes,
+ pid_t pid, char *pathname, char *command),
+ TP_ARGS(inode, offset, bytes, pid, pathname, command),
+ TP_STRUCT__entry(
+ __string(pathbuf, pathname);
+ __field(loff_t, offset);
+ __field(int, bytes);
+ __field(loff_t, i_size);
+ __string(cmdline, command);
+ __field(pid_t, pid);
+ __field(ino_t, ino);
+ ),
+ TP_fast_assign(
+ {
+ /*
+ * Replace the spaces in filenames and cmdlines
+ * because this screws up the tooling that parses
+ * the traces.
+ */
+ __assign_str(pathbuf, pathname);
+ (void)strreplace(__get_str(pathbuf), ' ', '_');
+ __entry->offset = offset;
+ __entry->bytes = bytes;
+ __entry->i_size = i_size_read(inode);
+ __assign_str(cmdline, command);
+ (void)strreplace(__get_str(cmdline), ' ', '_');
+ __entry->pid = pid;
+ __entry->ino = inode->i_ino;
+ }
+ ),
+ TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s,"
+ " pid %d, i_size %llu, ino %lu",
+ __get_str(pathbuf), __entry->offset, __entry->bytes,
+ __get_str(cmdline), __entry->pid, __entry->i_size,
+ (unsigned long) __entry->ino)
+);
+
+DECLARE_EVENT_CLASS(android_fs_data_end_template,
+ TP_PROTO(struct inode *inode, loff_t offset, int bytes),
+ TP_ARGS(inode, offset, bytes),
+ TP_STRUCT__entry(
+ __field(ino_t, ino);
+ __field(loff_t, offset);
+ __field(int, bytes);
+ ),
+ TP_fast_assign(
+ {
+ __entry->ino = inode->i_ino;
+ __entry->offset = offset;
+ __entry->bytes = bytes;
+ }
+ ),
+ TP_printk("ino %lu, offset %llu, bytes %d",
+ (unsigned long) __entry->ino,
+ __entry->offset, __entry->bytes)
+);
+
+#endif /* _TRACE_ANDROID_FS_TEMPLATE_H */
diff --git a/include/trace/events/cpufreq_interactive.h b/include/trace/events/cpufreq_interactive.h
new file mode 100644
index 000000000000..faecc0bfdeff
--- /dev/null
+++ b/include/trace/events/cpufreq_interactive.h
@@ -0,0 +1,112 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cpufreq_interactive
+
+#if !defined(_TRACE_CPUFREQ_INTERACTIVE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_CPUFREQ_INTERACTIVE_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(set,
+ TP_PROTO(u32 cpu_id, unsigned long targfreq,
+ unsigned long actualfreq),
+ TP_ARGS(cpu_id, targfreq, actualfreq),
+
+ TP_STRUCT__entry(
+ __field(u32, cpu_id)
+ __field(unsigned long, targfreq)
+ __field(unsigned long, actualfreq)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu_id = (u32)cpu_id;
+ __entry->targfreq = targfreq;
+ __entry->actualfreq = actualfreq;
+ ),
+
+ TP_printk("cpu=%u targ=%lu actual=%lu",
+ __entry->cpu_id, __entry->targfreq,
+ __entry->actualfreq)
+);
+
+DEFINE_EVENT(set, cpufreq_interactive_setspeed,
+ TP_PROTO(u32 cpu_id, unsigned long targfreq,
+ unsigned long actualfreq),
+ TP_ARGS(cpu_id, targfreq, actualfreq)
+);
+
+DECLARE_EVENT_CLASS(loadeval,
+ TP_PROTO(unsigned long cpu_id, unsigned long load,
+ unsigned long curtarg, unsigned long curactual,
+ unsigned long newtarg),
+ TP_ARGS(cpu_id, load, curtarg, curactual, newtarg),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, cpu_id)
+ __field(unsigned long, load)
+ __field(unsigned long, curtarg)
+ __field(unsigned long, curactual)
+ __field(unsigned long, newtarg)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu_id = cpu_id;
+ __entry->load = load;
+ __entry->curtarg = curtarg;
+ __entry->curactual = curactual;
+ __entry->newtarg = newtarg;
+ ),
+
+ TP_printk("cpu=%lu load=%lu cur=%lu actual=%lu targ=%lu",
+ __entry->cpu_id, __entry->load, __entry->curtarg,
+ __entry->curactual, __entry->newtarg)
+);
+
+DEFINE_EVENT(loadeval, cpufreq_interactive_target,
+ TP_PROTO(unsigned long cpu_id, unsigned long load,
+ unsigned long curtarg, unsigned long curactual,
+ unsigned long newtarg),
+ TP_ARGS(cpu_id, load, curtarg, curactual, newtarg)
+);
+
+DEFINE_EVENT(loadeval, cpufreq_interactive_already,
+ TP_PROTO(unsigned long cpu_id, unsigned long load,
+ unsigned long curtarg, unsigned long curactual,
+ unsigned long newtarg),
+ TP_ARGS(cpu_id, load, curtarg, curactual, newtarg)
+);
+
+DEFINE_EVENT(loadeval, cpufreq_interactive_notyet,
+ TP_PROTO(unsigned long cpu_id, unsigned long load,
+ unsigned long curtarg, unsigned long curactual,
+ unsigned long newtarg),
+ TP_ARGS(cpu_id, load, curtarg, curactual, newtarg)
+);
+
+TRACE_EVENT(cpufreq_interactive_boost,
+ TP_PROTO(const char *s),
+ TP_ARGS(s),
+ TP_STRUCT__entry(
+ __string(s, s)
+ ),
+ TP_fast_assign(
+ __assign_str(s, s);
+ ),
+ TP_printk("%s", __get_str(s))
+);
+
+TRACE_EVENT(cpufreq_interactive_unboost,
+ TP_PROTO(const char *s),
+ TP_ARGS(s),
+ TP_STRUCT__entry(
+ __string(s, s)
+ ),
+ TP_fast_assign(
+ __assign_str(s, s);
+ ),
+ TP_printk("%s", __get_str(s))
+);
+
+#endif /* _TRACE_CPUFREQ_INTERACTIVE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/cpufreq_sched.h b/include/trace/events/cpufreq_sched.h
new file mode 100644
index 000000000000..a46cd088e969
--- /dev/null
+++ b/include/trace/events/cpufreq_sched.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2015 Steve Muckle <smuckle@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cpufreq_sched
+
+#if !defined(_TRACE_CPUFREQ_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_CPUFREQ_SCHED_H
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(cpufreq_sched_throttled,
+ TP_PROTO(unsigned int rem),
+ TP_ARGS(rem),
+ TP_STRUCT__entry(
+ __field( unsigned int, rem)
+ ),
+ TP_fast_assign(
+ __entry->rem = rem;
+ ),
+ TP_printk("throttled - %d usec remaining", __entry->rem)
+);
+
+TRACE_EVENT(cpufreq_sched_request_opp,
+ TP_PROTO(int cpu,
+ unsigned long capacity,
+ unsigned int freq_new,
+ unsigned int requested_freq),
+ TP_ARGS(cpu, capacity, freq_new, requested_freq),
+ TP_STRUCT__entry(
+ __field( int, cpu)
+ __field( unsigned long, capacity)
+ __field( unsigned int, freq_new)
+ __field( unsigned int, requested_freq)
+ ),
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->capacity = capacity;
+ __entry->freq_new = freq_new;
+ __entry->requested_freq = requested_freq;
+ ),
+ TP_printk("cpu %d cap change, cluster cap request %ld => OPP %d "
+ "(cur %d)",
+ __entry->cpu, __entry->capacity, __entry->freq_new,
+ __entry->requested_freq)
+);
+
+TRACE_EVENT(cpufreq_sched_update_capacity,
+ TP_PROTO(int cpu,
+ bool request,
+ struct sched_capacity_reqs *scr,
+ unsigned long new_capacity),
+ TP_ARGS(cpu, request, scr, new_capacity),
+ TP_STRUCT__entry(
+ __field( int, cpu)
+ __field( bool, request)
+ __field( unsigned long, cfs)
+ __field( unsigned long, rt)
+ __field( unsigned long, dl)
+ __field( unsigned long, total)
+ __field( unsigned long, new_total)
+ ),
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->request = request;
+ __entry->cfs = scr->cfs;
+ __entry->rt = scr->rt;
+ __entry->dl = scr->dl;
+ __entry->total = scr->total;
+ __entry->new_total = new_capacity;
+ ),
+ TP_printk("cpu=%d set_cap=%d cfs=%ld rt=%ld dl=%ld old_tot=%ld "
+ "new_tot=%ld",
+ __entry->cpu, __entry->request, __entry->cfs, __entry->rt,
+ __entry->dl, __entry->total, __entry->new_total)
+);
+
+#endif /* _TRACE_CPUFREQ_SCHED_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/gpu.h b/include/trace/events/gpu.h
new file mode 100644
index 000000000000..7e15cdfafe5a
--- /dev/null
+++ b/include/trace/events/gpu.h
@@ -0,0 +1,143 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM gpu
+
+#if !defined(_TRACE_GPU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_GPU_H
+
+#include <linux/tracepoint.h>
+#include <linux/time.h>
+
+#define show_secs_from_ns(ns) \
+ ({ \
+ u64 t = ns + (NSEC_PER_USEC / 2); \
+ do_div(t, NSEC_PER_SEC); \
+ t; \
+ })
+
+#define show_usecs_from_ns(ns) \
+ ({ \
+ u64 t = ns + (NSEC_PER_USEC / 2) ; \
+ u32 rem; \
+ do_div(t, NSEC_PER_USEC); \
+ rem = do_div(t, USEC_PER_SEC); \
+ })
+
+/*
+ * The gpu_sched_switch event indicates that a switch from one GPU context to
+ * another occurred on one of the GPU hardware blocks.
+ *
+ * The gpu_name argument identifies the GPU hardware block. Each independently
+ * scheduled GPU hardware block should have a different name. This may be used
+ * in different ways for different GPUs. For example, if a GPU includes
+ * multiple processing cores it may use names "GPU 0", "GPU 1", etc. If a GPU
+ * includes a separately scheduled 2D and 3D hardware block, it might use the
+ * names "2D" and "3D".
+ *
+ * The timestamp argument is the timestamp at which the switch occurred on the
+ * GPU. These timestamps are in units of nanoseconds and must use
+ * approximately the same time as sched_clock, though they need not come from
+ * any CPU clock. The timestamps for a single hardware block must be
+ * monotonically nondecreasing. This means that if a variable compensation
+ * offset is used to translate from some other clock to the sched_clock, then
+ * care must be taken when increasing that offset, and doing so may result in
+ * multiple events with the same timestamp.
+ *
+ * The next_ctx_id argument identifies the next context that was running on
+ * the GPU hardware block. A value of 0 indicates that the hardware block
+ * will be idle.
+ *
+ * The next_prio argument indicates the priority of the next context at the
+ * time of the event. The exact numeric values may mean different things for
+ * different GPUs, but they should follow the rule that lower values indicate a
+ * higher priority.
+ *
+ * The next_job_id argument identifies the batch of work that the GPU will be
+ * working on. This should correspond to a job_id that was previously traced
+ * as a gpu_job_enqueue event when the batch of work was created.
+ */
+TRACE_EVENT(gpu_sched_switch,
+
+ TP_PROTO(const char *gpu_name, u64 timestamp,
+ u32 next_ctx_id, s32 next_prio, u32 next_job_id),
+
+ TP_ARGS(gpu_name, timestamp, next_ctx_id, next_prio, next_job_id),
+
+ TP_STRUCT__entry(
+ __string( gpu_name, gpu_name )
+ __field( u64, timestamp )
+ __field( u32, next_ctx_id )
+ __field( s32, next_prio )
+ __field( u32, next_job_id )
+ ),
+
+ TP_fast_assign(
+ __assign_str(gpu_name, gpu_name);
+ __entry->timestamp = timestamp;
+ __entry->next_ctx_id = next_ctx_id;
+ __entry->next_prio = next_prio;
+ __entry->next_job_id = next_job_id;
+ ),
+
+ TP_printk("gpu_name=%s ts=%llu.%06lu next_ctx_id=%lu next_prio=%ld "
+ "next_job_id=%lu",
+ __get_str(gpu_name),
+ (unsigned long long)show_secs_from_ns(__entry->timestamp),
+ (unsigned long)show_usecs_from_ns(__entry->timestamp),
+ (unsigned long)__entry->next_ctx_id,
+ (long)__entry->next_prio,
+ (unsigned long)__entry->next_job_id)
+);
+
+/*
+ * The gpu_job_enqueue event indicates that a batch of work has been queued up
+ * to be processed by the GPU. This event is not intended to indicate that
+ * the batch of work has been submitted to the GPU hardware, but rather that
+ * it has been submitted to the GPU kernel driver.
+ *
+ * This event should be traced on the thread that initiated the work being
+ * queued. For example, if a batch of work is submitted to the kernel by a
+ * userland thread, the event should be traced on that thread.
+ *
+ * The ctx_id field identifies the GPU context in which the batch of work
+ * being queued is to be run.
+ *
+ * The job_id field identifies the batch of work being queued within the given
+ * GPU context. The first batch of work submitted for a given GPU context
+ * should have a job_id of 0, and each subsequent batch of work should
+ * increment the job_id by 1.
+ *
+ * The type field identifies the type of the job being enqueued. The job
+ * types may be different for different GPU hardware. For example, a GPU may
+ * differentiate between "2D", "3D", and "compute" jobs.
+ */
+TRACE_EVENT(gpu_job_enqueue,
+
+ TP_PROTO(u32 ctx_id, u32 job_id, const char *type),
+
+ TP_ARGS(ctx_id, job_id, type),
+
+ TP_STRUCT__entry(
+ __field( u32, ctx_id )
+ __field( u32, job_id )
+ __string( type, type )
+ ),
+
+ TP_fast_assign(
+ __entry->ctx_id = ctx_id;
+ __entry->job_id = job_id;
+ __assign_str(type, type);
+ ),
+
+ TP_printk("ctx_id=%lu job_id=%lu type=%s",
+ (unsigned long)__entry->ctx_id,
+ (unsigned long)__entry->job_id,
+ __get_str(type))
+);
+
+#undef show_secs_from_ns
+#undef show_usecs_from_ns
+
+#endif /* _TRACE_GPU_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 49cc7c3de252..89d009e10938 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -57,7 +57,7 @@ TRACE_EVENT(net_dev_start_xmit,
__entry->gso_type = skb_shinfo(skb)->gso_type;
),
- TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
+ TP_printk("dev=%s queue_mapping=%u skbaddr=%pK vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
__get_str(name), __entry->queue_mapping, __entry->skbaddr,
__entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci,
__entry->protocol, __entry->ip_summed, __entry->len,
@@ -90,7 +90,7 @@ TRACE_EVENT(net_dev_xmit,
__assign_str(name, dev->name);
),
- TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
+ TP_printk("dev=%s skbaddr=%pK len=%u rc=%d",
__get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
);
@@ -112,7 +112,7 @@ DECLARE_EVENT_CLASS(net_dev_template,
__assign_str(name, skb->dev->name);
),
- TP_printk("dev=%s skbaddr=%p len=%u",
+ TP_printk("dev=%s skbaddr=%pK len=%u",
__get_str(name), __entry->skbaddr, __entry->len)
)
@@ -191,7 +191,7 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
__entry->gso_type = skb_shinfo(skb)->gso_type;
),
- TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
+ TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%pK vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
__get_str(name), __entry->napi_id, __entry->queue_mapping,
__entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto,
__entry->vlan_tci, __entry->protocol, __entry->ip_summed,
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 54e3aad32806..ec6f81561558 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -147,6 +147,38 @@ DEFINE_EVENT(cpu, cpu_frequency,
TP_ARGS(frequency, cpu_id)
);
+TRACE_EVENT(cpu_frequency_limits,
+
+ TP_PROTO(unsigned int max_freq, unsigned int min_freq,
+ unsigned int cpu_id),
+
+ TP_ARGS(max_freq, min_freq, cpu_id),
+
+ TP_STRUCT__entry(
+ __field( u32, min_freq )
+ __field( u32, max_freq )
+ __field( u32, cpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->min_freq = min_freq;
+ __entry->max_freq = max_freq;
+ __entry->cpu_id = cpu_id;
+ ),
+
+ TP_printk("min=%lu max=%lu cpu_id=%lu",
+ (unsigned long)__entry->min_freq,
+ (unsigned long)__entry->max_freq,
+ (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(cpu, cpu_capacity,
+
+ TP_PROTO(unsigned int capacity, unsigned int cpu_id),
+
+ TP_ARGS(capacity, cpu_id)
+);
+
TRACE_EVENT(device_pm_callback_start,
TP_PROTO(struct device *dev, const char *pm_ops, int event),
@@ -300,6 +332,25 @@ DEFINE_EVENT(clock, clock_set_rate,
TP_ARGS(name, state, cpu_id)
);
+TRACE_EVENT(clock_set_parent,
+
+ TP_PROTO(const char *name, const char *parent_name),
+
+ TP_ARGS(name, parent_name),
+
+ TP_STRUCT__entry(
+ __string( name, name )
+ __string( parent_name, parent_name )
+ ),
+
+ TP_fast_assign(
+ __assign_str(name, name);
+ __assign_str(parent_name, parent_name);
+ ),
+
+ TP_printk("%s parent=%s", __get_str(name), __get_str(parent_name))
+);
+
/*
* The power domain events are used for power domains transitions
*/
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9b90c57517a9..da97ab53fc61 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -219,7 +219,7 @@ DECLARE_EVENT_CLASS(sched_process_template,
DEFINE_EVENT(sched_process_template, sched_process_free,
TP_PROTO(struct task_struct *p),
TP_ARGS(p));
-
+
/*
* Tracepoint for a task exiting:
@@ -374,6 +374,30 @@ DEFINE_EVENT(sched_stat_template, sched_stat_blocked,
TP_ARGS(tsk, delay));
/*
+ * Tracepoint for recording the cause of uninterruptible sleep.
+ */
+TRACE_EVENT(sched_blocked_reason,
+
+ TP_PROTO(struct task_struct *tsk),
+
+ TP_ARGS(tsk),
+
+ TP_STRUCT__entry(
+ __field( pid_t, pid )
+ __field( void*, caller )
+ __field( bool, io_wait )
+ ),
+
+ TP_fast_assign(
+ __entry->pid = tsk->pid;
+ __entry->caller = (void*)get_wchan(tsk);
+ __entry->io_wait = tsk->in_iowait;
+ ),
+
+ TP_printk("pid=%d iowait=%d caller=%pS", __entry->pid, __entry->io_wait, __entry->caller)
+);
+
+/*
* Tracepoint for accounting runtime (time the task is executing
* on a CPU).
*/
@@ -562,6 +586,581 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
TP_printk("cpu=%d", __entry->cpu)
);
+
+TRACE_EVENT(sched_contrib_scale_f,
+
+ TP_PROTO(int cpu, unsigned long freq_scale_factor,
+ unsigned long cpu_scale_factor),
+
+ TP_ARGS(cpu, freq_scale_factor, cpu_scale_factor),
+
+ TP_STRUCT__entry(
+ __field(int, cpu)
+ __field(unsigned long, freq_scale_factor)
+ __field(unsigned long, cpu_scale_factor)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->freq_scale_factor = freq_scale_factor;
+ __entry->cpu_scale_factor = cpu_scale_factor;
+ ),
+
+ TP_printk("cpu=%d freq_scale_factor=%lu cpu_scale_factor=%lu",
+ __entry->cpu, __entry->freq_scale_factor,
+ __entry->cpu_scale_factor)
+);
+
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int walt_ravg_window;
+extern unsigned int walt_disabled;
+#endif
+
+/*
+ * Tracepoint for accounting sched averages for tasks.
+ */
+TRACE_EVENT(sched_load_avg_task,
+
+ TP_PROTO(struct task_struct *tsk, struct sched_avg *avg, void *_ravg),
+
+ TP_ARGS(tsk, avg, _ravg),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( int, cpu )
+ __field( unsigned long, load_avg )
+ __field( unsigned long, util_avg )
+ __field( unsigned long, util_avg_pelt )
+ __field( unsigned long, util_avg_walt )
+ __field( u64, load_sum )
+ __field( u32, util_sum )
+ __field( u32, period_contrib )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->cpu = task_cpu(tsk);
+ __entry->load_avg = avg->load_avg;
+ __entry->util_avg = avg->util_avg;
+ __entry->load_sum = avg->load_sum;
+ __entry->util_sum = avg->util_sum;
+ __entry->period_contrib = avg->period_contrib;
+ __entry->util_avg_pelt = avg->util_avg;
+ __entry->util_avg_walt = 0;
+#ifdef CONFIG_SCHED_WALT
+ __entry->util_avg_walt = (((unsigned long)((struct ravg*)_ravg)->demand) << NICE_0_LOAD_SHIFT);
+ do_div(__entry->util_avg_walt, walt_ravg_window);
+ if (!walt_disabled && sysctl_sched_use_walt_task_util)
+ __entry->util_avg = __entry->util_avg_walt;
+#endif
+ ),
+ TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu "
+ "util_avg_pelt=%lu util_avg_walt=%lu load_sum=%llu"
+ " util_sum=%u period_contrib=%u",
+ __entry->comm,
+ __entry->pid,
+ __entry->cpu,
+ __entry->load_avg,
+ __entry->util_avg,
+ __entry->util_avg_pelt,
+ __entry->util_avg_walt,
+ (u64)__entry->load_sum,
+ (u32)__entry->util_sum,
+ (u32)__entry->period_contrib)
+);
+
+/*
+ * Tracepoint for accounting sched averages for cpus.
+ */
+TRACE_EVENT(sched_load_avg_cpu,
+
+ TP_PROTO(int cpu, struct cfs_rq *cfs_rq),
+
+ TP_ARGS(cpu, cfs_rq),
+
+ TP_STRUCT__entry(
+ __field( int, cpu )
+ __field( unsigned long, load_avg )
+ __field( unsigned long, util_avg )
+ __field( unsigned long, util_avg_pelt )
+ __field( unsigned long, util_avg_walt )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->load_avg = cfs_rq->avg.load_avg;
+ __entry->util_avg = cfs_rq->avg.util_avg;
+ __entry->util_avg_pelt = cfs_rq->avg.util_avg;
+ __entry->util_avg_walt = 0;
+#ifdef CONFIG_SCHED_WALT
+ __entry->util_avg_walt =
+ cpu_rq(cpu)->prev_runnable_sum << NICE_0_LOAD_SHIFT;
+ do_div(__entry->util_avg_walt, walt_ravg_window);
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+ __entry->util_avg = __entry->util_avg_walt;
+#endif
+ ),
+
+ TP_printk("cpu=%d load_avg=%lu util_avg=%lu "
+ "util_avg_pelt=%lu util_avg_walt=%lu",
+ __entry->cpu, __entry->load_avg, __entry->util_avg,
+ __entry->util_avg_pelt, __entry->util_avg_walt)
+);
+
+/*
+ * Tracepoint for sched_tune_config settings
+ */
+TRACE_EVENT(sched_tune_config,
+
+ TP_PROTO(int boost),
+
+ TP_ARGS(boost),
+
+ TP_STRUCT__entry(
+ __field( int, boost )
+ ),
+
+ TP_fast_assign(
+ __entry->boost = boost;
+ ),
+
+ TP_printk("boost=%d ", __entry->boost)
+);
+
+/*
+ * Tracepoint for accounting CPU boosted utilization
+ */
+TRACE_EVENT(sched_boost_cpu,
+
+ TP_PROTO(int cpu, unsigned long util, long margin),
+
+ TP_ARGS(cpu, util, margin),
+
+ TP_STRUCT__entry(
+ __field( int, cpu )
+ __field( unsigned long, util )
+ __field(long, margin )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->util = util;
+ __entry->margin = margin;
+ ),
+
+ TP_printk("cpu=%d util=%lu margin=%ld",
+ __entry->cpu,
+ __entry->util,
+ __entry->margin)
+);
+
+/*
+ * Tracepoint for schedtune_tasks_update
+ */
+TRACE_EVENT(sched_tune_tasks_update,
+
+ TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx,
+ int boost, int max_boost),
+
+ TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( int, cpu )
+ __field( int, tasks )
+ __field( int, idx )
+ __field( int, boost )
+ __field( int, max_boost )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->cpu = cpu;
+ __entry->tasks = tasks;
+ __entry->idx = idx;
+ __entry->boost = boost;
+ __entry->max_boost = max_boost;
+ ),
+
+ TP_printk("pid=%d comm=%s "
+ "cpu=%d tasks=%d idx=%d boost=%d max_boost=%d",
+ __entry->pid, __entry->comm,
+ __entry->cpu, __entry->tasks, __entry->idx,
+ __entry->boost, __entry->max_boost)
+);
+
+/*
+ * Tracepoint for schedtune_boostgroup_update
+ */
+TRACE_EVENT(sched_tune_boostgroup_update,
+
+ TP_PROTO(int cpu, int variation, int max_boost),
+
+ TP_ARGS(cpu, variation, max_boost),
+
+ TP_STRUCT__entry(
+ __field( int, cpu )
+ __field( int, variation )
+ __field( int, max_boost )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->variation = variation;
+ __entry->max_boost = max_boost;
+ ),
+
+ TP_printk("cpu=%d variation=%d max_boost=%d",
+ __entry->cpu, __entry->variation, __entry->max_boost)
+);
+
+/*
+ * Tracepoint for accounting task boosted utilization
+ */
+TRACE_EVENT(sched_boost_task,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long util, long margin),
+
+ TP_ARGS(tsk, util, margin),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( unsigned long, util )
+ __field( long, margin )
+
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->util = util;
+ __entry->margin = margin;
+ ),
+
+ TP_printk("comm=%s pid=%d util=%lu margin=%ld",
+ __entry->comm, __entry->pid,
+ __entry->util,
+ __entry->margin)
+);
+
+/*
+ * Tracepoint for find_best_target
+ */
+TRACE_EVENT(sched_find_best_target,
+
+ TP_PROTO(struct task_struct *tsk, bool prefer_idle,
+ unsigned long min_util, int start_cpu,
+ int best_idle, int best_active, int target),
+
+ TP_ARGS(tsk, prefer_idle, min_util, start_cpu,
+ best_idle, best_active, target),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( unsigned long, min_util )
+ __field( bool, prefer_idle )
+ __field( int, start_cpu )
+ __field( int, best_idle )
+ __field( int, best_active )
+ __field( int, target )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->min_util = min_util;
+ __entry->prefer_idle = prefer_idle;
+ __entry->start_cpu = start_cpu;
+ __entry->best_idle = best_idle;
+ __entry->best_active = best_active;
+ __entry->target = target;
+ ),
+
+ TP_printk("pid=%d comm=%s prefer_idle=%d start_cpu=%d "
+ "best_idle=%d best_active=%d target=%d",
+ __entry->pid, __entry->comm,
+ __entry->prefer_idle, __entry->start_cpu,
+ __entry->best_idle, __entry->best_active,
+ __entry->target)
+);
+
+/*
+ * Tracepoint for accounting sched group energy
+ */
+TRACE_EVENT(sched_energy_diff,
+
+ TP_PROTO(struct task_struct *tsk, int scpu, int dcpu, int udelta,
+ int nrgb, int nrga, int nrgd, int capb, int capa, int capd,
+ int nrgn, int nrgp),
+
+ TP_ARGS(tsk, scpu, dcpu, udelta,
+ nrgb, nrga, nrgd, capb, capa, capd,
+ nrgn, nrgp),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( int, scpu )
+ __field( int, dcpu )
+ __field( int, udelta )
+ __field( int, nrgb )
+ __field( int, nrga )
+ __field( int, nrgd )
+ __field( int, capb )
+ __field( int, capa )
+ __field( int, capd )
+ __field( int, nrgn )
+ __field( int, nrgp )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = tsk->pid;
+ __entry->scpu = scpu;
+ __entry->dcpu = dcpu;
+ __entry->udelta = udelta;
+ __entry->nrgb = nrgb;
+ __entry->nrga = nrga;
+ __entry->nrgd = nrgd;
+ __entry->capb = capb;
+ __entry->capa = capa;
+ __entry->capd = capd;
+ __entry->nrgn = nrgn;
+ __entry->nrgp = nrgp;
+ ),
+
+ TP_printk("pid=%d comm=%s "
+ "src_cpu=%d dst_cpu=%d usage_delta=%d "
+ "nrg_before=%d nrg_after=%d nrg_diff=%d "
+ "cap_before=%d cap_after=%d cap_delta=%d "
+ "nrg_delta=%d nrg_payoff=%d",
+ __entry->pid, __entry->comm,
+ __entry->scpu, __entry->dcpu, __entry->udelta,
+ __entry->nrgb, __entry->nrga, __entry->nrgd,
+ __entry->capb, __entry->capa, __entry->capd,
+ __entry->nrgn, __entry->nrgp)
+);
+
+/*
+ * Tracepoint for schedtune_tasks_update
+ */
+TRACE_EVENT(sched_tune_filter,
+
+ TP_PROTO(int nrg_delta, int cap_delta,
+ int nrg_gain, int cap_gain,
+ int payoff, int region),
+
+ TP_ARGS(nrg_delta, cap_delta, nrg_gain, cap_gain, payoff, region),
+
+ TP_STRUCT__entry(
+ __field( int, nrg_delta )
+ __field( int, cap_delta )
+ __field( int, nrg_gain )
+ __field( int, cap_gain )
+ __field( int, payoff )
+ __field( int, region )
+ ),
+
+ TP_fast_assign(
+ __entry->nrg_delta = nrg_delta;
+ __entry->cap_delta = cap_delta;
+ __entry->nrg_gain = nrg_gain;
+ __entry->cap_gain = cap_gain;
+ __entry->payoff = payoff;
+ __entry->region = region;
+ ),
+
+ TP_printk("nrg_delta=%d cap_delta=%d nrg_gain=%d cap_gain=%d payoff=%d region=%d",
+ __entry->nrg_delta, __entry->cap_delta,
+ __entry->nrg_gain, __entry->cap_gain,
+ __entry->payoff, __entry->region)
+);
+
+/*
+ * Tracepoint for system overutilized flag
+ */
+TRACE_EVENT(sched_overutilized,
+
+ TP_PROTO(bool overutilized),
+
+ TP_ARGS(overutilized),
+
+ TP_STRUCT__entry(
+ __field( bool, overutilized )
+ ),
+
+ TP_fast_assign(
+ __entry->overutilized = overutilized;
+ ),
+
+ TP_printk("overutilized=%d",
+ __entry->overutilized ? 1 : 0)
+);
+#ifdef CONFIG_SCHED_WALT
+struct rq;
+
+TRACE_EVENT(walt_update_task_ravg,
+
+ TP_PROTO(struct task_struct *p, struct rq *rq, int evt,
+ u64 wallclock, u64 irqtime),
+
+ TP_ARGS(p, rq, evt, wallclock, irqtime),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field( pid_t, cur_pid )
+ __field(unsigned int, cur_freq )
+ __field( u64, wallclock )
+ __field( u64, mark_start )
+ __field( u64, delta_m )
+ __field( u64, win_start )
+ __field( u64, delta )
+ __field( u64, irqtime )
+ __field( int, evt )
+ __field(unsigned int, demand )
+ __field(unsigned int, sum )
+ __field( int, cpu )
+ __field( u64, cs )
+ __field( u64, ps )
+ __field(unsigned long, util )
+ __field( u32, curr_window )
+ __field( u32, prev_window )
+ __field( u64, nt_cs )
+ __field( u64, nt_ps )
+ __field( u32, active_windows )
+ ),
+
+ TP_fast_assign(
+ __entry->wallclock = wallclock;
+ __entry->win_start = rq->window_start;
+ __entry->delta = (wallclock - rq->window_start);
+ __entry->evt = evt;
+ __entry->cpu = rq->cpu;
+ __entry->cur_pid = rq->curr->pid;
+ __entry->cur_freq = rq->cur_freq;
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->pid = p->pid;
+ __entry->mark_start = p->ravg.mark_start;
+ __entry->delta_m = (wallclock - p->ravg.mark_start);
+ __entry->demand = p->ravg.demand;
+ __entry->sum = p->ravg.sum;
+ __entry->irqtime = irqtime;
+ __entry->cs = rq->curr_runnable_sum;
+ __entry->ps = rq->prev_runnable_sum;
+ __entry->util = rq->prev_runnable_sum << NICE_0_LOAD_SHIFT;
+ do_div(__entry->util, walt_ravg_window);
+ __entry->curr_window = p->ravg.curr_window;
+ __entry->prev_window = p->ravg.prev_window;
+ __entry->nt_cs = rq->nt_curr_runnable_sum;
+ __entry->nt_ps = rq->nt_prev_runnable_sum;
+ __entry->active_windows = p->ravg.active_windows;
+ ),
+
+ TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu"
+ " cs %llu ps %llu util %lu cur_window %u prev_window %u active_wins %u"
+ , __entry->wallclock, __entry->win_start, __entry->delta,
+ __entry->evt, __entry->cpu,
+ __entry->cur_freq, __entry->cur_pid,
+ __entry->pid, __entry->comm, __entry->mark_start,
+ __entry->delta_m, __entry->demand,
+ __entry->sum, __entry->irqtime,
+ __entry->cs, __entry->ps, __entry->util,
+ __entry->curr_window, __entry->prev_window,
+ __entry->active_windows
+ )
+);
+
+TRACE_EVENT(walt_update_history,
+
+ TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
+ int evt),
+
+ TP_ARGS(rq, p, runtime, samples, evt),
+
+ TP_STRUCT__entry(
+ __array( char, comm, TASK_COMM_LEN )
+ __field( pid_t, pid )
+ __field(unsigned int, runtime )
+ __field( int, samples )
+ __field( int, evt )
+ __field( u64, demand )
+ __field( u64, walt_avg )
+ __field(unsigned int, pelt_avg )
+ __array( u32, hist, RAVG_HIST_SIZE_MAX)
+ __field( int, cpu )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->pid = p->pid;
+ __entry->runtime = runtime;
+ __entry->samples = samples;
+ __entry->evt = evt;
+ __entry->demand = p->ravg.demand;
+ __entry->walt_avg = (__entry->demand << 10);
+ __entry->walt_avg = div_u64(__entry->walt_avg,
+ walt_ravg_window);
+ __entry->pelt_avg = p->se.avg.util_avg;
+ memcpy(__entry->hist, p->ravg.sum_history,
+ RAVG_HIST_SIZE_MAX * sizeof(u32));
+ __entry->cpu = rq->cpu;
+ ),
+
+ TP_printk("%d (%s): runtime %u samples %d event %d demand %llu"
+ " walt %llu pelt %u (hist: %u %u %u %u %u) cpu %d",
+ __entry->pid, __entry->comm,
+ __entry->runtime, __entry->samples, __entry->evt,
+ __entry->demand,
+ __entry->walt_avg,
+ __entry->pelt_avg,
+ __entry->hist[0], __entry->hist[1],
+ __entry->hist[2], __entry->hist[3],
+ __entry->hist[4], __entry->cpu)
+);
+
+TRACE_EVENT(walt_migration_update_sum,
+
+ TP_PROTO(struct rq *rq, struct task_struct *p),
+
+ TP_ARGS(rq, p),
+
+ TP_STRUCT__entry(
+ __field(int, cpu )
+ __field(int, pid )
+ __field( u64, cs )
+ __field( u64, ps )
+ __field( s64, nt_cs )
+ __field( s64, nt_ps )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu_of(rq);
+ __entry->cs = rq->curr_runnable_sum;
+ __entry->ps = rq->prev_runnable_sum;
+ __entry->nt_cs = (s64)rq->nt_curr_runnable_sum;
+ __entry->nt_ps = (s64)rq->nt_prev_runnable_sum;
+ __entry->pid = p->pid;
+ ),
+
+ TP_printk("cpu %d: cs %llu ps %llu nt_cs %lld nt_ps %lld pid %d",
+ __entry->cpu, __entry->cs, __entry->ps,
+ __entry->nt_cs, __entry->nt_ps, __entry->pid)
+);
+#endif /* CONFIG_SCHED_WALT */
+
+#endif /* CONFIG_SMP */
+
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 67d632f1743d..2d078c20abcb 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -92,4 +92,6 @@
#define SO_CNX_ADVICE 53
+#define SO_COOKIE 57
+
#endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 41420e341e75..5539933b3491 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -33,11 +33,60 @@ enum {
BINDER_TYPE_HANDLE = B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE),
BINDER_TYPE_WEAK_HANDLE = B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE),
BINDER_TYPE_FD = B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE),
+ BINDER_TYPE_FDA = B_PACK_CHARS('f', 'd', 'a', B_TYPE_LARGE),
+ BINDER_TYPE_PTR = B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE),
};
-enum {
+/**
+ * enum flat_binder_object_shifts: shift values for flat_binder_object_flags
+ * @FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT: shift for getting scheduler policy.
+ *
+ */
+enum flat_binder_object_shifts {
+ FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT = 9,
+};
+
+/**
+ * enum flat_binder_object_flags - flags for use in flat_binder_object.flags
+ */
+enum flat_binder_object_flags {
+ /**
+ * @FLAT_BINDER_FLAG_PRIORITY_MASK: bit-mask for min scheduler priority
+ *
+ * These bits can be used to set the minimum scheduler priority
+ * at which transactions into this node should run. Valid values
+ * in these bits depend on the scheduler policy encoded in
+ * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK.
+ *
+ * For SCHED_NORMAL/SCHED_BATCH, the valid range is between [-20..19]
+ * For SCHED_FIFO/SCHED_RR, the value can run between [1..99]
+ */
FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff,
+ /**
+ * @FLAT_BINDER_FLAG_ACCEPTS_FDS: whether the node accepts fds.
+ */
FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100,
+ /**
+ * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK: bit-mask for scheduling policy
+ *
+ * These two bits can be used to set the min scheduling policy at which
+ * transactions on this node should run. These match the UAPI
+ * scheduler policy values, eg:
+ * 00b: SCHED_NORMAL
+ * 01b: SCHED_FIFO
+ * 10b: SCHED_RR
+ * 11b: SCHED_BATCH
+ */
+ FLAT_BINDER_FLAG_SCHED_POLICY_MASK =
+ 3U << FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT,
+
+ /**
+ * @FLAT_BINDER_FLAG_INHERIT_RT: whether the node inherits RT policy
+ *
+ * Only when set, calls into this node will inherit a real-time
+ * scheduling policy from the caller (for synchronous transactions).
+ */
+ FLAT_BINDER_FLAG_INHERIT_RT = 0x800,
};
#ifdef BINDER_IPC_32BIT
@@ -48,6 +97,14 @@ typedef __u64 binder_size_t;
typedef __u64 binder_uintptr_t;
#endif
+/**
+ * struct binder_object_header - header shared by all binder metadata objects.
+ * @type: type of the object
+ */
+struct binder_object_header {
+ __u32 type;
+};
+
/*
* This is the flattened representation of a Binder object for transfer
* between processes. The 'offsets' supplied as part of a binder transaction
@@ -56,9 +113,8 @@ typedef __u64 binder_uintptr_t;
* between processes.
*/
struct flat_binder_object {
- /* 8 bytes for large_flat_header. */
- __u32 type;
- __u32 flags;
+ struct binder_object_header hdr;
+ __u32 flags;
/* 8 bytes of data. */
union {
@@ -70,6 +126,86 @@ struct flat_binder_object {
binder_uintptr_t cookie;
};
+/**
+ * struct binder_fd_object - describes a filedescriptor to be fixed up.
+ * @hdr: common header structure
+ * @pad_flags: padding to remain compatible with old userspace code
+ * @pad_binder: padding to remain compatible with old userspace code
+ * @fd: file descriptor
+ * @cookie: opaque data, used by user-space
+ */
+struct binder_fd_object {
+ struct binder_object_header hdr;
+ __u32 pad_flags;
+ union {
+ binder_uintptr_t pad_binder;
+ __u32 fd;
+ };
+
+ binder_uintptr_t cookie;
+};
+
+/* struct binder_buffer_object - object describing a userspace buffer
+ * @hdr: common header structure
+ * @flags: one or more BINDER_BUFFER_* flags
+ * @buffer: address of the buffer
+ * @length: length of the buffer
+ * @parent: index in offset array pointing to parent buffer
+ * @parent_offset: offset in @parent pointing to this buffer
+ *
+ * A binder_buffer object represents an object that the
+ * binder kernel driver can copy verbatim to the target
+ * address space. A buffer itself may be pointed to from
+ * within another buffer, meaning that the pointer inside
+ * that other buffer needs to be fixed up as well. This
+ * can be done by setting the BINDER_BUFFER_FLAG_HAS_PARENT
+ * flag in @flags, by setting @parent buffer to the index
+ * in the offset array pointing to the parent binder_buffer_object,
+ * and by setting @parent_offset to the offset in the parent buffer
+ * at which the pointer to this buffer is located.
+ */
+struct binder_buffer_object {
+ struct binder_object_header hdr;
+ __u32 flags;
+ binder_uintptr_t buffer;
+ binder_size_t length;
+ binder_size_t parent;
+ binder_size_t parent_offset;
+};
+
+enum {
+ BINDER_BUFFER_FLAG_HAS_PARENT = 0x01,
+};
+
+/* struct binder_fd_array_object - object describing an array of fds in a buffer
+ * @hdr: common header structure
+ * @pad: padding to ensure correct alignment
+ * @num_fds: number of file descriptors in the buffer
+ * @parent: index in offset array to buffer holding the fd array
+ * @parent_offset: start offset of fd array in the buffer
+ *
+ * A binder_fd_array object represents an array of file
+ * descriptors embedded in a binder_buffer_object. It is
+ * different from a regular binder_buffer_object because it
+ * describes a list of file descriptors to fix up, not an opaque
+ * blob of memory, and hence the kernel needs to treat it differently.
+ *
+ * An example of how this would be used is with Android's
+ * native_handle_t object, which is a struct with a list of integers
+ * and a list of file descriptors. The native_handle_t struct itself
+ * will be represented by a struct binder_buffer_objct, whereas the
+ * embedded list of file descriptors is represented by a
+ * struct binder_fd_array_object with that binder_buffer_object as
+ * a parent.
+ */
+struct binder_fd_array_object {
+ struct binder_object_header hdr;
+ __u32 pad;
+ binder_size_t num_fds;
+ binder_size_t parent;
+ binder_size_t parent_offset;
+};
+
/*
* On 64-bit platforms where user code may run in 32-bits the driver must
* translate the buffer (and local binder) addresses appropriately.
@@ -97,6 +233,19 @@ struct binder_version {
#define BINDER_CURRENT_PROTOCOL_VERSION 8
#endif
+/*
+ * Use with BINDER_GET_NODE_DEBUG_INFO, driver reads ptr, writes to all fields.
+ * Set ptr to NULL for the first call to get the info for the first node, and
+ * then repeat the call passing the previously returned value to get the next
+ * nodes. ptr will be 0 when there are no more nodes.
+ */
+struct binder_node_debug_info {
+ binder_uintptr_t ptr;
+ binder_uintptr_t cookie;
+ __u32 has_strong_ref;
+ __u32 has_weak_ref;
+};
+
#define BINDER_WRITE_READ _IOWR('b', 1, struct binder_write_read)
#define BINDER_SET_IDLE_TIMEOUT _IOW('b', 3, __s64)
#define BINDER_SET_MAX_THREADS _IOW('b', 5, __u32)
@@ -104,6 +253,7 @@ struct binder_version {
#define BINDER_SET_CONTEXT_MGR _IOW('b', 7, __s32)
#define BINDER_THREAD_EXIT _IOW('b', 8, __s32)
#define BINDER_VERSION _IOWR('b', 9, struct binder_version)
+#define BINDER_GET_NODE_DEBUG_INFO _IOWR('b', 11, struct binder_node_debug_info)
/*
* NOTE: Two special error codes you should check for when calling
@@ -162,6 +312,11 @@ struct binder_transaction_data {
} data;
};
+struct binder_transaction_data_sg {
+ struct binder_transaction_data transaction_data;
+ binder_size_t buffers_size;
+};
+
struct binder_ptr_cookie {
binder_uintptr_t ptr;
binder_uintptr_t cookie;
@@ -346,6 +501,12 @@ enum binder_driver_command_protocol {
/*
* void *: cookie
*/
+
+ BC_TRANSACTION_SG = _IOW('c', 17, struct binder_transaction_data_sg),
+ BC_REPLY_SG = _IOW('c', 18, struct binder_transaction_data_sg),
+ /*
+ * binder_transaction_data_sg: the sent command.
+ */
};
#endif /* _UAPI_LINUX_BINDER_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f09c70b97eca..b2d5be9fc909 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -73,6 +73,8 @@ enum bpf_cmd {
BPF_PROG_LOAD,
BPF_OBJ_PIN,
BPF_OBJ_GET,
+ BPF_PROG_ATTACH,
+ BPF_PROG_DETACH,
};
enum bpf_map_type {
@@ -96,8 +98,23 @@ enum bpf_prog_type {
BPF_PROG_TYPE_TRACEPOINT,
BPF_PROG_TYPE_XDP,
BPF_PROG_TYPE_PERF_EVENT,
+ BPF_PROG_TYPE_CGROUP_SKB,
};
+enum bpf_attach_type {
+ BPF_CGROUP_INET_INGRESS,
+ BPF_CGROUP_INET_EGRESS,
+ __MAX_BPF_ATTACH_TYPE
+};
+
+#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
+
+/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
+ * to the given target_fd cgroup the descendent cgroup will be able to
+ * override effective bpf program that was inherited from this cgroup
+ */
+#define BPF_F_ALLOW_OVERRIDE (1U << 0)
+
#define BPF_PSEUDO_MAP_FD 1
/* flags for BPF_MAP_UPDATE_ELEM command */
@@ -141,6 +158,13 @@ union bpf_attr {
__aligned_u64 pathname;
__u32 bpf_fd;
};
+
+ struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
+ __u32 target_fd; /* container object to attach to */
+ __u32 attach_bpf_fd; /* eBPF program to attach */
+ __u32 attach_type;
+ __u32 attach_flags;
+ };
} __attribute__((aligned(8)));
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -426,6 +450,67 @@ enum bpf_func_id {
*/
BPF_FUNC_set_hash_invalid,
+ /**
+ * int bpf_get_numa_node_id()
+ * Return: Id of current NUMA node.
+ */
+ BPF_FUNC_get_numa_node_id,
+
+ /**
+ * int bpf_skb_change_head()
+ * Grows headroom of skb and adjusts MAC header offset accordingly.
+ * Will extends/reallocae as required automatically.
+ * May change skb data pointer and will thus invalidate any check
+ * performed for direct packet access.
+ * @skb: pointer to skb
+ * @len: length of header to be pushed in front
+ * @flags: Flags (unused for now)
+ * Return: 0 on success or negative error
+ */
+ BPF_FUNC_skb_change_head,
+
+ /**
+ * int bpf_xdp_adjust_head(xdp_md, delta)
+ * Adjust the xdp_md.data by delta
+ * @xdp_md: pointer to xdp_md
+ * @delta: An positive/negative integer to be added to xdp_md.data
+ * Return: 0 on success or negative on error
+ */
+ BPF_FUNC_xdp_adjust_head,
+
+ /**
+ * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
+ * Copy a NUL terminated string from unsafe address. In case the string
+ * length is smaller than size, the target is not padded with further NUL
+ * bytes. In case the string length is larger than size, just count-1
+ * bytes are copied and the last byte is set to NUL.
+ * @dst: destination address
+ * @size: maximum number of bytes to copy, including the trailing NUL
+ * @unsafe_ptr: unsafe address
+ * Return:
+ * > 0 length of the string including the trailing NUL on success
+ * < 0 error
+ */
+ BPF_FUNC_probe_read_str,
+
+ /**
+ * u64 bpf_bpf_get_socket_cookie(skb)
+ * Get the cookie for the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: 8 Bytes non-decreasing number on success or 0 if the socket
+ * field is missing inside sk_buff
+ */
+ BPF_FUNC_get_socket_cookie,
+
+ /**
+ * u32 bpf_get_socket_uid(skb)
+ * Get the owner uid of the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: uid of the socket owner on success or 0 if the socket pointer
+ * inside sk_buff is NULL
+ */
+ BPF_FUNC_get_socket_uid,
+
__BPF_FUNC_MAX_ID,
};
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 14404b3ebb89..bbf02a63a011 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -29,6 +29,11 @@ struct fib_rule_hdr {
__u32 flags;
};
+struct fib_rule_uid_range {
+ __u32 start;
+ __u32 end;
+};
+
enum {
FRA_UNSPEC,
FRA_DST, /* destination address */
@@ -51,6 +56,7 @@ enum {
FRA_OIFNAME,
FRA_PAD,
FRA_L3MDEV, /* iif or oif is l3mdev goto its table */
+ FRA_UID_RANGE, /* UID range */
__FRA_MAX
};
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 474995568f35..11c030782426 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -237,6 +237,8 @@ struct fsxattr {
#define FICLONERANGE _IOW(0x94, 13, struct file_clone_range)
#define FIDEDUPERANGE _IOWR(0x94, 54, struct file_dedupe_range)
+#define FIDTRIM _IOWR('f', 128, struct fstrim_range) /* Deep discard trim */
+
#define FS_IOC_GETFLAGS _IOR('f', 1, long)
#define FS_IOC_SETFLAGS _IOW('f', 2, long)
#define FS_IOC_GETVERSION _IOR('v', 1, long)
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 42fa977e3b14..093237817ed0 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -375,6 +375,7 @@ enum fuse_opcode {
FUSE_READDIRPLUS = 44,
FUSE_RENAME2 = 45,
FUSE_LSEEK = 46,
+ FUSE_CANONICAL_PATH= 2016,
/* CUSE specific operations */
CUSE_INIT = 4096,
diff --git a/include/uapi/linux/hw_breakpoint.h b/include/uapi/linux/hw_breakpoint.h
index b04000a2296a..2b65efd19a46 100644
--- a/include/uapi/linux/hw_breakpoint.h
+++ b/include/uapi/linux/hw_breakpoint.h
@@ -4,7 +4,11 @@
enum {
HW_BREAKPOINT_LEN_1 = 1,
HW_BREAKPOINT_LEN_2 = 2,
+ HW_BREAKPOINT_LEN_3 = 3,
HW_BREAKPOINT_LEN_4 = 4,
+ HW_BREAKPOINT_LEN_5 = 5,
+ HW_BREAKPOINT_LEN_6 = 6,
+ HW_BREAKPOINT_LEN_7 = 7,
HW_BREAKPOINT_LEN_8 = 8,
};
diff --git a/include/uapi/linux/if_pppolac.h b/include/uapi/linux/if_pppolac.h
new file mode 100644
index 000000000000..b7eb8153ef66
--- /dev/null
+++ b/include/uapi/linux/if_pppolac.h
@@ -0,0 +1,33 @@
+/* include/uapi/linux/if_pppolac.h
+ *
+ * Header for PPP on L2TP Access Concentrator / PPPoLAC Socket (RFC 2661)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ * Author: Chia-chi Yeh <chiachi@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _UAPI_LINUX_IF_PPPOLAC_H
+#define _UAPI_LINUX_IF_PPPOLAC_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+struct sockaddr_pppolac {
+ sa_family_t sa_family; /* AF_PPPOX */
+ unsigned int sa_protocol; /* PX_PROTO_OLAC */
+ int udp_socket;
+ struct __attribute__((packed)) {
+ __u16 tunnel, session;
+ } local, remote;
+} __attribute__((packed));
+
+#endif /* _UAPI_LINUX_IF_PPPOLAC_H */
diff --git a/include/uapi/linux/if_pppopns.h b/include/uapi/linux/if_pppopns.h
new file mode 100644
index 000000000000..a392b52ea6ec
--- /dev/null
+++ b/include/uapi/linux/if_pppopns.h
@@ -0,0 +1,32 @@
+/* include/uapi/linux/if_pppopns.h
+ *
+ * Header for PPP on PPTP Network Server / PPPoPNS Socket (RFC 2637)
+ *
+ * Copyright (C) 2009 Google, Inc.
+ * Author: Chia-chi Yeh <chiachi@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _UAPI_LINUX_IF_PPPOPNS_H
+#define _UAPI_LINUX_IF_PPPOPNS_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+struct sockaddr_pppopns {
+ sa_family_t sa_family; /* AF_PPPOX */
+ unsigned int sa_protocol; /* PX_PROTO_OPNS */
+ int tcp_socket;
+ __u16 local;
+ __u16 remote;
+} __attribute__((packed));
+
+#endif /* _UAPI_LINUX_IF_PPPOPNS_H */
diff --git a/include/uapi/linux/if_pppox.h b/include/uapi/linux/if_pppox.h
index d37bbb17a007..6aad18a517d3 100644
--- a/include/uapi/linux/if_pppox.h
+++ b/include/uapi/linux/if_pppox.h
@@ -24,6 +24,8 @@
#include <linux/if.h>
#include <linux/if_ether.h>
#include <linux/if_pppol2tp.h>
+#include <linux/if_pppolac.h>
+#include <linux/if_pppopns.h>
#include <linux/in.h>
#include <linux/in6.h>
@@ -59,7 +61,9 @@ struct pptp_addr {
#define PX_PROTO_OE 0 /* Currently just PPPoE */
#define PX_PROTO_OL2TP 1 /* Now L2TP also */
#define PX_PROTO_PPTP 2
-#define PX_MAX_PROTO 3
+#define PX_PROTO_OLAC 3
+#define PX_PROTO_OPNS 4
+#define PX_MAX_PROTO 5
struct sockaddr_pppox {
__kernel_sa_family_t sa_family; /* address family, AF_PPPOX */
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 8c2772340c3f..c462f1dc175e 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -164,6 +164,7 @@ enum {
DEVCONF_ACCEPT_DAD,
DEVCONF_FORCE_TLLAO,
DEVCONF_NDISC_NOTIFY,
+ DEVCONF_ACCEPT_RA_RT_TABLE,
DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL,
DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL,
DEVCONF_SUPPRESS_FRAG_NDISC,
@@ -178,6 +179,12 @@ enum {
DEVCONF_DROP_UNSOLICITED_NA,
DEVCONF_KEEP_ADDR_ON_DOWN,
DEVCONF_RTR_SOLICIT_MAX_INTERVAL,
+ DEVCONF_SEG6_ENABLED,
+ DEVCONF_SEG6_REQUIRE_HMAC,
+ DEVCONF_ENHANCED_DAD,
+ DEVCONF_ADDR_GEN_MODE,
+ DEVCONF_DISABLE_POLICY,
+ DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN,
DEVCONF_MAX
};
diff --git a/include/uapi/linux/keychord.h b/include/uapi/linux/keychord.h
new file mode 100644
index 000000000000..ea7cf4d27bbd
--- /dev/null
+++ b/include/uapi/linux/keychord.h
@@ -0,0 +1,52 @@
+/*
+ * Key chord input driver
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+*/
+
+#ifndef _UAPI_LINUX_KEYCHORD_H_
+#define _UAPI_LINUX_KEYCHORD_H_
+
+#include <linux/input.h>
+
+#define KEYCHORD_VERSION 1
+
+/*
+ * One or more input_keychord structs are written to /dev/keychord
+ * at once to specify the list of keychords to monitor.
+ * Reading /dev/keychord returns the id of a keychord when the
+ * keychord combination is pressed. A keychord is signalled when
+ * all of the keys in the keycode list are in the pressed state.
+ * The order in which the keys are pressed does not matter.
+ * The keychord will not be signalled if keys not in the keycode
+ * list are pressed.
+ * Keychords will not be signalled on key release events.
+ */
+struct input_keychord {
+ /* should be KEYCHORD_VERSION */
+ __u16 version;
+ /*
+ * client specified ID, returned from read()
+ * when this keychord is pressed.
+ */
+ __u16 id;
+
+ /* number of keycodes in this keychord */
+ __u16 count;
+
+ /* variable length array of keycodes */
+ __u16 keycodes[];
+};
+
+#endif /* _UAPI_LINUX_KEYCHORD_H_ */
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 9bd559472c92..bd017699420e 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -53,6 +53,8 @@
#define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs"
#define REISER2FS_JR_SUPER_MAGIC_STRING "ReIsEr3Fs"
+#define SDCARDFS_SUPER_MAGIC 0x5dca2df5
+
#define SMB_SUPER_MAGIC 0x517B
#define CGROUP_SUPER_MAGIC 0x27e0eb
#define CGROUP2_SUPER_MAGIC 0x63677270
diff --git a/include/uapi/linux/netfilter/xt_IDLETIMER.h b/include/uapi/linux/netfilter/xt_IDLETIMER.h
index 208ae9387331..faaa28b3d061 100644
--- a/include/uapi/linux/netfilter/xt_IDLETIMER.h
+++ b/include/uapi/linux/netfilter/xt_IDLETIMER.h
@@ -4,6 +4,7 @@
* Header file for Xtables timer target module.
*
* Copyright (C) 2004, 2010 Nokia Corporation
+ *
* Written by Timo Teras <ext-timo.teras@nokia.com>
*
* Converted to x_tables and forward-ported to 2.6.34
@@ -32,12 +33,19 @@
#include <linux/types.h>
#define MAX_IDLETIMER_LABEL_SIZE 28
+#define NLMSG_MAX_SIZE 64
+
+#define NL_EVENT_TYPE_INACTIVE 0
+#define NL_EVENT_TYPE_ACTIVE 1
struct idletimer_tg_info {
__u32 timeout;
char label[MAX_IDLETIMER_LABEL_SIZE];
+ /* Use netlink messages for notification in addition to sysfs */
+ __u8 send_nl_msg;
+
/* for kernel module internal use only */
struct idletimer_tg *timer __attribute__((aligned(8)));
};
diff --git a/include/uapi/linux/netfilter/xt_socket.h b/include/uapi/linux/netfilter/xt_socket.h
index 87644f832494..7f00df6cd897 100644
--- a/include/uapi/linux/netfilter/xt_socket.h
+++ b/include/uapi/linux/netfilter/xt_socket.h
@@ -26,4 +26,11 @@ struct xt_socket_mtinfo3 {
| XT_SOCKET_NOWILDCARD \
| XT_SOCKET_RESTORESKMARK)
+struct sock *xt_socket_lookup_slow_v4(struct net *net,
+ const struct sk_buff *skb,
+ const struct net_device *indev);
+struct sock *xt_socket_lookup_slow_v6(struct net *net,
+ const struct sk_buff *skb,
+ const struct net_device *indev);
+
#endif /* _XT_SOCKET_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 64776b72e1eb..8b8a5e9934af 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -209,4 +209,7 @@ struct prctl_mm_map {
# define PR_SPEC_DISABLE (1UL << 2)
# define PR_SPEC_FORCE_DISABLE (1UL << 3)
+#define PR_SET_VMA 0x53564d41
+# define PR_SET_VMA_ANON_NAME 0
+
#endif /* _LINUX_PRCTL_H */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 5a78be518101..e14377f2ec27 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -318,6 +318,7 @@ enum rtattr_type_t {
RTA_ENCAP,
RTA_EXPIRES,
RTA_PAD,
+ RTA_UID,
__RTA_MAX
};
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index d2b12152e358..e13d48058b8d 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -568,6 +568,7 @@ enum {
NET_IPV6_PROXY_NDP=23,
NET_IPV6_ACCEPT_SOURCE_ROUTE=25,
NET_IPV6_ACCEPT_RA_FROM_LOCAL=26,
+ NET_IPV6_ACCEPT_RA_RT_INFO_MIN_PLEN=27,
__NET_IPV6_MAX
};
diff --git a/include/uapi/linux/usb/f_accessory.h b/include/uapi/linux/usb/f_accessory.h
new file mode 100644
index 000000000000..0baeb7d0d74c
--- /dev/null
+++ b/include/uapi/linux/usb/f_accessory.h
@@ -0,0 +1,146 @@
+/*
+ * Gadget Function Driver for Android USB accessories
+ *
+ * Copyright (C) 2011 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _UAPI_LINUX_USB_F_ACCESSORY_H
+#define _UAPI_LINUX_USB_F_ACCESSORY_H
+
+/* Use Google Vendor ID when in accessory mode */
+#define USB_ACCESSORY_VENDOR_ID 0x18D1
+
+
+/* Product ID to use when in accessory mode */
+#define USB_ACCESSORY_PRODUCT_ID 0x2D00
+
+/* Product ID to use when in accessory mode and adb is enabled */
+#define USB_ACCESSORY_ADB_PRODUCT_ID 0x2D01
+
+/* Indexes for strings sent by the host via ACCESSORY_SEND_STRING */
+#define ACCESSORY_STRING_MANUFACTURER 0
+#define ACCESSORY_STRING_MODEL 1
+#define ACCESSORY_STRING_DESCRIPTION 2
+#define ACCESSORY_STRING_VERSION 3
+#define ACCESSORY_STRING_URI 4
+#define ACCESSORY_STRING_SERIAL 5
+
+/* Control request for retrieving device's protocol version
+ *
+ * requestType: USB_DIR_IN | USB_TYPE_VENDOR
+ * request: ACCESSORY_GET_PROTOCOL
+ * value: 0
+ * index: 0
+ * data version number (16 bits little endian)
+ * 1 for original accessory support
+ * 2 adds HID and device to host audio support
+ */
+#define ACCESSORY_GET_PROTOCOL 51
+
+/* Control request for host to send a string to the device
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_SEND_STRING
+ * value: 0
+ * index: string ID
+ * data zero terminated UTF8 string
+ *
+ * The device can later retrieve these strings via the
+ * ACCESSORY_GET_STRING_* ioctls
+ */
+#define ACCESSORY_SEND_STRING 52
+
+/* Control request for starting device in accessory mode.
+ * The host sends this after setting all its strings to the device.
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_START
+ * value: 0
+ * index: 0
+ * data none
+ */
+#define ACCESSORY_START 53
+
+/* Control request for registering a HID device.
+ * Upon registering, a unique ID is sent by the accessory in the
+ * value parameter. This ID will be used for future commands for
+ * the device
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_REGISTER_HID_DEVICE
+ * value: Accessory assigned ID for the HID device
+ * index: total length of the HID report descriptor
+ * data none
+ */
+#define ACCESSORY_REGISTER_HID 54
+
+/* Control request for unregistering a HID device.
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_REGISTER_HID
+ * value: Accessory assigned ID for the HID device
+ * index: 0
+ * data none
+ */
+#define ACCESSORY_UNREGISTER_HID 55
+
+/* Control request for sending the HID report descriptor.
+ * If the HID descriptor is longer than the endpoint zero max packet size,
+ * the descriptor will be sent in multiple ACCESSORY_SET_HID_REPORT_DESC
+ * commands. The data for the descriptor must be sent sequentially
+ * if multiple packets are needed.
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_SET_HID_REPORT_DESC
+ * value: Accessory assigned ID for the HID device
+ * index: offset of data in descriptor
+ * (needed when HID descriptor is too big for one packet)
+ * data the HID report descriptor
+ */
+#define ACCESSORY_SET_HID_REPORT_DESC 56
+
+/* Control request for sending HID events.
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_SEND_HID_EVENT
+ * value: Accessory assigned ID for the HID device
+ * index: 0
+ * data the HID report for the event
+ */
+#define ACCESSORY_SEND_HID_EVENT 57
+
+/* Control request for setting the audio mode.
+ *
+ * requestType: USB_DIR_OUT | USB_TYPE_VENDOR
+ * request: ACCESSORY_SET_AUDIO_MODE
+ * value: 0 - no audio
+ * 1 - device to host, 44100 16-bit stereo PCM
+ * index: 0
+ * data none
+ */
+#define ACCESSORY_SET_AUDIO_MODE 58
+
+/* ioctls for retrieving strings set by the host */
+#define ACCESSORY_GET_STRING_MANUFACTURER _IOW('M', 1, char[256])
+#define ACCESSORY_GET_STRING_MODEL _IOW('M', 2, char[256])
+#define ACCESSORY_GET_STRING_DESCRIPTION _IOW('M', 3, char[256])
+#define ACCESSORY_GET_STRING_VERSION _IOW('M', 4, char[256])
+#define ACCESSORY_GET_STRING_URI _IOW('M', 5, char[256])
+#define ACCESSORY_GET_STRING_SERIAL _IOW('M', 6, char[256])
+/* returns 1 if there is a start request pending */
+#define ACCESSORY_IS_START_REQUESTED _IO('M', 7)
+/* returns audio mode (set via the ACCESSORY_SET_AUDIO_MODE control request) */
+#define ACCESSORY_GET_AUDIO_MODE _IO('M', 8)
+
+#endif /* _UAPI_LINUX_USB_F_ACCESSORY_H */
diff --git a/include/uapi/linux/usb/f_mtp.h b/include/uapi/linux/usb/f_mtp.h
new file mode 100644
index 000000000000..503291855abd
--- /dev/null
+++ b/include/uapi/linux/usb/f_mtp.h
@@ -0,0 +1,61 @@
+/*
+ * Gadget Function Driver for MTP
+ *
+ * Copyright (C) 2010 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _UAPI_LINUX_USB_F_MTP_H
+#define _UAPI_LINUX_USB_F_MTP_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+struct mtp_file_range {
+ /* file descriptor for file to transfer */
+ int fd;
+ /* offset in file for start of transfer */
+ loff_t offset;
+ /* number of bytes to transfer */
+ int64_t length;
+ /* MTP command ID for data header,
+ * used only for MTP_SEND_FILE_WITH_HEADER
+ */
+ uint16_t command;
+ /* MTP transaction ID for data header,
+ * used only for MTP_SEND_FILE_WITH_HEADER
+ */
+ uint32_t transaction_id;
+};
+
+struct mtp_event {
+ /* size of the event */
+ size_t length;
+ /* event data to send */
+ void *data;
+};
+
+/* Sends the specified file range to the host */
+#define MTP_SEND_FILE _IOW('M', 0, struct mtp_file_range)
+/* Receives data from the host and writes it to a file.
+ * The file is created if it does not exist.
+ */
+#define MTP_RECEIVE_FILE _IOW('M', 1, struct mtp_file_range)
+/* Sends an event to the host via the interrupt endpoint */
+#define MTP_SEND_EVENT _IOW('M', 3, struct mtp_event)
+/* Sends the specified file range to the host,
+ * with a 12 byte MTP data packet header at the beginning.
+ */
+#define MTP_SEND_FILE_WITH_HEADER _IOW('M', 4, struct mtp_file_range)
+
+#endif /* _UAPI_LINUX_USB_F_MTP_H */
diff --git a/init/Kconfig b/init/Kconfig
index b331feeabda4..544d7910d89b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -400,6 +400,15 @@ config IRQ_TIME_ACCOUNTING
If in doubt, say N here.
+config SCHED_WALT
+ bool "Support window based load tracking"
+ depends on SMP
+ help
+ This feature will allow the scheduler to maintain a tunable window
+ based set of metrics for tasks and runqueues. These metrics can be
+ used to guide task placement as well as task frequency requirements
+ for cpufreq governors.
+
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
depends on MULTIUSER
@@ -971,6 +980,82 @@ menuconfig CGROUPS
if CGROUPS
+config CGROUP_DEBUG
+ bool "Example debug cgroup subsystem"
+ default n
+ help
+ This option enables a simple cgroup subsystem that
+ exports useful debugging information about the cgroups
+ framework.
+
+ Say N if unsure.
+
+config CGROUP_FREEZER
+ bool "Freezer cgroup subsystem"
+ help
+ Provides a way to freeze and unfreeze all tasks in a
+ cgroup.
+
+config CGROUP_PIDS
+ bool "PIDs cgroup subsystem"
+ help
+ Provides enforcement of process number limits in the scope of a
+ cgroup. Any attempt to fork more processes than is allowed in the
+ cgroup will fail. PIDs are fundamentally a global resource because it
+ is fairly trivial to reach PID exhaustion before you reach even a
+ conservative kmemcg limit. As a result, it is possible to grind a
+ system to halt without being limited by other cgroup policies. The
+ PIDs cgroup subsystem is designed to stop this from happening.
+
+ It should be noted that organisational operations (such as attaching
+ to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
+ since the PIDs limit only affects a process's ability to fork, not to
+ attach to a cgroup.
+
+config CGROUP_DEVICE
+ bool "Device controller for cgroups"
+ help
+ Provides a cgroup implementing whitelists for devices which
+ a process in the cgroup can mknod or open.
+
+config CPUSETS
+ bool "Cpuset support"
+ help
+ This option will let you create and manage CPUSETs which
+ allow dynamically partitioning a system into sets of CPUs and
+ Memory Nodes and assigning tasks to run only within those sets.
+ This is primarily useful on large SMP or NUMA systems.
+
+ Say N if unsure.
+
+config PROC_PID_CPUSET
+ bool "Include legacy /proc/<pid>/cpuset file"
+ depends on CPUSETS
+ default y
+
+config CGROUP_CPUACCT
+ bool "Simple CPU accounting cgroup subsystem"
+ help
+ Provides a simple Resource Controller for monitoring the
+ total CPU consumed by the tasks in a cgroup.
+
+config CGROUP_SCHEDTUNE
+ bool "CFS tasks boosting cgroup subsystem (EXPERIMENTAL)"
+ depends on SCHED_TUNE
+ help
+ This option provides the "schedtune" controller which improves the
+ flexibility of the task boosting mechanism by introducing the support
+ to define "per task" boost values.
+
+ This new controller:
+ 1. allows only a two layers hierarchy, where the root defines the
+ system-wide boost value and its direct childrens define each one a
+ different "class of tasks" to be boosted with a different value
+ 2. supports up to 16 different task classes, each one which could be
+ configured with a different boost value
+
+ Say N if unsure.
+
config PAGE_COUNTER
bool
@@ -1154,6 +1239,19 @@ config CGROUP_PERF
Say N if unsure.
+config CGROUP_BPF
+ bool "Support for eBPF programs attached to cgroups"
+ depends on BPF_SYSCALL
+ select SOCK_CGROUP_DATA
+ help
+ Allow attaching eBPF programs to a cgroup using the bpf(2)
+ syscall command BPF_PROG_ATTACH.
+
+ In which context these programs are accessed depends on the type
+ of attachment. For instance, programs that are attached using
+ BPF_CGROUP_INET_INGRESS will be executed on the ingress path of
+ inet sockets.
+
config CGROUP_DEBUG
bool "Example controller"
default n
@@ -1163,6 +1261,10 @@ config CGROUP_DEBUG
Say N.
+config SOCK_CGROUP_DATA
+ bool
+ default n
+
endif # CGROUPS
config CHECKPOINT_RESTORE
@@ -1248,6 +1350,43 @@ config SCHED_AUTOGROUP
desktop applications. Task group autogeneration is currently based
upon task session.
+config SCHED_TUNE
+ bool "Boosting for CFS tasks (EXPERIMENTAL)"
+ depends on SMP
+ help
+ This option enables the system-wide support for task boosting.
+ When this support is enabled a new sysctl interface is exposed to
+ userspace via:
+ /proc/sys/kernel/sched_cfs_boost
+ which allows to set a system-wide boost value in range [0..100].
+
+ The currently boosting strategy is implemented in such a way that:
+ - a 0% boost value requires to operate in "standard" mode by
+ scheduling all tasks at the minimum capacities required by their
+ workload demand
+ - a 100% boost value requires to push at maximum the task
+ performances, "regardless" of the incurred energy consumption
+
+ A boost value in between these two boundaries is used to bias the
+ power/performance trade-off, the higher the boost value the more the
+ scheduler is biased toward performance boosting instead of energy
+ efficiency.
+
+ Since this support exposes a single system-wide knob, the specified
+ boost value is applied to all (CFS) tasks in the system.
+
+ If unsure, say N.
+
+config DEFAULT_USE_ENERGY_AWARE
+ bool "Default to enabling the Energy Aware Scheduler feature"
+ default n
+ help
+ This option defaults the ENERGY_AWARE scheduling feature to true,
+ as without SCHED_DEBUG set this feature can't be enabled or disabled
+ via sysctl.
+
+ Say N if unsure.
+
config SYSFS_DEPRECATED
bool "Enable deprecated sysfs features to support old userspace tools"
depends on SYSFS
diff --git a/init/Makefile b/init/Makefile
index c4fb45525d08..d210b235c5d7 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -5,11 +5,8 @@
ccflags-y := -fno-function-sections -fno-data-sections
obj-y := main.o version.o mounts.o
-ifneq ($(CONFIG_BLK_DEV_INITRD),y)
obj-y += noinitramfs.o
-else
obj-$(CONFIG_BLK_DEV_INITRD) += initramfs.o
-endif
obj-$(CONFIG_GENERIC_CALIBRATE_DELAY) += calibrate.o
ifneq ($(CONFIG_ARCH_INIT_TASK),y)
@@ -20,6 +17,7 @@ mounts-y := do_mounts.o
mounts-$(CONFIG_BLK_DEV_RAM) += do_mounts_rd.o
mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o
mounts-$(CONFIG_BLK_DEV_MD) += do_mounts_md.o
+mounts-$(CONFIG_BLK_DEV_DM) += do_mounts_dm.o
# dependencies on generated files need to be listed explicitly
$(obj)/version.o: include/generated/compile.h
diff --git a/init/do_mounts.c b/init/do_mounts.c
index dea5de95c2dd..1902a1c80831 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -566,6 +566,7 @@ void __init prepare_namespace(void)
wait_for_device_probe();
md_run_setup();
+ dm_run_setup();
if (saved_root_name[0]) {
root_device_name = saved_root_name;
diff --git a/init/do_mounts.h b/init/do_mounts.h
index 067af1d9e8b6..ecb275782c03 100644
--- a/init/do_mounts.h
+++ b/init/do_mounts.h
@@ -74,3 +74,13 @@ void md_run_setup(void);
static inline void md_run_setup(void) {}
#endif
+
+#ifdef CONFIG_BLK_DEV_DM
+
+void dm_run_setup(void);
+
+#else
+
+static inline void dm_run_setup(void) {}
+
+#endif
diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c
new file mode 100644
index 000000000000..af84b01ccfbc
--- /dev/null
+++ b/init/do_mounts_dm.c
@@ -0,0 +1,470 @@
+/* do_mounts_dm.c
+ * Copyright (C) 2010 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * All Rights Reserved.
+ * Based on do_mounts_md.c
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/async.h>
+#include <linux/ctype.h>
+#include <linux/device-mapper.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+
+#include "do_mounts.h"
+
+#define DM_MAX_DEVICES 256
+#define DM_MAX_TARGETS 256
+#define DM_MAX_NAME 32
+#define DM_MAX_UUID 129
+#define DM_NO_UUID "none"
+
+#define DM_MSG_PREFIX "init"
+
+/* Separators used for parsing the dm= argument. */
+#define DM_FIELD_SEP " "
+#define DM_LINE_SEP ","
+#define DM_ANY_SEP DM_FIELD_SEP DM_LINE_SEP
+
+/*
+ * When the device-mapper and any targets are compiled into the kernel
+ * (not a module), one or more device-mappers may be created and used
+ * as the root device at boot time with the parameters given with the
+ * boot line dm=...
+ *
+ * Multiple device-mappers can be stacked specifing the number of
+ * devices. A device can have multiple targets if the the number of
+ * targets is specified.
+ *
+ * TODO(taysom:defect 32847)
+ * In the future, the <num> field will be mandatory.
+ *
+ * <device> ::= [<num>] <device-mapper>+
+ * <device-mapper> ::= <head> "," <target>+
+ * <head> ::= <name> <uuid> <mode> [<num>]
+ * <target> ::= <start> <length> <type> <options> ","
+ * <mode> ::= "ro" | "rw"
+ * <uuid> ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "none"
+ * <type> ::= "verity" | "bootcache" | ...
+ *
+ * Example:
+ * 2 vboot none ro 1,
+ * 0 1768000 bootcache
+ * device=aa55b119-2a47-8c45-946a-5ac57765011f+1
+ * signature=76e9be054b15884a9fa85973e9cb274c93afadb6
+ * cache_start=1768000 max_blocks=100000 size_limit=23 max_trace=20000,
+ * vroot none ro 1,
+ * 0 1740800 verity payload=254:0 hashtree=254:0 hashstart=1740800 alg=sha1
+ * root_hexdigest=76e9be054b15884a9fa85973e9cb274c93afadb6
+ * salt=5b3549d54d6c7a3837b9b81ed72e49463a64c03680c47835bef94d768e5646fe
+ *
+ * Notes:
+ * 1. uuid is a label for the device and we set it to "none".
+ * 2. The <num> field will be optional initially and assumed to be 1.
+ * Once all the scripts that set these fields have been set, it will
+ * be made mandatory.
+ */
+
+struct dm_setup_target {
+ sector_t begin;
+ sector_t length;
+ char *type;
+ char *params;
+ /* simple singly linked list */
+ struct dm_setup_target *next;
+};
+
+struct dm_device {
+ int minor;
+ int ro;
+ char name[DM_MAX_NAME];
+ char uuid[DM_MAX_UUID];
+ unsigned long num_targets;
+ struct dm_setup_target *target;
+ int target_count;
+ struct dm_device *next;
+};
+
+struct dm_option {
+ char *start;
+ char *next;
+ size_t len;
+ char delim;
+};
+
+static struct {
+ unsigned long num_devices;
+ char *str;
+} dm_setup_args __initdata;
+
+static __initdata int dm_early_setup;
+
+static int __init get_dm_option(struct dm_option *opt, const char *accept)
+{
+ char *str = opt->next;
+ char *endp;
+
+ if (!str)
+ return 0;
+
+ str = skip_spaces(str);
+ opt->start = str;
+ endp = strpbrk(str, accept);
+ if (!endp) { /* act like strchrnul */
+ opt->len = strlen(str);
+ endp = str + opt->len;
+ } else {
+ opt->len = endp - str;
+ }
+ opt->delim = *endp;
+ if (*endp == 0) {
+ /* Don't advance past the nul. */
+ opt->next = endp;
+ } else {
+ opt->next = endp + 1;
+ }
+ return opt->len != 0;
+}
+
+static int __init dm_setup_cleanup(struct dm_device *devices)
+{
+ struct dm_device *dev = devices;
+
+ while (dev) {
+ struct dm_device *old_dev = dev;
+ struct dm_setup_target *target = dev->target;
+ while (target) {
+ struct dm_setup_target *old_target = target;
+ kfree(target->type);
+ kfree(target->params);
+ target = target->next;
+ kfree(old_target);
+ dev->target_count--;
+ }
+ BUG_ON(dev->target_count);
+ dev = dev->next;
+ kfree(old_dev);
+ }
+ return 0;
+}
+
+static char * __init dm_parse_device(struct dm_device *dev, char *str)
+{
+ struct dm_option opt;
+ size_t len;
+
+ /* Grab the logical name of the device to be exported to udev */
+ opt.next = str;
+ if (!get_dm_option(&opt, DM_FIELD_SEP)) {
+ DMERR("failed to parse device name");
+ goto parse_fail;
+ }
+ len = min(opt.len + 1, sizeof(dev->name));
+ strlcpy(dev->name, opt.start, len); /* includes nul */
+
+ /* Grab the UUID value or "none" */
+ if (!get_dm_option(&opt, DM_FIELD_SEP)) {
+ DMERR("failed to parse device uuid");
+ goto parse_fail;
+ }
+ len = min(opt.len + 1, sizeof(dev->uuid));
+ strlcpy(dev->uuid, opt.start, len);
+
+ /* Determine if the table/device will be read only or read-write */
+ get_dm_option(&opt, DM_ANY_SEP);
+ if (!strncmp("ro", opt.start, opt.len)) {
+ dev->ro = 1;
+ } else if (!strncmp("rw", opt.start, opt.len)) {
+ dev->ro = 0;
+ } else {
+ DMERR("failed to parse table mode");
+ goto parse_fail;
+ }
+
+ /* Optional number field */
+ /* XXX: The <num> field will be mandatory in the next round */
+ if (opt.delim == DM_FIELD_SEP[0]) {
+ if (!get_dm_option(&opt, DM_LINE_SEP))
+ return NULL;
+ dev->num_targets = simple_strtoul(opt.start, NULL, 10);
+ } else {
+ dev->num_targets = 1;
+ }
+ if (dev->num_targets > DM_MAX_TARGETS) {
+ DMERR("too many targets %lu > %d",
+ dev->num_targets, DM_MAX_TARGETS);
+ }
+ return opt.next;
+
+parse_fail:
+ return NULL;
+}
+
+static char * __init dm_parse_targets(struct dm_device *dev, char *str)
+{
+ struct dm_option opt;
+ struct dm_setup_target **target = &dev->target;
+ unsigned long num_targets = dev->num_targets;
+ unsigned long i;
+
+ /* Targets are defined as per the table format but with a
+ * comma as a newline separator. */
+ opt.next = str;
+ for (i = 0; i < num_targets; i++) {
+ *target = kzalloc(sizeof(struct dm_setup_target), GFP_KERNEL);
+ if (!*target) {
+ DMERR("failed to allocate memory for target %s<%ld>",
+ dev->name, i);
+ goto parse_fail;
+ }
+ dev->target_count++;
+
+ if (!get_dm_option(&opt, DM_FIELD_SEP)) {
+ DMERR("failed to parse starting sector"
+ " for target %s<%ld>", dev->name, i);
+ goto parse_fail;
+ }
+ (*target)->begin = simple_strtoull(opt.start, NULL, 10);
+
+ if (!get_dm_option(&opt, DM_FIELD_SEP)) {
+ DMERR("failed to parse length for target %s<%ld>",
+ dev->name, i);
+ goto parse_fail;
+ }
+ (*target)->length = simple_strtoull(opt.start, NULL, 10);
+
+ if (get_dm_option(&opt, DM_FIELD_SEP))
+ (*target)->type = kstrndup(opt.start, opt.len,
+ GFP_KERNEL);
+ if (!((*target)->type)) {
+ DMERR("failed to parse type for target %s<%ld>",
+ dev->name, i);
+ goto parse_fail;
+ }
+ if (get_dm_option(&opt, DM_LINE_SEP))
+ (*target)->params = kstrndup(opt.start, opt.len,
+ GFP_KERNEL);
+ if (!((*target)->params)) {
+ DMERR("failed to parse params for target %s<%ld>",
+ dev->name, i);
+ goto parse_fail;
+ }
+ target = &((*target)->next);
+ }
+ DMDEBUG("parsed %d targets", dev->target_count);
+
+ return opt.next;
+
+parse_fail:
+ return NULL;
+}
+
+static struct dm_device * __init dm_parse_args(void)
+{
+ struct dm_device *devices = NULL;
+ struct dm_device **tail = &devices;
+ struct dm_device *dev;
+ char *str = dm_setup_args.str;
+ unsigned long num_devices = dm_setup_args.num_devices;
+ unsigned long i;
+
+ if (!str)
+ return NULL;
+ for (i = 0; i < num_devices; i++) {
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev) {
+ DMERR("failed to allocated memory for dev");
+ goto error;
+ }
+ *tail = dev;
+ tail = &dev->next;
+ /*
+ * devices are given minor numbers 0 - n-1
+ * in the order they are found in the arg
+ * string.
+ */
+ dev->minor = i;
+ str = dm_parse_device(dev, str);
+ if (!str) /* NULL indicates error in parsing, bail */
+ goto error;
+
+ str = dm_parse_targets(dev, str);
+ if (!str)
+ goto error;
+ }
+ return devices;
+error:
+ dm_setup_cleanup(devices);
+ return NULL;
+}
+
+/*
+ * Parse the command-line parameters given our kernel, but do not
+ * actually try to invoke the DM device now; that is handled by
+ * dm_setup_drives after the low-level disk drivers have initialised.
+ * dm format is described at the top of the file.
+ *
+ * Because dm minor numbers are assigned in assending order starting with 0,
+ * You can assume the first device is /dev/dm-0, the next device is /dev/dm-1,
+ * and so forth.
+ */
+static int __init dm_setup(char *str)
+{
+ struct dm_option opt;
+ unsigned long num_devices;
+
+ if (!str) {
+ DMDEBUG("str is NULL");
+ goto parse_fail;
+ }
+ opt.next = str;
+ if (!get_dm_option(&opt, DM_FIELD_SEP))
+ goto parse_fail;
+ if (isdigit(opt.start[0])) { /* XXX: Optional number field */
+ num_devices = simple_strtoul(opt.start, NULL, 10);
+ str = opt.next;
+ } else {
+ num_devices = 1;
+ /* Don't advance str */
+ }
+ if (num_devices > DM_MAX_DEVICES) {
+ DMDEBUG("too many devices %lu > %d",
+ num_devices, DM_MAX_DEVICES);
+ }
+ dm_setup_args.str = str;
+ dm_setup_args.num_devices = num_devices;
+ DMINFO("will configure %lu devices", num_devices);
+ dm_early_setup = 1;
+ return 1;
+
+parse_fail:
+ DMWARN("Invalid arguments supplied to dm=.");
+ return 0;
+}
+
+static void __init dm_setup_drives(void)
+{
+ struct mapped_device *md = NULL;
+ struct dm_table *table = NULL;
+ struct dm_setup_target *target;
+ struct dm_device *dev;
+ char *uuid;
+ fmode_t fmode = FMODE_READ;
+ struct dm_device *devices;
+
+ devices = dm_parse_args();
+
+ for (dev = devices; dev; dev = dev->next) {
+ if (dm_create(dev->minor, &md)) {
+ DMDEBUG("failed to create the device");
+ goto dm_create_fail;
+ }
+ DMDEBUG("created device '%s'", dm_device_name(md));
+
+ /*
+ * In addition to flagging the table below, the disk must be
+ * set explicitly ro/rw.
+ */
+ set_disk_ro(dm_disk(md), dev->ro);
+
+ if (!dev->ro)
+ fmode |= FMODE_WRITE;
+ if (dm_table_create(&table, fmode, dev->target_count, md)) {
+ DMDEBUG("failed to create the table");
+ goto dm_table_create_fail;
+ }
+
+ dm_lock_md_type(md);
+
+ for (target = dev->target; target; target = target->next) {
+ DMINFO("adding target '%llu %llu %s %s'",
+ (unsigned long long) target->begin,
+ (unsigned long long) target->length,
+ target->type, target->params);
+ if (dm_table_add_target(table, target->type,
+ target->begin,
+ target->length,
+ target->params)) {
+ DMDEBUG("failed to add the target"
+ " to the table");
+ goto add_target_fail;
+ }
+ }
+ if (dm_table_complete(table)) {
+ DMDEBUG("failed to complete the table");
+ goto table_complete_fail;
+ }
+
+ /* Suspend the device so that we can bind it to the table. */
+ if (dm_suspend(md, 0)) {
+ DMDEBUG("failed to suspend the device pre-bind");
+ goto suspend_fail;
+ }
+
+ /* Initial table load: acquire type of table. */
+ dm_set_md_type(md, dm_table_get_type(table));
+
+ /* Setup md->queue to reflect md's type. */
+ if (dm_setup_md_queue(md, table)) {
+ DMWARN("unable to set up device queue for new table.");
+ goto setup_md_queue_fail;
+ }
+
+ /*
+ * Bind the table to the device. This is the only way
+ * to associate md->map with the table and set the disk
+ * capacity directly.
+ */
+ if (dm_swap_table(md, table)) { /* should return NULL. */
+ DMDEBUG("failed to bind the device to the table");
+ goto table_bind_fail;
+ }
+
+ /* Finally, resume and the device should be ready. */
+ if (dm_resume(md)) {
+ DMDEBUG("failed to resume the device");
+ goto resume_fail;
+ }
+
+ /* Export the dm device via the ioctl interface */
+ if (!strcmp(DM_NO_UUID, dev->uuid))
+ uuid = NULL;
+ if (dm_ioctl_export(md, dev->name, uuid)) {
+ DMDEBUG("failed to export device with given"
+ " name and uuid");
+ goto export_fail;
+ }
+
+ dm_unlock_md_type(md);
+
+ DMINFO("dm-%d is ready", dev->minor);
+ }
+ dm_setup_cleanup(devices);
+ return;
+
+export_fail:
+resume_fail:
+table_bind_fail:
+setup_md_queue_fail:
+suspend_fail:
+table_complete_fail:
+add_target_fail:
+ dm_unlock_md_type(md);
+dm_table_create_fail:
+ dm_put(md);
+dm_create_fail:
+ DMWARN("starting dm-%d (%s) failed",
+ dev->minor, dev->name);
+ dm_setup_cleanup(devices);
+}
+
+__setup("dm=", dm_setup);
+
+void __init dm_run_setup(void)
+{
+ if (!dm_early_setup)
+ return;
+ DMINFO("attempting early device configuration.");
+ dm_setup_drives();
+}
diff --git a/init/initramfs.c b/init/initramfs.c
index 981f286c1d16..d0b53f49c98a 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -18,6 +18,7 @@
#include <linux/dirent.h>
#include <linux/syscalls.h>
#include <linux/utime.h>
+#include <linux/initramfs.h>
#include <linux/file.h>
static ssize_t __init xwrite(int fd, const char *p, size_t count)
@@ -606,9 +607,25 @@ static void __init clean_rootfs(void)
}
#endif
+static int __initdata do_skip_initramfs;
+
+static int __init skip_initramfs_param(char *str)
+{
+ if (*str)
+ return 0;
+ do_skip_initramfs = 1;
+ return 1;
+}
+__setup("skip_initramfs", skip_initramfs_param);
+
static int __init populate_rootfs(void)
{
- char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
+ char *err;
+
+ if (do_skip_initramfs)
+ return default_rootfs();
+
+ err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
if (err)
panic("%s", err); /* Failed to decompress INTERNAL initramfs */
if (initrd_start) {
diff --git a/init/noinitramfs.c b/init/noinitramfs.c
index 267739d85179..bcc8bcb053ee 100644
--- a/init/noinitramfs.c
+++ b/init/noinitramfs.c
@@ -21,11 +21,16 @@
#include <linux/stat.h>
#include <linux/kdev_t.h>
#include <linux/syscalls.h>
+#include <linux/kconfig.h>
+#include <linux/initramfs.h>
/*
* Create a simple rootfs that is similar to the default initramfs
*/
-static int __init default_rootfs(void)
+#if !IS_BUILTIN(CONFIG_BLK_DEV_INITRD)
+static
+#endif
+int __init default_rootfs(void)
{
int err;
@@ -49,4 +54,6 @@ out:
printk(KERN_WARNING "Failed to create a rootfs\n");
return err;
}
+#if !IS_BUILTIN(CONFIG_BLK_DEV_INITRD)
rootfs_initcall(default_rootfs);
+#endif
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 28a142f1be36..02fb438fd3af 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -745,7 +745,7 @@ static struct file *do_create(struct ipc_namespace *ipc_ns, struct inode *dir,
}
mode &= ~current_umask();
- ret = vfs_create(dir, path->dentry, mode, true);
+ ret = vfs_create2(path->mnt, dir, path->dentry, mode, true);
path->dentry->d_fsdata = NULL;
if (ret)
return ERR_PTR(ret);
@@ -761,7 +761,7 @@ static struct file *do_open(struct path *path, int oflag)
if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
return ERR_PTR(-EINVAL);
acc = oflag2acc[oflag & O_ACCMODE];
- if (inode_permission(d_inode(path->dentry), acc))
+ if (inode_permission2(path->mnt, d_inode(path->dentry), acc))
return ERR_PTR(-EACCES);
return dentry_open(path, oflag, current_cred());
}
@@ -794,7 +794,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
ro = mnt_want_write(mnt); /* we'll drop it in any case */
error = 0;
inode_lock(d_inode(root));
- path.dentry = lookup_one_len(name->name, root, strlen(name->name));
+ path.dentry = lookup_one_len2(name->name, mnt, root, strlen(name->name));
if (IS_ERR(path.dentry)) {
error = PTR_ERR(path.dentry);
goto out_putfd;
@@ -865,7 +865,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
if (err)
goto out_name;
inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT);
- dentry = lookup_one_len(name->name, mnt->mnt_root,
+ dentry = lookup_one_len2(name->name, mnt, mnt->mnt_root,
strlen(name->name));
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
@@ -877,7 +877,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
err = -ENOENT;
} else {
ihold(inode);
- err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL);
+ err = vfs_unlink2(mnt, d_inode(dentry->d_parent), dentry, NULL);
}
dput(dentry);
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index eed911d091da..b22256b3893d 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -5,3 +5,4 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
+obj-$(CONFIG_CGROUP_BPF) += cgroup.o
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
new file mode 100644
index 000000000000..a44a7e4c23d2
--- /dev/null
+++ b/kernel/bpf/cgroup.c
@@ -0,0 +1,205 @@
+/*
+ * Functions to manage eBPF programs attached to cgroups
+ *
+ * Copyright (c) 2016 Daniel Mack
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
+#include <net/sock.h>
+
+DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+
+/**
+ * cgroup_bpf_put() - put references of all bpf programs
+ * @cgrp: the cgroup to modify
+ */
+void cgroup_bpf_put(struct cgroup *cgrp)
+{
+ unsigned int type;
+
+ for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
+ struct bpf_prog *prog = cgrp->bpf.prog[type];
+
+ if (prog) {
+ bpf_prog_put(prog);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ }
+ }
+}
+
+/**
+ * cgroup_bpf_inherit() - inherit effective programs from parent
+ * @cgrp: the cgroup to modify
+ * @parent: the parent to inherit from
+ */
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+{
+ unsigned int type;
+
+ for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
+ struct bpf_prog *e;
+
+ e = rcu_dereference_protected(parent->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+ rcu_assign_pointer(cgrp->bpf.effective[type], e);
+ cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
+ }
+}
+
+/**
+ * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
+ * @prog: A new program to pin
+ * @type: Type of pinning operation (ingress/egress)
+ *
+ * Each cgroup has a set of two pointers for bpf programs; one for eBPF
+ * programs it owns, and which is effective for execution.
+ *
+ * If @prog is not %NULL, this function attaches a new program to the cgroup
+ * and releases the one that is currently attached, if any. @prog is then made
+ * the effective program of type @type in that cgroup.
+ *
+ * If @prog is %NULL, the currently attached program of type @type is released,
+ * and the effective program of the parent cgroup (if any) is inherited to
+ * @cgrp.
+ *
+ * Then, the descendants of @cgrp are walked and the effective program for
+ * each of them is set to the effective program of @cgrp unless the
+ * descendant has its own program attached, in which case the subbranch is
+ * skipped. This ensures that delegated subcgroups with own programs are left
+ * untouched.
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
+ struct bpf_prog *prog, enum bpf_attach_type type,
+ bool new_overridable)
+{
+ struct bpf_prog *old_prog, *effective = NULL;
+ struct cgroup_subsys_state *pos;
+ bool overridable = true;
+
+ if (parent) {
+ overridable = !parent->bpf.disallow_override[type];
+ effective = rcu_dereference_protected(parent->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+ }
+
+ if (prog && effective && !overridable)
+ /* if parent has non-overridable prog attached, disallow
+ * attaching new programs to descendent cgroup
+ */
+ return -EPERM;
+
+ if (prog && effective && overridable != new_overridable)
+ /* if parent has overridable prog attached, only
+ * allow overridable programs in descendent cgroup
+ */
+ return -EPERM;
+
+ old_prog = cgrp->bpf.prog[type];
+
+ if (prog) {
+ overridable = new_overridable;
+ effective = prog;
+ if (old_prog &&
+ cgrp->bpf.disallow_override[type] == new_overridable)
+ /* disallow attaching non-overridable on top
+ * of existing overridable in this cgroup
+ * and vice versa
+ */
+ return -EPERM;
+ }
+
+ if (!prog && !old_prog)
+ /* report error when trying to detach and nothing is attached */
+ return -ENOENT;
+
+ cgrp->bpf.prog[type] = prog;
+
+ css_for_each_descendant_pre(pos, &cgrp->self) {
+ struct cgroup *desc = container_of(pos, struct cgroup, self);
+
+ /* skip the subtree if the descendant has its own program */
+ if (desc->bpf.prog[type] && desc != cgrp) {
+ pos = css_rightmost_descendant(pos);
+ } else {
+ rcu_assign_pointer(desc->bpf.effective[type],
+ effective);
+ desc->bpf.disallow_override[type] = !overridable;
+ }
+ }
+
+ if (prog)
+ static_branch_inc(&cgroup_bpf_enabled_key);
+
+ if (old_prog) {
+ bpf_prog_put(old_prog);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ }
+ return 0;
+}
+
+/**
+ * __cgroup_bpf_run_filter() - Run a program for packet filtering
+ * @sk: The socket sending or receiving traffic
+ * @skb: The skb that is being sent or received
+ * @type: The type of program to be exectuted
+ *
+ * If no socket is passed, or the socket is not of type INET or INET6,
+ * this function does nothing and returns 0.
+ *
+ * The program type passed in via @type must be suitable for network
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter(struct sock *sk,
+ struct sk_buff *skb,
+ enum bpf_attach_type type)
+{
+ struct bpf_prog *prog;
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ if (sk->sk_family != AF_INET &&
+ sk->sk_family != AF_INET6)
+ return 0;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+
+ rcu_read_lock();
+
+ prog = rcu_dereference(cgrp->bpf.effective[type]);
+ if (prog) {
+ unsigned int offset = skb->data - skb_network_header(skb);
+ struct sock *save_sk = skb->sk;
+
+ skb->sk = sk;
+ __skb_push(skb, offset);
+ ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
+ __skb_pull(skb, offset);
+ skb->sk = save_sk;
+ }
+
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ca7e277e8b5f..4eb48f47b24e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -726,7 +726,9 @@ static int bpf_prog_load(union bpf_attr *attr)
attr->kern_version != LINUX_VERSION_CODE)
return -EINVAL;
- if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+ if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
+ type != BPF_PROG_TYPE_CGROUP_SKB &&
+ !capable(CAP_SYS_ADMIN))
return -EPERM;
/* plain bpf_prog allocation */
@@ -800,6 +802,85 @@ static int bpf_obj_get(const union bpf_attr *attr)
return bpf_obj_get_user(u64_to_ptr(attr->pathname));
}
+#ifdef CONFIG_CGROUP_BPF
+
+#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+
+static int bpf_prog_attach(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ struct cgroup *cgrp;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (CHECK_ATTR(BPF_PROG_ATTACH))
+ return -EINVAL;
+
+ if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+ return -EINVAL;
+
+ switch (attr->attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ prog = bpf_prog_get_type(attr->attach_bpf_fd,
+ BPF_PROG_TYPE_CGROUP_SKB);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp)) {
+ bpf_prog_put(prog);
+ return PTR_ERR(cgrp);
+ }
+
+ ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
+ attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+ if (ret)
+ bpf_prog_put(prog);
+ cgroup_put(cgrp);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return ret;
+}
+
+#define BPF_PROG_DETACH_LAST_FIELD attach_type
+
+static int bpf_prog_detach(const union bpf_attr *attr)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (CHECK_ATTR(BPF_PROG_DETACH))
+ return -EINVAL;
+
+ switch (attr->attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
+ cgroup_put(cgrp);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return ret;
+}
+#endif /* CONFIG_CGROUP_BPF */
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -866,6 +947,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_OBJ_GET:
err = bpf_obj_get(&attr);
break;
+
+#ifdef CONFIG_CGROUP_BPF
+ case BPF_PROG_ATTACH:
+ err = bpf_prog_attach(&attr);
+ break;
+ case BPF_PROG_DETACH:
+ err = bpf_prog_detach(&attr);
+ break;
+#endif
+
default:
err = -EINVAL;
break;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bb0cf1caf1cd..2ebd03d49bff 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2856,7 +2856,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
*/
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid))
+ !uid_eq(cred->euid, tcred->suid) &&
+ !ns_capable(tcred->user_ns, CAP_SYS_NICE))
ret = -EACCES;
if (!ret && cgroup_on_dfl(dst_cgrp)) {
@@ -5079,6 +5080,8 @@ static void css_release_work_fn(struct work_struct *work)
if (cgrp->kn)
RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
NULL);
+
+ cgroup_bpf_put(cgrp);
}
mutex_unlock(&cgroup_mutex);
@@ -5291,6 +5294,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (!cgroup_on_dfl(cgrp))
cgrp->subtree_control = cgroup_control(cgrp);
+ if (parent)
+ cgroup_bpf_inherit(cgrp, parent);
+
cgroup_propagate_control(cgrp);
return cgrp;
@@ -6506,6 +6512,20 @@ static __init int cgroup_namespaces_init(void)
}
subsys_initcall(cgroup_namespaces_init);
+#ifdef CONFIG_CGROUP_BPF
+int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, bool overridable)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+#endif /* CONFIG_CGROUP_BPF */
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/kernel/configs/README.android b/kernel/configs/README.android
new file mode 100644
index 000000000000..2e2d7c001275
--- /dev/null
+++ b/kernel/configs/README.android
@@ -0,0 +1,15 @@
+The android-*.config files in this directory are meant to be used as a base
+for an Android kernel config. All devices should have the options in
+android-base.config enabled. While not mandatory, the options in
+android-recommended.config enable advanced Android features.
+
+Assuming you already have a minimalist defconfig for your device, a possible
+way to enable these options would be:
+
+ ARCH=<arch> scripts/kconfig/merge_config.sh <path_to>/<device>_defconfig kernel/configs/android-base.config kernel/configs/android-recommended.config
+
+This will generate a .config that can then be used to save a new defconfig or
+compile a new kernel with Android features enabled.
+
+Because there is no tool to consistently generate these config fragments,
+lets keep them alphabetically sorted instead of random.
diff --git a/kernel/configs/android-base-arm64.cfg b/kernel/configs/android-base-arm64.cfg
new file mode 100644
index 000000000000..43f23d6b5391
--- /dev/null
+++ b/kernel/configs/android-base-arm64.cfg
@@ -0,0 +1,5 @@
+# KEEP ALPHABETICALLY SORTED
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_SWP_EMULATION=y
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 1a8f34f63601..a393f83e9a6e 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -1,27 +1,32 @@
# KEEP ALPHABETICALLY SORTED
# CONFIG_DEVKMEM is not set
# CONFIG_DEVMEM is not set
+# CONFIG_FHANDLE is not set
# CONFIG_INET_LRO is not set
-# CONFIG_MODULES is not set
+# CONFIG_NFSD is not set
+# CONFIG_NFS_FS is not set
# CONFIG_OABI_COMPAT is not set
# CONFIG_SYSVIPC is not set
+# CONFIG_USELIB is not set
CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder
CONFIG_ANDROID_BINDER_IPC=y
CONFIG_ANDROID_LOW_MEMORY_KILLER=y
-CONFIG_ARMV8_DEPRECATED=y
CONFIG_ASHMEM=y
CONFIG_AUDIT=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_CGROUPS=y
CONFIG_CGROUP_CPUACCT=y
-CONFIG_CGROUP_DEBUG=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_SCHED=y
-CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_CGROUP_BPF=y
CONFIG_DEFAULT_SECURITY_SELINUX=y
CONFIG_EMBEDDED=y
CONFIG_FB=y
+CONFIG_HARDENED_USERCOPY=y
CONFIG_HIGH_RES_TIMERS=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
CONFIG_INET6_AH=y
CONFIG_INET6_ESP=y
CONFIG_INET6_IPCOMP=y
@@ -59,10 +64,12 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y
CONFIG_IP_NF_TARGET_NETMAP=y
CONFIG_IP_NF_TARGET_REDIRECT=y
CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
CONFIG_NET=y
CONFIG_NETDEVICES=y
CONFIG_NETFILTER=y
-CONFIG_NETFILTER_TPROXY=y
CONFIG_NETFILTER_XT_MATCH_COMMENT=y
CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
@@ -76,6 +83,8 @@ CONFIG_NETFILTER_XT_MATCH_MAC=y
CONFIG_NETFILTER_XT_MATCH_MARK=y
CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
CONFIG_NETFILTER_XT_MATCH_QUOTA=y
CONFIG_NETFILTER_XT_MATCH_SOCKET=y
CONFIG_NETFILTER_XT_MATCH_STATE=y
@@ -124,26 +133,33 @@ CONFIG_PACKET=y
CONFIG_PM_AUTOSLEEP=y
CONFIG_PM_WAKELOCKS=y
CONFIG_PPP=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
CONFIG_PPP_BSDCOMP=y
CONFIG_PPP_DEFLATE=y
CONFIG_PPP_MPPE=y
CONFIG_PREEMPT=y
-CONFIG_QUOTA=y
+CONFIG_PROFILING=y
+CONFIG_RANDOMIZE_BASE=y
CONFIG_RTC_CLASS=y
CONFIG_RT_GROUP_SCHED=y
CONFIG_SECCOMP=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y
CONFIG_SECURITY_SELINUX=y
-CONFIG_SETEND_EMULATION=y
CONFIG_STAGING=y
-CONFIG_SWP_EMULATION=y
CONFIG_SYNC=y
CONFIG_TUN=y
+CONFIG_UID_SYS_STATS=y
CONFIG_UNIX=y
-CONFIG_USB_GADGET=y
CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_ACC=y
+CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y
CONFIG_USB_CONFIGFS_F_FS=y
CONFIG_USB_CONFIGFS_F_MIDI=y
-CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_USB_CONFIGFS_F_MTP=y
+CONFIG_USB_CONFIGFS_F_PTP=y
+CONFIG_USB_CONFIGFS_UEVENT=y
+CONFIG_USB_GADGET=y
CONFIG_XFRM_USER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index 297756be369c..f3f7c1c7dd78 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -1,16 +1,20 @@
# KEEP ALPHABETICALLY SORTED
+# CONFIG_AIO is not set
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
# CONFIG_INPUT_MOUSE is not set
# CONFIG_LEGACY_PTYS is not set
# CONFIG_NF_CONNTRACK_SIP is not set
# CONFIG_PM_WAKELOCKS_GC is not set
# CONFIG_VT is not set
+CONFIG_ARM64_SW_TTBR0_PAN=y
CONFIG_BACKLIGHT_LCD_SUPPORT=y
CONFIG_BLK_DEV_DM=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_CC_STACKPROTECTOR_STRONG=y
CONFIG_COMPACTION=y
+CONFIG_CPU_SW_DOMAIN_PAN=y
CONFIG_DEBUG_RODATA=y
CONFIG_DM_CRYPT=y
CONFIG_DM_UEVENT=y
@@ -75,6 +79,8 @@ CONFIG_HID_ZYDACRON=y
CONFIG_INPUT_EVDEV=y
CONFIG_INPUT_GPIO=y
CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_KEYRESET=y
CONFIG_INPUT_MISC=y
CONFIG_INPUT_TABLET=y
CONFIG_INPUT_UINPUT=y
@@ -89,6 +95,7 @@ CONFIG_LOGIRUMBLEPAD2_FF=y
CONFIG_LOGITECH_FF=y
CONFIG_MD=y
CONFIG_MEDIA_SUPPORT=y
+CONFIG_MEMORY_STATE_TIME=y
CONFIG_MSDOS_FS=y
CONFIG_PANIC_TIMEOUT=5
CONFIG_PANTHERLORD_FF=y
@@ -100,6 +107,11 @@ CONFIG_POWER_SUPPLY=y
CONFIG_PSTORE=y
CONFIG_PSTORE_CONSOLE=y
CONFIG_PSTORE_RAM=y
+CONFIG_QFMT_V2=y
+CONFIG_QUOTA=y
+CONFIG_QUOTACTL=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+CONFIG_QUOTA_TREE=y
CONFIG_SCHEDSTATS=y
CONFIG_SMARTJOYPLUS_FF=y
CONFIG_SND=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b5a0165b7300..67eb51252542 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1262,6 +1262,7 @@ void __weak arch_enable_nonboot_cpus_end(void)
void enable_nonboot_cpus(void)
{
int cpu, error;
+ struct device *cpu_device;
/* Allow everyone to use the CPU hotplug again */
cpu_maps_update_begin();
@@ -1279,6 +1280,12 @@ void enable_nonboot_cpus(void)
trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
pr_info("CPU%d is up\n", cpu);
+ cpu_device = get_cpu_device(cpu);
+ if (!cpu_device)
+ pr_err("%s: failed to get cpu%d device\n",
+ __func__, cpu);
+ else
+ kobject_uevent(&cpu_device->kobj, KOBJ_ONLINE);
continue;
}
pr_warn("Error taking CPU%d up: %d\n", cpu, error);
@@ -2206,3 +2213,23 @@ void __init boot_cpu_hotplug_init(void)
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
}
+
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+ atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
+
+void idle_notifier_call_chain(unsigned long val)
+{
+ atomic_notifier_call_chain(&idle_notifier, val, NULL);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_call_chain);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 511b1dd8ff09..194e2f24841b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -100,6 +100,7 @@ struct cpuset {
/* user-configured CPUs and Memory Nodes allow to tasks */
cpumask_var_t cpus_allowed;
+ cpumask_var_t cpus_requested;
nodemask_t mems_allowed;
/* effective CPUs and Memory Nodes allow to tasks */
@@ -399,7 +400,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
{
- return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
+ return cpumask_subset(p->cpus_requested, q->cpus_requested) &&
nodes_subset(p->mems_allowed, q->mems_allowed) &&
is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
is_mem_exclusive(p) <= is_mem_exclusive(q);
@@ -499,7 +500,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
cpuset_for_each_child(c, css, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
- cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+ cpumask_intersects(trial->cpus_requested, c->cpus_requested))
goto out;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
@@ -958,17 +959,18 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (!*buf) {
cpumask_clear(trialcs->cpus_allowed);
} else {
- retval = cpulist_parse(buf, trialcs->cpus_allowed);
+ retval = cpulist_parse(buf, trialcs->cpus_requested);
if (retval < 0)
return retval;
- if (!cpumask_subset(trialcs->cpus_allowed,
- top_cpuset.cpus_allowed))
+ if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask))
return -EINVAL;
+
+ cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask);
}
/* Nothing to do if the cpus didn't change */
- if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+ if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested))
return 0;
retval = validate_change(cs, trialcs);
@@ -977,6 +979,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+ cpumask_copy(cs->cpus_requested, trialcs->cpus_requested);
spin_unlock_irq(&callback_lock);
/* use trialcs->cpus_allowed as a temp variable */
@@ -1761,7 +1764,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
switch (type) {
case FILE_CPULIST:
- seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_requested));
break;
case FILE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
@@ -1951,11 +1954,14 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);
if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
goto free_cs;
+ if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
+ goto free_allowed;
if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
- goto free_cpus;
+ goto free_requested;
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
+ cpumask_clear(cs->cpus_requested);
nodes_clear(cs->mems_allowed);
cpumask_clear(cs->effective_cpus);
nodes_clear(cs->effective_mems);
@@ -1964,7 +1970,9 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return &cs->css;
-free_cpus:
+free_requested:
+ free_cpumask_var(cs->cpus_requested);
+free_allowed:
free_cpumask_var(cs->cpus_allowed);
free_cs:
kfree(cs);
@@ -2027,6 +2035,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->mems_allowed = parent->mems_allowed;
cs->effective_mems = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ cpumask_copy(cs->cpus_requested, parent->cpus_requested);
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
@@ -2061,6 +2070,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
free_cpumask_var(cs->effective_cpus);
free_cpumask_var(cs->cpus_allowed);
+ free_cpumask_var(cs->cpus_requested);
kfree(cs);
}
@@ -2125,8 +2135,11 @@ int __init cpuset_init(void)
BUG();
if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
BUG();
+ if (!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL))
+ BUG();
cpumask_setall(top_cpuset.cpus_allowed);
+ cpumask_setall(top_cpuset.cpus_requested);
nodes_setall(top_cpuset.mems_allowed);
cpumask_setall(top_cpuset.effective_cpus);
nodes_setall(top_cpuset.effective_mems);
@@ -2260,7 +2273,7 @@ retry:
goto retry;
}
- cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+ cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index cc892a9e109d..d3c5b15c86c1 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -216,7 +216,7 @@ static char *kdb_read(char *buffer, size_t bufsize)
int i;
int diag, dtab_count;
int key, buf_size, ret;
-
+ static int last_crlf;
diag = kdbgetintenv("DTABCOUNT", &dtab_count);
if (diag)
@@ -237,6 +237,9 @@ poll_again:
return buffer;
if (key != 9)
tab = 0;
+ if (key != 10 && key != 13)
+ last_crlf = 0;
+
switch (key) {
case 8: /* backspace */
if (cp > buffer) {
@@ -254,7 +257,12 @@ poll_again:
*cp = tmp;
}
break;
- case 13: /* enter */
+ case 10: /* new line */
+ case 13: /* carriage return */
+ /* handle \n after \r */
+ if (last_crlf && last_crlf != key)
+ break;
+ last_crlf = key;
*lastchar++ = '\n';
*lastchar++ = '\0';
if (!KDB_STATE(KGDB_TRANS)) {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1af0bbf20984..be42bfeb87ae 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -389,8 +389,13 @@ static struct srcu_struct pmus_srcu;
* 0 - disallow raw tracepoint access for unpriv
* 1 - disallow cpu events for unpriv
* 2 - disallow kernel profiling for unpriv
+ * 3 - disallow all unpriv perf event use
*/
+#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
+int sysctl_perf_event_paranoid __read_mostly = 3;
+#else
int sysctl_perf_event_paranoid __read_mostly = 2;
+#endif
/* Minimum for 512 kiB + 1 user control page */
int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
@@ -9670,6 +9675,9 @@ SYSCALL_DEFINE5(perf_event_open,
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
+ if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
diff --git a/kernel/exit.c b/kernel/exit.c
index 6dd7ff4b337a..61921bc3fd5f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,6 +55,8 @@
#include <linux/shm.h>
#include <linux/kcov.h>
+#include "sched/tune.h"
+
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/pgtable.h>
@@ -775,6 +777,9 @@ void __noreturn do_exit(long code)
}
exit_signals(tsk); /* sets PF_EXITING */
+
+ schedtune_exit_task(tsk);
+
/*
* Ensure that all new tsk->pi_lock acquisitions must observe
* PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index dd2b5a4d89a5..826b733eaeb9 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,6 +1,7 @@
config SUSPEND
bool "Suspend to RAM and standby"
depends on ARCH_SUSPEND_POSSIBLE
+ select RTC_LIB
default y
---help---
Allow the system to enter sleep states in which main memory is
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index eb4f717705ba..80578f272be4 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -14,3 +14,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
+
+obj-$(CONFIG_SUSPEND) += wakeup_reason.o
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 8ea24ded1dab..9a12c835b6c2 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -18,6 +18,7 @@
#include <linux/workqueue.h>
#include <linux/kmod.h>
#include <trace/events/power.h>
+#include <linux/wakeup_reason.h>
#include <linux/cpuset.h>
/*
@@ -35,6 +36,9 @@ static int try_to_freeze_tasks(bool user_only)
unsigned int elapsed_msecs;
bool wakeup = false;
int sleep_usecs = USEC_PER_MSEC;
+#ifdef CONFIG_PM_SLEEP
+ char suspend_abort[MAX_SUSPEND_ABORT_LEN];
+#endif
start = ktime_get_boottime();
@@ -64,6 +68,11 @@ static int try_to_freeze_tasks(bool user_only)
break;
if (pm_wakeup_pending()) {
+#ifdef CONFIG_PM_SLEEP
+ pm_get_active_wakeup_sources(suspend_abort,
+ MAX_SUSPEND_ABORT_LEN);
+ log_suspend_abort_reason(suspend_abort);
+#endif
wakeup = true;
break;
}
@@ -82,26 +91,27 @@ static int try_to_freeze_tasks(bool user_only)
elapsed = ktime_sub(end, start);
elapsed_msecs = ktime_to_ms(elapsed);
- if (todo) {
+ if (wakeup) {
pr_cont("\n");
- pr_err("Freezing of tasks %s after %d.%03d seconds "
- "(%d tasks refusing to freeze, wq_busy=%d):\n",
- wakeup ? "aborted" : "failed",
+ pr_err("Freezing of tasks aborted after %d.%03d seconds",
+ elapsed_msecs / 1000, elapsed_msecs % 1000);
+ } else if (todo) {
+ pr_cont("\n");
+ pr_err("Freezing of tasks failed after %d.%03d seconds"
+ " (%d tasks refusing to freeze, wq_busy=%d):\n",
elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
if (wq_busy)
show_workqueue_state();
- if (!wakeup) {
- read_lock(&tasklist_lock);
- for_each_process_thread(g, p) {
- if (p != current && !freezer_should_skip(p)
- && freezing(p) && !frozen(p))
- sched_show_task(p);
- }
- read_unlock(&tasklist_lock);
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
+ if (p != current && !freezer_should_skip(p)
+ && freezing(p) && !frozen(p))
+ sched_show_task(p);
}
+ read_unlock(&tasklist_lock);
} else {
pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
elapsed_msecs % 1000);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6ccb08f57fcb..2d0c99b3f34c 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -26,9 +26,11 @@
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
#include <linux/ftrace.h>
+#include <linux/rtc.h>
#include <trace/events/power.h>
#include <linux/compiler.h>
#include <linux/moduleparam.h>
+#include <linux/wakeup_reason.h>
#include "power.h"
@@ -322,7 +324,8 @@ void __weak arch_suspend_enable_irqs(void)
*/
static int suspend_enter(suspend_state_t state, bool *wakeup)
{
- int error;
+ char suspend_abort[MAX_SUSPEND_ABORT_LEN];
+ int error, last_dev;
error = platform_suspend_prepare(state);
if (error)
@@ -330,7 +333,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_late(PMSG_SUSPEND);
if (error) {
+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+ last_dev %= REC_FAILED_NUM;
pr_err("PM: late suspend of devices failed\n");
+ log_suspend_abort_reason("%s device failed to power down",
+ suspend_stats.failed_devs[last_dev]);
goto Platform_finish;
}
error = platform_suspend_prepare_late(state);
@@ -339,7 +346,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+ last_dev %= REC_FAILED_NUM;
pr_err("PM: noirq suspend of devices failed\n");
+ log_suspend_abort_reason("noirq suspend of %s device failed",
+ suspend_stats.failed_devs[last_dev]);
goto Platform_early_resume;
}
error = platform_suspend_prepare_noirq(state);
@@ -363,8 +374,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
}
error = disable_nonboot_cpus();
- if (error || suspend_test(TEST_CPUS))
+ if (error || suspend_test(TEST_CPUS)) {
+ log_suspend_abort_reason("Disabling non-boot cpus failed");
goto Enable_cpus;
+ }
arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());
@@ -380,6 +393,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
state, false);
events_check_enabled = false;
} else if (*wakeup) {
+ pm_get_active_wakeup_sources(suspend_abort,
+ MAX_SUSPEND_ABORT_LEN);
+ log_suspend_abort_reason(suspend_abort);
error = -EBUSY;
}
syscore_resume();
@@ -427,6 +443,7 @@ int suspend_devices_and_enter(suspend_state_t state)
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
+ log_suspend_abort_reason("Some devices failed to suspend, or early wake event detected");
goto Recover_platform;
}
suspend_test_finish("suspend devices");
@@ -527,6 +544,18 @@ static int enter_state(suspend_state_t state)
return error;
}
+static void pm_suspend_marker(char *annotation)
+{
+ struct timespec ts;
+ struct rtc_time tm;
+
+ getnstimeofday(&ts);
+ rtc_time_to_tm(ts.tv_sec, &tm);
+ pr_info("PM: suspend %s %d-%02d-%02d %02d:%02d:%02d.%09lu UTC\n",
+ annotation, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec, ts.tv_nsec);
+}
+
/**
* pm_suspend - Externally visible function for suspending the system.
* @state: System sleep state to enter.
@@ -541,6 +570,7 @@ int pm_suspend(suspend_state_t state)
if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
return -EINVAL;
+ pm_suspend_marker("entry");
error = enter_state(state);
if (error) {
suspend_stats.fail++;
@@ -548,6 +578,7 @@ int pm_suspend(suspend_state_t state)
} else {
suspend_stats.success++;
}
+ pm_suspend_marker("exit");
return error;
}
EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c
new file mode 100644
index 000000000000..252611fad2fe
--- /dev/null
+++ b/kernel/power/wakeup_reason.c
@@ -0,0 +1,225 @@
+/*
+ * kernel/power/wakeup_reason.c
+ *
+ * Logs the reasons which caused the kernel to resume from
+ * the suspend mode.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/wakeup_reason.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+
+
+#define MAX_WAKEUP_REASON_IRQS 32
+static int irq_list[MAX_WAKEUP_REASON_IRQS];
+static int irqcount;
+static bool suspend_abort;
+static char abort_reason[MAX_SUSPEND_ABORT_LEN];
+static struct kobject *wakeup_reason;
+static DEFINE_SPINLOCK(resume_reason_lock);
+
+static ktime_t last_monotime; /* monotonic time before last suspend */
+static ktime_t curr_monotime; /* monotonic time after last suspend */
+static ktime_t last_stime; /* monotonic boottime offset before last suspend */
+static ktime_t curr_stime; /* monotonic boottime offset after last suspend */
+
+static ssize_t last_resume_reason_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ int irq_no, buf_offset = 0;
+ struct irq_desc *desc;
+ spin_lock(&resume_reason_lock);
+ if (suspend_abort) {
+ buf_offset = sprintf(buf, "Abort: %s", abort_reason);
+ } else {
+ for (irq_no = 0; irq_no < irqcount; irq_no++) {
+ desc = irq_to_desc(irq_list[irq_no]);
+ if (desc && desc->action && desc->action->name)
+ buf_offset += sprintf(buf + buf_offset, "%d %s\n",
+ irq_list[irq_no], desc->action->name);
+ else
+ buf_offset += sprintf(buf + buf_offset, "%d\n",
+ irq_list[irq_no]);
+ }
+ }
+ spin_unlock(&resume_reason_lock);
+ return buf_offset;
+}
+
+static ssize_t last_suspend_time_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct timespec sleep_time;
+ struct timespec total_time;
+ struct timespec suspend_resume_time;
+
+ /*
+ * total_time is calculated from monotonic bootoffsets because
+ * unlike CLOCK_MONOTONIC it include the time spent in suspend state.
+ */
+ total_time = ktime_to_timespec(ktime_sub(curr_stime, last_stime));
+
+ /*
+ * suspend_resume_time is calculated as monotonic (CLOCK_MONOTONIC)
+ * time interval before entering suspend and post suspend.
+ */
+ suspend_resume_time = ktime_to_timespec(ktime_sub(curr_monotime, last_monotime));
+
+ /* sleep_time = total_time - suspend_resume_time */
+ sleep_time = timespec_sub(total_time, suspend_resume_time);
+
+ /* Export suspend_resume_time and sleep_time in pair here. */
+ return sprintf(buf, "%lu.%09lu %lu.%09lu\n",
+ suspend_resume_time.tv_sec, suspend_resume_time.tv_nsec,
+ sleep_time.tv_sec, sleep_time.tv_nsec);
+}
+
+static struct kobj_attribute resume_reason = __ATTR_RO(last_resume_reason);
+static struct kobj_attribute suspend_time = __ATTR_RO(last_suspend_time);
+
+static struct attribute *attrs[] = {
+ &resume_reason.attr,
+ &suspend_time.attr,
+ NULL,
+};
+static struct attribute_group attr_group = {
+ .attrs = attrs,
+};
+
+/*
+ * logs all the wake up reasons to the kernel
+ * stores the irqs to expose them to the userspace via sysfs
+ */
+void log_wakeup_reason(int irq)
+{
+ struct irq_desc *desc;
+ desc = irq_to_desc(irq);
+ if (desc && desc->action && desc->action->name)
+ printk(KERN_INFO "Resume caused by IRQ %d, %s\n", irq,
+ desc->action->name);
+ else
+ printk(KERN_INFO "Resume caused by IRQ %d\n", irq);
+
+ spin_lock(&resume_reason_lock);
+ if (irqcount == MAX_WAKEUP_REASON_IRQS) {
+ spin_unlock(&resume_reason_lock);
+ printk(KERN_WARNING "Resume caused by more than %d IRQs\n",
+ MAX_WAKEUP_REASON_IRQS);
+ return;
+ }
+
+ irq_list[irqcount++] = irq;
+ spin_unlock(&resume_reason_lock);
+}
+
+int check_wakeup_reason(int irq)
+{
+ int irq_no;
+ int ret = false;
+
+ spin_lock(&resume_reason_lock);
+ for (irq_no = 0; irq_no < irqcount; irq_no++)
+ if (irq_list[irq_no] == irq) {
+ ret = true;
+ break;
+ }
+ spin_unlock(&resume_reason_lock);
+ return ret;
+}
+
+void log_suspend_abort_reason(const char *fmt, ...)
+{
+ va_list args;
+
+ spin_lock(&resume_reason_lock);
+
+ //Suspend abort reason has already been logged.
+ if (suspend_abort) {
+ spin_unlock(&resume_reason_lock);
+ return;
+ }
+
+ suspend_abort = true;
+ va_start(args, fmt);
+ vsnprintf(abort_reason, MAX_SUSPEND_ABORT_LEN, fmt, args);
+ va_end(args);
+ spin_unlock(&resume_reason_lock);
+}
+
+/* Detects a suspend and clears all the previous wake up reasons*/
+static int wakeup_reason_pm_event(struct notifier_block *notifier,
+ unsigned long pm_event, void *unused)
+{
+ switch (pm_event) {
+ case PM_SUSPEND_PREPARE:
+ spin_lock(&resume_reason_lock);
+ irqcount = 0;
+ suspend_abort = false;
+ spin_unlock(&resume_reason_lock);
+ /* monotonic time since boot */
+ last_monotime = ktime_get();
+ /* monotonic time since boot including the time spent in suspend */
+ last_stime = ktime_get_boottime();
+ break;
+ case PM_POST_SUSPEND:
+ /* monotonic time since boot */
+ curr_monotime = ktime_get();
+ /* monotonic time since boot including the time spent in suspend */
+ curr_stime = ktime_get_boottime();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block wakeup_reason_pm_notifier_block = {
+ .notifier_call = wakeup_reason_pm_event,
+};
+
+/* Initializes the sysfs parameter
+ * registers the pm_event notifier
+ */
+int __init wakeup_reason_init(void)
+{
+ int retval;
+
+ retval = register_pm_notifier(&wakeup_reason_pm_notifier_block);
+ if (retval)
+ printk(KERN_WARNING "[%s] failed to register PM notifier %d\n",
+ __func__, retval);
+
+ wakeup_reason = kobject_create_and_add("wakeup_reasons", kernel_kobj);
+ if (!wakeup_reason) {
+ printk(KERN_WARNING "[%s] failed to create a sysfs kobject\n",
+ __func__);
+ return 1;
+ }
+ retval = sysfs_create_group(wakeup_reason, &attr_group);
+ if (retval) {
+ kobject_put(wakeup_reason);
+ printk(KERN_WARNING "[%s] failed to create a sysfs group %d\n",
+ __func__, retval);
+ }
+ return 0;
+}
+
+late_initcall(wakeup_reason_init);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 27adaaab96ba..b6e193da6ea4 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -56,6 +56,10 @@
#include "braille.h"
#include "internal.h"
+#ifdef CONFIG_EARLY_PRINTK_DIRECT
+extern void printascii(char *);
+#endif
+
int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */
@@ -1875,6 +1879,10 @@ asmlinkage int vprintk_emit(int facility, int level,
}
}
+#ifdef CONFIG_EARLY_PRINTK_DIRECT
+ printascii(text);
+#endif
+
if (level == LOGLEVEL_DEFAULT)
level = default_message_loglevel;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5e59b832ae2b..b687747e6676 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -18,10 +18,13 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
obj-y += wait.o swait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
+obj-$(CONFIG_SCHED_WALT) += walt.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
+obj-$(CONFIG_SCHED_TUNE) += tune.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6b3fff6a6437..0b6635dba3f4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,6 +90,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+#include "walt.h"
DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -156,6 +157,18 @@ int sysctl_sched_rt_runtime = 950000;
/* cpus with isolated domains */
cpumask_var_t cpu_isolated_map;
+struct rq *
+lock_rq_of(struct task_struct *p, struct rq_flags *flags)
+{
+ return task_rq_lock(p, flags);
+}
+
+void
+unlock_rq_of(struct rq *rq, struct task_struct *p, struct rq_flags *flags)
+{
+ task_rq_unlock(rq, p, flags);
+}
+
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
@@ -998,7 +1011,9 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
p->on_rq = TASK_ON_RQ_MIGRATING;
dequeue_task(rq, p, 0);
+ double_lock_balance(rq, cpu_rq(new_cpu));
set_task_cpu(p, new_cpu);
+ double_unlock_balance(rq, cpu_rq(new_cpu));
raw_spin_unlock(&rq->lock);
rq = cpu_rq(new_cpu);
@@ -1254,6 +1269,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->sched_class->migrate_task_rq(p);
p->se.nr_migrations++;
perf_event_task_migrate(p);
+
+ walt_fixup_busy_time(p, new_cpu);
}
__set_task_cpu(p, new_cpu);
@@ -2010,6 +2027,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
int cpu, success = 0;
+#ifdef CONFIG_SMP
+ struct rq *rq;
+ u64 wallclock;
+#endif
/*
* If we are going to wake up a thread waiting for CONDITION we
@@ -2083,14 +2104,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);
+ rq = cpu_rq(task_cpu(p));
+
+ raw_spin_lock(&rq->lock);
+ wallclock = walt_ktime_clock();
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+ raw_spin_unlock(&rq->lock);
+
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
+
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
@@ -2140,8 +2171,13 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
trace_sched_waking(p);
- if (!task_on_rq_queued(p))
+ if (!task_on_rq_queued(p)) {
+ u64 wallclock = walt_ktime_clock();
+
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ }
ttwu_do_wakeup(rq, p, 0, cookie);
ttwu_stat(p, smp_processor_id(), 0);
@@ -2207,6 +2243,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.nr_migrations = 0;
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+ walt_init_new_task_load(p);
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
@@ -2566,6 +2603,9 @@ void wake_up_new_task(struct task_struct *p)
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+
+ walt_init_new_task_load(p);
+
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
@@ -2581,7 +2621,9 @@ void wake_up_new_task(struct task_struct *p)
rq = __task_rq_lock(p, &rf);
post_init_entity_util_avg(&p->se);
- activate_task(rq, p, 0);
+ walt_mark_task_starting(p);
+
+ activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
@@ -2966,6 +3008,36 @@ unsigned long nr_iowait_cpu(int cpu)
return atomic_read(&this->nr_iowait);
}
+#ifdef CONFIG_CPU_QUIET
+u64 nr_running_integral(unsigned int cpu)
+{
+ unsigned int seqcnt;
+ u64 integral;
+ struct rq *q;
+
+ if (cpu >= nr_cpu_ids)
+ return 0;
+
+ q = cpu_rq(cpu);
+
+ /*
+ * Update average to avoid reading stalled value if there were
+ * no run-queue changes for a long time. On the other hand if
+ * the changes are happening right now, just read current value
+ * directly.
+ */
+
+ seqcnt = read_seqcount_begin(&q->ave_seqcnt);
+ integral = do_nr_running_integral(q);
+ if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
+ read_seqcount_begin(&q->ave_seqcnt);
+ integral = q->nr_running_integral;
+ }
+
+ return integral;
+}
+#endif
+
void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
{
struct rq *rq = this_rq();
@@ -3070,6 +3142,94 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED
+
+static inline
+unsigned long add_capacity_margin(unsigned long cpu_capacity)
+{
+ cpu_capacity = cpu_capacity * capacity_margin;
+ cpu_capacity /= SCHED_CAPACITY_SCALE;
+ return cpu_capacity;
+}
+
+static inline
+unsigned long sum_capacity_reqs(unsigned long cfs_cap,
+ struct sched_capacity_reqs *scr)
+{
+ unsigned long total = add_capacity_margin(cfs_cap + scr->rt);
+ return total += scr->dl;
+}
+
+unsigned long boosted_cpu_util(int cpu);
+static void sched_freq_tick_pelt(int cpu)
+{
+ unsigned long cpu_utilization = boosted_cpu_util(cpu);
+ unsigned long capacity_curr = capacity_curr_of(cpu);
+ struct sched_capacity_reqs *scr;
+
+ scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+ if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr)
+ return;
+
+ /*
+ * To make free room for a task that is building up its "real"
+ * utilization and to harm its performance the least, request
+ * a jump to a higher OPP as soon as the margin of free capacity
+ * is impacted (specified by capacity_margin).
+ */
+ set_cfs_cpu_capacity(cpu, true, cpu_utilization);
+}
+
+#ifdef CONFIG_SCHED_WALT
+static void sched_freq_tick_walt(int cpu)
+{
+ unsigned long cpu_utilization = cpu_util(cpu);
+ unsigned long capacity_curr = capacity_curr_of(cpu);
+
+ if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
+ return sched_freq_tick_pelt(cpu);
+
+ /*
+ * Add a margin to the WALT utilization.
+ * NOTE: WALT tracks a single CPU signal for all the scheduling
+ * classes, thus this margin is going to be added to the DL class as
+ * well, which is something we do not do in sched_freq_tick_pelt case.
+ */
+ cpu_utilization = add_capacity_margin(cpu_utilization);
+ if (cpu_utilization <= capacity_curr)
+ return;
+
+ /*
+ * It is likely that the load is growing so we
+ * keep the added margin in our request as an
+ * extra boost.
+ */
+ set_cfs_cpu_capacity(cpu, true, cpu_utilization);
+
+}
+#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu)
+#else
+#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu)
+#endif /* CONFIG_SCHED_WALT */
+
+static void sched_freq_tick(int cpu)
+{
+ unsigned long capacity_orig, capacity_curr;
+
+ if (!sched_freq())
+ return;
+
+ capacity_orig = capacity_orig_of(cpu);
+ capacity_curr = capacity_curr_of(cpu);
+ if (capacity_curr == capacity_orig)
+ return;
+
+ _sched_freq_tick(cpu);
+}
+#else
+static inline void sched_freq_tick(int cpu) { }
+#endif /* CONFIG_CPU_FREQ_GOV_SCHED */
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -3083,10 +3243,14 @@ void scheduler_tick(void)
sched_clock_tick();
raw_spin_lock(&rq->lock);
+ walt_set_window_start(rq);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq);
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+ walt_ktime_clock(), 0);
calc_global_load_tick(rq);
+ sched_freq_tick(cpu);
raw_spin_unlock(&rq->lock);
perf_event_task_tick();
@@ -3339,6 +3503,7 @@ static void __sched notrace __schedule(bool preempt)
struct pin_cookie cookie;
struct rq *rq;
int cpu;
+ u64 wallclock;
cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -3391,6 +3556,9 @@ static void __sched notrace __schedule(bool preempt)
update_rq_clock(rq);
next = pick_next_task(rq, prev, cookie);
+ wallclock = walt_ktime_clock();
+ walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+ walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->clock_skip_update = 0;
@@ -5666,9 +5834,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n");
- if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
- " has parent");
return -1;
}
@@ -5710,7 +5875,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
printk(KERN_CONT " %*pbl",
cpumask_pr_args(sched_group_cpus(group)));
if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
- printk(KERN_CONT " (cpu_capacity = %d)",
+ printk(KERN_CONT " (cpu_capacity = %lu)",
group->sgc->capacity);
}
@@ -5763,8 +5928,12 @@ static inline bool sched_debug(void)
static int sd_degenerate(struct sched_domain *sd)
{
- if (cpumask_weight(sched_domain_span(sd)) == 1)
- return 1;
+ if (cpumask_weight(sched_domain_span(sd)) == 1) {
+ if (sd->groups->sge)
+ sd->flags &= ~SD_LOAD_BALANCE;
+ else
+ return 1;
+ }
/* Following flags need at least 2 groups */
if (sd->flags & (SD_LOAD_BALANCE |
@@ -5774,7 +5943,8 @@ static int sd_degenerate(struct sched_domain *sd)
SD_SHARE_CPUCAPACITY |
SD_ASYM_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
- SD_SHARE_POWERDOMAIN)) {
+ SD_SHARE_POWERDOMAIN |
+ SD_SHARE_CAP_STATES)) {
if (sd->groups != sd->groups->next)
return 0;
}
@@ -5807,7 +5977,12 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
+ SD_SHARE_POWERDOMAIN |
+ SD_SHARE_CAP_STATES);
+ if (parent->groups->sge) {
+ parent->flags &= ~SD_LOAD_BALANCE;
+ return 0;
+ }
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -5905,6 +6080,11 @@ static int init_rootdomain(struct root_domain *rd)
if (cpupri_init(&rd->cpupri) != 0)
goto free_rto_mask;
+
+ init_max_cpu_capacity(&rd->max_cpu_capacity);
+
+ rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1;
+
return 0;
free_rto_mask:
@@ -6016,11 +6196,14 @@ DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+DEFINE_PER_CPU(struct sched_domain *, sd_ea);
+DEFINE_PER_CPU(struct sched_domain *, sd_scs);
static void update_top_cache_domain(int cpu)
{
struct sched_domain_shared *sds = NULL;
struct sched_domain *sd;
+ struct sched_domain *ea_sd = NULL;
int id = cpu;
int size = 1;
@@ -6041,6 +6224,17 @@ static void update_top_cache_domain(int cpu)
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+
+ for_each_domain(cpu, sd) {
+ if (sd->groups->sge)
+ ea_sd = sd;
+ else
+ break;
+ }
+ rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
+
+ sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
+ rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
}
/*
@@ -6222,6 +6416,8 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* die on a /0 trap.
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
/*
* Make sure the first group of this domain contains the
@@ -6350,6 +6546,66 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
}
/*
+ * Check that the per-cpu provided sd energy data is consistent for all cpus
+ * within the mask.
+ */
+static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn,
+ const struct cpumask *cpumask)
+{
+ const struct sched_group_energy * const sge = fn(cpu);
+ struct cpumask mask;
+ int i;
+
+ if (cpumask_weight(cpumask) <= 1)
+ return;
+
+ cpumask_xor(&mask, cpumask, get_cpu_mask(cpu));
+
+ for_each_cpu(i, &mask) {
+ const struct sched_group_energy * const e = fn(i);
+ int y;
+
+ BUG_ON(e->nr_idle_states != sge->nr_idle_states);
+
+ for (y = 0; y < (e->nr_idle_states); y++) {
+ BUG_ON(e->idle_states[y].power !=
+ sge->idle_states[y].power);
+ }
+
+ BUG_ON(e->nr_cap_states != sge->nr_cap_states);
+
+ for (y = 0; y < (e->nr_cap_states); y++) {
+ BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap);
+ BUG_ON(e->cap_states[y].power !=
+ sge->cap_states[y].power);
+ }
+ }
+}
+
+static void init_sched_energy(int cpu, struct sched_domain *sd,
+ sched_domain_energy_f fn)
+{
+ if (!(fn && fn(cpu)))
+ return;
+
+ if (cpu != group_balance_cpu(sd->groups))
+ return;
+
+ if (sd->child && !sd->child->groups->sge) {
+ pr_err("BUG: EAS setup broken for CPU%d\n", cpu);
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" energy data on %s but not on %s domain\n",
+ sd->name, sd->child->name);
+#endif
+ return;
+ }
+
+ check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups));
+
+ sd->groups->sge = fn(cpu);
+}
+
+/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
@@ -6465,6 +6721,7 @@ static int sched_domains_curr_level;
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
* SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
+ * SD_SHARE_CAP_STATES - describes shared capacity states
*
* Odd one out, which beside describing the topology has a quirk also
* prescribes the desired behaviour that goes along with it:
@@ -6477,7 +6734,8 @@ static int sched_domains_curr_level;
SD_NUMA | \
SD_ASYM_PACKING | \
SD_ASYM_CPUCAPACITY | \
- SD_SHARE_POWERDOMAIN)
+ SD_SHARE_POWERDOMAIN | \
+ SD_SHARE_CAP_STATES)
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
@@ -7035,7 +7293,6 @@ static int build_sched_domains(const struct cpumask *cpu_map,
enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
- struct rq *rq = NULL;
int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -7053,8 +7310,6 @@ static int build_sched_domains(const struct cpumask *cpu_map,
*per_cpu_ptr(d.sd, i) = sd;
if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
sd->flags |= SD_OVERLAP;
- if (cpumask_equal(cpu_map, sched_domain_span(sd)))
- break;
}
}
@@ -7074,10 +7329,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ struct sched_domain_topology_level *tl = sched_domain_topology;
+
if (!cpumask_test_cpu(i, cpu_map))
continue;
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
+ init_sched_energy(i, sd, tl->energy);
claim_allocations(i, sd);
init_sched_groups_capacity(i, sd);
}
@@ -7086,22 +7344,23 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
- rq = cpu_rq(i);
- sd = *per_cpu_ptr(d.sd, i);
+ int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
+ int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu);
+
+ if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig >
+ cpu_rq(max_cpu)->cpu_capacity_orig))
+ WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
- /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
- if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
- WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+ if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig <
+ cpu_rq(min_cpu)->cpu_capacity_orig))
+ WRITE_ONCE(d.rd->min_cap_orig_cpu, i);
+
+ sd = *per_cpu_ptr(d.sd, i);
cpu_attach_domain(sd, d.rd, i);
}
rcu_read_unlock();
- if (rq && sched_debug_enabled) {
- pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
- cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
- }
-
ret = 0;
error:
__free_domain_allocs(&d, alloc_state, cpu_map);
@@ -7444,6 +7703,9 @@ int sched_cpu_dying(unsigned int cpu)
/* Handle pending wakeups and then migrate everything off */
sched_ttwu_pending();
raw_spin_lock_irqsave(&rq->lock, flags);
+
+ walt_migrate_sync_cpu(cpu);
+
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
@@ -7463,6 +7725,7 @@ void __init sched_init_smp(void)
{
cpumask_var_t non_isolated_cpus;
+ walt_init_cpu_efficiency();
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
@@ -7620,6 +7883,7 @@ void __init sched_init(void)
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
* How much cpu bandwidth does root_task_group get?
*
@@ -7664,6 +7928,11 @@ void __init sched_init(void)
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_WALT
+ rq->cur_irqload = 0;
+ rq->avg_irqload = 0;
+ rq->irqload_ts = 0;
+#endif
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -7721,6 +7990,14 @@ static inline int preempt_count_equals(int preempt_offset)
return (nested == preempt_offset);
}
+static int __might_sleep_init_called;
+int __init __might_sleep_init(void)
+{
+ __might_sleep_init_called = 1;
+ return 0;
+}
+early_initcall(__might_sleep_init);
+
void __might_sleep(const char *file, int line, int preempt_offset)
{
/*
@@ -7746,8 +8023,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
- !is_idle_task(current)) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ !is_idle_task(current)) || oops_in_progress)
+ return;
+ if (system_state != SYSTEM_RUNNING &&
+ (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
new file mode 100644
index 000000000000..1b19f2643f48
--- /dev/null
+++ b/kernel/sched/cpufreq_sched.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (C) 2015 Michael Turquette <mturquette@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/percpu.h>
+#include <linux/irq_work.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/cpufreq_sched.h>
+
+#include "sched.h"
+
+#define THROTTLE_DOWN_NSEC 50000000 /* 50ms default */
+#define THROTTLE_UP_NSEC 500000 /* 500us default */
+
+struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE;
+static bool __read_mostly cpufreq_driver_slow;
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
+static struct cpufreq_governor cpufreq_gov_sched;
+#endif
+
+static DEFINE_PER_CPU(unsigned long, enabled);
+DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
+
+struct gov_tunables {
+ struct gov_attr_set attr_set;
+ unsigned int up_throttle_nsec;
+ unsigned int down_throttle_nsec;
+};
+
+/**
+ * gov_data - per-policy data internal to the governor
+ * @up_throttle: next throttling period expiry if increasing OPP
+ * @down_throttle: next throttling period expiry if decreasing OPP
+ * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP
+ * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP
+ * @task: worker thread for dvfs transition that may block/sleep
+ * @irq_work: callback used to wake up worker thread
+ * @requested_freq: last frequency requested by the sched governor
+ *
+ * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
+ * per-policy instance of it is created when the cpufreq_sched governor receives
+ * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
+ * member of struct cpufreq_policy.
+ *
+ * Readers of this data must call down_read(policy->rwsem). Writers must
+ * call down_write(policy->rwsem).
+ */
+struct gov_data {
+ ktime_t up_throttle;
+ ktime_t down_throttle;
+ struct gov_tunables *tunables;
+ struct list_head tunables_hook;
+ struct task_struct *task;
+ struct irq_work irq_work;
+ unsigned int requested_freq;
+};
+
+static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,
+ unsigned int freq)
+{
+ struct gov_data *gd = policy->governor_data;
+
+ /* avoid race with cpufreq_sched_stop */
+ if (!down_write_trylock(&policy->rwsem))
+ return;
+
+ __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
+
+ gd->up_throttle = ktime_add_ns(ktime_get(),
+ gd->tunables->up_throttle_nsec);
+ gd->down_throttle = ktime_add_ns(ktime_get(),
+ gd->tunables->down_throttle_nsec);
+ up_write(&policy->rwsem);
+}
+
+static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq)
+{
+ ktime_t now = ktime_get();
+
+ ktime_t throttle = gd->requested_freq < cur_freq ?
+ gd->down_throttle : gd->up_throttle;
+
+ if (ktime_after(now, throttle))
+ return false;
+
+ while (1) {
+ int usec_left = ktime_to_ns(ktime_sub(throttle, now));
+
+ usec_left /= NSEC_PER_USEC;
+ trace_cpufreq_sched_throttled(usec_left);
+ usleep_range(usec_left, usec_left + 100);
+ now = ktime_get();
+ if (ktime_after(now, throttle))
+ return true;
+ }
+}
+
+/*
+ * we pass in struct cpufreq_policy. This is safe because changing out the
+ * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
+ * which tears down all of the data structures and __cpufreq_governor(policy,
+ * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
+ * new policy pointer
+ */
+static int cpufreq_sched_thread(void *data)
+{
+ struct sched_param param;
+ struct cpufreq_policy *policy;
+ struct gov_data *gd;
+ unsigned int new_request = 0;
+ unsigned int last_request = 0;
+ int ret;
+
+ policy = (struct cpufreq_policy *) data;
+ gd = policy->governor_data;
+
+ param.sched_priority = 50;
+ ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
+ if (ret) {
+ pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+ do_exit(-EINVAL);
+ } else {
+ pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
+ __func__, gd->task->pid);
+ }
+
+ do {
+ new_request = gd->requested_freq;
+ if (new_request == last_request) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread_should_stop())
+ break;
+ schedule();
+ } else {
+ /*
+ * if the frequency thread sleeps while waiting to be
+ * unthrottled, start over to check for a newer request
+ */
+ if (finish_last_request(gd, policy->cur))
+ continue;
+ last_request = new_request;
+ cpufreq_sched_try_driver_target(policy, new_request);
+ }
+ } while (!kthread_should_stop());
+
+ return 0;
+}
+
+static void cpufreq_sched_irq_work(struct irq_work *irq_work)
+{
+ struct gov_data *gd;
+
+ gd = container_of(irq_work, struct gov_data, irq_work);
+ if (!gd)
+ return;
+
+ wake_up_process(gd->task);
+}
+
+static void update_fdomain_capacity_request(int cpu)
+{
+ unsigned int freq_new, index_new, cpu_tmp;
+ struct cpufreq_policy *policy;
+ struct gov_data *gd;
+ unsigned long capacity = 0;
+
+ /*
+ * Avoid grabbing the policy if possible. A test is still
+ * required after locking the CPU's policy to avoid racing
+ * with the governor changing.
+ */
+ if (!per_cpu(enabled, cpu))
+ return;
+
+ policy = cpufreq_cpu_get(cpu);
+ if (IS_ERR_OR_NULL(policy))
+ return;
+
+ if (policy->governor != &cpufreq_gov_sched ||
+ !policy->governor_data)
+ goto out;
+
+ gd = policy->governor_data;
+
+ /* find max capacity requested by cpus in this policy */
+ for_each_cpu(cpu_tmp, policy->cpus) {
+ struct sched_capacity_reqs *scr;
+
+ scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp);
+ capacity = max(capacity, scr->total);
+ }
+
+ /* Convert the new maximum capacity request into a cpu frequency */
+ freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
+ index_new = cpufreq_frequency_table_target(policy, freq_new, CPUFREQ_RELATION_L);
+ freq_new = policy->freq_table[index_new].frequency;
+
+ if (freq_new > policy->max)
+ freq_new = policy->max;
+
+ if (freq_new < policy->min)
+ freq_new = policy->min;
+
+ trace_cpufreq_sched_request_opp(cpu, capacity, freq_new,
+ gd->requested_freq);
+ if (freq_new == gd->requested_freq)
+ goto out;
+
+ gd->requested_freq = freq_new;
+
+ /*
+ * Throttling is not yet supported on platforms with fast cpufreq
+ * drivers.
+ */
+ if (cpufreq_driver_slow)
+ irq_work_queue_on(&gd->irq_work, cpu);
+ else
+ cpufreq_sched_try_driver_target(policy, freq_new);
+
+out:
+ cpufreq_cpu_put(policy);
+}
+
+void update_cpu_capacity_request(int cpu, bool request)
+{
+ unsigned long new_capacity;
+ struct sched_capacity_reqs *scr;
+
+ /* The rq lock serializes access to the CPU's sched_capacity_reqs. */
+ lockdep_assert_held(&cpu_rq(cpu)->lock);
+
+ scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+
+ new_capacity = scr->cfs + scr->rt;
+ new_capacity = new_capacity * capacity_margin
+ / SCHED_CAPACITY_SCALE;
+ new_capacity += scr->dl;
+
+ if (new_capacity == scr->total)
+ return;
+
+ trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity);
+
+ scr->total = new_capacity;
+ if (request)
+ update_fdomain_capacity_request(cpu);
+}
+
+static inline void set_sched_freq(void)
+{
+ static_key_slow_inc(&__sched_freq);
+}
+
+static inline void clear_sched_freq(void)
+{
+ static_key_slow_dec(&__sched_freq);
+}
+
+/* Tunables */
+static struct gov_tunables *global_tunables;
+
+static inline struct gov_tunables *to_tunables(struct gov_attr_set *attr_set)
+{
+ return container_of(attr_set, struct gov_tunables, attr_set);
+}
+
+static ssize_t up_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct gov_tunables *tunables = to_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->up_throttle_nsec);
+}
+
+static ssize_t up_throttle_nsec_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct gov_tunables *tunables = to_tunables(attr_set);
+ int ret;
+ long unsigned int val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ tunables->up_throttle_nsec = val;
+ return count;
+}
+
+static ssize_t down_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct gov_tunables *tunables = to_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->down_throttle_nsec);
+}
+
+static ssize_t down_throttle_nsec_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct gov_tunables *tunables = to_tunables(attr_set);
+ int ret;
+ long unsigned int val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ tunables->down_throttle_nsec = val;
+ return count;
+}
+
+static struct governor_attr up_throttle_nsec = __ATTR_RW(up_throttle_nsec);
+static struct governor_attr down_throttle_nsec = __ATTR_RW(down_throttle_nsec);
+
+static struct attribute *schedfreq_attributes[] = {
+ &up_throttle_nsec.attr,
+ &down_throttle_nsec.attr,
+ NULL
+};
+
+static struct kobj_type tunables_ktype = {
+ .default_attrs = schedfreq_attributes,
+ .sysfs_ops = &governor_sysfs_ops,
+};
+
+static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
+{
+ struct gov_data *gd;
+ int cpu;
+ int rc;
+
+ for_each_cpu(cpu, policy->cpus)
+ memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0,
+ sizeof(struct sched_capacity_reqs));
+
+ gd = kzalloc(sizeof(*gd), GFP_KERNEL);
+ if (!gd)
+ return -ENOMEM;
+
+ policy->governor_data = gd;
+
+ if (!global_tunables) {
+ gd->tunables = kzalloc(sizeof(*gd->tunables), GFP_KERNEL);
+ if (!gd->tunables)
+ goto free_gd;
+
+ gd->tunables->up_throttle_nsec =
+ policy->cpuinfo.transition_latency ?
+ policy->cpuinfo.transition_latency :
+ THROTTLE_UP_NSEC;
+ gd->tunables->down_throttle_nsec =
+ THROTTLE_DOWN_NSEC;
+
+ rc = kobject_init_and_add(&gd->tunables->attr_set.kobj,
+ &tunables_ktype,
+ get_governor_parent_kobj(policy),
+ "%s", cpufreq_gov_sched.name);
+ if (rc)
+ goto free_tunables;
+
+ gov_attr_set_init(&gd->tunables->attr_set,
+ &gd->tunables_hook);
+
+ pr_debug("%s: throttle_threshold = %u [ns]\n",
+ __func__, gd->tunables->up_throttle_nsec);
+
+ if (!have_governor_per_policy())
+ global_tunables = gd->tunables;
+ } else {
+ gd->tunables = global_tunables;
+ gov_attr_set_get(&global_tunables->attr_set,
+ &gd->tunables_hook);
+ }
+
+ policy->governor_data = gd;
+ if (cpufreq_driver_is_slow()) {
+ cpufreq_driver_slow = true;
+ gd->task = kthread_create(cpufreq_sched_thread, policy,
+ "kschedfreq:%d",
+ cpumask_first(policy->related_cpus));
+ if (IS_ERR_OR_NULL(gd->task)) {
+ pr_err("%s: failed to create kschedfreq thread\n",
+ __func__);
+ goto free_tunables;
+ }
+ get_task_struct(gd->task);
+ kthread_bind_mask(gd->task, policy->related_cpus);
+ wake_up_process(gd->task);
+ init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
+ }
+
+ set_sched_freq();
+
+ return 0;
+
+free_tunables:
+ kfree(gd->tunables);
+free_gd:
+ policy->governor_data = NULL;
+ kfree(gd);
+ return -ENOMEM;
+}
+
+static void cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
+{
+ unsigned int count;
+ struct gov_data *gd = policy->governor_data;
+
+ clear_sched_freq();
+ if (cpufreq_driver_slow) {
+ kthread_stop(gd->task);
+ put_task_struct(gd->task);
+ }
+
+ count = gov_attr_set_put(&gd->tunables->attr_set, &gd->tunables_hook);
+ if (!count) {
+ if (!have_governor_per_policy())
+ global_tunables = NULL;
+ kfree(gd->tunables);
+ }
+
+ policy->governor_data = NULL;
+
+ kfree(gd);
+}
+
+static int cpufreq_sched_start(struct cpufreq_policy *policy)
+{
+ int cpu;
+
+ for_each_cpu(cpu, policy->cpus)
+ per_cpu(enabled, cpu) = 1;
+
+ return 0;
+}
+
+static void cpufreq_sched_limits(struct cpufreq_policy *policy)
+{
+ unsigned int clamp_freq;
+ struct gov_data *gd = policy->governor_data;;
+
+ pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n",
+ policy->cpu, policy->min, policy->max,
+ policy->cur);
+
+ clamp_freq = clamp(gd->requested_freq, policy->min, policy->max);
+
+ if (policy->cur != clamp_freq)
+ __cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L);
+}
+
+static void cpufreq_sched_stop(struct cpufreq_policy *policy)
+{
+ int cpu;
+
+ for_each_cpu(cpu, policy->cpus)
+ per_cpu(enabled, cpu) = 0;
+}
+
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
+static
+#endif
+struct cpufreq_governor cpufreq_gov_sched = {
+ .name = "sched",
+ .init = cpufreq_sched_policy_init,
+ .exit = cpufreq_sched_policy_exit,
+ .start = cpufreq_sched_start,
+ .stop = cpufreq_sched_stop,
+ .limits = cpufreq_sched_limits,
+ .owner = THIS_MODULE,
+};
+
+static int __init cpufreq_sched_init(void)
+{
+ int cpu;
+
+ for_each_cpu(cpu, cpu_possible_mask)
+ per_cpu(enabled, cpu) = 0;
+ return cpufreq_register_governor(&cpufreq_gov_sched);
+}
+
+#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
+struct cpufreq_governor *cpufreq_default_governor(void)
+{
+ return &cpufreq_gov_sched;
+}
+#endif
+
+/* Try to make this the default governor */
+fs_initcall(cpufreq_sched_init);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index cb771c76682e..19694c241081 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -12,14 +12,27 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cpufreq.h>
+#include <linux/kthread.h>
#include <linux/slab.h>
#include <trace/events/power.h>
#include "sched.h"
+#include "tune.h"
+
+unsigned long boosted_cpu_util(int cpu);
+
+/* Stub out fast switch routines present on mainline to reduce the backport
+ * overhead. */
+#define cpufreq_driver_fast_switch(x, y) 0
+#define cpufreq_enable_fast_switch(x)
+#define cpufreq_disable_fast_switch(x)
+#define LATENCY_MULTIPLIER (1000)
+#define SUGOV_KTHREAD_PRIORITY 50
struct sugov_tunables {
struct gov_attr_set attr_set;
- unsigned int rate_limit_us;
+ unsigned int up_rate_limit_us;
+ unsigned int down_rate_limit_us;
};
struct sugov_policy {
@@ -30,14 +43,18 @@ struct sugov_policy {
raw_spinlock_t update_lock; /* For shared policies */
u64 last_freq_update_time;
- s64 freq_update_delay_ns;
+ s64 min_rate_limit_ns;
+ s64 up_rate_delay_ns;
+ s64 down_rate_delay_ns;
unsigned int next_freq;
unsigned int cached_raw_freq;
/* The next fields are only needed if fast switch cannot be used. */
struct irq_work irq_work;
- struct work_struct work;
+ struct kthread_work work;
struct mutex work_lock;
+ struct kthread_worker worker;
+ struct task_struct *thread;
bool work_in_progress;
bool need_freq_update;
@@ -55,6 +72,11 @@ struct sugov_cpu {
unsigned long util;
unsigned long max;
unsigned int flags;
+
+ /* The field below is for single-CPU policies only. */
+#ifdef CONFIG_NO_HZ_COMMON
+ unsigned long saved_idle_calls;
+#endif
};
static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -79,7 +101,27 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
}
delta_ns = time - sg_policy->last_freq_update_time;
- return delta_ns >= sg_policy->freq_update_delay_ns;
+
+ /* No need to recalculate next freq for min_rate_limit_us at least */
+ return delta_ns >= sg_policy->min_rate_limit_ns;
+}
+
+static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ s64 delta_ns;
+
+ delta_ns = time - sg_policy->last_freq_update_time;
+
+ if (next_freq > sg_policy->next_freq &&
+ delta_ns < sg_policy->up_rate_delay_ns)
+ return true;
+
+ if (next_freq < sg_policy->next_freq &&
+ delta_ns < sg_policy->down_rate_delay_ns)
+ return true;
+
+ return false;
}
static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
@@ -87,22 +129,23 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
{
struct cpufreq_policy *policy = sg_policy->policy;
+ if (sugov_up_down_rate_limit(sg_policy, time, next_freq))
+ return;
+
+ if (sg_policy->next_freq == next_freq)
+ return;
+
+ sg_policy->next_freq = next_freq;
sg_policy->last_freq_update_time = time;
if (policy->fast_switch_enabled) {
- if (sg_policy->next_freq == next_freq) {
- trace_cpu_frequency(policy->cur, smp_processor_id());
- return;
- }
- sg_policy->next_freq = next_freq;
next_freq = cpufreq_driver_fast_switch(policy, next_freq);
if (next_freq == CPUFREQ_ENTRY_INVALID)
return;
policy->cur = next_freq;
trace_cpu_frequency(next_freq, smp_processor_id());
- } else if (sg_policy->next_freq != next_freq) {
- sg_policy->next_freq = next_freq;
+ } else {
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
}
@@ -110,7 +153,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
/**
* get_next_freq - Compute a new frequency for a given cpufreq policy.
- * @sg_cpu: schedutil cpu object to compute the new frequency for.
+ * @sg_policy: schedutil policy object to compute the new frequency for.
* @util: Current CPU utilization.
* @max: CPU capacity.
*
@@ -130,10 +173,9 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
* next_freq (as calculated above) is returned, subject to policy min/max and
* cpufreq driver limitations.
*/
-static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
- unsigned long max)
+static unsigned int get_next_freq(struct sugov_policy *sg_policy,
+ unsigned long util, unsigned long max)
{
- struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
@@ -146,15 +188,36 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
return cpufreq_driver_resolve_freq(policy, freq);
}
-static void sugov_get_util(unsigned long *util, unsigned long *max)
+static inline bool use_pelt(void)
{
- struct rq *rq = this_rq();
- unsigned long cfs_max;
+#ifdef CONFIG_SCHED_WALT
+ return (!sysctl_sched_use_walt_cpu_util || walt_disabled);
+#else
+ return true;
+#endif
+}
+
+static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long max_cap, rt;
+ s64 delta;
+
+ max_cap = arch_scale_cpu_capacity(NULL, cpu);
- cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+ sched_avg_update(rq);
+ delta = time - rq->age_stamp;
+ if (unlikely(delta < 0))
+ delta = 0;
+ rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);
+ rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;
- *util = min(rq->cfs.avg.util_avg, cfs_max);
- *max = cfs_max;
+ *util = boosted_cpu_util(cpu);
+ if (likely(use_pelt()))
+ *util = min((*util + rt), max_cap);
+
+ *max = max_cap;
}
static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
@@ -187,6 +250,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
sg_cpu->iowait_boost >>= 1;
}
+#ifdef CONFIG_NO_HZ_COMMON
+static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+{
+ unsigned long idle_calls = tick_nohz_get_idle_calls();
+ bool ret = idle_calls == sg_cpu->saved_idle_calls;
+
+ sg_cpu->saved_idle_calls = idle_calls;
+ return ret;
+}
+#else
+static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+#endif /* CONFIG_NO_HZ_COMMON */
+
static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int flags)
{
@@ -195,6 +271,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
struct cpufreq_policy *policy = sg_policy->policy;
unsigned long util, max;
unsigned int next_f;
+ bool busy;
sugov_set_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
@@ -202,40 +279,36 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
if (!sugov_should_update_freq(sg_policy, time))
return;
- if (flags & SCHED_CPUFREQ_RT_DL) {
+ busy = sugov_cpu_is_busy(sg_cpu);
+
+ if (flags & SCHED_CPUFREQ_DL) {
next_f = policy->cpuinfo.max_freq;
} else {
- sugov_get_util(&util, &max);
+ sugov_get_util(&util, &max, time);
sugov_iowait_boost(sg_cpu, &util, &max);
- next_f = get_next_freq(sg_cpu, util, max);
+ next_f = get_next_freq(sg_policy, util, max);
+ /*
+ * Do not reduce the frequency if the CPU has not been idle
+ * recently, as the reduction is likely to be premature then.
+ */
+ if (busy && next_f < sg_policy->next_freq)
+ next_f = sg_policy->next_freq;
}
sugov_update_commit(sg_policy, time, next_f);
}
-static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
- unsigned long util, unsigned long max,
- unsigned int flags)
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned int max_f = policy->cpuinfo.max_freq;
- u64 last_freq_update_time = sg_policy->last_freq_update_time;
+ unsigned long util = 0, max = 1;
unsigned int j;
- if (flags & SCHED_CPUFREQ_RT_DL)
- return max_f;
-
- sugov_iowait_boost(sg_cpu, &util, &max);
-
for_each_cpu(j, policy->cpus) {
- struct sugov_cpu *j_sg_cpu;
+ struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
s64 delta_ns;
- if (j == smp_processor_id())
- continue;
-
- j_sg_cpu = &per_cpu(sugov_cpu, j);
/*
* If the CPU utilization was last updated before the previous
* frequency update and the time elapsed between the last update
@@ -243,13 +316,13 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
* enough, don't take the CPU into account as it probably is
* idle now (and clear iowait_boost for it).
*/
- delta_ns = last_freq_update_time - j_sg_cpu->last_update;
+ delta_ns = time - j_sg_cpu->last_update;
if (delta_ns > TICK_NSEC) {
j_sg_cpu->iowait_boost = 0;
continue;
}
- if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
- return max_f;
+ if (j_sg_cpu->flags & SCHED_CPUFREQ_DL)
+ return policy->cpuinfo.max_freq;
j_util = j_sg_cpu->util;
j_max = j_sg_cpu->max;
@@ -261,7 +334,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
sugov_iowait_boost(j_sg_cpu, &util, &max);
}
- return get_next_freq(sg_cpu, util, max);
+ return get_next_freq(sg_policy, util, max);
}
static void sugov_update_shared(struct update_util_data *hook, u64 time,
@@ -272,7 +345,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
unsigned long util, max;
unsigned int next_f;
- sugov_get_util(&util, &max);
+ sugov_get_util(&util, &max, time);
raw_spin_lock(&sg_policy->update_lock);
@@ -284,14 +357,18 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) {
- next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
+ if (flags & SCHED_CPUFREQ_DL)
+ next_f = sg_policy->policy->cpuinfo.max_freq;
+ else
+ next_f = sugov_next_freq_shared(sg_cpu, time);
+
sugov_update_commit(sg_policy, time, next_f);
}
raw_spin_unlock(&sg_policy->update_lock);
}
-static void sugov_work(struct work_struct *work)
+static void sugov_work(struct kthread_work *work)
{
struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
@@ -308,7 +385,21 @@ static void sugov_irq_work(struct irq_work *irq_work)
struct sugov_policy *sg_policy;
sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
- schedule_work_on(smp_processor_id(), &sg_policy->work);
+
+ /*
+ * For RT and deadline tasks, the schedutil governor shoots the
+ * frequency to maximum. Special care must be taken to ensure that this
+ * kthread doesn't result in the same behavior.
+ *
+ * This is (mostly) guaranteed by the work_in_progress flag. The flag is
+ * updated only at the end of the sugov_work() function and before that
+ * the schedutil governor rejects all other frequency scaling requests.
+ *
+ * There is a very rare case though, where the RT thread yields right
+ * after the work_in_progress flag is cleared. The effects of that are
+ * neglected for now.
+ */
+ kthread_queue_work(&sg_policy->worker, &sg_policy->work);
}
/************************** sysfs interface ************************/
@@ -321,15 +412,32 @@ static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr
return container_of(attr_set, struct sugov_tunables, attr_set);
}
-static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+static DEFINE_MUTEX(min_rate_lock);
+
+static void update_min_rate_limit_us(struct sugov_policy *sg_policy)
+{
+ mutex_lock(&min_rate_lock);
+ sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,
+ sg_policy->down_rate_delay_ns);
+ mutex_unlock(&min_rate_lock);
+}
+
+static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->up_rate_limit_us);
+}
+
+static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
{
struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
- return sprintf(buf, "%u\n", tunables->rate_limit_us);
+ return sprintf(buf, "%u\n", tunables->down_rate_limit_us);
}
-static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
- size_t count)
+static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
{
struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
struct sugov_policy *sg_policy;
@@ -338,18 +446,42 @@ static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *bu
if (kstrtouint(buf, 10, &rate_limit_us))
return -EINVAL;
- tunables->rate_limit_us = rate_limit_us;
+ tunables->up_rate_limit_us = rate_limit_us;
- list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
- sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+ sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+ update_min_rate_limit_us(sg_policy);
+ }
return count;
}
-static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
+static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ struct sugov_policy *sg_policy;
+ unsigned int rate_limit_us;
+
+ if (kstrtouint(buf, 10, &rate_limit_us))
+ return -EINVAL;
+
+ tunables->down_rate_limit_us = rate_limit_us;
+
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+ sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+ update_min_rate_limit_us(sg_policy);
+ }
+
+ return count;
+}
+
+static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
+static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
static struct attribute *sugov_attributes[] = {
- &rate_limit_us.attr,
+ &up_rate_limit_us.attr,
+ &down_rate_limit_us.attr,
NULL
};
@@ -371,19 +503,64 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
return NULL;
sg_policy->policy = policy;
- init_irq_work(&sg_policy->irq_work, sugov_irq_work);
- INIT_WORK(&sg_policy->work, sugov_work);
- mutex_init(&sg_policy->work_lock);
raw_spin_lock_init(&sg_policy->update_lock);
return sg_policy;
}
static void sugov_policy_free(struct sugov_policy *sg_policy)
{
- mutex_destroy(&sg_policy->work_lock);
kfree(sg_policy);
}
+static int sugov_kthread_create(struct sugov_policy *sg_policy)
+{
+ struct task_struct *thread;
+ struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+ struct cpufreq_policy *policy = sg_policy->policy;
+ int ret;
+
+ /* kthread only required for slow path */
+ if (policy->fast_switch_enabled)
+ return 0;
+
+ kthread_init_work(&sg_policy->work, sugov_work);
+ kthread_init_worker(&sg_policy->worker);
+ thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
+ "sugov:%d",
+ cpumask_first(policy->related_cpus));
+ if (IS_ERR(thread)) {
+ pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+ return PTR_ERR(thread);
+ }
+
+ ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+ if (ret) {
+ kthread_stop(thread);
+ pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+ return ret;
+ }
+
+ sg_policy->thread = thread;
+ kthread_bind_mask(thread, policy->related_cpus);
+ init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+ mutex_init(&sg_policy->work_lock);
+
+ wake_up_process(thread);
+
+ return 0;
+}
+
+static void sugov_kthread_stop(struct sugov_policy *sg_policy)
+{
+ /* kthread only required for slow path */
+ if (sg_policy->policy->fast_switch_enabled)
+ return;
+
+ kthread_flush_worker(&sg_policy->worker);
+ kthread_stop(sg_policy->thread);
+ mutex_destroy(&sg_policy->work_lock);
+}
+
static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
{
struct sugov_tunables *tunables;
@@ -416,16 +593,24 @@ static int sugov_init(struct cpufreq_policy *policy)
if (policy->governor_data)
return -EBUSY;
+ cpufreq_enable_fast_switch(policy);
+
sg_policy = sugov_policy_alloc(policy);
- if (!sg_policy)
- return -ENOMEM;
+ if (!sg_policy) {
+ ret = -ENOMEM;
+ goto disable_fast_switch;
+ }
+
+ ret = sugov_kthread_create(sg_policy);
+ if (ret)
+ goto free_sg_policy;
mutex_lock(&global_tunables_lock);
if (global_tunables) {
if (WARN_ON(have_governor_per_policy())) {
ret = -EINVAL;
- goto free_sg_policy;
+ goto stop_kthread;
}
policy->governor_data = sg_policy;
sg_policy->tunables = global_tunables;
@@ -437,13 +622,16 @@ static int sugov_init(struct cpufreq_policy *policy)
tunables = sugov_tunables_alloc(sg_policy);
if (!tunables) {
ret = -ENOMEM;
- goto free_sg_policy;
+ goto stop_kthread;
}
- tunables->rate_limit_us = LATENCY_MULTIPLIER;
+ tunables->up_rate_limit_us = LATENCY_MULTIPLIER;
+ tunables->down_rate_limit_us = LATENCY_MULTIPLIER;
lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
- if (lat)
- tunables->rate_limit_us *= lat;
+ if (lat) {
+ tunables->up_rate_limit_us *= lat;
+ tunables->down_rate_limit_us *= lat;
+ }
policy->governor_data = sg_policy;
sg_policy->tunables = tunables;
@@ -454,20 +642,25 @@ static int sugov_init(struct cpufreq_policy *policy)
if (ret)
goto fail;
- out:
+out:
mutex_unlock(&global_tunables_lock);
-
- cpufreq_enable_fast_switch(policy);
return 0;
- fail:
+fail:
policy->governor_data = NULL;
sugov_tunables_free(tunables);
+ stop_kthread:
+ sugov_kthread_stop(sg_policy);
+
free_sg_policy:
mutex_unlock(&global_tunables_lock);
sugov_policy_free(sg_policy);
+
+disable_fast_switch:
+ cpufreq_disable_fast_switch(policy);
+
pr_err("initialization failed (error %d)\n", ret);
return ret;
}
@@ -478,8 +671,6 @@ static void sugov_exit(struct cpufreq_policy *policy)
struct sugov_tunables *tunables = sg_policy->tunables;
unsigned int count;
- cpufreq_disable_fast_switch(policy);
-
mutex_lock(&global_tunables_lock);
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
@@ -489,7 +680,10 @@ static void sugov_exit(struct cpufreq_policy *policy)
mutex_unlock(&global_tunables_lock);
+ sugov_kthread_stop(sg_policy);
sugov_policy_free(sg_policy);
+
+ cpufreq_disable_fast_switch(policy);
}
static int sugov_start(struct cpufreq_policy *policy)
@@ -497,7 +691,11 @@ static int sugov_start(struct cpufreq_policy *policy)
struct sugov_policy *sg_policy = policy->governor_data;
unsigned int cpu;
- sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
+ sg_policy->up_rate_delay_ns =
+ sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;
+ sg_policy->down_rate_delay_ns =
+ sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
+ update_min_rate_limit_us(sg_policy);
sg_policy->last_freq_update_time = 0;
sg_policy->next_freq = UINT_MAX;
sg_policy->work_in_progress = false;
@@ -509,7 +707,7 @@ static int sugov_start(struct cpufreq_policy *policy)
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->sg_policy = sg_policy;
- sg_cpu->flags = SCHED_CPUFREQ_RT;
+ sg_cpu->flags = SCHED_CPUFREQ_DL;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
policy_is_shared(policy) ?
@@ -529,8 +727,10 @@ static void sugov_stop(struct cpufreq_policy *policy)
synchronize_sched();
- irq_work_sync(&sg_policy->irq_work);
- cancel_work_sync(&sg_policy->work);
+ if (!policy->fast_switch_enabled) {
+ irq_work_sync(&sg_policy->irq_work);
+ kthread_cancel_work_sync(&sg_policy->work);
+ }
}
static void sugov_limits(struct cpufreq_policy *policy)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 448d6426fa5f..84437fca4c1c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -8,7 +8,7 @@
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
-
+#include "walt.h"
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -58,11 +58,18 @@ void irqtime_account_irq(struct task_struct *curr)
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
s64 delta;
int cpu;
+#ifdef CONFIG_SCHED_WALT
+ u64 wallclock;
+ bool account = true;
+#endif
if (!sched_clock_irqtime)
return;
cpu = smp_processor_id();
+#ifdef CONFIG_SCHED_WALT
+ wallclock = sched_clock_cpu(cpu);
+#endif
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
@@ -76,6 +83,13 @@ void irqtime_account_irq(struct task_struct *curr)
irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
+#ifdef CONFIG_SCHED_WALT
+ else
+ account = false;
+
+ if (account)
+ walt_account_irqtime(cpu, curr, delta, wallclock);
+#endif
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index fa178b62ea79..59e38cd81076 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -261,9 +261,60 @@ set_table_entry(struct ctl_table *entry,
}
static struct ctl_table *
+sd_alloc_ctl_energy_table(struct sched_group_energy *sge)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(5);
+
+ if (table == NULL)
+ return NULL;
+
+ set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power,
+ sge->nr_idle_states*sizeof(struct idle_state), 0644,
+ proc_doulongvec_minmax, false);
+ set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap,
+ sge->nr_cap_states*sizeof(struct capacity_state), 0644,
+ proc_doulongvec_minmax, false);
+
+ return table;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_group_table(struct sched_group *sg)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(2);
+
+ if (table == NULL)
+ return NULL;
+
+ table->procname = kstrdup("energy", GFP_KERNEL);
+ table->mode = 0555;
+ table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge);
+
+ return table;
+}
+
+static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(14);
+ struct ctl_table *table;
+ unsigned int nr_entries = 14;
+
+ int i = 0;
+ struct sched_group *sg = sd->groups;
+
+ if (sg->sge) {
+ int nr_sgs = 0;
+
+ do {} while (nr_sgs++, sg = sg->next, sg != sd->groups);
+
+ nr_entries += nr_sgs;
+ }
+
+ table = sd_alloc_ctl_entry(nr_entries);
if (table == NULL)
return NULL;
@@ -296,7 +347,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- /* &table[13] is terminator */
+ sg = sd->groups;
+ if (sg->sge) {
+ char buf[32];
+ struct ctl_table *entry = &table[13];
+
+ do {
+ snprintf(buf, 32, "group%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_group_table(sg);
+ } while (entry++, i++, sg = sg->next, sg != sd->groups);
+ }
+ /* &table[nr_entries-1] is terminator */
return table;
}
@@ -918,7 +981,33 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
-
+ /* eas */
+ /* select_idle_sibling() */
+ P_SCHEDSTAT(se.statistics.nr_wakeups_sis_attempts);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_sis_idle);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_sis_cache_affine);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_sis_suff_cap);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_sis_idle_cpu);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_sis_count);
+ /* select_energy_cpu_brute() */
+ P_SCHEDSTAT(se.statistics.nr_wakeups_secb_attempts);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_secb_sync);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_secb_idle_bt);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_secb_insuff_cap);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_secb_no_nrg_sav);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_secb_nrg_sav);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_secb_count);
+ /* find_best_target() */
+ P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_attempts);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_no_cpu);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_no_sd);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_pref_idle);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_count);
+ /* cas */
+ /* select_task_rq_fair() */
+ P_SCHEDSTAT(se.statistics.nr_wakeups_cas_attempts);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_cas_count);
+
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
avg_atom = div64_ul(avg_atom, nr_switches);
diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c
new file mode 100644
index 000000000000..b0656b7a93e3
--- /dev/null
+++ b/kernel/sched/energy.c
@@ -0,0 +1,124 @@
+/*
+ * Obtain energy cost data from DT and populate relevant scheduler data
+ * structures.
+ *
+ * Copyright (C) 2015 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#define pr_fmt(fmt) "sched-energy: " fmt
+
+#define DEBUG
+
+#include <linux/gfp.h>
+#include <linux/of.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched_energy.h>
+#include <linux/stddef.h>
+
+struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
+
+static void free_resources(void)
+{
+ int cpu, sd_level;
+ struct sched_group_energy *sge;
+
+ for_each_possible_cpu(cpu) {
+ for_each_possible_sd_level(sd_level) {
+ sge = sge_array[cpu][sd_level];
+ if (sge) {
+ kfree(sge->cap_states);
+ kfree(sge->idle_states);
+ kfree(sge);
+ }
+ }
+ }
+}
+
+void init_sched_energy_costs(void)
+{
+ struct device_node *cn, *cp;
+ struct capacity_state *cap_states;
+ struct idle_state *idle_states;
+ struct sched_group_energy *sge;
+ const struct property *prop;
+ int sd_level, i, nstates, cpu;
+ const __be32 *val;
+
+ for_each_possible_cpu(cpu) {
+ cn = of_get_cpu_node(cpu, NULL);
+ if (!cn) {
+ pr_warn("CPU device node missing for CPU %d\n", cpu);
+ return;
+ }
+
+ if (!of_find_property(cn, "sched-energy-costs", NULL)) {
+ pr_warn("CPU device node has no sched-energy-costs\n");
+ return;
+ }
+
+ for_each_possible_sd_level(sd_level) {
+ cp = of_parse_phandle(cn, "sched-energy-costs", sd_level);
+ if (!cp)
+ break;
+
+ prop = of_find_property(cp, "busy-cost-data", NULL);
+ if (!prop || !prop->value) {
+ pr_warn("No busy-cost data, skipping sched_energy init\n");
+ goto out;
+ }
+
+ sge = kcalloc(1, sizeof(struct sched_group_energy),
+ GFP_NOWAIT);
+
+ nstates = (prop->length / sizeof(u32)) / 2;
+ cap_states = kcalloc(nstates,
+ sizeof(struct capacity_state),
+ GFP_NOWAIT);
+
+ for (i = 0, val = prop->value; i < nstates; i++) {
+ cap_states[i].cap = be32_to_cpup(val++);
+ cap_states[i].power = be32_to_cpup(val++);
+ }
+
+ sge->nr_cap_states = nstates;
+ sge->cap_states = cap_states;
+
+ prop = of_find_property(cp, "idle-cost-data", NULL);
+ if (!prop || !prop->value) {
+ pr_warn("No idle-cost data, skipping sched_energy init\n");
+ goto out;
+ }
+
+ nstates = (prop->length / sizeof(u32));
+ idle_states = kcalloc(nstates,
+ sizeof(struct idle_state),
+ GFP_NOWAIT);
+
+ for (i = 0, val = prop->value; i < nstates; i++)
+ idle_states[i].power = be32_to_cpup(val++);
+
+ sge->nr_idle_states = nstates;
+ sge->idle_states = idle_states;
+
+ sge_array[cpu][sd_level] = sge;
+ }
+ }
+
+ pr_info("Sched-energy-costs installed from DT\n");
+ return;
+
+out:
+ free_resources();
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0c91d72f3e8f..078a5f5c8377 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -30,10 +30,13 @@
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
+#include <linux/module.h>
#include <trace/events/sched.h>
#include "sched.h"
+#include "tune.h"
+#include "walt.h"
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -50,6 +53,16 @@
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+unsigned int sysctl_sched_sync_hint_enable = 1;
+unsigned int sysctl_sched_initial_task_util = 0;
+unsigned int sysctl_sched_cstate_aware = 1;
+
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+ (10 * NSEC_PER_MSEC);
+#endif
/*
* The initial- and re-scaling of tunables is configurable
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -116,7 +129,7 @@ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
/*
* The margin used when comparing utilization with CPU capacity:
- * util * 1024 < capacity * margin
+ * util * margin < capacity * 1024
*/
unsigned int capacity_margin = 1280; /* ~20% */
@@ -290,19 +303,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases.
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the beg of the tree
*/
if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
- } else {
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+ /*
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
+ */
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the beg of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else {
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the beg of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+ rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new beg
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
}
cfs_rq->on_list = 1;
@@ -699,18 +752,19 @@ void init_entity_runnable_average(struct sched_entity *se)
if (entity_is_task(se))
sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+
/*
* At this point, util_avg won't be used in select_task_rq_fair anyway
*/
- sa->util_avg = 0;
+ sa->util_avg = sched_freq() ?
+ sysctl_sched_initial_task_util :
+ 0;
sa->util_sum = 0;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+static void attach_entity_cfs_rq(struct sched_entity *se);
/*
* With new tasks being created, their initial util_avgs are extrapolated
@@ -742,7 +796,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
- u64 now = cfs_rq_clock_task(cfs_rq);
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
@@ -770,14 +823,12 @@ void post_init_entity_util_avg(struct sched_entity *se)
* such that the next switched_to_fair() has the
* expected state.
*/
- se->avg.last_update_time = now;
+ se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
return;
}
}
- update_cfs_rq_load_avg(now, cfs_rq, false);
- attach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, false);
+ attach_entity_cfs_rq(se);
}
#else /* !CONFIG_SMP */
@@ -937,6 +988,7 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
trace_sched_stat_blocked(tsk, delta);
+ trace_sched_blocked_reason(tsk);
/*
* Blocking time is in units of nanosecs, so shift by
@@ -2646,16 +2698,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg;
- struct sched_entity *se;
long shares;
- tg = cfs_rq->tg;
- se = tg->se[cpu_of(rq_of(cfs_rq))];
- if (!se || throttled_hierarchy(cfs_rq))
+ if (!cfs_rq)
return;
+
+ if (throttled_hierarchy(cfs_rq))
+ return;
+
+ tg = cfs_rq->tg;
+
#ifndef CONFIG_SMP
if (likely(se->load.weight == tg->shares))
return;
@@ -2664,8 +2720,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -2816,6 +2873,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
scale_freq = arch_scale_freq_capacity(NULL, cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+ trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
/* delta_w is the amount already accumulated against our next period */
delta_w = sa->period_contrib;
@@ -2891,6 +2949,26 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
return decayed;
}
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/**
* update_tg_load_avg - update the tg's load avg
@@ -2970,8 +3048,138 @@ void set_task_rq_fair(struct sched_entity *se,
se->avg.last_update_time = n_last_update_time;
}
}
+
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's utilization */
+ se->avg.util_avg = gcfs_rq->avg.util_avg;
+ se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq utilization */
+ add_positive(&cfs_rq->avg.util_avg, delta);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta, load = gcfs_rq->avg.load_avg;
+
+ /*
+ * If the load of group cfs_rq is null, the load of the
+ * sched_entity will also be null so we can skip the formula
+ */
+ if (load) {
+ long tg_load;
+
+ /* Get tg's load and ensure tg_load > 0 */
+ tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+ /* Ensure tg_load >= load and updated with current load*/
+ tg_load -= gcfs_rq->tg_load_avg_contrib;
+ tg_load += load;
+
+ /*
+ * We need to compute a correction term in the case that the
+ * task group is consuming more CPU than a task of equal
+ * weight. A task with a weight equals to tg->shares will have
+ * a load less or equal to scale_load_down(tg->shares).
+ * Similarly, the sched_entities that represent the task group
+ * at parent level, can't have a load higher than
+ * scale_load_down(tg->shares). And the Sum of sched_entities'
+ * load must be <= scale_load_down(tg->shares).
+ */
+ if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+ /* scale gcfs_rq's load into tg's shares*/
+ load *= scale_load_down(gcfs_rq->tg->shares);
+ load /= tg_load;
+ }
+ }
+
+ delta = load - se->avg.load_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's load */
+ se->avg.load_avg = load;
+ se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq load */
+ add_positive(&cfs_rq->avg.load_avg, delta);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+
+ /*
+ * If the sched_entity is already enqueued, we also have to update the
+ * runnable load avg.
+ */
+ if (se->on_rq) {
+ /* Update parent cfs_rq runnable_load_avg */
+ add_positive(&cfs_rq->runnable_load_avg, delta);
+ cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ }
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+ if (!cfs_rq->propagate_avg)
+ return 0;
+
+ cfs_rq->propagate_avg = 0;
+ return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ if (entity_is_task(se))
+ return 0;
+
+ if (!test_and_clear_tg_cfs_propagate(se))
+ return 0;
+
+ cfs_rq = cfs_rq_of(se);
+
+ set_tg_cfs_propagate(cfs_rq);
+
+ update_tg_cfs_util(cfs_rq, se);
+ update_tg_cfs_load(cfs_rq, se);
+
+ return 1;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
+
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ return 0;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
@@ -3042,6 +3250,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
sub_positive(&sa->load_avg, r);
sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
removed_load = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
@@ -3049,6 +3258,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
removed_util = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -3062,27 +3272,51 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
if (update_freq && (decayed || removed_util))
cfs_rq_util_change(cfs_rq);
+ /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
+ if (cfs_rq == &rq_of(cfs_rq)->cfs)
+ trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
return decayed || removed_load;
}
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG 0x1
+#define SKIP_AGE_LOAD 0x2
+
/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);
+ int decayed;
+ void *ptr = NULL;
/*
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- __update_load_avg(now, cpu, &se->avg,
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+ __update_load_avg(now, cpu, &se->avg,
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
+ }
- if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+ decayed |= propagate_entity_load_avg(se);
+
+ if (decayed && (flags & UPDATE_TG))
update_tg_load_avg(cfs_rq, 0);
+
+ if (entity_is_task(se)) {
+#ifdef CONFIG_SCHED_WALT
+ ptr = (void *)&(task_of(se)->ravg);
+#endif
+ trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
+ }
}
/**
@@ -3095,31 +3329,12 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (!sched_feat(ATTACH_AGE_LOAD))
- goto skip_aging;
-
- /*
- * If we got migrated (either between CPUs or between cgroups) we'll
- * have aged the average right before clearing @last_update_time.
- *
- * Or we're fresh through post_init_entity_util_avg().
- */
- if (se->avg.last_update_time) {
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, 0, 0, NULL);
-
- /*
- * XXX: we could have just aged the entire load away if we've been
- * absent from the fair class for too long.
- */
- }
-
-skip_aging:
se->avg.last_update_time = cfs_rq->avg.last_update_time;
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se->avg.load_sum;
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+ set_tg_cfs_propagate(cfs_rq);
cfs_rq_util_change(cfs_rq);
}
@@ -3134,14 +3349,12 @@ skip_aging:
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ set_tg_cfs_propagate(cfs_rq);
cfs_rq_util_change(cfs_rq);
}
@@ -3151,34 +3364,20 @@ static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
- u64 now = cfs_rq_clock_task(cfs_rq);
- int migrated, decayed;
-
- migrated = !sa->last_update_time;
- if (!migrated) {
- __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
- }
-
- decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;
- if (migrated)
+ if (!sa->last_update_time) {
attach_entity_load_avg(cfs_rq, se);
-
- if (decayed || migrated)
update_tg_load_avg(cfs_rq, 0);
+ }
}
/* Remove the runnable load generated by se from cfs_rq's runnable load average */
static inline void
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- update_load_avg(se, 1);
-
cfs_rq->runnable_load_avg =
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
cfs_rq->runnable_load_sum =
@@ -3207,13 +3406,25 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
#endif
/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+}
+
+/*
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
*/
void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
@@ -3225,9 +3436,7 @@ void remove_entity_load_avg(struct sched_entity *se)
* calls this.
*/
- last_update_time = cfs_rq_last_update_time(cfs_rq);
-
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ sync_entity_load_avg(se);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
@@ -3252,7 +3461,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
return 0;
}
-static inline void update_load_avg(struct sched_entity *se, int not_used)
+#define UPDATE_TG 0x0
+#define SKIP_AGE_LOAD 0x0
+
+static inline void update_load_avg(struct sched_entity *se, int not_used1)
{
cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
}
@@ -3397,9 +3609,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
+ /*
+ * When enqueuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Add its load to cfs_rq->runnable_avg
+ * - For group_entity, update its weight to reflect the new share of
+ * its group cfs_rq
+ * - Add its new weight to cfs_rq->load.weight
+ */
+ update_load_avg(se, UPDATE_TG);
enqueue_entity_load_avg(cfs_rq, se);
+ update_cfs_shares(se);
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
@@ -3471,6 +3692,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+
+ /*
+ * When dequeuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Substract its load from the cfs_rq->runnable_avg.
+ * - Substract its previous weight from cfs_rq->load.weight.
+ * - For group entity, update its weight to reflect the new share
+ * of its group cfs_rq.
+ */
+ update_load_avg(se, UPDATE_TG);
dequeue_entity_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se, flags);
@@ -3494,7 +3725,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
/*
* Now advance min_vruntime if @se was the entity holding it back,
@@ -3558,7 +3789,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_load_avg(se, 1);
+ update_load_avg(se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
@@ -3676,8 +3907,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_load_avg(curr, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(curr, UPDATE_TG);
+ update_cfs_shares(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -4544,6 +4775,28 @@ static inline void hrtick_update(struct rq *rq)
}
#endif
+#ifdef CONFIG_SMP
+static bool cpu_overutilized(int cpu);
+unsigned long boosted_cpu_util(int cpu);
+#else
+#define boosted_cpu_util(cpu) cpu_util(cpu)
+#endif
+
+#ifdef CONFIG_SMP
+static void update_capacity_of(int cpu)
+{
+ unsigned long req_cap;
+
+ if (!sched_freq())
+ return;
+
+ /* Convert scale-invariant capacity to cpu. */
+ req_cap = boosted_cpu_util(cpu);
+ req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
+ set_cfs_cpu_capacity(cpu, true, req_cap);
+}
+#endif
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -4554,6 +4807,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+#ifdef CONFIG_SMP
+ int task_new = flags & ENQUEUE_WAKEUP_NEW;
+ int task_wakeup = flags & ENQUEUE_WAKEUP;
+#endif
/*
* If in_iowait is set, the code below may not trigger any cpufreq
@@ -4578,6 +4835,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
flags = ENQUEUE_WAKEUP;
}
@@ -4585,17 +4843,59 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
}
if (!se)
add_nr_running(rq, 1);
+#ifdef CONFIG_SMP
+
+ /*
+ * Update SchedTune accounting.
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ *
+ * We do it also in the case where we enqueue a throttled task;
+ * we could argue that a throttled task should not boost a CPU,
+ * however:
+ * a) properly implementing CPU boosting considering throttled
+ * tasks will increase a lot the complexity of the solution
+ * b) it's not easy to quantify the benefits introduced by
+ * such a more complex solution.
+ * Thus, for the time being we go for the simple solution and boost
+ * also for throttled RQs.
+ */
+ schedtune_enqueue_task(p, cpu_of(rq));
+
+ if (!se) {
+ walt_inc_cumulative_runnable_avg(rq, p);
+ if (!task_new && !rq->rd->overutilized &&
+ cpu_overutilized(rq->cpu)) {
+ rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
+
+ /*
+ * We want to potentially trigger a freq switch
+ * request only for tasks that are waking up; this is
+ * because we get here also during load balancing, but
+ * in these cases it seems wise to trigger as single
+ * request after load balancing is done.
+ */
+ if (task_new || task_wakeup)
+ update_capacity_of(cpu_of(rq));
+ }
+
+#endif /* CONFIG_SMP */
hrtick_update(rq);
}
@@ -4625,6 +4925,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -4644,17 +4945,50 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
}
if (!se)
sub_nr_running(rq, 1);
+#ifdef CONFIG_SMP
+
+ /*
+ * Update SchedTune accounting
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ */
+ schedtune_dequeue_task(p, cpu_of(rq));
+
+ if (!se) {
+ walt_dec_cumulative_runnable_avg(rq, p);
+
+ /*
+ * We want to potentially trigger a freq switch
+ * request only for tasks that are going to sleep;
+ * this is because we get here also during load
+ * balancing, but in these cases it seems wise to
+ * trigger as single request after load balancing is
+ * done.
+ */
+ if (task_sleep) {
+ if (rq->cfs.nr_running)
+ update_capacity_of(cpu_of(rq));
+ else if (sched_freq())
+ set_cfs_cpu_capacity(cpu_of(rq), false, 0);
+ }
+ }
+
+#endif /* CONFIG_SMP */
+
hrtick_update(rq);
}
@@ -4961,15 +5295,6 @@ static unsigned long target_load(int cpu, int type)
return max(rq->cpu_load[type-1], total);
}
-static unsigned long capacity_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity;
-}
-
-static unsigned long capacity_orig_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_orig;
-}
static unsigned long cpu_avg_load_per_task(int cpu)
{
@@ -5121,6 +5446,500 @@ static void record_wakee(struct task_struct *p)
}
/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+unsigned long capacity_curr_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig *
+ arch_scale_freq_capacity(NULL, cpu)
+ >> SCHED_CAPACITY_SHIFT;
+}
+
+static inline bool energy_aware(void)
+{
+ return sched_feat(ENERGY_AWARE);
+}
+
+struct energy_env {
+ struct sched_group *sg_top;
+ struct sched_group *sg_cap;
+ int cap_idx;
+ int util_delta;
+ int src_cpu;
+ int dst_cpu;
+ int energy;
+ int payoff;
+ struct task_struct *task;
+ struct {
+ int before;
+ int after;
+ int delta;
+ int diff;
+ } nrg;
+ struct {
+ int before;
+ int after;
+ int delta;
+ } cap;
+};
+
+/*
+ * __cpu_norm_util() returns the cpu util relative to a specific capacity,
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
+ * energy calculations. Using the scale-invariant util returned by
+ * cpu_util() and approximating scale-invariant util by:
+ *
+ * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
+ *
+ * the normalized util can be found using the specific capacity.
+ *
+ * capacity = capacity_orig * curr_freq/max_freq
+ *
+ * norm_util = running_time/time ~ util/capacity
+ */
+static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
+{
+ int util = __cpu_util(cpu, delta);
+
+ if (util >= capacity)
+ return SCHED_CAPACITY_SCALE;
+
+ return (util << SCHED_CAPACITY_SHIFT)/capacity;
+}
+
+static int calc_util_delta(struct energy_env *eenv, int cpu)
+{
+ if (cpu == eenv->src_cpu)
+ return -eenv->util_delta;
+ if (cpu == eenv->dst_cpu)
+ return eenv->util_delta;
+ return 0;
+}
+
+static
+unsigned long group_max_util(struct energy_env *eenv)
+{
+ int i, delta;
+ unsigned long max_util = 0;
+
+ for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
+ delta = calc_util_delta(eenv, i);
+ max_util = max(max_util, __cpu_util(i, delta));
+ }
+
+ return max_util;
+}
+
+/*
+ * group_norm_util() returns the approximated group util relative to it's
+ * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
+ * energy calculations. Since task executions may or may not overlap in time in
+ * the group the true normalized util is between max(cpu_norm_util(i)) and
+ * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
+ * latter is used as the estimate as it leads to a more pessimistic energy
+ * estimate (more busy).
+ */
+static unsigned
+long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
+{
+ int i, delta;
+ unsigned long util_sum = 0;
+ unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ delta = calc_util_delta(eenv, i);
+ util_sum += __cpu_norm_util(i, capacity, delta);
+ }
+
+ if (util_sum > SCHED_CAPACITY_SCALE)
+ return SCHED_CAPACITY_SCALE;
+ return util_sum;
+}
+
+static int find_new_capacity(struct energy_env *eenv,
+ const struct sched_group_energy * const sge)
+{
+ int idx;
+ unsigned long util = group_max_util(eenv);
+
+ for (idx = 0; idx < sge->nr_cap_states; idx++) {
+ if (sge->cap_states[idx].cap >= util)
+ break;
+ }
+
+ eenv->cap_idx = idx;
+
+ return idx;
+}
+
+static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
+{
+ int i, state = INT_MAX;
+ int src_in_grp, dst_in_grp;
+ long grp_util = 0;
+
+ /* Find the shallowest idle state in the sched group. */
+ for_each_cpu(i, sched_group_cpus(sg))
+ state = min(state, idle_get_state_idx(cpu_rq(i)));
+
+ /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
+ state++;
+
+ /*
+ * Try to estimate if a deeper idle state is
+ * achievable when we move the task.
+ */
+ for_each_cpu(i, sched_group_cpus(sg))
+ grp_util += cpu_util(i);
+
+ src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
+ dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
+ if (src_in_grp == dst_in_grp) {
+ /* both CPUs under consideration are in the same group or not in
+ * either group, migration should leave idle state the same.
+ */
+ goto end;
+ }
+ /* add or remove util as appropriate to indicate what group util
+ * will be (worst case - no concurrent execution) after moving the task
+ */
+ grp_util += src_in_grp ? -eenv->util_delta : eenv->util_delta;
+
+ if (grp_util <=
+ ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
+ /* after moving, this group is at most partly
+ * occupied, so it should have some idle time.
+ */
+ int max_idle_state_idx = sg->sge->nr_idle_states - 2;
+ int new_state = grp_util * max_idle_state_idx;
+ if (grp_util <= 0)
+ /* group will have no util, use lowest state */
+ new_state = max_idle_state_idx + 1;
+ else {
+ /* for partially idle, linearly map util to idle
+ * states, excluding the lowest one. This does not
+ * correspond to the state we expect to enter in
+ * reality, but an indication of what might happen.
+ */
+ new_state = min(max_idle_state_idx, (int)
+ (new_state / sg->sgc->max_capacity));
+ new_state = max_idle_state_idx - new_state;
+ }
+ state = new_state;
+ } else {
+ /* After moving, the group will be fully occupied
+ * so assume it will not be idle at all.
+ */
+ state = 0;
+ }
+end:
+ return state;
+}
+
+/*
+ * sched_group_energy(): Computes the absolute energy consumption of cpus
+ * belonging to the sched_group including shared resources shared only by
+ * members of the group. Iterates over all cpus in the hierarchy below the
+ * sched_group starting from the bottom working it's way up before going to
+ * the next cpu until all cpus are covered at all levels. The current
+ * implementation is likely to gather the same util statistics multiple times.
+ * This can probably be done in a faster but more complex way.
+ * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ */
+static int sched_group_energy(struct energy_env *eenv)
+{
+ struct sched_domain *sd;
+ int cpu, total_energy = 0;
+ struct cpumask visit_cpus;
+ struct sched_group *sg;
+ int cpu_count;
+
+ WARN_ON(!eenv->sg_top->sge);
+
+ cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
+ /* If a cpu is hotplugged in while we are in this function,
+ * it does not appear in the existing visit_cpus mask
+ * which came from the sched_group pointer of the
+ * sched_domain pointed at by sd_ea for either the prev
+ * or next cpu and was dereferenced in __energy_diff.
+ * Since we will dereference sd_scs later as we iterate
+ * through the CPUs we expect to visit, new CPUs can
+ * be present which are not in the visit_cpus mask.
+ * Guard this with cpu_count.
+ */
+ cpu_count = cpumask_weight(&visit_cpus);
+
+ while (!cpumask_empty(&visit_cpus)) {
+ struct sched_group *sg_shared_cap = NULL;
+
+ cpu = cpumask_first(&visit_cpus);
+
+ /*
+ * Is the group utilization affected by cpus outside this
+ * sched_group?
+ * This sd may have groups with cpus which were not present
+ * when we took visit_cpus.
+ */
+ sd = rcu_dereference(per_cpu(sd_scs, cpu));
+
+ if (sd && sd->parent)
+ sg_shared_cap = sd->parent->groups;
+
+ for_each_domain(cpu, sd) {
+ sg = sd->groups;
+
+ /* Has this sched_domain already been visited? */
+ if (sd->child && group_first_cpu(sg) != cpu)
+ break;
+
+ do {
+ unsigned long group_util;
+ int sg_busy_energy, sg_idle_energy;
+ int cap_idx, idle_idx;
+
+ if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
+ eenv->sg_cap = sg_shared_cap;
+ else
+ eenv->sg_cap = sg;
+
+ cap_idx = find_new_capacity(eenv, sg->sge);
+
+ if (sg->group_weight == 1) {
+ /* Remove capacity of src CPU (before task move) */
+ if (eenv->util_delta == 0 &&
+ cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
+ eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
+ eenv->cap.delta -= eenv->cap.before;
+ }
+ /* Add capacity of dst CPU (after task move) */
+ if (eenv->util_delta != 0 &&
+ cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
+ eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
+ eenv->cap.delta += eenv->cap.after;
+ }
+ }
+
+ idle_idx = group_idle_state(eenv, sg);
+ group_util = group_norm_util(eenv, sg);
+
+ sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
+ >> SCHED_CAPACITY_SHIFT;
+ sg_idle_energy = ((SCHED_CAPACITY_SCALE-group_util)
+ * sg->sge->idle_states[idle_idx].power)
+ >> SCHED_CAPACITY_SHIFT;
+
+ total_energy += sg_busy_energy + sg_idle_energy;
+
+ if (!sd->child) {
+ /*
+ * cpu_count here is the number of
+ * cpus we expect to visit in this
+ * calculation. If we race against
+ * hotplug, we can have extra cpus
+ * added to the groups we are
+ * iterating which do not appear in
+ * the visit_cpus mask. In that case
+ * we are not able to calculate energy
+ * without restarting so we will bail
+ * out and use prev_cpu this time.
+ */
+ if (!cpu_count)
+ return -EINVAL;
+ cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
+ cpu_count--;
+ }
+
+ if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
+ goto next_cpu;
+
+ } while (sg = sg->next, sg != sd->groups);
+ }
+
+ /*
+ * If we raced with hotplug and got an sd NULL-pointer;
+ * returning a wrong energy estimation is better than
+ * entering an infinite loop.
+ * Specifically: If a cpu is unplugged after we took
+ * the visit_cpus mask, it no longer has an sd_scs
+ * pointer, so when we dereference it, we get NULL.
+ */
+ if (cpumask_test_cpu(cpu, &visit_cpus))
+ return -EINVAL;
+next_cpu:
+ cpumask_clear_cpu(cpu, &visit_cpus);
+ continue;
+ }
+
+ eenv->energy = total_energy;
+ return 0;
+}
+
+static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
+{
+ return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
+}
+
+/*
+ * energy_diff(): Estimate the energy impact of changing the utilization
+ * distribution. eenv specifies the change: utilisation amount, source, and
+ * destination cpu. Source or destination cpu may be -1 in which case the
+ * utilization is removed from or added to the system (e.g. task wake-up). If
+ * both are specified, the utilization is migrated.
+ */
+static inline int __energy_diff(struct energy_env *eenv)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+ int sd_cpu = -1, energy_before = 0, energy_after = 0;
+ int diff, margin;
+
+ struct energy_env eenv_before = {
+ .util_delta = 0,
+ .src_cpu = eenv->src_cpu,
+ .dst_cpu = eenv->dst_cpu,
+ .nrg = { 0, 0, 0, 0},
+ .cap = { 0, 0, 0 },
+ };
+
+ if (eenv->src_cpu == eenv->dst_cpu)
+ return 0;
+
+ sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
+ sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
+
+ if (!sd)
+ return 0; /* Error */
+
+ sg = sd->groups;
+
+ do {
+ if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
+ eenv_before.sg_top = eenv->sg_top = sg;
+
+ if (sched_group_energy(&eenv_before))
+ return 0; /* Invalid result abort */
+ energy_before += eenv_before.energy;
+
+ /* Keep track of SRC cpu (before) capacity */
+ eenv->cap.before = eenv_before.cap.before;
+ eenv->cap.delta = eenv_before.cap.delta;
+
+ if (sched_group_energy(eenv))
+ return 0; /* Invalid result abort */
+ energy_after += eenv->energy;
+ }
+ } while (sg = sg->next, sg != sd->groups);
+
+ eenv->nrg.before = energy_before;
+ eenv->nrg.after = energy_after;
+ eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
+ eenv->payoff = 0;
+#ifndef CONFIG_SCHED_TUNE
+ trace_sched_energy_diff(eenv->task,
+ eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+ eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+ eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+ eenv->nrg.delta, eenv->payoff);
+#endif
+ /*
+ * Dead-zone margin preventing too many migrations.
+ */
+
+ margin = eenv->nrg.before >> 6; /* ~1.56% */
+
+ diff = eenv->nrg.after - eenv->nrg.before;
+
+ eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
+
+ return eenv->nrg.diff;
+}
+
+#ifdef CONFIG_SCHED_TUNE
+
+struct target_nrg schedtune_target_nrg;
+extern bool schedtune_initialized;
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
+ * corresponding to the specified energy variation.
+ */
+static inline int
+normalize_energy(int energy_diff)
+{
+ u32 normalized_nrg;
+
+ /* during early setup, we don't know the extents */
+ if (unlikely(!schedtune_initialized))
+ return energy_diff < 0 ? -1 : 1 ;
+
+#ifdef CONFIG_SCHED_DEBUG
+ {
+ int max_delta;
+
+ /* Check for boundaries */
+ max_delta = schedtune_target_nrg.max_power;
+ max_delta -= schedtune_target_nrg.min_power;
+ WARN_ON(abs(energy_diff) >= max_delta);
+ }
+#endif
+
+ /* Do scaling using positive numbers to increase the range */
+ normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
+
+ /* Scale by energy magnitude */
+ normalized_nrg <<= SCHED_CAPACITY_SHIFT;
+
+ /* Normalize on max energy for target platform */
+ normalized_nrg = reciprocal_divide(
+ normalized_nrg, schedtune_target_nrg.rdiv);
+
+ return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+}
+
+static inline int
+energy_diff(struct energy_env *eenv)
+{
+ int boost = schedtune_task_boost(eenv->task);
+ int nrg_delta;
+
+ /* Conpute "absolute" energy diff */
+ __energy_diff(eenv);
+
+ /* Return energy diff when boost margin is 0 */
+ if (boost == 0)
+ return eenv->nrg.diff;
+
+ /* Compute normalized energy diff */
+ nrg_delta = normalize_energy(eenv->nrg.diff);
+ eenv->nrg.delta = nrg_delta;
+
+ eenv->payoff = schedtune_accept_deltas(
+ eenv->nrg.delta,
+ eenv->cap.delta,
+ eenv->task);
+
+ trace_sched_energy_diff(eenv->task,
+ eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+ eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+ eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+ eenv->nrg.delta, eenv->payoff);
+
+ /*
+ * When SchedTune is enabled, the energy_diff() function will return
+ * the computed energy payoff value. Since the energy_diff() return
+ * value is expected to be negative by its callers, this evaluation
+ * function return a negative value each time the evaluation return a
+ * positive payoff, which is the condition for the acceptance of
+ * a scheduling decision
+ */
+ return -eenv->payoff;
+}
+#else /* CONFIG_SCHED_TUNE */
+#define energy_diff(eenv) __energy_diff(eenv)
+#endif
+
+/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
*
* A waker of many should wake a different task than the one last awakened
@@ -5216,6 +6035,149 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return 1;
}
+static inline unsigned long task_util(struct task_struct *p)
+{
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+ unsigned long demand = p->ravg.demand;
+ return (demand << 10) / walt_ravg_window;
+ }
+#endif
+ return p->se.avg.util_avg;
+}
+
+static inline unsigned long boosted_task_util(struct task_struct *task);
+
+static inline bool __task_fits(struct task_struct *p, int cpu, int util)
+{
+ unsigned long capacity = capacity_of(cpu);
+
+ util += boosted_task_util(p);
+
+ return (capacity * 1024) > (util * capacity_margin);
+}
+
+static inline bool task_fits_max(struct task_struct *p, int cpu)
+{
+ unsigned long capacity = capacity_of(cpu);
+ unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
+
+ if (capacity == max_capacity)
+ return true;
+
+ if (capacity * capacity_margin > max_capacity * 1024)
+ return true;
+
+ return __task_fits(p, cpu, 0);
+}
+
+static bool cpu_overutilized(int cpu)
+{
+ return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+}
+
+#ifdef CONFIG_SCHED_TUNE
+
+struct reciprocal_value schedtune_spc_rdiv;
+
+static long
+schedtune_margin(unsigned long signal, long boost)
+{
+ long long margin = 0;
+
+ /*
+ * Signal proportional compensation (SPC)
+ *
+ * The Boost (B) value is used to compute a Margin (M) which is
+ * proportional to the complement of the original Signal (S):
+ * M = B * (SCHED_CAPACITY_SCALE - S)
+ * The obtained M could be used by the caller to "boost" S.
+ */
+ if (boost >= 0) {
+ margin = SCHED_CAPACITY_SCALE - signal;
+ margin *= boost;
+ } else
+ margin = -signal * boost;
+
+ margin = reciprocal_divide(margin, schedtune_spc_rdiv);
+
+ if (boost < 0)
+ margin *= -1;
+ return margin;
+}
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ int boost = schedtune_cpu_boost(cpu);
+
+ if (boost == 0)
+ return 0;
+
+ return schedtune_margin(util, boost);
+}
+
+static inline long
+schedtune_task_margin(struct task_struct *task)
+{
+ int boost = schedtune_task_boost(task);
+ unsigned long util;
+ long margin;
+
+ if (boost == 0)
+ return 0;
+
+ util = task_util(task);
+ margin = schedtune_margin(util, boost);
+
+ return margin;
+}
+
+#else /* CONFIG_SCHED_TUNE */
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ return 0;
+}
+
+static inline int
+schedtune_task_margin(struct task_struct *task)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_TUNE */
+
+unsigned long
+boosted_cpu_util(int cpu)
+{
+ unsigned long util = cpu_util(cpu);
+ long margin = schedtune_cpu_margin(util, cpu);
+
+ trace_sched_boost_cpu(cpu, util, margin);
+
+ return util + margin;
+}
+
+static inline unsigned long
+boosted_task_util(struct task_struct *task)
+{
+ unsigned long util = task_util(task);
+ long margin = schedtune_task_margin(task);
+
+ trace_sched_boost_task(task, util, margin);
+
+ return util + margin;
+}
+
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+ return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+}
+
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
@@ -5225,7 +6187,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
+ struct sched_group *most_spare_sg = NULL;
unsigned long min_load = ULONG_MAX, this_load = 0;
+ unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
@@ -5233,7 +6197,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
load_idx = sd->wake_idx;
do {
- unsigned long load, avg_load;
+ unsigned long load, avg_load, spare_cap, max_spare_cap;
int local_group;
int i;
@@ -5245,8 +6209,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
- /* Tally up the load of all CPUs in the group */
+ /*
+ * Tally up the load of all CPUs in the group and find
+ * the group containing the CPU with most spare capacity.
+ */
avg_load = 0;
+ max_spare_cap = 0;
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
@@ -5256,6 +6224,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
load = target_load(i, load_idx);
avg_load += load;
+
+ spare_cap = capacity_spare_wake(i, p);
+
+ if (spare_cap > max_spare_cap)
+ max_spare_cap = spare_cap;
}
/* Adjust by relative CPU capacity of the group */
@@ -5263,12 +6236,33 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
if (local_group) {
this_load = avg_load;
- } else if (avg_load < min_load) {
- min_load = avg_load;
- idlest = group;
+ this_spare = max_spare_cap;
+ } else {
+ if (avg_load < min_load) {
+ min_load = avg_load;
+ idlest = group;
+ }
+
+ if (most_spare < max_spare_cap) {
+ most_spare = max_spare_cap;
+ most_spare_sg = group;
+ }
}
} while (group = group->next, group != sd->groups);
+ /*
+ * The cross-over point between using spare capacity or least load
+ * is too conservative for high utilization tasks on partially
+ * utilized systems if we require spare_capacity > task_util(p),
+ * so we allow for some task stuffing by using
+ * spare_capacity > task_util(p)/2.
+ */
+ if (this_spare > task_util(p) / 2 &&
+ imbalance*this_spare > 100*most_spare)
+ return NULL;
+ else if (most_spare > task_util(p) / 2)
+ return most_spare_sg;
+
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
return idlest;
@@ -5494,96 +6488,525 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
- int i;
+ struct sched_group *sg;
+ int i = task_cpu(p);
+ int best_idle_cpu = -1;
+ int best_idle_cstate = INT_MAX;
+ unsigned long best_idle_capacity = ULONG_MAX;
+
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_attempts);
+ schedstat_inc(this_rq()->eas_stats.sis_attempts);
+
+ if (!sysctl_sched_cstate_aware) {
+ if (idle_cpu(target)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_idle);
+ schedstat_inc(this_rq()->eas_stats.sis_idle);
+ return target;
+ }
+
+ /*
+ * If the prevous cpu is cache affine and idle, don't be stupid.
+ */
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_cache_affine);
+ schedstat_inc(this_rq()->eas_stats.sis_cache_affine);
+ return i;
+ }
+
+ sd = rcu_dereference(per_cpu(sd_llc, target));
+ if (!sd)
+ return target;
- if (idle_cpu(target))
- return target;
+ i = select_idle_core(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ i = select_idle_cpu(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ i = select_idle_smt(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+ }
/*
- * If the previous cpu is cache affine and idle, don't be stupid.
+ * Otherwise, iterate the domains and find an elegible idle cpu.
*/
- if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
- return prev;
-
sd = rcu_dereference(per_cpu(sd_llc, target));
- if (!sd)
- return target;
+ for_each_lower_domain(sd) {
+ sg = sd->groups;
+ do {
+ if (!cpumask_intersects(sched_group_cpus(sg),
+ tsk_cpus_allowed(p)))
+ goto next;
+
+
+ if (sysctl_sched_cstate_aware) {
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
+ unsigned long new_usage = boosted_task_util(p);
+ unsigned long capacity_orig = capacity_orig_of(i);
+
+ if (new_usage > capacity_orig || !idle_cpu(i))
+ goto next;
+
+ if (i == target && new_usage <= capacity_curr_of(target)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_suff_cap);
+ schedstat_inc(this_rq()->eas_stats.sis_suff_cap);
+ schedstat_inc(sd->eas_stats.sis_suff_cap);
+ return target;
+ }
+
+ if (idle_idx < best_idle_cstate &&
+ capacity_orig <= best_idle_capacity) {
+ best_idle_cpu = i;
+ best_idle_cstate = idle_idx;
+ best_idle_capacity = capacity_orig;
+ }
+ }
+ } else {
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ if (i == target || !idle_cpu(i))
+ goto next;
+ }
- i = select_idle_core(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
+ target = cpumask_first_and(sched_group_cpus(sg),
+ tsk_cpus_allowed(p));
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_idle_cpu);
+ schedstat_inc(this_rq()->eas_stats.sis_idle_cpu);
+ schedstat_inc(sd->eas_stats.sis_idle_cpu);
+ goto done;
+ }
+next:
+ sg = sg->next;
+ } while (sg != sd->groups);
+ }
- i = select_idle_cpu(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
+ if (best_idle_cpu >= 0)
+ target = best_idle_cpu;
- i = select_idle_smt(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
+done:
+ schedstat_inc(p->se.statistics.nr_wakeups_sis_count);
+ schedstat_inc(this_rq()->eas_stats.sis_count);
return target;
}
-
+
/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed.
*/
-static int cpu_util(int cpu)
+static int cpu_util_wake(int cpu, struct task_struct *p)
{
- unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
- unsigned long capacity = capacity_orig_of(cpu);
+ unsigned long util, capacity;
+
+#ifdef CONFIG_SCHED_WALT
+ /*
+ * WALT does not decay idle tasks in the same manner
+ * as PELT, so it makes little sense to subtract task
+ * utilization from cpu utilization. Instead just use
+ * cpu_util for this case.
+ */
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+ return cpu_util(cpu);
+#endif
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+ return cpu_util(cpu);
+
+ capacity = capacity_orig_of(cpu);
+ util = max_t(long, cpu_util(cpu) - task_util(p), 0);
return (util >= capacity) ? capacity : util;
}
-static inline int task_util(struct task_struct *p)
+static int start_cpu(bool boosted)
{
- return p->se.avg.util_avg;
+ struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+
+ RCU_LOCKDEP_WARN(rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+
+ return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
+}
+
+static inline int find_best_target(struct task_struct *p, int *backup_cpu,
+ bool boosted, bool prefer_idle)
+{
+ unsigned long best_idle_min_cap_orig = ULONG_MAX;
+ unsigned long min_util = boosted_task_util(p);
+ unsigned long target_capacity = ULONG_MAX;
+ unsigned long min_wake_util = ULONG_MAX;
+ unsigned long target_max_spare_cap = 0;
+ unsigned long target_util = ULONG_MAX;
+ unsigned long best_active_util = ULONG_MAX;
+ int best_idle_cstate = INT_MAX;
+ struct sched_domain *sd;
+ struct sched_group *sg;
+ int best_active_cpu = -1;
+ int best_idle_cpu = -1;
+ int target_cpu = -1;
+ int cpu, i;
+
+ *backup_cpu = -1;
+
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_attempts);
+ schedstat_inc(this_rq()->eas_stats.fbt_attempts);
+
+ /* Find start CPU based on boost value */
+ cpu = start_cpu(boosted);
+ if (cpu < 0) {
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_cpu);
+ schedstat_inc(this_rq()->eas_stats.fbt_no_cpu);
+ return -1;
+ }
+
+ /* Find SD for the start CPU */
+ sd = rcu_dereference(per_cpu(sd_ea, cpu));
+ if (!sd) {
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_sd);
+ schedstat_inc(this_rq()->eas_stats.fbt_no_sd);
+ return -1;
+ }
+
+ /* Scan CPUs in all SDs */
+ sg = sd->groups;
+ do {
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+ unsigned long capacity_curr = capacity_curr_of(i);
+ unsigned long capacity_orig = capacity_orig_of(i);
+ unsigned long wake_util, new_util;
+
+ if (!cpu_online(i))
+ continue;
+
+ if (walt_cpu_high_irqload(i))
+ continue;
+
+ /*
+ * p's blocked utilization is still accounted for on prev_cpu
+ * so prev_cpu will receive a negative bias due to the double
+ * accounting. However, the blocked utilization may be zero.
+ */
+ wake_util = cpu_util_wake(i, p);
+ new_util = wake_util + task_util(p);
+
+ /*
+ * Ensure minimum capacity to grant the required boost.
+ * The target CPU can be already at a capacity level higher
+ * than the one required to boost the task.
+ */
+ new_util = max(min_util, new_util);
+ if (new_util > capacity_orig)
+ continue;
+
+ /*
+ * Case A) Latency sensitive tasks
+ *
+ * Unconditionally favoring tasks that prefer idle CPU to
+ * improve latency.
+ *
+ * Looking for:
+ * - an idle CPU, whatever its idle_state is, since
+ * the first CPUs we explore are more likely to be
+ * reserved for latency sensitive tasks.
+ * - a non idle CPU where the task fits in its current
+ * capacity and has the maximum spare capacity.
+ * - a non idle CPU with lower contention from other
+ * tasks and running at the lowest possible OPP.
+ *
+ * The last two goals tries to favor a non idle CPU
+ * where the task can run as if it is "almost alone".
+ * A maximum spare capacity CPU is favoured since
+ * the task already fits into that CPU's capacity
+ * without waiting for an OPP chance.
+ *
+ * The following code path is the only one in the CPUs
+ * exploration loop which is always used by
+ * prefer_idle tasks. It exits the loop with wither a
+ * best_active_cpu or a target_cpu which should
+ * represent an optimal choice for latency sensitive
+ * tasks.
+ */
+ if (prefer_idle) {
+
+ /*
+ * Case A.1: IDLE CPU
+ * Return the first IDLE CPU we find.
+ */
+ if (idle_cpu(i)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_pref_idle);
+ schedstat_inc(this_rq()->eas_stats.fbt_pref_idle);
+
+ trace_sched_find_best_target(p,
+ prefer_idle, min_util,
+ cpu, best_idle_cpu,
+ best_active_cpu, i);
+
+ return i;
+ }
+
+ /*
+ * Case A.2: Target ACTIVE CPU
+ * Favor CPUs with max spare capacity.
+ */
+ if ((capacity_curr > new_util) &&
+ (capacity_orig - new_util > target_max_spare_cap)) {
+ target_max_spare_cap = capacity_orig - new_util;
+ target_cpu = i;
+ continue;
+ }
+ if (target_cpu != -1)
+ continue;
+
+
+ /*
+ * Case A.3: Backup ACTIVE CPU
+ * Favor CPUs with:
+ * - lower utilization due to other tasks
+ * - lower utilization with the task in
+ */
+ if (wake_util > min_wake_util)
+ continue;
+ if (new_util > best_active_util)
+ continue;
+ min_wake_util = wake_util;
+ best_active_util = new_util;
+ best_active_cpu = i;
+ continue;
+ }
+
+ /*
+ * Case B) Non latency sensitive tasks on IDLE CPUs.
+ *
+ * Find an optimal backup IDLE CPU for non latency
+ * sensitive tasks.
+ *
+ * Looking for:
+ * - minimizing the capacity_orig,
+ * i.e. preferring LITTLE CPUs
+ * - favoring shallowest idle states
+ * i.e. avoid to wakeup deep-idle CPUs
+ *
+ * The following code path is used by non latency
+ * sensitive tasks if IDLE CPUs are available. If at
+ * least one of such CPUs are available it sets the
+ * best_idle_cpu to the most suitable idle CPU to be
+ * selected.
+ *
+ * If idle CPUs are available, favour these CPUs to
+ * improve performances by spreading tasks.
+ * Indeed, the energy_diff() computed by the caller
+ * will take care to ensure the minimization of energy
+ * consumptions without affecting performance.
+ */
+ if (idle_cpu(i)) {
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
+
+ /* Select idle CPU with lower cap_orig */
+ if (capacity_orig > best_idle_min_cap_orig)
+ continue;
+
+ /*
+ * Skip CPUs in deeper idle state, but only
+ * if they are also less energy efficient.
+ * IOW, prefer a deep IDLE LITTLE CPU vs a
+ * shallow idle big CPU.
+ */
+ if (sysctl_sched_cstate_aware &&
+ best_idle_cstate <= idle_idx)
+ continue;
+
+ /* Keep track of best idle CPU */
+ best_idle_min_cap_orig = capacity_orig;
+ best_idle_cstate = idle_idx;
+ best_idle_cpu = i;
+ continue;
+ }
+
+ /*
+ * Case C) Non latency sensitive tasks on ACTIVE CPUs.
+ *
+ * Pack tasks in the most energy efficient capacities.
+ *
+ * This task packing strategy prefers more energy
+ * efficient CPUs (i.e. pack on smaller maximum
+ * capacity CPUs) while also trying to spread tasks to
+ * run them all at the lower OPP.
+ *
+ * This assumes for example that it's more energy
+ * efficient to run two tasks on two CPUs at a lower
+ * OPP than packing both on a single CPU but running
+ * that CPU at an higher OPP.
+ *
+ * Thus, this case keep track of the CPU with the
+ * smallest maximum capacity and highest spare maximum
+ * capacity.
+ */
+
+ /* Favor CPUs with smaller capacity */
+ if (capacity_orig > target_capacity)
+ continue;
+
+ /* Favor CPUs with maximum spare capacity */
+ if ((capacity_orig - new_util) < target_max_spare_cap)
+ continue;
+
+ target_max_spare_cap = capacity_orig - new_util;
+ target_capacity = capacity_orig;
+ target_util = new_util;
+ target_cpu = i;
+ }
+
+ } while (sg = sg->next, sg != sd->groups);
+
+ /*
+ * For non latency sensitive tasks, cases B and C in the previous loop,
+ * we pick the best IDLE CPU only if we was not able to find a target
+ * ACTIVE CPU.
+ *
+ * Policies priorities:
+ *
+ * - prefer_idle tasks:
+ *
+ * a) IDLE CPU available, we return immediately
+ * b) ACTIVE CPU where task fits and has the bigger maximum spare
+ * capacity (i.e. target_cpu)
+ * c) ACTIVE CPU with less contention due to other tasks
+ * (i.e. best_active_cpu)
+ *
+ * - NON prefer_idle tasks:
+ *
+ * a) ACTIVE CPU: target_cpu
+ * b) IDLE CPU: best_idle_cpu
+ */
+ if (target_cpu == -1)
+ target_cpu = prefer_idle
+ ? best_active_cpu
+ : best_idle_cpu;
+ else
+ *backup_cpu = prefer_idle
+ ? best_active_cpu
+ : best_idle_cpu;
+
+ trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
+ best_idle_cpu, best_active_cpu,
+ target_cpu);
+
+ schedstat_inc(p->se.statistics.nr_wakeups_fbt_count);
+ schedstat_inc(this_rq()->eas_stats.fbt_count);
+
+ return target_cpu;
}
/*
* Disable WAKE_AFFINE in the case where task @p doesn't fit in the
* capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
- *
+ *
* In that case WAKE_AFFINE doesn't make sense and we'll let
* BALANCE_WAKE sort things out.
*/
static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
{
long min_cap, max_cap;
-
min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
- max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
-
+ max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
/* Minimum capacity is close to max, no need to abort wake_affine */
if (max_cap - min_cap < max_cap >> 3)
return 0;
+ /* Bring task utilization in sync with prev_cpu */
+ sync_entity_load_avg(&p->se);
+
return min_cap * 1024 < task_util(p) * capacity_margin;
}
+static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
+{
+ struct sched_domain *sd;
+ int target_cpu = prev_cpu, tmp_target, tmp_backup;
+ bool boosted, prefer_idle;
+
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_attempts);
+ schedstat_inc(this_rq()->eas_stats.secb_attempts);
+
+ if (sysctl_sched_sync_hint_enable && sync) {
+ int cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_sync);
+ schedstat_inc(this_rq()->eas_stats.secb_sync);
+ return cpu;
+ }
+ }
+
+ rcu_read_lock();
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ boosted = schedtune_task_boost(p) > 0;
+ prefer_idle = schedtune_prefer_idle(p) > 0;
+#else
+ boosted = get_sysctl_sched_cfs_boost() > 0;
+ prefer_idle = 0;
+#endif
+
+ sync_entity_load_avg(&p->se);
+
+ sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
+ /* Find a cpu with sufficient capacity */
+ tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
+
+ if (!sd)
+ goto unlock;
+ if (tmp_target >= 0) {
+ target_cpu = tmp_target;
+ if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_idle_bt);
+ schedstat_inc(this_rq()->eas_stats.secb_idle_bt);
+ goto unlock;
+ }
+ }
+
+ if (target_cpu != prev_cpu) {
+ struct energy_env eenv = {
+ .util_delta = task_util(p),
+ .src_cpu = prev_cpu,
+ .dst_cpu = target_cpu,
+ .task = p,
+ };
+
+ /* Not enough spare capacity on previous cpu */
+ if (cpu_overutilized(prev_cpu)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_insuff_cap);
+ schedstat_inc(this_rq()->eas_stats.secb_insuff_cap);
+ goto unlock;
+ }
+
+ if (energy_diff(&eenv) >= 0) {
+ /* No energy saving for target_cpu, try backup */
+ target_cpu = tmp_backup;
+ eenv.dst_cpu = target_cpu;
+ if (tmp_backup < 0 || energy_diff(&eenv) >= 0) {
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_no_nrg_sav);
+ schedstat_inc(this_rq()->eas_stats.secb_no_nrg_sav);
+ target_cpu = prev_cpu;
+ goto unlock;
+ }
+ }
+
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_nrg_sav);
+ schedstat_inc(this_rq()->eas_stats.secb_nrg_sav);
+ goto unlock;
+ }
+
+ schedstat_inc(p->se.statistics.nr_wakeups_secb_count);
+ schedstat_inc(this_rq()->eas_stats.secb_count);
+
+unlock:
+ rcu_read_unlock();
+ return target_cpu;
+}
+
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5607,10 +7030,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
- want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
- && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ want_affine = (!wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p)));
}
+ if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
+ return select_energy_cpu_brute(p, prev_cpu, sync);
+
rcu_read_lock();
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
@@ -5642,39 +7068,58 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- } else while (sd) {
- struct sched_group *group;
- int weight;
+ } else {
+ int wu = sd_flag & SD_BALANCE_WAKE;
+ int cas_cpu = -1;
- if (!(sd->flags & sd_flag)) {
- sd = sd->child;
- continue;
+ if (wu) {
+ schedstat_inc(p->se.statistics.nr_wakeups_cas_attempts);
+ schedstat_inc(this_rq()->eas_stats.cas_attempts);
}
- group = find_idlest_group(sd, p, cpu, sd_flag);
- if (!group) {
- sd = sd->child;
- continue;
- }
- new_cpu = find_idlest_cpu(group, p, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
- /* Now try balancing at a lower domain level of cpu */
- sd = sd->child;
- continue;
+ while (sd) {
+ struct sched_group *group;
+ int weight;
+
+ if (wu)
+ schedstat_inc(sd->eas_stats.cas_attempts);
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = find_idlest_group(sd, p, cpu, sd_flag);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_cpu(group, p, cpu);
+ if (new_cpu == -1 || new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = cas_cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
}
- /* Now try balancing at a lower domain level of new_cpu */
- cpu = new_cpu;
- weight = sd->span_weight;
- sd = NULL;
- for_each_domain(cpu, tmp) {
- if (weight <= tmp->span_weight)
- break;
- if (tmp->flags & sd_flag)
- sd = tmp;
+ if (wu && (cas_cpu >= 0)) {
+ schedstat_inc(p->se.statistics.nr_wakeups_cas_count);
+ schedstat_inc(this_rq()->eas_stats.cas_count);
}
- /* while loop will break here if sd == NULL */
}
rcu_read_unlock();
@@ -5734,6 +7179,8 @@ static void task_dead_fair(struct task_struct *p)
{
remove_entity_load_avg(&p->se);
}
+#else
+#define task_fits_max(p, cpu) true
#endif /* CONFIG_SMP */
static unsigned long
@@ -5980,6 +7427,8 @@ again:
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
+ rq->misfit_task = !task_fits_max(p, rq->cpu);
+
return p;
simple:
cfs_rq = &rq->cfs;
@@ -6001,9 +7450,12 @@ simple:
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
+ rq->misfit_task = !task_fits_max(p, rq->cpu);
+
return p;
idle:
+ rq->misfit_task = 0;
/*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
@@ -6216,6 +7668,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
enum fbq_type { regular, remote, all };
+enum group_type {
+ group_other = 0,
+ group_misfit_task,
+ group_imbalanced,
+ group_overloaded,
+};
+
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
@@ -6234,6 +7693,7 @@ struct lb_env {
int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
+ unsigned int src_grp_nr_running;
/* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus;
@@ -6244,6 +7704,7 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
+ enum group_type busiest_group_type;
struct list_head tasks;
};
@@ -6425,7 +7886,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(env->src_rq, p, 0);
+ double_lock_balance(env->src_rq, env->dst_rq);
set_task_cpu(p, env->dst_cpu);
+ double_unlock_balance(env->src_rq, env->dst_rq);
}
/*
@@ -6570,6 +8033,10 @@ static void attach_one_task(struct rq *rq, struct task_struct *p)
{
raw_spin_lock(&rq->lock);
attach_task(rq, p);
+ /*
+ * We want to potentially raise target_cpu's OPP.
+ */
+ update_capacity_of(cpu_of(rq));
raw_spin_unlock(&rq->lock);
}
@@ -6591,6 +8058,11 @@ static void attach_tasks(struct lb_env *env)
attach_task(env->dst_rq, p);
}
+ /*
+ * We want to potentially raise env.dst_cpu's OPP.
+ */
+ update_capacity_of(env->dst_cpu);
+
raw_spin_unlock(&env->dst_rq->lock);
}
@@ -6615,6 +8087,10 @@ static void update_blocked_averages(int cpu)
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
update_tg_load_avg(cfs_rq, 0);
+
+ /* Propagate pending load changes to the parent */
+ if (cfs_rq->tg->se[cpu])
+ update_load_avg(cfs_rq->tg->se[cpu], 0);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -6686,12 +8162,6 @@ static unsigned long task_h_load(struct task_struct *p)
/********** Helpers for find_busiest_group ************************/
-enum group_type {
- group_other = 0,
- group_imbalanced,
- group_overloaded,
-};
-
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
@@ -6707,6 +8177,7 @@ struct sg_lb_stats {
unsigned int group_weight;
enum group_type group_type;
int group_no_capacity;
+ int group_misfit_task; /* A cpu has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -6804,13 +8275,43 @@ static unsigned long scale_rt_capacity(int cpu)
return 1;
}
+void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
+{
+ raw_spin_lock_init(&mcc->lock);
+ mcc->val = 0;
+ mcc->cpu = -1;
+}
+
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
+ struct max_cpu_capacity *mcc;
+ unsigned long max_capacity;
+ int max_cap_cpu;
+ unsigned long flags;
cpu_rq(cpu)->cpu_capacity_orig = capacity;
+ mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
+
+ raw_spin_lock_irqsave(&mcc->lock, flags);
+ max_capacity = mcc->val;
+ max_cap_cpu = mcc->cpu;
+
+ if ((max_capacity > capacity && max_cap_cpu == cpu) ||
+ (max_capacity < capacity)) {
+ mcc->val = capacity;
+ mcc->cpu = cpu;
+#ifdef CONFIG_SCHED_DEBUG
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
+ pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+ goto skip_unlock;
+#endif
+ }
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
+
+skip_unlock: __attribute__ ((unused));
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
@@ -6819,13 +8320,15 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
+ sdg->sgc->max_capacity = capacity;
+ sdg->sgc->min_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity;
+ unsigned long capacity, max_capacity, min_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -6838,6 +8341,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
capacity = 0;
+ max_capacity = 0;
+ min_capacity = ULONG_MAX;
if (child->flags & SD_OVERLAP) {
/*
@@ -6862,11 +8367,13 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
*/
if (unlikely(!rq->sd)) {
capacity += capacity_of(cpu);
- continue;
+ } else {
+ sgc = rq->sd->groups->sgc;
+ capacity += sgc->capacity;
}
- sgc = rq->sd->groups->sgc;
- capacity += sgc->capacity;
+ max_capacity = max(capacity, max_capacity);
+ min_capacity = min(capacity, min_capacity);
}
} else {
/*
@@ -6876,12 +8383,18 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- capacity += group->sgc->capacity;
+ struct sched_group_capacity *sgc = group->sgc;
+
+ capacity += sgc->capacity;
+ max_capacity = max(sgc->max_capacity, max_capacity);
+ min_capacity = min(sgc->min_capacity, min_capacity);
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
+ sdg->sgc->max_capacity = max_capacity;
+ sdg->sgc->min_capacity = min_capacity;
}
/*
@@ -6976,6 +8489,17 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
return false;
}
+/*
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-cpu capacity than sched_group ref.
+ */
+static inline bool
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+ return sg->sgc->max_capacity + capacity_margin - SCHED_CAPACITY_SCALE <
+ ref->sgc->max_capacity;
+}
+
static inline enum
group_type group_classify(struct sched_group *group,
struct sg_lb_stats *sgs)
@@ -6986,9 +8510,44 @@ group_type group_classify(struct sched_group *group,
if (sg_imbalanced(group))
return group_imbalanced;
+ if (sgs->group_misfit_task)
+ return group_misfit_task;
+
return group_other;
}
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * idle load balancing data
+ * - used by the nohz balance, but we want it available here
+ * so that we can see which CPUs have no tick.
+ */
+static struct {
+ cpumask_var_t idle_cpus_mask;
+ atomic_t nr_cpus;
+ unsigned long next_balance; /* in jiffy units */
+} nohz ____cacheline_aligned;
+
+static inline void update_cpu_stats_if_tickless(struct rq *rq)
+{
+ /* only called from update_sg_lb_stats when irqs are disabled */
+ if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
+ /* rate limit updates to once-per-jiffie at most */
+ if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&rq->lock);
+ update_rq_clock(rq);
+ cpu_load_update_idle(rq);
+ update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
+ raw_spin_unlock(&rq->lock);
+ }
+}
+
+#else
+static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
+#endif
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -6997,11 +8556,12 @@ group_type group_classify(struct sched_group *group,
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
* @overload: Indicate more than one runnable task for any CPU.
+ * @overutilized: Indicate overutilization for any CPU.
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
int local_group, struct sg_lb_stats *sgs,
- bool *overload)
+ bool *overload, bool *overutilized)
{
unsigned long load;
int i, nr_running;
@@ -7011,6 +8571,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ /* if we are entering idle and there are CPUs with
+ * their tick stopped, do an update for them
+ */
+ if (env->idle == CPU_NEWLY_IDLE)
+ update_cpu_stats_if_tickless(rq);
+
/* Bias balancing toward cpus of our domain */
if (local_group)
load = target_load(i, load_idx);
@@ -7035,6 +8601,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
*/
if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
+
+ if (cpu_overutilized(i)) {
+ *overutilized = true;
+ if (!sgs->group_misfit_task && rq->misfit_task)
+ sgs->group_misfit_task = capacity_of(i);
+ }
}
/* Adjust by relative CPU capacity of the group */
@@ -7076,9 +8648,31 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->group_type < busiest->group_type)
return false;
+ /*
+ * Candidate sg doesn't face any serious load-balance problems
+ * so don't pick it if the local sg is already filled up.
+ */
+ if (sgs->group_type == group_other &&
+ !group_has_capacity(env, &sds->local_stat))
+ return false;
+
if (sgs->avg_load <= busiest->avg_load)
return false;
+ if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+ goto asym_packing;
+
+ /*
+ * Candidate sg has no more than one task per CPU and
+ * has higher per-CPU capacity. Migrating tasks to less
+ * capable CPUs may harm throughput. Maximize throughput,
+ * power/energy consequences are not considered.
+ */
+ if (sgs->sum_nr_running <= sgs->group_weight &&
+ group_smaller_cpu_capacity(sds->local, sg))
+ return false;
+
+asym_packing:
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
@@ -7133,6 +8727,9 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
}
#endif /* CONFIG_NUMA_BALANCING */
+#define lb_sd_parent(sd) \
+ (sd->parent && sd->parent->groups != sd->parent->groups->next)
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
@@ -7144,7 +8741,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
- bool overload = false;
+ bool overload = false, overutilized = false;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
@@ -7166,7 +8763,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
- &overload);
+ &overload, &overutilized);
if (local_group)
goto next_group;
@@ -7188,6 +8785,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
sgs->group_type = group_classify(sg, sgs);
}
+ /*
+ * Ignore task groups with misfit tasks if local group has no
+ * capacity or if per-cpu capacity isn't higher.
+ */
+ if (sgs->group_type == group_misfit_task &&
+ (!group_has_capacity(env, &sds->local_stat) ||
+ !group_smaller_cpu_capacity(sg, sds->local)))
+ sgs->group_type = group_other;
+
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
@@ -7204,10 +8810,23 @@ next_group:
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
- if (!env->sd->parent) {
+ env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
+
+ if (!lb_sd_parent(env->sd)) {
/* update overload indicator if we are at root domain */
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
+
+ /* Update over-utilization (tipping point, U >= 0) indicator */
+ if (env->dst_rq->rd->overutilized != overutilized) {
+ env->dst_rq->rd->overutilized = overutilized;
+ trace_sched_overutilized(overutilized);
+ }
+ } else {
+ if (!env->dst_rq->rd->overutilized && overutilized) {
+ env->dst_rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
}
}
@@ -7360,6 +8979,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
+ /* Misfitting tasks should be migrated in any case */
+ if (busiest->group_type == group_misfit_task) {
+ env->imbalance = busiest->group_misfit_task;
+ return;
+ }
+
+ /*
+ * Busiest group is overloaded, local is not, use the spare
+ * cycles to maximize throughput
+ */
+ if (busiest->group_type == group_overloaded &&
+ local->group_type <= group_misfit_task) {
+ env->imbalance = busiest->load_per_task;
+ return;
+ }
+
env->imbalance = 0;
return fix_small_imbalance(env, sds);
}
@@ -7393,6 +9028,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
+ /* Boost imbalance to allow misfit task to be balanced. */
+ if (busiest->group_type == group_misfit_task)
+ env->imbalance = max_t(long, env->imbalance,
+ busiest->group_misfit_task);
+
/*
* if *imbalance is less than the average load per runnable task
* there is no guarantee that any tasks will be moved so we'll have
@@ -7428,6 +9068,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* this level.
*/
update_sd_lb_stats(env, &sds);
+
+ if (energy_aware() && !env->dst_rq->rd->overutilized)
+ goto out_balanced;
+
local = &sds.local_stat;
busiest = &sds.busiest_stat;
@@ -7455,6 +9099,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
busiest->group_no_capacity)
goto force_balance;
+ /* Misfitting tasks should be dealt with regardless of the avg load */
+ if (busiest->group_type == group_misfit_task) {
+ goto force_balance;
+ }
+
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
@@ -7478,7 +9127,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* might end up to just move the imbalance on another group
*/
if ((busiest->group_type != group_overloaded) &&
- (local->idle_cpus <= (busiest->idle_cpus + 1)))
+ (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
+ !group_smaller_cpu_capacity(sds.busiest, sds.local))
goto out_balanced;
} else {
/*
@@ -7491,6 +9141,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
}
force_balance:
+ env->busiest_group_type = busiest->group_type;
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(env, &sds);
return sds.busiest;
@@ -7549,7 +9200,8 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
if (rq->nr_running == 1 && wl > env->imbalance &&
- !check_cpu_capacity(rq, env->sd))
+ !check_cpu_capacity(rq, env->sd) &&
+ env->busiest_group_type != group_misfit_task)
continue;
/*
@@ -7607,6 +9259,13 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
+ if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+ env->src_rq->cfs.h_nr_running == 1 &&
+ cpu_overutilized(env->src_cpu) &&
+ !cpu_overutilized(env->dst_cpu)) {
+ return 1;
+ }
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -7655,7 +9314,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
int *continue_balancing)
{
int ld_moved, cur_ld_moved, active_balance = 0;
- struct sched_domain *sd_parent = sd->parent;
+ struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
@@ -7728,6 +9387,11 @@ more_balance:
* ld_moved - cumulative load moved across iterations
*/
cur_ld_moved = detach_tasks(&env);
+ /*
+ * We want to potentially lower env.src_cpu's OPP.
+ */
+ if (cur_ld_moved)
+ update_capacity_of(env.src_cpu);
/*
* We've detached some tasks from busiest_rq. Every
@@ -7819,7 +9483,8 @@ more_balance:
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
- sd->nr_balance_failed++;
+ if (env.src_grp_nr_running > 1)
+ sd->nr_balance_failed++;
if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
@@ -7949,6 +9614,7 @@ static int idle_balance(struct rq *this_rq)
struct sched_domain *sd;
int pulled_task = 0;
u64 curr_cost = 0;
+ long removed_util=0;
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
@@ -7956,8 +9622,9 @@ static int idle_balance(struct rq *this_rq)
*/
this_rq->idle_stamp = rq_clock(this_rq);
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !this_rq->rd->overload) {
+ if (!energy_aware() &&
+ (this_rq->avg_idle < sysctl_sched_migration_cost ||
+ !this_rq->rd->overload)) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
@@ -7969,6 +9636,17 @@ static int idle_balance(struct rq *this_rq)
raw_spin_unlock(&this_rq->lock);
+ /*
+ * If removed_util_avg is !0 we most probably migrated some task away
+ * from this_cpu. In this case we might be willing to trigger an OPP
+ * update, but we want to do so if we don't find anybody else to pull
+ * here (we will trigger an OPP update with the pulled task's enqueue
+ * anyway).
+ *
+ * Record removed_util before calling update_blocked_averages, and use
+ * it below (before returning) to see if an OPP update is required.
+ */
+ removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg);
update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
@@ -8032,6 +9710,13 @@ out:
if (pulled_task)
this_rq->idle_stamp = 0;
+ else if (removed_util) {
+ /*
+ * No task pulled and someone has been migrated away.
+ * Good case to trigger an OPP update.
+ */
+ update_capacity_of(this_cpu);
+ }
return pulled_task;
}
@@ -8092,6 +9777,10 @@ static int active_load_balance_cpu_stop(void *data)
p = detach_one_task(&env);
if (p) {
schedstat_inc(sd->alb_pushed);
+ /*
+ * We want to potentially lower env.src_cpu's OPP.
+ */
+ update_capacity_of(env.src_cpu);
/* Active balancing done, reset the failure counter. */
sd->nr_balance_failed = 0;
} else {
@@ -8123,12 +9812,6 @@ static inline int on_null_domain(struct rq *rq)
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
-static struct {
- cpumask_var_t idle_cpus_mask;
- atomic_t nr_cpus;
- unsigned long next_balance; /* in jiffy units */
-} nohz ____cacheline_aligned;
-
static inline int find_new_ilb(void)
{
int ilb = cpumask_first(nohz.idle_cpus_mask);
@@ -8462,12 +10145,17 @@ static inline bool nohz_kick_needed(struct rq *rq)
if (time_before(now, nohz.next_balance))
return false;
- if (rq->nr_running >= 2)
+ if (rq->nr_running >= 2 &&
+ (!energy_aware() || cpu_overutilized(cpu)))
+ return true;
+
+ /* Do idle load balance if there have misfit task */
+ if (energy_aware() && rq->misfit_task)
return true;
rcu_read_lock();
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
- if (sds) {
+ if (sds && !energy_aware()) {
/*
* XXX: write a coherent comment on why we do this.
* See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
@@ -8575,6 +10263,16 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
+
+#ifdef CONFIG_SMP
+ if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
+ rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
+
+ rq->misfit_task = !task_fits_max(curr, rq->cpu);
+#endif
+
}
/*
@@ -8662,32 +10360,45 @@ static inline bool vruntime_normalized(struct task_struct *p)
return false;
}
-static void detach_task_cfs_rq(struct task_struct *p)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Propagate the changes of the sched_entity across the tg tree to make it
+ * visible to the root
+ */
+static void propagate_entity_cfs_rq(struct sched_entity *se)
{
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
+ struct cfs_rq *cfs_rq;
- if (!vruntime_normalized(p)) {
- /*
- * Fix up our vruntime so that the current sleep doesn't
- * cause 'unlimited' sleep bonus.
- */
- place_entity(cfs_rq, se, 0);
- se->vruntime -= cfs_rq->min_vruntime;
+ /* Start to propagate at parent */
+ se = se->parent;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
+ update_load_avg(se, UPDATE_TG);
}
+}
+#else
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+#endif
+
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
/* Catch up with the cfs_rq and remove our load when we leave */
- update_cfs_rq_load_avg(now, cfs_rq, false);
+ update_load_avg(se, 0);
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
}
-static void attach_task_cfs_rq(struct task_struct *p)
+static void attach_entity_cfs_rq(struct sched_entity *se)
{
- struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
@@ -8697,10 +10408,36 @@ static void attach_task_cfs_rq(struct task_struct *p)
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif
- /* Synchronize task with its cfs_rq */
- update_cfs_rq_load_avg(now, cfs_rq, false);
+ /* Synchronize entity with its cfs_rq */
+ update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
+static void detach_task_cfs_rq(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (!vruntime_normalized(p)) {
+ /*
+ * Fix up our vruntime so that the current sleep doesn't
+ * cause 'unlimited' sleep bonus.
+ */
+ place_entity(cfs_rq, se, 0);
+ se->vruntime -= cfs_rq->min_vruntime;
+ }
+
+ detach_entity_cfs_rq(se);
+}
+
+static void attach_task_cfs_rq(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ attach_entity_cfs_rq(se);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
@@ -8754,6 +10491,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->propagate_avg = 0;
+#endif
atomic_long_set(&cfs_rq->removed_load_avg, 0);
atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
@@ -8954,8 +10694,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
/* Possible calls to update_curr() need rq clock */
update_rq_clock(rq);
- for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se));
+ for_each_sched_entity(se) {
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1b3c8189b286..2a453dffd605 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -74,3 +74,12 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
+/*
+ * Energy aware scheduling. Use platform energy model to guide scheduling
+ * decisions optimizing for energy efficiency.
+ */
+#ifdef CONFIG_DEFAULT_USE_ENERGY_AWARE
+SCHED_FEAT(ENERGY_AWARE, true)
+#else
+SCHED_FEAT(ENERGY_AWARE, false)
+#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1d8718d5300d..cf75f00f7037 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -23,9 +23,10 @@ extern char __cpuidle_text_start[], __cpuidle_text_end[];
* sched_idle_set_state - Record idle state for the current CPU.
* @idle_state: State to record.
*/
-void sched_idle_set_state(struct cpuidle_state *idle_state)
+void sched_idle_set_state(struct cpuidle_state *idle_state, int index)
{
idle_set_state(this_rq(), idle_state);
+ idle_set_state_idx(this_rq(), index);
}
static int __read_mostly cpu_idle_force_poll;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9ab4d73e9cc9..d7f46a0d4774 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -8,6 +8,8 @@
#include <linux/slab.h>
#include <linux/irq_work.h>
+#include "walt.h"
+
int sched_rr_timeslice = RR_TIMESLICE;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -888,6 +890,51 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
return rt_task_of(rt_se)->prio;
}
+static void dump_throttled_rt_tasks(struct rt_rq *rt_rq)
+{
+ struct rt_prio_array *array = &rt_rq->active;
+ struct sched_rt_entity *rt_se;
+ char buf[500];
+ char *pos = buf;
+ char *end = buf + sizeof(buf);
+ int idx;
+
+ pos += snprintf(pos, sizeof(buf),
+ "sched: RT throttling activated for rt_rq %p (cpu %d)\n",
+ rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
+
+ if (bitmap_empty(array->bitmap, MAX_RT_PRIO))
+ goto out;
+
+ pos += snprintf(pos, end - pos, "potential CPU hogs:\n");
+ idx = sched_find_first_bit(array->bitmap);
+ while (idx < MAX_RT_PRIO) {
+ list_for_each_entry(rt_se, array->queue + idx, run_list) {
+ struct task_struct *p;
+
+ if (!rt_entity_is_task(rt_se))
+ continue;
+
+ p = rt_task_of(rt_se);
+ if (pos < end)
+ pos += snprintf(pos, end - pos, "\t%s (%d)\n",
+ p->comm, p->pid);
+ }
+ idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
+ }
+out:
+#ifdef CONFIG_PANIC_ON_RT_THROTTLING
+ /*
+ * Use pr_err() in the BUG() case since printk_sched() will
+ * not get flushed and deadlock is not a concern.
+ */
+ pr_err("%s", buf);
+ BUG();
+#else
+ printk_deferred("%s", buf);
+#endif
+}
+
static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
u64 runtime = sched_rt_runtime(rt_rq);
@@ -911,8 +958,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
* but accrue some time due to boosting.
*/
if (likely(rt_b->rt_runtime)) {
+ static bool once = false;
+
rt_rq->rt_throttled = 1;
- printk_deferred_once("sched: RT throttling activated\n");
+
+ if (!once) {
+ once = true;
+ dump_throttled_rt_tasks(rt_rq);
+ }
} else {
/*
* In case we did anyway, make it go away,
@@ -1313,6 +1366,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
rt_se->timeout = 0;
enqueue_rt_entity(rt_se, flags);
+ walt_inc_cumulative_runnable_avg(rq, p);
if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
enqueue_pushable_task(rq, p);
@@ -1324,6 +1378,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
update_curr_rt(rq);
dequeue_rt_entity(rt_se, flags);
+ walt_dec_cumulative_runnable_avg(rq, p);
dequeue_pushable_task(rq, p);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ec6e838e991a..99293bb49f15 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -413,6 +413,7 @@ struct cfs_rq {
unsigned long runnable_load_avg;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
+ unsigned long propagate_avg;
#endif
atomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BIT
@@ -447,6 +448,10 @@ struct cfs_rq {
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
+#ifdef CONFIG_SCHED_WALT
+ u64 cumulative_runnable_avg;
+#endif
+
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
@@ -542,6 +547,12 @@ struct dl_rq {
#ifdef CONFIG_SMP
+struct max_cpu_capacity {
+ raw_spinlock_t lock;
+ unsigned long val;
+ int cpu;
+};
+
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -560,6 +571,9 @@ struct root_domain {
/* Indicate more than one runnable task for any CPU */
bool overload;
+ /* Indicate one or more cpus over-utilized (tipping point) */
+ bool overutilized;
+
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
@@ -589,7 +603,11 @@ struct root_domain {
cpumask_var_t rto_mask;
struct cpupri cpupri;
- unsigned long max_cpu_capacity;
+ /* Maximum cpu capacity in the system. */
+ struct max_cpu_capacity max_cpu_capacity;
+
+ /* First cpu with maximum and minimum original capacity */
+ int max_cap_orig_cpu, min_cap_orig_cpu;
};
extern struct root_domain def_root_domain;
@@ -623,6 +641,7 @@ struct rq {
#endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+ unsigned int misfit_task;
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
@@ -632,6 +651,14 @@ struct rq {
#ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
#endif
+
+#ifdef CONFIG_CPU_QUIET
+ /* time-based average load */
+ u64 nr_last_stamp;
+ u64 nr_running_integral;
+ seqcount_t ave_seqcnt;
+#endif
+
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
@@ -644,6 +671,7 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
+ struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
@@ -693,6 +721,30 @@ struct rq {
u64 max_idle_balance_cost;
#endif
+#ifdef CONFIG_SCHED_WALT
+ /*
+ * max_freq = user or thermal defined maximum
+ * max_possible_freq = maximum supported by hardware
+ */
+ unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
+ struct cpumask freq_domain_cpumask;
+
+ u64 cumulative_runnable_avg;
+ int efficiency; /* Differentiate cpus with different IPC capability */
+ int load_scale_factor;
+ int capacity;
+ int max_possible_capacity;
+ u64 window_start;
+ u64 curr_runnable_sum;
+ u64 prev_runnable_sum;
+ u64 nt_curr_runnable_sum;
+ u64 nt_prev_runnable_sum;
+ u64 cur_irqload;
+ u64 avg_irqload;
+ u64 irqload_ts;
+#endif /* CONFIG_SCHED_WALT */
+
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
@@ -731,6 +783,9 @@ struct rq {
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
+#ifdef CONFIG_SMP
+ struct eas_stats eas_stats;
+#endif
#endif
#ifdef CONFIG_SMP
@@ -740,6 +795,7 @@ struct rq {
#ifdef CONFIG_CPU_IDLE
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
+ int idle_state_idx;
#endif
};
@@ -889,6 +945,8 @@ DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+DECLARE_PER_CPU(struct sched_domain *, sd_ea);
+DECLARE_PER_CPU(struct sched_domain *, sd_scs);
struct sched_group_capacity {
atomic_t ref;
@@ -896,7 +954,9 @@ struct sched_group_capacity {
* CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
* for a single CPU.
*/
- unsigned int capacity;
+ unsigned long capacity;
+ unsigned long max_capacity; /* Max per-cpu capacity in group */
+ unsigned long min_capacity; /* Min per-CPU capacity in group */
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
@@ -909,6 +969,7 @@ struct sched_group {
unsigned int group_weight;
struct sched_group_capacity *sgc;
+ const struct sched_group_energy *sge;
/*
* The CPUs this group covers.
@@ -1217,6 +1278,7 @@ extern const u32 sched_prio_to_wmult[40];
#else
#define ENQUEUE_MIGRATED 0x00
#endif
+#define ENQUEUE_WAKEUP_NEW 0x40
#define RETRY_TASK ((void *)-1UL)
@@ -1307,6 +1369,7 @@ extern const struct sched_class idle_sched_class;
#ifdef CONFIG_SMP
+extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
@@ -1327,6 +1390,17 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
SCHED_WARN_ON(!rcu_read_lock_held());
return rq->idle_state;
}
+
+static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
+{
+ rq->idle_state_idx = idle_state_idx;
+}
+
+static inline int idle_get_state_idx(struct rq *rq)
+{
+ WARN_ON(!rcu_read_lock_held());
+ return rq->idle_state_idx;
+}
#else
static inline void idle_set_state(struct rq *rq,
struct cpuidle_state *idle_state)
@@ -1337,6 +1411,15 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
return NULL;
}
+
+static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
+{
+}
+
+static inline int idle_get_state_idx(struct rq *rq)
+{
+ return -1;
+}
#endif
extern void sysrq_sched_debug_show(void);
@@ -1391,7 +1474,7 @@ static inline void sched_update_tick_dependency(struct rq *rq)
static inline void sched_update_tick_dependency(struct rq *rq) { }
#endif
-static inline void add_nr_running(struct rq *rq, unsigned count)
+static inline void __add_nr_running(struct rq *rq, unsigned count)
{
unsigned prev_nr = rq->nr_running;
@@ -1407,13 +1490,50 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
sched_update_tick_dependency(rq);
}
-static inline void sub_nr_running(struct rq *rq, unsigned count)
+static inline void __sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
/* Check if we still need preemption */
sched_update_tick_dependency(rq);
}
+#ifdef CONFIG_CPU_QUIET
+#define NR_AVE_SCALE(x) ((x) << FSHIFT)
+static inline u64 do_nr_running_integral(struct rq *rq)
+{
+ s64 nr, deltax;
+ u64 nr_running_integral = rq->nr_running_integral;
+
+ deltax = rq->clock_task - rq->nr_last_stamp;
+ nr = NR_AVE_SCALE(rq->nr_running);
+
+ nr_running_integral += nr * deltax;
+
+ return nr_running_integral;
+}
+
+static inline void add_nr_running(struct rq *rq, unsigned count)
+{
+ write_seqcount_begin(&rq->ave_seqcnt);
+ rq->nr_running_integral = do_nr_running_integral(rq);
+ rq->nr_last_stamp = rq->clock_task;
+ __add_nr_running(rq, count);
+ write_seqcount_end(&rq->ave_seqcnt);
+}
+
+static inline void sub_nr_running(struct rq *rq, unsigned count)
+{
+ write_seqcount_begin(&rq->ave_seqcnt);
+ rq->nr_running_integral = do_nr_running_integral(rq);
+ rq->nr_last_stamp = rq->clock_task;
+ __sub_nr_running(rq, count);
+ write_seqcount_end(&rq->ave_seqcnt);
+}
+#else
+#define add_nr_running __add_nr_running
+#define sub_nr_running __sub_nr_running
+#endif
+
static inline void rq_last_tick_reset(struct rq *rq)
{
#ifdef CONFIG_NO_HZ_FULL
@@ -1486,10 +1606,146 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
}
#endif
+#ifdef CONFIG_SMP
+static inline unsigned long capacity_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity;
+}
+
+static inline unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int walt_ravg_window;
+extern unsigned int walt_disabled;
+
+/*
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ */
+static inline unsigned long __cpu_util(int cpu, int delta)
+{
+ unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+ unsigned long capacity = capacity_orig_of(cpu);
+
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+ util = cpu_rq(cpu)->prev_runnable_sum << SCHED_CAPACITY_SHIFT;
+ util = div_u64(util, walt_ravg_window);
+ }
+#endif
+ delta += util;
+ if (delta < 0)
+ return 0;
+
+ return (delta >= capacity) ? capacity : delta;
+}
+
+static inline unsigned long cpu_util(int cpu)
+{
+ return __cpu_util(cpu, 0);
+}
+
+#endif
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED
+#define capacity_max SCHED_CAPACITY_SCALE
+extern unsigned int capacity_margin;
+extern struct static_key __sched_freq;
+
+static inline bool sched_freq(void)
+{
+ return static_key_false(&__sched_freq);
+}
+
+DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
+void update_cpu_capacity_request(int cpu, bool request);
+
+static inline void set_cfs_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{
+ struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+ int rtdl = scr->rt + scr->dl;
+ /*
+ * WALT tracks the utilization of a CPU considering the load
+ * generated by all the scheduling classes.
+ * Since the following call to:
+ * update_cpu_capacity
+ * is already adding the RT and DL utilizations let's remove
+ * these contributions from the WALT signal.
+ */
+ if (capacity > rtdl)
+ capacity -= rtdl;
+ else
+ capacity = 0;
+ }
+#endif
+ if (scr->cfs != capacity) {
+ scr->cfs = capacity;
+ update_cpu_capacity_request(cpu, request);
+ }
+}
+
+static inline void set_rt_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{
+ if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) {
+ per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity;
+ update_cpu_capacity_request(cpu, request);
+ }
+}
+
+static inline void set_dl_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{
+ if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) {
+ per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity;
+ update_cpu_capacity_request(cpu, request);
+ }
+}
+#else
+static inline bool sched_freq(void) { return false; }
+static inline void set_cfs_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{ }
+static inline void set_rt_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{ }
+static inline void set_dl_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{ }
+#endif
+
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
- sched_avg_update(rq);
}
#else
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
@@ -1524,6 +1780,9 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
+extern struct rq *lock_rq_of(struct task_struct *p, struct rq_flags *flags);
+extern void unlock_rq_of(struct rq *rq, struct task_struct *p, struct rq_flags *flags);
+
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
@@ -1596,7 +1855,8 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
- raw_spin_unlock(&busiest->lock);
+ if (this_rq != busiest)
+ raw_spin_unlock(&busiest->lock);
lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
}
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 87e2c9f0c33e..6d74a7c77c8c 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -12,6 +12,28 @@
*/
#define SCHEDSTAT_VERSION 15
+#ifdef CONFIG_SMP
+static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats)
+{
+ /* eas-specific runqueue stats */
+ seq_printf(seq, "eas %llu %llu %llu %llu %llu %llu ",
+ stats->sis_attempts, stats->sis_idle, stats->sis_cache_affine,
+ stats->sis_suff_cap, stats->sis_idle_cpu, stats->sis_count);
+
+ seq_printf(seq, "%llu %llu %llu %llu %llu %llu %llu ",
+ stats->secb_attempts, stats->secb_sync, stats->secb_idle_bt,
+ stats->secb_insuff_cap, stats->secb_no_nrg_sav,
+ stats->secb_nrg_sav, stats->secb_count);
+
+ seq_printf(seq, "%llu %llu %llu %llu %llu ",
+ stats->fbt_attempts, stats->fbt_no_cpu, stats->fbt_no_sd,
+ stats->fbt_pref_idle, stats->fbt_count);
+
+ seq_printf(seq, "%llu %llu\n",
+ stats->cas_attempts, stats->cas_count);
+}
+#endif
+
static int show_schedstat(struct seq_file *seq, void *v)
{
int cpu;
@@ -40,6 +62,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
#ifdef CONFIG_SMP
+ show_easstat(seq, &rq->eas_stats);
+
/* domain-specific stats */
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -66,6 +90,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
sd->ttwu_wake_remote, sd->ttwu_move_affine,
sd->ttwu_move_balance);
+
+ show_easstat(seq, &sd->eas_stats);
}
rcu_read_unlock();
#endif
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 604297a08b3a..836a3894cf57 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,4 +1,5 @@
#include "sched.h"
+#include "walt.h"
/*
* stop-task scheduling class.
@@ -42,12 +43,14 @@ static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
add_nr_running(rq, 1);
+ walt_inc_cumulative_runnable_avg(rq, p);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
sub_nr_running(rq, 1);
+ walt_dec_cumulative_runnable_avg(rq, p);
}
static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
new file mode 100644
index 000000000000..654934fc5e47
--- /dev/null
+++ b/kernel/sched/tune.c
@@ -0,0 +1,956 @@
+#include <linux/cgroup.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+
+#include <trace/events/sched.h>
+
+#include "sched.h"
+#include "tune.h"
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+bool schedtune_initialized = false;
+#endif
+
+unsigned int sysctl_sched_cfs_boost __read_mostly;
+
+extern struct reciprocal_value schedtune_spc_rdiv;
+extern struct target_nrg schedtune_target_nrg;
+
+/* Performance Boost region (B) threshold params */
+static int perf_boost_idx;
+
+/* Performance Constraint region (C) threshold params */
+static int perf_constrain_idx;
+
+/**
+ * Performance-Energy (P-E) Space thresholds constants
+ */
+struct threshold_params {
+ int nrg_gain;
+ int cap_gain;
+};
+
+/*
+ * System specific P-E space thresholds constants
+ */
+static struct threshold_params
+threshold_gains[] = {
+ { 0, 5 }, /* < 10% */
+ { 1, 5 }, /* < 20% */
+ { 2, 5 }, /* < 30% */
+ { 3, 5 }, /* < 40% */
+ { 4, 5 }, /* < 50% */
+ { 5, 4 }, /* < 60% */
+ { 5, 3 }, /* < 70% */
+ { 5, 2 }, /* < 80% */
+ { 5, 1 }, /* < 90% */
+ { 5, 0 } /* <= 100% */
+};
+
+static int
+__schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ int perf_boost_idx, int perf_constrain_idx)
+{
+ int payoff = -INT_MAX;
+ int gain_idx = -1;
+
+ /* Performance Boost (B) region */
+ if (nrg_delta >= 0 && cap_delta > 0)
+ gain_idx = perf_boost_idx;
+ /* Performance Constraint (C) region */
+ else if (nrg_delta < 0 && cap_delta <= 0)
+ gain_idx = perf_constrain_idx;
+
+ /* Default: reject schedule candidate */
+ if (gain_idx == -1)
+ return payoff;
+
+ /*
+ * Evaluate "Performance Boost" vs "Energy Increase"
+ *
+ * - Performance Boost (B) region
+ *
+ * Condition: nrg_delta > 0 && cap_delta > 0
+ * Payoff criteria:
+ * cap_gain / nrg_gain < cap_delta / nrg_delta =
+ * cap_gain * nrg_delta < cap_delta * nrg_gain
+ * Note that since both nrg_gain and nrg_delta are positive, the
+ * inequality does not change. Thus:
+ *
+ * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
+ *
+ * - Performance Constraint (C) region
+ *
+ * Condition: nrg_delta < 0 && cap_delta < 0
+ * payoff criteria:
+ * cap_gain / nrg_gain > cap_delta / nrg_delta =
+ * cap_gain * nrg_delta < cap_delta * nrg_gain
+ * Note that since nrg_gain > 0 while nrg_delta < 0, the
+ * inequality change. Thus:
+ *
+ * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
+ *
+ * This means that, in case of same positive defined {cap,nrg}_gain
+ * for both the B and C regions, we can use the same payoff formula
+ * where a positive value represents the accept condition.
+ */
+ payoff = cap_delta * threshold_gains[gain_idx].nrg_gain;
+ payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain;
+
+ return payoff;
+}
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+
+/*
+ * EAS scheduler tunables for task groups.
+ */
+
+/* SchdTune tunables for a group of tasks */
+struct schedtune {
+ /* SchedTune CGroup subsystem */
+ struct cgroup_subsys_state css;
+
+ /* Boost group allocated ID */
+ int idx;
+
+ /* Boost value for tasks on that SchedTune CGroup */
+ int boost;
+
+ /* Performance Boost (B) region threshold params */
+ int perf_boost_idx;
+
+ /* Performance Constraint (C) region threshold params */
+ int perf_constrain_idx;
+
+ /* Hint to bias scheduling of tasks on that SchedTune CGroup
+ * towards idle CPUs */
+ int prefer_idle;
+};
+
+static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct schedtune, css) : NULL;
+}
+
+static inline struct schedtune *task_schedtune(struct task_struct *tsk)
+{
+ return css_st(task_css(tsk, schedtune_cgrp_id));
+}
+
+static inline struct schedtune *parent_st(struct schedtune *st)
+{
+ return css_st(st->css.parent);
+}
+
+/*
+ * SchedTune root control group
+ * The root control group is used to defined a system-wide boosting tuning,
+ * which is applied to all tasks in the system.
+ * Task specific boost tuning could be specified by creating and
+ * configuring a child control group under the root one.
+ * By default, system-wide boosting is disabled, i.e. no boosting is applied
+ * to tasks which are not into a child control group.
+ */
+static struct schedtune
+root_schedtune = {
+ .boost = 0,
+ .perf_boost_idx = 0,
+ .perf_constrain_idx = 0,
+ .prefer_idle = 0,
+};
+
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task)
+{
+ struct schedtune *ct;
+ int perf_boost_idx;
+ int perf_constrain_idx;
+
+ /* Optimal (O) region */
+ if (nrg_delta < 0 && cap_delta > 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
+ return INT_MAX;
+ }
+
+ /* Suboptimal (S) region */
+ if (nrg_delta > 0 && cap_delta < 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
+ return -INT_MAX;
+ }
+
+ /* Get task specific perf Boost/Constraints indexes */
+ rcu_read_lock();
+ ct = task_schedtune(task);
+ perf_boost_idx = ct->perf_boost_idx;
+ perf_constrain_idx = ct->perf_constrain_idx;
+ rcu_read_unlock();
+
+ return __schedtune_accept_deltas(nrg_delta, cap_delta,
+ perf_boost_idx, perf_constrain_idx);
+}
+
+/*
+ * Maximum number of boost groups to support
+ * When per-task boosting is used we still allow only limited number of
+ * boost groups for two main reasons:
+ * 1. on a real system we usually have only few classes of workloads which
+ * make sense to boost with different values (e.g. background vs foreground
+ * tasks, interactive vs low-priority tasks)
+ * 2. a limited number allows for a simpler and more memory/time efficient
+ * implementation especially for the computation of the per-CPU boost
+ * value
+ */
+#define BOOSTGROUPS_COUNT 5
+
+/* Array of configured boostgroups */
+static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
+ &root_schedtune,
+ NULL,
+};
+
+/* SchedTune boost groups
+ * Keep track of all the boost groups which impact on CPU, for example when a
+ * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
+ * likely with different boost values.
+ * Since on each system we expect only a limited number of boost groups, here
+ * we use a simple array to keep track of the metrics required to compute the
+ * maximum per-CPU boosting value.
+ */
+struct boost_groups {
+ /* Maximum boost value for all RUNNABLE tasks on a CPU */
+ bool idle;
+ int boost_max;
+ struct {
+ /* The boost for tasks on that boost group */
+ int boost;
+ /* Count of RUNNABLE tasks on that boost group */
+ unsigned tasks;
+ } group[BOOSTGROUPS_COUNT];
+ /* CPU's boost group locking */
+ raw_spinlock_t lock;
+};
+
+/* Boost groups affecting each CPU in the system */
+DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
+
+static void
+schedtune_cpu_update(int cpu)
+{
+ struct boost_groups *bg;
+ int boost_max;
+ int idx;
+
+ bg = &per_cpu(cpu_boost_groups, cpu);
+
+ /* The root boost group is always active */
+ boost_max = bg->group[0].boost;
+ for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
+ /*
+ * A boost group affects a CPU only if it has
+ * RUNNABLE tasks on that CPU
+ */
+ if (bg->group[idx].tasks == 0)
+ continue;
+
+ boost_max = max(boost_max, bg->group[idx].boost);
+ }
+ /* Ensures boost_max is non-negative when all cgroup boost values
+ * are neagtive. Avoids under-accounting of cpu capacity which may cause
+ * task stacking and frequency spikes.*/
+ boost_max = max(boost_max, 0);
+ bg->boost_max = boost_max;
+}
+
+static int
+schedtune_boostgroup_update(int idx, int boost)
+{
+ struct boost_groups *bg;
+ int cur_boost_max;
+ int old_boost;
+ int cpu;
+
+ /* Update per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+
+ /*
+ * Keep track of current boost values to compute the per CPU
+ * maximum only when it has been affected by the new value of
+ * the updated boost group
+ */
+ cur_boost_max = bg->boost_max;
+ old_boost = bg->group[idx].boost;
+
+ /* Update the boost value of this boost group */
+ bg->group[idx].boost = boost;
+
+ /* Check if this update increase current max */
+ if (boost > cur_boost_max && bg->group[idx].tasks) {
+ bg->boost_max = boost;
+ trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
+ continue;
+ }
+
+ /* Check if this update has decreased current max */
+ if (cur_boost_max == old_boost && old_boost > boost) {
+ schedtune_cpu_update(cpu);
+ trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
+ continue;
+ }
+
+ trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
+ }
+
+ return 0;
+}
+
+#define ENQUEUE_TASK 1
+#define DEQUEUE_TASK -1
+
+static inline void
+schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
+{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ int tasks = bg->group[idx].tasks + task_count;
+
+ /* Update boosted tasks count while avoiding to make it negative */
+ bg->group[idx].tasks = max(0, tasks);
+
+ trace_sched_tune_tasks_update(p, cpu, tasks, idx,
+ bg->group[idx].boost, bg->boost_max);
+
+ /* Boost group activation or deactivation on that RQ */
+ if (tasks == 1 || tasks == 0)
+ schedtune_cpu_update(cpu);
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_enqueue_task(struct task_struct *p, int cpu)
+{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ unsigned long irq_flags;
+ struct schedtune *st;
+ int idx;
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ /*
+ * When a task is marked PF_EXITING by do_exit() it's going to be
+ * dequeued and enqueued multiple times in the exit path.
+ * Thus we avoid any further update, since we do not want to change
+ * CPU boosting while the task is exiting.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions for example on
+ * do_exit()::cgroup_exit() and task migration.
+ */
+ raw_spin_lock_irqsave(&bg->lock, irq_flags);
+ rcu_read_lock();
+
+ st = task_schedtune(p);
+ idx = st->idx;
+
+ schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
+
+ rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+int schedtune_can_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+ struct boost_groups *bg;
+ struct rq_flags irq_flags;
+ unsigned int cpu;
+ struct rq *rq;
+ int src_bg; /* Source boost group index */
+ int dst_bg; /* Destination boost group index */
+ int tasks;
+
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
+
+ cgroup_taskset_for_each(task, css, tset) {
+
+ /*
+ * Lock the CPU's RQ the task is enqueued to avoid race
+ * conditions with migration code while the task is being
+ * accounted
+ */
+ rq = lock_rq_of(task, &irq_flags);
+
+ if (!task->on_rq) {
+ unlock_rq_of(rq, task, &irq_flags);
+ continue;
+ }
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions on...
+ */
+ cpu = cpu_of(rq);
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ raw_spin_lock(&bg->lock);
+
+ dst_bg = css_st(css)->idx;
+ src_bg = task_schedtune(task)->idx;
+
+ /*
+ * Current task is not changing boostgroup, which can
+ * happen when the new hierarchy is in use.
+ */
+ if (unlikely(dst_bg == src_bg)) {
+ raw_spin_unlock(&bg->lock);
+ unlock_rq_of(rq, task, &irq_flags);
+ continue;
+ }
+
+ /*
+ * This is the case of a RUNNABLE task which is switching its
+ * current boost group.
+ */
+
+ /* Move task from src to dst boost group */
+ tasks = bg->group[src_bg].tasks - 1;
+ bg->group[src_bg].tasks = max(0, tasks);
+ bg->group[dst_bg].tasks += 1;
+
+ raw_spin_unlock(&bg->lock);
+ unlock_rq_of(rq, task, &irq_flags);
+
+ /* Update CPU boost group */
+ if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
+ schedtune_cpu_update(task_cpu(task));
+
+ }
+
+ return 0;
+}
+
+void schedtune_cancel_attach(struct cgroup_taskset *tset)
+{
+ /* This can happen only if SchedTune controller is mounted with
+ * other hierarchies ane one of them fails. Since usually SchedTune is
+ * mouted on its own hierarcy, for the time being we do not implement
+ * a proper rollback mechanism */
+ WARN(1, "SchedTune cancel attach not implemented");
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_dequeue_task(struct task_struct *p, int cpu)
+{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ unsigned long irq_flags;
+ struct schedtune *st;
+ int idx;
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ /*
+ * When a task is marked PF_EXITING by do_exit() it's going to be
+ * dequeued and enqueued multiple times in the exit path.
+ * Thus we avoid any further update, since we do not want to change
+ * CPU boosting while the task is exiting.
+ * The last dequeue is already enforce by the do_exit() code path
+ * via schedtune_exit_task().
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions on...
+ */
+ raw_spin_lock_irqsave(&bg->lock, irq_flags);
+ rcu_read_lock();
+
+ st = task_schedtune(p);
+ idx = st->idx;
+
+ schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
+
+ rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+void schedtune_exit_task(struct task_struct *tsk)
+{
+ struct schedtune *st;
+ struct rq_flags irq_flags;
+ unsigned int cpu;
+ struct rq *rq;
+ int idx;
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ rq = lock_rq_of(tsk, &irq_flags);
+ rcu_read_lock();
+
+ cpu = cpu_of(rq);
+ st = task_schedtune(tsk);
+ idx = st->idx;
+ schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
+
+ rcu_read_unlock();
+ unlock_rq_of(rq, tsk, &irq_flags);
+}
+
+int schedtune_cpu_boost(int cpu)
+{
+ struct boost_groups *bg;
+
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ return bg->boost_max;
+}
+
+int schedtune_task_boost(struct task_struct *p)
+{
+ struct schedtune *st;
+ int task_boost;
+
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
+ /* Get task boost value */
+ rcu_read_lock();
+ st = task_schedtune(p);
+ task_boost = st->boost;
+ rcu_read_unlock();
+
+ return task_boost;
+}
+
+int schedtune_prefer_idle(struct task_struct *p)
+{
+ struct schedtune *st;
+ int prefer_idle;
+
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
+ /* Get prefer_idle value */
+ rcu_read_lock();
+ st = task_schedtune(p);
+ prefer_idle = st->prefer_idle;
+ rcu_read_unlock();
+
+ return prefer_idle;
+}
+
+static u64
+prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct schedtune *st = css_st(css);
+
+ return st->prefer_idle;
+}
+
+static int
+prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 prefer_idle)
+{
+ struct schedtune *st = css_st(css);
+ st->prefer_idle = prefer_idle;
+
+ return 0;
+}
+
+static s64
+boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct schedtune *st = css_st(css);
+
+ return st->boost;
+}
+
+static int
+boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ s64 boost)
+{
+ struct schedtune *st = css_st(css);
+ unsigned threshold_idx;
+ int boost_pct;
+
+ if (boost < -100 || boost > 100)
+ return -EINVAL;
+ boost_pct = boost;
+
+ /*
+ * Update threshold params for Performance Boost (B)
+ * and Performance Constraint (C) regions.
+ * The current implementatio uses the same cuts for both
+ * B and C regions.
+ */
+ threshold_idx = clamp(boost_pct, 0, 99) / 10;
+ st->perf_boost_idx = threshold_idx;
+ st->perf_constrain_idx = threshold_idx;
+
+ st->boost = boost;
+ if (css == &root_schedtune.css) {
+ sysctl_sched_cfs_boost = boost;
+ perf_boost_idx = threshold_idx;
+ perf_constrain_idx = threshold_idx;
+ }
+
+ /* Update CPU boost */
+ schedtune_boostgroup_update(st->idx, st->boost);
+
+ trace_sched_tune_config(st->boost);
+
+ return 0;
+}
+
+static struct cftype files[] = {
+ {
+ .name = "boost",
+ .read_s64 = boost_read,
+ .write_s64 = boost_write,
+ },
+ {
+ .name = "prefer_idle",
+ .read_u64 = prefer_idle_read,
+ .write_u64 = prefer_idle_write,
+ },
+ { } /* terminate */
+};
+
+static int
+schedtune_boostgroup_init(struct schedtune *st)
+{
+ struct boost_groups *bg;
+ int cpu;
+
+ /* Keep track of allocated boost groups */
+ allocated_group[st->idx] = st;
+
+ /* Initialize the per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ bg->group[st->idx].boost = 0;
+ bg->group[st->idx].tasks = 0;
+ raw_spin_lock_init(&bg->lock);
+ }
+
+ return 0;
+}
+
+static struct cgroup_subsys_state *
+schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct schedtune *st;
+ int idx;
+
+ if (!parent_css)
+ return &root_schedtune.css;
+
+ /* Allow only single level hierachies */
+ if (parent_css != &root_schedtune.css) {
+ pr_err("Nested SchedTune boosting groups not allowed\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* Allow only a limited number of boosting groups */
+ for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
+ if (!allocated_group[idx])
+ break;
+ if (idx == BOOSTGROUPS_COUNT) {
+ pr_err("Trying to create more than %d SchedTune boosting groups\n",
+ BOOSTGROUPS_COUNT);
+ return ERR_PTR(-ENOSPC);
+ }
+
+ st = kzalloc(sizeof(*st), GFP_KERNEL);
+ if (!st)
+ goto out;
+
+ /* Initialize per CPUs boost group support */
+ st->idx = idx;
+ if (schedtune_boostgroup_init(st))
+ goto release;
+
+ return &st->css;
+
+release:
+ kfree(st);
+out:
+ return ERR_PTR(-ENOMEM);
+}
+
+static void
+schedtune_boostgroup_release(struct schedtune *st)
+{
+ /* Reset this boost group */
+ schedtune_boostgroup_update(st->idx, 0);
+
+ /* Keep track of allocated boost groups */
+ allocated_group[st->idx] = NULL;
+}
+
+static void
+schedtune_css_free(struct cgroup_subsys_state *css)
+{
+ struct schedtune *st = css_st(css);
+
+ schedtune_boostgroup_release(st);
+ kfree(st);
+}
+
+struct cgroup_subsys schedtune_cgrp_subsys = {
+ .css_alloc = schedtune_css_alloc,
+ .css_free = schedtune_css_free,
+ .can_attach = schedtune_can_attach,
+ .cancel_attach = schedtune_cancel_attach,
+ .legacy_cftypes = files,
+ .early_init = 1,
+};
+
+static inline void
+schedtune_init_cgroups(void)
+{
+ struct boost_groups *bg;
+ int cpu;
+
+ /* Initialize the per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ memset(bg, 0, sizeof(struct boost_groups));
+ raw_spin_lock_init(&bg->lock);
+ }
+
+ pr_info("schedtune: configured to support %d boost groups\n",
+ BOOSTGROUPS_COUNT);
+
+ schedtune_initialized = true;
+}
+
+#else /* CONFIG_CGROUP_SCHEDTUNE */
+
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task)
+{
+ /* Optimal (O) region */
+ if (nrg_delta < 0 && cap_delta > 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
+ return INT_MAX;
+ }
+
+ /* Suboptimal (S) region */
+ if (nrg_delta > 0 && cap_delta < 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
+ return -INT_MAX;
+ }
+
+ return __schedtune_accept_deltas(nrg_delta, cap_delta,
+ perf_boost_idx, perf_constrain_idx);
+}
+
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
+int
+sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ unsigned threshold_idx;
+ int boost_pct;
+
+ if (ret || !write)
+ return ret;
+
+ if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100)
+ return -EINVAL;
+ boost_pct = sysctl_sched_cfs_boost;
+
+ /*
+ * Update threshold params for Performance Boost (B)
+ * and Performance Constraint (C) regions.
+ * The current implementatio uses the same cuts for both
+ * B and C regions.
+ */
+ threshold_idx = clamp(boost_pct, 0, 99) / 10;
+ perf_boost_idx = threshold_idx;
+ perf_constrain_idx = threshold_idx;
+
+ return 0;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static void
+schedtune_test_nrg(unsigned long delta_pwr)
+{
+ unsigned long test_delta_pwr;
+ unsigned long test_norm_pwr;
+ int idx;
+
+ /*
+ * Check normalization constants using some constant system
+ * energy values
+ */
+ pr_info("schedtune: verify normalization constants...\n");
+ for (idx = 0; idx < 6; ++idx) {
+ test_delta_pwr = delta_pwr >> idx;
+
+ /* Normalize on max energy for target platform */
+ test_norm_pwr = reciprocal_divide(
+ test_delta_pwr << SCHED_CAPACITY_SHIFT,
+ schedtune_target_nrg.rdiv);
+
+ pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
+ idx, test_delta_pwr, test_norm_pwr);
+ }
+}
+#else
+#define schedtune_test_nrg(delta_pwr)
+#endif
+
+/*
+ * Compute the min/max power consumption of a cluster and all its CPUs
+ */
+static void
+schedtune_add_cluster_nrg(
+ struct sched_domain *sd,
+ struct sched_group *sg,
+ struct target_nrg *ste)
+{
+ struct sched_domain *sd2;
+ struct sched_group *sg2;
+
+ struct cpumask *cluster_cpus;
+ char str[32];
+
+ unsigned long min_pwr;
+ unsigned long max_pwr;
+ int cpu;
+
+ /* Get Cluster energy using EM data for the first CPU */
+ cluster_cpus = sched_group_cpus(sg);
+ snprintf(str, 32, "CLUSTER[%*pbl]",
+ cpumask_pr_args(cluster_cpus));
+
+ min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
+ max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
+ pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+ str, min_pwr, max_pwr);
+
+ /*
+ * Keep track of this cluster's energy in the computation of the
+ * overall system energy
+ */
+ ste->min_power += min_pwr;
+ ste->max_power += max_pwr;
+
+ /* Get CPU energy using EM data for each CPU in the group */
+ for_each_cpu(cpu, cluster_cpus) {
+ /* Get a SD view for the specific CPU */
+ for_each_domain(cpu, sd2) {
+ /* Get the CPU group */
+ sg2 = sd2->groups;
+ min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
+ max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
+
+ ste->min_power += min_pwr;
+ ste->max_power += max_pwr;
+
+ snprintf(str, 32, "CPU[%d]", cpu);
+ pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+ str, min_pwr, max_pwr);
+
+ /*
+ * Assume we have EM data only at the CPU and
+ * the upper CLUSTER level
+ */
+ BUG_ON(!cpumask_equal(
+ sched_group_cpus(sg),
+ sched_group_cpus(sd2->parent->groups)
+ ));
+ break;
+ }
+ }
+}
+
+/*
+ * Initialize the constants required to compute normalized energy.
+ * The values of these constants depends on the EM data for the specific
+ * target system and topology.
+ * Thus, this function is expected to be called by the code
+ * that bind the EM to the topology information.
+ */
+static int
+schedtune_init(void)
+{
+ struct target_nrg *ste = &schedtune_target_nrg;
+ unsigned long delta_pwr = 0;
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ pr_info("schedtune: init normalization constants...\n");
+ ste->max_power = 0;
+ ste->min_power = 0;
+
+ rcu_read_lock();
+
+ /*
+ * When EAS is in use, we always have a pointer to the highest SD
+ * which provides EM data.
+ */
+ sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
+ if (!sd) {
+ pr_info("schedtune: no energy model data\n");
+ goto nodata;
+ }
+
+ sg = sd->groups;
+ do {
+ schedtune_add_cluster_nrg(sd, sg, ste);
+ } while (sg = sg->next, sg != sd->groups);
+
+ rcu_read_unlock();
+
+ pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+ "SYSTEM", ste->min_power, ste->max_power);
+
+ /* Compute normalization constants */
+ delta_pwr = ste->max_power - ste->min_power;
+ ste->rdiv = reciprocal_value(delta_pwr);
+ pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
+ ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
+
+ schedtune_test_nrg(delta_pwr);
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ schedtune_init_cgroups();
+#else
+ pr_info("schedtune: configured to support global boosting only\n");
+#endif
+
+ schedtune_spc_rdiv = reciprocal_value(100);
+
+ return 0;
+
+nodata:
+ pr_warning("schedtune: disabled!\n");
+ rcu_read_unlock();
+ return -EINVAL;
+}
+postcore_initcall(schedtune_init);
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
new file mode 100644
index 000000000000..4f6441771e4c
--- /dev/null
+++ b/kernel/sched/tune.h
@@ -0,0 +1,55 @@
+
+#ifdef CONFIG_SCHED_TUNE
+
+#include <linux/reciprocal_div.h>
+
+/*
+ * System energy normalization constants
+ */
+struct target_nrg {
+ unsigned long min_power;
+ unsigned long max_power;
+ struct reciprocal_value rdiv;
+};
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+
+int schedtune_cpu_boost(int cpu);
+int schedtune_task_boost(struct task_struct *tsk);
+
+int schedtune_prefer_idle(struct task_struct *tsk);
+
+void schedtune_exit_task(struct task_struct *tsk);
+
+void schedtune_enqueue_task(struct task_struct *p, int cpu);
+void schedtune_dequeue_task(struct task_struct *p, int cpu);
+
+#else /* CONFIG_CGROUP_SCHEDTUNE */
+
+#define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost()
+#define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost()
+
+#define schedtune_exit_task(task) do { } while (0)
+
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
+int schedtune_normalize_energy(int energy);
+int schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task);
+
+#else /* CONFIG_SCHED_TUNE */
+
+#define schedtune_cpu_boost(cpu) 0
+#define schedtune_task_boost(tsk) 0
+
+#define schedtune_exit_task(task) do { } while (0)
+
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+
+#define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta
+
+#endif /* CONFIG_SCHED_TUNE */
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
new file mode 100644
index 000000000000..522f723af576
--- /dev/null
+++ b/kernel/sched/walt.c
@@ -0,0 +1,1133 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ *
+ * Window Assisted Load Tracking (WALT) implementation credits:
+ * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
+ * Pavan Kumar Kondeti, Olav Haugan
+ *
+ * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
+ * and Todd Kjos
+ */
+
+#include <linux/syscore_ops.h>
+#include <linux/cpufreq.h>
+#include <trace/events/sched.h>
+#include "sched.h"
+#include "walt.h"
+
+#define WINDOW_STATS_RECENT 0
+#define WINDOW_STATS_MAX 1
+#define WINDOW_STATS_MAX_RECENT_AVG 2
+#define WINDOW_STATS_AVG 3
+#define WINDOW_STATS_INVALID_POLICY 4
+
+#define EXITING_TASK_MARKER 0xdeaddead
+
+static __read_mostly unsigned int walt_ravg_hist_size = 5;
+static __read_mostly unsigned int walt_window_stats_policy =
+ WINDOW_STATS_MAX_RECENT_AVG;
+static __read_mostly unsigned int walt_account_wait_time = 1;
+static __read_mostly unsigned int walt_freq_account_wait_time = 0;
+static __read_mostly unsigned int walt_io_is_busy = 0;
+
+unsigned int sysctl_sched_walt_init_task_load_pct = 15;
+
+/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
+unsigned int __read_mostly walt_disabled = 0;
+
+static unsigned int max_possible_efficiency = 1024;
+static unsigned int min_possible_efficiency = 1024;
+
+/*
+ * Maximum possible frequency across all cpus. Task demand and cpu
+ * capacity (cpu_power) metrics are scaled in reference to it.
+ */
+static unsigned int max_possible_freq = 1;
+
+/*
+ * Minimum possible max_freq across all cpus. This will be same as
+ * max_possible_freq on homogeneous systems and could be different from
+ * max_possible_freq on heterogenous systems. min_max_freq is used to derive
+ * capacity (cpu_power) of cpus.
+ */
+static unsigned int min_max_freq = 1;
+
+static unsigned int max_load_scale_factor = 1024;
+static unsigned int max_possible_capacity = 1024;
+
+/* Mask of all CPUs that have max_possible_capacity */
+static cpumask_t mpc_mask = CPU_MASK_ALL;
+
+/* Window size (in ns) */
+__read_mostly unsigned int walt_ravg_window = 20000000;
+
+/* Min window size (in ns) = 10ms */
+#ifdef CONFIG_HZ_300
+/*
+ * Tick interval becomes to 3333333 due to
+ * rounding error when HZ=300.
+ */
+#define MIN_SCHED_RAVG_WINDOW (3333333 * 6)
+#else
+#define MIN_SCHED_RAVG_WINDOW 10000000
+#endif
+
+/* Max window size (in ns) = 1s */
+#define MAX_SCHED_RAVG_WINDOW 1000000000
+
+static unsigned int sync_cpu;
+static ktime_t ktime_last;
+static __read_mostly bool walt_ktime_suspended;
+
+static unsigned int task_load(struct task_struct *p)
+{
+ return p->ravg.demand;
+}
+
+void
+walt_inc_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p)
+{
+ rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void
+walt_dec_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p)
+{
+ rq->cumulative_runnable_avg -= p->ravg.demand;
+ BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+}
+
+static void
+fixup_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p, s64 task_load_delta)
+{
+ rq->cumulative_runnable_avg += task_load_delta;
+ if ((s64)rq->cumulative_runnable_avg < 0)
+ panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
+ task_load_delta, task_load(p));
+}
+
+u64 walt_ktime_clock(void)
+{
+ if (unlikely(walt_ktime_suspended))
+ return ktime_to_ns(ktime_last);
+ return ktime_get_ns();
+}
+
+static void walt_resume(void)
+{
+ walt_ktime_suspended = false;
+}
+
+static int walt_suspend(void)
+{
+ ktime_last = ktime_get();
+ walt_ktime_suspended = true;
+ return 0;
+}
+
+static struct syscore_ops walt_syscore_ops = {
+ .resume = walt_resume,
+ .suspend = walt_suspend
+};
+
+static int __init walt_init_ops(void)
+{
+ register_syscore_ops(&walt_syscore_ops);
+ return 0;
+}
+late_initcall(walt_init_ops);
+
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ cfs_rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
+}
+
+static int exiting_task(struct task_struct *p)
+{
+ if (p->flags & PF_EXITING) {
+ if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
+ p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int __init set_walt_ravg_window(char *str)
+{
+ get_option(&str, &walt_ravg_window);
+
+ walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+ walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+ return 0;
+}
+
+early_param("walt_ravg_window", set_walt_ravg_window);
+
+static void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+ s64 delta;
+ int nr_windows;
+
+ delta = wallclock - rq->window_start;
+ /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
+ if (delta < 0) {
+ delta = 0;
+ WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
+ }
+
+ if (delta < walt_ravg_window)
+ return;
+
+ nr_windows = div64_u64(delta, walt_ravg_window);
+ rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+}
+
+static u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+ unsigned int cur_freq = rq->cur_freq;
+ int sf;
+
+ if (unlikely(cur_freq > max_possible_freq))
+ cur_freq = rq->max_possible_freq;
+
+ /* round up div64 */
+ delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
+ max_possible_freq);
+
+ sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
+
+ delta *= sf;
+ delta >>= 10;
+
+ return delta;
+}
+
+static int cpu_is_waiting_on_io(struct rq *rq)
+{
+ if (!walt_io_is_busy)
+ return 0;
+
+ return atomic_read(&rq->nr_iowait);
+}
+
+void walt_account_irqtime(int cpu, struct task_struct *curr,
+ u64 delta, u64 wallclock)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags, nr_windows;
+ u64 cur_jiffies_ts;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ /*
+ * cputime (wallclock) uses sched_clock so use the same here for
+ * consistency.
+ */
+ delta += sched_clock() - wallclock;
+ cur_jiffies_ts = get_jiffies_64();
+
+ if (is_idle_task(curr))
+ walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
+ delta);
+
+ nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+ if (nr_windows) {
+ if (nr_windows < 10) {
+ /* Decay CPU's irqload by 3/4 for each window. */
+ rq->avg_irqload *= (3 * nr_windows);
+ rq->avg_irqload = div64_u64(rq->avg_irqload,
+ 4 * nr_windows);
+ } else {
+ rq->avg_irqload = 0;
+ }
+ rq->avg_irqload += rq->cur_irqload;
+ rq->cur_irqload = 0;
+ }
+
+ rq->cur_irqload += delta;
+ rq->irqload_ts = cur_jiffies_ts;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+
+#define WALT_HIGH_IRQ_TIMEOUT 3
+
+u64 walt_irqload(int cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ s64 delta;
+ delta = get_jiffies_64() - rq->irqload_ts;
+
+ /*
+ * Current context can be preempted by irq and rq->irqload_ts can be
+ * updated by irq context so that delta can be negative.
+ * But this is okay and we can safely return as this means there
+ * was recent irq occurrence.
+ */
+
+ if (delta < WALT_HIGH_IRQ_TIMEOUT)
+ return rq->avg_irqload;
+ else
+ return 0;
+}
+
+int walt_cpu_high_irqload(int cpu) {
+ return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+ u64 irqtime, int event)
+{
+ if (is_idle_task(p)) {
+ /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+ if (event == PICK_NEXT_TASK)
+ return 0;
+
+ /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+ return irqtime || cpu_is_waiting_on_io(rq);
+ }
+
+ if (event == TASK_WAKE)
+ return 0;
+
+ if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
+ event == TASK_UPDATE)
+ return 1;
+
+ /* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+ return walt_freq_account_wait_time;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+ int new_window, nr_full_windows = 0;
+ int p_is_curr_task = (p == rq->curr);
+ u64 mark_start = p->ravg.mark_start;
+ u64 window_start = rq->window_start;
+ u32 window_size = walt_ravg_window;
+ u64 delta;
+
+ new_window = mark_start < window_start;
+ if (new_window) {
+ nr_full_windows = div64_u64((window_start - mark_start),
+ window_size);
+ if (p->ravg.active_windows < USHRT_MAX)
+ p->ravg.active_windows++;
+ }
+
+ /* Handle per-task window rollover. We don't care about the idle
+ * task or exiting tasks. */
+ if (new_window && !is_idle_task(p) && !exiting_task(p)) {
+ u32 curr_window = 0;
+
+ if (!nr_full_windows)
+ curr_window = p->ravg.curr_window;
+
+ p->ravg.prev_window = curr_window;
+ p->ravg.curr_window = 0;
+ }
+
+ if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
+ /* account_busy_for_cpu_time() = 0, so no update to the
+ * task's current window needs to be made. This could be
+ * for example
+ *
+ * - a wakeup event on a task within the current
+ * window (!new_window below, no action required),
+ * - switching to a new task from idle (PICK_NEXT_TASK)
+ * in a new window where irqtime is 0 and we aren't
+ * waiting on IO */
+
+ if (!new_window)
+ return;
+
+ /* A new window has started. The RQ demand must be rolled
+ * over if p is the current task. */
+ if (p_is_curr_task) {
+ u64 prev_sum = 0;
+
+ /* p is either idle task or an exiting task */
+ if (!nr_full_windows) {
+ prev_sum = rq->curr_runnable_sum;
+ }
+
+ rq->prev_runnable_sum = prev_sum;
+ rq->curr_runnable_sum = 0;
+ }
+
+ return;
+ }
+
+ if (!new_window) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. No rollover
+ * since we didn't start a new window. An example of this is
+ * when a task starts execution and then sleeps within the
+ * same window. */
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+ delta = wallclock - mark_start;
+ else
+ delta = irqtime;
+ delta = scale_exec_time(delta, rq);
+ rq->curr_runnable_sum += delta;
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.curr_window += delta;
+
+ return;
+ }
+
+ if (!p_is_curr_task) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has also started, but p is not the current task, so the
+ * window is not rolled over - just split up and account
+ * as necessary into curr and prev. The window is only
+ * rolled over when a new window is processed for the current
+ * task.
+ *
+ * Irqtime can't be accounted by a task that isn't the
+ * currently running task. */
+
+ if (!nr_full_windows) {
+ /* A full window hasn't elapsed, account partial
+ * contribution to previous completed window. */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!exiting_task(p))
+ p->ravg.prev_window += delta;
+ } else {
+ /* Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size). */
+ delta = scale_exec_time(window_size, rq);
+ if (!exiting_task(p))
+ p->ravg.prev_window = delta;
+ }
+ rq->prev_runnable_sum += delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ rq->curr_runnable_sum += delta;
+ if (!exiting_task(p))
+ p->ravg.curr_window = delta;
+
+ return;
+ }
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. If any of these three above conditions are true
+ * then this busy time can't be accounted as irqtime.
+ *
+ * Busy time for the idle task or exiting tasks need not
+ * be accounted.
+ *
+ * An example of this would be a task that starts execution
+ * and then sleeps once a new window has begun. */
+
+ if (!nr_full_windows) {
+ /* A full window hasn't elapsed, account partial
+ * contribution to previous completed window. */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.prev_window += delta;
+
+ delta += rq->curr_runnable_sum;
+ } else {
+ /* Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size). */
+ delta = scale_exec_time(window_size, rq);
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.prev_window = delta;
+
+ }
+ /*
+ * Rollover for normal runnable sum is done here by overwriting
+ * the values in prev_runnable_sum and curr_runnable_sum.
+ * Rollover for new task runnable sum has completed by previous
+ * if-else statement.
+ */
+ rq->prev_runnable_sum = delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ rq->curr_runnable_sum = delta;
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.curr_window = delta;
+
+ return;
+ }
+
+ if (irqtime) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. The current task must be the idle task because
+ * irqtime is not accounted for any other task.
+ *
+ * Irqtime will be accounted each time we process IRQ activity
+ * after a period of idleness, so we know the IRQ busy time
+ * started at wallclock - irqtime. */
+
+ BUG_ON(!is_idle_task(p));
+ mark_start = wallclock - irqtime;
+
+ /* Roll window over. If IRQ busy time was just in the current
+ * window then that is all that need be accounted. */
+ rq->prev_runnable_sum = rq->curr_runnable_sum;
+ if (mark_start > window_start) {
+ rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
+ return;
+ }
+
+ /* The IRQ busy time spanned multiple windows. Process the
+ * busy time preceding the current window start first. */
+ delta = window_start - mark_start;
+ if (delta > window_size)
+ delta = window_size;
+ delta = scale_exec_time(delta, rq);
+ rq->prev_runnable_sum += delta;
+
+ /* Process the remaining IRQ busy time in the current window. */
+ delta = wallclock - window_start;
+ rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+ return;
+ }
+
+ BUG();
+}
+
+static int account_busy_for_task_demand(struct task_struct *p, int event)
+{
+ /* No need to bother updating task demand for exiting tasks
+ * or the idle task. */
+ if (exiting_task(p) || is_idle_task(p))
+ return 0;
+
+ /* When a task is waking up it is completing a segment of non-busy
+ * time. Likewise, if wait time is not treated as busy time, then
+ * when a task begins to run or is migrated, it is not running and
+ * is completing a segment of non-busy time. */
+ if (event == TASK_WAKE || (!walt_account_wait_time &&
+ (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+ u32 runtime, int samples, int event)
+{
+ u32 *hist = &p->ravg.sum_history[0];
+ int ridx, widx;
+ u32 max = 0, avg, demand;
+ u64 sum = 0;
+
+ /* Ignore windows where task had no activity */
+ if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+ goto done;
+
+ /* Push new 'runtime' value onto stack */
+ widx = walt_ravg_hist_size - 1;
+ ridx = widx - samples;
+ for (; ridx >= 0; --widx, --ridx) {
+ hist[widx] = hist[ridx];
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
+ hist[widx] = runtime;
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ p->ravg.sum = 0;
+
+ if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
+ demand = runtime;
+ } else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
+ demand = max;
+ } else {
+ avg = div64_u64(sum, walt_ravg_hist_size);
+ if (walt_window_stats_policy == WINDOW_STATS_AVG)
+ demand = avg;
+ else
+ demand = max(avg, runtime);
+ }
+
+ /*
+ * A throttled deadline sched class task gets dequeued without
+ * changing p->on_rq. Since the dequeue decrements hmp stats
+ * avoid decrementing it here again.
+ */
+ if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+ !p->dl.dl_throttled))
+ fixup_cumulative_runnable_avg(rq, p, demand);
+
+ p->ravg.demand = demand;
+
+done:
+ trace_walt_update_history(rq, p, runtime, samples, event);
+ return;
+}
+
+static void add_to_task_demand(struct rq *rq, struct task_struct *p,
+ u64 delta)
+{
+ delta = scale_exec_time(delta, rq);
+ p->ravg.sum += delta;
+ if (unlikely(p->ravg.sum > walt_ravg_window))
+ p->ravg.sum = walt_ravg_window;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ * a) Task event is contained within one window.
+ * window_start < mark_start < wallclock
+ *
+ * ws ms wc
+ * | | |
+ * V V V
+ * |---------------|
+ *
+ * In this case, p->ravg.sum is updated *iff* event is appropriate
+ * (ex: event == PUT_PREV_TASK)
+ *
+ * b) Task event spans two windows.
+ * mark_start < window_start < wallclock
+ *
+ * ms ws wc
+ * | | |
+ * V V V
+ * -----|-------------------
+ *
+ * In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ * is appropriate, then a new window sample is recorded followed
+ * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ * c) Task event spans more than two windows.
+ *
+ * ms ws_tmp ws wc
+ * | | | |
+ * V V V V
+ * ---|-------|-------|-------|-------|------
+ * | |
+ * |<------ nr_full_windows ------>|
+ *
+ * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ * event is appropriate, window sample of p->ravg.sum is recorded,
+ * 'nr_full_window' samples of window_size is also recorded *iff*
+ * event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ * *iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static void update_task_demand(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock)
+{
+ u64 mark_start = p->ravg.mark_start;
+ u64 delta, window_start = rq->window_start;
+ int new_window, nr_full_windows;
+ u32 window_size = walt_ravg_window;
+
+ new_window = mark_start < window_start;
+ if (!account_busy_for_task_demand(p, event)) {
+ if (new_window)
+ /* If the time accounted isn't being accounted as
+ * busy time, and a new window started, only the
+ * previous window need be closed out with the
+ * pre-existing demand. Multiple windows may have
+ * elapsed, but since empty windows are dropped,
+ * it is not necessary to account those. */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ return;
+ }
+
+ if (!new_window) {
+ /* The simple case - busy time contained within the existing
+ * window. */
+ add_to_task_demand(rq, p, wallclock - mark_start);
+ return;
+ }
+
+ /* Busy time spans at least two windows. Temporarily rewind
+ * window_start to first window boundary after mark_start. */
+ delta = window_start - mark_start;
+ nr_full_windows = div64_u64(delta, window_size);
+ window_start -= (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (window_start - mark_start) first */
+ add_to_task_demand(rq, p, window_start - mark_start);
+
+ /* Push new sample(s) into task's demand history */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ if (nr_full_windows)
+ update_history(rq, p, scale_exec_time(window_size, rq),
+ nr_full_windows, event);
+
+ /* Roll window_start back to current to process any remainder
+ * in current window. */
+ window_start += (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (wallclock - window_start) next */
+ mark_start = window_start;
+ add_to_task_demand(rq, p, wallclock - mark_start);
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+ if (walt_disabled || !rq->window_start)
+ return;
+
+ lockdep_assert_held(&rq->lock);
+
+ update_window_start(rq, wallclock);
+
+ if (!p->ravg.mark_start)
+ goto done;
+
+ update_task_demand(p, rq, event, wallclock);
+ update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+
+done:
+ trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
+
+ p->ravg.mark_start = wallclock;
+}
+
+unsigned long __weak arch_get_cpu_efficiency(int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+
+void walt_init_cpu_efficiency(void)
+{
+ int i, efficiency;
+ unsigned int max = 0, min = UINT_MAX;
+
+ for_each_possible_cpu(i) {
+ efficiency = arch_get_cpu_efficiency(i);
+ cpu_rq(i)->efficiency = efficiency;
+
+ if (efficiency > max)
+ max = efficiency;
+ if (efficiency < min)
+ min = efficiency;
+ }
+
+ if (max)
+ max_possible_efficiency = max;
+
+ if (min)
+ min_possible_efficiency = min;
+}
+
+static void reset_task_stats(struct task_struct *p)
+{
+ u32 sum = 0;
+
+ if (exiting_task(p))
+ sum = EXITING_TASK_MARKER;
+
+ memset(&p->ravg, 0, sizeof(struct ravg));
+ /* Retain EXITING_TASK marker */
+ p->ravg.sum_history[0] = sum;
+}
+
+void walt_mark_task_starting(struct task_struct *p)
+{
+ u64 wallclock;
+ struct rq *rq = task_rq(p);
+
+ if (!rq->window_start) {
+ reset_task_stats(p);
+ return;
+ }
+
+ wallclock = walt_ktime_clock();
+ p->ravg.mark_start = wallclock;
+}
+
+void walt_set_window_start(struct rq *rq)
+{
+ int cpu = cpu_of(rq);
+ struct rq *sync_rq = cpu_rq(sync_cpu);
+
+ if (rq->window_start)
+ return;
+
+ if (cpu == sync_cpu) {
+ rq->window_start = walt_ktime_clock();
+ } else {
+ raw_spin_unlock(&rq->lock);
+ double_rq_lock(rq, sync_rq);
+ rq->window_start = cpu_rq(sync_cpu)->window_start;
+ rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+ raw_spin_unlock(&sync_rq->lock);
+ }
+
+ rq->curr->ravg.mark_start = rq->window_start;
+}
+
+void walt_migrate_sync_cpu(int cpu)
+{
+ if (cpu == sync_cpu)
+ sync_cpu = smp_processor_id();
+}
+
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+ struct rq *src_rq = task_rq(p);
+ struct rq *dest_rq = cpu_rq(new_cpu);
+ u64 wallclock;
+
+ if (!p->on_rq && p->state != TASK_WAKING)
+ return;
+
+ if (exiting_task(p)) {
+ return;
+ }
+
+ if (p->state == TASK_WAKING)
+ double_rq_lock(src_rq, dest_rq);
+
+ wallclock = walt_ktime_clock();
+
+ walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
+ TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(dest_rq->curr, dest_rq,
+ TASK_UPDATE, wallclock, 0);
+
+ walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
+
+ if (p->ravg.curr_window) {
+ src_rq->curr_runnable_sum -= p->ravg.curr_window;
+ dest_rq->curr_runnable_sum += p->ravg.curr_window;
+ }
+
+ if (p->ravg.prev_window) {
+ src_rq->prev_runnable_sum -= p->ravg.prev_window;
+ dest_rq->prev_runnable_sum += p->ravg.prev_window;
+ }
+
+ if ((s64)src_rq->prev_runnable_sum < 0) {
+ src_rq->prev_runnable_sum = 0;
+ WARN_ON(1);
+ }
+ if ((s64)src_rq->curr_runnable_sum < 0) {
+ src_rq->curr_runnable_sum = 0;
+ WARN_ON(1);
+ }
+
+ trace_walt_migration_update_sum(src_rq, p);
+ trace_walt_migration_update_sum(dest_rq, p);
+
+ if (p->state == TASK_WAKING)
+ double_rq_unlock(src_rq, dest_rq);
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
+ * least efficient cpu gets capacity of 1024
+ */
+static unsigned long capacity_scale_cpu_efficiency(int cpu)
+{
+ return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency;
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
+ * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
+ */
+static unsigned long capacity_scale_cpu_freq(int cpu)
+{
+ return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq;
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
+ * that "most" efficient cpu gets a load_scale_factor of 1
+ */
+static unsigned long load_scale_cpu_efficiency(int cpu)
+{
+ return DIV_ROUND_UP(1024 * max_possible_efficiency,
+ cpu_rq(cpu)->efficiency);
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to cpu with best max_freq
+ * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
+ * of 1.
+ */
+static unsigned long load_scale_cpu_freq(int cpu)
+{
+ return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq);
+}
+
+static int compute_capacity(int cpu)
+{
+ int capacity = 1024;
+
+ capacity *= capacity_scale_cpu_efficiency(cpu);
+ capacity >>= 10;
+
+ capacity *= capacity_scale_cpu_freq(cpu);
+ capacity >>= 10;
+
+ return capacity;
+}
+
+static int compute_load_scale_factor(int cpu)
+{
+ int load_scale = 1024;
+
+ /*
+ * load_scale_factor accounts for the fact that task load
+ * is in reference to "best" performing cpu. Task's load will need to be
+ * scaled (up) by a factor to determine suitability to be placed on a
+ * (little) cpu.
+ */
+ load_scale *= load_scale_cpu_efficiency(cpu);
+ load_scale >>= 10;
+
+ load_scale *= load_scale_cpu_freq(cpu);
+ load_scale >>= 10;
+
+ return load_scale;
+}
+
+static int cpufreq_notifier_policy(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
+ int i, update_max = 0;
+ u64 highest_mpc = 0, highest_mplsf = 0;
+ const struct cpumask *cpus = policy->related_cpus;
+ unsigned int orig_min_max_freq = min_max_freq;
+ unsigned int orig_max_possible_freq = max_possible_freq;
+ /* Initialized to policy->max in case policy->related_cpus is empty! */
+ unsigned int orig_max_freq = policy->max;
+
+ if (val != CPUFREQ_NOTIFY)
+ return 0;
+
+ for_each_cpu(i, policy->related_cpus) {
+ cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
+ policy->related_cpus);
+ orig_max_freq = cpu_rq(i)->max_freq;
+ cpu_rq(i)->min_freq = policy->min;
+ cpu_rq(i)->max_freq = policy->max;
+ cpu_rq(i)->cur_freq = policy->cur;
+ cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
+ }
+
+ max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
+ if (min_max_freq == 1)
+ min_max_freq = UINT_MAX;
+ min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
+ BUG_ON(!min_max_freq);
+ BUG_ON(!policy->max);
+
+ /* Changes to policy other than max_freq don't require any updates */
+ if (orig_max_freq == policy->max)
+ return 0;
+
+ /*
+ * A changed min_max_freq or max_possible_freq (possible during bootup)
+ * needs to trigger re-computation of load_scale_factor and capacity for
+ * all possible cpus (even those offline). It also needs to trigger
+ * re-computation of nr_big_task count on all online cpus.
+ *
+ * A changed rq->max_freq otoh needs to trigger re-computation of
+ * load_scale_factor and capacity for just the cluster of cpus involved.
+ * Since small task definition depends on max_load_scale_factor, a
+ * changed load_scale_factor of one cluster could influence
+ * classification of tasks in another cluster. Hence a changed
+ * rq->max_freq will need to trigger re-computation of nr_big_task
+ * count on all online cpus.
+ *
+ * While it should be sufficient for nr_big_tasks to be
+ * re-computed for only online cpus, we have inadequate context
+ * information here (in policy notifier) with regard to hotplug-safety
+ * context in which notification is issued. As a result, we can't use
+ * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is
+ * fixed up to issue notification always in hotplug-safe context,
+ * re-compute nr_big_task for all possible cpus.
+ */
+
+ if (orig_min_max_freq != min_max_freq ||
+ orig_max_possible_freq != max_possible_freq) {
+ cpus = cpu_possible_mask;
+ update_max = 1;
+ }
+
+ /*
+ * Changed load_scale_factor can trigger reclassification of tasks as
+ * big or small. Make this change "atomic" so that tasks are accounted
+ * properly due to changed load_scale_factor
+ */
+ for_each_cpu(i, cpus) {
+ struct rq *rq = cpu_rq(i);
+
+ rq->capacity = compute_capacity(i);
+ rq->load_scale_factor = compute_load_scale_factor(i);
+
+ if (update_max) {
+ u64 mpc, mplsf;
+
+ mpc = div_u64(((u64) rq->capacity) *
+ rq->max_possible_freq, rq->max_freq);
+ rq->max_possible_capacity = (int) mpc;
+
+ mplsf = div_u64(((u64) rq->load_scale_factor) *
+ rq->max_possible_freq, rq->max_freq);
+
+ if (mpc > highest_mpc) {
+ highest_mpc = mpc;
+ cpumask_clear(&mpc_mask);
+ cpumask_set_cpu(i, &mpc_mask);
+ } else if (mpc == highest_mpc) {
+ cpumask_set_cpu(i, &mpc_mask);
+ }
+
+ if (mplsf > highest_mplsf)
+ highest_mplsf = mplsf;
+ }
+ }
+
+ if (update_max) {
+ max_possible_capacity = highest_mpc;
+ max_load_scale_factor = highest_mplsf;
+ }
+
+ return 0;
+}
+
+static int cpufreq_notifier_trans(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
+ unsigned int cpu = freq->cpu, new_freq = freq->new;
+ unsigned long flags;
+ int i;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return 0;
+
+ BUG_ON(!new_freq);
+
+ if (cpu_rq(cpu)->cur_freq == new_freq)
+ return 0;
+
+ for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
+ struct rq *rq = cpu_rq(i);
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+ walt_ktime_clock(), 0);
+ rq->cur_freq = new_freq;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+ return 0;
+}
+
+static struct notifier_block notifier_policy_block = {
+ .notifier_call = cpufreq_notifier_policy
+};
+
+static struct notifier_block notifier_trans_block = {
+ .notifier_call = cpufreq_notifier_trans
+};
+
+static int register_sched_callback(void)
+{
+ int ret;
+
+ ret = cpufreq_register_notifier(&notifier_policy_block,
+ CPUFREQ_POLICY_NOTIFIER);
+
+ if (!ret)
+ ret = cpufreq_register_notifier(&notifier_trans_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+
+ return 0;
+}
+
+/*
+ * cpufreq callbacks can be registered at core_initcall or later time.
+ * Any registration done prior to that is "forgotten" by cpufreq. See
+ * initialization of variable init_cpufreq_transition_notifier_list_called
+ * for further information.
+ */
+core_initcall(register_sched_callback);
+
+void walt_init_new_task_load(struct task_struct *p)
+{
+ int i;
+ u32 init_load_windows =
+ div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
+ (u64)walt_ravg_window, 100);
+ u32 init_load_pct = current->init_load_pct;
+
+ p->init_load_pct = 0;
+ memset(&p->ravg, 0, sizeof(struct ravg));
+
+ if (init_load_pct) {
+ init_load_windows = div64_u64((u64)init_load_pct *
+ (u64)walt_ravg_window, 100);
+ }
+
+ p->ravg.demand = init_load_windows;
+ for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+ p->ravg.sum_history[i] = init_load_windows;
+}
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
new file mode 100644
index 000000000000..f56c4da16d0b
--- /dev/null
+++ b/kernel/sched/walt.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __WALT_H
+#define __WALT_H
+
+#ifdef CONFIG_SCHED_WALT
+
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+ u64 wallclock, u64 irqtime);
+void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p);
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p);
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu);
+void walt_init_new_task_load(struct task_struct *p);
+void walt_mark_task_starting(struct task_struct *p);
+void walt_set_window_start(struct rq *rq);
+void walt_migrate_sync_cpu(int cpu);
+void walt_init_cpu_efficiency(void);
+u64 walt_ktime_clock(void);
+void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta,
+ u64 wallclock);
+
+u64 walt_irqload(int cpu);
+int walt_cpu_high_irqload(int cpu);
+
+#else /* CONFIG_SCHED_WALT */
+
+static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime) { }
+static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p) { }
+static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p) { }
+static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void walt_init_new_task_load(struct task_struct *p) { }
+static inline void walt_mark_task_starting(struct task_struct *p) { }
+static inline void walt_set_window_start(struct rq *rq) { }
+static inline void walt_migrate_sync_cpu(int cpu) { }
+static inline void walt_init_cpu_efficiency(void) { }
+static inline u64 walt_ktime_clock(void) { return 0; }
+
+#define walt_cpu_high_irqload(cpu) false
+
+#endif /* CONFIG_SCHED_WALT */
+
+extern unsigned int walt_disabled;
+
+#endif
diff --git a/kernel/sys.c b/kernel/sys.c
index 6c4e9b533258..23559571068f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -41,6 +41,8 @@
#include <linux/syscore_ops.h>
#include <linux/version.h>
#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/mempolicy.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
@@ -2070,6 +2072,153 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
}
#endif
+#ifdef CONFIG_MMU
+static int prctl_update_vma_anon_name(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ const char __user *name_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int error = 0;
+ pgoff_t pgoff;
+
+ if (name_addr == vma_get_anon_name(vma)) {
+ *prev = vma;
+ goto out;
+ }
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ *prev = vma_merge(mm, *prev, start, end, vma->vm_flags, vma->anon_vma,
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, name_addr);
+ if (*prev) {
+ vma = *prev;
+ goto success;
+ }
+
+ *prev = vma;
+
+ if (start != vma->vm_start) {
+ error = split_vma(mm, vma, start, 1);
+ if (error)
+ goto out;
+ }
+
+ if (end != vma->vm_end) {
+ error = split_vma(mm, vma, end, 0);
+ if (error)
+ goto out;
+ }
+
+success:
+ if (!vma->vm_file)
+ vma->anon_name = name_addr;
+
+out:
+ if (error == -ENOMEM)
+ error = -EAGAIN;
+ return error;
+}
+
+static int prctl_set_vma_anon_name(unsigned long start, unsigned long end,
+ unsigned long arg)
+{
+ unsigned long tmp;
+ struct vm_area_struct *vma, *prev;
+ int unmapped_error = 0;
+ int error = -EINVAL;
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ * - this matches the handling in madvise.
+ */
+ vma = find_vma_prev(current->mm, start, &prev);
+ if (vma && start > vma->vm_start)
+ prev = vma;
+
+ for (;;) {
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ return error;
+
+ /* Here start < (end|vma->vm_end). */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ if (start >= end)
+ return error;
+ }
+
+ /* Here vma->vm_start <= start < (end|vma->vm_end) */
+ tmp = vma->vm_end;
+ if (end < tmp)
+ tmp = end;
+
+ /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+ error = prctl_update_vma_anon_name(vma, &prev, start, tmp,
+ (const char __user *)arg);
+ if (error)
+ return error;
+ start = tmp;
+ if (prev && start < prev->vm_end)
+ start = prev->vm_end;
+ error = unmapped_error;
+ if (start >= end)
+ return error;
+ if (prev)
+ vma = prev->vm_next;
+ else /* madvise_remove dropped mmap_sem */
+ vma = find_vma(current->mm, start);
+ }
+}
+
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+ unsigned long len_in, unsigned long arg)
+{
+ struct mm_struct *mm = current->mm;
+ int error;
+ unsigned long len;
+ unsigned long end;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return -EINVAL;
+
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+
+ if (end == start)
+ return 0;
+
+ down_write(&mm->mmap_sem);
+
+ switch (opt) {
+ case PR_SET_VMA_ANON_NAME:
+ error = prctl_set_vma_anon_name(start, end, arg);
+ break;
+ default:
+ error = -EINVAL;
+ }
+
+ up_write(&mm->mmap_sem);
+
+ return error;
+}
+#else /* CONFIG_MMU */
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+ unsigned long len_in, unsigned long arg)
+{
+ return -EINVAL;
+}
+#endif
+
int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
{
return -EINVAL;
@@ -2289,6 +2438,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
break;
+ case PR_SET_VMA:
+ error = prctl_set_vma(arg2, arg3, arg4, arg5);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 23f658d311c0..a053c41d0afb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -308,6 +308,57 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_sched_granularity_ns,
},
{
+ .procname = "sched_sync_hint_enable",
+ .data = &sysctl_sched_sync_hint_enable,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_SCHED_WALT
+ {
+ .procname = "sched_use_walt_cpu_util",
+ .data = &sysctl_sched_use_walt_cpu_util,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_use_walt_task_util",
+ .data = &sysctl_sched_use_walt_task_util,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_walt_init_task_load_pct",
+ .data = &sysctl_sched_walt_init_task_load_pct,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_walt_cpu_high_irqload",
+ .data = &sysctl_sched_walt_cpu_high_irqload,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "sched_initial_task_util",
+ .data = &sysctl_sched_initial_task_util,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_cstate_aware",
+ .data = &sysctl_sched_cstate_aware,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "sched_wakeup_granularity_ns",
.data = &sysctl_sched_wakeup_granularity,
.maxlen = sizeof(unsigned int),
@@ -450,6 +501,21 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
},
#endif
+#ifdef CONFIG_SCHED_TUNE
+ {
+ .procname = "sched_cfs_boost",
+ .data = &sysctl_sched_cfs_boost,
+ .maxlen = sizeof(sysctl_sched_cfs_boost),
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ .mode = 0444,
+#else
+ .mode = 0644,
+#endif
+ .proc_handler = &sysctl_sched_cfs_boost_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b6bebe28a3e0..65aafadad8e2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1009,6 +1009,18 @@ ktime_t tick_nohz_get_sleep_length(void)
return ts->sleep_length;
}
+/**
+ * tick_nohz_get_idle_calls - return the current idle calls counter value
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ return ts->idle_calls;
+}
+
static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d831827d7ab0..234d3e4da597 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -442,6 +442,35 @@ u64 ktime_get_raw_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
+/**
+ * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
+ *
+ * To keep it NMI safe since we're accessing from tracing, we're not using a
+ * separate timekeeper with updates to monotonic clock and boot offset
+ * protected with seqlocks. This has the following minor side effects:
+ *
+ * (1) Its possible that a timestamp be taken after the boot offset is updated
+ * but before the timekeeper is updated. If this happens, the new boot offset
+ * is added to the old timekeeping making the clock appear to update slightly
+ * earlier:
+ * CPU 0 CPU 1
+ * timekeeping_inject_sleeptime64()
+ * __timekeeping_inject_sleeptime(tk, delta);
+ * timestamp();
+ * timekeeping_update(tk, TK_CLEAR_NTP...);
+ *
+ * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
+ * partially updated. Since the tk->offs_boot update is a rare event, this
+ * should be a rare occurrence which postprocessing should be able to handle.
+ */
+u64 notrace ktime_get_boot_fast_ns(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot));
+}
+EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
+
/* Suspend-time cycles value for halted fast timekeeper. */
static cycle_t cycles_at_suspend;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2a96b063d659..da5768901a0d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -72,6 +72,9 @@ config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
bool
+config GPU_TRACEPOINTS
+ bool
+
config CONTEXT_SWITCH_TRACER
bool
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index e57980845549..b4eaf9c9c610 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -67,6 +67,7 @@ obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
+obj-$(CONFIG_GPU_TRACEPOINTS) += gpu-traces.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
diff --git a/kernel/trace/gpu-traces.c b/kernel/trace/gpu-traces.c
new file mode 100644
index 000000000000..a4b3f00faee3
--- /dev/null
+++ b/kernel/trace/gpu-traces.c
@@ -0,0 +1,23 @@
+/*
+ * GPU tracepoints
+ *
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/gpu.h>
+
+EXPORT_TRACEPOINT_SYMBOL(gpu_sched_switch);
+EXPORT_TRACEPOINT_SYMBOL(gpu_job_enqueue);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a47339b156ce..69f9ebbb05c1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1125,6 +1125,7 @@ static struct {
{ trace_clock, "perf", 1 },
{ ktime_get_mono_fast_ns, "mono", 1 },
{ ktime_get_raw_fast_ns, "mono_raw", 1 },
+ { ktime_get_boot_fast_ns, "boot", 1 },
ARCH_TRACE_CLOCKS
};
@@ -1603,6 +1604,7 @@ void tracing_reset_all_online_cpus(void)
#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
+static unsigned saved_tgids[SAVED_CMDLINES_DEFAULT];
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
struct saved_cmdlines_buffer {
unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
@@ -1841,7 +1843,7 @@ static int trace_save_cmdline(struct task_struct *tsk)
}
set_cmdline(idx, tsk->comm);
-
+ saved_tgids[idx] = tsk->tgid;
arch_spin_unlock(&trace_cmdline_lock);
return 1;
@@ -1884,6 +1886,25 @@ void trace_find_cmdline(int pid, char comm[])
preempt_enable();
}
+int trace_find_tgid(int pid)
+{
+ unsigned map;
+ int tgid;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+ map = savedcmd->map_pid_to_cmdline[pid];
+ if (map != NO_CMDLINE_MAP)
+ tgid = saved_tgids[map];
+ else
+ tgid = -1;
+
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ return tgid;
+}
+
void tracing_record_cmdline(struct task_struct *tsk)
{
if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
@@ -2937,6 +2958,13 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
"# | | | | |\n");
}
+static void print_func_help_header_tgid(struct trace_buffer *buf, struct seq_file *m)
+{
+ print_event_info(buf, m);
+ seq_puts(m, "# TASK-PID TGID CPU# TIMESTAMP FUNCTION\n");
+ seq_puts(m, "# | | | | | |\n");
+}
+
static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
{
print_event_info(buf, m);
@@ -2949,6 +2977,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
"# | | | |||| | |\n");
}
+static void print_func_help_header_irq_tgid(struct trace_buffer *buf, struct seq_file *m)
+{
+ print_event_info(buf, m);
+ seq_puts(m, "# _-----=> irqs-off\n");
+ seq_puts(m, "# / _----=> need-resched\n");
+ seq_puts(m, "# | / _---=> hardirq/softirq\n");
+ seq_puts(m, "# || / _--=> preempt-depth\n");
+ seq_puts(m, "# ||| / delay\n");
+ seq_puts(m, "# TASK-PID TGID CPU# |||| TIMESTAMP FUNCTION\n");
+ seq_puts(m, "# | | | | |||| | |\n");
+}
+
void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
{
@@ -3261,9 +3301,15 @@ void trace_default_header(struct seq_file *m)
} else {
if (!(trace_flags & TRACE_ITER_VERBOSE)) {
if (trace_flags & TRACE_ITER_IRQ_INFO)
- print_func_help_header_irq(iter->trace_buffer, m);
+ if (trace_flags & TRACE_ITER_TGID)
+ print_func_help_header_irq_tgid(iter->trace_buffer, m);
+ else
+ print_func_help_header_irq(iter->trace_buffer, m);
else
- print_func_help_header(iter->trace_buffer, m);
+ if (trace_flags & TRACE_ITER_TGID)
+ print_func_help_header_tgid(iter->trace_buffer, m);
+ else
+ print_func_help_header(iter->trace_buffer, m);
}
}
}
@@ -4594,6 +4640,50 @@ static void trace_insert_enum_map(struct module *mod,
}
static ssize_t
+tracing_saved_tgids_read(struct file *file, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char *file_buf;
+ char *buf;
+ int len = 0;
+ int pid;
+ int i;
+
+ file_buf = kmalloc(SAVED_CMDLINES_DEFAULT*(16+1+16), GFP_KERNEL);
+ if (!file_buf)
+ return -ENOMEM;
+
+ buf = file_buf;
+
+ for (i = 0; i < SAVED_CMDLINES_DEFAULT; i++) {
+ int tgid;
+ int r;
+
+ pid = savedcmd->map_cmdline_to_pid[i];
+ if (pid == -1 || pid == NO_CMDLINE_MAP)
+ continue;
+
+ tgid = trace_find_tgid(pid);
+ r = sprintf(buf, "%d %d\n", pid, tgid);
+ buf += r;
+ len += r;
+ }
+
+ len = simple_read_from_buffer(ubuf, cnt, ppos,
+ file_buf, len);
+
+ kfree(file_buf);
+
+ return len;
+}
+
+static const struct file_operations tracing_saved_tgids_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_saved_tgids_read,
+ .llseek = generic_file_llseek,
+};
+
+static ssize_t
tracing_set_trace_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -7227,6 +7317,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("trace_marker", 0220, d_tracer,
tr, &tracing_mark_fops);
+ trace_create_file("saved_tgids", 0444, d_tracer,
+ tr, &tracing_saved_tgids_fops);
+
trace_create_file("trace_clock", 0644, d_tracer, tr,
&trace_clock_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b0d8576c27ae..d4c66331d5f5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -690,6 +690,7 @@ extern cycle_t ftrace_now(int cpu);
extern void trace_find_cmdline(int pid, char comm[]);
extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
+extern int trace_find_tgid(int pid);
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
@@ -1009,7 +1010,8 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
FUNCTION_FLAGS \
FGRAPH_FLAGS \
STACK_FLAGS \
- BRANCH_FLAGS
+ BRANCH_FLAGS \
+ C(TGID, "print-tgid"),
/*
* By defining C, we can make TRACE_FLAGS a list of bit names
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 01e71812e174..7461d51342d7 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -65,6 +65,9 @@ struct fgraph_data {
#define TRACE_GRAPH_INDENT 2
+/* Flag options */
+#define TRACE_GRAPH_PRINT_FLAT 0x80
+
static unsigned int max_depth;
static struct tracer_opt trace_opts[] = {
@@ -88,6 +91,8 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
/* Include time within nested functions */
{ TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) },
+ /* Use standard trace formatting rather than hierarchical */
+ { TRACER_OPT(funcgraph-flat, TRACE_GRAPH_PRINT_FLAT) },
{ } /* Empty entry */
};
@@ -1246,6 +1251,9 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
int cpu = iter->cpu;
int ret;
+ if (flags & TRACE_GRAPH_PRINT_FLAT)
+ return TRACE_TYPE_UNHANDLED;
+
if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
return TRACE_TYPE_HANDLED;
@@ -1303,13 +1311,6 @@ print_graph_function(struct trace_iterator *iter)
return print_graph_function_flags(iter, tracer_flags.val);
}
-static enum print_line_t
-print_graph_function_event(struct trace_iterator *iter, int flags,
- struct trace_event *event)
-{
- return print_graph_function(iter);
-}
-
static void print_lat_header(struct seq_file *s, u32 flags)
{
static const char spaces[] = " " /* 16 spaces */
@@ -1378,6 +1379,11 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
struct trace_iterator *iter = s->private;
struct trace_array *tr = iter->tr;
+ if (flags & TRACE_GRAPH_PRINT_FLAT) {
+ trace_default_header(s);
+ return;
+ }
+
if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
return;
@@ -1459,19 +1465,6 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
return 0;
}
-static struct trace_event_functions graph_functions = {
- .trace = print_graph_function_event,
-};
-
-static struct trace_event graph_trace_entry_event = {
- .type = TRACE_GRAPH_ENT,
- .funcs = &graph_functions,
-};
-
-static struct trace_event graph_trace_ret_event = {
- .type = TRACE_GRAPH_RET,
- .funcs = &graph_functions
-};
static struct tracer graph_trace __tracer_data = {
.name = "function_graph",
@@ -1548,16 +1541,6 @@ static __init int init_graph_trace(void)
{
max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
- if (!register_trace_event(&graph_trace_entry_event)) {
- pr_warn("Warning: could not register graph trace events\n");
- return 1;
- }
-
- if (!register_trace_event(&graph_trace_ret_event)) {
- pr_warn("Warning: could not register graph trace events\n");
- return 1;
- }
-
return register_tracer(&graph_trace);
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 3fc20422c166..034675950649 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -530,11 +530,21 @@ int trace_print_context(struct trace_iterator *iter)
unsigned long long t;
unsigned long secs, usec_rem;
char comm[TASK_COMM_LEN];
+ int tgid;
trace_find_cmdline(entry->pid, comm);
- trace_seq_printf(s, "%16s-%-5d [%03d] ",
- comm, entry->pid, iter->cpu);
+ trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+
+ if (tr->trace_flags & TRACE_ITER_TGID) {
+ tgid = trace_find_tgid(entry->pid);
+ if (tgid < 0)
+ trace_seq_puts(s, "(-----) ");
+ else
+ trace_seq_printf(s, "(%5d) ", tgid);
+ }
+
+ trace_seq_printf(s, "[%03d] ", iter->cpu);
if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
trace_print_lat_fmt(s, entry);
@@ -849,6 +859,174 @@ static struct trace_event trace_fn_event = {
.funcs = &trace_fn_funcs,
};
+/* TRACE_GRAPH_ENT */
+static enum print_line_t trace_graph_ent_trace(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_seq *s = &iter->seq;
+ struct ftrace_graph_ent_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_puts(s, "graph_ent: func=");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (!seq_print_ip_sym(s, field->graph_ent.func, flags))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ trace_seq_puts(s, "\n");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ent_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(&iter->seq, "%lx %d\n",
+ field->graph_ent.func,
+ field->graph_ent.depth);
+ if (trace_seq_has_overflowed(&iter->seq))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ent_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_HEX_FIELD(s, field->graph_ent.func);
+ SEQ_PUT_HEX_FIELD(s, field->graph_ent.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_bin(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ent_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_FIELD(s, field->graph_ent.func);
+ SEQ_PUT_FIELD(s, field->graph_ent.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static struct trace_event_functions trace_graph_ent_funcs = {
+ .trace = trace_graph_ent_trace,
+ .raw = trace_graph_ent_raw,
+ .hex = trace_graph_ent_hex,
+ .binary = trace_graph_ent_bin,
+};
+
+static struct trace_event trace_graph_ent_event = {
+ .type = TRACE_GRAPH_ENT,
+ .funcs = &trace_graph_ent_funcs,
+};
+
+/* TRACE_GRAPH_RET */
+static enum print_line_t trace_graph_ret_trace(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+ struct ftrace_graph_ret_entry *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_puts(s, "graph_ret: func=");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (!seq_print_ip_sym(s, field->ret.func, flags))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ trace_seq_puts(s, "\n");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ret_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(&iter->seq, "%lx %lld %lld %ld %d\n",
+ field->ret.func,
+ field->ret.calltime,
+ field->ret.rettime,
+ field->ret.overrun,
+ field->ret.depth);
+ if (trace_seq_has_overflowed(&iter->seq))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ret_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_HEX_FIELD(s, field->ret.func);
+ SEQ_PUT_HEX_FIELD(s, field->ret.calltime);
+ SEQ_PUT_HEX_FIELD(s, field->ret.rettime);
+ SEQ_PUT_HEX_FIELD(s, field->ret.overrun);
+ SEQ_PUT_HEX_FIELD(s, field->ret.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_bin(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ret_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_FIELD(s, field->ret.func);
+ SEQ_PUT_FIELD(s, field->ret.calltime);
+ SEQ_PUT_FIELD(s, field->ret.rettime);
+ SEQ_PUT_FIELD(s, field->ret.overrun);
+ SEQ_PUT_FIELD(s, field->ret.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static struct trace_event_functions trace_graph_ret_funcs = {
+ .trace = trace_graph_ret_trace,
+ .raw = trace_graph_ret_raw,
+ .hex = trace_graph_ret_hex,
+ .binary = trace_graph_ret_bin,
+};
+
+static struct trace_event trace_graph_ret_event = {
+ .type = TRACE_GRAPH_RET,
+ .funcs = &trace_graph_ret_funcs,
+};
+
/* TRACE_CTX an TRACE_WAKE */
static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
char *delim)
@@ -1291,6 +1469,8 @@ static struct trace_event trace_print_event = {
static struct trace_event *events[] __initdata = {
&trace_fn_event,
+ &trace_graph_ent_event,
+ &trace_graph_ret_event,
&trace_ctx_event,
&trace_wake_event,
&trace_stack_event,
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 58a22ca10f33..ba7670cb7402 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -922,6 +922,15 @@ config SCHED_INFO
bool
default n
+config PANIC_ON_RT_THROTTLING
+ bool "Panic on RT throttling"
+ help
+ Say Y here to enable the kernel to panic when a realtime
+ runqueue is throttled. This may be useful for detecting
+ and debugging RT throttling issues.
+
+ Say N if unsure.
+
config SCHEDSTATS
bool "Collect scheduler statistics"
depends on DEBUG_KERNEL && PROC_FS
diff --git a/mm/madvise.c b/mm/madvise.c
index 4a01c4bd786c..6849e4e2be22 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -109,7 +109,7 @@ static long madvise_behavior(struct vm_area_struct *vma,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
if (*prev) {
vma = *prev;
goto success;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e21d9b44247b..d80898272227 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -752,7 +752,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
- new_pol, vma->vm_userfaultfd_ctx);
+ new_pol, vma->vm_userfaultfd_ctx,
+ vma_get_anon_name(vma));
if (prev) {
vma = prev;
next = vma->vm_next;
diff --git a/mm/mlock.c b/mm/mlock.c
index f0505692a5f4..9cdd063b7d32 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -529,7 +529,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
if (*prev) {
vma = *prev;
goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index 283755645d17..131641484303 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -970,7 +970,8 @@ again:
*/
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char __user *anon_name)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -988,6 +989,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
return 0;
+ if (vma_get_anon_name(vma) != anon_name)
+ return 0;
return 1;
}
@@ -1020,9 +1023,10 @@ static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char __user *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
@@ -1041,9 +1045,10 @@ static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char __user *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = vma_pages(vma);
@@ -1054,9 +1059,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
}
/*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor. Or both (it neatly fills a hole).
*
* In most cases - when called for mmap, brk or mremap - [addr,end) is
* certain not to be mapped by the time vma_merge is called; but when
@@ -1098,7 +1103,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char __user *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -1131,7 +1137,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff,
- vm_userfaultfd_ctx)) {
+ vm_userfaultfd_ctx,
+ anon_name)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
@@ -1140,7 +1147,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
can_vma_merge_before(next, vm_flags,
anon_vma, file,
pgoff+pglen,
- vm_userfaultfd_ctx) &&
+ vm_userfaultfd_ctx,
+ anon_name) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
@@ -1163,7 +1171,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen,
- vm_userfaultfd_ctx)) {
+ vm_userfaultfd_ctx,
+ anon_name)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL, next);
@@ -1673,7 +1682,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* Can we just expand an old mapping?
*/
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
+ NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (vma)
goto out;
@@ -2733,6 +2742,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
return 0;
}
+EXPORT_SYMBOL(do_munmap);
int vm_munmap(unsigned long start, size_t len)
{
@@ -2922,7 +2932,7 @@ static int do_brk(unsigned long addr, unsigned long len)
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
+ NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (vma)
goto out;
@@ -3090,7 +3100,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 6896f77be166..5471f35be825 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -352,7 +352,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
if (*pprev) {
vma = *pprev;
VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
diff --git a/mm/shmem.c b/mm/shmem.c
index 9b17bd4cbc5e..d15851d9c53c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -4083,6 +4083,14 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
}
EXPORT_SYMBOL_GPL(shmem_file_setup);
+void shmem_set_file(struct vm_area_struct *vma, struct file *file)
+{
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ vma->vm_file = file;
+ vma->vm_ops = &shmem_vm_ops;
+}
+
/**
* shmem_zero_setup - setup a shared anonymous mapping
* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
@@ -4102,10 +4110,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
if (IS_ERR(file))
return PTR_ERR(file);
- if (vma->vm_file)
- fput(vma->vm_file);
- vma->vm_file = file;
- vma->vm_ops = &shmem_vm_ops;
+ shmem_set_file(vma, file);
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
diff --git a/net/Kconfig b/net/Kconfig
index 7b6cd340b72b..28071fc1594c 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -89,6 +89,12 @@ source "net/netlabel/Kconfig"
endif # if INET
+config ANDROID_PARANOID_NETWORK
+ bool "Only allow certain groups to create sockets"
+ default y
+ help
+ none
+
config NETWORK_SECMARK
bool "Security Marking"
help
@@ -258,10 +264,6 @@ config XPS
config HWBM
bool
-config SOCK_CGROUP_DATA
- bool
- default n
-
config CGROUP_NET_PRIO
bool "Network priority cgroup"
depends on CGROUPS
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 5d3698170004..ec313c9dee6d 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -106,11 +106,40 @@ void bt_sock_unregister(int proto)
}
EXPORT_SYMBOL(bt_sock_unregister);
+#ifdef CONFIG_PARANOID_NETWORK
+static inline int current_has_bt_admin(void)
+{
+ return !current_euid();
+}
+
+static inline int current_has_bt(void)
+{
+ return current_has_bt_admin();
+}
+# else
+static inline int current_has_bt_admin(void)
+{
+ return 1;
+}
+
+static inline int current_has_bt(void)
+{
+ return 1;
+}
+#endif
+
static int bt_sock_create(struct net *net, struct socket *sock, int proto,
int kern)
{
int err;
+ if (proto == BTPROTO_RFCOMM || proto == BTPROTO_SCO ||
+ proto == BTPROTO_L2CAP) {
+ if (!current_has_bt())
+ return -EPERM;
+ } else if (!current_has_bt_admin())
+ return -EPERM;
+
if (net != &init_net)
return -EAFNOSUPPORT;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 5f5e28f210e0..04eea2f4b80f 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -48,11 +48,6 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
- u64_stats_update_begin(&brstats->syncp);
- brstats->tx_packets++;
- brstats->tx_bytes += skb->len;
- u64_stats_update_end(&brstats->syncp);
-
#ifdef CONFIG_NET_SWITCHDEV
skb->offload_fwd_mark = 0;
#endif
@@ -61,6 +56,12 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
skb_reset_mac_header(skb);
skb_pull(skb, ETH_HLEN);
+ u64_stats_update_begin(&brstats->syncp);
+ brstats->tx_packets++;
+ /* Exclude ETH_HLEN from byte stats for consistency with Rx chain */
+ brstats->tx_bytes += skb->len;
+ u64_stats_update_end(&brstats->syncp);
+
if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid))
goto out;
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index be4629c344a6..b6791d94841d 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -18,6 +18,11 @@
#include <net/fib_rules.h>
#include <net/ip_tunnels.h>
+static const struct fib_kuid_range fib_kuid_range_unset = {
+ KUIDT_INIT(0),
+ KUIDT_INIT(~0),
+};
+
int fib_default_rule_add(struct fib_rules_ops *ops,
u32 pref, u32 table, u32 flags)
{
@@ -33,6 +38,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->table = table;
r->flags = flags;
r->fr_net = ops->fro_net;
+ r->uid_range = fib_kuid_range_unset;
r->suppress_prefixlen = -1;
r->suppress_ifgroup = -1;
@@ -172,6 +178,34 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
}
EXPORT_SYMBOL_GPL(fib_rules_unregister);
+static int uid_range_set(struct fib_kuid_range *range)
+{
+ return uid_valid(range->start) && uid_valid(range->end);
+}
+
+static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb)
+{
+ struct fib_rule_uid_range *in;
+ struct fib_kuid_range out;
+
+ in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]);
+
+ out.start = make_kuid(current_user_ns(), in->start);
+ out.end = make_kuid(current_user_ns(), in->end);
+
+ return out;
+}
+
+static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range)
+{
+ struct fib_rule_uid_range out = {
+ from_kuid_munged(current_user_ns(), range->start),
+ from_kuid_munged(current_user_ns(), range->end)
+ };
+
+ return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out);
+}
+
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags,
struct fib_lookup_arg *arg)
@@ -193,6 +227,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg))
goto out;
+ if (uid_lt(fl->flowi_uid, rule->uid_range.start) ||
+ uid_gt(fl->flowi_uid, rule->uid_range.end))
+ goto out;
+
ret = ops->match(rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -305,6 +343,10 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
if (r->l3mdev != rule->l3mdev)
continue;
+ if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+ !uid_eq(r->uid_range.end, rule->uid_range.end))
+ continue;
+
if (!ops->compare(r, frh, tb))
continue;
return 1;
@@ -429,6 +471,21 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh)
if (rule->l3mdev && rule->table)
goto errout_free;
+ if (tb[FRA_UID_RANGE]) {
+ if (current_user_ns() != net->user_ns) {
+ err = -EPERM;
+ goto errout_free;
+ }
+
+ rule->uid_range = nla_get_kuid_range(tb);
+
+ if (!uid_range_set(&rule->uid_range) ||
+ !uid_lte(rule->uid_range.start, rule->uid_range.end))
+ goto errout_free;
+ } else {
+ rule->uid_range = fib_kuid_range_unset;
+ }
+
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
rule_exists(ops, frh, tb, rule)) {
err = -EEXIST;
@@ -497,6 +554,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
struct fib_rules_ops *ops = NULL;
struct fib_rule *rule, *tmp;
struct nlattr *tb[FRA_MAX+1];
+ struct fib_kuid_range range;
int err = -EINVAL;
if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
@@ -516,6 +574,14 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err < 0)
goto errout;
+ if (tb[FRA_UID_RANGE]) {
+ range = nla_get_kuid_range(tb);
+ if (!uid_range_set(&range))
+ goto errout;
+ } else {
+ range = fib_kuid_range_unset;
+ }
+
list_for_each_entry(rule, &ops->rules_list, list) {
if (frh->action && (frh->action != rule->action))
continue;
@@ -552,6 +618,11 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
(rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
continue;
+ if (uid_range_set(&range) &&
+ (!uid_eq(rule->uid_range.start, range.start) ||
+ !uid_eq(rule->uid_range.end, range.end)))
+ continue;
+
if (!ops->compare(rule, frh, tb))
continue;
@@ -619,7 +690,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ nla_total_size(4) /* FRA_FWMARK */
+ nla_total_size(4) /* FRA_FWMASK */
- + nla_total_size_64bit(8); /* FRA_TUN_ID */
+ + nla_total_size_64bit(8) /* FRA_TUN_ID */
+ + nla_total_size(sizeof(struct fib_kuid_range));
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -679,7 +751,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
(rule->tun_id &&
nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) ||
(rule->l3mdev &&
- nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)))
+ nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) ||
+ (uid_range_set(&rule->uid_range) &&
+ nla_put_uid_range(skb, &rule->uid_range)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
diff --git a/net/core/filter.c b/net/core/filter.c
index e8c89d2d2bc0..c385c55af653 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -26,6 +26,7 @@
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
+#include <linux/sock_diag.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
@@ -78,6 +79,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
return -ENOMEM;
+ err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
+ if (err)
+ return err;
+
err = security_sock_rcv_skb(sk, skb);
if (err)
return err;
@@ -85,7 +90,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
if (filter) {
- unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
+ struct sock *save_sk = skb->sk;
+ unsigned int pkt_len;
+
+ skb->sk = sk;
+ pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
+ skb->sk = save_sk;
err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
}
rcu_read_unlock();
@@ -2533,6 +2543,36 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
.arg5_type = ARG_CONST_STACK_SIZE,
};
+BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
+{
+ return skb->sk ? sock_gen_cookie(skb->sk) : 0;
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
+ .func = bpf_get_socket_cookie,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
+{
+ struct sock *sk = sk_to_full_sk(skb->sk);
+ kuid_t kuid;
+
+ if (!sk || !sk_fullsock(sk))
+ return overflowuid;
+ kuid = sock_net_uid(sock_net(sk), sk);
+ return from_kuid_munged(sock_net(sk)->user_ns, kuid);
+}
+
+static const struct bpf_func_proto bpf_get_socket_uid_proto = {
+ .func = bpf_get_socket_uid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id)
{
@@ -2554,6 +2594,10 @@ sk_filter_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_trace_printk:
if (capable(CAP_SYS_ADMIN))
return bpf_get_trace_printk_proto();
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_socket_uid:
+ return &bpf_get_socket_uid_proto;
default:
return NULL;
}
@@ -2631,6 +2675,17 @@ xdp_func_proto(enum bpf_func_id func_id)
}
}
+static const struct bpf_func_proto *
+cg_skb_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ default:
+ return sk_filter_func_proto(func_id);
+ }
+}
+
static bool __is_valid_access(int off, int size, enum bpf_access_type type)
{
if (off < 0 || off >= sizeof(struct __sk_buff))
@@ -2993,6 +3048,12 @@ static const struct bpf_verifier_ops xdp_ops = {
.convert_ctx_access = xdp_convert_ctx_access,
};
+static const struct bpf_verifier_ops cg_skb_ops = {
+ .get_func_proto = cg_skb_func_proto,
+ .is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
+};
+
static struct bpf_prog_type_list sk_filter_type __read_mostly = {
.ops = &sk_filter_ops,
.type = BPF_PROG_TYPE_SOCKET_FILTER,
@@ -3013,12 +3074,18 @@ static struct bpf_prog_type_list xdp_type __read_mostly = {
.type = BPF_PROG_TYPE_XDP,
};
+static struct bpf_prog_type_list cg_skb_type __read_mostly = {
+ .ops = &cg_skb_ops,
+ .type = BPF_PROG_TYPE_CGROUP_SKB,
+};
+
static int __init register_sk_filter_ops(void)
{
bpf_register_prog_type(&sk_filter_type);
bpf_register_prog_type(&sched_cls_type);
bpf_register_prog_type(&sched_act_type);
bpf_register_prog_type(&xdp_type);
+ bpf_register_prog_type(&cg_skb_type);
return 0;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 1c4c43483b54..0e82197ea701 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1031,6 +1031,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
union {
int val;
+ u64 val64;
struct linger ling;
struct timeval tm;
} v;
@@ -1261,6 +1262,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_incoming_cpu;
break;
+
+ case SO_COOKIE:
+ lv = sizeof(u64);
+ if (len < lv)
+ return -EINVAL;
+ v.val64 = sock_gen_cookie(sk);
+ break;
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
@@ -2441,8 +2449,11 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_type = sock->type;
sk->sk_wq = sock->wq;
sock->sk = sk;
- } else
+ sk->sk_uid = SOCK_INODE(sock)->i_uid;
+ } else {
sk->sk_wq = NULL;
+ sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
+ }
rwlock_init(&sk->sk_callback_lock);
lockdep_set_class_and_name(&sk->sk_callback_lock,
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index d1d9faf3046b..fb467db2344a 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -19,7 +19,7 @@ static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
static DEFINE_MUTEX(sock_diag_table_mutex);
static struct workqueue_struct *broadcast_wq;
-static u64 sock_gen_cookie(struct sock *sk)
+u64 sock_gen_cookie(struct sock *sk)
{
while (1) {
u64 res = atomic64_read(&sk->sk_cookie);
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index bc6a6c8b9bcd..a8b934aa9d84 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -16,6 +16,7 @@ obj-y := route.o inetpeer.o protocol.o \
obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
+obj-$(CONFIG_SYSFS) += sysfs_net_ipv4.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 689246d079ad..0865b560e11b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -89,6 +89,7 @@
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
#include <linux/slab.h>
+#include <linux/netfilter/xt_qtaguid.h>
#include <asm/uaccess.h>
@@ -121,6 +122,19 @@
#endif
#include <net/l3mdev.h>
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+
+static inline int current_has_network(void)
+{
+ return in_egroup_p(AID_INET) || capable(CAP_NET_RAW);
+}
+#else
+static inline int current_has_network(void)
+{
+ return 1;
+}
+#endif
/* The inetsw table contains everything that inet_create needs to
* build a new socket.
@@ -255,6 +269,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
if (protocol < 0 || protocol >= IPPROTO_MAX)
return -EINVAL;
+ if (!current_has_network())
+ return -EACCES;
+
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
@@ -303,8 +320,7 @@ lookup_protocol:
}
err = -EPERM;
- if (sock->type == SOCK_RAW && !kern &&
- !ns_capable(net->user_ns, CAP_NET_RAW))
+ if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
goto out_rcu_unlock;
sock->ops = answer->ops;
@@ -397,6 +413,9 @@ int inet_release(struct socket *sock)
if (sk) {
long timeout;
+#ifdef CONFIG_NETFILTER_XT_MATCH_QTAGUID
+ qtaguid_untag(sock, true);
+#endif
/* Applications forget to leave groups before exiting */
ip_mc_drop_socket(sk);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 9364c39d0555..60982807feb9 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -622,6 +622,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_FLOW] = { .type = NLA_U32 },
[RTA_ENCAP_TYPE] = { .type = NLA_U16 },
[RTA_ENCAP] = { .type = NLA_NESTED },
+ [RTA_UID] = { .type = NLA_U32 },
};
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 31f17f0bbd1c..42a19fb4ae5f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -425,6 +425,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.daddr = daddr;
fl4.saddr = saddr;
fl4.flowi4_mark = mark;
+ fl4.flowi4_uid = sock_net_uid(net, NULL);
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
@@ -473,6 +474,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
param->replyopts.opt.opt.faddr : iph->saddr);
fl4->saddr = saddr;
fl4->flowi4_mark = mark;
+ fl4->flowi4_uid = sock_net_uid(net, NULL);
fl4->flowi4_tos = RT_TOS(tos);
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 528a6777cda0..b857ecccaedb 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -418,7 +418,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
- htons(ireq->ir_num));
+ htons(ireq->ir_num), sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
@@ -456,7 +456,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
- htons(ireq->ir_num));
+ htons(ireq->ir_num), sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 100c86f1f547..5fcafc839dad 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -74,6 +74,7 @@
#include <net/checksum.h>
#include <net/inetpeer.h>
#include <net/lwtunnel.h>
+#include <linux/bpf-cgroup.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
@@ -287,6 +288,13 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
@@ -305,6 +313,20 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
return ip_finish_output2(net, sk, skb);
}
+static int ip_mc_finish_output(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
+
+ return dev_loopback_xmit(net, sk, skb);
+}
+
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
@@ -342,7 +364,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, newskb, NULL, newskb->dev,
- dev_loopback_xmit);
+ ip_mc_finish_output);
}
/* Multicasts with ttl 0 must not go beyond the host */
@@ -358,7 +380,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (newskb)
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, newskb, NULL, newskb->dev,
- dev_loopback_xmit);
+ ip_mc_finish_output);
}
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
@@ -1600,7 +1622,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg),
daddr, saddr,
- tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
+ tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
+ arg->uid);
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 9adcd4b1b3fd..8fa153c65e76 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -798,7 +798,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
- inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);
+ inet_sk_flowi_flags(sk), faddr, saddr, 0, 0,
+ sk->sk_uid);
security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 59d8770055ed..a860df28300d 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -612,7 +612,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk) |
(hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
- daddr, saddr, 0, 0);
+ daddr, saddr, 0, 0, sk->sk_uid);
if (!hdrincl) {
rfv.msg = msg;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 890141d32ab9..d655751dfdf5 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -510,7 +510,8 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
}
EXPORT_SYMBOL(__ip_select_ident);
-static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
+ const struct sock *sk,
const struct iphdr *iph,
int oif, u8 tos,
u8 prot, u32 mark, int flow_flags)
@@ -526,19 +527,21 @@ static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
flowi4_init_output(fl4, oif, mark, tos,
RT_SCOPE_UNIVERSE, prot,
flow_flags,
- iph->daddr, iph->saddr, 0, 0);
+ iph->daddr, iph->saddr, 0, 0,
+ sock_net_uid(net, sk));
}
static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
const struct sock *sk)
{
+ const struct net *net = dev_net(skb->dev);
const struct iphdr *iph = ip_hdr(skb);
int oif = skb->dev->ifindex;
u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
u32 mark = skb->mark;
- __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
+ __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
}
static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
@@ -555,7 +558,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk),
- daddr, inet->inet_saddr, 0, 0);
+ daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
rcu_read_unlock();
}
@@ -807,6 +810,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
struct rtable *rt;
struct flowi4 fl4;
const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct net *net = dev_net(skb->dev);
int oif = skb->dev->ifindex;
u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
@@ -814,7 +818,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
rt = (struct rtable *) dst;
- __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
+ __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
__ip_do_redirect(rt, skb, &fl4, true);
}
@@ -1035,7 +1039,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
if (!mark)
mark = IP4_REPLY_MARK(net, skb->mark);
- __build_flow_key(&fl4, NULL, iph, oif,
+ __build_flow_key(net, &fl4, NULL, iph, oif,
RT_TOS(iph->tos), protocol, mark, flow_flags);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
@@ -1051,7 +1055,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
struct flowi4 fl4;
struct rtable *rt;
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
if (!fl4.flowi4_mark)
fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
@@ -1070,6 +1074,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
struct rtable *rt;
struct dst_entry *odst = NULL;
bool new = false;
+ struct net *net = sock_net(sk);
bh_lock_sock(sk);
@@ -1083,7 +1088,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
goto out;
}
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
rt = (struct rtable *)odst;
if (odst->obsolete && !odst->ops->check(odst, 0)) {
@@ -1123,7 +1128,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
struct flowi4 fl4;
struct rtable *rt;
- __build_flow_key(&fl4, NULL, iph, oif,
+ __build_flow_key(net, &fl4, NULL, iph, oif,
RT_TOS(iph->tos), protocol, mark, flow_flags);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
@@ -1138,9 +1143,10 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
const struct iphdr *iph = (const struct iphdr *) skb->data;
struct flowi4 fl4;
struct rtable *rt;
+ struct net *net = sock_net(sk);
- __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
- rt = __ip_route_output_key(sock_net(sk), &fl4);
+ __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
+ rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_do_redirect(rt, skb, &fl4, false);
ip_rt_put(rt);
@@ -2524,6 +2530,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
goto nla_put_failure;
+ if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
+ nla_put_u32(skb, RTA_UID,
+ from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
+ goto nla_put_failure;
+
error = rt->dst.error;
if (rt_is_input_route(rt)) {
@@ -2576,6 +2587,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
int mark;
struct sk_buff *skb;
u32 table_id = RT_TABLE_MAIN;
+ kuid_t uid;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
if (err < 0)
@@ -2603,6 +2615,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
+ if (tb[RTA_UID])
+ uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
+ else
+ uid = (iif ? INVALID_UID : current_uid());
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
@@ -2610,6 +2626,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
fl4.flowi4_tos = rtm->rtm_tos;
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
+ fl4.flowi4_uid = uid;
if (iif) {
struct net_device *dev;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 0597ad73a1fa..4487c71873db 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -373,7 +373,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
inet_sk_flowi_flags(sk),
opt->srr ? opt->faddr : ireq->ir_rmt_addr,
- ireq->ir_loc_addr, th->source, th->dest);
+ ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(sock_net(sk), &fl4);
if (IS_ERR(rt)) {
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 024ab833557d..ce43dc7afc16 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -152,6 +152,21 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
return ret;
}
+/* Validate changes from /proc interface. */
+static int proc_tcp_default_init_rwnd(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int old_value = *(int *)ctl->data;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ int new_value = *(int *)ctl->data;
+
+ if (write && ret == 0 && (new_value < 3 || new_value > 100))
+ *(int *)ctl->data = old_value;
+
+ return ret;
+}
+
static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -633,6 +648,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec_ms_jiffies,
},
{
+ .procname = "tcp_default_init_rwnd",
+ .data = &sysctl_tcp_default_init_rwnd,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_tcp_default_init_rwnd
+ },
+ {
.procname = "icmp_msgs_per_sec",
.data = &sysctl_icmp_msgs_per_sec,
.maxlen = sizeof(int),
diff --git a/net/ipv4/sysfs_net_ipv4.c b/net/ipv4/sysfs_net_ipv4.c
new file mode 100644
index 000000000000..0cbbf10026a6
--- /dev/null
+++ b/net/ipv4/sysfs_net_ipv4.c
@@ -0,0 +1,88 @@
+/*
+ * net/ipv4/sysfs_net_ipv4.c
+ *
+ * sysfs-based networking knobs (so we can, unlike with sysctl, control perms)
+ *
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * Robert Love <rlove@google.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <net/tcp.h>
+
+#define CREATE_IPV4_FILE(_name, _var) \
+static ssize_t _name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buf) \
+{ \
+ return sprintf(buf, "%d\n", _var); \
+} \
+static ssize_t _name##_store(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ const char *buf, size_t count) \
+{ \
+ int val, ret; \
+ ret = sscanf(buf, "%d", &val); \
+ if (ret != 1) \
+ return -EINVAL; \
+ if (val < 0) \
+ return -EINVAL; \
+ _var = val; \
+ return count; \
+} \
+static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+CREATE_IPV4_FILE(tcp_wmem_min, sysctl_tcp_wmem[0]);
+CREATE_IPV4_FILE(tcp_wmem_def, sysctl_tcp_wmem[1]);
+CREATE_IPV4_FILE(tcp_wmem_max, sysctl_tcp_wmem[2]);
+
+CREATE_IPV4_FILE(tcp_rmem_min, sysctl_tcp_rmem[0]);
+CREATE_IPV4_FILE(tcp_rmem_def, sysctl_tcp_rmem[1]);
+CREATE_IPV4_FILE(tcp_rmem_max, sysctl_tcp_rmem[2]);
+
+static struct attribute *ipv4_attrs[] = {
+ &tcp_wmem_min_attr.attr,
+ &tcp_wmem_def_attr.attr,
+ &tcp_wmem_max_attr.attr,
+ &tcp_rmem_min_attr.attr,
+ &tcp_rmem_def_attr.attr,
+ &tcp_rmem_max_attr.attr,
+ NULL
+};
+
+static struct attribute_group ipv4_attr_group = {
+ .attrs = ipv4_attrs,
+};
+
+static __init int sysfs_ipv4_init(void)
+{
+ struct kobject *ipv4_kobject;
+ int ret;
+
+ ipv4_kobject = kobject_create_and_add("ipv4", kernel_kobj);
+ if (!ipv4_kobject)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(ipv4_kobject, &ipv4_attr_group);
+ if (ret) {
+ kobject_put(ipv4_kobject);
+ return ret;
+ }
+
+ return 0;
+}
+
+subsys_initcall(sysfs_ipv4_init);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dbb153c6b21a..80ac4ec65f61 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -100,6 +100,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_early_retrans __read_mostly = 3;
int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
+int sysctl_tcp_default_init_rwnd __read_mostly = TCP_INIT_CWND * 2;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1ea0c91ba994..9fcbde7a28c9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -695,6 +695,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
arg.tos = ip_hdr(skb)->tos;
+ arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -715,7 +716,7 @@ out:
outside socket context is ugly, certainly. What can I do?
*/
-static void tcp_v4_send_ack(struct net *net,
+static void tcp_v4_send_ack(const struct sock *sk,
struct sk_buff *skb, u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
@@ -730,6 +731,7 @@ static void tcp_v4_send_ack(struct net *net,
#endif
];
} rep;
+ struct net *net = sock_net(sk);
struct ip_reply_arg arg;
memset(&rep.th, 0, sizeof(struct tcphdr));
@@ -779,6 +781,7 @@ static void tcp_v4_send_ack(struct net *net,
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
+ arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -794,7 +797,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
- tcp_v4_send_ack(sock_net(sk), skb,
+ tcp_v4_send_ack(sk, skb,
tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp + tcptw->tw_ts_offset,
@@ -822,7 +825,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
* exception of <SYN> segments, MUST be right-shifted by
* Rcv.Wind.Shift bits:
*/
- tcp_v4_send_ack(sock_net(sk), skb, seq,
+ tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index bd68f073570b..070692099ae1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -193,7 +193,7 @@ u32 tcp_default_init_rwnd(u32 mss)
* (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
* limit when mss is larger than 1460.
*/
- u32 init_rwnd = TCP_INIT_CWND * 2;
+ u32 init_rwnd = sysctl_tcp_default_init_rwnd;
if (mss > 1460)
init_rwnd = max((1460 * init_rwnd) / mss, 2U);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5d4b5e0f6b5e..bff17458e49b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1018,7 +1018,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
flow_flags,
- faddr, saddr, dport, inet->inet_sport);
+ faddr, saddr, dport, inet->inet_sport,
+ sk->sk_uid);
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8f79f0414bc3..40c3f6a157ed 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -223,9 +223,11 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.accept_ra_rtr_pref = 1,
.rtr_probe_interval = 60 * HZ,
#ifdef CONFIG_IPV6_ROUTE_INFO
+ .accept_ra_rt_info_min_plen = 0,
.accept_ra_rt_info_max_plen = 0,
#endif
#endif
+ .accept_ra_rt_table = 0,
.proxy_ndp = 0,
.accept_source_route = 0, /* we do not accept RH0 by default. */
.disable_ipv6 = 0,
@@ -269,9 +271,11 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.accept_ra_rtr_pref = 1,
.rtr_probe_interval = 60 * HZ,
#ifdef CONFIG_IPV6_ROUTE_INFO
+ .accept_ra_rt_info_min_plen = 0,
.accept_ra_rt_info_max_plen = 0,
#endif
#endif
+ .accept_ra_rt_table = 0,
.proxy_ndp = 0,
.accept_source_route = 0, /* we do not accept RH0 by default. */
.disable_ipv6 = 0,
@@ -2203,6 +2207,31 @@ static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
ipv6_regen_rndid(idev);
}
+u32 addrconf_rt_table(const struct net_device *dev, u32 default_table) {
+ /* Determines into what table to put autoconf PIO/RIO/default routes
+ * learned on this device.
+ *
+ * - If 0, use the same table for every device. This puts routes into
+ * one of RT_TABLE_{PREFIX,INFO,DFLT} depending on the type of route
+ * (but note that these three are currently all equal to
+ * RT6_TABLE_MAIN).
+ * - If > 0, use the specified table.
+ * - If < 0, put routes into table dev->ifindex + (-rt_table).
+ */
+ struct inet6_dev *idev = in6_dev_get(dev);
+ u32 table;
+ int sysctl = idev->cnf.accept_ra_rt_table;
+ if (sysctl == 0) {
+ table = default_table;
+ } else if (sysctl > 0) {
+ table = (u32) sysctl;
+ } else {
+ table = (unsigned) dev->ifindex + (-sysctl);
+ }
+ in6_dev_put(idev);
+ return table;
+}
+
/*
* Add prefix route.
*/
@@ -2212,7 +2241,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
unsigned long expires, u32 flags)
{
struct fib6_config cfg = {
- .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
+ .fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_PREFIX),
.fc_metric = IP6_RT_PRIO_ADDRCONF,
.fc_ifindex = dev->ifindex,
.fc_expires = expires,
@@ -2245,7 +2274,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
struct fib6_node *fn;
struct rt6_info *rt = NULL;
struct fib6_table *table;
- u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX;
+ u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_PREFIX);
table = fib6_get_table(dev_net(dev), tb_id);
if (!table)
@@ -4957,9 +4986,11 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_RTR_PROBE_INTERVAL] =
jiffies_to_msecs(cnf->rtr_probe_interval);
#ifdef CONFIG_IPV6_ROUTE_INFO
+ array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] = cnf->accept_ra_rt_info_min_plen;
array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen;
#endif
#endif
+ array[DEVCONF_ACCEPT_RA_RT_TABLE] = cnf->accept_ra_rt_table;
array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp;
array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route;
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
@@ -5932,6 +5963,13 @@ static const struct ctl_table addrconf_sysctl[] = {
},
#ifdef CONFIG_IPV6_ROUTE_INFO
{
+ .procname = "accept_ra_rt_info_min_plen",
+ .data = &ipv6_devconf.accept_ra_rt_info_min_plen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "accept_ra_rt_info_max_plen",
.data = &ipv6_devconf.accept_ra_rt_info_max_plen,
.maxlen = sizeof(int),
@@ -5941,6 +5979,13 @@ static const struct ctl_table addrconf_sysctl[] = {
#endif
#endif
{
+ .procname = "accept_ra_rt_table",
+ .data = &ipv6_devconf.accept_ra_rt_table,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "proxy_ndp",
.data = &ipv6_devconf.proxy_ndp,
.maxlen = sizeof(int),
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index f7b425615c12..c789a3eef0e8 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -65,6 +65,20 @@
#include <asm/uaccess.h>
#include <linux/mroute6.h>
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+
+static inline int current_has_network(void)
+{
+ return in_egroup_p(AID_INET) || capable(CAP_NET_RAW);
+}
+#else
+static inline int current_has_network(void)
+{
+ return 1;
+}
+#endif
+
#include "ip6_offload.h"
MODULE_AUTHOR("Cast of dozens");
@@ -121,6 +135,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
if (protocol < 0 || protocol >= IPPROTO_MAX)
return -EINVAL;
+ if (!current_has_network())
+ return -EACCES;
+
/* Look for the requested type/protocol pair. */
lookup_protocol:
err = -ESOCKTNOSUPPORT;
@@ -167,8 +184,7 @@ lookup_protocol:
}
err = -EPERM;
- if (sock->type == SOCK_RAW && !kern &&
- !ns_capable(net->user_ns, CAP_NET_RAW))
+ if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
goto out_rcu_unlock;
sock->ops = answer->ops;
@@ -680,6 +696,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = inet->inet_dport;
fl6.fl6_sport = inet->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
rcu_read_lock();
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 0edc44cb254e..e742c4deb13d 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -666,9 +666,10 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 2d3c8fe27583..988b1c8dea5b 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -54,6 +54,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
fl6->fl6_dport = inet->inet_dport;
fl6->fl6_sport = inet->inet_sport;
fl6->flowlabel = np->flow_label;
+ fl6->flowi6_uid = sk->sk_uid;
if (!fl6->flowi6_oif)
fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 6a924be66e37..44a2010e2076 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -478,9 +478,10 @@ static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 305e2ed730bf..477692f80f0d 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -166,15 +166,15 @@ EXPORT_SYMBOL_GPL(ipv6_find_tlv);
* to explore inner IPv6 header, eg. ICMPv6 error messages.
*
* If target header is found, its offset is set in *offset and return protocol
- * number. Otherwise, return -1.
+ * number. Otherwise, return -ENOENT or -EBADMSG.
*
* If the first fragment doesn't contain the final protocol header or
* NEXTHDR_NONE it is considered invalid.
*
* Note that non-1st fragment is special case that "the protocol number
* of last header" is "next header" field in Fragment header. In this case,
- * *offset is meaningless and fragment offset is stored in *fragoff if fragoff
- * isn't NULL.
+ * *offset is meaningless. If fragoff is not NULL, the fragment offset is
+ * stored in *fragoff; if it is NULL, return -EINVAL.
*
* if flags is not NULL and it's a fragment, then the frag flag
* IP6_FH_F_FRAG will be set. If it's an AH header, the
@@ -253,9 +253,12 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
if (target < 0 &&
((!ipv6_ext_hdr(hp->nexthdr)) ||
hp->nexthdr == NEXTHDR_NONE)) {
- if (fragoff)
+ if (fragoff) {
*fragoff = _frag_off;
- return hp->nexthdr;
+ return hp->nexthdr;
+ } else {
+ return -EINVAL;
+ }
}
if (!found)
return -ENOENT;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 2772004ba5a1..17fa28f7a0ff 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -92,9 +92,10 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct net *net = dev_net(skb->dev);
if (type == ICMPV6_PKT_TOOBIG)
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
else if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
if (!(type & ICMPV6_INFOMSG_MASK))
if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
@@ -486,6 +487,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
fl6.flowi6_oif = iif;
fl6.fl6_icmp_type = type;
fl6.fl6_icmp_code = code;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
sk = icmpv6_xmit_lock(net);
@@ -660,6 +662,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
fl6.flowi6_oif = skb->dev->ifindex;
fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
fl6.flowi6_mark = mark;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
sk = icmpv6_xmit_lock(net);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 798a0950e9a6..10d1deb30547 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -88,6 +88,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
fl6->flowi6_mark = ireq->ir_mark;
fl6->fl6_dport = ireq->ir_rmt_port;
fl6->fl6_sport = htons(ireq->ir_num);
+ fl6->flowi6_uid = sk->sk_uid;
security_req_classify_flow(req, flowi6_to_flowi(fl6));
dst = ip6_dst_lookup_flow(sk, fl6, final_p);
@@ -136,6 +137,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
fl6->flowi6_mark = sk->sk_mark;
fl6->fl6_sport = inet->inet_sport;
fl6->fl6_dport = inet->inet_dport;
+ fl6->flowi6_uid = sk->sk_uid;
security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
rcu_read_lock();
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index caee5530ae2c..a88aff02b579 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -562,6 +562,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
if (err)
return -1;
@@ -621,6 +623,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)))
return -1;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8e77cecd2165..b489e865c956 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -39,6 +39,7 @@
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/bpf-cgroup.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
@@ -66,9 +67,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
struct in6_addr *nexthop;
int ret;
- skb->protocol = htons(ETH_P_IPV6);
- skb->dev = dev;
-
if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
@@ -131,6 +129,14 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ if (ret) {
+ kfree_skb(skb);
+ return ret;
+ }
+
if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
dst_allfrag(skb_dst(skb)) ||
(IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
@@ -144,6 +150,9 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
struct net_device *dev = skb_dst(skb)->dev;
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->dev = dev;
+
if (unlikely(idev->cnf.disable_ipv6)) {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
kfree_skb(skb);
@@ -866,7 +875,6 @@ fail_toobig:
if (skb->sk && dst_allfrag(skb_dst(skb)))
sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
- skb->dev = skb_dst(skb)->dev;
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
err = -EMSGSIZE;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 9c5afa5153ce..778b20b1c24a 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1274,6 +1274,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
fl6.flowi6_mark = skb->mark;
}
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
@@ -1361,6 +1363,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
fl6.flowi6_mark = skb->mark;
}
+ fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 3213921cdfee..406ff507c18b 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -617,9 +617,10 @@ static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 1b9316e1386a..54d165b9845a 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -74,9 +74,10 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
if (type == NDISC_REDIRECT)
- ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
else
- ip6_update_pmtu(skb, net, info, 0, 0);
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
xfrm_state_put(x);
return 0;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 505d048ffff5..14750ba527cf 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1395,6 +1395,8 @@ skip_linkparms:
if (ri->prefix_len == 0 &&
!in6_dev->cnf.accept_ra_defrtr)
continue;
+ if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen)
+ continue;
if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen)
continue;
rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3,
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index d11c46833d61..39970e212ad5 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -26,6 +26,7 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
struct flowi6 fl6 = {
.flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
.flowi6_mark = skb->mark,
+ .flowi6_uid = sock_net_uid(net, skb->sk),
.daddr = iph->daddr,
.saddr = iph->saddr,
};
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 982868193dbb..2a965d4c2421 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -113,6 +113,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.daddr = *daddr;
fl6.flowi6_oif = oif;
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
fl6.fl6_icmp_type = user_icmph.icmp6_type;
fl6.fl6_icmp_code = user_icmph.icmp6_code;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index a4f979ff31b9..55d284a2e266 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -791,6 +791,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
ipc6.hlimit = -1;
ipc6.tclass = -1;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b0a72677b7e5..52892b871bdc 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1417,7 +1417,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
}
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
- int oif, u32 mark)
+ int oif, u32 mark, kuid_t uid)
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
@@ -1429,6 +1429,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
fl6.daddr = iph->daddr;
fl6.saddr = iph->saddr;
fl6.flowlabel = ip6_flowinfo(iph);
+ fl6.flowi6_uid = uid;
dst = ip6_route_output(net, NULL, &fl6);
if (!dst->error)
@@ -1445,7 +1446,7 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
if (!oif && skb->dev)
oif = l3mdev_master_ifindex(skb->dev);
- ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark);
+ ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
dst = __sk_dst_get(sk);
if (!dst || !dst->obsolete ||
@@ -1537,7 +1538,8 @@ static struct dst_entry *ip6_route_redirect(struct net *net,
flags, __ip6_route_redirect);
}
-void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
+void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
+ kuid_t uid)
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
@@ -1550,6 +1552,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
fl6.daddr = iph->daddr;
fl6.saddr = iph->saddr;
fl6.flowlabel = ip6_flowinfo(iph);
+ fl6.flowi6_uid = uid;
dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
rt6_do_redirect(dst, NULL, skb);
@@ -1571,6 +1574,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
fl6.flowi6_mark = mark;
fl6.daddr = msg->dest;
fl6.saddr = iph->daddr;
+ fl6.flowi6_uid = sock_net_uid(net, NULL);
dst = ip6_route_redirect(net, &fl6, &iph->saddr);
rt6_do_redirect(dst, NULL, skb);
@@ -1579,7 +1583,8 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
- ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
+ ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
+ sk->sk_uid);
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);
@@ -2354,8 +2359,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
const struct in6_addr *gwaddr,
struct net_device *dev)
{
- u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
- int ifindex = dev->ifindex;
+ u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO);
struct fib6_node *fn;
struct rt6_info *rt = NULL;
struct fib6_table *table;
@@ -2370,7 +2374,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
goto out;
for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
- if (rt->dst.dev->ifindex != ifindex)
+ if (rt->dst.dev->ifindex != dev->ifindex)
continue;
if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
continue;
@@ -2401,7 +2405,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
.fc_nlinfo.nl_net = net,
};
- cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
+ cfg.fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO),
cfg.fc_dst = *prefix;
cfg.fc_gateway = *gwaddr;
@@ -2417,7 +2421,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
{
- u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
+ u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_MAIN);
struct rt6_info *rt;
struct fib6_table *table;
@@ -2443,7 +2447,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
unsigned int pref)
{
struct fib6_config cfg = {
- .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
+ .fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_DFLT),
.fc_metric = IP6_RT_PRIO_USER,
.fc_ifindex = dev->ifindex,
.fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
@@ -2466,43 +2470,16 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
return rt6_get_dflt_router(gwaddr, dev);
}
-static void __rt6_purge_dflt_routers(struct fib6_table *table)
-{
- struct rt6_info *rt;
-
-restart:
- read_lock_bh(&table->tb6_lock);
- for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
- if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
- (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
- dst_hold(&rt->dst);
- read_unlock_bh(&table->tb6_lock);
- ip6_del_rt(rt);
- goto restart;
- }
- }
- read_unlock_bh(&table->tb6_lock);
-
- table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
+int rt6_addrconf_purge(struct rt6_info *rt, void *arg) {
+ if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
+ (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2))
+ return -1;
+ return 0;
}
void rt6_purge_dflt_routers(struct net *net)
{
- struct fib6_table *table;
- struct hlist_head *head;
- unsigned int h;
-
- rcu_read_lock();
-
- for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
- head = &net->ipv6.fib_table_hash[h];
- hlist_for_each_entry_rcu(table, head, tb6_hlist) {
- if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
- __rt6_purge_dflt_routers(table);
- }
- }
-
- rcu_read_unlock();
+ fib6_clean_all(net, rt6_addrconf_purge, NULL);
}
static void rtmsg_to_fib6_config(struct net *net,
@@ -2821,6 +2798,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_ENCAP_TYPE] = { .type = NLA_U16 },
[RTA_ENCAP] = { .type = NLA_NESTED },
[RTA_EXPIRES] = { .type = NLA_U32 },
+ [RTA_UID] = { .type = NLA_U32 },
[RTA_TABLE] = { .type = NLA_U32 },
};
@@ -3401,6 +3379,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
if (tb[RTA_MARK])
fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
+ if (tb[RTA_UID])
+ fl6.flowi6_uid = make_kuid(current_user_ns(),
+ nla_get_u32(tb[RTA_UID]));
+ else
+ fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
+
if (iif) {
struct net_device *dev;
int flags = 0;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 7a86433d8896..a67174e0b094 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -228,6 +228,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
fl6.flowi6_mark = ireq->ir_mark;
fl6.fl6_dport = ireq->ir_rmt_port;
fl6.fl6_sport = inet_sk(sk)->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
security_req_classify_flow(req, flowi6_to_flowi(&fl6));
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0a69d39880f2..f2e48bddf71a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -238,6 +238,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = usin->sin6_port;
fl6.fl6_sport = inet->inet_sport;
+ fl6.flowi6_uid = sk->sk_uid;
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
final_p = fl6_update_dst(&fl6, opt, &final);
@@ -835,6 +836,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
+ fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
/* Pass a socket to ip6_dst_lookup either it is for RST
@@ -1240,9 +1242,6 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (skb->protocol == htons(ETH_P_IP))
return tcp_v4_do_rcv(sk, skb);
- if (tcp_filter(sk, skb))
- goto discard;
-
/*
* socket locking is here for SMP purposes as backlog rcv
* is currently called with bh processing disabled.
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4db5f541bca6..c43ef0cdf8cb 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1162,6 +1162,7 @@ do_udp_sendmsg:
fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
if (msg->msg_controllen) {
opt = &opt_space;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 247097289fd0..86ad51a039c4 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -526,6 +526,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_uid = sk->sk_uid;
ipc6.hlimit = -1;
ipc6.tclass = -1;
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e8d56d9a4df2..177d3ae5fda8 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1316,6 +1316,8 @@ config NETFILTER_XT_MATCH_OWNER
based on who created the socket: the user or group. It is also
possible to check whether a socket actually exists.
+ Conflicts with '"quota, tag, uid" match'
+
config NETFILTER_XT_MATCH_POLICY
tristate 'IPsec "policy" match support'
depends on XFRM
@@ -1349,6 +1351,22 @@ config NETFILTER_XT_MATCH_PKTTYPE
To compile it as a module, choose M here. If unsure, say N.
+config NETFILTER_XT_MATCH_QTAGUID
+ bool '"quota, tag, owner" match and stats support'
+ depends on NETFILTER_XT_MATCH_SOCKET
+ depends on NETFILTER_XT_MATCH_OWNER=n
+ help
+ This option replaces the `owner' match. In addition to matching
+ on uid, it keeps stats based on a tag assigned to a socket.
+ The full tag is comprised of a UID and an accounting tag.
+ The tags are assignable to sockets from user space (e.g. a download
+ manager can assign the socket to another UID for accounting).
+ Stats and control are done via /proc/net/xt_qtaguid/.
+ It replaces owner as it takes the same arguments, but should
+ really be recognized by the iptables tool.
+
+ If unsure, say `N'.
+
config NETFILTER_XT_MATCH_QUOTA
tristate '"quota" match support'
depends on NETFILTER_ADVANCED
@@ -1359,6 +1377,29 @@ config NETFILTER_XT_MATCH_QUOTA
If you want to compile it as a module, say M here and read
<file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+config NETFILTER_XT_MATCH_QUOTA2
+ tristate '"quota2" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `quota2' match, which allows to match on a
+ byte counter correctly and not per CPU.
+ It allows naming the quotas.
+ This is based on http://xtables-addons.git.sourceforge.net
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_QUOTA2_LOG
+ bool '"quota2" Netfilter LOG support'
+ depends on NETFILTER_XT_MATCH_QUOTA2
+ default n
+ help
+ This option allows `quota2' to log ONCE when a quota limit
+ is passed. It logs via NETLINK using the NETLINK_NFLOG family.
+ It logs similarly to how ipt_ULOG would without data.
+
+ If unsure, say `N'.
+
config NETFILTER_XT_MATCH_RATEEST
tristate '"rateest" match support'
depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index c23c3c84416f..54ba5aa1f9bf 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -167,7 +167,9 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_CGROUP) += xt_cgroup.o
obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QTAGUID) += xt_qtaguid_print.o xt_qtaguid.o
obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA2) += xt_quota2.o
obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o
obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index 921c9bd7e1e7..44c42a7fa7bc 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -5,6 +5,7 @@
* After timer expires a kevent will be sent.
*
* Copyright (C) 2004, 2010 Nokia Corporation
+ *
* Written by Timo Teras <ext-timo.teras@nokia.com>
*
* Converted to x_tables and reworked for upstream inclusion
@@ -38,8 +39,17 @@
#include <linux/netfilter/xt_IDLETIMER.h>
#include <linux/kdev_t.h>
#include <linux/kobject.h>
+#include <linux/skbuff.h>
#include <linux/workqueue.h>
#include <linux/sysfs.h>
+#include <linux/rtc.h>
+#include <linux/time.h>
+#include <linux/math64.h>
+#include <linux/suspend.h>
+#include <linux/notifier.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
struct idletimer_tg_attr {
struct attribute attr;
@@ -55,14 +65,110 @@ struct idletimer_tg {
struct kobject *kobj;
struct idletimer_tg_attr attr;
+ struct timespec delayed_timer_trigger;
+ struct timespec last_modified_timer;
+ struct timespec last_suspend_time;
+ struct notifier_block pm_nb;
+
+ int timeout;
unsigned int refcnt;
+ bool work_pending;
+ bool send_nl_msg;
+ bool active;
+ uid_t uid;
};
static LIST_HEAD(idletimer_tg_list);
static DEFINE_MUTEX(list_mutex);
+static DEFINE_SPINLOCK(timestamp_lock);
static struct kobject *idletimer_tg_kobj;
+static bool check_for_delayed_trigger(struct idletimer_tg *timer,
+ struct timespec *ts)
+{
+ bool state;
+ struct timespec temp;
+ spin_lock_bh(&timestamp_lock);
+ timer->work_pending = false;
+ if ((ts->tv_sec - timer->last_modified_timer.tv_sec) > timer->timeout ||
+ timer->delayed_timer_trigger.tv_sec != 0) {
+ state = false;
+ temp.tv_sec = timer->timeout;
+ temp.tv_nsec = 0;
+ if (timer->delayed_timer_trigger.tv_sec != 0) {
+ temp = timespec_add(timer->delayed_timer_trigger, temp);
+ ts->tv_sec = temp.tv_sec;
+ ts->tv_nsec = temp.tv_nsec;
+ timer->delayed_timer_trigger.tv_sec = 0;
+ timer->work_pending = true;
+ schedule_work(&timer->work);
+ } else {
+ temp = timespec_add(timer->last_modified_timer, temp);
+ ts->tv_sec = temp.tv_sec;
+ ts->tv_nsec = temp.tv_nsec;
+ }
+ } else {
+ state = timer->active;
+ }
+ spin_unlock_bh(&timestamp_lock);
+ return state;
+}
+
+static void notify_netlink_uevent(const char *iface, struct idletimer_tg *timer)
+{
+ char iface_msg[NLMSG_MAX_SIZE];
+ char state_msg[NLMSG_MAX_SIZE];
+ char timestamp_msg[NLMSG_MAX_SIZE];
+ char uid_msg[NLMSG_MAX_SIZE];
+ char *envp[] = { iface_msg, state_msg, timestamp_msg, uid_msg, NULL };
+ int res;
+ struct timespec ts;
+ uint64_t time_ns;
+ bool state;
+
+ res = snprintf(iface_msg, NLMSG_MAX_SIZE, "INTERFACE=%s",
+ iface);
+ if (NLMSG_MAX_SIZE <= res) {
+ pr_err("message too long (%d)", res);
+ return;
+ }
+
+ get_monotonic_boottime(&ts);
+ state = check_for_delayed_trigger(timer, &ts);
+ res = snprintf(state_msg, NLMSG_MAX_SIZE, "STATE=%s",
+ state ? "active" : "inactive");
+
+ if (NLMSG_MAX_SIZE <= res) {
+ pr_err("message too long (%d)", res);
+ return;
+ }
+
+ if (state) {
+ res = snprintf(uid_msg, NLMSG_MAX_SIZE, "UID=%u", timer->uid);
+ if (NLMSG_MAX_SIZE <= res)
+ pr_err("message too long (%d)", res);
+ } else {
+ res = snprintf(uid_msg, NLMSG_MAX_SIZE, "UID=");
+ if (NLMSG_MAX_SIZE <= res)
+ pr_err("message too long (%d)", res);
+ }
+
+ time_ns = timespec_to_ns(&ts);
+ res = snprintf(timestamp_msg, NLMSG_MAX_SIZE, "TIME_NS=%llu", time_ns);
+ if (NLMSG_MAX_SIZE <= res) {
+ timestamp_msg[0] = '\0';
+ pr_err("message too long (%d)", res);
+ }
+
+ pr_debug("putting nlmsg: <%s> <%s> <%s> <%s>\n", iface_msg, state_msg,
+ timestamp_msg, uid_msg);
+ kobject_uevent_env(idletimer_tg_kobj, KOBJ_CHANGE, envp);
+ return;
+
+
+}
+
static
struct idletimer_tg *__idletimer_tg_find_by_label(const char *label)
{
@@ -83,6 +189,7 @@ static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
{
struct idletimer_tg *timer;
unsigned long expires = 0;
+ unsigned long now = jiffies;
mutex_lock(&list_mutex);
@@ -92,11 +199,15 @@ static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
mutex_unlock(&list_mutex);
- if (time_after(expires, jiffies))
+ if (time_after(expires, now))
return sprintf(buf, "%u\n",
- jiffies_to_msecs(expires - jiffies) / 1000);
+ jiffies_to_msecs(expires - now) / 1000);
- return sprintf(buf, "0\n");
+ if (timer->send_nl_msg)
+ return sprintf(buf, "0 %d\n",
+ jiffies_to_msecs(now - expires) / 1000);
+ else
+ return sprintf(buf, "0\n");
}
static void idletimer_tg_work(struct work_struct *work)
@@ -105,6 +216,9 @@ static void idletimer_tg_work(struct work_struct *work)
work);
sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name);
+
+ if (timer->send_nl_msg)
+ notify_netlink_uevent(timer->attr.attr.name, timer);
}
static void idletimer_tg_expired(unsigned long data)
@@ -112,8 +226,55 @@ static void idletimer_tg_expired(unsigned long data)
struct idletimer_tg *timer = (struct idletimer_tg *) data;
pr_debug("timer %s expired\n", timer->attr.attr.name);
-
+ spin_lock_bh(&timestamp_lock);
+ timer->active = false;
+ timer->work_pending = true;
schedule_work(&timer->work);
+ spin_unlock_bh(&timestamp_lock);
+}
+
+static int idletimer_resume(struct notifier_block *notifier,
+ unsigned long pm_event, void *unused)
+{
+ struct timespec ts;
+ unsigned long time_diff, now = jiffies;
+ struct idletimer_tg *timer = container_of(notifier,
+ struct idletimer_tg, pm_nb);
+ if (!timer)
+ return NOTIFY_DONE;
+ switch (pm_event) {
+ case PM_SUSPEND_PREPARE:
+ get_monotonic_boottime(&timer->last_suspend_time);
+ break;
+ case PM_POST_SUSPEND:
+ spin_lock_bh(&timestamp_lock);
+ if (!timer->active) {
+ spin_unlock_bh(&timestamp_lock);
+ break;
+ }
+ /* since jiffies are not updated when suspended now represents
+ * the time it would have suspended */
+ if (time_after(timer->timer.expires, now)) {
+ get_monotonic_boottime(&ts);
+ ts = timespec_sub(ts, timer->last_suspend_time);
+ time_diff = timespec_to_jiffies(&ts);
+ if (timer->timer.expires > (time_diff + now)) {
+ mod_timer_pending(&timer->timer,
+ (timer->timer.expires - time_diff));
+ } else {
+ del_timer(&timer->timer);
+ timer->timer.expires = 0;
+ timer->active = false;
+ timer->work_pending = true;
+ schedule_work(&timer->work);
+ }
+ }
+ spin_unlock_bh(&timestamp_lock);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
}
static int idletimer_check_sysfs_name(const char *name, unsigned int size)
@@ -166,6 +327,21 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
setup_timer(&info->timer->timer, idletimer_tg_expired,
(unsigned long) info->timer);
info->timer->refcnt = 1;
+ info->timer->send_nl_msg = (info->send_nl_msg == 0) ? false : true;
+ info->timer->active = true;
+ info->timer->timeout = info->timeout;
+
+ info->timer->delayed_timer_trigger.tv_sec = 0;
+ info->timer->delayed_timer_trigger.tv_nsec = 0;
+ info->timer->work_pending = false;
+ info->timer->uid = 0;
+ get_monotonic_boottime(&info->timer->last_modified_timer);
+
+ info->timer->pm_nb.notifier_call = idletimer_resume;
+ ret = register_pm_notifier(&info->timer->pm_nb);
+ if (ret)
+ printk(KERN_WARNING "[%s] Failed to register pm notifier %d\n",
+ __func__, ret);
INIT_WORK(&info->timer->work, idletimer_tg_work);
@@ -182,6 +358,42 @@ out:
return ret;
}
+static void reset_timer(const struct idletimer_tg_info *info,
+ struct sk_buff *skb)
+{
+ unsigned long now = jiffies;
+ struct idletimer_tg *timer = info->timer;
+ bool timer_prev;
+
+ spin_lock_bh(&timestamp_lock);
+ timer_prev = timer->active;
+ timer->active = true;
+ /* timer_prev is used to guard overflow problem in time_before*/
+ if (!timer_prev || time_before(timer->timer.expires, now)) {
+ pr_debug("Starting Checkentry timer (Expired, Jiffies): %lu, %lu\n",
+ timer->timer.expires, now);
+
+ /* Stores the uid resposible for waking up the radio */
+ if (skb && (skb->sk)) {
+ timer->uid = from_kuid_munged(current_user_ns(),
+ sock_i_uid(skb_to_full_sk(skb)));
+ }
+
+ /* checks if there is a pending inactive notification*/
+ if (timer->work_pending)
+ timer->delayed_timer_trigger = timer->last_modified_timer;
+ else {
+ timer->work_pending = true;
+ schedule_work(&timer->work);
+ }
+ }
+
+ get_monotonic_boottime(&timer->last_modified_timer);
+ mod_timer(&timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + now);
+ spin_unlock_bh(&timestamp_lock);
+}
+
/*
* The actual xt_tables plugin.
*/
@@ -189,15 +401,23 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb,
const struct xt_action_param *par)
{
const struct idletimer_tg_info *info = par->targinfo;
+ unsigned long now = jiffies;
pr_debug("resetting timer %s, timeout period %u\n",
info->label, info->timeout);
BUG_ON(!info->timer);
- mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ info->timer->active = true;
+
+ if (time_before(info->timer->timer.expires, now)) {
+ schedule_work(&info->timer->work);
+ pr_debug("Starting timer %s (Expired, Jiffies): %lu, %lu\n",
+ info->label, info->timer->timer.expires, now);
+ }
+ /* TODO: Avoid modifying timers on each packet */
+ reset_timer(info, skb);
return XT_CONTINUE;
}
@@ -206,7 +426,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
struct idletimer_tg_info *info = par->targinfo;
int ret;
- pr_debug("checkentry targinfo%s\n", info->label);
+ pr_debug("checkentry targinfo %s\n", info->label);
if (info->timeout == 0) {
pr_debug("timeout value is zero\n");
@@ -228,9 +448,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
info->timer = __idletimer_tg_find_by_label(info->label);
if (info->timer) {
info->timer->refcnt++;
- mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
-
+ reset_timer(info, NULL);
pr_debug("increased refcnt of timer %s to %u\n",
info->label, info->timer->refcnt);
} else {
@@ -243,6 +461,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
}
mutex_unlock(&list_mutex);
+
return 0;
}
@@ -259,13 +478,14 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
list_del(&info->timer->entry);
del_timer_sync(&info->timer->timer);
- cancel_work_sync(&info->timer->work);
sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ unregister_pm_notifier(&info->timer->pm_nb);
+ cancel_work_sync(&info->timer->work);
kfree(info->timer->attr.attr.name);
kfree(info->timer);
} else {
pr_debug("decreased refcnt of timer %s to %u\n",
- info->label, info->timer->refcnt);
+ info->label, info->timer->refcnt);
}
mutex_unlock(&list_mutex);
@@ -273,6 +493,7 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
static struct xt_target idletimer_tg __read_mostly = {
.name = "IDLETIMER",
+ .revision = 1,
.family = NFPROTO_UNSPEC,
.target = idletimer_tg_target,
.targetsize = sizeof(struct idletimer_tg_info),
@@ -338,3 +559,4 @@ MODULE_DESCRIPTION("Xtables: idle time monitor");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS("ipt_IDLETIMER");
MODULE_ALIAS("ip6t_IDLETIMER");
+MODULE_ALIAS("arpt_IDLETIMER");
diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
new file mode 100644
index 000000000000..0e581491c187
--- /dev/null
+++ b/net/netfilter/xt_qtaguid.c
@@ -0,0 +1,3031 @@
+/*
+ * Kernel iptables module to track stats for packets based on user tags.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * There are run-time debug flags enabled via the debug_mask module param, or
+ * via the DEFAULT_DEBUG_MASK. See xt_qtaguid_internal.h.
+ */
+#define DEBUG
+
+#include <linux/file.h>
+#include <linux/inetdevice.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_qtaguid.h>
+#include <linux/ratelimit.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <net/addrconf.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#endif
+
+#include <linux/netfilter/xt_socket.h>
+#include "xt_qtaguid_internal.h"
+#include "xt_qtaguid_print.h"
+#include "../../fs/proc/internal.h"
+
+/*
+ * We only use the xt_socket funcs within a similar context to avoid unexpected
+ * return values.
+ */
+#define XT_SOCKET_SUPPORTED_HOOKS \
+ ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN))
+
+
+static const char *module_procdirname = "xt_qtaguid";
+static struct proc_dir_entry *xt_qtaguid_procdir;
+
+static unsigned int proc_iface_perms = S_IRUGO;
+module_param_named(iface_perms, proc_iface_perms, uint, S_IRUGO | S_IWUSR);
+
+static struct proc_dir_entry *xt_qtaguid_stats_file;
+static unsigned int proc_stats_perms = S_IRUGO;
+module_param_named(stats_perms, proc_stats_perms, uint, S_IRUGO | S_IWUSR);
+
+static struct proc_dir_entry *xt_qtaguid_ctrl_file;
+
+/* Everybody can write. But proc_ctrl_write_limited is true by default which
+ * limits what can be controlled. See the can_*() functions.
+ */
+static unsigned int proc_ctrl_perms = S_IRUGO | S_IWUGO;
+module_param_named(ctrl_perms, proc_ctrl_perms, uint, S_IRUGO | S_IWUSR);
+
+/* Limited by default, so the gid of the ctrl and stats proc entries
+ * will limit what can be done. See the can_*() functions.
+ */
+static bool proc_stats_readall_limited = true;
+static bool proc_ctrl_write_limited = true;
+
+module_param_named(stats_readall_limited, proc_stats_readall_limited, bool,
+ S_IRUGO | S_IWUSR);
+module_param_named(ctrl_write_limited, proc_ctrl_write_limited, bool,
+ S_IRUGO | S_IWUSR);
+
+/*
+ * Limit the number of active tags (via socket tags) for a given UID.
+ * Multiple processes could share the UID.
+ */
+static int max_sock_tags = DEFAULT_MAX_SOCK_TAGS;
+module_param(max_sock_tags, int, S_IRUGO | S_IWUSR);
+
+/*
+ * After the kernel has initiallized this module, it is still possible
+ * to make it passive.
+ * Setting passive to Y:
+ * - the iface stats handling will not act on notifications.
+ * - iptables matches will never match.
+ * - ctrl commands silently succeed.
+ * - stats are always empty.
+ * This is mostly usefull when a bug is suspected.
+ */
+static bool module_passive;
+module_param_named(passive, module_passive, bool, S_IRUGO | S_IWUSR);
+
+/*
+ * Control how qtaguid data is tracked per proc/uid.
+ * Setting tag_tracking_passive to Y:
+ * - don't create proc specific structs to track tags
+ * - don't check that active tag stats exceed some limits.
+ * - don't clean up socket tags on process exits.
+ * This is mostly usefull when a bug is suspected.
+ */
+static bool qtu_proc_handling_passive;
+module_param_named(tag_tracking_passive, qtu_proc_handling_passive, bool,
+ S_IRUGO | S_IWUSR);
+
+#define QTU_DEV_NAME "xt_qtaguid"
+
+uint qtaguid_debug_mask = DEFAULT_DEBUG_MASK;
+module_param_named(debug_mask, qtaguid_debug_mask, uint, S_IRUGO | S_IWUSR);
+
+/*---------------------------------------------------------------------------*/
+static const char *iface_stat_procdirname = "iface_stat";
+static struct proc_dir_entry *iface_stat_procdir;
+/*
+ * The iface_stat_all* will go away once userspace gets use to the new fields
+ * that have a format line.
+ */
+static const char *iface_stat_all_procfilename = "iface_stat_all";
+static struct proc_dir_entry *iface_stat_all_procfile;
+static const char *iface_stat_fmt_procfilename = "iface_stat_fmt";
+static struct proc_dir_entry *iface_stat_fmt_procfile;
+
+
+static LIST_HEAD(iface_stat_list);
+static DEFINE_SPINLOCK(iface_stat_list_lock);
+
+static struct rb_root sock_tag_tree = RB_ROOT;
+static DEFINE_SPINLOCK(sock_tag_list_lock);
+
+static struct rb_root tag_counter_set_tree = RB_ROOT;
+static DEFINE_SPINLOCK(tag_counter_set_list_lock);
+
+static struct rb_root uid_tag_data_tree = RB_ROOT;
+static DEFINE_SPINLOCK(uid_tag_data_tree_lock);
+
+static struct rb_root proc_qtu_data_tree = RB_ROOT;
+/* No proc_qtu_data_tree_lock; use uid_tag_data_tree_lock */
+
+static struct qtaguid_event_counts qtu_events;
+/*----------------------------------------------*/
+static bool can_manipulate_uids(void)
+{
+ /* root pwnd */
+ return in_egroup_p(xt_qtaguid_ctrl_file->gid)
+ || unlikely(!from_kuid(&init_user_ns, current_fsuid())) || unlikely(!proc_ctrl_write_limited)
+ || unlikely(uid_eq(current_fsuid(), xt_qtaguid_ctrl_file->uid));
+}
+
+static bool can_impersonate_uid(kuid_t uid)
+{
+ return uid_eq(uid, current_fsuid()) || can_manipulate_uids();
+}
+
+static bool can_read_other_uid_stats(kuid_t uid)
+{
+ /* root pwnd */
+ return in_egroup_p(xt_qtaguid_stats_file->gid)
+ || unlikely(!from_kuid(&init_user_ns, current_fsuid())) || uid_eq(uid, current_fsuid())
+ || unlikely(!proc_stats_readall_limited)
+ || unlikely(uid_eq(current_fsuid(), xt_qtaguid_ctrl_file->uid));
+}
+
+static inline void dc_add_byte_packets(struct data_counters *counters, int set,
+ enum ifs_tx_rx direction,
+ enum ifs_proto ifs_proto,
+ int bytes,
+ int packets)
+{
+ counters->bpc[set][direction][ifs_proto].bytes += bytes;
+ counters->bpc[set][direction][ifs_proto].packets += packets;
+}
+
+static struct tag_node *tag_node_tree_search(struct rb_root *root, tag_t tag)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct tag_node *data = rb_entry(node, struct tag_node, node);
+ int result;
+ RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
+ " node=%p data=%p\n", tag, node, data);
+ result = tag_compare(tag, data->tag);
+ RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
+ " data.tag=0x%llx (uid=%u) res=%d\n",
+ tag, data->tag, get_uid_from_tag(data->tag), result);
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return data;
+ }
+ return NULL;
+}
+
+static void tag_node_tree_insert(struct tag_node *data, struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct tag_node *this = rb_entry(*new, struct tag_node,
+ node);
+ int result = tag_compare(data->tag, this->tag);
+ RB_DEBUG("qtaguid: %s(): tag=0x%llx"
+ " (uid=%u)\n", __func__,
+ this->tag,
+ get_uid_from_tag(this->tag));
+ parent = *new;
+ if (result < 0)
+ new = &((*new)->rb_left);
+ else if (result > 0)
+ new = &((*new)->rb_right);
+ else
+ BUG();
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&data->node, parent, new);
+ rb_insert_color(&data->node, root);
+}
+
+static void tag_stat_tree_insert(struct tag_stat *data, struct rb_root *root)
+{
+ tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_stat *tag_stat_tree_search(struct rb_root *root, tag_t tag)
+{
+ struct tag_node *node = tag_node_tree_search(root, tag);
+ if (!node)
+ return NULL;
+ return rb_entry(&node->node, struct tag_stat, tn.node);
+}
+
+static void tag_counter_set_tree_insert(struct tag_counter_set *data,
+ struct rb_root *root)
+{
+ tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_counter_set *tag_counter_set_tree_search(struct rb_root *root,
+ tag_t tag)
+{
+ struct tag_node *node = tag_node_tree_search(root, tag);
+ if (!node)
+ return NULL;
+ return rb_entry(&node->node, struct tag_counter_set, tn.node);
+
+}
+
+static void tag_ref_tree_insert(struct tag_ref *data, struct rb_root *root)
+{
+ tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_ref *tag_ref_tree_search(struct rb_root *root, tag_t tag)
+{
+ struct tag_node *node = tag_node_tree_search(root, tag);
+ if (!node)
+ return NULL;
+ return rb_entry(&node->node, struct tag_ref, tn.node);
+}
+
+static struct sock_tag *sock_tag_tree_search(struct rb_root *root,
+ const struct sock *sk)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct sock_tag *data = rb_entry(node, struct sock_tag,
+ sock_node);
+ if (sk < data->sk)
+ node = node->rb_left;
+ else if (sk > data->sk)
+ node = node->rb_right;
+ else
+ return data;
+ }
+ return NULL;
+}
+
+static void sock_tag_tree_insert(struct sock_tag *data, struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct sock_tag *this = rb_entry(*new, struct sock_tag,
+ sock_node);
+ parent = *new;
+ if (data->sk < this->sk)
+ new = &((*new)->rb_left);
+ else if (data->sk > this->sk)
+ new = &((*new)->rb_right);
+ else
+ BUG();
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&data->sock_node, parent, new);
+ rb_insert_color(&data->sock_node, root);
+}
+
+static void sock_tag_tree_erase(struct rb_root *st_to_free_tree)
+{
+ struct rb_node *node;
+ struct sock_tag *st_entry;
+
+ node = rb_first(st_to_free_tree);
+ while (node) {
+ st_entry = rb_entry(node, struct sock_tag, sock_node);
+ node = rb_next(node);
+ CT_DEBUG("qtaguid: %s(): "
+ "erase st: sk=%p tag=0x%llx (uid=%u)\n", __func__,
+ st_entry->sk,
+ st_entry->tag,
+ get_uid_from_tag(st_entry->tag));
+ rb_erase(&st_entry->sock_node, st_to_free_tree);
+ sock_put(st_entry->sk);
+ kfree(st_entry);
+ }
+}
+
+static struct proc_qtu_data *proc_qtu_data_tree_search(struct rb_root *root,
+ const pid_t pid)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct proc_qtu_data *data = rb_entry(node,
+ struct proc_qtu_data,
+ node);
+ if (pid < data->pid)
+ node = node->rb_left;
+ else if (pid > data->pid)
+ node = node->rb_right;
+ else
+ return data;
+ }
+ return NULL;
+}
+
+static void proc_qtu_data_tree_insert(struct proc_qtu_data *data,
+ struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct proc_qtu_data *this = rb_entry(*new,
+ struct proc_qtu_data,
+ node);
+ parent = *new;
+ if (data->pid < this->pid)
+ new = &((*new)->rb_left);
+ else if (data->pid > this->pid)
+ new = &((*new)->rb_right);
+ else
+ BUG();
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&data->node, parent, new);
+ rb_insert_color(&data->node, root);
+}
+
+static void uid_tag_data_tree_insert(struct uid_tag_data *data,
+ struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct uid_tag_data *this = rb_entry(*new,
+ struct uid_tag_data,
+ node);
+ parent = *new;
+ if (data->uid < this->uid)
+ new = &((*new)->rb_left);
+ else if (data->uid > this->uid)
+ new = &((*new)->rb_right);
+ else
+ BUG();
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&data->node, parent, new);
+ rb_insert_color(&data->node, root);
+}
+
+static struct uid_tag_data *uid_tag_data_tree_search(struct rb_root *root,
+ uid_t uid)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct uid_tag_data *data = rb_entry(node,
+ struct uid_tag_data,
+ node);
+ if (uid < data->uid)
+ node = node->rb_left;
+ else if (uid > data->uid)
+ node = node->rb_right;
+ else
+ return data;
+ }
+ return NULL;
+}
+
+/*
+ * Allocates a new uid_tag_data struct if needed.
+ * Returns a pointer to the found or allocated uid_tag_data.
+ * Returns a PTR_ERR on failures, and lock is not held.
+ * If found is not NULL:
+ * sets *found to true if not allocated.
+ * sets *found to false if allocated.
+ */
+struct uid_tag_data *get_uid_data(uid_t uid, bool *found_res)
+{
+ struct uid_tag_data *utd_entry;
+
+ /* Look for top level uid_tag_data for the UID */
+ utd_entry = uid_tag_data_tree_search(&uid_tag_data_tree, uid);
+ DR_DEBUG("qtaguid: get_uid_data(%u) utd=%p\n", uid, utd_entry);
+
+ if (found_res)
+ *found_res = utd_entry;
+ if (utd_entry)
+ return utd_entry;
+
+ utd_entry = kzalloc(sizeof(*utd_entry), GFP_ATOMIC);
+ if (!utd_entry) {
+ pr_err("qtaguid: get_uid_data(%u): "
+ "tag data alloc failed\n", uid);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ utd_entry->uid = uid;
+ utd_entry->tag_ref_tree = RB_ROOT;
+ uid_tag_data_tree_insert(utd_entry, &uid_tag_data_tree);
+ DR_DEBUG("qtaguid: get_uid_data(%u) new utd=%p\n", uid, utd_entry);
+ return utd_entry;
+}
+
+/* Never returns NULL. Either PTR_ERR or a valid ptr. */
+static struct tag_ref *new_tag_ref(tag_t new_tag,
+ struct uid_tag_data *utd_entry)
+{
+ struct tag_ref *tr_entry;
+ int res;
+
+ if (utd_entry->num_active_tags + 1 > max_sock_tags) {
+ pr_info("qtaguid: new_tag_ref(0x%llx): "
+ "tag ref alloc quota exceeded. max=%d\n",
+ new_tag, max_sock_tags);
+ res = -EMFILE;
+ goto err_res;
+
+ }
+
+ tr_entry = kzalloc(sizeof(*tr_entry), GFP_ATOMIC);
+ if (!tr_entry) {
+ pr_err("qtaguid: new_tag_ref(0x%llx): "
+ "tag ref alloc failed\n",
+ new_tag);
+ res = -ENOMEM;
+ goto err_res;
+ }
+ tr_entry->tn.tag = new_tag;
+ /* tr_entry->num_sock_tags handled by caller */
+ utd_entry->num_active_tags++;
+ tag_ref_tree_insert(tr_entry, &utd_entry->tag_ref_tree);
+ DR_DEBUG("qtaguid: new_tag_ref(0x%llx): "
+ " inserted new tag ref %p\n",
+ new_tag, tr_entry);
+ return tr_entry;
+
+err_res:
+ return ERR_PTR(res);
+}
+
+static struct tag_ref *lookup_tag_ref(tag_t full_tag,
+ struct uid_tag_data **utd_res)
+{
+ struct uid_tag_data *utd_entry;
+ struct tag_ref *tr_entry;
+ bool found_utd;
+ uid_t uid = get_uid_from_tag(full_tag);
+
+ DR_DEBUG("qtaguid: lookup_tag_ref(tag=0x%llx (uid=%u))\n",
+ full_tag, uid);
+
+ utd_entry = get_uid_data(uid, &found_utd);
+ if (IS_ERR_OR_NULL(utd_entry)) {
+ if (utd_res)
+ *utd_res = utd_entry;
+ return NULL;
+ }
+
+ tr_entry = tag_ref_tree_search(&utd_entry->tag_ref_tree, full_tag);
+ if (utd_res)
+ *utd_res = utd_entry;
+ DR_DEBUG("qtaguid: lookup_tag_ref(0x%llx) utd_entry=%p tr_entry=%p\n",
+ full_tag, utd_entry, tr_entry);
+ return tr_entry;
+}
+
+/* Never returns NULL. Either PTR_ERR or a valid ptr. */
+static struct tag_ref *get_tag_ref(tag_t full_tag,
+ struct uid_tag_data **utd_res)
+{
+ struct uid_tag_data *utd_entry;
+ struct tag_ref *tr_entry;
+
+ DR_DEBUG("qtaguid: get_tag_ref(0x%llx)\n",
+ full_tag);
+ tr_entry = lookup_tag_ref(full_tag, &utd_entry);
+ BUG_ON(IS_ERR_OR_NULL(utd_entry));
+ if (!tr_entry)
+ tr_entry = new_tag_ref(full_tag, utd_entry);
+
+ if (utd_res)
+ *utd_res = utd_entry;
+ DR_DEBUG("qtaguid: get_tag_ref(0x%llx) utd=%p tr=%p\n",
+ full_tag, utd_entry, tr_entry);
+ return tr_entry;
+}
+
+/* Checks and maybe frees the UID Tag Data entry */
+static void put_utd_entry(struct uid_tag_data *utd_entry)
+{
+ /* Are we done with the UID tag data entry? */
+ if (RB_EMPTY_ROOT(&utd_entry->tag_ref_tree) &&
+ !utd_entry->num_pqd) {
+ DR_DEBUG("qtaguid: %s(): "
+ "erase utd_entry=%p uid=%u "
+ "by pid=%u tgid=%u uid=%u\n", __func__,
+ utd_entry, utd_entry->uid,
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+ BUG_ON(utd_entry->num_active_tags);
+ rb_erase(&utd_entry->node, &uid_tag_data_tree);
+ kfree(utd_entry);
+ } else {
+ DR_DEBUG("qtaguid: %s(): "
+ "utd_entry=%p still has %d tags %d proc_qtu_data\n",
+ __func__, utd_entry, utd_entry->num_active_tags,
+ utd_entry->num_pqd);
+ BUG_ON(!(utd_entry->num_active_tags ||
+ utd_entry->num_pqd));
+ }
+}
+
+/*
+ * If no sock_tags are using this tag_ref,
+ * decrements refcount of utd_entry, removes tr_entry
+ * from utd_entry->tag_ref_tree and frees.
+ */
+static void free_tag_ref_from_utd_entry(struct tag_ref *tr_entry,
+ struct uid_tag_data *utd_entry)
+{
+ DR_DEBUG("qtaguid: %s(): %p tag=0x%llx (uid=%u)\n", __func__,
+ tr_entry, tr_entry->tn.tag,
+ get_uid_from_tag(tr_entry->tn.tag));
+ if (!tr_entry->num_sock_tags) {
+ BUG_ON(!utd_entry->num_active_tags);
+ utd_entry->num_active_tags--;
+ rb_erase(&tr_entry->tn.node, &utd_entry->tag_ref_tree);
+ DR_DEBUG("qtaguid: %s(): erased %p\n", __func__, tr_entry);
+ kfree(tr_entry);
+ }
+}
+
+static void put_tag_ref_tree(tag_t full_tag, struct uid_tag_data *utd_entry)
+{
+ struct rb_node *node;
+ struct tag_ref *tr_entry;
+ tag_t acct_tag;
+
+ DR_DEBUG("qtaguid: %s(tag=0x%llx (uid=%u))\n", __func__,
+ full_tag, get_uid_from_tag(full_tag));
+ acct_tag = get_atag_from_tag(full_tag);
+ node = rb_first(&utd_entry->tag_ref_tree);
+ while (node) {
+ tr_entry = rb_entry(node, struct tag_ref, tn.node);
+ node = rb_next(node);
+ if (!acct_tag || tr_entry->tn.tag == full_tag)
+ free_tag_ref_from_utd_entry(tr_entry, utd_entry);
+ }
+}
+
+static ssize_t read_proc_u64(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ uint64_t *valuep = PDE_DATA(file_inode(file));
+ char tmp[24];
+ size_t tmp_size;
+
+ tmp_size = scnprintf(tmp, sizeof(tmp), "%llu\n", *valuep);
+ return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size);
+}
+
+static ssize_t read_proc_bool(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ bool *valuep = PDE_DATA(file_inode(file));
+ char tmp[24];
+ size_t tmp_size;
+
+ tmp_size = scnprintf(tmp, sizeof(tmp), "%u\n", *valuep);
+ return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size);
+}
+
+static int get_active_counter_set(tag_t tag)
+{
+ int active_set = 0;
+ struct tag_counter_set *tcs;
+
+ MT_DEBUG("qtaguid: get_active_counter_set(tag=0x%llx)"
+ " (uid=%u)\n",
+ tag, get_uid_from_tag(tag));
+ /* For now we only handle UID tags for active sets */
+ tag = get_utag_from_tag(tag);
+ spin_lock_bh(&tag_counter_set_list_lock);
+ tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
+ if (tcs)
+ active_set = tcs->active_set;
+ spin_unlock_bh(&tag_counter_set_list_lock);
+ return active_set;
+}
+
+/*
+ * Find the entry for tracking the specified interface.
+ * Caller must hold iface_stat_list_lock
+ */
+static struct iface_stat *get_iface_entry(const char *ifname)
+{
+ struct iface_stat *iface_entry;
+
+ /* Find the entry for tracking the specified tag within the interface */
+ if (ifname == NULL) {
+ pr_info("qtaguid: iface_stat: get() NULL device name\n");
+ return NULL;
+ }
+
+ /* Iterate over interfaces */
+ list_for_each_entry(iface_entry, &iface_stat_list, list) {
+ if (!strcmp(ifname, iface_entry->ifname))
+ goto done;
+ }
+ iface_entry = NULL;
+done:
+ return iface_entry;
+}
+
+/* This is for fmt2 only */
+static void pp_iface_stat_header(struct seq_file *m)
+{
+ seq_puts(m,
+ "ifname "
+ "total_skb_rx_bytes total_skb_rx_packets "
+ "total_skb_tx_bytes total_skb_tx_packets "
+ "rx_tcp_bytes rx_tcp_packets "
+ "rx_udp_bytes rx_udp_packets "
+ "rx_other_bytes rx_other_packets "
+ "tx_tcp_bytes tx_tcp_packets "
+ "tx_udp_bytes tx_udp_packets "
+ "tx_other_bytes tx_other_packets\n"
+ );
+}
+
+static void pp_iface_stat_line(struct seq_file *m,
+ struct iface_stat *iface_entry)
+{
+ struct data_counters *cnts;
+ int cnt_set = 0; /* We only use one set for the device */
+ cnts = &iface_entry->totals_via_skb;
+ seq_printf(m, "%s %llu %llu %llu %llu %llu %llu %llu %llu "
+ "%llu %llu %llu %llu %llu %llu %llu %llu\n",
+ iface_entry->ifname,
+ dc_sum_bytes(cnts, cnt_set, IFS_RX),
+ dc_sum_packets(cnts, cnt_set, IFS_RX),
+ dc_sum_bytes(cnts, cnt_set, IFS_TX),
+ dc_sum_packets(cnts, cnt_set, IFS_TX),
+ cnts->bpc[cnt_set][IFS_RX][IFS_TCP].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_TCP].packets,
+ cnts->bpc[cnt_set][IFS_RX][IFS_UDP].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_UDP].packets,
+ cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_TCP].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_TCP].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_UDP].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets);
+}
+
+struct proc_iface_stat_fmt_info {
+ int fmt;
+};
+
+static void *iface_stat_fmt_proc_start(struct seq_file *m, loff_t *pos)
+{
+ struct proc_iface_stat_fmt_info *p = m->private;
+ loff_t n = *pos;
+
+ /*
+ * This lock will prevent iface_stat_update() from changing active,
+ * and in turn prevent an interface from unregistering itself.
+ */
+ spin_lock_bh(&iface_stat_list_lock);
+
+ if (unlikely(module_passive))
+ return NULL;
+
+ if (!n && p->fmt == 2)
+ pp_iface_stat_header(m);
+
+ return seq_list_start(&iface_stat_list, n);
+}
+
+static void *iface_stat_fmt_proc_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &iface_stat_list, pos);
+}
+
+static void iface_stat_fmt_proc_stop(struct seq_file *m, void *p)
+{
+ spin_unlock_bh(&iface_stat_list_lock);
+}
+
+static int iface_stat_fmt_proc_show(struct seq_file *m, void *v)
+{
+ struct proc_iface_stat_fmt_info *p = m->private;
+ struct iface_stat *iface_entry;
+ struct rtnl_link_stats64 dev_stats, *stats;
+ struct rtnl_link_stats64 no_dev_stats = {0};
+
+
+ CT_DEBUG("qtaguid:proc iface_stat_fmt pid=%u tgid=%u uid=%u\n",
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+ iface_entry = list_entry(v, struct iface_stat, list);
+
+ if (iface_entry->active) {
+ stats = dev_get_stats(iface_entry->net_dev,
+ &dev_stats);
+ } else {
+ stats = &no_dev_stats;
+ }
+ /*
+ * If the meaning of the data changes, then update the fmtX
+ * string.
+ */
+ if (p->fmt == 1) {
+ seq_printf(m, "%s %d %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ iface_entry->ifname,
+ iface_entry->active,
+ iface_entry->totals_via_dev[IFS_RX].bytes,
+ iface_entry->totals_via_dev[IFS_RX].packets,
+ iface_entry->totals_via_dev[IFS_TX].bytes,
+ iface_entry->totals_via_dev[IFS_TX].packets,
+ stats->rx_bytes, stats->rx_packets,
+ stats->tx_bytes, stats->tx_packets
+ );
+ } else {
+ pp_iface_stat_line(m, iface_entry);
+ }
+ return 0;
+}
+
+static const struct file_operations read_u64_fops = {
+ .read = read_proc_u64,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations read_bool_fops = {
+ .read = read_proc_bool,
+ .llseek = default_llseek,
+};
+
+static void iface_create_proc_worker(struct work_struct *work)
+{
+ struct proc_dir_entry *proc_entry;
+ struct iface_stat_work *isw = container_of(work, struct iface_stat_work,
+ iface_work);
+ struct iface_stat *new_iface = isw->iface_entry;
+
+ /* iface_entries are not deleted, so safe to manipulate. */
+ proc_entry = proc_mkdir(new_iface->ifname, iface_stat_procdir);
+ if (IS_ERR_OR_NULL(proc_entry)) {
+ pr_err("qtaguid: iface_stat: create_proc(): alloc failed.\n");
+ kfree(isw);
+ return;
+ }
+
+ new_iface->proc_ptr = proc_entry;
+
+ proc_create_data("tx_bytes", proc_iface_perms, proc_entry,
+ &read_u64_fops,
+ &new_iface->totals_via_dev[IFS_TX].bytes);
+ proc_create_data("rx_bytes", proc_iface_perms, proc_entry,
+ &read_u64_fops,
+ &new_iface->totals_via_dev[IFS_RX].bytes);
+ proc_create_data("tx_packets", proc_iface_perms, proc_entry,
+ &read_u64_fops,
+ &new_iface->totals_via_dev[IFS_TX].packets);
+ proc_create_data("rx_packets", proc_iface_perms, proc_entry,
+ &read_u64_fops,
+ &new_iface->totals_via_dev[IFS_RX].packets);
+ proc_create_data("active", proc_iface_perms, proc_entry,
+ &read_bool_fops, &new_iface->active);
+
+ IF_DEBUG("qtaguid: iface_stat: create_proc(): done "
+ "entry=%p dev=%s\n", new_iface, new_iface->ifname);
+ kfree(isw);
+}
+
+/*
+ * Will set the entry's active state, and
+ * update the net_dev accordingly also.
+ */
+static void _iface_stat_set_active(struct iface_stat *entry,
+ struct net_device *net_dev,
+ bool activate)
+{
+ if (activate) {
+ entry->net_dev = net_dev;
+ entry->active = true;
+ IF_DEBUG("qtaguid: %s(%s): "
+ "enable tracking. rfcnt=%d\n", __func__,
+ entry->ifname,
+ __this_cpu_read(*net_dev->pcpu_refcnt));
+ } else {
+ entry->active = false;
+ entry->net_dev = NULL;
+ IF_DEBUG("qtaguid: %s(%s): "
+ "disable tracking. rfcnt=%d\n", __func__,
+ entry->ifname,
+ __this_cpu_read(*net_dev->pcpu_refcnt));
+
+ }
+}
+
+/* Caller must hold iface_stat_list_lock */
+static struct iface_stat *iface_alloc(struct net_device *net_dev)
+{
+ struct iface_stat *new_iface;
+ struct iface_stat_work *isw;
+
+ new_iface = kzalloc(sizeof(*new_iface), GFP_ATOMIC);
+ if (new_iface == NULL) {
+ pr_err("qtaguid: iface_stat: create(%s): "
+ "iface_stat alloc failed\n", net_dev->name);
+ return NULL;
+ }
+ new_iface->ifname = kstrdup(net_dev->name, GFP_ATOMIC);
+ if (new_iface->ifname == NULL) {
+ pr_err("qtaguid: iface_stat: create(%s): "
+ "ifname alloc failed\n", net_dev->name);
+ kfree(new_iface);
+ return NULL;
+ }
+ spin_lock_init(&new_iface->tag_stat_list_lock);
+ new_iface->tag_stat_tree = RB_ROOT;
+ _iface_stat_set_active(new_iface, net_dev, true);
+
+ /*
+ * ipv6 notifier chains are atomic :(
+ * No create_proc_read_entry() for you!
+ */
+ isw = kmalloc(sizeof(*isw), GFP_ATOMIC);
+ if (!isw) {
+ pr_err("qtaguid: iface_stat: create(%s): "
+ "work alloc failed\n", new_iface->ifname);
+ _iface_stat_set_active(new_iface, net_dev, false);
+ kfree(new_iface->ifname);
+ kfree(new_iface);
+ return NULL;
+ }
+ isw->iface_entry = new_iface;
+ INIT_WORK(&isw->iface_work, iface_create_proc_worker);
+ schedule_work(&isw->iface_work);
+ list_add(&new_iface->list, &iface_stat_list);
+ return new_iface;
+}
+
+static void iface_check_stats_reset_and_adjust(struct net_device *net_dev,
+ struct iface_stat *iface)
+{
+ struct rtnl_link_stats64 dev_stats, *stats;
+ bool stats_rewound;
+
+ stats = dev_get_stats(net_dev, &dev_stats);
+ /* No empty packets */
+ stats_rewound =
+ (stats->rx_bytes < iface->last_known[IFS_RX].bytes)
+ || (stats->tx_bytes < iface->last_known[IFS_TX].bytes);
+
+ IF_DEBUG("qtaguid: %s(%s): iface=%p netdev=%p "
+ "bytes rx/tx=%llu/%llu "
+ "active=%d last_known=%d "
+ "stats_rewound=%d\n", __func__,
+ net_dev ? net_dev->name : "?",
+ iface, net_dev,
+ stats->rx_bytes, stats->tx_bytes,
+ iface->active, iface->last_known_valid, stats_rewound);
+
+ if (iface->active && iface->last_known_valid && stats_rewound) {
+ pr_warn_once("qtaguid: iface_stat: %s(%s): "
+ "iface reset its stats unexpectedly\n", __func__,
+ net_dev->name);
+
+ iface->totals_via_dev[IFS_TX].bytes +=
+ iface->last_known[IFS_TX].bytes;
+ iface->totals_via_dev[IFS_TX].packets +=
+ iface->last_known[IFS_TX].packets;
+ iface->totals_via_dev[IFS_RX].bytes +=
+ iface->last_known[IFS_RX].bytes;
+ iface->totals_via_dev[IFS_RX].packets +=
+ iface->last_known[IFS_RX].packets;
+ iface->last_known_valid = false;
+ IF_DEBUG("qtaguid: %s(%s): iface=%p "
+ "used last known bytes rx/tx=%llu/%llu\n", __func__,
+ iface->ifname, iface, iface->last_known[IFS_RX].bytes,
+ iface->last_known[IFS_TX].bytes);
+ }
+}
+
+/*
+ * Create a new entry for tracking the specified interface.
+ * Do nothing if the entry already exists.
+ * Called when an interface is configured with a valid IP address.
+ */
+static void iface_stat_create(struct net_device *net_dev,
+ struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = NULL;
+ const char *ifname;
+ struct iface_stat *entry;
+ __be32 ipaddr = 0;
+ struct iface_stat *new_iface;
+
+ IF_DEBUG("qtaguid: iface_stat: create(%s): ifa=%p netdev=%p\n",
+ net_dev ? net_dev->name : "?",
+ ifa, net_dev);
+ if (!net_dev) {
+ pr_err("qtaguid: iface_stat: create(): no net dev\n");
+ return;
+ }
+
+ ifname = net_dev->name;
+ if (!ifa) {
+ in_dev = in_dev_get(net_dev);
+ if (!in_dev) {
+ pr_err("qtaguid: iface_stat: create(%s): no inet dev\n",
+ ifname);
+ return;
+ }
+ IF_DEBUG("qtaguid: iface_stat: create(%s): in_dev=%p\n",
+ ifname, in_dev);
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ IF_DEBUG("qtaguid: iface_stat: create(%s): "
+ "ifa=%p ifa_label=%s\n",
+ ifname, ifa, ifa->ifa_label);
+ if (!strcmp(ifname, ifa->ifa_label))
+ break;
+ }
+ }
+
+ if (!ifa) {
+ IF_DEBUG("qtaguid: iface_stat: create(%s): no matching IP\n",
+ ifname);
+ goto done_put;
+ }
+ ipaddr = ifa->ifa_local;
+
+ spin_lock_bh(&iface_stat_list_lock);
+ entry = get_iface_entry(ifname);
+ if (entry != NULL) {
+ IF_DEBUG("qtaguid: iface_stat: create(%s): entry=%p\n",
+ ifname, entry);
+ iface_check_stats_reset_and_adjust(net_dev, entry);
+ _iface_stat_set_active(entry, net_dev, true);
+ IF_DEBUG("qtaguid: %s(%s): "
+ "tracking now %d on ip=%pI4\n", __func__,
+ entry->ifname, true, &ipaddr);
+ goto done_unlock_put;
+ }
+
+ new_iface = iface_alloc(net_dev);
+ IF_DEBUG("qtaguid: iface_stat: create(%s): done "
+ "entry=%p ip=%pI4\n", ifname, new_iface, &ipaddr);
+done_unlock_put:
+ spin_unlock_bh(&iface_stat_list_lock);
+done_put:
+ if (in_dev)
+ in_dev_put(in_dev);
+}
+
+static void iface_stat_create_ipv6(struct net_device *net_dev,
+ struct inet6_ifaddr *ifa)
+{
+ struct in_device *in_dev;
+ const char *ifname;
+ struct iface_stat *entry;
+ struct iface_stat *new_iface;
+ int addr_type;
+
+ IF_DEBUG("qtaguid: iface_stat: create6(): ifa=%p netdev=%p->name=%s\n",
+ ifa, net_dev, net_dev ? net_dev->name : "");
+ if (!net_dev) {
+ pr_err("qtaguid: iface_stat: create6(): no net dev!\n");
+ return;
+ }
+ ifname = net_dev->name;
+
+ in_dev = in_dev_get(net_dev);
+ if (!in_dev) {
+ pr_err("qtaguid: iface_stat: create6(%s): no inet dev\n",
+ ifname);
+ return;
+ }
+
+ IF_DEBUG("qtaguid: iface_stat: create6(%s): in_dev=%p\n",
+ ifname, in_dev);
+
+ if (!ifa) {
+ IF_DEBUG("qtaguid: iface_stat: create6(%s): no matching IP\n",
+ ifname);
+ goto done_put;
+ }
+ addr_type = ipv6_addr_type(&ifa->addr);
+
+ spin_lock_bh(&iface_stat_list_lock);
+ entry = get_iface_entry(ifname);
+ if (entry != NULL) {
+ IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
+ ifname, entry);
+ iface_check_stats_reset_and_adjust(net_dev, entry);
+ _iface_stat_set_active(entry, net_dev, true);
+ IF_DEBUG("qtaguid: %s(%s): "
+ "tracking now %d on ip=%pI6c\n", __func__,
+ entry->ifname, true, &ifa->addr);
+ goto done_unlock_put;
+ }
+
+ new_iface = iface_alloc(net_dev);
+ IF_DEBUG("qtaguid: iface_stat: create6(%s): done "
+ "entry=%p ip=%pI6c\n", ifname, new_iface, &ifa->addr);
+
+done_unlock_put:
+ spin_unlock_bh(&iface_stat_list_lock);
+done_put:
+ in_dev_put(in_dev);
+}
+
+static struct sock_tag *get_sock_stat_nl(const struct sock *sk)
+{
+ MT_DEBUG("qtaguid: get_sock_stat_nl(sk=%p)\n", sk);
+ return sock_tag_tree_search(&sock_tag_tree, sk);
+}
+
+static struct sock_tag *get_sock_stat(const struct sock *sk)
+{
+ struct sock_tag *sock_tag_entry;
+ MT_DEBUG("qtaguid: get_sock_stat(sk=%p)\n", sk);
+ if (!sk)
+ return NULL;
+ spin_lock_bh(&sock_tag_list_lock);
+ sock_tag_entry = get_sock_stat_nl(sk);
+ spin_unlock_bh(&sock_tag_list_lock);
+ return sock_tag_entry;
+}
+
+static int ipx_proto(const struct sk_buff *skb,
+ struct xt_action_param *par)
+{
+ int thoff = 0, tproto;
+
+ switch (par->family) {
+ case NFPROTO_IPV6:
+ tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
+ if (tproto < 0)
+ MT_DEBUG("%s(): transport header not found in ipv6"
+ " skb=%p\n", __func__, skb);
+ break;
+ case NFPROTO_IPV4:
+ tproto = ip_hdr(skb)->protocol;
+ break;
+ default:
+ tproto = IPPROTO_RAW;
+ }
+ return tproto;
+}
+
+static void
+data_counters_update(struct data_counters *dc, int set,
+ enum ifs_tx_rx direction, int proto, int bytes)
+{
+ switch (proto) {
+ case IPPROTO_TCP:
+ dc_add_byte_packets(dc, set, direction, IFS_TCP, bytes, 1);
+ break;
+ case IPPROTO_UDP:
+ dc_add_byte_packets(dc, set, direction, IFS_UDP, bytes, 1);
+ break;
+ case IPPROTO_IP:
+ default:
+ dc_add_byte_packets(dc, set, direction, IFS_PROTO_OTHER, bytes,
+ 1);
+ break;
+ }
+}
+
+/*
+ * Update stats for the specified interface. Do nothing if the entry
+ * does not exist (when a device was never configured with an IP address).
+ * Called when an device is being unregistered.
+ */
+static void iface_stat_update(struct net_device *net_dev, bool stash_only)
+{
+ struct rtnl_link_stats64 dev_stats, *stats;
+ struct iface_stat *entry;
+
+ stats = dev_get_stats(net_dev, &dev_stats);
+ spin_lock_bh(&iface_stat_list_lock);
+ entry = get_iface_entry(net_dev->name);
+ if (entry == NULL) {
+ IF_DEBUG("qtaguid: iface_stat: update(%s): not tracked\n",
+ net_dev->name);
+ spin_unlock_bh(&iface_stat_list_lock);
+ return;
+ }
+
+ IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
+ net_dev->name, entry);
+ if (!entry->active) {
+ IF_DEBUG("qtaguid: %s(%s): already disabled\n", __func__,
+ net_dev->name);
+ spin_unlock_bh(&iface_stat_list_lock);
+ return;
+ }
+
+ if (stash_only) {
+ entry->last_known[IFS_TX].bytes = stats->tx_bytes;
+ entry->last_known[IFS_TX].packets = stats->tx_packets;
+ entry->last_known[IFS_RX].bytes = stats->rx_bytes;
+ entry->last_known[IFS_RX].packets = stats->rx_packets;
+ entry->last_known_valid = true;
+ IF_DEBUG("qtaguid: %s(%s): "
+ "dev stats stashed rx/tx=%llu/%llu\n", __func__,
+ net_dev->name, stats->rx_bytes, stats->tx_bytes);
+ spin_unlock_bh(&iface_stat_list_lock);
+ return;
+ }
+ entry->totals_via_dev[IFS_TX].bytes += stats->tx_bytes;
+ entry->totals_via_dev[IFS_TX].packets += stats->tx_packets;
+ entry->totals_via_dev[IFS_RX].bytes += stats->rx_bytes;
+ entry->totals_via_dev[IFS_RX].packets += stats->rx_packets;
+ /* We don't need the last_known[] anymore */
+ entry->last_known_valid = false;
+ _iface_stat_set_active(entry, net_dev, false);
+ IF_DEBUG("qtaguid: %s(%s): "
+ "disable tracking. rx/tx=%llu/%llu\n", __func__,
+ net_dev->name, stats->rx_bytes, stats->tx_bytes);
+ spin_unlock_bh(&iface_stat_list_lock);
+}
+
+/*
+ * Update stats for the specified interface from the skb.
+ * Do nothing if the entry
+ * does not exist (when a device was never configured with an IP address).
+ * Called on each sk.
+ */
+static void iface_stat_update_from_skb(const struct sk_buff *skb,
+ struct xt_action_param *par)
+{
+ struct iface_stat *entry;
+ const struct net_device *el_dev;
+ enum ifs_tx_rx direction = par->in ? IFS_RX : IFS_TX;
+ int bytes = skb->len;
+ int proto;
+
+ if (!skb->dev) {
+ MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum);
+ el_dev = par->in ? : par->out;
+ } else {
+ const struct net_device *other_dev;
+ el_dev = skb->dev;
+ other_dev = par->in ? : par->out;
+ if (el_dev != other_dev) {
+ MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs "
+ "par->(in/out)=%p %s\n",
+ par->hooknum, el_dev, el_dev->name, other_dev,
+ other_dev->name);
+ }
+ }
+
+ if (unlikely(!el_dev)) {
+ pr_err_ratelimited("qtaguid[%d]: %s(): no par->in/out?!!\n",
+ par->hooknum, __func__);
+ BUG();
+ } else {
+ proto = ipx_proto(skb, par);
+ MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d\n",
+ par->hooknum, el_dev->name, el_dev->type,
+ par->family, proto);
+ }
+
+ spin_lock_bh(&iface_stat_list_lock);
+ entry = get_iface_entry(el_dev->name);
+ if (entry == NULL) {
+ IF_DEBUG("qtaguid: iface_stat: %s(%s): not tracked\n",
+ __func__, el_dev->name);
+ spin_unlock_bh(&iface_stat_list_lock);
+ return;
+ }
+
+ IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
+ el_dev->name, entry);
+
+ data_counters_update(&entry->totals_via_skb, 0, direction, proto,
+ bytes);
+ spin_unlock_bh(&iface_stat_list_lock);
+}
+
+static void tag_stat_update(struct tag_stat *tag_entry,
+ enum ifs_tx_rx direction, int proto, int bytes)
+{
+ int active_set;
+ active_set = get_active_counter_set(tag_entry->tn.tag);
+ MT_DEBUG("qtaguid: tag_stat_update(tag=0x%llx (uid=%u) set=%d "
+ "dir=%d proto=%d bytes=%d)\n",
+ tag_entry->tn.tag, get_uid_from_tag(tag_entry->tn.tag),
+ active_set, direction, proto, bytes);
+ data_counters_update(&tag_entry->counters, active_set, direction,
+ proto, bytes);
+ if (tag_entry->parent_counters)
+ data_counters_update(tag_entry->parent_counters, active_set,
+ direction, proto, bytes);
+}
+
+/*
+ * Create a new entry for tracking the specified {acct_tag,uid_tag} within
+ * the interface.
+ * iface_entry->tag_stat_list_lock should be held.
+ */
+static struct tag_stat *create_if_tag_stat(struct iface_stat *iface_entry,
+ tag_t tag)
+{
+ struct tag_stat *new_tag_stat_entry = NULL;
+ IF_DEBUG("qtaguid: iface_stat: %s(): ife=%p tag=0x%llx"
+ " (uid=%u)\n", __func__,
+ iface_entry, tag, get_uid_from_tag(tag));
+ new_tag_stat_entry = kzalloc(sizeof(*new_tag_stat_entry), GFP_ATOMIC);
+ if (!new_tag_stat_entry) {
+ pr_err("qtaguid: iface_stat: tag stat alloc failed\n");
+ goto done;
+ }
+ new_tag_stat_entry->tn.tag = tag;
+ tag_stat_tree_insert(new_tag_stat_entry, &iface_entry->tag_stat_tree);
+done:
+ return new_tag_stat_entry;
+}
+
+static void if_tag_stat_update(const char *ifname, uid_t uid,
+ const struct sock *sk, enum ifs_tx_rx direction,
+ int proto, int bytes)
+{
+ struct tag_stat *tag_stat_entry;
+ tag_t tag, acct_tag;
+ tag_t uid_tag;
+ struct data_counters *uid_tag_counters;
+ struct sock_tag *sock_tag_entry;
+ struct iface_stat *iface_entry;
+ struct tag_stat *new_tag_stat = NULL;
+ MT_DEBUG("qtaguid: if_tag_stat_update(ifname=%s "
+ "uid=%u sk=%p dir=%d proto=%d bytes=%d)\n",
+ ifname, uid, sk, direction, proto, bytes);
+
+ spin_lock_bh(&iface_stat_list_lock);
+ iface_entry = get_iface_entry(ifname);
+ if (!iface_entry) {
+ pr_err_ratelimited("qtaguid: iface_stat: stat_update() "
+ "%s not found\n", ifname);
+ spin_unlock_bh(&iface_stat_list_lock);
+ return;
+ }
+ /* It is ok to process data when an iface_entry is inactive */
+
+ MT_DEBUG("qtaguid: iface_stat: stat_update() dev=%s entry=%p\n",
+ ifname, iface_entry);
+
+ /*
+ * Look for a tagged sock.
+ * It will have an acct_uid.
+ */
+ sock_tag_entry = get_sock_stat(sk);
+ if (sock_tag_entry) {
+ tag = sock_tag_entry->tag;
+ acct_tag = get_atag_from_tag(tag);
+ uid_tag = get_utag_from_tag(tag);
+ } else {
+ acct_tag = make_atag_from_value(0);
+ tag = combine_atag_with_uid(acct_tag, uid);
+ uid_tag = make_tag_from_uid(uid);
+ }
+ MT_DEBUG("qtaguid: iface_stat: stat_update(): "
+ " looking for tag=0x%llx (uid=%u) in ife=%p\n",
+ tag, get_uid_from_tag(tag), iface_entry);
+ /* Loop over tag list under this interface for {acct_tag,uid_tag} */
+ spin_lock_bh(&iface_entry->tag_stat_list_lock);
+
+ tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree,
+ tag);
+ if (tag_stat_entry) {
+ /*
+ * Updating the {acct_tag, uid_tag} entry handles both stats:
+ * {0, uid_tag} will also get updated.
+ */
+ tag_stat_update(tag_stat_entry, direction, proto, bytes);
+ goto unlock;
+ }
+
+ /* Loop over tag list under this interface for {0,uid_tag} */
+ tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree,
+ uid_tag);
+ if (!tag_stat_entry) {
+ /* Here: the base uid_tag did not exist */
+ /*
+ * No parent counters. So
+ * - No {0, uid_tag} stats and no {acc_tag, uid_tag} stats.
+ */
+ new_tag_stat = create_if_tag_stat(iface_entry, uid_tag);
+ if (!new_tag_stat)
+ goto unlock;
+ uid_tag_counters = &new_tag_stat->counters;
+ } else {
+ uid_tag_counters = &tag_stat_entry->counters;
+ }
+
+ if (acct_tag) {
+ /* Create the child {acct_tag, uid_tag} and hook up parent. */
+ new_tag_stat = create_if_tag_stat(iface_entry, tag);
+ if (!new_tag_stat)
+ goto unlock;
+ new_tag_stat->parent_counters = uid_tag_counters;
+ } else {
+ /*
+ * For new_tag_stat to be still NULL here would require:
+ * {0, uid_tag} exists
+ * and {acct_tag, uid_tag} doesn't exist
+ * AND acct_tag == 0.
+ * Impossible. This reassures us that new_tag_stat
+ * below will always be assigned.
+ */
+ BUG_ON(!new_tag_stat);
+ }
+ tag_stat_update(new_tag_stat, direction, proto, bytes);
+unlock:
+ spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+ spin_unlock_bh(&iface_stat_list_lock);
+}
+
+static int iface_netdev_event_handler(struct notifier_block *nb,
+ unsigned long event, void *ptr) {
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (unlikely(module_passive))
+ return NOTIFY_DONE;
+
+ IF_DEBUG("qtaguid: iface_stat: netdev_event(): "
+ "ev=0x%lx/%s netdev=%p->name=%s\n",
+ event, netdev_evt_str(event), dev, dev ? dev->name : "");
+
+ switch (event) {
+ case NETDEV_UP:
+ iface_stat_create(dev, NULL);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ iface_stat_update(dev, event == NETDEV_DOWN);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static int iface_inet6addr_event_handler(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct inet6_ifaddr *ifa = ptr;
+ struct net_device *dev;
+
+ if (unlikely(module_passive))
+ return NOTIFY_DONE;
+
+ IF_DEBUG("qtaguid: iface_stat: inet6addr_event(): "
+ "ev=0x%lx/%s ifa=%p\n",
+ event, netdev_evt_str(event), ifa);
+
+ switch (event) {
+ case NETDEV_UP:
+ BUG_ON(!ifa || !ifa->idev);
+ dev = (struct net_device *)ifa->idev->dev;
+ iface_stat_create_ipv6(dev, ifa);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ BUG_ON(!ifa || !ifa->idev);
+ dev = (struct net_device *)ifa->idev->dev;
+ iface_stat_update(dev, event == NETDEV_DOWN);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static int iface_inetaddr_event_handler(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct in_ifaddr *ifa = ptr;
+ struct net_device *dev;
+
+ if (unlikely(module_passive))
+ return NOTIFY_DONE;
+
+ IF_DEBUG("qtaguid: iface_stat: inetaddr_event(): "
+ "ev=0x%lx/%s ifa=%p\n",
+ event, netdev_evt_str(event), ifa);
+
+ switch (event) {
+ case NETDEV_UP:
+ BUG_ON(!ifa || !ifa->ifa_dev);
+ dev = ifa->ifa_dev->dev;
+ iface_stat_create(dev, ifa);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ BUG_ON(!ifa || !ifa->ifa_dev);
+ dev = ifa->ifa_dev->dev;
+ iface_stat_update(dev, event == NETDEV_DOWN);
+ atomic64_inc(&qtu_events.iface_events);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block iface_netdev_notifier_blk = {
+ .notifier_call = iface_netdev_event_handler,
+};
+
+static struct notifier_block iface_inetaddr_notifier_blk = {
+ .notifier_call = iface_inetaddr_event_handler,
+};
+
+static struct notifier_block iface_inet6addr_notifier_blk = {
+ .notifier_call = iface_inet6addr_event_handler,
+};
+
+static const struct seq_operations iface_stat_fmt_proc_seq_ops = {
+ .start = iface_stat_fmt_proc_start,
+ .next = iface_stat_fmt_proc_next,
+ .stop = iface_stat_fmt_proc_stop,
+ .show = iface_stat_fmt_proc_show,
+};
+
+static int proc_iface_stat_fmt_open(struct inode *inode, struct file *file)
+{
+ struct proc_iface_stat_fmt_info *s;
+
+ s = __seq_open_private(file, &iface_stat_fmt_proc_seq_ops,
+ sizeof(struct proc_iface_stat_fmt_info));
+ if (!s)
+ return -ENOMEM;
+
+ s->fmt = (uintptr_t)PDE_DATA(inode);
+ return 0;
+}
+
+static const struct file_operations proc_iface_stat_fmt_fops = {
+ .open = proc_iface_stat_fmt_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static int __init iface_stat_init(struct proc_dir_entry *parent_procdir)
+{
+ int err;
+
+ iface_stat_procdir = proc_mkdir(iface_stat_procdirname, parent_procdir);
+ if (!iface_stat_procdir) {
+ pr_err("qtaguid: iface_stat: init failed to create proc entry\n");
+ err = -1;
+ goto err;
+ }
+
+ iface_stat_all_procfile = proc_create_data(iface_stat_all_procfilename,
+ proc_iface_perms,
+ parent_procdir,
+ &proc_iface_stat_fmt_fops,
+ (void *)1 /* fmt1 */);
+ if (!iface_stat_all_procfile) {
+ pr_err("qtaguid: iface_stat: init "
+ " failed to create stat_old proc entry\n");
+ err = -1;
+ goto err_zap_entry;
+ }
+
+ iface_stat_fmt_procfile = proc_create_data(iface_stat_fmt_procfilename,
+ proc_iface_perms,
+ parent_procdir,
+ &proc_iface_stat_fmt_fops,
+ (void *)2 /* fmt2 */);
+ if (!iface_stat_fmt_procfile) {
+ pr_err("qtaguid: iface_stat: init "
+ " failed to create stat_all proc entry\n");
+ err = -1;
+ goto err_zap_all_stats_entry;
+ }
+
+
+ err = register_netdevice_notifier(&iface_netdev_notifier_blk);
+ if (err) {
+ pr_err("qtaguid: iface_stat: init "
+ "failed to register dev event handler\n");
+ goto err_zap_all_stats_entries;
+ }
+ err = register_inetaddr_notifier(&iface_inetaddr_notifier_blk);
+ if (err) {
+ pr_err("qtaguid: iface_stat: init "
+ "failed to register ipv4 dev event handler\n");
+ goto err_unreg_nd;
+ }
+
+ err = register_inet6addr_notifier(&iface_inet6addr_notifier_blk);
+ if (err) {
+ pr_err("qtaguid: iface_stat: init "
+ "failed to register ipv6 dev event handler\n");
+ goto err_unreg_ip4_addr;
+ }
+ return 0;
+
+err_unreg_ip4_addr:
+ unregister_inetaddr_notifier(&iface_inetaddr_notifier_blk);
+err_unreg_nd:
+ unregister_netdevice_notifier(&iface_netdev_notifier_blk);
+err_zap_all_stats_entries:
+ remove_proc_entry(iface_stat_fmt_procfilename, parent_procdir);
+err_zap_all_stats_entry:
+ remove_proc_entry(iface_stat_all_procfilename, parent_procdir);
+err_zap_entry:
+ remove_proc_entry(iface_stat_procdirname, parent_procdir);
+err:
+ return err;
+}
+
+static struct sock *qtaguid_find_sk(const struct sk_buff *skb,
+ struct xt_action_param *par)
+{
+ struct sock *sk;
+ unsigned int hook_mask = (1 << par->hooknum);
+
+ MT_DEBUG("qtaguid: find_sk(skb=%p) hooknum=%d family=%d\n", skb,
+ par->hooknum, par->family);
+
+ /*
+ * Let's not abuse the the xt_socket_get*_sk(), or else it will
+ * return garbage SKs.
+ */
+ if (!(hook_mask & XT_SOCKET_SUPPORTED_HOOKS))
+ return NULL;
+
+ switch (par->family) {
+ case NFPROTO_IPV6:
+ sk = xt_socket_lookup_slow_v6(dev_net(skb->dev), skb, par->in);
+ break;
+ case NFPROTO_IPV4:
+ sk = xt_socket_lookup_slow_v4(dev_net(skb->dev), skb, par->in);
+ break;
+ default:
+ return NULL;
+ }
+
+ if (sk) {
+ MT_DEBUG("qtaguid: %p->sk_proto=%u "
+ "->sk_state=%d\n", sk, sk->sk_protocol, sk->sk_state);
+ }
+ return sk;
+}
+
+static void account_for_uid(const struct sk_buff *skb,
+ const struct sock *alternate_sk, uid_t uid,
+ struct xt_action_param *par)
+{
+ const struct net_device *el_dev;
+
+ if (!skb->dev) {
+ MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum);
+ el_dev = par->in ? : par->out;
+ } else {
+ const struct net_device *other_dev;
+ el_dev = skb->dev;
+ other_dev = par->in ? : par->out;
+ if (el_dev != other_dev) {
+ MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs "
+ "par->(in/out)=%p %s\n",
+ par->hooknum, el_dev, el_dev->name, other_dev,
+ other_dev->name);
+ }
+ }
+
+ if (unlikely(!el_dev)) {
+ pr_info("qtaguid[%d]: no par->in/out?!!\n", par->hooknum);
+ } else {
+ int proto = ipx_proto(skb, par);
+ MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d\n",
+ par->hooknum, el_dev->name, el_dev->type,
+ par->family, proto);
+
+ if_tag_stat_update(el_dev->name, uid,
+ skb->sk ? skb->sk : alternate_sk,
+ par->in ? IFS_RX : IFS_TX,
+ proto, skb->len);
+ }
+}
+
+static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_qtaguid_match_info *info = par->matchinfo;
+ const struct file *filp;
+ bool got_sock = false;
+ struct sock *sk;
+ kuid_t sock_uid;
+ bool res;
+ bool set_sk_callback_lock = false;
+
+ if (unlikely(module_passive))
+ return (info->match ^ info->invert) == 0;
+
+ MT_DEBUG("qtaguid[%d]: entered skb=%p par->in=%p/out=%p fam=%d\n",
+ par->hooknum, skb, par->in, par->out, par->family);
+
+ atomic64_inc(&qtu_events.match_calls);
+ if (skb == NULL) {
+ res = (info->match ^ info->invert) == 0;
+ goto ret_res;
+ }
+
+ switch (par->hooknum) {
+ case NF_INET_PRE_ROUTING:
+ case NF_INET_POST_ROUTING:
+ atomic64_inc(&qtu_events.match_calls_prepost);
+ iface_stat_update_from_skb(skb, par);
+ /*
+ * We are done in pre/post. The skb will get processed
+ * further alter.
+ */
+ res = (info->match ^ info->invert);
+ goto ret_res;
+ break;
+ /* default: Fall through and do UID releated work */
+ }
+
+ sk = skb_to_full_sk(skb);
+ /*
+ * When in TCP_TIME_WAIT the sk is not a "struct sock" but
+ * "struct inet_timewait_sock" which is missing fields.
+ * So we ignore it.
+ */
+ if (sk && sk->sk_state == TCP_TIME_WAIT)
+ sk = NULL;
+ if (sk == NULL) {
+ /*
+ * A missing sk->sk_socket happens when packets are in-flight
+ * and the matching socket is already closed and gone.
+ */
+ sk = qtaguid_find_sk(skb, par);
+ /*
+ * TCP_NEW_SYN_RECV are not "struct sock" but "struct request_sock"
+ * where we can get a pointer to a full socket to retrieve uid/gid.
+ * When in TCP_TIME_WAIT, sk is a struct inet_timewait_sock
+ * which is missing fields and does not contain any reference
+ * to a full socket, so just ignore the socket.
+ */
+ if (sk && sk->sk_state == TCP_NEW_SYN_RECV) {
+ sock_gen_put(sk);
+ sk = sk_to_full_sk(sk);
+ } else if (sk && (!sk_fullsock(sk) || sk->sk_state == TCP_TIME_WAIT)) {
+ sock_gen_put(sk);
+ sk = NULL;
+ } else {
+ /*
+ * If we got the socket from the find_sk(), we will need to put
+ * it back, as nf_tproxy_get_sock_v4() got it.
+ */
+ got_sock = sk;
+ }
+ if (sk)
+ atomic64_inc(&qtu_events.match_found_sk_in_ct);
+ else
+ atomic64_inc(&qtu_events.match_found_no_sk_in_ct);
+ } else {
+ atomic64_inc(&qtu_events.match_found_sk);
+ }
+ MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n",
+ par->hooknum, sk, got_sock, par->family, ipx_proto(skb, par));
+
+
+ if (sk == NULL) {
+ /*
+ * Here, the qtaguid_find_sk() using connection tracking
+ * couldn't find the owner, so for now we just count them
+ * against the system.
+ */
+ /*
+ * TODO: unhack how to force just accounting.
+ * For now we only do iface stats when the uid-owner is not
+ * requested.
+ */
+ if (!(info->match & XT_QTAGUID_UID))
+ account_for_uid(skb, sk, 0, par);
+ MT_DEBUG("qtaguid[%d]: leaving (sk=NULL)\n", par->hooknum);
+ res = (info->match ^ info->invert) == 0;
+ atomic64_inc(&qtu_events.match_no_sk);
+ goto put_sock_ret_res;
+ } else if (info->match & info->invert & XT_QTAGUID_SOCKET) {
+ res = false;
+ goto put_sock_ret_res;
+ }
+ sock_uid = sk->sk_uid;
+ /*
+ * TODO: unhack how to force just accounting.
+ * For now we only do iface stats when the uid-owner is not requested
+ */
+ if (!(info->match & XT_QTAGUID_UID))
+ account_for_uid(skb, sk, from_kuid(&init_user_ns, sock_uid), par);
+
+ /*
+ * The following two tests fail the match when:
+ * id not in range AND no inverted condition requested
+ * or id in range AND inverted condition requested
+ * Thus (!a && b) || (a && !b) == a ^ b
+ */
+ if (info->match & XT_QTAGUID_UID) {
+ kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min);
+ kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max);
+
+ if ((uid_gte(sk->sk_uid, uid_min) &&
+ uid_lte(sk->sk_uid, uid_max)) ^
+ !(info->invert & XT_QTAGUID_UID)) {
+ MT_DEBUG("qtaguid[%d]: leaving uid not matching\n",
+ par->hooknum);
+ res = false;
+ goto put_sock_ret_res;
+ }
+ }
+ if (info->match & XT_QTAGUID_GID) {
+ kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min);
+ kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max);
+ set_sk_callback_lock = true;
+ read_lock_bh(&sk->sk_callback_lock);
+ MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n",
+ par->hooknum, sk, sk->sk_socket,
+ sk->sk_socket ? sk->sk_socket->file : (void *)-1LL);
+ filp = sk->sk_socket ? sk->sk_socket->file : NULL;
+ if (!filp) {
+ res = ((info->match ^ info->invert) & XT_QTAGUID_GID) == 0;
+ atomic64_inc(&qtu_events.match_no_sk_gid);
+ goto put_sock_ret_res;
+ }
+ MT_DEBUG("qtaguid[%d]: filp...uid=%u\n",
+ par->hooknum, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1);
+ if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
+ gid_lte(filp->f_cred->fsgid, gid_max)) ^
+ !(info->invert & XT_QTAGUID_GID)) {
+ MT_DEBUG("qtaguid[%d]: leaving gid not matching\n",
+ par->hooknum);
+ res = false;
+ goto put_sock_ret_res;
+ }
+ }
+ MT_DEBUG("qtaguid[%d]: leaving matched\n", par->hooknum);
+ res = true;
+
+put_sock_ret_res:
+ if (got_sock)
+ sock_gen_put(sk);
+ if (set_sk_callback_lock)
+ read_unlock_bh(&sk->sk_callback_lock);
+ret_res:
+ MT_DEBUG("qtaguid[%d]: left %d\n", par->hooknum, res);
+ return res;
+}
+
+#ifdef DDEBUG
+/*
+ * This function is not in xt_qtaguid_print.c because of locks visibility.
+ * The lock of sock_tag_list must be aquired before calling this function
+ */
+static void prdebug_full_state_locked(int indent_level, const char *fmt, ...)
+{
+ va_list args;
+ char *fmt_buff;
+ char *buff;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ fmt_buff = kasprintf(GFP_ATOMIC,
+ "qtaguid: %s(): %s {\n", __func__, fmt);
+ BUG_ON(!fmt_buff);
+ va_start(args, fmt);
+ buff = kvasprintf(GFP_ATOMIC,
+ fmt_buff, args);
+ BUG_ON(!buff);
+ pr_debug("%s", buff);
+ kfree(fmt_buff);
+ kfree(buff);
+ va_end(args);
+
+ prdebug_sock_tag_tree(indent_level, &sock_tag_tree);
+
+ spin_lock_bh(&uid_tag_data_tree_lock);
+ prdebug_uid_tag_data_tree(indent_level, &uid_tag_data_tree);
+ prdebug_proc_qtu_data_tree(indent_level, &proc_qtu_data_tree);
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+
+ spin_lock_bh(&iface_stat_list_lock);
+ prdebug_iface_stat_list(indent_level, &iface_stat_list);
+ spin_unlock_bh(&iface_stat_list_lock);
+
+ pr_debug("qtaguid: %s(): }\n", __func__);
+}
+#else
+static void prdebug_full_state_locked(int indent_level, const char *fmt, ...) {}
+#endif
+
+struct proc_ctrl_print_info {
+ struct sock *sk; /* socket found by reading to sk_pos */
+ loff_t sk_pos;
+};
+
+static void *qtaguid_ctrl_proc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct proc_ctrl_print_info *pcpi = m->private;
+ struct sock_tag *sock_tag_entry = v;
+ struct rb_node *node;
+
+ (*pos)++;
+
+ if (!v || v == SEQ_START_TOKEN)
+ return NULL;
+
+ node = rb_next(&sock_tag_entry->sock_node);
+ if (!node) {
+ pcpi->sk = NULL;
+ sock_tag_entry = SEQ_START_TOKEN;
+ } else {
+ sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
+ pcpi->sk = sock_tag_entry->sk;
+ }
+ pcpi->sk_pos = *pos;
+ return sock_tag_entry;
+}
+
+static void *qtaguid_ctrl_proc_start(struct seq_file *m, loff_t *pos)
+{
+ struct proc_ctrl_print_info *pcpi = m->private;
+ struct sock_tag *sock_tag_entry;
+ struct rb_node *node;
+
+ spin_lock_bh(&sock_tag_list_lock);
+
+ if (unlikely(module_passive))
+ return NULL;
+
+ if (*pos == 0) {
+ pcpi->sk_pos = 0;
+ node = rb_first(&sock_tag_tree);
+ if (!node) {
+ pcpi->sk = NULL;
+ return SEQ_START_TOKEN;
+ }
+ sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
+ pcpi->sk = sock_tag_entry->sk;
+ } else {
+ sock_tag_entry = (pcpi->sk ? get_sock_stat_nl(pcpi->sk) :
+ NULL) ?: SEQ_START_TOKEN;
+ if (*pos != pcpi->sk_pos) {
+ /* seq_read skipped a next call */
+ *pos = pcpi->sk_pos;
+ return qtaguid_ctrl_proc_next(m, sock_tag_entry, pos);
+ }
+ }
+ return sock_tag_entry;
+}
+
+static void qtaguid_ctrl_proc_stop(struct seq_file *m, void *v)
+{
+ spin_unlock_bh(&sock_tag_list_lock);
+}
+
+/*
+ * Procfs reader to get all active socket tags using style "1)" as described in
+ * fs/proc/generic.c
+ */
+static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v)
+{
+ struct sock_tag *sock_tag_entry = v;
+ uid_t uid;
+
+ CT_DEBUG("qtaguid: proc ctrl pid=%u tgid=%u uid=%u\n",
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+ if (sock_tag_entry != SEQ_START_TOKEN) {
+ int sk_ref_count;
+ uid = get_uid_from_tag(sock_tag_entry->tag);
+ CT_DEBUG("qtaguid: proc_read(): sk=%p tag=0x%llx (uid=%u) "
+ "pid=%u\n",
+ sock_tag_entry->sk,
+ sock_tag_entry->tag,
+ uid,
+ sock_tag_entry->pid
+ );
+ sk_ref_count = atomic_read(
+ &sock_tag_entry->sk->sk_refcnt);
+ seq_printf(m, "sock=%pK tag=0x%llx (uid=%u) pid=%u "
+ "f_count=%d\n",
+ sock_tag_entry->sk,
+ sock_tag_entry->tag, uid,
+ sock_tag_entry->pid, sk_ref_count);
+ } else {
+ seq_printf(m, "events: sockets_tagged=%llu "
+ "sockets_untagged=%llu "
+ "counter_set_changes=%llu "
+ "delete_cmds=%llu "
+ "iface_events=%llu "
+ "match_calls=%llu "
+ "match_calls_prepost=%llu "
+ "match_found_sk=%llu "
+ "match_found_sk_in_ct=%llu "
+ "match_found_no_sk_in_ct=%llu "
+ "match_no_sk=%llu "
+ "match_no_sk_gid=%llu\n",
+ (u64)atomic64_read(&qtu_events.sockets_tagged),
+ (u64)atomic64_read(&qtu_events.sockets_untagged),
+ (u64)atomic64_read(&qtu_events.counter_set_changes),
+ (u64)atomic64_read(&qtu_events.delete_cmds),
+ (u64)atomic64_read(&qtu_events.iface_events),
+ (u64)atomic64_read(&qtu_events.match_calls),
+ (u64)atomic64_read(&qtu_events.match_calls_prepost),
+ (u64)atomic64_read(&qtu_events.match_found_sk),
+ (u64)atomic64_read(&qtu_events.match_found_sk_in_ct),
+ (u64)atomic64_read(&qtu_events.match_found_no_sk_in_ct),
+ (u64)atomic64_read(&qtu_events.match_no_sk),
+ (u64)atomic64_read(&qtu_events.match_no_sk_gid));
+
+ /* Count the following as part of the last item_index. No need
+ * to lock the sock_tag_list here since it is already locked when
+ * starting the seq_file operation
+ */
+ prdebug_full_state_locked(0, "proc ctrl");
+ }
+
+ return 0;
+}
+
+/*
+ * Delete socket tags, and stat tags associated with a given
+ * accouting tag and uid.
+ */
+static int ctrl_cmd_delete(const char *input)
+{
+ char cmd;
+ int uid_int;
+ kuid_t uid;
+ uid_t entry_uid;
+ tag_t acct_tag;
+ tag_t tag;
+ int res, argc;
+ struct iface_stat *iface_entry;
+ struct rb_node *node;
+ struct sock_tag *st_entry;
+ struct rb_root st_to_free_tree = RB_ROOT;
+ struct tag_stat *ts_entry;
+ struct tag_counter_set *tcs_entry;
+ struct tag_ref *tr_entry;
+ struct uid_tag_data *utd_entry;
+
+ argc = sscanf(input, "%c %llu %u", &cmd, &acct_tag, &uid_int);
+ uid = make_kuid(&init_user_ns, uid_int);
+ CT_DEBUG("qtaguid: ctrl_delete(%s): argc=%d cmd=%c "
+ "user_tag=0x%llx uid=%u\n", input, argc, cmd,
+ acct_tag, uid_int);
+ if (argc < 2) {
+ res = -EINVAL;
+ goto err;
+ }
+ if (!valid_atag(acct_tag)) {
+ pr_info("qtaguid: ctrl_delete(%s): invalid tag\n", input);
+ res = -EINVAL;
+ goto err;
+ }
+ if (argc < 3) {
+ uid = current_fsuid();
+ uid_int = from_kuid(&init_user_ns, uid);
+ } else if (!can_impersonate_uid(uid)) {
+ pr_info("qtaguid: ctrl_delete(%s): "
+ "insufficient priv from pid=%u tgid=%u uid=%u\n",
+ input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+ res = -EPERM;
+ goto err;
+ }
+
+ tag = combine_atag_with_uid(acct_tag, uid_int);
+ CT_DEBUG("qtaguid: ctrl_delete(%s): "
+ "looking for tag=0x%llx (uid=%u)\n",
+ input, tag, uid_int);
+
+ /* Delete socket tags */
+ spin_lock_bh(&sock_tag_list_lock);
+ spin_lock_bh(&uid_tag_data_tree_lock);
+ node = rb_first(&sock_tag_tree);
+ while (node) {
+ st_entry = rb_entry(node, struct sock_tag, sock_node);
+ entry_uid = get_uid_from_tag(st_entry->tag);
+ node = rb_next(node);
+ if (entry_uid != uid_int)
+ continue;
+
+ CT_DEBUG("qtaguid: ctrl_delete(%s): st tag=0x%llx (uid=%u)\n",
+ input, st_entry->tag, entry_uid);
+
+ if (!acct_tag || st_entry->tag == tag) {
+ rb_erase(&st_entry->sock_node, &sock_tag_tree);
+ /* Can't sockfd_put() within spinlock, do it later. */
+ sock_tag_tree_insert(st_entry, &st_to_free_tree);
+ tr_entry = lookup_tag_ref(st_entry->tag, NULL);
+ BUG_ON(tr_entry->num_sock_tags <= 0);
+ tr_entry->num_sock_tags--;
+ /*
+ * TODO: remove if, and start failing.
+ * This is a hack to work around the fact that in some
+ * places we have "if (IS_ERR_OR_NULL(pqd_entry))"
+ * and are trying to work around apps
+ * that didn't open the /dev/xt_qtaguid.
+ */
+ if (st_entry->list.next && st_entry->list.prev)
+ list_del(&st_entry->list);
+ }
+ }
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ spin_unlock_bh(&sock_tag_list_lock);
+
+ sock_tag_tree_erase(&st_to_free_tree);
+
+ /* Delete tag counter-sets */
+ spin_lock_bh(&tag_counter_set_list_lock);
+ /* Counter sets are only on the uid tag, not full tag */
+ tcs_entry = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
+ if (tcs_entry) {
+ CT_DEBUG("qtaguid: ctrl_delete(%s): "
+ "erase tcs: tag=0x%llx (uid=%u) set=%d\n",
+ input,
+ tcs_entry->tn.tag,
+ get_uid_from_tag(tcs_entry->tn.tag),
+ tcs_entry->active_set);
+ rb_erase(&tcs_entry->tn.node, &tag_counter_set_tree);
+ kfree(tcs_entry);
+ }
+ spin_unlock_bh(&tag_counter_set_list_lock);
+
+ /*
+ * If acct_tag is 0, then all entries belonging to uid are
+ * erased.
+ */
+ spin_lock_bh(&iface_stat_list_lock);
+ list_for_each_entry(iface_entry, &iface_stat_list, list) {
+ spin_lock_bh(&iface_entry->tag_stat_list_lock);
+ node = rb_first(&iface_entry->tag_stat_tree);
+ while (node) {
+ ts_entry = rb_entry(node, struct tag_stat, tn.node);
+ entry_uid = get_uid_from_tag(ts_entry->tn.tag);
+ node = rb_next(node);
+
+ CT_DEBUG("qtaguid: ctrl_delete(%s): "
+ "ts tag=0x%llx (uid=%u)\n",
+ input, ts_entry->tn.tag, entry_uid);
+
+ if (entry_uid != uid_int)
+ continue;
+ if (!acct_tag || ts_entry->tn.tag == tag) {
+ CT_DEBUG("qtaguid: ctrl_delete(%s): "
+ "erase ts: %s 0x%llx %u\n",
+ input, iface_entry->ifname,
+ get_atag_from_tag(ts_entry->tn.tag),
+ entry_uid);
+ rb_erase(&ts_entry->tn.node,
+ &iface_entry->tag_stat_tree);
+ kfree(ts_entry);
+ }
+ }
+ spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+ }
+ spin_unlock_bh(&iface_stat_list_lock);
+
+ /* Cleanup the uid_tag_data */
+ spin_lock_bh(&uid_tag_data_tree_lock);
+ node = rb_first(&uid_tag_data_tree);
+ while (node) {
+ utd_entry = rb_entry(node, struct uid_tag_data, node);
+ entry_uid = utd_entry->uid;
+ node = rb_next(node);
+
+ CT_DEBUG("qtaguid: ctrl_delete(%s): "
+ "utd uid=%u\n",
+ input, entry_uid);
+
+ if (entry_uid != uid_int)
+ continue;
+ /*
+ * Go over the tag_refs, and those that don't have
+ * sock_tags using them are freed.
+ */
+ put_tag_ref_tree(tag, utd_entry);
+ put_utd_entry(utd_entry);
+ }
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+
+ atomic64_inc(&qtu_events.delete_cmds);
+ res = 0;
+
+err:
+ return res;
+}
+
+static int ctrl_cmd_counter_set(const char *input)
+{
+ char cmd;
+ uid_t uid = 0;
+ tag_t tag;
+ int res, argc;
+ struct tag_counter_set *tcs;
+ int counter_set;
+
+ argc = sscanf(input, "%c %d %u", &cmd, &counter_set, &uid);
+ CT_DEBUG("qtaguid: ctrl_counterset(%s): argc=%d cmd=%c "
+ "set=%d uid=%u\n", input, argc, cmd,
+ counter_set, uid);
+ if (argc != 3) {
+ res = -EINVAL;
+ goto err;
+ }
+ if (counter_set < 0 || counter_set >= IFS_MAX_COUNTER_SETS) {
+ pr_info("qtaguid: ctrl_counterset(%s): invalid counter_set range\n",
+ input);
+ res = -EINVAL;
+ goto err;
+ }
+ if (!can_manipulate_uids()) {
+ pr_info("qtaguid: ctrl_counterset(%s): "
+ "insufficient priv from pid=%u tgid=%u uid=%u\n",
+ input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+ res = -EPERM;
+ goto err;
+ }
+
+ tag = make_tag_from_uid(uid);
+ spin_lock_bh(&tag_counter_set_list_lock);
+ tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
+ if (!tcs) {
+ tcs = kzalloc(sizeof(*tcs), GFP_ATOMIC);
+ if (!tcs) {
+ spin_unlock_bh(&tag_counter_set_list_lock);
+ pr_err("qtaguid: ctrl_counterset(%s): "
+ "failed to alloc counter set\n",
+ input);
+ res = -ENOMEM;
+ goto err;
+ }
+ tcs->tn.tag = tag;
+ tag_counter_set_tree_insert(tcs, &tag_counter_set_tree);
+ CT_DEBUG("qtaguid: ctrl_counterset(%s): added tcs tag=0x%llx "
+ "(uid=%u) set=%d\n",
+ input, tag, get_uid_from_tag(tag), counter_set);
+ }
+ tcs->active_set = counter_set;
+ spin_unlock_bh(&tag_counter_set_list_lock);
+ atomic64_inc(&qtu_events.counter_set_changes);
+ res = 0;
+
+err:
+ return res;
+}
+
+static int ctrl_cmd_tag(const char *input)
+{
+ char cmd;
+ int sock_fd = 0;
+ kuid_t uid;
+ unsigned int uid_int = 0;
+ tag_t acct_tag = make_atag_from_value(0);
+ tag_t full_tag;
+ struct socket *el_socket;
+ int res, argc;
+ struct sock_tag *sock_tag_entry;
+ struct tag_ref *tag_ref_entry;
+ struct uid_tag_data *uid_tag_data_entry;
+ struct proc_qtu_data *pqd_entry;
+
+ /* Unassigned args will get defaulted later. */
+ argc = sscanf(input, "%c %d %llu %u", &cmd, &sock_fd, &acct_tag, &uid_int);
+ uid = make_kuid(&init_user_ns, uid_int);
+ CT_DEBUG("qtaguid: ctrl_tag(%s): argc=%d cmd=%c sock_fd=%d "
+ "acct_tag=0x%llx uid=%u\n", input, argc, cmd, sock_fd,
+ acct_tag, uid_int);
+ if (argc < 2) {
+ res = -EINVAL;
+ goto err;
+ }
+ el_socket = sockfd_lookup(sock_fd, &res); /* This locks the file */
+ if (!el_socket) {
+ pr_info("qtaguid: ctrl_tag(%s): failed to lookup"
+ " sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n",
+ input, sock_fd, res, current->pid, current->tgid,
+ from_kuid(&init_user_ns, current_fsuid()));
+ goto err;
+ }
+ CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->sk_refcnt=%d ->sk=%p\n",
+ input, atomic_read(&el_socket->sk->sk_refcnt),
+ el_socket->sk);
+ if (argc < 3) {
+ acct_tag = make_atag_from_value(0);
+ } else if (!valid_atag(acct_tag)) {
+ pr_info("qtaguid: ctrl_tag(%s): invalid tag\n", input);
+ res = -EINVAL;
+ goto err_put;
+ }
+ CT_DEBUG("qtaguid: ctrl_tag(%s): "
+ "pid=%u tgid=%u uid=%u euid=%u fsuid=%u "
+ "ctrl.gid=%u in_group()=%d in_egroup()=%d\n",
+ input, current->pid, current->tgid,
+ from_kuid(&init_user_ns, current_uid()),
+ from_kuid(&init_user_ns, current_euid()),
+ from_kuid(&init_user_ns, current_fsuid()),
+ from_kgid(&init_user_ns, xt_qtaguid_ctrl_file->gid),
+ in_group_p(xt_qtaguid_ctrl_file->gid),
+ in_egroup_p(xt_qtaguid_ctrl_file->gid));
+ if (argc < 4) {
+ uid = current_fsuid();
+ uid_int = from_kuid(&init_user_ns, uid);
+ } else if (!can_impersonate_uid(uid)) {
+ pr_info("qtaguid: ctrl_tag(%s): "
+ "insufficient priv from pid=%u tgid=%u uid=%u\n",
+ input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+ res = -EPERM;
+ goto err_put;
+ }
+ full_tag = combine_atag_with_uid(acct_tag, uid_int);
+
+ spin_lock_bh(&sock_tag_list_lock);
+ spin_lock_bh(&uid_tag_data_tree_lock);
+ sock_tag_entry = get_sock_stat_nl(el_socket->sk);
+ tag_ref_entry = get_tag_ref(full_tag, &uid_tag_data_entry);
+ if (IS_ERR(tag_ref_entry)) {
+ res = PTR_ERR(tag_ref_entry);
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ spin_unlock_bh(&sock_tag_list_lock);
+ goto err_put;
+ }
+ tag_ref_entry->num_sock_tags++;
+ if (sock_tag_entry) {
+ struct tag_ref *prev_tag_ref_entry;
+
+ CT_DEBUG("qtaguid: ctrl_tag(%s): retag for sk=%p "
+ "st@%p ...->sk_refcnt=%d\n",
+ input, el_socket->sk, sock_tag_entry,
+ atomic_read(&el_socket->sk->sk_refcnt));
+ prev_tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag,
+ &uid_tag_data_entry);
+ BUG_ON(IS_ERR_OR_NULL(prev_tag_ref_entry));
+ BUG_ON(prev_tag_ref_entry->num_sock_tags <= 0);
+ prev_tag_ref_entry->num_sock_tags--;
+ sock_tag_entry->tag = full_tag;
+ } else {
+ CT_DEBUG("qtaguid: ctrl_tag(%s): newtag for sk=%p\n",
+ input, el_socket->sk);
+ sock_tag_entry = kzalloc(sizeof(*sock_tag_entry),
+ GFP_ATOMIC);
+ if (!sock_tag_entry) {
+ pr_err("qtaguid: ctrl_tag(%s): "
+ "socket tag alloc failed\n",
+ input);
+ BUG_ON(tag_ref_entry->num_sock_tags <= 0);
+ tag_ref_entry->num_sock_tags--;
+ free_tag_ref_from_utd_entry(tag_ref_entry,
+ uid_tag_data_entry);
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ spin_unlock_bh(&sock_tag_list_lock);
+ res = -ENOMEM;
+ goto err_put;
+ }
+ /*
+ * Hold the sk refcount here to make sure the sk pointer cannot
+ * be freed and reused
+ */
+ sock_hold(el_socket->sk);
+ sock_tag_entry->sk = el_socket->sk;
+ sock_tag_entry->pid = current->tgid;
+ sock_tag_entry->tag = combine_atag_with_uid(acct_tag, uid_int);
+ pqd_entry = proc_qtu_data_tree_search(
+ &proc_qtu_data_tree, current->tgid);
+ /*
+ * TODO: remove if, and start failing.
+ * At first, we want to catch user-space code that is not
+ * opening the /dev/xt_qtaguid.
+ */
+ if (IS_ERR_OR_NULL(pqd_entry))
+ pr_warn_once(
+ "qtaguid: %s(): "
+ "User space forgot to open /dev/xt_qtaguid? "
+ "pid=%u tgid=%u uid=%u\n", __func__,
+ current->pid, current->tgid,
+ from_kuid(&init_user_ns, current_fsuid()));
+ else
+ list_add(&sock_tag_entry->list,
+ &pqd_entry->sock_tag_list);
+
+ sock_tag_tree_insert(sock_tag_entry, &sock_tag_tree);
+ atomic64_inc(&qtu_events.sockets_tagged);
+ }
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ spin_unlock_bh(&sock_tag_list_lock);
+ /* We keep the ref to the sk until it is untagged */
+ CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->sk_refcnt=%d\n",
+ input, sock_tag_entry,
+ atomic_read(&el_socket->sk->sk_refcnt));
+ sockfd_put(el_socket);
+ return 0;
+
+err_put:
+ CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->sk_refcnt=%d\n",
+ input, atomic_read(&el_socket->sk->sk_refcnt) - 1);
+ /* Release the sock_fd that was grabbed by sockfd_lookup(). */
+ sockfd_put(el_socket);
+ return res;
+
+err:
+ CT_DEBUG("qtaguid: ctrl_tag(%s): done.\n", input);
+ return res;
+}
+
+static int ctrl_cmd_untag(const char *input)
+{
+ char cmd;
+ int sock_fd = 0;
+ struct socket *el_socket;
+ int res, argc;
+
+ argc = sscanf(input, "%c %d", &cmd, &sock_fd);
+ CT_DEBUG("qtaguid: ctrl_untag(%s): argc=%d cmd=%c sock_fd=%d\n",
+ input, argc, cmd, sock_fd);
+ if (argc < 2) {
+ res = -EINVAL;
+ return res;
+ }
+ el_socket = sockfd_lookup(sock_fd, &res); /* This locks the file */
+ if (!el_socket) {
+ pr_info("qtaguid: ctrl_untag(%s): failed to lookup"
+ " sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n",
+ input, sock_fd, res, current->pid, current->tgid,
+ from_kuid(&init_user_ns, current_fsuid()));
+ return res;
+ }
+ CT_DEBUG("qtaguid: ctrl_untag(%s): socket->...->f_count=%ld ->sk=%p\n",
+ input, atomic_long_read(&el_socket->file->f_count),
+ el_socket->sk);
+ res = qtaguid_untag(el_socket, false);
+ sockfd_put(el_socket);
+ return res;
+}
+
+int qtaguid_untag(struct socket *el_socket, bool kernel)
+{
+ int res;
+ pid_t pid;
+ struct sock_tag *sock_tag_entry;
+ struct tag_ref *tag_ref_entry;
+ struct uid_tag_data *utd_entry;
+ struct proc_qtu_data *pqd_entry;
+
+ spin_lock_bh(&sock_tag_list_lock);
+ sock_tag_entry = get_sock_stat_nl(el_socket->sk);
+ if (!sock_tag_entry) {
+ spin_unlock_bh(&sock_tag_list_lock);
+ res = -EINVAL;
+ return res;
+ }
+ /*
+ * The socket already belongs to the current process
+ * so it can do whatever it wants to it.
+ */
+ rb_erase(&sock_tag_entry->sock_node, &sock_tag_tree);
+
+ tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, &utd_entry);
+ BUG_ON(!tag_ref_entry);
+ BUG_ON(tag_ref_entry->num_sock_tags <= 0);
+ spin_lock_bh(&uid_tag_data_tree_lock);
+ if (kernel)
+ pid = sock_tag_entry->pid;
+ else
+ pid = current->tgid;
+ pqd_entry = proc_qtu_data_tree_search(
+ &proc_qtu_data_tree, pid);
+ /*
+ * TODO: remove if, and start failing.
+ * At first, we want to catch user-space code that is not
+ * opening the /dev/xt_qtaguid.
+ */
+ if (IS_ERR_OR_NULL(pqd_entry) || !sock_tag_entry->list.next) {
+ pr_warn_once("qtaguid: %s(): "
+ "User space forgot to open /dev/xt_qtaguid? "
+ "pid=%u tgid=%u sk_pid=%u, uid=%u\n", __func__,
+ current->pid, current->tgid, sock_tag_entry->pid,
+ from_kuid(&init_user_ns, current_fsuid()));
+ } else {
+ list_del(&sock_tag_entry->list);
+ }
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ /*
+ * We don't free tag_ref from the utd_entry here,
+ * only during a cmd_delete().
+ */
+ tag_ref_entry->num_sock_tags--;
+ spin_unlock_bh(&sock_tag_list_lock);
+ /*
+ * Release the sock_fd that was grabbed at tag time.
+ */
+ sock_put(sock_tag_entry->sk);
+ CT_DEBUG("qtaguid: done. st@%p ...->sk_refcnt=%d\n",
+ sock_tag_entry,
+ atomic_read(&el_socket->sk->sk_refcnt));
+
+ kfree(sock_tag_entry);
+ atomic64_inc(&qtu_events.sockets_untagged);
+
+ return 0;
+}
+
+static ssize_t qtaguid_ctrl_parse(const char *input, size_t count)
+{
+ char cmd;
+ ssize_t res;
+
+ CT_DEBUG("qtaguid: ctrl(%s): pid=%u tgid=%u uid=%u\n",
+ input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+ cmd = input[0];
+ /* Collect params for commands */
+ switch (cmd) {
+ case 'd':
+ res = ctrl_cmd_delete(input);
+ break;
+
+ case 's':
+ res = ctrl_cmd_counter_set(input);
+ break;
+
+ case 't':
+ res = ctrl_cmd_tag(input);
+ break;
+
+ case 'u':
+ res = ctrl_cmd_untag(input);
+ break;
+
+ default:
+ res = -EINVAL;
+ goto err;
+ }
+ if (!res)
+ res = count;
+err:
+ CT_DEBUG("qtaguid: ctrl(%s): res=%zd\n", input, res);
+ return res;
+}
+
+#define MAX_QTAGUID_CTRL_INPUT_LEN 255
+static ssize_t qtaguid_ctrl_proc_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *offp)
+{
+ char input_buf[MAX_QTAGUID_CTRL_INPUT_LEN];
+
+ if (unlikely(module_passive))
+ return count;
+
+ if (count >= MAX_QTAGUID_CTRL_INPUT_LEN)
+ return -EINVAL;
+
+ if (copy_from_user(input_buf, buffer, count))
+ return -EFAULT;
+
+ input_buf[count] = '\0';
+ return qtaguid_ctrl_parse(input_buf, count);
+}
+
+struct proc_print_info {
+ struct iface_stat *iface_entry;
+ int item_index;
+ tag_t tag; /* tag found by reading to tag_pos */
+ off_t tag_pos;
+ int tag_item_index;
+};
+
+static void pp_stats_header(struct seq_file *m)
+{
+ seq_puts(m,
+ "idx iface acct_tag_hex uid_tag_int cnt_set "
+ "rx_bytes rx_packets "
+ "tx_bytes tx_packets "
+ "rx_tcp_bytes rx_tcp_packets "
+ "rx_udp_bytes rx_udp_packets "
+ "rx_other_bytes rx_other_packets "
+ "tx_tcp_bytes tx_tcp_packets "
+ "tx_udp_bytes tx_udp_packets "
+ "tx_other_bytes tx_other_packets\n");
+}
+
+static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry,
+ int cnt_set)
+{
+ struct data_counters *cnts;
+ tag_t tag = ts_entry->tn.tag;
+ uid_t stat_uid = get_uid_from_tag(tag);
+ struct proc_print_info *ppi = m->private;
+ /* Detailed tags are not available to everybody */
+ if (!can_read_other_uid_stats(make_kuid(&init_user_ns,stat_uid))) {
+ CT_DEBUG("qtaguid: stats line: "
+ "%s 0x%llx %u: insufficient priv "
+ "from pid=%u tgid=%u uid=%u stats.gid=%u\n",
+ ppi->iface_entry->ifname,
+ get_atag_from_tag(tag), stat_uid,
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()),
+ from_kgid(&init_user_ns,xt_qtaguid_stats_file->gid));
+ return 0;
+ }
+ ppi->item_index++;
+ cnts = &ts_entry->counters;
+ seq_printf(m, "%d %s 0x%llx %u %u "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu "
+ "%llu %llu\n",
+ ppi->item_index,
+ ppi->iface_entry->ifname,
+ get_atag_from_tag(tag),
+ stat_uid,
+ cnt_set,
+ dc_sum_bytes(cnts, cnt_set, IFS_RX),
+ dc_sum_packets(cnts, cnt_set, IFS_RX),
+ dc_sum_bytes(cnts, cnt_set, IFS_TX),
+ dc_sum_packets(cnts, cnt_set, IFS_TX),
+ cnts->bpc[cnt_set][IFS_RX][IFS_TCP].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_TCP].packets,
+ cnts->bpc[cnt_set][IFS_RX][IFS_UDP].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_UDP].packets,
+ cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].bytes,
+ cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_TCP].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_TCP].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_UDP].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets,
+ cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes,
+ cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets);
+ return seq_has_overflowed(m) ? -ENOSPC : 1;
+}
+
+static bool pp_sets(struct seq_file *m, struct tag_stat *ts_entry)
+{
+ int ret;
+ int counter_set;
+ for (counter_set = 0; counter_set < IFS_MAX_COUNTER_SETS;
+ counter_set++) {
+ ret = pp_stats_line(m, ts_entry, counter_set);
+ if (ret < 0)
+ return false;
+ }
+ return true;
+}
+
+static int qtaguid_stats_proc_iface_stat_ptr_valid(struct iface_stat *ptr)
+{
+ struct iface_stat *iface_entry;
+
+ if (!ptr)
+ return false;
+
+ list_for_each_entry(iface_entry, &iface_stat_list, list)
+ if (iface_entry == ptr)
+ return true;
+ return false;
+}
+
+static void qtaguid_stats_proc_next_iface_entry(struct proc_print_info *ppi)
+{
+ spin_unlock_bh(&ppi->iface_entry->tag_stat_list_lock);
+ list_for_each_entry_continue(ppi->iface_entry, &iface_stat_list, list) {
+ spin_lock_bh(&ppi->iface_entry->tag_stat_list_lock);
+ return;
+ }
+ ppi->iface_entry = NULL;
+}
+
+static void *qtaguid_stats_proc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct proc_print_info *ppi = m->private;
+ struct tag_stat *ts_entry;
+ struct rb_node *node;
+
+ if (!v) {
+ pr_err("qtaguid: %s(): unexpected v: NULL\n", __func__);
+ return NULL;
+ }
+
+ (*pos)++;
+
+ if (!ppi->iface_entry || unlikely(module_passive))
+ return NULL;
+
+ if (v == SEQ_START_TOKEN)
+ node = rb_first(&ppi->iface_entry->tag_stat_tree);
+ else
+ node = rb_next(&((struct tag_stat *)v)->tn.node);
+
+ while (!node) {
+ qtaguid_stats_proc_next_iface_entry(ppi);
+ if (!ppi->iface_entry)
+ return NULL;
+ node = rb_first(&ppi->iface_entry->tag_stat_tree);
+ }
+
+ ts_entry = rb_entry(node, struct tag_stat, tn.node);
+ ppi->tag = ts_entry->tn.tag;
+ ppi->tag_pos = *pos;
+ ppi->tag_item_index = ppi->item_index;
+ return ts_entry;
+}
+
+static void *qtaguid_stats_proc_start(struct seq_file *m, loff_t *pos)
+{
+ struct proc_print_info *ppi = m->private;
+ struct tag_stat *ts_entry = NULL;
+
+ spin_lock_bh(&iface_stat_list_lock);
+
+ if (*pos == 0) {
+ ppi->item_index = 1;
+ ppi->tag_pos = 0;
+ if (list_empty(&iface_stat_list)) {
+ ppi->iface_entry = NULL;
+ } else {
+ ppi->iface_entry = list_first_entry(&iface_stat_list,
+ struct iface_stat,
+ list);
+ spin_lock_bh(&ppi->iface_entry->tag_stat_list_lock);
+ }
+ return SEQ_START_TOKEN;
+ }
+ if (!qtaguid_stats_proc_iface_stat_ptr_valid(ppi->iface_entry)) {
+ if (ppi->iface_entry) {
+ pr_err("qtaguid: %s(): iface_entry %p not found\n",
+ __func__, ppi->iface_entry);
+ ppi->iface_entry = NULL;
+ }
+ return NULL;
+ }
+
+ spin_lock_bh(&ppi->iface_entry->tag_stat_list_lock);
+
+ if (!ppi->tag_pos) {
+ /* seq_read skipped first next call */
+ ts_entry = SEQ_START_TOKEN;
+ } else {
+ ts_entry = tag_stat_tree_search(
+ &ppi->iface_entry->tag_stat_tree, ppi->tag);
+ if (!ts_entry) {
+ pr_info("qtaguid: %s(): tag_stat.tag 0x%llx not found. Abort.\n",
+ __func__, ppi->tag);
+ return NULL;
+ }
+ }
+
+ if (*pos == ppi->tag_pos) { /* normal resume */
+ ppi->item_index = ppi->tag_item_index;
+ } else {
+ /* seq_read skipped a next call */
+ *pos = ppi->tag_pos;
+ ts_entry = qtaguid_stats_proc_next(m, ts_entry, pos);
+ }
+
+ return ts_entry;
+}
+
+static void qtaguid_stats_proc_stop(struct seq_file *m, void *v)
+{
+ struct proc_print_info *ppi = m->private;
+ if (ppi->iface_entry)
+ spin_unlock_bh(&ppi->iface_entry->tag_stat_list_lock);
+ spin_unlock_bh(&iface_stat_list_lock);
+}
+
+/*
+ * Procfs reader to get all tag stats using style "1)" as described in
+ * fs/proc/generic.c
+ * Groups all protocols tx/rx bytes.
+ */
+static int qtaguid_stats_proc_show(struct seq_file *m, void *v)
+{
+ struct tag_stat *ts_entry = v;
+
+ if (v == SEQ_START_TOKEN)
+ pp_stats_header(m);
+ else
+ pp_sets(m, ts_entry);
+
+ return 0;
+}
+
+/*------------------------------------------*/
+static int qtudev_open(struct inode *inode, struct file *file)
+{
+ struct uid_tag_data *utd_entry;
+ struct proc_qtu_data *pqd_entry;
+ struct proc_qtu_data *new_pqd_entry;
+ int res;
+ bool utd_entry_found;
+
+ if (unlikely(qtu_proc_handling_passive))
+ return 0;
+
+ DR_DEBUG("qtaguid: qtudev_open(): pid=%u tgid=%u uid=%u\n",
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+ spin_lock_bh(&uid_tag_data_tree_lock);
+
+ /* Look for existing uid data, or alloc one. */
+ utd_entry = get_uid_data(from_kuid(&init_user_ns, current_fsuid()), &utd_entry_found);
+ if (IS_ERR_OR_NULL(utd_entry)) {
+ res = PTR_ERR(utd_entry);
+ goto err_unlock;
+ }
+
+ /* Look for existing PID based proc_data */
+ pqd_entry = proc_qtu_data_tree_search(&proc_qtu_data_tree,
+ current->tgid);
+ if (pqd_entry) {
+ pr_err("qtaguid: qtudev_open(): %u/%u %u "
+ "%s already opened\n",
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()),
+ QTU_DEV_NAME);
+ res = -EBUSY;
+ goto err_unlock_free_utd;
+ }
+
+ new_pqd_entry = kzalloc(sizeof(*new_pqd_entry), GFP_ATOMIC);
+ if (!new_pqd_entry) {
+ pr_err("qtaguid: qtudev_open(): %u/%u %u: "
+ "proc data alloc failed\n",
+ current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+ res = -ENOMEM;
+ goto err_unlock_free_utd;
+ }
+ new_pqd_entry->pid = current->tgid;
+ INIT_LIST_HEAD(&new_pqd_entry->sock_tag_list);
+ new_pqd_entry->parent_tag_data = utd_entry;
+ utd_entry->num_pqd++;
+
+ proc_qtu_data_tree_insert(new_pqd_entry,
+ &proc_qtu_data_tree);
+
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ DR_DEBUG("qtaguid: tracking data for uid=%u in pqd=%p\n",
+ from_kuid(&init_user_ns, current_fsuid()), new_pqd_entry);
+ file->private_data = new_pqd_entry;
+ return 0;
+
+err_unlock_free_utd:
+ if (!utd_entry_found) {
+ rb_erase(&utd_entry->node, &uid_tag_data_tree);
+ kfree(utd_entry);
+ }
+err_unlock:
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ return res;
+}
+
+static int qtudev_release(struct inode *inode, struct file *file)
+{
+ struct proc_qtu_data *pqd_entry = file->private_data;
+ struct uid_tag_data *utd_entry = pqd_entry->parent_tag_data;
+ struct sock_tag *st_entry;
+ struct rb_root st_to_free_tree = RB_ROOT;
+ struct list_head *entry, *next;
+ struct tag_ref *tr;
+
+ if (unlikely(qtu_proc_handling_passive))
+ return 0;
+
+ /*
+ * Do not trust the current->pid, it might just be a kworker cleaning
+ * up after a dead proc.
+ */
+ DR_DEBUG("qtaguid: qtudev_release(): "
+ "pid=%u tgid=%u uid=%u "
+ "pqd_entry=%p->pid=%u utd_entry=%p->active_tags=%d\n",
+ current->pid, current->tgid, pqd_entry->parent_tag_data->uid,
+ pqd_entry, pqd_entry->pid, utd_entry,
+ utd_entry->num_active_tags);
+
+ spin_lock_bh(&sock_tag_list_lock);
+ spin_lock_bh(&uid_tag_data_tree_lock);
+
+ list_for_each_safe(entry, next, &pqd_entry->sock_tag_list) {
+ st_entry = list_entry(entry, struct sock_tag, list);
+ DR_DEBUG("qtaguid: %s(): "
+ "erase sock_tag=%p->sk=%p pid=%u tgid=%u uid=%u\n",
+ __func__,
+ st_entry, st_entry->sk,
+ current->pid, current->tgid,
+ pqd_entry->parent_tag_data->uid);
+
+ utd_entry = uid_tag_data_tree_search(
+ &uid_tag_data_tree,
+ get_uid_from_tag(st_entry->tag));
+ BUG_ON(IS_ERR_OR_NULL(utd_entry));
+ DR_DEBUG("qtaguid: %s(): "
+ "looking for tag=0x%llx in utd_entry=%p\n", __func__,
+ st_entry->tag, utd_entry);
+ tr = tag_ref_tree_search(&utd_entry->tag_ref_tree,
+ st_entry->tag);
+ BUG_ON(!tr);
+ BUG_ON(tr->num_sock_tags <= 0);
+ tr->num_sock_tags--;
+ free_tag_ref_from_utd_entry(tr, utd_entry);
+
+ rb_erase(&st_entry->sock_node, &sock_tag_tree);
+ list_del(&st_entry->list);
+ /* Can't sockfd_put() within spinlock, do it later. */
+ sock_tag_tree_insert(st_entry, &st_to_free_tree);
+
+ /*
+ * Try to free the utd_entry if no other proc_qtu_data is
+ * using it (num_pqd is 0) and it doesn't have active tags
+ * (num_active_tags is 0).
+ */
+ put_utd_entry(utd_entry);
+ }
+
+ rb_erase(&pqd_entry->node, &proc_qtu_data_tree);
+ BUG_ON(pqd_entry->parent_tag_data->num_pqd < 1);
+ pqd_entry->parent_tag_data->num_pqd--;
+ put_utd_entry(pqd_entry->parent_tag_data);
+ kfree(pqd_entry);
+ file->private_data = NULL;
+
+ spin_unlock_bh(&uid_tag_data_tree_lock);
+ spin_unlock_bh(&sock_tag_list_lock);
+
+
+ sock_tag_tree_erase(&st_to_free_tree);
+
+ spin_lock_bh(&sock_tag_list_lock);
+ prdebug_full_state_locked(0, "%s(): pid=%u tgid=%u", __func__,
+ current->pid, current->tgid);
+ spin_unlock_bh(&sock_tag_list_lock);
+ return 0;
+}
+
+/*------------------------------------------*/
+static const struct file_operations qtudev_fops = {
+ .owner = THIS_MODULE,
+ .open = qtudev_open,
+ .release = qtudev_release,
+};
+
+static struct miscdevice qtu_device = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = QTU_DEV_NAME,
+ .fops = &qtudev_fops,
+ /* How sad it doesn't allow for defaults: .mode = S_IRUGO | S_IWUSR */
+};
+
+static const struct seq_operations proc_qtaguid_ctrl_seqops = {
+ .start = qtaguid_ctrl_proc_start,
+ .next = qtaguid_ctrl_proc_next,
+ .stop = qtaguid_ctrl_proc_stop,
+ .show = qtaguid_ctrl_proc_show,
+};
+
+static int proc_qtaguid_ctrl_open(struct inode *inode, struct file *file)
+{
+ return seq_open_private(file, &proc_qtaguid_ctrl_seqops,
+ sizeof(struct proc_ctrl_print_info));
+}
+
+static const struct file_operations proc_qtaguid_ctrl_fops = {
+ .open = proc_qtaguid_ctrl_open,
+ .read = seq_read,
+ .write = qtaguid_ctrl_proc_write,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static const struct seq_operations proc_qtaguid_stats_seqops = {
+ .start = qtaguid_stats_proc_start,
+ .next = qtaguid_stats_proc_next,
+ .stop = qtaguid_stats_proc_stop,
+ .show = qtaguid_stats_proc_show,
+};
+
+static int proc_qtaguid_stats_open(struct inode *inode, struct file *file)
+{
+ return seq_open_private(file, &proc_qtaguid_stats_seqops,
+ sizeof(struct proc_print_info));
+}
+
+static const struct file_operations proc_qtaguid_stats_fops = {
+ .open = proc_qtaguid_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+/*------------------------------------------*/
+static int __init qtaguid_proc_register(struct proc_dir_entry **res_procdir)
+{
+ int ret;
+ *res_procdir = proc_mkdir(module_procdirname, init_net.proc_net);
+ if (!*res_procdir) {
+ pr_err("qtaguid: failed to create proc/.../xt_qtaguid\n");
+ ret = -ENOMEM;
+ goto no_dir;
+ }
+
+ xt_qtaguid_ctrl_file = proc_create_data("ctrl", proc_ctrl_perms,
+ *res_procdir,
+ &proc_qtaguid_ctrl_fops,
+ NULL);
+ if (!xt_qtaguid_ctrl_file) {
+ pr_err("qtaguid: failed to create xt_qtaguid/ctrl "
+ " file\n");
+ ret = -ENOMEM;
+ goto no_ctrl_entry;
+ }
+
+ xt_qtaguid_stats_file = proc_create_data("stats", proc_stats_perms,
+ *res_procdir,
+ &proc_qtaguid_stats_fops,
+ NULL);
+ if (!xt_qtaguid_stats_file) {
+ pr_err("qtaguid: failed to create xt_qtaguid/stats "
+ "file\n");
+ ret = -ENOMEM;
+ goto no_stats_entry;
+ }
+ /*
+ * TODO: add support counter hacking
+ * xt_qtaguid_stats_file->write_proc = qtaguid_stats_proc_write;
+ */
+ return 0;
+
+no_stats_entry:
+ remove_proc_entry("ctrl", *res_procdir);
+no_ctrl_entry:
+ remove_proc_entry("xt_qtaguid", NULL);
+no_dir:
+ return ret;
+}
+
+static struct xt_match qtaguid_mt_reg __read_mostly = {
+ /*
+ * This module masquerades as the "owner" module so that iptables
+ * tools can deal with it.
+ */
+ .name = "owner",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .match = qtaguid_mt,
+ .matchsize = sizeof(struct xt_qtaguid_match_info),
+ .me = THIS_MODULE,
+};
+
+static int __init qtaguid_mt_init(void)
+{
+ if (qtaguid_proc_register(&xt_qtaguid_procdir)
+ || iface_stat_init(xt_qtaguid_procdir)
+ || xt_register_match(&qtaguid_mt_reg)
+ || misc_register(&qtu_device))
+ return -1;
+ return 0;
+}
+
+/*
+ * TODO: allow unloading of the module.
+ * For now stats are permanent.
+ * Kconfig forces'y/n' and never an 'm'.
+ */
+
+module_init(qtaguid_mt_init);
+MODULE_AUTHOR("jpa <jpa@google.com>");
+MODULE_DESCRIPTION("Xtables: socket owner+tag matching and associated stats");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_owner");
+MODULE_ALIAS("ip6t_owner");
+MODULE_ALIAS("ipt_qtaguid");
+MODULE_ALIAS("ip6t_qtaguid");
diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h
new file mode 100644
index 000000000000..c7052707a6a4
--- /dev/null
+++ b/net/netfilter/xt_qtaguid_internal.h
@@ -0,0 +1,350 @@
+/*
+ * Kernel iptables module to track stats for packets based on user tags.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __XT_QTAGUID_INTERNAL_H__
+#define __XT_QTAGUID_INTERNAL_H__
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock_types.h>
+#include <linux/workqueue.h>
+
+/* Iface handling */
+#define IDEBUG_MASK (1<<0)
+/* Iptable Matching. Per packet. */
+#define MDEBUG_MASK (1<<1)
+/* Red-black tree handling. Per packet. */
+#define RDEBUG_MASK (1<<2)
+/* procfs ctrl/stats handling */
+#define CDEBUG_MASK (1<<3)
+/* dev and resource tracking */
+#define DDEBUG_MASK (1<<4)
+
+/* E.g (IDEBUG_MASK | CDEBUG_MASK | DDEBUG_MASK) */
+#define DEFAULT_DEBUG_MASK 0
+
+/*
+ * (Un)Define these *DEBUG to compile out/in the pr_debug calls.
+ * All undef: text size ~ 0x3030; all def: ~ 0x4404.
+ */
+#define IDEBUG
+#define MDEBUG
+#define RDEBUG
+#define CDEBUG
+#define DDEBUG
+
+#define MSK_DEBUG(mask, ...) do { \
+ if (unlikely(qtaguid_debug_mask & (mask))) \
+ pr_debug(__VA_ARGS__); \
+ } while (0)
+#ifdef IDEBUG
+#define IF_DEBUG(...) MSK_DEBUG(IDEBUG_MASK, __VA_ARGS__)
+#else
+#define IF_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef MDEBUG
+#define MT_DEBUG(...) MSK_DEBUG(MDEBUG_MASK, __VA_ARGS__)
+#else
+#define MT_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef RDEBUG
+#define RB_DEBUG(...) MSK_DEBUG(RDEBUG_MASK, __VA_ARGS__)
+#else
+#define RB_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef CDEBUG
+#define CT_DEBUG(...) MSK_DEBUG(CDEBUG_MASK, __VA_ARGS__)
+#else
+#define CT_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef DDEBUG
+#define DR_DEBUG(...) MSK_DEBUG(DDEBUG_MASK, __VA_ARGS__)
+#else
+#define DR_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+
+extern uint qtaguid_debug_mask;
+
+/*---------------------------------------------------------------------------*/
+/*
+ * Tags:
+ *
+ * They represent what the data usage counters will be tracked against.
+ * By default a tag is just based on the UID.
+ * The UID is used as the base for policing, and can not be ignored.
+ * So a tag will always at least represent a UID (uid_tag).
+ *
+ * A tag can be augmented with an "accounting tag" which is associated
+ * with a UID.
+ * User space can set the acct_tag portion of the tag which is then used
+ * with sockets: all data belonging to that socket will be counted against the
+ * tag. The policing is then based on the tag's uid_tag portion,
+ * and stats are collected for the acct_tag portion separately.
+ *
+ * There could be
+ * a: {acct_tag=1, uid_tag=10003}
+ * b: {acct_tag=2, uid_tag=10003}
+ * c: {acct_tag=3, uid_tag=10003}
+ * d: {acct_tag=0, uid_tag=10003}
+ * a, b, and c represent tags associated with specific sockets.
+ * d is for the totals for that uid, including all untagged traffic.
+ * Typically d is used with policing/quota rules.
+ *
+ * We want tag_t big enough to distinguish uid_t and acct_tag.
+ * It might become a struct if needed.
+ * Nothing should be using it as an int.
+ */
+typedef uint64_t tag_t; /* Only used via accessors */
+
+#define TAG_UID_MASK 0xFFFFFFFFULL
+#define TAG_ACCT_MASK (~0xFFFFFFFFULL)
+
+static inline int tag_compare(tag_t t1, tag_t t2)
+{
+ return t1 < t2 ? -1 : t1 == t2 ? 0 : 1;
+}
+
+static inline tag_t combine_atag_with_uid(tag_t acct_tag, uid_t uid)
+{
+ return acct_tag | uid;
+}
+static inline tag_t make_tag_from_uid(uid_t uid)
+{
+ return uid;
+}
+static inline uid_t get_uid_from_tag(tag_t tag)
+{
+ return tag & TAG_UID_MASK;
+}
+static inline tag_t get_utag_from_tag(tag_t tag)
+{
+ return tag & TAG_UID_MASK;
+}
+static inline tag_t get_atag_from_tag(tag_t tag)
+{
+ return tag & TAG_ACCT_MASK;
+}
+
+static inline bool valid_atag(tag_t tag)
+{
+ return !(tag & TAG_UID_MASK);
+}
+static inline tag_t make_atag_from_value(uint32_t value)
+{
+ return (uint64_t)value << 32;
+}
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Maximum number of socket tags that a UID is allowed to have active.
+ * Multiple processes belonging to the same UID contribute towards this limit.
+ * Special UIDs that can impersonate a UID also contribute (e.g. download
+ * manager, ...)
+ */
+#define DEFAULT_MAX_SOCK_TAGS 1024
+
+/*
+ * For now we only track 2 sets of counters.
+ * The default set is 0.
+ * Userspace can activate another set for a given uid being tracked.
+ */
+#define IFS_MAX_COUNTER_SETS 2
+
+enum ifs_tx_rx {
+ IFS_TX,
+ IFS_RX,
+ IFS_MAX_DIRECTIONS
+};
+
+/* For now, TCP, UDP, the rest */
+enum ifs_proto {
+ IFS_TCP,
+ IFS_UDP,
+ IFS_PROTO_OTHER,
+ IFS_MAX_PROTOS
+};
+
+struct byte_packet_counters {
+ uint64_t bytes;
+ uint64_t packets;
+};
+
+struct data_counters {
+ struct byte_packet_counters bpc[IFS_MAX_COUNTER_SETS][IFS_MAX_DIRECTIONS][IFS_MAX_PROTOS];
+};
+
+static inline uint64_t dc_sum_bytes(struct data_counters *counters,
+ int set,
+ enum ifs_tx_rx direction)
+{
+ return counters->bpc[set][direction][IFS_TCP].bytes
+ + counters->bpc[set][direction][IFS_UDP].bytes
+ + counters->bpc[set][direction][IFS_PROTO_OTHER].bytes;
+}
+
+static inline uint64_t dc_sum_packets(struct data_counters *counters,
+ int set,
+ enum ifs_tx_rx direction)
+{
+ return counters->bpc[set][direction][IFS_TCP].packets
+ + counters->bpc[set][direction][IFS_UDP].packets
+ + counters->bpc[set][direction][IFS_PROTO_OTHER].packets;
+}
+
+
+/* Generic X based nodes used as a base for rb_tree ops */
+struct tag_node {
+ struct rb_node node;
+ tag_t tag;
+};
+
+struct tag_stat {
+ struct tag_node tn;
+ struct data_counters counters;
+ /*
+ * If this tag is acct_tag based, we need to count against the
+ * matching parent uid_tag.
+ */
+ struct data_counters *parent_counters;
+};
+
+struct iface_stat {
+ struct list_head list; /* in iface_stat_list */
+ char *ifname;
+ bool active;
+ /* net_dev is only valid for active iface_stat */
+ struct net_device *net_dev;
+
+ struct byte_packet_counters totals_via_dev[IFS_MAX_DIRECTIONS];
+ struct data_counters totals_via_skb;
+ /*
+ * We keep the last_known, because some devices reset their counters
+ * just before NETDEV_UP, while some will reset just before
+ * NETDEV_REGISTER (which is more normal).
+ * So now, if the device didn't do a NETDEV_UNREGISTER and we see
+ * its current dev stats smaller that what was previously known, we
+ * assume an UNREGISTER and just use the last_known.
+ */
+ struct byte_packet_counters last_known[IFS_MAX_DIRECTIONS];
+ /* last_known is usable when last_known_valid is true */
+ bool last_known_valid;
+
+ struct proc_dir_entry *proc_ptr;
+
+ struct rb_root tag_stat_tree;
+ spinlock_t tag_stat_list_lock;
+};
+
+/* This is needed to create proc_dir_entries from atomic context. */
+struct iface_stat_work {
+ struct work_struct iface_work;
+ struct iface_stat *iface_entry;
+};
+
+/*
+ * Track tag that this socket is transferring data for, and not necessarily
+ * the uid that owns the socket.
+ * This is the tag against which tag_stat.counters will be billed.
+ * These structs need to be looked up by sock and pid.
+ */
+struct sock_tag {
+ struct rb_node sock_node;
+ struct sock *sk; /* Only used as a number, never dereferenced */
+ /* Used to associate with a given pid */
+ struct list_head list; /* in proc_qtu_data.sock_tag_list */
+ pid_t pid;
+
+ tag_t tag;
+};
+
+struct qtaguid_event_counts {
+ /* Various successful events */
+ atomic64_t sockets_tagged;
+ atomic64_t sockets_untagged;
+ atomic64_t counter_set_changes;
+ atomic64_t delete_cmds;
+ atomic64_t iface_events; /* Number of NETDEV_* events handled */
+
+ atomic64_t match_calls; /* Number of times iptables called mt */
+ /* Number of times iptables called mt from pre or post routing hooks */
+ atomic64_t match_calls_prepost;
+ /*
+ * match_found_sk_*: numbers related to the netfilter matching
+ * function finding a sock for the sk_buff.
+ * Total skbs processed is sum(match_found*).
+ */
+ atomic64_t match_found_sk; /* An sk was already in the sk_buff. */
+ /* The connection tracker had or didn't have the sk. */
+ atomic64_t match_found_sk_in_ct;
+ atomic64_t match_found_no_sk_in_ct;
+ /*
+ * No sk could be found. No apparent owner. Could happen with
+ * unsolicited traffic.
+ */
+ atomic64_t match_no_sk;
+ /*
+ * The file ptr in the sk_socket wasn't there and we couldn't get GID.
+ * This might happen for traffic while the socket is being closed.
+ */
+ atomic64_t match_no_sk_gid;
+};
+
+/* Track the set active_set for the given tag. */
+struct tag_counter_set {
+ struct tag_node tn;
+ int active_set;
+};
+
+/*----------------------------------------------*/
+/*
+ * The qtu uid data is used to track resources that are created directly or
+ * indirectly by processes (uid tracked).
+ * It is shared by the processes with the same uid.
+ * Some of the resource will be counted to prevent further rogue allocations,
+ * some will need freeing once the owner process (uid) exits.
+ */
+struct uid_tag_data {
+ struct rb_node node;
+ uid_t uid;
+
+ /*
+ * For the uid, how many accounting tags have been set.
+ */
+ int num_active_tags;
+ /* Track the number of proc_qtu_data that reference it */
+ int num_pqd;
+ struct rb_root tag_ref_tree;
+ /* No tag_node_tree_lock; use uid_tag_data_tree_lock */
+};
+
+struct tag_ref {
+ struct tag_node tn;
+
+ /*
+ * This tracks the number of active sockets that have a tag on them
+ * which matches this tag_ref.tn.tag.
+ * A tag ref can live on after the sockets are untagged.
+ * A tag ref can only be removed during a tag delete command.
+ */
+ int num_sock_tags;
+};
+
+struct proc_qtu_data {
+ struct rb_node node;
+ pid_t pid;
+
+ struct uid_tag_data *parent_tag_data;
+
+ /* Tracks the sock_tags that need freeing upon this proc's death */
+ struct list_head sock_tag_list;
+ /* No spinlock_t sock_tag_list_lock; use the global one. */
+};
+
+/*----------------------------------------------*/
+#endif /* ifndef __XT_QTAGUID_INTERNAL_H__ */
diff --git a/net/netfilter/xt_qtaguid_print.c b/net/netfilter/xt_qtaguid_print.c
new file mode 100644
index 000000000000..2a7190d285e6
--- /dev/null
+++ b/net/netfilter/xt_qtaguid_print.c
@@ -0,0 +1,566 @@
+/*
+ * Pretty printing Support for iptables xt_qtaguid module.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Most of the functions in this file just waste time if DEBUG is not defined.
+ * The matching xt_qtaguid_print.h will static inline empty funcs if the needed
+ * debug flags ore not defined.
+ * Those funcs that fail to allocate memory will panic as there is no need to
+ * hobble allong just pretending to do the requested work.
+ */
+
+#define DEBUG
+
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/net.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/spinlock_types.h>
+#include <net/sock.h>
+
+#include "xt_qtaguid_internal.h"
+#include "xt_qtaguid_print.h"
+
+#ifdef DDEBUG
+
+static void _bug_on_err_or_null(void *ptr)
+{
+ if (IS_ERR_OR_NULL(ptr)) {
+ pr_err("qtaguid: kmalloc failed\n");
+ BUG();
+ }
+}
+
+char *pp_tag_t(tag_t *tag)
+{
+ char *res;
+
+ if (!tag)
+ res = kasprintf(GFP_ATOMIC, "tag_t@null{}");
+ else
+ res = kasprintf(GFP_ATOMIC,
+ "tag_t@%p{tag=0x%llx, uid=%u}",
+ tag, *tag, get_uid_from_tag(*tag));
+ _bug_on_err_or_null(res);
+ return res;
+}
+
+char *pp_data_counters(struct data_counters *dc, bool showValues)
+{
+ char *res;
+
+ if (!dc)
+ res = kasprintf(GFP_ATOMIC, "data_counters@null{}");
+ else if (showValues)
+ res = kasprintf(
+ GFP_ATOMIC, "data_counters@%p{"
+ "set0{"
+ "rx{"
+ "tcp{b=%llu, p=%llu}, "
+ "udp{b=%llu, p=%llu},"
+ "other{b=%llu, p=%llu}}, "
+ "tx{"
+ "tcp{b=%llu, p=%llu}, "
+ "udp{b=%llu, p=%llu},"
+ "other{b=%llu, p=%llu}}}, "
+ "set1{"
+ "rx{"
+ "tcp{b=%llu, p=%llu}, "
+ "udp{b=%llu, p=%llu},"
+ "other{b=%llu, p=%llu}}, "
+ "tx{"
+ "tcp{b=%llu, p=%llu}, "
+ "udp{b=%llu, p=%llu},"
+ "other{b=%llu, p=%llu}}}}",
+ dc,
+ dc->bpc[0][IFS_RX][IFS_TCP].bytes,
+ dc->bpc[0][IFS_RX][IFS_TCP].packets,
+ dc->bpc[0][IFS_RX][IFS_UDP].bytes,
+ dc->bpc[0][IFS_RX][IFS_UDP].packets,
+ dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].bytes,
+ dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].packets,
+ dc->bpc[0][IFS_TX][IFS_TCP].bytes,
+ dc->bpc[0][IFS_TX][IFS_TCP].packets,
+ dc->bpc[0][IFS_TX][IFS_UDP].bytes,
+ dc->bpc[0][IFS_TX][IFS_UDP].packets,
+ dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].bytes,
+ dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].packets,
+ dc->bpc[1][IFS_RX][IFS_TCP].bytes,
+ dc->bpc[1][IFS_RX][IFS_TCP].packets,
+ dc->bpc[1][IFS_RX][IFS_UDP].bytes,
+ dc->bpc[1][IFS_RX][IFS_UDP].packets,
+ dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].bytes,
+ dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].packets,
+ dc->bpc[1][IFS_TX][IFS_TCP].bytes,
+ dc->bpc[1][IFS_TX][IFS_TCP].packets,
+ dc->bpc[1][IFS_TX][IFS_UDP].bytes,
+ dc->bpc[1][IFS_TX][IFS_UDP].packets,
+ dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].bytes,
+ dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].packets);
+ else
+ res = kasprintf(GFP_ATOMIC, "data_counters@%p{...}", dc);
+ _bug_on_err_or_null(res);
+ return res;
+}
+
+char *pp_tag_node(struct tag_node *tn)
+{
+ char *tag_str;
+ char *res;
+
+ if (!tn) {
+ res = kasprintf(GFP_ATOMIC, "tag_node@null{}");
+ _bug_on_err_or_null(res);
+ return res;
+ }
+ tag_str = pp_tag_t(&tn->tag);
+ res = kasprintf(GFP_ATOMIC,
+ "tag_node@%p{tag=%s}",
+ tn, tag_str);
+ _bug_on_err_or_null(res);
+ kfree(tag_str);
+ return res;
+}
+
+char *pp_tag_ref(struct tag_ref *tr)
+{
+ char *tn_str;
+ char *res;
+
+ if (!tr) {
+ res = kasprintf(GFP_ATOMIC, "tag_ref@null{}");
+ _bug_on_err_or_null(res);
+ return res;
+ }
+ tn_str = pp_tag_node(&tr->tn);
+ res = kasprintf(GFP_ATOMIC,
+ "tag_ref@%p{%s, num_sock_tags=%d}",
+ tr, tn_str, tr->num_sock_tags);
+ _bug_on_err_or_null(res);
+ kfree(tn_str);
+ return res;
+}
+
+char *pp_tag_stat(struct tag_stat *ts)
+{
+ char *tn_str;
+ char *counters_str;
+ char *parent_counters_str;
+ char *res;
+
+ if (!ts) {
+ res = kasprintf(GFP_ATOMIC, "tag_stat@null{}");
+ _bug_on_err_or_null(res);
+ return res;
+ }
+ tn_str = pp_tag_node(&ts->tn);
+ counters_str = pp_data_counters(&ts->counters, true);
+ parent_counters_str = pp_data_counters(ts->parent_counters, false);
+ res = kasprintf(GFP_ATOMIC,
+ "tag_stat@%p{%s, counters=%s, parent_counters=%s}",
+ ts, tn_str, counters_str, parent_counters_str);
+ _bug_on_err_or_null(res);
+ kfree(tn_str);
+ kfree(counters_str);
+ kfree(parent_counters_str);
+ return res;
+}
+
+char *pp_iface_stat(struct iface_stat *is)
+{
+ char *res;
+ if (!is) {
+ res = kasprintf(GFP_ATOMIC, "iface_stat@null{}");
+ } else {
+ struct data_counters *cnts = &is->totals_via_skb;
+ res = kasprintf(GFP_ATOMIC, "iface_stat@%p{"
+ "list=list_head{...}, "
+ "ifname=%s, "
+ "total_dev={rx={bytes=%llu, "
+ "packets=%llu}, "
+ "tx={bytes=%llu, "
+ "packets=%llu}}, "
+ "total_skb={rx={bytes=%llu, "
+ "packets=%llu}, "
+ "tx={bytes=%llu, "
+ "packets=%llu}}, "
+ "last_known_valid=%d, "
+ "last_known={rx={bytes=%llu, "
+ "packets=%llu}, "
+ "tx={bytes=%llu, "
+ "packets=%llu}}, "
+ "active=%d, "
+ "net_dev=%p, "
+ "proc_ptr=%p, "
+ "tag_stat_tree=rb_root{...}}",
+ is,
+ is->ifname,
+ is->totals_via_dev[IFS_RX].bytes,
+ is->totals_via_dev[IFS_RX].packets,
+ is->totals_via_dev[IFS_TX].bytes,
+ is->totals_via_dev[IFS_TX].packets,
+ dc_sum_bytes(cnts, 0, IFS_RX),
+ dc_sum_packets(cnts, 0, IFS_RX),
+ dc_sum_bytes(cnts, 0, IFS_TX),
+ dc_sum_packets(cnts, 0, IFS_TX),
+ is->last_known_valid,
+ is->last_known[IFS_RX].bytes,
+ is->last_known[IFS_RX].packets,
+ is->last_known[IFS_TX].bytes,
+ is->last_known[IFS_TX].packets,
+ is->active,
+ is->net_dev,
+ is->proc_ptr);
+ }
+ _bug_on_err_or_null(res);
+ return res;
+}
+
+char *pp_sock_tag(struct sock_tag *st)
+{
+ char *tag_str;
+ char *res;
+
+ if (!st) {
+ res = kasprintf(GFP_ATOMIC, "sock_tag@null{}");
+ _bug_on_err_or_null(res);
+ return res;
+ }
+ tag_str = pp_tag_t(&st->tag);
+ res = kasprintf(GFP_ATOMIC, "sock_tag@%p{"
+ "sock_node=rb_node{...}, "
+ "sk=%p (f_count=%d), list=list_head{...}, "
+ "pid=%u, tag=%s}",
+ st, st->sk, atomic_read(
+ &st->sk->sk_refcnt),
+ st->pid, tag_str);
+ _bug_on_err_or_null(res);
+ kfree(tag_str);
+ return res;
+}
+
+char *pp_uid_tag_data(struct uid_tag_data *utd)
+{
+ char *res;
+
+ if (!utd)
+ res = kasprintf(GFP_ATOMIC, "uid_tag_data@null{}");
+ else
+ res = kasprintf(GFP_ATOMIC, "uid_tag_data@%p{"
+ "uid=%u, num_active_acct_tags=%d, "
+ "num_pqd=%d, "
+ "tag_node_tree=rb_root{...}, "
+ "proc_qtu_data_tree=rb_root{...}}",
+ utd, utd->uid,
+ utd->num_active_tags, utd->num_pqd);
+ _bug_on_err_or_null(res);
+ return res;
+}
+
+char *pp_proc_qtu_data(struct proc_qtu_data *pqd)
+{
+ char *parent_tag_data_str;
+ char *res;
+
+ if (!pqd) {
+ res = kasprintf(GFP_ATOMIC, "proc_qtu_data@null{}");
+ _bug_on_err_or_null(res);
+ return res;
+ }
+ parent_tag_data_str = pp_uid_tag_data(pqd->parent_tag_data);
+ res = kasprintf(GFP_ATOMIC, "proc_qtu_data@%p{"
+ "node=rb_node{...}, pid=%u, "
+ "parent_tag_data=%s, "
+ "sock_tag_list=list_head{...}}",
+ pqd, pqd->pid, parent_tag_data_str
+ );
+ _bug_on_err_or_null(res);
+ kfree(parent_tag_data_str);
+ return res;
+}
+
+/*------------------------------------------*/
+void prdebug_sock_tag_tree(int indent_level,
+ struct rb_root *sock_tag_tree)
+{
+ struct rb_node *node;
+ struct sock_tag *sock_tag_entry;
+ char *str;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (RB_EMPTY_ROOT(sock_tag_tree)) {
+ str = "sock_tag_tree=rb_root{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "sock_tag_tree=rb_root{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ for (node = rb_first(sock_tag_tree);
+ node;
+ node = rb_next(node)) {
+ sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
+ str = pp_sock_tag(sock_tag_entry);
+ pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
+ kfree(str);
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_sock_tag_list(int indent_level,
+ struct list_head *sock_tag_list)
+{
+ struct sock_tag *sock_tag_entry;
+ char *str;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (list_empty(sock_tag_list)) {
+ str = "sock_tag_list=list_head{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "sock_tag_list=list_head{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ list_for_each_entry(sock_tag_entry, sock_tag_list, list) {
+ str = pp_sock_tag(sock_tag_entry);
+ pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
+ kfree(str);
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_proc_qtu_data_tree(int indent_level,
+ struct rb_root *proc_qtu_data_tree)
+{
+ char *str;
+ struct rb_node *node;
+ struct proc_qtu_data *proc_qtu_data_entry;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (RB_EMPTY_ROOT(proc_qtu_data_tree)) {
+ str = "proc_qtu_data_tree=rb_root{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "proc_qtu_data_tree=rb_root{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ for (node = rb_first(proc_qtu_data_tree);
+ node;
+ node = rb_next(node)) {
+ proc_qtu_data_entry = rb_entry(node,
+ struct proc_qtu_data,
+ node);
+ str = pp_proc_qtu_data(proc_qtu_data_entry);
+ pr_debug("%*d: %s,\n", indent_level*2, indent_level,
+ str);
+ kfree(str);
+ indent_level++;
+ prdebug_sock_tag_list(indent_level,
+ &proc_qtu_data_entry->sock_tag_list);
+ indent_level--;
+
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree)
+{
+ char *str;
+ struct rb_node *node;
+ struct tag_ref *tag_ref_entry;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (RB_EMPTY_ROOT(tag_ref_tree)) {
+ str = "tag_ref_tree{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "tag_ref_tree{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ for (node = rb_first(tag_ref_tree);
+ node;
+ node = rb_next(node)) {
+ tag_ref_entry = rb_entry(node,
+ struct tag_ref,
+ tn.node);
+ str = pp_tag_ref(tag_ref_entry);
+ pr_debug("%*d: %s,\n", indent_level*2, indent_level,
+ str);
+ kfree(str);
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_uid_tag_data_tree(int indent_level,
+ struct rb_root *uid_tag_data_tree)
+{
+ char *str;
+ struct rb_node *node;
+ struct uid_tag_data *uid_tag_data_entry;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (RB_EMPTY_ROOT(uid_tag_data_tree)) {
+ str = "uid_tag_data_tree=rb_root{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "uid_tag_data_tree=rb_root{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ for (node = rb_first(uid_tag_data_tree);
+ node;
+ node = rb_next(node)) {
+ uid_tag_data_entry = rb_entry(node, struct uid_tag_data,
+ node);
+ str = pp_uid_tag_data(uid_tag_data_entry);
+ pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
+ kfree(str);
+ if (!RB_EMPTY_ROOT(&uid_tag_data_entry->tag_ref_tree)) {
+ indent_level++;
+ prdebug_tag_ref_tree(indent_level,
+ &uid_tag_data_entry->tag_ref_tree);
+ indent_level--;
+ }
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_tag_stat_tree(int indent_level,
+ struct rb_root *tag_stat_tree)
+{
+ char *str;
+ struct rb_node *node;
+ struct tag_stat *ts_entry;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (RB_EMPTY_ROOT(tag_stat_tree)) {
+ str = "tag_stat_tree{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "tag_stat_tree{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ for (node = rb_first(tag_stat_tree);
+ node;
+ node = rb_next(node)) {
+ ts_entry = rb_entry(node, struct tag_stat, tn.node);
+ str = pp_tag_stat(ts_entry);
+ pr_debug("%*d: %s\n", indent_level*2, indent_level,
+ str);
+ kfree(str);
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_iface_stat_list(int indent_level,
+ struct list_head *iface_stat_list)
+{
+ char *str;
+ struct iface_stat *iface_entry;
+
+ if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+ return;
+
+ if (list_empty(iface_stat_list)) {
+ str = "iface_stat_list=list_head{}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ return;
+ }
+
+ str = "iface_stat_list=list_head{";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ indent_level++;
+ list_for_each_entry(iface_entry, iface_stat_list, list) {
+ str = pp_iface_stat(iface_entry);
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+ kfree(str);
+
+ spin_lock_bh(&iface_entry->tag_stat_list_lock);
+ if (!RB_EMPTY_ROOT(&iface_entry->tag_stat_tree)) {
+ indent_level++;
+ prdebug_tag_stat_tree(indent_level,
+ &iface_entry->tag_stat_tree);
+ indent_level--;
+ }
+ spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+ }
+ indent_level--;
+ str = "}";
+ pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+#endif /* ifdef DDEBUG */
+/*------------------------------------------*/
+static const char * const netdev_event_strings[] = {
+ "netdev_unknown",
+ "NETDEV_UP",
+ "NETDEV_DOWN",
+ "NETDEV_REBOOT",
+ "NETDEV_CHANGE",
+ "NETDEV_REGISTER",
+ "NETDEV_UNREGISTER",
+ "NETDEV_CHANGEMTU",
+ "NETDEV_CHANGEADDR",
+ "NETDEV_GOING_DOWN",
+ "NETDEV_CHANGENAME",
+ "NETDEV_FEAT_CHANGE",
+ "NETDEV_BONDING_FAILOVER",
+ "NETDEV_PRE_UP",
+ "NETDEV_PRE_TYPE_CHANGE",
+ "NETDEV_POST_TYPE_CHANGE",
+ "NETDEV_POST_INIT",
+ "NETDEV_UNREGISTER_BATCH",
+ "NETDEV_RELEASE",
+ "NETDEV_NOTIFY_PEERS",
+ "NETDEV_JOIN",
+};
+
+const char *netdev_evt_str(int netdev_event)
+{
+ if (netdev_event < 0
+ || netdev_event >= ARRAY_SIZE(netdev_event_strings))
+ return "bad event num";
+ return netdev_event_strings[netdev_event];
+}
diff --git a/net/netfilter/xt_qtaguid_print.h b/net/netfilter/xt_qtaguid_print.h
new file mode 100644
index 000000000000..b63871a0be5a
--- /dev/null
+++ b/net/netfilter/xt_qtaguid_print.h
@@ -0,0 +1,120 @@
+/*
+ * Pretty printing Support for iptables xt_qtaguid module.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __XT_QTAGUID_PRINT_H__
+#define __XT_QTAGUID_PRINT_H__
+
+#include "xt_qtaguid_internal.h"
+
+#ifdef DDEBUG
+
+char *pp_tag_t(tag_t *tag);
+char *pp_data_counters(struct data_counters *dc, bool showValues);
+char *pp_tag_node(struct tag_node *tn);
+char *pp_tag_ref(struct tag_ref *tr);
+char *pp_tag_stat(struct tag_stat *ts);
+char *pp_iface_stat(struct iface_stat *is);
+char *pp_sock_tag(struct sock_tag *st);
+char *pp_uid_tag_data(struct uid_tag_data *qtd);
+char *pp_proc_qtu_data(struct proc_qtu_data *pqd);
+
+/*------------------------------------------*/
+void prdebug_sock_tag_list(int indent_level,
+ struct list_head *sock_tag_list);
+void prdebug_sock_tag_tree(int indent_level,
+ struct rb_root *sock_tag_tree);
+void prdebug_proc_qtu_data_tree(int indent_level,
+ struct rb_root *proc_qtu_data_tree);
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree);
+void prdebug_uid_tag_data_tree(int indent_level,
+ struct rb_root *uid_tag_data_tree);
+void prdebug_tag_stat_tree(int indent_level,
+ struct rb_root *tag_stat_tree);
+void prdebug_iface_stat_list(int indent_level,
+ struct list_head *iface_stat_list);
+
+#else
+
+/*------------------------------------------*/
+static inline char *pp_tag_t(tag_t *tag)
+{
+ return NULL;
+}
+static inline char *pp_data_counters(struct data_counters *dc, bool showValues)
+{
+ return NULL;
+}
+static inline char *pp_tag_node(struct tag_node *tn)
+{
+ return NULL;
+}
+static inline char *pp_tag_ref(struct tag_ref *tr)
+{
+ return NULL;
+}
+static inline char *pp_tag_stat(struct tag_stat *ts)
+{
+ return NULL;
+}
+static inline char *pp_iface_stat(struct iface_stat *is)
+{
+ return NULL;
+}
+static inline char *pp_sock_tag(struct sock_tag *st)
+{
+ return NULL;
+}
+static inline char *pp_uid_tag_data(struct uid_tag_data *qtd)
+{
+ return NULL;
+}
+static inline char *pp_proc_qtu_data(struct proc_qtu_data *pqd)
+{
+ return NULL;
+}
+
+/*------------------------------------------*/
+static inline
+void prdebug_sock_tag_list(int indent_level,
+ struct list_head *sock_tag_list)
+{
+}
+static inline
+void prdebug_sock_tag_tree(int indent_level,
+ struct rb_root *sock_tag_tree)
+{
+}
+static inline
+void prdebug_proc_qtu_data_tree(int indent_level,
+ struct rb_root *proc_qtu_data_tree)
+{
+}
+static inline
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree)
+{
+}
+static inline
+void prdebug_uid_tag_data_tree(int indent_level,
+ struct rb_root *uid_tag_data_tree)
+{
+}
+static inline
+void prdebug_tag_stat_tree(int indent_level,
+ struct rb_root *tag_stat_tree)
+{
+}
+static inline
+void prdebug_iface_stat_list(int indent_level,
+ struct list_head *iface_stat_list)
+{
+}
+#endif
+/*------------------------------------------*/
+const char *netdev_evt_str(int netdev_event);
+#endif /* ifndef __XT_QTAGUID_PRINT_H__ */
diff --git a/net/netfilter/xt_quota2.c b/net/netfilter/xt_quota2.c
new file mode 100644
index 000000000000..834594aa0085
--- /dev/null
+++ b/net/netfilter/xt_quota2.c
@@ -0,0 +1,401 @@
+/*
+ * xt_quota2 - enhanced xt_quota that can count upwards and in packets
+ * as a minimal accounting match.
+ * by Jan Engelhardt <jengelh@medozas.de>, 2008
+ *
+ * Originally based on xt_quota.c:
+ * netfilter module to enforce network quotas
+ * Sam Johnston <samj@samj.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License; either
+ * version 2 of the License, as published by the Free Software Foundation.
+ */
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_quota2.h>
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+/* For compatibility, these definitions are copied from the
+ * deprecated header file <linux/netfilter_ipv4/ipt_ULOG.h> */
+#define ULOG_MAC_LEN 80
+#define ULOG_PREFIX_LEN 32
+
+/* Format of the ULOG packets passed through netlink */
+typedef struct ulog_packet_msg {
+ unsigned long mark;
+ long timestamp_sec;
+ long timestamp_usec;
+ unsigned int hook;
+ char indev_name[IFNAMSIZ];
+ char outdev_name[IFNAMSIZ];
+ size_t data_len;
+ char prefix[ULOG_PREFIX_LEN];
+ unsigned char mac_len;
+ unsigned char mac[ULOG_MAC_LEN];
+ unsigned char payload[0];
+} ulog_packet_msg_t;
+#endif
+
+/**
+ * @lock: lock to protect quota writers from each other
+ */
+struct xt_quota_counter {
+ u_int64_t quota;
+ spinlock_t lock;
+ struct list_head list;
+ atomic_t ref;
+ char name[sizeof(((struct xt_quota_mtinfo2 *)NULL)->name)];
+ struct proc_dir_entry *procfs_entry;
+};
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+/* Harald's favorite number +1 :D From ipt_ULOG.C */
+static int qlog_nl_event = 112;
+module_param_named(event_num, qlog_nl_event, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(event_num,
+ "Event number for NETLINK_NFLOG message. 0 disables log."
+ "111 is what ipt_ULOG uses.");
+static struct sock *nflognl;
+#endif
+
+static LIST_HEAD(counter_list);
+static DEFINE_SPINLOCK(counter_list_lock);
+
+static struct proc_dir_entry *proc_xt_quota;
+static unsigned int quota_list_perms = S_IRUGO | S_IWUSR;
+static kuid_t quota_list_uid = KUIDT_INIT(0);
+static kgid_t quota_list_gid = KGIDT_INIT(0);
+module_param_named(perms, quota_list_perms, uint, S_IRUGO | S_IWUSR);
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+static void quota2_log(unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const char *prefix)
+{
+ ulog_packet_msg_t *pm;
+ struct sk_buff *log_skb;
+ size_t size;
+ struct nlmsghdr *nlh;
+
+ if (!qlog_nl_event)
+ return;
+
+ size = NLMSG_SPACE(sizeof(*pm));
+ size = max(size, (size_t)NLMSG_GOODSIZE);
+ log_skb = alloc_skb(size, GFP_ATOMIC);
+ if (!log_skb) {
+ pr_err("xt_quota2: cannot alloc skb for logging\n");
+ return;
+ }
+
+ nlh = nlmsg_put(log_skb, /*pid*/0, /*seq*/0, qlog_nl_event,
+ sizeof(*pm), 0);
+ if (!nlh) {
+ pr_err("xt_quota2: nlmsg_put failed\n");
+ kfree_skb(log_skb);
+ return;
+ }
+ pm = nlmsg_data(nlh);
+ if (skb->tstamp.tv64 == 0)
+ __net_timestamp((struct sk_buff *)skb);
+ pm->data_len = 0;
+ pm->hook = hooknum;
+ if (prefix != NULL)
+ strlcpy(pm->prefix, prefix, sizeof(pm->prefix));
+ else
+ *(pm->prefix) = '\0';
+ if (in)
+ strlcpy(pm->indev_name, in->name, sizeof(pm->indev_name));
+ else
+ pm->indev_name[0] = '\0';
+
+ if (out)
+ strlcpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
+ else
+ pm->outdev_name[0] = '\0';
+
+ NETLINK_CB(log_skb).dst_group = 1;
+ pr_debug("throwing 1 packets to netlink group 1\n");
+ netlink_broadcast(nflognl, log_skb, 0, 1, GFP_ATOMIC);
+}
+#else
+static void quota2_log(unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const char *prefix)
+{
+}
+#endif /* if+else CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG */
+
+static ssize_t quota_proc_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct xt_quota_counter *e = PDE_DATA(file_inode(file));
+ char tmp[24];
+ size_t tmp_size;
+
+ spin_lock_bh(&e->lock);
+ tmp_size = scnprintf(tmp, sizeof(tmp), "%llu\n", e->quota);
+ spin_unlock_bh(&e->lock);
+ return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size);
+}
+
+static ssize_t quota_proc_write(struct file *file, const char __user *input,
+ size_t size, loff_t *ppos)
+{
+ struct xt_quota_counter *e = PDE_DATA(file_inode(file));
+ char buf[sizeof("18446744073709551616")];
+
+ if (size > sizeof(buf))
+ size = sizeof(buf);
+ if (copy_from_user(buf, input, size) != 0)
+ return -EFAULT;
+ buf[sizeof(buf)-1] = '\0';
+
+ spin_lock_bh(&e->lock);
+ e->quota = simple_strtoull(buf, NULL, 0);
+ spin_unlock_bh(&e->lock);
+ return size;
+}
+
+static const struct file_operations q2_counter_fops = {
+ .read = quota_proc_read,
+ .write = quota_proc_write,
+ .llseek = default_llseek,
+};
+
+static struct xt_quota_counter *
+q2_new_counter(const struct xt_quota_mtinfo2 *q, bool anon)
+{
+ struct xt_quota_counter *e;
+ unsigned int size;
+
+ /* Do not need all the procfs things for anonymous counters. */
+ size = anon ? offsetof(typeof(*e), list) : sizeof(*e);
+ e = kmalloc(size, GFP_KERNEL);
+ if (e == NULL)
+ return NULL;
+
+ e->quota = q->quota;
+ spin_lock_init(&e->lock);
+ if (!anon) {
+ INIT_LIST_HEAD(&e->list);
+ atomic_set(&e->ref, 1);
+ strlcpy(e->name, q->name, sizeof(e->name));
+ }
+ return e;
+}
+
+/**
+ * q2_get_counter - get ref to counter or create new
+ * @name: name of counter
+ */
+static struct xt_quota_counter *
+q2_get_counter(const struct xt_quota_mtinfo2 *q)
+{
+ struct proc_dir_entry *p;
+ struct xt_quota_counter *e = NULL;
+ struct xt_quota_counter *new_e;
+
+ if (*q->name == '\0')
+ return q2_new_counter(q, true);
+
+ /* No need to hold a lock while getting a new counter */
+ new_e = q2_new_counter(q, false);
+ if (new_e == NULL)
+ goto out;
+
+ spin_lock_bh(&counter_list_lock);
+ list_for_each_entry(e, &counter_list, list)
+ if (strcmp(e->name, q->name) == 0) {
+ atomic_inc(&e->ref);
+ spin_unlock_bh(&counter_list_lock);
+ kfree(new_e);
+ pr_debug("xt_quota2: old counter name=%s", e->name);
+ return e;
+ }
+ e = new_e;
+ pr_debug("xt_quota2: new_counter name=%s", e->name);
+ list_add_tail(&e->list, &counter_list);
+ /* The entry having a refcount of 1 is not directly destructible.
+ * This func has not yet returned the new entry, thus iptables
+ * has not references for destroying this entry.
+ * For another rule to try to destroy it, it would 1st need for this
+ * func* to be re-invoked, acquire a new ref for the same named quota.
+ * Nobody will access the e->procfs_entry either.
+ * So release the lock. */
+ spin_unlock_bh(&counter_list_lock);
+
+ /* create_proc_entry() is not spin_lock happy */
+ p = e->procfs_entry = proc_create_data(e->name, quota_list_perms,
+ proc_xt_quota, &q2_counter_fops, e);
+
+ if (IS_ERR_OR_NULL(p)) {
+ spin_lock_bh(&counter_list_lock);
+ list_del(&e->list);
+ spin_unlock_bh(&counter_list_lock);
+ goto out;
+ }
+ proc_set_user(p, quota_list_uid, quota_list_gid);
+ return e;
+
+ out:
+ kfree(e);
+ return NULL;
+}
+
+static int quota_mt2_check(const struct xt_mtchk_param *par)
+{
+ struct xt_quota_mtinfo2 *q = par->matchinfo;
+
+ pr_debug("xt_quota2: check() flags=0x%04x", q->flags);
+
+ if (q->flags & ~XT_QUOTA_MASK)
+ return -EINVAL;
+
+ q->name[sizeof(q->name)-1] = '\0';
+ if (*q->name == '.' || strchr(q->name, '/') != NULL) {
+ printk(KERN_ERR "xt_quota.3: illegal name\n");
+ return -EINVAL;
+ }
+
+ q->master = q2_get_counter(q);
+ if (q->master == NULL) {
+ printk(KERN_ERR "xt_quota.3: memory alloc failure\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void quota_mt2_destroy(const struct xt_mtdtor_param *par)
+{
+ struct xt_quota_mtinfo2 *q = par->matchinfo;
+ struct xt_quota_counter *e = q->master;
+
+ if (*q->name == '\0') {
+ kfree(e);
+ return;
+ }
+
+ spin_lock_bh(&counter_list_lock);
+ if (!atomic_dec_and_test(&e->ref)) {
+ spin_unlock_bh(&counter_list_lock);
+ return;
+ }
+
+ list_del(&e->list);
+ remove_proc_entry(e->name, proc_xt_quota);
+ spin_unlock_bh(&counter_list_lock);
+ kfree(e);
+}
+
+static bool
+quota_mt2(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct xt_quota_mtinfo2 *q = (void *)par->matchinfo;
+ struct xt_quota_counter *e = q->master;
+ bool ret = q->flags & XT_QUOTA_INVERT;
+
+ spin_lock_bh(&e->lock);
+ if (q->flags & XT_QUOTA_GROW) {
+ /*
+ * While no_change is pointless in "grow" mode, we will
+ * implement it here simply to have a consistent behavior.
+ */
+ if (!(q->flags & XT_QUOTA_NO_CHANGE)) {
+ e->quota += (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len;
+ }
+ ret = true;
+ } else {
+ if (e->quota >= skb->len) {
+ if (!(q->flags & XT_QUOTA_NO_CHANGE))
+ e->quota -= (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len;
+ ret = !ret;
+ } else {
+ /* We are transitioning, log that fact. */
+ if (e->quota) {
+ quota2_log(par->hooknum,
+ skb,
+ par->in,
+ par->out,
+ q->name);
+ }
+ /* we do not allow even small packets from now on */
+ e->quota = 0;
+ }
+ }
+ spin_unlock_bh(&e->lock);
+ return ret;
+}
+
+static struct xt_match quota_mt2_reg[] __read_mostly = {
+ {
+ .name = "quota2",
+ .revision = 3,
+ .family = NFPROTO_IPV4,
+ .checkentry = quota_mt2_check,
+ .match = quota_mt2,
+ .destroy = quota_mt2_destroy,
+ .matchsize = sizeof(struct xt_quota_mtinfo2),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "quota2",
+ .revision = 3,
+ .family = NFPROTO_IPV6,
+ .checkentry = quota_mt2_check,
+ .match = quota_mt2,
+ .destroy = quota_mt2_destroy,
+ .matchsize = sizeof(struct xt_quota_mtinfo2),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init quota_mt2_init(void)
+{
+ int ret;
+ pr_debug("xt_quota2: init()");
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+ nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, NULL);
+ if (!nflognl)
+ return -ENOMEM;
+#endif
+
+ proc_xt_quota = proc_mkdir("xt_quota", init_net.proc_net);
+ if (proc_xt_quota == NULL)
+ return -EACCES;
+
+ ret = xt_register_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg));
+ if (ret < 0)
+ remove_proc_entry("xt_quota", init_net.proc_net);
+ pr_debug("xt_quota2: init() %d", ret);
+ return ret;
+}
+
+static void __exit quota_mt2_exit(void)
+{
+ xt_unregister_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg));
+ remove_proc_entry("xt_quota", init_net.proc_net);
+}
+
+module_init(quota_mt2_init);
+module_exit(quota_mt2_exit);
+MODULE_DESCRIPTION("Xtables: countdown quota match; up counter");
+MODULE_AUTHOR("Sam Johnston <samj@samj.net>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_quota2");
+MODULE_ALIAS("ip6t_quota2");
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index b10ade272b50..a52fbaf52691 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -144,13 +144,14 @@ static bool xt_socket_sk_is_transparent(struct sock *sk)
}
}
-static struct sock *xt_socket_lookup_slow_v4(struct net *net,
+struct sock *xt_socket_lookup_slow_v4(struct net *net,
const struct sk_buff *skb,
const struct net_device *indev)
{
const struct iphdr *iph = ip_hdr(skb);
struct sk_buff *data_skb = NULL;
int doff = 0;
+ struct sock *sk = skb->sk;
__be32 uninitialized_var(daddr), uninitialized_var(saddr);
__be16 uninitialized_var(dport), uninitialized_var(sport);
u8 uninitialized_var(protocol);
@@ -205,9 +206,16 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net,
}
#endif
- return xt_socket_get_sock_v4(net, data_skb, doff, protocol, saddr,
- daddr, sport, dport, indev);
+ if (sk)
+ atomic_inc(&sk->sk_refcnt);
+ else
+ sk = xt_socket_get_sock_v4(dev_net(skb->dev), data_skb, doff,
+ protocol, saddr, daddr, sport,
+ dport, indev);
+
+ return sk;
}
+EXPORT_SYMBOL(xt_socket_lookup_slow_v4);
static bool
socket_match(const struct sk_buff *skb, struct xt_action_param *par,
@@ -239,8 +247,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
transparent)
pskb->mark = sk->sk_mark;
- if (sk != skb->sk)
- sock_gen_put(sk);
+ sock_gen_put(sk);
if (wildcard || !transparent)
sk = NULL;
@@ -344,10 +351,11 @@ xt_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
return NULL;
}
-static struct sock *xt_socket_lookup_slow_v6(struct net *net,
+struct sock *xt_socket_lookup_slow_v6(struct net *net,
const struct sk_buff *skb,
const struct net_device *indev)
{
+ struct sock *sk = skb->sk;
__be16 uninitialized_var(dport), uninitialized_var(sport);
const struct in6_addr *daddr = NULL, *saddr = NULL;
struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -387,9 +395,16 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net,
return NULL;
}
- return xt_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr,
- sport, dport, indev);
+ if (sk)
+ atomic_inc(&sk->sk_refcnt);
+ else
+ sk = xt_socket_get_sock_v6(dev_net(skb->dev), data_skb, doff,
+ tproto, saddr, daddr, sport, dport,
+ indev);
+
+ return sk;
}
+EXPORT_SYMBOL(xt_socket_lookup_slow_v6);
static bool
socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index 868f1ad0415a..8463a6d4d508 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -10,6 +10,11 @@ menuconfig RFKILL
To compile this driver as a module, choose M here: the
module will be called rfkill.
+config RFKILL_PM
+ bool "Power off on suspend"
+ depends on RFKILL && PM
+ default y
+
# LED trigger support
config RFKILL_LEDS
bool
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 884027f62783..a9a7128f039e 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -796,8 +796,7 @@ void rfkill_resume_polling(struct rfkill *rfkill)
}
EXPORT_SYMBOL(rfkill_resume_polling);
-#ifdef CONFIG_PM_SLEEP
-static int rfkill_suspend(struct device *dev)
+static __maybe_unused int rfkill_suspend(struct device *dev)
{
struct rfkill *rfkill = to_rfkill(dev);
@@ -807,7 +806,7 @@ static int rfkill_suspend(struct device *dev)
return 0;
}
-static int rfkill_resume(struct device *dev)
+static __maybe_unused int rfkill_resume(struct device *dev)
{
struct rfkill *rfkill = to_rfkill(dev);
bool cur;
@@ -827,17 +826,13 @@ static int rfkill_resume(struct device *dev)
}
static SIMPLE_DEV_PM_OPS(rfkill_pm_ops, rfkill_suspend, rfkill_resume);
-#define RFKILL_PM_OPS (&rfkill_pm_ops)
-#else
-#define RFKILL_PM_OPS NULL
-#endif
static struct class rfkill_class = {
.name = "rfkill",
.dev_release = rfkill_release,
.dev_groups = rfkill_dev_groups,
.dev_uevent = rfkill_dev_uevent,
- .pm = RFKILL_PM_OPS,
+ .pm = IS_ENABLED(CONFIG_RFKILL_PM) ? &rfkill_pm_ops : NULL,
};
bool rfkill_blocked(struct rfkill *rfkill)
diff --git a/net/socket.c b/net/socket.c
index d9e2989c10c4..7d47cb73725b 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -534,8 +534,22 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
return used;
}
+static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ int err = simple_setattr(dentry, iattr);
+
+ if (!err && (iattr->ia_valid & ATTR_UID)) {
+ struct socket *sock = SOCKET_I(d_inode(dentry));
+
+ sock->sk->sk_uid = iattr->ia_uid;
+ }
+
+ return err;
+}
+
static const struct inode_operations sockfs_inode_ops = {
.listxattr = sockfs_listxattr,
+ .setattr = sockfs_setattr,
};
/**
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 435f904c1be5..38caa6ab1fa7 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -70,7 +70,7 @@ module_param(bss_entries_limit, int, 0644);
MODULE_PARM_DESC(bss_entries_limit,
"limit to number of scan BSS entries (per wiphy, default 1000)");
-#define IEEE80211_SCAN_RESULT_EXPIRE (30 * HZ)
+#define IEEE80211_SCAN_RESULT_EXPIRE (7 * HZ)
static void bss_free(struct cfg80211_internal_bss *bss)
{
diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index 44ac85fe2bc9..d0ca0dbf494e 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -241,7 +241,7 @@ static struct xfrm_algo_desc aalg_list[] = {
.uinfo = {
.auth = {
- .icv_truncbits = 96,
+ .icv_truncbits = 128,
.icv_fullbits = 256,
}
},
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 026770884d46..e0b1e8d30f76 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1735,6 +1735,10 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb,
struct sk_buff *skb;
int err;
+ err = verify_policy_dir(dir);
+ if (err)
+ return ERR_PTR(err);
+
skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!skb)
return ERR_PTR(-ENOMEM);
@@ -2256,6 +2260,10 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
int n = 0;
struct net *net = sock_net(skb->sk);
+ err = verify_policy_dir(pi->dir);
+ if (err)
+ return err;
+
if (attrs[XFRMA_MIGRATE] == NULL)
return -EINVAL;
@@ -2371,6 +2379,11 @@ static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
{
struct net *net = &init_net;
struct sk_buff *skb;
+ int err;
+
+ err = verify_policy_dir(dir);
+ if (err)
+ return err;
skb = nlmsg_new(xfrm_migrate_msgsize(num_migrate, !!k), GFP_ATOMIC);
if (skb == NULL)
@@ -3028,6 +3041,11 @@ out_free_skb:
static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
{
+ int err;
+
+ err = verify_policy_dir(dir);
+ if (err)
+ return err;
switch (c->event) {
case XFRM_MSG_NEWPOLICY:
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 72c58675973e..b2cdced61090 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -22,6 +22,7 @@ hostprogs-y += spintest
hostprogs-y += map_perf_test
hostprogs-y += test_overhead
hostprogs-y += test_cgrp2_array_pin
+hostprogs-y += test_cgrp2_attach
hostprogs-y += xdp1
hostprogs-y += xdp2
hostprogs-y += test_current_task_under_cgroup
@@ -50,6 +51,7 @@ spintest-objs := bpf_load.o libbpf.o spintest_user.o
map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
+test_cgrp2_attach-objs := libbpf.o test_cgrp2_attach.o
xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
# reuse xdp1 source intentionally
xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 9969e35550c3..9cbc786e48e5 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -104,6 +104,29 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
}
+int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
+ unsigned int flags)
+{
+ union bpf_attr attr = {
+ .target_fd = target_fd,
+ .attach_bpf_fd = prog_fd,
+ .attach_type = type,
+ .attach_flags = flags;
+ };
+
+ return syscall(__NR_bpf, BPF_PROG_ATTACH, &attr, sizeof(attr));
+}
+
+int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
+{
+ union bpf_attr attr = {
+ .target_fd = target_fd,
+ .attach_type = type,
+ };
+
+ return syscall(__NR_bpf, BPF_PROG_DETACH, &attr, sizeof(attr));
+}
+
int bpf_obj_pin(int fd, const char *pathname)
{
union bpf_attr attr = {
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index ac6edb61b64a..b06cf5aea097 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -15,6 +15,10 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
const struct bpf_insn *insns, int insn_len,
const char *license, int kern_version);
+int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type,
+ unsigned int flags);
+int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
+
int bpf_obj_pin(int fd, const char *pathname);
int bpf_obj_get(const char *pathname);
diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c
new file mode 100644
index 000000000000..9de4896edeef
--- /dev/null
+++ b/samples/bpf/test_cgrp2_attach.c
@@ -0,0 +1,147 @@
+/* eBPF example program:
+ *
+ * - Creates arraymap in kernel with 4 bytes keys and 8 byte values
+ *
+ * - Loads eBPF program
+ *
+ * The eBPF program accesses the map passed in to store two pieces of
+ * information. The number of invocations of the program, which maps
+ * to the number of packets received, is stored to key 0. Key 1 is
+ * incremented on each iteration by the number of bytes stored in
+ * the skb.
+ *
+ * - Detaches any eBPF program previously attached to the cgroup
+ *
+ * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
+ *
+ * - Every second, reads map[0] and map[1] to see how many bytes and
+ * packets were seen on any socket of tasks in the given cgroup.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include <linux/bpf.h>
+
+#include "libbpf.h"
+
+enum {
+ MAP_KEY_PACKETS,
+ MAP_KEY_BYTES,
+};
+
+static int prog_load(int map_fd, int verdict)
+{
+ struct bpf_insn prog[] = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* save r6 so it's not clobbered by BPF_CALL */
+
+ /* Count packets */
+ BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* load map fd to r1 */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+ /* Count bytes */
+ BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+ BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
+ BPF_EXIT_INSN(),
+ };
+
+ return bpf_prog_load(BPF_PROG_TYPE_CGROUP_SKB,
+ prog, sizeof(prog), "GPL", 0);
+}
+
+static int usage(const char *argv0)
+{
+ printf("Usage: %s <cg-path> <egress|ingress> [drop]\n", argv0);
+ return EXIT_FAILURE;
+}
+
+int main(int argc, char **argv)
+{
+ int cg_fd, map_fd, prog_fd, key, ret;
+ long long pkt_cnt, byte_cnt;
+ enum bpf_attach_type type;
+ int verdict = 1;
+
+ if (argc < 3)
+ return usage(argv[0]);
+
+ if (strcmp(argv[2], "ingress") == 0)
+ type = BPF_CGROUP_INET_INGRESS;
+ else if (strcmp(argv[2], "egress") == 0)
+ type = BPF_CGROUP_INET_EGRESS;
+ else
+ return usage(argv[0]);
+
+ if (argc > 3 && strcmp(argv[3], "drop") == 0)
+ verdict = 0;
+
+ cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY);
+ if (cg_fd < 0) {
+ printf("Failed to open cgroup path: '%s'\n", strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY,
+ sizeof(key), sizeof(byte_cnt),
+ 256, 0);
+ if (map_fd < 0) {
+ printf("Failed to create map: '%s'\n", strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ prog_fd = prog_load(map_fd, verdict);
+ printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf);
+
+ if (prog_fd < 0) {
+ printf("Failed to load prog: '%s'\n", strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ ret = bpf_prog_detach(cg_fd, type);
+ printf("bpf_prog_detach() returned '%s' (%d)\n", strerror(errno), errno);
+
+ ret = bpf_prog_attach(prog_fd, cg_fd, type, 0);
+ if (ret < 0) {
+ printf("Failed to attach prog to cgroup: '%s'\n",
+ strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ while (1) {
+ key = MAP_KEY_PACKETS;
+ assert(bpf_lookup_elem(map_fd, &key, &pkt_cnt) == 0);
+
+ key = MAP_KEY_BYTES;
+ assert(bpf_lookup_elem(map_fd, &key, &byte_cnt) == 0);
+
+ printf("cgroup received %lld packets, %lld bytes\n",
+ pkt_cnt, byte_cnt);
+ sleep(1);
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/scripts/Makefile.clean b/scripts/Makefile.clean
index 50616ea25131..2e70c6f06354 100644
--- a/scripts/Makefile.clean
+++ b/scripts/Makefile.clean
@@ -11,7 +11,7 @@ include scripts/Kbuild.include
# The filename Kbuild has precedence over Makefile
kbuild-dir := $(if $(filter /%,$(src)),$(src),$(srctree)/$(src))
-include $(if $(wildcard $(kbuild-dir)/Kbuild), $(kbuild-dir)/Kbuild, $(kbuild-dir)/Makefile)
+-include $(if $(wildcard $(kbuild-dir)/Kbuild), $(kbuild-dir)/Kbuild, $(kbuild-dir)/Makefile)
# Figure out what we need to build from the various variables
# ==========================================================================
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 4e02d51dfc62..45b5f5589a7b 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -314,6 +314,12 @@ $(obj)/%.dtb: $(src)/%.dts FORCE
dtc-tmp = $(subst $(comma),_,$(dot-target).dts.tmp)
+# cat
+# ---------------------------------------------------------------------------
+# Concatentate multiple files together
+quiet_cmd_cat = CAT $@
+cmd_cat = (cat $(filter-out FORCE,$^) > $@) || (rm -f $@; false)
+
# Bzip2
# ---------------------------------------------------------------------------
diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst
index 07650eeaaf06..6f4c3f5a7ae3 100644
--- a/scripts/Makefile.modinst
+++ b/scripts/Makefile.modinst
@@ -29,7 +29,7 @@ quiet_cmd_modules_install = INSTALL $@
INSTALL_MOD_DIR ?= extra
ext-mod-dir = $(INSTALL_MOD_DIR)$(subst $(patsubst %/,%,$(KBUILD_EXTMOD)),,$(@D))
-modinst_dir = $(if $(KBUILD_EXTMOD),$(ext-mod-dir),kernel/$(@D))
+modinst_dir ?= $(if $(KBUILD_EXTMOD),$(ext-mod-dir),kernel/$(@D))
$(modules):
$(call cmd,modules_install,$(MODLIB)/$(modinst_dir))
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 55171647f516..77bc36817c2e 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2529,6 +2529,7 @@ sub process {
# Check for git id commit length and improperly formed commit descriptions
if ($in_commit_log && !$commit_log_possible_stack_dump &&
$line !~ /^\s*(?:Link|Patchwork|http|https|BugLink):/i &&
+ $line !~ /^This reverts commit [0-9a-f]{7,40}/ &&
($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i ||
($line =~ /(?:\s|^)[0-9a-f]{12,40}(?:[\s"'\(\[]|$)/i &&
$line !~ /[\<\[][0-9a-f]{12,40}[\>\]]/i &&
diff --git a/security/Kconfig b/security/Kconfig
index 32f36b40e9f0..80a2934d3110 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -18,6 +18,15 @@ config SECURITY_DMESG_RESTRICT
If you are unsure how to answer this question, answer N.
+config SECURITY_PERF_EVENTS_RESTRICT
+ bool "Restrict unprivileged use of performance events"
+ depends on PERF_EVENTS
+ help
+ If you say Y here, the kernel.perf_event_paranoid sysctl
+ will be set to 3 by default, and no unprivileged use of the
+ perf_event_open syscall will be permitted unless it is
+ changed.
+
config SECURITY
bool "Enable different security models"
depends on SYSFS
diff --git a/security/commoncap.c b/security/commoncap.c
index 8df676fbd393..a8e4aacf0b0c 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -31,6 +31,10 @@
#include <linux/binfmts.h>
#include <linux/personality.h>
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+#endif
+
/*
* If a non-root user executes a setuid-root binary in
* !secure(SECURE_NOROOT) mode, then we raise capabilities.
@@ -73,6 +77,13 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
{
struct user_namespace *ns = targ_ns;
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+ if (cap == CAP_NET_RAW && in_egroup_p(AID_NET_RAW))
+ return 0;
+ if (cap == CAP_NET_ADMIN && in_egroup_p(AID_NET_ADMIN))
+ return 0;
+#endif
+
/* See if cred has the capability in the target user namespace
* by examining the target user namespace and all of the target
* user namespace's parents.
diff --git a/security/inode.c b/security/inode.c
index c83db05c15ab..b4531f2be0f1 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -100,7 +100,7 @@ struct dentry *securityfs_create_file(const char *name, umode_t mode,
dir = d_inode(parent);
inode_lock(dir);
- dentry = lookup_one_len(name, parent, strlen(name));
+ dentry = lookup_one_len2(name, mount, parent, strlen(name));
if (IS_ERR(dentry))
goto out;
diff --git a/security/security.c b/security/security.c
index f825304f04a7..1ba5274bef5c 100644
--- a/security/security.c
+++ b/security/security.c
@@ -508,6 +508,7 @@ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
return 0;
return call_int_hook(path_chown, 0, path, uid, gid);
}
+EXPORT_SYMBOL(security_path_chown);
int security_path_chroot(const struct path *path)
{
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 8ded80867b92..ade3a90ada6b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -479,6 +479,7 @@ static int selinux_is_sblabel_mnt(struct super_block *sb)
!strcmp(sb->s_type->name, "sysfs") ||
!strcmp(sb->s_type->name, "pstore") ||
!strcmp(sb->s_type->name, "debugfs") ||
+ !strcmp(sb->s_type->name, "tracefs") ||
!strcmp(sb->s_type->name, "rootfs");
}
@@ -797,6 +798,7 @@ static int selinux_set_mnt_opts(struct super_block *sb,
sbsec->flags |= SE_SBPROC | SE_SBGENFS;
if (!strcmp(sb->s_type->name, "debugfs") ||
+ !strcmp(sb->s_type->name, "tracefs") ||
!strcmp(sb->s_type->name, "sysfs") ||
!strcmp(sb->s_type->name, "pstore"))
sbsec->flags |= SE_SBGENFS;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f09c70b97eca..b2d5be9fc909 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -73,6 +73,8 @@ enum bpf_cmd {
BPF_PROG_LOAD,
BPF_OBJ_PIN,
BPF_OBJ_GET,
+ BPF_PROG_ATTACH,
+ BPF_PROG_DETACH,
};
enum bpf_map_type {
@@ -96,8 +98,23 @@ enum bpf_prog_type {
BPF_PROG_TYPE_TRACEPOINT,
BPF_PROG_TYPE_XDP,
BPF_PROG_TYPE_PERF_EVENT,
+ BPF_PROG_TYPE_CGROUP_SKB,
};
+enum bpf_attach_type {
+ BPF_CGROUP_INET_INGRESS,
+ BPF_CGROUP_INET_EGRESS,
+ __MAX_BPF_ATTACH_TYPE
+};
+
+#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
+
+/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
+ * to the given target_fd cgroup the descendent cgroup will be able to
+ * override effective bpf program that was inherited from this cgroup
+ */
+#define BPF_F_ALLOW_OVERRIDE (1U << 0)
+
#define BPF_PSEUDO_MAP_FD 1
/* flags for BPF_MAP_UPDATE_ELEM command */
@@ -141,6 +158,13 @@ union bpf_attr {
__aligned_u64 pathname;
__u32 bpf_fd;
};
+
+ struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
+ __u32 target_fd; /* container object to attach to */
+ __u32 attach_bpf_fd; /* eBPF program to attach */
+ __u32 attach_type;
+ __u32 attach_flags;
+ };
} __attribute__((aligned(8)));
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -426,6 +450,67 @@ enum bpf_func_id {
*/
BPF_FUNC_set_hash_invalid,
+ /**
+ * int bpf_get_numa_node_id()
+ * Return: Id of current NUMA node.
+ */
+ BPF_FUNC_get_numa_node_id,
+
+ /**
+ * int bpf_skb_change_head()
+ * Grows headroom of skb and adjusts MAC header offset accordingly.
+ * Will extends/reallocae as required automatically.
+ * May change skb data pointer and will thus invalidate any check
+ * performed for direct packet access.
+ * @skb: pointer to skb
+ * @len: length of header to be pushed in front
+ * @flags: Flags (unused for now)
+ * Return: 0 on success or negative error
+ */
+ BPF_FUNC_skb_change_head,
+
+ /**
+ * int bpf_xdp_adjust_head(xdp_md, delta)
+ * Adjust the xdp_md.data by delta
+ * @xdp_md: pointer to xdp_md
+ * @delta: An positive/negative integer to be added to xdp_md.data
+ * Return: 0 on success or negative on error
+ */
+ BPF_FUNC_xdp_adjust_head,
+
+ /**
+ * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
+ * Copy a NUL terminated string from unsafe address. In case the string
+ * length is smaller than size, the target is not padded with further NUL
+ * bytes. In case the string length is larger than size, just count-1
+ * bytes are copied and the last byte is set to NUL.
+ * @dst: destination address
+ * @size: maximum number of bytes to copy, including the trailing NUL
+ * @unsafe_ptr: unsafe address
+ * Return:
+ * > 0 length of the string including the trailing NUL on success
+ * < 0 error
+ */
+ BPF_FUNC_probe_read_str,
+
+ /**
+ * u64 bpf_bpf_get_socket_cookie(skb)
+ * Get the cookie for the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: 8 Bytes non-decreasing number on success or 0 if the socket
+ * field is missing inside sk_buff
+ */
+ BPF_FUNC_get_socket_cookie,
+
+ /**
+ * u32 bpf_get_socket_uid(skb)
+ * Get the owner uid of the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: uid of the socket owner on success or 0 if the socket pointer
+ * inside sk_buff is NULL
+ */
+ BPF_FUNC_get_socket_uid,
+
__BPF_FUNC_MAX_ID,
};
diff --git a/tools/include/uapi/linux/hw_breakpoint.h b/tools/include/uapi/linux/hw_breakpoint.h
index b04000a2296a..2b65efd19a46 100644
--- a/tools/include/uapi/linux/hw_breakpoint.h
+++ b/tools/include/uapi/linux/hw_breakpoint.h
@@ -4,7 +4,11 @@
enum {
HW_BREAKPOINT_LEN_1 = 1,
HW_BREAKPOINT_LEN_2 = 2,
+ HW_BREAKPOINT_LEN_3 = 3,
HW_BREAKPOINT_LEN_4 = 4,
+ HW_BREAKPOINT_LEN_5 = 5,
+ HW_BREAKPOINT_LEN_6 = 6,
+ HW_BREAKPOINT_LEN_7 = 7,
HW_BREAKPOINT_LEN_8 = 8,
};