aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-wakeup_reasons16
-rw-r--r--Documentation/android.txt121
-rw-r--r--Documentation/cgroups/cgroups.txt9
-rw-r--r--Documentation/cpu-freq/governors.txt85
-rw-r--r--Documentation/device-mapper/verity.txt12
-rw-r--r--Documentation/filesystems/porting3
-rw-r--r--Documentation/filesystems/proc.txt10
-rw-r--r--Documentation/filesystems/squashfs.txt8
-rw-r--r--Documentation/networking/ip-sysctl.txt41
-rw-r--r--Documentation/sync.txt75
-rw-r--r--Documentation/sysctl/vm.txt16
-rw-r--r--Documentation/trace/ftrace.txt29
-rw-r--r--android/configs/README15
-rw-r--r--android/configs/android-base.cfg152
-rw-r--r--android/configs/android-recommended.cfg121
-rw-r--r--arch/Kconfig1
-rw-r--r--arch/alpha/kernel/osf_sys.c15
-rw-r--r--arch/arm/Kconfig31
-rw-r--r--arch/arm/Kconfig.debug21
-rw-r--r--arch/arm/Makefile2
-rw-r--r--arch/arm/boot/Makefile1
-rw-r--r--arch/arm/boot/compressed/head.S2
-rw-r--r--arch/arm/common/Kconfig4
-rw-r--r--arch/arm/common/Makefile1
-rw-r--r--arch/arm/common/fiq_glue.S118
-rw-r--r--arch/arm/common/fiq_glue_setup.c147
-rw-r--r--arch/arm/crypto/.gitignore1
-rw-r--r--arch/arm/crypto/Makefile24
-rw-r--r--arch/arm/crypto/aes_glue.c22
-rw-r--r--arch/arm/crypto/aes_glue.h19
-rw-r--r--arch/arm/crypto/aesbs-core.S_shipped2544
-rw-r--r--arch/arm/crypto/aesbs-glue.c434
-rw-r--r--arch/arm/crypto/bsaes-armv7.pl2467
-rw-r--r--arch/arm/crypto/sha1-armv7-neon.S634
-rw-r--r--arch/arm/crypto/sha1_glue.c58
-rw-r--r--arch/arm/crypto/sha1_neon_glue.c197
-rw-r--r--arch/arm/crypto/sha256-armv4.pl716
-rw-r--r--arch/arm/crypto/sha256-core.S_shipped2808
-rw-r--r--arch/arm/crypto/sha256_glue.c246
-rw-r--r--arch/arm/crypto/sha256_glue.h23
-rw-r--r--arch/arm/crypto/sha256_neon_glue.c172
-rw-r--r--arch/arm/crypto/sha512-armv7-neon.S455
-rw-r--r--arch/arm/crypto/sha512_neon_glue.c305
-rw-r--r--arch/arm/include/asm/Kbuild1
-rw-r--r--arch/arm/include/asm/barrier.h15
-rw-r--r--arch/arm/include/asm/cacheflush.h1
-rw-r--r--arch/arm/include/asm/crypto/sha1.h10
-rw-r--r--arch/arm/include/asm/fiq_glue.h33
-rw-r--r--arch/arm/include/asm/hardirq.h2
-rw-r--r--arch/arm/include/asm/hardware/cache-l2x0.h3
-rw-r--r--arch/arm/include/asm/irq.h3
-rw-r--r--arch/arm/include/asm/mach/mmc.h28
-rw-r--r--arch/arm/include/asm/neon.h36
-rw-r--r--arch/arm/include/asm/rodata.h32
-rw-r--r--arch/arm/include/asm/smp.h1
-rw-r--r--arch/arm/include/asm/syscall.h3
-rw-r--r--arch/arm/include/asm/unistd.h2
-rw-r--r--arch/arm/include/uapi/asm/unistd.h6
-rw-r--r--arch/arm/kernel/calls.S5
-rw-r--r--arch/arm/kernel/ftrace.c15
-rw-r--r--arch/arm/kernel/kgdb.c4
-rw-r--r--arch/arm/kernel/process.c127
-rw-r--r--arch/arm/kernel/ptrace.c7
-rw-r--r--arch/arm/kernel/smp.c58
-rw-r--r--arch/arm/mm/Makefile1
-rw-r--r--arch/arm/mm/cache-l2x0.c76
-rw-r--r--arch/arm/mm/cache-v6.S17
-rw-r--r--arch/arm/mm/fault.c4
-rw-r--r--arch/arm/mm/mmu.c106
-rw-r--r--arch/arm/mm/rodata.c159
-rw-r--r--arch/arm/vfp/vfpmodule.c49
-rw-r--r--arch/arm64/Kconfig95
-rw-r--r--arch/arm64/Makefile9
-rw-r--r--arch/arm64/boot/.gitignore1
-rw-r--r--arch/arm64/boot/Makefile13
-rw-r--r--arch/arm64/boot/dts/Makefile13
-rw-r--r--arch/arm64/crypto/Makefile2
-rw-r--r--arch/arm64/crypto/aes-glue.c12
-rw-r--r--arch/arm64/crypto/ghash-ce-core.S84
-rw-r--r--arch/arm64/crypto/ghash-ce-glue.c5
-rw-r--r--arch/arm64/include/asm/cmpxchg.h8
-rw-r--r--arch/arm64/include/asm/compat.h7
-rw-r--r--arch/arm64/include/asm/fpsimd.h23
-rw-r--r--arch/arm64/include/asm/fpsimdmacros.h35
-rw-r--r--arch/arm64/include/asm/neon.h18
-rw-r--r--arch/arm64/include/asm/opcodes.h231
-rw-r--r--arch/arm64/include/asm/ptrace.h17
-rw-r--r--arch/arm64/include/asm/seccomp.h25
-rw-r--r--arch/arm64/include/asm/syscall.h14
-rw-r--r--arch/arm64/include/asm/thread_info.h8
-rw-r--r--arch/arm64/include/asm/traps.h13
-rw-r--r--arch/arm64/include/asm/unistd.h20
-rw-r--r--arch/arm64/include/asm/unistd32.h1164
-rw-r--r--arch/arm64/include/uapi/asm/ptrace.h1
-rw-r--r--arch/arm64/kernel/Makefile7
-rw-r--r--arch/arm64/kernel/entry-fpsimd.S24
-rw-r--r--arch/arm64/kernel/entry.S9
-rw-r--r--arch/arm64/kernel/fpsimd.c187
-rw-r--r--arch/arm64/kernel/kuser32.S1
-rw-r--r--arch/arm64/kernel/opcodes.c72
-rw-r--r--arch/arm64/kernel/process.c68
-rw-r--r--arch/arm64/kernel/ptrace.c24
-rw-r--r--arch/arm64/kernel/setup.c6
-rw-r--r--arch/arm64/kernel/signal.c13
-rw-r--r--arch/arm64/kernel/signal32.c19
-rw-r--r--arch/arm64/kernel/swp_emulate.c223
-rw-r--r--arch/arm64/kernel/sys_compat.c2
-rw-r--r--arch/arm64/kernel/traps.c44
-rw-r--r--arch/arm64/mm/init.c4
-rw-r--r--arch/arm64/mm/proc.S15
-rw-r--r--arch/ia64/include/asm/barrier.h23
-rw-r--r--arch/metag/include/asm/barrier.h15
-rw-r--r--arch/mips/include/asm/barrier.h15
-rw-r--r--arch/parisc/hpux/fs.c16
-rw-r--r--arch/powerpc/include/asm/barrier.h21
-rw-r--r--arch/s390/include/asm/barrier.h15
-rw-r--r--arch/s390/include/asm/syscall.h5
-rw-r--r--arch/sparc/include/asm/barrier_64.h15
-rw-r--r--arch/um/include/shared/frame_kern.h8
-rw-r--r--arch/um/kernel/signal.c4
-rw-r--r--arch/um/os-Linux/signal.c8
-rw-r--r--arch/um/os-Linux/skas/process.c10
-rw-r--r--arch/x86/include/asm/barrier.h43
-rw-r--r--arch/x86/include/asm/idle.h7
-rw-r--r--arch/x86/include/asm/syscall.h8
-rw-r--r--arch/x86/kernel/process.c17
-rw-r--r--arch/x86/syscalls/syscall_32.tbl4
-rw-r--r--arch/x86/syscalls/syscall_64.tbl4
-rw-r--r--arch/x86/um/signal.c1
-rw-r--r--block/genhd.c17
-rw-r--r--block/partition-generic.c11
-rw-r--r--crypto/Kconfig49
-rw-r--r--crypto/ablk_helper.c4
-rw-r--r--drivers/Kconfig4
-rw-r--r--drivers/Makefile2
-rw-r--r--drivers/android/Kconfig37
-rw-r--r--drivers/android/Makefile3
-rw-r--r--drivers/android/binder.c (renamed from drivers/staging/android/binder.c)596
-rw-r--r--drivers/android/binder_trace.h (renamed from drivers/staging/android/binder_trace.h)14
-rw-r--r--drivers/base/firmware_class.c46
-rw-r--r--drivers/base/power/main.c79
-rw-r--r--drivers/base/power/wakeup.c32
-rw-r--r--drivers/base/syscore.c3
-rw-r--r--drivers/char/Kconfig17
-rw-r--r--drivers/char/Makefile1
-rw-r--r--drivers/char/dcc_tty.c326
-rw-r--r--drivers/char/mem.c17
-rw-r--r--drivers/cpufreq/Kconfig28
-rw-r--r--drivers/cpufreq/Makefile1
-rw-r--r--drivers/cpufreq/cpufreq.c48
-rw-r--r--drivers/cpufreq/cpufreq_governor.c45
-rw-r--r--drivers/cpufreq/cpufreq_governor.h1
-rw-r--r--drivers/cpufreq/cpufreq_interactive.c1375
-rw-r--r--drivers/cpufreq/cpufreq_stats.c249
-rw-r--r--drivers/cpuidle/governors/menu.c9
-rw-r--r--drivers/gpio/gpiolib.c10
-rw-r--r--drivers/hid/hid-debug.c15
-rw-r--r--drivers/hid/hid-input.c27
-rw-r--r--drivers/hid/hid-multitouch.c24
-rw-r--r--drivers/iio/industrialio-event.c12
-rw-r--r--drivers/input/Kconfig19
-rw-r--r--drivers/input/Makefile3
-rw-r--r--drivers/input/evdev.c53
-rw-r--r--drivers/input/keycombo.c261
-rw-r--r--drivers/input/keyreset.c145
-rw-r--r--drivers/input/misc/Kconfig16
-rw-r--r--drivers/input/misc/Makefile2
-rw-r--r--drivers/input/misc/gpio_axis.c192
-rw-r--r--drivers/input/misc/gpio_event.c228
-rw-r--r--drivers/input/misc/gpio_input.c390
-rw-r--r--drivers/input/misc/gpio_matrix.c441
-rw-r--r--drivers/input/misc/gpio_output.c97
-rw-r--r--drivers/input/misc/keychord.c391
-rw-r--r--drivers/md/dm-verity.c98
-rw-r--r--drivers/misc/Kconfig10
-rw-r--r--drivers/misc/Makefile2
-rw-r--r--drivers/misc/uid_cputime.c237
-rw-r--r--drivers/misc/uid_stat.c152
-rw-r--r--drivers/mmc/card/Kconfig9
-rw-r--r--drivers/mmc/card/block.c36
-rw-r--r--drivers/mmc/core/Kconfig15
-rw-r--r--drivers/mmc/core/core.c110
-rw-r--r--drivers/mmc/core/host.c10
-rw-r--r--drivers/mmc/core/sd.c83
-rw-r--r--drivers/mmc/core/sdio.c111
-rw-r--r--drivers/mmc/core/sdio_bus.c13
-rwxr-xr-x[-rw-r--r--]drivers/mmc/core/sdio_io.c33
-rw-r--r--drivers/mtd/nand/Kconfig10
-rw-r--r--drivers/net/ppp/Kconfig17
-rw-r--r--drivers/net/ppp/Makefile2
-rw-r--r--drivers/net/ppp/pppolac.c449
-rw-r--r--drivers/net/ppp/pppopns.c428
-rw-r--r--drivers/net/tun.c6
-rw-r--r--drivers/net/wireless/Kconfig5
-rw-r--r--drivers/of/fdt.c64
-rw-r--r--drivers/power/power_supply_core.c31
-rw-r--r--drivers/power/power_supply_sysfs.c11
-rw-r--r--drivers/staging/android/Kconfig46
-rw-r--r--drivers/staging/android/Makefile6
-rw-r--r--drivers/staging/android/TODO10
-rw-r--r--drivers/staging/android/alarm-dev.c443
-rw-r--r--drivers/staging/android/android_alarm.h81
-rw-r--r--drivers/staging/android/ashmem.c69
-rw-r--r--drivers/staging/android/ashmem.h30
-rw-r--r--drivers/staging/android/fiq_debugger/Kconfig49
-rw-r--r--drivers/staging/android/fiq_debugger/Makefile4
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger.c1213
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger.h64
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_arm.c240
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c202
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_priv.h37
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h94
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_watchdog.c56
-rw-r--r--drivers/staging/android/fiq_debugger/fiq_watchdog.h20
-rw-r--r--drivers/staging/android/ion/Kconfig35
-rw-r--r--drivers/staging/android/ion/Makefile10
-rw-r--r--drivers/staging/android/ion/compat_ion.c195
-rw-r--r--drivers/staging/android/ion/compat_ion.h30
-rw-r--r--drivers/staging/android/ion/ion.c1644
-rw-r--r--drivers/staging/android/ion/ion.h204
-rw-r--r--drivers/staging/android/ion/ion_carveout_heap.c194
-rw-r--r--drivers/staging/android/ion/ion_chunk_heap.c195
-rw-r--r--drivers/staging/android/ion/ion_cma_heap.c218
-rw-r--r--drivers/staging/android/ion/ion_dummy_driver.c158
-rw-r--r--drivers/staging/android/ion/ion_heap.c371
-rw-r--r--drivers/staging/android/ion/ion_page_pool.c190
-rw-r--r--drivers/staging/android/ion/ion_priv.h406
-rw-r--r--drivers/staging/android/ion/ion_system_heap.c451
-rw-r--r--drivers/staging/android/ion/ion_test.c282
-rw-r--r--drivers/staging/android/ion/tegra/Makefile1
-rw-r--r--drivers/staging/android/ion/tegra/tegra_ion.c84
-rw-r--r--drivers/staging/android/logger.c851
-rw-r--r--drivers/staging/android/logger.h89
-rw-r--r--drivers/staging/android/lowmemorykiller.c110
-rw-r--r--drivers/staging/android/sw_sync.c2
-rw-r--r--drivers/staging/android/sw_sync.h37
-rw-r--r--drivers/staging/android/sync.c22
-rw-r--r--drivers/staging/android/sync.h86
-rw-r--r--drivers/staging/android/timed_gpio.c1
-rw-r--r--drivers/staging/android/uapi/ashmem.h48
-rw-r--r--drivers/staging/android/uapi/ion.h196
-rw-r--r--drivers/staging/android/uapi/ion_test.h70
-rw-r--r--drivers/staging/android/uapi/sw_sync.h32
-rw-r--r--drivers/staging/android/uapi/sync.h97
-rw-r--r--drivers/switch/Kconfig15
-rw-r--r--drivers/switch/Makefile4
-rw-r--r--drivers/switch/switch_class.c174
-rw-r--r--drivers/switch/switch_gpio.c172
-rw-r--r--drivers/tty/serial/serial_core.c3
-rw-r--r--drivers/usb/gadget/Kconfig21
-rw-r--r--drivers/usb/gadget/Makefile2
-rw-r--r--drivers/usb/gadget/android.c1587
-rw-r--r--drivers/usb/gadget/composite.c10
-rw-r--r--drivers/usb/gadget/f_accessory.c1215
-rw-r--r--drivers/usb/gadget/f_audio_source.c834
-rw-r--r--drivers/usb/gadget/f_fs.c7
-rw-r--r--drivers/usb/gadget/f_midi.c56
-rw-r--r--drivers/usb/gadget/f_mtp.c1287
-rw-r--r--drivers/usb/gadget/f_rndis.c40
-rw-r--r--drivers/usb/gadget/rndis.c122
-rw-r--r--drivers/usb/gadget/rndis.h2
-rw-r--r--drivers/usb/gadget/u_ether.c324
-rw-r--r--drivers/usb/gadget/u_ether.h3
-rw-r--r--drivers/usb/gadget/u_serial.c1
-rw-r--r--drivers/usb/gadget/udc-core.c10
-rw-r--r--drivers/usb/phy/Kconfig8
-rw-r--r--drivers/usb/phy/Makefile1
-rw-r--r--drivers/usb/phy/otg-wakelock.c173
-rw-r--r--drivers/video/Kconfig1
-rw-r--r--drivers/video/Makefile1
-rw-r--r--drivers/video/adf/Kconfig14
-rw-r--r--drivers/video/adf/Makefile15
-rw-r--r--drivers/video/adf/adf.c1188
-rw-r--r--drivers/video/adf/adf.h71
-rw-r--r--drivers/video/adf/adf_client.c811
-rw-r--r--drivers/video/adf/adf_fbdev.c665
-rw-r--r--drivers/video/adf/adf_fops.c957
-rw-r--r--drivers/video/adf/adf_fops.h37
-rw-r--r--drivers/video/adf/adf_fops32.c217
-rw-r--r--drivers/video/adf/adf_fops32.h78
-rw-r--r--drivers/video/adf/adf_format.c280
-rw-r--r--drivers/video/adf/adf_memblock.c160
-rw-r--r--drivers/video/adf/adf_sysfs.c296
-rw-r--r--drivers/video/adf/adf_sysfs.h33
-rw-r--r--drivers/video/adf/adf_trace.h93
-rw-r--r--drivers/w1/masters/ds2482.c47
-rw-r--r--fs/adfs/super.c1
-rw-r--r--fs/affs/super.c1
-rw-r--r--fs/befs/linuxvfs.c1
-rw-r--r--fs/btrfs/super.c1
-rw-r--r--fs/cifs/cifsfs.c1
-rw-r--r--fs/cifs/transport.c2
-rw-r--r--fs/coda/dir.c19
-rw-r--r--fs/coda/inode.c1
-rw-r--r--fs/compat.c43
-rw-r--r--fs/cramfs/inode.c1
-rw-r--r--fs/debugfs/inode.c1
-rw-r--r--fs/devpts/inode.c1
-rw-r--r--fs/ecryptfs/file.c18
-rw-r--r--fs/efs/super.c1
-rw-r--r--fs/eventpoll.c7
-rw-r--r--fs/exec.c6
-rw-r--r--fs/exportfs/expfs.c14
-rw-r--r--fs/ext2/super.c1
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext4/ext4.h3
-rw-r--r--fs/ext4/ioctl.c6
-rw-r--r--fs/ext4/mballoc.c28
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/f2fs/super.c1
-rw-r--r--fs/fat/dir.c11
-rw-r--r--fs/fat/fat.h1
-rw-r--r--fs/fat/inode.c11
-rw-r--r--fs/freevxfs/vxfs_super.c1
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/fuse/dev.c6
-rw-r--r--fs/fuse/inode.c1
-rw-r--r--fs/gfs2/export.c6
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/super.c1
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/jffs2/super.c1
-rw-r--r--fs/jfs/super.c1
-rw-r--r--fs/minix/inode.c1
-rw-r--r--fs/ncpfs/inode.c1
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/nfs3proc.c2
-rw-r--r--fs/nfs/nfs4proc.c4
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfsd/nfs4recover.c20
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/nilfs2/super.c1
-rw-r--r--fs/ntfs/super.c2
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/openpromfs/inode.c1
-rw-r--r--fs/proc/base.c8
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/task_mmu.c79
-rw-r--r--fs/pstore/Kconfig10
-rw-r--r--fs/pstore/Makefile2
-rw-r--r--fs/pstore/inode.c18
-rw-r--r--fs/pstore/internal.h6
-rw-r--r--fs/pstore/platform.c1
-rw-r--r--fs/pstore/pmsg.c114
-rw-r--r--fs/pstore/ram.c59
-rw-r--r--fs/qnx4/inode.c1
-rw-r--r--fs/qnx6/inode.c1
-rw-r--r--fs/readdir.c61
-rw-r--r--fs/reiserfs/super.c1
-rw-r--r--fs/romfs/super.c1
-rw-r--r--fs/select.c4
-rw-r--r--fs/seq_file.c30
-rw-r--r--fs/squashfs/Kconfig87
-rw-r--r--fs/squashfs/Makefile6
-rw-r--r--fs/squashfs/block.c41
-rw-r--r--fs/squashfs/cache.c28
-rw-r--r--fs/squashfs/decompressor.c66
-rw-r--r--fs/squashfs/decompressor.h28
-rw-r--r--fs/squashfs/decompressor_multi.c198
-rw-r--r--fs/squashfs/decompressor_multi_percpu.c97
-rw-r--r--fs/squashfs/decompressor_single.c85
-rw-r--r--fs/squashfs/dir.c55
-rw-r--r--fs/squashfs/file.c142
-rw-r--r--fs/squashfs/file_cache.c38
-rw-r--r--fs/squashfs/file_direct.c176
-rw-r--r--fs/squashfs/lz4_wrapper.c142
-rw-r--r--fs/squashfs/lzo_wrapper.c47
-rw-r--r--fs/squashfs/namei.c8
-rw-r--r--fs/squashfs/page_actor.c100
-rw-r--r--fs/squashfs/page_actor.h81
-rw-r--r--fs/squashfs/squashfs.h22
-rw-r--r--fs/squashfs/squashfs_fs.h6
-rw-r--r--fs/squashfs/squashfs_fs_sb.h4
-rw-r--r--fs/squashfs/super.c16
-rw-r--r--fs/squashfs/xz_wrapper.c105
-rw-r--r--fs/squashfs/zlib_wrapper.c64
-rw-r--r--fs/sysv/inode.c1
-rw-r--r--fs/timerfd.c132
-rw-r--r--fs/ubifs/super.c1
-rw-r--r--fs/udf/super.c1
-rw-r--r--fs/ufs/super.c1
-rw-r--r--fs/xfs/xfs_super.c1
-rw-r--r--include/asm-generic/barrier.h15
-rw-r--r--include/asm-generic/seccomp.h29
-rw-r--r--include/asm-generic/syscall.h4
-rw-r--r--include/linux/Kbuild2
-rw-r--r--include/linux/alarmtimer.h4
-rw-r--r--include/linux/amba/mmci.h12
-rw-r--r--include/linux/android_aid.h28
-rw-r--r--include/linux/cgroup.h12
-rw-r--r--include/linux/compiler.h9
-rw-r--r--include/linux/cpu.h7
-rw-r--r--include/linux/cpufreq.h5
-rw-r--r--include/linux/debug_locks.h4
-rw-r--r--include/linux/freezer.h171
-rw-r--r--include/linux/fs.h13
-rw-r--r--include/linux/gpio_event.h170
-rw-r--r--include/linux/hid.h4
-rw-r--r--include/linux/if_pppolac.h23
-rw-r--r--include/linux/if_pppopns.h23
-rw-r--r--include/linux/if_pppox.h21
-rw-r--r--include/linux/ipv6.h2
-rw-r--r--include/linux/kernel.h3
-rw-r--r--include/linux/keychord.h23
-rw-r--r--include/linux/keycombo.h36
-rw-r--r--include/linux/keyreset.h29
-rw-r--r--include/linux/lsm_audit.h7
-rw-r--r--include/linux/mm.h10
-rw-r--r--include/linux/mm_types.h13
-rw-r--r--include/linux/mmc/host.h35
-rw-r--r--include/linux/mmc/pm.h1
-rwxr-xr-x[-rw-r--r--]include/linux/mmc/sdio_func.h10
-rw-r--r--include/linux/netfilter/xt_qtaguid.h13
-rw-r--r--include/linux/netfilter/xt_quota2.h25
-rw-r--r--include/linux/nmi.h5
-rw-r--r--include/linux/of_fdt.h21
-rw-r--r--include/linux/platform_data/ds2482.h21
-rw-r--r--include/linux/power_supply.h10
-rw-r--r--include/linux/pstore.h1
-rw-r--r--include/linux/pstore_ram.h3
-rw-r--r--include/linux/sched.h21
-rw-r--r--include/linux/seccomp.h8
-rw-r--r--include/linux/security.h29
-rw-r--r--include/linux/serial_core.h1
-rw-r--r--include/linux/suspend.h2
-rw-r--r--include/linux/switch.h53
-rw-r--r--include/linux/syscalls.h2
-rw-r--r--include/linux/uid_stat.h29
-rw-r--r--include/linux/usb/f_accessory.h23
-rw-r--r--include/linux/usb/f_mtp.h23
-rw-r--r--include/linux/wakelock.h67
-rw-r--r--include/linux/wakeup_reason.h27
-rw-r--r--include/linux/wifi_tiwlan.h27
-rw-r--r--include/linux/wlan_plat.h30
-rw-r--r--include/net/activity_stats.h25
-rw-r--r--include/net/addrconf.h3
-rw-r--r--include/net/bluetooth/hci.h9
-rw-r--r--include/net/bluetooth/hci_core.h8
-rw-r--r--include/net/bluetooth/sco.h4
-rw-r--r--include/net/cfg80211.h220
-rw-r--r--include/net/fib_rules.h6
-rw-r--r--include/net/flow.h19
-rw-r--r--include/net/inet_sock.h9
-rw-r--r--include/net/ip.h4
-rw-r--r--include/net/ip6_route.h2
-rw-r--r--include/net/ipv6.h12
-rw-r--r--include/net/net_namespace.h9
-rw-r--r--include/net/netns/ipv4.h3
-rw-r--r--include/net/netns/ipv6.h1
-rw-r--r--include/net/ping.h49
-rw-r--r--include/net/route.h5
-rw-r--r--include/net/tcp.h3
-rw-r--r--include/net/transp_v6.h3
-rw-r--r--include/trace/events/cpufreq_interactive.h112
-rw-r--r--include/trace/events/gpu.h143
-rw-r--r--include/trace/events/mmc.h91
-rw-r--r--include/trace/events/power.h19
-rw-r--r--include/uapi/asm-generic/unistd.h12
-rw-r--r--include/uapi/linux/Kbuild1
-rw-r--r--include/uapi/linux/android/Kbuild2
-rw-r--r--include/uapi/linux/android/binder.h (renamed from drivers/staging/android/binder.h)115
-rw-r--r--include/uapi/linux/audit.h2
-rw-r--r--include/uapi/linux/eventpoll.h13
-rw-r--r--include/uapi/linux/fib_rules.h2
-rw-r--r--include/uapi/linux/fs.h2
-rw-r--r--include/uapi/linux/if_pppolac.h33
-rw-r--r--include/uapi/linux/if_pppopns.h32
-rw-r--r--include/uapi/linux/if_pppox.h6
-rw-r--r--include/uapi/linux/input.h43
-rw-r--r--include/uapi/linux/ipv6.h2
-rw-r--r--include/uapi/linux/keychord.h52
-rw-r--r--include/uapi/linux/msdos_fs.h12
-rw-r--r--include/uapi/linux/netfilter/xt_IDLETIMER.h8
-rw-r--r--include/uapi/linux/netfilter/xt_socket.h6
-rw-r--r--include/uapi/linux/nl80211.h130
-rw-r--r--include/uapi/linux/prctl.h9
-rw-r--r--include/uapi/linux/rtnetlink.h1
-rw-r--r--include/uapi/linux/seccomp.h7
-rw-r--r--include/uapi/linux/sockios.h1
-rw-r--r--include/uapi/linux/usb/f_accessory.h146
-rw-r--r--include/uapi/linux/usb/f_mtp.h61
-rw-r--r--include/uapi/sound/compress_params.h10
-rw-r--r--include/uapi/video/adf.h321
-rw-r--r--include/video/adf.h502
-rw-r--r--include/video/adf_client.h61
-rw-r--r--include/video/adf_fbdev.h124
-rw-r--r--include/video/adf_format.h26
-rw-r--r--include/video/adf_memblock.h20
-rw-r--r--init/Kconfig6
-rw-r--r--kernel/cgroup.c52
-rw-r--r--kernel/cpu.c20
-rw-r--r--kernel/debug/debug_core.c12
-rw-r--r--kernel/debug/kdb/kdb_io.c12
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c68
-rw-r--r--kernel/freezer.c12
-rw-r--r--kernel/futex.c3
-rw-r--r--kernel/hrtimer.c3
-rw-r--r--kernel/irq/pm.c18
-rw-r--r--kernel/lockdep.c17
-rw-r--r--kernel/panic.c13
-rw-r--r--kernel/power/Kconfig15
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/process.c61
-rw-r--r--kernel/power/suspend.c34
-rw-r--r--kernel/power/suspend_time.c111
-rw-r--r--kernel/power/wakelock.c7
-rw-r--r--kernel/power/wakeup_reason.c216
-rw-r--r--kernel/sched/core.c14
-rw-r--r--kernel/seccomp.c409
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/sys.c178
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c17
-rw-r--r--kernel/time/alarmtimer.c39
-rw-r--r--kernel/trace/Kconfig3
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/gpu-traces.c23
-rw-r--r--kernel/trace/trace.c104
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_functions_graph.c43
-rw-r--r--kernel/trace/trace_output.c182
-rw-r--r--kernel/watchdog.c123
-rw-r--r--lib/Kconfig.debug17
-rw-r--r--mm/madvise.c3
-rw-r--r--mm/memcontrol.c12
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/mlock.c7
-rw-r--r--mm/mmap.c44
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/page_alloc.c35
-rw-r--r--mm/page_io.c50
-rw-r--r--mm/shmem.c13
-rw-r--r--mm/util.c10
-rw-r--r--mm/vmscan.c43
-rw-r--r--net/Kconfig16
-rw-r--r--net/Makefile1
-rw-r--r--net/activity_stats.c119
-rw-r--r--net/bluetooth/af_bluetooth.c34
-rw-r--r--net/bluetooth/amp.c2
-rw-r--r--net/bluetooth/hci_conn.c57
-rwxr-xr-x[-rw-r--r--]net/bluetooth/hci_event.c20
-rw-r--r--net/bluetooth/l2cap_core.c4
-rw-r--r--net/bluetooth/mgmt.c4
-rw-r--r--net/bluetooth/rfcomm/core.c1
-rw-r--r--net/bluetooth/sco.c53
-rw-r--r--net/bridge/br_device.c11
-rw-r--r--net/core/fib_rules.c53
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c20
-rw-r--r--net/ipv4/devinet.c8
-rw-r--r--net/ipv4/fib_frontend.c3
-rw-r--r--net/ipv4/fib_semantics.c1
-rw-r--r--net/ipv4/icmp.c16
-rw-r--r--net/ipv4/inet_connection_sock.c12
-rw-r--r--net/ipv4/ip_output.c6
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/netfilter/Kconfig12
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c8
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c5
-rw-r--r--net/ipv4/ping.c584
-rw-r--r--net/ipv4/raw.c6
-rw-r--r--net/ipv4/route.c32
-rw-r--r--net/ipv4/syncookies.c6
-rw-r--r--net/ipv4/sysctl_net_ipv4.c38
-rw-r--r--net/ipv4/sysfs_net_ipv4.c88
-rw-r--r--net/ipv4/tcp.c118
-rw-r--r--net/ipv4/tcp_input.c5
-rw-r--r--net/ipv4/tcp_ipv4.c1
-rw-r--r--net/ipv4/tcp_output.c7
-rw-r--r--net/ipv4/udp.c3
-rw-r--r--net/ipv6/Makefile2
-rw-r--r--net/ipv6/addrconf.c98
-rw-r--r--net/ipv6/af_inet6.c50
-rw-r--r--net/ipv6/ah6.c2
-rw-r--r--net/ipv6/anycast.c21
-rw-r--r--net/ipv6/datagram.c4
-rw-r--r--net/ipv6/esp6.c2
-rw-r--r--net/ipv6/exthdrs_core.c13
-rw-r--r--net/ipv6/icmp.c27
-rw-r--r--net/ipv6/inet6_connection_sock.c4
-rw-r--r--net/ipv6/ip6mr.c2
-rw-r--r--net/ipv6/ipcomp6.c2
-rw-r--r--net/ipv6/netfilter/Kconfig12
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c9
-rw-r--r--net/ipv6/ping.c224
-rw-r--r--net/ipv6/raw.c7
-rw-r--r--net/ipv6/route.c89
-rw-r--r--net/ipv6/syncookies.c5
-rw-r--r--net/ipv6/sysctl_net_ipv6.c7
-rw-r--r--net/ipv6/tcp_ipv6.c3
-rw-r--r--net/ipv6/udp.c7
-rw-r--r--net/netfilter/Kconfig42
-rw-r--r--net/netfilter/Makefile2
-rw-r--r--net/netfilter/xt_IDLETIMER.c247
-rw-r--r--net/netfilter/xt_qtaguid.c3017
-rw-r--r--net/netfilter/xt_qtaguid_internal.h352
-rw-r--r--net/netfilter/xt_qtaguid_print.c566
-rw-r--r--net/netfilter/xt_qtaguid_print.h120
-rw-r--r--net/netfilter/xt_quota2.c385
-rw-r--r--net/netfilter/xt_socket.c70
-rw-r--r--net/rfkill/Kconfig5
-rw-r--r--net/rfkill/core.c4
-rw-r--r--net/sunrpc/sched.c2
-rw-r--r--net/unix/af_unix.c3
-rw-r--r--net/wireless/Kconfig11
-rw-r--r--net/wireless/core.h4
-rw-r--r--net/wireless/nl80211.c337
-rw-r--r--net/wireless/scan.c2
-rw-r--r--net/wireless/sme.c6
-rw-r--r--scripts/Makefile.lib12
-rw-r--r--scripts/Makefile.modinst2
-rw-r--r--scripts/kconfig/confdata.c14
-rw-r--r--scripts/kconfig/expr.h3
-rw-r--r--scripts/kconfig/lkc.h1
-rw-r--r--scripts/kconfig/symbol.c11
-rw-r--r--security/Kconfig2
-rw-r--r--security/apparmor/domain.c4
-rw-r--r--security/apparmor/include/apparmor.h1
-rw-r--r--security/apparmor/lib.c32
-rw-r--r--security/capability.c24
-rw-r--r--security/commoncap.c11
-rw-r--r--security/lsm_audit.c15
-rw-r--r--security/security.c20
-rw-r--r--security/selinux/avc.c437
-rw-r--r--security/selinux/hooks.c183
-rw-r--r--security/selinux/include/avc.h9
-rw-r--r--security/selinux/include/classmap.h1
-rw-r--r--security/selinux/include/netif.h6
-rw-r--r--security/selinux/include/netnode.h2
-rw-r--r--security/selinux/include/netport.h2
-rw-r--r--security/selinux/include/objsec.h2
-rw-r--r--security/selinux/include/security.h35
-rw-r--r--security/selinux/netif.c58
-rw-r--r--security/selinux/netnode.c15
-rw-r--r--security/selinux/netport.c15
-rw-r--r--security/selinux/nlmsgtab.c9
-rw-r--r--security/selinux/ss/avtab.c94
-rw-r--r--security/selinux/ss/avtab.h25
-rw-r--r--security/selinux/ss/conditional.c32
-rw-r--r--security/selinux/ss/conditional.h6
-rw-r--r--security/selinux/ss/constraint.h1
-rw-r--r--security/selinux/ss/policydb.c101
-rw-r--r--security/selinux/ss/policydb.h11
-rw-r--r--security/selinux/ss/services.c202
-rw-r--r--security/selinux/ss/services.h6
-rw-r--r--sound/core/compress_offload.c3
648 files changed, 55451 insertions, 4315 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons b/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons
new file mode 100644
index 000000000000..acb19b91c192
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons
@@ -0,0 +1,16 @@
+What: /sys/kernel/wakeup_reasons/last_resume_reason
+Date: February 2014
+Contact: Ruchi Kandoi <kandoiruchi@google.com>
+Description:
+ The /sys/kernel/wakeup_reasons/last_resume_reason is
+ used to report wakeup reasons after system exited suspend.
+
+What: /sys/kernel/wakeup_reasons/last_suspend_time
+Date: March 2015
+Contact: jinqian <jinqian@google.com>
+Description:
+ The /sys/kernel/wakeup_reasons/last_suspend_time is
+ used to report time spent in last suspend cycle. It contains
+ two numbers (in seconds) separated by space. First number is
+ the time spent in suspend and resume processes. Second number
+ is the time spent in sleep state. \ No newline at end of file
diff --git a/Documentation/android.txt b/Documentation/android.txt
new file mode 100644
index 000000000000..0f40a78b045f
--- /dev/null
+++ b/Documentation/android.txt
@@ -0,0 +1,121 @@
+ =============
+ A N D R O I D
+ =============
+
+Copyright (C) 2009 Google, Inc.
+Written by Mike Chan <mike@android.com>
+
+CONTENTS:
+---------
+
+1. Android
+ 1.1 Required enabled config options
+ 1.2 Required disabled config options
+ 1.3 Recommended enabled config options
+2. Contact
+
+
+1. Android
+==========
+
+Android (www.android.com) is an open source operating system for mobile devices.
+This document describes configurations needed to run the Android framework on
+top of the Linux kernel.
+
+To see a working defconfig look at msm_defconfig or goldfish_defconfig
+which can be found at http://android.git.kernel.org in kernel/common.git
+and kernel/msm.git
+
+
+1.1 Required enabled config options
+-----------------------------------
+After building a standard defconfig, ensure that these options are enabled in
+your .config or defconfig if they are not already. Based off the msm_defconfig.
+You should keep the rest of the default options enabled in the defconfig
+unless you know what you are doing.
+
+ANDROID_PARANOID_NETWORK
+ASHMEM
+CONFIG_FB_MODE_HELPERS
+CONFIG_FONT_8x16
+CONFIG_FONT_8x8
+CONFIG_YAFFS_SHORT_NAMES_IN_RAM
+DAB
+EARLYSUSPEND
+FB
+FB_CFB_COPYAREA
+FB_CFB_FILLRECT
+FB_CFB_IMAGEBLIT
+FB_DEFERRED_IO
+FB_TILEBLITTING
+HIGH_RES_TIMERS
+INOTIFY
+INOTIFY_USER
+INPUT_EVDEV
+INPUT_GPIO
+INPUT_MISC
+LEDS_CLASS
+LEDS_GPIO
+LOCK_KERNEL
+LkOGGER
+LOW_MEMORY_KILLER
+MISC_DEVICES
+NEW_LEDS
+NO_HZ
+POWER_SUPPLY
+PREEMPT
+RAMFS
+RTC_CLASS
+RTC_LIB
+SWITCH
+SWITCH_GPIO
+TMPFS
+UID_STAT
+UID16
+USB_FUNCTION
+USB_FUNCTION_ADB
+USER_WAKELOCK
+VIDEO_OUTPUT_CONTROL
+WAKELOCK
+YAFFS_AUTO_YAFFS2
+YAFFS_FS
+YAFFS_YAFFS1
+YAFFS_YAFFS2
+
+
+1.2 Required disabled config options
+------------------------------------
+CONFIG_YAFFS_DISABLE_LAZY_LOAD
+DNOTIFY
+
+
+1.3 Recommended enabled config options
+------------------------------
+ANDROID_PMEM
+PSTORE_CONSOLE
+PSTORE_RAM
+SCHEDSTATS
+DEBUG_PREEMPT
+DEBUG_MUTEXES
+DEBUG_SPINLOCK_SLEEP
+DEBUG_INFO
+FRAME_POINTER
+CPU_FREQ
+CPU_FREQ_TABLE
+CPU_FREQ_DEFAULT_GOV_ONDEMAND
+CPU_FREQ_GOV_ONDEMAND
+CRC_CCITT
+EMBEDDED
+INPUT_TOUCHSCREEN
+I2C
+I2C_BOARDINFO
+LOG_BUF_SHIFT=17
+SERIAL_CORE
+SERIAL_CORE_CONSOLE
+
+
+2. Contact
+==========
+website: http://android.git.kernel.org
+
+mailing-lists: android-kernel@googlegroups.com
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 638bf17ff869..61dc0ec5c9a9 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -598,6 +598,15 @@ is completely unused; @cgrp->parent is still valid. (Note - can also
be called for a newly-created cgroup if an error occurs after this
subsystem's create() method has been called for the new cgroup).
+int allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+(cgroup_mutex held by caller)
+
+Called prior to moving a task into a cgroup; if the subsystem
+returns an error, this will abort the attach operation. Used
+to extend the permission checks - if all subsystems in a cgroup
+return 0, the attach will be allowed to proceed, even if the
+default permission check (root or same user) fails.
+
int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
(cgroup_mutex held by caller)
diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt
index 219970ba54b7..875eecbed8c8 100644
--- a/Documentation/cpu-freq/governors.txt
+++ b/Documentation/cpu-freq/governors.txt
@@ -28,6 +28,7 @@ Contents:
2.3 Userspace
2.4 Ondemand
2.5 Conservative
+2.6 Interactive
3. The Governor Interface in the CPUfreq Core
@@ -218,6 +219,90 @@ a decision on when to decrease the frequency while running in any
speed. Load for frequency increase is still evaluated every
sampling rate.
+2.6 Interactive
+---------------
+
+The CPUfreq governor "interactive" is designed for latency-sensitive,
+interactive workloads. This governor sets the CPU speed depending on
+usage, similar to "ondemand" and "conservative" governors, but with a
+different set of configurable behaviors.
+
+The tuneable values for this governor are:
+
+target_loads: CPU load values used to adjust speed to influence the
+current CPU load toward that value. In general, the lower the target
+load, the more often the governor will raise CPU speeds to bring load
+below the target. The format is a single target load, optionally
+followed by pairs of CPU speeds and CPU loads to target at or above
+those speeds. Colons can be used between the speeds and associated
+target loads for readability. For example:
+
+ 85 1000000:90 1700000:99
+
+targets CPU load 85% below speed 1GHz, 90% at or above 1GHz, until
+1.7GHz and above, at which load 99% is targeted. If speeds are
+specified these must appear in ascending order. Higher target load
+values are typically specified for higher speeds, that is, target load
+values also usually appear in an ascending order. The default is
+target load 90% for all speeds.
+
+min_sample_time: The minimum amount of time to spend at the current
+frequency before ramping down. Default is 80000 uS.
+
+hispeed_freq: An intermediate "hi speed" at which to initially ramp
+when CPU load hits the value specified in go_hispeed_load. If load
+stays high for the amount of time specified in above_hispeed_delay,
+then speed may be bumped higher. Default is the maximum speed
+allowed by the policy at governor initialization time.
+
+go_hispeed_load: The CPU load at which to ramp to hispeed_freq.
+Default is 99%.
+
+above_hispeed_delay: When speed is at or above hispeed_freq, wait for
+this long before raising speed in response to continued high load.
+The format is a single delay value, optionally followed by pairs of
+CPU speeds and the delay to use at or above those speeds. Colons can
+be used between the speeds and associated delays for readability. For
+example:
+
+ 80000 1300000:200000 1500000:40000
+
+uses delay 80000 uS until CPU speed 1.3 GHz, at which speed delay
+200000 uS is used until speed 1.5 GHz, at which speed (and above)
+delay 40000 uS is used. If speeds are specified these must appear in
+ascending order. Default is 20000 uS.
+
+timer_rate: Sample rate for reevaluating CPU load when the CPU is not
+idle. A deferrable timer is used, such that the CPU will not be woken
+from idle to service this timer until something else needs to run.
+(The maximum time to allow deferring this timer when not running at
+minimum speed is configurable via timer_slack.) Default is 20000 uS.
+
+timer_slack: Maximum additional time to defer handling the governor
+sampling timer beyond timer_rate when running at speeds above the
+minimum. For platforms that consume additional power at idle when
+CPUs are running at speeds greater than minimum, this places an upper
+bound on how long the timer will be deferred prior to re-evaluating
+load and dropping speed. For example, if timer_rate is 20000uS and
+timer_slack is 10000uS then timers will be deferred for up to 30msec
+when not at lowest speed. A value of -1 means defer timers
+indefinitely at all speeds. Default is 80000 uS.
+
+boost: If non-zero, immediately boost speed of all CPUs to at least
+hispeed_freq until zero is written to this attribute. If zero, allow
+CPU speeds to drop below hispeed_freq according to load as usual.
+Default is zero.
+
+boostpulse: On each write, immediately boost speed of all CPUs to
+hispeed_freq for at least the period of time specified by
+boostpulse_duration, after which speeds are allowed to drop below
+hispeed_freq according to load as usual.
+
+boostpulse_duration: Length of time to hold CPU speed at hispeed_freq
+on a write to boostpulse, before allowing speed to drop according to
+load as usual. Default is 80000 uS.
+
+
3. The Governor Interface in the CPUfreq Core
=============================================
diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt
index 9884681535ee..2929f6b1ccf1 100644
--- a/Documentation/device-mapper/verity.txt
+++ b/Documentation/device-mapper/verity.txt
@@ -10,7 +10,7 @@ Construction Parameters
<version> <dev> <hash_dev>
<data_block_size> <hash_block_size>
<num_data_blocks> <hash_start_block>
- <algorithm> <digest> <salt>
+ <algorithm> <digest> <salt> <mode>
<version>
This is the type of the on-disk hash format.
@@ -62,6 +62,16 @@ Construction Parameters
<salt>
The hexadecimal encoding of the salt value.
+<mode>
+ Optional. The mode of operation.
+
+ 0 is the normal mode of operation where a corrupted block will result in an
+ I/O error.
+
+ 1 is logging mode where corrupted blocks are logged and a uevent is sent to
+ notify user space.
+
+
Theory of operation
===================
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 4db22f6491e0..85a4a033bae7 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -445,3 +445,6 @@ object doesn't exist. It's remote/distributed ones that might care...
[mandatory]
FS_REVAL_DOT is gone; if you used to have it, add ->d_weak_revalidate()
in your dentry operations instead.
+--
+[mandatory]
+ vfs_readdir() is gone; switch to iterate_dir() instead
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 954eab8c7fec..be35913d282b 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -369,6 +369,8 @@ is not associated with a file:
[stack:1001] = the stack of the thread with tid 1001
[vdso] = the "virtual dynamic shared object",
the kernel system call handler
+ [anon:<name>] = an anonymous mapping that has been
+ named by userspace
or if empty, the mapping is anonymous.
@@ -419,6 +421,7 @@ KernelPageSize: 4 kB
MMUPageSize: 4 kB
Locked: 374 kB
VmFlags: rd ex mr mw me de
+Name: name from userspace
the first of these lines shows the same information as is displayed for the
mapping in /proc/PID/maps. The remaining lines show the size of the mapping
@@ -469,6 +472,9 @@ Note that there is no guarantee that every flag and associated mnemonic will
be present in all further kernel releases. Things get changed, the flags may
be vanished or the reverse -- new added.
+The "Name" field will only be present on a mapping that has been named by
+userspace, and will show the name passed in by userspace.
+
This file is only present if the CONFIG_MMU kernel configuration option is
enabled.
@@ -484,6 +490,10 @@ To clear the bits for the file mapped pages associated with the process
> echo 3 > /proc/PID/clear_refs
Any other value written to /proc/PID/clear_refs will have no effect.
+To reset the peak resident set size ("high water mark") to the process's
+current value:
+ > echo 5 > /proc/PID/clear_refs
+
The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
using /proc/kpageflags and number of times a page is mapped using
/proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt.
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt
index 403c090aca39..e5274f84dc56 100644
--- a/Documentation/filesystems/squashfs.txt
+++ b/Documentation/filesystems/squashfs.txt
@@ -2,10 +2,10 @@ SQUASHFS 4.0 FILESYSTEM
=======================
Squashfs is a compressed read-only filesystem for Linux.
-It uses zlib/lzo/xz compression to compress files, inodes and directories.
-Inodes in the system are very small and all blocks are packed to minimise
-data overhead. Block sizes greater than 4K are supported up to a maximum
-of 1Mbytes (default block size 128K).
+It uses zlib, lz4, lzo, or xz compression to compress files, inodes and
+directories. Inodes in the system are very small and all blocks are packed to
+minimise data overhead. Block sizes greater than 4K are supported up to a
+maximum of 1Mbytes (default block size 128K).
Squashfs is intended for general read-only filesystem use, for archival
use (i.e. in cases where a .tar.gz file may be used), and in constrained
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index a59ee432a98f..fbfc9e0d75e0 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -22,6 +22,15 @@ ip_no_pmtu_disc - BOOLEAN
min_pmtu - INTEGER
default 552 - minimum discovered Path MTU
+fwmark_reflect - BOOLEAN
+ Controls the fwmark of kernel-generated IPv4 reply packets that are not
+ associated with a socket for example, TCP RSTs or ICMP echo replies).
+ If unset, these packets have a fwmark of zero. If set, they have the
+ fwmark of the packet they are replying to. Similarly affects the fwmark
+ used by internal routing lookups triggered by incoming packets, such as
+ the ones used for Path MTU Discovery.
+ Default: 0
+
route/max_size - INTEGER
Maximum number of routes allowed in the kernel. Increase
this when using large numbers of interfaces and/or routes.
@@ -468,6 +477,16 @@ tcp_fastopen - INTEGER
See include/net/tcp.h and the code for more details.
+tcp_fwmark_accept - BOOLEAN
+ If set, incoming connections to listening sockets that do not have a
+ socket mark will set the mark of the accepting socket to the fwmark of
+ the incoming SYN packet. This will cause all packets on that connection
+ (starting from the first SYNACK) to be sent with that fwmark. The
+ listening socket's mark is unchanged. Listening sockets that already
+ have a fwmark set via setsockopt(SOL_SOCKET, SO_MARK, ...) are
+ unaffected.
+ Default: 0
+
tcp_syn_retries - INTEGER
Number of times initial SYNs for an active TCP connection attempt
will be retransmitted. Should not be higher than 255. Default value
@@ -1093,6 +1112,15 @@ conf/all/forwarding - BOOLEAN
proxy_ndp - BOOLEAN
Do proxy ndp.
+fwmark_reflect - BOOLEAN
+ Controls the fwmark of kernel-generated IPv6 reply packets that are not
+ associated with a socket for example, TCP RSTs or ICMPv6 echo replies).
+ If unset, these packets have a fwmark of zero. If set, they have the
+ fwmark of the packet they are replying to. Similarly affects the fwmark
+ used by internal routing lookups triggered by incoming packets, such as
+ the ones used for Path MTU Discovery.
+ Default: 0
+
conf/interface/*:
Change special settings per interface.
@@ -1311,6 +1339,19 @@ ndisc_notify - BOOLEAN
1 - Generate unsolicited neighbour advertisements when device is brought
up or hardware address changes.
+optimistic_dad - BOOLEAN
+ Whether to perform Optimistic Duplicate Address Detection (RFC 4429).
+ 0: disabled (default)
+ 1: enabled
+
+use_optimistic - BOOLEAN
+ If enabled, do not classify optimistic addresses as deprecated during
+ source address selection. Preferred addresses will still be chosen
+ before optimistic addresses, subject to other ranking in the source
+ address selection algorithm.
+ 0: disabled (default)
+ 1: enabled
+
icmp/*:
ratelimit - INTEGER
Limit the maximal rates for sending ICMPv6 packets.
diff --git a/Documentation/sync.txt b/Documentation/sync.txt
new file mode 100644
index 000000000000..a2d05e7fa193
--- /dev/null
+++ b/Documentation/sync.txt
@@ -0,0 +1,75 @@
+Motivation:
+
+In complicated DMA pipelines such as graphics (multimedia, camera, gpu, display)
+a consumer of a buffer needs to know when the producer has finished producing
+it. Likewise the producer needs to know when the consumer is finished with the
+buffer so it can reuse it. A particular buffer may be consumed by multiple
+consumers which will retain the buffer for different amounts of time. In
+addition, a consumer may consume multiple buffers atomically.
+The sync framework adds an API which allows synchronization between the
+producers and consumers in a generic way while also allowing platforms which
+have shared hardware synchronization primitives to exploit them.
+
+Goals:
+ * provide a generic API for expressing synchronization dependencies
+ * allow drivers to exploit hardware synchronization between hardware
+ blocks
+ * provide a userspace API that allows a compositor to manage
+ dependencies.
+ * provide rich telemetry data to allow debugging slowdowns and stalls of
+ the graphics pipeline.
+
+Objects:
+ * sync_timeline
+ * sync_pt
+ * sync_fence
+
+sync_timeline:
+
+A sync_timeline is an abstract monotonically increasing counter. In general,
+each driver/hardware block context will have one of these. They can be backed
+by the appropriate hardware or rely on the generic sw_sync implementation.
+Timelines are only ever created through their specific implementations
+(i.e. sw_sync.)
+
+sync_pt:
+
+A sync_pt is an abstract value which marks a point on a sync_timeline. Sync_pts
+have a single timeline parent. They have 3 states: active, signaled, and error.
+They start in active state and transition, once, to either signaled (when the
+timeline counter advances beyond the sync_pt’s value) or error state.
+
+sync_fence:
+
+Sync_fences are the primary primitives used by drivers to coordinate
+synchronization of their buffers. They are a collection of sync_pts which may
+or may not have the same timeline parent. A sync_pt can only exist in one fence
+and the fence's list of sync_pts is immutable once created. Fences can be
+waited on synchronously or asynchronously. Two fences can also be merged to
+create a third fence containing a copy of the two fences’ sync_pts. Fences are
+backed by file descriptors to allow userspace to coordinate the display pipeline
+dependencies.
+
+Use:
+
+A driver implementing sync support should have a work submission function which:
+ * takes a fence argument specifying when to begin work
+ * asynchronously queues that work to kick off when the fence is signaled
+ * returns a fence to indicate when its work will be done.
+ * signals the returned fence once the work is completed.
+
+Consider an imaginary display driver that has the following API:
+/*
+ * assumes buf is ready to be displayed.
+ * blocks until the buffer is on screen.
+ */
+ void display_buffer(struct dma_buf *buf);
+
+The new API will become:
+/*
+ * will display buf when fence is signaled.
+ * returns immediately with a fence that will signal when buf
+ * is no longer displayed.
+ */
+struct sync_fence* display_buffer(struct dma_buf *buf,
+ struct sync_fence *fence);
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index dcc75a9ed919..b81fca90f7fe 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm:
- dirty_writeback_centisecs
- drop_caches
- extfrag_threshold
+- extra_free_kbytes
- hugepages_treat_as_movable
- hugetlb_shm_group
- laptop_mode
@@ -198,6 +199,21 @@ fragmentation index is <= extfrag_threshold. The default value is 500.
==============================================================
+extra_free_kbytes
+
+This parameter tells the VM to keep extra free memory between the threshold
+where background reclaim (kswapd) kicks in, and the threshold where direct
+reclaim (by allocating processes) kicks in.
+
+This is useful for workloads that require low latency memory allocations
+and have a bounded burstiness in memory allocations, for example a
+realtime application that receives and transmits network traffic
+(causing in-kernel memory allocations) with a maximum total message burst
+size of 200MB may need 200MB of extra free memory to avoid direct reclaim
+related latencies.
+
+==============================================================
+
hugepages_treat_as_movable
This parameter is only useful when kernelcore= is specified at boot time to
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index bfe8c29b1f1d..29064c3bfcd1 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -2013,6 +2013,35 @@ will produce:
1) 1.449 us | }
+You can disable the hierarchical function call formatting and instead print a
+flat list of function entry and return events. This uses the format described
+in the Output Formatting section and respects all the trace options that
+control that formatting. Hierarchical formatting is the default.
+
+ hierachical: echo nofuncgraph-flat > trace_options
+ flat: echo funcgraph-flat > trace_options
+
+ ie:
+
+ # tracer: function_graph
+ #
+ # entries-in-buffer/entries-written: 68355/68355 #P:2
+ #
+ # _-----=> irqs-off
+ # / _----=> need-resched
+ # | / _---=> hardirq/softirq
+ # || / _--=> preempt-depth
+ # ||| / delay
+ # TASK-PID CPU# |||| TIMESTAMP FUNCTION
+ # | | | |||| | |
+ sh-1806 [001] d... 198.843443: graph_ent: func=_raw_spin_lock
+ sh-1806 [001] d... 198.843445: graph_ent: func=__raw_spin_lock
+ sh-1806 [001] d..1 198.843447: graph_ret: func=__raw_spin_lock
+ sh-1806 [001] d..1 198.843449: graph_ret: func=_raw_spin_lock
+ sh-1806 [001] d..1 198.843451: graph_ent: func=_raw_spin_unlock_irqrestore
+ sh-1806 [001] d... 198.843453: graph_ret: func=_raw_spin_unlock_irqrestore
+
+
You might find other useful features for this tracer in the
following "dynamic ftrace" section such as tracing only specific
functions or tasks.
diff --git a/android/configs/README b/android/configs/README
new file mode 100644
index 000000000000..8798731f8904
--- /dev/null
+++ b/android/configs/README
@@ -0,0 +1,15 @@
+The files in this directory are meant to be used as a base for an Android
+kernel config. All devices should have the options in android-base.cfg enabled.
+While not mandatory, the options in android-recommended.cfg enable advanced
+Android features.
+
+Assuming you already have a minimalist defconfig for your device, a possible
+way to enable these options would be:
+
+ ARCH=<arch> scripts/kconfig/merge_config.sh <path_to>/<device>_defconfig android/configs/android-base.cfg android/configs/android-recommended.cfg
+
+This will generate a .config that can then be used to save a new defconfig or
+compile a new kernel with Android features enabled.
+
+Because there is no tool to consistently generate these config fragments,
+lets keep them alphabetically sorted instead of random.
diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg
new file mode 100644
index 000000000000..1bee5d614d1b
--- /dev/null
+++ b/android/configs/android-base.cfg
@@ -0,0 +1,152 @@
+# KEEP ALPHABETICALLY SORTED
+# CONFIG_DEVKMEM is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_INET_LRO is not set
+# CONFIG_MODULES is not set
+# CONFIG_OABI_COMPAT is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_ARMV7_COMPAT=y
+CONFIG_ASHMEM=y
+CONFIG_AUDIT=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_VERITY=y
+CONFIG_EMBEDDED=y
+CONFIG_FB=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_INET=y
+CONFIG_INET_ESP=y
+CONFIG_INET_XFRM_MODE_TUNNEL=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_TARGET_REJECT_SKERR=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_TARGET_REJECT_SKERR=y
+CONFIG_NET=y
+CONFIG_NETDEVICES=y
+CONFIG_NETFILTER=y
+CONFIG_NETFILTER_TPROXY=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_KEY=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_NAT=y
+CONFIG_NO_HZ=y
+CONFIG_PACKET=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PPP=y
+CONFIG_PPPOLAC=y
+CONFIG_PPPOPNS=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PREEMPT=y
+CONFIG_RESOURCE_COUNTERS=y
+CONFIG_RTC_CLASS=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SND=y
+CONFIG_SOUND=y
+CONFIG_STAGING=y
+CONFIG_SWITCH=y
+CONFIG_SYNC=y
+CONFIG_SYSVIPC=y
+CONFIG_TUN=y
+CONFIG_UNIX=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_G_ANDROID=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_XFRM_USER=y
diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg
new file mode 100644
index 000000000000..960b9de2860d
--- /dev/null
+++ b/android/configs/android-recommended.cfg
@@ -0,0 +1,121 @@
+# KEEP ALPHABETICALLY SORTED
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_NF_CONNTRACK_SIP is not set
+# CONFIG_PM_WAKELOCKS_GC is not set
+# CONFIG_VT is not set
+CONFIG_ANDROID_TIMED_GPIO=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_COMPACTION=y
+CONFIG_DM_UEVENT=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_FUSE_FS=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HIDRAW=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_GREENASIA=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_GPIO=y
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_KEYRESET=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_ION=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_KSM=y
+CONFIG_LOGIG940_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGITECH_FF=y
+CONFIG_MD=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_MSDOS_FS=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_PANTHERLORD_FF=y
+CONFIG_PERF_EVENTS=y
+CONFIG_PM_DEBUG=y
+CONFIG_PM_RUNTIME=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+CONFIG_POWER_SUPPLY=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_SCHEDSTATS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_SND=y
+CONFIG_SOUND=y
+CONFIG_SUSPEND_TIME=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_TABLET_USB_WACOM=y
+CONFIG_TIMER_STATS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_UHID=y
+CONFIG_UID_STAT=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_USBNET=y
+CONFIG_VFAT_FS=y
diff --git a/arch/Kconfig b/arch/Kconfig
index 00e3702ec79b..4c0a1d03ae0d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -331,6 +331,7 @@ config HAVE_ARCH_SECCOMP_FILTER
- secure_computing is called from a ptrace_event()-safe context
- secure_computing return value is checked and a return value of -1
results in the system call being skipped immediately.
+ - seccomp syscall wired up
config SECCOMP_FILTER
def_bool y
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index b9e37ad6fa19..1402fcc11c2c 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -96,6 +96,7 @@ struct osf_dirent {
};
struct osf_dirent_callback {
+ struct dir_context ctx;
struct osf_dirent __user *dirent;
long __user *basep;
unsigned int count;
@@ -146,17 +147,17 @@ SYSCALL_DEFINE4(osf_getdirentries, unsigned int, fd,
{
int error;
struct fd arg = fdget(fd);
- struct osf_dirent_callback buf;
+ struct osf_dirent_callback buf = {
+ .ctx.actor = osf_filldir,
+ .dirent = dirent,
+ .basep = basep,
+ .count = count
+ };
if (!arg.file)
return -EBADF;
- buf.dirent = dirent;
- buf.basep = basep;
- buf.count = count;
- buf.error = 0;
-
- error = vfs_readdir(arg.file, osf_filldir, &buf);
+ error = iterate_dir(arg.file, &buf.ctx);
if (error >= 0)
error = buf.error;
if (count != buf.count)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 9422f298f32f..1a86004f3f27 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1958,6 +1958,15 @@ config XEN
help
Say Y if you want to run Linux in a Virtual Machine on Xen on ARM.
+config ARM_FLUSH_CONSOLE_ON_RESTART
+ bool "Force flush the console on restart"
+ help
+ If the console is locked while the system is rebooted, the messages
+ in the temporary logbuffer would not have propogated to all the
+ console drivers. This option forces the console lock to be
+ released if it failed to be acquired, which will cause all the
+ pending messages to be flushed.
+
endmenu
menu "Boot options"
@@ -1987,6 +1996,21 @@ config DEPRECATED_PARAM_STRUCT
This was deprecated in 2001 and announced to live on for 5 years.
Some old boot loaders still use this way.
+config BUILD_ARM_APPENDED_DTB_IMAGE
+ bool "Build a concatenated zImage/dtb by default"
+ depends on OF
+ help
+ Enabling this option will cause a concatenated zImage and DTB to
+ be built by default (instead of a standalone zImage.) The image
+ will built in arch/arm/boot/zImage-dtb.<dtb name>
+
+config BUILD_ARM_APPENDED_DTB_IMAGE_NAME
+ string "Default dtb name"
+ depends on BUILD_ARM_APPENDED_DTB_IMAGE
+ help
+ name of the dtb to append when building a concatenated
+ zImage/dtb.
+
# Compressed boot loader in ROM. Yes, we really want to ask about
# TEXT and BSS so we preserve their values in the config files.
config ZBOOT_ROM_TEXT
@@ -2336,6 +2360,13 @@ config NEON
Say Y to include support code for NEON, the ARMv7 Advanced SIMD
Extension.
+config KERNEL_MODE_NEON
+ bool "Support for NEON in kernel mode"
+ default n
+ depends on NEON
+ help
+ Say Y to include support for NEON in kernel mode.
+
endmenu
menu "Userspace binary formats"
diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug
index 5fdb6dbc5f89..4d1793474c9c 100644
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -63,6 +63,27 @@ config DEBUG_USER
8 - SIGSEGV faults
16 - SIGBUS faults
+config DEBUG_RODATA
+ bool "Write protect kernel text section"
+ default n
+ depends on DEBUG_KERNEL && MMU
+ ---help---
+ Mark the kernel text section as write-protected in the pagetables,
+ in order to catch accidental (and incorrect) writes to such const
+ data. This will cause the size of the kernel, plus up to 4MB, to
+ be mapped as pages instead of sections, which will increase TLB
+ pressure.
+ If in doubt, say "N".
+
+config DEBUG_RODATA_TEST
+ bool "Testcase for the DEBUG_RODATA feature"
+ depends on DEBUG_RODATA
+ default n
+ ---help---
+ This option enables a testcase for the DEBUG_RODATA
+ feature.
+ If in doubt, say "N"
+
# These options are only for real kernel hackers who want to get their hands dirty.
config DEBUG_LL
bool "Kernel low-level debugging functions (read help!)"
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 70bc19e2274f..9d36200374f0 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -265,6 +265,8 @@ libs-y := arch/arm/lib/ $(libs-y)
# Default target when executing plain make
ifeq ($(CONFIG_XIP_KERNEL),y)
KBUILD_IMAGE := xipImage
+else ifeq ($(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE),y)
+KBUILD_IMAGE := zImage-dtb.$(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE_NAME)
else
KBUILD_IMAGE := zImage
endif
diff --git a/arch/arm/boot/Makefile b/arch/arm/boot/Makefile
index 84aa2caf07ed..085bb96493a3 100644
--- a/arch/arm/boot/Makefile
+++ b/arch/arm/boot/Makefile
@@ -14,6 +14,7 @@
ifneq ($(MACHINE),)
include $(srctree)/$(MACHINE)/Makefile.boot
endif
+include $(srctree)/arch/arm/boot/dts/Makefile
# Note: the following conditions must always be true:
# ZRELADDR == virt_to_phys(PAGE_OFFSET + TEXT_OFFSET)
diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index f6e34be012ff..a8264aa9b03a 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -714,6 +714,8 @@ __armv7_mmu_cache_on:
bic r6, r6, #1 << 31 @ 32-bit translation system
bic r6, r6, #3 << 0 @ use only ttbr0
mcrne p15, 0, r3, c2, c0, 0 @ load page table pointer
+ mcrne p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
+ mcr p15, 0, r0, c7, c5, 4 @ ISB
mcrne p15, 0, r1, c3, c0, 0 @ load domain access control
mcrne p15, 0, r6, c2, c0, 2 @ load ttb control
#endif
diff --git a/arch/arm/common/Kconfig b/arch/arm/common/Kconfig
index 9353184d730d..ce01364a96e3 100644
--- a/arch/arm/common/Kconfig
+++ b/arch/arm/common/Kconfig
@@ -17,3 +17,7 @@ config SHARP_PARAM
config SHARP_SCOOP
bool
+
+config FIQ_GLUE
+ bool
+ select FIQ
diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile
index 462cd580fc2d..505c479202b6 100644
--- a/arch/arm/common/Makefile
+++ b/arch/arm/common/Makefile
@@ -4,6 +4,7 @@
obj-y += firmware.o
+obj-$(CONFIG_FIQ_GLUE) += fiq_glue.o fiq_glue_setup.o
obj-$(CONFIG_ICST) += icst.o
obj-$(CONFIG_SA1111) += sa1111.o
obj-$(CONFIG_PCI_HOST_VIA82C505) += via82c505.o
diff --git a/arch/arm/common/fiq_glue.S b/arch/arm/common/fiq_glue.S
new file mode 100644
index 000000000000..24b42cec4813
--- /dev/null
+++ b/arch/arm/common/fiq_glue.S
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+
+ .global fiq_glue_end
+
+ /* fiq stack: r0-r15,cpsr,spsr of interrupted mode */
+
+ENTRY(fiq_glue)
+ /* store pc, cpsr from previous mode, reserve space for spsr */
+ mrs r12, spsr
+ sub lr, lr, #4
+ subs r10, #1
+ bne nested_fiq
+
+ str r12, [sp, #-8]!
+ str lr, [sp, #-4]!
+
+ /* store r8-r14 from previous mode */
+ sub sp, sp, #(7 * 4)
+ stmia sp, {r8-r14}^
+ nop
+
+ /* store r0-r7 from previous mode */
+ stmfd sp!, {r0-r7}
+
+ /* setup func(data,regs) arguments */
+ mov r0, r9
+ mov r1, sp
+ mov r3, r8
+
+ mov r7, sp
+
+ /* Get sp and lr from non-user modes */
+ and r4, r12, #MODE_MASK
+ cmp r4, #USR_MODE
+ beq fiq_from_usr_mode
+
+ mov r7, sp
+ orr r4, r4, #(PSR_I_BIT | PSR_F_BIT)
+ msr cpsr_c, r4
+ str sp, [r7, #(4 * 13)]
+ str lr, [r7, #(4 * 14)]
+ mrs r5, spsr
+ str r5, [r7, #(4 * 17)]
+
+ cmp r4, #(SVC_MODE | PSR_I_BIT | PSR_F_BIT)
+ /* use fiq stack if we reenter this mode */
+ subne sp, r7, #(4 * 3)
+
+fiq_from_usr_mode:
+ msr cpsr_c, #(SVC_MODE | PSR_I_BIT | PSR_F_BIT)
+ mov r2, sp
+ sub sp, r7, #12
+ stmfd sp!, {r2, ip, lr}
+ /* call func(data,regs) */
+ blx r3
+ ldmfd sp, {r2, ip, lr}
+ mov sp, r2
+
+ /* restore/discard saved state */
+ cmp r4, #USR_MODE
+ beq fiq_from_usr_mode_exit
+
+ msr cpsr_c, r4
+ ldr sp, [r7, #(4 * 13)]
+ ldr lr, [r7, #(4 * 14)]
+ msr spsr_cxsf, r5
+
+fiq_from_usr_mode_exit:
+ msr cpsr_c, #(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)
+
+ ldmfd sp!, {r0-r7}
+ ldr lr, [sp, #(4 * 7)]
+ ldr r12, [sp, #(4 * 8)]
+ add sp, sp, #(10 * 4)
+exit_fiq:
+ msr spsr_cxsf, r12
+ add r10, #1
+ cmp r11, #0
+ moveqs pc, lr
+ bx r11 /* jump to custom fiq return function */
+
+nested_fiq:
+ orr r12, r12, #(PSR_F_BIT)
+ b exit_fiq
+
+fiq_glue_end:
+
+ENTRY(fiq_glue_setup) /* func, data, sp, smc call number */
+ stmfd sp!, {r4}
+ mrs r4, cpsr
+ msr cpsr_c, #(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)
+ movs r8, r0
+ mov r9, r1
+ mov sp, r2
+ mov r11, r3
+ moveq r10, #0
+ movne r10, #1
+ msr cpsr_c, r4
+ ldmfd sp!, {r4}
+ bx lr
+
diff --git a/arch/arm/common/fiq_glue_setup.c b/arch/arm/common/fiq_glue_setup.c
new file mode 100644
index 000000000000..8cb1b611c6d5
--- /dev/null
+++ b/arch/arm/common/fiq_glue_setup.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <asm/fiq.h>
+#include <asm/fiq_glue.h>
+
+extern unsigned char fiq_glue, fiq_glue_end;
+extern void fiq_glue_setup(void *func, void *data, void *sp,
+ fiq_return_handler_t fiq_return_handler);
+
+static struct fiq_handler fiq_debbuger_fiq_handler = {
+ .name = "fiq_glue",
+};
+DEFINE_PER_CPU(void *, fiq_stack);
+static struct fiq_glue_handler *current_handler;
+static fiq_return_handler_t fiq_return_handler;
+static DEFINE_MUTEX(fiq_glue_lock);
+
+static void fiq_glue_setup_helper(void *info)
+{
+ struct fiq_glue_handler *handler = info;
+ fiq_glue_setup(handler->fiq, handler,
+ __get_cpu_var(fiq_stack) + THREAD_START_SP,
+ fiq_return_handler);
+}
+
+int fiq_glue_register_handler(struct fiq_glue_handler *handler)
+{
+ int ret;
+ int cpu;
+
+ if (!handler || !handler->fiq)
+ return -EINVAL;
+
+ mutex_lock(&fiq_glue_lock);
+ if (fiq_stack) {
+ ret = -EBUSY;
+ goto err_busy;
+ }
+
+ for_each_possible_cpu(cpu) {
+ void *stack;
+ stack = (void *)__get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
+ if (WARN_ON(!stack)) {
+ ret = -ENOMEM;
+ goto err_alloc_fiq_stack;
+ }
+ per_cpu(fiq_stack, cpu) = stack;
+ }
+
+ ret = claim_fiq(&fiq_debbuger_fiq_handler);
+ if (WARN_ON(ret))
+ goto err_claim_fiq;
+
+ current_handler = handler;
+ on_each_cpu(fiq_glue_setup_helper, handler, true);
+ set_fiq_handler(&fiq_glue, &fiq_glue_end - &fiq_glue);
+
+ mutex_unlock(&fiq_glue_lock);
+ return 0;
+
+err_claim_fiq:
+err_alloc_fiq_stack:
+ for_each_possible_cpu(cpu) {
+ __free_pages(per_cpu(fiq_stack, cpu), THREAD_SIZE_ORDER);
+ per_cpu(fiq_stack, cpu) = NULL;
+ }
+err_busy:
+ mutex_unlock(&fiq_glue_lock);
+ return ret;
+}
+
+static void fiq_glue_update_return_handler(void (*fiq_return)(void))
+{
+ fiq_return_handler = fiq_return;
+ if (current_handler)
+ on_each_cpu(fiq_glue_setup_helper, current_handler, true);
+}
+
+int fiq_glue_set_return_handler(void (*fiq_return)(void))
+{
+ int ret;
+
+ mutex_lock(&fiq_glue_lock);
+ if (fiq_return_handler) {
+ ret = -EBUSY;
+ goto err_busy;
+ }
+ fiq_glue_update_return_handler(fiq_return);
+ ret = 0;
+err_busy:
+ mutex_unlock(&fiq_glue_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(fiq_glue_set_return_handler);
+
+int fiq_glue_clear_return_handler(void (*fiq_return)(void))
+{
+ int ret;
+
+ mutex_lock(&fiq_glue_lock);
+ if (WARN_ON(fiq_return_handler != fiq_return)) {
+ ret = -EINVAL;
+ goto err_inval;
+ }
+ fiq_glue_update_return_handler(NULL);
+ ret = 0;
+err_inval:
+ mutex_unlock(&fiq_glue_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(fiq_glue_clear_return_handler);
+
+/**
+ * fiq_glue_resume - Restore fiqs after suspend or low power idle states
+ *
+ * This must be called before calling local_fiq_enable after returning from a
+ * power state where the fiq mode registers were lost. If a driver provided
+ * a resume hook when it registered the handler it will be called.
+ */
+
+void fiq_glue_resume(void)
+{
+ if (!current_handler)
+ return;
+ fiq_glue_setup(current_handler->fiq, current_handler,
+ __get_cpu_var(fiq_stack) + THREAD_START_SP,
+ fiq_return_handler);
+ if (current_handler->resume)
+ current_handler->resume(current_handler);
+}
+
diff --git a/arch/arm/crypto/.gitignore b/arch/arm/crypto/.gitignore
new file mode 100644
index 000000000000..6231d36b3635
--- /dev/null
+++ b/arch/arm/crypto/.gitignore
@@ -0,0 +1 @@
+aesbs-core.S
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index a2c83851bc90..2cee53b27238 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -3,7 +3,27 @@
#
obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
+obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
+obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
+obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
-aes-arm-y := aes-armv4.o aes_glue.o
-sha1-arm-y := sha1-armv4-large.o sha1_glue.o
+aes-arm-y := aes-armv4.o aes_glue.o
+aes-arm-bs-y := aesbs-core.o aesbs-glue.o
+sha1-arm-y := sha1-armv4-large.o sha1_glue.o
+sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
+sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
+sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
+sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
+
+quiet_cmd_perl = PERL $@
+ cmd_perl = $(PERL) $(<) > $(@)
+
+$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
+ $(call cmd,perl)
+
+$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
+ $(call cmd,perl)
+
+.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
diff --git a/arch/arm/crypto/aes_glue.c b/arch/arm/crypto/aes_glue.c
index e73ec2ab1316..0409b8f89782 100644
--- a/arch/arm/crypto/aes_glue.c
+++ b/arch/arm/crypto/aes_glue.c
@@ -6,22 +6,12 @@
#include <linux/crypto.h>
#include <crypto/aes.h>
-#define AES_MAXNR 14
+#include "aes_glue.h"
-typedef struct {
- unsigned int rd_key[4 *(AES_MAXNR + 1)];
- int rounds;
-} AES_KEY;
-
-struct AES_CTX {
- AES_KEY enc_key;
- AES_KEY dec_key;
-};
-
-asmlinkage void AES_encrypt(const u8 *in, u8 *out, AES_KEY *ctx);
-asmlinkage void AES_decrypt(const u8 *in, u8 *out, AES_KEY *ctx);
-asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
-asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
+EXPORT_SYMBOL(AES_encrypt);
+EXPORT_SYMBOL(AES_decrypt);
+EXPORT_SYMBOL(private_AES_set_encrypt_key);
+EXPORT_SYMBOL(private_AES_set_decrypt_key);
static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
@@ -81,7 +71,7 @@ static struct crypto_alg aes_alg = {
.cipher = {
.cia_min_keysize = AES_MIN_KEY_SIZE,
.cia_max_keysize = AES_MAX_KEY_SIZE,
- .cia_setkey = aes_set_key,
+ .cia_setkey = aes_set_key,
.cia_encrypt = aes_encrypt,
.cia_decrypt = aes_decrypt
}
diff --git a/arch/arm/crypto/aes_glue.h b/arch/arm/crypto/aes_glue.h
new file mode 100644
index 000000000000..cca3e51eb606
--- /dev/null
+++ b/arch/arm/crypto/aes_glue.h
@@ -0,0 +1,19 @@
+
+#define AES_MAXNR 14
+
+struct AES_KEY {
+ unsigned int rd_key[4 * (AES_MAXNR + 1)];
+ int rounds;
+};
+
+struct AES_CTX {
+ struct AES_KEY enc_key;
+ struct AES_KEY dec_key;
+};
+
+asmlinkage void AES_encrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
+asmlinkage void AES_decrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
+asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey,
+ const int bits, struct AES_KEY *key);
+asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey,
+ const int bits, struct AES_KEY *key);
diff --git a/arch/arm/crypto/aesbs-core.S_shipped b/arch/arm/crypto/aesbs-core.S_shipped
new file mode 100644
index 000000000000..71e5fc7cfb18
--- /dev/null
+++ b/arch/arm/crypto/aesbs-core.S_shipped
@@ -0,0 +1,2544 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+@ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+@ granted.
+@ ====================================================================
+
+@ Bit-sliced AES for ARM NEON
+@
+@ February 2012.
+@
+@ This implementation is direct adaptation of bsaes-x86_64 module for
+@ ARM NEON. Except that this module is endian-neutral [in sense that
+@ it can be compiled for either endianness] by courtesy of vld1.8's
+@ neutrality. Initial version doesn't implement interface to OpenSSL,
+@ only low-level primitives and unsupported entry points, just enough
+@ to collect performance results, which for Cortex-A8 core are:
+@
+@ encrypt 19.5 cycles per byte processed with 128-bit key
+@ decrypt 22.1 cycles per byte processed with 128-bit key
+@ key conv. 440 cycles per 128-bit key/0.18 of 8x block
+@
+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+@ which is [much] worse than anticipated (for further details see
+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
+@
+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+@ manages in 20.0 cycles].
+@
+@ When comparing to x86_64 results keep in mind that NEON unit is
+@ [mostly] single-issue and thus can't [fully] benefit from
+@ instruction-level parallelism. And when comparing to aes-armv4
+@ results keep in mind key schedule conversion overhead (see
+@ bsaes-x86_64.pl for further details)...
+@
+@ <appro@openssl.org>
+
+@ April-August 2013
+@
+@ Add CBC, CTR and XTS subroutines, adapt for kernel use.
+@
+@ <ard.biesheuvel@linaro.org>
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ 7
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_ARCH__>=7
+.text
+.syntax unified @ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code 32
+#endif
+
+.fpu neon
+
+.type _bsaes_decrypt8,%function
+.align 4
+_bsaes_decrypt8:
+ adr r6,_bsaes_decrypt8
+ vldmia r4!, {q9} @ round 0 key
+ add r6,r6,#.LM0ISR-_bsaes_decrypt8
+
+ vldmia r6!, {q8} @ .LM0ISR
+ veor q10, q0, q9 @ xor with round0 key
+ veor q11, q1, q9
+ vtbl.8 d0, {q10}, d16
+ vtbl.8 d1, {q10}, d17
+ veor q12, q2, q9
+ vtbl.8 d2, {q11}, d16
+ vtbl.8 d3, {q11}, d17
+ veor q13, q3, q9
+ vtbl.8 d4, {q12}, d16
+ vtbl.8 d5, {q12}, d17
+ veor q14, q4, q9
+ vtbl.8 d6, {q13}, d16
+ vtbl.8 d7, {q13}, d17
+ veor q15, q5, q9
+ vtbl.8 d8, {q14}, d16
+ vtbl.8 d9, {q14}, d17
+ veor q10, q6, q9
+ vtbl.8 d10, {q15}, d16
+ vtbl.8 d11, {q15}, d17
+ veor q11, q7, q9
+ vtbl.8 d12, {q10}, d16
+ vtbl.8 d13, {q10}, d17
+ vtbl.8 d14, {q11}, d16
+ vtbl.8 d15, {q11}, d17
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q4, #1
+ veor q10, q10, q7
+ veor q11, q11, q5
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #1
+ veor q5, q5, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q3
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q5, #2
+ vshr.u64 q11, q4, #2
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q5, q5, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q3
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q3, #4
+ vshr.u64 q11, q2, #4
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #4
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q5
+ veor q11, q11, q4
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ sub r5,r5,#1
+ b .Ldec_sbox
+.align 4
+.Ldec_loop:
+ vldmia r4!, {q8-q11}
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vtbl.8 d0, {q8}, d24
+ vtbl.8 d1, {q8}, d25
+ vldmia r4!, {q8}
+ veor q10, q10, q2
+ vtbl.8 d2, {q9}, d24
+ vtbl.8 d3, {q9}, d25
+ vldmia r4!, {q9}
+ veor q11, q11, q3
+ vtbl.8 d4, {q10}, d24
+ vtbl.8 d5, {q10}, d25
+ vldmia r4!, {q10}
+ vtbl.8 d6, {q11}, d24
+ vtbl.8 d7, {q11}, d25
+ vldmia r4!, {q11}
+ veor q8, q8, q4
+ veor q9, q9, q5
+ vtbl.8 d8, {q8}, d24
+ vtbl.8 d9, {q8}, d25
+ veor q10, q10, q6
+ vtbl.8 d10, {q9}, d24
+ vtbl.8 d11, {q9}, d25
+ veor q11, q11, q7
+ vtbl.8 d12, {q10}, d24
+ vtbl.8 d13, {q10}, d25
+ vtbl.8 d14, {q11}, d24
+ vtbl.8 d15, {q11}, d25
+.Ldec_sbox:
+ veor q1, q1, q4
+ veor q3, q3, q4
+
+ veor q4, q4, q7
+ veor q1, q1, q6
+ veor q2, q2, q7
+ veor q6, q6, q4
+
+ veor q0, q0, q1
+ veor q2, q2, q5
+ veor q7, q7, q6
+ veor q3, q3, q0
+ veor q5, q5, q0
+ veor q1, q1, q3
+ veor q11, q3, q0
+ veor q10, q7, q4
+ veor q9, q1, q6
+ veor q13, q4, q0
+ vmov q8, q10
+ veor q12, q5, q2
+
+ vorr q10, q10, q9
+ veor q15, q11, q8
+ vand q14, q11, q12
+ vorr q11, q11, q12
+ veor q12, q12, q9
+ vand q8, q8, q9
+ veor q9, q6, q2
+ vand q15, q15, q12
+ vand q13, q13, q9
+ veor q9, q3, q7
+ veor q12, q1, q5
+ veor q11, q11, q13
+ veor q10, q10, q13
+ vand q13, q9, q12
+ vorr q9, q9, q12
+ veor q11, q11, q15
+ veor q8, q8, q13
+ veor q10, q10, q14
+ veor q9, q9, q15
+ veor q8, q8, q14
+ vand q12, q4, q6
+ veor q9, q9, q14
+ vand q13, q0, q2
+ vand q14, q7, q1
+ vorr q15, q3, q5
+ veor q11, q11, q12
+ veor q9, q9, q14
+ veor q8, q8, q15
+ veor q10, q10, q13
+
+ @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
+
+ @ new smaller inversion
+
+ vand q14, q11, q9
+ vmov q12, q8
+
+ veor q13, q10, q14
+ veor q15, q8, q14
+ veor q14, q8, q14 @ q14=q15
+
+ vbsl q13, q9, q8
+ vbsl q15, q11, q10
+ veor q11, q11, q10
+
+ vbsl q12, q13, q14
+ vbsl q8, q14, q13
+
+ vand q14, q12, q15
+ veor q9, q9, q8
+
+ veor q14, q14, q11
+ veor q12, q5, q2
+ veor q8, q1, q6
+ veor q10, q15, q14
+ vand q10, q10, q5
+ veor q5, q5, q1
+ vand q11, q1, q15
+ vand q5, q5, q14
+ veor q1, q11, q10
+ veor q5, q5, q11
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q2
+ veor q12, q12, q8
+ veor q2, q2, q6
+ vand q8, q8, q15
+ vand q6, q6, q13
+ vand q12, q12, q14
+ vand q2, q2, q9
+ veor q8, q8, q12
+ veor q2, q2, q6
+ veor q12, q12, q11
+ veor q6, q6, q10
+ veor q5, q5, q12
+ veor q2, q2, q12
+ veor q1, q1, q8
+ veor q6, q6, q8
+
+ veor q12, q3, q0
+ veor q8, q7, q4
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q0
+ veor q12, q12, q8
+ veor q0, q0, q4
+ vand q8, q8, q15
+ vand q4, q4, q13
+ vand q12, q12, q14
+ vand q0, q0, q9
+ veor q8, q8, q12
+ veor q0, q0, q4
+ veor q12, q12, q11
+ veor q4, q4, q10
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q10, q15, q14
+ vand q10, q10, q3
+ veor q3, q3, q7
+ vand q11, q7, q15
+ vand q3, q3, q14
+ veor q7, q11, q10
+ veor q3, q3, q11
+ veor q3, q3, q12
+ veor q0, q0, q12
+ veor q7, q7, q8
+ veor q4, q4, q8
+ veor q1, q1, q7
+ veor q6, q6, q5
+
+ veor q4, q4, q1
+ veor q2, q2, q7
+ veor q5, q5, q7
+ veor q4, q4, q2
+ veor q7, q7, q0
+ veor q4, q4, q5
+ veor q3, q3, q6
+ veor q6, q6, q1
+ veor q3, q3, q4
+
+ veor q4, q4, q0
+ veor q7, q7, q3
+ subs r5,r5,#1
+ bcc .Ldec_done
+ @ multiplication by 0x05-0x00-0x04-0x00
+ vext.8 q8, q0, q0, #8
+ vext.8 q14, q3, q3, #8
+ vext.8 q15, q5, q5, #8
+ veor q8, q8, q0
+ vext.8 q9, q1, q1, #8
+ veor q14, q14, q3
+ vext.8 q10, q6, q6, #8
+ veor q15, q15, q5
+ vext.8 q11, q4, q4, #8
+ veor q9, q9, q1
+ vext.8 q12, q2, q2, #8
+ veor q10, q10, q6
+ vext.8 q13, q7, q7, #8
+ veor q11, q11, q4
+ veor q12, q12, q2
+ veor q13, q13, q7
+
+ veor q0, q0, q14
+ veor q1, q1, q14
+ veor q6, q6, q8
+ veor q2, q2, q10
+ veor q4, q4, q9
+ veor q1, q1, q15
+ veor q6, q6, q15
+ veor q2, q2, q14
+ veor q7, q7, q11
+ veor q4, q4, q14
+ veor q3, q3, q12
+ veor q2, q2, q15
+ veor q7, q7, q15
+ veor q5, q5, q13
+ vext.8 q8, q0, q0, #12 @ x0 <<< 32
+ vext.8 q9, q1, q1, #12
+ veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
+ vext.8 q10, q6, q6, #12
+ veor q1, q1, q9
+ vext.8 q11, q4, q4, #12
+ veor q6, q6, q10
+ vext.8 q12, q2, q2, #12
+ veor q4, q4, q11
+ vext.8 q13, q7, q7, #12
+ veor q2, q2, q12
+ vext.8 q14, q3, q3, #12
+ veor q7, q7, q13
+ vext.8 q15, q5, q5, #12
+ veor q3, q3, q14
+
+ veor q9, q9, q0
+ veor q5, q5, q15
+ vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor q10, q10, q1
+ veor q8, q8, q5
+ veor q9, q9, q5
+ vext.8 q1, q1, q1, #8
+ veor q13, q13, q2
+ veor q0, q0, q8
+ veor q14, q14, q7
+ veor q1, q1, q9
+ vext.8 q8, q2, q2, #8
+ veor q12, q12, q4
+ vext.8 q9, q7, q7, #8
+ veor q15, q15, q3
+ vext.8 q2, q4, q4, #8
+ veor q11, q11, q6
+ vext.8 q7, q5, q5, #8
+ veor q12, q12, q5
+ vext.8 q4, q3, q3, #8
+ veor q11, q11, q5
+ vext.8 q3, q6, q6, #8
+ veor q5, q9, q13
+ veor q11, q11, q2
+ veor q7, q7, q15
+ veor q6, q4, q14
+ veor q4, q8, q12
+ veor q2, q3, q10
+ vmov q3, q11
+ @ vmov q5, q9
+ vldmia r6, {q12} @ .LISR
+ ite eq @ Thumb2 thing, sanity check in ARM
+ addeq r6,r6,#0x10
+ bne .Ldec_loop
+ vldmia r6, {q12} @ .LISRM0
+ b .Ldec_loop
+.align 4
+.Ldec_done:
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q3, #1
+ vshr.u64 q11, q2, #1
+ veor q10, q10, q5
+ veor q11, q11, q7
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #1
+ veor q7, q7, q11
+ vshl.u64 q11, q11, #1
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q4
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q4, q4, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q7, #2
+ vshr.u64 q11, q2, #2
+ veor q10, q10, q5
+ veor q11, q11, q3
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #2
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #2
+ veor q7, q7, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q4
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q4, q4, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q4, #4
+ vshr.u64 q11, q6, #4
+ veor q10, q10, q5
+ veor q11, q11, q3
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #4
+ veor q4, q4, q10
+ veor q6, q6, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q7
+ veor q11, q11, q2
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vldmia r4, {q8} @ last round key
+ veor q6, q6, q8
+ veor q4, q4, q8
+ veor q2, q2, q8
+ veor q7, q7, q8
+ veor q3, q3, q8
+ veor q5, q5, q8
+ veor q0, q0, q8
+ veor q1, q1, q8
+ bx lr
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type _bsaes_const,%object
+.align 6
+_bsaes_const:
+.LM0ISR: @ InvShiftRows constants
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR: @ ShiftRows constants
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+ .quad 0x090d01050c000408, 0x03070b0f060a0e02
+.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align 6
+.size _bsaes_const,.-_bsaes_const
+
+.type _bsaes_encrypt8,%function
+.align 4
+_bsaes_encrypt8:
+ adr r6,_bsaes_encrypt8
+ vldmia r4!, {q9} @ round 0 key
+ sub r6,r6,#_bsaes_encrypt8-.LM0SR
+
+ vldmia r6!, {q8} @ .LM0SR
+_bsaes_encrypt8_alt:
+ veor q10, q0, q9 @ xor with round0 key
+ veor q11, q1, q9
+ vtbl.8 d0, {q10}, d16
+ vtbl.8 d1, {q10}, d17
+ veor q12, q2, q9
+ vtbl.8 d2, {q11}, d16
+ vtbl.8 d3, {q11}, d17
+ veor q13, q3, q9
+ vtbl.8 d4, {q12}, d16
+ vtbl.8 d5, {q12}, d17
+ veor q14, q4, q9
+ vtbl.8 d6, {q13}, d16
+ vtbl.8 d7, {q13}, d17
+ veor q15, q5, q9
+ vtbl.8 d8, {q14}, d16
+ vtbl.8 d9, {q14}, d17
+ veor q10, q6, q9
+ vtbl.8 d10, {q15}, d16
+ vtbl.8 d11, {q15}, d17
+ veor q11, q7, q9
+ vtbl.8 d12, {q10}, d16
+ vtbl.8 d13, {q10}, d17
+ vtbl.8 d14, {q11}, d16
+ vtbl.8 d15, {q11}, d17
+_bsaes_encrypt8_bitslice:
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q4, #1
+ veor q10, q10, q7
+ veor q11, q11, q5
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #1
+ veor q5, q5, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q3
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q5, #2
+ vshr.u64 q11, q4, #2
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q5, q5, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q3
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q3, #4
+ vshr.u64 q11, q2, #4
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #4
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q5
+ veor q11, q11, q4
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ sub r5,r5,#1
+ b .Lenc_sbox
+.align 4
+.Lenc_loop:
+ vldmia r4!, {q8-q11}
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vtbl.8 d0, {q8}, d24
+ vtbl.8 d1, {q8}, d25
+ vldmia r4!, {q8}
+ veor q10, q10, q2
+ vtbl.8 d2, {q9}, d24
+ vtbl.8 d3, {q9}, d25
+ vldmia r4!, {q9}
+ veor q11, q11, q3
+ vtbl.8 d4, {q10}, d24
+ vtbl.8 d5, {q10}, d25
+ vldmia r4!, {q10}
+ vtbl.8 d6, {q11}, d24
+ vtbl.8 d7, {q11}, d25
+ vldmia r4!, {q11}
+ veor q8, q8, q4
+ veor q9, q9, q5
+ vtbl.8 d8, {q8}, d24
+ vtbl.8 d9, {q8}, d25
+ veor q10, q10, q6
+ vtbl.8 d10, {q9}, d24
+ vtbl.8 d11, {q9}, d25
+ veor q11, q11, q7
+ vtbl.8 d12, {q10}, d24
+ vtbl.8 d13, {q10}, d25
+ vtbl.8 d14, {q11}, d24
+ vtbl.8 d15, {q11}, d25
+.Lenc_sbox:
+ veor q2, q2, q1
+ veor q5, q5, q6
+ veor q3, q3, q0
+ veor q6, q6, q2
+ veor q5, q5, q0
+
+ veor q6, q6, q3
+ veor q3, q3, q7
+ veor q7, q7, q5
+ veor q3, q3, q4
+ veor q4, q4, q5
+
+ veor q2, q2, q7
+ veor q3, q3, q1
+ veor q1, q1, q5
+ veor q11, q7, q4
+ veor q10, q1, q2
+ veor q9, q5, q3
+ veor q13, q2, q4
+ vmov q8, q10
+ veor q12, q6, q0
+
+ vorr q10, q10, q9
+ veor q15, q11, q8
+ vand q14, q11, q12
+ vorr q11, q11, q12
+ veor q12, q12, q9
+ vand q8, q8, q9
+ veor q9, q3, q0
+ vand q15, q15, q12
+ vand q13, q13, q9
+ veor q9, q7, q1
+ veor q12, q5, q6
+ veor q11, q11, q13
+ veor q10, q10, q13
+ vand q13, q9, q12
+ vorr q9, q9, q12
+ veor q11, q11, q15
+ veor q8, q8, q13
+ veor q10, q10, q14
+ veor q9, q9, q15
+ veor q8, q8, q14
+ vand q12, q2, q3
+ veor q9, q9, q14
+ vand q13, q4, q0
+ vand q14, q1, q5
+ vorr q15, q7, q6
+ veor q11, q11, q12
+ veor q9, q9, q14
+ veor q8, q8, q15
+ veor q10, q10, q13
+
+ @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
+
+ @ new smaller inversion
+
+ vand q14, q11, q9
+ vmov q12, q8
+
+ veor q13, q10, q14
+ veor q15, q8, q14
+ veor q14, q8, q14 @ q14=q15
+
+ vbsl q13, q9, q8
+ vbsl q15, q11, q10
+ veor q11, q11, q10
+
+ vbsl q12, q13, q14
+ vbsl q8, q14, q13
+
+ vand q14, q12, q15
+ veor q9, q9, q8
+
+ veor q14, q14, q11
+ veor q12, q6, q0
+ veor q8, q5, q3
+ veor q10, q15, q14
+ vand q10, q10, q6
+ veor q6, q6, q5
+ vand q11, q5, q15
+ vand q6, q6, q14
+ veor q5, q11, q10
+ veor q6, q6, q11
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q0
+ veor q12, q12, q8
+ veor q0, q0, q3
+ vand q8, q8, q15
+ vand q3, q3, q13
+ vand q12, q12, q14
+ vand q0, q0, q9
+ veor q8, q8, q12
+ veor q0, q0, q3
+ veor q12, q12, q11
+ veor q3, q3, q10
+ veor q6, q6, q12
+ veor q0, q0, q12
+ veor q5, q5, q8
+ veor q3, q3, q8
+
+ veor q12, q7, q4
+ veor q8, q1, q2
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q4
+ veor q12, q12, q8
+ veor q4, q4, q2
+ vand q8, q8, q15
+ vand q2, q2, q13
+ vand q12, q12, q14
+ vand q4, q4, q9
+ veor q8, q8, q12
+ veor q4, q4, q2
+ veor q12, q12, q11
+ veor q2, q2, q10
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q10, q15, q14
+ vand q10, q10, q7
+ veor q7, q7, q1
+ vand q11, q1, q15
+ vand q7, q7, q14
+ veor q1, q11, q10
+ veor q7, q7, q11
+ veor q7, q7, q12
+ veor q4, q4, q12
+ veor q1, q1, q8
+ veor q2, q2, q8
+ veor q7, q7, q0
+ veor q1, q1, q6
+ veor q6, q6, q0
+ veor q4, q4, q7
+ veor q0, q0, q1
+
+ veor q1, q1, q5
+ veor q5, q5, q2
+ veor q2, q2, q3
+ veor q3, q3, q5
+ veor q4, q4, q5
+
+ veor q6, q6, q3
+ subs r5,r5,#1
+ bcc .Lenc_done
+ vext.8 q8, q0, q0, #12 @ x0 <<< 32
+ vext.8 q9, q1, q1, #12
+ veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
+ vext.8 q10, q4, q4, #12
+ veor q1, q1, q9
+ vext.8 q11, q6, q6, #12
+ veor q4, q4, q10
+ vext.8 q12, q3, q3, #12
+ veor q6, q6, q11
+ vext.8 q13, q7, q7, #12
+ veor q3, q3, q12
+ vext.8 q14, q2, q2, #12
+ veor q7, q7, q13
+ vext.8 q15, q5, q5, #12
+ veor q2, q2, q14
+
+ veor q9, q9, q0
+ veor q5, q5, q15
+ vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor q10, q10, q1
+ veor q8, q8, q5
+ veor q9, q9, q5
+ vext.8 q1, q1, q1, #8
+ veor q13, q13, q3
+ veor q0, q0, q8
+ veor q14, q14, q7
+ veor q1, q1, q9
+ vext.8 q8, q3, q3, #8
+ veor q12, q12, q6
+ vext.8 q9, q7, q7, #8
+ veor q15, q15, q2
+ vext.8 q3, q6, q6, #8
+ veor q11, q11, q4
+ vext.8 q7, q5, q5, #8
+ veor q12, q12, q5
+ vext.8 q6, q2, q2, #8
+ veor q11, q11, q5
+ vext.8 q2, q4, q4, #8
+ veor q5, q9, q13
+ veor q4, q8, q12
+ veor q3, q3, q11
+ veor q7, q7, q15
+ veor q6, q6, q14
+ @ vmov q4, q8
+ veor q2, q2, q10
+ @ vmov q5, q9
+ vldmia r6, {q12} @ .LSR
+ ite eq @ Thumb2 thing, samity check in ARM
+ addeq r6,r6,#0x10
+ bne .Lenc_loop
+ vldmia r6, {q12} @ .LSRM0
+ b .Lenc_loop
+.align 4
+.Lenc_done:
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q3, #1
+ veor q10, q10, q5
+ veor q11, q11, q7
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #1
+ veor q7, q7, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q3, q3, q11
+ vshr.u64 q10, q4, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q6
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q6, q6, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q4, q4, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q7, #2
+ vshr.u64 q11, q3, #2
+ veor q10, q10, q5
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q7, q7, q10
+ veor q3, q3, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q6
+ veor q11, q11, q4
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q6, q6, q10
+ vshl.u64 q10, q10, #2
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q6, #4
+ vshr.u64 q11, q4, #4
+ veor q10, q10, q5
+ veor q11, q11, q2
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #4
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q7
+ veor q11, q11, q3
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vldmia r4, {q8} @ last round key
+ veor q4, q4, q8
+ veor q6, q6, q8
+ veor q3, q3, q8
+ veor q7, q7, q8
+ veor q2, q2, q8
+ veor q5, q5, q8
+ veor q0, q0, q8
+ veor q1, q1, q8
+ bx lr
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+.type _bsaes_key_convert,%function
+.align 4
+_bsaes_key_convert:
+ adr r6,_bsaes_key_convert
+ vld1.8 {q7}, [r4]! @ load round 0 key
+ sub r6,r6,#_bsaes_key_convert-.LM0
+ vld1.8 {q15}, [r4]! @ load round 1 key
+
+ vmov.i8 q8, #0x01 @ bit masks
+ vmov.i8 q9, #0x02
+ vmov.i8 q10, #0x04
+ vmov.i8 q11, #0x08
+ vmov.i8 q12, #0x10
+ vmov.i8 q13, #0x20
+ vldmia r6, {q14} @ .LM0
+
+#ifdef __ARMEL__
+ vrev32.8 q7, q7
+ vrev32.8 q15, q15
+#endif
+ sub r5,r5,#1
+ vstmia r12!, {q7} @ save round 0 key
+ b .Lkey_loop
+
+.align 4
+.Lkey_loop:
+ vtbl.8 d14,{q15},d28
+ vtbl.8 d15,{q15},d29
+ vmov.i8 q6, #0x40
+ vmov.i8 q15, #0x80
+
+ vtst.8 q0, q7, q8
+ vtst.8 q1, q7, q9
+ vtst.8 q2, q7, q10
+ vtst.8 q3, q7, q11
+ vtst.8 q4, q7, q12
+ vtst.8 q5, q7, q13
+ vtst.8 q6, q7, q6
+ vtst.8 q7, q7, q15
+ vld1.8 {q15}, [r4]! @ load next round key
+ vmvn q0, q0 @ "pnot"
+ vmvn q1, q1
+ vmvn q5, q5
+ vmvn q6, q6
+#ifdef __ARMEL__
+ vrev32.8 q15, q15
+#endif
+ subs r5,r5,#1
+ vstmia r12!,{q0-q7} @ write bit-sliced round key
+ bne .Lkey_loop
+
+ vmov.i8 q7,#0x63 @ compose .L63
+ @ don't save last round key
+ bx lr
+.size _bsaes_key_convert,.-_bsaes_key_convert
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+
+.global bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,%function
+.align 5
+bsaes_cbc_encrypt:
+#ifndef __KERNEL__
+ cmp r2, #128
+#ifndef __thumb__
+ blo AES_cbc_encrypt
+#else
+ bhs 1f
+ b AES_cbc_encrypt
+1:
+#endif
+#endif
+
+ @ it is up to the caller to make sure we are called with enc == 0
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr r8, [ip] @ IV is 1st arg on the stack
+ mov r2, r2, lsr#4 @ len in 16 byte blocks
+ sub sp, #0x10 @ scratch space to carry over the IV
+ mov r9, sp @ save sp
+
+ ldr r10, [r3, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
+ add r12, #96 @ sifze of bit-slices key schedule
+
+ @ populate the key schedule
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ mov sp, r12 @ sp is sp
+ bl _bsaes_key_convert
+ vldmia sp, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia sp, {q7}
+#else
+ ldr r12, [r3, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [r3, #244]
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ add r12, r3, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, r3, #248
+ vldmia r4, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia r4, {q7}
+
+.align 2
+0:
+#endif
+
+ vld1.8 {q15}, [r8] @ load IV
+ b .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+ subs r2, r2, #0x8
+ bmi .Lcbc_dec_loop_finish
+
+ vld1.8 {q0-q1}, [r0]! @ load input
+ vld1.8 {q2-q3}, [r0]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, sp @ pass the key
+#else
+ add r4, r3, #248
+#endif
+ vld1.8 {q4-q5}, [r0]!
+ mov r5, r10
+ vld1.8 {q6-q7}, [r0]
+ sub r0, r0, #0x60
+ vstmia r9, {q15} @ put aside IV
+
+ bl _bsaes_decrypt8
+
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10-q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12-q13}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q14-q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0-q1}, [r1]! @ write output
+ veor q3, q3, q13
+ vst1.8 {q6}, [r1]!
+ veor q5, q5, q14
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ vst1.8 {q3}, [r1]!
+ vst1.8 {q5}, [r1]!
+
+ b .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+ adds r2, r2, #8
+ beq .Lcbc_dec_done
+
+ vld1.8 {q0}, [r0]! @ load input
+ cmp r2, #2
+ blo .Lcbc_dec_one
+ vld1.8 {q1}, [r0]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, sp @ pass the key
+#else
+ add r4, r3, #248
+#endif
+ mov r5, r10
+ vstmia r9, {q15} @ put aside IV
+ beq .Lcbc_dec_two
+ vld1.8 {q2}, [r0]!
+ cmp r2, #4
+ blo .Lcbc_dec_three
+ vld1.8 {q3}, [r0]!
+ beq .Lcbc_dec_four
+ vld1.8 {q4}, [r0]!
+ cmp r2, #6
+ blo .Lcbc_dec_five
+ vld1.8 {q5}, [r0]!
+ beq .Lcbc_dec_six
+ vld1.8 {q6}, [r0]!
+ sub r0, r0, #0x70
+
+ bl _bsaes_decrypt8
+
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10-q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12-q13}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0-q1}, [r1]! @ write output
+ veor q3, q3, q13
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ vst1.8 {q3}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+ sub r0, r0, #0x60
+ bl _bsaes_decrypt8
+ vldmia r9,{q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10-q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0-q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+ sub r0, r0, #0x50
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10-q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q15}, [r0]!
+ veor q4, q4, q10
+ vst1.8 {q0-q1}, [r1]! @ write output
+ veor q2, q2, q11
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+ sub r0, r0, #0x40
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q15}, [r0]!
+ veor q4, q4, q10
+ vst1.8 {q0-q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+ sub r0, r0, #0x30
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q15}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vst1.8 {q0-q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+ sub r0, r0, #0x20
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q15}, [r0]! @ reload input
+ veor q1, q1, q8
+ vst1.8 {q0-q1}, [r1]! @ write output
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+ sub r0, r0, #0x10
+ mov r10, r1 @ save original out pointer
+ mov r1, r9 @ use the iv scratch space as out buffer
+ mov r2, r3
+ vmov q4,q15 @ just in case ensure that IV
+ vmov q5,q0 @ and input are preserved
+ bl AES_decrypt
+ vld1.8 {q0}, [r9,:64] @ load result
+ veor q0, q0, q4 @ ^= IV
+ vmov q15, q5 @ q5 holds input
+ vst1.8 {q0}, [r10] @ write output
+
+.Lcbc_dec_done:
+#ifndef BSAES_ASM_EXTENDED_KEY
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+.Lcbc_dec_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r9
+ bne .Lcbc_dec_bzero
+#endif
+
+ mov sp, r9
+ add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
+ vst1.8 {q15}, [r8] @ return IV
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc}
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+.extern AES_encrypt
+.global bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,%function
+.align 5
+bsaes_ctr32_encrypt_blocks:
+ cmp r2, #8 @ use plain AES for
+ blo .Lctr_enc_short @ small sizes
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr r8, [ip] @ ctr is 1st arg on the stack
+ sub sp, sp, #0x10 @ scratch space to carry over the ctr
+ mov r9, sp @ save sp
+
+ ldr r10, [r3, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
+ add r12, #96 @ size of bit-sliced key schedule
+
+ @ populate the key schedule
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ mov sp, r12 @ sp is sp
+ bl _bsaes_key_convert
+ veor q7,q7,q15 @ fix up last round key
+ vstmia r12, {q7} @ save last round key
+
+ vld1.8 {q0}, [r8] @ load counter
+ add r8, r6, #.LREVM0SR-.LM0 @ borrow r8
+ vldmia sp, {q4} @ load round0 key
+#else
+ ldr r12, [r3, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [r3, #244]
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ add r12, r3, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor q7,q7,q15 @ fix up last round key
+ vstmia r12, {q7} @ save last round key
+
+.align 2
+0: add r12, r3, #248
+ vld1.8 {q0}, [r8] @ load counter
+ adrl r8, .LREVM0SR @ borrow r8
+ vldmia r12, {q4} @ load round0 key
+ sub sp, #0x10 @ place for adjusted round0 key
+#endif
+
+ vmov.i32 q8,#1 @ compose 1<<96
+ veor q9,q9,q9
+ vrev32.8 q0,q0
+ vext.8 q8,q9,q8,#4
+ vrev32.8 q4,q4
+ vadd.u32 q9,q8,q8 @ compose 2<<96
+ vstmia sp, {q4} @ save adjusted round0 key
+ b .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+ vadd.u32 q10, q8, q9 @ compose 3<<96
+ vadd.u32 q1, q0, q8 @ +1
+ vadd.u32 q2, q0, q9 @ +2
+ vadd.u32 q3, q0, q10 @ +3
+ vadd.u32 q4, q1, q10
+ vadd.u32 q5, q2, q10
+ vadd.u32 q6, q3, q10
+ vadd.u32 q7, q4, q10
+ vadd.u32 q10, q5, q10 @ next counter
+
+ @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ @ to flip byte order in 32-bit counter
+
+ vldmia sp, {q9} @ load round0 key
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x10 @ pass next round key
+#else
+ add r4, r3, #264
+#endif
+ vldmia r8, {q8} @ .LREVM0SR
+ mov r5, r10 @ pass rounds
+ vstmia r9, {q10} @ save next counter
+ sub r6, r8, #.LREVM0SR-.LSR @ pass constants
+
+ bl _bsaes_encrypt8_alt
+
+ subs r2, r2, #8
+ blo .Lctr_enc_loop_done
+
+ vld1.8 {q8-q9}, [r0]! @ load input
+ vld1.8 {q10-q11}, [r0]!
+ veor q0, q8
+ veor q1, q9
+ vld1.8 {q12-q13}, [r0]!
+ veor q4, q10
+ veor q6, q11
+ vld1.8 {q14-q15}, [r0]!
+ veor q3, q12
+ vst1.8 {q0-q1}, [r1]! @ write output
+ veor q7, q13
+ veor q2, q14
+ vst1.8 {q4}, [r1]!
+ veor q5, q15
+ vst1.8 {q6}, [r1]!
+ vmov.i32 q8, #1 @ compose 1<<96
+ vst1.8 {q3}, [r1]!
+ veor q9, q9, q9
+ vst1.8 {q7}, [r1]!
+ vext.8 q8, q9, q8, #4
+ vst1.8 {q2}, [r1]!
+ vadd.u32 q9,q8,q8 @ compose 2<<96
+ vst1.8 {q5}, [r1]!
+ vldmia r9, {q0} @ load counter
+
+ bne .Lctr_enc_loop
+ b .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+ add r2, r2, #8
+ vld1.8 {q8}, [r0]! @ load input
+ veor q0, q8
+ vst1.8 {q0}, [r1]! @ write output
+ cmp r2, #2
+ blo .Lctr_enc_done
+ vld1.8 {q9}, [r0]!
+ veor q1, q9
+ vst1.8 {q1}, [r1]!
+ beq .Lctr_enc_done
+ vld1.8 {q10}, [r0]!
+ veor q4, q10
+ vst1.8 {q4}, [r1]!
+ cmp r2, #4
+ blo .Lctr_enc_done
+ vld1.8 {q11}, [r0]!
+ veor q6, q11
+ vst1.8 {q6}, [r1]!
+ beq .Lctr_enc_done
+ vld1.8 {q12}, [r0]!
+ veor q3, q12
+ vst1.8 {q3}, [r1]!
+ cmp r2, #6
+ blo .Lctr_enc_done
+ vld1.8 {q13}, [r0]!
+ veor q7, q13
+ vst1.8 {q7}, [r1]!
+ beq .Lctr_enc_done
+ vld1.8 {q14}, [r0]
+ veor q2, q14
+ vst1.8 {q2}, [r1]!
+
+.Lctr_enc_done:
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifndef BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r9
+ bne .Lctr_enc_bzero
+#else
+ vstmia sp, {q0-q1}
+#endif
+
+ mov sp, r9
+ add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.align 4
+.Lctr_enc_short:
+ ldr ip, [sp] @ ctr pointer is passed on stack
+ stmdb sp!, {r4-r8, lr}
+
+ mov r4, r0 @ copy arguments
+ mov r5, r1
+ mov r6, r2
+ mov r7, r3
+ ldr r8, [ip, #12] @ load counter LSW
+ vld1.8 {q1}, [ip] @ load whole counter value
+#ifdef __ARMEL__
+ rev r8, r8
+#endif
+ sub sp, sp, #0x10
+ vst1.8 {q1}, [sp,:64] @ copy counter value
+ sub sp, sp, #0x10
+
+.Lctr_enc_short_loop:
+ add r0, sp, #0x10 @ input counter value
+ mov r1, sp @ output on the stack
+ mov r2, r7 @ key
+
+ bl AES_encrypt
+
+ vld1.8 {q0}, [r4]! @ load input
+ vld1.8 {q1}, [sp,:64] @ load encrypted counter
+ add r8, r8, #1
+#ifdef __ARMEL__
+ rev r0, r8
+ str r0, [sp, #0x1c] @ next counter value
+#else
+ str r8, [sp, #0x1c] @ next counter value
+#endif
+ veor q0,q0,q1
+ vst1.8 {q0}, [r5]! @ store output
+ subs r6, r6, #1
+ bne .Lctr_enc_short_loop
+
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vstmia sp!, {q0-q1}
+
+ ldmia sp!, {r4-r8, pc}
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,%function
+.align 4
+bsaes_xts_encrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future r3
+
+ mov r7, r0
+ mov r8, r1
+ mov r9, r2
+ mov r10, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0,sp @ pointer to initial tweak
+#endif
+
+ ldr r1, [r10, #240] @ get # of rounds
+ mov r3, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #96 @ size of bit-sliced key schedule
+ sub r12, #48 @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, r10 @ pass key
+ mov r5, r1 @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ veor q7, q7, q15 @ fix up last round key
+ vstmia r12, {q7} @ save last round key
+#else
+ ldr r12, [r10, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [r10, #244]
+ mov r4, r10 @ pass key
+ mov r5, r1 @ pass # of rounds
+ add r12, r10, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor q7, q7, q15 @ fix up last round key
+ vstmia r12, {q7}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+
+ vld1.8 {q8}, [r0] @ initial tweak
+ adr r2, .Lxts_magic
+
+ subs r9, #0x80
+ blo .Lxts_enc_short
+ b .Lxts_enc_loop
+
+.align 4
+.Lxts_enc_loop:
+ vldmia r2, {q5} @ load XTS magic
+ vshr.s64 q6, q8, #63
+ mov r0, sp
+ vand q6, q6, q5
+ vadd.u64 q9, q8, q8
+ vst1.64 {q8}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q9, #63
+ veor q9, q9, q6
+ vand q7, q7, q5
+ vadd.u64 q10, q9, q9
+ vst1.64 {q9}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q10, #63
+ veor q10, q10, q7
+ vand q6, q6, q5
+ vld1.8 {q0}, [r7]!
+ vadd.u64 q11, q10, q10
+ vst1.64 {q10}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q11, #63
+ veor q11, q11, q6
+ vand q7, q7, q5
+ vld1.8 {q1}, [r7]!
+ veor q0, q0, q8
+ vadd.u64 q12, q11, q11
+ vst1.64 {q11}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q12, #63
+ veor q12, q12, q7
+ vand q6, q6, q5
+ vld1.8 {q2}, [r7]!
+ veor q1, q1, q9
+ vadd.u64 q13, q12, q12
+ vst1.64 {q12}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q13, #63
+ veor q13, q13, q6
+ vand q7, q7, q5
+ vld1.8 {q3}, [r7]!
+ veor q2, q2, q10
+ vadd.u64 q14, q13, q13
+ vst1.64 {q13}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q14, #63
+ veor q14, q14, q7
+ vand q6, q6, q5
+ vld1.8 {q4}, [r7]!
+ veor q3, q3, q11
+ vadd.u64 q15, q14, q14
+ vst1.64 {q14}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q15, #63
+ veor q15, q15, q6
+ vand q7, q7, q5
+ vld1.8 {q5}, [r7]!
+ veor q4, q4, q12
+ vadd.u64 q8, q15, q15
+ vst1.64 {q15}, [r0,:128]!
+ vswp d15,d14
+ veor q8, q8, q7
+ vst1.64 {q8}, [r0,:128] @ next round tweak
+
+ vld1.8 {q6-q7}, [r7]!
+ veor q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q6, q6, q14
+ mov r5, r1 @ pass rounds
+ veor q7, q7, q15
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q6, q11
+ vld1.64 {q14-q15}, [r0,:128]!
+ veor q10, q3, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ veor q12, q2, q14
+ vst1.8 {q10-q11}, [r8]!
+ veor q13, q5, q15
+ vst1.8 {q12-q13}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+
+ subs r9, #0x80
+ bpl .Lxts_enc_loop
+
+.Lxts_enc_short:
+ adds r9, #0x70
+ bmi .Lxts_enc_done
+
+ vldmia r2, {q5} @ load XTS magic
+ vshr.s64 q7, q8, #63
+ mov r0, sp
+ vand q7, q7, q5
+ vadd.u64 q9, q8, q8
+ vst1.64 {q8}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q9, #63
+ veor q9, q9, q7
+ vand q6, q6, q5
+ vadd.u64 q10, q9, q9
+ vst1.64 {q9}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q10, #63
+ veor q10, q10, q6
+ vand q7, q7, q5
+ vld1.8 {q0}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_1
+ vadd.u64 q11, q10, q10
+ vst1.64 {q10}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q11, #63
+ veor q11, q11, q7
+ vand q6, q6, q5
+ vld1.8 {q1}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_2
+ veor q0, q0, q8
+ vadd.u64 q12, q11, q11
+ vst1.64 {q11}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q12, #63
+ veor q12, q12, q6
+ vand q7, q7, q5
+ vld1.8 {q2}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_3
+ veor q1, q1, q9
+ vadd.u64 q13, q12, q12
+ vst1.64 {q12}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q13, #63
+ veor q13, q13, q7
+ vand q6, q6, q5
+ vld1.8 {q3}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_4
+ veor q2, q2, q10
+ vadd.u64 q14, q13, q13
+ vst1.64 {q13}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q14, #63
+ veor q14, q14, q6
+ vand q7, q7, q5
+ vld1.8 {q4}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_5
+ veor q3, q3, q11
+ vadd.u64 q15, q14, q14
+ vst1.64 {q14}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q15, #63
+ veor q15, q15, q7
+ vand q6, q6, q5
+ vld1.8 {q5}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_6
+ veor q4, q4, q12
+ sub r9, #0x10
+ vst1.64 {q15}, [r0,:128] @ next round tweak
+
+ vld1.8 {q6}, [r7]!
+ veor q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q6, q6, q14
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q6, q11
+ vld1.64 {q14}, [r0,:128]!
+ veor q10, q3, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ veor q12, q2, q14
+ vst1.8 {q10-q11}, [r8]!
+ vst1.8 {q12}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_6:
+ vst1.64 {q14}, [r0,:128] @ next round tweak
+
+ veor q4, q4, q12
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q5, q5, q13
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q6, q11
+ veor q10, q3, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ vst1.8 {q10-q11}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align 5
+.Lxts_magic:
+ .quad 1, 0x87
+
+.align 5
+.Lxts_enc_5:
+ vst1.64 {q13}, [r0,:128] @ next round tweak
+
+ veor q3, q3, q11
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q4, q4, q12
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q6, q11
+ veor q10, q3, q12
+ vst1.8 {q8-q9}, [r8]!
+ vst1.8 {q10}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_4:
+ vst1.64 {q12}, [r0,:128] @ next round tweak
+
+ veor q2, q2, q10
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q3, q3, q11
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q6, q11
+ vst1.8 {q8-q9}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_3:
+ vst1.64 {q11}, [r0,:128] @ next round tweak
+
+ veor q1, q1, q9
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q2, q2, q10
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ vst1.8 {q8}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_2:
+ vst1.64 {q10}, [r0,:128] @ next round tweak
+
+ veor q0, q0, q8
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q1, q1, q9
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ vst1.8 {q0-q1}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_1:
+ mov r0, sp
+ veor q0, q8
+ mov r1, sp
+ vst1.8 {q0}, [sp,:128]
+ mov r2, r10
+ mov r4, r3 @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {q0}, [sp,:128]
+ veor q0, q0, q8
+ vst1.8 {q0}, [r8]!
+ mov r3, r4
+
+ vmov q8, q9 @ next round tweak
+
+.Lxts_enc_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds r9, #0x10
+ beq .Lxts_enc_ret
+ sub r6, r8, #0x10
+
+.Lxts_enc_steal:
+ ldrb r0, [r7], #1
+ ldrb r1, [r8, #-0x10]
+ strb r0, [r8, #-0x10]
+ strb r1, [r8], #1
+
+ subs r9, #1
+ bhi .Lxts_enc_steal
+
+ vld1.8 {q0}, [r6]
+ mov r0, sp
+ veor q0, q0, q8
+ mov r1, sp
+ vst1.8 {q0}, [sp,:128]
+ mov r2, r10
+ mov r4, r3 @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {q0}, [sp,:128]
+ veor q0, q0, q8
+ vst1.8 {q0}, [r6]
+ mov r3, r4
+#endif
+
+.Lxts_enc_ret:
+ bic r0, r3, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_enc_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_enc_bzero
+
+ mov sp, r3
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {q8}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,%function
+.align 4
+bsaes_xts_decrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future r3
+
+ mov r7, r0
+ mov r8, r1
+ mov r9, r2
+ mov r10, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0, sp @ pointer to initial tweak
+#endif
+
+ ldr r1, [r10, #240] @ get # of rounds
+ mov r3, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #96 @ size of bit-sliced key schedule
+ sub r12, #48 @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, r10 @ pass key
+ mov r5, r1 @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, sp, #0x90
+ vldmia r4, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia r4, {q7}
+#else
+ ldr r12, [r10, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [r10, #244]
+ mov r4, r10 @ pass key
+ mov r5, r1 @ pass # of rounds
+ add r12, r10, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, r10, #248
+ vldmia r4, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia r4, {q7}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+ vld1.8 {q8}, [r0] @ initial tweak
+ adr r2, .Lxts_magic
+
+ tst r9, #0xf @ if not multiple of 16
+ it ne @ Thumb2 thing, sanity check in ARM
+ subne r9, #0x10 @ subtract another 16 bytes
+ subs r9, #0x80
+
+ blo .Lxts_dec_short
+ b .Lxts_dec_loop
+
+.align 4
+.Lxts_dec_loop:
+ vldmia r2, {q5} @ load XTS magic
+ vshr.s64 q6, q8, #63
+ mov r0, sp
+ vand q6, q6, q5
+ vadd.u64 q9, q8, q8
+ vst1.64 {q8}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q9, #63
+ veor q9, q9, q6
+ vand q7, q7, q5
+ vadd.u64 q10, q9, q9
+ vst1.64 {q9}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q10, #63
+ veor q10, q10, q7
+ vand q6, q6, q5
+ vld1.8 {q0}, [r7]!
+ vadd.u64 q11, q10, q10
+ vst1.64 {q10}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q11, #63
+ veor q11, q11, q6
+ vand q7, q7, q5
+ vld1.8 {q1}, [r7]!
+ veor q0, q0, q8
+ vadd.u64 q12, q11, q11
+ vst1.64 {q11}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q12, #63
+ veor q12, q12, q7
+ vand q6, q6, q5
+ vld1.8 {q2}, [r7]!
+ veor q1, q1, q9
+ vadd.u64 q13, q12, q12
+ vst1.64 {q12}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q13, #63
+ veor q13, q13, q6
+ vand q7, q7, q5
+ vld1.8 {q3}, [r7]!
+ veor q2, q2, q10
+ vadd.u64 q14, q13, q13
+ vst1.64 {q13}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q14, #63
+ veor q14, q14, q7
+ vand q6, q6, q5
+ vld1.8 {q4}, [r7]!
+ veor q3, q3, q11
+ vadd.u64 q15, q14, q14
+ vst1.64 {q14}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q15, #63
+ veor q15, q15, q6
+ vand q7, q7, q5
+ vld1.8 {q5}, [r7]!
+ veor q4, q4, q12
+ vadd.u64 q8, q15, q15
+ vst1.64 {q15}, [r0,:128]!
+ vswp d15,d14
+ veor q8, q8, q7
+ vst1.64 {q8}, [r0,:128] @ next round tweak
+
+ vld1.8 {q6-q7}, [r7]!
+ veor q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q6, q6, q14
+ mov r5, r1 @ pass rounds
+ veor q7, q7, q15
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q4, q11
+ vld1.64 {q14-q15}, [r0,:128]!
+ veor q10, q2, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ veor q12, q3, q14
+ vst1.8 {q10-q11}, [r8]!
+ veor q13, q5, q15
+ vst1.8 {q12-q13}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+
+ subs r9, #0x80
+ bpl .Lxts_dec_loop
+
+.Lxts_dec_short:
+ adds r9, #0x70
+ bmi .Lxts_dec_done
+
+ vldmia r2, {q5} @ load XTS magic
+ vshr.s64 q7, q8, #63
+ mov r0, sp
+ vand q7, q7, q5
+ vadd.u64 q9, q8, q8
+ vst1.64 {q8}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q9, #63
+ veor q9, q9, q7
+ vand q6, q6, q5
+ vadd.u64 q10, q9, q9
+ vst1.64 {q9}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q10, #63
+ veor q10, q10, q6
+ vand q7, q7, q5
+ vld1.8 {q0}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_1
+ vadd.u64 q11, q10, q10
+ vst1.64 {q10}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q11, #63
+ veor q11, q11, q7
+ vand q6, q6, q5
+ vld1.8 {q1}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_2
+ veor q0, q0, q8
+ vadd.u64 q12, q11, q11
+ vst1.64 {q11}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q12, #63
+ veor q12, q12, q6
+ vand q7, q7, q5
+ vld1.8 {q2}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_3
+ veor q1, q1, q9
+ vadd.u64 q13, q12, q12
+ vst1.64 {q12}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q13, #63
+ veor q13, q13, q7
+ vand q6, q6, q5
+ vld1.8 {q3}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_4
+ veor q2, q2, q10
+ vadd.u64 q14, q13, q13
+ vst1.64 {q13}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q14, #63
+ veor q14, q14, q6
+ vand q7, q7, q5
+ vld1.8 {q4}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_5
+ veor q3, q3, q11
+ vadd.u64 q15, q14, q14
+ vst1.64 {q14}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q15, #63
+ veor q15, q15, q7
+ vand q6, q6, q5
+ vld1.8 {q5}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_6
+ veor q4, q4, q12
+ sub r9, #0x10
+ vst1.64 {q15}, [r0,:128] @ next round tweak
+
+ vld1.8 {q6}, [r7]!
+ veor q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q6, q6, q14
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q4, q11
+ vld1.64 {q14}, [r0,:128]!
+ veor q10, q2, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ veor q12, q3, q14
+ vst1.8 {q10-q11}, [r8]!
+ vst1.8 {q12}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_6:
+ vst1.64 {q14}, [r0,:128] @ next round tweak
+
+ veor q4, q4, q12
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q5, q5, q13
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q4, q11
+ veor q10, q2, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ vst1.8 {q10-q11}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_5:
+ vst1.64 {q13}, [r0,:128] @ next round tweak
+
+ veor q3, q3, q11
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q4, q4, q12
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q4, q11
+ veor q10, q2, q12
+ vst1.8 {q8-q9}, [r8]!
+ vst1.8 {q10}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_4:
+ vst1.64 {q12}, [r0,:128] @ next round tweak
+
+ veor q2, q2, q10
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q3, q3, q11
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q4, q11
+ vst1.8 {q8-q9}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_3:
+ vst1.64 {q11}, [r0,:128] @ next round tweak
+
+ veor q1, q1, q9
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q2, q2, q10
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ vst1.8 {q8}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_2:
+ vst1.64 {q10}, [r0,:128] @ next round tweak
+
+ veor q0, q0, q8
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q1, q1, q9
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ vst1.8 {q0-q1}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_1:
+ mov r0, sp
+ veor q0, q8
+ mov r1, sp
+ vst1.8 {q0}, [sp,:128]
+ mov r2, r10
+ mov r4, r3 @ preserve fp
+ mov r5, r2 @ preserve magic
+
+ bl AES_decrypt
+
+ vld1.8 {q0}, [sp,:128]
+ veor q0, q0, q8
+ vst1.8 {q0}, [r8]!
+ mov r3, r4
+ mov r2, r5
+
+ vmov q8, q9 @ next round tweak
+
+.Lxts_dec_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds r9, #0x10
+ beq .Lxts_dec_ret
+
+ @ calculate one round of extra tweak for the stolen ciphertext
+ vldmia r2, {q5}
+ vshr.s64 q6, q8, #63
+ vand q6, q6, q5
+ vadd.u64 q9, q8, q8
+ vswp d13,d12
+ veor q9, q9, q6
+
+ @ perform the final decryption with the last tweak value
+ vld1.8 {q0}, [r7]!
+ mov r0, sp
+ veor q0, q0, q9
+ mov r1, sp
+ vst1.8 {q0}, [sp,:128]
+ mov r2, r10
+ mov r4, r3 @ preserve fp
+
+ bl AES_decrypt
+
+ vld1.8 {q0}, [sp,:128]
+ veor q0, q0, q9
+ vst1.8 {q0}, [r8]
+
+ mov r6, r8
+.Lxts_dec_steal:
+ ldrb r1, [r8]
+ ldrb r0, [r7], #1
+ strb r1, [r8, #0x10]
+ strb r0, [r8], #1
+
+ subs r9, #1
+ bhi .Lxts_dec_steal
+
+ vld1.8 {q0}, [r6]
+ mov r0, sp
+ veor q0, q8
+ mov r1, sp
+ vst1.8 {q0}, [sp,:128]
+ mov r2, r10
+
+ bl AES_decrypt
+
+ vld1.8 {q0}, [sp,:128]
+ veor q0, q0, q8
+ vst1.8 {q0}, [r6]
+ mov r3, r4
+#endif
+
+.Lxts_dec_ret:
+ bic r0, r3, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_dec_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_dec_bzero
+
+ mov sp, r3
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {q8}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
+#endif
diff --git a/arch/arm/crypto/aesbs-glue.c b/arch/arm/crypto/aesbs-glue.c
new file mode 100644
index 000000000000..15468fbbdea3
--- /dev/null
+++ b/arch/arm/crypto/aesbs-glue.c
@@ -0,0 +1,434 @@
+/*
+ * linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <crypto/ablk_helper.h>
+#include <crypto/algapi.h>
+#include <linux/module.h>
+
+#include "aes_glue.h"
+
+#define BIT_SLICED_KEY_MAXSIZE (128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE)
+
+struct BS_KEY {
+ struct AES_KEY rk;
+ int converted;
+ u8 __aligned(8) bs[BIT_SLICED_KEY_MAXSIZE];
+} __aligned(8);
+
+asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in);
+asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in);
+
+asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes,
+ struct BS_KEY *key, u8 iv[]);
+
+asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks,
+ struct BS_KEY *key, u8 const iv[]);
+
+asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes,
+ struct BS_KEY *key, u8 tweak[]);
+
+asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes,
+ struct BS_KEY *key, u8 tweak[]);
+
+struct aesbs_cbc_ctx {
+ struct AES_KEY enc;
+ struct BS_KEY dec;
+};
+
+struct aesbs_ctr_ctx {
+ struct BS_KEY enc;
+};
+
+struct aesbs_xts_ctx {
+ struct BS_KEY enc;
+ struct BS_KEY dec;
+ struct AES_KEY twkey;
+};
+
+static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+ int bits = key_len * 8;
+
+ if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) {
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+ ctx->dec.rk = ctx->enc;
+ private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
+ ctx->dec.converted = 0;
+ return 0;
+}
+
+static int aesbs_ctr_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
+ int bits = key_len * 8;
+
+ if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+ ctx->enc.converted = 0;
+ return 0;
+}
+
+static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+ int bits = key_len * 4;
+
+ if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+ ctx->dec.rk = ctx->enc.rk;
+ private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
+ private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey);
+ ctx->enc.converted = ctx->dec.converted = 0;
+ return 0;
+}
+
+static int aesbs_cbc_encrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while (walk.nbytes) {
+ u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
+ u8 *src = walk.src.virt.addr;
+
+ if (walk.dst.virt.addr == walk.src.virt.addr) {
+ u8 *iv = walk.iv;
+
+ do {
+ crypto_xor(src, iv, AES_BLOCK_SIZE);
+ AES_encrypt(src, src, &ctx->enc);
+ iv = src;
+ src += AES_BLOCK_SIZE;
+ } while (--blocks);
+ memcpy(walk.iv, iv, AES_BLOCK_SIZE);
+ } else {
+ u8 *dst = walk.dst.virt.addr;
+
+ do {
+ crypto_xor(walk.iv, src, AES_BLOCK_SIZE);
+ AES_encrypt(walk.iv, dst, &ctx->enc);
+ memcpy(walk.iv, dst, AES_BLOCK_SIZE);
+ src += AES_BLOCK_SIZE;
+ dst += AES_BLOCK_SIZE;
+ } while (--blocks);
+ }
+ err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
+ }
+ return err;
+}
+
+static int aesbs_cbc_decrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+ while ((walk.nbytes / AES_BLOCK_SIZE) >= 8) {
+ kernel_neon_begin();
+ bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes, &ctx->dec, walk.iv);
+ kernel_neon_end();
+ err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
+ }
+ while (walk.nbytes) {
+ u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
+ u8 *dst = walk.dst.virt.addr;
+ u8 *src = walk.src.virt.addr;
+ u8 bk[2][AES_BLOCK_SIZE];
+ u8 *iv = walk.iv;
+
+ do {
+ if (walk.dst.virt.addr == walk.src.virt.addr)
+ memcpy(bk[blocks & 1], src, AES_BLOCK_SIZE);
+
+ AES_decrypt(src, dst, &ctx->dec.rk);
+ crypto_xor(dst, iv, AES_BLOCK_SIZE);
+
+ if (walk.dst.virt.addr == walk.src.virt.addr)
+ iv = bk[blocks & 1];
+ else
+ iv = src;
+
+ dst += AES_BLOCK_SIZE;
+ src += AES_BLOCK_SIZE;
+ } while (--blocks);
+ err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
+ }
+ return err;
+}
+
+static void inc_be128_ctr(__be32 ctr[], u32 addend)
+{
+ int i;
+
+ for (i = 3; i >= 0; i--, addend = 1) {
+ u32 n = be32_to_cpu(ctr[i]) + addend;
+
+ ctr[i] = cpu_to_be32(n);
+ if (n >= addend)
+ break;
+ }
+}
+
+static int aesbs_ctr_encrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst, struct scatterlist *src,
+ unsigned int nbytes)
+{
+ struct aesbs_ctr_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ u32 blocks;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+ while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) {
+ u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+ __be32 *ctr = (__be32 *)walk.iv;
+ u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]);
+
+ /* avoid 32 bit counter overflow in the NEON code */
+ if (unlikely(headroom < blocks)) {
+ blocks = headroom + 1;
+ tail = walk.nbytes - blocks * AES_BLOCK_SIZE;
+ }
+ kernel_neon_begin();
+ bsaes_ctr32_encrypt_blocks(walk.src.virt.addr,
+ walk.dst.virt.addr, blocks,
+ &ctx->enc, walk.iv);
+ kernel_neon_end();
+ inc_be128_ctr(ctr, blocks);
+
+ nbytes -= blocks * AES_BLOCK_SIZE;
+ if (nbytes && nbytes == tail && nbytes <= AES_BLOCK_SIZE)
+ break;
+
+ err = blkcipher_walk_done(desc, &walk, tail);
+ }
+ if (walk.nbytes) {
+ u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+ u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+ u8 ks[AES_BLOCK_SIZE];
+
+ AES_encrypt(walk.iv, ks, &ctx->enc.rk);
+ if (tdst != tsrc)
+ memcpy(tdst, tsrc, nbytes);
+ crypto_xor(tdst, ks, nbytes);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ return err;
+}
+
+static int aesbs_xts_encrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+ /* generate the initial tweak */
+ AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
+
+ while (walk.nbytes) {
+ kernel_neon_begin();
+ bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes, &ctx->enc, walk.iv);
+ kernel_neon_end();
+ err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
+ }
+ return err;
+}
+
+static int aesbs_xts_decrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+ /* generate the initial tweak */
+ AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
+
+ while (walk.nbytes) {
+ kernel_neon_begin();
+ bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes, &ctx->dec, walk.iv);
+ kernel_neon_end();
+ err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
+ }
+ return err;
+}
+
+static struct crypto_alg aesbs_algs[] = { {
+ .cra_name = "__cbc-aes-neonbs",
+ .cra_driver_name = "__driver-cbc-aes-neonbs",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct aesbs_cbc_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_blkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_cbc_set_key,
+ .encrypt = aesbs_cbc_encrypt,
+ .decrypt = aesbs_cbc_decrypt,
+ },
+}, {
+ .cra_name = "__ctr-aes-neonbs",
+ .cra_driver_name = "__driver-ctr-aes-neonbs",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aesbs_ctr_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_blkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_ctr_set_key,
+ .encrypt = aesbs_ctr_encrypt,
+ .decrypt = aesbs_ctr_encrypt,
+ },
+}, {
+ .cra_name = "__xts-aes-neonbs",
+ .cra_driver_name = "__driver-xts-aes-neonbs",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct aesbs_xts_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_blkcipher = {
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_xts_set_key,
+ .encrypt = aesbs_xts_encrypt,
+ .decrypt = aesbs_xts_decrypt,
+ },
+}, {
+ .cra_name = "cbc(aes)",
+ .cra_driver_name = "cbc-aes-neonbs",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_init = ablk_init,
+ .cra_exit = ablk_exit,
+ .cra_ablkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = __ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ }
+}, {
+ .cra_name = "ctr(aes)",
+ .cra_driver_name = "ctr-aes-neonbs",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_init = ablk_init,
+ .cra_exit = ablk_exit,
+ .cra_ablkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ }
+}, {
+ .cra_name = "xts(aes)",
+ .cra_driver_name = "xts-aes-neonbs",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_init = ablk_init,
+ .cra_exit = ablk_exit,
+ .cra_ablkcipher = {
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ }
+} };
+
+static int __init aesbs_mod_init(void)
+{
+ if (!cpu_has_neon())
+ return -ENODEV;
+
+ return crypto_register_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
+}
+
+static void __exit aesbs_mod_exit(void)
+{
+ crypto_unregister_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
+}
+
+module_init(aesbs_mod_init);
+module_exit(aesbs_mod_exit);
+
+MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL");
diff --git a/arch/arm/crypto/bsaes-armv7.pl b/arch/arm/crypto/bsaes-armv7.pl
new file mode 100644
index 000000000000..be068db960ee
--- /dev/null
+++ b/arch/arm/crypto/bsaes-armv7.pl
@@ -0,0 +1,2467 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+# granted.
+# ====================================================================
+
+# Bit-sliced AES for ARM NEON
+#
+# February 2012.
+#
+# This implementation is direct adaptation of bsaes-x86_64 module for
+# ARM NEON. Except that this module is endian-neutral [in sense that
+# it can be compiled for either endianness] by courtesy of vld1.8's
+# neutrality. Initial version doesn't implement interface to OpenSSL,
+# only low-level primitives and unsupported entry points, just enough
+# to collect performance results, which for Cortex-A8 core are:
+#
+# encrypt 19.5 cycles per byte processed with 128-bit key
+# decrypt 22.1 cycles per byte processed with 128-bit key
+# key conv. 440 cycles per 128-bit key/0.18 of 8x block
+#
+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+# which is [much] worse than anticipated (for further details see
+# http://www.openssl.org/~appro/Snapdragon-S4.html).
+#
+# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+# manages in 20.0 cycles].
+#
+# When comparing to x86_64 results keep in mind that NEON unit is
+# [mostly] single-issue and thus can't [fully] benefit from
+# instruction-level parallelism. And when comparing to aes-armv4
+# results keep in mind key schedule conversion overhead (see
+# bsaes-x86_64.pl for further details)...
+#
+# <appro@openssl.org>
+
+# April-August 2013
+#
+# Add CBC, CTR and XTS subroutines, adapt for kernel use.
+#
+# <ard.biesheuvel@linaro.org>
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
+my @XMM=map("q$_",(0..15));
+
+{
+my ($key,$rounds,$const)=("r4","r5","r6");
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub Sbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InBasisChange (@b);
+ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
+ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ veor @b[2], @b[2], @b[1]
+ veor @b[5], @b[5], @b[6]
+ veor @b[3], @b[3], @b[0]
+ veor @b[6], @b[6], @b[2]
+ veor @b[5], @b[5], @b[0]
+
+ veor @b[6], @b[6], @b[3]
+ veor @b[3], @b[3], @b[7]
+ veor @b[7], @b[7], @b[5]
+ veor @b[3], @b[3], @b[4]
+ veor @b[4], @b[4], @b[5]
+
+ veor @b[2], @b[2], @b[7]
+ veor @b[3], @b[3], @b[1]
+ veor @b[1], @b[1], @b[5]
+___
+}
+
+sub OutBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ veor @b[0], @b[0], @b[6]
+ veor @b[1], @b[1], @b[4]
+ veor @b[4], @b[4], @b[6]
+ veor @b[2], @b[2], @b[0]
+ veor @b[6], @b[6], @b[1]
+
+ veor @b[1], @b[1], @b[5]
+ veor @b[5], @b[5], @b[3]
+ veor @b[3], @b[3], @b[7]
+ veor @b[7], @b[7], @b[5]
+ veor @b[2], @b[2], @b[5]
+
+ veor @b[4], @b[4], @b[7]
+___
+}
+
+sub InvSbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InvInBasisChange (@b);
+ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
+ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange { # OutBasisChange in reverse (with twist)
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+ veor @b[1], @b[1], @b[7]
+ veor @b[4], @b[4], @b[7]
+
+ veor @b[7], @b[7], @b[5]
+ veor @b[1], @b[1], @b[3]
+ veor @b[2], @b[2], @b[5]
+ veor @b[3], @b[3], @b[7]
+
+ veor @b[6], @b[6], @b[1]
+ veor @b[2], @b[2], @b[0]
+ veor @b[5], @b[5], @b[3]
+ veor @b[4], @b[4], @b[6]
+ veor @b[0], @b[0], @b[6]
+ veor @b[1], @b[1], @b[4]
+___
+}
+
+sub InvOutBasisChange { # InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+ veor @b[1], @b[1], @b[5]
+ veor @b[2], @b[2], @b[7]
+
+ veor @b[3], @b[3], @b[1]
+ veor @b[4], @b[4], @b[5]
+ veor @b[7], @b[7], @b[5]
+ veor @b[3], @b[3], @b[4]
+ veor @b[5], @b[5], @b[0]
+ veor @b[3], @b[3], @b[7]
+ veor @b[6], @b[6], @b[2]
+ veor @b[2], @b[2], @b[1]
+ veor @b[6], @b[6], @b[3]
+
+ veor @b[3], @b[3], @b[0]
+ veor @b[5], @b[5], @b[6]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ vand $t0, $t0, $x0
+ veor $x0, $x0, $x1
+ vand $t1, $x1, $y0
+ vand $x0, $x0, $y1
+ veor $x1, $t1, $t0
+ veor $x0, $x0, $t1
+___
+}
+
+sub Mul_GF4_N { # not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ vand $t0, $t0, $x0
+ veor $x0, $x0, $x1
+ vand $x1, $x1, $y0
+ vand $x0, $x0, $y1
+ veor $x1, $x1, $x0
+ veor $x0, $x0, $t0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+ $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ veor $t1, $y2, $y3
+ vand $t0, $t0, $x0
+ vand $t1, $t1, $x2
+ veor $x0, $x0, $x1
+ veor $x2, $x2, $x3
+ vand $x1, $x1, $y0
+ vand $x3, $x3, $y2
+ vand $x0, $x0, $y1
+ vand $x2, $x2, $y3
+ veor $x1, $x1, $x0
+ veor $x2, $x2, $x3
+ veor $x0, $x0, $t0
+ veor $x3, $x3, $t1
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+ veor @t[0], @x[0], @x[2]
+ veor @t[1], @x[1], @x[3]
+___
+ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+ veor @y[0], @y[0], @y[2]
+ veor @y[1], @y[1], @y[3]
+___
+ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ veor @x[0], @x[0], @t[0]
+ veor @x[2], @x[2], @t[0]
+ veor @x[1], @x[1], @t[1]
+ veor @x[3], @x[3], @t[1]
+
+ veor @t[0], @x[4], @x[6]
+ veor @t[1], @x[5], @x[7]
+___
+ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ veor @y[0], @y[0], @y[2]
+ veor @y[1], @y[1], @y[3]
+___
+ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+ veor @x[4], @x[4], @t[0]
+ veor @x[6], @x[6], @t[0]
+ veor @x[5], @x[5], @t[1]
+ veor @x[7], @x[7], @t[1]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+ veor @t[3], @x[4], @x[6]
+ veor @t[2], @x[5], @x[7]
+ veor @t[1], @x[1], @x[3]
+ veor @s[1], @x[7], @x[6]
+ vmov @t[0], @t[2]
+ veor @s[0], @x[0], @x[2]
+
+ vorr @t[2], @t[2], @t[1]
+ veor @s[3], @t[3], @t[0]
+ vand @s[2], @t[3], @s[0]
+ vorr @t[3], @t[3], @s[0]
+ veor @s[0], @s[0], @t[1]
+ vand @t[0], @t[0], @t[1]
+ veor @t[1], @x[3], @x[2]
+ vand @s[3], @s[3], @s[0]
+ vand @s[1], @s[1], @t[1]
+ veor @t[1], @x[4], @x[5]
+ veor @s[0], @x[1], @x[0]
+ veor @t[3], @t[3], @s[1]
+ veor @t[2], @t[2], @s[1]
+ vand @s[1], @t[1], @s[0]
+ vorr @t[1], @t[1], @s[0]
+ veor @t[3], @t[3], @s[3]
+ veor @t[0], @t[0], @s[1]
+ veor @t[2], @t[2], @s[2]
+ veor @t[1], @t[1], @s[3]
+ veor @t[0], @t[0], @s[2]
+ vand @s[0], @x[7], @x[3]
+ veor @t[1], @t[1], @s[2]
+ vand @s[1], @x[6], @x[2]
+ vand @s[2], @x[5], @x[1]
+ vorr @s[3], @x[4], @x[0]
+ veor @t[3], @t[3], @s[0]
+ veor @t[1], @t[1], @s[2]
+ veor @t[0], @t[0], @s[3]
+ veor @t[2], @t[2], @s[1]
+
+ @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+ @ new smaller inversion
+
+ vand @s[2], @t[3], @t[1]
+ vmov @s[0], @t[0]
+
+ veor @s[1], @t[2], @s[2]
+ veor @s[3], @t[0], @s[2]
+ veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
+
+ vbsl @s[1], @t[1], @t[0]
+ vbsl @s[3], @t[3], @t[2]
+ veor @t[3], @t[3], @t[2]
+
+ vbsl @s[0], @s[1], @s[2]
+ vbsl @t[0], @s[2], @s[1]
+
+ vand @s[2], @s[0], @s[3]
+ veor @t[1], @t[1], @t[0]
+
+ veor @s[2], @s[2], @t[3]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my @t=@_[8..11];
+my $mask=pop;
+$code.=<<___;
+ vldmia $key!, {@t[0]-@t[3]}
+ veor @t[0], @t[0], @x[0]
+ veor @t[1], @t[1], @x[1]
+ vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
+ vldmia $key!, {@t[0]}
+ veor @t[2], @t[2], @x[2]
+ vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
+ vldmia $key!, {@t[1]}
+ veor @t[3], @t[3], @x[3]
+ vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
+ vldmia $key!, {@t[2]}
+ vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
+ vldmia $key!, {@t[3]}
+ veor @t[0], @t[0], @x[4]
+ veor @t[1], @t[1], @x[5]
+ vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
+ veor @t[2], @t[2], @x[6]
+ vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
+ veor @t[3], @t[3], @x[7]
+ vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
+ vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+my $inv=@_[16]; # optional
+$code.=<<___;
+ vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
+ vext.8 @t[1], @x[1], @x[1], #12
+ veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
+ vext.8 @t[2], @x[2], @x[2], #12
+ veor @x[1], @x[1], @t[1]
+ vext.8 @t[3], @x[3], @x[3], #12
+ veor @x[2], @x[2], @t[2]
+ vext.8 @t[4], @x[4], @x[4], #12
+ veor @x[3], @x[3], @t[3]
+ vext.8 @t[5], @x[5], @x[5], #12
+ veor @x[4], @x[4], @t[4]
+ vext.8 @t[6], @x[6], @x[6], #12
+ veor @x[5], @x[5], @t[5]
+ vext.8 @t[7], @x[7], @x[7], #12
+ veor @x[6], @x[6], @t[6]
+
+ veor @t[1], @t[1], @x[0]
+ veor @x[7], @x[7], @t[7]
+ vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor @t[2], @t[2], @x[1]
+ veor @t[0], @t[0], @x[7]
+ veor @t[1], @t[1], @x[7]
+ vext.8 @x[1], @x[1], @x[1], #8
+ veor @t[5], @t[5], @x[4]
+ veor @x[0], @x[0], @t[0]
+ veor @t[6], @t[6], @x[5]
+ veor @x[1], @x[1], @t[1]
+ vext.8 @t[0], @x[4], @x[4], #8
+ veor @t[4], @t[4], @x[3]
+ vext.8 @t[1], @x[5], @x[5], #8
+ veor @t[7], @t[7], @x[6]
+ vext.8 @x[4], @x[3], @x[3], #8
+ veor @t[3], @t[3], @x[2]
+ vext.8 @x[5], @x[7], @x[7], #8
+ veor @t[4], @t[4], @x[7]
+ vext.8 @x[3], @x[6], @x[6], #8
+ veor @t[3], @t[3], @x[7]
+ vext.8 @x[6], @x[2], @x[2], #8
+ veor @x[7], @t[1], @t[5]
+___
+$code.=<<___ if (!$inv);
+ veor @x[2], @t[0], @t[4]
+ veor @x[4], @x[4], @t[3]
+ veor @x[5], @x[5], @t[7]
+ veor @x[3], @x[3], @t[6]
+ @ vmov @x[2], @t[0]
+ veor @x[6], @x[6], @t[2]
+ @ vmov @x[7], @t[1]
+___
+$code.=<<___ if ($inv);
+ veor @t[3], @t[3], @x[4]
+ veor @x[5], @x[5], @t[7]
+ veor @x[2], @x[3], @t[6]
+ veor @x[3], @t[0], @t[4]
+ veor @x[4], @x[6], @t[2]
+ vmov @x[6], @t[3]
+ @ vmov @x[7], @t[1]
+___
+}
+
+sub InvMixColumns_orig {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+ @ multiplication by 0x0e
+ vext.8 @t[7], @x[7], @x[7], #12
+ vmov @t[2], @x[2]
+ veor @x[2], @x[2], @x[5] @ 2 5
+ veor @x[7], @x[7], @x[5] @ 7 5
+ vext.8 @t[0], @x[0], @x[0], #12
+ vmov @t[5], @x[5]
+ veor @x[5], @x[5], @x[0] @ 5 0 [1]
+ veor @x[0], @x[0], @x[1] @ 0 1
+ vext.8 @t[1], @x[1], @x[1], #12
+ veor @x[1], @x[1], @x[2] @ 1 25
+ veor @x[0], @x[0], @x[6] @ 01 6 [2]
+ vext.8 @t[3], @x[3], @x[3], #12
+ veor @x[1], @x[1], @x[3] @ 125 3 [4]
+ veor @x[2], @x[2], @x[0] @ 25 016 [3]
+ veor @x[3], @x[3], @x[7] @ 3 75
+ veor @x[7], @x[7], @x[6] @ 75 6 [0]
+ vext.8 @t[6], @x[6], @x[6], #12
+ vmov @t[4], @x[4]
+ veor @x[6], @x[6], @x[4] @ 6 4
+ veor @x[4], @x[4], @x[3] @ 4 375 [6]
+ veor @x[3], @x[3], @x[7] @ 375 756=36
+ veor @x[6], @x[6], @t[5] @ 64 5 [7]
+ veor @x[3], @x[3], @t[2] @ 36 2
+ vext.8 @t[5], @t[5], @t[5], #12
+ veor @x[3], @x[3], @t[4] @ 362 4 [5]
+___
+ my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+ @ multiplication by 0x0b
+ veor @y[1], @y[1], @y[0]
+ veor @y[0], @y[0], @t[0]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[1], @y[1], @t[1]
+ veor @y[0], @y[0], @t[5]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[1], @y[1], @t[6]
+ veor @y[0], @y[0], @t[7]
+ veor @t[7], @t[7], @t[6] @ clobber t[7]
+
+ veor @y[3], @y[3], @t[0]
+ veor @y[1], @y[1], @y[0]
+ vext.8 @t[0], @t[0], @t[0], #12
+ veor @y[2], @y[2], @t[1]
+ veor @y[4], @y[4], @t[1]
+ vext.8 @t[1], @t[1], @t[1], #12
+ veor @y[2], @y[2], @t[2]
+ veor @y[3], @y[3], @t[2]
+ veor @y[5], @y[5], @t[2]
+ veor @y[2], @y[2], @t[7]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[3], @y[3], @t[3]
+ veor @y[6], @y[6], @t[3]
+ veor @y[4], @y[4], @t[3]
+ veor @y[7], @y[7], @t[4]
+ vext.8 @t[3], @t[3], @t[3], #12
+ veor @y[5], @y[5], @t[4]
+ veor @y[7], @y[7], @t[7]
+ veor @t[7], @t[7], @t[5] @ clobber t[7] even more
+ veor @y[3], @y[3], @t[5]
+ veor @y[4], @y[4], @t[4]
+
+ veor @y[5], @y[5], @t[7]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[6], @y[6], @t[7]
+ veor @y[4], @y[4], @t[7]
+
+ veor @t[7], @t[7], @t[5]
+ vext.8 @t[5], @t[5], @t[5], #12
+
+ @ multiplication by 0x0d
+ veor @y[4], @y[4], @y[7]
+ veor @t[7], @t[7], @t[6] @ restore t[7]
+ veor @y[7], @y[7], @t[4]
+ vext.8 @t[6], @t[6], @t[6], #12
+ veor @y[2], @y[2], @t[0]
+ veor @y[7], @y[7], @t[5]
+ vext.8 @t[7], @t[7], @t[7], #12
+ veor @y[2], @y[2], @t[2]
+
+ veor @y[3], @y[3], @y[1]
+ veor @y[1], @y[1], @t[1]
+ veor @y[0], @y[0], @t[0]
+ veor @y[3], @y[3], @t[0]
+ veor @y[1], @y[1], @t[5]
+ veor @y[0], @y[0], @t[5]
+ vext.8 @t[0], @t[0], @t[0], #12
+ veor @y[1], @y[1], @t[7]
+ veor @y[0], @y[0], @t[6]
+ veor @y[3], @y[3], @y[1]
+ veor @y[4], @y[4], @t[1]
+ vext.8 @t[1], @t[1], @t[1], #12
+
+ veor @y[7], @y[7], @t[7]
+ veor @y[4], @y[4], @t[2]
+ veor @y[5], @y[5], @t[2]
+ veor @y[2], @y[2], @t[6]
+ veor @t[6], @t[6], @t[3] @ clobber t[6]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[4], @y[4], @y[7]
+ veor @y[3], @y[3], @t[6]
+
+ veor @y[6], @y[6], @t[6]
+ veor @y[5], @y[5], @t[5]
+ vext.8 @t[5], @t[5], @t[5], #12
+ veor @y[6], @y[6], @t[4]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[5], @y[5], @t[6]
+ veor @y[6], @y[6], @t[7]
+ vext.8 @t[7], @t[7], @t[7], #12
+ veor @t[6], @t[6], @t[3] @ restore t[6]
+ vext.8 @t[3], @t[3], @t[3], #12
+
+ @ multiplication by 0x09
+ veor @y[4], @y[4], @y[1]
+ veor @t[1], @t[1], @y[1] @ t[1]=y[1]
+ veor @t[0], @t[0], @t[5] @ clobber t[0]
+ vext.8 @t[6], @t[6], @t[6], #12
+ veor @t[1], @t[1], @t[5]
+ veor @y[3], @y[3], @t[0]
+ veor @t[0], @t[0], @y[0] @ t[0]=y[0]
+ veor @t[1], @t[1], @t[6]
+ veor @t[6], @t[6], @t[7] @ clobber t[6]
+ veor @y[4], @y[4], @t[1]
+ veor @y[7], @y[7], @t[4]
+ veor @y[6], @y[6], @t[3]
+ veor @y[5], @y[5], @t[2]
+ veor @t[4], @t[4], @y[4] @ t[4]=y[4]
+ veor @t[3], @t[3], @y[3] @ t[3]=y[3]
+ veor @t[5], @t[5], @y[5] @ t[5]=y[5]
+ veor @t[2], @t[2], @y[2] @ t[2]=y[2]
+ veor @t[3], @t[3], @t[7]
+ veor @XMM[5], @t[5], @t[6]
+ veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
+ veor @XMM[2], @t[2], @t[6]
+ veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
+
+ vmov @XMM[0], @t[0]
+ vmov @XMM[1], @t[1]
+ @ vmov @XMM[2], @t[2]
+ vmov @XMM[3], @t[3]
+ vmov @XMM[4], @t[4]
+ @ vmov @XMM[5], @t[5]
+ @ vmov @XMM[6], @t[6]
+ @ vmov @XMM[7], @t[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
+# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
+
+$code.=<<___;
+ @ multiplication by 0x05-0x00-0x04-0x00
+ vext.8 @t[0], @x[0], @x[0], #8
+ vext.8 @t[6], @x[6], @x[6], #8
+ vext.8 @t[7], @x[7], @x[7], #8
+ veor @t[0], @t[0], @x[0]
+ vext.8 @t[1], @x[1], @x[1], #8
+ veor @t[6], @t[6], @x[6]
+ vext.8 @t[2], @x[2], @x[2], #8
+ veor @t[7], @t[7], @x[7]
+ vext.8 @t[3], @x[3], @x[3], #8
+ veor @t[1], @t[1], @x[1]
+ vext.8 @t[4], @x[4], @x[4], #8
+ veor @t[2], @t[2], @x[2]
+ vext.8 @t[5], @x[5], @x[5], #8
+ veor @t[3], @t[3], @x[3]
+ veor @t[4], @t[4], @x[4]
+ veor @t[5], @t[5], @x[5]
+
+ veor @x[0], @x[0], @t[6]
+ veor @x[1], @x[1], @t[6]
+ veor @x[2], @x[2], @t[0]
+ veor @x[4], @x[4], @t[2]
+ veor @x[3], @x[3], @t[1]
+ veor @x[1], @x[1], @t[7]
+ veor @x[2], @x[2], @t[7]
+ veor @x[4], @x[4], @t[6]
+ veor @x[5], @x[5], @t[3]
+ veor @x[3], @x[3], @t[6]
+ veor @x[6], @x[6], @t[4]
+ veor @x[4], @x[4], @t[7]
+ veor @x[5], @x[5], @t[7]
+ veor @x[7], @x[7], @t[5]
+___
+ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+ vshr.u64 $t, $b, #$n
+ veor $t, $t, $a
+ vand $t, $t, $mask
+ veor $a, $a, $t
+ vshl.u64 $t, $t, #$n
+ veor $b, $b, $t
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+ vshr.u64 $t0, $b0, #$n
+ vshr.u64 $t1, $b1, #$n
+ veor $t0, $t0, $a0
+ veor $t1, $t1, $a1
+ vand $t0, $t0, $mask
+ vand $t1, $t1, $mask
+ veor $a0, $a0, $t0
+ vshl.u64 $t0, $t0, #$n
+ veor $a1, $a1, $t1
+ vshl.u64 $t1, $t1, #$n
+ veor $b0, $b0, $t0
+ veor $b1, $b1, $t1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+ vmov.i8 $t0,#0x55 @ compose .LBS0
+ vmov.i8 $t1,#0x33 @ compose .LBS1
+___
+ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+ vmov.i8 $t0,#0x0f @ compose .LBS2
+___
+ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ 7
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_ARCH__>=7
+.text
+.syntax unified @ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code 32
+#endif
+
+.fpu neon
+
+.type _bsaes_decrypt8,%function
+.align 4
+_bsaes_decrypt8:
+ adr $const,_bsaes_decrypt8
+ vldmia $key!, {@XMM[9]} @ round 0 key
+ add $const,$const,#.LM0ISR-_bsaes_decrypt8
+
+ vldmia $const!, {@XMM[8]} @ .LM0ISR
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
+ veor @XMM[11], @XMM[1], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ veor @XMM[12], @XMM[2], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+ veor @XMM[13], @XMM[3], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+ veor @XMM[14], @XMM[4], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+ veor @XMM[15], @XMM[5], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+ veor @XMM[10], @XMM[6], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+ veor @XMM[11], @XMM[7], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ sub $rounds,$rounds,#1
+ b .Ldec_sbox
+.align 4
+.Ldec_loop:
+___
+ &ShiftRows (@XMM[0..7, 8..12]);
+$code.=".Ldec_sbox:\n";
+ &InvSbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ subs $rounds,$rounds,#1
+ bcc .Ldec_done
+___
+ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+ vldmia $const, {@XMM[12]} @ .LISR
+ ite eq @ Thumb2 thing, sanity check in ARM
+ addeq $const,$const,#0x10
+ bne .Ldec_loop
+ vldmia $const, {@XMM[12]} @ .LISRM0
+ b .Ldec_loop
+.align 4
+.Ldec_done:
+___
+ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+ vldmia $key, {@XMM[8]} @ last round key
+ veor @XMM[6], @XMM[6], @XMM[8]
+ veor @XMM[4], @XMM[4], @XMM[8]
+ veor @XMM[2], @XMM[2], @XMM[8]
+ veor @XMM[7], @XMM[7], @XMM[8]
+ veor @XMM[3], @XMM[3], @XMM[8]
+ veor @XMM[5], @XMM[5], @XMM[8]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[1], @XMM[8]
+ bx lr
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type _bsaes_const,%object
+.align 6
+_bsaes_const:
+.LM0ISR: @ InvShiftRows constants
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR: @ ShiftRows constants
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+ .quad 0x090d01050c000408, 0x03070b0f060a0e02
+.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 6
+.size _bsaes_const,.-_bsaes_const
+
+.type _bsaes_encrypt8,%function
+.align 4
+_bsaes_encrypt8:
+ adr $const,_bsaes_encrypt8
+ vldmia $key!, {@XMM[9]} @ round 0 key
+ sub $const,$const,#_bsaes_encrypt8-.LM0SR
+
+ vldmia $const!, {@XMM[8]} @ .LM0SR
+_bsaes_encrypt8_alt:
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
+ veor @XMM[11], @XMM[1], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ veor @XMM[12], @XMM[2], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+ veor @XMM[13], @XMM[3], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+ veor @XMM[14], @XMM[4], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+ veor @XMM[15], @XMM[5], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+ veor @XMM[10], @XMM[6], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+ veor @XMM[11], @XMM[7], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+_bsaes_encrypt8_bitslice:
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ sub $rounds,$rounds,#1
+ b .Lenc_sbox
+.align 4
+.Lenc_loop:
+___
+ &ShiftRows (@XMM[0..7, 8..12]);
+$code.=".Lenc_sbox:\n";
+ &Sbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ subs $rounds,$rounds,#1
+ bcc .Lenc_done
+___
+ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+ vldmia $const, {@XMM[12]} @ .LSR
+ ite eq @ Thumb2 thing, samity check in ARM
+ addeq $const,$const,#0x10
+ bne .Lenc_loop
+ vldmia $const, {@XMM[12]} @ .LSRM0
+ b .Lenc_loop
+.align 4
+.Lenc_done:
+___
+ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+ vldmia $key, {@XMM[8]} @ last round key
+ veor @XMM[4], @XMM[4], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[8]
+ veor @XMM[3], @XMM[3], @XMM[8]
+ veor @XMM[7], @XMM[7], @XMM[8]
+ veor @XMM[2], @XMM[2], @XMM[8]
+ veor @XMM[5], @XMM[5], @XMM[8]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[1], @XMM[8]
+ bx lr
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+ @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
+ vmov @x[2], @x[0]
+ vmov @x[3], @x[1]
+___
+ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+ @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+ vmov @x[4], @x[0]
+ vmov @x[6], @x[2]
+ vmov @x[5], @x[1]
+ vmov @x[7], @x[3]
+___
+ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
+ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type _bsaes_key_convert,%function
+.align 4
+_bsaes_key_convert:
+ adr $const,_bsaes_key_convert
+ vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
+ sub $const,$const,#_bsaes_key_convert-.LM0
+ vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
+
+ vmov.i8 @XMM[8], #0x01 @ bit masks
+ vmov.i8 @XMM[9], #0x02
+ vmov.i8 @XMM[10], #0x04
+ vmov.i8 @XMM[11], #0x08
+ vmov.i8 @XMM[12], #0x10
+ vmov.i8 @XMM[13], #0x20
+ vldmia $const, {@XMM[14]} @ .LM0
+
+#ifdef __ARMEL__
+ vrev32.8 @XMM[7], @XMM[7]
+ vrev32.8 @XMM[15], @XMM[15]
+#endif
+ sub $rounds,$rounds,#1
+ vstmia $out!, {@XMM[7]} @ save round 0 key
+ b .Lkey_loop
+
+.align 4
+.Lkey_loop:
+ vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
+ vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
+ vmov.i8 @XMM[6], #0x40
+ vmov.i8 @XMM[15], #0x80
+
+ vtst.8 @XMM[0], @XMM[7], @XMM[8]
+ vtst.8 @XMM[1], @XMM[7], @XMM[9]
+ vtst.8 @XMM[2], @XMM[7], @XMM[10]
+ vtst.8 @XMM[3], @XMM[7], @XMM[11]
+ vtst.8 @XMM[4], @XMM[7], @XMM[12]
+ vtst.8 @XMM[5], @XMM[7], @XMM[13]
+ vtst.8 @XMM[6], @XMM[7], @XMM[6]
+ vtst.8 @XMM[7], @XMM[7], @XMM[15]
+ vld1.8 {@XMM[15]}, [$inp]! @ load next round key
+ vmvn @XMM[0], @XMM[0] @ "pnot"
+ vmvn @XMM[1], @XMM[1]
+ vmvn @XMM[5], @XMM[5]
+ vmvn @XMM[6], @XMM[6]
+#ifdef __ARMEL__
+ vrev32.8 @XMM[15], @XMM[15]
+#endif
+ subs $rounds,$rounds,#1
+ vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
+ bne .Lkey_loop
+
+ vmov.i8 @XMM[7],#0x63 @ compose .L63
+ @ don't save last round key
+ bx lr
+.size _bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0) { # following four functions are unsupported interface
+ # used for benchmarking...
+$code.=<<___;
+.globl bsaes_enc_key_convert
+.type bsaes_enc_key_convert,%function
+.align 4
+bsaes_enc_key_convert:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ ldr r5,[$inp,#240] @ pass rounds
+ mov r4,$inp @ pass key
+ mov r12,$out @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl bsaes_encrypt_128
+.type bsaes_encrypt_128,%function
+.align 4
+bsaes_encrypt_128:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+.Lenc128_loop:
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+ mov r4,$key @ pass the key
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5,#10 @ pass rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+
+ bl _bsaes_encrypt8
+
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ subs $len,$len,#0x80
+ vst1.8 {@XMM[5]}, [$out]!
+ bhi .Lenc128_loop
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl bsaes_dec_key_convert
+.type bsaes_dec_key_convert,%function
+.align 4
+bsaes_dec_key_convert:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ ldr r5,[$inp,#240] @ pass rounds
+ mov r4,$inp @ pass key
+ mov r12,$out @ pass key schedule
+ bl _bsaes_key_convert
+ vldmia $out, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia $out, {@XMM[7]}
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl bsaes_decrypt_128
+.type bsaes_decrypt_128,%function
+.align 4
+bsaes_decrypt_128:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+.Ldec128_loop:
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+ mov r4,$key @ pass the key
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5,#10 @ pass rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+
+ bl _bsaes_decrypt8
+
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ subs $len,$len,#0x80
+ vst1.8 {@XMM[5]}, [$out]!
+ bhi .Ldec128_loop
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
+my ($keysched)=("sp");
+
+$code.=<<___;
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+
+.global bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,%function
+.align 5
+bsaes_cbc_encrypt:
+#ifndef __KERNEL__
+ cmp $len, #128
+#ifndef __thumb__
+ blo AES_cbc_encrypt
+#else
+ bhs 1f
+ b AES_cbc_encrypt
+1:
+#endif
+#endif
+
+ @ it is up to the caller to make sure we are called with enc == 0
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr $ivp, [ip] @ IV is 1st arg on the stack
+ mov $len, $len, lsr#4 @ len in 16 byte blocks
+ sub sp, #0x10 @ scratch space to carry over the IV
+ mov $fp, sp @ save sp
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ add r12, #`128-32` @ sifze of bit-slices key schedule
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12 @ sp is $keysched
+ bl _bsaes_key_convert
+ vldmia $keysched, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia $keysched, {@XMM[7]}
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, $key, #248
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+
+.align 2
+0:
+#endif
+
+ vld1.8 {@XMM[15]}, [$ivp] @ load IV
+ b .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+ subs $len, $len, #0x8
+ bmi .Lcbc_dec_loop_finish
+
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, $keysched @ pass the key
+#else
+ add r4, $key, #248
+#endif
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5, $rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
+ sub $inp, $inp, #0x60
+ vstmia $fp, {@XMM[15]} @ put aside IV
+
+ bl _bsaes_decrypt8
+
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[3], @XMM[3], @XMM[13]
+ vst1.8 {@XMM[6]}, [$out]!
+ veor @XMM[5], @XMM[5], @XMM[14]
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ vst1.8 {@XMM[5]}, [$out]!
+
+ b .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+ adds $len, $len, #8
+ beq .Lcbc_dec_done
+
+ vld1.8 {@XMM[0]}, [$inp]! @ load input
+ cmp $len, #2
+ blo .Lcbc_dec_one
+ vld1.8 {@XMM[1]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, $keysched @ pass the key
+#else
+ add r4, $key, #248
+#endif
+ mov r5, $rounds
+ vstmia $fp, {@XMM[15]} @ put aside IV
+ beq .Lcbc_dec_two
+ vld1.8 {@XMM[2]}, [$inp]!
+ cmp $len, #4
+ blo .Lcbc_dec_three
+ vld1.8 {@XMM[3]}, [$inp]!
+ beq .Lcbc_dec_four
+ vld1.8 {@XMM[4]}, [$inp]!
+ cmp $len, #6
+ blo .Lcbc_dec_five
+ vld1.8 {@XMM[5]}, [$inp]!
+ beq .Lcbc_dec_six
+ vld1.8 {@XMM[6]}, [$inp]!
+ sub $inp, $inp, #0x70
+
+ bl _bsaes_decrypt8
+
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[3], @XMM[3], @XMM[13]
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+ sub $inp, $inp, #0x60
+ bl _bsaes_decrypt8
+ vldmia $fp,{@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+ sub $inp, $inp, #0x50
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+ sub $inp, $inp, #0x40
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+ sub $inp, $inp, #0x30
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+ sub $inp, $inp, #0x20
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[15]}, [$inp]! @ reload input
+ veor @XMM[1], @XMM[1], @XMM[8]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+ sub $inp, $inp, #0x10
+ mov $rounds, $out @ save original out pointer
+ mov $out, $fp @ use the iv scratch space as out buffer
+ mov r2, $key
+ vmov @XMM[4],@XMM[15] @ just in case ensure that IV
+ vmov @XMM[5],@XMM[0] @ and input are preserved
+ bl AES_decrypt
+ vld1.8 {@XMM[0]}, [$fp,:64] @ load result
+ veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
+ vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
+ vst1.8 {@XMM[0]}, [$rounds] @ write output
+
+.Lcbc_dec_done:
+#ifndef BSAES_ASM_EXTENDED_KEY
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+.Lcbc_dec_bzero: @ wipe key schedule [if any]
+ vstmia $keysched!, {q0-q1}
+ cmp $keysched, $fp
+ bne .Lcbc_dec_bzero
+#endif
+
+ mov sp, $fp
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
+ vst1.8 {@XMM[15]}, [$ivp] @ return IV
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc}
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+___
+}
+{
+my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
+my $const = "r6"; # shared with _bsaes_encrypt8_alt
+my $keysched = "sp";
+
+$code.=<<___;
+.extern AES_encrypt
+.global bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,%function
+.align 5
+bsaes_ctr32_encrypt_blocks:
+ cmp $len, #8 @ use plain AES for
+ blo .Lctr_enc_short @ small sizes
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr $ctr, [ip] @ ctr is 1st arg on the stack
+ sub sp, sp, #0x10 @ scratch space to carry over the ctr
+ mov $fp, sp @ save sp
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ add r12, #`128-32` @ size of bit-sliced key schedule
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12 @ sp is $keysched
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
+ add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
+ vldmia $keysched, {@XMM[4]} @ load round0 key
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+.align 2
+0: add r12, $key, #248
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
+ adrl $ctr, .LREVM0SR @ borrow $ctr
+ vldmia r12, {@XMM[4]} @ load round0 key
+ sub sp, #0x10 @ place for adjusted round0 key
+#endif
+
+ vmov.i32 @XMM[8],#1 @ compose 1<<96
+ veor @XMM[9],@XMM[9],@XMM[9]
+ vrev32.8 @XMM[0],@XMM[0]
+ vext.8 @XMM[8],@XMM[9],@XMM[8],#4
+ vrev32.8 @XMM[4],@XMM[4]
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
+ vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
+ b .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+ vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
+ vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
+ vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
+ vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
+ vadd.u32 @XMM[4], @XMM[1], @XMM[10]
+ vadd.u32 @XMM[5], @XMM[2], @XMM[10]
+ vadd.u32 @XMM[6], @XMM[3], @XMM[10]
+ vadd.u32 @XMM[7], @XMM[4], @XMM[10]
+ vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
+
+ @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ @ to flip byte order in 32-bit counter
+
+ vldmia $keysched, {@XMM[9]} @ load round0 key
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, $keysched, #0x10 @ pass next round key
+#else
+ add r4, $key, #`248+16`
+#endif
+ vldmia $ctr, {@XMM[8]} @ .LREVM0SR
+ mov r5, $rounds @ pass rounds
+ vstmia $fp, {@XMM[10]} @ save next counter
+ sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
+
+ bl _bsaes_encrypt8_alt
+
+ subs $len, $len, #8
+ blo .Lctr_enc_loop_done
+
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[10]
+ veor @XMM[6], @XMM[11]
+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
+ veor @XMM[3], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[7], @XMM[13]
+ veor @XMM[2], @XMM[14]
+ vst1.8 {@XMM[4]}, [$out]!
+ veor @XMM[5], @XMM[15]
+ vst1.8 {@XMM[6]}, [$out]!
+ vmov.i32 @XMM[8], #1 @ compose 1<<96
+ vst1.8 {@XMM[3]}, [$out]!
+ veor @XMM[9], @XMM[9], @XMM[9]
+ vst1.8 {@XMM[7]}, [$out]!
+ vext.8 @XMM[8], @XMM[9], @XMM[8], #4
+ vst1.8 {@XMM[2]}, [$out]!
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
+ vst1.8 {@XMM[5]}, [$out]!
+ vldmia $fp, {@XMM[0]} @ load counter
+
+ bne .Lctr_enc_loop
+ b .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+ add $len, $len, #8
+ vld1.8 {@XMM[8]}, [$inp]! @ load input
+ veor @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]! @ write output
+ cmp $len, #2
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[9]}, [$inp]!
+ veor @XMM[1], @XMM[9]
+ vst1.8 {@XMM[1]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[10]}, [$inp]!
+ veor @XMM[4], @XMM[10]
+ vst1.8 {@XMM[4]}, [$out]!
+ cmp $len, #4
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[11]}, [$inp]!
+ veor @XMM[6], @XMM[11]
+ vst1.8 {@XMM[6]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[12]}, [$inp]!
+ veor @XMM[3], @XMM[12]
+ vst1.8 {@XMM[3]}, [$out]!
+ cmp $len, #6
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[13]}, [$inp]!
+ veor @XMM[7], @XMM[13]
+ vst1.8 {@XMM[7]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[14]}, [$inp]
+ veor @XMM[2], @XMM[14]
+ vst1.8 {@XMM[2]}, [$out]!
+
+.Lctr_enc_done:
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifndef BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero: @ wipe key schedule [if any]
+ vstmia $keysched!, {q0-q1}
+ cmp $keysched, $fp
+ bne .Lctr_enc_bzero
+#else
+ vstmia $keysched, {q0-q1}
+#endif
+
+ mov sp, $fp
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.align 4
+.Lctr_enc_short:
+ ldr ip, [sp] @ ctr pointer is passed on stack
+ stmdb sp!, {r4-r8, lr}
+
+ mov r4, $inp @ copy arguments
+ mov r5, $out
+ mov r6, $len
+ mov r7, $key
+ ldr r8, [ip, #12] @ load counter LSW
+ vld1.8 {@XMM[1]}, [ip] @ load whole counter value
+#ifdef __ARMEL__
+ rev r8, r8
+#endif
+ sub sp, sp, #0x10
+ vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value
+ sub sp, sp, #0x10
+
+.Lctr_enc_short_loop:
+ add r0, sp, #0x10 @ input counter value
+ mov r1, sp @ output on the stack
+ mov r2, r7 @ key
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [r4]! @ load input
+ vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter
+ add r8, r8, #1
+#ifdef __ARMEL__
+ rev r0, r8
+ str r0, [sp, #0x1c] @ next counter value
+#else
+ str r8, [sp, #0x1c] @ next counter value
+#endif
+ veor @XMM[0],@XMM[0],@XMM[1]
+ vst1.8 {@XMM[0]}, [r5]! @ store output
+ subs r6, r6, #1
+ bne .Lctr_enc_short_loop
+
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vstmia sp!, {q0-q1}
+
+ ldmia sp!, {r4-r8, pc}
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+}
+{
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
+my $const="r6"; # returned by _bsaes_key_convert
+my $twmask=@XMM[5];
+my @T=@XMM[6..7];
+
+$code.=<<___;
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,%function
+.align 4
+bsaes_xts_encrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future $fp
+
+ mov $inp, r0
+ mov $out, r1
+ mov $len, r2
+ mov $key, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0,sp @ pointer to initial tweak
+#endif
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+ mov $fp, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
+ sub r12, #`32+16` @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+
+ vld1.8 {@XMM[8]}, [r0] @ initial tweak
+ adr $magic, .Lxts_magic
+
+ subs $len, #0x80
+ blo .Lxts_enc_short
+ b .Lxts_enc_loop
+
+.align 4
+.Lxts_enc_loop:
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ vadd.u64 @XMM[8], @XMM[15], @XMM[15]
+ vst1.64 {@XMM[15]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ veor @XMM[8], @XMM[8], @T[0]
+ vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ veor @XMM[7], @XMM[7], @XMM[15]
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[2], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ veor @XMM[13], @XMM[5], @XMM[15]
+ vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ subs $len, #0x80
+ bpl .Lxts_enc_loop
+
+.Lxts_enc_short:
+ adds $len, #0x70
+ bmi .Lxts_enc_done
+
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+ subs $len, #0x10
+ bmi .Lxts_enc_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ sub $len, #0x10
+ vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vld1.64 {@XMM[14]}, [r0,:128]!
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[2], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ vst1.8 {@XMM[12]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_6:
+ vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
+
+ veor @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[5], @XMM[5], @XMM[13]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align 5
+.Lxts_magic:
+ .quad 1, 0x87
+
+.align 5
+.Lxts_enc_5:
+ vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
+
+ veor @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[4], @XMM[4], @XMM[12]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ vst1.8 {@XMM[10]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_4:
+ vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
+
+ veor @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[3], @XMM[3], @XMM[11]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_3:
+ vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
+
+ veor @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[2], @XMM[2], @XMM[10]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ vld1.64 {@XMM[10]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ vst1.8 {@XMM[8]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_2:
+ vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
+
+ veor @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[1], @XMM[1], @XMM[9]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_1:
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]!
+ mov $fp, r4
+
+ vmov @XMM[8], @XMM[9] @ next round tweak
+
+.Lxts_enc_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds $len, #0x10
+ beq .Lxts_enc_ret
+ sub r6, $out, #0x10
+
+.Lxts_enc_steal:
+ ldrb r0, [$inp], #1
+ ldrb r1, [$out, #-0x10]
+ strb r0, [$out, #-0x10]
+ strb r1, [$out], #1
+
+ subs $len, #1
+ bhi .Lxts_enc_steal
+
+ vld1.8 {@XMM[0]}, [r6]
+ mov r0, sp
+ veor @XMM[0], @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [r6]
+ mov $fp, r4
+#endif
+
+.Lxts_enc_ret:
+ bic r0, $fp, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_enc_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_enc_bzero
+
+ mov sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {@XMM[8]}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,%function
+.align 4
+bsaes_xts_decrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future $fp
+
+ mov $inp, r0
+ mov $out, r1
+ mov $len, r2
+ mov $key, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0, sp @ pointer to initial tweak
+#endif
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+ mov $fp, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
+ sub r12, #`32+16` @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, sp, #0x90
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, $key, #248
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+ vld1.8 {@XMM[8]}, [r0] @ initial tweak
+ adr $magic, .Lxts_magic
+
+ tst $len, #0xf @ if not multiple of 16
+ it ne @ Thumb2 thing, sanity check in ARM
+ subne $len, #0x10 @ subtract another 16 bytes
+ subs $len, #0x80
+
+ blo .Lxts_dec_short
+ b .Lxts_dec_loop
+
+.align 4
+.Lxts_dec_loop:
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ vadd.u64 @XMM[8], @XMM[15], @XMM[15]
+ vst1.64 {@XMM[15]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ veor @XMM[8], @XMM[8], @T[0]
+ vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ veor @XMM[7], @XMM[7], @XMM[15]
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[3], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ veor @XMM[13], @XMM[5], @XMM[15]
+ vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ subs $len, #0x80
+ bpl .Lxts_dec_loop
+
+.Lxts_dec_short:
+ adds $len, #0x70
+ bmi .Lxts_dec_done
+
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+ subs $len, #0x10
+ bmi .Lxts_dec_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ sub $len, #0x10
+ vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vld1.64 {@XMM[14]}, [r0,:128]!
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[3], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ vst1.8 {@XMM[12]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_6:
+ vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
+
+ veor @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[5], @XMM[5], @XMM[13]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_5:
+ vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
+
+ veor @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[4], @XMM[4], @XMM[12]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ vst1.8 {@XMM[10]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_4:
+ vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
+
+ veor @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[3], @XMM[3], @XMM[11]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_3:
+ vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
+
+ veor @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[2], @XMM[2], @XMM[10]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ vld1.64 {@XMM[10]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ vst1.8 {@XMM[8]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_2:
+ vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
+
+ veor @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[1], @XMM[1], @XMM[9]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_1:
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+ mov r5, $magic @ preserve magic
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]!
+ mov $fp, r4
+ mov $magic, r5
+
+ vmov @XMM[8], @XMM[9] @ next round tweak
+
+.Lxts_dec_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds $len, #0x10
+ beq .Lxts_dec_ret
+
+ @ calculate one round of extra tweak for the stolen ciphertext
+ vldmia $magic, {$twmask}
+ vshr.s64 @XMM[6], @XMM[8], #63
+ vand @XMM[6], @XMM[6], $twmask
+ vadd.u64 @XMM[9], @XMM[8], @XMM[8]
+ vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
+ veor @XMM[9], @XMM[9], @XMM[6]
+
+ @ perform the final decryption with the last tweak value
+ vld1.8 {@XMM[0]}, [$inp]!
+ mov r0, sp
+ veor @XMM[0], @XMM[0], @XMM[9]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[9]
+ vst1.8 {@XMM[0]}, [$out]
+
+ mov r6, $out
+.Lxts_dec_steal:
+ ldrb r1, [$out]
+ ldrb r0, [$inp], #1
+ strb r1, [$out, #0x10]
+ strb r0, [$out], #1
+
+ subs $len, #1
+ bhi .Lxts_dec_steal
+
+ vld1.8 {@XMM[0]}, [r6]
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [r6]
+ mov $fp, r4
+#endif
+
+.Lxts_dec_ret:
+ bic r0, $fp, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_dec_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_dec_bzero
+
+ mov sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {@XMM[8]}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+print $code;
+
+close STDOUT;
diff --git a/arch/arm/crypto/sha1-armv7-neon.S b/arch/arm/crypto/sha1-armv7-neon.S
new file mode 100644
index 000000000000..50013c0e2864
--- /dev/null
+++ b/arch/arm/crypto/sha1-armv7-neon.S
@@ -0,0 +1,634 @@
+/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
+ *
+ * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/linkage.h>
+
+
+.syntax unified
+.code 32
+.fpu neon
+
+.text
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+#define K1 0x5A827999
+#define K2 0x6ED9EBA1
+#define K3 0x8F1BBCDC
+#define K4 0xCA62C1D6
+.align 4
+.LK_VEC:
+.LK1: .long K1, K1, K1, K1
+.LK2: .long K2, K2, K2, K2
+.LK3: .long K3, K3, K3, K3
+.LK4: .long K4, K4, K4, K4
+
+
+/* Register macros */
+
+#define RSTATE r0
+#define RDATA r1
+#define RNBLKS r2
+#define ROLDSTACK r3
+#define RWK lr
+
+#define _a r4
+#define _b r5
+#define _c r6
+#define _d r7
+#define _e r8
+
+#define RT0 r9
+#define RT1 r10
+#define RT2 r11
+#define RT3 r12
+
+#define W0 q0
+#define W1 q1
+#define W2 q2
+#define W3 q3
+#define W4 q4
+#define W5 q5
+#define W6 q6
+#define W7 q7
+
+#define tmp0 q8
+#define tmp1 q9
+#define tmp2 q10
+#define tmp3 q11
+
+#define qK1 q12
+#define qK2 q13
+#define qK3 q14
+#define qK4 q15
+
+
+/* Round function macros. */
+
+#define WK_offs(i) (((i) & 15) * 4)
+
+#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ldr RT3, [sp, WK_offs(i)]; \
+ pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ bic RT0, d, b; \
+ add e, e, a, ror #(32 - 5); \
+ and RT1, c, b; \
+ pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add RT0, RT0, RT3; \
+ add e, e, RT1; \
+ ror b, #(32 - 30); \
+ pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT0;
+
+#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ldr RT3, [sp, WK_offs(i)]; \
+ pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ eor RT0, d, b; \
+ add e, e, a, ror #(32 - 5); \
+ eor RT0, RT0, c; \
+ pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT3; \
+ ror b, #(32 - 30); \
+ pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT0; \
+
+#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ldr RT3, [sp, WK_offs(i)]; \
+ pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ eor RT0, b, c; \
+ and RT1, b, c; \
+ add e, e, a, ror #(32 - 5); \
+ pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ and RT0, RT0, d; \
+ add RT1, RT1, RT3; \
+ add e, e, RT0; \
+ ror b, #(32 - 30); \
+ pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT1;
+
+#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define R(a,b,c,d,e,f,i) \
+ _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define dummy(...)
+
+
+/* Input expansion macros. */
+
+/********* Precalc macros for rounds 0-15 *************************************/
+
+#define W_PRECALC_00_15() \
+ add RWK, sp, #(WK_offs(0)); \
+ \
+ vld1.32 {tmp0, tmp1}, [RDATA]!; \
+ vrev32.8 W0, tmp0; /* big => little */ \
+ vld1.32 {tmp2, tmp3}, [RDATA]!; \
+ vadd.u32 tmp0, W0, curK; \
+ vrev32.8 W7, tmp1; /* big => little */ \
+ vrev32.8 W6, tmp2; /* big => little */ \
+ vadd.u32 tmp1, W7, curK; \
+ vrev32.8 W5, tmp3; /* big => little */ \
+ vadd.u32 tmp2, W6, curK; \
+ vst1.32 {tmp0, tmp1}, [RWK]!; \
+ vadd.u32 tmp3, W5, curK; \
+ vst1.32 {tmp2, tmp3}, [RWK]; \
+
+#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vld1.32 {tmp0, tmp1}, [RDATA]!; \
+
+#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ add RWK, sp, #(WK_offs(0)); \
+
+#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vrev32.8 W0, tmp0; /* big => little */ \
+
+#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vld1.32 {tmp2, tmp3}, [RDATA]!; \
+
+#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp0, W0, curK; \
+
+#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vrev32.8 W7, tmp1; /* big => little */ \
+
+#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vrev32.8 W6, tmp2; /* big => little */ \
+
+#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp1, W7, curK; \
+
+#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vrev32.8 W5, tmp3; /* big => little */ \
+
+#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp2, W6, curK; \
+
+#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp0, tmp1}, [RWK]!; \
+
+#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp3, W5, curK; \
+
+#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp2, tmp3}, [RWK]; \
+
+
+/********* Precalc macros for rounds 16-31 ************************************/
+
+#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp0, tmp0; \
+ vext.8 W, W_m16, W_m12, #8; \
+
+#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ add RWK, sp, #(WK_offs(i)); \
+ vext.8 tmp0, W_m04, tmp0, #4; \
+
+#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp0, tmp0, W_m16; \
+ veor.32 W, W, W_m08; \
+
+#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp1, tmp1; \
+ veor W, W, tmp0; \
+
+#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshl.u32 tmp0, W, #1; \
+
+#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vext.8 tmp1, tmp1, W, #(16-12); \
+ vshr.u32 W, W, #31; \
+
+#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vorr tmp0, tmp0, W; \
+ vshr.u32 W, tmp1, #30; \
+
+#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshl.u32 tmp1, tmp1, #2; \
+
+#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp0, tmp0, W; \
+
+#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, tmp0, tmp1; \
+
+#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp0, W, curK; \
+
+#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp0}, [RWK];
+
+
+/********* Precalc macros for rounds 32-79 ************************************/
+
+#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, W_m28; \
+
+#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vext.8 tmp0, W_m08, W_m04, #8; \
+
+#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, W_m16; \
+
+#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, tmp0; \
+
+#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ add RWK, sp, #(WK_offs(i&~3)); \
+
+#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshl.u32 tmp1, W, #2; \
+
+#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshr.u32 tmp0, W, #30; \
+
+#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vorr W, tmp0, tmp1; \
+
+#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp0, W, curK; \
+
+#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp0}, [RWK];
+
+
+/*
+ * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
+ *
+ * unsigned int
+ * sha1_transform_neon (void *ctx, const unsigned char *data,
+ * unsigned int nblks)
+ */
+.align 3
+ENTRY(sha1_transform_neon)
+ /* input:
+ * r0: ctx, CTX
+ * r1: data (64*nblks bytes)
+ * r2: nblks
+ */
+
+ cmp RNBLKS, #0;
+ beq .Ldo_nothing;
+
+ push {r4-r12, lr};
+ /*vpush {q4-q7};*/
+
+ adr RT3, .LK_VEC;
+
+ mov ROLDSTACK, sp;
+
+ /* Align stack. */
+ sub RT0, sp, #(16*4);
+ and RT0, #(~(16-1));
+ mov sp, RT0;
+
+ vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
+
+ /* Get the values of the chaining variables. */
+ ldm RSTATE, {_a-_e};
+
+ vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
+
+#undef curK
+#define curK qK1
+ /* Precalc 0-15. */
+ W_PRECALC_00_15();
+
+.Loop:
+ /* Transform 0-15 + Precalc 16-31. */
+ _R( _a, _b, _c, _d, _e, F1, 0,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
+ W4, W5, W6, W7, W0, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F1, 1,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
+ W4, W5, W6, W7, W0, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F1, 2,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
+ W4, W5, W6, W7, W0, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F1, 3,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
+ W4, W5, W6, W7, W0, _, _, _ );
+
+#undef curK
+#define curK qK2
+ _R( _b, _c, _d, _e, _a, F1, 4,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
+ W3, W4, W5, W6, W7, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F1, 5,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
+ W3, W4, W5, W6, W7, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F1, 6,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
+ W3, W4, W5, W6, W7, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F1, 7,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
+ W3, W4, W5, W6, W7, _, _, _ );
+
+ _R( _c, _d, _e, _a, _b, F1, 8,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
+ W2, W3, W4, W5, W6, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F1, 9,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
+ W2, W3, W4, W5, W6, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F1, 10,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
+ W2, W3, W4, W5, W6, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F1, 11,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
+ W2, W3, W4, W5, W6, _, _, _ );
+
+ _R( _d, _e, _a, _b, _c, F1, 12,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
+ W1, W2, W3, W4, W5, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F1, 13,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
+ W1, W2, W3, W4, W5, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F1, 14,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
+ W1, W2, W3, W4, W5, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F1, 15,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
+ W1, W2, W3, W4, W5, _, _, _ );
+
+ /* Transform 16-63 + Precalc 32-79. */
+ _R( _e, _a, _b, _c, _d, F1, 16,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _d, _e, _a, _b, _c, F1, 17,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _c, _d, _e, _a, _b, F1, 18,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _b, _c, _d, _e, _a, F1, 19,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+
+ _R( _a, _b, _c, _d, _e, F2, 20,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _e, _a, _b, _c, _d, F2, 21,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _d, _e, _a, _b, _c, F2, 22,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _c, _d, _e, _a, _b, F2, 23,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+
+#undef curK
+#define curK qK3
+ _R( _b, _c, _d, _e, _a, F2, 24,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _a, _b, _c, _d, _e, F2, 25,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _e, _a, _b, _c, _d, F2, 26,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _d, _e, _a, _b, _c, F2, 27,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+
+ _R( _c, _d, _e, _a, _b, F2, 28,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _b, _c, _d, _e, _a, F2, 29,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _a, _b, _c, _d, _e, F2, 30,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _e, _a, _b, _c, _d, F2, 31,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+
+ _R( _d, _e, _a, _b, _c, F2, 32,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+ _R( _c, _d, _e, _a, _b, F2, 33,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+ _R( _b, _c, _d, _e, _a, F2, 34,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+ _R( _a, _b, _c, _d, _e, F2, 35,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+
+ _R( _e, _a, _b, _c, _d, F2, 36,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+ _R( _d, _e, _a, _b, _c, F2, 37,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+ _R( _c, _d, _e, _a, _b, F2, 38,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+ _R( _b, _c, _d, _e, _a, F2, 39,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+
+ _R( _a, _b, _c, _d, _e, F3, 40,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+ _R( _e, _a, _b, _c, _d, F3, 41,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+ _R( _d, _e, _a, _b, _c, F3, 42,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+ _R( _c, _d, _e, _a, _b, F3, 43,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+
+#undef curK
+#define curK qK4
+ _R( _b, _c, _d, _e, _a, F3, 44,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+ _R( _a, _b, _c, _d, _e, F3, 45,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+ _R( _e, _a, _b, _c, _d, F3, 46,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+ _R( _d, _e, _a, _b, _c, F3, 47,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+
+ _R( _c, _d, _e, _a, _b, F3, 48,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _b, _c, _d, _e, _a, F3, 49,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _a, _b, _c, _d, _e, F3, 50,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _e, _a, _b, _c, _d, F3, 51,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+
+ _R( _d, _e, _a, _b, _c, F3, 52,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _c, _d, _e, _a, _b, F3, 53,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _b, _c, _d, _e, _a, F3, 54,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _a, _b, _c, _d, _e, F3, 55,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+
+ _R( _e, _a, _b, _c, _d, F3, 56,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _d, _e, _a, _b, _c, F3, 57,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _c, _d, _e, _a, _b, F3, 58,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _b, _c, _d, _e, _a, F3, 59,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+
+ subs RNBLKS, #1;
+
+ _R( _a, _b, _c, _d, _e, F4, 60,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _e, _a, _b, _c, _d, F4, 61,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _d, _e, _a, _b, _c, F4, 62,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _c, _d, _e, _a, _b, F4, 63,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+
+ beq .Lend;
+
+ /* Transform 64-79 + Precalc 0-15 of next block. */
+#undef curK
+#define curK qK1
+ _R( _b, _c, _d, _e, _a, F4, 64,
+ WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F4, 65,
+ WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F4, 66,
+ WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F4, 67,
+ WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+ _R( _c, _d, _e, _a, _b, F4, 68,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F4, 69,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F4, 70,
+ WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F4, 71,
+ WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+ _R( _d, _e, _a, _b, _c, F4, 72,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F4, 73,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F4, 74,
+ WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F4, 75,
+ WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+ _R( _e, _a, _b, _c, _d, F4, 76,
+ WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F4, 77,
+ WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F4, 78,
+ WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F4, 79,
+ WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
+
+ /* Update the chaining variables. */
+ ldm RSTATE, {RT0-RT3};
+ add _a, RT0;
+ ldr RT0, [RSTATE, #state_h4];
+ add _b, RT1;
+ add _c, RT2;
+ add _d, RT3;
+ add _e, RT0;
+ stm RSTATE, {_a-_e};
+
+ b .Loop;
+
+.Lend:
+ /* Transform 64-79 */
+ R( _b, _c, _d, _e, _a, F4, 64 );
+ R( _a, _b, _c, _d, _e, F4, 65 );
+ R( _e, _a, _b, _c, _d, F4, 66 );
+ R( _d, _e, _a, _b, _c, F4, 67 );
+ R( _c, _d, _e, _a, _b, F4, 68 );
+ R( _b, _c, _d, _e, _a, F4, 69 );
+ R( _a, _b, _c, _d, _e, F4, 70 );
+ R( _e, _a, _b, _c, _d, F4, 71 );
+ R( _d, _e, _a, _b, _c, F4, 72 );
+ R( _c, _d, _e, _a, _b, F4, 73 );
+ R( _b, _c, _d, _e, _a, F4, 74 );
+ R( _a, _b, _c, _d, _e, F4, 75 );
+ R( _e, _a, _b, _c, _d, F4, 76 );
+ R( _d, _e, _a, _b, _c, F4, 77 );
+ R( _c, _d, _e, _a, _b, F4, 78 );
+ R( _b, _c, _d, _e, _a, F4, 79 );
+
+ mov sp, ROLDSTACK;
+
+ /* Update the chaining variables. */
+ ldm RSTATE, {RT0-RT3};
+ add _a, RT0;
+ ldr RT0, [RSTATE, #state_h4];
+ add _b, RT1;
+ add _c, RT2;
+ add _d, RT3;
+ /*vpop {q4-q7};*/
+ add _e, RT0;
+ stm RSTATE, {_a-_e};
+
+ pop {r4-r12, pc};
+
+.Ldo_nothing:
+ bx lr
+ENDPROC(sha1_transform_neon)
diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c
index ace4cd67464c..e31b0440c613 100644
--- a/arch/arm/crypto/sha1_glue.c
+++ b/arch/arm/crypto/sha1_glue.c
@@ -23,32 +23,27 @@
#include <linux/types.h>
#include <crypto/sha.h>
#include <asm/byteorder.h>
+#include <asm/crypto/sha1.h>
-struct SHA1_CTX {
- uint32_t h0,h1,h2,h3,h4;
- u64 count;
- u8 data[SHA1_BLOCK_SIZE];
-};
-asmlinkage void sha1_block_data_order(struct SHA1_CTX *digest,
+asmlinkage void sha1_block_data_order(u32 *digest,
const unsigned char *data, unsigned int rounds);
static int sha1_init(struct shash_desc *desc)
{
- struct SHA1_CTX *sctx = shash_desc_ctx(desc);
- memset(sctx, 0, sizeof(*sctx));
- sctx->h0 = SHA1_H0;
- sctx->h1 = SHA1_H1;
- sctx->h2 = SHA1_H2;
- sctx->h3 = SHA1_H3;
- sctx->h4 = SHA1_H4;
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ *sctx = (struct sha1_state){
+ .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+ };
+
return 0;
}
-static int __sha1_update(struct SHA1_CTX *sctx, const u8 *data,
- unsigned int len, unsigned int partial)
+static int __sha1_update(struct sha1_state *sctx, const u8 *data,
+ unsigned int len, unsigned int partial)
{
unsigned int done = 0;
@@ -56,43 +51,44 @@ static int __sha1_update(struct SHA1_CTX *sctx, const u8 *data,
if (partial) {
done = SHA1_BLOCK_SIZE - partial;
- memcpy(sctx->data + partial, data, done);
- sha1_block_data_order(sctx, sctx->data, 1);
+ memcpy(sctx->buffer + partial, data, done);
+ sha1_block_data_order(sctx->state, sctx->buffer, 1);
}
if (len - done >= SHA1_BLOCK_SIZE) {
const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
- sha1_block_data_order(sctx, data + done, rounds);
+ sha1_block_data_order(sctx->state, data + done, rounds);
done += rounds * SHA1_BLOCK_SIZE;
}
- memcpy(sctx->data, data + done, len - done);
+ memcpy(sctx->buffer, data + done, len - done);
return 0;
}
-static int sha1_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
+int sha1_update_arm(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
{
- struct SHA1_CTX *sctx = shash_desc_ctx(desc);
+ struct sha1_state *sctx = shash_desc_ctx(desc);
unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
int res;
/* Handle the fast case right here */
if (partial + len < SHA1_BLOCK_SIZE) {
sctx->count += len;
- memcpy(sctx->data + partial, data, len);
+ memcpy(sctx->buffer + partial, data, len);
return 0;
}
res = __sha1_update(sctx, data, len, partial);
return res;
}
+EXPORT_SYMBOL_GPL(sha1_update_arm);
/* Add padding and return the message digest. */
static int sha1_final(struct shash_desc *desc, u8 *out)
{
- struct SHA1_CTX *sctx = shash_desc_ctx(desc);
+ struct sha1_state *sctx = shash_desc_ctx(desc);
unsigned int i, index, padlen;
__be32 *dst = (__be32 *)out;
__be64 bits;
@@ -106,7 +102,7 @@ static int sha1_final(struct shash_desc *desc, u8 *out)
/* We need to fill a whole block for __sha1_update() */
if (padlen <= 56) {
sctx->count += padlen;
- memcpy(sctx->data + index, padding, padlen);
+ memcpy(sctx->buffer + index, padding, padlen);
} else {
__sha1_update(sctx, padding, padlen, index);
}
@@ -114,7 +110,7 @@ static int sha1_final(struct shash_desc *desc, u8 *out)
/* Store state in digest */
for (i = 0; i < 5; i++)
- dst[i] = cpu_to_be32(((u32 *)sctx)[i]);
+ dst[i] = cpu_to_be32(sctx->state[i]);
/* Wipe context */
memset(sctx, 0, sizeof(*sctx));
@@ -124,7 +120,7 @@ static int sha1_final(struct shash_desc *desc, u8 *out)
static int sha1_export(struct shash_desc *desc, void *out)
{
- struct SHA1_CTX *sctx = shash_desc_ctx(desc);
+ struct sha1_state *sctx = shash_desc_ctx(desc);
memcpy(out, sctx, sizeof(*sctx));
return 0;
}
@@ -132,7 +128,7 @@ static int sha1_export(struct shash_desc *desc, void *out)
static int sha1_import(struct shash_desc *desc, const void *in)
{
- struct SHA1_CTX *sctx = shash_desc_ctx(desc);
+ struct sha1_state *sctx = shash_desc_ctx(desc);
memcpy(sctx, in, sizeof(*sctx));
return 0;
}
@@ -141,12 +137,12 @@ static int sha1_import(struct shash_desc *desc, const void *in)
static struct shash_alg alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_init,
- .update = sha1_update,
+ .update = sha1_update_arm,
.final = sha1_final,
.export = sha1_export,
.import = sha1_import,
- .descsize = sizeof(struct SHA1_CTX),
- .statesize = sizeof(struct SHA1_CTX),
+ .descsize = sizeof(struct sha1_state),
+ .statesize = sizeof(struct sha1_state),
.base = {
.cra_name = "sha1",
.cra_driver_name= "sha1-asm",
diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c
new file mode 100644
index 000000000000..6f1b411b1d55
--- /dev/null
+++ b/arch/arm/crypto/sha1_neon_glue.c
@@ -0,0 +1,197 @@
+/*
+ * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
+ * ARM NEON instructions.
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is based on sha1_generic.c and sha1_ssse3_glue.c:
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/crypto/sha1.h>
+
+
+asmlinkage void sha1_transform_neon(void *state_h, const char *data,
+ unsigned int rounds);
+
+
+static int sha1_neon_init(struct shash_desc *desc)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ *sctx = (struct sha1_state){
+ .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+ };
+
+ return 0;
+}
+
+static int __sha1_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count += len;
+
+ if (partial) {
+ done = SHA1_BLOCK_SIZE - partial;
+ memcpy(sctx->buffer + partial, data, done);
+ sha1_transform_neon(sctx->state, sctx->buffer, 1);
+ }
+
+ if (len - done >= SHA1_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
+
+ sha1_transform_neon(sctx->state, data + done, rounds);
+ done += rounds * SHA1_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buffer, data + done, len - done);
+
+ return 0;
+}
+
+static int sha1_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
+ int res;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA1_BLOCK_SIZE) {
+ sctx->count += len;
+ memcpy(sctx->buffer + partial, data, len);
+
+ return 0;
+ }
+
+ if (!may_use_simd()) {
+ res = sha1_update_arm(desc, data, len);
+ } else {
+ kernel_neon_begin();
+ res = __sha1_neon_update(desc, data, len, partial);
+ kernel_neon_end();
+ }
+
+ return res;
+}
+
+
+/* Add padding and return the message digest. */
+static int sha1_neon_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
+
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64 and append length */
+ index = sctx->count % SHA1_BLOCK_SIZE;
+ padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
+ if (!may_use_simd()) {
+ sha1_update_arm(desc, padding, padlen);
+ sha1_update_arm(desc, (const u8 *)&bits, sizeof(bits));
+ } else {
+ kernel_neon_begin();
+ /* We need to fill a whole block for __sha1_neon_update() */
+ if (padlen <= 56) {
+ sctx->count += padlen;
+ memcpy(sctx->buffer + index, padding, padlen);
+ } else {
+ __sha1_neon_update(desc, padding, padlen, index);
+ }
+ __sha1_neon_update(desc, (const u8 *)&bits, sizeof(bits), 56);
+ kernel_neon_end();
+ }
+
+ /* Store state in digest */
+ for (i = 0; i < 5; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Wipe context */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha1_neon_export(struct shash_desc *desc, void *out)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha1_neon_import(struct shash_desc *desc, const void *in)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static struct shash_alg alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_neon_init,
+ .update = sha1_neon_update,
+ .final = sha1_neon_final,
+ .export = sha1_neon_export,
+ .import = sha1_neon_import,
+ .descsize = sizeof(struct sha1_state),
+ .statesize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int __init sha1_neon_mod_init(void)
+{
+ if (!cpu_has_neon())
+ return -ENODEV;
+
+ return crypto_register_shash(&alg);
+}
+
+static void __exit sha1_neon_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(sha1_neon_mod_init);
+module_exit(sha1_neon_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, NEON accelerated");
+MODULE_ALIAS("sha1");
diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
new file mode 100644
index 000000000000..fac0533ea633
--- /dev/null
+++ b/arch/arm/crypto/sha256-armv4.pl
@@ -0,0 +1,716 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0"; $t0="r0";
+$inp="r1"; $t4="r1";
+$len="r2"; $t1="r2";
+$T1="r3"; $t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+ @ ldr $t1,[$inp],#4 @ $i
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+# ifndef __ARMEB__
+ rev $t1,$t1
+# endif
+#else
+ @ ldrb $t1,[$inp,#3] @ $i
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ ldrb $t2,[$inp,#2]
+ ldrb $t0,[$inp,#1]
+ orr $t1,$t1,$t2,lsl#8
+ ldrb $t2,[$inp],#4
+ orr $t1,$t1,$t0,lsl#16
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ orr $t1,$t1,$t2,lsl#24
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+#endif
+___
+$code.=<<___;
+ ldr $t2,[$Ktbl],#4 @ *K256++
+ add $h,$h,$t1 @ h+=X[i]
+ str $t1,[sp,#`$i%16`*4]
+ eor $t1,$f,$g
+ add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
+ and $t1,$t1,$e
+ add $h,$h,$t2 @ h+=K256[i]
+ eor $t1,$t1,$g @ Ch(e,f,g)
+ eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+ add $h,$h,$t1 @ h+=Ch(e,f,g)
+#if $i==31
+ and $t2,$t2,#0xff
+ cmp $t2,#0xf2 @ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4 @ prefetch
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t2,$a,$b @ a^b, b^c in next round
+#else
+ ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
+ eor $t2,$a,$b @ a^b, b^c in next round
+ ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
+#endif
+ eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
+ and $t3,$t3,$t2 @ (b^c)&=(a^b)
+ add $d,$d,$h @ d+=h
+ eor $t3,$t3,$b @ Maj(a,b,c)
+ add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
+ @ add $h,$h,$t3 @ h+=Maj(a,b,c)
+___
+ ($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+ @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
+ @ ldr $t4,[sp,#`($i+14)%16`*4]
+ mov $t0,$t1,ror#$sigma0[0]
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ mov $t2,$t4,ror#$sigma1[0]
+ eor $t0,$t0,$t1,ror#$sigma0[1]
+ eor $t2,$t2,$t4,ror#$sigma1[1]
+ eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
+ ldr $t1,[sp,#`($i+0)%16`*4]
+ eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
+ ldr $t4,[sp,#`($i+9)%16`*4]
+
+ add $t2,$t2,$t0
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
+ add $t1,$t1,$t2
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+ add $t1,$t1,$t4 @ X[i]
+___
+ &BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+# define adrl adr
+.thumb
+# else
+.code 32
+# endif
+#endif
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global sha256_block_data_order
+.type sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+ sub r3,pc,#8 @ sha256_block_data_order
+#else
+ adr r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
+ ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+ sub $Ktbl,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t3,$B,$C @ magic
+ eor $t2,$t2,$t2
+___
+for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq $t3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t0,[$t3,#0]
+ ldr $t1,[$t3,#4]
+ ldr $t2,[$t3,#8]
+ add $A,$A,$t0
+ ldr $t0,[$t3,#12]
+ add $B,$B,$t1
+ ldr $t1,[$t3,#16]
+ add $C,$C,$t2
+ ldr $t2,[$t3,#20]
+ add $D,$D,$t0
+ ldr $t0,[$t3,#24]
+ add $E,$E,$t1
+ ldr $t1,[$t3,#28]
+ add $F,$F,$t2
+ ldr $inp,[sp,#17*4] @ pull inp
+ ldr $t2,[sp,#18*4] @ pull inp+len
+ add $G,$G,$t0
+ add $H,$H,$t1
+ stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+ cmp $inp,$t2
+ sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#`16+3`*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T2,$T0,$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T1,$T0,$sigma0[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T2,$T0,32-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T3,$T0,$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T3,$T0,32-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T3); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ while($#insns>=2) { eval(shift(@insns)); }
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vrev32_8 (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ foreach (@insns) { eval; } # remaining instructions
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+ '&add ($h,$h,$t1)', # h+=X[i]+K[i]
+ '&eor ($t1,$f,$g)',
+ '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+ '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
+ '&and ($t1,$t1,$e)',
+ '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
+ '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+ '&eor ($t1,$t1,$g)', # Ch(e,f,g)
+ '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
+ '&eor ($t2,$a,$b)', # a^b, b^c in next round
+ '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
+ '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
+ '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
+ '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
+ '&ldr ($t1,"[sp,#64]") if ($j==31)',
+ '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
+ '&add ($d,$d,$h)', # d+=h
+ '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
+ '&eor ($t3,$t3,$b)', # Maj(a,b,c)
+ '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+ )
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ sub $H,sp,#16*4+16
+ adrl $Ktbl,K256
+ bic $H,$H,#15 @ align for 128-bit stores
+ mov $t2,sp
+ mov sp,$H @ alloca
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {@X[0]},[$inp]!
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ vld1.32 {$T0},[$Ktbl,:128]!
+ vld1.32 {$T1},[$Ktbl,:128]!
+ vld1.32 {$T2},[$Ktbl,:128]!
+ vld1.32 {$T3},[$Ktbl,:128]!
+ vrev32.8 @X[0],@X[0] @ yes, even on
+ str $ctx,[sp,#64]
+ vrev32.8 @X[1],@X[1] @ big-endian
+ str $inp,[sp,#68]
+ mov $Xfer,sp
+ vrev32.8 @X[2],@X[2]
+ str $len,[sp,#72]
+ vrev32.8 @X[3],@X[3]
+ str $t2,[sp,#76] @ save original sp
+ vadd.i32 $T0,$T0,@X[0]
+ vadd.i32 $T1,$T1,@X[1]
+ vst1.32 {$T0},[$Xfer,:128]!
+ vadd.i32 $T2,$T2,@X[2]
+ vst1.32 {$T1},[$Xfer,:128]!
+ vadd.i32 $T3,$T3,@X[3]
+ vst1.32 {$T2},[$Xfer,:128]!
+ vst1.32 {$T3},[$Xfer,:128]!
+
+ ldmia $ctx,{$A-$H}
+ sub $Xfer,$Xfer,#64
+ ldr $t1,[sp,#0]
+ eor $t2,$t2,$t2
+ eor $t3,$B,$C
+ b .L_00_48
+
+.align 4
+.L_00_48:
+___
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+$code.=<<___;
+ teq $t1,#0 @ check for K256 terminator
+ ldr $t1,[sp,#0]
+ sub $Xfer,$Xfer,#64
+ bne .L_00_48
+
+ ldr $inp,[sp,#68]
+ ldr $t0,[sp,#72]
+ sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
+ teq $inp,$t0
+ it eq
+ subeq $inp,$inp,#64 @ avoid SEGV
+ vld1.8 {@X[0]},[$inp]! @ load next input block
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ it ne
+ strne $inp,[sp,#68]
+ mov $Xfer,sp
+___
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+$code.=<<___;
+ ldr $t0,[$t1,#0]
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t2,[$t1,#4]
+ ldr $t3,[$t1,#8]
+ ldr $t4,[$t1,#12]
+ add $A,$A,$t0 @ accumulate
+ ldr $t0,[$t1,#16]
+ add $B,$B,$t2
+ ldr $t2,[$t1,#20]
+ add $C,$C,$t3
+ ldr $t3,[$t1,#24]
+ add $D,$D,$t4
+ ldr $t4,[$t1,#28]
+ add $E,$E,$t0
+ str $A,[$t1],#4
+ add $F,$F,$t2
+ str $B,[$t1],#4
+ add $G,$G,$t3
+ str $C,[$t1],#4
+ add $H,$H,$t4
+ str $D,[$t1],#4
+ stmia $t1,{$E-$H}
+
+ ittte ne
+ movne $Xfer,sp
+ ldrne $t1,[sp,#0]
+ eorne $t2,$t2,$t2
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne $t3,$B,$C
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+ adr $Ktbl,.LARMv8
+ sub $Ktbl,$Ktbl,#.LARMv8-K256
+# else
+ adrl $Ktbl,K256
+# endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+.Loop_v8:
+ vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
+ vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
+ vld1.32 {$W0},[$Ktbl]!
+ vrev32.8 @MSG[0],@MSG[0]
+ vrev32.8 @MSG[1],@MSG[1]
+ vrev32.8 @MSG[2],@MSG[2]
+ vrev32.8 @MSG[3],@MSG[3]
+ vmov $ABCD_SAVE,$ABCD @ offload
+ vmov $EFGH_SAVE,$EFGH
+ teq $inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ sha256su0 @MSG[0],@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vld1.32 {$W0},[$Ktbl]!
+ vadd.i32 $W1,$W1,@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vld1.32 {$W1},[$Ktbl]
+ vadd.i32 $W0,$W0,@MSG[2]
+ sub $Ktbl,$Ktbl,#256-16 @ rewind
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vadd.i32 $W1,$W1,@MSG[3]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
+ vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {$ABCD,$EFGH},[$ctx]
+
+ ret @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+{ my %opcode = (
+ "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
+ "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
+
+ sub unsha256 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+}
+
+foreach (split($/,$code)) {
+
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped
new file mode 100644
index 000000000000..555a1a8eec90
--- /dev/null
+++ b/arch/arm/crypto/sha256-core.S_shipped
@@ -0,0 +1,2808 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+# define adrl adr
+.thumb
+# else
+.code 32
+# endif
+#endif
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global sha256_block_data_order
+.type sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+ sub r3,pc,#8 @ sha256_block_data_order
+#else
+ adr r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
+ ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+ sub r14,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ magic
+ eor r12,r12,r12
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 0
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 0
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 0==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 1
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 1
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 1==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 2
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 2
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 2==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 3
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 3
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 3==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 4
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 4
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 4==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 5
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 5==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 6
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 6
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 6==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 7
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 7==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 8
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 8
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 8==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 9
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 9
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 9==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 10
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 10
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 10==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 11
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 11
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 11==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 12
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 12
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 12==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 13
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 13
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 13==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 14
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 14
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 14==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 15
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 15
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 15==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+ @ ldr r2,[sp,#1*4] @ 16
+ @ ldr r1,[sp,#14*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#0*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#9*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 16==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#2*4] @ 17
+ @ ldr r1,[sp,#15*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#1*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#10*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 17==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#3*4] @ 18
+ @ ldr r1,[sp,#0*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#2*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#11*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 18==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#4*4] @ 19
+ @ ldr r1,[sp,#1*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#3*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#12*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 19==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#5*4] @ 20
+ @ ldr r1,[sp,#2*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#4*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#13*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 20==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#6*4] @ 21
+ @ ldr r1,[sp,#3*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#5*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#14*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 21==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#7*4] @ 22
+ @ ldr r1,[sp,#4*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#6*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#15*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 22==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#8*4] @ 23
+ @ ldr r1,[sp,#5*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#7*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#0*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 23==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#9*4] @ 24
+ @ ldr r1,[sp,#6*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#8*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#1*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 24==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#10*4] @ 25
+ @ ldr r1,[sp,#7*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#9*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#2*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 25==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#11*4] @ 26
+ @ ldr r1,[sp,#8*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#10*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#3*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 26==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#12*4] @ 27
+ @ ldr r1,[sp,#9*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#11*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#4*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 27==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#13*4] @ 28
+ @ ldr r1,[sp,#10*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#12*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#5*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 28==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#14*4] @ 29
+ @ ldr r1,[sp,#11*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#13*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#6*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 29==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#15*4] @ 30
+ @ ldr r1,[sp,#12*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#14*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#7*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 30==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#0*4] @ 31
+ @ ldr r1,[sp,#13*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#15*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#8*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 31==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq r3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r0,[r3,#0]
+ ldr r2,[r3,#4]
+ ldr r12,[r3,#8]
+ add r4,r4,r0
+ ldr r0,[r3,#12]
+ add r5,r5,r2
+ ldr r2,[r3,#16]
+ add r6,r6,r12
+ ldr r12,[r3,#20]
+ add r7,r7,r0
+ ldr r0,[r3,#24]
+ add r8,r8,r2
+ ldr r2,[r3,#28]
+ add r9,r9,r12
+ ldr r1,[sp,#17*4] @ pull inp
+ ldr r12,[sp,#18*4] @ pull inp+len
+ add r10,r10,r0
+ add r11,r11,r2
+ stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+ cmp r1,r12
+ sub r14,r14,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#19*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size sha256_block_data_order,.-sha256_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ sub r11,sp,#16*4+16
+ adrl r14,K256
+ bic r11,r11,#15 @ align for 128-bit stores
+ mov r12,sp
+ mov sp,r11 @ alloca
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {q0},[r1]!
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ vld1.32 {q8},[r14,:128]!
+ vld1.32 {q9},[r14,:128]!
+ vld1.32 {q10},[r14,:128]!
+ vld1.32 {q11},[r14,:128]!
+ vrev32.8 q0,q0 @ yes, even on
+ str r0,[sp,#64]
+ vrev32.8 q1,q1 @ big-endian
+ str r1,[sp,#68]
+ mov r1,sp
+ vrev32.8 q2,q2
+ str r2,[sp,#72]
+ vrev32.8 q3,q3
+ str r12,[sp,#76] @ save original sp
+ vadd.i32 q8,q8,q0
+ vadd.i32 q9,q9,q1
+ vst1.32 {q8},[r1,:128]!
+ vadd.i32 q10,q10,q2
+ vst1.32 {q9},[r1,:128]!
+ vadd.i32 q11,q11,q3
+ vst1.32 {q10},[r1,:128]!
+ vst1.32 {q11},[r1,:128]!
+
+ ldmia r0,{r4-r11}
+ sub r1,r1,#64
+ ldr r2,[sp,#0]
+ eor r12,r12,r12
+ eor r3,r5,r6
+ b .L_00_48
+
+.align 4
+.L_00_48:
+ vext.8 q8,q0,q1,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q2,q3,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q0,q0,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d7,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d7,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d7,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q0,q0,q9
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d7,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d7,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d0,d0,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d0,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d0,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d0,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ vshr.u32 d24,d0,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d0,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d1,d1,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q0
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q1,q2,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q3,q0,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q1,q1,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d1,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d1,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d1,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q1,q1,q9
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d1,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d1,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d2,d2,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d2,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d2,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d2,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ vshr.u32 d24,d2,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d2,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d3,d3,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q1
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vext.8 q8,q2,q3,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q0,q1,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q2,q2,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d3,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d3,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d3,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q2,q2,q9
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d3,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d3,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d4,d4,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d4,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d4,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d4,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ vshr.u32 d24,d4,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d4,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d5,d5,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q2
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q3,q0,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q1,q2,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q3,q3,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d5,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d5,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d5,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q3,q3,q9
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d5,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d5,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d6,d6,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d6,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d6,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d6,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ vshr.u32 d24,d6,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d6,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d7,d7,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q3
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[r14]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ teq r2,#0 @ check for K256 terminator
+ ldr r2,[sp,#0]
+ sub r1,r1,#64
+ bne .L_00_48
+
+ ldr r1,[sp,#68]
+ ldr r0,[sp,#72]
+ sub r14,r14,#256 @ rewind r14
+ teq r1,r0
+ it eq
+ subeq r1,r1,#64 @ avoid SEGV
+ vld1.8 {q0},[r1]! @ load next input block
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ it ne
+ strne r1,[sp,#68]
+ mov r1,sp
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q0,q0
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q0
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q1,q1
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q1
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q2,q2
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q2
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q3,q3
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q3
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#64]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ ldr r0,[r2,#0]
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r12,[r2,#4]
+ ldr r3,[r2,#8]
+ ldr r1,[r2,#12]
+ add r4,r4,r0 @ accumulate
+ ldr r0,[r2,#16]
+ add r5,r5,r12
+ ldr r12,[r2,#20]
+ add r6,r6,r3
+ ldr r3,[r2,#24]
+ add r7,r7,r1
+ ldr r1,[r2,#28]
+ add r8,r8,r0
+ str r4,[r2],#4
+ add r9,r9,r12
+ str r5,[r2],#4
+ add r10,r10,r3
+ str r6,[r2],#4
+ add r11,r11,r1
+ str r7,[r2],#4
+ stmia r2,{r8-r11}
+
+ ittte ne
+ movne r1,sp
+ ldrne r2,[sp,#0]
+ eorne r12,r12,r12
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne r3,r5,r6
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {q0,q1},[r0]
+# ifdef __thumb2__
+ adr r3,.LARMv8
+ sub r3,r3,#.LARMv8-K256
+# else
+ adrl r3,K256
+# endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+
+.Loop_v8:
+ vld1.8 {q8-q9},[r1]!
+ vld1.8 {q10-q11},[r1]!
+ vld1.32 {q12},[r3]!
+ vrev32.8 q8,q8
+ vrev32.8 q9,q9
+ vrev32.8 q10,q10
+ vrev32.8 q11,q11
+ vmov q14,q0 @ offload
+ vmov q15,q1
+ teq r1,r2
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vld1.32 {q13},[r3]
+ vadd.i32 q12,q12,q10
+ sub r3,r3,#256-16 @ rewind
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vadd.i32 q13,q13,q11
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vadd.i32 q0,q0,q14
+ vadd.i32 q1,q1,q15
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {q0,q1},[r0]
+
+ bx lr @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
new file mode 100644
index 000000000000..bb03482e53e2
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.c
@@ -0,0 +1,246 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using optimized ARM assembler and NEON instructions.
+ *
+ * Copyright © 2015 Google Inc.
+ *
+ * This file is based on sha256_ssse3_glue.c:
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
+ unsigned int num_blks);
+
+
+int sha256_init(struct shash_desc *desc)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA256_H0;
+ sctx->state[1] = SHA256_H1;
+ sctx->state[2] = SHA256_H2;
+ sctx->state[3] = SHA256_H3;
+ sctx->state[4] = SHA256_H4;
+ sctx->state[5] = SHA256_H5;
+ sctx->state[6] = SHA256_H6;
+ sctx->state[7] = SHA256_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+int sha224_init(struct shash_desc *desc)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA224_H0;
+ sctx->state[1] = SHA224_H1;
+ sctx->state[2] = SHA224_H2;
+ sctx->state[3] = SHA224_H3;
+ sctx->state[4] = SHA224_H4;
+ sctx->state[5] = SHA224_H5;
+ sctx->state[6] = SHA224_H6;
+ sctx->state[7] = SHA224_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+int __sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len,
+ unsigned int partial)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count += len;
+
+ if (partial) {
+ done = SHA256_BLOCK_SIZE - partial;
+ memcpy(sctx->buf + partial, data, done);
+ sha256_block_data_order(sctx->state, sctx->buf, 1);
+ }
+
+ if (len - done >= SHA256_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+ sha256_block_data_order(sctx->state, data + done, rounds);
+ done += rounds * SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buf, data + done, len - done);
+
+ return 0;
+}
+
+int sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA256_BLOCK_SIZE) {
+ sctx->count += len;
+ memcpy(sctx->buf + partial, data, len);
+
+ return 0;
+ }
+
+ return __sha256_update(desc, data, len, partial);
+}
+
+/* Add padding and return the message digest. */
+static int sha256_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+ /* save number of bits */
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64 and append length */
+ index = sctx->count % SHA256_BLOCK_SIZE;
+ padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+ /* We need to fill a whole block for __sha256_update */
+ if (padlen <= 56) {
+ sctx->count += padlen;
+ memcpy(sctx->buf + index, padding, padlen);
+ } else {
+ __sha256_update(desc, padding, padlen, index);
+ }
+ __sha256_update(desc, (const u8 *)&bits, sizeof(bits), 56);
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Wipe context */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha224_final(struct shash_desc *desc, u8 *out)
+{
+ u8 D[SHA256_DIGEST_SIZE];
+
+ sha256_final(desc, D);
+
+ memcpy(out, D, SHA224_DIGEST_SIZE);
+ memset(D, 0, SHA256_DIGEST_SIZE);
+
+ return 0;
+}
+
+int sha256_export(struct shash_desc *desc, void *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+int sha256_import(struct shash_desc *desc, const void *in)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static struct shash_alg algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_init,
+ .update = sha256_update,
+ .final = sha256_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-asm",
+ .cra_priority = 150,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_init,
+ .update = sha256_update,
+ .final = sha224_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-asm",
+ .cra_priority = 150,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int __init sha256_mod_init(void)
+{
+ int res = crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+ if (res < 0)
+ return res;
+
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
+ res = crypto_register_shashes(sha256_neon_algs,
+ ARRAY_SIZE(sha256_neon_algs));
+
+ if (res < 0)
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+ }
+
+ return res;
+}
+
+static void __exit sha256_mod_fini(void)
+{
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
+ crypto_unregister_shashes(sha256_neon_algs,
+ ARRAY_SIZE(sha256_neon_algs));
+}
+
+module_init(sha256_mod_init);
+module_exit(sha256_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm (ARM), including NEON");
+
+MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/arm/crypto/sha256_glue.h b/arch/arm/crypto/sha256_glue.h
new file mode 100644
index 000000000000..0312f4ffe8cc
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.h
@@ -0,0 +1,23 @@
+#ifndef _CRYPTO_SHA256_GLUE_H
+#define _CRYPTO_SHA256_GLUE_H
+
+#include <linux/crypto.h>
+#include <crypto/sha.h>
+
+extern struct shash_alg sha256_neon_algs[2];
+
+extern int sha256_init(struct shash_desc *desc);
+
+extern int sha224_init(struct shash_desc *desc);
+
+extern int __sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial);
+
+extern int sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len);
+
+extern int sha256_export(struct shash_desc *desc, void *out);
+
+extern int sha256_import(struct shash_desc *desc, const void *in);
+
+#endif /* _CRYPTO_SHA256_GLUE_H */
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
new file mode 100644
index 000000000000..3ff0a7f1d092
--- /dev/null
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -0,0 +1,172 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright © 2015 Google Inc.
+ *
+ * This file is based on sha512_neon_glue.c:
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order_neon(u32 *digest, const void *data,
+ unsigned int num_blks);
+
+
+static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count += len;
+
+ if (partial) {
+ done = SHA256_BLOCK_SIZE - partial;
+ memcpy(sctx->buf + partial, data, done);
+ sha256_block_data_order_neon(sctx->state, sctx->buf, 1);
+ }
+
+ if (len - done >= SHA256_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+ sha256_block_data_order_neon(sctx->state, data + done, rounds);
+ done += rounds * SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buf, data + done, len - done);
+
+ return 0;
+}
+
+static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+ int res;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA256_BLOCK_SIZE) {
+ sctx->count += len;
+ memcpy(sctx->buf + partial, data, len);
+
+ return 0;
+ }
+
+ if (!may_use_simd()) {
+ res = __sha256_update(desc, data, len, partial);
+ } else {
+ kernel_neon_begin();
+ res = __sha256_neon_update(desc, data, len, partial);
+ kernel_neon_end();
+ }
+
+ return res;
+}
+
+/* Add padding and return the message digest. */
+static int sha256_neon_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+ /* save number of bits */
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64 and append length */
+ index = sctx->count % SHA256_BLOCK_SIZE;
+ padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+ if (!may_use_simd()) {
+ sha256_update(desc, padding, padlen);
+ sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+ } else {
+ kernel_neon_begin();
+ /* We need to fill a whole block for __sha256_neon_update() */
+ if (padlen <= 56) {
+ sctx->count += padlen;
+ memcpy(sctx->buf + index, padding, padlen);
+ } else {
+ __sha256_neon_update(desc, padding, padlen, index);
+ }
+ __sha256_neon_update(desc, (const u8 *)&bits,
+ sizeof(bits), 56);
+ kernel_neon_end();
+ }
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Wipe context */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha224_neon_final(struct shash_desc *desc, u8 *out)
+{
+ u8 D[SHA256_DIGEST_SIZE];
+
+ sha256_neon_final(desc, D);
+
+ memcpy(out, D, SHA224_DIGEST_SIZE);
+ memset(D, 0, SHA256_DIGEST_SIZE);
+
+ return 0;
+}
+
+struct shash_alg sha256_neon_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_init,
+ .update = sha256_neon_update,
+ .final = sha256_neon_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_init,
+ .update = sha256_neon_update,
+ .final = sha224_neon_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
diff --git a/arch/arm/crypto/sha512-armv7-neon.S b/arch/arm/crypto/sha512-armv7-neon.S
new file mode 100644
index 000000000000..fe99472e507c
--- /dev/null
+++ b/arch/arm/crypto/sha512-armv7-neon.S
@@ -0,0 +1,455 @@
+/* sha512-armv7-neon.S - ARM/NEON assembly implementation of SHA-512 transform
+ *
+ * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/linkage.h>
+
+
+.syntax unified
+.code 32
+.fpu neon
+
+.text
+
+/* structure of SHA512_CONTEXT */
+#define hd_a 0
+#define hd_b ((hd_a) + 8)
+#define hd_c ((hd_b) + 8)
+#define hd_d ((hd_c) + 8)
+#define hd_e ((hd_d) + 8)
+#define hd_f ((hd_e) + 8)
+#define hd_g ((hd_f) + 8)
+
+/* register macros */
+#define RK %r2
+
+#define RA d0
+#define RB d1
+#define RC d2
+#define RD d3
+#define RE d4
+#define RF d5
+#define RG d6
+#define RH d7
+
+#define RT0 d8
+#define RT1 d9
+#define RT2 d10
+#define RT3 d11
+#define RT4 d12
+#define RT5 d13
+#define RT6 d14
+#define RT7 d15
+
+#define RT01q q4
+#define RT23q q5
+#define RT45q q6
+#define RT67q q7
+
+#define RW0 d16
+#define RW1 d17
+#define RW2 d18
+#define RW3 d19
+#define RW4 d20
+#define RW5 d21
+#define RW6 d22
+#define RW7 d23
+#define RW8 d24
+#define RW9 d25
+#define RW10 d26
+#define RW11 d27
+#define RW12 d28
+#define RW13 d29
+#define RW14 d30
+#define RW15 d31
+
+#define RW01q q8
+#define RW23q q9
+#define RW45q q10
+#define RW67q q11
+#define RW89q q12
+#define RW1011q q13
+#define RW1213q q14
+#define RW1415q q15
+
+/***********************************************************************
+ * ARM assembly implementation of sha512 transform
+ ***********************************************************************/
+#define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, \
+ rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \
+ /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
+ vshr.u64 RT2, re, #14; \
+ vshl.u64 RT3, re, #64 - 14; \
+ interleave_op(arg1); \
+ vshr.u64 RT4, re, #18; \
+ vshl.u64 RT5, re, #64 - 18; \
+ vld1.64 {RT0}, [RK]!; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, re, #41; \
+ vshl.u64 RT5, re, #64 - 41; \
+ vadd.u64 RT0, RT0, rw0; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vmov.64 RT7, re; \
+ veor.64 RT1, RT2, RT3; \
+ vbsl.64 RT7, rf, rg; \
+ \
+ vadd.u64 RT1, RT1, rh; \
+ vshr.u64 RT2, ra, #28; \
+ vshl.u64 RT3, ra, #64 - 28; \
+ vadd.u64 RT1, RT1, RT0; \
+ vshr.u64 RT4, ra, #34; \
+ vshl.u64 RT5, ra, #64 - 34; \
+ vadd.u64 RT1, RT1, RT7; \
+ \
+ /* h = Sum0 (a) + Maj (a, b, c); */ \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, ra, #39; \
+ vshl.u64 RT5, ra, #64 - 39; \
+ veor.64 RT0, ra, rb; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vbsl.64 RT0, rc, rb; \
+ vadd.u64 rd, rd, RT1; /* d+=t1; */ \
+ veor.64 rh, RT2, RT3; \
+ \
+ /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
+ vshr.u64 RT2, rd, #14; \
+ vshl.u64 RT3, rd, #64 - 14; \
+ vadd.u64 rh, rh, RT0; \
+ vshr.u64 RT4, rd, #18; \
+ vshl.u64 RT5, rd, #64 - 18; \
+ vadd.u64 rh, rh, RT1; /* h+=t1; */ \
+ vld1.64 {RT0}, [RK]!; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, rd, #41; \
+ vshl.u64 RT5, rd, #64 - 41; \
+ vadd.u64 RT0, RT0, rw1; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vmov.64 RT7, rd; \
+ veor.64 RT1, RT2, RT3; \
+ vbsl.64 RT7, re, rf; \
+ \
+ vadd.u64 RT1, RT1, rg; \
+ vshr.u64 RT2, rh, #28; \
+ vshl.u64 RT3, rh, #64 - 28; \
+ vadd.u64 RT1, RT1, RT0; \
+ vshr.u64 RT4, rh, #34; \
+ vshl.u64 RT5, rh, #64 - 34; \
+ vadd.u64 RT1, RT1, RT7; \
+ \
+ /* g = Sum0 (h) + Maj (h, a, b); */ \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, rh, #39; \
+ vshl.u64 RT5, rh, #64 - 39; \
+ veor.64 RT0, rh, ra; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vbsl.64 RT0, rb, ra; \
+ vadd.u64 rc, rc, RT1; /* c+=t1; */ \
+ veor.64 rg, RT2, RT3; \
+ \
+ /* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \
+ /* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \
+ \
+ /**** S0(w[1:2]) */ \
+ \
+ /* w[0:1] += w[9:10] */ \
+ /* RT23q = rw1:rw2 */ \
+ vext.u64 RT23q, rw01q, rw23q, #1; \
+ vadd.u64 rw0, rw9; \
+ vadd.u64 rg, rg, RT0; \
+ vadd.u64 rw1, rw10;\
+ vadd.u64 rg, rg, RT1; /* g+=t1; */ \
+ \
+ vshr.u64 RT45q, RT23q, #1; \
+ vshl.u64 RT67q, RT23q, #64 - 1; \
+ vshr.u64 RT01q, RT23q, #8; \
+ veor.u64 RT45q, RT45q, RT67q; \
+ vshl.u64 RT67q, RT23q, #64 - 8; \
+ veor.u64 RT45q, RT45q, RT01q; \
+ vshr.u64 RT01q, RT23q, #7; \
+ veor.u64 RT45q, RT45q, RT67q; \
+ \
+ /**** S1(w[14:15]) */ \
+ vshr.u64 RT23q, rw1415q, #6; \
+ veor.u64 RT01q, RT01q, RT45q; \
+ vshr.u64 RT45q, rw1415q, #19; \
+ vshl.u64 RT67q, rw1415q, #64 - 19; \
+ veor.u64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT45q, rw1415q, #61; \
+ veor.u64 RT23q, RT23q, RT67q; \
+ vshl.u64 RT67q, rw1415q, #64 - 61; \
+ veor.u64 RT23q, RT23q, RT45q; \
+ vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \
+ veor.u64 RT01q, RT23q, RT67q;
+#define vadd_RT01q(rw01q) \
+ /* w[0:1] += S(w[14:15]) */ \
+ vadd.u64 rw01q, RT01q;
+
+#define dummy(_) /*_*/
+
+#define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, \
+ interleave_op1, arg1, interleave_op2, arg2) \
+ /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
+ vshr.u64 RT2, re, #14; \
+ vshl.u64 RT3, re, #64 - 14; \
+ interleave_op1(arg1); \
+ vshr.u64 RT4, re, #18; \
+ vshl.u64 RT5, re, #64 - 18; \
+ interleave_op2(arg2); \
+ vld1.64 {RT0}, [RK]!; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, re, #41; \
+ vshl.u64 RT5, re, #64 - 41; \
+ vadd.u64 RT0, RT0, rw0; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vmov.64 RT7, re; \
+ veor.64 RT1, RT2, RT3; \
+ vbsl.64 RT7, rf, rg; \
+ \
+ vadd.u64 RT1, RT1, rh; \
+ vshr.u64 RT2, ra, #28; \
+ vshl.u64 RT3, ra, #64 - 28; \
+ vadd.u64 RT1, RT1, RT0; \
+ vshr.u64 RT4, ra, #34; \
+ vshl.u64 RT5, ra, #64 - 34; \
+ vadd.u64 RT1, RT1, RT7; \
+ \
+ /* h = Sum0 (a) + Maj (a, b, c); */ \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, ra, #39; \
+ vshl.u64 RT5, ra, #64 - 39; \
+ veor.64 RT0, ra, rb; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vbsl.64 RT0, rc, rb; \
+ vadd.u64 rd, rd, RT1; /* d+=t1; */ \
+ veor.64 rh, RT2, RT3; \
+ \
+ /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
+ vshr.u64 RT2, rd, #14; \
+ vshl.u64 RT3, rd, #64 - 14; \
+ vadd.u64 rh, rh, RT0; \
+ vshr.u64 RT4, rd, #18; \
+ vshl.u64 RT5, rd, #64 - 18; \
+ vadd.u64 rh, rh, RT1; /* h+=t1; */ \
+ vld1.64 {RT0}, [RK]!; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, rd, #41; \
+ vshl.u64 RT5, rd, #64 - 41; \
+ vadd.u64 RT0, RT0, rw1; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vmov.64 RT7, rd; \
+ veor.64 RT1, RT2, RT3; \
+ vbsl.64 RT7, re, rf; \
+ \
+ vadd.u64 RT1, RT1, rg; \
+ vshr.u64 RT2, rh, #28; \
+ vshl.u64 RT3, rh, #64 - 28; \
+ vadd.u64 RT1, RT1, RT0; \
+ vshr.u64 RT4, rh, #34; \
+ vshl.u64 RT5, rh, #64 - 34; \
+ vadd.u64 RT1, RT1, RT7; \
+ \
+ /* g = Sum0 (h) + Maj (h, a, b); */ \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, rh, #39; \
+ vshl.u64 RT5, rh, #64 - 39; \
+ veor.64 RT0, rh, ra; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vbsl.64 RT0, rb, ra; \
+ vadd.u64 rc, rc, RT1; /* c+=t1; */ \
+ veor.64 rg, RT2, RT3;
+#define vadd_rg_RT0(rg) \
+ vadd.u64 rg, rg, RT0;
+#define vadd_rg_RT1(rg) \
+ vadd.u64 rg, rg, RT1; /* g+=t1; */
+
+.align 3
+ENTRY(sha512_transform_neon)
+ /* Input:
+ * %r0: SHA512_CONTEXT
+ * %r1: data
+ * %r2: u64 k[] constants
+ * %r3: nblks
+ */
+ push {%lr};
+
+ mov %lr, #0;
+
+ /* Load context to d0-d7 */
+ vld1.64 {RA-RD}, [%r0]!;
+ vld1.64 {RE-RH}, [%r0];
+ sub %r0, #(4*8);
+
+ /* Load input to w[16], d16-d31 */
+ /* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */
+ vld1.64 {RW0-RW3}, [%r1]!;
+ vld1.64 {RW4-RW7}, [%r1]!;
+ vld1.64 {RW8-RW11}, [%r1]!;
+ vld1.64 {RW12-RW15}, [%r1]!;
+#ifdef __ARMEL__
+ /* byteswap */
+ vrev64.8 RW01q, RW01q;
+ vrev64.8 RW23q, RW23q;
+ vrev64.8 RW45q, RW45q;
+ vrev64.8 RW67q, RW67q;
+ vrev64.8 RW89q, RW89q;
+ vrev64.8 RW1011q, RW1011q;
+ vrev64.8 RW1213q, RW1213q;
+ vrev64.8 RW1415q, RW1415q;
+#endif
+
+ /* EABI says that d8-d15 must be preserved by callee. */
+ /*vpush {RT0-RT7};*/
+
+.Loop:
+ rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2,
+ RW23q, RW1415q, RW9, RW10, dummy, _);
+ b .Lenter_rounds;
+
+.Loop_rounds:
+ rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2,
+ RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q);
+.Lenter_rounds:
+ rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4,
+ RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q);
+ rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6,
+ RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q);
+ rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8,
+ RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q);
+ rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10,
+ RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q);
+ rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12,
+ RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q);
+ add %lr, #16;
+ rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14,
+ RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q);
+ cmp %lr, #64;
+ rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0,
+ RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q);
+ bne .Loop_rounds;
+
+ subs %r3, #1;
+
+ rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1,
+ vadd_RT01q, RW1415q, dummy, _);
+ rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3,
+ vadd_rg_RT0, RG, vadd_rg_RT1, RG);
+ beq .Lhandle_tail;
+ vld1.64 {RW0-RW3}, [%r1]!;
+ rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5,
+ vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+ rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7,
+ vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+#ifdef __ARMEL__
+ vrev64.8 RW01q, RW01q;
+ vrev64.8 RW23q, RW23q;
+#endif
+ vld1.64 {RW4-RW7}, [%r1]!;
+ rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9,
+ vadd_rg_RT0, RA, vadd_rg_RT1, RA);
+ rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11,
+ vadd_rg_RT0, RG, vadd_rg_RT1, RG);
+#ifdef __ARMEL__
+ vrev64.8 RW45q, RW45q;
+ vrev64.8 RW67q, RW67q;
+#endif
+ vld1.64 {RW8-RW11}, [%r1]!;
+ rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13,
+ vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+ rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15,
+ vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+#ifdef __ARMEL__
+ vrev64.8 RW89q, RW89q;
+ vrev64.8 RW1011q, RW1011q;
+#endif
+ vld1.64 {RW12-RW15}, [%r1]!;
+ vadd_rg_RT0(RA);
+ vadd_rg_RT1(RA);
+
+ /* Load context */
+ vld1.64 {RT0-RT3}, [%r0]!;
+ vld1.64 {RT4-RT7}, [%r0];
+ sub %r0, #(4*8);
+
+#ifdef __ARMEL__
+ vrev64.8 RW1213q, RW1213q;
+ vrev64.8 RW1415q, RW1415q;
+#endif
+
+ vadd.u64 RA, RT0;
+ vadd.u64 RB, RT1;
+ vadd.u64 RC, RT2;
+ vadd.u64 RD, RT3;
+ vadd.u64 RE, RT4;
+ vadd.u64 RF, RT5;
+ vadd.u64 RG, RT6;
+ vadd.u64 RH, RT7;
+
+ /* Store the first half of context */
+ vst1.64 {RA-RD}, [%r0]!;
+ sub RK, $(8*80);
+ vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
+ mov %lr, #0;
+ sub %r0, #(4*8);
+
+ b .Loop;
+
+.Lhandle_tail:
+ rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5,
+ vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+ rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7,
+ vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+ rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9,
+ vadd_rg_RT0, RA, vadd_rg_RT1, RA);
+ rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11,
+ vadd_rg_RT0, RG, vadd_rg_RT1, RG);
+ rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13,
+ vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+ rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15,
+ vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+
+ /* Load context to d16-d23 */
+ vld1.64 {RW0-RW3}, [%r0]!;
+ vadd_rg_RT0(RA);
+ vld1.64 {RW4-RW7}, [%r0];
+ vadd_rg_RT1(RA);
+ sub %r0, #(4*8);
+
+ vadd.u64 RA, RW0;
+ vadd.u64 RB, RW1;
+ vadd.u64 RC, RW2;
+ vadd.u64 RD, RW3;
+ vadd.u64 RE, RW4;
+ vadd.u64 RF, RW5;
+ vadd.u64 RG, RW6;
+ vadd.u64 RH, RW7;
+
+ /* Store the first half of context */
+ vst1.64 {RA-RD}, [%r0]!;
+
+ /* Clear used registers */
+ /* d16-d31 */
+ veor.u64 RW01q, RW01q;
+ veor.u64 RW23q, RW23q;
+ veor.u64 RW45q, RW45q;
+ veor.u64 RW67q, RW67q;
+ vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
+ veor.u64 RW89q, RW89q;
+ veor.u64 RW1011q, RW1011q;
+ veor.u64 RW1213q, RW1213q;
+ veor.u64 RW1415q, RW1415q;
+ /* d8-d15 */
+ /*vpop {RT0-RT7};*/
+ /* d0-d7 (q0-q3) */
+ veor.u64 %q0, %q0;
+ veor.u64 %q1, %q1;
+ veor.u64 %q2, %q2;
+ veor.u64 %q3, %q3;
+
+ pop {%pc};
+ENDPROC(sha512_transform_neon)
diff --git a/arch/arm/crypto/sha512_neon_glue.c b/arch/arm/crypto/sha512_neon_glue.c
new file mode 100644
index 000000000000..0d2758ff5e12
--- /dev/null
+++ b/arch/arm/crypto/sha512_neon_glue.c
@@ -0,0 +1,305 @@
+/*
+ * Glue code for the SHA512 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is based on sha512_ssse3_glue.c:
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+
+
+static const u64 sha512_k[] = {
+ 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
+ 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
+ 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+ 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
+ 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
+ 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+ 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
+ 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
+ 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+ 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
+ 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
+ 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+ 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
+ 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
+ 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+ 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
+ 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
+ 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+ 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
+ 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
+ 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+ 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
+ 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
+ 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+ 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
+ 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
+ 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+ 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
+ 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
+ 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+ 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
+ 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
+ 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+ 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
+ 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
+ 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+ 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
+ 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
+ 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+ 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+
+asmlinkage void sha512_transform_neon(u64 *digest, const void *data,
+ const u64 k[], unsigned int num_blks);
+
+
+static int sha512_neon_init(struct shash_desc *desc)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA512_H0;
+ sctx->state[1] = SHA512_H1;
+ sctx->state[2] = SHA512_H2;
+ sctx->state[3] = SHA512_H3;
+ sctx->state[4] = SHA512_H4;
+ sctx->state[5] = SHA512_H5;
+ sctx->state[6] = SHA512_H6;
+ sctx->state[7] = SHA512_H7;
+ sctx->count[0] = sctx->count[1] = 0;
+
+ return 0;
+}
+
+static int __sha512_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count[0] += len;
+ if (sctx->count[0] < len)
+ sctx->count[1]++;
+
+ if (partial) {
+ done = SHA512_BLOCK_SIZE - partial;
+ memcpy(sctx->buf + partial, data, done);
+ sha512_transform_neon(sctx->state, sctx->buf, sha512_k, 1);
+ }
+
+ if (len - done >= SHA512_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
+
+ sha512_transform_neon(sctx->state, data + done, sha512_k,
+ rounds);
+
+ done += rounds * SHA512_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buf, data + done, len - done);
+
+ return 0;
+}
+
+static int sha512_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
+ int res;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA512_BLOCK_SIZE) {
+ sctx->count[0] += len;
+ if (sctx->count[0] < len)
+ sctx->count[1]++;
+ memcpy(sctx->buf + partial, data, len);
+
+ return 0;
+ }
+
+ if (!may_use_simd()) {
+ res = crypto_sha512_update(desc, data, len);
+ } else {
+ kernel_neon_begin();
+ res = __sha512_neon_update(desc, data, len, partial);
+ kernel_neon_end();
+ }
+
+ return res;
+}
+
+
+/* Add padding and return the message digest. */
+static int sha512_neon_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be64 *dst = (__be64 *)out;
+ __be64 bits[2];
+ static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
+
+ /* save number of bits */
+ bits[1] = cpu_to_be64(sctx->count[0] << 3);
+ bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
+
+ /* Pad out to 112 mod 128 and append length */
+ index = sctx->count[0] & 0x7f;
+ padlen = (index < 112) ? (112 - index) : ((128+112) - index);
+
+ if (!may_use_simd()) {
+ crypto_sha512_update(desc, padding, padlen);
+ crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
+ } else {
+ kernel_neon_begin();
+ /* We need to fill a whole block for __sha512_neon_update() */
+ if (padlen <= 112) {
+ sctx->count[0] += padlen;
+ if (sctx->count[0] < padlen)
+ sctx->count[1]++;
+ memcpy(sctx->buf + index, padding, padlen);
+ } else {
+ __sha512_neon_update(desc, padding, padlen, index);
+ }
+ __sha512_neon_update(desc, (const u8 *)&bits,
+ sizeof(bits), 112);
+ kernel_neon_end();
+ }
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be64(sctx->state[i]);
+
+ /* Wipe context */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha512_neon_export(struct shash_desc *desc, void *out)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha512_neon_import(struct shash_desc *desc, const void *in)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha384_neon_init(struct shash_desc *desc)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA384_H0;
+ sctx->state[1] = SHA384_H1;
+ sctx->state[2] = SHA384_H2;
+ sctx->state[3] = SHA384_H3;
+ sctx->state[4] = SHA384_H4;
+ sctx->state[5] = SHA384_H5;
+ sctx->state[6] = SHA384_H6;
+ sctx->state[7] = SHA384_H7;
+
+ sctx->count[0] = sctx->count[1] = 0;
+
+ return 0;
+}
+
+static int sha384_neon_final(struct shash_desc *desc, u8 *hash)
+{
+ u8 D[SHA512_DIGEST_SIZE];
+
+ sha512_neon_final(desc, D);
+
+ memcpy(hash, D, SHA384_DIGEST_SIZE);
+ memset(D, 0, SHA512_DIGEST_SIZE);
+
+ return 0;
+}
+
+static struct shash_alg algs[] = { {
+ .digestsize = SHA512_DIGEST_SIZE,
+ .init = sha512_neon_init,
+ .update = sha512_neon_update,
+ .final = sha512_neon_final,
+ .export = sha512_neon_export,
+ .import = sha512_neon_import,
+ .descsize = sizeof(struct sha512_state),
+ .statesize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha512",
+ .cra_driver_name = "sha512-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA512_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA384_DIGEST_SIZE,
+ .init = sha384_neon_init,
+ .update = sha512_neon_update,
+ .final = sha384_neon_final,
+ .export = sha512_neon_export,
+ .import = sha512_neon_import,
+ .descsize = sizeof(struct sha512_state),
+ .statesize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha384",
+ .cra_driver_name = "sha384-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA384_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int __init sha512_neon_mod_init(void)
+{
+ if (!cpu_has_neon())
+ return -ENODEV;
+
+ return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit sha512_neon_mod_fini(void)
+{
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+
+module_init(sha512_neon_mod_init);
+module_exit(sha512_neon_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, NEON accelerated");
+
+MODULE_ALIAS("sha512");
+MODULE_ALIAS("sha384");
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index d3db39860b9c..6577b8aeb711 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -24,6 +24,7 @@ generic-y += sembuf.h
generic-y += serial.h
generic-y += shmbuf.h
generic-y += siginfo.h
+generic-y += simd.h
generic-y += sizes.h
generic-y += socket.h
generic-y += sockios.h
diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h
index 60f15e274e6d..2f59f7443396 100644
--- a/arch/arm/include/asm/barrier.h
+++ b/arch/arm/include/asm/barrier.h
@@ -59,6 +59,21 @@
#define smp_wmb() dmb(ishst)
#endif
+#define smp_store_release(p, v) \
+do { \
+ compiletime_assert_atomic_type(*p); \
+ smp_mb(); \
+ ACCESS_ONCE(*p) = (v); \
+} while (0)
+
+#define smp_load_acquire(p) \
+({ \
+ typeof(*p) ___p1 = ACCESS_ONCE(*p); \
+ compiletime_assert_atomic_type(*p); \
+ smp_mb(); \
+ ___p1; \
+})
+
#define read_barrier_depends() do { } while(0)
#define smp_read_barrier_depends() do { } while(0)
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 2059f019bef4..455e6637c881 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -16,6 +16,7 @@
#include <asm/shmparam.h>
#include <asm/cachetype.h>
#include <asm/outercache.h>
+#include <asm/rodata.h>
#define CACHE_COLOUR(vaddr) ((vaddr & (SHMLBA - 1)) >> PAGE_SHIFT)
diff --git a/arch/arm/include/asm/crypto/sha1.h b/arch/arm/include/asm/crypto/sha1.h
new file mode 100644
index 000000000000..75e6a417416b
--- /dev/null
+++ b/arch/arm/include/asm/crypto/sha1.h
@@ -0,0 +1,10 @@
+#ifndef ASM_ARM_CRYPTO_SHA1_H
+#define ASM_ARM_CRYPTO_SHA1_H
+
+#include <linux/crypto.h>
+#include <crypto/sha.h>
+
+extern int sha1_update_arm(struct shash_desc *desc, const u8 *data,
+ unsigned int len);
+
+#endif
diff --git a/arch/arm/include/asm/fiq_glue.h b/arch/arm/include/asm/fiq_glue.h
new file mode 100644
index 000000000000..a9e244f9f197
--- /dev/null
+++ b/arch/arm/include/asm/fiq_glue.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __ASM_FIQ_GLUE_H
+#define __ASM_FIQ_GLUE_H
+
+struct fiq_glue_handler {
+ void (*fiq)(struct fiq_glue_handler *h, void *regs, void *svc_sp);
+ void (*resume)(struct fiq_glue_handler *h);
+};
+typedef void (*fiq_return_handler_t)(void);
+
+int fiq_glue_register_handler(struct fiq_glue_handler *handler);
+int fiq_glue_set_return_handler(fiq_return_handler_t fiq_return);
+int fiq_glue_clear_return_handler(fiq_return_handler_t fiq_return);
+
+#ifdef CONFIG_FIQ_GLUE
+void fiq_glue_resume(void);
+#else
+static inline void fiq_glue_resume(void) {}
+#endif
+
+#endif
diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h
index 3d7351c844aa..fe3ea776dc34 100644
--- a/arch/arm/include/asm/hardirq.h
+++ b/arch/arm/include/asm/hardirq.h
@@ -5,7 +5,7 @@
#include <linux/threads.h>
#include <asm/irq.h>
-#define NR_IPI 7
+#define NR_IPI 8
typedef struct {
unsigned int __softirq_pending;
diff --git a/arch/arm/include/asm/hardware/cache-l2x0.h b/arch/arm/include/asm/hardware/cache-l2x0.h
index 3b2c40b5bfa2..0ca0f5a7c84b 100644
--- a/arch/arm/include/asm/hardware/cache-l2x0.h
+++ b/arch/arm/include/asm/hardware/cache-l2x0.h
@@ -66,6 +66,7 @@
#define L2X0_STNDBY_MODE_EN (1 << 0)
/* Registers shifts and masks */
+#define L2X0_CACHE_ID_REV_MASK (0x3f)
#define L2X0_CACHE_ID_PART_MASK (0xf << 6)
#define L2X0_CACHE_ID_PART_L210 (1 << 6)
#define L2X0_CACHE_ID_PART_L310 (3 << 6)
@@ -106,6 +107,8 @@
#define L2X0_WAY_SIZE_SHIFT 3
+#define REV_PL310_R2P0 4
+
#ifndef __ASSEMBLY__
extern void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask);
#if defined(CONFIG_CACHE_L2X0) && defined(CONFIG_OF)
diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h
index 53c15dec7af6..809203a4b71b 100644
--- a/arch/arm/include/asm/irq.h
+++ b/arch/arm/include/asm/irq.h
@@ -35,6 +35,9 @@ extern void (*handle_arch_irq)(struct pt_regs *);
extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
#endif
+void arch_trigger_all_cpu_backtrace(void);
+#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
+
#endif
#endif
diff --git a/arch/arm/include/asm/mach/mmc.h b/arch/arm/include/asm/mach/mmc.h
new file mode 100644
index 000000000000..bca864ac945f
--- /dev/null
+++ b/arch/arm/include/asm/mach/mmc.h
@@ -0,0 +1,28 @@
+/*
+ * arch/arm/include/asm/mach/mmc.h
+ */
+#ifndef ASMARM_MACH_MMC_H
+#define ASMARM_MACH_MMC_H
+
+#include <linux/mmc/host.h>
+#include <linux/mmc/card.h>
+#include <linux/mmc/sdio_func.h>
+
+struct embedded_sdio_data {
+ struct sdio_cis cis;
+ struct sdio_cccr cccr;
+ struct sdio_embedded_func *funcs;
+ int num_funcs;
+};
+
+struct mmc_platform_data {
+ unsigned int ocr_mask; /* available voltages */
+ int built_in; /* built-in device flag */
+ int card_present; /* card detect state */
+ u32 (*translate_vdd)(struct device *, unsigned int);
+ unsigned int (*status)(struct device *);
+ struct embedded_sdio_data *embedded_sdio;
+ int (*register_status_notify)(void (*callback)(int card_present, void *dev_id), void *dev_id);
+};
+
+#endif
diff --git a/arch/arm/include/asm/neon.h b/arch/arm/include/asm/neon.h
new file mode 100644
index 000000000000..8f730fe70093
--- /dev/null
+++ b/arch/arm/include/asm/neon.h
@@ -0,0 +1,36 @@
+/*
+ * linux/arch/arm/include/asm/neon.h
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/hwcap.h>
+
+#define cpu_has_neon() (!!(elf_hwcap & HWCAP_NEON))
+
+#ifdef __ARM_NEON__
+
+/*
+ * If you are affected by the BUILD_BUG below, it probably means that you are
+ * using NEON code /and/ calling the kernel_neon_begin() function from the same
+ * compilation unit. To prevent issues that may arise from GCC reordering or
+ * generating(1) NEON instructions outside of these begin/end functions, the
+ * only supported way of using NEON code in the kernel is by isolating it in a
+ * separate compilation unit, and calling it from another unit from inside a
+ * kernel_neon_begin/kernel_neon_end pair.
+ *
+ * (1) Current GCC (4.7) might generate NEON instructions at O3 level if
+ * -mpfu=neon is set.
+ */
+
+#define kernel_neon_begin() \
+ BUILD_BUG_ON_MSG(1, "kernel_neon_begin() called from NEON code")
+
+#else
+void kernel_neon_begin(void);
+#endif
+void kernel_neon_end(void);
diff --git a/arch/arm/include/asm/rodata.h b/arch/arm/include/asm/rodata.h
new file mode 100644
index 000000000000..8c8add87bbc5
--- /dev/null
+++ b/arch/arm/include/asm/rodata.h
@@ -0,0 +1,32 @@
+/*
+ * arch/arm/include/asm/rodata.h
+ *
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASMARM_RODATA_H
+#define _ASMARM_RODATA_H
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_DEBUG_RODATA
+
+int set_memory_rw(unsigned long virt, int numpages);
+int set_memory_ro(unsigned long virt, int numpages);
+
+void mark_rodata_ro(void);
+void set_kernel_text_rw(void);
+void set_kernel_text_ro(void);
+#else
+static inline void set_kernel_text_rw(void) { }
+static inline void set_kernel_text_ro(void) { }
+#endif
+
+#endif
+
+#endif
diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h
index 610ccf33f5e7..67a18a5ed9fa 100644
--- a/arch/arm/include/asm/smp.h
+++ b/arch/arm/include/asm/smp.h
@@ -82,6 +82,7 @@ extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
extern int register_ipi_completion(struct completion *completion, int cpu);
+extern void smp_send_all_cpu_backtrace(void);
struct smp_operations {
#ifdef CONFIG_SMP
diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h
index 73ddd7239b33..ed805f1d3785 100644
--- a/arch/arm/include/asm/syscall.h
+++ b/arch/arm/include/asm/syscall.h
@@ -103,8 +103,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
memcpy(&regs->ARM_r0 + i, args, n * sizeof(args[0]));
}
-static inline int syscall_get_arch(struct task_struct *task,
- struct pt_regs *regs)
+static inline int syscall_get_arch(void)
{
/* ARM tasks don't change audit architectures on the fly. */
return AUDIT_ARCH_ARM;
diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index cbd61977c996..43876245fc57 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -15,7 +15,7 @@
#include <uapi/asm/unistd.h>
-#define __NR_syscalls (380)
+#define __NR_syscalls (384)
#define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0)
#define __ARCH_WANT_STAT64
diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h
index af33b44990ed..17407c92c0da 100644
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -406,6 +406,12 @@
#define __NR_process_vm_writev (__NR_SYSCALL_BASE+377)
#define __NR_kcmp (__NR_SYSCALL_BASE+378)
#define __NR_finit_module (__NR_SYSCALL_BASE+379)
+/* Reserve for later
+#define __NR_sched_setattr (__NR_SYSCALL_BASE+380)
+#define __NR_sched_getattr (__NR_SYSCALL_BASE+381)
+#define __NR_renameat2 (__NR_SYSCALL_BASE+382)
+*/
+#define __NR_seccomp (__NR_SYSCALL_BASE+383)
/*
* This may need to be greater than __NR_last_syscall+1 in order to
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index c6ca7e376773..725f844926ea 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -389,6 +389,11 @@
CALL(sys_process_vm_writev)
CALL(sys_kcmp)
CALL(sys_finit_module)
+/* 380 */ CALL(sys_ni_syscall) /* reserved sys_sched_setattr */
+ CALL(sys_ni_syscall) /* reserved sys_sched_getattr */
+ CALL(sys_ni_syscall) /* reserved sys_renameat2 */
+ CALL(sys_seccomp)
+
#ifndef syscalls_counted
.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
#define syscalls_counted
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index 34e56647dcee..6a740a93f4bb 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -13,6 +13,7 @@
*/
#include <linux/ftrace.h>
+#include <linux/module.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -63,6 +64,20 @@ static unsigned long adjust_address(struct dyn_ftrace *rec, unsigned long addr)
}
#endif
+int ftrace_arch_code_modify_prepare(void)
+{
+ set_kernel_text_rw();
+ set_all_modules_text_rw();
+ return 0;
+}
+
+int ftrace_arch_code_modify_post_process(void)
+{
+ set_all_modules_text_ro();
+ set_kernel_text_ro();
+ return 0;
+}
+
static unsigned long ftrace_call_replace(unsigned long pc, unsigned long addr)
{
return arm_gen_branch_link(pc, addr);
diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
index 778c2f7024ff..b321c8fbb87d 100644
--- a/arch/arm/kernel/kgdb.c
+++ b/arch/arm/kernel/kgdb.c
@@ -144,6 +144,8 @@ int kgdb_arch_handle_exception(int exception_vector, int signo,
static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr)
{
+ if (user_mode(regs))
+ return -1;
kgdb_handle_exception(1, SIGTRAP, 0, regs);
return 0;
@@ -151,6 +153,8 @@ static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr)
static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int instr)
{
+ if (user_mode(regs))
+ return -1;
compiled_break = 1;
kgdb_handle_exception(1, SIGTRAP, 0, regs);
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index ac4c2e5e17e4..978002e5b406 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -32,6 +32,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/cpuidle.h>
#include <linux/leds.h>
+#include <linux/console.h>
#include <asm/cacheflush.h>
#include <asm/idmap.h>
@@ -57,9 +58,46 @@ static const char *isa_modes[] = {
"ARM" , "Thumb" , "Jazelle", "ThumbEE"
};
+#ifdef CONFIG_SMP
+void arch_trigger_all_cpu_backtrace(void)
+{
+ smp_send_all_cpu_backtrace();
+}
+#else
+void arch_trigger_all_cpu_backtrace(void)
+{
+ dump_stack();
+}
+#endif
+
extern void call_with_stack(void (*fn)(void *), void *arg, void *sp);
typedef void (*phys_reset_t)(unsigned long);
+#ifdef CONFIG_ARM_FLUSH_CONSOLE_ON_RESTART
+void arm_machine_flush_console(void)
+{
+ printk("\n");
+ pr_emerg("Restarting %s\n", linux_banner);
+ if (console_trylock()) {
+ console_unlock();
+ return;
+ }
+
+ mdelay(50);
+
+ local_irq_disable();
+ if (!console_trylock())
+ pr_emerg("arm_restart: Console was locked! Busting\n");
+ else
+ pr_emerg("arm_restart: Console was locked!\n");
+ console_unlock();
+}
+#else
+void arm_machine_flush_console(void)
+{
+}
+#endif
+
/*
* A temporary stack to use for CPU reset. This is static so that we
* don't clobber it with the identity mapping. When running with this
@@ -147,6 +185,7 @@ void arch_cpu_idle_prepare(void)
void arch_cpu_idle_enter(void)
{
+ idle_notifier_call_chain(IDLE_START);
ledtrig_cpu(CPU_LED_IDLE_START);
#ifdef CONFIG_PL310_ERRATA_769419
wmb();
@@ -156,6 +195,7 @@ void arch_cpu_idle_enter(void)
void arch_cpu_idle_exit(void)
{
ledtrig_cpu(CPU_LED_IDLE_END);
+ idle_notifier_call_chain(IDLE_END);
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -195,6 +235,16 @@ __setup("reboot=", reboot_setup);
*/
void machine_shutdown(void)
{
+#ifdef CONFIG_SMP
+ /*
+ * Disable preemption so we're guaranteed to
+ * run to power off or reboot and prevent
+ * the possibility of switching to another
+ * thread that might wind up blocking on
+ * one of the stopped CPUs.
+ */
+ preempt_disable();
+#endif
disable_nonboot_cpus();
}
@@ -240,6 +290,10 @@ void machine_restart(char *cmd)
{
smp_send_stop();
+ /* Flush the console to make sure all the relevant messages make it
+ * out to the console drivers */
+ arm_machine_flush_console();
+
arm_pm_restart(reboot_mode, cmd);
/* Give a grace period for failure to restart of 1s */
@@ -251,6 +305,77 @@ void machine_restart(char *cmd)
while (1);
}
+/*
+ * dump a block of kernel memory from around the given address
+ */
+static void show_data(unsigned long addr, int nbytes, const char *name)
+{
+ int i, j;
+ int nlines;
+ u32 *p;
+
+ /*
+ * don't attempt to dump non-kernel addresses or
+ * values that are probably just small negative numbers
+ */
+ if (addr < PAGE_OFFSET || addr > -256UL)
+ return;
+
+ printk("\n%s: %#lx:\n", name, addr);
+
+ /*
+ * round address down to a 32 bit boundary
+ * and always dump a multiple of 32 bytes
+ */
+ p = (u32 *)(addr & ~(sizeof(u32) - 1));
+ nbytes += (addr & (sizeof(u32) - 1));
+ nlines = (nbytes + 31) / 32;
+
+
+ for (i = 0; i < nlines; i++) {
+ /*
+ * just display low 16 bits of address to keep
+ * each line of the dump < 80 characters
+ */
+ printk("%04lx ", (unsigned long)p & 0xffff);
+ for (j = 0; j < 8; j++) {
+ u32 data;
+ if (probe_kernel_address(p, data)) {
+ printk(" ********");
+ } else {
+ printk(" %08x", data);
+ }
+ ++p;
+ }
+ printk("\n");
+ }
+}
+
+static void show_extra_register_data(struct pt_regs *regs, int nbytes)
+{
+ mm_segment_t fs;
+
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ show_data(regs->ARM_pc - nbytes, nbytes * 2, "PC");
+ show_data(regs->ARM_lr - nbytes, nbytes * 2, "LR");
+ show_data(regs->ARM_sp - nbytes, nbytes * 2, "SP");
+ show_data(regs->ARM_ip - nbytes, nbytes * 2, "IP");
+ show_data(regs->ARM_fp - nbytes, nbytes * 2, "FP");
+ show_data(regs->ARM_r0 - nbytes, nbytes * 2, "R0");
+ show_data(regs->ARM_r1 - nbytes, nbytes * 2, "R1");
+ show_data(regs->ARM_r2 - nbytes, nbytes * 2, "R2");
+ show_data(regs->ARM_r3 - nbytes, nbytes * 2, "R3");
+ show_data(regs->ARM_r4 - nbytes, nbytes * 2, "R4");
+ show_data(regs->ARM_r5 - nbytes, nbytes * 2, "R5");
+ show_data(regs->ARM_r6 - nbytes, nbytes * 2, "R6");
+ show_data(regs->ARM_r7 - nbytes, nbytes * 2, "R7");
+ show_data(regs->ARM_r8 - nbytes, nbytes * 2, "R8");
+ show_data(regs->ARM_r9 - nbytes, nbytes * 2, "R9");
+ show_data(regs->ARM_r10 - nbytes, nbytes * 2, "R10");
+ set_fs(fs);
+}
+
void __show_regs(struct pt_regs *regs)
{
unsigned long flags;
@@ -307,6 +432,8 @@ void __show_regs(struct pt_regs *regs)
printk("Control: %08x%s\n", ctrl, buf);
}
#endif
+
+ show_extra_register_data(regs, 128);
}
void show_regs(struct pt_regs * regs)
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 03deeffd9f6d..394424b25254 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -916,7 +916,7 @@ enum ptrace_syscall_dir {
PTRACE_SYSCALL_EXIT,
};
-static int tracehook_report_syscall(struct pt_regs *regs,
+static void tracehook_report_syscall(struct pt_regs *regs,
enum ptrace_syscall_dir dir)
{
unsigned long ip;
@@ -934,7 +934,6 @@ static int tracehook_report_syscall(struct pt_regs *regs,
current_thread_info()->syscall = -1;
regs->ARM_ip = ip;
- return current_thread_info()->syscall;
}
asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno)
@@ -946,7 +945,9 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno)
return -1;
if (test_thread_flag(TIF_SYSCALL_TRACE))
- scno = tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
+ tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
+
+ scno = current_thread_info()->syscall;
if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
trace_sys_enter(regs, scno);
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index d6e3d1ca4ddf..f2724e475b96 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -70,6 +70,7 @@ enum ipi_msg_type {
IPI_CALL_FUNC_SINGLE,
IPI_CPU_STOP,
IPI_COMPLETION,
+ IPI_CPU_BACKTRACE,
};
static DECLARE_COMPLETION(cpu_running);
@@ -468,6 +469,7 @@ static const char *ipi_types[NR_IPI] = {
S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"),
S(IPI_CPU_STOP, "CPU stop interrupts"),
S(IPI_COMPLETION, "completion interrupts"),
+ S(IPI_CPU_BACKTRACE, "CPU backtrace"),
};
void show_ipi_list(struct seq_file *p, int prec)
@@ -606,6 +608,58 @@ static void ipi_complete(unsigned int cpu)
complete(per_cpu(cpu_completion, cpu));
}
+static cpumask_t backtrace_mask;
+static DEFINE_RAW_SPINLOCK(backtrace_lock);
+
+/* "in progress" flag of arch_trigger_all_cpu_backtrace */
+static unsigned long backtrace_flag;
+
+void smp_send_all_cpu_backtrace(void)
+{
+ unsigned int this_cpu = smp_processor_id();
+ int i;
+
+ if (test_and_set_bit(0, &backtrace_flag))
+ /*
+ * If there is already a trigger_all_cpu_backtrace() in progress
+ * (backtrace_flag == 1), don't output double cpu dump infos.
+ */
+ return;
+
+ cpumask_copy(&backtrace_mask, cpu_online_mask);
+ cpu_clear(this_cpu, backtrace_mask);
+
+ pr_info("Backtrace for cpu %d (current):\n", this_cpu);
+ dump_stack();
+
+ pr_info("\nsending IPI to all other CPUs:\n");
+ smp_cross_call(&backtrace_mask, IPI_CPU_BACKTRACE);
+
+ /* Wait for up to 10 seconds for all other CPUs to do the backtrace */
+ for (i = 0; i < 10 * 1000; i++) {
+ if (cpumask_empty(&backtrace_mask))
+ break;
+ mdelay(1);
+ }
+
+ clear_bit(0, &backtrace_flag);
+ smp_mb__after_clear_bit();
+}
+
+/*
+ * ipi_cpu_backtrace - handle IPI from smp_send_all_cpu_backtrace()
+ */
+static void ipi_cpu_backtrace(unsigned int cpu, struct pt_regs *regs)
+{
+ if (cpu_isset(cpu, backtrace_mask)) {
+ raw_spin_lock(&backtrace_lock);
+ pr_warning("IPI backtrace for cpu %d\n", cpu);
+ show_regs(regs);
+ raw_spin_unlock(&backtrace_lock);
+ cpu_clear(cpu, backtrace_mask);
+ }
+}
+
/*
* Main handler for inter-processor interrupts
*/
@@ -663,6 +717,10 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
irq_exit();
break;
+ case IPI_CPU_BACKTRACE:
+ ipi_cpu_backtrace(cpu, regs);
+ break;
+
default:
printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n",
cpu, ipinr);
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 224a9cc09877..2d861a210ac5 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -7,6 +7,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \
obj-$(CONFIG_MMU) += fault-armv.o flush.o idmap.o ioremap.o \
mmap.o pgd.o mmu.o
+obj-$(CONFIG_DEBUG_RODATA) += rodata.o
ifneq ($(CONFIG_MMU),y)
obj-y += nommu.o
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
index c465faca51b0..90a130f98acf 100644
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -33,6 +33,9 @@ static void __iomem *l2x0_base;
static DEFINE_RAW_SPINLOCK(l2x0_lock);
static u32 l2x0_way_mask; /* Bitmask of active ways */
static u32 l2x0_size;
+static u32 l2x0_cache_id;
+static unsigned int l2x0_sets;
+static unsigned int l2x0_ways;
static unsigned long sync_reg_offset = L2X0_CACHE_SYNC;
/* Aurora don't have the cache ID register available, so we have to
@@ -49,6 +52,13 @@ struct l2x0_of_data {
static bool of_init = false;
+static inline bool is_pl310_rev(int rev)
+{
+ return (l2x0_cache_id &
+ (L2X0_CACHE_ID_PART_MASK | L2X0_CACHE_ID_REV_MASK)) ==
+ (L2X0_CACHE_ID_PART_L310 | rev);
+}
+
static inline void cache_wait_way(void __iomem *reg, unsigned long mask)
{
/* wait for cache operation by line or way to complete */
@@ -137,6 +147,23 @@ static void l2x0_cache_sync(void)
raw_spin_unlock_irqrestore(&l2x0_lock, flags);
}
+#ifdef CONFIG_PL310_ERRATA_727915
+static void l2x0_for_each_set_way(void __iomem *reg)
+{
+ int set;
+ int way;
+ unsigned long flags;
+
+ for (way = 0; way < l2x0_ways; way++) {
+ raw_spin_lock_irqsave(&l2x0_lock, flags);
+ for (set = 0; set < l2x0_sets; set++)
+ writel_relaxed((way << 28) | (set << 5), reg);
+ cache_sync();
+ raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+ }
+}
+#endif
+
static void __l2x0_flush_all(void)
{
debug_writel(0x03);
@@ -150,6 +177,13 @@ static void l2x0_flush_all(void)
{
unsigned long flags;
+#ifdef CONFIG_PL310_ERRATA_727915
+ if (is_pl310_rev(REV_PL310_R2P0)) {
+ l2x0_for_each_set_way(l2x0_base + L2X0_CLEAN_INV_LINE_IDX);
+ return;
+ }
+#endif
+
/* clean all ways */
raw_spin_lock_irqsave(&l2x0_lock, flags);
__l2x0_flush_all();
@@ -160,11 +194,20 @@ static void l2x0_clean_all(void)
{
unsigned long flags;
+#ifdef CONFIG_PL310_ERRATA_727915
+ if (is_pl310_rev(REV_PL310_R2P0)) {
+ l2x0_for_each_set_way(l2x0_base + L2X0_CLEAN_LINE_IDX);
+ return;
+ }
+#endif
+
/* clean all ways */
raw_spin_lock_irqsave(&l2x0_lock, flags);
+ debug_writel(0x03);
writel_relaxed(l2x0_way_mask, l2x0_base + L2X0_CLEAN_WAY);
cache_wait_way(l2x0_base + L2X0_CLEAN_WAY, l2x0_way_mask);
cache_sync();
+ debug_writel(0x00);
raw_spin_unlock_irqrestore(&l2x0_lock, flags);
}
@@ -323,65 +366,64 @@ static void l2x0_unlock(u32 cache_id)
void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
{
u32 aux;
- u32 cache_id;
u32 way_size = 0;
- int ways;
int way_size_shift = L2X0_WAY_SIZE_SHIFT;
const char *type;
l2x0_base = base;
if (cache_id_part_number_from_dt)
- cache_id = cache_id_part_number_from_dt;
+ l2x0_cache_id = cache_id_part_number_from_dt;
else
- cache_id = readl_relaxed(l2x0_base + L2X0_CACHE_ID);
+ l2x0_cache_id = readl_relaxed(l2x0_base + L2X0_CACHE_ID);
aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
aux &= aux_mask;
aux |= aux_val;
/* Determine the number of ways */
- switch (cache_id & L2X0_CACHE_ID_PART_MASK) {
+ switch (l2x0_cache_id & L2X0_CACHE_ID_PART_MASK) {
case L2X0_CACHE_ID_PART_L310:
if (aux & (1 << 16))
- ways = 16;
+ l2x0_ways = 16;
else
- ways = 8;
+ l2x0_ways = 8;
type = "L310";
#ifdef CONFIG_PL310_ERRATA_753970
/* Unmapped register. */
sync_reg_offset = L2X0_DUMMY_REG;
#endif
- if ((cache_id & L2X0_CACHE_ID_RTL_MASK) <= L2X0_CACHE_ID_RTL_R3P0)
+ if ((l2x0_cache_id & L2X0_CACHE_ID_RTL_MASK) <= L2X0_CACHE_ID_RTL_R3P0)
outer_cache.set_debug = pl310_set_debug;
break;
case L2X0_CACHE_ID_PART_L210:
- ways = (aux >> 13) & 0xf;
+ l2x0_ways = (aux >> 13) & 0xf;
type = "L210";
break;
case AURORA_CACHE_ID:
sync_reg_offset = AURORA_SYNC_REG;
- ways = (aux >> 13) & 0xf;
- ways = 2 << ((ways + 1) >> 2);
+ l2x0_ways = (aux >> 13) & 0xf;
+ l2x0_ways = 2 << ((l2x0_ways + 1) >> 2);
way_size_shift = AURORA_WAY_SIZE_SHIFT;
type = "Aurora";
break;
default:
/* Assume unknown chips have 8 ways */
- ways = 8;
+ l2x0_ways = 8;
type = "L2x0 series";
break;
}
- l2x0_way_mask = (1 << ways) - 1;
+ l2x0_way_mask = (1 << l2x0_ways) - 1;
/*
* L2 cache Size = Way size * Number of ways
*/
way_size = (aux & L2X0_AUX_CTRL_WAY_SIZE_MASK) >> 17;
- way_size = 1 << (way_size + way_size_shift);
+ way_size = SZ_1K << (way_size + way_size_shift);
- l2x0_size = ways * way_size * SZ_1K;
+ l2x0_size = l2x0_ways * way_size;
+ l2x0_sets = way_size / CACHE_LINE_SIZE;
/*
* Check if l2x0 controller is already enabled.
@@ -390,7 +432,7 @@ void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
*/
if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) {
/* Make sure that I&D is not locked down when starting */
- l2x0_unlock(cache_id);
+ l2x0_unlock(l2x0_cache_id);
/* l2x0 controller is disabled */
writel_relaxed(aux, l2x0_base + L2X0_AUX_CTRL);
@@ -419,7 +461,7 @@ void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
printk(KERN_INFO "%s cache controller enabled\n", type);
printk(KERN_INFO "l2x0: %d ways, CACHE_ID 0x%08x, AUX_CTRL 0x%08x, Cache size: %d B\n",
- ways, cache_id, aux, l2x0_size);
+ l2x0_ways, l2x0_cache_id, aux, l2x0_size);
}
#ifdef CONFIG_OF
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index d8fd4d4bd3d4..7a3d3d8d98d7 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -270,6 +270,11 @@ v6_dma_clean_range:
* - end - virtual end address of region
*/
ENTRY(v6_dma_flush_range)
+#ifdef CONFIG_CACHE_FLUSH_RANGE_LIMIT
+ sub r2, r1, r0
+ cmp r2, #CONFIG_CACHE_FLUSH_RANGE_LIMIT
+ bhi v6_dma_flush_dcache_all
+#endif
#ifdef CONFIG_DMA_CACHE_RWFO
ldrb r2, [r0] @ read for ownership
strb r2, [r0] @ write for ownership
@@ -292,6 +297,18 @@ ENTRY(v6_dma_flush_range)
mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr
+#ifdef CONFIG_CACHE_FLUSH_RANGE_LIMIT
+v6_dma_flush_dcache_all:
+ mov r0, #0
+#ifdef HARVARD_CACHE
+ mcr p15, 0, r0, c7, c14, 0 @ D cache clean+invalidate
+#else
+ mcr p15, 0, r0, c7, c15, 0 @ Cache clean+invalidate
+#endif
+ mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
+ mov pc, lr
+#endif
+
/*
* dma_map_area(start, size, dir)
* - start - kernel virtual start address
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 54fcddafec15..9820ad4b80c0 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -274,10 +274,10 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
local_irq_enable();
/*
- * If we're in an interrupt or have no user
+ * If we're in an interrupt, or have no irqs, or have no user
* context, we must not take the fault..
*/
- if (in_atomic() || !mm)
+ if (in_atomic() || irqs_disabled() || !mm)
goto no_context;
if (user_mode(regs))
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 4c7d5cddef35..a4e5cd5c3067 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -605,11 +605,25 @@ static void __init *early_alloc(unsigned long sz)
return early_alloc_aligned(sz, sz);
}
-static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr, unsigned long prot)
+static pte_t * __init early_pte_alloc(pmd_t *pmd)
+{
+ if (pmd_none(*pmd) || pmd_bad(*pmd))
+ return early_alloc(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE);
+ return pmd_page_vaddr(*pmd);
+}
+
+static void __init early_pte_install(pmd_t *pmd, pte_t *pte, unsigned long prot)
+{
+ __pmd_populate(pmd, __pa(pte), prot);
+ BUG_ON(pmd_bad(*pmd));
+}
+
+static pte_t * __init early_pte_alloc_and_install(pmd_t *pmd,
+ unsigned long addr, unsigned long prot)
{
if (pmd_none(*pmd)) {
- pte_t *pte = early_alloc(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE);
- __pmd_populate(pmd, __pa(pte), prot);
+ pte_t *pte = early_pte_alloc(pmd);
+ early_pte_install(pmd, pte, prot);
}
BUG_ON(pmd_bad(*pmd));
return pte_offset_kernel(pmd, addr);
@@ -619,11 +633,17 @@ static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
unsigned long end, unsigned long pfn,
const struct mem_type *type)
{
- pte_t *pte = early_pte_alloc(pmd, addr, type->prot_l1);
+ pte_t *start_pte = early_pte_alloc(pmd);
+ pte_t *pte = start_pte + pte_index(addr);
+
+ /* If replacing a section mapping, the whole section must be replaced */
+ BUG_ON(!pmd_none(*pmd) && pmd_bad(*pmd) && ((addr | end) & ~PMD_MASK));
+
do {
set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)), 0);
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+ early_pte_install(pmd, start_pte, type->prot_l1);
}
static void __init __map_init_section(pmd_t *pmd, unsigned long addr,
@@ -655,7 +675,8 @@ static void __init __map_init_section(pmd_t *pmd, unsigned long addr,
static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
unsigned long end, phys_addr_t phys,
- const struct mem_type *type)
+ const struct mem_type *type,
+ bool force_pages)
{
pmd_t *pmd = pmd_offset(pud, addr);
unsigned long next;
@@ -672,7 +693,8 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
* aligned to a section boundary.
*/
if (type->prot_sect &&
- ((addr | next | phys) & ~SECTION_MASK) == 0) {
+ ((addr | next | phys) & ~SECTION_MASK) == 0 &&
+ !force_pages) {
__map_init_section(pmd, addr, next, phys, type);
} else {
alloc_init_pte(pmd, addr, next,
@@ -686,14 +708,15 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr,
unsigned long end, phys_addr_t phys,
- const struct mem_type *type)
+ const struct mem_type *type,
+ bool force_pages)
{
pud_t *pud = pud_offset(pgd, addr);
unsigned long next;
do {
next = pud_addr_end(addr, end);
- alloc_init_pmd(pud, addr, next, phys, type);
+ alloc_init_pmd(pud, addr, next, phys, type, force_pages);
phys += next - addr;
} while (pud++, addr = next, addr != end);
}
@@ -767,7 +790,7 @@ static void __init create_36bit_mapping(struct map_desc *md,
* offsets, and we take full advantage of sections and
* supersections.
*/
-static void __init create_mapping(struct map_desc *md)
+static void __init create_mapping(struct map_desc *md, bool force_pages)
{
unsigned long addr, length, end;
phys_addr_t phys;
@@ -817,7 +840,7 @@ static void __init create_mapping(struct map_desc *md)
do {
unsigned long next = pgd_addr_end(addr, end);
- alloc_init_pud(pgd, addr, next, phys, type);
+ alloc_init_pud(pgd, addr, next, phys, type, force_pages);
phys += next - addr;
addr = next;
@@ -839,7 +862,7 @@ void __init iotable_init(struct map_desc *io_desc, int nr)
svm = early_alloc_aligned(sizeof(*svm) * nr, __alignof__(*svm));
for (md = io_desc; nr; md++, nr--) {
- create_mapping(md);
+ create_mapping(md, false);
vm = &svm->vm;
vm->addr = (void *)(md->virtual & PAGE_MASK);
@@ -960,7 +983,7 @@ void __init debug_ll_io_init(void)
map.virtual &= PAGE_MASK;
map.length = PAGE_SIZE;
map.type = MT_DEVICE;
- create_mapping(&map);
+ create_mapping(&map, false);
}
#endif
@@ -1005,6 +1028,28 @@ void __init sanity_check_meminfo(void)
struct membank *bank = &meminfo.bank[j];
*bank = meminfo.bank[i];
+#ifdef CONFIG_SPARSEMEM
+ if (pfn_to_section_nr(bank_pfn_start(bank)) !=
+ pfn_to_section_nr(bank_pfn_end(bank) - 1)) {
+ phys_addr_t sz;
+ unsigned long start_pfn = bank_pfn_start(bank);
+ unsigned long end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
+ sz = ((phys_addr_t)(end_pfn - start_pfn) << PAGE_SHIFT);
+
+ if (meminfo.nr_banks >= NR_BANKS) {
+ pr_crit("NR_BANKS too low, ignoring %lld bytes of memory\n",
+ (unsigned long long)(bank->size - sz));
+ } else {
+ memmove(bank + 1, bank,
+ (meminfo.nr_banks - i) * sizeof(*bank));
+ meminfo.nr_banks++;
+ bank[1].size -= sz;
+ bank[1].start = __pfn_to_phys(end_pfn);
+ }
+ bank->size = sz;
+ }
+#endif
+
if (bank->start > ULONG_MAX)
highmem = 1;
@@ -1202,7 +1247,7 @@ static void __init devicemaps_init(struct machine_desc *mdesc)
map.virtual = MODULES_VADDR;
map.length = ((unsigned long)_etext - map.virtual + ~SECTION_MASK) & SECTION_MASK;
map.type = MT_ROM;
- create_mapping(&map);
+ create_mapping(&map, false);
#endif
/*
@@ -1213,14 +1258,14 @@ static void __init devicemaps_init(struct machine_desc *mdesc)
map.virtual = FLUSH_BASE;
map.length = SZ_1M;
map.type = MT_CACHECLEAN;
- create_mapping(&map);
+ create_mapping(&map, false);
#endif
#ifdef FLUSH_BASE_MINICACHE
map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS + SZ_1M);
map.virtual = FLUSH_BASE_MINICACHE;
map.length = SZ_1M;
map.type = MT_MINICLEAN;
- create_mapping(&map);
+ create_mapping(&map, false);
#endif
/*
@@ -1236,13 +1281,13 @@ static void __init devicemaps_init(struct machine_desc *mdesc)
#else
map.type = MT_LOW_VECTORS;
#endif
- create_mapping(&map);
+ create_mapping(&map, false);
if (!vectors_high()) {
map.virtual = 0;
map.length = PAGE_SIZE * 2;
map.type = MT_LOW_VECTORS;
- create_mapping(&map);
+ create_mapping(&map, false);
}
/* Now create a kernel read-only mapping */
@@ -1250,7 +1295,7 @@ static void __init devicemaps_init(struct machine_desc *mdesc)
map.virtual = 0xffff0000 + PAGE_SIZE;
map.length = PAGE_SIZE;
map.type = MT_LOW_VECTORS;
- create_mapping(&map);
+ create_mapping(&map, false);
/*
* Ask the machine support to map in the statically mapped devices.
@@ -1275,20 +1320,23 @@ static void __init devicemaps_init(struct machine_desc *mdesc)
static void __init kmap_init(void)
{
#ifdef CONFIG_HIGHMEM
- pkmap_page_table = early_pte_alloc(pmd_off_k(PKMAP_BASE),
+ pkmap_page_table = early_pte_alloc_and_install(pmd_off_k(PKMAP_BASE),
PKMAP_BASE, _PAGE_KERNEL_TABLE);
#endif
}
+
static void __init map_lowmem(void)
{
struct memblock_region *reg;
+ phys_addr_t start;
+ phys_addr_t end;
+ struct map_desc map;
/* Map all the lowmem memory banks. */
for_each_memblock(memory, reg) {
- phys_addr_t start = reg->base;
- phys_addr_t end = start + reg->size;
- struct map_desc map;
+ start = reg->base;
+ end = start + reg->size;
if (end > arm_lowmem_limit)
end = arm_lowmem_limit;
@@ -1300,8 +1348,20 @@ static void __init map_lowmem(void)
map.length = end - start;
map.type = MT_MEMORY;
- create_mapping(&map);
+ create_mapping(&map, false);
}
+
+#ifdef CONFIG_DEBUG_RODATA
+ start = __pa(_stext) & PMD_MASK;
+ end = ALIGN(__pa(__end_rodata), PMD_SIZE);
+
+ map.pfn = __phys_to_pfn(start);
+ map.virtual = __phys_to_virt(start);
+ map.length = end - start;
+ map.type = MT_MEMORY;
+
+ create_mapping(&map, true);
+#endif
}
/*
diff --git a/arch/arm/mm/rodata.c b/arch/arm/mm/rodata.c
new file mode 100644
index 000000000000..9a8eb841c428
--- /dev/null
+++ b/arch/arm/mm/rodata.c
@@ -0,0 +1,159 @@
+/*
+ * linux/arch/arm/mm/rodata.c
+ *
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * Based on x86 implementation in arch/x86/mm/init_32.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <asm/cache.h>
+#include <asm/pgtable.h>
+#include <asm/rodata.h>
+#include <asm/sections.h>
+#include <asm/tlbflush.h>
+
+#include "mm.h"
+
+static int kernel_set_to_readonly __read_mostly;
+
+#ifdef CONFIG_DEBUG_RODATA_TEST
+static const int rodata_test_data = 0xC3;
+
+static noinline void rodata_test(void)
+{
+ int result;
+
+ pr_info("%s: attempting to write to read-only section:\n", __func__);
+
+ if (*(volatile int *)&rodata_test_data != 0xC3) {
+ pr_err("read only data changed before test\n");
+ return;
+ }
+
+ /*
+ * Attempt to to write to rodata_test_data, trapping the expected
+ * data abort. If the trap executed, result will be 1. If it didn't,
+ * result will be 0xFF.
+ */
+ asm volatile(
+ "0: str %[zero], [%[rodata_test_data]]\n"
+ " mov %[result], #0xFF\n"
+ " b 2f\n"
+ "1: mov %[result], #1\n"
+ "2:\n"
+
+ /* Exception fixup - if store at label 0 faults, jumps to 1 */
+ ".pushsection __ex_table, \"a\"\n"
+ " .long 0b, 1b\n"
+ ".popsection\n"
+
+ : [result] "=r" (result)
+ : [rodata_test_data] "r" (&rodata_test_data), [zero] "r" (0)
+ : "memory"
+ );
+
+ if (result == 1)
+ pr_info("write to read-only section trapped, success\n");
+ else
+ pr_err("write to read-only section NOT trapped, test failed\n");
+
+ if (*(volatile int *)&rodata_test_data != 0xC3)
+ pr_err("read only data changed during write\n");
+}
+#else
+static inline void rodata_test(void) { }
+#endif
+
+static int set_page_attributes(unsigned long virt, int numpages,
+ pte_t (*f)(pte_t))
+{
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long start = virt;
+ unsigned long end = virt + (numpages << PAGE_SHIFT);
+ unsigned long pmd_end;
+
+ while (virt < end) {
+ pmd = pmd_off_k(virt);
+ pmd_end = min(ALIGN(virt + 1, PMD_SIZE), end);
+
+ if ((pmd_val(*pmd) & PMD_TYPE_MASK) != PMD_TYPE_TABLE) {
+ pr_err("%s: pmd %p=%08lx for %08lx not page table\n",
+ __func__, pmd, pmd_val(*pmd), virt);
+ virt = pmd_end;
+ continue;
+ }
+
+ while (virt < pmd_end) {
+ pte = pte_offset_kernel(pmd, virt);
+ set_pte_ext(pte, f(*pte), 0);
+ virt += PAGE_SIZE;
+ }
+ }
+
+ flush_tlb_kernel_range(start, end);
+
+ return 0;
+}
+
+int set_memory_ro(unsigned long virt, int numpages)
+{
+ return set_page_attributes(virt, numpages, pte_wrprotect);
+}
+EXPORT_SYMBOL(set_memory_ro);
+
+int set_memory_rw(unsigned long virt, int numpages)
+{
+ return set_page_attributes(virt, numpages, pte_mkwrite);
+}
+EXPORT_SYMBOL(set_memory_rw);
+
+void set_kernel_text_rw(void)
+{
+ unsigned long start = PAGE_ALIGN((unsigned long)_text);
+ unsigned long size = PAGE_ALIGN((unsigned long)__end_rodata) - start;
+
+ if (!kernel_set_to_readonly)
+ return;
+
+ pr_debug("Set kernel text: %lx - %lx to read-write\n",
+ start, start + size);
+
+ set_memory_rw(start, size >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+ unsigned long start = PAGE_ALIGN((unsigned long)_text);
+ unsigned long size = PAGE_ALIGN((unsigned long)__end_rodata) - start;
+
+ if (!kernel_set_to_readonly)
+ return;
+
+ pr_info_once("Write protecting the kernel text section %lx - %lx\n",
+ start, start + size);
+
+ pr_debug("Set kernel text: %lx - %lx to read only\n",
+ start, start + size);
+
+ set_memory_ro(start, size >> PAGE_SHIFT);
+}
+
+void mark_rodata_ro(void)
+{
+ kernel_set_to_readonly = 1;
+
+ set_kernel_text_ro();
+
+ rodata_test();
+}
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 5dfbb0b8e7f4..452fb3ad68aa 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/uaccess.h>
#include <linux/user.h>
+#include <linux/export.h>
#include <asm/cp15.h>
#include <asm/cputype.h>
@@ -648,6 +649,52 @@ static int vfp_hotplug(struct notifier_block *b, unsigned long action,
return NOTIFY_OK;
}
+#ifdef CONFIG_KERNEL_MODE_NEON
+
+/*
+ * Kernel-side NEON support functions
+ */
+void kernel_neon_begin(void)
+{
+ struct thread_info *thread = current_thread_info();
+ unsigned int cpu;
+ u32 fpexc;
+
+ /*
+ * Kernel mode NEON is only allowed outside of interrupt context
+ * with preemption disabled. This will make sure that the kernel
+ * mode NEON register contents never need to be preserved.
+ */
+ BUG_ON(in_interrupt());
+ cpu = get_cpu();
+
+ fpexc = fmrx(FPEXC) | FPEXC_EN;
+ fmxr(FPEXC, fpexc);
+
+ /*
+ * Save the userland NEON/VFP state. Under UP,
+ * the owner could be a task other than 'current'
+ */
+ if (vfp_state_in_hw(cpu, thread))
+ vfp_save_state(&thread->vfpstate, fpexc);
+#ifndef CONFIG_SMP
+ else if (vfp_current_hw_state[cpu] != NULL)
+ vfp_save_state(vfp_current_hw_state[cpu], fpexc);
+#endif
+ vfp_current_hw_state[cpu] = NULL;
+}
+EXPORT_SYMBOL(kernel_neon_begin);
+
+void kernel_neon_end(void)
+{
+ /* Disable the NEON/VFP unit. */
+ fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN);
+ put_cpu();
+}
+EXPORT_SYMBOL(kernel_neon_end);
+
+#endif /* CONFIG_KERNEL_MODE_NEON */
+
/*
* VFP support code initialisation.
*/
@@ -731,4 +778,4 @@ static int __init vfp_init(void)
return 0;
}
-late_initcall(vfp_init);
+core_initcall(vfp_init);
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9742106cb51c..58ed097828b6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -15,6 +15,7 @@ config ARM64
select ARM_GIC
select ARM_GIC_V3
select BUILDTIME_EXTABLE_SORT
+ select AUDIT_ARCH_COMPAT_GENERIC
select CLONE_BACKWARDS
select COMMON_CLK
select CPU_PM if (SUSPEND || CPU_IDLE)
@@ -34,6 +35,8 @@ config ARM64
select HARDIRQS_SW_RESEND
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KGDB
+ select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
select HAVE_C_RECORDMCOUNT
select HAVE_CC_STACKPROTECTOR
@@ -336,6 +339,31 @@ config HOTPLUG_CPU
Say Y here to experiment with turning CPUs off and on. CPUs
can be controlled through /sys/devices/system/cpu.
+config SWP_EMULATE
+ bool "Emulate SWP/SWPB instructions"
+ help
+ ARMv6 architecture deprecates use of the SWP/SWPB instructions. ARMv8
+ oblosetes the use of SWP/SWPB instructions. ARMv7 multiprocessing
+ extensions introduce the ability to disable these instructions,
+ triggering an undefined instruction exception when executed. Say Y
+ here to enable software emulation of these instructions for userspace
+ (not kernel) using LDREX/STREX. Also creates /proc/cpu/swp_emulation
+ for statistics.
+
+ In some older versions of glibc [<=2.8] SWP is used during futex
+ trylock() operations with the assumption that the code will not
+ be preempted. This invalid assumption may be more likely to fail
+ with SWP emulation enabled, leading to deadlock of the user
+ application.
+
+ NOTE: when accessing uncached shared regions, LDREX/STREX rely
+ on an external transaction monitoring block called a global
+ monitor to maintain update atomicity. If your system does not
+ implement a global monitor, this option can cause programs that
+ perform SWP operations to uncached memory to deadlock.
+
+ If unsure, say Y.
+
source kernel/Kconfig.preempt
config HZ
@@ -378,6 +406,27 @@ config ARCH_WANT_HUGE_PMD_SHARE
config HAVE_ARCH_TRANSPARENT_HUGEPAGE
def_bool y
+config ARMV7_COMPAT
+ bool "Kernel support for ARMv7 applications"
+ depends on COMPAT
+ select SWP_EMULATE
+ help
+ This option enables features that allow that ran on an ARMv7 or older
+ processor to continue functioning.
+
+ If you want to execute ARMv7 applications, say Y
+
+config ARMV7_COMPAT_CPUINFO
+ bool "Report backwards compatible cpu features in /proc/cpuinfo"
+ depends on ARMV7_COMPAT
+ default y
+ help
+ This option makes /proc/cpuinfo list CPU features that an ARMv7 or
+ earlier kernel would report, but are not optional on an ARMv8 or later
+ processor.
+
+ If you want to execute ARMv7 applications, say Y
+
source "mm/Kconfig"
config FORCE_MAX_ZONEORDER
@@ -385,6 +434,19 @@ config FORCE_MAX_ZONEORDER
default "14" if (ARM64_64K_PAGES && TRANSPARENT_HUGEPAGE)
default "11"
+config SECCOMP
+ bool "Enable seccomp to safely compute untrusted bytecode"
+ ---help---
+ This kernel feature is useful for number crunching applications
+ that may need to compute untrusted bytecode during their
+ execution. By using pipes or other transports made available to
+ the process as file descriptors supporting the read/write
+ syscalls, it's possible to isolate those applications in
+ their own address space using seccomp. Once seccomp is
+ enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
+ and the task is only allowed to execute a few safe syscalls
+ defined by each seccomp mode.
+
endmenu
menu "Boot options"
@@ -397,6 +459,23 @@ config CMDLINE
entering them here. As a minimum, you should specify the the
root device (e.g. root=/dev/nfs).
+choice
+ prompt "Kernel command line type" if CMDLINE != ""
+ default CMDLINE_FROM_BOOTLOADER
+
+config CMDLINE_FROM_BOOTLOADER
+ bool "Use bootloader kernel arguments if available"
+ help
+ Uses the command-line options passed by the boot loader. If
+ the boot loader doesn't provide any, the default kernel command
+ string provided in CMDLINE will be used.
+
+config CMDLINE_EXTEND
+ bool "Extend bootloader kernel arguments"
+ help
+ The command-line arguments provided by the boot loader will be
+ appended to the default kernel command string.
+
config CMDLINE_FORCE
bool "Always use the default kernel command string"
help
@@ -404,6 +483,22 @@ config CMDLINE_FORCE
loader passes other arguments to the kernel.
This is useful if you cannot or don't want to change the
command-line options your boot loader passes to the kernel.
+endchoice
+
+config BUILD_ARM64_APPENDED_DTB_IMAGE
+ bool "Build a concatenated Image.gz/dtb by default"
+ depends on OF
+ help
+ Enabling this option will cause a concatenated Image.gz and list of
+ DTBs to be built by default (instead of a standalone Image.gz.)
+ The image will built in arch/arm64/boot/Image.gz-dtb
+
+config BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES
+ string "Default dtb names"
+ depends on BUILD_ARM64_APPENDED_DTB_IMAGE
+ help
+ Space separated list of names of dtbs to append when
+ building a concatenated Image.gz-dtb.
config EFI
bool "UEFI runtime support"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 3e7882c4e034..9e64836cb256 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -20,6 +20,7 @@ LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
KBUILD_DEFCONFIG := defconfig
KBUILD_CFLAGS += -mgeneral-regs-only
+KBUILD_CFLAGS += -fno-pic
ifeq ($(CONFIG_CPU_BIG_ENDIAN), y)
KBUILD_CPPFLAGS += -mbig-endian
AS += -EB
@@ -53,7 +54,12 @@ libs-y := arch/arm64/lib/ $(libs-y)
libs-y += $(LIBGCC)
# Default target when executing plain make
+ifeq ($(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE),y)
+KBUILD_IMAGE := Image.gz-dtb
+else
KBUILD_IMAGE := Image.gz
+endif
+
KBUILD_DTBS := dtbs
all: $(KBUILD_IMAGE) $(KBUILD_DTBS)
@@ -72,6 +78,9 @@ zinstall install: vmlinux
dtbs: scripts
$(Q)$(MAKE) $(build)=$(boot)/dts dtbs
+Image.gz-dtb: vmlinux scripts dtbs
+ $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
+
PHONY += vdso_install
vdso_install:
$(Q)$(MAKE) $(build)=arch/arm64/kernel/vdso $@
diff --git a/arch/arm64/boot/.gitignore b/arch/arm64/boot/.gitignore
index 8dab0bb6ae66..eb3551131b1e 100644
--- a/arch/arm64/boot/.gitignore
+++ b/arch/arm64/boot/.gitignore
@@ -1,2 +1,3 @@
Image
Image.gz
+Image.gz-dtb
diff --git a/arch/arm64/boot/Makefile b/arch/arm64/boot/Makefile
index 5a0e3ab854a5..df519849fa00 100644
--- a/arch/arm64/boot/Makefile
+++ b/arch/arm64/boot/Makefile
@@ -14,14 +14,27 @@
# Based on the ia64 boot/Makefile.
#
+include $(srctree)/arch/arm64/boot/dts/Makefile
+
targets := Image Image.gz
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
+endif
+DTB_OBJS := $(addprefix $(obj)/dts/,$(DTB_LIST))
+
$(obj)/Image: vmlinux FORCE
$(call if_changed,objcopy)
$(obj)/Image.gz: $(obj)/Image FORCE
$(call if_changed,gzip)
+$(obj)/Image.gz-dtb: $(obj)/Image.gz $(DTB_OBJS) FORCE
+ $(call if_changed,cat)
+
install: $(obj)/Image
$(CONFIG_SHELL) $(srctree)/$(src)/install.sh $(KERNELRELEASE) \
$(obj)/Image System.map "$(INSTALL_PATH)"
diff --git a/arch/arm64/boot/dts/Makefile b/arch/arm64/boot/dts/Makefile
index ef388176116d..661015fd7748 100644
--- a/arch/arm64/boot/dts/Makefile
+++ b/arch/arm64/boot/dts/Makefile
@@ -4,8 +4,15 @@ dtb-$(CONFIG_ARCH_VEXPRESS) += juno.dtb
dtb-$(CONFIG_ARCH_XGENE) += apm-mustang.dtb
targets += dtbs
-targets += $(dtb-y)
-dtbs: $(addprefix $(obj)/, $(dtb-y))
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
+endif
+targets += $(DTB_LIST)
-clean-files := *.dtb
+dtbs: $(addprefix $(obj)/, $(DTB_LIST))
+
+clean-files := dts/*.dtb *.dtb
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 2070a56ecc46..a3f935fde975 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -35,4 +35,4 @@ AFLAGS_aes-neon.o := -DINTERLEAVE=4
CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS
$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
- $(call if_changed_dep,cc_o_c)
+ $(call if_changed_rule,cc_o_c)
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 60f2f4c12256..79cd911ef88c 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -106,7 +106,7 @@ static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
(u8 *)ctx->key_enc, rounds, blocks, first);
- err = blkcipher_walk_done(desc, &walk, 0);
+ err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
}
kernel_neon_end();
return err;
@@ -128,7 +128,7 @@ static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
(u8 *)ctx->key_dec, rounds, blocks, first);
- err = blkcipher_walk_done(desc, &walk, 0);
+ err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
}
kernel_neon_end();
return err;
@@ -151,7 +151,7 @@ static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
(u8 *)ctx->key_enc, rounds, blocks, walk.iv,
first);
- err = blkcipher_walk_done(desc, &walk, 0);
+ err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
}
kernel_neon_end();
return err;
@@ -174,7 +174,7 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
(u8 *)ctx->key_dec, rounds, blocks, walk.iv,
first);
- err = blkcipher_walk_done(desc, &walk, 0);
+ err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
}
kernel_neon_end();
return err;
@@ -243,7 +243,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
(u8 *)ctx->key1.key_enc, rounds, blocks,
(u8 *)ctx->key2.key_enc, walk.iv, first);
- err = blkcipher_walk_done(desc, &walk, 0);
+ err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
}
kernel_neon_end();