aboutsummaryrefslogtreecommitdiff
path: root/arch/arm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm')
-rw-r--r--arch/arm/Kconfig31
-rw-r--r--arch/arm/Kconfig.debug21
-rw-r--r--arch/arm/Makefile2
-rw-r--r--arch/arm/boot/Makefile1
-rw-r--r--arch/arm/boot/compressed/head.S2
-rw-r--r--arch/arm/common/Kconfig4
-rw-r--r--arch/arm/common/Makefile1
-rw-r--r--arch/arm/common/fiq_glue.S118
-rw-r--r--arch/arm/common/fiq_glue_setup.c147
-rw-r--r--arch/arm/crypto/.gitignore1
-rw-r--r--arch/arm/crypto/Makefile24
-rw-r--r--arch/arm/crypto/aes_glue.c22
-rw-r--r--arch/arm/crypto/aes_glue.h19
-rw-r--r--arch/arm/crypto/aesbs-core.S_shipped2544
-rw-r--r--arch/arm/crypto/aesbs-glue.c434
-rw-r--r--arch/arm/crypto/bsaes-armv7.pl2467
-rw-r--r--arch/arm/crypto/sha1-armv7-neon.S634
-rw-r--r--arch/arm/crypto/sha1_glue.c58
-rw-r--r--arch/arm/crypto/sha1_neon_glue.c197
-rw-r--r--arch/arm/crypto/sha256-armv4.pl716
-rw-r--r--arch/arm/crypto/sha256-core.S_shipped2808
-rw-r--r--arch/arm/crypto/sha256_glue.c246
-rw-r--r--arch/arm/crypto/sha256_glue.h23
-rw-r--r--arch/arm/crypto/sha256_neon_glue.c172
-rw-r--r--arch/arm/crypto/sha512-armv7-neon.S455
-rw-r--r--arch/arm/crypto/sha512_neon_glue.c305
-rw-r--r--arch/arm/include/asm/Kbuild1
-rw-r--r--arch/arm/include/asm/barrier.h15
-rw-r--r--arch/arm/include/asm/cacheflush.h1
-rw-r--r--arch/arm/include/asm/crypto/sha1.h10
-rw-r--r--arch/arm/include/asm/fiq_glue.h33
-rw-r--r--arch/arm/include/asm/hardirq.h2
-rw-r--r--arch/arm/include/asm/hardware/cache-l2x0.h3
-rw-r--r--arch/arm/include/asm/irq.h3
-rw-r--r--arch/arm/include/asm/mach/mmc.h28
-rw-r--r--arch/arm/include/asm/neon.h36
-rw-r--r--arch/arm/include/asm/rodata.h32
-rw-r--r--arch/arm/include/asm/smp.h1
-rw-r--r--arch/arm/include/asm/syscall.h3
-rw-r--r--arch/arm/include/asm/unistd.h2
-rw-r--r--arch/arm/include/uapi/asm/unistd.h6
-rw-r--r--arch/arm/kernel/calls.S5
-rw-r--r--arch/arm/kernel/ftrace.c15
-rw-r--r--arch/arm/kernel/kgdb.c4
-rw-r--r--arch/arm/kernel/process.c127
-rw-r--r--arch/arm/kernel/ptrace.c7
-rw-r--r--arch/arm/kernel/smp.c58
-rw-r--r--arch/arm/mm/Makefile1
-rw-r--r--arch/arm/mm/cache-l2x0.c76
-rw-r--r--arch/arm/mm/cache-v6.S17
-rw-r--r--arch/arm/mm/fault.c4
-rw-r--r--arch/arm/mm/mmu.c106
-rw-r--r--arch/arm/mm/rodata.c159
-rw-r--r--arch/arm/vfp/vfpmodule.c49
54 files changed, 12157 insertions, 99 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 9422f298f32f..1a86004f3f27 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1958,6 +1958,15 @@ config XEN
help
Say Y if you want to run Linux in a Virtual Machine on Xen on ARM.
+config ARM_FLUSH_CONSOLE_ON_RESTART
+ bool "Force flush the console on restart"
+ help
+ If the console is locked while the system is rebooted, the messages
+ in the temporary logbuffer would not have propogated to all the
+ console drivers. This option forces the console lock to be
+ released if it failed to be acquired, which will cause all the
+ pending messages to be flushed.
+
endmenu
menu "Boot options"
@@ -1987,6 +1996,21 @@ config DEPRECATED_PARAM_STRUCT
This was deprecated in 2001 and announced to live on for 5 years.
Some old boot loaders still use this way.
+config BUILD_ARM_APPENDED_DTB_IMAGE
+ bool "Build a concatenated zImage/dtb by default"
+ depends on OF
+ help
+ Enabling this option will cause a concatenated zImage and DTB to
+ be built by default (instead of a standalone zImage.) The image
+ will built in arch/arm/boot/zImage-dtb.<dtb name>
+
+config BUILD_ARM_APPENDED_DTB_IMAGE_NAME
+ string "Default dtb name"
+ depends on BUILD_ARM_APPENDED_DTB_IMAGE
+ help
+ name of the dtb to append when building a concatenated
+ zImage/dtb.
+
# Compressed boot loader in ROM. Yes, we really want to ask about
# TEXT and BSS so we preserve their values in the config files.
config ZBOOT_ROM_TEXT
@@ -2336,6 +2360,13 @@ config NEON
Say Y to include support code for NEON, the ARMv7 Advanced SIMD
Extension.
+config KERNEL_MODE_NEON
+ bool "Support for NEON in kernel mode"
+ default n
+ depends on NEON
+ help
+ Say Y to include support for NEON in kernel mode.
+
endmenu
menu "Userspace binary formats"
diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug
index 5fdb6dbc5f89..4d1793474c9c 100644
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -63,6 +63,27 @@ config DEBUG_USER
8 - SIGSEGV faults
16 - SIGBUS faults
+config DEBUG_RODATA
+ bool "Write protect kernel text section"
+ default n
+ depends on DEBUG_KERNEL && MMU
+ ---help---
+ Mark the kernel text section as write-protected in the pagetables,
+ in order to catch accidental (and incorrect) writes to such const
+ data. This will cause the size of the kernel, plus up to 4MB, to
+ be mapped as pages instead of sections, which will increase TLB
+ pressure.
+ If in doubt, say "N".
+
+config DEBUG_RODATA_TEST
+ bool "Testcase for the DEBUG_RODATA feature"
+ depends on DEBUG_RODATA
+ default n
+ ---help---
+ This option enables a testcase for the DEBUG_RODATA
+ feature.
+ If in doubt, say "N"
+
# These options are only for real kernel hackers who want to get their hands dirty.
config DEBUG_LL
bool "Kernel low-level debugging functions (read help!)"
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 70bc19e2274f..9d36200374f0 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -265,6 +265,8 @@ libs-y := arch/arm/lib/ $(libs-y)
# Default target when executing plain make
ifeq ($(CONFIG_XIP_KERNEL),y)
KBUILD_IMAGE := xipImage
+else ifeq ($(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE),y)
+KBUILD_IMAGE := zImage-dtb.$(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE_NAME)
else
KBUILD_IMAGE := zImage
endif
diff --git a/arch/arm/boot/Makefile b/arch/arm/boot/Makefile
index 84aa2caf07ed..085bb96493a3 100644
--- a/arch/arm/boot/Makefile
+++ b/arch/arm/boot/Makefile
@@ -14,6 +14,7 @@
ifneq ($(MACHINE),)
include $(srctree)/$(MACHINE)/Makefile.boot
endif
+include $(srctree)/arch/arm/boot/dts/Makefile
# Note: the following conditions must always be true:
# ZRELADDR == virt_to_phys(PAGE_OFFSET + TEXT_OFFSET)
diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index f6e34be012ff..a8264aa9b03a 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -714,6 +714,8 @@ __armv7_mmu_cache_on:
bic r6, r6, #1 << 31 @ 32-bit translation system
bic r6, r6, #3 << 0 @ use only ttbr0
mcrne p15, 0, r3, c2, c0, 0 @ load page table pointer
+ mcrne p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
+ mcr p15, 0, r0, c7, c5, 4 @ ISB
mcrne p15, 0, r1, c3, c0, 0 @ load domain access control
mcrne p15, 0, r6, c2, c0, 2 @ load ttb control
#endif
diff --git a/arch/arm/common/Kconfig b/arch/arm/common/Kconfig
index 9353184d730d..ce01364a96e3 100644
--- a/arch/arm/common/Kconfig
+++ b/arch/arm/common/Kconfig
@@ -17,3 +17,7 @@ config SHARP_PARAM
config SHARP_SCOOP
bool
+
+config FIQ_GLUE
+ bool
+ select FIQ
diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile
index 462cd580fc2d..505c479202b6 100644
--- a/arch/arm/common/Makefile
+++ b/arch/arm/common/Makefile
@@ -4,6 +4,7 @@
obj-y += firmware.o
+obj-$(CONFIG_FIQ_GLUE) += fiq_glue.o fiq_glue_setup.o
obj-$(CONFIG_ICST) += icst.o
obj-$(CONFIG_SA1111) += sa1111.o
obj-$(CONFIG_PCI_HOST_VIA82C505) += via82c505.o
diff --git a/arch/arm/common/fiq_glue.S b/arch/arm/common/fiq_glue.S
new file mode 100644
index 000000000000..24b42cec4813
--- /dev/null
+++ b/arch/arm/common/fiq_glue.S
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+
+ .global fiq_glue_end
+
+ /* fiq stack: r0-r15,cpsr,spsr of interrupted mode */
+
+ENTRY(fiq_glue)
+ /* store pc, cpsr from previous mode, reserve space for spsr */
+ mrs r12, spsr
+ sub lr, lr, #4
+ subs r10, #1
+ bne nested_fiq
+
+ str r12, [sp, #-8]!
+ str lr, [sp, #-4]!
+
+ /* store r8-r14 from previous mode */
+ sub sp, sp, #(7 * 4)
+ stmia sp, {r8-r14}^
+ nop
+
+ /* store r0-r7 from previous mode */
+ stmfd sp!, {r0-r7}
+
+ /* setup func(data,regs) arguments */
+ mov r0, r9
+ mov r1, sp
+ mov r3, r8
+
+ mov r7, sp
+
+ /* Get sp and lr from non-user modes */
+ and r4, r12, #MODE_MASK
+ cmp r4, #USR_MODE
+ beq fiq_from_usr_mode
+
+ mov r7, sp
+ orr r4, r4, #(PSR_I_BIT | PSR_F_BIT)
+ msr cpsr_c, r4
+ str sp, [r7, #(4 * 13)]
+ str lr, [r7, #(4 * 14)]
+ mrs r5, spsr
+ str r5, [r7, #(4 * 17)]
+
+ cmp r4, #(SVC_MODE | PSR_I_BIT | PSR_F_BIT)
+ /* use fiq stack if we reenter this mode */
+ subne sp, r7, #(4 * 3)
+
+fiq_from_usr_mode:
+ msr cpsr_c, #(SVC_MODE | PSR_I_BIT | PSR_F_BIT)
+ mov r2, sp
+ sub sp, r7, #12
+ stmfd sp!, {r2, ip, lr}
+ /* call func(data,regs) */
+ blx r3
+ ldmfd sp, {r2, ip, lr}
+ mov sp, r2
+
+ /* restore/discard saved state */
+ cmp r4, #USR_MODE
+ beq fiq_from_usr_mode_exit
+
+ msr cpsr_c, r4
+ ldr sp, [r7, #(4 * 13)]
+ ldr lr, [r7, #(4 * 14)]
+ msr spsr_cxsf, r5
+
+fiq_from_usr_mode_exit:
+ msr cpsr_c, #(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)
+
+ ldmfd sp!, {r0-r7}
+ ldr lr, [sp, #(4 * 7)]
+ ldr r12, [sp, #(4 * 8)]
+ add sp, sp, #(10 * 4)
+exit_fiq:
+ msr spsr_cxsf, r12
+ add r10, #1
+ cmp r11, #0
+ moveqs pc, lr
+ bx r11 /* jump to custom fiq return function */
+
+nested_fiq:
+ orr r12, r12, #(PSR_F_BIT)
+ b exit_fiq
+
+fiq_glue_end:
+
+ENTRY(fiq_glue_setup) /* func, data, sp, smc call number */
+ stmfd sp!, {r4}
+ mrs r4, cpsr
+ msr cpsr_c, #(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)
+ movs r8, r0
+ mov r9, r1
+ mov sp, r2
+ mov r11, r3
+ moveq r10, #0
+ movne r10, #1
+ msr cpsr_c, r4
+ ldmfd sp!, {r4}
+ bx lr
+
diff --git a/arch/arm/common/fiq_glue_setup.c b/arch/arm/common/fiq_glue_setup.c
new file mode 100644
index 000000000000..8cb1b611c6d5
--- /dev/null
+++ b/arch/arm/common/fiq_glue_setup.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <asm/fiq.h>
+#include <asm/fiq_glue.h>
+
+extern unsigned char fiq_glue, fiq_glue_end;
+extern void fiq_glue_setup(void *func, void *data, void *sp,
+ fiq_return_handler_t fiq_return_handler);
+
+static struct fiq_handler fiq_debbuger_fiq_handler = {
+ .name = "fiq_glue",
+};
+DEFINE_PER_CPU(void *, fiq_stack);
+static struct fiq_glue_handler *current_handler;
+static fiq_return_handler_t fiq_return_handler;
+static DEFINE_MUTEX(fiq_glue_lock);
+
+static void fiq_glue_setup_helper(void *info)
+{
+ struct fiq_glue_handler *handler = info;
+ fiq_glue_setup(handler->fiq, handler,
+ __get_cpu_var(fiq_stack) + THREAD_START_SP,
+ fiq_return_handler);
+}
+
+int fiq_glue_register_handler(struct fiq_glue_handler *handler)
+{
+ int ret;
+ int cpu;
+
+ if (!handler || !handler->fiq)
+ return -EINVAL;
+
+ mutex_lock(&fiq_glue_lock);
+ if (fiq_stack) {
+ ret = -EBUSY;
+ goto err_busy;
+ }
+
+ for_each_possible_cpu(cpu) {
+ void *stack;
+ stack = (void *)__get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
+ if (WARN_ON(!stack)) {
+ ret = -ENOMEM;
+ goto err_alloc_fiq_stack;
+ }
+ per_cpu(fiq_stack, cpu) = stack;
+ }
+
+ ret = claim_fiq(&fiq_debbuger_fiq_handler);
+ if (WARN_ON(ret))
+ goto err_claim_fiq;
+
+ current_handler = handler;
+ on_each_cpu(fiq_glue_setup_helper, handler, true);
+ set_fiq_handler(&fiq_glue, &fiq_glue_end - &fiq_glue);
+
+ mutex_unlock(&fiq_glue_lock);
+ return 0;
+
+err_claim_fiq:
+err_alloc_fiq_stack:
+ for_each_possible_cpu(cpu) {
+ __free_pages(per_cpu(fiq_stack, cpu), THREAD_SIZE_ORDER);
+ per_cpu(fiq_stack, cpu) = NULL;
+ }
+err_busy:
+ mutex_unlock(&fiq_glue_lock);
+ return ret;
+}
+
+static void fiq_glue_update_return_handler(void (*fiq_return)(void))
+{
+ fiq_return_handler = fiq_return;
+ if (current_handler)
+ on_each_cpu(fiq_glue_setup_helper, current_handler, true);
+}
+
+int fiq_glue_set_return_handler(void (*fiq_return)(void))
+{
+ int ret;
+
+ mutex_lock(&fiq_glue_lock);
+ if (fiq_return_handler) {
+ ret = -EBUSY;
+ goto err_busy;
+ }
+ fiq_glue_update_return_handler(fiq_return);
+ ret = 0;
+err_busy:
+ mutex_unlock(&fiq_glue_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(fiq_glue_set_return_handler);
+
+int fiq_glue_clear_return_handler(void (*fiq_return)(void))
+{
+ int ret;
+
+ mutex_lock(&fiq_glue_lock);
+ if (WARN_ON(fiq_return_handler != fiq_return)) {
+ ret = -EINVAL;
+ goto err_inval;
+ }
+ fiq_glue_update_return_handler(NULL);
+ ret = 0;
+err_inval:
+ mutex_unlock(&fiq_glue_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(fiq_glue_clear_return_handler);
+
+/**
+ * fiq_glue_resume - Restore fiqs after suspend or low power idle states
+ *
+ * This must be called before calling local_fiq_enable after returning from a
+ * power state where the fiq mode registers were lost. If a driver provided
+ * a resume hook when it registered the handler it will be called.
+ */
+
+void fiq_glue_resume(void)
+{
+ if (!current_handler)
+ return;
+ fiq_glue_setup(current_handler->fiq, current_handler,
+ __get_cpu_var(fiq_stack) + THREAD_START_SP,
+ fiq_return_handler);
+ if (current_handler->resume)
+ current_handler->resume(current_handler);
+}
+
diff --git a/arch/arm/crypto/.gitignore b/arch/arm/crypto/.gitignore
new file mode 100644
index 000000000000..6231d36b3635
--- /dev/null
+++ b/arch/arm/crypto/.gitignore
@@ -0,0 +1 @@
+aesbs-core.S
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index a2c83851bc90..2cee53b27238 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -3,7 +3,27 @@
#
obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
+obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
+obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
+obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
-aes-arm-y := aes-armv4.o aes_glue.o
-sha1-arm-y := sha1-armv4-large.o sha1_glue.o
+aes-arm-y := aes-armv4.o aes_glue.o
+aes-arm-bs-y := aesbs-core.o aesbs-glue.o
+sha1-arm-y := sha1-armv4-large.o sha1_glue.o
+sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
+sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
+sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
+sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
+
+quiet_cmd_perl = PERL $@
+ cmd_perl = $(PERL) $(<) > $(@)
+
+$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
+ $(call cmd,perl)
+
+$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
+ $(call cmd,perl)
+
+.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
diff --git a/arch/arm/crypto/aes_glue.c b/arch/arm/crypto/aes_glue.c
index e73ec2ab1316..0409b8f89782 100644
--- a/arch/arm/crypto/aes_glue.c
+++ b/arch/arm/crypto/aes_glue.c
@@ -6,22 +6,12 @@
#include <linux/crypto.h>
#include <crypto/aes.h>
-#define AES_MAXNR 14
+#include "aes_glue.h"
-typedef struct {
- unsigned int rd_key[4 *(AES_MAXNR + 1)];
- int rounds;
-} AES_KEY;
-
-struct AES_CTX {
- AES_KEY enc_key;
- AES_KEY dec_key;
-};
-
-asmlinkage void AES_encrypt(const u8 *in, u8 *out, AES_KEY *ctx);
-asmlinkage void AES_decrypt(const u8 *in, u8 *out, AES_KEY *ctx);
-asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
-asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
+EXPORT_SYMBOL(AES_encrypt);
+EXPORT_SYMBOL(AES_decrypt);
+EXPORT_SYMBOL(private_AES_set_encrypt_key);
+EXPORT_SYMBOL(private_AES_set_decrypt_key);
static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
@@ -81,7 +71,7 @@ static struct crypto_alg aes_alg = {
.cipher = {
.cia_min_keysize = AES_MIN_KEY_SIZE,
.cia_max_keysize = AES_MAX_KEY_SIZE,
- .cia_setkey = aes_set_key,
+ .cia_setkey = aes_set_key,
.cia_encrypt = aes_encrypt,
.cia_decrypt = aes_decrypt
}
diff --git a/arch/arm/crypto/aes_glue.h b/arch/arm/crypto/aes_glue.h
new file mode 100644
index 000000000000..cca3e51eb606
--- /dev/null
+++ b/arch/arm/crypto/aes_glue.h
@@ -0,0 +1,19 @@
+
+#define AES_MAXNR 14
+
+struct AES_KEY {
+ unsigned int rd_key[4 * (AES_MAXNR + 1)];
+ int rounds;
+};
+
+struct AES_CTX {
+ struct AES_KEY enc_key;
+ struct AES_KEY dec_key;
+};
+
+asmlinkage void AES_encrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
+asmlinkage void AES_decrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
+asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey,
+ const int bits, struct AES_KEY *key);
+asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey,
+ const int bits, struct AES_KEY *key);
diff --git a/arch/arm/crypto/aesbs-core.S_shipped b/arch/arm/crypto/aesbs-core.S_shipped
new file mode 100644
index 000000000000..71e5fc7cfb18
--- /dev/null
+++ b/arch/arm/crypto/aesbs-core.S_shipped
@@ -0,0 +1,2544 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+@ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+@ granted.
+@ ====================================================================
+
+@ Bit-sliced AES for ARM NEON
+@
+@ February 2012.
+@
+@ This implementation is direct adaptation of bsaes-x86_64 module for
+@ ARM NEON. Except that this module is endian-neutral [in sense that
+@ it can be compiled for either endianness] by courtesy of vld1.8's
+@ neutrality. Initial version doesn't implement interface to OpenSSL,
+@ only low-level primitives and unsupported entry points, just enough
+@ to collect performance results, which for Cortex-A8 core are:
+@
+@ encrypt 19.5 cycles per byte processed with 128-bit key
+@ decrypt 22.1 cycles per byte processed with 128-bit key
+@ key conv. 440 cycles per 128-bit key/0.18 of 8x block
+@
+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+@ which is [much] worse than anticipated (for further details see
+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
+@
+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+@ manages in 20.0 cycles].
+@
+@ When comparing to x86_64 results keep in mind that NEON unit is
+@ [mostly] single-issue and thus can't [fully] benefit from
+@ instruction-level parallelism. And when comparing to aes-armv4
+@ results keep in mind key schedule conversion overhead (see
+@ bsaes-x86_64.pl for further details)...
+@
+@ <appro@openssl.org>
+
+@ April-August 2013
+@
+@ Add CBC, CTR and XTS subroutines, adapt for kernel use.
+@
+@ <ard.biesheuvel@linaro.org>
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ 7
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_ARCH__>=7
+.text
+.syntax unified @ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code 32
+#endif
+
+.fpu neon
+
+.type _bsaes_decrypt8,%function
+.align 4
+_bsaes_decrypt8:
+ adr r6,_bsaes_decrypt8
+ vldmia r4!, {q9} @ round 0 key
+ add r6,r6,#.LM0ISR-_bsaes_decrypt8
+
+ vldmia r6!, {q8} @ .LM0ISR
+ veor q10, q0, q9 @ xor with round0 key
+ veor q11, q1, q9
+ vtbl.8 d0, {q10}, d16
+ vtbl.8 d1, {q10}, d17
+ veor q12, q2, q9
+ vtbl.8 d2, {q11}, d16
+ vtbl.8 d3, {q11}, d17
+ veor q13, q3, q9
+ vtbl.8 d4, {q12}, d16
+ vtbl.8 d5, {q12}, d17
+ veor q14, q4, q9
+ vtbl.8 d6, {q13}, d16
+ vtbl.8 d7, {q13}, d17
+ veor q15, q5, q9
+ vtbl.8 d8, {q14}, d16
+ vtbl.8 d9, {q14}, d17
+ veor q10, q6, q9
+ vtbl.8 d10, {q15}, d16
+ vtbl.8 d11, {q15}, d17
+ veor q11, q7, q9
+ vtbl.8 d12, {q10}, d16
+ vtbl.8 d13, {q10}, d17
+ vtbl.8 d14, {q11}, d16
+ vtbl.8 d15, {q11}, d17
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q4, #1
+ veor q10, q10, q7
+ veor q11, q11, q5
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #1
+ veor q5, q5, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q3
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q5, #2
+ vshr.u64 q11, q4, #2
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q5, q5, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q3
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q3, #4
+ vshr.u64 q11, q2, #4
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #4
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q5
+ veor q11, q11, q4
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ sub r5,r5,#1
+ b .Ldec_sbox
+.align 4
+.Ldec_loop:
+ vldmia r4!, {q8-q11}
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vtbl.8 d0, {q8}, d24
+ vtbl.8 d1, {q8}, d25
+ vldmia r4!, {q8}
+ veor q10, q10, q2
+ vtbl.8 d2, {q9}, d24
+ vtbl.8 d3, {q9}, d25
+ vldmia r4!, {q9}
+ veor q11, q11, q3
+ vtbl.8 d4, {q10}, d24
+ vtbl.8 d5, {q10}, d25
+ vldmia r4!, {q10}
+ vtbl.8 d6, {q11}, d24
+ vtbl.8 d7, {q11}, d25
+ vldmia r4!, {q11}
+ veor q8, q8, q4
+ veor q9, q9, q5
+ vtbl.8 d8, {q8}, d24
+ vtbl.8 d9, {q8}, d25
+ veor q10, q10, q6
+ vtbl.8 d10, {q9}, d24
+ vtbl.8 d11, {q9}, d25
+ veor q11, q11, q7
+ vtbl.8 d12, {q10}, d24
+ vtbl.8 d13, {q10}, d25
+ vtbl.8 d14, {q11}, d24
+ vtbl.8 d15, {q11}, d25
+.Ldec_sbox:
+ veor q1, q1, q4
+ veor q3, q3, q4
+
+ veor q4, q4, q7
+ veor q1, q1, q6
+ veor q2, q2, q7
+ veor q6, q6, q4
+
+ veor q0, q0, q1
+ veor q2, q2, q5
+ veor q7, q7, q6
+ veor q3, q3, q0
+ veor q5, q5, q0
+ veor q1, q1, q3
+ veor q11, q3, q0
+ veor q10, q7, q4
+ veor q9, q1, q6
+ veor q13, q4, q0
+ vmov q8, q10
+ veor q12, q5, q2
+
+ vorr q10, q10, q9
+ veor q15, q11, q8
+ vand q14, q11, q12
+ vorr q11, q11, q12
+ veor q12, q12, q9
+ vand q8, q8, q9
+ veor q9, q6, q2
+ vand q15, q15, q12
+ vand q13, q13, q9
+ veor q9, q3, q7
+ veor q12, q1, q5
+ veor q11, q11, q13
+ veor q10, q10, q13
+ vand q13, q9, q12
+ vorr q9, q9, q12
+ veor q11, q11, q15
+ veor q8, q8, q13
+ veor q10, q10, q14
+ veor q9, q9, q15
+ veor q8, q8, q14
+ vand q12, q4, q6
+ veor q9, q9, q14
+ vand q13, q0, q2
+ vand q14, q7, q1
+ vorr q15, q3, q5
+ veor q11, q11, q12
+ veor q9, q9, q14
+ veor q8, q8, q15
+ veor q10, q10, q13
+
+ @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
+
+ @ new smaller inversion
+
+ vand q14, q11, q9
+ vmov q12, q8
+
+ veor q13, q10, q14
+ veor q15, q8, q14
+ veor q14, q8, q14 @ q14=q15
+
+ vbsl q13, q9, q8
+ vbsl q15, q11, q10
+ veor q11, q11, q10
+
+ vbsl q12, q13, q14
+ vbsl q8, q14, q13
+
+ vand q14, q12, q15
+ veor q9, q9, q8
+
+ veor q14, q14, q11
+ veor q12, q5, q2
+ veor q8, q1, q6
+ veor q10, q15, q14
+ vand q10, q10, q5
+ veor q5, q5, q1
+ vand q11, q1, q15
+ vand q5, q5, q14
+ veor q1, q11, q10
+ veor q5, q5, q11
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q2
+ veor q12, q12, q8
+ veor q2, q2, q6
+ vand q8, q8, q15
+ vand q6, q6, q13
+ vand q12, q12, q14
+ vand q2, q2, q9
+ veor q8, q8, q12
+ veor q2, q2, q6
+ veor q12, q12, q11
+ veor q6, q6, q10
+ veor q5, q5, q12
+ veor q2, q2, q12
+ veor q1, q1, q8
+ veor q6, q6, q8
+
+ veor q12, q3, q0
+ veor q8, q7, q4
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q0
+ veor q12, q12, q8
+ veor q0, q0, q4
+ vand q8, q8, q15
+ vand q4, q4, q13
+ vand q12, q12, q14
+ vand q0, q0, q9
+ veor q8, q8, q12
+ veor q0, q0, q4
+ veor q12, q12, q11
+ veor q4, q4, q10
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q10, q15, q14
+ vand q10, q10, q3
+ veor q3, q3, q7
+ vand q11, q7, q15
+ vand q3, q3, q14
+ veor q7, q11, q10
+ veor q3, q3, q11
+ veor q3, q3, q12
+ veor q0, q0, q12
+ veor q7, q7, q8
+ veor q4, q4, q8
+ veor q1, q1, q7
+ veor q6, q6, q5
+
+ veor q4, q4, q1
+ veor q2, q2, q7
+ veor q5, q5, q7
+ veor q4, q4, q2
+ veor q7, q7, q0
+ veor q4, q4, q5
+ veor q3, q3, q6
+ veor q6, q6, q1
+ veor q3, q3, q4
+
+ veor q4, q4, q0
+ veor q7, q7, q3
+ subs r5,r5,#1
+ bcc .Ldec_done
+ @ multiplication by 0x05-0x00-0x04-0x00
+ vext.8 q8, q0, q0, #8
+ vext.8 q14, q3, q3, #8
+ vext.8 q15, q5, q5, #8
+ veor q8, q8, q0
+ vext.8 q9, q1, q1, #8
+ veor q14, q14, q3
+ vext.8 q10, q6, q6, #8
+ veor q15, q15, q5
+ vext.8 q11, q4, q4, #8
+ veor q9, q9, q1
+ vext.8 q12, q2, q2, #8
+ veor q10, q10, q6
+ vext.8 q13, q7, q7, #8
+ veor q11, q11, q4
+ veor q12, q12, q2
+ veor q13, q13, q7
+
+ veor q0, q0, q14
+ veor q1, q1, q14
+ veor q6, q6, q8
+ veor q2, q2, q10
+ veor q4, q4, q9
+ veor q1, q1, q15
+ veor q6, q6, q15
+ veor q2, q2, q14
+ veor q7, q7, q11
+ veor q4, q4, q14
+ veor q3, q3, q12
+ veor q2, q2, q15
+ veor q7, q7, q15
+ veor q5, q5, q13
+ vext.8 q8, q0, q0, #12 @ x0 <<< 32
+ vext.8 q9, q1, q1, #12
+ veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
+ vext.8 q10, q6, q6, #12
+ veor q1, q1, q9
+ vext.8 q11, q4, q4, #12
+ veor q6, q6, q10
+ vext.8 q12, q2, q2, #12
+ veor q4, q4, q11
+ vext.8 q13, q7, q7, #12
+ veor q2, q2, q12
+ vext.8 q14, q3, q3, #12
+ veor q7, q7, q13
+ vext.8 q15, q5, q5, #12
+ veor q3, q3, q14
+
+ veor q9, q9, q0
+ veor q5, q5, q15
+ vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor q10, q10, q1
+ veor q8, q8, q5
+ veor q9, q9, q5
+ vext.8 q1, q1, q1, #8
+ veor q13, q13, q2
+ veor q0, q0, q8
+ veor q14, q14, q7
+ veor q1, q1, q9
+ vext.8 q8, q2, q2, #8
+ veor q12, q12, q4
+ vext.8 q9, q7, q7, #8
+ veor q15, q15, q3
+ vext.8 q2, q4, q4, #8
+ veor q11, q11, q6
+ vext.8 q7, q5, q5, #8
+ veor q12, q12, q5
+ vext.8 q4, q3, q3, #8
+ veor q11, q11, q5
+ vext.8 q3, q6, q6, #8
+ veor q5, q9, q13
+ veor q11, q11, q2
+ veor q7, q7, q15
+ veor q6, q4, q14
+ veor q4, q8, q12
+ veor q2, q3, q10
+ vmov q3, q11
+ @ vmov q5, q9
+ vldmia r6, {q12} @ .LISR
+ ite eq @ Thumb2 thing, sanity check in ARM
+ addeq r6,r6,#0x10
+ bne .Ldec_loop
+ vldmia r6, {q12} @ .LISRM0
+ b .Ldec_loop
+.align 4
+.Ldec_done:
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q3, #1
+ vshr.u64 q11, q2, #1
+ veor q10, q10, q5
+ veor q11, q11, q7
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #1
+ veor q7, q7, q11
+ vshl.u64 q11, q11, #1
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q4
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q4, q4, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q7, #2
+ vshr.u64 q11, q2, #2
+ veor q10, q10, q5
+ veor q11, q11, q3
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #2
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #2
+ veor q7, q7, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q4
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q4, q4, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q4, #4
+ vshr.u64 q11, q6, #4
+ veor q10, q10, q5
+ veor q11, q11, q3
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #4
+ veor q4, q4, q10
+ veor q6, q6, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q7
+ veor q11, q11, q2
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vldmia r4, {q8} @ last round key
+ veor q6, q6, q8
+ veor q4, q4, q8
+ veor q2, q2, q8
+ veor q7, q7, q8
+ veor q3, q3, q8
+ veor q5, q5, q8
+ veor q0, q0, q8
+ veor q1, q1, q8
+ bx lr
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type _bsaes_const,%object
+.align 6
+_bsaes_const:
+.LM0ISR: @ InvShiftRows constants
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR: @ ShiftRows constants
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+ .quad 0x090d01050c000408, 0x03070b0f060a0e02
+.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align 6
+.size _bsaes_const,.-_bsaes_const
+
+.type _bsaes_encrypt8,%function
+.align 4
+_bsaes_encrypt8:
+ adr r6,_bsaes_encrypt8
+ vldmia r4!, {q9} @ round 0 key
+ sub r6,r6,#_bsaes_encrypt8-.LM0SR
+
+ vldmia r6!, {q8} @ .LM0SR
+_bsaes_encrypt8_alt:
+ veor q10, q0, q9 @ xor with round0 key
+ veor q11, q1, q9
+ vtbl.8 d0, {q10}, d16
+ vtbl.8 d1, {q10}, d17
+ veor q12, q2, q9
+ vtbl.8 d2, {q11}, d16
+ vtbl.8 d3, {q11}, d17
+ veor q13, q3, q9
+ vtbl.8 d4, {q12}, d16
+ vtbl.8 d5, {q12}, d17
+ veor q14, q4, q9
+ vtbl.8 d6, {q13}, d16
+ vtbl.8 d7, {q13}, d17
+ veor q15, q5, q9
+ vtbl.8 d8, {q14}, d16
+ vtbl.8 d9, {q14}, d17
+ veor q10, q6, q9
+ vtbl.8 d10, {q15}, d16
+ vtbl.8 d11, {q15}, d17
+ veor q11, q7, q9
+ vtbl.8 d12, {q10}, d16
+ vtbl.8 d13, {q10}, d17
+ vtbl.8 d14, {q11}, d16
+ vtbl.8 d15, {q11}, d17
+_bsaes_encrypt8_bitslice:
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q4, #1
+ veor q10, q10, q7
+ veor q11, q11, q5
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #1
+ veor q5, q5, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q3
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q5, #2
+ vshr.u64 q11, q4, #2
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q5, q5, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q3
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q3, #4
+ vshr.u64 q11, q2, #4
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #4
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q5
+ veor q11, q11, q4
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ sub r5,r5,#1
+ b .Lenc_sbox
+.align 4
+.Lenc_loop:
+ vldmia r4!, {q8-q11}
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vtbl.8 d0, {q8}, d24
+ vtbl.8 d1, {q8}, d25
+ vldmia r4!, {q8}
+ veor q10, q10, q2
+ vtbl.8 d2, {q9}, d24
+ vtbl.8 d3, {q9}, d25
+ vldmia r4!, {q9}
+ veor q11, q11, q3
+ vtbl.8 d4, {q10}, d24
+ vtbl.8 d5, {q10}, d25
+ vldmia r4!, {q10}
+ vtbl.8 d6, {q11}, d24
+ vtbl.8 d7, {q11}, d25
+ vldmia r4!, {q11}
+ veor q8, q8, q4
+ veor q9, q9, q5
+ vtbl.8 d8, {q8}, d24
+ vtbl.8 d9, {q8}, d25
+ veor q10, q10, q6
+ vtbl.8 d10, {q9}, d24
+ vtbl.8 d11, {q9}, d25
+ veor q11, q11, q7
+ vtbl.8 d12, {q10}, d24
+ vtbl.8 d13, {q10}, d25
+ vtbl.8 d14, {q11}, d24
+ vtbl.8 d15, {q11}, d25
+.Lenc_sbox:
+ veor q2, q2, q1
+ veor q5, q5, q6
+ veor q3, q3, q0
+ veor q6, q6, q2
+ veor q5, q5, q0
+
+ veor q6, q6, q3
+ veor q3, q3, q7
+ veor q7, q7, q5
+ veor q3, q3, q4
+ veor q4, q4, q5
+
+ veor q2, q2, q7
+ veor q3, q3, q1
+ veor q1, q1, q5
+ veor q11, q7, q4
+ veor q10, q1, q2
+ veor q9, q5, q3
+ veor q13, q2, q4
+ vmov q8, q10
+ veor q12, q6, q0
+
+ vorr q10, q10, q9
+ veor q15, q11, q8
+ vand q14, q11, q12
+ vorr q11, q11, q12
+ veor q12, q12, q9
+ vand q8, q8, q9
+ veor q9, q3, q0
+ vand q15, q15, q12
+ vand q13, q13, q9
+ veor q9, q7, q1
+ veor q12, q5, q6
+ veor q11, q11, q13
+ veor q10, q10, q13
+ vand q13, q9, q12
+ vorr q9, q9, q12
+ veor q11, q11, q15
+ veor q8, q8, q13
+ veor q10, q10, q14
+ veor q9, q9, q15
+ veor q8, q8, q14
+ vand q12, q2, q3
+ veor q9, q9, q14
+ vand q13, q4, q0
+ vand q14, q1, q5
+ vorr q15, q7, q6
+ veor q11, q11, q12
+ veor q9, q9, q14
+ veor q8, q8, q15
+ veor q10, q10, q13
+
+ @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
+
+ @ new smaller inversion
+
+ vand q14, q11, q9
+ vmov q12, q8
+
+ veor q13, q10, q14
+ veor q15, q8, q14
+ veor q14, q8, q14 @ q14=q15
+
+ vbsl q13, q9, q8
+ vbsl q15, q11, q10
+ veor q11, q11, q10
+
+ vbsl q12, q13, q14
+ vbsl q8, q14, q13
+
+ vand q14, q12, q15
+ veor q9, q9, q8
+
+ veor q14, q14, q11
+ veor q12, q6, q0
+ veor q8, q5, q3
+ veor q10, q15, q14
+ vand q10, q10, q6
+ veor q6, q6, q5
+ vand q11, q5, q15
+ vand q6, q6, q14
+ veor q5, q11, q10
+ veor q6, q6, q11
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q0
+ veor q12, q12, q8
+ veor q0, q0, q3
+ vand q8, q8, q15
+ vand q3, q3, q13
+ vand q12, q12, q14
+ vand q0, q0, q9
+ veor q8, q8, q12
+ veor q0, q0, q3
+ veor q12, q12, q11
+ veor q3, q3, q10
+ veor q6, q6, q12
+ veor q0, q0, q12
+ veor q5, q5, q8
+ veor q3, q3, q8
+
+ veor q12, q7, q4
+ veor q8, q1, q2
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q4
+ veor q12, q12, q8
+ veor q4, q4, q2
+ vand q8, q8, q15
+ vand q2, q2, q13
+ vand q12, q12, q14
+ vand q4, q4, q9
+ veor q8, q8, q12
+ veor q4, q4, q2
+ veor q12, q12, q11
+ veor q2, q2, q10
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q10, q15, q14
+ vand q10, q10, q7
+ veor q7, q7, q1
+ vand q11, q1, q15
+ vand q7, q7, q14
+ veor q1, q11, q10
+ veor q7, q7, q11
+ veor q7, q7, q12
+ veor q4, q4, q12
+ veor q1, q1, q8
+ veor q2, q2, q8
+ veor q7, q7, q0
+ veor q1, q1, q6
+ veor q6, q6, q0
+ veor q4, q4, q7
+ veor q0, q0, q1
+
+ veor q1, q1, q5
+ veor q5, q5, q2
+ veor q2, q2, q3
+ veor q3, q3, q5
+ veor q4, q4, q5
+
+ veor q6, q6, q3
+ subs r5,r5,#1
+ bcc .Lenc_done
+ vext.8 q8, q0, q0, #12 @ x0 <<< 32
+ vext.8 q9, q1, q1, #12
+ veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
+ vext.8 q10, q4, q4, #12
+ veor q1, q1, q9
+ vext.8 q11, q6, q6, #12
+ veor q4, q4, q10
+ vext.8 q12, q3, q3, #12
+ veor q6, q6, q11
+ vext.8 q13, q7, q7, #12
+ veor q3, q3, q12
+ vext.8 q14, q2, q2, #12
+ veor q7, q7, q13
+ vext.8 q15, q5, q5, #12
+ veor q2, q2, q14
+
+ veor q9, q9, q0
+ veor q5, q5, q15
+ vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor q10, q10, q1
+ veor q8, q8, q5
+ veor q9, q9, q5
+ vext.8 q1, q1, q1, #8
+ veor q13, q13, q3
+ veor q0, q0, q8
+ veor q14, q14, q7
+ veor q1, q1, q9
+ vext.8 q8, q3, q3, #8
+ veor q12, q12, q6
+ vext.8 q9, q7, q7, #8
+ veor q15, q15, q2
+ vext.8 q3, q6, q6, #8
+ veor q11, q11, q4
+ vext.8 q7, q5, q5, #8
+ veor q12, q12, q5
+ vext.8 q6, q2, q2, #8
+ veor q11, q11, q5
+ vext.8 q2, q4, q4, #8
+ veor q5, q9, q13
+ veor q4, q8, q12
+ veor q3, q3, q11
+ veor q7, q7, q15
+ veor q6, q6, q14
+ @ vmov q4, q8
+ veor q2, q2, q10
+ @ vmov q5, q9
+ vldmia r6, {q12} @ .LSR
+ ite eq @ Thumb2 thing, samity check in ARM
+ addeq r6,r6,#0x10
+ bne .Lenc_loop
+ vldmia r6, {q12} @ .LSRM0
+ b .Lenc_loop
+.align 4
+.Lenc_done:
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q3, #1
+ veor q10, q10, q5
+ veor q11, q11, q7
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #1
+ veor q7, q7, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q3, q3, q11
+ vshr.u64 q10, q4, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q6
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q6, q6, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q4, q4, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q7, #2
+ vshr.u64 q11, q3, #2
+ veor q10, q10, q5
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q7, q7, q10
+ veor q3, q3, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q6
+ veor q11, q11, q4
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q6, q6, q10
+ vshl.u64 q10, q10, #2
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q6, #4
+ vshr.u64 q11, q4, #4
+ veor q10, q10, q5
+ veor q11, q11, q2
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #4
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q7
+ veor q11, q11, q3
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vldmia r4, {q8} @ last round key
+ veor q4, q4, q8
+ veor q6, q6, q8
+ veor q3, q3, q8
+ veor q7, q7, q8
+ veor q2, q2, q8
+ veor q5, q5, q8
+ veor q0, q0, q8
+ veor q1, q1, q8
+ bx lr
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+.type _bsaes_key_convert,%function
+.align 4
+_bsaes_key_convert:
+ adr r6,_bsaes_key_convert
+ vld1.8 {q7}, [r4]! @ load round 0 key
+ sub r6,r6,#_bsaes_key_convert-.LM0
+ vld1.8 {q15}, [r4]! @ load round 1 key
+
+ vmov.i8 q8, #0x01 @ bit masks
+ vmov.i8 q9, #0x02
+ vmov.i8 q10, #0x04
+ vmov.i8 q11, #0x08
+ vmov.i8 q12, #0x10
+ vmov.i8 q13, #0x20
+ vldmia r6, {q14} @ .LM0
+
+#ifdef __ARMEL__
+ vrev32.8 q7, q7
+ vrev32.8 q15, q15
+#endif
+ sub r5,r5,#1
+ vstmia r12!, {q7} @ save round 0 key
+ b .Lkey_loop
+
+.align 4
+.Lkey_loop:
+ vtbl.8 d14,{q15},d28
+ vtbl.8 d15,{q15},d29
+ vmov.i8 q6, #0x40
+ vmov.i8 q15, #0x80
+
+ vtst.8 q0, q7, q8
+ vtst.8 q1, q7, q9
+ vtst.8 q2, q7, q10
+ vtst.8 q3, q7, q11
+ vtst.8 q4, q7, q12
+ vtst.8 q5, q7, q13
+ vtst.8 q6, q7, q6
+ vtst.8 q7, q7, q15
+ vld1.8 {q15}, [r4]! @ load next round key
+ vmvn q0, q0 @ "pnot"
+ vmvn q1, q1
+ vmvn q5, q5
+ vmvn q6, q6
+#ifdef __ARMEL__
+ vrev32.8 q15, q15
+#endif
+ subs r5,r5,#1
+ vstmia r12!,{q0-q7} @ write bit-sliced round key
+ bne .Lkey_loop
+
+ vmov.i8 q7,#0x63 @ compose .L63
+ @ don't save last round key
+ bx lr
+.size _bsaes_key_convert,.-_bsaes_key_convert
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+
+.global bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,%function
+.align 5
+bsaes_cbc_encrypt:
+#ifndef __KERNEL__
+ cmp r2, #128
+#ifndef __thumb__
+ blo AES_cbc_encrypt
+#else
+ bhs 1f
+ b AES_cbc_encrypt
+1:
+#endif
+#endif
+
+ @ it is up to the caller to make sure we are called with enc == 0
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr r8, [ip] @ IV is 1st arg on the stack
+ mov r2, r2, lsr#4 @ len in 16 byte blocks
+ sub sp, #0x10 @ scratch space to carry over the IV
+ mov r9, sp @ save sp
+
+ ldr r10, [r3, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
+ add r12, #96 @ sifze of bit-slices key schedule
+
+ @ populate the key schedule
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ mov sp, r12 @ sp is sp
+ bl _bsaes_key_convert
+ vldmia sp, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia sp, {q7}
+#else
+ ldr r12, [r3, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [r3, #244]
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ add r12, r3, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, r3, #248
+ vldmia r4, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia r4, {q7}
+
+.align 2
+0:
+#endif
+
+ vld1.8 {q15}, [r8] @ load IV
+ b .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+ subs r2, r2, #0x8
+ bmi .Lcbc_dec_loop_finish
+
+ vld1.8 {q0-q1}, [r0]! @ load input
+ vld1.8 {q2-q3}, [r0]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, sp @ pass the key
+#else
+ add r4, r3, #248
+#endif
+ vld1.8 {q4-q5}, [r0]!
+ mov r5, r10
+ vld1.8 {q6-q7}, [r0]
+ sub r0, r0, #0x60
+ vstmia r9, {q15} @ put aside IV
+
+ bl _bsaes_decrypt8
+
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10-q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12-q13}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q14-q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0-q1}, [r1]! @ write output
+ veor q3, q3, q13
+ vst1.8 {q6}, [r1]!
+ veor q5, q5, q14
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ vst1.8 {q3}, [r1]!
+ vst1.8 {q5}, [r1]!
+
+ b .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+ adds r2, r2, #8
+ beq .Lcbc_dec_done
+
+ vld1.8 {q0}, [r0]! @ load input
+ cmp r2, #2
+ blo .Lcbc_dec_one
+ vld1.8 {q1}, [r0]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, sp @ pass the key
+#else
+ add r4, r3, #248
+#endif
+ mov r5, r10
+ vstmia r9, {q15} @ put aside IV
+ beq .Lcbc_dec_two
+ vld1.8 {q2}, [r0]!
+ cmp r2, #4
+ blo .Lcbc_dec_three
+ vld1.8 {q3}, [r0]!
+ beq .Lcbc_dec_four
+ vld1.8 {q4}, [r0]!
+ cmp r2, #6
+ blo .Lcbc_dec_five
+ vld1.8 {q5}, [r0]!
+ beq .Lcbc_dec_six
+ vld1.8 {q6}, [r0]!
+ sub r0, r0, #0x70
+
+ bl _bsaes_decrypt8
+
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10-q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12-q13}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0-q1}, [r1]! @ write output
+ veor q3, q3, q13
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ vst1.8 {q3}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+ sub r0, r0, #0x60
+ bl _bsaes_decrypt8
+ vldmia r9,{q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10-q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0-q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+ sub r0, r0, #0x50
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10-q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q15}, [r0]!
+ veor q4, q4, q10
+ vst1.8 {q0-q1}, [r1]! @ write output
+ veor q2, q2, q11
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+ sub r0, r0, #0x40
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q15}, [r0]!
+ veor q4, q4, q10
+ vst1.8 {q0-q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+ sub r0, r0, #0x30
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8-q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q15}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vst1.8 {q0-q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+ sub r0, r0, #0x20
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q15}, [r0]! @ reload input
+ veor q1, q1, q8
+ vst1.8 {q0-q1}, [r1]! @ write output
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+ sub r0, r0, #0x10
+ mov r10, r1 @ save original out pointer
+ mov r1, r9 @ use the iv scratch space as out buffer
+ mov r2, r3
+ vmov q4,q15 @ just in case ensure that IV
+ vmov q5,q0 @ and input are preserved
+ bl AES_decrypt
+ vld1.8 {q0}, [r9,:64] @ load result
+ veor q0, q0, q4 @ ^= IV
+ vmov q15, q5 @ q5 holds input
+ vst1.8 {q0}, [r10] @ write output
+
+.Lcbc_dec_done:
+#ifndef BSAES_ASM_EXTENDED_KEY
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+.Lcbc_dec_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r9
+ bne .Lcbc_dec_bzero
+#endif
+
+ mov sp, r9
+ add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
+ vst1.8 {q15}, [r8] @ return IV
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc}
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+.extern AES_encrypt
+.global bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,%function
+.align 5
+bsaes_ctr32_encrypt_blocks:
+ cmp r2, #8 @ use plain AES for
+ blo .Lctr_enc_short @ small sizes
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr r8, [ip] @ ctr is 1st arg on the stack
+ sub sp, sp, #0x10 @ scratch space to carry over the ctr
+ mov r9, sp @ save sp
+
+ ldr r10, [r3, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
+ add r12, #96 @ size of bit-sliced key schedule
+
+ @ populate the key schedule
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ mov sp, r12 @ sp is sp
+ bl _bsaes_key_convert
+ veor q7,q7,q15 @ fix up last round key
+ vstmia r12, {q7} @ save last round key
+
+ vld1.8 {q0}, [r8] @ load counter
+ add r8, r6, #.LREVM0SR-.LM0 @ borrow r8
+ vldmia sp, {q4} @ load round0 key
+#else
+ ldr r12, [r3, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [r3, #244]
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ add r12, r3, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor q7,q7,q15 @ fix up last round key
+ vstmia r12, {q7} @ save last round key
+
+.align 2
+0: add r12, r3, #248
+ vld1.8 {q0}, [r8] @ load counter
+ adrl r8, .LREVM0SR @ borrow r8
+ vldmia r12, {q4} @ load round0 key
+ sub sp, #0x10 @ place for adjusted round0 key
+#endif
+
+ vmov.i32 q8,#1 @ compose 1<<96
+ veor q9,q9,q9
+ vrev32.8 q0,q0
+ vext.8 q8,q9,q8,#4
+ vrev32.8 q4,q4
+ vadd.u32 q9,q8,q8 @ compose 2<<96
+ vstmia sp, {q4} @ save adjusted round0 key
+ b .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+ vadd.u32 q10, q8, q9 @ compose 3<<96
+ vadd.u32 q1, q0, q8 @ +1
+ vadd.u32 q2, q0, q9 @ +2
+ vadd.u32 q3, q0, q10 @ +3
+ vadd.u32 q4, q1, q10
+ vadd.u32 q5, q2, q10
+ vadd.u32 q6, q3, q10
+ vadd.u32 q7, q4, q10
+ vadd.u32 q10, q5, q10 @ next counter
+
+ @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ @ to flip byte order in 32-bit counter
+
+ vldmia sp, {q9} @ load round0 key
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x10 @ pass next round key
+#else
+ add r4, r3, #264
+#endif
+ vldmia r8, {q8} @ .LREVM0SR
+ mov r5, r10 @ pass rounds
+ vstmia r9, {q10} @ save next counter
+ sub r6, r8, #.LREVM0SR-.LSR @ pass constants
+
+ bl _bsaes_encrypt8_alt
+
+ subs r2, r2, #8
+ blo .Lctr_enc_loop_done
+
+ vld1.8 {q8-q9}, [r0]! @ load input
+ vld1.8 {q10-q11}, [r0]!
+ veor q0, q8
+ veor q1, q9
+ vld1.8 {q12-q13}, [r0]!
+ veor q4, q10
+ veor q6, q11
+ vld1.8 {q14-q15}, [r0]!
+ veor q3, q12
+ vst1.8 {q0-q1}, [r1]! @ write output
+ veor q7, q13
+ veor q2, q14
+ vst1.8 {q4}, [r1]!
+ veor q5, q15
+ vst1.8 {q6}, [r1]!
+ vmov.i32 q8, #1 @ compose 1<<96
+ vst1.8 {q3}, [r1]!
+ veor q9, q9, q9
+ vst1.8 {q7}, [r1]!
+ vext.8 q8, q9, q8, #4
+ vst1.8 {q2}, [r1]!
+ vadd.u32 q9,q8,q8 @ compose 2<<96
+ vst1.8 {q5}, [r1]!
+ vldmia r9, {q0} @ load counter
+
+ bne .Lctr_enc_loop
+ b .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+ add r2, r2, #8
+ vld1.8 {q8}, [r0]! @ load input
+ veor q0, q8
+ vst1.8 {q0}, [r1]! @ write output
+ cmp r2, #2
+ blo .Lctr_enc_done
+ vld1.8 {q9}, [r0]!
+ veor q1, q9
+ vst1.8 {q1}, [r1]!
+ beq .Lctr_enc_done
+ vld1.8 {q10}, [r0]!
+ veor q4, q10
+ vst1.8 {q4}, [r1]!
+ cmp r2, #4
+ blo .Lctr_enc_done
+ vld1.8 {q11}, [r0]!
+ veor q6, q11
+ vst1.8 {q6}, [r1]!
+ beq .Lctr_enc_done
+ vld1.8 {q12}, [r0]!
+ veor q3, q12
+ vst1.8 {q3}, [r1]!
+ cmp r2, #6
+ blo .Lctr_enc_done
+ vld1.8 {q13}, [r0]!
+ veor q7, q13
+ vst1.8 {q7}, [r1]!
+ beq .Lctr_enc_done
+ vld1.8 {q14}, [r0]
+ veor q2, q14
+ vst1.8 {q2}, [r1]!
+
+.Lctr_enc_done:
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifndef BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r9
+ bne .Lctr_enc_bzero
+#else
+ vstmia sp, {q0-q1}
+#endif
+
+ mov sp, r9
+ add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.align 4
+.Lctr_enc_short:
+ ldr ip, [sp] @ ctr pointer is passed on stack
+ stmdb sp!, {r4-r8, lr}
+
+ mov r4, r0 @ copy arguments
+ mov r5, r1
+ mov r6, r2
+ mov r7, r3
+ ldr r8, [ip, #12] @ load counter LSW
+ vld1.8 {q1}, [ip] @ load whole counter value
+#ifdef __ARMEL__
+ rev r8, r8
+#endif
+ sub sp, sp, #0x10
+ vst1.8 {q1}, [sp,:64] @ copy counter value
+ sub sp, sp, #0x10
+
+.Lctr_enc_short_loop:
+ add r0, sp, #0x10 @ input counter value
+ mov r1, sp @ output on the stack
+ mov r2, r7 @ key
+
+ bl AES_encrypt
+
+ vld1.8 {q0}, [r4]! @ load input
+ vld1.8 {q1}, [sp,:64] @ load encrypted counter
+ add r8, r8, #1
+#ifdef __ARMEL__
+ rev r0, r8
+ str r0, [sp, #0x1c] @ next counter value
+#else
+ str r8, [sp, #0x1c] @ next counter value
+#endif
+ veor q0,q0,q1
+ vst1.8 {q0}, [r5]! @ store output
+ subs r6, r6, #1
+ bne .Lctr_enc_short_loop
+
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vstmia sp!, {q0-q1}
+
+ ldmia sp!, {r4-r8, pc}
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,%function
+.align 4
+bsaes_xts_encrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future r3
+
+ mov r7, r0
+ mov r8, r1
+ mov r9, r2
+ mov r10, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0,sp @ pointer to initial tweak
+#endif
+
+ ldr r1, [r10, #240] @ get # of rounds
+ mov r3, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #96 @ size of bit-sliced key schedule
+ sub r12, #48 @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, r10 @ pass key
+ mov r5, r1 @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ veor q7, q7, q15 @ fix up last round key
+ vstmia r12, {q7} @ save last round key
+#else
+ ldr r12, [r10, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [r10, #244]
+ mov r4, r10 @ pass key
+ mov r5, r1 @ pass # of rounds
+ add r12, r10, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor q7, q7, q15 @ fix up last round key
+ vstmia r12, {q7}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+
+ vld1.8 {q8}, [r0] @ initial tweak
+ adr r2, .Lxts_magic
+
+ subs r9, #0x80
+ blo .Lxts_enc_short
+ b .Lxts_enc_loop
+
+.align 4
+.Lxts_enc_loop:
+ vldmia r2, {q5} @ load XTS magic
+ vshr.s64 q6, q8, #63
+ mov r0, sp
+ vand q6, q6, q5
+ vadd.u64 q9, q8, q8
+ vst1.64 {q8}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q9, #63
+ veor q9, q9, q6
+ vand q7, q7, q5
+ vadd.u64 q10, q9, q9
+ vst1.64 {q9}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q10, #63
+ veor q10, q10, q7
+ vand q6, q6, q5
+ vld1.8 {q0}, [r7]!
+ vadd.u64 q11, q10, q10
+ vst1.64 {q10}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q11, #63
+ veor q11, q11, q6
+ vand q7, q7, q5
+ vld1.8 {q1}, [r7]!
+ veor q0, q0, q8
+ vadd.u64 q12, q11, q11
+ vst1.64 {q11}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q12, #63
+ veor q12, q12, q7
+ vand q6, q6, q5
+ vld1.8 {q2}, [r7]!
+ veor q1, q1, q9
+ vadd.u64 q13, q12, q12
+ vst1.64 {q12}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q13, #63
+ veor q13, q13, q6
+ vand q7, q7, q5
+ vld1.8 {q3}, [r7]!
+ veor q2, q2, q10
+ vadd.u64 q14, q13, q13
+ vst1.64 {q13}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q14, #63
+ veor q14, q14, q7
+ vand q6, q6, q5
+ vld1.8 {q4}, [r7]!
+ veor q3, q3, q11
+ vadd.u64 q15, q14, q14
+ vst1.64 {q14}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q15, #63
+ veor q15, q15, q6
+ vand q7, q7, q5
+ vld1.8 {q5}, [r7]!
+ veor q4, q4, q12
+ vadd.u64 q8, q15, q15
+ vst1.64 {q15}, [r0,:128]!
+ vswp d15,d14
+ veor q8, q8, q7
+ vst1.64 {q8}, [r0,:128] @ next round tweak
+
+ vld1.8 {q6-q7}, [r7]!
+ veor q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q6, q6, q14
+ mov r5, r1 @ pass rounds
+ veor q7, q7, q15
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q6, q11
+ vld1.64 {q14-q15}, [r0,:128]!
+ veor q10, q3, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ veor q12, q2, q14
+ vst1.8 {q10-q11}, [r8]!
+ veor q13, q5, q15
+ vst1.8 {q12-q13}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+
+ subs r9, #0x80
+ bpl .Lxts_enc_loop
+
+.Lxts_enc_short:
+ adds r9, #0x70
+ bmi .Lxts_enc_done
+
+ vldmia r2, {q5} @ load XTS magic
+ vshr.s64 q7, q8, #63
+ mov r0, sp
+ vand q7, q7, q5
+ vadd.u64 q9, q8, q8
+ vst1.64 {q8}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q9, #63
+ veor q9, q9, q7
+ vand q6, q6, q5
+ vadd.u64 q10, q9, q9
+ vst1.64 {q9}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q10, #63
+ veor q10, q10, q6
+ vand q7, q7, q5
+ vld1.8 {q0}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_1
+ vadd.u64 q11, q10, q10
+ vst1.64 {q10}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q11, #63
+ veor q11, q11, q7
+ vand q6, q6, q5
+ vld1.8 {q1}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_2
+ veor q0, q0, q8
+ vadd.u64 q12, q11, q11
+ vst1.64 {q11}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q12, #63
+ veor q12, q12, q6
+ vand q7, q7, q5
+ vld1.8 {q2}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_3
+ veor q1, q1, q9
+ vadd.u64 q13, q12, q12
+ vst1.64 {q12}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q13, #63
+ veor q13, q13, q7
+ vand q6, q6, q5
+ vld1.8 {q3}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_4
+ veor q2, q2, q10
+ vadd.u64 q14, q13, q13
+ vst1.64 {q13}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q14, #63
+ veor q14, q14, q6
+ vand q7, q7, q5
+ vld1.8 {q4}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_5
+ veor q3, q3, q11
+ vadd.u64 q15, q14, q14
+ vst1.64 {q14}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q15, #63
+ veor q15, q15, q7
+ vand q6, q6, q5
+ vld1.8 {q5}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_enc_6
+ veor q4, q4, q12
+ sub r9, #0x10
+ vst1.64 {q15}, [r0,:128] @ next round tweak
+
+ vld1.8 {q6}, [r7]!
+ veor q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q6, q6, q14
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q6, q11
+ vld1.64 {q14}, [r0,:128]!
+ veor q10, q3, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ veor q12, q2, q14
+ vst1.8 {q10-q11}, [r8]!
+ vst1.8 {q12}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_6:
+ vst1.64 {q14}, [r0,:128] @ next round tweak
+
+ veor q4, q4, q12
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q5, q5, q13
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q6, q11
+ veor q10, q3, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ vst1.8 {q10-q11}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align 5
+.Lxts_magic:
+ .quad 1, 0x87
+
+.align 5
+.Lxts_enc_5:
+ vst1.64 {q13}, [r0,:128] @ next round tweak
+
+ veor q3, q3, q11
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q4, q4, q12
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q6, q11
+ veor q10, q3, q12
+ vst1.8 {q8-q9}, [r8]!
+ vst1.8 {q10}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_4:
+ vst1.64 {q12}, [r0,:128] @ next round tweak
+
+ veor q2, q2, q10
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q3, q3, q11
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q6, q11
+ vst1.8 {q8-q9}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_3:
+ vst1.64 {q11}, [r0,:128] @ next round tweak
+
+ veor q1, q1, q9
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q2, q2, q10
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ veor q8, q4, q10
+ vst1.8 {q0-q1}, [r8]!
+ vst1.8 {q8}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_2:
+ vst1.64 {q10}, [r0,:128] @ next round tweak
+
+ veor q0, q0, q8
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q1, q1, q9
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ vst1.8 {q0-q1}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_1:
+ mov r0, sp
+ veor q0, q8
+ mov r1, sp
+ vst1.8 {q0}, [sp,:128]
+ mov r2, r10
+ mov r4, r3 @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {q0}, [sp,:128]
+ veor q0, q0, q8
+ vst1.8 {q0}, [r8]!
+ mov r3, r4
+
+ vmov q8, q9 @ next round tweak
+
+.Lxts_enc_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds r9, #0x10
+ beq .Lxts_enc_ret
+ sub r6, r8, #0x10
+
+.Lxts_enc_steal:
+ ldrb r0, [r7], #1
+ ldrb r1, [r8, #-0x10]
+ strb r0, [r8, #-0x10]
+ strb r1, [r8], #1
+
+ subs r9, #1
+ bhi .Lxts_enc_steal
+
+ vld1.8 {q0}, [r6]
+ mov r0, sp
+ veor q0, q0, q8
+ mov r1, sp
+ vst1.8 {q0}, [sp,:128]
+ mov r2, r10
+ mov r4, r3 @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {q0}, [sp,:128]
+ veor q0, q0, q8
+ vst1.8 {q0}, [r6]
+ mov r3, r4
+#endif
+
+.Lxts_enc_ret:
+ bic r0, r3, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_enc_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_enc_bzero
+
+ mov sp, r3
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {q8}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,%function
+.align 4
+bsaes_xts_decrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future r3
+
+ mov r7, r0
+ mov r8, r1
+ mov r9, r2
+ mov r10, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0, sp @ pointer to initial tweak
+#endif
+
+ ldr r1, [r10, #240] @ get # of rounds
+ mov r3, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #96 @ size of bit-sliced key schedule
+ sub r12, #48 @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, r10 @ pass key
+ mov r5, r1 @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, sp, #0x90
+ vldmia r4, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia r4, {q7}
+#else
+ ldr r12, [r10, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [r10, #244]
+ mov r4, r10 @ pass key
+ mov r5, r1 @ pass # of rounds
+ add r12, r10, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, r10, #248
+ vldmia r4, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia r4, {q7}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+ vld1.8 {q8}, [r0] @ initial tweak
+ adr r2, .Lxts_magic
+
+ tst r9, #0xf @ if not multiple of 16
+ it ne @ Thumb2 thing, sanity check in ARM
+ subne r9, #0x10 @ subtract another 16 bytes
+ subs r9, #0x80
+
+ blo .Lxts_dec_short
+ b .Lxts_dec_loop
+
+.align 4
+.Lxts_dec_loop:
+ vldmia r2, {q5} @ load XTS magic
+ vshr.s64 q6, q8, #63
+ mov r0, sp
+ vand q6, q6, q5
+ vadd.u64 q9, q8, q8
+ vst1.64 {q8}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q9, #63
+ veor q9, q9, q6
+ vand q7, q7, q5
+ vadd.u64 q10, q9, q9
+ vst1.64 {q9}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q10, #63
+ veor q10, q10, q7
+ vand q6, q6, q5
+ vld1.8 {q0}, [r7]!
+ vadd.u64 q11, q10, q10
+ vst1.64 {q10}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q11, #63
+ veor q11, q11, q6
+ vand q7, q7, q5
+ vld1.8 {q1}, [r7]!
+ veor q0, q0, q8
+ vadd.u64 q12, q11, q11
+ vst1.64 {q11}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q12, #63
+ veor q12, q12, q7
+ vand q6, q6, q5
+ vld1.8 {q2}, [r7]!
+ veor q1, q1, q9
+ vadd.u64 q13, q12, q12
+ vst1.64 {q12}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q13, #63
+ veor q13, q13, q6
+ vand q7, q7, q5
+ vld1.8 {q3}, [r7]!
+ veor q2, q2, q10
+ vadd.u64 q14, q13, q13
+ vst1.64 {q13}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q14, #63
+ veor q14, q14, q7
+ vand q6, q6, q5
+ vld1.8 {q4}, [r7]!
+ veor q3, q3, q11
+ vadd.u64 q15, q14, q14
+ vst1.64 {q14}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q15, #63
+ veor q15, q15, q6
+ vand q7, q7, q5
+ vld1.8 {q5}, [r7]!
+ veor q4, q4, q12
+ vadd.u64 q8, q15, q15
+ vst1.64 {q15}, [r0,:128]!
+ vswp d15,d14
+ veor q8, q8, q7
+ vst1.64 {q8}, [r0,:128] @ next round tweak
+
+ vld1.8 {q6-q7}, [r7]!
+ veor q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q6, q6, q14
+ mov r5, r1 @ pass rounds
+ veor q7, q7, q15
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q4, q11
+ vld1.64 {q14-q15}, [r0,:128]!
+ veor q10, q2, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ veor q12, q3, q14
+ vst1.8 {q10-q11}, [r8]!
+ veor q13, q5, q15
+ vst1.8 {q12-q13}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+
+ subs r9, #0x80
+ bpl .Lxts_dec_loop
+
+.Lxts_dec_short:
+ adds r9, #0x70
+ bmi .Lxts_dec_done
+
+ vldmia r2, {q5} @ load XTS magic
+ vshr.s64 q7, q8, #63
+ mov r0, sp
+ vand q7, q7, q5
+ vadd.u64 q9, q8, q8
+ vst1.64 {q8}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q9, #63
+ veor q9, q9, q7
+ vand q6, q6, q5
+ vadd.u64 q10, q9, q9
+ vst1.64 {q9}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q10, #63
+ veor q10, q10, q6
+ vand q7, q7, q5
+ vld1.8 {q0}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_1
+ vadd.u64 q11, q10, q10
+ vst1.64 {q10}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q11, #63
+ veor q11, q11, q7
+ vand q6, q6, q5
+ vld1.8 {q1}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_2
+ veor q0, q0, q8
+ vadd.u64 q12, q11, q11
+ vst1.64 {q11}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q12, #63
+ veor q12, q12, q6
+ vand q7, q7, q5
+ vld1.8 {q2}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_3
+ veor q1, q1, q9
+ vadd.u64 q13, q12, q12
+ vst1.64 {q12}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q13, #63
+ veor q13, q13, q7
+ vand q6, q6, q5
+ vld1.8 {q3}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_4
+ veor q2, q2, q10
+ vadd.u64 q14, q13, q13
+ vst1.64 {q13}, [r0,:128]!
+ vswp d13,d12
+ vshr.s64 q7, q14, #63
+ veor q14, q14, q6
+ vand q7, q7, q5
+ vld1.8 {q4}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_5
+ veor q3, q3, q11
+ vadd.u64 q15, q14, q14
+ vst1.64 {q14}, [r0,:128]!
+ vswp d15,d14
+ vshr.s64 q6, q15, #63
+ veor q15, q15, q7
+ vand q6, q6, q5
+ vld1.8 {q5}, [r7]!
+ subs r9, #0x10
+ bmi .Lxts_dec_6
+ veor q4, q4, q12
+ sub r9, #0x10
+ vst1.64 {q15}, [r0,:128] @ next round tweak
+
+ vld1.8 {q6}, [r7]!
+ veor q5, q5, q13
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q6, q6, q14
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q4, q11
+ vld1.64 {q14}, [r0,:128]!
+ veor q10, q2, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ veor q12, q3, q14
+ vst1.8 {q10-q11}, [r8]!
+ vst1.8 {q12}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_6:
+ vst1.64 {q14}, [r0,:128] @ next round tweak
+
+ veor q4, q4, q12
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q5, q5, q13
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12-q13}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q4, q11
+ veor q10, q2, q12
+ vst1.8 {q8-q9}, [r8]!
+ veor q11, q7, q13
+ vst1.8 {q10-q11}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_5:
+ vst1.64 {q13}, [r0,:128] @ next round tweak
+
+ veor q3, q3, q11
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q4, q4, q12
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ vld1.64 {q12}, [r0,:128]!
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q4, q11
+ veor q10, q2, q12
+ vst1.8 {q8-q9}, [r8]!
+ vst1.8 {q10}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_4:
+ vst1.64 {q12}, [r0,:128] @ next round tweak
+
+ veor q2, q2, q10
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q3, q3, q11
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10-q11}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ veor q9, q4, q11
+ vst1.8 {q8-q9}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_3:
+ vst1.64 {q11}, [r0,:128] @ next round tweak
+
+ veor q1, q1, q9
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q2, q2, q10
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ vld1.64 {q10}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ veor q8, q6, q10
+ vst1.8 {q0-q1}, [r8]!
+ vst1.8 {q8}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_2:
+ vst1.64 {q10}, [r0,:128] @ next round tweak
+
+ veor q0, q0, q8
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, r10, #248 @ pass key schedule
+#endif
+ veor q1, q1, q9
+ mov r5, r1 @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {q8-q9}, [r0,:128]!
+ veor q0, q0, q8
+ veor q1, q1, q9
+ vst1.8 {q0-q1}, [r8]!
+
+ vld1.64 {q8}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_1:
+ mov r0, sp
+ veor q0, q8
+ mov r1, sp
+ vst1.8 {q0}, [sp,:128]
+ mov r2, r10
+ mov r4, r3 @ preserve fp
+ mov r5, r2 @ preserve magic
+
+ bl AES_decrypt
+
+ vld1.8 {q0}, [sp,:128]
+ veor q0, q0, q8
+ vst1.8 {q0}, [r8]!
+ mov r3, r4
+ mov r2, r5
+
+ vmov q8, q9 @ next round tweak
+
+.Lxts_dec_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds r9, #0x10
+ beq .Lxts_dec_ret
+
+ @ calculate one round of extra tweak for the stolen ciphertext
+ vldmia r2, {q5}
+ vshr.s64 q6, q8, #63
+ vand q6, q6, q5
+ vadd.u64 q9, q8, q8
+ vswp d13,d12
+ veor q9, q9, q6
+
+ @ perform the final decryption with the last tweak value
+ vld1.8 {q0}, [r7]!
+ mov r0, sp
+ veor q0, q0, q9
+ mov r1, sp
+ vst1.8 {q0}, [sp,:128]
+ mov r2, r10
+ mov r4, r3 @ preserve fp
+
+ bl AES_decrypt
+
+ vld1.8 {q0}, [sp,:128]
+ veor q0, q0, q9
+ vst1.8 {q0}, [r8]
+
+ mov r6, r8
+.Lxts_dec_steal:
+ ldrb r1, [r8]
+ ldrb r0, [r7], #1
+ strb r1, [r8, #0x10]
+ strb r0, [r8], #1
+
+ subs r9, #1
+ bhi .Lxts_dec_steal
+
+ vld1.8 {q0}, [r6]
+ mov r0, sp
+ veor q0, q8
+ mov r1, sp
+ vst1.8 {q0}, [sp,:128]
+ mov r2, r10
+
+ bl AES_decrypt
+
+ vld1.8 {q0}, [sp,:128]
+ veor q0, q0, q8
+ vst1.8 {q0}, [r6]
+ mov r3, r4
+#endif
+
+.Lxts_dec_ret:
+ bic r0, r3, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_dec_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_dec_bzero
+
+ mov sp, r3
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {q8}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
+#endif
diff --git a/arch/arm/crypto/aesbs-glue.c b/arch/arm/crypto/aesbs-glue.c
new file mode 100644
index 000000000000..15468fbbdea3
--- /dev/null
+++ b/arch/arm/crypto/aesbs-glue.c
@@ -0,0 +1,434 @@
+/*
+ * linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <crypto/ablk_helper.h>
+#include <crypto/algapi.h>
+#include <linux/module.h>
+
+#include "aes_glue.h"
+
+#define BIT_SLICED_KEY_MAXSIZE (128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE)
+
+struct BS_KEY {
+ struct AES_KEY rk;
+ int converted;
+ u8 __aligned(8) bs[BIT_SLICED_KEY_MAXSIZE];
+} __aligned(8);
+
+asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in);
+asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in);
+
+asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes,
+ struct BS_KEY *key, u8 iv[]);
+
+asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks,
+ struct BS_KEY *key, u8 const iv[]);
+
+asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes,
+ struct BS_KEY *key, u8 tweak[]);
+
+asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes,
+ struct BS_KEY *key, u8 tweak[]);
+
+struct aesbs_cbc_ctx {
+ struct AES_KEY enc;
+ struct BS_KEY dec;
+};
+
+struct aesbs_ctr_ctx {
+ struct BS_KEY enc;
+};
+
+struct aesbs_xts_ctx {
+ struct BS_KEY enc;
+ struct BS_KEY dec;
+ struct AES_KEY twkey;
+};
+
+static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+ int bits = key_len * 8;
+
+ if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) {
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+ ctx->dec.rk = ctx->enc;
+ private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
+ ctx->dec.converted = 0;
+ return 0;
+}
+
+static int aesbs_ctr_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
+ int bits = key_len * 8;
+
+ if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+ ctx->enc.converted = 0;
+ return 0;
+}
+
+static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+ int bits = key_len * 4;
+
+ if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+ ctx->dec.rk = ctx->enc.rk;
+ private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
+ private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey);
+ ctx->enc.converted = ctx->dec.converted = 0;
+ return 0;
+}
+
+static int aesbs_cbc_encrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while (walk.nbytes) {
+ u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
+ u8 *src = walk.src.virt.addr;
+
+ if (walk.dst.virt.addr == walk.src.virt.addr) {
+ u8 *iv = walk.iv;
+
+ do {
+ crypto_xor(src, iv, AES_BLOCK_SIZE);
+ AES_encrypt(src, src, &ctx->enc);
+ iv = src;
+ src += AES_BLOCK_SIZE;
+ } while (--blocks);
+ memcpy(walk.iv, iv, AES_BLOCK_SIZE);
+ } else {
+ u8 *dst = walk.dst.virt.addr;
+
+ do {
+ crypto_xor(walk.iv, src, AES_BLOCK_SIZE);
+ AES_encrypt(walk.iv, dst, &ctx->enc);
+ memcpy(walk.iv, dst, AES_BLOCK_SIZE);
+ src += AES_BLOCK_SIZE;
+ dst += AES_BLOCK_SIZE;
+ } while (--blocks);
+ }
+ err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
+ }
+ return err;
+}
+
+static int aesbs_cbc_decrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+ while ((walk.nbytes / AES_BLOCK_SIZE) >= 8) {
+ kernel_neon_begin();
+ bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes, &ctx->dec, walk.iv);
+ kernel_neon_end();
+ err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
+ }
+ while (walk.nbytes) {
+ u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
+ u8 *dst = walk.dst.virt.addr;
+ u8 *src = walk.src.virt.addr;
+ u8 bk[2][AES_BLOCK_SIZE];
+ u8 *iv = walk.iv;
+
+ do {
+ if (walk.dst.virt.addr == walk.src.virt.addr)
+ memcpy(bk[blocks & 1], src, AES_BLOCK_SIZE);
+
+ AES_decrypt(src, dst, &ctx->dec.rk);
+ crypto_xor(dst, iv, AES_BLOCK_SIZE);
+
+ if (walk.dst.virt.addr == walk.src.virt.addr)
+ iv = bk[blocks & 1];
+ else
+ iv = src;
+
+ dst += AES_BLOCK_SIZE;
+ src += AES_BLOCK_SIZE;
+ } while (--blocks);
+ err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
+ }
+ return err;
+}
+
+static void inc_be128_ctr(__be32 ctr[], u32 addend)
+{
+ int i;
+
+ for (i = 3; i >= 0; i--, addend = 1) {
+ u32 n = be32_to_cpu(ctr[i]) + addend;
+
+ ctr[i] = cpu_to_be32(n);
+ if (n >= addend)
+ break;
+ }
+}
+
+static int aesbs_ctr_encrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst, struct scatterlist *src,
+ unsigned int nbytes)
+{
+ struct aesbs_ctr_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ u32 blocks;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+ while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) {
+ u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+ __be32 *ctr = (__be32 *)walk.iv;
+ u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]);
+
+ /* avoid 32 bit counter overflow in the NEON code */
+ if (unlikely(headroom < blocks)) {
+ blocks = headroom + 1;
+ tail = walk.nbytes - blocks * AES_BLOCK_SIZE;
+ }
+ kernel_neon_begin();
+ bsaes_ctr32_encrypt_blocks(walk.src.virt.addr,
+ walk.dst.virt.addr, blocks,
+ &ctx->enc, walk.iv);
+ kernel_neon_end();
+ inc_be128_ctr(ctr, blocks);
+
+ nbytes -= blocks * AES_BLOCK_SIZE;
+ if (nbytes && nbytes == tail && nbytes <= AES_BLOCK_SIZE)
+ break;
+
+ err = blkcipher_walk_done(desc, &walk, tail);
+ }
+ if (walk.nbytes) {
+ u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+ u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+ u8 ks[AES_BLOCK_SIZE];
+
+ AES_encrypt(walk.iv, ks, &ctx->enc.rk);
+ if (tdst != tsrc)
+ memcpy(tdst, tsrc, nbytes);
+ crypto_xor(tdst, ks, nbytes);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ return err;
+}
+
+static int aesbs_xts_encrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+ /* generate the initial tweak */
+ AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
+
+ while (walk.nbytes) {
+ kernel_neon_begin();
+ bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes, &ctx->enc, walk.iv);
+ kernel_neon_end();
+ err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
+ }
+ return err;
+}
+
+static int aesbs_xts_decrypt(struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+
+ /* generate the initial tweak */
+ AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
+
+ while (walk.nbytes) {
+ kernel_neon_begin();
+ bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes, &ctx->dec, walk.iv);
+ kernel_neon_end();
+ err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
+ }
+ return err;
+}
+
+static struct crypto_alg aesbs_algs[] = { {
+ .cra_name = "__cbc-aes-neonbs",
+ .cra_driver_name = "__driver-cbc-aes-neonbs",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct aesbs_cbc_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_blkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_cbc_set_key,
+ .encrypt = aesbs_cbc_encrypt,
+ .decrypt = aesbs_cbc_decrypt,
+ },
+}, {
+ .cra_name = "__ctr-aes-neonbs",
+ .cra_driver_name = "__driver-ctr-aes-neonbs",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aesbs_ctr_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_blkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_ctr_set_key,
+ .encrypt = aesbs_ctr_encrypt,
+ .decrypt = aesbs_ctr_encrypt,
+ },
+}, {
+ .cra_name = "__xts-aes-neonbs",
+ .cra_driver_name = "__driver-xts-aes-neonbs",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct aesbs_xts_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_blkcipher = {
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_xts_set_key,
+ .encrypt = aesbs_xts_encrypt,
+ .decrypt = aesbs_xts_decrypt,
+ },
+}, {
+ .cra_name = "cbc(aes)",
+ .cra_driver_name = "cbc-aes-neonbs",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_init = ablk_init,
+ .cra_exit = ablk_exit,
+ .cra_ablkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = __ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ }
+}, {
+ .cra_name = "ctr(aes)",
+ .cra_driver_name = "ctr-aes-neonbs",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_init = ablk_init,
+ .cra_exit = ablk_exit,
+ .cra_ablkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ }
+}, {
+ .cra_name = "xts(aes)",
+ .cra_driver_name = "xts-aes-neonbs",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_init = ablk_init,
+ .cra_exit = ablk_exit,
+ .cra_ablkcipher = {
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ }
+} };
+
+static int __init aesbs_mod_init(void)
+{
+ if (!cpu_has_neon())
+ return -ENODEV;
+
+ return crypto_register_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
+}
+
+static void __exit aesbs_mod_exit(void)
+{
+ crypto_unregister_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
+}
+
+module_init(aesbs_mod_init);
+module_exit(aesbs_mod_exit);
+
+MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL");
diff --git a/arch/arm/crypto/bsaes-armv7.pl b/arch/arm/crypto/bsaes-armv7.pl
new file mode 100644
index 000000000000..be068db960ee
--- /dev/null
+++ b/arch/arm/crypto/bsaes-armv7.pl
@@ -0,0 +1,2467 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
+# granted.
+# ====================================================================
+
+# Bit-sliced AES for ARM NEON
+#
+# February 2012.
+#
+# This implementation is direct adaptation of bsaes-x86_64 module for
+# ARM NEON. Except that this module is endian-neutral [in sense that
+# it can be compiled for either endianness] by courtesy of vld1.8's
+# neutrality. Initial version doesn't implement interface to OpenSSL,
+# only low-level primitives and unsupported entry points, just enough
+# to collect performance results, which for Cortex-A8 core are:
+#
+# encrypt 19.5 cycles per byte processed with 128-bit key
+# decrypt 22.1 cycles per byte processed with 128-bit key
+# key conv. 440 cycles per 128-bit key/0.18 of 8x block
+#
+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+# which is [much] worse than anticipated (for further details see
+# http://www.openssl.org/~appro/Snapdragon-S4.html).
+#
+# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+# manages in 20.0 cycles].
+#
+# When comparing to x86_64 results keep in mind that NEON unit is
+# [mostly] single-issue and thus can't [fully] benefit from
+# instruction-level parallelism. And when comparing to aes-armv4
+# results keep in mind key schedule conversion overhead (see
+# bsaes-x86_64.pl for further details)...
+#
+# <appro@openssl.org>
+
+# April-August 2013
+#
+# Add CBC, CTR and XTS subroutines, adapt for kernel use.
+#
+# <ard.biesheuvel@linaro.org>
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
+my @XMM=map("q$_",(0..15));
+
+{
+my ($key,$rounds,$const)=("r4","r5","r6");
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub Sbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InBasisChange (@b);
+ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
+ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ veor @b[2], @b[2], @b[1]
+ veor @b[5], @b[5], @b[6]
+ veor @b[3], @b[3], @b[0]
+ veor @b[6], @b[6], @b[2]
+ veor @b[5], @b[5], @b[0]
+
+ veor @b[6], @b[6], @b[3]
+ veor @b[3], @b[3], @b[7]
+ veor @b[7], @b[7], @b[5]
+ veor @b[3], @b[3], @b[4]
+ veor @b[4], @b[4], @b[5]
+
+ veor @b[2], @b[2], @b[7]
+ veor @b[3], @b[3], @b[1]
+ veor @b[1], @b[1], @b[5]
+___
+}
+
+sub OutBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ veor @b[0], @b[0], @b[6]
+ veor @b[1], @b[1], @b[4]
+ veor @b[4], @b[4], @b[6]
+ veor @b[2], @b[2], @b[0]
+ veor @b[6], @b[6], @b[1]
+
+ veor @b[1], @b[1], @b[5]
+ veor @b[5], @b[5], @b[3]
+ veor @b[3], @b[3], @b[7]
+ veor @b[7], @b[7], @b[5]
+ veor @b[2], @b[2], @b[5]
+
+ veor @b[4], @b[4], @b[7]
+___
+}
+
+sub InvSbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InvInBasisChange (@b);
+ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
+ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange { # OutBasisChange in reverse (with twist)
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+ veor @b[1], @b[1], @b[7]
+ veor @b[4], @b[4], @b[7]
+
+ veor @b[7], @b[7], @b[5]
+ veor @b[1], @b[1], @b[3]
+ veor @b[2], @b[2], @b[5]
+ veor @b[3], @b[3], @b[7]
+
+ veor @b[6], @b[6], @b[1]
+ veor @b[2], @b[2], @b[0]
+ veor @b[5], @b[5], @b[3]
+ veor @b[4], @b[4], @b[6]
+ veor @b[0], @b[0], @b[6]
+ veor @b[1], @b[1], @b[4]
+___
+}
+
+sub InvOutBasisChange { # InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+ veor @b[1], @b[1], @b[5]
+ veor @b[2], @b[2], @b[7]
+
+ veor @b[3], @b[3], @b[1]
+ veor @b[4], @b[4], @b[5]
+ veor @b[7], @b[7], @b[5]
+ veor @b[3], @b[3], @b[4]
+ veor @b[5], @b[5], @b[0]
+ veor @b[3], @b[3], @b[7]
+ veor @b[6], @b[6], @b[2]
+ veor @b[2], @b[2], @b[1]
+ veor @b[6], @b[6], @b[3]
+
+ veor @b[3], @b[3], @b[0]
+ veor @b[5], @b[5], @b[6]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ vand $t0, $t0, $x0
+ veor $x0, $x0, $x1
+ vand $t1, $x1, $y0
+ vand $x0, $x0, $y1
+ veor $x1, $t1, $t0
+ veor $x0, $x0, $t1
+___
+}
+
+sub Mul_GF4_N { # not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ vand $t0, $t0, $x0
+ veor $x0, $x0, $x1
+ vand $x1, $x1, $y0
+ vand $x0, $x0, $y1
+ veor $x1, $x1, $x0
+ veor $x0, $x0, $t0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+ $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+ veor $t0, $y0, $y1
+ veor $t1, $y2, $y3
+ vand $t0, $t0, $x0
+ vand $t1, $t1, $x2
+ veor $x0, $x0, $x1
+ veor $x2, $x2, $x3
+ vand $x1, $x1, $y0
+ vand $x3, $x3, $y2
+ vand $x0, $x0, $y1
+ vand $x2, $x2, $y3
+ veor $x1, $x1, $x0
+ veor $x2, $x2, $x3
+ veor $x0, $x0, $t0
+ veor $x3, $x3, $t1
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+ veor @t[0], @x[0], @x[2]
+ veor @t[1], @x[1], @x[3]
+___
+ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+ veor @y[0], @y[0], @y[2]
+ veor @y[1], @y[1], @y[3]
+___
+ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ veor @x[0], @x[0], @t[0]
+ veor @x[2], @x[2], @t[0]
+ veor @x[1], @x[1], @t[1]
+ veor @x[3], @x[3], @t[1]
+
+ veor @t[0], @x[4], @x[6]
+ veor @t[1], @x[5], @x[7]
+___
+ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ veor @y[0], @y[0], @y[2]
+ veor @y[1], @y[1], @y[3]
+___
+ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
+$code.=<<___;
+ veor @x[4], @x[4], @t[0]
+ veor @x[6], @x[6], @t[0]
+ veor @x[5], @x[5], @t[1]
+ veor @x[7], @x[7], @t[1]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+ veor @t[3], @x[4], @x[6]
+ veor @t[2], @x[5], @x[7]
+ veor @t[1], @x[1], @x[3]
+ veor @s[1], @x[7], @x[6]
+ vmov @t[0], @t[2]
+ veor @s[0], @x[0], @x[2]
+
+ vorr @t[2], @t[2], @t[1]
+ veor @s[3], @t[3], @t[0]
+ vand @s[2], @t[3], @s[0]
+ vorr @t[3], @t[3], @s[0]
+ veor @s[0], @s[0], @t[1]
+ vand @t[0], @t[0], @t[1]
+ veor @t[1], @x[3], @x[2]
+ vand @s[3], @s[3], @s[0]
+ vand @s[1], @s[1], @t[1]
+ veor @t[1], @x[4], @x[5]
+ veor @s[0], @x[1], @x[0]
+ veor @t[3], @t[3], @s[1]
+ veor @t[2], @t[2], @s[1]
+ vand @s[1], @t[1], @s[0]
+ vorr @t[1], @t[1], @s[0]
+ veor @t[3], @t[3], @s[3]
+ veor @t[0], @t[0], @s[1]
+ veor @t[2], @t[2], @s[2]
+ veor @t[1], @t[1], @s[3]
+ veor @t[0], @t[0], @s[2]
+ vand @s[0], @x[7], @x[3]
+ veor @t[1], @t[1], @s[2]
+ vand @s[1], @x[6], @x[2]
+ vand @s[2], @x[5], @x[1]
+ vorr @s[3], @x[4], @x[0]
+ veor @t[3], @t[3], @s[0]
+ veor @t[1], @t[1], @s[2]
+ veor @t[0], @t[0], @s[3]
+ veor @t[2], @t[2], @s[1]
+
+ @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+ @ new smaller inversion
+
+ vand @s[2], @t[3], @t[1]
+ vmov @s[0], @t[0]
+
+ veor @s[1], @t[2], @s[2]
+ veor @s[3], @t[0], @s[2]
+ veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
+
+ vbsl @s[1], @t[1], @t[0]
+ vbsl @s[3], @t[3], @t[2]
+ veor @t[3], @t[3], @t[2]
+
+ vbsl @s[0], @s[1], @s[2]
+ vbsl @t[0], @s[2], @s[1]
+
+ vand @s[2], @s[0], @s[3]
+ veor @t[1], @t[1], @t[0]
+
+ veor @s[2], @s[2], @t[3]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my @t=@_[8..11];
+my $mask=pop;
+$code.=<<___;
+ vldmia $key!, {@t[0]-@t[3]}
+ veor @t[0], @t[0], @x[0]
+ veor @t[1], @t[1], @x[1]
+ vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
+ vldmia $key!, {@t[0]}
+ veor @t[2], @t[2], @x[2]
+ vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
+ vldmia $key!, {@t[1]}
+ veor @t[3], @t[3], @x[3]
+ vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
+ vldmia $key!, {@t[2]}
+ vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
+ vldmia $key!, {@t[3]}
+ veor @t[0], @t[0], @x[4]
+ veor @t[1], @t[1], @x[5]
+ vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
+ veor @t[2], @t[2], @x[6]
+ vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
+ veor @t[3], @t[3], @x[7]
+ vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
+ vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
+ vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+my $inv=@_[16]; # optional
+$code.=<<___;
+ vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
+ vext.8 @t[1], @x[1], @x[1], #12
+ veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
+ vext.8 @t[2], @x[2], @x[2], #12
+ veor @x[1], @x[1], @t[1]
+ vext.8 @t[3], @x[3], @x[3], #12
+ veor @x[2], @x[2], @t[2]
+ vext.8 @t[4], @x[4], @x[4], #12
+ veor @x[3], @x[3], @t[3]
+ vext.8 @t[5], @x[5], @x[5], #12
+ veor @x[4], @x[4], @t[4]
+ vext.8 @t[6], @x[6], @x[6], #12
+ veor @x[5], @x[5], @t[5]
+ vext.8 @t[7], @x[7], @x[7], #12
+ veor @x[6], @x[6], @t[6]
+
+ veor @t[1], @t[1], @x[0]
+ veor @x[7], @x[7], @t[7]
+ vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor @t[2], @t[2], @x[1]
+ veor @t[0], @t[0], @x[7]
+ veor @t[1], @t[1], @x[7]
+ vext.8 @x[1], @x[1], @x[1], #8
+ veor @t[5], @t[5], @x[4]
+ veor @x[0], @x[0], @t[0]
+ veor @t[6], @t[6], @x[5]
+ veor @x[1], @x[1], @t[1]
+ vext.8 @t[0], @x[4], @x[4], #8
+ veor @t[4], @t[4], @x[3]
+ vext.8 @t[1], @x[5], @x[5], #8
+ veor @t[7], @t[7], @x[6]
+ vext.8 @x[4], @x[3], @x[3], #8
+ veor @t[3], @t[3], @x[2]
+ vext.8 @x[5], @x[7], @x[7], #8
+ veor @t[4], @t[4], @x[7]
+ vext.8 @x[3], @x[6], @x[6], #8
+ veor @t[3], @t[3], @x[7]
+ vext.8 @x[6], @x[2], @x[2], #8
+ veor @x[7], @t[1], @t[5]
+___
+$code.=<<___ if (!$inv);
+ veor @x[2], @t[0], @t[4]
+ veor @x[4], @x[4], @t[3]
+ veor @x[5], @x[5], @t[7]
+ veor @x[3], @x[3], @t[6]
+ @ vmov @x[2], @t[0]
+ veor @x[6], @x[6], @t[2]
+ @ vmov @x[7], @t[1]
+___
+$code.=<<___ if ($inv);
+ veor @t[3], @t[3], @x[4]
+ veor @x[5], @x[5], @t[7]
+ veor @x[2], @x[3], @t[6]
+ veor @x[3], @t[0], @t[4]
+ veor @x[4], @x[6], @t[2]
+ vmov @x[6], @t[3]
+ @ vmov @x[7], @t[1]
+___
+}
+
+sub InvMixColumns_orig {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+ @ multiplication by 0x0e
+ vext.8 @t[7], @x[7], @x[7], #12
+ vmov @t[2], @x[2]
+ veor @x[2], @x[2], @x[5] @ 2 5
+ veor @x[7], @x[7], @x[5] @ 7 5
+ vext.8 @t[0], @x[0], @x[0], #12
+ vmov @t[5], @x[5]
+ veor @x[5], @x[5], @x[0] @ 5 0 [1]
+ veor @x[0], @x[0], @x[1] @ 0 1
+ vext.8 @t[1], @x[1], @x[1], #12
+ veor @x[1], @x[1], @x[2] @ 1 25
+ veor @x[0], @x[0], @x[6] @ 01 6 [2]
+ vext.8 @t[3], @x[3], @x[3], #12
+ veor @x[1], @x[1], @x[3] @ 125 3 [4]
+ veor @x[2], @x[2], @x[0] @ 25 016 [3]
+ veor @x[3], @x[3], @x[7] @ 3 75
+ veor @x[7], @x[7], @x[6] @ 75 6 [0]
+ vext.8 @t[6], @x[6], @x[6], #12
+ vmov @t[4], @x[4]
+ veor @x[6], @x[6], @x[4] @ 6 4
+ veor @x[4], @x[4], @x[3] @ 4 375 [6]
+ veor @x[3], @x[3], @x[7] @ 375 756=36
+ veor @x[6], @x[6], @t[5] @ 64 5 [7]
+ veor @x[3], @x[3], @t[2] @ 36 2
+ vext.8 @t[5], @t[5], @t[5], #12
+ veor @x[3], @x[3], @t[4] @ 362 4 [5]
+___
+ my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+ @ multiplication by 0x0b
+ veor @y[1], @y[1], @y[0]
+ veor @y[0], @y[0], @t[0]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[1], @y[1], @t[1]
+ veor @y[0], @y[0], @t[5]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[1], @y[1], @t[6]
+ veor @y[0], @y[0], @t[7]
+ veor @t[7], @t[7], @t[6] @ clobber t[7]
+
+ veor @y[3], @y[3], @t[0]
+ veor @y[1], @y[1], @y[0]
+ vext.8 @t[0], @t[0], @t[0], #12
+ veor @y[2], @y[2], @t[1]
+ veor @y[4], @y[4], @t[1]
+ vext.8 @t[1], @t[1], @t[1], #12
+ veor @y[2], @y[2], @t[2]
+ veor @y[3], @y[3], @t[2]
+ veor @y[5], @y[5], @t[2]
+ veor @y[2], @y[2], @t[7]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[3], @y[3], @t[3]
+ veor @y[6], @y[6], @t[3]
+ veor @y[4], @y[4], @t[3]
+ veor @y[7], @y[7], @t[4]
+ vext.8 @t[3], @t[3], @t[3], #12
+ veor @y[5], @y[5], @t[4]
+ veor @y[7], @y[7], @t[7]
+ veor @t[7], @t[7], @t[5] @ clobber t[7] even more
+ veor @y[3], @y[3], @t[5]
+ veor @y[4], @y[4], @t[4]
+
+ veor @y[5], @y[5], @t[7]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[6], @y[6], @t[7]
+ veor @y[4], @y[4], @t[7]
+
+ veor @t[7], @t[7], @t[5]
+ vext.8 @t[5], @t[5], @t[5], #12
+
+ @ multiplication by 0x0d
+ veor @y[4], @y[4], @y[7]
+ veor @t[7], @t[7], @t[6] @ restore t[7]
+ veor @y[7], @y[7], @t[4]
+ vext.8 @t[6], @t[6], @t[6], #12
+ veor @y[2], @y[2], @t[0]
+ veor @y[7], @y[7], @t[5]
+ vext.8 @t[7], @t[7], @t[7], #12
+ veor @y[2], @y[2], @t[2]
+
+ veor @y[3], @y[3], @y[1]
+ veor @y[1], @y[1], @t[1]
+ veor @y[0], @y[0], @t[0]
+ veor @y[3], @y[3], @t[0]
+ veor @y[1], @y[1], @t[5]
+ veor @y[0], @y[0], @t[5]
+ vext.8 @t[0], @t[0], @t[0], #12
+ veor @y[1], @y[1], @t[7]
+ veor @y[0], @y[0], @t[6]
+ veor @y[3], @y[3], @y[1]
+ veor @y[4], @y[4], @t[1]
+ vext.8 @t[1], @t[1], @t[1], #12
+
+ veor @y[7], @y[7], @t[7]
+ veor @y[4], @y[4], @t[2]
+ veor @y[5], @y[5], @t[2]
+ veor @y[2], @y[2], @t[6]
+ veor @t[6], @t[6], @t[3] @ clobber t[6]
+ vext.8 @t[2], @t[2], @t[2], #12
+ veor @y[4], @y[4], @y[7]
+ veor @y[3], @y[3], @t[6]
+
+ veor @y[6], @y[6], @t[6]
+ veor @y[5], @y[5], @t[5]
+ vext.8 @t[5], @t[5], @t[5], #12
+ veor @y[6], @y[6], @t[4]
+ vext.8 @t[4], @t[4], @t[4], #12
+ veor @y[5], @y[5], @t[6]
+ veor @y[6], @y[6], @t[7]
+ vext.8 @t[7], @t[7], @t[7], #12
+ veor @t[6], @t[6], @t[3] @ restore t[6]
+ vext.8 @t[3], @t[3], @t[3], #12
+
+ @ multiplication by 0x09
+ veor @y[4], @y[4], @y[1]
+ veor @t[1], @t[1], @y[1] @ t[1]=y[1]
+ veor @t[0], @t[0], @t[5] @ clobber t[0]
+ vext.8 @t[6], @t[6], @t[6], #12
+ veor @t[1], @t[1], @t[5]
+ veor @y[3], @y[3], @t[0]
+ veor @t[0], @t[0], @y[0] @ t[0]=y[0]
+ veor @t[1], @t[1], @t[6]
+ veor @t[6], @t[6], @t[7] @ clobber t[6]
+ veor @y[4], @y[4], @t[1]
+ veor @y[7], @y[7], @t[4]
+ veor @y[6], @y[6], @t[3]
+ veor @y[5], @y[5], @t[2]
+ veor @t[4], @t[4], @y[4] @ t[4]=y[4]
+ veor @t[3], @t[3], @y[3] @ t[3]=y[3]
+ veor @t[5], @t[5], @y[5] @ t[5]=y[5]
+ veor @t[2], @t[2], @y[2] @ t[2]=y[2]
+ veor @t[3], @t[3], @t[7]
+ veor @XMM[5], @t[5], @t[6]
+ veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
+ veor @XMM[2], @t[2], @t[6]
+ veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
+
+ vmov @XMM[0], @t[0]
+ vmov @XMM[1], @t[1]
+ @ vmov @XMM[2], @t[2]
+ vmov @XMM[3], @t[3]
+ vmov @XMM[4], @t[4]
+ @ vmov @XMM[5], @t[5]
+ @ vmov @XMM[6], @t[6]
+ @ vmov @XMM[7], @t[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
+# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
+
+$code.=<<___;
+ @ multiplication by 0x05-0x00-0x04-0x00
+ vext.8 @t[0], @x[0], @x[0], #8
+ vext.8 @t[6], @x[6], @x[6], #8
+ vext.8 @t[7], @x[7], @x[7], #8
+ veor @t[0], @t[0], @x[0]
+ vext.8 @t[1], @x[1], @x[1], #8
+ veor @t[6], @t[6], @x[6]
+ vext.8 @t[2], @x[2], @x[2], #8
+ veor @t[7], @t[7], @x[7]
+ vext.8 @t[3], @x[3], @x[3], #8
+ veor @t[1], @t[1], @x[1]
+ vext.8 @t[4], @x[4], @x[4], #8
+ veor @t[2], @t[2], @x[2]
+ vext.8 @t[5], @x[5], @x[5], #8
+ veor @t[3], @t[3], @x[3]
+ veor @t[4], @t[4], @x[4]
+ veor @t[5], @t[5], @x[5]
+
+ veor @x[0], @x[0], @t[6]
+ veor @x[1], @x[1], @t[6]
+ veor @x[2], @x[2], @t[0]
+ veor @x[4], @x[4], @t[2]
+ veor @x[3], @x[3], @t[1]
+ veor @x[1], @x[1], @t[7]
+ veor @x[2], @x[2], @t[7]
+ veor @x[4], @x[4], @t[6]
+ veor @x[5], @x[5], @t[3]
+ veor @x[3], @x[3], @t[6]
+ veor @x[6], @x[6], @t[4]
+ veor @x[4], @x[4], @t[7]
+ veor @x[5], @x[5], @t[7]
+ veor @x[7], @x[7], @t[5]
+___
+ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+ vshr.u64 $t, $b, #$n
+ veor $t, $t, $a
+ vand $t, $t, $mask
+ veor $a, $a, $t
+ vshl.u64 $t, $t, #$n
+ veor $b, $b, $t
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+ vshr.u64 $t0, $b0, #$n
+ vshr.u64 $t1, $b1, #$n
+ veor $t0, $t0, $a0
+ veor $t1, $t1, $a1
+ vand $t0, $t0, $mask
+ vand $t1, $t1, $mask
+ veor $a0, $a0, $t0
+ vshl.u64 $t0, $t0, #$n
+ veor $a1, $a1, $t1
+ vshl.u64 $t1, $t1, #$n
+ veor $b0, $b0, $t0
+ veor $b1, $b1, $t1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+ vmov.i8 $t0,#0x55 @ compose .LBS0
+ vmov.i8 $t1,#0x33 @ compose .LBS1
+___
+ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+ vmov.i8 $t0,#0x0f @ compose .LBS2
+___
+ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ 7
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_ARCH__>=7
+.text
+.syntax unified @ ARMv7-capable assembler is expected to handle this
+#ifdef __thumb2__
+.thumb
+#else
+.code 32
+#endif
+
+.fpu neon
+
+.type _bsaes_decrypt8,%function
+.align 4
+_bsaes_decrypt8:
+ adr $const,_bsaes_decrypt8
+ vldmia $key!, {@XMM[9]} @ round 0 key
+ add $const,$const,#.LM0ISR-_bsaes_decrypt8
+
+ vldmia $const!, {@XMM[8]} @ .LM0ISR
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
+ veor @XMM[11], @XMM[1], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ veor @XMM[12], @XMM[2], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+ veor @XMM[13], @XMM[3], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+ veor @XMM[14], @XMM[4], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+ veor @XMM[15], @XMM[5], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+ veor @XMM[10], @XMM[6], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+ veor @XMM[11], @XMM[7], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ sub $rounds,$rounds,#1
+ b .Ldec_sbox
+.align 4
+.Ldec_loop:
+___
+ &ShiftRows (@XMM[0..7, 8..12]);
+$code.=".Ldec_sbox:\n";
+ &InvSbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ subs $rounds,$rounds,#1
+ bcc .Ldec_done
+___
+ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+ vldmia $const, {@XMM[12]} @ .LISR
+ ite eq @ Thumb2 thing, sanity check in ARM
+ addeq $const,$const,#0x10
+ bne .Ldec_loop
+ vldmia $const, {@XMM[12]} @ .LISRM0
+ b .Ldec_loop
+.align 4
+.Ldec_done:
+___
+ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+ vldmia $key, {@XMM[8]} @ last round key
+ veor @XMM[6], @XMM[6], @XMM[8]
+ veor @XMM[4], @XMM[4], @XMM[8]
+ veor @XMM[2], @XMM[2], @XMM[8]
+ veor @XMM[7], @XMM[7], @XMM[8]
+ veor @XMM[3], @XMM[3], @XMM[8]
+ veor @XMM[5], @XMM[5], @XMM[8]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[1], @XMM[8]
+ bx lr
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type _bsaes_const,%object
+.align 6
+_bsaes_const:
+.LM0ISR: @ InvShiftRows constants
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR: @ ShiftRows constants
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+ .quad 0x090d01050c000408, 0x03070b0f060a0e02
+.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 6
+.size _bsaes_const,.-_bsaes_const
+
+.type _bsaes_encrypt8,%function
+.align 4
+_bsaes_encrypt8:
+ adr $const,_bsaes_encrypt8
+ vldmia $key!, {@XMM[9]} @ round 0 key
+ sub $const,$const,#_bsaes_encrypt8-.LM0SR
+
+ vldmia $const!, {@XMM[8]} @ .LM0SR
+_bsaes_encrypt8_alt:
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
+ veor @XMM[11], @XMM[1], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ veor @XMM[12], @XMM[2], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+ veor @XMM[13], @XMM[3], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
+ veor @XMM[14], @XMM[4], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
+ veor @XMM[15], @XMM[5], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
+ veor @XMM[10], @XMM[6], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
+ veor @XMM[11], @XMM[7], @XMM[9]
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
+_bsaes_encrypt8_bitslice:
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ sub $rounds,$rounds,#1
+ b .Lenc_sbox
+.align 4
+.Lenc_loop:
+___
+ &ShiftRows (@XMM[0..7, 8..12]);
+$code.=".Lenc_sbox:\n";
+ &Sbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ subs $rounds,$rounds,#1
+ bcc .Lenc_done
+___
+ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+ vldmia $const, {@XMM[12]} @ .LSR
+ ite eq @ Thumb2 thing, samity check in ARM
+ addeq $const,$const,#0x10
+ bne .Lenc_loop
+ vldmia $const, {@XMM[12]} @ .LSRM0
+ b .Lenc_loop
+.align 4
+.Lenc_done:
+___
+ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+ vldmia $key, {@XMM[8]} @ last round key
+ veor @XMM[4], @XMM[4], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[8]
+ veor @XMM[3], @XMM[3], @XMM[8]
+ veor @XMM[7], @XMM[7], @XMM[8]
+ veor @XMM[2], @XMM[2], @XMM[8]
+ veor @XMM[5], @XMM[5], @XMM[8]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[1], @XMM[8]
+ bx lr
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+ @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
+ vmov @x[2], @x[0]
+ vmov @x[3], @x[1]
+___
+ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+ @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+ vmov @x[4], @x[0]
+ vmov @x[6], @x[2]
+ vmov @x[5], @x[1]
+ vmov @x[7], @x[3]
+___
+ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
+ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type _bsaes_key_convert,%function
+.align 4
+_bsaes_key_convert:
+ adr $const,_bsaes_key_convert
+ vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
+ sub $const,$const,#_bsaes_key_convert-.LM0
+ vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
+
+ vmov.i8 @XMM[8], #0x01 @ bit masks
+ vmov.i8 @XMM[9], #0x02
+ vmov.i8 @XMM[10], #0x04
+ vmov.i8 @XMM[11], #0x08
+ vmov.i8 @XMM[12], #0x10
+ vmov.i8 @XMM[13], #0x20
+ vldmia $const, {@XMM[14]} @ .LM0
+
+#ifdef __ARMEL__
+ vrev32.8 @XMM[7], @XMM[7]
+ vrev32.8 @XMM[15], @XMM[15]
+#endif
+ sub $rounds,$rounds,#1
+ vstmia $out!, {@XMM[7]} @ save round 0 key
+ b .Lkey_loop
+
+.align 4
+.Lkey_loop:
+ vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
+ vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
+ vmov.i8 @XMM[6], #0x40
+ vmov.i8 @XMM[15], #0x80
+
+ vtst.8 @XMM[0], @XMM[7], @XMM[8]
+ vtst.8 @XMM[1], @XMM[7], @XMM[9]
+ vtst.8 @XMM[2], @XMM[7], @XMM[10]
+ vtst.8 @XMM[3], @XMM[7], @XMM[11]
+ vtst.8 @XMM[4], @XMM[7], @XMM[12]
+ vtst.8 @XMM[5], @XMM[7], @XMM[13]
+ vtst.8 @XMM[6], @XMM[7], @XMM[6]
+ vtst.8 @XMM[7], @XMM[7], @XMM[15]
+ vld1.8 {@XMM[15]}, [$inp]! @ load next round key
+ vmvn @XMM[0], @XMM[0] @ "pnot"
+ vmvn @XMM[1], @XMM[1]
+ vmvn @XMM[5], @XMM[5]
+ vmvn @XMM[6], @XMM[6]
+#ifdef __ARMEL__
+ vrev32.8 @XMM[15], @XMM[15]
+#endif
+ subs $rounds,$rounds,#1
+ vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
+ bne .Lkey_loop
+
+ vmov.i8 @XMM[7],#0x63 @ compose .L63
+ @ don't save last round key
+ bx lr
+.size _bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0) { # following four functions are unsupported interface
+ # used for benchmarking...
+$code.=<<___;
+.globl bsaes_enc_key_convert
+.type bsaes_enc_key_convert,%function
+.align 4
+bsaes_enc_key_convert:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ ldr r5,[$inp,#240] @ pass rounds
+ mov r4,$inp @ pass key
+ mov r12,$out @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl bsaes_encrypt_128
+.type bsaes_encrypt_128,%function
+.align 4
+bsaes_encrypt_128:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+.Lenc128_loop:
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+ mov r4,$key @ pass the key
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5,#10 @ pass rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+
+ bl _bsaes_encrypt8
+
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ subs $len,$len,#0x80
+ vst1.8 {@XMM[5]}, [$out]!
+ bhi .Lenc128_loop
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl bsaes_dec_key_convert
+.type bsaes_dec_key_convert,%function
+.align 4
+bsaes_dec_key_convert:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ ldr r5,[$inp,#240] @ pass rounds
+ mov r4,$inp @ pass key
+ mov r12,$out @ pass key schedule
+ bl _bsaes_key_convert
+ vldmia $out, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia $out, {@XMM[7]}
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl bsaes_decrypt_128
+.type bsaes_decrypt_128,%function
+.align 4
+bsaes_decrypt_128:
+ stmdb sp!,{r4-r6,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+.Ldec128_loop:
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+ mov r4,$key @ pass the key
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5,#10 @ pass rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+
+ bl _bsaes_decrypt8
+
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ subs $len,$len,#0x80
+ vst1.8 {@XMM[5]}, [$out]!
+ bhi .Ldec128_loop
+
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r6,pc}
+.size bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
+my ($keysched)=("sp");
+
+$code.=<<___;
+.extern AES_cbc_encrypt
+.extern AES_decrypt
+
+.global bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,%function
+.align 5
+bsaes_cbc_encrypt:
+#ifndef __KERNEL__
+ cmp $len, #128
+#ifndef __thumb__
+ blo AES_cbc_encrypt
+#else
+ bhs 1f
+ b AES_cbc_encrypt
+1:
+#endif
+#endif
+
+ @ it is up to the caller to make sure we are called with enc == 0
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr $ivp, [ip] @ IV is 1st arg on the stack
+ mov $len, $len, lsr#4 @ len in 16 byte blocks
+ sub sp, #0x10 @ scratch space to carry over the IV
+ mov $fp, sp @ save sp
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ add r12, #`128-32` @ sifze of bit-slices key schedule
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12 @ sp is $keysched
+ bl _bsaes_key_convert
+ vldmia $keysched, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia $keysched, {@XMM[7]}
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, $key, #248
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+
+.align 2
+0:
+#endif
+
+ vld1.8 {@XMM[15]}, [$ivp] @ load IV
+ b .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+ subs $len, $len, #0x8
+ bmi .Lcbc_dec_loop_finish
+
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, $keysched @ pass the key
+#else
+ add r4, $key, #248
+#endif
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
+ mov r5, $rounds
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]
+ sub $inp, $inp, #0x60
+ vstmia $fp, {@XMM[15]} @ put aside IV
+
+ bl _bsaes_decrypt8
+
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[3], @XMM[3], @XMM[13]
+ vst1.8 {@XMM[6]}, [$out]!
+ veor @XMM[5], @XMM[5], @XMM[14]
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ vst1.8 {@XMM[5]}, [$out]!
+
+ b .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+ adds $len, $len, #8
+ beq .Lcbc_dec_done
+
+ vld1.8 {@XMM[0]}, [$inp]! @ load input
+ cmp $len, #2
+ blo .Lcbc_dec_one
+ vld1.8 {@XMM[1]}, [$inp]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, $keysched @ pass the key
+#else
+ add r4, $key, #248
+#endif
+ mov r5, $rounds
+ vstmia $fp, {@XMM[15]} @ put aside IV
+ beq .Lcbc_dec_two
+ vld1.8 {@XMM[2]}, [$inp]!
+ cmp $len, #4
+ blo .Lcbc_dec_three
+ vld1.8 {@XMM[3]}, [$inp]!
+ beq .Lcbc_dec_four
+ vld1.8 {@XMM[4]}, [$inp]!
+ cmp $len, #6
+ blo .Lcbc_dec_five
+ vld1.8 {@XMM[5]}, [$inp]!
+ beq .Lcbc_dec_six
+ vld1.8 {@XMM[6]}, [$inp]!
+ sub $inp, $inp, #0x70
+
+ bl _bsaes_decrypt8
+
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[3], @XMM[3], @XMM[13]
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ vst1.8 {@XMM[3]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+ sub $inp, $inp, #0x60
+ bl _bsaes_decrypt8
+ vldmia $fp,{@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[12]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[7], @XMM[7], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ vst1.8 {@XMM[7]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+ sub $inp, $inp, #0x50
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[2], @XMM[2], @XMM[11]
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ vst1.8 {@XMM[2]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+ sub $inp, $inp, #0x40
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[10]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[4], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ vst1.8 {@XMM[4]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+ sub $inp, $inp, #0x30
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[15]}, [$inp]!
+ veor @XMM[1], @XMM[1], @XMM[8]
+ veor @XMM[6], @XMM[6], @XMM[9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ vst1.8 {@XMM[6]}, [$out]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+ sub $inp, $inp, #0x20
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[8]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vld1.8 {@XMM[15]}, [$inp]! @ reload input
+ veor @XMM[1], @XMM[1], @XMM[8]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+ sub $inp, $inp, #0x10
+ mov $rounds, $out @ save original out pointer
+ mov $out, $fp @ use the iv scratch space as out buffer
+ mov r2, $key
+ vmov @XMM[4],@XMM[15] @ just in case ensure that IV
+ vmov @XMM[5],@XMM[0] @ and input are preserved
+ bl AES_decrypt
+ vld1.8 {@XMM[0]}, [$fp,:64] @ load result
+ veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
+ vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
+ vst1.8 {@XMM[0]}, [$rounds] @ write output
+
+.Lcbc_dec_done:
+#ifndef BSAES_ASM_EXTENDED_KEY
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+.Lcbc_dec_bzero: @ wipe key schedule [if any]
+ vstmia $keysched!, {q0-q1}
+ cmp $keysched, $fp
+ bne .Lcbc_dec_bzero
+#endif
+
+ mov sp, $fp
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
+ vst1.8 {@XMM[15]}, [$ivp] @ return IV
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc}
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+___
+}
+{
+my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
+my $const = "r6"; # shared with _bsaes_encrypt8_alt
+my $keysched = "sp";
+
+$code.=<<___;
+.extern AES_encrypt
+.global bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,%function
+.align 5
+bsaes_ctr32_encrypt_blocks:
+ cmp $len, #8 @ use plain AES for
+ blo .Lctr_enc_short @ small sizes
+
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr}
+ VFP_ABI_PUSH
+ ldr $ctr, [ip] @ ctr is 1st arg on the stack
+ sub sp, sp, #0x10 @ scratch space to carry over the ctr
+ mov $fp, sp @ save sp
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ add r12, #`128-32` @ size of bit-sliced key schedule
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12 @ sp is $keysched
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
+ add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
+ vldmia $keysched, {@XMM[4]} @ load round0 key
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+
+.align 2
+0: add r12, $key, #248
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
+ adrl $ctr, .LREVM0SR @ borrow $ctr
+ vldmia r12, {@XMM[4]} @ load round0 key
+ sub sp, #0x10 @ place for adjusted round0 key
+#endif
+
+ vmov.i32 @XMM[8],#1 @ compose 1<<96
+ veor @XMM[9],@XMM[9],@XMM[9]
+ vrev32.8 @XMM[0],@XMM[0]
+ vext.8 @XMM[8],@XMM[9],@XMM[8],#4
+ vrev32.8 @XMM[4],@XMM[4]
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
+ vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
+ b .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+ vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
+ vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
+ vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
+ vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
+ vadd.u32 @XMM[4], @XMM[1], @XMM[10]
+ vadd.u32 @XMM[5], @XMM[2], @XMM[10]
+ vadd.u32 @XMM[6], @XMM[3], @XMM[10]
+ vadd.u32 @XMM[7], @XMM[4], @XMM[10]
+ vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
+
+ @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ @ to flip byte order in 32-bit counter
+
+ vldmia $keysched, {@XMM[9]} @ load round0 key
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, $keysched, #0x10 @ pass next round key
+#else
+ add r4, $key, #`248+16`
+#endif
+ vldmia $ctr, {@XMM[8]} @ .LREVM0SR
+ mov r5, $rounds @ pass rounds
+ vstmia $fp, {@XMM[10]} @ save next counter
+ sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
+
+ bl _bsaes_encrypt8_alt
+
+ subs $len, $len, #8
+ blo .Lctr_enc_loop_done
+
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
+ veor @XMM[0], @XMM[8]
+ veor @XMM[1], @XMM[9]
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
+ veor @XMM[4], @XMM[10]
+ veor @XMM[6], @XMM[11]
+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
+ veor @XMM[3], @XMM[12]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
+ veor @XMM[7], @XMM[13]
+ veor @XMM[2], @XMM[14]
+ vst1.8 {@XMM[4]}, [$out]!
+ veor @XMM[5], @XMM[15]
+ vst1.8 {@XMM[6]}, [$out]!
+ vmov.i32 @XMM[8], #1 @ compose 1<<96
+ vst1.8 {@XMM[3]}, [$out]!
+ veor @XMM[9], @XMM[9], @XMM[9]
+ vst1.8 {@XMM[7]}, [$out]!
+ vext.8 @XMM[8], @XMM[9], @XMM[8], #4
+ vst1.8 {@XMM[2]}, [$out]!
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
+ vst1.8 {@XMM[5]}, [$out]!
+ vldmia $fp, {@XMM[0]} @ load counter
+
+ bne .Lctr_enc_loop
+ b .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+ add $len, $len, #8
+ vld1.8 {@XMM[8]}, [$inp]! @ load input
+ veor @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]! @ write output
+ cmp $len, #2
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[9]}, [$inp]!
+ veor @XMM[1], @XMM[9]
+ vst1.8 {@XMM[1]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[10]}, [$inp]!
+ veor @XMM[4], @XMM[10]
+ vst1.8 {@XMM[4]}, [$out]!
+ cmp $len, #4
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[11]}, [$inp]!
+ veor @XMM[6], @XMM[11]
+ vst1.8 {@XMM[6]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[12]}, [$inp]!
+ veor @XMM[3], @XMM[12]
+ vst1.8 {@XMM[3]}, [$out]!
+ cmp $len, #6
+ blo .Lctr_enc_done
+ vld1.8 {@XMM[13]}, [$inp]!
+ veor @XMM[7], @XMM[13]
+ vst1.8 {@XMM[7]}, [$out]!
+ beq .Lctr_enc_done
+ vld1.8 {@XMM[14]}, [$inp]
+ veor @XMM[2], @XMM[14]
+ vst1.8 {@XMM[2]}, [$out]!
+
+.Lctr_enc_done:
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifndef BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero: @ wipe key schedule [if any]
+ vstmia $keysched!, {q0-q1}
+ cmp $keysched, $fp
+ bne .Lctr_enc_bzero
+#else
+ vstmia $keysched, {q0-q1}
+#endif
+
+ mov sp, $fp
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.align 4
+.Lctr_enc_short:
+ ldr ip, [sp] @ ctr pointer is passed on stack
+ stmdb sp!, {r4-r8, lr}
+
+ mov r4, $inp @ copy arguments
+ mov r5, $out
+ mov r6, $len
+ mov r7, $key
+ ldr r8, [ip, #12] @ load counter LSW
+ vld1.8 {@XMM[1]}, [ip] @ load whole counter value
+#ifdef __ARMEL__
+ rev r8, r8
+#endif
+ sub sp, sp, #0x10
+ vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value
+ sub sp, sp, #0x10
+
+.Lctr_enc_short_loop:
+ add r0, sp, #0x10 @ input counter value
+ mov r1, sp @ output on the stack
+ mov r2, r7 @ key
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [r4]! @ load input
+ vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter
+ add r8, r8, #1
+#ifdef __ARMEL__
+ rev r0, r8
+ str r0, [sp, #0x1c] @ next counter value
+#else
+ str r8, [sp, #0x1c] @ next counter value
+#endif
+ veor @XMM[0],@XMM[0],@XMM[1]
+ vst1.8 {@XMM[0]}, [r5]! @ store output
+ subs r6, r6, #1
+ bne .Lctr_enc_short_loop
+
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vstmia sp!, {q0-q1}
+
+ ldmia sp!, {r4-r8, pc}
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+}
+{
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
+my $const="r6"; # returned by _bsaes_key_convert
+my $twmask=@XMM[5];
+my @T=@XMM[6..7];
+
+$code.=<<___;
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,%function
+.align 4
+bsaes_xts_encrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future $fp
+
+ mov $inp, r0
+ mov $out, r1
+ mov $len, r2
+ mov $key, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0,sp @ pointer to initial tweak
+#endif
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+ mov $fp, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
+ sub r12, #`32+16` @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]} @ save last round key
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key
+ vstmia r12, {@XMM[7]}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+
+ vld1.8 {@XMM[8]}, [r0] @ initial tweak
+ adr $magic, .Lxts_magic
+
+ subs $len, #0x80
+ blo .Lxts_enc_short
+ b .Lxts_enc_loop
+
+.align 4
+.Lxts_enc_loop:
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ vadd.u64 @XMM[8], @XMM[15], @XMM[15]
+ vst1.64 {@XMM[15]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ veor @XMM[8], @XMM[8], @T[0]
+ vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ veor @XMM[7], @XMM[7], @XMM[15]
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[2], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ veor @XMM[13], @XMM[5], @XMM[15]
+ vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ subs $len, #0x80
+ bpl .Lxts_enc_loop
+
+.Lxts_enc_short:
+ adds $len, #0x70
+ bmi .Lxts_enc_done
+
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+ subs $len, #0x10
+ bmi .Lxts_enc_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ sub $len, #0x10
+ vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vld1.64 {@XMM[14]}, [r0,:128]!
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[2], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ vst1.8 {@XMM[12]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_6:
+ vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
+
+ veor @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[5], @XMM[5], @XMM[13]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+
+@ put this in range for both ARM and Thumb mode adr instructions
+.align 5
+.Lxts_magic:
+ .quad 1, 0x87
+
+.align 5
+.Lxts_enc_5:
+ vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
+
+ veor @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[4], @XMM[4], @XMM[12]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ veor @XMM[10], @XMM[3], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ vst1.8 {@XMM[10]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_4:
+ vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
+
+ veor @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[3], @XMM[3], @XMM[11]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[6], @XMM[11]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_3:
+ vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
+
+ veor @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[2], @XMM[2], @XMM[10]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ vld1.64 {@XMM[10]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[4], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ vst1.8 {@XMM[8]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_2:
+ vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
+
+ veor @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[1], @XMM[1], @XMM[9]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_encrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_enc_done
+.align 4
+.Lxts_enc_1:
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]!
+ mov $fp, r4
+
+ vmov @XMM[8], @XMM[9] @ next round tweak
+
+.Lxts_enc_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds $len, #0x10
+ beq .Lxts_enc_ret
+ sub r6, $out, #0x10
+
+.Lxts_enc_steal:
+ ldrb r0, [$inp], #1
+ ldrb r1, [$out, #-0x10]
+ strb r0, [$out, #-0x10]
+ strb r1, [$out], #1
+
+ subs $len, #1
+ bhi .Lxts_enc_steal
+
+ vld1.8 {@XMM[0]}, [r6]
+ mov r0, sp
+ veor @XMM[0], @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_encrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [r6]
+ mov $fp, r4
+#endif
+
+.Lxts_enc_ret:
+ bic r0, $fp, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_enc_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_enc_bzero
+
+ mov sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {@XMM[8]}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,%function
+.align 4
+bsaes_xts_decrypt:
+ mov ip, sp
+ stmdb sp!, {r4-r10, lr} @ 0x20
+ VFP_ABI_PUSH
+ mov r6, sp @ future $fp
+
+ mov $inp, r0
+ mov $out, r1
+ mov $len, r2
+ mov $key, r3
+
+ sub r0, sp, #0x10 @ 0x10
+ bic r0, #0xf @ align at 16 bytes
+ mov sp, r0
+
+#ifdef XTS_CHAIN_TWEAK
+ ldr r0, [ip] @ pointer to input tweak
+#else
+ @ generate initial tweak
+ ldr r0, [ip, #4] @ iv[]
+ mov r1, sp
+ ldr r2, [ip, #0] @ key2
+ bl AES_encrypt
+ mov r0, sp @ pointer to initial tweak
+#endif
+
+ ldr $rounds, [$key, #240] @ get # of rounds
+ mov $fp, r6
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
+ @ add r12, #`128-32` @ size of bit-sliced key schedule
+ sub r12, #`32+16` @ place for tweak[9]
+
+ @ populate the key schedule
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ mov sp, r12
+ add r12, #0x90 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, sp, #0x90
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+#else
+ ldr r12, [$key, #244]
+ eors r12, #1
+ beq 0f
+
+ str r12, [$key, #244]
+ mov r4, $key @ pass key
+ mov r5, $rounds @ pass # of rounds
+ add r12, $key, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, $key, #248
+ vldmia r4, {@XMM[6]}
+ vstmia r12, {@XMM[15]} @ save last round key
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
+ vstmia r4, {@XMM[7]}
+
+.align 2
+0: sub sp, #0x90 @ place for tweak[9]
+#endif
+ vld1.8 {@XMM[8]}, [r0] @ initial tweak
+ adr $magic, .Lxts_magic
+
+ tst $len, #0xf @ if not multiple of 16
+ it ne @ Thumb2 thing, sanity check in ARM
+ subne $len, #0x10 @ subtract another 16 bytes
+ subs $len, #0x80
+
+ blo .Lxts_dec_short
+ b .Lxts_dec_loop
+
+.align 4
+.Lxts_dec_loop:
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ vadd.u64 @XMM[8], @XMM[15], @XMM[15]
+ vst1.64 {@XMM[15]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ veor @XMM[8], @XMM[8], @T[0]
+ vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ veor @XMM[7], @XMM[7], @XMM[15]
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]!
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[3], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ veor @XMM[13], @XMM[5], @XMM[15]
+ vst1.8 {@XMM[12]-@XMM[13]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+
+ subs $len, #0x80
+ bpl .Lxts_dec_loop
+
+.Lxts_dec_short:
+ adds $len, #0x70
+ bmi .Lxts_dec_done
+
+ vldmia $magic, {$twmask} @ load XTS magic
+ vshr.s64 @T[0], @XMM[8], #63
+ mov r0, sp
+ vand @T[0], @T[0], $twmask
+___
+for($i=9;$i<16;$i++) {
+$code.=<<___;
+ vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1]
+ vst1.64 {@XMM[$i-1]}, [r0,:128]!
+ vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")`
+ vshr.s64 @T[1], @XMM[$i], #63
+ veor @XMM[$i], @XMM[$i], @T[0]
+ vand @T[1], @T[1], $twmask
+___
+ @T=reverse(@T);
+
+$code.=<<___ if ($i>=10);
+ vld1.8 {@XMM[$i-10]}, [$inp]!
+ subs $len, #0x10
+ bmi .Lxts_dec_`$i-9`
+___
+$code.=<<___ if ($i>=11);
+ veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
+___
+}
+$code.=<<___;
+ sub $len, #0x10
+ vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak
+
+ vld1.8 {@XMM[6]}, [$inp]!
+ veor @XMM[5], @XMM[5], @XMM[13]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[6], @XMM[6], @XMM[14]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vld1.64 {@XMM[14]}, [r0,:128]!
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ veor @XMM[12], @XMM[3], @XMM[14]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+ vst1.8 {@XMM[12]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_6:
+ vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak
+
+ veor @XMM[4], @XMM[4], @XMM[12]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[5], @XMM[5], @XMM[13]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ veor @XMM[11], @XMM[7], @XMM[13]
+ vst1.8 {@XMM[10]-@XMM[11]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_5:
+ vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak
+
+ veor @XMM[3], @XMM[3], @XMM[11]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[4], @XMM[4], @XMM[12]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ vld1.64 {@XMM[12]}, [r0,:128]!
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ veor @XMM[10], @XMM[2], @XMM[12]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+ vst1.8 {@XMM[10]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_4:
+ vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak
+
+ veor @XMM[2], @XMM[2], @XMM[10]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[3], @XMM[3], @XMM[11]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
+ vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ veor @XMM[9], @XMM[4], @XMM[11]
+ vst1.8 {@XMM[8]-@XMM[9]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_3:
+ vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak
+
+ veor @XMM[1], @XMM[1], @XMM[9]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[2], @XMM[2], @XMM[10]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ vld1.64 {@XMM[10]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ veor @XMM[8], @XMM[6], @XMM[10]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+ vst1.8 {@XMM[8]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_2:
+ vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak
+
+ veor @XMM[0], @XMM[0], @XMM[8]
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x90 @ pass key schedule
+#else
+ add r4, $key, #248 @ pass key schedule
+#endif
+ veor @XMM[1], @XMM[1], @XMM[9]
+ mov r5, $rounds @ pass rounds
+ mov r0, sp
+
+ bl _bsaes_decrypt8
+
+ vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]!
+ veor @XMM[0], @XMM[0], @XMM[ 8]
+ veor @XMM[1], @XMM[1], @XMM[ 9]
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]!
+
+ vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak
+ b .Lxts_dec_done
+.align 4
+.Lxts_dec_1:
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+ mov r5, $magic @ preserve magic
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [$out]!
+ mov $fp, r4
+ mov $magic, r5
+
+ vmov @XMM[8], @XMM[9] @ next round tweak
+
+.Lxts_dec_done:
+#ifndef XTS_CHAIN_TWEAK
+ adds $len, #0x10
+ beq .Lxts_dec_ret
+
+ @ calculate one round of extra tweak for the stolen ciphertext
+ vldmia $magic, {$twmask}
+ vshr.s64 @XMM[6], @XMM[8], #63
+ vand @XMM[6], @XMM[6], $twmask
+ vadd.u64 @XMM[9], @XMM[8], @XMM[8]
+ vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
+ veor @XMM[9], @XMM[9], @XMM[6]
+
+ @ perform the final decryption with the last tweak value
+ vld1.8 {@XMM[0]}, [$inp]!
+ mov r0, sp
+ veor @XMM[0], @XMM[0], @XMM[9]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+ mov r4, $fp @ preserve fp
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[9]
+ vst1.8 {@XMM[0]}, [$out]
+
+ mov r6, $out
+.Lxts_dec_steal:
+ ldrb r1, [$out]
+ ldrb r0, [$inp], #1
+ strb r1, [$out, #0x10]
+ strb r0, [$out], #1
+
+ subs $len, #1
+ bhi .Lxts_dec_steal
+
+ vld1.8 {@XMM[0]}, [r6]
+ mov r0, sp
+ veor @XMM[0], @XMM[8]
+ mov r1, sp
+ vst1.8 {@XMM[0]}, [sp,:128]
+ mov r2, $key
+
+ bl AES_decrypt
+
+ vld1.8 {@XMM[0]}, [sp,:128]
+ veor @XMM[0], @XMM[0], @XMM[8]
+ vst1.8 {@XMM[0]}, [r6]
+ mov $fp, r4
+#endif
+
+.Lxts_dec_ret:
+ bic r0, $fp, #0xf
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifdef XTS_CHAIN_TWEAK
+ ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak
+#endif
+.Lxts_dec_bzero: @ wipe key schedule [if any]
+ vstmia sp!, {q0-q1}
+ cmp sp, r0
+ bne .Lxts_dec_bzero
+
+ mov sp, $fp
+#ifdef XTS_CHAIN_TWEAK
+ vst1.8 {@XMM[8]}, [r1]
+#endif
+ VFP_ABI_POP
+ ldmia sp!, {r4-r10, pc} @ return
+
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+print $code;
+
+close STDOUT;
diff --git a/arch/arm/crypto/sha1-armv7-neon.S b/arch/arm/crypto/sha1-armv7-neon.S
new file mode 100644
index 000000000000..50013c0e2864
--- /dev/null
+++ b/arch/arm/crypto/sha1-armv7-neon.S
@@ -0,0 +1,634 @@
+/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
+ *
+ * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/linkage.h>
+
+
+.syntax unified
+.code 32
+.fpu neon
+
+.text
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+#define K1 0x5A827999
+#define K2 0x6ED9EBA1
+#define K3 0x8F1BBCDC
+#define K4 0xCA62C1D6
+.align 4
+.LK_VEC:
+.LK1: .long K1, K1, K1, K1
+.LK2: .long K2, K2, K2, K2
+.LK3: .long K3, K3, K3, K3
+.LK4: .long K4, K4, K4, K4
+
+
+/* Register macros */
+
+#define RSTATE r0
+#define RDATA r1
+#define RNBLKS r2
+#define ROLDSTACK r3
+#define RWK lr
+
+#define _a r4
+#define _b r5
+#define _c r6
+#define _d r7
+#define _e r8
+
+#define RT0 r9
+#define RT1 r10
+#define RT2 r11
+#define RT3 r12
+
+#define W0 q0
+#define W1 q1
+#define W2 q2
+#define W3 q3
+#define W4 q4
+#define W5 q5
+#define W6 q6
+#define W7 q7
+
+#define tmp0 q8
+#define tmp1 q9
+#define tmp2 q10
+#define tmp3 q11
+
+#define qK1 q12
+#define qK2 q13
+#define qK3 q14
+#define qK4 q15
+
+
+/* Round function macros. */
+
+#define WK_offs(i) (((i) & 15) * 4)
+
+#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ldr RT3, [sp, WK_offs(i)]; \
+ pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ bic RT0, d, b; \
+ add e, e, a, ror #(32 - 5); \
+ and RT1, c, b; \
+ pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add RT0, RT0, RT3; \
+ add e, e, RT1; \
+ ror b, #(32 - 30); \
+ pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT0;
+
+#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ldr RT3, [sp, WK_offs(i)]; \
+ pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ eor RT0, d, b; \
+ add e, e, a, ror #(32 - 5); \
+ eor RT0, RT0, c; \
+ pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT3; \
+ ror b, #(32 - 30); \
+ pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT0; \
+
+#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ldr RT3, [sp, WK_offs(i)]; \
+ pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ eor RT0, b, c; \
+ and RT1, b, c; \
+ add e, e, a, ror #(32 - 5); \
+ pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ and RT0, RT0, d; \
+ add RT1, RT1, RT3; \
+ add e, e, RT0; \
+ ror b, #(32 - 30); \
+ pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT1;
+
+#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define R(a,b,c,d,e,f,i) \
+ _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define dummy(...)
+
+
+/* Input expansion macros. */
+
+/********* Precalc macros for rounds 0-15 *************************************/
+
+#define W_PRECALC_00_15() \
+ add RWK, sp, #(WK_offs(0)); \
+ \
+ vld1.32 {tmp0, tmp1}, [RDATA]!; \
+ vrev32.8 W0, tmp0; /* big => little */ \
+ vld1.32 {tmp2, tmp3}, [RDATA]!; \
+ vadd.u32 tmp0, W0, curK; \
+ vrev32.8 W7, tmp1; /* big => little */ \
+ vrev32.8 W6, tmp2; /* big => little */ \
+ vadd.u32 tmp1, W7, curK; \
+ vrev32.8 W5, tmp3; /* big => little */ \
+ vadd.u32 tmp2, W6, curK; \
+ vst1.32 {tmp0, tmp1}, [RWK]!; \
+ vadd.u32 tmp3, W5, curK; \
+ vst1.32 {tmp2, tmp3}, [RWK]; \
+
+#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vld1.32 {tmp0, tmp1}, [RDATA]!; \
+
+#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ add RWK, sp, #(WK_offs(0)); \
+
+#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vrev32.8 W0, tmp0; /* big => little */ \
+
+#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vld1.32 {tmp2, tmp3}, [RDATA]!; \
+
+#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp0, W0, curK; \
+
+#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vrev32.8 W7, tmp1; /* big => little */ \
+
+#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vrev32.8 W6, tmp2; /* big => little */ \
+
+#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp1, W7, curK; \
+
+#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vrev32.8 W5, tmp3; /* big => little */ \
+
+#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp2, W6, curK; \
+
+#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp0, tmp1}, [RWK]!; \
+
+#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp3, W5, curK; \
+
+#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp2, tmp3}, [RWK]; \
+
+
+/********* Precalc macros for rounds 16-31 ************************************/
+
+#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp0, tmp0; \
+ vext.8 W, W_m16, W_m12, #8; \
+
+#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ add RWK, sp, #(WK_offs(i)); \
+ vext.8 tmp0, W_m04, tmp0, #4; \
+
+#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp0, tmp0, W_m16; \
+ veor.32 W, W, W_m08; \
+
+#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp1, tmp1; \
+ veor W, W, tmp0; \
+
+#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshl.u32 tmp0, W, #1; \
+
+#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vext.8 tmp1, tmp1, W, #(16-12); \
+ vshr.u32 W, W, #31; \
+
+#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vorr tmp0, tmp0, W; \
+ vshr.u32 W, tmp1, #30; \
+
+#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshl.u32 tmp1, tmp1, #2; \
+
+#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp0, tmp0, W; \
+
+#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, tmp0, tmp1; \
+
+#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp0, W, curK; \
+
+#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp0}, [RWK];
+
+
+/********* Precalc macros for rounds 32-79 ************************************/
+
+#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, W_m28; \
+
+#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vext.8 tmp0, W_m08, W_m04, #8; \
+
+#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, W_m16; \
+
+#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, tmp0; \
+
+#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ add RWK, sp, #(WK_offs(i&~3)); \
+
+#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshl.u32 tmp1, W, #2; \
+
+#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshr.u32 tmp0, W, #30; \
+
+#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vorr W, tmp0, tmp1; \
+
+#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp0, W, curK; \
+
+#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp0}, [RWK];
+
+
+/*
+ * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
+ *
+ * unsigned int
+ * sha1_transform_neon (void *ctx, const unsigned char *data,
+ * unsigned int nblks)
+ */
+.align 3
+ENTRY(sha1_transform_neon)
+ /* input:
+ * r0: ctx, CTX
+ * r1: data (64*nblks bytes)
+ * r2: nblks
+ */
+
+ cmp RNBLKS, #0;
+ beq .Ldo_nothing;
+
+ push {r4-r12, lr};
+ /*vpush {q4-q7};*/
+
+ adr RT3, .LK_VEC;
+
+ mov ROLDSTACK, sp;
+
+ /* Align stack. */
+ sub RT0, sp, #(16*4);
+ and RT0, #(~(16-1));
+ mov sp, RT0;
+
+ vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
+
+ /* Get the values of the chaining variables. */
+ ldm RSTATE, {_a-_e};
+
+ vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
+
+#undef curK
+#define curK qK1
+ /* Precalc 0-15. */
+ W_PRECALC_00_15();
+
+.Loop:
+ /* Transform 0-15 + Precalc 16-31. */
+ _R( _a, _b, _c, _d, _e, F1, 0,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
+ W4, W5, W6, W7, W0, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F1, 1,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
+ W4, W5, W6, W7, W0, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F1, 2,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
+ W4, W5, W6, W7, W0, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F1, 3,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
+ W4, W5, W6, W7, W0, _, _, _ );
+
+#undef curK
+#define curK qK2
+ _R( _b, _c, _d, _e, _a, F1, 4,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
+ W3, W4, W5, W6, W7, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F1, 5,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
+ W3, W4, W5, W6, W7, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F1, 6,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
+ W3, W4, W5, W6, W7, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F1, 7,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
+ W3, W4, W5, W6, W7, _, _, _ );
+
+ _R( _c, _d, _e, _a, _b, F1, 8,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
+ W2, W3, W4, W5, W6, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F1, 9,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
+ W2, W3, W4, W5, W6, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F1, 10,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
+ W2, W3, W4, W5, W6, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F1, 11,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
+ W2, W3, W4, W5, W6, _, _, _ );
+
+ _R( _d, _e, _a, _b, _c, F1, 12,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
+ W1, W2, W3, W4, W5, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F1, 13,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
+ W1, W2, W3, W4, W5, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F1, 14,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
+ W1, W2, W3, W4, W5, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F1, 15,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
+ W1, W2, W3, W4, W5, _, _, _ );
+
+ /* Transform 16-63 + Precalc 32-79. */
+ _R( _e, _a, _b, _c, _d, F1, 16,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _d, _e, _a, _b, _c, F1, 17,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _c, _d, _e, _a, _b, F1, 18,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _b, _c, _d, _e, _a, F1, 19,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+
+ _R( _a, _b, _c, _d, _e, F2, 20,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _e, _a, _b, _c, _d, F2, 21,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _d, _e, _a, _b, _c, F2, 22,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _c, _d, _e, _a, _b, F2, 23,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+
+#undef curK
+#define curK qK3
+ _R( _b, _c, _d, _e, _a, F2, 24,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _a, _b, _c, _d, _e, F2, 25,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _e, _a, _b, _c, _d, F2, 26,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _d, _e, _a, _b, _c, F2, 27,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+
+ _R( _c, _d, _e, _a, _b, F2, 28,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _b, _c, _d, _e, _a, F2, 29,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _a, _b, _c, _d, _e, F2, 30,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _e, _a, _b, _c, _d, F2, 31,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+
+ _R( _d, _e, _a, _b, _c, F2, 32,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+ _R( _c, _d, _e, _a, _b, F2, 33,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+ _R( _b, _c, _d, _e, _a, F2, 34,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+ _R( _a, _b, _c, _d, _e, F2, 35,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+
+ _R( _e, _a, _b, _c, _d, F2, 36,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+ _R( _d, _e, _a, _b, _c, F2, 37,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+ _R( _c, _d, _e, _a, _b, F2, 38,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+ _R( _b, _c, _d, _e, _a, F2, 39,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+
+ _R( _a, _b, _c, _d, _e, F3, 40,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+ _R( _e, _a, _b, _c, _d, F3, 41,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+ _R( _d, _e, _a, _b, _c, F3, 42,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+ _R( _c, _d, _e, _a, _b, F3, 43,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+
+#undef curK
+#define curK qK4
+ _R( _b, _c, _d, _e, _a, F3, 44,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+ _R( _a, _b, _c, _d, _e, F3, 45,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+ _R( _e, _a, _b, _c, _d, F3, 46,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+ _R( _d, _e, _a, _b, _c, F3, 47,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+
+ _R( _c, _d, _e, _a, _b, F3, 48,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _b, _c, _d, _e, _a, F3, 49,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _a, _b, _c, _d, _e, F3, 50,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _e, _a, _b, _c, _d, F3, 51,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+
+ _R( _d, _e, _a, _b, _c, F3, 52,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _c, _d, _e, _a, _b, F3, 53,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _b, _c, _d, _e, _a, F3, 54,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _a, _b, _c, _d, _e, F3, 55,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+
+ _R( _e, _a, _b, _c, _d, F3, 56,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _d, _e, _a, _b, _c, F3, 57,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _c, _d, _e, _a, _b, F3, 58,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _b, _c, _d, _e, _a, F3, 59,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+
+ subs RNBLKS, #1;
+
+ _R( _a, _b, _c, _d, _e, F4, 60,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _e, _a, _b, _c, _d, F4, 61,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _d, _e, _a, _b, _c, F4, 62,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _c, _d, _e, _a, _b, F4, 63,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+
+ beq .Lend;
+
+ /* Transform 64-79 + Precalc 0-15 of next block. */
+#undef curK
+#define curK qK1
+ _R( _b, _c, _d, _e, _a, F4, 64,
+ WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F4, 65,
+ WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F4, 66,
+ WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F4, 67,
+ WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+ _R( _c, _d, _e, _a, _b, F4, 68,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F4, 69,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F4, 70,
+ WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F4, 71,
+ WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+ _R( _d, _e, _a, _b, _c, F4, 72,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F4, 73,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F4, 74,
+ WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F4, 75,
+ WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+ _R( _e, _a, _b, _c, _d, F4, 76,
+ WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F4, 77,
+ WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F4, 78,
+ WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F4, 79,
+ WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
+
+ /* Update the chaining variables. */
+ ldm RSTATE, {RT0-RT3};
+ add _a, RT0;
+ ldr RT0, [RSTATE, #state_h4];
+ add _b, RT1;
+ add _c, RT2;
+ add _d, RT3;
+ add _e, RT0;
+ stm RSTATE, {_a-_e};
+
+ b .Loop;
+
+.Lend:
+ /* Transform 64-79 */
+ R( _b, _c, _d, _e, _a, F4, 64 );
+ R( _a, _b, _c, _d, _e, F4, 65 );
+ R( _e, _a, _b, _c, _d, F4, 66 );
+ R( _d, _e, _a, _b, _c, F4, 67 );
+ R( _c, _d, _e, _a, _b, F4, 68 );
+ R( _b, _c, _d, _e, _a, F4, 69 );
+ R( _a, _b, _c, _d, _e, F4, 70 );
+ R( _e, _a, _b, _c, _d, F4, 71 );
+ R( _d, _e, _a, _b, _c, F4, 72 );
+ R( _c, _d, _e, _a, _b, F4, 73 );
+ R( _b, _c, _d, _e, _a, F4, 74 );
+ R( _a, _b, _c, _d, _e, F4, 75 );
+ R( _e, _a, _b, _c, _d, F4, 76 );
+ R( _d, _e, _a, _b, _c, F4, 77 );
+ R( _c, _d, _e, _a, _b, F4, 78 );
+ R( _b, _c, _d, _e, _a, F4, 79 );
+
+ mov sp, ROLDSTACK;
+
+ /* Update the chaining variables. */
+ ldm RSTATE, {RT0-RT3};
+ add _a, RT0;
+ ldr RT0, [RSTATE, #state_h4];
+ add _b, RT1;
+ add _c, RT2;
+ add _d, RT3;
+ /*vpop {q4-q7};*/
+ add _e, RT0;
+ stm RSTATE, {_a-_e};
+
+ pop {r4-r12, pc};
+
+.Ldo_nothing:
+ bx lr
+ENDPROC(sha1_transform_neon)
diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c
index ace4cd67464c..e31b0440c613 100644
--- a/arch/arm/crypto/sha1_glue.c
+++ b/arch/arm/crypto/sha1_glue.c
@@ -23,32 +23,27 @@
#include <linux/types.h>
#include <crypto/sha.h>
#include <asm/byteorder.h>
+#include <asm/crypto/sha1.h>
-struct SHA1_CTX {
- uint32_t h0,h1,h2,h3,h4;
- u64 count;
- u8 data[SHA1_BLOCK_SIZE];
-};
-asmlinkage void sha1_block_data_order(struct SHA1_CTX *digest,
+asmlinkage void sha1_block_data_order(u32 *digest,
const unsigned char *data, unsigned int rounds);
static int sha1_init(struct shash_desc *desc)
{
- struct SHA1_CTX *sctx = shash_desc_ctx(desc);
- memset(sctx, 0, sizeof(*sctx));
- sctx->h0 = SHA1_H0;
- sctx->h1 = SHA1_H1;
- sctx->h2 = SHA1_H2;
- sctx->h3 = SHA1_H3;
- sctx->h4 = SHA1_H4;
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ *sctx = (struct sha1_state){
+ .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+ };
+
return 0;
}
-static int __sha1_update(struct SHA1_CTX *sctx, const u8 *data,
- unsigned int len, unsigned int partial)
+static int __sha1_update(struct sha1_state *sctx, const u8 *data,
+ unsigned int len, unsigned int partial)
{
unsigned int done = 0;
@@ -56,43 +51,44 @@ static int __sha1_update(struct SHA1_CTX *sctx, const u8 *data,
if (partial) {
done = SHA1_BLOCK_SIZE - partial;
- memcpy(sctx->data + partial, data, done);
- sha1_block_data_order(sctx, sctx->data, 1);
+ memcpy(sctx->buffer + partial, data, done);
+ sha1_block_data_order(sctx->state, sctx->buffer, 1);
}
if (len - done >= SHA1_BLOCK_SIZE) {
const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
- sha1_block_data_order(sctx, data + done, rounds);
+ sha1_block_data_order(sctx->state, data + done, rounds);
done += rounds * SHA1_BLOCK_SIZE;
}
- memcpy(sctx->data, data + done, len - done);
+ memcpy(sctx->buffer, data + done, len - done);
return 0;
}
-static int sha1_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
+int sha1_update_arm(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
{
- struct SHA1_CTX *sctx = shash_desc_ctx(desc);
+ struct sha1_state *sctx = shash_desc_ctx(desc);
unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
int res;
/* Handle the fast case right here */
if (partial + len < SHA1_BLOCK_SIZE) {
sctx->count += len;
- memcpy(sctx->data + partial, data, len);
+ memcpy(sctx->buffer + partial, data, len);
return 0;
}
res = __sha1_update(sctx, data, len, partial);
return res;
}
+EXPORT_SYMBOL_GPL(sha1_update_arm);
/* Add padding and return the message digest. */
static int sha1_final(struct shash_desc *desc, u8 *out)
{
- struct SHA1_CTX *sctx = shash_desc_ctx(desc);
+ struct sha1_state *sctx = shash_desc_ctx(desc);
unsigned int i, index, padlen;
__be32 *dst = (__be32 *)out;
__be64 bits;
@@ -106,7 +102,7 @@ static int sha1_final(struct shash_desc *desc, u8 *out)
/* We need to fill a whole block for __sha1_update() */
if (padlen <= 56) {
sctx->count += padlen;
- memcpy(sctx->data + index, padding, padlen);
+ memcpy(sctx->buffer + index, padding, padlen);
} else {
__sha1_update(sctx, padding, padlen, index);
}
@@ -114,7 +110,7 @@ static int sha1_final(struct shash_desc *desc, u8 *out)
/* Store state in digest */
for (i = 0; i < 5; i++)
- dst[i] = cpu_to_be32(((u32 *)sctx)[i]);
+ dst[i] = cpu_to_be32(sctx->state[i]);
/* Wipe context */
memset(sctx, 0, sizeof(*sctx));
@@ -124,7 +120,7 @@ static int sha1_final(struct shash_desc *desc, u8 *out)
static int sha1_export(struct shash_desc *desc, void *out)
{
- struct SHA1_CTX *sctx = shash_desc_ctx(desc);
+ struct sha1_state *sctx = shash_desc_ctx(desc);
memcpy(out, sctx, sizeof(*sctx));
return 0;
}
@@ -132,7 +128,7 @@ static int sha1_export(struct shash_desc *desc, void *out)
static int sha1_import(struct shash_desc *desc, const void *in)
{
- struct SHA1_CTX *sctx = shash_desc_ctx(desc);
+ struct sha1_state *sctx = shash_desc_ctx(desc);
memcpy(sctx, in, sizeof(*sctx));
return 0;
}
@@ -141,12 +137,12 @@ static int sha1_import(struct shash_desc *desc, const void *in)
static struct shash_alg alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_init,
- .update = sha1_update,
+ .update = sha1_update_arm,
.final = sha1_final,
.export = sha1_export,
.import = sha1_import,
- .descsize = sizeof(struct SHA1_CTX),
- .statesize = sizeof(struct SHA1_CTX),
+ .descsize = sizeof(struct sha1_state),
+ .statesize = sizeof(struct sha1_state),
.base = {
.cra_name = "sha1",
.cra_driver_name= "sha1-asm",
diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c
new file mode 100644
index 000000000000..6f1b411b1d55
--- /dev/null
+++ b/arch/arm/crypto/sha1_neon_glue.c
@@ -0,0 +1,197 @@
+/*
+ * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
+ * ARM NEON instructions.
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is based on sha1_generic.c and sha1_ssse3_glue.c:
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/crypto/sha1.h>
+
+
+asmlinkage void sha1_transform_neon(void *state_h, const char *data,
+ unsigned int rounds);
+
+
+static int sha1_neon_init(struct shash_desc *desc)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ *sctx = (struct sha1_state){
+ .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+ };
+
+ return 0;
+}
+
+static int __sha1_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count += len;
+
+ if (partial) {
+ done = SHA1_BLOCK_SIZE - partial;
+ memcpy(sctx->buffer + partial, data, done);
+ sha1_transform_neon(sctx->state, sctx->buffer, 1);
+ }
+
+ if (len - done >= SHA1_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
+
+ sha1_transform_neon(sctx->state, data + done, rounds);
+ done += rounds * SHA1_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buffer, data + done, len - done);
+
+ return 0;
+}
+
+static int sha1_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
+ int res;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA1_BLOCK_SIZE) {
+ sctx->count += len;
+ memcpy(sctx->buffer + partial, data, len);
+
+ return 0;
+ }
+
+ if (!may_use_simd()) {
+ res = sha1_update_arm(desc, data, len);
+ } else {
+ kernel_neon_begin();
+ res = __sha1_neon_update(desc, data, len, partial);
+ kernel_neon_end();
+ }
+
+ return res;
+}
+
+
+/* Add padding and return the message digest. */
+static int sha1_neon_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
+
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64 and append length */
+ index = sctx->count % SHA1_BLOCK_SIZE;
+ padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
+ if (!may_use_simd()) {
+ sha1_update_arm(desc, padding, padlen);
+ sha1_update_arm(desc, (const u8 *)&bits, sizeof(bits));
+ } else {
+ kernel_neon_begin();
+ /* We need to fill a whole block for __sha1_neon_update() */
+ if (padlen <= 56) {
+ sctx->count += padlen;
+ memcpy(sctx->buffer + index, padding, padlen);
+ } else {
+ __sha1_neon_update(desc, padding, padlen, index);
+ }
+ __sha1_neon_update(desc, (const u8 *)&bits, sizeof(bits), 56);
+ kernel_neon_end();
+ }
+
+ /* Store state in digest */
+ for (i = 0; i < 5; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Wipe context */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha1_neon_export(struct shash_desc *desc, void *out)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha1_neon_import(struct shash_desc *desc, const void *in)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static struct shash_alg alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_neon_init,
+ .update = sha1_neon_update,
+ .final = sha1_neon_final,
+ .export = sha1_neon_export,
+ .import = sha1_neon_import,
+ .descsize = sizeof(struct sha1_state),
+ .statesize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int __init sha1_neon_mod_init(void)
+{
+ if (!cpu_has_neon())
+ return -ENODEV;
+
+ return crypto_register_shash(&alg);
+}
+
+static void __exit sha1_neon_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(sha1_neon_mod_init);
+module_exit(sha1_neon_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, NEON accelerated");
+MODULE_ALIAS("sha1");
diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
new file mode 100644
index 000000000000..fac0533ea633
--- /dev/null
+++ b/arch/arm/crypto/sha256-armv4.pl
@@ -0,0 +1,716 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0"; $t0="r0";
+$inp="r1"; $t4="r1";
+$len="r2"; $t1="r2";
+$T1="r3"; $t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+ @ ldr $t1,[$inp],#4 @ $i
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+# ifndef __ARMEB__
+ rev $t1,$t1
+# endif
+#else
+ @ ldrb $t1,[$inp,#3] @ $i
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ ldrb $t2,[$inp,#2]
+ ldrb $t0,[$inp,#1]
+ orr $t1,$t1,$t2,lsl#8
+ ldrb $t2,[$inp],#4
+ orr $t1,$t1,$t0,lsl#16
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ orr $t1,$t1,$t2,lsl#24
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+#endif
+___
+$code.=<<___;
+ ldr $t2,[$Ktbl],#4 @ *K256++
+ add $h,$h,$t1 @ h+=X[i]
+ str $t1,[sp,#`$i%16`*4]
+ eor $t1,$f,$g
+ add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
+ and $t1,$t1,$e
+ add $h,$h,$t2 @ h+=K256[i]
+ eor $t1,$t1,$g @ Ch(e,f,g)
+ eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+ add $h,$h,$t1 @ h+=Ch(e,f,g)
+#if $i==31
+ and $t2,$t2,#0xff
+ cmp $t2,#0xf2 @ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4 @ prefetch
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t2,$a,$b @ a^b, b^c in next round
+#else
+ ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
+ eor $t2,$a,$b @ a^b, b^c in next round
+ ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
+#endif
+ eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
+ and $t3,$t3,$t2 @ (b^c)&=(a^b)
+ add $d,$d,$h @ d+=h
+ eor $t3,$t3,$b @ Maj(a,b,c)
+ add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
+ @ add $h,$h,$t3 @ h+=Maj(a,b,c)
+___
+ ($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+ @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
+ @ ldr $t4,[sp,#`($i+14)%16`*4]
+ mov $t0,$t1,ror#$sigma0[0]
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ mov $t2,$t4,ror#$sigma1[0]
+ eor $t0,$t0,$t1,ror#$sigma0[1]
+ eor $t2,$t2,$t4,ror#$sigma1[1]
+ eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
+ ldr $t1,[sp,#`($i+0)%16`*4]
+ eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
+ ldr $t4,[sp,#`($i+9)%16`*4]
+
+ add $t2,$t2,$t0
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
+ add $t1,$t1,$t2
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+ add $t1,$t1,$t4 @ X[i]
+___
+ &BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+# define adrl adr
+.thumb
+# else
+.code 32
+# endif
+#endif
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global sha256_block_data_order
+.type sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+ sub r3,pc,#8 @ sha256_block_data_order
+#else
+ adr r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
+ ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+ sub $Ktbl,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t3,$B,$C @ magic
+ eor $t2,$t2,$t2
+___
+for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq $t3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t0,[$t3,#0]
+ ldr $t1,[$t3,#4]
+ ldr $t2,[$t3,#8]
+ add $A,$A,$t0
+ ldr $t0,[$t3,#12]
+ add $B,$B,$t1
+ ldr $t1,[$t3,#16]
+ add $C,$C,$t2
+ ldr $t2,[$t3,#20]
+ add $D,$D,$t0
+ ldr $t0,[$t3,#24]
+ add $E,$E,$t1
+ ldr $t1,[$t3,#28]
+ add $F,$F,$t2
+ ldr $inp,[sp,#17*4] @ pull inp
+ ldr $t2,[sp,#18*4] @ pull inp+len
+ add $G,$G,$t0
+ add $H,$H,$t1
+ stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+ cmp $inp,$t2
+ sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#`16+3`*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T2,$T0,$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T1,$T0,$sigma0[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T2,$T0,32-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T3,$T0,$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T3,$T0,32-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T3); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ while($#insns>=2) { eval(shift(@insns)); }
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vrev32_8 (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ foreach (@insns) { eval; } # remaining instructions
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+ '&add ($h,$h,$t1)', # h+=X[i]+K[i]
+ '&eor ($t1,$f,$g)',
+ '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+ '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
+ '&and ($t1,$t1,$e)',
+ '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
+ '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+ '&eor ($t1,$t1,$g)', # Ch(e,f,g)
+ '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
+ '&eor ($t2,$a,$b)', # a^b, b^c in next round
+ '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
+ '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
+ '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
+ '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
+ '&ldr ($t1,"[sp,#64]") if ($j==31)',
+ '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
+ '&add ($d,$d,$h)', # d+=h
+ '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
+ '&eor ($t3,$t3,$b)', # Maj(a,b,c)
+ '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+ )
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ sub $H,sp,#16*4+16
+ adrl $Ktbl,K256
+ bic $H,$H,#15 @ align for 128-bit stores
+ mov $t2,sp
+ mov sp,$H @ alloca
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {@X[0]},[$inp]!
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ vld1.32 {$T0},[$Ktbl,:128]!
+ vld1.32 {$T1},[$Ktbl,:128]!
+ vld1.32 {$T2},[$Ktbl,:128]!
+ vld1.32 {$T3},[$Ktbl,:128]!
+ vrev32.8 @X[0],@X[0] @ yes, even on
+ str $ctx,[sp,#64]
+ vrev32.8 @X[1],@X[1] @ big-endian
+ str $inp,[sp,#68]
+ mov $Xfer,sp
+ vrev32.8 @X[2],@X[2]
+ str $len,[sp,#72]
+ vrev32.8 @X[3],@X[3]
+ str $t2,[sp,#76] @ save original sp
+ vadd.i32 $T0,$T0,@X[0]
+ vadd.i32 $T1,$T1,@X[1]
+ vst1.32 {$T0},[$Xfer,:128]!
+ vadd.i32 $T2,$T2,@X[2]
+ vst1.32 {$T1},[$Xfer,:128]!
+ vadd.i32 $T3,$T3,@X[3]
+ vst1.32 {$T2},[$Xfer,:128]!
+ vst1.32 {$T3},[$Xfer,:128]!
+
+ ldmia $ctx,{$A-$H}
+ sub $Xfer,$Xfer,#64
+ ldr $t1,[sp,#0]
+ eor $t2,$t2,$t2
+ eor $t3,$B,$C
+ b .L_00_48
+
+.align 4
+.L_00_48:
+___
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+$code.=<<___;
+ teq $t1,#0 @ check for K256 terminator
+ ldr $t1,[sp,#0]
+ sub $Xfer,$Xfer,#64
+ bne .L_00_48
+
+ ldr $inp,[sp,#68]
+ ldr $t0,[sp,#72]
+ sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
+ teq $inp,$t0
+ it eq
+ subeq $inp,$inp,#64 @ avoid SEGV
+ vld1.8 {@X[0]},[$inp]! @ load next input block
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ it ne
+ strne $inp,[sp,#68]
+ mov $Xfer,sp
+___
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+$code.=<<___;
+ ldr $t0,[$t1,#0]
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t2,[$t1,#4]
+ ldr $t3,[$t1,#8]
+ ldr $t4,[$t1,#12]
+ add $A,$A,$t0 @ accumulate
+ ldr $t0,[$t1,#16]
+ add $B,$B,$t2
+ ldr $t2,[$t1,#20]
+ add $C,$C,$t3
+ ldr $t3,[$t1,#24]
+ add $D,$D,$t4
+ ldr $t4,[$t1,#28]
+ add $E,$E,$t0
+ str $A,[$t1],#4
+ add $F,$F,$t2
+ str $B,[$t1],#4
+ add $G,$G,$t3
+ str $C,[$t1],#4
+ add $H,$H,$t4
+ str $D,[$t1],#4
+ stmia $t1,{$E-$H}
+
+ ittte ne
+ movne $Xfer,sp
+ ldrne $t1,[sp,#0]
+ eorne $t2,$t2,$t2
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne $t3,$B,$C
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+ adr $Ktbl,.LARMv8
+ sub $Ktbl,$Ktbl,#.LARMv8-K256
+# else
+ adrl $Ktbl,K256
+# endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+.Loop_v8:
+ vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
+ vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
+ vld1.32 {$W0},[$Ktbl]!
+ vrev32.8 @MSG[0],@MSG[0]
+ vrev32.8 @MSG[1],@MSG[1]
+ vrev32.8 @MSG[2],@MSG[2]
+ vrev32.8 @MSG[3],@MSG[3]
+ vmov $ABCD_SAVE,$ABCD @ offload
+ vmov $EFGH_SAVE,$EFGH
+ teq $inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ sha256su0 @MSG[0],@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vld1.32 {$W0},[$Ktbl]!
+ vadd.i32 $W1,$W1,@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vld1.32 {$W1},[$Ktbl]
+ vadd.i32 $W0,$W0,@MSG[2]
+ sub $Ktbl,$Ktbl,#256-16 @ rewind
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vadd.i32 $W1,$W1,@MSG[3]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
+ vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {$ABCD,$EFGH},[$ctx]
+
+ ret @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+{ my %opcode = (
+ "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
+ "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
+
+ sub unsha256 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+}
+
+foreach (split($/,$code)) {
+
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped
new file mode 100644
index 000000000000..555a1a8eec90
--- /dev/null
+++ b/arch/arm/crypto/sha256-core.S_shipped
@@ -0,0 +1,2808 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+# define adrl adr
+.thumb
+# else
+.code 32
+# endif
+#endif
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global sha256_block_data_order
+.type sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+ sub r3,pc,#8 @ sha256_block_data_order
+#else
+ adr r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
+ ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+ sub r14,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ magic
+ eor r12,r12,r12
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 0
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 0
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 0==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 1
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 1
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 1==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 2
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 2
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 2==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 3
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 3
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 3==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 4
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 4
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 4==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 5
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 5==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 6
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 6
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 6==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 7
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 7==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 8
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 8
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 8==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 9
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 9
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 9==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 10
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 10
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 10==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 11
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 11
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 11==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 12
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 12
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 12==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 13
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 13
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 13==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 14
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 14
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 14==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 15
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 15
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 15==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+ @ ldr r2,[sp,#1*4] @ 16
+ @ ldr r1,[sp,#14*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#0*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#9*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 16==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#2*4] @ 17
+ @ ldr r1,[sp,#15*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#1*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#10*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 17==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#3*4] @ 18
+ @ ldr r1,[sp,#0*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#2*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#11*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 18==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#4*4] @ 19
+ @ ldr r1,[sp,#1*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#3*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#12*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 19==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#5*4] @ 20
+ @ ldr r1,[sp,#2*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#4*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#13*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 20==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#6*4] @ 21
+ @ ldr r1,[sp,#3*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#5*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#14*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 21==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#7*4] @ 22
+ @ ldr r1,[sp,#4*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#6*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#15*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 22==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#8*4] @ 23
+ @ ldr r1,[sp,#5*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#7*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#0*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 23==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#9*4] @ 24
+ @ ldr r1,[sp,#6*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#8*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#1*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 24==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#10*4] @ 25
+ @ ldr r1,[sp,#7*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#9*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#2*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 25==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#11*4] @ 26
+ @ ldr r1,[sp,#8*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#10*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#3*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 26==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#12*4] @ 27
+ @ ldr r1,[sp,#9*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#11*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#4*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 27==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#13*4] @ 28
+ @ ldr r1,[sp,#10*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#12*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#5*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 28==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#14*4] @ 29
+ @ ldr r1,[sp,#11*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#13*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#6*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 29==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#15*4] @ 30
+ @ ldr r1,[sp,#12*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#14*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#7*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 30==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#0*4] @ 31
+ @ ldr r1,[sp,#13*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#15*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#8*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 31==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq r3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r0,[r3,#0]
+ ldr r2,[r3,#4]
+ ldr r12,[r3,#8]
+ add r4,r4,r0
+ ldr r0,[r3,#12]
+ add r5,r5,r2
+ ldr r2,[r3,#16]
+ add r6,r6,r12
+ ldr r12,[r3,#20]
+ add r7,r7,r0
+ ldr r0,[r3,#24]
+ add r8,r8,r2
+ ldr r2,[r3,#28]
+ add r9,r9,r12
+ ldr r1,[sp,#17*4] @ pull inp
+ ldr r12,[sp,#18*4] @ pull inp+len
+ add r10,r10,r0
+ add r11,r11,r2
+ stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+ cmp r1,r12
+ sub r14,r14,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#19*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size sha256_block_data_order,.-sha256_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ sub r11,sp,#16*4+16
+ adrl r14,K256
+ bic r11,r11,#15 @ align for 128-bit stores
+ mov r12,sp
+ mov sp,r11 @ alloca
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {q0},[r1]!
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ vld1.32 {q8},[r14,:128]!
+ vld1.32 {q9},[r14,:128]!
+ vld1.32 {q10},[r14,:128]!
+ vld1.32 {q11},[r14,:128]!
+ vrev32.8 q0,q0 @ yes, even on
+ str r0,[sp,#64]
+ vrev32.8 q1,q1 @ big-endian
+ str r1,[sp,#68]
+ mov r1,sp
+ vrev32.8 q2,q2
+ str r2,[sp,#72]
+ vrev32.8 q3,q3
+ str r12,[sp,#76] @ save original sp
+ vadd.i32 q8,q8,q0
+ vadd.i32 q9,q9,q1
+ vst1.32 {q8},[r1,:128]!
+ vadd.i32 q10,q10,q2
+ vst1.32 {q9},[r1,:128]!
+ vadd.i32 q11,q11,q3
+ vst1.32 {q10},[r1,:128]!
+ vst1.32 {q11},[r1,:128]!
+
+ ldmia r0,{r4-r11}
+ sub r1,r1,#64
+ ldr r2,[sp,#0]
+ eor r12,r12,r12
+ eor r3,r5,r6
+ b .L_00_48
+
+.align 4
+.L_00_48:
+ vext.8 q8,q0,q1,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q2,q3,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q0,q0,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d7,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d7,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d7,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q0,q0,q9
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d7,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d7,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d0,d0,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d0,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d0,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d0,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ vshr.u32 d24,d0,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d0,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d1,d1,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q0
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q1,q2,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q3,q0,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q1,q1,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d1,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d1,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d1,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q1,q1,q9
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d1,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d1,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d2,d2,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d2,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d2,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d2,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ vshr.u32 d24,d2,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d2,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d3,d3,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q1
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vext.8 q8,q2,q3,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q0,q1,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q2,q2,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d3,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d3,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d3,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q2,q2,q9
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d3,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d3,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d4,d4,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d4,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d4,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d4,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ vshr.u32 d24,d4,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d4,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d5,d5,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q2
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q3,q0,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q1,q2,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q3,q3,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d5,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d5,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d5,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q3,q3,q9
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d5,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d5,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d6,d6,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d6,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d6,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d6,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ vshr.u32 d24,d6,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d6,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d7,d7,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q3
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[r14]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ teq r2,#0 @ check for K256 terminator
+ ldr r2,[sp,#0]
+ sub r1,r1,#64
+ bne .L_00_48
+
+ ldr r1,[sp,#68]
+ ldr r0,[sp,#72]
+ sub r14,r14,#256 @ rewind r14
+ teq r1,r0
+ it eq
+ subeq r1,r1,#64 @ avoid SEGV
+ vld1.8 {q0},[r1]! @ load next input block
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ it ne
+ strne r1,[sp,#68]
+ mov r1,sp
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q0,q0
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q0
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q1,q1
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q1
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q2,q2
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q2
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q3,q3
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q3
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#64]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ ldr r0,[r2,#0]
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r12,[r2,#4]
+ ldr r3,[r2,#8]
+ ldr r1,[r2,#12]
+ add r4,r4,r0 @ accumulate
+ ldr r0,[r2,#16]
+ add r5,r5,r12
+ ldr r12,[r2,#20]
+ add r6,r6,r3
+ ldr r3,[r2,#24]
+ add r7,r7,r1
+ ldr r1,[r2,#28]
+ add r8,r8,r0
+ str r4,[r2],#4
+ add r9,r9,r12
+ str r5,[r2],#4
+ add r10,r10,r3
+ str r6,[r2],#4
+ add r11,r11,r1
+ str r7,[r2],#4
+ stmia r2,{r8-r11}
+
+ ittte ne
+ movne r1,sp
+ ldrne r2,[sp,#0]
+ eorne r12,r12,r12
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne r3,r5,r6
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {q0,q1},[r0]
+# ifdef __thumb2__
+ adr r3,.LARMv8
+ sub r3,r3,#.LARMv8-K256
+# else
+ adrl r3,K256
+# endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+
+.Loop_v8:
+ vld1.8 {q8-q9},[r1]!
+ vld1.8 {q10-q11},[r1]!
+ vld1.32 {q12},[r3]!
+ vrev32.8 q8,q8
+ vrev32.8 q9,q9
+ vrev32.8 q10,q10
+ vrev32.8 q11,q11
+ vmov q14,q0 @ offload
+ vmov q15,q1
+ teq r1,r2
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vld1.32 {q13},[r3]
+ vadd.i32 q12,q12,q10
+ sub r3,r3,#256-16 @ rewind
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vadd.i32 q13,q13,q11
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vadd.i32 q0,q0,q14
+ vadd.i32 q1,q1,q15
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {q0,q1},[r0]
+
+ bx lr @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
new file mode 100644
index 000000000000..bb03482e53e2
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.c
@@ -0,0 +1,246 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using optimized ARM assembler and NEON instructions.
+ *
+ * Copyright © 2015 Google Inc.
+ *
+ * This file is based on sha256_ssse3_glue.c:
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
+ unsigned int num_blks);
+
+
+int sha256_init(struct shash_desc *desc)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA256_H0;
+ sctx->state[1] = SHA256_H1;
+ sctx->state[2] = SHA256_H2;
+ sctx->state[3] = SHA256_H3;
+ sctx->state[4] = SHA256_H4;
+ sctx->state[5] = SHA256_H5;
+ sctx->state[6] = SHA256_H6;
+ sctx->state[7] = SHA256_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+int sha224_init(struct shash_desc *desc)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA224_H0;
+ sctx->state[1] = SHA224_H1;
+ sctx->state[2] = SHA224_H2;
+ sctx->state[3] = SHA224_H3;
+ sctx->state[4] = SHA224_H4;
+ sctx->state[5] = SHA224_H5;
+ sctx->state[6] = SHA224_H6;
+ sctx->state[7] = SHA224_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+int __sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len,
+ unsigned int partial)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count += len;
+
+ if (partial) {
+ done = SHA256_BLOCK_SIZE - partial;
+ memcpy(sctx->buf + partial, data, done);
+ sha256_block_data_order(sctx->state, sctx->buf, 1);
+ }
+
+ if (len - done >= SHA256_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+ sha256_block_data_order(sctx->state, data + done, rounds);
+ done += rounds * SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buf, data + done, len - done);
+
+ return 0;
+}
+
+int sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA256_BLOCK_SIZE) {
+ sctx->count += len;
+ memcpy(sctx->buf + partial, data, len);
+
+ return 0;
+ }
+
+ return __sha256_update(desc, data, len, partial);
+}
+
+/* Add padding and return the message digest. */
+static int sha256_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+ /* save number of bits */
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64 and append length */
+ index = sctx->count % SHA256_BLOCK_SIZE;
+ padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+ /* We need to fill a whole block for __sha256_update */
+ if (padlen <= 56) {
+ sctx->count += padlen;
+ memcpy(sctx->buf + index, padding, padlen);
+ } else {
+ __sha256_update(desc, padding, padlen, index);
+ }
+ __sha256_update(desc, (const u8 *)&bits, sizeof(bits), 56);
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Wipe context */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha224_final(struct shash_desc *desc, u8 *out)
+{
+ u8 D[SHA256_DIGEST_SIZE];
+
+ sha256_final(desc, D);
+
+ memcpy(out, D, SHA224_DIGEST_SIZE);
+ memset(D, 0, SHA256_DIGEST_SIZE);
+
+ return 0;
+}
+
+int sha256_export(struct shash_desc *desc, void *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+int sha256_import(struct shash_desc *desc, const void *in)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static struct shash_alg algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_init,
+ .update = sha256_update,
+ .final = sha256_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-asm",
+ .cra_priority = 150,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_init,
+ .update = sha256_update,
+ .final = sha224_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-asm",
+ .cra_priority = 150,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int __init sha256_mod_init(void)
+{
+ int res = crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+ if (res < 0)
+ return res;
+
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
+ res = crypto_register_shashes(sha256_neon_algs,
+ ARRAY_SIZE(sha256_neon_algs));
+
+ if (res < 0)
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+ }
+
+ return res;
+}
+
+static void __exit sha256_mod_fini(void)
+{
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
+ crypto_unregister_shashes(sha256_neon_algs,
+ ARRAY_SIZE(sha256_neon_algs));
+}
+
+module_init(sha256_mod_init);
+module_exit(sha256_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm (ARM), including NEON");
+
+MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/arm/crypto/sha256_glue.h b/arch/arm/crypto/sha256_glue.h
new file mode 100644
index 000000000000..0312f4ffe8cc
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.h
@@ -0,0 +1,23 @@
+#ifndef _CRYPTO_SHA256_GLUE_H
+#define _CRYPTO_SHA256_GLUE_H
+
+#include <linux/crypto.h>
+#include <crypto/sha.h>
+
+extern struct shash_alg sha256_neon_algs[2];
+
+extern int sha256_init(struct shash_desc *desc);
+
+extern int sha224_init(struct shash_desc *desc);
+
+extern int __sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial);
+
+extern int sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len);
+
+extern int sha256_export(struct shash_desc *desc, void *out);
+
+extern int sha256_import(struct shash_desc *desc, const void *in);
+
+#endif /* _CRYPTO_SHA256_GLUE_H */
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
new file mode 100644
index 000000000000..3ff0a7f1d092
--- /dev/null
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -0,0 +1,172 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright © 2015 Google Inc.
+ *
+ * This file is based on sha512_neon_glue.c:
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order_neon(u32 *digest, const void *data,
+ unsigned int num_blks);
+
+
+static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count += len;
+
+ if (partial) {
+ done = SHA256_BLOCK_SIZE - partial;
+ memcpy(sctx->buf + partial, data, done);
+ sha256_block_data_order_neon(sctx->state, sctx->buf, 1);
+ }
+
+ if (len - done >= SHA256_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+ sha256_block_data_order_neon(sctx->state, data + done, rounds);
+ done += rounds * SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buf, data + done, len - done);
+
+ return 0;
+}
+
+static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+ int res;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA256_BLOCK_SIZE) {
+ sctx->count += len;
+ memcpy(sctx->buf + partial, data, len);
+
+ return 0;
+ }
+
+ if (!may_use_simd()) {
+ res = __sha256_update(desc, data, len, partial);
+ } else {
+ kernel_neon_begin();
+ res = __sha256_neon_update(desc, data, len, partial);
+ kernel_neon_end();
+ }
+
+ return res;
+}
+
+/* Add padding and return the message digest. */
+static int sha256_neon_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+ /* save number of bits */
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64 and append length */
+ index = sctx->count % SHA256_BLOCK_SIZE;
+ padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+ if (!may_use_simd()) {
+ sha256_update(desc, padding, padlen);
+ sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+ } else {
+ kernel_neon_begin();
+ /* We need to fill a whole block for __sha256_neon_update() */
+ if (padlen <= 56) {
+ sctx->count += padlen;
+ memcpy(sctx->buf + index, padding, padlen);
+ } else {
+ __sha256_neon_update(desc, padding, padlen, index);
+ }
+ __sha256_neon_update(desc, (const u8 *)&bits,
+ sizeof(bits), 56);
+ kernel_neon_end();
+ }
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Wipe context */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha224_neon_final(struct shash_desc *desc, u8 *out)
+{
+ u8 D[SHA256_DIGEST_SIZE];
+
+ sha256_neon_final(desc, D);
+
+ memcpy(out, D, SHA224_DIGEST_SIZE);
+ memset(D, 0, SHA256_DIGEST_SIZE);
+
+ return 0;
+}
+
+struct shash_alg sha256_neon_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_init,
+ .update = sha256_neon_update,
+ .final = sha256_neon_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_init,
+ .update = sha256_neon_update,
+ .final = sha224_neon_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
diff --git a/arch/arm/crypto/sha512-armv7-neon.S b/arch/arm/crypto/sha512-armv7-neon.S
new file mode 100644
index 000000000000..fe99472e507c
--- /dev/null
+++ b/arch/arm/crypto/sha512-armv7-neon.S
@@ -0,0 +1,455 @@
+/* sha512-armv7-neon.S - ARM/NEON assembly implementation of SHA-512 transform
+ *
+ * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/linkage.h>
+
+
+.syntax unified
+.code 32
+.fpu neon
+
+.text
+
+/* structure of SHA512_CONTEXT */
+#define hd_a 0
+#define hd_b ((hd_a) + 8)
+#define hd_c ((hd_b) + 8)
+#define hd_d ((hd_c) + 8)
+#define hd_e ((hd_d) + 8)
+#define hd_f ((hd_e) + 8)
+#define hd_g ((hd_f) + 8)
+
+/* register macros */
+#define RK %r2
+
+#define RA d0
+#define RB d1
+#define RC d2
+#define RD d3
+#define RE d4
+#define RF d5
+#define RG d6
+#define RH d7
+
+#define RT0 d8
+#define RT1 d9
+#define RT2 d10
+#define RT3 d11
+#define RT4 d12
+#define RT5 d13
+#define RT6 d14
+#define RT7 d15
+
+#define RT01q q4
+#define RT23q q5
+#define RT45q q6
+#define RT67q q7
+
+#define RW0 d16
+#define RW1 d17
+#define RW2 d18
+#define RW3 d19
+#define RW4 d20
+#define RW5 d21
+#define RW6 d22
+#define RW7 d23
+#define RW8 d24
+#define RW9 d25
+#define RW10 d26
+#define RW11 d27
+#define RW12 d28
+#define RW13 d29
+#define RW14 d30
+#define RW15 d31
+
+#define RW01q q8
+#define RW23q q9
+#define RW45q q10
+#define RW67q q11
+#define RW89q q12
+#define RW1011q q13
+#define RW1213q q14
+#define RW1415q q15
+
+/***********************************************************************
+ * ARM assembly implementation of sha512 transform
+ ***********************************************************************/
+#define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, \
+ rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \
+ /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
+ vshr.u64 RT2, re, #14; \
+ vshl.u64 RT3, re, #64 - 14; \
+ interleave_op(arg1); \
+ vshr.u64 RT4, re, #18; \
+ vshl.u64 RT5, re, #64 - 18; \
+ vld1.64 {RT0}, [RK]!; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, re, #41; \
+ vshl.u64 RT5, re, #64 - 41; \
+ vadd.u64 RT0, RT0, rw0; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vmov.64 RT7, re; \
+ veor.64 RT1, RT2, RT3; \
+ vbsl.64 RT7, rf, rg; \
+ \
+ vadd.u64 RT1, RT1, rh; \
+ vshr.u64 RT2, ra, #28; \
+ vshl.u64 RT3, ra, #64 - 28; \
+ vadd.u64 RT1, RT1, RT0; \
+ vshr.u64 RT4, ra, #34; \
+ vshl.u64 RT5, ra, #64 - 34; \
+ vadd.u64 RT1, RT1, RT7; \
+ \
+ /* h = Sum0 (a) + Maj (a, b, c); */ \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, ra, #39; \
+ vshl.u64 RT5, ra, #64 - 39; \
+ veor.64 RT0, ra, rb; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vbsl.64 RT0, rc, rb; \
+ vadd.u64 rd, rd, RT1; /* d+=t1; */ \
+ veor.64 rh, RT2, RT3; \
+ \
+ /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
+ vshr.u64 RT2, rd, #14; \
+ vshl.u64 RT3, rd, #64 - 14; \
+ vadd.u64 rh, rh, RT0; \
+ vshr.u64 RT4, rd, #18; \
+ vshl.u64 RT5, rd, #64 - 18; \
+ vadd.u64 rh, rh, RT1; /* h+=t1; */ \
+ vld1.64 {RT0}, [RK]!; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, rd, #41; \
+ vshl.u64 RT5, rd, #64 - 41; \
+ vadd.u64 RT0, RT0, rw1; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vmov.64 RT7, rd; \
+ veor.64 RT1, RT2, RT3; \
+ vbsl.64 RT7, re, rf; \
+ \
+ vadd.u64 RT1, RT1, rg; \
+ vshr.u64 RT2, rh, #28; \
+ vshl.u64 RT3, rh, #64 - 28; \
+ vadd.u64 RT1, RT1, RT0; \
+ vshr.u64 RT4, rh, #34; \
+ vshl.u64 RT5, rh, #64 - 34; \
+ vadd.u64 RT1, RT1, RT7; \
+ \
+ /* g = Sum0 (h) + Maj (h, a, b); */ \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, rh, #39; \
+ vshl.u64 RT5, rh, #64 - 39; \
+ veor.64 RT0, rh, ra; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vbsl.64 RT0, rb, ra; \
+ vadd.u64 rc, rc, RT1; /* c+=t1; */ \
+ veor.64 rg, RT2, RT3; \
+ \
+ /* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \
+ /* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \
+ \
+ /**** S0(w[1:2]) */ \
+ \
+ /* w[0:1] += w[9:10] */ \
+ /* RT23q = rw1:rw2 */ \
+ vext.u64 RT23q, rw01q, rw23q, #1; \
+ vadd.u64 rw0, rw9; \
+ vadd.u64 rg, rg, RT0; \
+ vadd.u64 rw1, rw10;\
+ vadd.u64 rg, rg, RT1; /* g+=t1; */ \
+ \
+ vshr.u64 RT45q, RT23q, #1; \
+ vshl.u64 RT67q, RT23q, #64 - 1; \
+ vshr.u64 RT01q, RT23q, #8; \
+ veor.u64 RT45q, RT45q, RT67q; \
+ vshl.u64 RT67q, RT23q, #64 - 8; \
+ veor.u64 RT45q, RT45q, RT01q; \
+ vshr.u64 RT01q, RT23q, #7; \
+ veor.u64 RT45q, RT45q, RT67q; \
+ \
+ /**** S1(w[14:15]) */ \
+ vshr.u64 RT23q, rw1415q, #6; \
+ veor.u64 RT01q, RT01q, RT45q; \
+ vshr.u64 RT45q, rw1415q, #19; \
+ vshl.u64 RT67q, rw1415q, #64 - 19; \
+ veor.u64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT45q, rw1415q, #61; \
+ veor.u64 RT23q, RT23q, RT67q; \
+ vshl.u64 RT67q, rw1415q, #64 - 61; \
+ veor.u64 RT23q, RT23q, RT45q; \
+ vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \
+ veor.u64 RT01q, RT23q, RT67q;
+#define vadd_RT01q(rw01q) \
+ /* w[0:1] += S(w[14:15]) */ \
+ vadd.u64 rw01q, RT01q;
+
+#define dummy(_) /*_*/
+
+#define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, \
+ interleave_op1, arg1, interleave_op2, arg2) \
+ /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
+ vshr.u64 RT2, re, #14; \
+ vshl.u64 RT3, re, #64 - 14; \
+ interleave_op1(arg1); \
+ vshr.u64 RT4, re, #18; \
+ vshl.u64 RT5, re, #64 - 18; \
+ interleave_op2(arg2); \
+ vld1.64 {RT0}, [RK]!; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, re, #41; \
+ vshl.u64 RT5, re, #64 - 41; \
+ vadd.u64 RT0, RT0, rw0; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vmov.64 RT7, re; \
+ veor.64 RT1, RT2, RT3; \
+ vbsl.64 RT7, rf, rg; \
+ \
+ vadd.u64 RT1, RT1, rh; \
+ vshr.u64 RT2, ra, #28; \
+ vshl.u64 RT3, ra, #64 - 28; \
+ vadd.u64 RT1, RT1, RT0; \
+ vshr.u64 RT4, ra, #34; \
+ vshl.u64 RT5, ra, #64 - 34; \
+ vadd.u64 RT1, RT1, RT7; \
+ \
+ /* h = Sum0 (a) + Maj (a, b, c); */ \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, ra, #39; \
+ vshl.u64 RT5, ra, #64 - 39; \
+ veor.64 RT0, ra, rb; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vbsl.64 RT0, rc, rb; \
+ vadd.u64 rd, rd, RT1; /* d+=t1; */ \
+ veor.64 rh, RT2, RT3; \
+ \
+ /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
+ vshr.u64 RT2, rd, #14; \
+ vshl.u64 RT3, rd, #64 - 14; \
+ vadd.u64 rh, rh, RT0; \
+ vshr.u64 RT4, rd, #18; \
+ vshl.u64 RT5, rd, #64 - 18; \
+ vadd.u64 rh, rh, RT1; /* h+=t1; */ \
+ vld1.64 {RT0}, [RK]!; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, rd, #41; \
+ vshl.u64 RT5, rd, #64 - 41; \
+ vadd.u64 RT0, RT0, rw1; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vmov.64 RT7, rd; \
+ veor.64 RT1, RT2, RT3; \
+ vbsl.64 RT7, re, rf; \
+ \
+ vadd.u64 RT1, RT1, rg; \
+ vshr.u64 RT2, rh, #28; \
+ vshl.u64 RT3, rh, #64 - 28; \
+ vadd.u64 RT1, RT1, RT0; \
+ vshr.u64 RT4, rh, #34; \
+ vshl.u64 RT5, rh, #64 - 34; \
+ vadd.u64 RT1, RT1, RT7; \
+ \
+ /* g = Sum0 (h) + Maj (h, a, b); */ \
+ veor.64 RT23q, RT23q, RT45q; \
+ vshr.u64 RT4, rh, #39; \
+ vshl.u64 RT5, rh, #64 - 39; \
+ veor.64 RT0, rh, ra; \
+ veor.64 RT23q, RT23q, RT45q; \
+ vbsl.64 RT0, rb, ra; \
+ vadd.u64 rc, rc, RT1; /* c+=t1; */ \
+ veor.64 rg, RT2, RT3;
+#define vadd_rg_RT0(rg) \
+ vadd.u64 rg, rg, RT0;
+#define vadd_rg_RT1(rg) \
+ vadd.u64 rg, rg, RT1; /* g+=t1; */
+
+.align 3
+ENTRY(sha512_transform_neon)
+ /* Input:
+ * %r0: SHA512_CONTEXT
+ * %r1: data
+ * %r2: u64 k[] constants
+ * %r3: nblks
+ */
+ push {%lr};
+
+ mov %lr, #0;
+
+ /* Load context to d0-d7 */
+ vld1.64 {RA-RD}, [%r0]!;
+ vld1.64 {RE-RH}, [%r0];
+ sub %r0, #(4*8);
+
+ /* Load input to w[16], d16-d31 */
+ /* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */
+ vld1.64 {RW0-RW3}, [%r1]!;
+ vld1.64 {RW4-RW7}, [%r1]!;
+ vld1.64 {RW8-RW11}, [%r1]!;
+ vld1.64 {RW12-RW15}, [%r1]!;
+#ifdef __ARMEL__
+ /* byteswap */
+ vrev64.8 RW01q, RW01q;
+ vrev64.8 RW23q, RW23q;
+ vrev64.8 RW45q, RW45q;
+ vrev64.8 RW67q, RW67q;
+ vrev64.8 RW89q, RW89q;
+ vrev64.8 RW1011q, RW1011q;
+ vrev64.8 RW1213q, RW1213q;
+ vrev64.8 RW1415q, RW1415q;
+#endif
+
+ /* EABI says that d8-d15 must be preserved by callee. */
+ /*vpush {RT0-RT7};*/
+
+.Loop:
+ rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2,
+ RW23q, RW1415q, RW9, RW10, dummy, _);
+ b .Lenter_rounds;
+
+.Loop_rounds:
+ rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2,
+ RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q);
+.Lenter_rounds:
+ rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4,
+ RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q);
+ rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6,
+ RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q);
+ rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8,
+ RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q);
+ rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10,
+ RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q);
+ rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12,
+ RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q);
+ add %lr, #16;
+ rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14,
+ RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q);
+ cmp %lr, #64;
+ rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0,
+ RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q);
+ bne .Loop_rounds;
+
+ subs %r3, #1;
+
+ rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1,
+ vadd_RT01q, RW1415q, dummy, _);
+ rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3,
+ vadd_rg_RT0, RG, vadd_rg_RT1, RG);
+ beq .Lhandle_tail;
+ vld1.64 {RW0-RW3}, [%r1]!;
+ rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5,
+ vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+ rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7,
+ vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+#ifdef __ARMEL__
+ vrev64.8 RW01q, RW01q;
+ vrev64.8 RW23q, RW23q;
+#endif
+ vld1.64 {RW4-RW7}, [%r1]!;
+ rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9,
+ vadd_rg_RT0, RA, vadd_rg_RT1, RA);
+ rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11,
+ vadd_rg_RT0, RG, vadd_rg_RT1, RG);
+#ifdef __ARMEL__
+ vrev64.8 RW45q, RW45q;
+ vrev64.8 RW67q, RW67q;
+#endif
+ vld1.64 {RW8-RW11}, [%r1]!;
+ rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13,
+ vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+ rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15,
+ vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+#ifdef __ARMEL__
+ vrev64.8 RW89q, RW89q;
+ vrev64.8 RW1011q, RW1011q;
+#endif
+ vld1.64 {RW12-RW15}, [%r1]!;
+ vadd_rg_RT0(RA);
+ vadd_rg_RT1(RA);
+
+ /* Load context */
+ vld1.64 {RT0-RT3}, [%r0]!;
+ vld1.64 {RT4-RT7}, [%r0];
+ sub %r0, #(4*8);
+
+#ifdef __ARMEL__
+ vrev64.8 RW1213q, RW1213q;
+ vrev64.8 RW1415q, RW1415q;
+#endif
+
+ vadd.u64 RA, RT0;
+ vadd.u64 RB, RT1;
+ vadd.u64 RC, RT2;
+ vadd.u64 RD, RT3;
+ vadd.u64 RE, RT4;
+ vadd.u64 RF, RT5;
+ vadd.u64 RG, RT6;
+ vadd.u64 RH, RT7;
+
+ /* Store the first half of context */
+ vst1.64 {RA-RD}, [%r0]!;
+ sub RK, $(8*80);
+ vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
+ mov %lr, #0;
+ sub %r0, #(4*8);
+
+ b .Loop;
+
+.Lhandle_tail:
+ rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5,
+ vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+ rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7,
+ vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+ rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9,
+ vadd_rg_RT0, RA, vadd_rg_RT1, RA);
+ rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11,
+ vadd_rg_RT0, RG, vadd_rg_RT1, RG);
+ rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13,
+ vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+ rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15,
+ vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+
+ /* Load context to d16-d23 */
+ vld1.64 {RW0-RW3}, [%r0]!;
+ vadd_rg_RT0(RA);
+ vld1.64 {RW4-RW7}, [%r0];
+ vadd_rg_RT1(RA);
+ sub %r0, #(4*8);
+
+ vadd.u64 RA, RW0;
+ vadd.u64 RB, RW1;
+ vadd.u64 RC, RW2;
+ vadd.u64 RD, RW3;
+ vadd.u64 RE, RW4;
+ vadd.u64 RF, RW5;
+ vadd.u64 RG, RW6;
+ vadd.u64 RH, RW7;
+
+ /* Store the first half of context */
+ vst1.64 {RA-RD}, [%r0]!;
+
+ /* Clear used registers */
+ /* d16-d31 */
+ veor.u64 RW01q, RW01q;
+ veor.u64 RW23q, RW23q;
+ veor.u64 RW45q, RW45q;
+ veor.u64 RW67q, RW67q;
+ vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
+ veor.u64 RW89q, RW89q;
+ veor.u64 RW1011q, RW1011q;
+ veor.u64 RW1213q, RW1213q;
+ veor.u64 RW1415q, RW1415q;
+ /* d8-d15 */
+ /*vpop {RT0-RT7};*/
+ /* d0-d7 (q0-q3) */
+ veor.u64 %q0, %q0;
+ veor.u64 %q1, %q1;
+ veor.u64 %q2, %q2;
+ veor.u64 %q3, %q3;
+
+ pop {%pc};
+ENDPROC(sha512_transform_neon)
diff --git a/arch/arm/crypto/sha512_neon_glue.c b/arch/arm/crypto/sha512_neon_glue.c
new file mode 100644
index 000000000000..0d2758ff5e12
--- /dev/null
+++ b/arch/arm/crypto/sha512_neon_glue.c
@@ -0,0 +1,305 @@
+/*
+ * Glue code for the SHA512 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is based on sha512_ssse3_glue.c:
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+
+
+static const u64 sha512_k[] = {
+ 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
+ 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
+ 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+ 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
+ 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
+ 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+ 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
+ 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
+ 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+ 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
+ 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
+ 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+ 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
+ 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
+ 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+ 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
+ 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
+ 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+ 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
+ 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
+ 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+ 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
+ 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
+ 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+ 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
+ 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
+ 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+ 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
+ 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
+ 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+ 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
+ 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
+ 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+ 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
+ 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
+ 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+ 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
+ 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
+ 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+ 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+
+asmlinkage void sha512_transform_neon(u64 *digest, const void *data,
+ const u64 k[], unsigned int num_blks);
+
+
+static int sha512_neon_init(struct shash_desc *desc)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA512_H0;
+ sctx->state[1] = SHA512_H1;
+ sctx->state[2] = SHA512_H2;
+ sctx->state[3] = SHA512_H3;
+ sctx->state[4] = SHA512_H4;
+ sctx->state[5] = SHA512_H5;
+ sctx->state[6] = SHA512_H6;
+ sctx->state[7] = SHA512_H7;
+ sctx->count[0] = sctx->count[1] = 0;
+
+ return 0;
+}
+
+static int __sha512_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count[0] += len;
+ if (sctx->count[0] < len)
+ sctx->count[1]++;
+
+ if (partial) {
+ done = SHA512_BLOCK_SIZE - partial;
+ memcpy(sctx->buf + partial, data, done);
+ sha512_transform_neon(sctx->state, sctx->buf, sha512_k, 1);
+ }
+
+ if (len - done >= SHA512_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
+
+ sha512_transform_neon(sctx->state, data + done, sha512_k,
+ rounds);
+
+ done += rounds * SHA512_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buf, data + done, len - done);
+
+ return 0;
+}
+
+static int sha512_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
+ int res;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA512_BLOCK_SIZE) {
+ sctx->count[0] += len;
+ if (sctx->count[0] < len)
+ sctx->count[1]++;
+ memcpy(sctx->buf + partial, data, len);
+
+ return 0;
+ }
+
+ if (!may_use_simd()) {
+ res = crypto_sha512_update(desc, data, len);
+ } else {
+ kernel_neon_begin();
+ res = __sha512_neon_update(desc, data, len, partial);
+ kernel_neon_end();
+ }
+
+ return res;
+}
+
+
+/* Add padding and return the message digest. */
+static int sha512_neon_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be64 *dst = (__be64 *)out;
+ __be64 bits[2];
+ static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
+
+ /* save number of bits */
+ bits[1] = cpu_to_be64(sctx->count[0] << 3);
+ bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
+
+ /* Pad out to 112 mod 128 and append length */
+ index = sctx->count[0] & 0x7f;
+ padlen = (index < 112) ? (112 - index) : ((128+112) - index);
+
+ if (!may_use_simd()) {
+ crypto_sha512_update(desc, padding, padlen);
+ crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
+ } else {
+ kernel_neon_begin();
+ /* We need to fill a whole block for __sha512_neon_update() */
+ if (padlen <= 112) {
+ sctx->count[0] += padlen;
+ if (sctx->count[0] < padlen)
+ sctx->count[1]++;
+ memcpy(sctx->buf + index, padding, padlen);
+ } else {
+ __sha512_neon_update(desc, padding, padlen, index);
+ }
+ __sha512_neon_update(desc, (const u8 *)&bits,
+ sizeof(bits), 112);
+ kernel_neon_end();
+ }
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be64(sctx->state[i]);
+
+ /* Wipe context */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha512_neon_export(struct shash_desc *desc, void *out)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha512_neon_import(struct shash_desc *desc, const void *in)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha384_neon_init(struct shash_desc *desc)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA384_H0;
+ sctx->state[1] = SHA384_H1;
+ sctx->state[2] = SHA384_H2;
+ sctx->state[3] = SHA384_H3;
+ sctx->state[4] = SHA384_H4;
+ sctx->state[5] = SHA384_H5;
+ sctx->state[6] = SHA384_H6;
+ sctx->state[7] = SHA384_H7;
+
+ sctx->count[0] = sctx->count[1] = 0;
+
+ return 0;
+}
+
+static int sha384_neon_final(struct shash_desc *desc, u8 *hash)
+{
+ u8 D[SHA512_DIGEST_SIZE];
+
+ sha512_neon_final(desc, D);
+
+ memcpy(hash, D, SHA384_DIGEST_SIZE);
+ memset(D, 0, SHA512_DIGEST_SIZE);
+
+ return 0;
+}
+
+static struct shash_alg algs[] = { {
+ .digestsize = SHA512_DIGEST_SIZE,
+ .init = sha512_neon_init,
+ .update = sha512_neon_update,
+ .final = sha512_neon_final,
+ .export = sha512_neon_export,
+ .import = sha512_neon_import,
+ .descsize = sizeof(struct sha512_state),
+ .statesize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha512",
+ .cra_driver_name = "sha512-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA512_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA384_DIGEST_SIZE,
+ .init = sha384_neon_init,
+ .update = sha512_neon_update,
+ .final = sha384_neon_final,
+ .export = sha512_neon_export,
+ .import = sha512_neon_import,
+ .descsize = sizeof(struct sha512_state),
+ .statesize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha384",
+ .cra_driver_name = "sha384-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA384_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int __init sha512_neon_mod_init(void)
+{
+ if (!cpu_has_neon())
+ return -ENODEV;
+
+ return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit sha512_neon_mod_fini(void)
+{
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+
+module_init(sha512_neon_mod_init);
+module_exit(sha512_neon_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, NEON accelerated");
+
+MODULE_ALIAS("sha512");
+MODULE_ALIAS("sha384");
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index d3db39860b9c..6577b8aeb711 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -24,6 +24,7 @@ generic-y += sembuf.h
generic-y += serial.h
generic-y += shmbuf.h
generic-y += siginfo.h
+generic-y += simd.h
generic-y += sizes.h
generic-y += socket.h
generic-y += sockios.h
diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h
index 60f15e274e6d..2f59f7443396 100644
--- a/arch/arm/include/asm/barrier.h
+++ b/arch/arm/include/asm/barrier.h
@@ -59,6 +59,21 @@
#define smp_wmb() dmb(ishst)
#endif
+#define smp_store_release(p, v) \
+do { \
+ compiletime_assert_atomic_type(*p); \
+ smp_mb(); \
+ ACCESS_ONCE(*p) = (v); \
+} while (0)
+
+#define smp_load_acquire(p) \
+({ \
+ typeof(*p) ___p1 = ACCESS_ONCE(*p); \
+ compiletime_assert_atomic_type(*p); \
+ smp_mb(); \
+ ___p1; \
+})
+
#define read_barrier_depends() do { } while(0)
#define smp_read_barrier_depends() do { } while(0)
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 2059f019bef4..455e6637c881 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -16,6 +16,7 @@
#include <asm/shmparam.h>
#include <asm/cachetype.h>
#include <asm/outercache.h>
+#include <asm/rodata.h>
#define CACHE_COLOUR(vaddr) ((vaddr & (SHMLBA - 1)) >> PAGE_SHIFT)
diff --git a/arch/arm/include/asm/crypto/sha1.h b/arch/arm/include/asm/crypto/sha1.h
new file mode 100644
index 000000000000..75e6a417416b
--- /dev/null
+++ b/arch/arm/include/asm/crypto/sha1.h
@@ -0,0 +1,10 @@
+#ifndef ASM_ARM_CRYPTO_SHA1_H
+#define ASM_ARM_CRYPTO_SHA1_H
+
+#include <linux/crypto.h>
+#include <crypto/sha.h>
+
+extern int sha1_update_arm(struct shash_desc *desc, const u8 *data,
+ unsigned int len);
+
+#endif
diff --git a/arch/arm/include/asm/fiq_glue.h b/arch/arm/include/asm/fiq_glue.h
new file mode 100644
index 000000000000..a9e244f9f197
--- /dev/null
+++ b/arch/arm/include/asm/fiq_glue.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __ASM_FIQ_GLUE_H
+#define __ASM_FIQ_GLUE_H
+
+struct fiq_glue_handler {
+ void (*fiq)(struct fiq_glue_handler *h, void *regs, void *svc_sp);
+ void (*resume)(struct fiq_glue_handler *h);
+};
+typedef void (*fiq_return_handler_t)(void);
+
+int fiq_glue_register_handler(struct fiq_glue_handler *handler);
+int fiq_glue_set_return_handler(fiq_return_handler_t fiq_return);
+int fiq_glue_clear_return_handler(fiq_return_handler_t fiq_return);
+
+#ifdef CONFIG_FIQ_GLUE
+void fiq_glue_resume(void);
+#else
+static inline void fiq_glue_resume(void) {}
+#endif
+
+#endif
diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h
index 3d7351c844aa..fe3ea776dc34 100644
--- a/arch/arm/include/asm/hardirq.h
+++ b/arch/arm/include/asm/hardirq.h
@@ -5,7 +5,7 @@
#include <linux/threads.h>
#include <asm/irq.h>
-#define NR_IPI 7
+#define NR_IPI 8
typedef struct {
unsigned int __softirq_pending;
diff --git a/arch/arm/include/asm/hardware/cache-l2x0.h b/arch/arm/include/asm/hardware/cache-l2x0.h
index 3b2c40b5bfa2..0ca0f5a7c84b 100644
--- a/arch/arm/include/asm/hardware/cache-l2x0.h
+++ b/arch/arm/include/asm/hardware/cache-l2x0.h
@@ -66,6 +66,7 @@
#define L2X0_STNDBY_MODE_EN (1 << 0)
/* Registers shifts and masks */
+#define L2X0_CACHE_ID_REV_MASK (0x3f)
#define L2X0_CACHE_ID_PART_MASK (0xf << 6)
#define L2X0_CACHE_ID_PART_L210 (1 << 6)
#define L2X0_CACHE_ID_PART_L310 (3 << 6)
@@ -106,6 +107,8 @@
#define L2X0_WAY_SIZE_SHIFT 3
+#define REV_PL310_R2P0 4
+
#ifndef __ASSEMBLY__
extern void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask);
#if defined(CONFIG_CACHE_L2X0) && defined(CONFIG_OF)
diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h
index 53c15dec7af6..809203a4b71b 100644
--- a/arch/arm/include/asm/irq.h
+++ b/arch/arm/include/asm/irq.h
@@ -35,6 +35,9 @@ extern void (*handle_arch_irq)(struct pt_regs *);
extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
#endif
+void arch_trigger_all_cpu_backtrace(void);
+#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
+
#endif
#endif
diff --git a/arch/arm/include/asm/mach/mmc.h b/arch/arm/include/asm/mach/mmc.h
new file mode 100644
index 000000000000..bca864ac945f
--- /dev/null
+++ b/arch/arm/include/asm/mach/mmc.h
@@ -0,0 +1,28 @@
+/*
+ * arch/arm/include/asm/mach/mmc.h
+ */
+#ifndef ASMARM_MACH_MMC_H
+#define ASMARM_MACH_MMC_H
+
+#include <linux/mmc/host.h>
+#include <linux/mmc/card.h>
+#include <linux/mmc/sdio_func.h>
+
+struct embedded_sdio_data {
+ struct sdio_cis cis;
+ struct sdio_cccr cccr;
+ struct sdio_embedded_func *funcs;
+ int num_funcs;
+};
+
+struct mmc_platform_data {
+ unsigned int ocr_mask; /* available voltages */
+ int built_in; /* built-in device flag */
+ int card_present; /* card detect state */
+ u32 (*translate_vdd)(struct device *, unsigned int);
+ unsigned int (*status)(struct device *);
+ struct embedded_sdio_data *embedded_sdio;
+ int (*register_status_notify)(void (*callback)(int card_present, void *dev_id), void *dev_id);
+};
+
+#endif
diff --git a/arch/arm/include/asm/neon.h b/arch/arm/include/asm/neon.h
new file mode 100644
index 000000000000..8f730fe70093
--- /dev/null
+++ b/arch/arm/include/asm/neon.h
@@ -0,0 +1,36 @@
+/*
+ * linux/arch/arm/include/asm/neon.h
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/hwcap.h>
+
+#define cpu_has_neon() (!!(elf_hwcap & HWCAP_NEON))
+
+#ifdef __ARM_NEON__
+
+/*
+ * If you are affected by the BUILD_BUG below, it probably means that you are
+ * using NEON code /and/ calling the kernel_neon_begin() function from the same
+ * compilation unit. To prevent issues that may arise from GCC reordering or
+ * generating(1) NEON instructions outside of these begin/end functions, the
+ * only supported way of using NEON code in the kernel is by isolating it in a
+ * separate compilation unit, and calling it from another unit from inside a
+ * kernel_neon_begin/kernel_neon_end pair.
+ *
+ * (1) Current GCC (4.7) might generate NEON instructions at O3 level if
+ * -mpfu=neon is set.
+ */
+
+#define kernel_neon_begin() \
+ BUILD_BUG_ON_MSG(1, "kernel_neon_begin() called from NEON code")
+
+#else
+void kernel_neon_begin(void);
+#endif
+void kernel_neon_end(void);
diff --git a/arch/arm/include/asm/rodata.h b/arch/arm/include/asm/rodata.h
new file mode 100644
index 000000000000..8c8add87bbc5
--- /dev/null
+++ b/arch/arm/include/asm/rodata.h
@@ -0,0 +1,32 @@
+/*
+ * arch/arm/include/asm/rodata.h
+ *
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _ASMARM_RODATA_H
+#define _ASMARM_RODATA_H
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_DEBUG_RODATA
+
+int set_memory_rw(unsigned long virt, int numpages);
+int set_memory_ro(unsigned long virt, int numpages);
+
+void mark_rodata_ro(void);
+void set_kernel_text_rw(void);
+void set_kernel_text_ro(void);
+#else
+static inline void set_kernel_text_rw(void) { }
+static inline void set_kernel_text_ro(void) { }
+#endif
+
+#endif
+
+#endif
diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h
index 610ccf33f5e7..67a18a5ed9fa 100644
--- a/arch/arm/include/asm/smp.h
+++ b/arch/arm/include/asm/smp.h
@@ -82,6 +82,7 @@ extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
extern int register_ipi_completion(struct completion *completion, int cpu);
+extern void smp_send_all_cpu_backtrace(void);
struct smp_operations {
#ifdef CONFIG_SMP
diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h
index 73ddd7239b33..ed805f1d3785 100644
--- a/arch/arm/include/asm/syscall.h
+++ b/arch/arm/include/asm/syscall.h
@@ -103,8 +103,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
memcpy(&regs->ARM_r0 + i, args, n * sizeof(args[0]));
}
-static inline int syscall_get_arch(struct task_struct *task,
- struct pt_regs *regs)
+static inline int syscall_get_arch(void)
{
/* ARM tasks don't change audit architectures on the fly. */
return AUDIT_ARCH_ARM;
diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index cbd61977c996..43876245fc57 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -15,7 +15,7 @@
#include <uapi/asm/unistd.h>
-#define __NR_syscalls (380)
+#define __NR_syscalls (384)
#define __ARM_NR_cmpxchg (__ARM_NR_BASE+0x00fff0)
#define __ARCH_WANT_STAT64
diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h
index af33b44990ed..17407c92c0da 100644
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -406,6 +406,12 @@
#define __NR_process_vm_writev (__NR_SYSCALL_BASE+377)
#define __NR_kcmp (__NR_SYSCALL_BASE+378)
#define __NR_finit_module (__NR_SYSCALL_BASE+379)
+/* Reserve for later
+#define __NR_sched_setattr (__NR_SYSCALL_BASE+380)
+#define __NR_sched_getattr (__NR_SYSCALL_BASE+381)
+#define __NR_renameat2 (__NR_SYSCALL_BASE+382)
+*/
+#define __NR_seccomp (__NR_SYSCALL_BASE+383)
/*
* This may need to be greater than __NR_last_syscall+1 in order to
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index c6ca7e376773..725f844926ea 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -389,6 +389,11 @@
CALL(sys_process_vm_writev)
CALL(sys_kcmp)
CALL(sys_finit_module)
+/* 380 */ CALL(sys_ni_syscall) /* reserved sys_sched_setattr */
+ CALL(sys_ni_syscall) /* reserved sys_sched_getattr */
+ CALL(sys_ni_syscall) /* reserved sys_renameat2 */
+ CALL(sys_seccomp)
+
#ifndef syscalls_counted
.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
#define syscalls_counted
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index 34e56647dcee..6a740a93f4bb 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -13,6 +13,7 @@
*/
#include <linux/ftrace.h>
+#include <linux/module.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -63,6 +64,20 @@ static unsigned long adjust_address(struct dyn_ftrace *rec, unsigned long addr)
}
#endif
+int ftrace_arch_code_modify_prepare(void)
+{
+ set_kernel_text_rw();
+ set_all_modules_text_rw();
+ return 0;
+}
+
+int ftrace_arch_code_modify_post_process(void)
+{
+ set_all_modules_text_ro();
+ set_kernel_text_ro();
+ return 0;
+}
+
static unsigned long ftrace_call_replace(unsigned long pc, unsigned long addr)
{
return arm_gen_branch_link(pc, addr);
diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
index 778c2f7024ff..b321c8fbb87d 100644
--- a/arch/arm/kernel/kgdb.c
+++ b/arch/arm/kernel/kgdb.c
@@ -144,6 +144,8 @@ int kgdb_arch_handle_exception(int exception_vector, int signo,
static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr)
{
+ if (user_mode(regs))
+ return -1;
kgdb_handle_exception(1, SIGTRAP, 0, regs);
return 0;
@@ -151,6 +153,8 @@ static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr)
static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int instr)
{
+ if (user_mode(regs))
+ return -1;
compiled_break = 1;
kgdb_handle_exception(1, SIGTRAP, 0, regs);
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index ac4c2e5e17e4..978002e5b406 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -32,6 +32,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/cpuidle.h>
#include <linux/leds.h>
+#include <linux/console.h>
#include <asm/cacheflush.h>
#include <asm/idmap.h>
@@ -57,9 +58,46 @@ static const char *isa_modes[] = {
"ARM" , "Thumb" , "Jazelle", "ThumbEE"
};
+#ifdef CONFIG_SMP
+void arch_trigger_all_cpu_backtrace(void)
+{
+ smp_send_all_cpu_backtrace();
+}
+#else
+void arch_trigger_all_cpu_backtrace(void)
+{
+ dump_stack();
+}
+#endif
+
extern void call_with_stack(void (*fn)(void *), void *arg, void *sp);
typedef void (*phys_reset_t)(unsigned long);
+#ifdef CONFIG_ARM_FLUSH_CONSOLE_ON_RESTART
+void arm_machine_flush_console(void)
+{
+ printk("\n");
+ pr_emerg("Restarting %s\n", linux_banner);
+ if (console_trylock()) {
+ console_unlock();
+ return;
+ }
+
+ mdelay(50);
+
+ local_irq_disable();
+ if (!console_trylock())
+ pr_emerg("arm_restart: Console was locked! Busting\n");
+ else
+ pr_emerg("arm_restart: Console was locked!\n");
+ console_unlock();
+}
+#else
+void arm_machine_flush_console(void)
+{
+}
+#endif
+
/*
* A temporary stack to use for CPU reset. This is static so that we
* don't clobber it with the identity mapping. When running with this
@@ -147,6 +185,7 @@ void arch_cpu_idle_prepare(void)
void arch_cpu_idle_enter(void)
{
+ idle_notifier_call_chain(IDLE_START);
ledtrig_cpu(CPU_LED_IDLE_START);
#ifdef CONFIG_PL310_ERRATA_769419
wmb();
@@ -156,6 +195,7 @@ void arch_cpu_idle_enter(void)
void arch_cpu_idle_exit(void)
{
ledtrig_cpu(CPU_LED_IDLE_END);
+ idle_notifier_call_chain(IDLE_END);
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -195,6 +235,16 @@ __setup("reboot=", reboot_setup);
*/
void machine_shutdown(void)
{
+#ifdef CONFIG_SMP
+ /*
+ * Disable preemption so we're guaranteed to
+ * run to power off or reboot and prevent
+ * the possibility of switching to another
+ * thread that might wind up blocking on
+ * one of the stopped CPUs.
+ */
+ preempt_disable();
+#endif
disable_nonboot_cpus();
}
@@ -240,6 +290,10 @@ void machine_restart(char *cmd)
{
smp_send_stop();
+ /* Flush the console to make sure all the relevant messages make it
+ * out to the console drivers */
+ arm_machine_flush_console();
+
arm_pm_restart(reboot_mode, cmd);
/* Give a grace period for failure to restart of 1s */
@@ -251,6 +305,77 @@ void machine_restart(char *cmd)
while (1);
}
+/*
+ * dump a block of kernel memory from around the given address
+ */
+static void show_data(unsigned long addr, int nbytes, const char *name)
+{
+ int i, j;
+ int nlines;
+ u32 *p;
+
+ /*
+ * don't attempt to dump non-kernel addresses or
+ * values that are probably just small negative numbers
+ */
+ if (addr < PAGE_OFFSET || addr > -256UL)
+ return;
+
+ printk("\n%s: %#lx:\n", name, addr);
+
+ /*
+ * round address down to a 32 bit boundary
+ * and always dump a multiple of 32 bytes
+ */
+ p = (u32 *)(addr & ~(sizeof(u32) - 1));
+ nbytes += (addr & (sizeof(u32) - 1));
+ nlines = (nbytes + 31) / 32;
+
+
+ for (i = 0; i < nlines; i++) {
+ /*
+ * just display low 16 bits of address to keep
+ * each line of the dump < 80 characters
+ */
+ printk("%04lx ", (unsigned long)p & 0xffff);
+ for (j = 0; j < 8; j++) {
+ u32 data;
+ if (probe_kernel_address(p, data)) {
+ printk(" ********");
+ } else {
+ printk(" %08x", data);
+ }
+ ++p;
+ }
+ printk("\n");
+ }
+}
+
+static void show_extra_register_data(struct pt_regs *regs, int nbytes)
+{
+ mm_segment_t fs;
+
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ show_data(regs->ARM_pc - nbytes, nbytes * 2, "PC");
+ show_data(regs->ARM_lr - nbytes, nbytes * 2, "LR");
+ show_data(regs->ARM_sp - nbytes, nbytes * 2, "SP");
+ show_data(regs->ARM_ip - nbytes, nbytes * 2, "IP");
+ show_data(regs->ARM_fp - nbytes, nbytes * 2, "FP");
+ show_data(regs->ARM_r0 - nbytes, nbytes * 2, "R0");
+ show_data(regs->ARM_r1 - nbytes, nbytes * 2, "R1");
+ show_data(regs->ARM_r2 - nbytes, nbytes * 2, "R2");
+ show_data(regs->ARM_r3 - nbytes, nbytes * 2, "R3");
+ show_data(regs->ARM_r4 - nbytes, nbytes * 2, "R4");
+ show_data(regs->ARM_r5 - nbytes, nbytes * 2, "R5");
+ show_data(regs->ARM_r6 - nbytes, nbytes * 2, "R6");
+ show_data(regs->ARM_r7 - nbytes, nbytes * 2, "R7");
+ show_data(regs->ARM_r8 - nbytes, nbytes * 2, "R8");
+ show_data(regs->ARM_r9 - nbytes, nbytes * 2, "R9");
+ show_data(regs->ARM_r10 - nbytes, nbytes * 2, "R10");
+ set_fs(fs);
+}
+
void __show_regs(struct pt_regs *regs)
{
unsigned long flags;
@@ -307,6 +432,8 @@ void __show_regs(struct pt_regs *regs)
printk("Control: %08x%s\n", ctrl, buf);
}
#endif
+
+ show_extra_register_data(regs, 128);
}
void show_regs(struct pt_regs * regs)
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 03deeffd9f6d..394424b25254 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -916,7 +916,7 @@ enum ptrace_syscall_dir {
PTRACE_SYSCALL_EXIT,
};
-static int tracehook_report_syscall(struct pt_regs *regs,
+static void tracehook_report_syscall(struct pt_regs *regs,
enum ptrace_syscall_dir dir)
{
unsigned long ip;
@@ -934,7 +934,6 @@ static int tracehook_report_syscall(struct pt_regs *regs,
current_thread_info()->syscall = -1;
regs->ARM_ip = ip;
- return current_thread_info()->syscall;
}
asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno)
@@ -946,7 +945,9 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno)
return -1;
if (test_thread_flag(TIF_SYSCALL_TRACE))
- scno = tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
+ tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
+
+ scno = current_thread_info()->syscall;
if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
trace_sys_enter(regs, scno);
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index d6e3d1ca4ddf..f2724e475b96 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -70,6 +70,7 @@ enum ipi_msg_type {
IPI_CALL_FUNC_SINGLE,
IPI_CPU_STOP,
IPI_COMPLETION,
+ IPI_CPU_BACKTRACE,
};
static DECLARE_COMPLETION(cpu_running);
@@ -468,6 +469,7 @@ static const char *ipi_types[NR_IPI] = {
S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"),
S(IPI_CPU_STOP, "CPU stop interrupts"),
S(IPI_COMPLETION, "completion interrupts"),
+ S(IPI_CPU_BACKTRACE, "CPU backtrace"),
};
void show_ipi_list(struct seq_file *p, int prec)
@@ -606,6 +608,58 @@ static void ipi_complete(unsigned int cpu)
complete(per_cpu(cpu_completion, cpu));
}
+static cpumask_t backtrace_mask;
+static DEFINE_RAW_SPINLOCK(backtrace_lock);
+
+/* "in progress" flag of arch_trigger_all_cpu_backtrace */
+static unsigned long backtrace_flag;
+
+void smp_send_all_cpu_backtrace(void)
+{
+ unsigned int this_cpu = smp_processor_id();
+ int i;
+
+ if (test_and_set_bit(0, &backtrace_flag))
+ /*
+ * If there is already a trigger_all_cpu_backtrace() in progress
+ * (backtrace_flag == 1), don't output double cpu dump infos.
+ */
+ return;
+
+ cpumask_copy(&backtrace_mask, cpu_online_mask);
+ cpu_clear(this_cpu, backtrace_mask);
+
+ pr_info("Backtrace for cpu %d (current):\n", this_cpu);
+ dump_stack();
+
+ pr_info("\nsending IPI to all other CPUs:\n");
+ smp_cross_call(&backtrace_mask, IPI_CPU_BACKTRACE);
+
+ /* Wait for up to 10 seconds for all other CPUs to do the backtrace */
+ for (i = 0; i < 10 * 1000; i++) {
+ if (cpumask_empty(&backtrace_mask))
+ break;
+ mdelay(1);
+ }
+
+ clear_bit(0, &backtrace_flag);
+ smp_mb__after_clear_bit();
+}
+
+/*
+ * ipi_cpu_backtrace - handle IPI from smp_send_all_cpu_backtrace()
+ */
+static void ipi_cpu_backtrace(unsigned int cpu, struct pt_regs *regs)
+{
+ if (cpu_isset(cpu, backtrace_mask)) {
+ raw_spin_lock(&backtrace_lock);
+ pr_warning("IPI backtrace for cpu %d\n", cpu);
+ show_regs(regs);
+ raw_spin_unlock(&backtrace_lock);
+ cpu_clear(cpu, backtrace_mask);
+ }
+}
+
/*
* Main handler for inter-processor interrupts
*/
@@ -663,6 +717,10 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
irq_exit();
break;
+ case IPI_CPU_BACKTRACE:
+ ipi_cpu_backtrace(cpu, regs);
+ break;
+
default:
printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n",
cpu, ipinr);
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 224a9cc09877..2d861a210ac5 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -7,6 +7,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \
obj-$(CONFIG_MMU) += fault-armv.o flush.o idmap.o ioremap.o \
mmap.o pgd.o mmu.o
+obj-$(CONFIG_DEBUG_RODATA) += rodata.o
ifneq ($(CONFIG_MMU),y)
obj-y += nommu.o
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
index c465faca51b0..90a130f98acf 100644
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -33,6 +33,9 @@ static void __iomem *l2x0_base;
static DEFINE_RAW_SPINLOCK(l2x0_lock);
static u32 l2x0_way_mask; /* Bitmask of active ways */
static u32 l2x0_size;
+static u32 l2x0_cache_id;
+static unsigned int l2x0_sets;
+static unsigned int l2x0_ways;
static unsigned long sync_reg_offset = L2X0_CACHE_SYNC;
/* Aurora don't have the cache ID register available, so we have to
@@ -49,6 +52,13 @@ struct l2x0_of_data {
static bool of_init = false;
+static inline bool is_pl310_rev(int rev)
+{
+ return (l2x0_cache_id &
+ (L2X0_CACHE_ID_PART_MASK | L2X0_CACHE_ID_REV_MASK)) ==
+ (L2X0_CACHE_ID_PART_L310 | rev);
+}
+
static inline void cache_wait_way(void __iomem *reg, unsigned long mask)
{
/* wait for cache operation by line or way to complete */
@@ -137,6 +147,23 @@ static void l2x0_cache_sync(void)
raw_spin_unlock_irqrestore(&l2x0_lock, flags);
}
+#ifdef CONFIG_PL310_ERRATA_727915
+static void l2x0_for_each_set_way(void __iomem *reg)
+{
+ int set;
+ int way;
+ unsigned long flags;
+
+ for (way = 0; way < l2x0_ways; way++) {
+ raw_spin_lock_irqsave(&l2x0_lock, flags);
+ for (set = 0; set < l2x0_sets; set++)
+ writel_relaxed((way << 28) | (set << 5), reg);
+ cache_sync();
+ raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+ }
+}
+#endif
+
static void __l2x0_flush_all(void)
{
debug_writel(0x03);
@@ -150,6 +177,13 @@ static void l2x0_flush_all(void)
{
unsigned long flags;
+#ifdef CONFIG_PL310_ERRATA_727915
+ if (is_pl310_rev(REV_PL310_R2P0)) {
+ l2x0_for_each_set_way(l2x0_base + L2X0_CLEAN_INV_LINE_IDX);
+ return;
+ }
+#endif
+
/* clean all ways */
raw_spin_lock_irqsave(&l2x0_lock, flags);
__l2x0_flush_all();
@@ -160,11 +194,20 @@ static void l2x0_clean_all(void)
{
unsigned long flags;
+#ifdef CONFIG_PL310_ERRATA_727915
+ if (is_pl310_rev(REV_PL310_R2P0)) {
+ l2x0_for_each_set_way(l2x0_base + L2X0_CLEAN_LINE_IDX);
+ return;
+ }
+#endif
+
/* clean all ways */
raw_spin_lock_irqsave(&l2x0_lock, flags);
+ debug_writel(0x03);
writel_relaxed(l2x0_way_mask, l2x0_base + L2X0_CLEAN_WAY);
cache_wait_way(l2x0_base + L2X0_CLEAN_WAY, l2x0_way_mask);
cache_sync();
+ debug_writel(0x00);
raw_spin_unlock_irqrestore(&l2x0_lock, flags);
}
@@ -323,65 +366,64 @@ static void l2x0_unlock(u32 cache_id)
void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
{
u32 aux;
- u32 cache_id;
u32 way_size = 0;
- int ways;
int way_size_shift = L2X0_WAY_SIZE_SHIFT;
const char *type;
l2x0_base = base;
if (cache_id_part_number_from_dt)
- cache_id = cache_id_part_number_from_dt;
+ l2x0_cache_id = cache_id_part_number_from_dt;
else
- cache_id = readl_relaxed(l2x0_base + L2X0_CACHE_ID);
+ l2x0_cache_id = readl_relaxed(l2x0_base + L2X0_CACHE_ID);
aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
aux &= aux_mask;
aux |= aux_val;
/* Determine the number of ways */
- switch (cache_id & L2X0_CACHE_ID_PART_MASK) {
+ switch (l2x0_cache_id & L2X0_CACHE_ID_PART_MASK) {
case L2X0_CACHE_ID_PART_L310:
if (aux & (1 << 16))
- ways = 16;
+ l2x0_ways = 16;
else
- ways = 8;
+ l2x0_ways = 8;
type = "L310";
#ifdef CONFIG_PL310_ERRATA_753970
/* Unmapped register. */
sync_reg_offset = L2X0_DUMMY_REG;
#endif
- if ((cache_id & L2X0_CACHE_ID_RTL_MASK) <= L2X0_CACHE_ID_RTL_R3P0)
+ if ((l2x0_cache_id & L2X0_CACHE_ID_RTL_MASK) <= L2X0_CACHE_ID_RTL_R3P0)
outer_cache.set_debug = pl310_set_debug;
break;
case L2X0_CACHE_ID_PART_L210:
- ways = (aux >> 13) & 0xf;
+ l2x0_ways = (aux >> 13) & 0xf;
type = "L210";
break;
case AURORA_CACHE_ID:
sync_reg_offset = AURORA_SYNC_REG;
- ways = (aux >> 13) & 0xf;
- ways = 2 << ((ways + 1) >> 2);
+ l2x0_ways = (aux >> 13) & 0xf;
+ l2x0_ways = 2 << ((l2x0_ways + 1) >> 2);
way_size_shift = AURORA_WAY_SIZE_SHIFT;
type = "Aurora";
break;
default:
/* Assume unknown chips have 8 ways */
- ways = 8;
+ l2x0_ways = 8;
type = "L2x0 series";
break;
}
- l2x0_way_mask = (1 << ways) - 1;
+ l2x0_way_mask = (1 << l2x0_ways) - 1;
/*
* L2 cache Size = Way size * Number of ways
*/
way_size = (aux & L2X0_AUX_CTRL_WAY_SIZE_MASK) >> 17;
- way_size = 1 << (way_size + way_size_shift);
+ way_size = SZ_1K << (way_size + way_size_shift);
- l2x0_size = ways * way_size * SZ_1K;
+ l2x0_size = l2x0_ways * way_size;
+ l2x0_sets = way_size / CACHE_LINE_SIZE;
/*
* Check if l2x0 controller is already enabled.
@@ -390,7 +432,7 @@ void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
*/
if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) {
/* Make sure that I&D is not locked down when starting */
- l2x0_unlock(cache_id);
+ l2x0_unlock(l2x0_cache_id);
/* l2x0 controller is disabled */
writel_relaxed(aux, l2x0_base + L2X0_AUX_CTRL);
@@ -419,7 +461,7 @@ void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
printk(KERN_INFO "%s cache controller enabled\n", type);
printk(KERN_INFO "l2x0: %d ways, CACHE_ID 0x%08x, AUX_CTRL 0x%08x, Cache size: %d B\n",
- ways, cache_id, aux, l2x0_size);
+ l2x0_ways, l2x0_cache_id, aux, l2x0_size);
}
#ifdef CONFIG_OF
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index d8fd4d4bd3d4..7a3d3d8d98d7 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -270,6 +270,11 @@ v6_dma_clean_range:
* - end - virtual end address of region
*/
ENTRY(v6_dma_flush_range)
+#ifdef CONFIG_CACHE_FLUSH_RANGE_LIMIT
+ sub r2, r1, r0
+ cmp r2, #CONFIG_CACHE_FLUSH_RANGE_LIMIT
+ bhi v6_dma_flush_dcache_all
+#endif
#ifdef CONFIG_DMA_CACHE_RWFO
ldrb r2, [r0] @ read for ownership
strb r2, [r0] @ write for ownership
@@ -292,6 +297,18 @@ ENTRY(v6_dma_flush_range)
mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
mov pc, lr
+#ifdef CONFIG_CACHE_FLUSH_RANGE_LIMIT
+v6_dma_flush_dcache_all:
+ mov r0, #0
+#ifdef HARVARD_CACHE
+ mcr p15, 0, r0, c7, c14, 0 @ D cache clean+invalidate
+#else
+ mcr p15, 0, r0, c7, c15, 0 @ Cache clean+invalidate
+#endif
+ mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
+ mov pc, lr
+#endif
+
/*
* dma_map_area(start, size, dir)
* - start - kernel virtual start address
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 54fcddafec15..9820ad4b80c0 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -274,10 +274,10 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
local_irq_enable();
/*
- * If we're in an interrupt or have no user
+ * If we're in an interrupt, or have no irqs, or have no user
* context, we must not take the fault..
*/
- if (in_atomic() || !mm)
+ if (in_atomic() || irqs_disabled() || !mm)
goto no_context;
if (user_mode(regs))
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 4c7d5cddef35..a4e5cd5c3067 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -605,11 +605,25 @@ static void __init *early_alloc(unsigned long sz)
return early_alloc_aligned(sz, sz);
}
-static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr, unsigned long prot)
+static pte_t * __init early_pte_alloc(pmd_t *pmd)
+{
+ if (pmd_none(*pmd) || pmd_bad(*pmd))
+ return early_alloc(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE);
+ return pmd_page_vaddr(*pmd);
+}
+
+static void __init early_pte_install(pmd_t *pmd, pte_t *pte, unsigned long prot)
+{
+ __pmd_populate(pmd, __pa(pte), prot);
+ BUG_ON(pmd_bad(*pmd));
+}
+
+static pte_t * __init early_pte_alloc_and_install(pmd_t *pmd,
+ unsigned long addr, unsigned long prot)
{
if (pmd_none(*pmd)) {
- pte_t *pte = early_alloc(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE);
- __pmd_populate(pmd, __pa(pte), prot);
+ pte_t *pte = early_pte_alloc(pmd);
+ early_pte_install(pmd, pte, prot);
}
BUG_ON(pmd_bad(*pmd));
return pte_offset_kernel(pmd, addr);
@@ -619,11 +633,17 @@ static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
unsigned long end, unsigned long pfn,
const struct mem_type *type)
{
- pte_t *pte = early_pte_alloc(pmd, addr, type->prot_l1);
+ pte_t *start_pte = early_pte_alloc(pmd);
+ pte_t *pte = start_pte + pte_index(addr);
+
+ /* If replacing a section mapping, the whole section must be replaced */
+ BUG_ON(!pmd_none(*pmd) && pmd_bad(*pmd) && ((addr | end) & ~PMD_MASK));
+
do {
set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)), 0);
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+ early_pte_install(pmd, start_pte, type->prot_l1);
}
static void __init __map_init_section(pmd_t *pmd, unsigned long addr,
@@ -655,7 +675,8 @@ static void __init __map_init_section(pmd_t *pmd, unsigned long addr,
static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
unsigned long end, phys_addr_t phys,
- const struct mem_type *type)
+ const struct mem_type *type,
+ bool force_pages)
{
pmd_t *pmd = pmd_offset(pud, addr);
unsigned long next;
@@ -672,7 +693,8 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
* aligned to a section boundary.
*/
if (type->prot_sect &&
- ((addr | next | phys) & ~SECTION_MASK) == 0) {
+ ((addr | next | phys) & ~SECTION_MASK) == 0 &&
+ !force_pages) {
__map_init_section(pmd, addr, next, phys, type);
} else {
alloc_init_pte(pmd, addr, next,
@@ -686,14 +708,15 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr,
unsigned long end, phys_addr_t phys,
- const struct mem_type *type)
+ const struct mem_type *type,
+ bool force_pages)
{
pud_t *pud = pud_offset(pgd, addr);
unsigned long next;
do {
next = pud_addr_end(addr, end);
- alloc_init_pmd(pud, addr, next, phys, type);
+ alloc_init_pmd(pud, addr, next, phys, type, force_pages);
phys += next - addr;
} while (pud++, addr = next, addr != end);
}
@@ -767,7 +790,7 @@ static void __init create_36bit_mapping(struct map_desc *md,
* offsets, and we take full advantage of sections and
* supersections.
*/
-static void __init create_mapping(struct map_desc *md)
+static void __init create_mapping(struct map_desc *md, bool force_pages)
{
unsigned long addr, length, end;
phys_addr_t phys;
@@ -817,7 +840,7 @@ static void __init create_mapping(struct map_desc *md)
do {
unsigned long next = pgd_addr_end(addr, end);
- alloc_init_pud(pgd, addr, next, phys, type);
+ alloc_init_pud(pgd, addr, next, phys, type, force_pages);
phys += next - addr;
addr = next;
@@ -839,7 +862,7 @@ void __init iotable_init(struct map_desc *io_desc, int nr)
svm = early_alloc_aligned(sizeof(*svm) * nr, __alignof__(*svm));
for (md = io_desc; nr; md++, nr--) {
- create_mapping(md);
+ create_mapping(md, false);
vm = &svm->vm;
vm->addr = (void *)(md->virtual & PAGE_MASK);
@@ -960,7 +983,7 @@ void __init debug_ll_io_init(void)
map.virtual &= PAGE_MASK;
map.length = PAGE_SIZE;
map.type = MT_DEVICE;
- create_mapping(&map);
+ create_mapping(&map, false);
}
#endif
@@ -1005,6 +1028,28 @@ void __init sanity_check_meminfo(void)
struct membank *bank = &meminfo.bank[j];
*bank = meminfo.bank[i];
+#ifdef CONFIG_SPARSEMEM
+ if (pfn_to_section_nr(bank_pfn_start(bank)) !=
+ pfn_to_section_nr(bank_pfn_end(bank) - 1)) {
+ phys_addr_t sz;
+ unsigned long start_pfn = bank_pfn_start(bank);
+ unsigned long end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
+ sz = ((phys_addr_t)(end_pfn - start_pfn) << PAGE_SHIFT);
+
+ if (meminfo.nr_banks >= NR_BANKS) {
+ pr_crit("NR_BANKS too low, ignoring %lld bytes of memory\n",
+ (unsigned long long)(bank->size - sz));
+ } else {
+ memmove(bank + 1, bank,
+ (meminfo.nr_banks - i) * sizeof(*bank));
+ meminfo.nr_banks++;
+ bank[1].size -= sz;
+ bank[1].start = __pfn_to_phys(end_pfn);
+ }
+ bank->size = sz;
+ }
+#endif
+
if (bank->start > ULONG_MAX)
highmem = 1;
@@ -1202,7 +1247,7 @@ static void __init devicemaps_init(struct machine_desc *mdesc)
map.virtual = MODULES_VADDR;
map.length = ((unsigned long)_etext - map.virtual + ~SECTION_MASK) & SECTION_MASK;
map.type = MT_ROM;
- create_mapping(&map);
+ create_mapping(&map, false);
#endif
/*
@@ -1213,14 +1258,14 @@ static void __init devicemaps_init(struct machine_desc *mdesc)
map.virtual = FLUSH_BASE;
map.length = SZ_1M;
map.type = MT_CACHECLEAN;
- create_mapping(&map);
+ create_mapping(&map, false);
#endif
#ifdef FLUSH_BASE_MINICACHE
map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS + SZ_1M);
map.virtual = FLUSH_BASE_MINICACHE;
map.length = SZ_1M;
map.type = MT_MINICLEAN;
- create_mapping(&map);
+ create_mapping(&map, false);
#endif
/*
@@ -1236,13 +1281,13 @@ static void __init devicemaps_init(struct machine_desc *mdesc)
#else
map.type = MT_LOW_VECTORS;
#endif
- create_mapping(&map);
+ create_mapping(&map, false);
if (!vectors_high()) {
map.virtual = 0;
map.length = PAGE_SIZE * 2;
map.type = MT_LOW_VECTORS;
- create_mapping(&map);
+ create_mapping(&map, false);
}
/* Now create a kernel read-only mapping */
@@ -1250,7 +1295,7 @@ static void __init devicemaps_init(struct machine_desc *mdesc)
map.virtual = 0xffff0000 + PAGE_SIZE;
map.length = PAGE_SIZE;
map.type = MT_LOW_VECTORS;
- create_mapping(&map);
+ create_mapping(&map, false);
/*
* Ask the machine support to map in the statically mapped devices.
@@ -1275,20 +1320,23 @@ static void __init devicemaps_init(struct machine_desc *mdesc)
static void __init kmap_init(void)
{
#ifdef CONFIG_HIGHMEM
- pkmap_page_table = early_pte_alloc(pmd_off_k(PKMAP_BASE),
+ pkmap_page_table = early_pte_alloc_and_install(pmd_off_k(PKMAP_BASE),
PKMAP_BASE, _PAGE_KERNEL_TABLE);
#endif
}
+
static void __init map_lowmem(void)
{
struct memblock_region *reg;
+ phys_addr_t start;
+ phys_addr_t end;
+ struct map_desc map;
/* Map all the lowmem memory banks. */
for_each_memblock(memory, reg) {
- phys_addr_t start = reg->base;
- phys_addr_t end = start + reg->size;
- struct map_desc map;
+ start = reg->base;
+ end = start + reg->size;
if (end > arm_lowmem_limit)
end = arm_lowmem_limit;
@@ -1300,8 +1348,20 @@ static void __init map_lowmem(void)
map.length = end - start;
map.type = MT_MEMORY;
- create_mapping(&map);
+ create_mapping(&map, false);
}
+
+#ifdef CONFIG_DEBUG_RODATA
+ start = __pa(_stext) & PMD_MASK;
+ end = ALIGN(__pa(__end_rodata), PMD_SIZE);
+
+ map.pfn = __phys_to_pfn(start);
+ map.virtual = __phys_to_virt(start);
+ map.length = end - start;
+ map.type = MT_MEMORY;
+
+ create_mapping(&map, true);
+#endif
}
/*
diff --git a/arch/arm/mm/rodata.c b/arch/arm/mm/rodata.c
new file mode 100644
index 000000000000..9a8eb841c428
--- /dev/null
+++ b/arch/arm/mm/rodata.c
@@ -0,0 +1,159 @@
+/*
+ * linux/arch/arm/mm/rodata.c
+ *
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * Based on x86 implementation in arch/x86/mm/init_32.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <asm/cache.h>
+#include <asm/pgtable.h>
+#include <asm/rodata.h>
+#include <asm/sections.h>
+#include <asm/tlbflush.h>
+
+#include "mm.h"
+
+static int kernel_set_to_readonly __read_mostly;
+
+#ifdef CONFIG_DEBUG_RODATA_TEST
+static const int rodata_test_data = 0xC3;
+
+static noinline void rodata_test(void)
+{
+ int result;
+
+ pr_info("%s: attempting to write to read-only section:\n", __func__);
+
+ if (*(volatile int *)&rodata_test_data != 0xC3) {
+ pr_err("read only data changed before test\n");
+ return;
+ }
+
+ /*
+ * Attempt to to write to rodata_test_data, trapping the expected
+ * data abort. If the trap executed, result will be 1. If it didn't,
+ * result will be 0xFF.
+ */
+ asm volatile(
+ "0: str %[zero], [%[rodata_test_data]]\n"
+ " mov %[result], #0xFF\n"
+ " b 2f\n"
+ "1: mov %[result], #1\n"
+ "2:\n"
+
+ /* Exception fixup - if store at label 0 faults, jumps to 1 */
+ ".pushsection __ex_table, \"a\"\n"
+ " .long 0b, 1b\n"
+ ".popsection\n"
+
+ : [result] "=r" (result)
+ : [rodata_test_data] "r" (&rodata_test_data), [zero] "r" (0)
+ : "memory"
+ );
+
+ if (result == 1)
+ pr_info("write to read-only section trapped, success\n");
+ else
+ pr_err("write to read-only section NOT trapped, test failed\n");
+
+ if (*(volatile int *)&rodata_test_data != 0xC3)
+ pr_err("read only data changed during write\n");
+}
+#else
+static inline void rodata_test(void) { }
+#endif
+
+static int set_page_attributes(unsigned long virt, int numpages,
+ pte_t (*f)(pte_t))
+{
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long start = virt;
+ unsigned long end = virt + (numpages << PAGE_SHIFT);
+ unsigned long pmd_end;
+
+ while (virt < end) {
+ pmd = pmd_off_k(virt);
+ pmd_end = min(ALIGN(virt + 1, PMD_SIZE), end);
+
+ if ((pmd_val(*pmd) & PMD_TYPE_MASK) != PMD_TYPE_TABLE) {
+ pr_err("%s: pmd %p=%08lx for %08lx not page table\n",
+ __func__, pmd, pmd_val(*pmd), virt);
+ virt = pmd_end;
+ continue;
+ }
+
+ while (virt < pmd_end) {
+ pte = pte_offset_kernel(pmd, virt);
+ set_pte_ext(pte, f(*pte), 0);
+ virt += PAGE_SIZE;
+ }
+ }
+
+ flush_tlb_kernel_range(start, end);
+
+ return 0;
+}
+
+int set_memory_ro(unsigned long virt, int numpages)
+{
+ return set_page_attributes(virt, numpages, pte_wrprotect);
+}
+EXPORT_SYMBOL(set_memory_ro);
+
+int set_memory_rw(unsigned long virt, int numpages)
+{
+ return set_page_attributes(virt, numpages, pte_mkwrite);
+}
+EXPORT_SYMBOL(set_memory_rw);
+
+void set_kernel_text_rw(void)
+{
+ unsigned long start = PAGE_ALIGN((unsigned long)_text);
+ unsigned long size = PAGE_ALIGN((unsigned long)__end_rodata) - start;
+
+ if (!kernel_set_to_readonly)
+ return;
+
+ pr_debug("Set kernel text: %lx - %lx to read-write\n",
+ start, start + size);
+
+ set_memory_rw(start, size >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+ unsigned long start = PAGE_ALIGN((unsigned long)_text);
+ unsigned long size = PAGE_ALIGN((unsigned long)__end_rodata) - start;
+
+ if (!kernel_set_to_readonly)
+ return;
+
+ pr_info_once("Write protecting the kernel text section %lx - %lx\n",
+ start, start + size);
+
+ pr_debug("Set kernel text: %lx - %lx to read only\n",
+ start, start + size);
+
+ set_memory_ro(start, size >> PAGE_SHIFT);
+}
+
+void mark_rodata_ro(void)
+{
+ kernel_set_to_readonly = 1;
+
+ set_kernel_text_ro();
+
+ rodata_test();
+}
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 5dfbb0b8e7f4..452fb3ad68aa 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/uaccess.h>
#include <linux/user.h>
+#include <linux/export.h>
#include <asm/cp15.h>
#include <asm/cputype.h>
@@ -648,6 +649,52 @@ static int vfp_hotplug(struct notifier_block *b, unsigned long action,
return NOTIFY_OK;
}
+#ifdef CONFIG_KERNEL_MODE_NEON
+
+/*
+ * Kernel-side NEON support functions
+ */
+void kernel_neon_begin(void)
+{
+ struct thread_info *thread = current_thread_info();
+ unsigned int cpu;
+ u32 fpexc;
+
+ /*
+ * Kernel mode NEON is only allowed outside of interrupt context
+ * with preemption disabled. This will make sure that the kernel
+ * mode NEON register contents never need to be preserved.
+ */
+ BUG_ON(in_interrupt());
+ cpu = get_cpu();
+
+ fpexc = fmrx(FPEXC) | FPEXC_EN;
+ fmxr(FPEXC, fpexc);
+
+ /*
+ * Save the userland NEON/VFP state. Under UP,
+ * the owner could be a task other than 'current'
+ */
+ if (vfp_state_in_hw(cpu, thread))
+ vfp_save_state(&thread->vfpstate, fpexc);
+#ifndef CONFIG_SMP
+ else if (vfp_current_hw_state[cpu] != NULL)
+ vfp_save_state(vfp_current_hw_state[cpu], fpexc);
+#endif
+ vfp_current_hw_state[cpu] = NULL;
+}
+EXPORT_SYMBOL(kernel_neon_begin);
+
+void kernel_neon_end(void)
+{
+ /* Disable the NEON/VFP unit. */
+ fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN);
+ put_cpu();
+}
+EXPORT_SYMBOL(kernel_neon_end);
+
+#endif /* CONFIG_KERNEL_MODE_NEON */
+
/*
* VFP support code initialisation.
*/
@@ -731,4 +778,4 @@ static int __init vfp_init(void)
return 0;
}
-late_initcall(vfp_init);
+core_initcall(vfp_init);