From 45fe93dff2fb58b22de04c729f8447ba0f773d93 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:04 +0100 Subject: crypto: algapi - make crypto_xor() take separate dst and src arguments There are quite a number of occurrences in the kernel of the pattern if (dst != src) memcpy(dst, src, walk.total % AES_BLOCK_SIZE); crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE); or crypto_xor(keystream, src, nbytes); memcpy(dst, keystream, nbytes); where crypto_xor() is preceded or followed by a memcpy() invocation that is only there because crypto_xor() uses its output parameter as one of the inputs. To avoid having to add new instances of this pattern in the arm64 code, which will be refactored to implement non-SIMD fallbacks, add an alternative implementation called crypto_xor_cpy(), taking separate input and output arguments. This removes the need for the separate memcpy(). Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/aes-ce-glue.c | 4 +--- arch/arm/crypto/aes-neonbs-glue.c | 5 ++--- arch/arm64/crypto/aes-glue.c | 4 +--- arch/arm64/crypto/aes-neonbs-glue.c | 5 ++--- arch/sparc/crypto/aes_glue.c | 3 +-- arch/x86/crypto/aesni-intel_glue.c | 4 ++-- arch/x86/crypto/blowfish_glue.c | 3 +-- arch/x86/crypto/cast5_avx_glue.c | 3 +-- arch/x86/crypto/des3_ede_glue.c | 3 +-- 9 files changed, 12 insertions(+), 22 deletions(-) (limited to 'arch') diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c index 0f966a8ca1ce..d0a9cec73707 100644 --- a/arch/arm/crypto/aes-ce-glue.c +++ b/arch/arm/crypto/aes-ce-glue.c @@ -285,9 +285,7 @@ static int ctr_encrypt(struct skcipher_request *req) ce_aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, num_rounds(ctx), blocks, walk.iv); - if (tdst != tsrc) - memcpy(tdst, tsrc, nbytes); - crypto_xor(tdst, tail, nbytes); + crypto_xor_cpy(tdst, tsrc, tail, nbytes); err = skcipher_walk_done(&walk, 0); } kernel_neon_end(); diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c index c76377961444..18768f330449 100644 --- a/arch/arm/crypto/aes-neonbs-glue.c +++ b/arch/arm/crypto/aes-neonbs-glue.c @@ -221,9 +221,8 @@ static int ctr_encrypt(struct skcipher_request *req) u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; - if (dst != src) - memcpy(dst, src, walk.total % AES_BLOCK_SIZE); - crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE); + crypto_xor_cpy(dst, src, final, + walk.total % AES_BLOCK_SIZE); err = skcipher_walk_done(&walk, 0); break; diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index bcf596b0197e..0da30e3b0e4b 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -241,9 +241,7 @@ static int ctr_encrypt(struct skcipher_request *req) aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds, blocks, walk.iv, first); - if (tdst != tsrc) - memcpy(tdst, tsrc, nbytes); - crypto_xor(tdst, tail, nbytes); + crypto_xor_cpy(tdst, tsrc, tail, nbytes); err = skcipher_walk_done(&walk, 0); } kernel_neon_end(); diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index db2501d93550..9001aec16007 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -224,9 +224,8 @@ static int ctr_encrypt(struct skcipher_request *req) u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; - if (dst != src) - memcpy(dst, src, walk.total % AES_BLOCK_SIZE); - crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE); + crypto_xor_cpy(dst, src, final, + walk.total % AES_BLOCK_SIZE); err = skcipher_walk_done(&walk, 0); break; diff --git a/arch/sparc/crypto/aes_glue.c b/arch/sparc/crypto/aes_glue.c index c90930de76ba..3cd4f6b198b6 100644 --- a/arch/sparc/crypto/aes_glue.c +++ b/arch/sparc/crypto/aes_glue.c @@ -344,8 +344,7 @@ static void ctr_crypt_final(struct crypto_sparc64_aes_ctx *ctx, ctx->ops->ecb_encrypt(&ctx->key[0], (const u64 *)ctrblk, keystream, AES_BLOCK_SIZE); - crypto_xor((u8 *) keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, (u8 *) keystream, src, nbytes); crypto_inc(ctrblk, AES_BLOCK_SIZE); } diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 4a55cdcdc008..5c15d6b57329 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -475,8 +475,8 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx, unsigned int nbytes = walk->nbytes; aesni_enc(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); + crypto_inc(ctrblk, AES_BLOCK_SIZE); } diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 17c05531dfd1..f9eca34301e2 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -271,8 +271,7 @@ static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk) unsigned int nbytes = walk->nbytes; blowfish_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, BF_BLOCK_SIZE); } diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 8648158f3916..dbea6020ffe7 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -256,8 +256,7 @@ static void ctr_crypt_final(struct blkcipher_desc *desc, unsigned int nbytes = walk->nbytes; __cast5_encrypt(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, CAST5_BLOCK_SIZE); } diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index d6fc59aaaadf..30c0a37f4882 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -277,8 +277,7 @@ static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx, unsigned int nbytes = walk->nbytes; des3_ede_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE); } -- cgit v1.2.3 From 6d6254d728a2e696aa697b4b44cb7736851f62e3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:05 +0100 Subject: crypto: arm64/ghash-ce - add non-SIMD scalar fallback The arm64 kernel will shortly disallow nested kernel mode NEON, so add a fallback to scalar C code that can be invoked in that case. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 3 ++- arch/arm64/crypto/ghash-ce-glue.c | 49 ++++++++++++++++++++++++++++++++------- 2 files changed, 43 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index d92293747d63..7d75a363e317 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -28,8 +28,9 @@ config CRYPTO_SHA2_ARM64_CE config CRYPTO_GHASH_ARM64_CE tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions" - depends on ARM64 && KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON select CRYPTO_HASH + select CRYPTO_GF128MUL config CRYPTO_CRCT10DIF_ARM64_CE tristate "CRCT10DIF digest algorithm using PMULL instructions" diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 833ec1e3f3e9..30221ef56e70 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -1,7 +1,7 @@ /* * Accelerated GHASH implementation with ARMv8 PMULL instructions. * - * Copyright (C) 2014 Linaro Ltd. + * Copyright (C) 2014 - 2017 Linaro Ltd. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -9,7 +9,9 @@ */ #include +#include #include +#include #include #include #include @@ -25,6 +27,7 @@ MODULE_LICENSE("GPL v2"); struct ghash_key { u64 a; u64 b; + be128 k; }; struct ghash_desc_ctx { @@ -44,6 +47,36 @@ static int ghash_init(struct shash_desc *desc) return 0; } +static void ghash_do_update(int blocks, u64 dg[], const char *src, + struct ghash_key *key, const char *head) +{ + if (likely(may_use_simd())) { + kernel_neon_begin(); + pmull_ghash_update(blocks, dg, src, key, head); + kernel_neon_end(); + } else { + be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) }; + + do { + const u8 *in = src; + + if (head) { + in = head; + blocks++; + head = NULL; + } else { + src += GHASH_BLOCK_SIZE; + } + + crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE); + gf128mul_lle(&dst, &key->k); + } while (--blocks); + + dg[0] = be64_to_cpu(dst.b); + dg[1] = be64_to_cpu(dst.a); + } +} + static int ghash_update(struct shash_desc *desc, const u8 *src, unsigned int len) { @@ -67,10 +100,9 @@ static int ghash_update(struct shash_desc *desc, const u8 *src, blocks = len / GHASH_BLOCK_SIZE; len %= GHASH_BLOCK_SIZE; - kernel_neon_begin_partial(8); - pmull_ghash_update(blocks, ctx->digest, src, key, - partial ? ctx->buf : NULL); - kernel_neon_end(); + ghash_do_update(blocks, ctx->digest, src, key, + partial ? ctx->buf : NULL); + src += blocks * GHASH_BLOCK_SIZE; partial = 0; } @@ -89,9 +121,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst) memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); - kernel_neon_begin_partial(8); - pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL); - kernel_neon_end(); + ghash_do_update(1, ctx->digest, ctx->buf, key, NULL); } put_unaligned_be64(ctx->digest[1], dst); put_unaligned_be64(ctx->digest[0], dst + 8); @@ -111,6 +141,9 @@ static int ghash_setkey(struct crypto_shash *tfm, return -EINVAL; } + /* needed for the fallback */ + memcpy(&key->k, inkey, GHASH_BLOCK_SIZE); + /* perform multiplication by 'x' in GF(2^128) */ b = get_unaligned_be64(inkey); a = get_unaligned_be64(inkey + 8); -- cgit v1.2.3 From 2dde374e1ff0663ca46b343949c242959028f976 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:06 +0100 Subject: crypto: arm64/crct10dif - add non-SIMD generic fallback The arm64 kernel will shortly disallow nested kernel mode NEON, so add a fallback to scalar C code that can be invoked in that case. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/crct10dif-ce-glue.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c index 60cb590c2590..96f0cae4a022 100644 --- a/arch/arm64/crypto/crct10dif-ce-glue.c +++ b/arch/arm64/crypto/crct10dif-ce-glue.c @@ -1,7 +1,7 @@ /* * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions * - * Copyright (C) 2016 Linaro Ltd + * Copyright (C) 2016 - 2017 Linaro Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -18,6 +18,7 @@ #include #include +#include #define CRC_T10DIF_PMULL_CHUNK_SIZE 16U @@ -48,9 +49,13 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data, } if (length > 0) { - kernel_neon_begin_partial(14); - *crc = crc_t10dif_pmull(*crc, data, length); - kernel_neon_end(); + if (may_use_simd()) { + kernel_neon_begin(); + *crc = crc_t10dif_pmull(*crc, data, length); + kernel_neon_end(); + } else { + *crc = crc_t10dif_generic(*crc, data, length); + } } return 0; -- cgit v1.2.3 From 15c7d8f8a2c7c86cb36f4d0273a1b2b2c9a479d7 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:07 +0100 Subject: crypto: arm64/crc32 - add non-SIMD scalar fallback The arm64 kernel will shortly disallow nested kernel mode NEON, so add a fallback to scalar C code that can be invoked in that case. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/crc32-ce-glue.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c index eccb1ae90064..624f4137918c 100644 --- a/arch/arm64/crypto/crc32-ce-glue.c +++ b/arch/arm64/crypto/crc32-ce-glue.c @@ -1,7 +1,7 @@ /* * Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions * - * Copyright (C) 2016 Linaro Ltd + * Copyright (C) 2016 - 2017 Linaro Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,6 +19,7 @@ #include #include +#include #include #define PMULL_MIN_LEN 64L /* minimum size of buffer @@ -105,10 +106,10 @@ static int crc32_pmull_update(struct shash_desc *desc, const u8 *data, length -= l; } - if (length >= PMULL_MIN_LEN) { + if (length >= PMULL_MIN_LEN && may_use_simd()) { l = round_down(length, SCALE_F); - kernel_neon_begin_partial(10); + kernel_neon_begin(); *crc = crc32_pmull_le(data, l, *crc); kernel_neon_end(); @@ -137,10 +138,10 @@ static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data, length -= l; } - if (length >= PMULL_MIN_LEN) { + if (length >= PMULL_MIN_LEN && may_use_simd()) { l = round_down(length, SCALE_F); - kernel_neon_begin_partial(10); + kernel_neon_begin(); *crc = crc32c_pmull_le(data, l, *crc); kernel_neon_end(); -- cgit v1.2.3 From 0771f3234db67732a49777a5fceaed6f7b4b488d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:08 +0100 Subject: crypto: arm64/sha1-ce - add non-SIMD generic fallback The arm64 kernel will shortly disallow nested kernel mode NEON, so add a fallback to scalar C code that can be invoked in that case. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 3 ++- arch/arm64/crypto/sha1-ce-glue.c | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 7d75a363e317..5d5953545dad 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -18,8 +18,9 @@ config CRYPTO_SHA512_ARM64 config CRYPTO_SHA1_ARM64_CE tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)" - depends on ARM64 && KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON select CRYPTO_HASH + select CRYPTO_SHA1 config CRYPTO_SHA2_ARM64_CE tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)" diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index ea319c055f5d..efbeb3e0dcfb 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -1,7 +1,7 @@ /* * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions * - * Copyright (C) 2014 Linaro Ltd + * Copyright (C) 2014 - 2017 Linaro Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -37,8 +38,11 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data, { struct sha1_ce_state *sctx = shash_desc_ctx(desc); + if (!may_use_simd()) + return crypto_sha1_update(desc, data, len); + sctx->finalize = 0; - kernel_neon_begin_partial(16); + kernel_neon_begin(); sha1_base_do_update(desc, data, len, (sha1_block_fn *)sha1_ce_transform); kernel_neon_end(); @@ -52,13 +56,16 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, struct sha1_ce_state *sctx = shash_desc_ctx(desc); bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE); + if (!may_use_simd()) + return crypto_sha1_finup(desc, data, len, out); + /* * Allow the asm code to perform the finalization if there is no * partial data and the input is a round multiple of the block size. */ sctx->finalize = finalize; - kernel_neon_begin_partial(16); + kernel_neon_begin(); sha1_base_do_update(desc, data, len, (sha1_block_fn *)sha1_ce_transform); if (!finalize) @@ -71,8 +78,11 @@ static int sha1_ce_final(struct shash_desc *desc, u8 *out) { struct sha1_ce_state *sctx = shash_desc_ctx(desc); + if (!may_use_simd()) + return crypto_sha1_finup(desc, NULL, 0, out); + sctx->finalize = 0; - kernel_neon_begin_partial(16); + kernel_neon_begin(); sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform); kernel_neon_end(); return sha1_base_finish(desc, out); -- cgit v1.2.3 From da1793312f7693787e0ed22aa121261c3e0e15c0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:09 +0100 Subject: crypto: arm64/sha2-ce - add non-SIMD scalar fallback The arm64 kernel will shortly disallow nested kernel mode NEON, so add a fallback to scalar code that can be invoked in that case. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 3 ++- arch/arm64/crypto/sha2-ce-glue.c | 30 ++++++++++++++++++++++++++---- arch/arm64/crypto/sha256-glue.c | 1 + 3 files changed, 29 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 5d5953545dad..8cd145f9c1ff 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -24,8 +24,9 @@ config CRYPTO_SHA1_ARM64_CE config CRYPTO_SHA2_ARM64_CE tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)" - depends on ARM64 && KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON select CRYPTO_HASH + select CRYPTO_SHA256_ARM64 config CRYPTO_GHASH_ARM64_CE tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions" diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index 0ed9486f75dd..fd1ff2b13dfa 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -1,7 +1,7 @@ /* * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions * - * Copyright (C) 2014 Linaro Ltd + * Copyright (C) 2014 - 2017 Linaro Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -34,13 +35,19 @@ const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state, const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state, finalize); +asmlinkage void sha256_block_data_order(u32 *digest, u8 const *src, int blocks); + static int sha256_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct sha256_ce_state *sctx = shash_desc_ctx(desc); + if (!may_use_simd()) + return sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_data_order); + sctx->finalize = 0; - kernel_neon_begin_partial(28); + kernel_neon_begin(); sha256_base_do_update(desc, data, len, (sha256_block_fn *)sha2_ce_transform); kernel_neon_end(); @@ -54,13 +61,22 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data, struct sha256_ce_state *sctx = shash_desc_ctx(desc); bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE); + if (!may_use_simd()) { + if (len) + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_data_order); + sha256_base_do_finalize(desc, + (sha256_block_fn *)sha256_block_data_order); + return sha256_base_finish(desc, out); + } + /* * Allow the asm code to perform the finalization if there is no * partial data and the input is a round multiple of the block size. */ sctx->finalize = finalize; - kernel_neon_begin_partial(28); + kernel_neon_begin(); sha256_base_do_update(desc, data, len, (sha256_block_fn *)sha2_ce_transform); if (!finalize) @@ -74,8 +90,14 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out) { struct sha256_ce_state *sctx = shash_desc_ctx(desc); + if (!may_use_simd()) { + sha256_base_do_finalize(desc, + (sha256_block_fn *)sha256_block_data_order); + return sha256_base_finish(desc, out); + } + sctx->finalize = 0; - kernel_neon_begin_partial(28); + kernel_neon_begin(); sha256_base_do_finalize(desc, (sha256_block_fn *)sha2_ce_transform); kernel_neon_end(); return sha256_base_finish(desc, out); diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c index a2226f841960..b064d925fe2a 100644 --- a/arch/arm64/crypto/sha256-glue.c +++ b/arch/arm64/crypto/sha256-glue.c @@ -29,6 +29,7 @@ MODULE_ALIAS_CRYPTO("sha256"); asmlinkage void sha256_block_data_order(u32 *digest, const void *data, unsigned int num_blks); +EXPORT_SYMBOL(sha256_block_data_order); asmlinkage void sha256_block_neon(u32 *digest, const void *data, unsigned int num_blks); -- cgit v1.2.3 From f402e3115e20b345bd6fbfcf463a506d958c7bf6 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:10 +0100 Subject: crypto: arm64/aes-ce-cipher - match round key endianness with generic code In order to be able to reuse the generic AES code as a fallback for situations where the NEON may not be used, update the key handling to match the byte order of the generic code: it stores round keys as sequences of 32-bit quantities rather than streams of bytes, and so our code needs to be updated to reflect that. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-ce-ccm-core.S | 30 +++++++++++++++--------------- arch/arm64/crypto/aes-ce-cipher.c | 35 ++++++++++++++++------------------- arch/arm64/crypto/aes-ce.S | 12 ++++++------ 3 files changed, 37 insertions(+), 40 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S index 3363560c79b7..e3a375c4cb83 100644 --- a/arch/arm64/crypto/aes-ce-ccm-core.S +++ b/arch/arm64/crypto/aes-ce-ccm-core.S @@ -1,7 +1,7 @@ /* * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions * - * Copyright (C) 2013 - 2014 Linaro Ltd + * Copyright (C) 2013 - 2017 Linaro Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -32,7 +32,7 @@ ENTRY(ce_aes_ccm_auth_data) beq 8f /* out of input? */ cbnz w8, 0b eor v0.16b, v0.16b, v1.16b -1: ld1 {v3.16b}, [x4] /* load first round key */ +1: ld1 {v3.4s}, [x4] /* load first round key */ prfm pldl1strm, [x1] cmp w5, #12 /* which key size? */ add x6, x4, #16 @@ -42,17 +42,17 @@ ENTRY(ce_aes_ccm_auth_data) mov v5.16b, v3.16b b 4f 2: mov v4.16b, v3.16b - ld1 {v5.16b}, [x6], #16 /* load 2nd round key */ + ld1 {v5.4s}, [x6], #16 /* load 2nd round key */ 3: aese v0.16b, v4.16b aesmc v0.16b, v0.16b -4: ld1 {v3.16b}, [x6], #16 /* load next round key */ +4: ld1 {v3.4s}, [x6], #16 /* load next round key */ aese v0.16b, v5.16b aesmc v0.16b, v0.16b -5: ld1 {v4.16b}, [x6], #16 /* load next round key */ +5: ld1 {v4.4s}, [x6], #16 /* load next round key */ subs w7, w7, #3 aese v0.16b, v3.16b aesmc v0.16b, v0.16b - ld1 {v5.16b}, [x6], #16 /* load next round key */ + ld1 {v5.4s}, [x6], #16 /* load next round key */ bpl 3b aese v0.16b, v4.16b subs w2, w2, #16 /* last data? */ @@ -90,7 +90,7 @@ ENDPROC(ce_aes_ccm_auth_data) * u32 rounds); */ ENTRY(ce_aes_ccm_final) - ld1 {v3.16b}, [x2], #16 /* load first round key */ + ld1 {v3.4s}, [x2], #16 /* load first round key */ ld1 {v0.16b}, [x0] /* load mac */ cmp w3, #12 /* which key size? */ sub w3, w3, #2 /* modified # of rounds */ @@ -100,17 +100,17 @@ ENTRY(ce_aes_ccm_final) mov v5.16b, v3.16b b 2f 0: mov v4.16b, v3.16b -1: ld1 {v5.16b}, [x2], #16 /* load next round key */ +1: ld1 {v5.4s}, [x2], #16 /* load next round key */ aese v0.16b, v4.16b aesmc v0.16b, v0.16b aese v1.16b, v4.16b aesmc v1.16b, v1.16b -2: ld1 {v3.16b}, [x2], #16 /* load next round key */ +2: ld1 {v3.4s}, [x2], #16 /* load next round key */ aese v0.16b, v5.16b aesmc v0.16b, v0.16b aese v1.16b, v5.16b aesmc v1.16b, v1.16b -3: ld1 {v4.16b}, [x2], #16 /* load next round key */ +3: ld1 {v4.4s}, [x2], #16 /* load next round key */ subs w3, w3, #3 aese v0.16b, v3.16b aesmc v0.16b, v0.16b @@ -137,31 +137,31 @@ CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */ cmp w4, #12 /* which key size? */ sub w7, w4, #2 /* get modified # of rounds */ ins v1.d[1], x9 /* no carry in lower ctr */ - ld1 {v3.16b}, [x3] /* load first round key */ + ld1 {v3.4s}, [x3] /* load first round key */ add x10, x3, #16 bmi 1f bne 4f mov v5.16b, v3.16b b 3f 1: mov v4.16b, v3.16b - ld1 {v5.16b}, [x10], #16 /* load 2nd round key */ + ld1 {v5.4s}, [x10], #16 /* load 2nd round key */ 2: /* inner loop: 3 rounds, 2x interleaved */ aese v0.16b, v4.16b aesmc v0.16b, v0.16b aese v1.16b, v4.16b aesmc v1.16b, v1.16b -3: ld1 {v3.16b}, [x10], #16 /* load next round key */ +3: ld1 {v3.4s}, [x10], #16 /* load next round key */ aese v0.16b, v5.16b aesmc v0.16b, v0.16b aese v1.16b, v5.16b aesmc v1.16b, v1.16b -4: ld1 {v4.16b}, [x10], #16 /* load next round key */ +4: ld1 {v4.4s}, [x10], #16 /* load next round key */ subs w7, w7, #3 aese v0.16b, v3.16b aesmc v0.16b, v0.16b aese v1.16b, v3.16b aesmc v1.16b, v1.16b - ld1 {v5.16b}, [x10], #16 /* load next round key */ + ld1 {v5.4s}, [x10], #16 /* load next round key */ bpl 2b aese v0.16b, v4.16b aese v1.16b, v4.16b diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c index 50d9fe11d0c8..a0a0e5e3a8b5 100644 --- a/arch/arm64/crypto/aes-ce-cipher.c +++ b/arch/arm64/crypto/aes-ce-cipher.c @@ -1,7 +1,7 @@ /* * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions * - * Copyright (C) 2013 - 2014 Linaro Ltd + * Copyright (C) 2013 - 2017 Linaro Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -47,24 +48,24 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) kernel_neon_begin_partial(4); __asm__(" ld1 {v0.16b}, %[in] ;" - " ld1 {v1.16b}, [%[key]], #16 ;" + " ld1 {v1.4s}, [%[key]], #16 ;" " cmp %w[rounds], #10 ;" " bmi 0f ;" " bne 3f ;" " mov v3.16b, v1.16b ;" " b 2f ;" "0: mov v2.16b, v1.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" + " ld1 {v3.4s}, [%[key]], #16 ;" "1: aese v0.16b, v2.16b ;" " aesmc v0.16b, v0.16b ;" - "2: ld1 {v1.16b}, [%[key]], #16 ;" + "2: ld1 {v1.4s}, [%[key]], #16 ;" " aese v0.16b, v3.16b ;" " aesmc v0.16b, v0.16b ;" - "3: ld1 {v2.16b}, [%[key]], #16 ;" + "3: ld1 {v2.4s}, [%[key]], #16 ;" " subs %w[rounds], %w[rounds], #3 ;" " aese v0.16b, v1.16b ;" " aesmc v0.16b, v0.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" + " ld1 {v3.4s}, [%[key]], #16 ;" " bpl 1b ;" " aese v0.16b, v2.16b ;" " eor v0.16b, v0.16b, v3.16b ;" @@ -92,24 +93,24 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) kernel_neon_begin_partial(4); __asm__(" ld1 {v0.16b}, %[in] ;" - " ld1 {v1.16b}, [%[key]], #16 ;" + " ld1 {v1.4s}, [%[key]], #16 ;" " cmp %w[rounds], #10 ;" " bmi 0f ;" " bne 3f ;" " mov v3.16b, v1.16b ;" " b 2f ;" "0: mov v2.16b, v1.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" + " ld1 {v3.4s}, [%[key]], #16 ;" "1: aesd v0.16b, v2.16b ;" " aesimc v0.16b, v0.16b ;" - "2: ld1 {v1.16b}, [%[key]], #16 ;" + "2: ld1 {v1.4s}, [%[key]], #16 ;" " aesd v0.16b, v3.16b ;" " aesimc v0.16b, v0.16b ;" - "3: ld1 {v2.16b}, [%[key]], #16 ;" + "3: ld1 {v2.4s}, [%[key]], #16 ;" " subs %w[rounds], %w[rounds], #3 ;" " aesd v0.16b, v1.16b ;" " aesimc v0.16b, v0.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" + " ld1 {v3.4s}, [%[key]], #16 ;" " bpl 1b ;" " aesd v0.16b, v2.16b ;" " eor v0.16b, v0.16b, v3.16b ;" @@ -165,20 +166,16 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, key_len != AES_KEYSIZE_256) return -EINVAL; - memcpy(ctx->key_enc, in_key, key_len); ctx->key_length = key_len; + for (i = 0; i < kwords; i++) + ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32)); kernel_neon_begin_partial(2); for (i = 0; i < sizeof(rcon); i++) { u32 *rki = ctx->key_enc + (i * kwords); u32 *rko = rki + kwords; -#ifndef CONFIG_CPU_BIG_ENDIAN rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0]; -#else - rko[0] = rol32(aes_sub(rki[kwords - 1]), 8) ^ (rcon[i] << 24) ^ - rki[0]; -#endif rko[1] = rko[0] ^ rki[1]; rko[2] = rko[1] ^ rki[2]; rko[3] = rko[2] ^ rki[3]; @@ -210,9 +207,9 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, key_dec[0] = key_enc[j]; for (i = 1, j--; j > 0; i++, j--) - __asm__("ld1 {v0.16b}, %[in] ;" + __asm__("ld1 {v0.4s}, %[in] ;" "aesimc v1.16b, v0.16b ;" - "st1 {v1.16b}, %[out] ;" + "st1 {v1.4s}, %[out] ;" : [out] "=Q"(key_dec[i]) : [in] "Q"(key_enc[j]) diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S index b46093d567e5..50330f5c3adc 100644 --- a/arch/arm64/crypto/aes-ce.S +++ b/arch/arm64/crypto/aes-ce.S @@ -2,7 +2,7 @@ * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with * Crypto Extensions * - * Copyright (C) 2013 Linaro Ltd + * Copyright (C) 2013 - 2017 Linaro Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -22,11 +22,11 @@ cmp \rounds, #12 blo 2222f /* 128 bits */ beq 1111f /* 192 bits */ - ld1 {v17.16b-v18.16b}, [\rk], #32 -1111: ld1 {v19.16b-v20.16b}, [\rk], #32 -2222: ld1 {v21.16b-v24.16b}, [\rk], #64 - ld1 {v25.16b-v28.16b}, [\rk], #64 - ld1 {v29.16b-v31.16b}, [\rk] + ld1 {v17.4s-v18.4s}, [\rk], #32 +1111: ld1 {v19.4s-v20.4s}, [\rk], #32 +2222: ld1 {v21.4s-v24.4s}, [\rk], #64 + ld1 {v25.4s-v28.4s}, [\rk], #64 + ld1 {v29.4s-v31.4s}, [\rk] .endm /* prepare for encryption with key in rk[] */ -- cgit v1.2.3 From b8fb993a836cd432309410eadf083bbe9c0e9e9c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:11 +0100 Subject: crypto: arm64/aes-ce-cipher: add non-SIMD generic fallback The arm64 kernel will shortly disallow nested kernel mode NEON, so add a fallback to scalar code that can be invoked in that case. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 1 + arch/arm64/crypto/aes-ce-cipher.c | 20 +++++++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 8cd145f9c1ff..2fd4bb6d0b5a 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -52,6 +52,7 @@ config CRYPTO_AES_ARM64_CE tristate "AES core cipher using ARMv8 Crypto Extensions" depends on ARM64 && KERNEL_MODE_NEON select CRYPTO_ALGAPI + select CRYPTO_AES_ARM64 config CRYPTO_AES_ARM64_CE_CCM tristate "AES in CCM mode using ARMv8 Crypto Extensions" diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c index a0a0e5e3a8b5..6a75cd75ed11 100644 --- a/arch/arm64/crypto/aes-ce-cipher.c +++ b/arch/arm64/crypto/aes-ce-cipher.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -21,6 +22,9 @@ MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); +asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds); + struct aes_block { u8 b[AES_BLOCK_SIZE]; }; @@ -45,7 +49,12 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) void *dummy0; int dummy1; - kernel_neon_begin_partial(4); + if (!may_use_simd()) { + __aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx)); + return; + } + + kernel_neon_begin(); __asm__(" ld1 {v0.16b}, %[in] ;" " ld1 {v1.4s}, [%[key]], #16 ;" @@ -90,7 +99,12 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) void *dummy0; int dummy1; - kernel_neon_begin_partial(4); + if (!may_use_simd()) { + __aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx)); + return; + } + + kernel_neon_begin(); __asm__(" ld1 {v0.16b}, %[in] ;" " ld1 {v1.4s}, [%[key]], #16 ;" @@ -170,7 +184,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, for (i = 0; i < kwords; i++) ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32)); - kernel_neon_begin_partial(2); + kernel_neon_begin(); for (i = 0; i < sizeof(rcon); i++) { u32 *rki = ctx->key_enc + (i * kwords); u32 *rko = rki + kwords; -- cgit v1.2.3 From 5092fcf3490811a735ef44bd22d8b5ff1bd63926 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:12 +0100 Subject: crypto: arm64/aes-ce-ccm: add non-SIMD generic fallback The arm64 kernel will shortly disallow nested kernel mode NEON. So honour this in the ARMv8 Crypto Extensions implementation of CCM-AES, and fall back to a scalar implementation using the generic crypto helpers for AES, XOR and incrementing the CTR counter. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 1 + arch/arm64/crypto/aes-ce-ccm-glue.c | 174 ++++++++++++++++++++++++++++-------- 2 files changed, 140 insertions(+), 35 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 2fd4bb6d0b5a..ba637765c19a 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -59,6 +59,7 @@ config CRYPTO_AES_ARM64_CE_CCM depends on ARM64 && KERNEL_MODE_NEON select CRYPTO_ALGAPI select CRYPTO_AES_ARM64_CE + select CRYPTO_AES_ARM64 select CRYPTO_AEAD config CRYPTO_AES_ARM64_CE_BLK diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c index 6a7dbc7c83a6..a1254036f2b1 100644 --- a/arch/arm64/crypto/aes-ce-ccm-glue.c +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c @@ -1,7 +1,7 @@ /* * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions * - * Copyright (C) 2013 - 2014 Linaro Ltd + * Copyright (C) 2013 - 2017 Linaro Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -44,6 +45,8 @@ asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[], u32 rounds); +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); + static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key, unsigned int key_len) { @@ -103,7 +106,45 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen) return 0; } -static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) +static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[], + u32 abytes, u32 *macp, bool use_neon) +{ + if (likely(use_neon)) { + ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc, + num_rounds(key)); + } else { + if (*macp > 0 && *macp < AES_BLOCK_SIZE) { + int added = min(abytes, AES_BLOCK_SIZE - *macp); + + crypto_xor(&mac[*macp], in, added); + + *macp += added; + in += added; + abytes -= added; + } + + while (abytes > AES_BLOCK_SIZE) { + __aes_arm64_encrypt(key->key_enc, mac, mac, + num_rounds(key)); + crypto_xor(mac, in, AES_BLOCK_SIZE); + + in += AES_BLOCK_SIZE; + abytes -= AES_BLOCK_SIZE; + } + + if (abytes > 0) { + __aes_arm64_encrypt(key->key_enc, mac, mac, + num_rounds(key)); + crypto_xor(mac, in, abytes); + *macp = abytes; + } else { + *macp = 0; + } + } +} + +static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[], + bool use_neon) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); @@ -122,8 +163,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) ltag.len = 6; } - ce_aes_ccm_auth_data(mac, (u8 *)<ag, ltag.len, &macp, ctx->key_enc, - num_rounds(ctx)); + ccm_update_mac(ctx, mac, (u8 *)<ag, ltag.len, &macp, use_neon); scatterwalk_start(&walk, req->src); do { @@ -135,8 +175,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) n = scatterwalk_clamp(&walk, len); } p = scatterwalk_map(&walk); - ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc, - num_rounds(ctx)); + ccm_update_mac(ctx, mac, p, n, &macp, use_neon); len -= n; scatterwalk_unmap(p); @@ -145,6 +184,56 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) } while (len); } +static int ccm_crypt_fallback(struct skcipher_walk *walk, u8 mac[], u8 iv0[], + struct crypto_aes_ctx *ctx, bool enc) +{ + u8 buf[AES_BLOCK_SIZE]; + int err = 0; + + while (walk->nbytes) { + int blocks = walk->nbytes / AES_BLOCK_SIZE; + u32 tail = walk->nbytes % AES_BLOCK_SIZE; + u8 *dst = walk->dst.virt.addr; + u8 *src = walk->src.virt.addr; + u32 nbytes = walk->nbytes; + + if (nbytes == walk->total && tail > 0) { + blocks++; + tail = 0; + } + + do { + u32 bsize = AES_BLOCK_SIZE; + + if (nbytes < AES_BLOCK_SIZE) + bsize = nbytes; + + crypto_inc(walk->iv, AES_BLOCK_SIZE); + __aes_arm64_encrypt(ctx->key_enc, buf, walk->iv, + num_rounds(ctx)); + __aes_arm64_encrypt(ctx->key_enc, mac, mac, + num_rounds(ctx)); + if (enc) + crypto_xor(mac, src, bsize); + crypto_xor_cpy(dst, src, buf, bsize); + if (!enc) + crypto_xor(mac, dst, bsize); + dst += bsize; + src += bsize; + nbytes -= bsize; + } while (--blocks); + + err = skcipher_walk_done(walk, tail); + } + + if (!err) { + __aes_arm64_encrypt(ctx->key_enc, buf, iv0, num_rounds(ctx)); + __aes_arm64_encrypt(ctx->key_enc, mac, mac, num_rounds(ctx)); + crypto_xor(mac, buf, AES_BLOCK_SIZE); + } + return err; +} + static int ccm_encrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); @@ -153,39 +242,46 @@ static int ccm_encrypt(struct aead_request *req) u8 __aligned(8) mac[AES_BLOCK_SIZE]; u8 buf[AES_BLOCK_SIZE]; u32 len = req->cryptlen; + bool use_neon = may_use_simd(); int err; err = ccm_init_mac(req, mac, len); if (err) return err; - kernel_neon_begin_partial(6); + if (likely(use_neon)) + kernel_neon_begin(); if (req->assoclen) - ccm_calculate_auth_mac(req, mac); + ccm_calculate_auth_mac(req, mac, use_neon); /* preserve the original iv for the final round */ memcpy(buf, req->iv, AES_BLOCK_SIZE); err = skcipher_walk_aead_encrypt(&walk, req, true); - while (walk.nbytes) { - u32 tail = walk.nbytes % AES_BLOCK_SIZE; - - if (walk.nbytes == walk.total) - tail = 0; + if (likely(use_neon)) { + while (walk.nbytes) { + u32 tail = walk.nbytes % AES_BLOCK_SIZE; - ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes - tail, ctx->key_enc, - num_rounds(ctx), mac, walk.iv); + if (walk.nbytes == walk.total) + tail = 0; - err = skcipher_walk_done(&walk, tail); - } - if (!err) - ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + ce_aes_ccm_encrypt(walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes - tail, ctx->key_enc, + num_rounds(ctx), mac, walk.iv); - kernel_neon_end(); + err = skcipher_walk_done(&walk, tail); + } + if (!err) + ce_aes_ccm_final(mac, buf, ctx->key_enc, + num_rounds(ctx)); + kernel_neon_end(); + } else { + err = ccm_crypt_fallback(&walk, mac, buf, ctx, true); + } if (err) return err; @@ -205,38 +301,46 @@ static int ccm_decrypt(struct aead_request *req) u8 __aligned(8) mac[AES_BLOCK_SIZE]; u8 buf[AES_BLOCK_SIZE]; u32 len = req->cryptlen - authsize; + bool use_neon = may_use_simd(); int err; err = ccm_init_mac(req, mac, len); if (err) return err; - kernel_neon_begin_partial(6); + if (likely(use_neon)) + kernel_neon_begin(); if (req->assoclen) - ccm_calculate_auth_mac(req, mac); + ccm_calculate_auth_mac(req, mac, use_neon); /* preserve the original iv for the final round */ memcpy(buf, req->iv, AES_BLOCK_SIZE); err = skcipher_walk_aead_decrypt(&walk, req, true); - while (walk.nbytes) { - u32 tail = walk.nbytes % AES_BLOCK_SIZE; + if (likely(use_neon)) { + while (walk.nbytes) { + u32 tail = walk.nbytes % AES_BLOCK_SIZE; - if (walk.nbytes == walk.total) - tail = 0; + if (walk.nbytes == walk.total) + tail = 0; - ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes - tail, ctx->key_enc, - num_rounds(ctx), mac, walk.iv); + ce_aes_ccm_decrypt(walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes - tail, ctx->key_enc, + num_rounds(ctx), mac, walk.iv); - err = skcipher_walk_done(&walk, tail); - } - if (!err) - ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + err = skcipher_walk_done(&walk, tail); + } + if (!err) + ce_aes_ccm_final(mac, buf, ctx->key_enc, + num_rounds(ctx)); - kernel_neon_end(); + kernel_neon_end(); + } else { + err = ccm_crypt_fallback(&walk, mac, buf, ctx, false); + } if (err) return err; -- cgit v1.2.3 From e211506979e205e5a00b0a9d321fb3cbb44ee9ea Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:13 +0100 Subject: crypto: arm64/aes-blk - add a non-SIMD fallback for synchronous CTR To accommodate systems that may disallow use of the NEON in kernel mode in some circumstances, introduce a C fallback for synchronous AES in CTR mode, and use it if may_use_simd() returns false. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 6 ++-- arch/arm64/crypto/aes-ctr-fallback.h | 53 ++++++++++++++++++++++++++++++++ arch/arm64/crypto/aes-glue.c | 59 +++++++++++++++++++++++++++--------- 3 files changed, 101 insertions(+), 17 deletions(-) create mode 100644 arch/arm64/crypto/aes-ctr-fallback.h (limited to 'arch') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index ba637765c19a..a068dcbe2518 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -64,15 +64,17 @@ config CRYPTO_AES_ARM64_CE_CCM config CRYPTO_AES_ARM64_CE_BLK tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions" - depends on ARM64 && KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER select CRYPTO_AES_ARM64_CE + select CRYPTO_AES_ARM64 select CRYPTO_SIMD config CRYPTO_AES_ARM64_NEON_BLK tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions" - depends on ARM64 && KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER + select CRYPTO_AES_ARM64 select CRYPTO_AES select CRYPTO_SIMD diff --git a/arch/arm64/crypto/aes-ctr-fallback.h b/arch/arm64/crypto/aes-ctr-fallback.h new file mode 100644 index 000000000000..c9285717b6b5 --- /dev/null +++ b/arch/arm64/crypto/aes-ctr-fallback.h @@ -0,0 +1,53 @@ +/* + * Fallback for sync aes(ctr) in contexts where kernel mode NEON + * is not allowed + * + * Copyright (C) 2017 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); + +static inline int aes_ctr_encrypt_fallback(struct crypto_aes_ctx *ctx, + struct skcipher_request *req) +{ + struct skcipher_walk walk; + u8 buf[AES_BLOCK_SIZE]; + int err; + + err = skcipher_walk_virt(&walk, req, true); + + while (walk.nbytes > 0) { + u8 *dst = walk.dst.virt.addr; + u8 *src = walk.src.virt.addr; + int nbytes = walk.nbytes; + int tail = 0; + + if (nbytes < walk.total) { + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + tail = walk.nbytes % AES_BLOCK_SIZE; + } + + do { + int bsize = min(nbytes, AES_BLOCK_SIZE); + + __aes_arm64_encrypt(ctx->key_enc, buf, walk.iv, + 6 + ctx->key_length / 4); + crypto_xor_cpy(dst, src, buf, bsize); + crypto_inc(walk.iv, AES_BLOCK_SIZE); + + dst += AES_BLOCK_SIZE; + src += AES_BLOCK_SIZE; + nbytes -= AES_BLOCK_SIZE; + } while (nbytes > 0); + + err = skcipher_walk_done(&walk, tail); + } + return err; +} diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 0da30e3b0e4b..998ba519a026 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -19,6 +20,7 @@ #include #include "aes-ce-setkey.h" +#include "aes-ctr-fallback.h" #ifdef USE_V8_CRYPTO_EXTENSIONS #define MODE "ce" @@ -249,6 +251,17 @@ static int ctr_encrypt(struct skcipher_request *req) return err; } +static int ctr_encrypt_sync(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + + if (!may_use_simd()) + return aes_ctr_encrypt_fallback(ctx, req); + + return ctr_encrypt(req); +} + static int xts_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); @@ -355,8 +368,8 @@ static struct skcipher_alg aes_algs[] = { { .ivsize = AES_BLOCK_SIZE, .chunksize = AES_BLOCK_SIZE, .setkey = skcipher_aes_setkey, - .encrypt = ctr_encrypt, - .decrypt = ctr_encrypt, + .encrypt = ctr_encrypt_sync, + .decrypt = ctr_encrypt_sync, }, { .base = { .cra_name = "__xts(aes)", @@ -458,11 +471,35 @@ static int mac_init(struct shash_desc *desc) return 0; } +static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks, + u8 dg[], int enc_before, int enc_after) +{ + int rounds = 6 + ctx->key_length / 4; + + if (may_use_simd()) { + kernel_neon_begin(); + aes_mac_update(in, ctx->key_enc, rounds, blocks, dg, enc_before, + enc_after); + kernel_neon_end(); + } else { + if (enc_before) + __aes_arm64_encrypt(ctx->key_enc, dg, dg, rounds); + + while (blocks--) { + crypto_xor(dg, in, AES_BLOCK_SIZE); + in += AES_BLOCK_SIZE; + + if (blocks || enc_after) + __aes_arm64_encrypt(ctx->key_enc, dg, dg, + rounds); + } + } +} + static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len) { struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct mac_desc_ctx *ctx = shash_desc_ctx(desc); - int rounds = 6 + tctx->key.key_length / 4; while (len > 0) { unsigned int l; @@ -474,10 +511,8 @@ static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len) len %= AES_BLOCK_SIZE; - kernel_neon_begin(); - aes_mac_update(p, tctx->key.key_enc, rounds, blocks, - ctx->dg, (ctx->len != 0), (len != 0)); - kernel_neon_end(); + mac_do_update(&tctx->key, p, blocks, ctx->dg, + (ctx->len != 0), (len != 0)); p += blocks * AES_BLOCK_SIZE; @@ -505,11 +540,8 @@ static int cbcmac_final(struct shash_desc *desc, u8 *out) { struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct mac_desc_ctx *ctx = shash_desc_ctx(desc); - int rounds = 6 + tctx->key.key_length / 4; - kernel_neon_begin(); - aes_mac_update(NULL, tctx->key.key_enc, rounds, 0, ctx->dg, 1, 0); - kernel_neon_end(); + mac_do_update(&tctx->key, NULL, 0, ctx->dg, 1, 0); memcpy(out, ctx->dg, AES_BLOCK_SIZE); @@ -520,7 +552,6 @@ static int cmac_final(struct shash_desc *desc, u8 *out) { struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct mac_desc_ctx *ctx = shash_desc_ctx(desc); - int rounds = 6 + tctx->key.key_length / 4; u8 *consts = tctx->consts; if (ctx->len != AES_BLOCK_SIZE) { @@ -528,9 +559,7 @@ static int cmac_final(struct shash_desc *desc, u8 *out) consts += AES_BLOCK_SIZE; } - kernel_neon_begin(); - aes_mac_update(consts, tctx->key.key_enc, rounds, 1, ctx->dg, 0, 1); - kernel_neon_end(); + mac_do_update(&tctx->key, consts, 1, ctx->dg, 0, 1); memcpy(out, ctx->dg, AES_BLOCK_SIZE); -- cgit v1.2.3 From 611d5324f488b82ef224ab493f1ed5fb2af6f003 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:14 +0100 Subject: crypto: arm64/chacha20 - take may_use_simd() into account To accommodate systems that disallow the use of kernel mode NEON in some circumstances, take the return value of may_use_simd into account when deciding whether to invoke the C fallback routine. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/chacha20-neon-glue.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c index a7cd575ea223..cbdb75d15cd0 100644 --- a/arch/arm64/crypto/chacha20-neon-glue.c +++ b/arch/arm64/crypto/chacha20-neon-glue.c @@ -1,7 +1,7 @@ /* * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions * - * Copyright (C) 2016 Linaro, Ltd. + * Copyright (C) 2016 - 2017 Linaro, Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -26,6 +26,7 @@ #include #include +#include asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); @@ -64,7 +65,7 @@ static int chacha20_neon(struct skcipher_request *req) u32 state[16]; int err; - if (req->cryptlen <= CHACHA20_BLOCK_SIZE) + if (!may_use_simd() || req->cryptlen <= CHACHA20_BLOCK_SIZE) return crypto_chacha20_crypt(req); err = skcipher_walk_virt(&walk, req, true); -- cgit v1.2.3 From ec808bbef0b15ad103771222d419245617378f32 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:15 +0100 Subject: crypto: arm64/aes-bs - implement non-SIMD fallback for AES-CTR Of the various chaining modes implemented by the bit sliced AES driver, only CTR is exposed as a synchronous cipher, and requires a fallback in order to remain usable once we update the kernel mode NEON handling logic to disallow nested use. So wire up the existing CTR fallback C code. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 1 + arch/arm64/crypto/aes-neonbs-glue.c | 48 +++++++++++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index a068dcbe2518..f9e264b83366 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -89,6 +89,7 @@ config CRYPTO_AES_ARM64_BS depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER select CRYPTO_AES_ARM64_NEON_BLK + select CRYPTO_AES_ARM64 select CRYPTO_SIMD endif diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index 9001aec16007..c55d68ccb89f 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -1,7 +1,7 @@ /* * Bit sliced AES using NEON instructions * - * Copyright (C) 2016 Linaro Ltd + * Copyright (C) 2016 - 2017 Linaro Ltd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,12 +9,15 @@ */ #include +#include #include #include #include #include #include +#include "aes-ctr-fallback.h" + MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); @@ -58,6 +61,11 @@ struct aesbs_cbc_ctx { u32 enc[AES_MAX_KEYLENGTH_U32]; }; +struct aesbs_ctr_ctx { + struct aesbs_ctx key; /* must be first member */ + struct crypto_aes_ctx fallback; +}; + struct aesbs_xts_ctx { struct aesbs_ctx key; u32 twkey[AES_MAX_KEYLENGTH_U32]; @@ -196,6 +204,25 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } +static int aesbs_ctr_setkey_sync(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); + int err; + + err = crypto_aes_expand_key(&ctx->fallback, in_key, key_len); + if (err) + return err; + + ctx->key.rounds = 6 + key_len / 4; + + kernel_neon_begin(); + aesbs_convert_key(ctx->key.rk, ctx->fallback.key_enc, ctx->key.rounds); + kernel_neon_end(); + + return 0; +} + static int ctr_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); @@ -259,6 +286,17 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key, return aesbs_setkey(tfm, in_key, key_len); } +static int ctr_encrypt_sync(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); + + if (!may_use_simd()) + return aes_ctr_encrypt_fallback(&ctx->fallback, req); + + return ctr_encrypt(req); +} + static int __xts_crypt(struct skcipher_request *req, void (*fn)(u8 out[], u8 const in[], u8 const rk[], int rounds, int blocks, u8 iv[])) @@ -355,7 +393,7 @@ static struct skcipher_alg aes_algs[] = { { .base.cra_driver_name = "ctr-aes-neonbs", .base.cra_priority = 250 - 1, .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct aesbs_ctx), + .base.cra_ctxsize = sizeof(struct aesbs_ctr_ctx), .base.cra_module = THIS_MODULE, .min_keysize = AES_MIN_KEY_SIZE, @@ -363,9 +401,9 @@ static struct skcipher_alg aes_algs[] = { { .chunksize = AES_BLOCK_SIZE, .walksize = 8 * AES_BLOCK_SIZE, .ivsize = AES_BLOCK_SIZE, - .setkey = aesbs_setkey, - .encrypt = ctr_encrypt, - .decrypt = ctr_encrypt, + .setkey = aesbs_ctr_setkey_sync, + .encrypt = ctr_encrypt_sync, + .decrypt = ctr_encrypt_sync, }, { .base.cra_name = "__xts(aes)", .base.cra_driver_name = "__xts-aes-neonbs", -- cgit v1.2.3 From 537c1445ab0b1e33ca338b669e347652c45f4e8c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:16 +0100 Subject: crypto: arm64/gcm - implement native driver using v8 Crypto Extensions Currently, the AES-GCM implementation for arm64 systems that support the ARMv8 Crypto Extensions is based on the generic GCM module, which combines the AES-CTR implementation using AES instructions with the PMULL based GHASH driver. This is suboptimal, given the fact that the input data needs to be loaded twice, once for the encryption and again for the MAC calculation. On Cortex-A57 (r1p2) and other recent cores that implement micro-op fusing for the AES instructions, AES executes at less than 1 cycle per byte, which means that any cycles wasted on loading the data twice hurt even more. So implement a new GCM driver that combines the AES and PMULL instructions at the block level. This improves performance on Cortex-A57 by ~37% (from 3.5 cpb to 2.6 cpb) Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 4 +- arch/arm64/crypto/ghash-ce-core.S | 175 +++++++++++++++ arch/arm64/crypto/ghash-ce-glue.c | 438 +++++++++++++++++++++++++++++++++++--- 3 files changed, 591 insertions(+), 26 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index f9e264b83366..7ca54a76f6b9 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -29,10 +29,12 @@ config CRYPTO_SHA2_ARM64_CE select CRYPTO_SHA256_ARM64 config CRYPTO_GHASH_ARM64_CE - tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions" + tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions" depends on KERNEL_MODE_NEON select CRYPTO_HASH select CRYPTO_GF128MUL + select CRYPTO_AES + select CRYPTO_AES_ARM64 config CRYPTO_CRCT10DIF_ARM64_CE tristate "CRCT10DIF digest algorithm using PMULL instructions" diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index f0bb9f0b524f..cb22459eba85 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -77,3 +77,178 @@ CPU_LE( rev64 T1.16b, T1.16b ) st1 {XL.2d}, [x1] ret ENDPROC(pmull_ghash_update) + + KS .req v8 + CTR .req v9 + INP .req v10 + + .macro load_round_keys, rounds, rk + cmp \rounds, #12 + blo 2222f /* 128 bits */ + beq 1111f /* 192 bits */ + ld1 {v17.4s-v18.4s}, [\rk], #32 +1111: ld1 {v19.4s-v20.4s}, [\rk], #32 +2222: ld1 {v21.4s-v24.4s}, [\rk], #64 + ld1 {v25.4s-v28.4s}, [\rk], #64 + ld1 {v29.4s-v31.4s}, [\rk] + .endm + + .macro enc_round, state, key + aese \state\().16b, \key\().16b + aesmc \state\().16b, \state\().16b + .endm + + .macro enc_block, state, rounds + cmp \rounds, #12 + b.lo 2222f /* 128 bits */ + b.eq 1111f /* 192 bits */ + enc_round \state, v17 + enc_round \state, v18 +1111: enc_round \state, v19 + enc_round \state, v20 +2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 + enc_round \state, \key + .endr + aese \state\().16b, v30.16b + eor \state\().16b, \state\().16b, v31.16b + .endm + + .macro pmull_gcm_do_crypt, enc + ld1 {SHASH.2d}, [x4] + ld1 {XL.2d}, [x1] + ldr x8, [x5, #8] // load lower counter + + movi MASK.16b, #0xe1 + ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 +CPU_LE( rev x8, x8 ) + shl MASK.2d, MASK.2d, #57 + eor SHASH2.16b, SHASH2.16b, SHASH.16b + + .if \enc == 1 + ld1 {KS.16b}, [x7] + .endif + +0: ld1 {CTR.8b}, [x5] // load upper counter + ld1 {INP.16b}, [x3], #16 + rev x9, x8 + add x8, x8, #1 + sub w0, w0, #1 + ins CTR.d[1], x9 // set lower counter + + .if \enc == 1 + eor INP.16b, INP.16b, KS.16b // encrypt input + st1 {INP.16b}, [x2], #16 + .endif + + rev64 T1.16b, INP.16b + + cmp w6, #12 + b.ge 2f // AES-192/256? + +1: enc_round CTR, v21 + + ext T2.16b, XL.16b, XL.16b, #8 + ext IN1.16b, T1.16b, T1.16b, #8 + + enc_round CTR, v22 + + eor T1.16b, T1.16b, T2.16b + eor XL.16b, XL.16b, IN1.16b + + enc_round CTR, v23 + + pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 + eor T1.16b, T1.16b, XL.16b + + enc_round CTR, v24 + + pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 + pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) + + enc_round CTR, v25 + + ext T1.16b, XL.16b, XH.16b, #8 + eor T2.16b, XL.16b, XH.16b + eor XM.16b, XM.16b, T1.16b + + enc_round CTR, v26 + + eor XM.16b, XM.16b, T2.16b + pmull T2.1q, XL.1d, MASK.1d + + enc_round CTR, v27 + + mov XH.d[0], XM.d[1] + mov XM.d[1], XL.d[0] + + enc_round CTR, v28 + + eor XL.16b, XM.16b, T2.16b + + enc_round CTR, v29 + + ext T2.16b, XL.16b, XL.16b, #8 + + aese CTR.16b, v30.16b + + pmull XL.1q, XL.1d, MASK.1d + eor T2.16b, T2.16b, XH.16b + + eor KS.16b, CTR.16b, v31.16b + + eor XL.16b, XL.16b, T2.16b + + .if \enc == 0 + eor INP.16b, INP.16b, KS.16b + st1 {INP.16b}, [x2], #16 + .endif + + cbnz w0, 0b + +CPU_LE( rev x8, x8 ) + st1 {XL.2d}, [x1] + str x8, [x5, #8] // store lower counter + + .if \enc == 1 + st1 {KS.16b}, [x7] + .endif + + ret + +2: b.eq 3f // AES-192? + enc_round CTR, v17 + enc_round CTR, v18 +3: enc_round CTR, v19 + enc_round CTR, v20 + b 1b + .endm + + /* + * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], + * struct ghash_key const *k, u8 ctr[], + * int rounds, u8 ks[]) + */ +ENTRY(pmull_gcm_encrypt) + pmull_gcm_do_crypt 1 +ENDPROC(pmull_gcm_encrypt) + + /* + * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], + * struct ghash_key const *k, u8 ctr[], + * int rounds) + */ +ENTRY(pmull_gcm_decrypt) + pmull_gcm_do_crypt 0 +ENDPROC(pmull_gcm_decrypt) + + /* + * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds) + */ +ENTRY(pmull_gcm_encrypt_block) + cbz x2, 0f + load_round_keys w3, x2 +0: ld1 {v0.16b}, [x1] + enc_block v0, w3 + st1 {v0.16b}, [x0] + ret +ENDPROC(pmull_gcm_encrypt_block) diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 30221ef56e70..ee6aaac05905 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -11,18 +11,25 @@ #include #include #include +#include +#include +#include #include +#include #include +#include +#include #include #include #include -MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions"); +MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 +#define GCM_IV_SIZE 12 struct ghash_key { u64 a; @@ -36,9 +43,27 @@ struct ghash_desc_ctx { u32 count; }; +struct gcm_aes_ctx { + struct crypto_aes_ctx aes_key; + struct ghash_key ghash_key; +}; + asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src, struct ghash_key const *k, const char *head); +asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], + const u8 src[], struct ghash_key const *k, + u8 ctr[], int rounds, u8 ks[]); + +asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], + const u8 src[], struct ghash_key const *k, + u8 ctr[], int rounds); + +asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[], + u32 const rk[], int rounds); + +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); + static int ghash_init(struct shash_desc *desc) { struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); @@ -130,17 +155,11 @@ static int ghash_final(struct shash_desc *desc, u8 *dst) return 0; } -static int ghash_setkey(struct crypto_shash *tfm, - const u8 *inkey, unsigned int keylen) +static int __ghash_setkey(struct ghash_key *key, + const u8 *inkey, unsigned int keylen) { - struct ghash_key *key = crypto_shash_ctx(tfm); u64 a, b; - if (keylen != GHASH_BLOCK_SIZE) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - /* needed for the fallback */ memcpy(&key->k, inkey, GHASH_BLOCK_SIZE); @@ -157,32 +176,401 @@ static int ghash_setkey(struct crypto_shash *tfm, return 0; } +static int ghash_setkey(struct crypto_shash *tfm, + const u8 *inkey, unsigned int keylen) +{ + struct ghash_key *key = crypto_shash_ctx(tfm); + + if (keylen != GHASH_BLOCK_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + return __ghash_setkey(key, inkey, keylen); +} + static struct shash_alg ghash_alg = { - .digestsize = GHASH_DIGEST_SIZE, - .init = ghash_init, - .update = ghash_update, - .final = ghash_final, - .setkey = ghash_setkey, - .descsize = sizeof(struct ghash_desc_ctx), - .base = { - .cra_name = "ghash", - .cra_driver_name = "ghash-ce", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, - .cra_blocksize = GHASH_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct ghash_key), - .cra_module = THIS_MODULE, - }, + .base.cra_name = "ghash", + .base.cra_driver_name = "ghash-ce", + .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = GHASH_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct ghash_key), + .base.cra_module = THIS_MODULE, + + .digestsize = GHASH_DIGEST_SIZE, + .init = ghash_init, + .update = ghash_update, + .final = ghash_final, + .setkey = ghash_setkey, + .descsize = sizeof(struct ghash_desc_ctx), +}; + +static int num_rounds(struct crypto_aes_ctx *ctx) +{ + /* + * # of rounds specified by AES: + * 128 bit key 10 rounds + * 192 bit key 12 rounds + * 256 bit key 14 rounds + * => n byte key => 6 + (n/4) rounds + */ + return 6 + ctx->key_length / 4; +} + +static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, + unsigned int keylen) +{ + struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm); + u8 key[GHASH_BLOCK_SIZE]; + int ret; + + ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen); + if (ret) { + tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + __aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){}, + num_rounds(&ctx->aes_key)); + + return __ghash_setkey(&ctx->ghash_key, key, sizeof(key)); +} + +static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12 ... 16: + break; + default: + return -EINVAL; + } + return 0; +} + +static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[], + int *buf_count, struct gcm_aes_ctx *ctx) +{ + if (*buf_count > 0) { + int buf_added = min(count, GHASH_BLOCK_SIZE - *buf_count); + + memcpy(&buf[*buf_count], src, buf_added); + + *buf_count += buf_added; + src += buf_added; + count -= buf_added; + } + + if (count >= GHASH_BLOCK_SIZE || *buf_count == GHASH_BLOCK_SIZE) { + int blocks = count / GHASH_BLOCK_SIZE; + + ghash_do_update(blocks, dg, src, &ctx->ghash_key, + *buf_count ? buf : NULL); + + src += blocks * GHASH_BLOCK_SIZE; + count %= GHASH_BLOCK_SIZE; + *buf_count = 0; + } + + if (count > 0) { + memcpy(buf, src, count); + *buf_count = count; + } +} + +static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[]) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); + u8 buf[GHASH_BLOCK_SIZE]; + struct scatter_walk walk; + u32 len = req->assoclen; + int buf_count = 0; + + scatterwalk_start(&walk, req->src); + + do { + u32 n = scatterwalk_clamp(&walk, len); + u8 *p; + + if (!n) { + scatterwalk_start(&walk, sg_next(walk.sg)); + n = scatterwalk_clamp(&walk, len); + } + p = scatterwalk_map(&walk); + + gcm_update_mac(dg, p, n, buf, &buf_count, ctx); + len -= n; + + scatterwalk_unmap(p); + scatterwalk_advance(&walk, n); + scatterwalk_done(&walk, 0, len); + } while (len); + + if (buf_count) { + memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count); + ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL); + } +} + +static void gcm_final(struct aead_request *req, struct gcm_aes_ctx *ctx, + u64 dg[], u8 tag[], int cryptlen) +{ + u8 mac[AES_BLOCK_SIZE]; + u128 lengths; + + lengths.a = cpu_to_be64(req->assoclen * 8); + lengths.b = cpu_to_be64(cryptlen * 8); + + ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL); + + put_unaligned_be64(dg[1], mac); + put_unaligned_be64(dg[0], mac + 8); + + crypto_xor(tag, mac, AES_BLOCK_SIZE); +} + +static int gcm_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); + struct skcipher_walk walk; + u8 iv[AES_BLOCK_SIZE]; + u8 ks[AES_BLOCK_SIZE]; + u8 tag[AES_BLOCK_SIZE]; + u64 dg[2] = {}; + int err; + + if (req->assoclen) + gcm_calculate_auth_mac(req, dg); + + memcpy(iv, req->iv, GCM_IV_SIZE); + put_unaligned_be32(1, iv + GCM_IV_SIZE); + + if (likely(may_use_simd())) { + kernel_neon_begin(); + + pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, + num_rounds(&ctx->aes_key)); + put_unaligned_be32(2, iv + GCM_IV_SIZE); + pmull_gcm_encrypt_block(ks, iv, NULL, + num_rounds(&ctx->aes_key)); + put_unaligned_be32(3, iv + GCM_IV_SIZE); + + err = skcipher_walk_aead_encrypt(&walk, req, true); + + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = walk.nbytes / AES_BLOCK_SIZE; + + pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr, + walk.src.virt.addr, &ctx->ghash_key, + iv, num_rounds(&ctx->aes_key), ks); + + err = skcipher_walk_done(&walk, + walk.nbytes % AES_BLOCK_SIZE); + } + kernel_neon_end(); + } else { + __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, + num_rounds(&ctx->aes_key)); + put_unaligned_be32(2, iv + GCM_IV_SIZE); + + err = skcipher_walk_aead_encrypt(&walk, req, true); + + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = walk.nbytes / AES_BLOCK_SIZE; + u8 *dst = walk.dst.virt.addr; + u8 *src = walk.src.virt.addr; + + do { + __aes_arm64_encrypt(ctx->aes_key.key_enc, + ks, iv, + num_rounds(&ctx->aes_key)); + crypto_xor_cpy(dst, src, ks, AES_BLOCK_SIZE); + crypto_inc(iv, AES_BLOCK_SIZE); + + dst += AES_BLOCK_SIZE; + src += AES_BLOCK_SIZE; + } while (--blocks > 0); + + ghash_do_update(walk.nbytes / AES_BLOCK_SIZE, dg, + walk.dst.virt.addr, &ctx->ghash_key, + NULL); + + err = skcipher_walk_done(&walk, + walk.nbytes % AES_BLOCK_SIZE); + } + if (walk.nbytes) + __aes_arm64_encrypt(ctx->aes_key.key_enc, ks, iv, + num_rounds(&ctx->aes_key)); + } + + /* handle the tail */ + if (walk.nbytes) { + u8 buf[GHASH_BLOCK_SIZE]; + + crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, ks, + walk.nbytes); + + memcpy(buf, walk.dst.virt.addr, walk.nbytes); + memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes); + ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL); + + err = skcipher_walk_done(&walk, 0); + } + + if (err) + return err; + + gcm_final(req, ctx, dg, tag, req->cryptlen); + + /* copy authtag to end of dst */ + scatterwalk_map_and_copy(tag, req->dst, req->assoclen + req->cryptlen, + crypto_aead_authsize(aead), 1); + + return 0; +} + +static int gcm_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); + unsigned int authsize = crypto_aead_authsize(aead); + struct skcipher_walk walk; + u8 iv[AES_BLOCK_SIZE]; + u8 tag[AES_BLOCK_SIZE]; + u8 buf[GHASH_BLOCK_SIZE]; + u64 dg[2] = {}; + int err; + + if (req->assoclen) + gcm_calculate_auth_mac(req, dg); + + memcpy(iv, req->iv, GCM_IV_SIZE); + put_unaligned_be32(1, iv + GCM_IV_SIZE); + + if (likely(may_use_simd())) { + kernel_neon_begin(); + + pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, + num_rounds(&ctx->aes_key)); + put_unaligned_be32(2, iv + GCM_IV_SIZE); + + err = skcipher_walk_aead_decrypt(&walk, req, true); + + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = walk.nbytes / AES_BLOCK_SIZE; + + pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr, + walk.src.virt.addr, &ctx->ghash_key, + iv, num_rounds(&ctx->aes_key)); + + err = skcipher_walk_done(&walk, + walk.nbytes % AES_BLOCK_SIZE); + } + if (walk.nbytes) + pmull_gcm_encrypt_block(iv, iv, NULL, + num_rounds(&ctx->aes_key)); + + kernel_neon_end(); + } else { + __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, + num_rounds(&ctx->aes_key)); + put_unaligned_be32(2, iv + GCM_IV_SIZE); + + err = skcipher_walk_aead_decrypt(&walk, req, true); + + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = walk.nbytes / AES_BLOCK_SIZE; + u8 *dst = walk.dst.virt.addr; + u8 *src = walk.src.virt.addr; + + ghash_do_update(blocks, dg, walk.src.virt.addr, + &ctx->ghash_key, NULL); + + do { + __aes_arm64_encrypt(ctx->aes_key.key_enc, + buf, iv, + num_rounds(&ctx->aes_key)); + crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE); + crypto_inc(iv, AES_BLOCK_SIZE); + + dst += AES_BLOCK_SIZE; + src += AES_BLOCK_SIZE; + } while (--blocks > 0); + + err = skcipher_walk_done(&walk, + walk.nbytes % AES_BLOCK_SIZE); + } + if (walk.nbytes) + __aes_arm64_encrypt(ctx->aes_key.key_enc, iv, iv, + num_rounds(&ctx->aes_key)); + } + + /* handle the tail */ + if (walk.nbytes) { + memcpy(buf, walk.src.virt.addr, walk.nbytes); + memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes); + ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL); + + crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv, + walk.nbytes); + + err = skcipher_walk_done(&walk, 0); + } + + if (err) + return err; + + gcm_final(req, ctx, dg, tag, req->cryptlen - authsize); + + /* compare calculated auth tag with the stored one */ + scatterwalk_map_and_copy(buf, req->src, + req->assoclen + req->cryptlen - authsize, + authsize, 0); + + if (crypto_memneq(tag, buf, authsize)) + return -EBADMSG; + return 0; +} + +static struct aead_alg gcm_aes_alg = { + .ivsize = GCM_IV_SIZE, + .chunksize = AES_BLOCK_SIZE, + .maxauthsize = AES_BLOCK_SIZE, + .setkey = gcm_setkey, + .setauthsize = gcm_setauthsize, + .encrypt = gcm_encrypt, + .decrypt = gcm_decrypt, + + .base.cra_name = "gcm(aes)", + .base.cra_driver_name = "gcm-aes-ce", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct gcm_aes_ctx), + .base.cra_module = THIS_MODULE, }; static int __init ghash_ce_mod_init(void) { - return crypto_register_shash(&ghash_alg); + int ret; + + ret = crypto_register_aead(&gcm_aes_alg); + if (ret) + return ret; + + ret = crypto_register_shash(&ghash_alg); + if (ret) + crypto_unregister_aead(&gcm_aes_alg); + return ret; } static void __exit ghash_ce_mod_exit(void) { crypto_unregister_shash(&ghash_alg); + crypto_unregister_aead(&gcm_aes_alg); } module_cpu_feature_match(PMULL, ghash_ce_mod_init); -- cgit v1.2.3 From 3759ee057261a45da0505e79084de8b6ac31c4a5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:17 +0100 Subject: crypto: arm/ghash - add NEON accelerated fallback for vmull.p64 Implement a NEON fallback for systems that do support NEON but have no support for the optional 64x64->128 polynomial multiplication instruction that is part of the ARMv8 Crypto Extensions. It is based on the paper "Fast Software Polynomial Multiplication on ARM Processors Using the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab (https://hal.inria.fr/hal-01506572) On a 32-bit guest executing under KVM on a Cortex-A57, the new code is not only 4x faster than the generic table based GHASH driver, it is also time invariant. (Note that the existing vmull.p64 code is 16x faster on this core). Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/Kconfig | 5 +- arch/arm/crypto/ghash-ce-core.S | 234 ++++++++++++++++++++++++++++++++-------- arch/arm/crypto/ghash-ce-glue.c | 24 ++++- 3 files changed, 215 insertions(+), 48 deletions(-) (limited to 'arch') diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index b9adedcc5b2e..ec72752d5668 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -94,14 +94,15 @@ config CRYPTO_AES_ARM_CE ARMv8 Crypto Extensions config CRYPTO_GHASH_ARM_CE - tristate "PMULL-accelerated GHASH using ARMv8 Crypto Extensions" + tristate "PMULL-accelerated GHASH using NEON/ARMv8 Crypto Extensions" depends on KERNEL_MODE_NEON select CRYPTO_HASH select CRYPTO_CRYPTD help Use an implementation of GHASH (used by the GCM AEAD chaining mode) that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64) - that is part of the ARMv8 Crypto Extensions + that is part of the ARMv8 Crypto Extensions, or a slower variant that + uses the vmull.p8 instruction that is part of the basic NEON ISA. config CRYPTO_CRCT10DIF_ARM_CE tristate "CRCT10DIF digest algorithm using PMULL instructions" diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S index f6ab8bcc9efe..2f78c10b1881 100644 --- a/arch/arm/crypto/ghash-ce-core.S +++ b/arch/arm/crypto/ghash-ce-core.S @@ -1,7 +1,7 @@ /* - * Accelerated GHASH implementation with ARMv8 vmull.p64 instructions. + * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. * - * Copyright (C) 2015 Linaro Ltd. + * Copyright (C) 2015 - 2017 Linaro Ltd. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -12,40 +12,162 @@ #include SHASH .req q0 - SHASH2 .req q1 - T1 .req q2 - T2 .req q3 - MASK .req q4 - XL .req q5 - XM .req q6 - XH .req q7 - IN1 .req q7 + T1 .req q1 + XL .req q2 + XM .req q3 + XH .req q4 + IN1 .req q4 SHASH_L .req d0 SHASH_H .req d1 - SHASH2_L .req d2 - T1_L .req d4 - MASK_L .req d8 - XL_L .req d10 - XL_H .req d11 - XM_L .req d12 - XM_H .req d13 - XH_L .req d14 + T1_L .req d2 + T1_H .req d3 + XL_L .req d4 + XL_H .req d5 + XM_L .req d6 + XM_H .req d7 + XH_L .req d8 + + t0l .req d10 + t0h .req d11 + t1l .req d12 + t1h .req d13 + t2l .req d14 + t2h .req d15 + t3l .req d16 + t3h .req d17 + t4l .req d18 + t4h .req d19 + + t0q .req q5 + t1q .req q6 + t2q .req q7 + t3q .req q8 + t4q .req q9 + T2 .req q9 + + s1l .req d20 + s1h .req d21 + s2l .req d22 + s2h .req d23 + s3l .req d24 + s3h .req d25 + s4l .req d26 + s4h .req d27 + + MASK .req d28 + SHASH2_p8 .req d28 + + k16 .req d29 + k32 .req d30 + k48 .req d31 + SHASH2_p64 .req d31 .text .fpu crypto-neon-fp-armv8 + .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 + vmull.p64 \rd, \rn, \rm + .endm + /* - * void pmull_ghash_update(int blocks, u64 dg[], const char *src, - * struct ghash_key const *k, const char *head) + * This implementation of 64x64 -> 128 bit polynomial multiplication + * using vmull.p8 instructions (8x8 -> 16) is taken from the paper + * "Fast Software Polynomial Multiplication on ARM Processors Using + * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and + * Ricardo Dahab (https://hal.inria.fr/hal-01506572) + * + * It has been slightly tweaked for in-order performance, and to allow + * 'rq' to overlap with 'ad' or 'bd'. */ -ENTRY(pmull_ghash_update) - vld1.64 {SHASH}, [r3] + .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l + vext.8 t0l, \ad, \ad, #1 @ A1 + .ifc \b1, t4l + vext.8 t4l, \bd, \bd, #1 @ B1 + .endif + vmull.p8 t0q, t0l, \bd @ F = A1*B + vext.8 t1l, \ad, \ad, #2 @ A2 + vmull.p8 t4q, \ad, \b1 @ E = A*B1 + .ifc \b2, t3l + vext.8 t3l, \bd, \bd, #2 @ B2 + .endif + vmull.p8 t1q, t1l, \bd @ H = A2*B + vext.8 t2l, \ad, \ad, #3 @ A3 + vmull.p8 t3q, \ad, \b2 @ G = A*B2 + veor t0q, t0q, t4q @ L = E + F + .ifc \b3, t4l + vext.8 t4l, \bd, \bd, #3 @ B3 + .endif + vmull.p8 t2q, t2l, \bd @ J = A3*B + veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 + veor t1q, t1q, t3q @ M = G + H + .ifc \b4, t3l + vext.8 t3l, \bd, \bd, #4 @ B4 + .endif + vmull.p8 t4q, \ad, \b3 @ I = A*B3 + veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 + vmull.p8 t3q, \ad, \b4 @ K = A*B4 + vand t0h, t0h, k48 + vand t1h, t1h, k32 + veor t2q, t2q, t4q @ N = I + J + veor t0l, t0l, t0h + veor t1l, t1l, t1h + veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 + vand t2h, t2h, k16 + veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 + vmov.i64 t3h, #0 + vext.8 t0q, t0q, t0q, #15 + veor t2l, t2l, t2h + vext.8 t1q, t1q, t1q, #14 + vmull.p8 \rq, \ad, \bd @ D = A*B + vext.8 t2q, t2q, t2q, #13 + vext.8 t3q, t3q, t3q, #12 + veor t0q, t0q, t1q + veor t2q, t2q, t3q + veor \rq, \rq, t0q + veor \rq, \rq, t2q + .endm + + // + // PMULL (64x64->128) based reduction for CPUs that can do + // it in a single instruction. + // + .macro __pmull_reduce_p64 + vmull.p64 T1, XL_L, MASK + + veor XH_L, XH_L, XM_H + vext.8 T1, T1, T1, #8 + veor XL_H, XL_H, XM_L + veor T1, T1, XL + + vmull.p64 XL, T1_H, MASK + .endm + + // + // Alternative reduction for CPUs that lack support for the + // 64x64->128 PMULL instruction + // + .macro __pmull_reduce_p8 + veor XL_H, XL_H, XM_L + veor XH_L, XH_L, XM_H + + vshl.i64 T1, XL, #57 + vshl.i64 T2, XL, #62 + veor T1, T1, T2 + vshl.i64 T2, XL, #63 + veor T1, T1, T2 + veor XL_H, XL_H, T1_L + veor XH_L, XH_L, T1_H + + vshr.u64 T1, XL, #1 + veor XH, XH, XL + veor XL, XL, T1 + vshr.u64 T1, T1, #6 + vshr.u64 XL, XL, #1 + .endm + + .macro ghash_update, pn vld1.64 {XL}, [r1] - vmov.i8 MASK, #0xe1 - vext.8 SHASH2, SHASH, SHASH, #8 - vshl.u64 MASK, MASK, #57 - veor SHASH2, SHASH2, SHASH /* do the head block first, if supplied */ ldr ip, [sp] @@ -62,33 +184,59 @@ ENTRY(pmull_ghash_update) #ifndef CONFIG_CPU_BIG_ENDIAN vrev64.8 T1, T1 #endif - vext.8 T2, XL, XL, #8 vext.8 IN1, T1, T1, #8 - veor T1, T1, T2 + veor T1_L, T1_L, XL_H veor XL, XL, IN1 - vmull.p64 XH, SHASH_H, XL_H @ a1 * b1 + __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 veor T1, T1, XL - vmull.p64 XL, SHASH_L, XL_L @ a0 * b0 - vmull.p64 XM, SHASH2_L, T1_L @ (a1 + a0)(b1 + b0) + __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 + __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) - vext.8 T1, XL, XH, #8 - veor T2, XL, XH + veor T1, XL, XH veor XM, XM, T1 - veor XM, XM, T2 - vmull.p64 T2, XL_L, MASK_L - vmov XH_L, XM_H - vmov XM_H, XL_L + __pmull_reduce_\pn - veor XL, XM, T2 - vext.8 T2, XL, XL, #8 - vmull.p64 XL, XL_L, MASK_L - veor T2, T2, XH - veor XL, XL, T2 + veor T1, T1, XH + veor XL, XL, T1 bne 0b vst1.64 {XL}, [r1] bx lr -ENDPROC(pmull_ghash_update) + .endm + + /* + * void pmull_ghash_update(int blocks, u64 dg[], const char *src, + * struct ghash_key const *k, const char *head) + */ +ENTRY(pmull_ghash_update_p64) + vld1.64 {SHASH}, [r3] + veor SHASH2_p64, SHASH_L, SHASH_H + + vmov.i8 MASK, #0xe1 + vshl.u64 MASK, MASK, #57 + + ghash_update p64 +ENDPROC(pmull_ghash_update_p64) + +ENTRY(pmull_ghash_update_p8) + vld1.64 {SHASH}, [r3] + veor SHASH2_p8, SHASH_L, SHASH_H + + vext.8 s1l, SHASH_L, SHASH_L, #1 + vext.8 s2l, SHASH_L, SHASH_L, #2 + vext.8 s3l, SHASH_L, SHASH_L, #3 + vext.8 s4l, SHASH_L, SHASH_L, #4 + vext.8 s1h, SHASH_H, SHASH_H, #1 + vext.8 s2h, SHASH_H, SHASH_H, #2 + vext.8 s3h, SHASH_H, SHASH_H, #3 + vext.8 s4h, SHASH_H, SHASH_H, #4 + + vmov.i64 k16, #0xffff + vmov.i64 k32, #0xffffffff + vmov.i64 k48, #0xffffffffffff + + ghash_update p8 +ENDPROC(pmull_ghash_update_p8) diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c index 6bac8bea9f1e..d9bb52cae2ac 100644 --- a/arch/arm/crypto/ghash-ce-glue.c +++ b/arch/arm/crypto/ghash-ce-glue.c @@ -22,6 +22,7 @@ MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("ghash"); #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 @@ -41,8 +42,17 @@ struct ghash_async_ctx { struct cryptd_ahash *cryptd_tfm; }; -asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src, - struct ghash_key const *k, const char *head); +asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); + +asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); + +static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); static int ghash_init(struct shash_desc *desc) { @@ -312,6 +322,14 @@ static int __init ghash_ce_mod_init(void) { int err; + if (!(elf_hwcap & HWCAP_NEON)) + return -ENODEV; + + if (elf_hwcap2 & HWCAP2_PMULL) + pmull_ghash_update = pmull_ghash_update_p64; + else + pmull_ghash_update = pmull_ghash_update_p8; + err = crypto_register_shash(&ghash_alg); if (err) return err; @@ -332,5 +350,5 @@ static void __exit ghash_ce_mod_exit(void) crypto_unregister_shash(&ghash_alg); } -module_cpu_feature_match(PMULL, ghash_ce_mod_init); +module_init(ghash_ce_mod_init); module_exit(ghash_ce_mod_exit); -- cgit v1.2.3 From 03c9a333fef1bb0a67615b686a7342d853f1a460 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:18 +0100 Subject: crypto: arm64/ghash - add NEON accelerated fallback for 64-bit PMULL Implement a NEON fallback for systems that do support NEON but have no support for the optional 64x64->128 polynomial multiplication instruction that is part of the ARMv8 Crypto Extensions. It is based on the paper "Fast Software Polynomial Multiplication on ARM Processors Using the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab (https://hal.inria.fr/hal-01506572), but has been reworked extensively for the AArch64 ISA. On a low-end core such as the Cortex-A53 found in the Raspberry Pi3, the NEON based implementation is 4x faster than the table based one, and is time invariant as well, making it less vulnerable to timing attacks. When combined with the bit-sliced NEON implementation of AES-CTR, the AES-GCM performance increases by 2x (from 58 to 29 cycles per byte). Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/ghash-ce-core.S | 248 +++++++++++++++++++++++++++++++++----- arch/arm64/crypto/ghash-ce-glue.c | 40 ++++-- 2 files changed, 252 insertions(+), 36 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index cb22459eba85..11ebf1ae248a 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -1,7 +1,7 @@ /* * Accelerated GHASH implementation with ARMv8 PMULL instructions. * - * Copyright (C) 2014 Linaro Ltd. + * Copyright (C) 2014 - 2017 Linaro Ltd. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -11,31 +11,215 @@ #include #include - SHASH .req v0 - SHASH2 .req v1 - T1 .req v2 - T2 .req v3 - MASK .req v4 - XL .req v5 - XM .req v6 - XH .req v7 - IN1 .req v7 + SHASH .req v0 + SHASH2 .req v1 + T1 .req v2 + T2 .req v3 + MASK .req v4 + XL .req v5 + XM .req v6 + XH .req v7 + IN1 .req v7 + + k00_16 .req v8 + k32_48 .req v9 + + t3 .req v10 + t4 .req v11 + t5 .req v12 + t6 .req v13 + t7 .req v14 + t8 .req v15 + t9 .req v16 + + perm1 .req v17 + perm2 .req v18 + perm3 .req v19 + + sh1 .req v20 + sh2 .req v21 + sh3 .req v22 + sh4 .req v23 + + ss1 .req v24 + ss2 .req v25 + ss3 .req v26 + ss4 .req v27 .text .arch armv8-a+crypto - /* - * void pmull_ghash_update(int blocks, u64 dg[], const char *src, - * struct ghash_key const *k, const char *head) - */ -ENTRY(pmull_ghash_update) + .macro __pmull_p64, rd, rn, rm + pmull \rd\().1q, \rn\().1d, \rm\().1d + .endm + + .macro __pmull2_p64, rd, rn, rm + pmull2 \rd\().1q, \rn\().2d, \rm\().2d + .endm + + .macro __pmull_p8, rq, ad, bd + ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 + ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 + ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 + + __pmull_p8_\bd \rq, \ad + .endm + + .macro __pmull2_p8, rq, ad, bd + tbl t3.16b, {\ad\().16b}, perm1.16b // A1 + tbl t5.16b, {\ad\().16b}, perm2.16b // A2 + tbl t7.16b, {\ad\().16b}, perm3.16b // A3 + + __pmull2_p8_\bd \rq, \ad + .endm + + .macro __pmull_p8_SHASH, rq, ad + __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 + .endm + + .macro __pmull_p8_SHASH2, rq, ad + __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 + .endm + + .macro __pmull2_p8_SHASH, rq, ad + __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 + .endm + + .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 + pmull\t t3.8h, t3.\nb, \bd // F = A1*B + pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 + pmull\t t5.8h, t5.\nb, \bd // H = A2*B + pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 + pmull\t t7.8h, t7.\nb, \bd // J = A3*B + pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 + pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 + pmull\t \rq\().8h, \ad, \bd // D = A*B + + eor t3.16b, t3.16b, t4.16b // L = E + F + eor t5.16b, t5.16b, t6.16b // M = G + H + eor t7.16b, t7.16b, t8.16b // N = I + J + + uzp1 t4.2d, t3.2d, t5.2d + uzp2 t3.2d, t3.2d, t5.2d + uzp1 t6.2d, t7.2d, t9.2d + uzp2 t7.2d, t7.2d, t9.2d + + // t3 = (L) (P0 + P1) << 8 + // t5 = (M) (P2 + P3) << 16 + eor t4.16b, t4.16b, t3.16b + and t3.16b, t3.16b, k32_48.16b + + // t7 = (N) (P4 + P5) << 24 + // t9 = (K) (P6 + P7) << 32 + eor t6.16b, t6.16b, t7.16b + and t7.16b, t7.16b, k00_16.16b + + eor t4.16b, t4.16b, t3.16b + eor t6.16b, t6.16b, t7.16b + + zip2 t5.2d, t4.2d, t3.2d + zip1 t3.2d, t4.2d, t3.2d + zip2 t9.2d, t6.2d, t7.2d + zip1 t7.2d, t6.2d, t7.2d + + ext t3.16b, t3.16b, t3.16b, #15 + ext t5.16b, t5.16b, t5.16b, #14 + ext t7.16b, t7.16b, t7.16b, #13 + ext t9.16b, t9.16b, t9.16b, #12 + + eor t3.16b, t3.16b, t5.16b + eor t7.16b, t7.16b, t9.16b + eor \rq\().16b, \rq\().16b, t3.16b + eor \rq\().16b, \rq\().16b, t7.16b + .endm + + .macro __pmull_pre_p64 + movi MASK.16b, #0xe1 + shl MASK.2d, MASK.2d, #57 + .endm + + .macro __pmull_pre_p8 + // k00_16 := 0x0000000000000000_000000000000ffff + // k32_48 := 0x00000000ffffffff_0000ffffffffffff + movi k32_48.2d, #0xffffffff + mov k32_48.h[2], k32_48.h[0] + ushr k00_16.2d, k32_48.2d, #32 + + // prepare the permutation vectors + mov_q x5, 0x080f0e0d0c0b0a09 + movi T1.8b, #8 + dup perm1.2d, x5 + eor perm1.16b, perm1.16b, T1.16b + ushr perm2.2d, perm1.2d, #8 + ushr perm3.2d, perm1.2d, #16 + ushr T1.2d, perm1.2d, #24 + sli perm2.2d, perm1.2d, #56 + sli perm3.2d, perm1.2d, #48 + sli T1.2d, perm1.2d, #40 + + // precompute loop invariants + tbl sh1.16b, {SHASH.16b}, perm1.16b + tbl sh2.16b, {SHASH.16b}, perm2.16b + tbl sh3.16b, {SHASH.16b}, perm3.16b + tbl sh4.16b, {SHASH.16b}, T1.16b + ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 + ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 + ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 + ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 + .endm + + // + // PMULL (64x64->128) based reduction for CPUs that can do + // it in a single instruction. + // + .macro __pmull_reduce_p64 + pmull T2.1q, XL.1d, MASK.1d + eor XM.16b, XM.16b, T1.16b + + mov XH.d[0], XM.d[1] + mov XM.d[1], XL.d[0] + + eor XL.16b, XM.16b, T2.16b + ext T2.16b, XL.16b, XL.16b, #8 + pmull XL.1q, XL.1d, MASK.1d + .endm + + // + // Alternative reduction for CPUs that lack support for the + // 64x64->128 PMULL instruction + // + .macro __pmull_reduce_p8 + eor XM.16b, XM.16b, T1.16b + + mov XL.d[1], XM.d[0] + mov XH.d[0], XM.d[1] + + shl T1.2d, XL.2d, #57 + shl T2.2d, XL.2d, #62 + eor T2.16b, T2.16b, T1.16b + shl T1.2d, XL.2d, #63 + eor T2.16b, T2.16b, T1.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor T2.16b, T2.16b, T1.16b + + mov XL.d[1], T2.d[0] + mov XH.d[0], T2.d[1] + + ushr T2.2d, XL.2d, #1 + eor XH.16b, XH.16b, XL.16b + eor XL.16b, XL.16b, T2.16b + ushr T2.2d, T2.2d, #6 + ushr XL.2d, XL.2d, #1 + .endm + + .macro __pmull_ghash, pn ld1 {SHASH.2d}, [x3] ld1 {XL.2d}, [x1] - movi MASK.16b, #0xe1 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 - shl MASK.2d, MASK.2d, #57 eor SHASH2.16b, SHASH2.16b, SHASH.16b + __pmull_pre_\pn + /* do the head block first, if supplied */ cbz x4, 0f ld1 {T1.2d}, [x4] @@ -52,23 +236,17 @@ CPU_LE( rev64 T1.16b, T1.16b ) eor T1.16b, T1.16b, T2.16b eor XL.16b, XL.16b, IN1.16b - pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 + __pmull2_\pn XH, XL, SHASH // a1 * b1 eor T1.16b, T1.16b, XL.16b - pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 - pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) + __pmull_\pn XL, XL, SHASH // a0 * b0 + __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) - ext T1.16b, XL.16b, XH.16b, #8 eor T2.16b, XL.16b, XH.16b - eor XM.16b, XM.16b, T1.16b + ext T1.16b, XL.16b, XH.16b, #8 eor XM.16b, XM.16b, T2.16b - pmull T2.1q, XL.1d, MASK.1d - mov XH.d[0], XM.d[1] - mov XM.d[1], XL.d[0] + __pmull_reduce_\pn - eor XL.16b, XM.16b, T2.16b - ext T2.16b, XL.16b, XL.16b, #8 - pmull XL.1q, XL.1d, MASK.1d eor T2.16b, T2.16b, XH.16b eor XL.16b, XL.16b, T2.16b @@ -76,7 +254,19 @@ CPU_LE( rev64 T1.16b, T1.16b ) st1 {XL.2d}, [x1] ret -ENDPROC(pmull_ghash_update) + .endm + + /* + * void pmull_ghash_update(int blocks, u64 dg[], const char *src, + * struct ghash_key const *k, const char *head) + */ +ENTRY(pmull_ghash_update_p64) + __pmull_ghash p64 +ENDPROC(pmull_ghash_update_p64) + +ENTRY(pmull_ghash_update_p8) + __pmull_ghash p8 +ENDPROC(pmull_ghash_update_p8) KS .req v8 CTR .req v9 diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index ee6aaac05905..cfc9c92814fd 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -26,6 +26,7 @@ MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel "); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("ghash"); #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 @@ -48,8 +49,17 @@ struct gcm_aes_ctx { struct ghash_key ghash_key; }; -asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src, - struct ghash_key const *k, const char *head); +asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); + +asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); + +static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], struct ghash_key const *k, @@ -557,13 +567,24 @@ static int __init ghash_ce_mod_init(void) { int ret; - ret = crypto_register_aead(&gcm_aes_alg); - if (ret) - return ret; + if (!(elf_hwcap & HWCAP_ASIMD)) + return -ENODEV; + + if (elf_hwcap & HWCAP_PMULL) + pmull_ghash_update = pmull_ghash_update_p64; + + else + pmull_ghash_update = pmull_ghash_update_p8; ret = crypto_register_shash(&ghash_alg); if (ret) - crypto_unregister_aead(&gcm_aes_alg); + return ret; + + if (elf_hwcap & HWCAP_PMULL) { + ret = crypto_register_aead(&gcm_aes_alg); + if (ret) + crypto_unregister_shash(&ghash_alg); + } return ret; } @@ -573,5 +594,10 @@ static void __exit ghash_ce_mod_exit(void) crypto_unregister_aead(&gcm_aes_alg); } -module_cpu_feature_match(PMULL, ghash_ce_mod_init); +static const struct cpu_feature ghash_cpu_feature[] = { + { cpu_feature(PMULL) }, { } +}; +MODULE_DEVICE_TABLE(cpu, ghash_cpu_feature); + +module_init(ghash_ce_mod_init); module_exit(ghash_ce_mod_exit); -- cgit v1.2.3 From 0d149ce67d4409d7ff13c2d08c3474f3319e1b7e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:19 +0100 Subject: crypto: arm/aes - avoid expanded lookup tables in the final round For the final round, avoid the expanded and padded lookup tables exported by the generic AES driver. Instead, for encryption, we can perform byte loads from the same table we used for the inner rounds, which will still be hot in the caches. For decryption, use the inverse AES Sbox directly, which is 4x smaller than the inverse lookup table exported by the generic driver. This should significantly reduce the Dcache footprint of our code, which makes the code more robust against timing attacks. It does not introduce any additional module dependencies, given that we already rely on the core AES module for the shared key expansion routines. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/aes-cipher-core.S | 88 +++++++++++++++++++++++++++++---------- 1 file changed, 65 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S index c817a86c4ca8..54b384084637 100644 --- a/arch/arm/crypto/aes-cipher-core.S +++ b/arch/arm/crypto/aes-cipher-core.S @@ -10,6 +10,7 @@ */ #include +#include .text .align 5 @@ -32,19 +33,19 @@ .endif .endm - .macro __load, out, in, idx + .macro __load, out, in, idx, sz, op .if __LINUX_ARM_ARCH__ < 7 && \idx > 0 - ldr \out, [ttab, \in, lsr #(8 * \idx) - 2] + ldr\op \out, [ttab, \in, lsr #(8 * \idx) - \sz] .else - ldr \out, [ttab, \in, lsl #2] + ldr\op \out, [ttab, \in, lsl #\sz] .endif .endm - .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc + .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op __select \out0, \in0, 0 __select t0, \in1, 1 - __load \out0, \out0, 0 - __load t0, t0, 1 + __load \out0, \out0, 0, \sz, \op + __load t0, t0, 1, \sz, \op .if \enc __select \out1, \in1, 0 @@ -53,10 +54,10 @@ __select \out1, \in3, 0 __select t1, \in0, 1 .endif - __load \out1, \out1, 0 + __load \out1, \out1, 0, \sz, \op __select t2, \in2, 2 - __load t1, t1, 1 - __load t2, t2, 2 + __load t1, t1, 1, \sz, \op + __load t2, t2, 2, \sz, \op eor \out0, \out0, t0, ror #24 @@ -68,9 +69,9 @@ __select \t3, \in1, 2 __select \t4, \in2, 3 .endif - __load \t3, \t3, 2 - __load t0, t0, 3 - __load \t4, \t4, 3 + __load \t3, \t3, 2, \sz, \op + __load t0, t0, 3, \sz, \op + __load \t4, \t4, 3, \sz, \op eor \out1, \out1, t1, ror #24 eor \out0, \out0, t2, ror #16 @@ -82,14 +83,14 @@ eor \out1, \out1, t2 .endm - .macro fround, out0, out1, out2, out3, in0, in1, in2, in3 - __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1 - __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1 + .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op + __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op .endm - .macro iround, out0, out1, out2, out3, in0, in1, in2, in3 - __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0 - __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0 + .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op + __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op .endm .macro __rev, out, in @@ -114,7 +115,7 @@ .endif .endm - .macro do_crypt, round, ttab, ltab + .macro do_crypt, round, ttab, ltab, bsz push {r3-r11, lr} ldr r4, [in] @@ -146,9 +147,12 @@ 1: subs rounds, rounds, #4 \round r8, r9, r10, r11, r4, r5, r6, r7 - __adrl ttab, \ltab, ls + bls 2f \round r4, r5, r6, r7, r8, r9, r10, r11 - bhi 0b + b 0b + +2: __adrl ttab, \ltab + \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b #ifdef CONFIG_CPU_BIG_ENDIAN __rev r4, r4 @@ -170,10 +174,48 @@ .ltorg .endm + .align L1_CACHE_SHIFT + .type __aes_arm_inverse_sbox, %object +__aes_arm_inverse_sbox: + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + .size __aes_arm_inverse_sbox, . - __aes_arm_inverse_sbox + ENTRY(__aes_arm_encrypt) - do_crypt fround, crypto_ft_tab, crypto_fl_tab + do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2 ENDPROC(__aes_arm_encrypt) + .align 5 ENTRY(__aes_arm_decrypt) - do_crypt iround, crypto_it_tab, crypto_il_tab + do_crypt iround, crypto_it_tab, __aes_arm_inverse_sbox, 0 ENDPROC(__aes_arm_decrypt) -- cgit v1.2.3 From 7c83d689c71fd6ca8829d4a34416685938368c13 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 24 Jul 2017 11:28:20 +0100 Subject: crypto: arm64/aes - avoid expanded lookup tables in the final round For the final round, avoid the expanded and padded lookup tables exported by the generic AES driver. Instead, for encryption, we can perform byte loads from the same table we used for the inner rounds, which will still be hot in the caches. For decryption, use the inverse AES Sbox directly, which is 4x smaller than the inverse lookup table exported by the generic driver. This should significantly reduce the Dcache footprint of our code, which makes the code more robust against timing attacks. It does not introduce any additional module dependencies, given that we already rely on the core AES module for the shared key expansion routines. It also frees up register x18, which is not available as a scratch register on all platforms, which and so avoiding it improves shareability of this code. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-cipher-core.S | 152 +++++++++++++++++++++++++----------- 1 file changed, 107 insertions(+), 45 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S index f2f9cc519309..6d2445d603cc 100644 --- a/arch/arm64/crypto/aes-cipher-core.S +++ b/arch/arm64/crypto/aes-cipher-core.S @@ -10,6 +10,7 @@ #include #include +#include .text @@ -17,94 +18,155 @@ out .req x1 in .req x2 rounds .req x3 - tt .req x4 - lt .req x2 + tt .req x2 - .macro __pair, enc, reg0, reg1, in0, in1e, in1d, shift + .macro __pair1, sz, op, reg0, reg1, in0, in1e, in1d, shift + .ifc \op\shift, b0 + ubfiz \reg0, \in0, #2, #8 + ubfiz \reg1, \in1e, #2, #8 + .else ubfx \reg0, \in0, #\shift, #8 - .if \enc ubfx \reg1, \in1e, #\shift, #8 - .else - ubfx \reg1, \in1d, #\shift, #8 .endif + + /* + * AArch64 cannot do byte size indexed loads from a table containing + * 32-bit quantities, i.e., 'ldrb w12, [tt, w12, uxtw #2]' is not a + * valid instruction. So perform the shift explicitly first for the + * high bytes (the low byte is shifted implicitly by using ubfiz rather + * than ubfx above) + */ + .ifnc \op, b ldr \reg0, [tt, \reg0, uxtw #2] ldr \reg1, [tt, \reg1, uxtw #2] + .else + .if \shift > 0 + lsl \reg0, \reg0, #2 + lsl \reg1, \reg1, #2 + .endif + ldrb \reg0, [tt, \reg0, uxtw] + ldrb \reg1, [tt, \reg1, uxtw] + .endif .endm - .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc + .macro __pair0, sz, op, reg0, reg1, in0, in1e, in1d, shift + ubfx \reg0, \in0, #\shift, #8 + ubfx \reg1, \in1d, #\shift, #8 + ldr\op \reg0, [tt, \reg0, uxtw #\sz] + ldr\op \reg1, [tt, \reg1, uxtw #\sz] + .endm + + .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc, sz, op ldp \out0, \out1, [rk], #8 - __pair \enc, w13, w14, \in0, \in1, \in3, 0 - __pair \enc, w15, w16, \in1, \in2, \in0, 8 - __pair \enc, w17, w18, \in2, \in3, \in1, 16 - __pair \enc, \t0, \t1, \in3, \in0, \in2, 24 - - eor \out0, \out0, w13 - eor \out1, \out1, w14 - eor \out0, \out0, w15, ror #24 - eor \out1, \out1, w16, ror #24 - eor \out0, \out0, w17, ror #16 - eor \out1, \out1, w18, ror #16 + __pair\enc \sz, \op, w12, w13, \in0, \in1, \in3, 0 + __pair\enc \sz, \op, w14, w15, \in1, \in2, \in0, 8 + __pair\enc \sz, \op, w16, w17, \in2, \in3, \in1, 16 + __pair\enc \sz, \op, \t0, \t1, \in3, \in0, \in2, 24 + + eor \out0, \out0, w12 + eor \out1, \out1, w13 + eor \out0, \out0, w14, ror #24 + eor \out1, \out1, w15, ror #24 + eor \out0, \out0, w16, ror #16 + eor \out1, \out1, w17, ror #16 eor \out0, \out0, \t0, ror #8 eor \out1, \out1, \t1, ror #8 .endm - .macro fround, out0, out1, out2, out3, in0, in1, in2, in3 - __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1 - __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1 + .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op + __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op .endm - .macro iround, out0, out1, out2, out3, in0, in1, in2, in3 - __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0 - __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0 + .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op + __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op .endm - .macro do_crypt, round, ttab, ltab - ldp w5, w6, [in] - ldp w7, w8, [in, #8] - ldp w9, w10, [rk], #16 - ldp w11, w12, [rk, #-8] + .macro do_crypt, round, ttab, ltab, bsz + ldp w4, w5, [in] + ldp w6, w7, [in, #8] + ldp w8, w9, [rk], #16 + ldp w10, w11, [rk, #-8] +CPU_BE( rev w4, w4 ) CPU_BE( rev w5, w5 ) CPU_BE( rev w6, w6 ) CPU_BE( rev w7, w7 ) -CPU_BE( rev w8, w8 ) + eor w4, w4, w8 eor w5, w5, w9 eor w6, w6, w10 eor w7, w7, w11 - eor w8, w8, w12 adr_l tt, \ttab - adr_l lt, \ltab tbnz rounds, #1, 1f -0: \round w9, w10, w11, w12, w5, w6, w7, w8 - \round w5, w6, w7, w8, w9, w10, w11, w12 +0: \round w8, w9, w10, w11, w4, w5, w6, w7 + \round w4, w5, w6, w7, w8, w9, w10, w11 1: subs rounds, rounds, #4 - \round w9, w10, w11, w12, w5, w6, w7, w8 - csel tt, tt, lt, hi - \round w5, w6, w7, w8, w9, w10, w11, w12 - b.hi 0b - + \round w8, w9, w10, w11, w4, w5, w6, w7 + b.ls 3f +2: \round w4, w5, w6, w7, w8, w9, w10, w11 + b 0b +3: adr_l tt, \ltab + \round w4, w5, w6, w7, w8, w9, w10, w11, \bsz, b + +CPU_BE( rev w4, w4 ) CPU_BE( rev w5, w5 ) CPU_BE( rev w6, w6 ) CPU_BE( rev w7, w7 ) -CPU_BE( rev w8, w8 ) - stp w5, w6, [out] - stp w7, w8, [out, #8] + stp w4, w5, [out] + stp w6, w7, [out, #8] ret .endm - .align 5 + .align L1_CACHE_SHIFT + .type __aes_arm64_inverse_sbox, %object +__aes_arm64_inverse_sbox: + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + .size __aes_arm64_inverse_sbox, . - __aes_arm64_inverse_sbox + ENTRY(__aes_arm64_encrypt) - do_crypt fround, crypto_ft_tab, crypto_fl_tab + do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2 ENDPROC(__aes_arm64_encrypt) .align 5 ENTRY(__aes_arm64_decrypt) - do_crypt iround, crypto_it_tab, crypto_il_tab + do_crypt iround, crypto_it_tab, __aes_arm64_inverse_sbox, 0 ENDPROC(__aes_arm64_decrypt) -- cgit v1.2.3