diff options
Diffstat (limited to 'arch/arm64')
52 files changed, 4589 insertions, 263 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 86e3ab630aa6..7023e9bbc401 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -429,6 +429,20 @@ config ARM64_ERRATUM_843419 If unsure, say Y. +config ARM64_ERRATUM_1024718 + bool "Cortex-A55: 1024718: Update of DBM/AP bits without break before make might result in incorrect update" + default y + help + This option adds work around for Arm Cortex-A55 Erratum 1024718. + + Affected Cortex-A55 cores (r0p0, r0p1, r1p0) could cause incorrect + update of the hardware dirty bit when the DBM/AP bits are updated + without a break-before-make. The work around is to disable the usage + of hardware DBM locally on the affected cores. CPUs not affected by + erratum will continue to use the feature. + + If unsure, say Y. + config CAVIUM_ERRATUM_22375 bool "Cavium erratum 22375, 24313" default y @@ -746,6 +760,35 @@ config FORCE_MAX_ZONEORDER However for 4K, we choose a higher default value, 11 as opposed to 10, giving us 4M allocations matching the default size used by generic code. +config UNMAP_KERNEL_AT_EL0 + bool "Unmap kernel when running in userspace (aka \"KAISER\")" if EXPERT + default y + help + Speculation attacks against some high-performance processors can + be used to bypass MMU permission checks and leak kernel data to + userspace. This can be defended against by unmapping the kernel + when running in userspace, mapping it back in on exception entry + via a trampoline page in the vector table. + + If unsure, say Y. + +config HARDEN_BRANCH_PREDICTOR + bool "Harden the branch predictor against aliasing attacks" if EXPERT + default y + help + Speculation attacks against some high-performance processors rely on + being able to manipulate the branch predictor for a victim context by + executing aliasing branches in the attacker context. Such attacks + can be partially mitigated against by clearing internal branch + predictor state and limiting the prediction logic in some situations. + + This config option will take CPU-specific actions to harden the + branch predictor against aliasing attacks and may rely on specific + instruction sequences or control bits being set by the system + firmware. + + If unsure, say Y. + menuconfig ARMV8_DEPRECATED bool "Emulate deprecated/obsolete ARMv8 instructions" depends on COMPAT diff --git a/arch/arm64/boot/dts/renesas/r8a7796.dtsi b/arch/arm64/boot/dts/renesas/r8a7796.dtsi index 9217da983525..53d03cb144e4 100644 --- a/arch/arm64/boot/dts/renesas/r8a7796.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a7796.dtsi @@ -36,9 +36,8 @@ enable-method = "psci"; }; - L2_CA57: cache-controller@0 { + L2_CA57: cache-controller-0 { compatible = "cache"; - reg = <0>; power-domains = <&sysc R8A7796_PD_CA57_SCU>; cache-unified; cache-level = <2>; diff --git a/arch/arm64/crypto/sha256-core.S b/arch/arm64/crypto/sha256-core.S new file mode 100644 index 000000000000..3ce82cc860bc --- /dev/null +++ b/arch/arm64/crypto/sha256-core.S @@ -0,0 +1,2061 @@ +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// ==================================================================== +// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +// project. The module is, however, dual licensed under OpenSSL and +// CRYPTOGAMS licenses depending on where you obtain it. For further +// details see http://www.openssl.org/~appro/cryptogams/. +// +// Permission to use under GPLv2 terms is granted. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significanty faster +// and the gap is only 40-90%. +// +// October 2016. +// +// Originally it was reckoned that it makes no sense to implement NEON +// version of SHA256 for 64-bit processors. This is because performance +// improvement on most wide-spread Cortex-A5x processors was observed +// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was +// observed that 32-bit NEON SHA256 performs significantly better than +// 64-bit scalar version on *some* of the more recent processors. As +// result 64-bit NEON version of SHA256 was added to provide best +// all-round performance. For example it executes ~30% faster on X-Gene +// and Mongoose. [For reference, NEON version of SHA512 is bound to +// deliver much less improvement, likely *negative* on Cortex-A5x. +// Which is why NEON support is limited to SHA256.] + +#ifndef __KERNEL__ +# include "arm_arch.h" +#endif + +.text + +.extern OPENSSL_armcap_P +.globl sha256_block_data_order +.type sha256_block_data_order,%function +.align 6 +sha256_block_data_order: +#ifndef __KERNEL__ +# ifdef __ILP32__ + ldrsw x16,.LOPENSSL_armcap_P +# else + ldr x16,.LOPENSSL_armcap_P +# endif + adr x17,.LOPENSSL_armcap_P + add x16,x16,x17 + ldr w16,[x16] + tst w16,#ARMV8_SHA256 + b.ne .Lv8_entry + tst w16,#ARMV7_NEON + b.ne .Lneon_entry +#endif + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*4 + + ldp w20,w21,[x0] // load context + ldp w22,w23,[x0,#2*4] + ldp w24,w25,[x0,#4*4] + add x2,x1,x2,lsl#6 // end of input + ldp w26,w27,[x0,#6*4] + adr x30,.LK256 + stp x0,x2,[x29,#96] + +.Loop: + ldp w3,w4,[x1],#2*4 + ldr w19,[x30],#4 // *K++ + eor w28,w21,w22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev w3,w3 // 0 +#endif + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w6,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w3 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w4,w4 // 1 +#endif + ldp w5,w6,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w7,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w4 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w5,w5 // 2 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w8,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w5 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w6,w6 // 3 +#endif + ldp w7,w8,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w9,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w6 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w7,w7 // 4 +#endif + add w24,w24,w17 // h+=Sigma0(a) + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w10,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w7 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w10,ror#11 // Sigma1(e) + ror w10,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w10,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w8,w8 // 5 +#endif + ldp w9,w10,[x1],#2*4 + add w23,w23,w17 // h+=Sigma0(a) + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w11,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w8 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w11,ror#11 // Sigma1(e) + ror w11,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w11,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w9,w9 // 6 +#endif + add w22,w22,w17 // h+=Sigma0(a) + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w12,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w9 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w12,ror#11 // Sigma1(e) + ror w12,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w12,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w10,w10 // 7 +#endif + ldp w11,w12,[x1],#2*4 + add w21,w21,w17 // h+=Sigma0(a) + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + eor w13,w25,w25,ror#14 + and w17,w26,w25 + bic w28,w27,w25 + add w20,w20,w10 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w13,ror#11 // Sigma1(e) + ror w13,w21,#2 + add w20,w20,w17 // h+=Ch(e,f,g) + eor w17,w21,w21,ror#9 + add w20,w20,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w24,w24,w20 // d+=h + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w13,w17,ror#13 // Sigma0(a) + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w20,w20,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w11,w11 // 8 +#endif + add w20,w20,w17 // h+=Sigma0(a) + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w14,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w11 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w14,ror#11 // Sigma1(e) + ror w14,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w14,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w12,w12 // 9 +#endif + ldp w13,w14,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w15,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w12 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w15,ror#11 // Sigma1(e) + ror w15,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w15,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w13,w13 // 10 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w0,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w13 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w0,ror#11 // Sigma1(e) + ror w0,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w0,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w14,w14 // 11 +#endif + ldp w15,w0,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w6,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w14 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w15,w15 // 12 +#endif + add w24,w24,w17 // h+=Sigma0(a) + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w7,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w15 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w0,w0 // 13 +#endif + ldp w1,w2,[x1] + add w23,w23,w17 // h+=Sigma0(a) + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w8,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w0 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w1,w1 // 14 +#endif + ldr w6,[sp,#12] + add w22,w22,w17 // h+=Sigma0(a) + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w9,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w1 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w2,w2 // 15 +#endif + ldr w7,[sp,#0] + add w21,w21,w17 // h+=Sigma0(a) + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 +.Loop_16_xx: + ldr w8,[sp,#4] + str w11,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w10,w5,#7 + and w17,w25,w24 + ror w9,w2,#17 + bic w19,w26,w24 + ror w11,w20,#2 + add w27,w27,w3 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w10,w10,w5,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w11,w11,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w9,w9,w2,ror#19 + eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w11,w20,ror#22 // Sigma0(a) + eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) + add w4,w4,w13 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w4,w4,w10 + add w27,w27,w17 // h+=Sigma0(a) + add w4,w4,w9 + ldr w9,[sp,#8] + str w12,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w11,w6,#7 + and w17,w24,w23 + ror w10,w3,#17 + bic w28,w25,w23 + ror w12,w27,#2 + add w26,w26,w4 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w11,w11,w6,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w12,w12,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w10,w10,w3,ror#19 + eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w12,w27,ror#22 // Sigma0(a) + eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) + add w5,w5,w14 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w5,w5,w11 + add w26,w26,w17 // h+=Sigma0(a) + add w5,w5,w10 + ldr w10,[sp,#12] + str w13,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w12,w7,#7 + and w17,w23,w22 + ror w11,w4,#17 + bic w19,w24,w22 + ror w13,w26,#2 + add w25,w25,w5 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w12,w12,w7,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w13,w13,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w11,w11,w4,ror#19 + eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w13,w26,ror#22 // Sigma0(a) + eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) + add w6,w6,w15 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w6,w6,w12 + add w25,w25,w17 // h+=Sigma0(a) + add w6,w6,w11 + ldr w11,[sp,#0] + str w14,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w13,w8,#7 + and w17,w22,w21 + ror w12,w5,#17 + bic w28,w23,w21 + ror w14,w25,#2 + add w24,w24,w6 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w13,w13,w8,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w14,w14,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w12,w12,w5,ror#19 + eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w14,w25,ror#22 // Sigma0(a) + eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) + add w7,w7,w0 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w7,w7,w13 + add w24,w24,w17 // h+=Sigma0(a) + add w7,w7,w12 + ldr w12,[sp,#4] + str w15,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w14,w9,#7 + and w17,w21,w20 + ror w13,w6,#17 + bic w19,w22,w20 + ror w15,w24,#2 + add w23,w23,w7 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w14,w14,w9,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w15,w15,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w13,w13,w6,ror#19 + eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w15,w24,ror#22 // Sigma0(a) + eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) + add w8,w8,w1 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w8,w8,w14 + add w23,w23,w17 // h+=Sigma0(a) + add w8,w8,w13 + ldr w13,[sp,#8] + str w0,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w15,w10,#7 + and w17,w20,w27 + ror w14,w7,#17 + bic w28,w21,w27 + ror w0,w23,#2 + add w22,w22,w8 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w15,w15,w10,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w0,w0,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w14,w14,w7,ror#19 + eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w0,w23,ror#22 // Sigma0(a) + eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) + add w9,w9,w2 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w9,w9,w15 + add w22,w22,w17 // h+=Sigma0(a) + add w9,w9,w14 + ldr w14,[sp,#12] + str w1,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w0,w11,#7 + and w17,w27,w26 + ror w15,w8,#17 + bic w19,w20,w26 + ror w1,w22,#2 + add w21,w21,w9 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w0,w0,w11,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w1,w1,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w15,w15,w8,ror#19 + eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w1,w22,ror#22 // Sigma0(a) + eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) + add w10,w10,w3 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w10,w10,w0 + add w21,w21,w17 // h+=Sigma0(a) + add w10,w10,w15 + ldr w15,[sp,#0] + str w2,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w1,w12,#7 + and w17,w26,w25 + ror w0,w9,#17 + bic w28,w27,w25 + ror w2,w21,#2 + add w20,w20,w10 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w1,w1,w12,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w2,w2,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w0,w0,w9,ror#19 + eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w2,w21,ror#22 // Sigma0(a) + eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) + add w11,w11,w4 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w11,w11,w1 + add w20,w20,w17 // h+=Sigma0(a) + add w11,w11,w0 + ldr w0,[sp,#4] + str w3,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w2,w13,#7 + and w17,w25,w24 + ror w1,w10,#17 + bic w19,w26,w24 + ror w3,w20,#2 + add w27,w27,w11 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w2,w2,w13,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w3,w3,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w1,w1,w10,ror#19 + eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w3,w20,ror#22 // Sigma0(a) + eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) + add w12,w12,w5 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w12,w12,w2 + add w27,w27,w17 // h+=Sigma0(a) + add w12,w12,w1 + ldr w1,[sp,#8] + str w4,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w3,w14,#7 + and w17,w24,w23 + ror w2,w11,#17 + bic w28,w25,w23 + ror w4,w27,#2 + add w26,w26,w12 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w3,w3,w14,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w4,w4,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w2,w2,w11,ror#19 + eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w4,w27,ror#22 // Sigma0(a) + eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) + add w13,w13,w6 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w13,w13,w3 + add w26,w26,w17 // h+=Sigma0(a) + add w13,w13,w2 + ldr w2,[sp,#12] + str w5,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w4,w15,#7 + and w17,w23,w22 + ror w3,w12,#17 + bic w19,w24,w22 + ror w5,w26,#2 + add w25,w25,w13 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w4,w4,w15,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w5,w5,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w3,w3,w12,ror#19 + eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w5,w26,ror#22 // Sigma0(a) + eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) + add w14,w14,w7 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w14,w14,w4 + add w25,w25,w17 // h+=Sigma0(a) + add w14,w14,w3 + ldr w3,[sp,#0] + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w5,w0,#7 + and w17,w22,w21 + ror w4,w13,#17 + bic w28,w23,w21 + ror w6,w25,#2 + add w24,w24,w14 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w5,w5,w0,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w6,w6,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w4,w4,w13,ror#19 + eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w25,ror#22 // Sigma0(a) + eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) + add w15,w15,w8 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w15,w15,w5 + add w24,w24,w17 // h+=Sigma0(a) + add w15,w15,w4 + ldr w4,[sp,#4] + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w6,w1,#7 + and w17,w21,w20 + ror w5,w14,#17 + bic w19,w22,w20 + ror w7,w24,#2 + add w23,w23,w15 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w6,w6,w1,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w7,w7,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w5,w5,w14,ror#19 + eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w24,ror#22 // Sigma0(a) + eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) + add w0,w0,w9 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w0,w0,w6 + add w23,w23,w17 // h+=Sigma0(a) + add w0,w0,w5 + ldr w5,[sp,#8] + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w7,w2,#7 + and w17,w20,w27 + ror w6,w15,#17 + bic w28,w21,w27 + ror w8,w23,#2 + add w22,w22,w0 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w7,w7,w2,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w8,w8,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w6,w6,w15,ror#19 + eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w23,ror#22 // Sigma0(a) + eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) + add w1,w1,w10 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w1,w1,w7 + add w22,w22,w17 // h+=Sigma0(a) + add w1,w1,w6 + ldr w6,[sp,#12] + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w8,w3,#7 + and w17,w27,w26 + ror w7,w0,#17 + bic w19,w20,w26 + ror w9,w22,#2 + add w21,w21,w1 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w8,w8,w3,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w9,w9,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w7,w7,w0,ror#19 + eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w22,ror#22 // Sigma0(a) + eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) + add w2,w2,w11 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w2,w2,w8 + add w21,w21,w17 // h+=Sigma0(a) + add w2,w2,w7 + ldr w7,[sp,#0] + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 + cbnz w19,.Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#260 // rewind + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#2*4] + add x1,x1,#14*4 // advance input pointer + ldp w7,w8,[x0,#4*4] + add w20,w20,w3 + ldp w9,w10,[x0,#6*4] + add w21,w21,w4 + add w22,w22,w5 + add w23,w23,w6 + stp w20,w21,[x0] + add w24,w24,w7 + add w25,w25,w8 + stp w22,w23,[x0,#2*4] + add w26,w26,w9 + add w27,w27,w10 + cmp x1,x2 + stp w24,w25,[x0,#4*4] + stp w26,w27,[x0,#6*4] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*4 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size sha256_block_data_order,.-sha256_block_data_order + +.align 6 +.type .LK256,%object +.LK256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +.size .LK256,.-.LK256 +#ifndef __KERNEL__ +.align 3 +.LOPENSSL_armcap_P: +# ifdef __ILP32__ + .long OPENSSL_armcap_P-. +# else + .quad OPENSSL_armcap_P-. +# endif +#endif +.asciz "SHA256 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align 2 +#ifndef __KERNEL__ +.type sha256_block_armv8,%function +.align 6 +sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,.LK256 + +.Loop_hw: + ld1 {v4.16b-v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + .inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + .inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + .inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b + .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b + .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret +.size sha256_block_armv8,.-sha256_block_armv8 +#endif +#ifdef __KERNEL__ +.globl sha256_block_neon +#endif +.type sha256_block_neon,%function +.align 4 +sha256_block_neon: +.Lneon_entry: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,.LK256 + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s-v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s-v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b .L_00_48 + +.align 4 +.L_00_48: + ext v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne .L_00_48 + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + csel x17, x17, xzr, eq + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size sha256_block_neon,.-sha256_block_neon +#ifndef __KERNEL__ +.comm OPENSSL_armcap_P,4,4 +#endif diff --git a/arch/arm64/crypto/sha512-core.S b/arch/arm64/crypto/sha512-core.S new file mode 100644 index 000000000000..bd0f59f06c9d --- /dev/null +++ b/arch/arm64/crypto/sha512-core.S @@ -0,0 +1,1085 @@ +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// ==================================================================== +// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +// project. The module is, however, dual licensed under OpenSSL and +// CRYPTOGAMS licenses depending on where you obtain it. For further +// details see http://www.openssl.org/~appro/cryptogams/. +// +// Permission to use under GPLv2 terms is granted. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significanty faster +// and the gap is only 40-90%. +// +// October 2016. +// +// Originally it was reckoned that it makes no sense to implement NEON +// version of SHA256 for 64-bit processors. This is because performance +// improvement on most wide-spread Cortex-A5x processors was observed +// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was +// observed that 32-bit NEON SHA256 performs significantly better than +// 64-bit scalar version on *some* of the more recent processors. As +// result 64-bit NEON version of SHA256 was added to provide best +// all-round performance. For example it executes ~30% faster on X-Gene +// and Mongoose. [For reference, NEON version of SHA512 is bound to +// deliver much less improvement, likely *negative* on Cortex-A5x. +// Which is why NEON support is limited to SHA256.] + +#ifndef __KERNEL__ +# include "arm_arch.h" +#endif + +.text + +.extern OPENSSL_armcap_P +.globl sha512_block_data_order +.type sha512_block_data_order,%function +.align 6 +sha512_block_data_order: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*8 + + ldp x20,x21,[x0] // load context + ldp x22,x23,[x0,#2*8] + ldp x24,x25,[x0,#4*8] + add x2,x1,x2,lsl#7 // end of input + ldp x26,x27,[x0,#6*8] + adr x30,.LK512 + stp x0,x2,[x29,#96] + +.Loop: + ldp x3,x4,[x1],#2*8 + ldr x19,[x30],#8 // *K++ + eor x28,x21,x22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev x3,x3 // 0 +#endif + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x6,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x3 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x4,x4 // 1 +#endif + ldp x5,x6,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x7,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x4 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x5,x5 // 2 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x8,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x5 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x6,x6 // 3 +#endif + ldp x7,x8,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x9,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x6 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x7,x7 // 4 +#endif + add x24,x24,x17 // h+=Sigma0(a) + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x10,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x7 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x10,ror#18 // Sigma1(e) + ror x10,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x10,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x8,x8 // 5 +#endif + ldp x9,x10,[x1],#2*8 + add x23,x23,x17 // h+=Sigma0(a) + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x11,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x8 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x11,ror#18 // Sigma1(e) + ror x11,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x11,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x9,x9 // 6 +#endif + add x22,x22,x17 // h+=Sigma0(a) + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x12,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x9 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x12,ror#18 // Sigma1(e) + ror x12,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x12,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x10,x10 // 7 +#endif + ldp x11,x12,[x1],#2*8 + add x21,x21,x17 // h+=Sigma0(a) + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + eor x13,x25,x25,ror#23 + and x17,x26,x25 + bic x28,x27,x25 + add x20,x20,x10 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x13,ror#18 // Sigma1(e) + ror x13,x21,#28 + add x20,x20,x17 // h+=Ch(e,f,g) + eor x17,x21,x21,ror#5 + add x20,x20,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x24,x24,x20 // d+=h + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x13,x17,ror#34 // Sigma0(a) + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x20,x20,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x11,x11 // 8 +#endif + add x20,x20,x17 // h+=Sigma0(a) + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x14,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x11 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x14,ror#18 // Sigma1(e) + ror x14,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x14,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x12,x12 // 9 +#endif + ldp x13,x14,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x15,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x12 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x15,ror#18 // Sigma1(e) + ror x15,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x15,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x13,x13 // 10 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x0,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x13 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x0,ror#18 // Sigma1(e) + ror x0,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x0,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x14,x14 // 11 +#endif + ldp x15,x0,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x6,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x14 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x15,x15 // 12 +#endif + add x24,x24,x17 // h+=Sigma0(a) + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x7,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x15 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x0,x0 // 13 +#endif + ldp x1,x2,[x1] + add x23,x23,x17 // h+=Sigma0(a) + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x8,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x0 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x1,x1 // 14 +#endif + ldr x6,[sp,#24] + add x22,x22,x17 // h+=Sigma0(a) + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x9,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x1 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x2,x2 // 15 +#endif + ldr x7,[sp,#0] + add x21,x21,x17 // h+=Sigma0(a) + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 +.Loop_16_xx: + ldr x8,[sp,#8] + str x11,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x10,x5,#1 + and x17,x25,x24 + ror x9,x2,#19 + bic x19,x26,x24 + ror x11,x20,#28 + add x27,x27,x3 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x10,x10,x5,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x11,x11,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x9,x9,x2,ror#61 + eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x11,x20,ror#39 // Sigma0(a) + eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) + add x4,x4,x13 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x4,x4,x10 + add x27,x27,x17 // h+=Sigma0(a) + add x4,x4,x9 + ldr x9,[sp,#16] + str x12,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x11,x6,#1 + and x17,x24,x23 + ror x10,x3,#19 + bic x28,x25,x23 + ror x12,x27,#28 + add x26,x26,x4 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x11,x11,x6,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x12,x12,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x10,x10,x3,ror#61 + eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x12,x27,ror#39 // Sigma0(a) + eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) + add x5,x5,x14 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x5,x5,x11 + add x26,x26,x17 // h+=Sigma0(a) + add x5,x5,x10 + ldr x10,[sp,#24] + str x13,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x12,x7,#1 + and x17,x23,x22 + ror x11,x4,#19 + bic x19,x24,x22 + ror x13,x26,#28 + add x25,x25,x5 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x12,x12,x7,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x13,x13,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x11,x11,x4,ror#61 + eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x13,x26,ror#39 // Sigma0(a) + eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) + add x6,x6,x15 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x6,x6,x12 + add x25,x25,x17 // h+=Sigma0(a) + add x6,x6,x11 + ldr x11,[sp,#0] + str x14,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x13,x8,#1 + and x17,x22,x21 + ror x12,x5,#19 + bic x28,x23,x21 + ror x14,x25,#28 + add x24,x24,x6 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x13,x13,x8,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x14,x14,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x12,x12,x5,ror#61 + eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x14,x25,ror#39 // Sigma0(a) + eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) + add x7,x7,x0 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x7,x7,x13 + add x24,x24,x17 // h+=Sigma0(a) + add x7,x7,x12 + ldr x12,[sp,#8] + str x15,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x14,x9,#1 + and x17,x21,x20 + ror x13,x6,#19 + bic x19,x22,x20 + ror x15,x24,#28 + add x23,x23,x7 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x14,x14,x9,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x15,x15,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x13,x13,x6,ror#61 + eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x15,x24,ror#39 // Sigma0(a) + eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) + add x8,x8,x1 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x8,x8,x14 + add x23,x23,x17 // h+=Sigma0(a) + add x8,x8,x13 + ldr x13,[sp,#16] + str x0,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x15,x10,#1 + and x17,x20,x27 + ror x14,x7,#19 + bic x28,x21,x27 + ror x0,x23,#28 + add x22,x22,x8 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x15,x15,x10,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x0,x0,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x14,x14,x7,ror#61 + eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x0,x23,ror#39 // Sigma0(a) + eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) + add x9,x9,x2 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x9,x9,x15 + add x22,x22,x17 // h+=Sigma0(a) + add x9,x9,x14 + ldr x14,[sp,#24] + str x1,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x0,x11,#1 + and x17,x27,x26 + ror x15,x8,#19 + bic x19,x20,x26 + ror x1,x22,#28 + add x21,x21,x9 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x0,x0,x11,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x1,x1,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x15,x15,x8,ror#61 + eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x1,x22,ror#39 // Sigma0(a) + eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) + add x10,x10,x3 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x10,x10,x0 + add x21,x21,x17 // h+=Sigma0(a) + add x10,x10,x15 + ldr x15,[sp,#0] + str x2,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x1,x12,#1 + and x17,x26,x25 + ror x0,x9,#19 + bic x28,x27,x25 + ror x2,x21,#28 + add x20,x20,x10 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x1,x1,x12,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x2,x2,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x0,x0,x9,ror#61 + eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x2,x21,ror#39 // Sigma0(a) + eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) + add x11,x11,x4 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x11,x11,x1 + add x20,x20,x17 // h+=Sigma0(a) + add x11,x11,x0 + ldr x0,[sp,#8] + str x3,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x2,x13,#1 + and x17,x25,x24 + ror x1,x10,#19 + bic x19,x26,x24 + ror x3,x20,#28 + add x27,x27,x11 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x2,x2,x13,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x3,x3,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x1,x1,x10,ror#61 + eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x3,x20,ror#39 // Sigma0(a) + eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) + add x12,x12,x5 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x12,x12,x2 + add x27,x27,x17 // h+=Sigma0(a) + add x12,x12,x1 + ldr x1,[sp,#16] + str x4,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x3,x14,#1 + and x17,x24,x23 + ror x2,x11,#19 + bic x28,x25,x23 + ror x4,x27,#28 + add x26,x26,x12 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x3,x3,x14,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x4,x4,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x2,x2,x11,ror#61 + eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x4,x27,ror#39 // Sigma0(a) + eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) + add x13,x13,x6 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x13,x13,x3 + add x26,x26,x17 // h+=Sigma0(a) + add x13,x13,x2 + ldr x2,[sp,#24] + str x5,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x4,x15,#1 + and x17,x23,x22 + ror x3,x12,#19 + bic x19,x24,x22 + ror x5,x26,#28 + add x25,x25,x13 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x4,x4,x15,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x5,x5,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x3,x3,x12,ror#61 + eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x5,x26,ror#39 // Sigma0(a) + eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) + add x14,x14,x7 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x14,x14,x4 + add x25,x25,x17 // h+=Sigma0(a) + add x14,x14,x3 + ldr x3,[sp,#0] + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x5,x0,#1 + and x17,x22,x21 + ror x4,x13,#19 + bic x28,x23,x21 + ror x6,x25,#28 + add x24,x24,x14 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x5,x5,x0,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x6,x6,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x4,x4,x13,ror#61 + eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x25,ror#39 // Sigma0(a) + eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) + add x15,x15,x8 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x15,x15,x5 + add x24,x24,x17 // h+=Sigma0(a) + add x15,x15,x4 + ldr x4,[sp,#8] + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x6,x1,#1 + and x17,x21,x20 + ror x5,x14,#19 + bic x19,x22,x20 + ror x7,x24,#28 + add x23,x23,x15 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x6,x6,x1,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x7,x7,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x5,x5,x14,ror#61 + eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x24,ror#39 // Sigma0(a) + eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) + add x0,x0,x9 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x0,x0,x6 + add x23,x23,x17 // h+=Sigma0(a) + add x0,x0,x5 + ldr x5,[sp,#16] + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x7,x2,#1 + and x17,x20,x27 + ror x6,x15,#19 + bic x28,x21,x27 + ror x8,x23,#28 + add x22,x22,x0 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x7,x7,x2,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x8,x8,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x6,x6,x15,ror#61 + eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x23,ror#39 // Sigma0(a) + eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) + add x1,x1,x10 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x1,x1,x7 + add x22,x22,x17 // h+=Sigma0(a) + add x1,x1,x6 + ldr x6,[sp,#24] + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x8,x3,#1 + and x17,x27,x26 + ror x7,x0,#19 + bic x19,x20,x26 + ror x9,x22,#28 + add x21,x21,x1 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x8,x8,x3,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x9,x9,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x7,x7,x0,ror#61 + eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x22,ror#39 // Sigma0(a) + eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) + add x2,x2,x11 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x2,x2,x8 + add x21,x21,x17 // h+=Sigma0(a) + add x2,x2,x7 + ldr x7,[sp,#0] + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 + cbnz x19,.Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#648 // rewind + + ldp x3,x4,[x0] + ldp x5,x6,[x0,#2*8] + add x1,x1,#14*8 // advance input pointer + ldp x7,x8,[x0,#4*8] + add x20,x20,x3 + ldp x9,x10,[x0,#6*8] + add x21,x21,x4 + add x22,x22,x5 + add x23,x23,x6 + stp x20,x21,[x0] + add x24,x24,x7 + add x25,x25,x8 + stp x22,x23,[x0,#2*8] + add x26,x26,x9 + add x27,x27,x10 + cmp x1,x2 + stp x24,x25,[x0,#4*8] + stp x26,x27,[x0,#6*8] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*8 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size sha512_block_data_order,.-sha512_block_data_order + +.align 6 +.type .LK512,%object +.LK512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + .quad 0 // terminator +.size .LK512,.-.LK512 +#ifndef __KERNEL__ +.align 3 +.LOPENSSL_armcap_P: +# ifdef __ILP32__ + .long OPENSSL_armcap_P-. +# else + .quad OPENSSL_armcap_P-. +# endif +#endif +.asciz "SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align 2 +#ifndef __KERNEL__ +.comm OPENSSL_armcap_P,4,4 +#endif diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 3a4301163e04..a2597e3f70ad 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -25,6 +25,7 @@ #include <asm/asm-offsets.h> #include <asm/cpufeature.h> +#include <asm/cputype.h> #include <asm/page.h> #include <asm/pgtable-hwdef.h> #include <asm/ptrace.h> @@ -96,6 +97,24 @@ .endm /* + * Value prediction barrier + */ + .macro csdb + hint #20 + .endm + +/* + * Sanitise a 64-bit bounded index wrt speculation, returning zero if out + * of bounds. + */ + .macro mask_nospec64, idx, limit, tmp + sub \tmp, \idx, \limit + bic \tmp, \tmp, \idx + and \idx, \idx, \tmp, asr #63 + csdb + .endm + +/* * NOP sequence */ .macro nops, num @@ -453,4 +472,47 @@ alternative_else_nop_endif #endif .endm + .macro pte_to_phys, phys, pte + and \phys, \pte, #(((1 << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) + .endm + +/* + * Check the MIDR_EL1 of the current CPU for a given model and a range of + * variant/revision. See asm/cputype.h for the macros used below. + * + * model: MIDR_CPU_MODEL of CPU + * rv_min: Minimum of MIDR_CPU_VAR_REV() + * rv_max: Maximum of MIDR_CPU_VAR_REV() + * res: Result register. + * tmp1, tmp2, tmp3: Temporary registers + * + * Corrupts: res, tmp1, tmp2, tmp3 + * Returns: 0, if the CPU id doesn't match. Non-zero otherwise + */ + .macro cpu_midr_match model, rv_min, rv_max, res, tmp1, tmp2, tmp3 + mrs \res, midr_el1 + mov_q \tmp1, (MIDR_REVISION_MASK | MIDR_VARIANT_MASK) + mov_q \tmp2, MIDR_CPU_MODEL_MASK + and \tmp3, \res, \tmp2 // Extract model + and \tmp1, \res, \tmp1 // rev & variant + mov_q \tmp2, \model + cmp \tmp3, \tmp2 + cset \res, eq + cbz \res, .Ldone\@ // Model matches ? + + .if (\rv_min != 0) // Skip min check if rv_min == 0 + mov_q \tmp3, \rv_min + cmp \tmp1, \tmp3 + cset \res, ge + .endif // \rv_min != 0 + /* Skip rv_max check if rv_min == rv_max && rv_min != 0 */ + .if ((\rv_min != \rv_max) || \rv_min == 0) + mov_q \tmp2, \rv_max + cmp \tmp1, \tmp2 + cset \tmp2, le + and \res, \res, \tmp2 + .endif +.Ldone\@: + .endm + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 0fe7e43b7fbc..0b0755c961ac 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -31,6 +31,8 @@ #define dmb(opt) asm volatile("dmb " #opt : : : "memory") #define dsb(opt) asm volatile("dsb " #opt : : : "memory") +#define csdb() asm volatile("hint #20" : : : "memory") + #define mb() dsb(sy) #define rmb() dsb(ld) #define wmb() dsb(st) @@ -38,6 +40,27 @@ #define dma_rmb() dmb(oshld) #define dma_wmb() dmb(oshst) +/* + * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz + * and 0 otherwise. + */ +#define array_index_mask_nospec array_index_mask_nospec +static inline unsigned long array_index_mask_nospec(unsigned long idx, + unsigned long sz) +{ + unsigned long mask; + + asm volatile( + " cmp %1, %2\n" + " sbc %0, xzr, xzr\n" + : "=r" (mask) + : "r" (idx), "Ir" (sz) + : "cc"); + + csdb(); + return mask; +} + #define __smp_mb() dmb(ish) #define __smp_rmb() dmb(ishld) #define __smp_wmb() dmb(ishst) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 87b446535185..ce67bf6a0886 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -34,7 +34,9 @@ #define ARM64_HAS_32BIT_EL0 13 #define ARM64_HYP_OFFSET_LOW 14 #define ARM64_MISMATCHED_CACHE_LINE_SIZE 15 +#define ARM64_UNMAP_KERNEL_AT_EL0 16 +#define ARM64_HARDEN_BRANCH_PREDICTOR 17 -#define ARM64_NCAPS 16 +#define ARM64_NCAPS 18 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 26a68ddb11c1..39d1db68748d 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -56,6 +56,9 @@ (0xf << MIDR_ARCHITECTURE_SHIFT) | \ ((partnum) << MIDR_PARTNUM_SHIFT)) +#define MIDR_CPU_VAR_REV(var, rev) \ + (((var) << MIDR_VARIANT_SHIFT) | (rev)) + #define MIDR_CPU_MODEL_MASK (MIDR_IMPLEMENTOR_MASK | MIDR_PARTNUM_MASK | \ MIDR_ARCHITECTURE_MASK) @@ -74,20 +77,31 @@ #define ARM_CPU_PART_AEM_V8 0xD0F #define ARM_CPU_PART_FOUNDATION 0xD00 +#define ARM_CPU_PART_CORTEX_A55 0xD05 #define ARM_CPU_PART_CORTEX_A57 0xD07 +#define ARM_CPU_PART_CORTEX_A72 0xD08 #define ARM_CPU_PART_CORTEX_A53 0xD03 +#define ARM_CPU_PART_CORTEX_A73 0xD09 +#define ARM_CPU_PART_CORTEX_A75 0xD0A #define APM_CPU_PART_POTENZA 0x000 #define CAVIUM_CPU_PART_THUNDERX 0x0A1 #define CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2 +#define CAVIUM_CPU_PART_THUNDERX2 0x0AF #define BRCM_CPU_PART_VULCAN 0x516 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) +#define MIDR_CORTEX_A55 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A55) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) +#define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) +#define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73) +#define MIDR_CORTEX_A75 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A75) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) +#define MIDR_CAVIUM_THUNDERX2 MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX2) +#define MIDR_BRCM_VULCAN MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_VULCAN) #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index caf86be815ba..d8e58051f32d 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -51,6 +51,12 @@ enum fixed_addresses { FIX_EARLYCON_MEM_BASE, FIX_TEXT_POKE0, + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + FIX_ENTRY_TRAMP_DATA, + FIX_ENTRY_TRAMP_TEXT, +#define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT)) +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ __end_of_permanent_fixed_addresses, /* diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 85c4a8981d47..a891bb6bb3d4 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -48,20 +48,10 @@ do { \ } while (0) static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -91,30 +81,24 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } static inline int -futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr, u32 oldval, u32 newval) { int ret = 0; u32 val, tmp; + u32 __user *uaddr; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(VERIFY_WRITE, _uaddr, sizeof(u32))) return -EFAULT; + uaddr = __uaccess_mask_ptr(_uaddr); uaccess_enable(); asm volatile("// futex_atomic_cmpxchg_inatomic\n" " prfm pstl1strm, %2\n" diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e5050388e062..0a33ea304e63 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -73,6 +73,9 @@ struct kvm_arch { /* Timer */ struct arch_timer_kvm timer; + + /* Mandated version of PSCI */ + u32 psci_version; }; #define KVM_NR_MEM_OBJS 40 @@ -393,4 +396,9 @@ static inline void __cpu_init_stage2(void) "PARange is %d bits, unsupported configuration!", parange); } +static inline bool kvm_arm_harden_branch_predictor(void) +{ + return cpus_have_cap(ARM64_HARDEN_BRANCH_PREDICTOR); +} + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 6d22017ebbad..80bf33715ecb 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -313,5 +313,43 @@ static inline unsigned int kvm_get_vmid_bits(void) return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8; } +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR +#include <asm/mmu.h> + +static inline void *kvm_get_hyp_vector(void) +{ + struct bp_hardening_data *data = arm64_get_bp_hardening_data(); + void *vect = kvm_ksym_ref(__kvm_hyp_vector); + + if (data->fn) { + vect = __bp_harden_hyp_vecs_start + + data->hyp_vectors_slot * SZ_2K; + + if (!cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN)) + vect = lm_alias(vect); + } + + return vect; +} + +static inline int kvm_map_vectors(void) +{ + return create_hyp_mappings(kvm_ksym_ref(__bp_harden_hyp_vecs_start), + kvm_ksym_ref(__bp_harden_hyp_vecs_end), + PAGE_HYP_EXEC); +} + +#else +static inline void *kvm_get_hyp_vector(void) +{ + return kvm_ksym_ref(__kvm_hyp_vector); +} + +static inline int kvm_map_vectors(void) +{ + return 0; +} +#endif + #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ diff --git a/arch/arm64/include/asm/kvm_psci.h b/arch/arm64/include/asm/kvm_psci.h deleted file mode 100644 index bc39e557c56c..000000000000 --- a/arch/arm64/include/asm/kvm_psci.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#ifndef __ARM64_KVM_PSCI_H__ -#define __ARM64_KVM_PSCI_H__ - -#define KVM_ARM_PSCI_0_1 1 -#define KVM_ARM_PSCI_0_2 2 - -int kvm_psci_version(struct kvm_vcpu *vcpu); -int kvm_psci_call(struct kvm_vcpu *vcpu); - -#endif /* __ARM64_KVM_PSCI_H__ */ diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 53211a0acf0f..ba917be5565a 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -60,12 +60,12 @@ * KIMAGE_VADDR - the virtual address of the start of the kernel image * VA_BITS - the maximum number of bits for virtual addresses. * VA_START - the first kernel virtual address. - * TASK_SIZE - the maximum size of a user space task. - * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area. */ #define VA_BITS (CONFIG_ARM64_VA_BITS) -#define VA_START (UL(0xffffffffffffffff) << VA_BITS) -#define PAGE_OFFSET (UL(0xffffffffffffffff) << (VA_BITS - 1)) +#define VA_START (UL(0xffffffffffffffff) - \ + (UL(1) << VA_BITS) + 1) +#define PAGE_OFFSET (UL(0xffffffffffffffff) - \ + (UL(1) << (VA_BITS - 1)) + 1) #define KIMAGE_VADDR (MODULES_END) #define MODULES_END (MODULES_VADDR + MODULES_VSIZE) #define MODULES_VADDR (VA_START + KASAN_SHADOW_SIZE) @@ -74,19 +74,6 @@ #define PCI_IO_END (VMEMMAP_START - SZ_2M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) #define FIXADDR_TOP (PCI_IO_START - SZ_2M) -#define TASK_SIZE_64 (UL(1) << VA_BITS) - -#ifdef CONFIG_COMPAT -#define TASK_SIZE_32 UL(0x100000000) -#define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \ - TASK_SIZE_32 : TASK_SIZE_64) -#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ - TASK_SIZE_32 : TASK_SIZE_64) -#else -#define TASK_SIZE TASK_SIZE_64 -#endif /* CONFIG_COMPAT */ - -#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 4)) #define KERNEL_START _text #define KERNEL_END _end diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 8d9fce037b2f..d51158a61892 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -16,6 +16,12 @@ #ifndef __ASM_MMU_H #define __ASM_MMU_H +#define USER_ASID_FLAG (UL(1) << 48) + +#ifndef __ASSEMBLY__ + +#include <linux/percpu.h> + typedef struct { atomic64_t id; void *vdso; @@ -28,6 +34,49 @@ typedef struct { */ #define ASID(mm) ((mm)->context.id.counter & 0xffff) +static inline bool arm64_kernel_unmapped_at_el0(void) +{ + return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && + cpus_have_cap(ARM64_UNMAP_KERNEL_AT_EL0); +} + +typedef void (*bp_hardening_cb_t)(void); + +struct bp_hardening_data { + int hyp_vectors_slot; + bp_hardening_cb_t fn; +}; + +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR +extern char __bp_harden_hyp_vecs_start[], __bp_harden_hyp_vecs_end[]; + +DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); + +static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void) +{ + return this_cpu_ptr(&bp_hardening_data); +} + +static inline void arm64_apply_bp_hardening(void) +{ + struct bp_hardening_data *d; + + if (!cpus_have_cap(ARM64_HARDEN_BRANCH_PREDICTOR)) + return; + + d = arm64_get_bp_hardening_data(); + if (d->fn) + d->fn(); +} +#else +static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void) +{ + return NULL; +} + +static inline void arm64_apply_bp_hardening(void) { } +#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ + extern void paging_init(void); extern void bootmem_init(void); extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); @@ -37,4 +86,5 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, pgprot_t prot, bool allow_block_mappings); extern void *fixmap_remap_fdt(phys_addr_t dt_phys); +#endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 0363fe80455c..041014cee28b 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -51,6 +51,13 @@ static inline void cpu_set_reserved_ttbr0(void) isb(); } +static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm) +{ + BUG_ON(pgd == swapper_pg_dir); + cpu_set_reserved_ttbr0(); + cpu_do_switch_mm(virt_to_phys(pgd),mm); +} + /* * TCR.T0SZ value to use when the ID map is active. Usually equals * TCR_T0SZ(VA_BITS), unless system RAM is positioned very high in diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index eb0c2bd90de9..8df4cb6ac6f7 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -272,6 +272,7 @@ #define TCR_TG1_4K (UL(2) << TCR_TG1_SHIFT) #define TCR_TG1_64K (UL(3) << TCR_TG1_SHIFT) +#define TCR_A1 (UL(1) << 22) #define TCR_ASID16 (UL(1) << 36) #define TCR_TBI0 (UL(1) << 37) #define TCR_HA (UL(1) << 39) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 2142c7726e76..f705d96a76f2 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -34,8 +34,14 @@ #include <asm/pgtable-types.h> -#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) -#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) +#define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) +#define _PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) + +#define PTE_MAYBE_NG (arm64_kernel_unmapped_at_el0() ? PTE_NG : 0) +#define PMD_MAYBE_NG (arm64_kernel_unmapped_at_el0() ? PMD_SECT_NG : 0) + +#define PROT_DEFAULT (_PROT_DEFAULT | PTE_MAYBE_NG) +#define PROT_SECT_DEFAULT (_PROT_SECT_DEFAULT | PMD_MAYBE_NG) #define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE)) #define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE)) @@ -47,23 +53,24 @@ #define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) #define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) -#define _PAGE_DEFAULT (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) +#define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) +#define _HYP_PAGE_DEFAULT _PAGE_DEFAULT -#define PAGE_KERNEL __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE) -#define PAGE_KERNEL_RO __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_RDONLY) -#define PAGE_KERNEL_ROX __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_RDONLY) -#define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE) -#define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT) +#define PAGE_KERNEL __pgprot(PROT_NORMAL) +#define PAGE_KERNEL_RO __pgprot((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY) +#define PAGE_KERNEL_ROX __pgprot((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY) +#define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN) +#define PAGE_KERNEL_EXEC_CONT __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT) -#define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN) -#define PAGE_HYP_EXEC __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY) -#define PAGE_HYP_RO __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) +#define PAGE_HYP __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN) +#define PAGE_HYP_EXEC __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY) +#define PAGE_HYP_RO __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) #define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP) -#define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) -#define PAGE_S2_DEVICE __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_UXN) +#define PAGE_S2 __pgprot(_PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) +#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_UXN) -#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_PXN | PTE_UXN) +#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_NG | PTE_PXN | PTE_UXN) #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) #define PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) #define PAGE_COPY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 7acd3c5c7643..3a30a3994e4a 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -692,6 +692,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; +extern pgd_t tramp_pg_dir[PTRS_PER_PGD]; /* * Encode and decode a swap entry: diff --git a/arch/arm64/include/asm/proc-fns.h b/arch/arm64/include/asm/proc-fns.h index 14ad6e4e87d1..16cef2e8449e 100644 --- a/arch/arm64/include/asm/proc-fns.h +++ b/arch/arm64/include/asm/proc-fns.h @@ -35,12 +35,6 @@ extern u64 cpu_do_resume(phys_addr_t ptr, u64 idmap_ttbr); #include <asm/memory.h> -#define cpu_switch_mm(pgd,mm) \ -do { \ - BUG_ON(pgd == swapper_pg_dir); \ - cpu_do_switch_mm(virt_to_phys(pgd),mm); \ -} while (0) - #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* __ASM_PROCFNS_H */ diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 60e34824e18c..5917147af0c4 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -19,6 +19,13 @@ #ifndef __ASM_PROCESSOR_H #define __ASM_PROCESSOR_H +#define TASK_SIZE_64 (UL(1) << VA_BITS) + +#define KERNEL_DS UL(-1) +#define USER_DS (TASK_SIZE_64 - 1) + +#ifndef __ASSEMBLY__ + /* * Default implementation of macro that returns current * instruction pointer ("program counter"). @@ -37,6 +44,22 @@ #include <asm/ptrace.h> #include <asm/types.h> +/* + * TASK_SIZE - the maximum size of a user space task. + * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area. + */ +#ifdef CONFIG_COMPAT +#define TASK_SIZE_32 UL(0x100000000) +#define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \ + TASK_SIZE_32 : TASK_SIZE_64) +#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ + TASK_SIZE_32 : TASK_SIZE_64) +#else +#define TASK_SIZE TASK_SIZE_64 +#endif /* CONFIG_COMPAT */ + +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 4)) + #define STACK_TOP_MAX TASK_SIZE_64 #ifdef CONFIG_COMPAT #define AARCH32_VECTORS_BASE 0xffff0000 @@ -192,4 +215,5 @@ int cpu_enable_pan(void *__unused); int cpu_enable_uao(void *__unused); int cpu_enable_cache_maint_trap(void *__unused); +#endif /* __ASSEMBLY__ */ #endif /* __ASM_PROCESSOR_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 7393cc767edb..88bbe364b6ae 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -117,6 +117,9 @@ #define ID_AA64ISAR0_AES_SHIFT 4 /* id_aa64pfr0 */ +#define ID_AA64PFR0_CSV3_SHIFT 60 +#define ID_AA64PFR0_CSV2_SHIFT 56 +#define ID_AA64PFR0_SVE_SHIFT 32 #define ID_AA64PFR0_GIC_SHIFT 24 #define ID_AA64PFR0_ASIMD_SHIFT 20 #define ID_AA64PFR0_FP_SHIFT 16 diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index deab52374119..ad6bd8b26ada 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -23,6 +23,7 @@ #include <linux/sched.h> #include <asm/cputype.h> +#include <asm/mmu.h> /* * Raw TLBI operations. @@ -42,6 +43,11 @@ #define __tlbi(op, ...) __TLBI_N(op, ##__VA_ARGS__, 1, 0) +#define __tlbi_user(op, arg) do { \ + if (arm64_kernel_unmapped_at_el0()) \ + __tlbi(op, (arg) | USER_ASID_FLAG); \ +} while (0) + /* * TLB Management * ============== @@ -103,6 +109,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) dsb(ishst); __tlbi(aside1is, asid); + __tlbi_user(aside1is, asid); dsb(ish); } @@ -113,6 +120,7 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, dsb(ishst); __tlbi(vale1is, addr); + __tlbi_user(vale1is, addr); dsb(ish); } @@ -139,10 +147,13 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, dsb(ishst); for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) { - if (last_level) + if (last_level) { __tlbi(vale1is, addr); - else + __tlbi_user(vale1is, addr); + } else { __tlbi(vae1is, addr); + __tlbi_user(vae1is, addr); + } } dsb(ish); } @@ -182,6 +193,7 @@ static inline void __flush_tlb_pgtable(struct mm_struct *mm, unsigned long addr = uaddr >> 12 | (ASID(mm) << 48); __tlbi(vae1is, addr); + __tlbi_user(vae1is, addr); dsb(ish); } diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 31b6940ec2f2..71a8ab47fabc 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -33,6 +33,7 @@ #include <linux/thread_info.h> #include <asm/cpufeature.h> +#include <asm/processor.h> #include <asm/ptrace.h> #include <asm/errno.h> #include <asm/memory.h> @@ -63,10 +64,7 @@ struct exception_table_entry extern int fixup_exception(struct pt_regs *regs); -#define KERNEL_DS (-1UL) #define get_ds() (KERNEL_DS) - -#define USER_DS TASK_SIZE_64 #define get_fs() (current_thread_info()->addr_limit) static inline void set_fs(mm_segment_t fs) @@ -74,6 +72,13 @@ static inline void set_fs(mm_segment_t fs) current_thread_info()->addr_limit = fs; /* + * Prevent a mispredicted conditional call to set_fs from forwarding + * the wrong address limit to access_ok under speculation. + */ + dsb(nsh); + isb(); + + /* * Enable/disable UAO so that copy_to_user() etc can access * kernel memory with the unprivileged instructions. */ @@ -91,22 +96,32 @@ static inline void set_fs(mm_segment_t fs) * Returns 1 if the range is valid, 0 otherwise. * * This is equivalent to the following test: - * (u65)addr + (u65)size <= current->addr_limit - * - * This needs 65-bit arithmetic. + * (u65)addr + (u65)size <= (u65)current->addr_limit + 1 */ -#define __range_ok(addr, size) \ -({ \ - unsigned long __addr = (unsigned long __force)(addr); \ - unsigned long flag, roksum; \ - __chk_user_ptr(addr); \ - asm("adds %1, %1, %3; ccmp %1, %4, #2, cc; cset %0, ls" \ - : "=&r" (flag), "=&r" (roksum) \ - : "1" (__addr), "Ir" (size), \ - "r" (current_thread_info()->addr_limit) \ - : "cc"); \ - flag; \ -}) +static inline unsigned long __range_ok(unsigned long addr, unsigned long size) +{ + unsigned long limit = current_thread_info()->addr_limit; + + __chk_user_ptr(addr); + asm volatile( + // A + B <= C + 1 for all A,B,C, in four easy steps: + // 1: X = A + B; X' = X % 2^64 + " adds %0, %0, %2\n" + // 2: Set C = 0 if X > 2^64, to guarantee X' > C in step 4 + " csel %1, xzr, %1, hi\n" + // 3: Set X' = ~0 if X >= 2^64. For X == 2^64, this decrements X' + // to compensate for the carry flag being set in step 4. For + // X > 2^64, X' merely has to remain nonzero, which it does. + " csinv %0, %0, xzr, cc\n" + // 4: For X < 2^64, this gives us X' - C - 1 <= 0, where the -1 + // comes from the carry in being clear. Otherwise, we are + // testing X' - C == 0, subject to the previous adjustments. + " sbcs xzr, %0, %1\n" + " cset %0, ls\n" + : "+r" (addr), "+r" (limit) : "Ir" (size) : "cc"); + + return addr; +} /* * When dealing with data aborts, watchpoints, or instruction traps we may end @@ -115,7 +130,7 @@ static inline void set_fs(mm_segment_t fs) */ #define untagged_addr(addr) sign_extend64(addr, 55) -#define access_ok(type, addr, size) __range_ok(addr, size) +#define access_ok(type, addr, size) __range_ok((unsigned long)(addr), size) #define user_addr_max get_fs #define _ASM_EXTABLE(from, to) \ @@ -218,6 +233,26 @@ static inline void uaccess_enable_not_uao(void) } /* + * Sanitise a uaccess pointer such that it becomes NULL if above the + * current addr_limit. + */ +#define uaccess_mask_ptr(ptr) (__typeof__(ptr))__uaccess_mask_ptr(ptr) +static inline void __user *__uaccess_mask_ptr(const void __user *ptr) +{ + void __user *safe_ptr; + + asm volatile( + " bics xzr, %1, %2\n" + " csel %0, %1, xzr, eq\n" + : "=&r" (safe_ptr) + : "r" (ptr), "r" (current_thread_info()->addr_limit) + : "cc"); + + csdb(); + return safe_ptr; +} + +/* * The "__xxx" versions of the user access functions do not verify the address * space - it must have been done previously with a separate "access_ok()" * call. @@ -269,30 +304,35 @@ do { \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ } while (0) -#define __get_user(x, ptr) \ +#define __get_user_check(x, ptr, err) \ ({ \ - int __gu_err = 0; \ - __get_user_err((x), (ptr), __gu_err); \ - __gu_err; \ + __typeof__(*(ptr)) __user *__p = (ptr); \ + might_fault(); \ + if (access_ok(VERIFY_READ, __p, sizeof(*__p))) { \ + __p = uaccess_mask_ptr(__p); \ + __get_user_err((x), __p, (err)); \ + } else { \ + (x) = 0; (err) = -EFAULT; \ + } \ }) #define __get_user_error(x, ptr, err) \ ({ \ - __get_user_err((x), (ptr), (err)); \ + __get_user_check((x), (ptr), (err)); \ (void)0; \ }) -#define __get_user_unaligned __get_user - -#define get_user(x, ptr) \ +#define __get_user(x, ptr) \ ({ \ - __typeof__(*(ptr)) __user *__p = (ptr); \ - might_fault(); \ - access_ok(VERIFY_READ, __p, sizeof(*__p)) ? \ - __get_user((x), __p) : \ - ((x) = 0, -EFAULT); \ + int __gu_err = 0; \ + __get_user_check((x), (ptr), __gu_err); \ + __gu_err; \ }) +#define __get_user_unaligned __get_user + +#define get_user __get_user + #define __put_user_asm(instr, alt_instr, reg, x, addr, err, feature) \ asm volatile( \ "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \ @@ -335,47 +375,51 @@ do { \ uaccess_disable_not_uao(); \ } while (0) -#define __put_user(x, ptr) \ +#define __put_user_check(x, ptr, err) \ ({ \ - int __pu_err = 0; \ - __put_user_err((x), (ptr), __pu_err); \ - __pu_err; \ + __typeof__(*(ptr)) __user *__p = (ptr); \ + might_fault(); \ + if (access_ok(VERIFY_WRITE, __p, sizeof(*__p))) { \ + __p = uaccess_mask_ptr(__p); \ + __put_user_err((x), __p, (err)); \ + } else { \ + (err) = -EFAULT; \ + } \ }) #define __put_user_error(x, ptr, err) \ ({ \ - __put_user_err((x), (ptr), (err)); \ + __put_user_check((x), (ptr), (err)); \ (void)0; \ }) -#define __put_user_unaligned __put_user - -#define put_user(x, ptr) \ +#define __put_user(x, ptr) \ ({ \ - __typeof__(*(ptr)) __user *__p = (ptr); \ - might_fault(); \ - access_ok(VERIFY_WRITE, __p, sizeof(*__p)) ? \ - __put_user((x), __p) : \ - -EFAULT; \ + int __pu_err = 0; \ + __put_user_check((x), (ptr), __pu_err); \ + __pu_err; \ }) +#define __put_user_unaligned __put_user + +#define put_user __put_user + extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n); -extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n); -extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n); +extern unsigned long __must_check __arch_copy_in_user(void __user *to, const void __user *from, unsigned long n); static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n) { kasan_check_write(to, n); check_object_size(to, n, false); - return __arch_copy_from_user(to, from, n); + return __arch_copy_from_user(to, __uaccess_mask_ptr(from), n); } static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) { kasan_check_read(from, n); check_object_size(from, n, true); - return __arch_copy_to_user(to, from, n); + return __arch_copy_to_user(__uaccess_mask_ptr(to), from, n); } static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) @@ -403,22 +447,25 @@ static inline unsigned long __must_check copy_to_user(void __user *to, const voi return n; } -static inline unsigned long __must_check copy_in_user(void __user *to, const void __user *from, unsigned long n) +static inline unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n) { if (access_ok(VERIFY_READ, from, n) && access_ok(VERIFY_WRITE, to, n)) - n = __copy_in_user(to, from, n); + n = __arch_copy_in_user(__uaccess_mask_ptr(to), __uaccess_mask_ptr(from), n); return n; } +#define copy_in_user __copy_in_user #define __copy_to_user_inatomic __copy_to_user #define __copy_from_user_inatomic __copy_from_user -static inline unsigned long __must_check clear_user(void __user *to, unsigned long n) +extern unsigned long __must_check __arch_clear_user(void __user *to, unsigned long n); +static inline unsigned long __must_check __clear_user(void __user *to, unsigned long n) { if (access_ok(VERIFY_WRITE, to, n)) - n = __clear_user(to, n); + n = __arch_clear_user(__uaccess_mask_ptr(to), n); return n; } +#define clear_user __clear_user extern long strncpy_from_user(char *dest, const char __user *src, long count); diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 3051f86a9b5f..702de7a2b024 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -195,6 +195,12 @@ struct kvm_arch_memory_slot { #define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2) #define KVM_REG_ARM_TIMER_CVAL ARM64_SYS_REG(3, 3, 14, 0, 2) +/* KVM-as-firmware specific pseudo-registers */ +#define KVM_REG_ARM_FW (0x0014 << KVM_REG_ARM_COPROC_SHIFT) +#define KVM_REG_ARM_FW_REG(r) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \ + KVM_REG_ARM_FW | ((r) & 0xffff)) +#define KVM_REG_ARM_PSCI_VERSION KVM_REG_ARM_FW_REG(0) + /* Device Control API: ARM VGIC */ #define KVM_DEV_ARM_VGIC_GRP_ADDR 0 #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1 diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 6a7384eee08d..5c26760225ea 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -52,6 +52,10 @@ arm64-obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o \ cpu-reset.o arm64-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +ifeq ($(CONFIG_KVM),y) +arm64-obj-$(CONFIG_HARDEN_BRANCH_PREDICTOR) += bpi.o +endif + obj-y += $(arm64-obj-y) vdso/ probes/ obj-m += $(arm64-obj-m) head-y := head.o diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index e9c4dc9e0ada..66be504edb6c 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -37,8 +37,8 @@ EXPORT_SYMBOL(clear_page); /* user mem (segment) */ EXPORT_SYMBOL(__arch_copy_from_user); EXPORT_SYMBOL(__arch_copy_to_user); -EXPORT_SYMBOL(__clear_user); -EXPORT_SYMBOL(__copy_in_user); +EXPORT_SYMBOL(__arch_clear_user); +EXPORT_SYMBOL(__arch_copy_in_user); /* physical memory */ EXPORT_SYMBOL(memstart_addr); diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 14709b7f19df..8e44b06277f3 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -24,6 +24,7 @@ #include <linux/kvm_host.h> #include <linux/suspend.h> #include <asm/cpufeature.h> +#include <asm/fixmap.h> #include <asm/thread_info.h> #include <asm/memory.h> #include <asm/smp_plat.h> @@ -148,11 +149,14 @@ int main(void) DEFINE(ARM_SMCCC_RES_X2_OFFS, offsetof(struct arm_smccc_res, a2)); DEFINE(ARM_SMCCC_QUIRK_ID_OFFS, offsetof(struct arm_smccc_quirk, id)); DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS, offsetof(struct arm_smccc_quirk, state)); - BLANK(); DEFINE(HIBERN_PBE_ORIG, offsetof(struct pbe, orig_address)); DEFINE(HIBERN_PBE_ADDR, offsetof(struct pbe, address)); DEFINE(HIBERN_PBE_NEXT, offsetof(struct pbe, next)); DEFINE(ARM64_FTR_SYSVAL, offsetof(struct arm64_ftr_reg, sys_val)); + BLANK(); +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + DEFINE(TRAMP_VALIAS, TRAMP_VALIAS); +#endif return 0; } diff --git a/arch/arm64/kernel/bpi.S b/arch/arm64/kernel/bpi.S new file mode 100644 index 000000000000..dc4eb154e33b --- /dev/null +++ b/arch/arm64/kernel/bpi.S @@ -0,0 +1,75 @@ +/* + * Contains CPU specific branch predictor invalidation sequences + * + * Copyright (C) 2018 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <linux/arm-smccc.h> + +.macro ventry target + .rept 31 + nop + .endr + b \target +.endm + +.macro vectors target + ventry \target + 0x000 + ventry \target + 0x080 + ventry \target + 0x100 + ventry \target + 0x180 + + ventry \target + 0x200 + ventry \target + 0x280 + ventry \target + 0x300 + ventry \target + 0x380 + + ventry \target + 0x400 + ventry \target + 0x480 + ventry \target + 0x500 + ventry \target + 0x580 + + ventry \target + 0x600 + ventry \target + 0x680 + ventry \target + 0x700 + ventry \target + 0x780 +.endm + + .align 11 +ENTRY(__bp_harden_hyp_vecs_start) + .rept 4 + vectors __kvm_hyp_vector + .endr +ENTRY(__bp_harden_hyp_vecs_end) + +.macro smccc_workaround_1 inst + sub sp, sp, #(8 * 4) + stp x2, x3, [sp, #(8 * 0)] + stp x0, x1, [sp, #(8 * 2)] + mov w0, #ARM_SMCCC_ARCH_WORKAROUND_1 + \inst #0 + ldp x2, x3, [sp, #(8 * 0)] + ldp x0, x1, [sp, #(8 * 2)] + add sp, sp, #(8 * 4) +.endm + +ENTRY(__smccc_workaround_1_smc_start) + smccc_workaround_1 smc +ENTRY(__smccc_workaround_1_smc_end) + +ENTRY(__smccc_workaround_1_hvc_start) + smccc_workaround_1 hvc +ENTRY(__smccc_workaround_1_hvc_end) diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S index 65f42d257414..f736a6f81ecd 100644 --- a/arch/arm64/kernel/cpu-reset.S +++ b/arch/arm64/kernel/cpu-reset.S @@ -16,7 +16,7 @@ #include <asm/virt.h> .text -.pushsection .idmap.text, "ax" +.pushsection .idmap.text, "awx" /* * __cpu_soft_restart(el2_switch, entry, arg0, arg1, arg2) - Helper for diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index b75e917aac46..74107134cc30 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -46,6 +46,147 @@ static int cpu_enable_trap_ctr_access(void *__unused) return 0; } +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR +#include <asm/mmu_context.h> +#include <asm/cacheflush.h> + +DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); + +#ifdef CONFIG_KVM +extern char __smccc_workaround_1_smc_start[]; +extern char __smccc_workaround_1_smc_end[]; +extern char __smccc_workaround_1_hvc_start[]; +extern char __smccc_workaround_1_hvc_end[]; + +static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + void *dst = __bp_harden_hyp_vecs_start + slot * SZ_2K; + int i; + + for (i = 0; i < SZ_2K; i += 0x80) + memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start); + + flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K); +} + +static void __install_bp_hardening_cb(bp_hardening_cb_t fn, + const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + static int last_slot = -1; + static DEFINE_SPINLOCK(bp_lock); + int cpu, slot = -1; + + spin_lock(&bp_lock); + for_each_possible_cpu(cpu) { + if (per_cpu(bp_hardening_data.fn, cpu) == fn) { + slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu); + break; + } + } + + if (slot == -1) { + last_slot++; + BUG_ON(((__bp_harden_hyp_vecs_end - __bp_harden_hyp_vecs_start) + / SZ_2K) <= last_slot); + slot = last_slot; + __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end); + } + + __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot); + __this_cpu_write(bp_hardening_data.fn, fn); + spin_unlock(&bp_lock); +} +#else +#define __smccc_workaround_1_smc_start NULL +#define __smccc_workaround_1_smc_end NULL +#define __smccc_workaround_1_hvc_start NULL +#define __smccc_workaround_1_hvc_end NULL + +static void __install_bp_hardening_cb(bp_hardening_cb_t fn, + const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + __this_cpu_write(bp_hardening_data.fn, fn); +} +#endif /* CONFIG_KVM */ + +static void install_bp_hardening_cb(const struct arm64_cpu_capabilities *entry, + bp_hardening_cb_t fn, + const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + u64 pfr0; + + if (!entry->matches(entry, SCOPE_LOCAL_CPU)) + return; + + pfr0 = read_cpuid(ID_AA64PFR0_EL1); + if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT)) + return; + + __install_bp_hardening_cb(fn, hyp_vecs_start, hyp_vecs_end); +} + +#include <uapi/linux/psci.h> +#include <linux/arm-smccc.h> +#include <linux/psci.h> + +static void call_smc_arch_workaround_1(void) +{ + arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); +} + +static void call_hvc_arch_workaround_1(void) +{ + arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); +} + +static int enable_smccc_arch_workaround_1(void *data) +{ + const struct arm64_cpu_capabilities *entry = data; + bp_hardening_cb_t cb; + void *smccc_start, *smccc_end; + struct arm_smccc_res res; + + if (!entry->matches(entry, SCOPE_LOCAL_CPU)) + return 0; + + if (psci_ops.smccc_version == SMCCC_VERSION_1_0) + return 0; + + switch (psci_ops.conduit) { + case PSCI_CONDUIT_HVC: + arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_1, &res); + if (res.a0) + return 0; + cb = call_hvc_arch_workaround_1; + smccc_start = __smccc_workaround_1_hvc_start; + smccc_end = __smccc_workaround_1_hvc_end; + break; + + case PSCI_CONDUIT_SMC: + arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_1, &res); + if (res.a0) + return 0; + cb = call_smc_arch_workaround_1; + smccc_start = __smccc_workaround_1_smc_start; + smccc_end = __smccc_workaround_1_smc_end; + break; + + default: + return 0; + } + + install_bp_hardening_cb(entry, cb, smccc_start, smccc_end); + + return 0; +} +#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ + #define MIDR_RANGE(model, min, max) \ .def_scope = SCOPE_LOCAL_CPU, \ .matches = is_affected_midr_range, \ @@ -53,6 +194,13 @@ static int cpu_enable_trap_ctr_access(void *__unused) .midr_range_min = min, \ .midr_range_max = max +#define MIDR_ALL_VERSIONS(model) \ + .def_scope = SCOPE_LOCAL_CPU, \ + .matches = is_affected_midr_range, \ + .midr_model = model, \ + .midr_range_min = 0, \ + .midr_range_max = (MIDR_VARIANT_MASK | MIDR_REVISION_MASK) + const struct arm64_cpu_capabilities arm64_errata[] = { #if defined(CONFIG_ARM64_ERRATUM_826319) || \ defined(CONFIG_ARM64_ERRATUM_827319) || \ @@ -130,6 +278,38 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .def_scope = SCOPE_LOCAL_CPU, .enable = cpu_enable_trap_ctr_access, }, +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + .enable = enable_smccc_arch_workaround_1, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + .enable = enable_smccc_arch_workaround_1, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), + .enable = enable_smccc_arch_workaround_1, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), + .enable = enable_smccc_arch_workaround_1, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), + .enable = enable_smccc_arch_workaround_1, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), + .enable = enable_smccc_arch_workaround_1, + }, +#endif { } }; @@ -143,15 +323,18 @@ void verify_local_cpu_errata_workarounds(void) { const struct arm64_cpu_capabilities *caps = arm64_errata; - for (; caps->matches; caps++) - if (!cpus_have_cap(caps->capability) && - caps->matches(caps, SCOPE_LOCAL_CPU)) { + for (; caps->matches; caps++) { + if (cpus_have_cap(caps->capability)) { + if (caps->enable) + caps->enable((void *)caps); + } else if (caps->matches(caps, SCOPE_LOCAL_CPU)) { pr_crit("CPU%d: Requires work around for %s, not detected" " at boot time\n", smp_processor_id(), caps->desc ? : "an erratum"); cpu_die_early(); } + } } void update_cpu_errata_workarounds(void) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index af5a1e34fb07..e60f6de1db09 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -94,7 +94,9 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { - ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 32, 0), + ARM64_FTR_BITS(FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 24, 0), ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 28, 4, 0), ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64PFR0_GIC_SHIFT, 4, 0), S_ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI), @@ -747,6 +749,86 @@ static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry, return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode(); } +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ + +static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, + int __unused) +{ + char const *str = "command line option"; + u64 pfr0 = read_system_reg(SYS_ID_AA64PFR0_EL1); + + /* + * For reasons that aren't entirely clear, enabling KPTI on Cavium + * ThunderX leads to apparent I-cache corruption of kernel text, which + * ends as well as you might imagine. Don't even try. + */ + if (cpus_have_cap(ARM64_WORKAROUND_CAVIUM_27456)) { + str = "ARM64_WORKAROUND_CAVIUM_27456"; + __kpti_forced = -1; + } + + /* Forced? */ + if (__kpti_forced) { + pr_info_once("kernel page table isolation forced %s by %s\n", + __kpti_forced > 0 ? "ON" : "OFF", str); + return __kpti_forced > 0; + } + + /* Useful for KASLR robustness */ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + return true; + + /* Don't force KPTI for CPUs that are not vulnerable */ + switch (read_cpuid_id() & MIDR_CPU_MODEL_MASK) { + case MIDR_CAVIUM_THUNDERX2: + case MIDR_BRCM_VULCAN: + return false; + } + + /* Defer to CPU feature registers */ + return !cpuid_feature_extract_unsigned_field(pfr0, + ID_AA64PFR0_CSV3_SHIFT); +} + +static int kpti_install_ng_mappings(void *__unused) +{ + typedef void (kpti_remap_fn)(int, int, phys_addr_t); + extern kpti_remap_fn idmap_kpti_install_ng_mappings; + kpti_remap_fn *remap_fn; + + static bool kpti_applied = false; + int cpu = smp_processor_id(); + + if (kpti_applied) + return 0; + + remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); + + cpu_install_idmap(); + remap_fn(cpu, num_online_cpus(), __pa_symbol(swapper_pg_dir)); + cpu_uninstall_idmap(); + + if (!cpu) + kpti_applied = true; + + return 0; +} + +static int __init parse_kpti(char *str) +{ + bool enabled; + int ret = strtobool(str, &enabled); + + if (ret) + return ret; + + __kpti_forced = enabled ? 1 : -1; + return 0; +} +__setup("kpti=", parse_kpti); +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -830,6 +912,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .def_scope = SCOPE_SYSTEM, .matches = hyp_offset_low, }, +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + { + .desc = "Kernel page table isolation (KPTI)", + .capability = ARM64_UNMAP_KERNEL_AT_EL0, + .def_scope = SCOPE_SYSTEM, + .matches = unmap_kernel_at_el0, + .enable = kpti_install_ng_mappings, + }, +#endif {}, }; @@ -923,6 +1014,25 @@ static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps) cap_set_elf_hwcap(hwcaps); } +/* + * Check if the current CPU has a given feature capability. + * Should be called from non-preemptible context. + */ +static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array, + unsigned int cap) +{ + const struct arm64_cpu_capabilities *caps; + + if (WARN_ON(preemptible())) + return false; + + for (caps = cap_array; caps->matches; caps++) + if (caps->capability == cap && + caps->matches(caps, SCOPE_LOCAL_CPU)) + return true; + return false; +} + void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps, const char *info) { @@ -950,7 +1060,7 @@ void __init enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps) * uses an IPI, giving us a PSTATE that disappears when * we return. */ - stop_machine(caps->enable, NULL, cpu_online_mask); + stop_machine(caps->enable, (void *)caps, cpu_online_mask); } /* @@ -991,8 +1101,9 @@ verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps) } static void -verify_local_cpu_features(const struct arm64_cpu_capabilities *caps) +verify_local_cpu_features(const struct arm64_cpu_capabilities *caps_list) { + const struct arm64_cpu_capabilities *caps = caps_list; for (; caps->matches; caps++) { if (!cpus_have_cap(caps->capability)) continue; @@ -1000,13 +1111,13 @@ verify_local_cpu_features(const struct arm64_cpu_capabilities *caps) * If the new CPU misses an advertised feature, we cannot proceed * further, park the cpu. */ - if (!caps->matches(caps, SCOPE_LOCAL_CPU)) { + if (!__this_cpu_has_cap(caps_list, caps->capability)) { pr_crit("CPU%d: missing feature: %s\n", smp_processor_id(), caps->desc); cpu_die_early(); } if (caps->enable) - caps->enable(NULL); + caps->enable((void *)caps); } } @@ -1053,22 +1164,12 @@ static void __init setup_feature_capabilities(void) enable_cpu_capabilities(arm64_features); } -/* - * Check if the current CPU has a given feature capability. - * Should be called from non-preemptible context. - */ +extern const struct arm64_cpu_capabilities arm64_errata[]; + bool this_cpu_has_cap(unsigned int cap) { - const struct arm64_cpu_capabilities *caps; - - if (WARN_ON(preemptible())) - return false; - - for (caps = arm64_features; caps->desc; caps++) - if (caps->capability == cap && caps->matches) - return caps->matches(caps, SCOPE_LOCAL_CPU); - - return false; + return (__this_cpu_has_cap(arm64_features, cap) || + __this_cpu_has_cap(arm64_errata, cap)); } void __init setup_cpu_features(void) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 5cf8aa8556f1..2bfe18d2e739 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -31,9 +31,12 @@ #include <asm/memory.h> #include <asm/ptrace.h> #include <asm/uaccess.h> +#include <asm/mmu.h> +#include <asm/processor.h> #include <asm/thread_info.h> #include <asm/asm-uaccess.h> #include <asm/unistd.h> +#include <asm/kernel-pgtable.h> /* * Context tracking subsystem. Used to instrument transitions @@ -70,8 +73,31 @@ #define BAD_FIQ 2 #define BAD_ERROR 3 - .macro kernel_entry, el, regsize = 64 + .macro kernel_ventry, el, label, regsize = 64 + .align 7 +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +alternative_if ARM64_UNMAP_KERNEL_AT_EL0 + .if \el == 0 + .if \regsize == 64 + mrs x30, tpidrro_el0 + msr tpidrro_el0, xzr + .else + mov x30, xzr + .endif + .endif +alternative_else_nop_endif +#endif + sub sp, sp, #S_FRAME_SIZE + b el\()\el\()_\label + .endm + + .macro tramp_alias, dst, sym + mov_q \dst, TRAMP_VALIAS + add \dst, \dst, #(\sym - .entry.tramp.text) + .endm + + .macro kernel_entry, el, regsize = 64 .if \regsize == 32 mov w0, w0 // zero upper 32 bits of x0 .endif @@ -104,7 +130,7 @@ /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */ ldr x20, [tsk, #TSK_TI_ADDR_LIMIT] str x20, [sp, #S_ORIG_ADDR_LIMIT] - mov x20, #TASK_SIZE_64 + mov x20, #USER_DS str x20, [tsk, #TSK_TI_ADDR_LIMIT] /* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */ .endif /* \el == 0 */ @@ -211,18 +237,20 @@ alternative_else_nop_endif .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 + tst x22, #PSR_MODE32_BIT // native task? + b.eq 3f + #ifdef CONFIG_ARM64_ERRATUM_845719 alternative_if ARM64_WORKAROUND_845719 - tbz x22, #4, 1f #ifdef CONFIG_PID_IN_CONTEXTIDR mrs x29, contextidr_el1 msr contextidr_el1, x29 #else msr contextidr_el1, xzr #endif -1: alternative_else_nop_endif #endif +3: .endif msr elr_el1, x21 // set up the return data @@ -244,7 +272,21 @@ alternative_else_nop_endif ldp x28, x29, [sp, #16 * 14] ldr lr, [sp, #S_LR] add sp, sp, #S_FRAME_SIZE // restore sp - eret // return to kernel + + .if \el == 0 +alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + bne 4f + msr far_el1, x30 + tramp_alias x30, tramp_exit_native + br x30 +4: + tramp_alias x30, tramp_exit_compat + br x30 +#endif + .else + eret + .endif .endm .macro irq_stack_entry @@ -316,31 +358,31 @@ tsk .req x28 // current thread_info .align 11 ENTRY(vectors) - ventry el1_sync_invalid // Synchronous EL1t - ventry el1_irq_invalid // IRQ EL1t - ventry el1_fiq_invalid // FIQ EL1t - ventry el1_error_invalid // Error EL1t + kernel_ventry 1, sync_invalid // Synchronous EL1t + kernel_ventry 1, irq_invalid // IRQ EL1t + kernel_ventry 1, fiq_invalid // FIQ EL1t + kernel_ventry 1, error_invalid // Error EL1t - ventry el1_sync // Synchronous EL1h - ventry el1_irq // IRQ EL1h - ventry el1_fiq_invalid // FIQ EL1h - ventry el1_error_invalid // Error EL1h + kernel_ventry 1, sync // Synchronous EL1h + kernel_ventry 1, irq // IRQ EL1h + kernel_ventry 1, fiq_invalid // FIQ EL1h + kernel_ventry 1, error_invalid // Error EL1h - ventry el0_sync // Synchronous 64-bit EL0 - ventry el0_irq // IRQ 64-bit EL0 - ventry el0_fiq_invalid // FIQ 64-bit EL0 - ventry el0_error_invalid // Error 64-bit EL0 + kernel_ventry 0, sync // Synchronous 64-bit EL0 + kernel_ventry 0, irq // IRQ 64-bit EL0 + kernel_ventry 0, fiq_invalid // FIQ 64-bit EL0 + kernel_ventry 0, error_invalid // Error 64-bit EL0 #ifdef CONFIG_COMPAT - ventry el0_sync_compat // Synchronous 32-bit EL0 - ventry el0_irq_compat // IRQ 32-bit EL0 - ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 - ventry el0_error_invalid_compat // Error 32-bit EL0 + kernel_ventry 0, sync_compat, 32 // Synchronous 32-bit EL0 + kernel_ventry 0, irq_compat, 32 // IRQ 32-bit EL0 + kernel_ventry 0, fiq_invalid_compat, 32 // FIQ 32-bit EL0 + kernel_ventry 0, error_invalid_compat, 32 // Error 32-bit EL0 #else - ventry el0_sync_invalid // Synchronous 32-bit EL0 - ventry el0_irq_invalid // IRQ 32-bit EL0 - ventry el0_fiq_invalid // FIQ 32-bit EL0 - ventry el0_error_invalid // Error 32-bit EL0 + kernel_ventry 0, sync_invalid, 32 // Synchronous 32-bit EL0 + kernel_ventry 0, irq_invalid, 32 // IRQ 32-bit EL0 + kernel_ventry 0, fiq_invalid, 32 // FIQ 32-bit EL0 + kernel_ventry 0, error_invalid, 32 // Error 32-bit EL0 #endif END(vectors) @@ -612,13 +654,15 @@ el0_ia: * Instruction abort handling */ mrs x26, far_el1 - // enable interrupts before calling the main handler - enable_dbg_and_irq + msr daifclr, #(8 | 4 | 1) +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off +#endif ct_user_exit mov x0, x26 mov x1, x25 mov x2, sp - bl do_mem_abort + bl do_el0_ia_bp_hardening b ret_to_user el0_fpsimd_acc: /* @@ -645,8 +689,10 @@ el0_sp_pc: * Stack or PC alignment exception handling */ mrs x26, far_el1 - // enable interrupts before calling the main handler - enable_dbg_and_irq + enable_dbg +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off +#endif ct_user_exit mov x0, x26 mov x1, x25 @@ -705,6 +751,11 @@ el0_irq_naked: #endif ct_user_exit +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR + tbz x22, #55, 1f + bl do_el0_irq_bp_hardening +1: +#endif irq_handler #ifdef CONFIG_TRACE_IRQFLAGS @@ -817,6 +868,7 @@ el0_svc_naked: // compat entry point b.ne __sys_trace cmp scno, sc_nr // check upper syscall limit b.hs ni_sys + mask_nospec64 scno, sc_nr, x19 // enforce bounds for syscall number ldr x16, [stbl, scno, lsl #3] // address in the syscall table blr x16 // call sys_* routine b ret_fast_syscall @@ -865,6 +917,105 @@ __ni_sys_trace: .popsection // .entry.text +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +/* + * Exception vectors trampoline. + */ + .pushsection ".entry.tramp.text", "ax" + + .macro tramp_map_kernel, tmp + mrs \tmp, ttbr1_el1 + sub \tmp, \tmp, #SWAPPER_DIR_SIZE + bic \tmp, \tmp, #USER_ASID_FLAG + msr ttbr1_el1, \tmp + .endm + + .macro tramp_unmap_kernel, tmp + mrs \tmp, ttbr1_el1 + add \tmp, \tmp, #SWAPPER_DIR_SIZE + orr \tmp, \tmp, #USER_ASID_FLAG + msr ttbr1_el1, \tmp + /* + * We avoid running the post_ttbr_update_workaround here because + * it's only needed by Cavium ThunderX, which requires KPTI to be + * disabled. + */ + .endm + + .macro tramp_ventry, regsize = 64 + .align 7 +1: + .if \regsize == 64 + msr tpidrro_el0, x30 // Restored in kernel_ventry + .endif + /* + * Defend against branch aliasing attacks by pushing a dummy + * entry onto the return stack and using a RET instruction to + * enter the full-fat kernel vectors. + */ + bl 2f + b . +2: + tramp_map_kernel x30 +#ifdef CONFIG_RANDOMIZE_BASE + adr x30, tramp_vectors + PAGE_SIZE + isb + ldr x30, [x30] +#else + ldr x30, =vectors +#endif + prfm plil1strm, [x30, #(1b - tramp_vectors)] + msr vbar_el1, x30 + add x30, x30, #(1b - tramp_vectors) + isb + ret + .endm + + .macro tramp_exit, regsize = 64 + adr x30, tramp_vectors + msr vbar_el1, x30 + tramp_unmap_kernel x30 + .if \regsize == 64 + mrs x30, far_el1 + .endif + eret + .endm + + .align 11 +ENTRY(tramp_vectors) + .space 0x400 + + tramp_ventry + tramp_ventry + tramp_ventry + tramp_ventry + + tramp_ventry 32 + tramp_ventry 32 + tramp_ventry 32 + tramp_ventry 32 +END(tramp_vectors) + +ENTRY(tramp_exit_native) + tramp_exit +END(tramp_exit_native) + +ENTRY(tramp_exit_compat) + tramp_exit 32 +END(tramp_exit_compat) + + .ltorg + .popsection // .entry.tramp.text +#ifdef CONFIG_RANDOMIZE_BASE + .pushsection ".rodata", "a" + .align PAGE_SHIFT + .globl __entry_tramp_data_start +__entry_tramp_data_start: + .quad vectors + .popsection // .rodata +#endif /* CONFIG_RANDOMIZE_BASE */ +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + /* * Special system call wrappers. */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index c18658690993..d479185d29c9 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -474,7 +474,7 @@ ENDPROC(__primary_switched) * end early head section, begin head code that is also used for * hotplug and needs to have the same protections as the text region */ - .section ".idmap.text","ax" + .section ".idmap.text","awx" ENTRY(kimage_vaddr) .quad _text - TEXT_OFFSET diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index 409abc45bdb6..1b3eb67edefb 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -175,8 +175,10 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) return NULL; root_ops = kzalloc_node(sizeof(*root_ops), GFP_KERNEL, node); - if (!root_ops) + if (!root_ops) { + kfree(ri); return NULL; + } ri->cfg = pci_acpi_setup_ecam_mapping(root); if (!ri->cfg) { diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 57ae9d9ed9bb..199a23f058d5 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -871,15 +871,24 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event, if (attr->exclude_idle) return -EPERM; - if (is_kernel_in_hyp_mode() && - attr->exclude_kernel != attr->exclude_hv) - return -EINVAL; + + /* + * If we're running in hyp mode, then we *are* the hypervisor. + * Therefore we ignore exclude_hv in this configuration, since + * there's no hypervisor to sample anyway. This is consistent + * with other architectures (x86 and Power). + */ + if (is_kernel_in_hyp_mode()) { + if (!attr->exclude_kernel) + config_base |= ARMV8_PMU_INCLUDE_EL2; + } else { + if (attr->exclude_kernel) + config_base |= ARMV8_PMU_EXCLUDE_EL1; + if (!attr->exclude_hv) + config_base |= ARMV8_PMU_INCLUDE_EL2; + } if (attr->exclude_user) config_base |= ARMV8_PMU_EXCLUDE_EL0; - if (!is_kernel_in_hyp_mode() && attr->exclude_kernel) - config_base |= ARMV8_PMU_EXCLUDE_EL1; - if (!attr->exclude_hv) - config_base |= ARMV8_PMU_INCLUDE_EL2; /* * Install the filter into config_base as this is used to diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 37ea9cf81c00..ede56a74bf9f 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -307,17 +307,17 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, static void tls_thread_switch(struct task_struct *next) { - unsigned long tpidr, tpidrro; + unsigned long tpidr; tpidr = read_sysreg(tpidr_el0); *task_user_tls(current) = tpidr; - tpidr = *task_user_tls(next); - tpidrro = is_compat_thread(task_thread_info(next)) ? - next->thread.tp_value : 0; + if (is_compat_thread(task_thread_info(next))) + write_sysreg(next->thread.tp_value, tpidrro_el0); + else if (!arm64_kernel_unmapped_at_el0()) + write_sysreg(0, tpidrro_el0); - write_sysreg(tpidr, tpidr_el0); - write_sysreg(tpidrro, tpidrro_el0); + write_sysreg(*task_user_tls(next), tpidr_el0); } /* Restore the UAO state depending on next's addr_limit */ diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index df67652e46f0..5a4fbcc744d2 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -95,7 +95,7 @@ ENTRY(__cpu_suspend_enter) ret ENDPROC(__cpu_suspend_enter) - .pushsection ".idmap.text", "ax" + .pushsection ".idmap.text", "awx" ENTRY(cpu_resume) bl el2_setup // if in EL2 drop to EL1 cleanly bl __cpu_setup diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index f6c003e9edbf..2783ff837a24 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -51,7 +51,7 @@ static const char *handler[]= { "Error" }; -int show_unhandled_signals = 1; +int show_unhandled_signals = 0; /* * Dump out the contents of some kernel memory nicely... diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index b8deffa9e1bf..34d3ed64fe8e 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -56,6 +56,17 @@ jiffies = jiffies_64; #define HIBERNATE_TEXT #endif +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#define TRAMP_TEXT \ + . = ALIGN(PAGE_SIZE); \ + VMLINUX_SYMBOL(__entry_tramp_text_start) = .; \ + *(.entry.tramp.text) \ + . = ALIGN(PAGE_SIZE); \ + VMLINUX_SYMBOL(__entry_tramp_text_end) = .; +#else +#define TRAMP_TEXT +#endif + /* * The size of the PE/COFF section that covers the kernel image, which * runs from stext to _edata, must be a round multiple of the PE/COFF @@ -128,6 +139,7 @@ SECTIONS HYPERVISOR_TEXT IDMAP_TEXT HIBERNATE_TEXT + TRAMP_TEXT *(.fixup) *(.gnu.warning) . = ALIGN(16); @@ -221,6 +233,11 @@ SECTIONS . += RESERVED_TTBR0_SIZE; #endif +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + tramp_pg_dir = .; + . += PAGE_SIZE; +#endif + _end = .; STABS_DEBUG @@ -240,7 +257,10 @@ ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1)) <= SZ_4K, "Hibernate exit text too big or misaligned") #endif - +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE, + "Entry trampoline text too big") +#endif /* * If padding is applied before .head.text, virt<->phys conversions will fail. */ diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 3f9e15722473..d3e0a2ffb383 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -25,6 +25,7 @@ #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/fs.h> +#include <kvm/arm_psci.h> #include <asm/cputype.h> #include <asm/uaccess.h> #include <asm/kvm.h> @@ -205,7 +206,7 @@ static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) { return num_core_regs() + kvm_arm_num_sys_reg_descs(vcpu) - + NUM_TIMER_REGS; + + kvm_arm_get_fw_num_regs(vcpu) + NUM_TIMER_REGS; } /** @@ -225,6 +226,11 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) uindices++; } + ret = kvm_arm_copy_fw_reg_indices(vcpu, uindices); + if (ret) + return ret; + uindices += kvm_arm_get_fw_num_regs(vcpu); + ret = copy_timer_indices(vcpu, uindices); if (ret) return ret; @@ -243,6 +249,9 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_CORE) return get_core_reg(vcpu, reg); + if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_FW) + return kvm_arm_get_fw_reg(vcpu, reg); + if (is_timer_reg(reg->id)) return get_timer_reg(vcpu, reg); @@ -259,6 +268,9 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_CORE) return set_core_reg(vcpu, reg); + if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_FW) + return kvm_arm_set_fw_reg(vcpu, reg); + if (is_timer_reg(reg->id)) return set_timer_reg(vcpu, reg); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 2e6e9e99977b..efe43c5f2dc1 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -22,12 +22,15 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <kvm/arm_psci.h> + #include <asm/esr.h> #include <asm/kvm_asm.h> #include <asm/kvm_coproc.h> #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> -#include <asm/kvm_psci.h> +#include <asm/debug-monitors.h> +#include <asm/traps.h> #define CREATE_TRACE_POINTS #include "trace.h" @@ -42,7 +45,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_vcpu_hvc_get_imm(vcpu)); vcpu->stat.hvc_exit_stat++; - ret = kvm_psci_call(vcpu); + ret = kvm_hvc_call_handler(vcpu); if (ret < 0) { vcpu_set_reg(vcpu, 0, ~0UL); return 1; @@ -53,7 +56,16 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) { + /* + * "If an SMC instruction executed at Non-secure EL1 is + * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a + * Trap exception, not a Secure Monitor Call exception [...]" + * + * We need to advance the PC after the trap, as it would + * otherwise return to the same address... + */ vcpu_set_reg(vcpu, 0, ~0UL); + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); return 1; } diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 4e92399f7105..4e9d50c3e658 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -15,6 +15,7 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/arm-smccc.h> #include <linux/linkage.h> #include <asm/alternative.h> @@ -79,10 +80,11 @@ alternative_endif lsr x0, x1, #ESR_ELx_EC_SHIFT cmp x0, #ESR_ELx_EC_HVC64 + ccmp x0, #ESR_ELx_EC_HVC32, #4, ne b.ne el1_trap - mrs x1, vttbr_el2 // If vttbr is valid, the 64bit guest - cbnz x1, el1_trap // called HVC + mrs x1, vttbr_el2 // If vttbr is valid, the guest + cbnz x1, el1_hvc_guest // called HVC /* Here, we're pretty sure the host called HVC. */ ldp x0, x1, [sp], #16 @@ -101,6 +103,20 @@ alternative_endif 2: eret +el1_hvc_guest: + /* + * Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1. + * The workaround has already been applied on the host, + * so let's quickly get back to the guest. We don't bother + * restoring x1, as it can be clobbered anyway. + */ + ldr x1, [sp] // Guest's x0 + eor w1, w1, #ARM_SMCCC_ARCH_WORKAROUND_1 + cbnz w1, el1_trap + mov x0, x1 + add sp, sp, #16 + eret + el1_trap: /* * x0: ESR_EC diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 0c848c18ca44..c49d09387192 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -17,6 +17,9 @@ #include <linux/types.h> #include <linux/jump_label.h> +#include <uapi/linux/psci.h> + +#include <kvm/arm_psci.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> @@ -50,7 +53,7 @@ static void __hyp_text __activate_traps_vhe(void) val &= ~CPACR_EL1_FPEN; write_sysreg(val, cpacr_el1); - write_sysreg(__kvm_hyp_vector, vbar_el1); + write_sysreg(kvm_get_hyp_vector(), vbar_el1); } static void __hyp_text __activate_traps_nvhe(void) @@ -404,6 +407,7 @@ void __hyp_text __noreturn __hyp_panic(void) vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2); host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); + __timer_save_state(vcpu); __deactivate_traps(vcpu); __deactivate_vm(vcpu); __sysreg_restore_host_state(host_ctxt); diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index d7150e30438a..72aa167cdaf2 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -21,7 +21,7 @@ .text -/* Prototype: int __clear_user(void *addr, size_t sz) +/* Prototype: int __arch_clear_user(void *addr, size_t sz) * Purpose : clear some user memory * Params : addr - user memory address to clear * : sz - number of bytes to clear @@ -29,7 +29,7 @@ * * Alignment fixed up by hardware. */ -ENTRY(__clear_user) +ENTRY(__arch_clear_user) uaccess_enable_not_uao x2, x3 mov x2, x1 // save the size for fixup return subs x1, x1, #8 @@ -52,7 +52,7 @@ uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 uaccess_disable_not_uao x2 ret -ENDPROC(__clear_user) +ENDPROC(__arch_clear_user) .section .fixup,"ax" .align 2 diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index 718b1c4e2f85..984570b0bd20 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -64,14 +64,14 @@ .endm end .req x5 -ENTRY(__copy_in_user) +ENTRY(__arch_copy_in_user) uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" uaccess_disable_not_uao x3 mov x0, #0 ret -ENDPROC(__copy_in_user) +ENDPROC(__arch_copy_in_user) .section .fixup,"ax" .align 2 diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 4c63cb154859..c841ccecbc4d 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -39,7 +39,16 @@ static cpumask_t tlb_flush_pending; #define ASID_MASK (~GENMASK(asid_bits - 1, 0)) #define ASID_FIRST_VERSION (1UL << asid_bits) -#define NUM_USER_ASIDS ASID_FIRST_VERSION + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#define NUM_USER_ASIDS (ASID_FIRST_VERSION >> 1) +#define asid2idx(asid) (((asid) & ~ASID_MASK) >> 1) +#define idx2asid(idx) (((idx) << 1) & ~ASID_MASK) +#else +#define NUM_USER_ASIDS (ASID_FIRST_VERSION) +#define asid2idx(asid) ((asid) & ~ASID_MASK) +#define idx2asid(idx) asid2idx(idx) +#endif /* Get the ASIDBits supported by the current CPU */ static u32 get_cpu_asid_bits(void) @@ -104,7 +113,7 @@ static void flush_context(unsigned int cpu) */ if (asid == 0) asid = per_cpu(reserved_asids, i); - __set_bit(asid & ~ASID_MASK, asid_map); + __set_bit(asid2idx(asid), asid_map); per_cpu(reserved_asids, i) = asid; } @@ -159,16 +168,16 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu) * We had a valid ASID in a previous life, so try to re-use * it if possible. */ - asid &= ~ASID_MASK; - if (!__test_and_set_bit(asid, asid_map)) + if (!__test_and_set_bit(asid2idx(asid), asid_map)) return newasid; } /* * Allocate a free ASID. If we can't find one, take a note of the - * currently active ASIDs and mark the TLBs as requiring flushes. - * We always count from ASID #1, as we use ASID #0 when setting a - * reserved TTBR0 for the init_mm. + * currently active ASIDs and mark the TLBs as requiring flushes. We + * always count from ASID #2 (index 1), as we use ASID #0 when setting + * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd + * pairs. */ asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, cur_idx); if (asid != NUM_USER_ASIDS) @@ -185,7 +194,7 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu) set_asid: __set_bit(asid, asid_map); cur_idx = asid; - return asid | generation; + return idx2asid(asid) | generation; } void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) @@ -221,6 +230,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); switch_mm_fastpath: + + arm64_apply_bp_hardening(); + /* * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when * emulating PAN. @@ -229,6 +241,15 @@ switch_mm_fastpath: cpu_switch_mm(mm->pgd, mm); } +/* Errata workaround post TTBRx_EL1 update. */ +asmlinkage void post_ttbr_update_workaround(void) +{ + asm(ALTERNATIVE("nop; nop; nop", + "ic iallu; dsb nsh; isb", + ARM64_WORKAROUND_CAVIUM_27456, + CONFIG_CAVIUM_ERRATUM_27456)); +} + static int asids_init(void) { asid_bits = get_cpu_asid_bits(); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index cf4e3f53a1e3..e4e7b1991985 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -338,7 +338,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, mm_flags |= FAULT_FLAG_WRITE; } - if (addr < USER_DS && is_permission_fault(esr, regs)) { + if (addr < TASK_SIZE && is_permission_fault(esr, regs)) { /* regs->orig_addr_limit may be 0 if we entered from EL0 */ if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); @@ -596,6 +596,29 @@ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr, arm64_notify_die("", regs, &info, esr); } +asmlinkage void __exception do_el0_irq_bp_hardening(void) +{ + /* PC has already been checked in entry.S */ + arm64_apply_bp_hardening(); +} + +asmlinkage void __exception do_el0_ia_bp_hardening(unsigned long addr, + unsigned int esr, + struct pt_regs *regs) +{ + /* + * We've taken an instruction abort from userspace and not yet + * re-enabled IRQs. If the address is a kernel address, apply + * BP hardening prior to enabling IRQs and pre-emption. + */ + if (addr > TASK_SIZE) + arm64_apply_bp_hardening(); + + local_irq_enable(); + do_mem_abort(addr, esr, regs); +} + + /* * Handle stack alignment exceptions. */ @@ -606,6 +629,12 @@ asmlinkage void __exception do_sp_pc_abort(unsigned long addr, struct siginfo info; struct task_struct *tsk = current; + if (user_mode(regs)) { + if (instruction_pointer(regs) > TASK_SIZE) + arm64_apply_bp_hardening(); + local_irq_enable(); + } + if (show_unhandled_signals && unhandled_signal(tsk, SIGBUS)) pr_info_ratelimited("%s[%d]: %s exception: pc=%p sp=%p\n", tsk->comm, task_pid_nr(tsk), @@ -665,6 +694,9 @@ asmlinkage int __exception do_debug_exception(unsigned long addr, if (interrupts_enabled(regs)) trace_hardirqs_off(); + if (user_mode(regs) && instruction_pointer(regs) > TASK_SIZE) + arm64_apply_bp_hardening(); + if (!inf->fn(addr, esr, regs)) { rv = 1; } else { diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 01c171723bb3..caf75abf6ae7 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -18,6 +18,7 @@ #include <linux/elf.h> #include <linux/fs.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/export.h> @@ -102,12 +103,18 @@ void arch_pick_mmap_layout(struct mm_struct *mm) */ int valid_phys_addr_range(phys_addr_t addr, size_t size) { - if (addr < PHYS_OFFSET) - return 0; - if (addr + size > __pa(high_memory - 1) + 1) - return 0; - - return 1; + /* + * Check whether addr is covered by a memory region without the + * MEMBLOCK_NOMAP attribute, and whether that region covers the + * entire range. In theory, this could lead to false negatives + * if the range is covered by distinct but adjacent memory regions + * that only differ in other attributes. However, few of such + * attributes have been defined, and it is debatable whether it + * follows that /dev/mem read() calls should be able traverse + * such boundaries. + */ + return memblock_is_region_memory(addr, size) && + memblock_is_map_memory(addr); } /* diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index f1f1d64d0715..5b1184c369d5 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -422,6 +422,37 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, vm_area_add_early(vma); } +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +static int __init map_entry_trampoline(void) +{ + extern char __entry_tramp_text_start[]; + + pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; + phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start); + + /* The trampoline is always mapped and can therefore be global */ + pgprot_val(prot) &= ~PTE_NG; + + /* Map only the text into the trampoline page table */ + memset(tramp_pg_dir, 0, PGD_SIZE); + __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE, + prot, pgd_pgtable_alloc, 0); + + /* Map both the text and data into the kernel page table */ + __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot); + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + extern char __entry_tramp_data_start[]; + + __set_fixmap(FIX_ENTRY_TRAMP_DATA, + __pa_symbol(__entry_tramp_data_start), + PAGE_KERNEL_RO); + } + + return 0; +} +core_initcall(map_entry_trampoline); +#endif + /* * Create fine-grained mappings for the kernel. */ @@ -775,3 +806,13 @@ int pmd_clear_huge(pmd_t *pmd) pmd_clear(pmd); return 1; } + +int pud_free_pmd_page(pud_t *pud) +{ + return pud_none(*pud); +} + +int pmd_free_pte_page(pmd_t *pmd) +{ + return pmd_none(*pmd); +} diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 32682be978e0..9d37f4e61166 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -86,7 +86,7 @@ ENDPROC(cpu_do_suspend) * * x0: Address of context pointer */ - .pushsection ".idmap.text", "ax" + .pushsection ".idmap.text", "awx" ENTRY(cpu_do_resume) ldp x2, x3, [x0] ldp x4, x5, [x0, #16] @@ -138,15 +138,27 @@ ENDPROC(cpu_do_resume) * - pgd_phys - physical address of new TTB */ ENTRY(cpu_do_switch_mm) + mrs x2, ttbr1_el1 mmid x1, x1 // get mm->context.id - bfi x0, x1, #48, #16 // set the ASID - msr ttbr0_el1, x0 // set TTBR0 + bfi x2, x1, #48, #16 // set the ASID + msr ttbr1_el1, x2 // in TTBR1 (since TCR.A1 is set) isb - post_ttbr0_update_workaround - ret + msr ttbr0_el1, x0 // now update TTBR0 + isb + b post_ttbr_update_workaround // Back to C code... ENDPROC(cpu_do_switch_mm) - .pushsection ".idmap.text", "ax" + .pushsection ".idmap.text", "awx" + +.macro __idmap_cpu_set_reserved_ttbr1, tmp1, tmp2 + adrp \tmp1, empty_zero_page + msr ttbr1_el1, \tmp1 + isb + tlbi vmalle1 + dsb nsh + isb +.endm + /* * void idmap_cpu_replace_ttbr1(phys_addr_t new_pgd) * @@ -157,13 +169,7 @@ ENTRY(idmap_cpu_replace_ttbr1) mrs x2, daif msr daifset, #0xf - adrp x1, empty_zero_page - msr ttbr1_el1, x1 - isb - - tlbi vmalle1 - dsb nsh - isb + __idmap_cpu_set_reserved_ttbr1 x1, x3 msr ttbr1_el1, x0 isb @@ -174,13 +180,196 @@ ENTRY(idmap_cpu_replace_ttbr1) ENDPROC(idmap_cpu_replace_ttbr1) .popsection +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + .pushsection ".idmap.text", "awx" + + .macro __idmap_kpti_get_pgtable_ent, type + dc cvac, cur_\()\type\()p // Ensure any existing dirty + dmb sy // lines are written back before + ldr \type, [cur_\()\type\()p] // loading the entry + tbz \type, #0, next_\()\type // Skip invalid entries + .endm + + .macro __idmap_kpti_put_pgtable_ent_ng, type + orr \type, \type, #PTE_NG // Same bit for blocks and pages + str \type, [cur_\()\type\()p] // Update the entry and ensure it + dc civac, cur_\()\type\()p // is visible to all CPUs. + .endm + +/* + * void __kpti_install_ng_mappings(int cpu, int num_cpus, phys_addr_t swapper) + * + * Called exactly once from stop_machine context by each CPU found during boot. + */ +__idmap_kpti_flag: + .long 1 +ENTRY(idmap_kpti_install_ng_mappings) + cpu .req w0 + num_cpus .req w1 + swapper_pa .req x2 + swapper_ttb .req x3 + flag_ptr .req x4 + cur_pgdp .req x5 + end_pgdp .req x6 + pgd .req x7 + cur_pudp .req x8 + end_pudp .req x9 + pud .req x10 + cur_pmdp .req x11 + end_pmdp .req x12 + pmd .req x13 + cur_ptep .req x14 + end_ptep .req x15 + pte .req x16 + + mrs swapper_ttb, ttbr1_el1 + adr flag_ptr, __idmap_kpti_flag + + cbnz cpu, __idmap_kpti_secondary + + /* We're the boot CPU. Wait for the others to catch up */ + sevl +1: wfe + ldaxr w18, [flag_ptr] + eor w18, w18, num_cpus + cbnz w18, 1b + + /* We need to walk swapper, so turn off the MMU. */ + mrs x18, sctlr_el1 + bic x18, x18, #SCTLR_ELx_M + msr sctlr_el1, x18 + isb + + /* Everybody is enjoying the idmap, so we can rewrite swapper. */ + /* PGD */ + mov cur_pgdp, swapper_pa + add end_pgdp, cur_pgdp, #(PTRS_PER_PGD * 8) +do_pgd: __idmap_kpti_get_pgtable_ent pgd + tbnz pgd, #1, walk_puds + __idmap_kpti_put_pgtable_ent_ng pgd +next_pgd: + add cur_pgdp, cur_pgdp, #8 + cmp cur_pgdp, end_pgdp + b.ne do_pgd + + /* Publish the updated tables and nuke all the TLBs */ + dsb sy + tlbi vmalle1is + dsb ish + isb + + /* We're done: fire up the MMU again */ + mrs x18, sctlr_el1 + orr x18, x18, #SCTLR_ELx_M + msr sctlr_el1, x18 + isb + + /* Set the flag to zero to indicate that we're all done */ + str wzr, [flag_ptr] + ret + + /* PUD */ +walk_puds: + .if CONFIG_PGTABLE_LEVELS > 3 + pte_to_phys cur_pudp, pgd + add end_pudp, cur_pudp, #(PTRS_PER_PUD * 8) +do_pud: __idmap_kpti_get_pgtable_ent pud + tbnz pud, #1, walk_pmds + __idmap_kpti_put_pgtable_ent_ng pud +next_pud: + add cur_pudp, cur_pudp, 8 + cmp cur_pudp, end_pudp + b.ne do_pud + b next_pgd + .else /* CONFIG_PGTABLE_LEVELS <= 3 */ + mov pud, pgd + b walk_pmds +next_pud: + b next_pgd + .endif + + /* PMD */ +walk_pmds: + .if CONFIG_PGTABLE_LEVELS > 2 + pte_to_phys cur_pmdp, pud + add end_pmdp, cur_pmdp, #(PTRS_PER_PMD * 8) +do_pmd: __idmap_kpti_get_pgtable_ent pmd + tbnz pmd, #1, walk_ptes + __idmap_kpti_put_pgtable_ent_ng pmd +next_pmd: + add cur_pmdp, cur_pmdp, #8 + cmp cur_pmdp, end_pmdp + b.ne do_pmd + b next_pud + .else /* CONFIG_PGTABLE_LEVELS <= 2 */ + mov pmd, pud + b walk_ptes +next_pmd: + b next_pud + .endif + + /* PTE */ +walk_ptes: + pte_to_phys cur_ptep, pmd + add end_ptep, cur_ptep, #(PTRS_PER_PTE * 8) +do_pte: __idmap_kpti_get_pgtable_ent pte + __idmap_kpti_put_pgtable_ent_ng pte +next_pte: + add cur_ptep, cur_ptep, #8 + cmp cur_ptep, end_ptep + b.ne do_pte + b next_pmd + + /* Secondary CPUs end up here */ +__idmap_kpti_secondary: + /* Uninstall swapper before surgery begins */ + __idmap_cpu_set_reserved_ttbr1 x18, x17 + + /* Increment the flag to let the boot CPU we're ready */ +1: ldxr w18, [flag_ptr] + add w18, w18, #1 + stxr w17, w18, [flag_ptr] + cbnz w17, 1b + + /* Wait for the boot CPU to finish messing around with swapper */ + sevl +1: wfe + ldxr w18, [flag_ptr] + cbnz w18, 1b + + /* All done, act like nothing happened */ + msr ttbr1_el1, swapper_ttb + isb + ret + + .unreq cpu + .unreq num_cpus + .unreq swapper_pa + .unreq swapper_ttb + .unreq flag_ptr + .unreq cur_pgdp + .unreq end_pgdp + .unreq pgd + .unreq cur_pudp + .unreq end_pudp + .unreq pud + .unreq cur_pmdp + .unreq end_pmdp + .unreq pmd + .unreq cur_ptep + .unreq end_ptep + .unreq pte +ENDPROC(idmap_kpti_install_ng_mappings) + .popsection +#endif + /* * __cpu_setup * * Initialise the processor for turning the MMU on. Return in x0 the * value of the SCTLR_EL1 register. */ - .pushsection ".idmap.text", "ax" + .pushsection ".idmap.text", "awx" ENTRY(__cpu_setup) tlbi vmalle1 // Invalidate local TLB dsb nsh @@ -224,7 +413,7 @@ ENTRY(__cpu_setup) * both user and kernel. */ ldr x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \ - TCR_TG_FLAGS | TCR_ASID16 | TCR_TBI0 + TCR_TG_FLAGS | TCR_ASID16 | TCR_TBI0 | TCR_A1 tcr_set_idmap_t0sz x10, x9 /* @@ -242,6 +431,11 @@ ENTRY(__cpu_setup) cbz x9, 2f cmp x9, #2 b.lt 1f +#ifdef CONFIG_ARM64_ERRATUM_1024718 + /* Disable hardware DBM on Cortex-A55 r0p0, r0p1 & r1p0 */ + cpu_midr_match MIDR_CORTEX_A55, MIDR_CPU_VAR_REV(0, 0), MIDR_CPU_VAR_REV(1, 0), x1, x2, x3, x4 + cbnz x1, 1f +#endif orr x10, x10, #TCR_HD // hardware Dirty flag update 1: orr x10, x10, #TCR_HA // hardware Access flag update 2: diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index d8199e12fb6e..b47a26f4290c 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -234,8 +234,9 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) off = offsetof(struct bpf_array, map.max_entries); emit_a64_mov_i64(tmp, off, ctx); emit(A64_LDR32(tmp, r2, tmp), ctx); + emit(A64_MOV(0, r3, r3), ctx); emit(A64_CMP(0, r3, tmp), ctx); - emit(A64_B_(A64_COND_GE, jmp_offset), ctx); + emit(A64_B_(A64_COND_CS, jmp_offset), ctx); /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; @@ -243,7 +244,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) */ emit_a64_mov_i64(tmp, MAX_TAIL_CALL_CNT, ctx); emit(A64_CMP(1, tcc, tmp), ctx); - emit(A64_B_(A64_COND_GT, jmp_offset), ctx); + emit(A64_B_(A64_COND_HI, jmp_offset), ctx); emit(A64_ADD_I(1, tcc, tcc, 1), ctx); /* prog = array->ptrs[index]; |