Support SM3 accelerating instructions (#108)
Add support for seven Neon SM3 accelerating instructions.
diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc
index de2dbc9..534c1f7 100644
--- a/src/aarch64/assembler-aarch64.cc
+++ b/src/aarch64/assembler-aarch64.cc
@@ -6053,6 +6053,70 @@
Emit(0x4e286800 | Rd(vd) | Rn(vn));
}
+void Assembler::sm3partw1(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+ VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+ VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+ VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+
+ Emit(0xce60c000 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sm3partw2(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+ VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+ VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+ VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+
+ Emit(0xce60c400 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sm3ss1(const VRegister& vd, const VRegister& vn, const VRegister& vm, const VRegister& va) {
+ VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+ VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+ VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S() && va.Is4S());
+
+ Emit(0xce400000 | Rd(vd) | Rn(vn) | Rm(vm) | Ra(va));
+}
+
+void Assembler::sm3tt1a(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) {
+ VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+ VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+ VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+ VIXL_ASSERT(IsUint2(index));
+
+ Instr i = static_cast<uint32_t>(index) << 12;
+ Emit(0xce408000 | Rd(vd) | Rn(vn) | Rm(vm) | i);
+}
+
+void Assembler::sm3tt1b(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) {
+ VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+ VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+ VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+ VIXL_ASSERT(IsUint2(index));
+
+ Instr i = static_cast<uint32_t>(index) << 12;
+ Emit(0xce408400 | Rd(vd) | Rn(vn) | Rm(vm) | i);
+}
+
+void Assembler::sm3tt2a(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) {
+ VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+ VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+ VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+ VIXL_ASSERT(IsUint2(index));
+
+ Instr i = static_cast<uint32_t>(index) << 12;
+ Emit(0xce408800 | Rd(vd) | Rn(vn) | Rm(vm) | i);
+}
+
+void Assembler::sm3tt2b(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) {
+ VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+ VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+ VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+ VIXL_ASSERT(IsUint2(index));
+
+ Instr i = static_cast<uint32_t>(index) << 12;
+ Emit(0xce408c00 | Rd(vd) | Rn(vn) | Rm(vm) | i);
+}
+
// Note:
// For all ToImm instructions below, a difference in case
// for the same letter indicates a negated bit.
diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h
index cc4b346..c1e4e6a 100644
--- a/src/aarch64/assembler-aarch64.h
+++ b/src/aarch64/assembler-aarch64.h
@@ -3696,6 +3696,42 @@
// AES mix columns.
void aesmc(const VRegister& vd, const VRegister& vn);
+ // SM3PARTW1.
+ void sm3partw1(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+ // SM3PARTW2.
+ void sm3partw2(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+ // SM3SS1.
+ void sm3ss1(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm,
+ const VRegister& va);
+
+ // SM3TT1A.
+ void sm3tt1a(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm,
+ int index);
+
+ // SM3TT1B.
+ void sm3tt1b(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm,
+ int index);
+
+ // SM3TT2A.
+ void sm3tt2a(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm,
+ int index);
+
+ // SM3TT2B.
+ void sm3tt2b(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm,
+ int index);
+
// Scalable Vector Extensions.
// Absolute value (predicated).
diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc
index d815924..66d29f0 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.cc
+++ b/src/aarch64/cpu-features-auditor-aarch64.cc
@@ -279,6 +279,12 @@
USE(instr);
}
+void CPUFeaturesAuditor::VisitCryptoSM3(const Instruction* instr) {
+ RecordInstructionFeaturesScope scope(this);
+ scope.Record(CPUFeatures::kNEON, CPUFeatures::kSM3);
+ USE(instr);
+}
+
void CPUFeaturesAuditor::VisitDataProcessing1Source(const Instruction* instr) {
RecordInstructionFeaturesScope scope(this);
switch (instr->Mask(DataProcessing1SourceMask)) {
diff --git a/src/aarch64/cpu-features-auditor-aarch64.h b/src/aarch64/cpu-features-auditor-aarch64.h
index 67de644..2496756 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.h
+++ b/src/aarch64/cpu-features-auditor-aarch64.h
@@ -113,6 +113,7 @@
#define DECLARE(A) virtual void Visit##A(const Instruction* instr);
VISITOR_LIST(DECLARE)
#undef DECLARE
+ void VisitCryptoSM3(const Instruction* instr);
void LoadStoreHelper(const Instruction* instr);
void LoadStorePairHelper(const Instruction* instr);
diff --git a/src/aarch64/decoder-visitor-map-aarch64.h b/src/aarch64/decoder-visitor-map-aarch64.h
index db51cad..b4b39f5 100644
--- a/src/aarch64/decoder-visitor-map-aarch64.h
+++ b/src/aarch64/decoder-visitor-map-aarch64.h
@@ -2656,13 +2656,13 @@
{"ldtrsw_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \
{"ldtr_32_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \
{"ldtr_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \
- {"sm3partw1_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \
- {"sm3partw2_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \
- {"sm3ss1_vvv4_crypto4"_h, &VISITORCLASS::VisitUnimplemented}, \
- {"sm3tt1a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented}, \
- {"sm3tt1b_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented}, \
- {"sm3tt2a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented}, \
- {"sm3tt2b_vvv_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented}, \
+ {"sm3partw1_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitCryptoSM3}, \
+ {"sm3partw2_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitCryptoSM3}, \
+ {"sm3ss1_vvv4_crypto4"_h, &VISITORCLASS::VisitCryptoSM3}, \
+ {"sm3tt1a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3}, \
+ {"sm3tt1b_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3}, \
+ {"sm3tt2a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3}, \
+ {"sm3tt2b_vvv_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3}, \
{"sm4ekey_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \
{"sm4e_vv4_cryptosha512_2"_h, &VISITORCLASS::VisitUnimplemented}, \
{"st64b_64l_memop"_h, &VISITORCLASS::VisitUnimplemented}, \
diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc
index f03b257..ebfc2c7 100644
--- a/src/aarch64/disasm-aarch64.cc
+++ b/src/aarch64/disasm-aarch64.cc
@@ -2204,6 +2204,25 @@
FormatWithDecodedMnemonic(instr, "'Vd.16b, 'Vn.16b");
}
+void Disassembler::VisitCryptoSM3(const Instruction *instr) {
+ const char *form = "'Vd.4s, 'Vn.4s, 'Vm.";
+ const char *suffix = "4s";
+
+ switch (form_hash_) {
+ case "sm3ss1_vvv4_crypto4"_h:
+ suffix = "4s, 'Va.4s";
+ break;
+ case "sm3tt1a_vvv4_crypto3_imm2"_h:
+ case "sm3tt1b_vvv4_crypto3_imm2"_h:
+ case "sm3tt2a_vvv4_crypto3_imm2"_h:
+ case "sm3tt2b_vvv_crypto3_imm2"_h:
+ suffix = "s['u1312]";
+ break;
+ }
+
+ FormatWithDecodedMnemonic(instr, form, suffix);
+}
+
void Disassembler::DisassembleSHA512(const Instruction *instr) {
const char *form = "'Qd, 'Qn, 'Vm.2d";
const char *suffix = NULL;
diff --git a/src/aarch64/disasm-aarch64.h b/src/aarch64/disasm-aarch64.h
index 574d4f4..b139c4c 100644
--- a/src/aarch64/disasm-aarch64.h
+++ b/src/aarch64/disasm-aarch64.h
@@ -243,6 +243,8 @@
void Disassemble_Xd_XnSP_Xm(const Instruction* instr);
void Disassemble_Xd_XnSP_XmSP(const Instruction* instr);
+ void VisitCryptoSM3(const Instruction* instr);
+
void Format(const Instruction* instr,
const char* mnemonic,
const char* format0,
diff --git a/src/aarch64/logic-aarch64.cc b/src/aarch64/logic-aarch64.cc
index 9a81e49..4d50568 100644
--- a/src/aarch64/logic-aarch64.cc
+++ b/src/aarch64/logic-aarch64.cc
@@ -7895,17 +7895,17 @@
}
template <>
-uint64_t SHA1Operation<"choose"_h>(uint64_t x, uint64_t y, uint64_t z) {
+uint64_t CryptoOp<"choose"_h>(uint64_t x, uint64_t y, uint64_t z) {
return ((y ^ z) & x) ^ z;
}
template <>
-uint64_t SHA1Operation<"majority"_h>(uint64_t x, uint64_t y, uint64_t z) {
+uint64_t CryptoOp<"majority"_h>(uint64_t x, uint64_t y, uint64_t z) {
return (x & y) | ((x | y) & z);
}
template <>
-uint64_t SHA1Operation<"parity"_h>(uint64_t x, uint64_t y, uint64_t z) {
+uint64_t CryptoOp<"parity"_h>(uint64_t x, uint64_t y, uint64_t z) {
return x ^ y ^ z;
}
@@ -7932,8 +7932,8 @@
}
for (unsigned i = 0; i < ArrayLength(x); i++) {
- uint64_t chs = SHA1Operation<"choose"_h>(y[0], y[1], y[2]);
- uint64_t maj = SHA1Operation<"majority"_h>(x[0], x[1], x[2]);
+ uint64_t chs = CryptoOp<"choose"_h>(y[0], y[1], y[2]);
+ uint64_t maj = CryptoOp<"majority"_h>(x[0], x[1], x[2]);
uint64_t w = src2.Uint(kFormat4S, i);
uint64_t t = y[3] + SHASigma<uint32_t, 6, 11, 25>(y[0]) + chs + w;
@@ -8351,6 +8351,125 @@
return dst;
}
+LogicVRegister Simulator::sm3partw1(LogicVRegister srcdst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2) {
+ using namespace std::placeholders;
+ auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+
+ SimVRegister temp;
+
+ ext(kFormat16B, temp, src2, temp, 4);
+ rol(kFormat4S, temp, temp, 15);
+ eor(kFormat4S, temp, temp, src1);
+ LogicVRegister r = eor(kFormat4S, temp, temp, srcdst);
+
+ uint64_t result[4] = {};
+ r.UintArray(kFormat4S, result);
+ for (int i = 0; i < 4; i++) {
+ if (i == 3) {
+ // result[3] already contains srcdst[3] ^ src1[3] from the operations
+ // above.
+ result[i] ^= ROL(result[0], 15);
+ }
+ result[i] ^= ROL(result[i], 15) ^ ROL(result[i], 23);
+ }
+ srcdst.SetUintArray(kFormat4S, result);
+ return srcdst;
+}
+
+LogicVRegister Simulator::sm3partw2(LogicVRegister srcdst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2) {
+ using namespace std::placeholders;
+ auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+
+ SimVRegister temp;
+ VectorFormat vf = kFormat4S;
+
+ rol(vf, temp, src2, 7);
+ LogicVRegister r = eor(vf, temp, temp, src1);
+ eor(vf, srcdst, temp, srcdst);
+
+ uint64_t tmp2 = ROL(r.Uint(vf, 0), 15);
+ tmp2 ^= ROL(tmp2, 15) ^ ROL(tmp2, 23);
+ srcdst.SetUint(vf, 3, srcdst.Uint(vf, 3) ^ tmp2);
+ return srcdst;
+}
+
+LogicVRegister Simulator::sm3ss1(LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ const LogicVRegister& src3) {
+ using namespace std::placeholders;
+ auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+
+ VectorFormat vf = kFormat4S;
+ uint64_t result = ROL(src1.Uint(vf, 3), 12);
+ result += src2.Uint(vf, 3) + src3.Uint(vf, 3);
+ dst.Clear();
+ dst.SetUint(vf, 3, ROL(result, 7));
+ return dst;
+}
+
+LogicVRegister Simulator::sm3tt1(LogicVRegister srcdst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ int index,
+ bool is_a) {
+ VectorFormat vf = kFormat4S;
+ using namespace std::placeholders;
+ auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+ auto sd = std::bind(&LogicVRegister::Uint, srcdst, vf, _1);
+
+ VIXL_ASSERT(IsUint2(index));
+
+ uint64_t wjprime = src2.Uint(vf, index);
+ uint64_t ss2 = src1.Uint(vf, 3) ^ ROL(sd(3), 12);
+
+ uint64_t tt1;
+ if (is_a) {
+ tt1 = CryptoOp<"parity"_h>(sd(1), sd(2), sd(3));
+ } else {
+ tt1 = CryptoOp<"majority"_h>(sd(1), sd(2), sd(3));
+ }
+ tt1 += sd(0) + ss2 + wjprime;
+
+ ext(kFormat16B, srcdst, srcdst, srcdst, 4);
+ srcdst.SetUint(vf, 1, ROL(sd(1), 9));
+ srcdst.SetUint(vf, 3, tt1);
+ return srcdst;
+}
+
+LogicVRegister Simulator::sm3tt2(LogicVRegister srcdst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ int index,
+ bool is_a) {
+ VectorFormat vf = kFormat4S;
+ using namespace std::placeholders;
+ auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+ auto sd = std::bind(&LogicVRegister::Uint, srcdst, vf, _1);
+
+ VIXL_ASSERT(IsUint2(index));
+
+ uint64_t wj = src2.Uint(vf, index);
+
+ uint64_t tt2;
+ if (is_a) {
+ tt2 = CryptoOp<"parity"_h>(sd(1), sd(2), sd(3));
+ } else {
+ tt2 = CryptoOp<"choose"_h>(sd(3), sd(2), sd(1));
+ }
+ tt2 += sd(0) + src1.Uint(vf, 3) + wj;
+
+ ext(kFormat16B, srcdst, srcdst, srcdst, 4);
+ srcdst.SetUint(vf, 1, ROL(sd(1), 19));
+ tt2 ^= ROL(tt2, 9) ^ ROL(tt2, 17);
+ srcdst.SetUint(vf, 3, tt2);
+ return srcdst;
+}
+
} // namespace aarch64
} // namespace vixl
diff --git a/src/aarch64/macro-assembler-aarch64.h b/src/aarch64/macro-assembler-aarch64.h
index a989ddd..b74be35 100644
--- a/src/aarch64/macro-assembler-aarch64.h
+++ b/src/aarch64/macro-assembler-aarch64.h
@@ -2812,6 +2812,8 @@
V(sha512su1, Sha512su1) \
V(shadd, Shadd) \
V(shsub, Shsub) \
+ V(sm3partw1, Sm3partw1) \
+ V(sm3partw2, Sm3partw2) \
V(smax, Smax) \
V(smaxp, Smaxp) \
V(smin, Smin) \
@@ -3052,7 +3054,11 @@
V(umlsl, Umlsl) \
V(umlsl2, Umlsl2) \
V(sudot, Sudot) \
- V(usdot, Usdot)
+ V(usdot, Usdot) \
+ V(sm3tt1a, Sm3tt1a) \
+ V(sm3tt1b, Sm3tt1b) \
+ V(sm3tt2a, Sm3tt2a) \
+ V(sm3tt2b, Sm3tt2b)
#define DEFINE_MACRO_ASM_FUNC(ASM, MASM) \
@@ -3523,6 +3529,14 @@
SingleEmissionCheckScope guard(this);
st4(vt, vt2, vt3, vt4, lane, dst);
}
+ void Sm3ss1(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm,
+ const VRegister& va) {
+ VIXL_ASSERT(allow_macro_instructions_);
+ SingleEmissionCheckScope guard(this);
+ sm3ss1(vd, vn, vm, va);
+ }
void Smov(const Register& rd, const VRegister& vn, int vn_index) {
VIXL_ASSERT(allow_macro_instructions_);
SingleEmissionCheckScope guard(this);
diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc
index e63715c..83d1649 100644
--- a/src/aarch64/simulator-aarch64.cc
+++ b/src/aarch64/simulator-aarch64.cc
@@ -7261,6 +7261,39 @@
}
}
+void Simulator::VisitCryptoSM3(const Instruction* instr) {
+ SimVRegister& rd = ReadVRegister(instr->GetRd());
+ SimVRegister& rn = ReadVRegister(instr->GetRn());
+ SimVRegister& rm = ReadVRegister(instr->GetRm());
+ SimVRegister& ra = ReadVRegister(instr->GetRa());
+ int index = instr->ExtractBits(13, 12);
+
+ bool is_a = false;
+ switch (form_hash_) {
+ case "sm3partw1_vvv4_cryptosha512_3"_h:
+ sm3partw1(rd, rn, rm);
+ break;
+ case "sm3partw2_vvv4_cryptosha512_3"_h:
+ sm3partw2(rd, rn, rm);
+ break;
+ case "sm3ss1_vvv4_crypto4"_h:
+ sm3ss1(rd, rn, rm, ra);
+ break;
+ case "sm3tt1a_vvv4_crypto3_imm2"_h:
+ is_a = true;
+ VIXL_FALLTHROUGH();
+ case "sm3tt1b_vvv4_crypto3_imm2"_h:
+ sm3tt1(rd, rn, rm, index, is_a);
+ break;
+ case "sm3tt2a_vvv4_crypto3_imm2"_h:
+ is_a = true;
+ VIXL_FALLTHROUGH();
+ case "sm3tt2b_vvv_crypto3_imm2"_h:
+ sm3tt2(rd, rn, rm, index, is_a);
+ break;
+ }
+}
+
void Simulator::SimulateSHA512(const Instruction* instr) {
SimVRegister& rd = ReadVRegister(instr->GetRd());
SimVRegister& rn = ReadVRegister(instr->GetRn());
diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h
index c20ec3c..6e36246 100644
--- a/src/aarch64/simulator-aarch64.h
+++ b/src/aarch64/simulator-aarch64.h
@@ -1283,7 +1283,7 @@
class Debugger;
template <uint32_t mode>
-uint64_t SHA1Operation(uint64_t x, uint64_t y, uint64_t z);
+uint64_t CryptoOp(uint64_t x, uint64_t y, uint64_t z);
class Simulator : public DecoderVisitor {
public:
@@ -1532,6 +1532,7 @@
void SimulateUnsignedMinMax(const Instruction* instr);
void SimulateSHA512(const Instruction* instr);
+ void VisitCryptoSM3(const Instruction* instr);
// Integer register accessors.
@@ -4518,7 +4519,7 @@
srcdst.UintArray(kFormat4S, sd);
for (unsigned i = 0; i < ArrayLength(sd); i++) {
- uint64_t t = SHA1Operation<mode>(sd[1], sd[2], sd[3]);
+ uint64_t t = CryptoOp<mode>(sd[1], sd[2], sd[3]);
y += RotateLeft(sd[0], 5, kSRegSize) + t;
y += src2.Uint(kFormat4S, i);
@@ -4561,6 +4562,27 @@
const LogicVRegister& src1,
bool inverse);
+ LogicVRegister sm3partw1(LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2);
+ LogicVRegister sm3partw2(LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2);
+ LogicVRegister sm3ss1(LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ const LogicVRegister& src3);
+ LogicVRegister sm3tt1(LogicVRegister srcdst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ int index,
+ bool is_a);
+ LogicVRegister sm3tt2(LogicVRegister srcdst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ int index,
+ bool is_a);
+
#define NEON_3VREG_LOGIC_LIST(V) \
V(addhn) \
V(addhn2) \
diff --git a/test/aarch64/test-cpu-features-aarch64.cc b/test/aarch64/test-cpu-features-aarch64.cc
index a14c524..5cc2a58 100644
--- a/test/aarch64/test-cpu-features-aarch64.cc
+++ b/test/aarch64/test-cpu-features-aarch64.cc
@@ -3835,5 +3835,18 @@
TEST_FEAT(aesmc_0, aesmc(v0.V16B(), v29.V16B()))
#undef TEST_FEAT
+#define TEST_FEAT(NAME, ASM) \
+ TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSM3), \
+ NEON_SM3_##NAME, \
+ ASM)
+TEST_FEAT(sm3partw1_0, sm3partw1(v12.V4S(), v13.V4S(), v14.V4S()))
+TEST_FEAT(sm3partw2_0, sm3partw2(v12.V4S(), v13.V4S(), v14.V4S()))
+TEST_FEAT(sm3ss1_0, sm3ss1(v13.V4S(), v15.V4S(), v17.V4S(), v21.V4S()))
+TEST_FEAT(sm3tt1a_0, sm3tt1a(v30.V4S(), v29.V4S(), v9.V4S(), 1))
+TEST_FEAT(sm3tt1b_0, sm3tt1b(v30.V4S(), v29.V4S(), v9.V4S(), 3))
+TEST_FEAT(sm3tt2a_0, sm3tt2a(v30.V4S(), v29.V4S(), v9.V4S(), 2))
+TEST_FEAT(sm3tt2b_0, sm3tt2b(v30.V4S(), v29.V4S(), v9.V4S(), 0))
+#undef TEST_FEAT
+
} // namespace aarch64
} // namespace vixl
diff --git a/test/aarch64/test-disasm-aarch64.cc b/test/aarch64/test-disasm-aarch64.cc
index 25820f6..6a29ffe 100644
--- a/test/aarch64/test-disasm-aarch64.cc
+++ b/test/aarch64/test-disasm-aarch64.cc
@@ -3802,15 +3802,15 @@
COMPARE_PREFIX(dci(0xcec08000), "sha512su0"); // SHA512SU0_VV2_cryptosha512_2
// ARMv8.2 - SM3
- // COMPARE_PREFIX(dci(0xce400000), "sm3ss1"); // SM3SS1_VVV4_crypto4
- // COMPARE_PREFIX(dci(0xce408000), "sm3tt1a"); // SM3TT1A_VVV4_crypto3_imm2
- // COMPARE_PREFIX(dci(0xce408400), "sm3tt1b"); // SM3TT1B_VVV4_crypto3_imm2
- // COMPARE_PREFIX(dci(0xce408800), "sm3tt2a"); // SM3TT2A_VVV4_crypto3_imm2
- // COMPARE_PREFIX(dci(0xce408c00), "sm3tt2b"); // SM3TT2B_VVV_crypto3_imm2
- // COMPARE_PREFIX(dci(0xce60c000), "sm3partw1"); //
- // SM3PARTW1_VVV4_cryptosha512_3
- // COMPARE_PREFIX(dci(0xce60c400), "sm3partw2"); //
- // SM3PARTW2_VVV4_cryptosha512_3
+ COMPARE_PREFIX(dci(0xce400000), "sm3ss1"); // SM3SS1_VVV4_crypto4
+ COMPARE_PREFIX(dci(0xce408000), "sm3tt1a"); // SM3TT1A_VVV4_crypto3_imm2
+ COMPARE_PREFIX(dci(0xce408400), "sm3tt1b"); // SM3TT1B_VVV4_crypto3_imm2
+ COMPARE_PREFIX(dci(0xce408800), "sm3tt2a"); // SM3TT2A_VVV4_crypto3_imm2
+ COMPARE_PREFIX(dci(0xce408c00), "sm3tt2b"); // SM3TT2B_VVV_crypto3_imm2
+ COMPARE_PREFIX(dci(0xce60c000),
+ "sm3partw1"); // SM3PARTW1_VVV4_cryptosha512_3
+ COMPARE_PREFIX(dci(0xce60c400),
+ "sm3partw2"); // SM3PARTW2_VVV4_cryptosha512_3
// ARMv8.2 - SM4
// COMPARE_PREFIX(dci(0xce60c800), "sm4ekey"); //
diff --git a/test/aarch64/test-disasm-neon-aarch64.cc b/test/aarch64/test-disasm-neon-aarch64.cc
index 5bdded9..26ecf4c 100644
--- a/test/aarch64/test-disasm-neon-aarch64.cc
+++ b/test/aarch64/test-disasm-neon-aarch64.cc
@@ -4579,6 +4579,27 @@
CLEANUP();
}
+TEST(neon_sm3) {
+ SETUP();
+
+ COMPARE_MACRO(Sm3partw1(v12.V4S(), v13.V4S(), v14.V4S()),
+ "sm3partw1 v12.4s, v13.4s, v14.4s");
+ COMPARE_MACRO(Sm3partw2(v12.V4S(), v13.V4S(), v14.V4S()),
+ "sm3partw2 v12.4s, v13.4s, v14.4s");
+ COMPARE_MACRO(Sm3ss1(v13.V4S(), v15.V4S(), v17.V4S(), v21.V4S()),
+ "sm3ss1 v13.4s, v15.4s, v17.4s, v21.4s");
+ COMPARE_MACRO(Sm3tt1a(v30.V4S(), v29.V4S(), v9.V4S(), 1),
+ "sm3tt1a v30.4s, v29.4s, v9.s[1]");
+ COMPARE_MACRO(Sm3tt1b(v30.V4S(), v29.V4S(), v9.V4S(), 3),
+ "sm3tt1b v30.4s, v29.4s, v9.s[3]");
+ COMPARE_MACRO(Sm3tt2a(v30.V4S(), v29.V4S(), v9.V4S(), 2),
+ "sm3tt2a v30.4s, v29.4s, v9.s[2]");
+ COMPARE_MACRO(Sm3tt2b(v30.V4S(), v29.V4S(), v9.V4S(), 0),
+ "sm3tt2b v30.4s, v29.4s, v9.s[0]");
+
+ CLEANUP();
+}
+
TEST(neon_unallocated_regression_test) {
SETUP();
diff --git a/test/aarch64/test-simulator-sve-aarch64.cc b/test/aarch64/test-simulator-sve-aarch64.cc
index e62fea2..585c00c 100644
--- a/test/aarch64/test-simulator-sve-aarch64.cc
+++ b/test/aarch64/test-simulator-sve-aarch64.cc
@@ -1534,5 +1534,341 @@
}
}
+TEST_SVE(neon_sm3) {
+ SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+ CPUFeatures::kNEON,
+ CPUFeatures::kCRC32,
+ CPUFeatures::kSM3);
+ START();
+
+ SetInitialMachineState(&masm);
+ // state = 0xe2bd2480
+
+ {
+ ExactAssemblyScope scope(&masm, 10 * kInstructionSize);
+ __ dci(0xce591017); // sm3ss1 v23.4s, v0.4s, v25.4s, v4.4s
+ // vl128 state = 0xad4bba0a
+ __ dci(0xce49121f); // sm3ss1 v31.4s, v16.4s, v9.4s, v4.4s
+ // vl128 state = 0x84adef21
+ __ dci(0xce49121e); // sm3ss1 v30.4s, v16.4s, v9.4s, v4.4s
+ // vl128 state = 0xccfd7e5a
+ __ dci(0xce49301a); // sm3ss1 v26.4s, v0.4s, v9.4s, v12.4s
+ // vl128 state = 0x60833cc7
+ __ dci(0xce49720a); // sm3ss1 v10.4s, v16.4s, v9.4s, v28.4s
+ // vl128 state = 0x03f03263
+ __ dci(0xce58721a); // sm3ss1 v26.4s, v16.4s, v24.4s, v28.4s
+ // vl128 state = 0x31845f40
+ __ dci(0xce58702a); // sm3ss1 v10.4s, v1.4s, v24.4s, v28.4s
+ // vl128 state = 0x54c64f70
+ __ dci(0xce58753a); // sm3ss1 v26.4s, v9.4s, v24.4s, v29.4s
+ // vl128 state = 0x3d5cb04f
+ __ dci(0xce507518); // sm3ss1 v24.4s, v8.4s, v16.4s, v29.4s
+ // vl128 state = 0xe02de221
+ __ dci(0xce406519); // sm3ss1 v25.4s, v8.4s, v0.4s, v25.4s
+ // vl128 state = 0x73d36ae8
+ }
+
+ uint32_t state;
+ ComputeMachineStateHash(&masm, &state);
+ __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+ __ Ldr(w0, MemOperand(x0));
+
+ END();
+ if (CAN_RUN()) {
+ RUN();
+ uint32_t expected_hashes[] = {
+ 0x73d36ae8,
+ 0xcbcda2db,
+ 0x6ee9ad3d,
+ 0xa6857a16,
+ 0xa238ec05,
+ 0x1bc82d1d,
+ 0xe4530773,
+ 0xfb0d092e,
+ 0xe62aff0a,
+ 0xf56a593f,
+ 0x3967d590,
+ 0xebcd14a0,
+ 0xa7bedcb8,
+ 0x867fa43c,
+ 0x1679eab5,
+ 0x0a836861,
+ };
+ ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+ }
+}
+
+TEST_SVE(neon_sm3partw12) {
+ SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+ CPUFeatures::kNEON,
+ CPUFeatures::kCRC32,
+ CPUFeatures::kSM3);
+ START();
+
+ SetInitialMachineState(&masm);
+ // state = 0xe2bd2480
+
+ {
+ ExactAssemblyScope scope(&masm, 30 * kInstructionSize);
+ __ dci(0xce70c201); // sm3partw1 v1.4s, v16.4s, v16.4s
+ // vl128 state = 0x6f2069a6
+ __ dci(0xce72c303); // sm3partw1 v3.4s, v24.4s, v18.4s
+ // vl128 state = 0x986fa56c
+ __ dci(0xce76c381); // sm3partw1 v1.4s, v28.4s, v22.4s
+ // vl128 state = 0x5dbd953c
+ __ dci(0xce7ec3b1); // sm3partw1 v17.4s, v29.4s, v30.4s
+ // vl128 state = 0xc72ccca5
+ __ dci(0xce7ac1b5); // sm3partw1 v21.4s, v13.4s, v26.4s
+ // vl128 state = 0x33cdfd6a
+ __ dci(0xce7ac1b7); // sm3partw1 v23.4s, v13.4s, v26.4s
+ // vl128 state = 0x4303e945
+ __ dci(0xce7ac1bf); // sm3partw1 v31.4s, v13.4s, v26.4s
+ // vl128 state = 0x56acac84
+ __ dci(0xce78c1fd); // sm3partw1 v29.4s, v15.4s, v24.4s
+ // vl128 state = 0x5e2a2793
+ __ dci(0xce78c5df); // sm3partw2 v31.4s, v14.4s, v24.4s
+ // vl128 state = 0xf7c457f3
+ __ dci(0xce70c55d); // sm3partw2 v29.4s, v10.4s, v16.4s
+ // vl128 state = 0xfa3557ac
+ __ dci(0xce60c159); // sm3partw1 v25.4s, v10.4s, v0.4s
+ // vl128 state = 0xb3ae6830
+ __ dci(0xce62c55b); // sm3partw2 v27.4s, v10.4s, v2.4s
+ // vl128 state = 0xa7747c70
+ __ dci(0xce66c753); // sm3partw2 v19.4s, v26.4s, v6.4s
+ // vl128 state = 0xb55f5895
+ __ dci(0xce67c551); // sm3partw2 v17.4s, v10.4s, v7.4s
+ // vl128 state = 0x519b1342
+ __ dci(0xce65c750); // sm3partw2 v16.4s, v26.4s, v5.4s
+ // vl128 state = 0xc4e6e4b9
+ __ dci(0xce61c718); // sm3partw2 v24.4s, v24.4s, v1.4s
+ // vl128 state = 0x127c483c
+ __ dci(0xce61c71c); // sm3partw2 v28.4s, v24.4s, v1.4s
+ // vl128 state = 0x92783ecc
+ __ dci(0xce6dc714); // sm3partw2 v20.4s, v24.4s, v13.4s
+ // vl128 state = 0xe11e87d3
+ __ dci(0xce65c756); // sm3partw2 v22.4s, v26.4s, v5.4s
+ // vl128 state = 0x8b6878d0
+ __ dci(0xce65c5d2); // sm3partw2 v18.4s, v14.4s, v5.4s
+ // vl128 state = 0xf2fb1e86
+ __ dci(0xce64c550); // sm3partw2 v16.4s, v10.4s, v4.4s
+ // vl128 state = 0x73ad3b0f
+ __ dci(0xce66c578); // sm3partw2 v24.4s, v11.4s, v6.4s
+ // vl128 state = 0x7e03900d
+ __ dci(0xce76c55c); // sm3partw2 v28.4s, v10.4s, v22.4s
+ // vl128 state = 0x1d0b5df6
+ __ dci(0xce76c54c); // sm3partw2 v12.4s, v10.4s, v22.4s
+ // vl128 state = 0x1a3d7a77
+ __ dci(0xce7ec448); // sm3partw2 v8.4s, v2.4s, v30.4s
+ // vl128 state = 0x3ed2e4bd
+ __ dci(0xce6ec409); // sm3partw2 v9.4s, v0.4s, v14.4s
+ // vl128 state = 0x826dd348
+ __ dci(0xce6ec52b); // sm3partw2 v11.4s, v9.4s, v14.4s
+ // vl128 state = 0x3ff5e482
+ __ dci(0xce66c72f); // sm3partw2 v15.4s, v25.4s, v6.4s
+ // vl128 state = 0x6fd24cd4
+ __ dci(0xce65c73f); // sm3partw2 v31.4s, v25.4s, v5.4s
+ // vl128 state = 0xd51ac474
+ __ dci(0xce67c77b); // sm3partw2 v27.4s, v27.4s, v7.4s
+ // vl128 state = 0x720d7419
+ }
+
+ uint32_t state;
+ ComputeMachineStateHash(&masm, &state);
+ __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+ __ Ldr(w0, MemOperand(x0));
+
+ END();
+ if (CAN_RUN()) {
+ RUN();
+ uint32_t expected_hashes[] = {
+ 0x720d7419,
+ 0x31445e06,
+ 0xd2aee240,
+ 0x45a27e4b,
+ 0xd6c46f08,
+ 0xcaed7f9e,
+ 0x734820c7,
+ 0x377e1f38,
+ 0x12e03585,
+ 0x1b9cbe63,
+ 0x1d58d49a,
+ 0xc160a9dc,
+ 0x22c2fe25,
+ 0x86b7af0f,
+ 0xfeae7bf5,
+ 0xf8dfcc40,
+ };
+ ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+ }
+}
+
+TEST_SVE(neon_sm3tt1) {
+ SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+ CPUFeatures::kNEON,
+ CPUFeatures::kCRC32,
+ CPUFeatures::kSM3);
+ START();
+
+ SetInitialMachineState(&masm);
+ // state = 0xe2bd2480
+
+ {
+ ExactAssemblyScope scope(&masm, 20 * kInstructionSize);
+ __ dci(0xce53a363); // sm3tt1a v3.4s, v27.4s, v19.s[2]
+ // vl128 state = 0xaaa8c715
+ __ dci(0xce58a7a7); // sm3tt1b v7.4s, v29.4s, v24.s[2]
+ // vl128 state = 0xb99a301d
+ __ dci(0xce5eb2b7); // sm3tt1a v23.4s, v21.4s, v30.s[3]
+ // vl128 state = 0xe8dabe99
+ __ dci(0xce43b6ce); // sm3tt1b v14.4s, v22.4s, v3.s[3]
+ // vl128 state = 0xaa498ae5
+ __ dci(0xce448027); // sm3tt1a v7.4s, v1.4s, v4.s[0]
+ // vl128 state = 0x32093547
+ __ dci(0xce4286d8); // sm3tt1b v24.4s, v22.4s, v2.s[0]
+ // vl128 state = 0xe03e3a81
+ __ dci(0xce44a0f3); // sm3tt1a v19.4s, v7.4s, v4.s[2]
+ // vl128 state = 0xcb555b4a
+ __ dci(0xce418233); // sm3tt1a v19.4s, v17.4s, v1.s[0]
+ // vl128 state = 0x751e4f7d
+ __ dci(0xce58a49f); // sm3tt1b v31.4s, v4.4s, v24.s[2]
+ // vl128 state = 0xcaff7580
+ __ dci(0xce548326); // sm3tt1a v6.4s, v25.4s, v20.s[0]
+ // vl128 state = 0xc4308a78
+ __ dci(0xce548124); // sm3tt1a v4.4s, v9.4s, v20.s[0]
+ // vl128 state = 0x1f1bfdfb
+ __ dci(0xce5fb282); // sm3tt1a v2.4s, v20.4s, v31.s[3]
+ // vl128 state = 0xa632c0b2
+ __ dci(0xce549573); // sm3tt1b v19.4s, v11.4s, v20.s[1]
+ // vl128 state = 0x7fb7c2d3
+ __ dci(0xce4387ae); // sm3tt1b v14.4s, v29.4s, v3.s[0]
+ // vl128 state = 0xe8d4c534
+ __ dci(0xce5094eb); // sm3tt1b v11.4s, v7.4s, v16.s[1]
+ // vl128 state = 0xf34a4fbc
+ __ dci(0xce51b59f); // sm3tt1b v31.4s, v12.4s, v17.s[3]
+ // vl128 state = 0x98e388e9
+ __ dci(0xce50a7bf); // sm3tt1b v31.4s, v29.4s, v16.s[2]
+ // vl128 state = 0x7cd7a6ac
+ __ dci(0xce5ca52e); // sm3tt1b v14.4s, v9.4s, v28.s[2]
+ // vl128 state = 0xce9410c5
+ __ dci(0xce5aa741); // sm3tt1b v1.4s, v26.4s, v26.s[2]
+ // vl128 state = 0xd83fbd58
+ __ dci(0xce5e94da); // sm3tt1b v26.4s, v6.4s, v30.s[1]
+ // vl128 state = 0xc6055fe3
+ }
+
+ uint32_t state;
+ ComputeMachineStateHash(&masm, &state);
+ __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+ __ Ldr(w0, MemOperand(x0));
+
+ END();
+ if (CAN_RUN()) {
+ RUN();
+ uint32_t expected_hashes[] = {
+ 0xc6055fe3,
+ 0xa2c33f98,
+ 0x1cc9a227,
+ 0xf29eb254,
+ 0xd1739d6e,
+ 0x1c4fff34,
+ 0x0c182795,
+ 0x96e46836,
+ 0x43d010c9,
+ 0xd7c4f94c,
+ 0x78c387f2,
+ 0x4319fef3,
+ 0x72407eef,
+ 0xa77d3869,
+ 0x3c81c49a,
+ 0x68cc20ef,
+ };
+ ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+ }
+}
+
+TEST_SVE(neon_sm3tt2) {
+ SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+ CPUFeatures::kNEON,
+ CPUFeatures::kCRC32,
+ CPUFeatures::kSM3);
+ START();
+
+ SetInitialMachineState(&masm);
+ // state = 0xe2bd2480
+
+ {
+ ExactAssemblyScope scope(&masm, 20 * kInstructionSize);
+ __ dci(0xce439d42); // sm3tt2b v2.4s, v10.4s, v3.s[1]
+ // vl128 state = 0x388642cc
+ __ dci(0xce42b89d); // sm3tt2a v29.4s, v4.4s, v2.s[3]
+ // vl128 state = 0x66f4e60a
+ __ dci(0xce4da95d); // sm3tt2a v29.4s, v10.4s, v13.s[2]
+ // vl128 state = 0x95d4651d
+ __ dci(0xce49b926); // sm3tt2a v6.4s, v9.4s, v9.s[3]
+ // vl128 state = 0x826919fe
+ __ dci(0xce5cae33); // sm3tt2b v19.4s, v17.4s, v28.s[2]
+ // vl128 state = 0xb5cfefb0
+ __ dci(0xce478959); // sm3tt2a v25.4s, v10.4s, v7.s[0]
+ // vl128 state = 0xfe17b730
+ __ dci(0xce549cc2); // sm3tt2b v2.4s, v6.4s, v20.s[1]
+ // vl128 state = 0x769a0d76
+ __ dci(0xce4c9f90); // sm3tt2b v16.4s, v28.4s, v12.s[1]
+ // vl128 state = 0x8f633b95
+ __ dci(0xce508d49); // sm3tt2b v9.4s, v10.4s, v16.s[0]
+ // vl128 state = 0x5eab6daa
+ __ dci(0xce59ad79); // sm3tt2b v25.4s, v11.4s, v25.s[2]
+ // vl128 state = 0xfb197616
+ __ dci(0xce458fd6); // sm3tt2b v22.4s, v30.4s, v5.s[0]
+ // vl128 state = 0x875ff29d
+ __ dci(0xce4ab92c); // sm3tt2a v12.4s, v9.4s, v10.s[3]
+ // vl128 state = 0xad159c01
+ __ dci(0xce598a1c); // sm3tt2a v28.4s, v16.4s, v25.s[0]
+ // vl128 state = 0x3da313e4
+ __ dci(0xce43989f); // sm3tt2a v31.4s, v4.4s, v3.s[1]
+ // vl128 state = 0xc0a54179
+ __ dci(0xce459c8a); // sm3tt2b v10.4s, v4.4s, v5.s[1]
+ // vl128 state = 0x4739cdbf
+ __ dci(0xce539959); // sm3tt2a v25.4s, v10.4s, v19.s[1]
+ // vl128 state = 0xd85f84ab
+ __ dci(0xce429be1); // sm3tt2a v1.4s, v31.4s, v2.s[1]
+ // vl128 state = 0x85b5871c
+ __ dci(0xce5d9fe3); // sm3tt2b v3.4s, v31.4s, v29.s[1]
+ // vl128 state = 0x2be5bd95
+ __ dci(0xce4ebe16); // sm3tt2b v22.4s, v16.4s, v14.s[3]
+ // vl128 state = 0x2f8146e9
+ __ dci(0xce599a63); // sm3tt2a v3.4s, v19.4s, v25.s[1]
+ // vl128 state = 0xa6e513e2
+ }
+
+ uint32_t state;
+ ComputeMachineStateHash(&masm, &state);
+ __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+ __ Ldr(w0, MemOperand(x0));
+
+ END();
+ if (CAN_RUN()) {
+ RUN();
+ uint32_t expected_hashes[] = {
+ 0xa6e513e2,
+ 0x6bf4ae47,
+ 0x74e074db,
+ 0xae1a57e0,
+ 0x0db67f09,
+ 0x85332e49,
+ 0xc40d6565,
+ 0x07ed81aa,
+ 0xfa0e10bb,
+ 0x9addadfa,
+ 0xa9cea561,
+ 0xa481e17b,
+ 0x7c2be34e,
+ 0xd4cf493f,
+ 0x8b30cc5e,
+ 0xe44416d3,
+ };
+ ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+ }
+}
+
} // namespace aarch64
} // namespace vixl