Support SM3 accelerating instructions (#108) Add support for seven Neon SM3 accelerating instructions.

commit: b7ab6ec2ee9f1f576f6f44b595a283c17b3acb56 [log] [tgz]
author: mmc28a <78873583+mmc28a@users.noreply.github.com> Wed Jul 24 11:06:17 2024 +0100
committer: GitHub <noreply@github.com> Wed Jul 24 11:06:17 2024 +0100
tree: 75cf3f44b1337cf5c2ff616228bb7976daea5b81
parent: da718c2b77862b594f5301f22eef5d7736ed834a [diff]
diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc
index de2dbc9..534c1f7 100644
--- a/src/aarch64/assembler-aarch64.cc
+++ b/src/aarch64/assembler-aarch64.cc

@@ -6053,6 +6053,70 @@
   Emit(0x4e286800 | Rd(vd) | Rn(vn));
 }
 
+void Assembler::sm3partw1(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+
+  Emit(0xce60c000 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sm3partw2(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+
+  Emit(0xce60c400 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sm3ss1(const VRegister& vd, const VRegister& vn, const VRegister& vm, const VRegister& va) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S() && va.Is4S());
+
+  Emit(0xce400000 | Rd(vd) | Rn(vn) | Rm(vm) | Ra(va));
+}
+
+void Assembler::sm3tt1a(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+  VIXL_ASSERT(IsUint2(index));
+
+  Instr i = static_cast<uint32_t>(index) << 12;
+  Emit(0xce408000 | Rd(vd) | Rn(vn) | Rm(vm) | i);
+}
+
+void Assembler::sm3tt1b(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+  VIXL_ASSERT(IsUint2(index));
+
+  Instr i = static_cast<uint32_t>(index) << 12;
+  Emit(0xce408400 | Rd(vd) | Rn(vn) | Rm(vm) | i);
+}
+
+void Assembler::sm3tt2a(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+  VIXL_ASSERT(IsUint2(index));
+
+  Instr i = static_cast<uint32_t>(index) << 12;
+  Emit(0xce408800 | Rd(vd) | Rn(vn) | Rm(vm) | i);
+}
+
+void Assembler::sm3tt2b(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+  VIXL_ASSERT(IsUint2(index));
+
+  Instr i = static_cast<uint32_t>(index) << 12;
+  Emit(0xce408c00 | Rd(vd) | Rn(vn) | Rm(vm) | i);
+}
+
 // Note:
 // For all ToImm instructions below, a difference in case
 // for the same letter indicates a negated bit.

diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h
index cc4b346..c1e4e6a 100644
--- a/src/aarch64/assembler-aarch64.h
+++ b/src/aarch64/assembler-aarch64.h

@@ -3696,6 +3696,42 @@
   // AES mix columns.
   void aesmc(const VRegister& vd, const VRegister& vn);
 
+  // SM3PARTW1.
+  void sm3partw1(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+  // SM3PARTW2.
+  void sm3partw2(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+  // SM3SS1.
+  void sm3ss1(const VRegister& vd,
+              const VRegister& vn,
+              const VRegister& vm,
+              const VRegister& va);
+
+  // SM3TT1A.
+  void sm3tt1a(const VRegister& vd,
+               const VRegister& vn,
+               const VRegister& vm,
+               int index);
+
+  // SM3TT1B.
+  void sm3tt1b(const VRegister& vd,
+               const VRegister& vn,
+               const VRegister& vm,
+               int index);
+
+  // SM3TT2A.
+  void sm3tt2a(const VRegister& vd,
+               const VRegister& vn,
+               const VRegister& vm,
+               int index);
+
+  // SM3TT2B.
+  void sm3tt2b(const VRegister& vd,
+               const VRegister& vn,
+               const VRegister& vm,
+               int index);
+
   // Scalable Vector Extensions.
 
   // Absolute value (predicated).

diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc
index d815924..66d29f0 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.cc
+++ b/src/aarch64/cpu-features-auditor-aarch64.cc

@@ -279,6 +279,12 @@
   USE(instr);
 }
 
+void CPUFeaturesAuditor::VisitCryptoSM3(const Instruction* instr) {
+  RecordInstructionFeaturesScope scope(this);
+  scope.Record(CPUFeatures::kNEON, CPUFeatures::kSM3);
+  USE(instr);
+}
+
 void CPUFeaturesAuditor::VisitDataProcessing1Source(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
   switch (instr->Mask(DataProcessing1SourceMask)) {

diff --git a/src/aarch64/cpu-features-auditor-aarch64.h b/src/aarch64/cpu-features-auditor-aarch64.h
index 67de644..2496756 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.h
+++ b/src/aarch64/cpu-features-auditor-aarch64.h

@@ -113,6 +113,7 @@
 #define DECLARE(A) virtual void Visit##A(const Instruction* instr);
   VISITOR_LIST(DECLARE)
 #undef DECLARE
+  void VisitCryptoSM3(const Instruction* instr);
 
   void LoadStoreHelper(const Instruction* instr);
   void LoadStorePairHelper(const Instruction* instr);

diff --git a/src/aarch64/decoder-visitor-map-aarch64.h b/src/aarch64/decoder-visitor-map-aarch64.h
index db51cad..b4b39f5 100644
--- a/src/aarch64/decoder-visitor-map-aarch64.h
+++ b/src/aarch64/decoder-visitor-map-aarch64.h

@@ -2656,13 +2656,13 @@
       {"ldtrsw_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented},          \
       {"ldtr_32_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented},            \
       {"ldtr_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented},            \
-      {"sm3partw1_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented},  \
-      {"sm3partw2_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented},  \
-      {"sm3ss1_vvv4_crypto4"_h, &VISITORCLASS::VisitUnimplemented},            \
-      {"sm3tt1a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented},      \
-      {"sm3tt1b_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented},      \
-      {"sm3tt2a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented},      \
-      {"sm3tt2b_vvv_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented},       \
+      {"sm3partw1_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitCryptoSM3},      \
+      {"sm3partw2_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitCryptoSM3},      \
+      {"sm3ss1_vvv4_crypto4"_h, &VISITORCLASS::VisitCryptoSM3},                \
+      {"sm3tt1a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3},          \
+      {"sm3tt1b_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3},          \
+      {"sm3tt2a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3},          \
+      {"sm3tt2b_vvv_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3},           \
       {"sm4ekey_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented},    \
       {"sm4e_vv4_cryptosha512_2"_h, &VISITORCLASS::VisitUnimplemented},        \
       {"st64b_64l_memop"_h, &VISITORCLASS::VisitUnimplemented},                \

diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc
index f03b257..ebfc2c7 100644
--- a/src/aarch64/disasm-aarch64.cc
+++ b/src/aarch64/disasm-aarch64.cc

@@ -2204,6 +2204,25 @@
   FormatWithDecodedMnemonic(instr, "'Vd.16b, 'Vn.16b");
 }
 
+void Disassembler::VisitCryptoSM3(const Instruction *instr) {
+  const char *form = "'Vd.4s, 'Vn.4s, 'Vm.";
+  const char *suffix = "4s";
+
+  switch (form_hash_) {
+    case "sm3ss1_vvv4_crypto4"_h:
+      suffix = "4s, 'Va.4s";
+      break;
+    case "sm3tt1a_vvv4_crypto3_imm2"_h:
+    case "sm3tt1b_vvv4_crypto3_imm2"_h:
+    case "sm3tt2a_vvv4_crypto3_imm2"_h:
+    case "sm3tt2b_vvv_crypto3_imm2"_h:
+      suffix = "s['u1312]";
+      break;
+  }
+
+  FormatWithDecodedMnemonic(instr, form, suffix);
+}
+
 void Disassembler::DisassembleSHA512(const Instruction *instr) {
   const char *form = "'Qd, 'Qn, 'Vm.2d";
   const char *suffix = NULL;

diff --git a/src/aarch64/disasm-aarch64.h b/src/aarch64/disasm-aarch64.h
index 574d4f4..b139c4c 100644
--- a/src/aarch64/disasm-aarch64.h
+++ b/src/aarch64/disasm-aarch64.h

@@ -243,6 +243,8 @@
   void Disassemble_Xd_XnSP_Xm(const Instruction* instr);
   void Disassemble_Xd_XnSP_XmSP(const Instruction* instr);
 
+  void VisitCryptoSM3(const Instruction* instr);
+
   void Format(const Instruction* instr,
               const char* mnemonic,
               const char* format0,

diff --git a/src/aarch64/logic-aarch64.cc b/src/aarch64/logic-aarch64.cc
index 9a81e49..4d50568 100644
--- a/src/aarch64/logic-aarch64.cc
+++ b/src/aarch64/logic-aarch64.cc

@@ -7895,17 +7895,17 @@
 }
 
 template <>
-uint64_t SHA1Operation<"choose"_h>(uint64_t x, uint64_t y, uint64_t z) {
+uint64_t CryptoOp<"choose"_h>(uint64_t x, uint64_t y, uint64_t z) {
   return ((y ^ z) & x) ^ z;
 }
 
 template <>
-uint64_t SHA1Operation<"majority"_h>(uint64_t x, uint64_t y, uint64_t z) {
+uint64_t CryptoOp<"majority"_h>(uint64_t x, uint64_t y, uint64_t z) {
   return (x & y) | ((x | y) & z);
 }
 
 template <>
-uint64_t SHA1Operation<"parity"_h>(uint64_t x, uint64_t y, uint64_t z) {
+uint64_t CryptoOp<"parity"_h>(uint64_t x, uint64_t y, uint64_t z) {
   return x ^ y ^ z;
 }
 
@@ -7932,8 +7932,8 @@
   }
 
   for (unsigned i = 0; i < ArrayLength(x); i++) {
-    uint64_t chs = SHA1Operation<"choose"_h>(y[0], y[1], y[2]);
-    uint64_t maj = SHA1Operation<"majority"_h>(x[0], x[1], x[2]);
+    uint64_t chs = CryptoOp<"choose"_h>(y[0], y[1], y[2]);
+    uint64_t maj = CryptoOp<"majority"_h>(x[0], x[1], x[2]);
 
     uint64_t w = src2.Uint(kFormat4S, i);
     uint64_t t = y[3] + SHASigma<uint32_t, 6, 11, 25>(y[0]) + chs + w;
@@ -8351,6 +8351,125 @@
   return dst;
 }
 
+LogicVRegister Simulator::sm3partw1(LogicVRegister srcdst,
+                                    const LogicVRegister& src1,
+                                    const LogicVRegister& src2) {
+  using namespace std::placeholders;
+  auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+
+  SimVRegister temp;
+
+  ext(kFormat16B, temp, src2, temp, 4);
+  rol(kFormat4S, temp, temp, 15);
+  eor(kFormat4S, temp, temp, src1);
+  LogicVRegister r = eor(kFormat4S, temp, temp, srcdst);
+
+  uint64_t result[4] = {};
+  r.UintArray(kFormat4S, result);
+  for (int i = 0; i < 4; i++) {
+    if (i == 3) {
+      // result[3] already contains srcdst[3] ^ src1[3] from the operations
+      // above.
+      result[i] ^= ROL(result[0], 15);
+    }
+    result[i] ^= ROL(result[i], 15) ^ ROL(result[i], 23);
+  }
+  srcdst.SetUintArray(kFormat4S, result);
+  return srcdst;
+}
+
+LogicVRegister Simulator::sm3partw2(LogicVRegister srcdst,
+                                    const LogicVRegister& src1,
+                                    const LogicVRegister& src2) {
+  using namespace std::placeholders;
+  auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+
+  SimVRegister temp;
+  VectorFormat vf = kFormat4S;
+
+  rol(vf, temp, src2, 7);
+  LogicVRegister r = eor(vf, temp, temp, src1);
+  eor(vf, srcdst, temp, srcdst);
+
+  uint64_t tmp2 = ROL(r.Uint(vf, 0), 15);
+  tmp2 ^= ROL(tmp2, 15) ^ ROL(tmp2, 23);
+  srcdst.SetUint(vf, 3, srcdst.Uint(vf, 3) ^ tmp2);
+  return srcdst;
+}
+
+LogicVRegister Simulator::sm3ss1(LogicVRegister dst,
+                                 const LogicVRegister& src1,
+                                 const LogicVRegister& src2,
+                                 const LogicVRegister& src3) {
+  using namespace std::placeholders;
+  auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+
+  VectorFormat vf = kFormat4S;
+  uint64_t result = ROL(src1.Uint(vf, 3), 12);
+  result += src2.Uint(vf, 3) + src3.Uint(vf, 3);
+  dst.Clear();
+  dst.SetUint(vf, 3, ROL(result, 7));
+  return dst;
+}
+
+LogicVRegister Simulator::sm3tt1(LogicVRegister srcdst,
+                                 const LogicVRegister& src1,
+                                 const LogicVRegister& src2,
+                                 int index,
+                                 bool is_a) {
+  VectorFormat vf = kFormat4S;
+  using namespace std::placeholders;
+  auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+  auto sd = std::bind(&LogicVRegister::Uint, srcdst, vf, _1);
+
+  VIXL_ASSERT(IsUint2(index));
+
+  uint64_t wjprime = src2.Uint(vf, index);
+  uint64_t ss2 = src1.Uint(vf, 3) ^ ROL(sd(3), 12);
+
+  uint64_t tt1;
+  if (is_a) {
+    tt1 = CryptoOp<"parity"_h>(sd(1), sd(2), sd(3));
+  } else {
+    tt1 = CryptoOp<"majority"_h>(sd(1), sd(2), sd(3));
+  }
+  tt1 += sd(0) + ss2 + wjprime;
+
+  ext(kFormat16B, srcdst, srcdst, srcdst, 4);
+  srcdst.SetUint(vf, 1, ROL(sd(1), 9));
+  srcdst.SetUint(vf, 3, tt1);
+  return srcdst;
+}
+
+LogicVRegister Simulator::sm3tt2(LogicVRegister srcdst,
+                                 const LogicVRegister& src1,
+                                 const LogicVRegister& src2,
+                                 int index,
+                                 bool is_a) {
+  VectorFormat vf = kFormat4S;
+  using namespace std::placeholders;
+  auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+  auto sd = std::bind(&LogicVRegister::Uint, srcdst, vf, _1);
+
+  VIXL_ASSERT(IsUint2(index));
+
+  uint64_t wj = src2.Uint(vf, index);
+
+  uint64_t tt2;
+  if (is_a) {
+    tt2 = CryptoOp<"parity"_h>(sd(1), sd(2), sd(3));
+  } else {
+    tt2 = CryptoOp<"choose"_h>(sd(3), sd(2), sd(1));
+  }
+  tt2 += sd(0) + src1.Uint(vf, 3) + wj;
+
+  ext(kFormat16B, srcdst, srcdst, srcdst, 4);
+  srcdst.SetUint(vf, 1, ROL(sd(1), 19));
+  tt2 ^= ROL(tt2, 9) ^ ROL(tt2, 17);
+  srcdst.SetUint(vf, 3, tt2);
+  return srcdst;
+}
+
 }  // namespace aarch64
 }  // namespace vixl
 

diff --git a/src/aarch64/macro-assembler-aarch64.h b/src/aarch64/macro-assembler-aarch64.h
index a989ddd..b74be35 100644
--- a/src/aarch64/macro-assembler-aarch64.h
+++ b/src/aarch64/macro-assembler-aarch64.h

@@ -2812,6 +2812,8 @@
   V(sha512su1, Sha512su1)        \
   V(shadd, Shadd)                \
   V(shsub, Shsub)                \
+  V(sm3partw1, Sm3partw1)        \
+  V(sm3partw2, Sm3partw2)        \
   V(smax, Smax)                  \
   V(smaxp, Smaxp)                \
   V(smin, Smin)                  \
@@ -3052,7 +3054,11 @@
   V(umlsl, Umlsl)                    \
   V(umlsl2, Umlsl2)                  \
   V(sudot, Sudot)                    \
-  V(usdot, Usdot)
+  V(usdot, Usdot)                    \
+  V(sm3tt1a, Sm3tt1a)                \
+  V(sm3tt1b, Sm3tt1b)                \
+  V(sm3tt2a, Sm3tt2a)                \
+  V(sm3tt2b, Sm3tt2b)
 
 
 #define DEFINE_MACRO_ASM_FUNC(ASM, MASM)    \
@@ -3523,6 +3529,14 @@
     SingleEmissionCheckScope guard(this);
     st4(vt, vt2, vt3, vt4, lane, dst);
   }
+  void Sm3ss1(const VRegister& vd,
+              const VRegister& vn,
+              const VRegister& vm,
+              const VRegister& va) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    sm3ss1(vd, vn, vm, va);
+  }
   void Smov(const Register& rd, const VRegister& vn, int vn_index) {
     VIXL_ASSERT(allow_macro_instructions_);
     SingleEmissionCheckScope guard(this);

diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc
index e63715c..83d1649 100644
--- a/src/aarch64/simulator-aarch64.cc
+++ b/src/aarch64/simulator-aarch64.cc

@@ -7261,6 +7261,39 @@
   }
 }
 
+void Simulator::VisitCryptoSM3(const Instruction* instr) {
+  SimVRegister& rd = ReadVRegister(instr->GetRd());
+  SimVRegister& rn = ReadVRegister(instr->GetRn());
+  SimVRegister& rm = ReadVRegister(instr->GetRm());
+  SimVRegister& ra = ReadVRegister(instr->GetRa());
+  int index = instr->ExtractBits(13, 12);
+
+  bool is_a = false;
+  switch (form_hash_) {
+    case "sm3partw1_vvv4_cryptosha512_3"_h:
+      sm3partw1(rd, rn, rm);
+      break;
+    case "sm3partw2_vvv4_cryptosha512_3"_h:
+      sm3partw2(rd, rn, rm);
+      break;
+    case "sm3ss1_vvv4_crypto4"_h:
+      sm3ss1(rd, rn, rm, ra);
+      break;
+    case "sm3tt1a_vvv4_crypto3_imm2"_h:
+      is_a = true;
+      VIXL_FALLTHROUGH();
+    case "sm3tt1b_vvv4_crypto3_imm2"_h:
+      sm3tt1(rd, rn, rm, index, is_a);
+      break;
+    case "sm3tt2a_vvv4_crypto3_imm2"_h:
+      is_a = true;
+      VIXL_FALLTHROUGH();
+    case "sm3tt2b_vvv_crypto3_imm2"_h:
+      sm3tt2(rd, rn, rm, index, is_a);
+      break;
+  }
+}
+
 void Simulator::SimulateSHA512(const Instruction* instr) {
   SimVRegister& rd = ReadVRegister(instr->GetRd());
   SimVRegister& rn = ReadVRegister(instr->GetRn());

diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h
index c20ec3c..6e36246 100644
--- a/src/aarch64/simulator-aarch64.h
+++ b/src/aarch64/simulator-aarch64.h

@@ -1283,7 +1283,7 @@
 class Debugger;
 
 template <uint32_t mode>
-uint64_t SHA1Operation(uint64_t x, uint64_t y, uint64_t z);
+uint64_t CryptoOp(uint64_t x, uint64_t y, uint64_t z);
 
 class Simulator : public DecoderVisitor {
  public:
@@ -1532,6 +1532,7 @@
   void SimulateUnsignedMinMax(const Instruction* instr);
   void SimulateSHA512(const Instruction* instr);
 
+  void VisitCryptoSM3(const Instruction* instr);
 
   // Integer register accessors.
 
@@ -4518,7 +4519,7 @@
     srcdst.UintArray(kFormat4S, sd);
 
     for (unsigned i = 0; i < ArrayLength(sd); i++) {
-      uint64_t t = SHA1Operation<mode>(sd[1], sd[2], sd[3]);
+      uint64_t t = CryptoOp<mode>(sd[1], sd[2], sd[3]);
 
       y += RotateLeft(sd[0], 5, kSRegSize) + t;
       y += src2.Uint(kFormat4S, i);
@@ -4561,6 +4562,27 @@
                         const LogicVRegister& src1,
                         bool inverse);
 
+  LogicVRegister sm3partw1(LogicVRegister dst,
+                           const LogicVRegister& src1,
+                           const LogicVRegister& src2);
+  LogicVRegister sm3partw2(LogicVRegister dst,
+                           const LogicVRegister& src1,
+                           const LogicVRegister& src2);
+  LogicVRegister sm3ss1(LogicVRegister dst,
+                        const LogicVRegister& src1,
+                        const LogicVRegister& src2,
+                        const LogicVRegister& src3);
+  LogicVRegister sm3tt1(LogicVRegister srcdst,
+                        const LogicVRegister& src1,
+                        const LogicVRegister& src2,
+                        int index,
+                        bool is_a);
+  LogicVRegister sm3tt2(LogicVRegister srcdst,
+                        const LogicVRegister& src1,
+                        const LogicVRegister& src2,
+                        int index,
+                        bool is_a);
+
 #define NEON_3VREG_LOGIC_LIST(V) \
   V(addhn)                       \
   V(addhn2)                      \

diff --git a/test/aarch64/test-cpu-features-aarch64.cc b/test/aarch64/test-cpu-features-aarch64.cc
index a14c524..5cc2a58 100644
--- a/test/aarch64/test-cpu-features-aarch64.cc
+++ b/test/aarch64/test-cpu-features-aarch64.cc

@@ -3835,5 +3835,18 @@
 TEST_FEAT(aesmc_0, aesmc(v0.V16B(), v29.V16B()))
 #undef TEST_FEAT
 
+#define TEST_FEAT(NAME, ASM)                                        \
+  TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSM3), \
+                NEON_SM3_##NAME,                                    \
+                ASM)
+TEST_FEAT(sm3partw1_0, sm3partw1(v12.V4S(), v13.V4S(), v14.V4S()))
+TEST_FEAT(sm3partw2_0, sm3partw2(v12.V4S(), v13.V4S(), v14.V4S()))
+TEST_FEAT(sm3ss1_0, sm3ss1(v13.V4S(), v15.V4S(), v17.V4S(), v21.V4S()))
+TEST_FEAT(sm3tt1a_0, sm3tt1a(v30.V4S(), v29.V4S(), v9.V4S(), 1))
+TEST_FEAT(sm3tt1b_0, sm3tt1b(v30.V4S(), v29.V4S(), v9.V4S(), 3))
+TEST_FEAT(sm3tt2a_0, sm3tt2a(v30.V4S(), v29.V4S(), v9.V4S(), 2))
+TEST_FEAT(sm3tt2b_0, sm3tt2b(v30.V4S(), v29.V4S(), v9.V4S(), 0))
+#undef TEST_FEAT
+
 }  // namespace aarch64
 }  // namespace vixl

diff --git a/test/aarch64/test-disasm-aarch64.cc b/test/aarch64/test-disasm-aarch64.cc
index 25820f6..6a29ffe 100644
--- a/test/aarch64/test-disasm-aarch64.cc
+++ b/test/aarch64/test-disasm-aarch64.cc

@@ -3802,15 +3802,15 @@
   COMPARE_PREFIX(dci(0xcec08000), "sha512su0");  // SHA512SU0_VV2_cryptosha512_2
 
   // ARMv8.2 - SM3
-  // COMPARE_PREFIX(dci(0xce400000), "sm3ss1");   // SM3SS1_VVV4_crypto4
-  // COMPARE_PREFIX(dci(0xce408000), "sm3tt1a");   // SM3TT1A_VVV4_crypto3_imm2
-  // COMPARE_PREFIX(dci(0xce408400), "sm3tt1b");   // SM3TT1B_VVV4_crypto3_imm2
-  // COMPARE_PREFIX(dci(0xce408800), "sm3tt2a");   // SM3TT2A_VVV4_crypto3_imm2
-  // COMPARE_PREFIX(dci(0xce408c00), "sm3tt2b");   // SM3TT2B_VVV_crypto3_imm2
-  // COMPARE_PREFIX(dci(0xce60c000), "sm3partw1");   //
-  // SM3PARTW1_VVV4_cryptosha512_3
-  // COMPARE_PREFIX(dci(0xce60c400), "sm3partw2");   //
-  // SM3PARTW2_VVV4_cryptosha512_3
+  COMPARE_PREFIX(dci(0xce400000), "sm3ss1");   // SM3SS1_VVV4_crypto4
+  COMPARE_PREFIX(dci(0xce408000), "sm3tt1a");  // SM3TT1A_VVV4_crypto3_imm2
+  COMPARE_PREFIX(dci(0xce408400), "sm3tt1b");  // SM3TT1B_VVV4_crypto3_imm2
+  COMPARE_PREFIX(dci(0xce408800), "sm3tt2a");  // SM3TT2A_VVV4_crypto3_imm2
+  COMPARE_PREFIX(dci(0xce408c00), "sm3tt2b");  // SM3TT2B_VVV_crypto3_imm2
+  COMPARE_PREFIX(dci(0xce60c000),
+                 "sm3partw1");  // SM3PARTW1_VVV4_cryptosha512_3
+  COMPARE_PREFIX(dci(0xce60c400),
+                 "sm3partw2");  // SM3PARTW2_VVV4_cryptosha512_3
 
   // ARMv8.2 - SM4
   // COMPARE_PREFIX(dci(0xce60c800), "sm4ekey");   //

diff --git a/test/aarch64/test-disasm-neon-aarch64.cc b/test/aarch64/test-disasm-neon-aarch64.cc
index 5bdded9..26ecf4c 100644
--- a/test/aarch64/test-disasm-neon-aarch64.cc
+++ b/test/aarch64/test-disasm-neon-aarch64.cc

@@ -4579,6 +4579,27 @@
   CLEANUP();
 }
 
+TEST(neon_sm3) {
+  SETUP();
+
+  COMPARE_MACRO(Sm3partw1(v12.V4S(), v13.V4S(), v14.V4S()),
+                "sm3partw1 v12.4s, v13.4s, v14.4s");
+  COMPARE_MACRO(Sm3partw2(v12.V4S(), v13.V4S(), v14.V4S()),
+                "sm3partw2 v12.4s, v13.4s, v14.4s");
+  COMPARE_MACRO(Sm3ss1(v13.V4S(), v15.V4S(), v17.V4S(), v21.V4S()),
+                "sm3ss1 v13.4s, v15.4s, v17.4s, v21.4s");
+  COMPARE_MACRO(Sm3tt1a(v30.V4S(), v29.V4S(), v9.V4S(), 1),
+                "sm3tt1a v30.4s, v29.4s, v9.s[1]");
+  COMPARE_MACRO(Sm3tt1b(v30.V4S(), v29.V4S(), v9.V4S(), 3),
+                "sm3tt1b v30.4s, v29.4s, v9.s[3]");
+  COMPARE_MACRO(Sm3tt2a(v30.V4S(), v29.V4S(), v9.V4S(), 2),
+                "sm3tt2a v30.4s, v29.4s, v9.s[2]");
+  COMPARE_MACRO(Sm3tt2b(v30.V4S(), v29.V4S(), v9.V4S(), 0),
+                "sm3tt2b v30.4s, v29.4s, v9.s[0]");
+
+  CLEANUP();
+}
+
 TEST(neon_unallocated_regression_test) {
   SETUP();
 

diff --git a/test/aarch64/test-simulator-sve-aarch64.cc b/test/aarch64/test-simulator-sve-aarch64.cc
index e62fea2..585c00c 100644
--- a/test/aarch64/test-simulator-sve-aarch64.cc
+++ b/test/aarch64/test-simulator-sve-aarch64.cc

@@ -1534,5 +1534,341 @@
   }
 }
 
+TEST_SVE(neon_sm3) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSM3);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 10 * kInstructionSize);
+    __ dci(0xce591017);  // sm3ss1 v23.4s, v0.4s, v25.4s, v4.4s
+    // vl128 state = 0xad4bba0a
+    __ dci(0xce49121f);  // sm3ss1 v31.4s, v16.4s, v9.4s, v4.4s
+    // vl128 state = 0x84adef21
+    __ dci(0xce49121e);  // sm3ss1 v30.4s, v16.4s, v9.4s, v4.4s
+    // vl128 state = 0xccfd7e5a
+    __ dci(0xce49301a);  // sm3ss1 v26.4s, v0.4s, v9.4s, v12.4s
+    // vl128 state = 0x60833cc7
+    __ dci(0xce49720a);  // sm3ss1 v10.4s, v16.4s, v9.4s, v28.4s
+    // vl128 state = 0x03f03263
+    __ dci(0xce58721a);  // sm3ss1 v26.4s, v16.4s, v24.4s, v28.4s
+    // vl128 state = 0x31845f40
+    __ dci(0xce58702a);  // sm3ss1 v10.4s, v1.4s, v24.4s, v28.4s
+    // vl128 state = 0x54c64f70
+    __ dci(0xce58753a);  // sm3ss1 v26.4s, v9.4s, v24.4s, v29.4s
+    // vl128 state = 0x3d5cb04f
+    __ dci(0xce507518);  // sm3ss1 v24.4s, v8.4s, v16.4s, v29.4s
+    // vl128 state = 0xe02de221
+    __ dci(0xce406519);  // sm3ss1 v25.4s, v8.4s, v0.4s, v25.4s
+    // vl128 state = 0x73d36ae8
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x73d36ae8,
+        0xcbcda2db,
+        0x6ee9ad3d,
+        0xa6857a16,
+        0xa238ec05,
+        0x1bc82d1d,
+        0xe4530773,
+        0xfb0d092e,
+        0xe62aff0a,
+        0xf56a593f,
+        0x3967d590,
+        0xebcd14a0,
+        0xa7bedcb8,
+        0x867fa43c,
+        0x1679eab5,
+        0x0a836861,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sm3partw12) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSM3);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 30 * kInstructionSize);
+    __ dci(0xce70c201);  // sm3partw1 v1.4s, v16.4s, v16.4s
+    // vl128 state = 0x6f2069a6
+    __ dci(0xce72c303);  // sm3partw1 v3.4s, v24.4s, v18.4s
+    // vl128 state = 0x986fa56c
+    __ dci(0xce76c381);  // sm3partw1 v1.4s, v28.4s, v22.4s
+    // vl128 state = 0x5dbd953c
+    __ dci(0xce7ec3b1);  // sm3partw1 v17.4s, v29.4s, v30.4s
+    // vl128 state = 0xc72ccca5
+    __ dci(0xce7ac1b5);  // sm3partw1 v21.4s, v13.4s, v26.4s
+    // vl128 state = 0x33cdfd6a
+    __ dci(0xce7ac1b7);  // sm3partw1 v23.4s, v13.4s, v26.4s
+    // vl128 state = 0x4303e945
+    __ dci(0xce7ac1bf);  // sm3partw1 v31.4s, v13.4s, v26.4s
+    // vl128 state = 0x56acac84
+    __ dci(0xce78c1fd);  // sm3partw1 v29.4s, v15.4s, v24.4s
+    // vl128 state = 0x5e2a2793
+    __ dci(0xce78c5df);  // sm3partw2 v31.4s, v14.4s, v24.4s
+    // vl128 state = 0xf7c457f3
+    __ dci(0xce70c55d);  // sm3partw2 v29.4s, v10.4s, v16.4s
+    // vl128 state = 0xfa3557ac
+    __ dci(0xce60c159);  // sm3partw1 v25.4s, v10.4s, v0.4s
+    // vl128 state = 0xb3ae6830
+    __ dci(0xce62c55b);  // sm3partw2 v27.4s, v10.4s, v2.4s
+    // vl128 state = 0xa7747c70
+    __ dci(0xce66c753);  // sm3partw2 v19.4s, v26.4s, v6.4s
+    // vl128 state = 0xb55f5895
+    __ dci(0xce67c551);  // sm3partw2 v17.4s, v10.4s, v7.4s
+    // vl128 state = 0x519b1342
+    __ dci(0xce65c750);  // sm3partw2 v16.4s, v26.4s, v5.4s
+    // vl128 state = 0xc4e6e4b9
+    __ dci(0xce61c718);  // sm3partw2 v24.4s, v24.4s, v1.4s
+    // vl128 state = 0x127c483c
+    __ dci(0xce61c71c);  // sm3partw2 v28.4s, v24.4s, v1.4s
+    // vl128 state = 0x92783ecc
+    __ dci(0xce6dc714);  // sm3partw2 v20.4s, v24.4s, v13.4s
+    // vl128 state = 0xe11e87d3
+    __ dci(0xce65c756);  // sm3partw2 v22.4s, v26.4s, v5.4s
+    // vl128 state = 0x8b6878d0
+    __ dci(0xce65c5d2);  // sm3partw2 v18.4s, v14.4s, v5.4s
+    // vl128 state = 0xf2fb1e86
+    __ dci(0xce64c550);  // sm3partw2 v16.4s, v10.4s, v4.4s
+    // vl128 state = 0x73ad3b0f
+    __ dci(0xce66c578);  // sm3partw2 v24.4s, v11.4s, v6.4s
+    // vl128 state = 0x7e03900d
+    __ dci(0xce76c55c);  // sm3partw2 v28.4s, v10.4s, v22.4s
+    // vl128 state = 0x1d0b5df6
+    __ dci(0xce76c54c);  // sm3partw2 v12.4s, v10.4s, v22.4s
+    // vl128 state = 0x1a3d7a77
+    __ dci(0xce7ec448);  // sm3partw2 v8.4s, v2.4s, v30.4s
+    // vl128 state = 0x3ed2e4bd
+    __ dci(0xce6ec409);  // sm3partw2 v9.4s, v0.4s, v14.4s
+    // vl128 state = 0x826dd348
+    __ dci(0xce6ec52b);  // sm3partw2 v11.4s, v9.4s, v14.4s
+    // vl128 state = 0x3ff5e482
+    __ dci(0xce66c72f);  // sm3partw2 v15.4s, v25.4s, v6.4s
+    // vl128 state = 0x6fd24cd4
+    __ dci(0xce65c73f);  // sm3partw2 v31.4s, v25.4s, v5.4s
+    // vl128 state = 0xd51ac474
+    __ dci(0xce67c77b);  // sm3partw2 v27.4s, v27.4s, v7.4s
+    // vl128 state = 0x720d7419
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x720d7419,
+        0x31445e06,
+        0xd2aee240,
+        0x45a27e4b,
+        0xd6c46f08,
+        0xcaed7f9e,
+        0x734820c7,
+        0x377e1f38,
+        0x12e03585,
+        0x1b9cbe63,
+        0x1d58d49a,
+        0xc160a9dc,
+        0x22c2fe25,
+        0x86b7af0f,
+        0xfeae7bf5,
+        0xf8dfcc40,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sm3tt1) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSM3);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 20 * kInstructionSize);
+    __ dci(0xce53a363);  // sm3tt1a v3.4s, v27.4s, v19.s[2]
+    // vl128 state = 0xaaa8c715
+    __ dci(0xce58a7a7);  // sm3tt1b v7.4s, v29.4s, v24.s[2]
+    // vl128 state = 0xb99a301d
+    __ dci(0xce5eb2b7);  // sm3tt1a v23.4s, v21.4s, v30.s[3]
+    // vl128 state = 0xe8dabe99
+    __ dci(0xce43b6ce);  // sm3tt1b v14.4s, v22.4s, v3.s[3]
+    // vl128 state = 0xaa498ae5
+    __ dci(0xce448027);  // sm3tt1a v7.4s, v1.4s, v4.s[0]
+    // vl128 state = 0x32093547
+    __ dci(0xce4286d8);  // sm3tt1b v24.4s, v22.4s, v2.s[0]
+    // vl128 state = 0xe03e3a81
+    __ dci(0xce44a0f3);  // sm3tt1a v19.4s, v7.4s, v4.s[2]
+    // vl128 state = 0xcb555b4a
+    __ dci(0xce418233);  // sm3tt1a v19.4s, v17.4s, v1.s[0]
+    // vl128 state = 0x751e4f7d
+    __ dci(0xce58a49f);  // sm3tt1b v31.4s, v4.4s, v24.s[2]
+    // vl128 state = 0xcaff7580
+    __ dci(0xce548326);  // sm3tt1a v6.4s, v25.4s, v20.s[0]
+    // vl128 state = 0xc4308a78
+    __ dci(0xce548124);  // sm3tt1a v4.4s, v9.4s, v20.s[0]
+    // vl128 state = 0x1f1bfdfb
+    __ dci(0xce5fb282);  // sm3tt1a v2.4s, v20.4s, v31.s[3]
+    // vl128 state = 0xa632c0b2
+    __ dci(0xce549573);  // sm3tt1b v19.4s, v11.4s, v20.s[1]
+    // vl128 state = 0x7fb7c2d3
+    __ dci(0xce4387ae);  // sm3tt1b v14.4s, v29.4s, v3.s[0]
+    // vl128 state = 0xe8d4c534
+    __ dci(0xce5094eb);  // sm3tt1b v11.4s, v7.4s, v16.s[1]
+    // vl128 state = 0xf34a4fbc
+    __ dci(0xce51b59f);  // sm3tt1b v31.4s, v12.4s, v17.s[3]
+    // vl128 state = 0x98e388e9
+    __ dci(0xce50a7bf);  // sm3tt1b v31.4s, v29.4s, v16.s[2]
+    // vl128 state = 0x7cd7a6ac
+    __ dci(0xce5ca52e);  // sm3tt1b v14.4s, v9.4s, v28.s[2]
+    // vl128 state = 0xce9410c5
+    __ dci(0xce5aa741);  // sm3tt1b v1.4s, v26.4s, v26.s[2]
+    // vl128 state = 0xd83fbd58
+    __ dci(0xce5e94da);  // sm3tt1b v26.4s, v6.4s, v30.s[1]
+    // vl128 state = 0xc6055fe3
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0xc6055fe3,
+        0xa2c33f98,
+        0x1cc9a227,
+        0xf29eb254,
+        0xd1739d6e,
+        0x1c4fff34,
+        0x0c182795,
+        0x96e46836,
+        0x43d010c9,
+        0xd7c4f94c,
+        0x78c387f2,
+        0x4319fef3,
+        0x72407eef,
+        0xa77d3869,
+        0x3c81c49a,
+        0x68cc20ef,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sm3tt2) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSM3);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 20 * kInstructionSize);
+    __ dci(0xce439d42);  // sm3tt2b v2.4s, v10.4s, v3.s[1]
+    // vl128 state = 0x388642cc
+    __ dci(0xce42b89d);  // sm3tt2a v29.4s, v4.4s, v2.s[3]
+    // vl128 state = 0x66f4e60a
+    __ dci(0xce4da95d);  // sm3tt2a v29.4s, v10.4s, v13.s[2]
+    // vl128 state = 0x95d4651d
+    __ dci(0xce49b926);  // sm3tt2a v6.4s, v9.4s, v9.s[3]
+    // vl128 state = 0x826919fe
+    __ dci(0xce5cae33);  // sm3tt2b v19.4s, v17.4s, v28.s[2]
+    // vl128 state = 0xb5cfefb0
+    __ dci(0xce478959);  // sm3tt2a v25.4s, v10.4s, v7.s[0]
+    // vl128 state = 0xfe17b730
+    __ dci(0xce549cc2);  // sm3tt2b v2.4s, v6.4s, v20.s[1]
+    // vl128 state = 0x769a0d76
+    __ dci(0xce4c9f90);  // sm3tt2b v16.4s, v28.4s, v12.s[1]
+    // vl128 state = 0x8f633b95
+    __ dci(0xce508d49);  // sm3tt2b v9.4s, v10.4s, v16.s[0]
+    // vl128 state = 0x5eab6daa
+    __ dci(0xce59ad79);  // sm3tt2b v25.4s, v11.4s, v25.s[2]
+    // vl128 state = 0xfb197616
+    __ dci(0xce458fd6);  // sm3tt2b v22.4s, v30.4s, v5.s[0]
+    // vl128 state = 0x875ff29d
+    __ dci(0xce4ab92c);  // sm3tt2a v12.4s, v9.4s, v10.s[3]
+    // vl128 state = 0xad159c01
+    __ dci(0xce598a1c);  // sm3tt2a v28.4s, v16.4s, v25.s[0]
+    // vl128 state = 0x3da313e4
+    __ dci(0xce43989f);  // sm3tt2a v31.4s, v4.4s, v3.s[1]
+    // vl128 state = 0xc0a54179
+    __ dci(0xce459c8a);  // sm3tt2b v10.4s, v4.4s, v5.s[1]
+    // vl128 state = 0x4739cdbf
+    __ dci(0xce539959);  // sm3tt2a v25.4s, v10.4s, v19.s[1]
+    // vl128 state = 0xd85f84ab
+    __ dci(0xce429be1);  // sm3tt2a v1.4s, v31.4s, v2.s[1]
+    // vl128 state = 0x85b5871c
+    __ dci(0xce5d9fe3);  // sm3tt2b v3.4s, v31.4s, v29.s[1]
+    // vl128 state = 0x2be5bd95
+    __ dci(0xce4ebe16);  // sm3tt2b v22.4s, v16.4s, v14.s[3]
+    // vl128 state = 0x2f8146e9
+    __ dci(0xce599a63);  // sm3tt2a v3.4s, v19.4s, v25.s[1]
+    // vl128 state = 0xa6e513e2
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0xa6e513e2,
+        0x6bf4ae47,
+        0x74e074db,
+        0xae1a57e0,
+        0x0db67f09,
+        0x85332e49,
+        0xc40d6565,
+        0x07ed81aa,
+        0xfa0e10bb,
+        0x9addadfa,
+        0xa9cea561,
+        0xa481e17b,
+        0x7c2be34e,
+        0xd4cf493f,
+        0x8b30cc5e,
+        0xe44416d3,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
 }  // namespace aarch64
 }  // namespace vixl
commit	b7ab6ec2ee9f1f576f6f44b595a283c17b3acb56	[log] [tgz]
author	mmc28a <78873583+mmc28a@users.noreply.github.com>	Wed Jul 24 11:06:17 2024 +0100
committer	GitHub <noreply@github.com>	Wed Jul 24 11:06:17 2024 +0100
tree	75cf3f44b1337cf5c2ff616228bb7976daea5b81
parent	da718c2b77862b594f5301f22eef5d7736ed834a [diff]