Support SM4 accelerating instructions (#123)

Add support for two Neon SM4 accelerating instructions.
diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc
index 31d5875..3ab0faa 100644
--- a/src/aarch64/assembler-aarch64.cc
+++ b/src/aarch64/assembler-aarch64.cc
@@ -6117,6 +6117,22 @@
   Emit(0xce408c00 | Rd(vd) | Rn(vn) | Rm(vm) | i);
 }
 
+void Assembler::sm4e(const VRegister& vd, const VRegister& vn) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM4));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S());
+
+  Emit(0xcec08400 | Rd(vd) | Rn(vn));
+}
+
+void Assembler::sm4ekey(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM4));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+
+  Emit(0xce60c800 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
 // Note:
 // For all ToImm instructions below, a difference in case
 // for the same letter indicates a negated bit.
diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h
index c1e4e6a..441a528 100644
--- a/src/aarch64/assembler-aarch64.h
+++ b/src/aarch64/assembler-aarch64.h
@@ -3732,6 +3732,12 @@
                const VRegister& vm,
                int index);
 
+  // SM4 Encode.
+  void sm4e(const VRegister& vd, const VRegister& vn);
+
+  // SM4 Key.
+  void sm4ekey(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
   // Scalable Vector Extensions.
 
   // Absolute value (predicated).
diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc
index 66d29f0..407ff98 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.cc
+++ b/src/aarch64/cpu-features-auditor-aarch64.cc
@@ -285,6 +285,12 @@
   USE(instr);
 }
 
+void CPUFeaturesAuditor::VisitCryptoSM4(const Instruction* instr) {
+  RecordInstructionFeaturesScope scope(this);
+  scope.Record(CPUFeatures::kNEON, CPUFeatures::kSM4);
+  USE(instr);
+}
+
 void CPUFeaturesAuditor::VisitDataProcessing1Source(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
   switch (instr->Mask(DataProcessing1SourceMask)) {
diff --git a/src/aarch64/cpu-features-auditor-aarch64.h b/src/aarch64/cpu-features-auditor-aarch64.h
index 7d5ca2f..489083a 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.h
+++ b/src/aarch64/cpu-features-auditor-aarch64.h
@@ -114,6 +114,7 @@
   VISITOR_LIST(DECLARE)
 #undef DECLARE
   void VisitCryptoSM3(const Instruction* instr);
+  void VisitCryptoSM4(const Instruction* instr);
 
   void LoadStoreHelper(const Instruction* instr);
   void LoadStorePairHelper(const Instruction* instr);
diff --git a/src/aarch64/decoder-visitor-map-aarch64.h b/src/aarch64/decoder-visitor-map-aarch64.h
index b5358ce..bda71ce 100644
--- a/src/aarch64/decoder-visitor-map-aarch64.h
+++ b/src/aarch64/decoder-visitor-map-aarch64.h
@@ -2663,8 +2663,8 @@
       {"sm3tt1b_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3},          \
       {"sm3tt2a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3},          \
       {"sm3tt2b_vvv_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3},           \
-      {"sm4ekey_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented},    \
-      {"sm4e_vv4_cryptosha512_2"_h, &VISITORCLASS::VisitUnimplemented},        \
+      {"sm4ekey_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitCryptoSM4},        \
+      {"sm4e_vv4_cryptosha512_2"_h, &VISITORCLASS::VisitCryptoSM4},            \
       {"st64b_64l_memop"_h, &VISITORCLASS::VisitUnimplemented},                \
       {"st64bv_64_memop"_h, &VISITORCLASS::VisitUnimplemented},                \
       {"st64bv0_64_memop"_h, &VISITORCLASS::VisitUnimplemented},               \
diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc
index ec4dfc9..cc42709 100644
--- a/src/aarch64/disasm-aarch64.cc
+++ b/src/aarch64/disasm-aarch64.cc
@@ -2223,6 +2223,16 @@
   FormatWithDecodedMnemonic(instr, form, suffix);
 }
 
+void Disassembler::VisitCryptoSM4(const Instruction *instr) {
+  VIXL_ASSERT((form_hash_ == "sm4ekey_vvv4_cryptosha512_3"_h) ||
+              (form_hash_ == "sm4e_vv4_cryptosha512_2"_h));
+  const char *form = "'Vd.4s, 'Vn.4s";
+  const char *suffix =
+      (form_hash_ == "sm4e_vv4_cryptosha512_2"_h) ? NULL : ", 'Vm.4s";
+
+  FormatWithDecodedMnemonic(instr, form, suffix);
+}
+
 void Disassembler::DisassembleSHA512(const Instruction *instr) {
   const char *form = "'Qd, 'Qn, 'Vm.2d";
   const char *suffix = NULL;
diff --git a/src/aarch64/disasm-aarch64.h b/src/aarch64/disasm-aarch64.h
index b139c4c..8f028b5 100644
--- a/src/aarch64/disasm-aarch64.h
+++ b/src/aarch64/disasm-aarch64.h
@@ -244,6 +244,7 @@
   void Disassemble_Xd_XnSP_XmSP(const Instruction* instr);
 
   void VisitCryptoSM3(const Instruction* instr);
+  void VisitCryptoSM4(const Instruction* instr);
 
   void Format(const Instruction* instr,
               const char* mnemonic,
diff --git a/src/aarch64/logic-aarch64.cc b/src/aarch64/logic-aarch64.cc
index 246ffe9..2d923cd 100644
--- a/src/aarch64/logic-aarch64.cc
+++ b/src/aarch64/logic-aarch64.cc
@@ -8477,6 +8477,77 @@
   return srcdst;
 }
 
+static uint64_t SM4SBox(uint64_t x) {
+  static const uint8_t sbox[256] = {
+      0x48, 0x39, 0xcb, 0xd7, 0x3e, 0x5f, 0xee, 0x79, 0x20, 0x4d, 0xdc, 0x3a,
+      0xec, 0x7d, 0xf0, 0x18, 0x84, 0xc6, 0x6e, 0xc5, 0x09, 0xf1, 0xb9, 0x65,
+      0x7e, 0x77, 0x96, 0x0c, 0x4a, 0x97, 0x69, 0x89, 0xb0, 0xb4, 0xe5, 0xb8,
+      0x12, 0xd0, 0x74, 0x2d, 0xbd, 0x7b, 0xcd, 0xa5, 0x88, 0x31, 0xc1, 0x0a,
+      0xd8, 0x5a, 0x10, 0x1f, 0x41, 0x5c, 0xd9, 0x11, 0x7f, 0xbc, 0xdd, 0xbb,
+      0x92, 0xaf, 0x1b, 0x8d, 0x51, 0x5b, 0x6c, 0x6d, 0x72, 0x6a, 0xff, 0x03,
+      0x2f, 0x8e, 0xfd, 0xde, 0x45, 0x37, 0xdb, 0xd5, 0x6f, 0x4e, 0x53, 0x0d,
+      0xab, 0x23, 0x29, 0xc0, 0x60, 0xca, 0x66, 0x82, 0x2e, 0xe2, 0xf6, 0x1d,
+      0xe3, 0xb1, 0x8c, 0xf5, 0x30, 0x32, 0x93, 0xad, 0x55, 0x1a, 0x34, 0x9b,
+      0xa4, 0x5d, 0xae, 0xe0, 0xa1, 0x15, 0x61, 0xf9, 0xce, 0xf2, 0xf7, 0xa3,
+      0xb5, 0x38, 0xc7, 0x40, 0xd2, 0x8a, 0xbf, 0xea, 0x9e, 0xc8, 0xc4, 0xa0,
+      0xe7, 0x02, 0x36, 0x4c, 0x52, 0x27, 0xd3, 0x9f, 0x57, 0x46, 0x00, 0xd4,
+      0x87, 0x78, 0x21, 0x01, 0x3b, 0x7c, 0x22, 0x25, 0xa2, 0xd1, 0x58, 0x63,
+      0x5e, 0x0e, 0x24, 0x1e, 0x35, 0x9d, 0x56, 0x70, 0x4b, 0x0f, 0xeb, 0xf8,
+      0x8b, 0xda, 0x64, 0x71, 0xb2, 0x81, 0x6b, 0x68, 0xa8, 0x4f, 0x85, 0xe6,
+      0x19, 0x3c, 0x59, 0x83, 0xba, 0x17, 0x73, 0xf3, 0xfc, 0xa7, 0x07, 0x47,
+      0xa6, 0x3f, 0x8f, 0x75, 0xfa, 0x94, 0xdf, 0x80, 0x95, 0xe8, 0x08, 0xc9,
+      0xa9, 0x1c, 0xb3, 0xe4, 0x62, 0xac, 0xcf, 0xed, 0x43, 0x0b, 0x54, 0x33,
+      0x7a, 0x98, 0xef, 0x91, 0xf4, 0x50, 0x42, 0x9c, 0x99, 0x06, 0x86, 0x49,
+      0x26, 0x13, 0x44, 0xaa, 0xc3, 0x04, 0xbe, 0x2a, 0x76, 0x9a, 0x67, 0x2b,
+      0x05, 0x2c, 0xfb, 0x28, 0xc2, 0x14, 0xb6, 0x16, 0xb7, 0x3d, 0xe1, 0xcc,
+      0xfe, 0xe9, 0x90, 0xd6,
+  };
+  uint64_t result = 0;
+  for (int j = 24; j >= 0; j -= 8) {
+    uint8_t s = 255 - ((x >> j) & 0xff);
+    result = (result << 8) | sbox[s];
+  }
+  return result;
+}
+
+LogicVRegister Simulator::sm4(LogicVRegister srcdst,
+                              const LogicVRegister& src1,
+                              const LogicVRegister& src2,
+                              bool is_key) {
+  using namespace std::placeholders;
+  auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+
+  VectorFormat vf = kFormat4S;
+  uint64_t result[4] = {};
+  if (is_key) {
+    src1.UintArray(vf, result);
+  } else {
+    srcdst.UintArray(vf, result);
+  }
+
+  for (int i = 0; i < 4; i++) {
+    uint64_t k = is_key ? src2.Uint(vf, i) : src1.Uint(vf, i);
+    uint64_t intval = result[3] ^ result[2] ^ result[1] ^ k;
+    intval = SM4SBox(intval);
+
+    if (is_key) {
+      intval ^= ROL(intval, 13) ^ ROL(intval, 23);
+    } else {
+      intval ^=
+          ROL(intval, 2) ^ ROL(intval, 10) ^ ROL(intval, 18) ^ ROL(intval, 24);
+    }
+
+    intval ^= result[0];
+
+    result[0] = result[1];
+    result[1] = result[2];
+    result[2] = result[3];
+    result[3] = intval;
+  }
+  srcdst.SetUintArray(vf, result);
+  return srcdst;
+}
+
 }  // namespace aarch64
 }  // namespace vixl
 
diff --git a/src/aarch64/macro-assembler-aarch64.h b/src/aarch64/macro-assembler-aarch64.h
index f2bd917..1763f49 100644
--- a/src/aarch64/macro-assembler-aarch64.h
+++ b/src/aarch64/macro-assembler-aarch64.h
@@ -2814,6 +2814,7 @@
   V(shsub, Shsub)                \
   V(sm3partw1, Sm3partw1)        \
   V(sm3partw2, Sm3partw2)        \
+  V(sm4ekey, Sm4ekey)            \
   V(smax, Smax)                  \
   V(smaxp, Smaxp)                \
   V(smin, Smin)                  \
@@ -2964,6 +2965,7 @@
   V(sha1su1, Sha1su1)            \
   V(sha256su0, Sha256su0)        \
   V(sha512su0, Sha512su0)        \
+  V(sm4e, Sm4e)                  \
   V(smaxv, Smaxv)                \
   V(sminv, Sminv)                \
   V(sqabs, Sqabs)                \
diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc
index 070e630..1a2959b 100644
--- a/src/aarch64/simulator-aarch64.cc
+++ b/src/aarch64/simulator-aarch64.cc
@@ -7353,6 +7353,22 @@
   }
 }
 
+void Simulator::VisitCryptoSM4(const Instruction* instr) {
+  SimVRegister& rd = ReadVRegister(instr->GetRd());
+  SimVRegister& rn = ReadVRegister(instr->GetRn());
+  SimVRegister& rm = ReadVRegister(instr->GetRm());
+
+  bool is_key = false;
+  switch (form_hash_) {
+    case "sm4ekey_vvv4_cryptosha512_3"_h:
+      is_key = true;
+      VIXL_FALLTHROUGH();
+    case "sm4e_vv4_cryptosha512_2"_h:
+      sm4(rd, rn, rm, is_key);
+      break;
+  }
+}
+
 void Simulator::SimulateSHA512(const Instruction* instr) {
   SimVRegister& rd = ReadVRegister(instr->GetRd());
   SimVRegister& rn = ReadVRegister(instr->GetRn());
diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h
index fa530bd..632b8ed 100644
--- a/src/aarch64/simulator-aarch64.h
+++ b/src/aarch64/simulator-aarch64.h
@@ -1534,6 +1534,7 @@
   void SimulateSHA512(const Instruction* instr);
 
   void VisitCryptoSM3(const Instruction* instr);
+  void VisitCryptoSM4(const Instruction* instr);
 
   // Integer register accessors.
 
@@ -4587,6 +4588,11 @@
                         int index,
                         bool is_a);
 
+  LogicVRegister sm4(LogicVRegister dst,
+                     const LogicVRegister& src1,
+                     const LogicVRegister& src2,
+                     bool is_key);
+
 #define NEON_3VREG_LOGIC_LIST(V) \
   V(addhn)                       \
   V(addhn2)                      \
diff --git a/test/aarch64/test-cpu-features-aarch64.cc b/test/aarch64/test-cpu-features-aarch64.cc
index 5cc2a58..5b5e603 100644
--- a/test/aarch64/test-cpu-features-aarch64.cc
+++ b/test/aarch64/test-cpu-features-aarch64.cc
@@ -3848,5 +3848,13 @@
 TEST_FEAT(sm3tt2b_0, sm3tt2b(v30.V4S(), v29.V4S(), v9.V4S(), 0))
 #undef TEST_FEAT
 
+#define TEST_FEAT(NAME, ASM)                                        \
+  TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSM4), \
+                NEON_SM4_##NAME,                                    \
+                ASM)
+TEST_FEAT(sm4e, sm4e(v12.V4S(), v13.V4S()))
+TEST_FEAT(sm4ekey, sm4ekey(v12.V4S(), v13.V4S(), v14.V4S()))
+#undef TEST_FEAT
+
 }  // namespace aarch64
 }  // namespace vixl
diff --git a/test/aarch64/test-disasm-aarch64.cc b/test/aarch64/test-disasm-aarch64.cc
index b4a674e..14a354b 100644
--- a/test/aarch64/test-disasm-aarch64.cc
+++ b/test/aarch64/test-disasm-aarch64.cc
@@ -3813,9 +3813,8 @@
                  "sm3partw2");  // SM3PARTW2_VVV4_cryptosha512_3
 
   // ARMv8.2 - SM4
-  // COMPARE_PREFIX(dci(0xce60c800), "sm4ekey");   //
-  // SM4EKEY_VVV4_cryptosha512_3
-  // COMPARE_PREFIX(dci(0xcec08400), "sm4e");   // SM4E_VV4_cryptosha512_2
+  COMPARE_PREFIX(dci(0xce60c800), "sm4ekey");  // SM4EKEY_VVV4_cryptosha512_3
+  COMPARE_PREFIX(dci(0xcec08400), "sm4e");     // SM4E_VV4_cryptosha512_2
 
   // ARMv8.2 - SPE
   // COMPARE_PREFIX(dci(0xd503223f), "psb");   // PSB_HC_hints
diff --git a/test/aarch64/test-disasm-neon-aarch64.cc b/test/aarch64/test-disasm-neon-aarch64.cc
index 912ee22..f50e5a6 100644
--- a/test/aarch64/test-disasm-neon-aarch64.cc
+++ b/test/aarch64/test-disasm-neon-aarch64.cc
@@ -4600,6 +4600,16 @@
   CLEANUP();
 }
 
+TEST(neon_sm4) {
+  SETUP();
+
+  COMPARE_MACRO(Sm4e(v12.V4S(), v13.V4S()), "sm4e v12.4s, v13.4s");
+  COMPARE_MACRO(Sm4ekey(v12.V4S(), v13.V4S(), v14.V4S()),
+                "sm4ekey v12.4s, v13.4s, v14.4s");
+
+  CLEANUP();
+}
+
 TEST(neon_unallocated_regression_test) {
   SETUP();
 
diff --git a/test/aarch64/test-simulator-sve-aarch64.cc b/test/aarch64/test-simulator-sve-aarch64.cc
index 585c00c..6b5b958 100644
--- a/test/aarch64/test-simulator-sve-aarch64.cc
+++ b/test/aarch64/test-simulator-sve-aarch64.cc
@@ -1870,5 +1870,173 @@
   }
 }
 
+TEST_SVE(neon_sm4e) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSM4);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 20 * kInstructionSize);
+    __ dci(0xcec08400);  // sm4e v0.4s, v0.4s
+    // vl128 state = 0xa687bacc
+    __ dci(0xcec08628);  // sm4e v8.4s, v17.4s
+    // vl128 state = 0xf174e346
+    __ dci(0xcec0862a);  // sm4e v10.4s, v17.4s
+    // vl128 state = 0xab88f8ca
+    __ dci(0xcec08628);  // sm4e v8.4s, v17.4s
+    // vl128 state = 0x000d3840
+    __ dci(0xcec08638);  // sm4e v24.4s, v17.4s
+    // vl128 state = 0xd980ddc2
+    __ dci(0xcec08688);  // sm4e v8.4s, v20.4s
+    // vl128 state = 0xd501f2c2
+    __ dci(0xcec0868c);  // sm4e v12.4s, v20.4s
+    // vl128 state = 0x699d6b6f
+    __ dci(0xcec0864d);  // sm4e v13.4s, v18.4s
+    // vl128 state = 0x67baf406
+    __ dci(0xcec08649);  // sm4e v9.4s, v18.4s
+    // vl128 state = 0x178b048e
+    __ dci(0xcec08659);  // sm4e v25.4s, v18.4s
+    // vl128 state = 0x552a70d9
+    __ dci(0xcec0865d);  // sm4e v29.4s, v18.4s
+    // vl128 state = 0x3be534d1
+    __ dci(0xcec0865f);  // sm4e v31.4s, v18.4s
+    // vl128 state = 0x396fdf70
+    __ dci(0xcec08657);  // sm4e v23.4s, v18.4s
+    // vl128 state = 0x836c474b
+    __ dci(0xcec086e7);  // sm4e v7.4s, v23.4s
+    // vl128 state = 0x71aebad7
+    __ dci(0xcec08683);  // sm4e v3.4s, v20.4s
+    // vl128 state = 0xadfd515c
+    __ dci(0xcec08681);  // sm4e v1.4s, v20.4s
+    // vl128 state = 0xf1465ab4
+    __ dci(0xcec087c0);  // sm4e v0.4s, v30.4s
+    // vl128 state = 0x8555b40f
+    __ dci(0xcec087c4);  // sm4e v4.4s, v30.4s
+    // vl128 state = 0x2cb3f99f
+    __ dci(0xcec087d4);  // sm4e v20.4s, v30.4s
+    // vl128 state = 0x733336fd
+    __ dci(0xcec085fc);  // sm4e v28.4s, v15.4s
+    // vl128 state = 0x11b138f9
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x11b138f9,
+        0x5993c196,
+        0xb9eef6b5,
+        0xf96d88cf,
+        0x8e92bd49,
+        0x04d27185,
+        0x8833f291,
+        0x77933d5b,
+        0x135500cc,
+        0xe5ca977f,
+        0x3e4536af,
+        0xb169aa9d,
+        0xe0b4425b,
+        0x35c1f76e,
+        0x54e3448a,
+        0x4dbf0c92,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sm4ekey) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSM4);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 20 * kInstructionSize);
+    __ dci(0xce6fc9d4);  // sm4ekey v20.4s, v14.4s, v15.4s
+    // vl128 state = 0x4bb7b396
+    __ dci(0xce6bc8d5);  // sm4ekey v21.4s, v6.4s, v11.4s
+    // vl128 state = 0xf4354b26
+    __ dci(0xce6bc8c5);  // sm4ekey v5.4s, v6.4s, v11.4s
+    // vl128 state = 0x0a331378
+    __ dci(0xce6bc8cd);  // sm4ekey v13.4s, v6.4s, v11.4s
+    // vl128 state = 0x7ed4c2a7
+    __ dci(0xce6fc8e5);  // sm4ekey v5.4s, v7.4s, v15.4s
+    // vl128 state = 0x38a433fd
+    __ dci(0xce6fc8e4);  // sm4ekey v4.4s, v7.4s, v15.4s
+    // vl128 state = 0xc1ad0d76
+    __ dci(0xce6bcaec);  // sm4ekey v12.4s, v23.4s, v11.4s
+    // vl128 state = 0x81660ce3
+    __ dci(0xce6bcae8);  // sm4ekey v8.4s, v23.4s, v11.4s
+    // vl128 state = 0x79f3e5c1
+    __ dci(0xce7bcaaa);  // sm4ekey v10.4s, v21.4s, v27.4s
+    // vl128 state = 0x231e0a79
+    __ dci(0xce72caa8);  // sm4ekey v8.4s, v21.4s, v18.4s
+    // vl128 state = 0xd931c858
+    __ dci(0xce7ac8aa);  // sm4ekey v10.4s, v5.4s, v26.4s
+    // vl128 state = 0x2476ef6a
+    __ dci(0xce7bc888);  // sm4ekey v8.4s, v4.4s, v27.4s
+    // vl128 state = 0xd4a9ac83
+    __ dci(0xce7bc889);  // sm4ekey v9.4s, v4.4s, v27.4s
+    // vl128 state = 0x149fd9b3
+    __ dci(0xce7bc9cd);  // sm4ekey v13.4s, v14.4s, v27.4s
+    // vl128 state = 0xece67fce
+    __ dci(0xce79cbc5);  // sm4ekey v5.4s, v30.4s, v25.4s
+    // vl128 state = 0xccb45863
+    __ dci(0xce71cac4);  // sm4ekey v4.4s, v22.4s, v17.4s
+    // vl128 state = 0xafb23c9d
+    __ dci(0xce71c8e0);  // sm4ekey v0.4s, v7.4s, v17.4s
+    // vl128 state = 0x5c808694
+    __ dci(0xce71c882);  // sm4ekey v2.4s, v4.4s, v17.4s
+    // vl128 state = 0x6cea5132
+    __ dci(0xce73c803);  // sm4ekey v3.4s, v0.4s, v19.4s
+    // vl128 state = 0x67e316db
+    __ dci(0xce71c847);  // sm4ekey v7.4s, v2.4s, v17.4s
+    // vl128 state = 0x317aafac
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x317aafac,
+        0xbacd34de,
+        0x3e92f0b2,
+        0x3043dbe3,
+        0x6dda4d17,
+        0x6e59ba0d,
+        0xa29887cf,
+        0x3bee1f56,
+        0xacd43191,
+        0x97ab7ada,
+        0x39ebcf53,
+        0xea7b411e,
+        0xd8e1efe9,
+        0x2b99fc57,
+        0xf5f62e02,
+        0xd50621d1,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
 }  // namespace aarch64
 }  // namespace vixl