Support SHA-3 accelerating instructions (#101)

Add support for Neon BCAX, EOR3, RAX1 and XAR instructions, used to accelerate
SHA-3.
diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc
index ad2e7c9..009b08c 100644
--- a/src/aarch64/assembler-aarch64.cc
+++ b/src/aarch64/assembler-aarch64.cc
@@ -5876,6 +5876,39 @@
   Emit(0x6e80a400 | Rd(vd) | Rn(vn) | Rm(vm));
 }
 
+void Assembler::bcax(const VRegister& vd, const VRegister& vn, const VRegister& vm, const VRegister& va) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3));
+  VIXL_ASSERT(vd.Is16B() && vn.Is16B() && vm.Is16B());
+
+  Emit(0xce200000 | Rd(vd) | Rn(vn) | Rm(vm) | Ra(va));
+}
+
+void Assembler::eor3(const VRegister& vd, const VRegister& vn, const VRegister& vm, const VRegister& va) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3));
+  VIXL_ASSERT(vd.Is16B() && vn.Is16B() && vm.Is16B() && va.Is16B());
+
+  Emit(0xce000000 | Rd(vd) | Rn(vn) | Rm(vm) | Ra(va));
+}
+
+void Assembler::xar(const VRegister& vd, const VRegister& vn, const VRegister& vm, int rotate) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3));
+  VIXL_ASSERT(vd.Is2D() && vn.Is2D() && vm.Is2D());
+  VIXL_ASSERT(IsUint6(rotate));
+
+  Emit(0xce800000 | Rd(vd) | Rn(vn) | Rm(vm) | rotate << 10);
+}
+
+void Assembler::rax1(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3));
+  VIXL_ASSERT(vd.Is2D() && vn.Is2D() && vm.Is2D());
+
+  Emit(0xce608c00 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
 // Note:
 // For all ToImm instructions below, a difference in case
 // for the same letter indicates a negated bit.
diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h
index 9bc7076..bbba5db 100644
--- a/src/aarch64/assembler-aarch64.h
+++ b/src/aarch64/assembler-aarch64.h
@@ -3621,6 +3621,27 @@
   // Unsigned 8-bit integer matrix multiply-accumulate (vector).
   void ummla(const VRegister& vd, const VRegister& vn, const VRegister& vm);
 
+  // Bit Clear and exclusive-OR.
+  void bcax(const VRegister& vd,
+            const VRegister& vn,
+            const VRegister& vm,
+            const VRegister& va);
+
+  // Three-way Exclusive-OR.
+  void eor3(const VRegister& vd,
+            const VRegister& vn,
+            const VRegister& vm,
+            const VRegister& va);
+
+  // Exclusive-OR and Rotate.
+  void xar(const VRegister& vd,
+           const VRegister& vn,
+           const VRegister& vm,
+           int rotate);
+
+  // Rotate and Exclusive-OR
+  void rax1(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
   // Scalable Vector Extensions.
 
   // Absolute value (predicated).
diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc
index 3925ced..a85587b 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.cc
+++ b/src/aarch64/cpu-features-auditor-aarch64.cc
@@ -1835,6 +1835,14 @@
         {"umax_64u_minmax_imm"_h, CPUFeatures::kCSSC},
         {"umin_32u_minmax_imm"_h, CPUFeatures::kCSSC},
         {"umin_64u_minmax_imm"_h, CPUFeatures::kCSSC},
+        {"bcax_vvv16_crypto4"_h,
+         CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)},
+        {"eor3_vvv16_crypto4"_h,
+         CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)},
+        {"rax1_vvv2_cryptosha512_3"_h,
+         CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)},
+        {"xar_vvv2_crypto3_imm6"_h,
+         CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)},
     };
 
     if (features.count(form_hash_) > 0) {
diff --git a/src/aarch64/decoder-visitor-map-aarch64.h b/src/aarch64/decoder-visitor-map-aarch64.h
index a0a6ef2..92131da 100644
--- a/src/aarch64/decoder-visitor-map-aarch64.h
+++ b/src/aarch64/decoder-visitor-map-aarch64.h
@@ -2638,7 +2638,6 @@
        &VISITORCLASS::VisitUnconditionalBranchToRegister},                     \
       {"ret_64r_branch_reg"_h,                                                 \
        &VISITORCLASS::VisitUnconditionalBranchToRegister},                     \
-      {"bcax_vvv16_crypto4"_h, &VISITORCLASS::VisitUnimplemented},             \
       {"bfcvtn_asimdmisc_4s"_h, &VISITORCLASS::VisitUnimplemented},            \
       {"bfdot_asimdelem_e"_h, &VISITORCLASS::VisitUnimplemented},              \
       {"bfdot_asimdsame2_d"_h, &VISITORCLASS::VisitUnimplemented},             \
@@ -2646,7 +2645,6 @@
       {"bfmlal_asimdsame2_f"_h, &VISITORCLASS::VisitUnimplemented},            \
       {"bfmmla_asimdsame2_e"_h, &VISITORCLASS::VisitUnimplemented},            \
       {"dsb_bon_barriers"_h, &VISITORCLASS::VisitUnimplemented},               \
-      {"eor3_vvv16_crypto4"_h, &VISITORCLASS::VisitUnimplemented},             \
       {"ld64b_64l_memop"_h, &VISITORCLASS::VisitUnimplemented},                \
       {"ldgm_64bulk_ldsttags"_h, &VISITORCLASS::VisitUnimplemented},           \
       {"ldtrb_32_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented},           \
@@ -2658,7 +2656,6 @@
       {"ldtrsw_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented},          \
       {"ldtr_32_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented},            \
       {"ldtr_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented},            \
-      {"rax1_vvv2_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented},       \
       {"sha512h2_qqv_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented},    \
       {"sha512h_qqv_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented},     \
       {"sha512su0_vv2_cryptosha512_2"_h, &VISITORCLASS::VisitUnimplemented},   \
@@ -2686,7 +2683,6 @@
       {"ttest_br_systemresult"_h, &VISITORCLASS::VisitUnimplemented},          \
       {"wfet_only_systeminstrswithreg"_h, &VISITORCLASS::VisitUnimplemented},  \
       {"wfit_only_systeminstrswithreg"_h, &VISITORCLASS::VisitUnimplemented},  \
-      {"xar_vvv2_crypto3_imm6"_h, &VISITORCLASS::VisitUnimplemented},          \
       {"bfcvt_z_p_z_s2bf"_h, &VISITORCLASS::VisitUnimplemented},               \
       {"bfcvtnt_z_p_z_s2bf"_h, &VISITORCLASS::VisitUnimplemented},             \
       {"bfdot_z_zzz"_h, &VISITORCLASS::VisitUnimplemented},                    \
diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc
index 9f53e81..5a8241d 100644
--- a/src/aarch64/disasm-aarch64.cc
+++ b/src/aarch64/disasm-aarch64.cc
@@ -753,6 +753,10 @@
       {"umax_64u_minmax_imm"_h, &Disassembler::DisassembleMinMaxImm},
       {"umin_32u_minmax_imm"_h, &Disassembler::DisassembleMinMaxImm},
       {"umin_64u_minmax_imm"_h, &Disassembler::DisassembleMinMaxImm},
+      {"bcax_vvv16_crypto4"_h, &Disassembler::DisassembleNEON4Same},
+      {"eor3_vvv16_crypto4"_h, &Disassembler::DisassembleNEON4Same},
+      {"xar_vvv2_crypto3_imm6"_h, &Disassembler::DisassembleNEONXar},
+      {"rax1_vvv2_cryptosha512_3"_h, &Disassembler::DisassembleNEONRax1},
   };
   return &form_to_visitor;
 }  // NOLINT(readability/fn_size)
@@ -2430,6 +2434,17 @@
   Format(instr, mnemonic, nfd.Substitute(form), suffix);
 }
 
+void Disassembler::DisassembleNEON4Same(const Instruction *instr) {
+  FormatWithDecodedMnemonic(instr, "'Vd.16b, 'Vn.16b, 'Vm.16b, 'Va.16b");
+}
+
+void Disassembler::DisassembleNEONXar(const Instruction *instr) {
+  FormatWithDecodedMnemonic(instr, "'Vd.2d, 'Vn.2d, 'Vm.2d, #'u1510");
+}
+
+void Disassembler::DisassembleNEONRax1(const Instruction *instr) {
+  FormatWithDecodedMnemonic(instr, "'Vd.2d, 'Vn.2d, 'Vm.2d");
+}
 
 void Disassembler::VisitNEON3Different(const Instruction *instr) {
   const char *mnemonic = mnemonic_.c_str();
diff --git a/src/aarch64/disasm-aarch64.h b/src/aarch64/disasm-aarch64.h
index 7985383..0da49e4 100644
--- a/src/aarch64/disasm-aarch64.h
+++ b/src/aarch64/disasm-aarch64.h
@@ -229,6 +229,9 @@
   void DisassembleNEONScalar2RegMiscOnlyD(const Instruction* instr);
   void DisassembleNEONFPScalar2RegMisc(const Instruction* instr);
   void DisassembleNEONPolynomialMul(const Instruction* instr);
+  void DisassembleNEON4Same(const Instruction* instr);
+  void DisassembleNEONXar(const Instruction* instr);
+  void DisassembleNEONRax1(const Instruction* instr);
 
   void DisassembleMTELoadTag(const Instruction* instr);
   void DisassembleMTEStoreTag(const Instruction* instr);
diff --git a/src/aarch64/macro-assembler-aarch64.h b/src/aarch64/macro-assembler-aarch64.h
index f6fc4d7..8878ef7 100644
--- a/src/aarch64/macro-assembler-aarch64.h
+++ b/src/aarch64/macro-assembler-aarch64.h
@@ -2787,6 +2787,7 @@
   V(pmull2, Pmull2)              \
   V(raddhn, Raddhn)              \
   V(raddhn2, Raddhn2)            \
+  V(rax1, Rax1)                  \
   V(rsubhn, Rsubhn)              \
   V(rsubhn2, Rsubhn2)            \
   V(saba, Saba)                  \
@@ -3152,6 +3153,14 @@
   SVE_3VREG_COMMUTATIVE_MACRO_LIST(DEFINE_MACRO_ASM_FUNC)
 #undef DEFINE_MACRO_ASM_FUNC
 
+  void Bcax(const VRegister& vd,
+            const VRegister& vn,
+            const VRegister& vm,
+            const VRegister& va) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    bcax(vd, vn, vm, va);
+  }
   void Bic(const VRegister& vd, const int imm8, const int left_shift = 0) {
     VIXL_ASSERT(allow_macro_instructions_);
     SingleEmissionCheckScope guard(this);
@@ -3192,6 +3201,14 @@
     SingleEmissionCheckScope guard(this);
     dup(vd, rn);
   }
+  void Eor3(const VRegister& vd,
+            const VRegister& vn,
+            const VRegister& vm,
+            const VRegister& va) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    eor3(vd, vn, vm, va);
+  }
   void Ext(const VRegister& vd,
            const VRegister& vn,
            const VRegister& vm,
@@ -3498,6 +3515,14 @@
     SingleEmissionCheckScope guard(this);
     umov(rd, vn, vn_index);
   }
+  void Xar(const VRegister& vd,
+           const VRegister& vn,
+           const VRegister& vm,
+           int rotate) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    xar(vd, vn, vm, rotate);
+  }
   void Crc32b(const Register& rd, const Register& rn, const Register& rm) {
     VIXL_ASSERT(allow_macro_instructions_);
     SingleEmissionCheckScope guard(this);
diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc
index 81bab07..fdc7106 100644
--- a/src/aarch64/simulator-aarch64.cc
+++ b/src/aarch64/simulator-aarch64.cc
@@ -507,6 +507,10 @@
       {"umax_64u_minmax_imm"_h, &Simulator::SimulateUnsignedMinMax},
       {"umin_32u_minmax_imm"_h, &Simulator::SimulateUnsignedMinMax},
       {"umin_64u_minmax_imm"_h, &Simulator::SimulateUnsignedMinMax},
+      {"bcax_vvv16_crypto4"_h, &Simulator::SimulateNEONSHA3},
+      {"eor3_vvv16_crypto4"_h, &Simulator::SimulateNEONSHA3},
+      {"rax1_vvv2_cryptosha512_3"_h, &Simulator::SimulateNEONSHA3},
+      {"xar_vvv2_crypto3_imm6"_h, &Simulator::SimulateNEONSHA3},
   };
   return &form_to_visitor;
 }
@@ -9926,6 +9930,34 @@
   }
 }
 
+void Simulator::SimulateNEONSHA3(const Instruction* instr) {
+  SimVRegister& rd = ReadVRegister(instr->GetRd());
+  SimVRegister& rn = ReadVRegister(instr->GetRn());
+  SimVRegister& rm = ReadVRegister(instr->GetRm());
+  SimVRegister& ra = ReadVRegister(instr->GetRa());
+  SimVRegister temp;
+
+  switch (form_hash_) {
+    case "bcax_vvv16_crypto4"_h:
+      bic(kFormat16B, temp, rm, ra);
+      eor(kFormat16B, rd, rn, temp);
+      break;
+    case "eor3_vvv16_crypto4"_h:
+      eor(kFormat16B, temp, rm, ra);
+      eor(kFormat16B, rd, rn, temp);
+      break;
+    case "rax1_vvv2_cryptosha512_3"_h:
+      ror(kFormat2D, temp, rm, 63);  // rol(1) => ror(63)
+      eor(kFormat2D, rd, rn, temp);
+      break;
+    case "xar_vvv2_crypto3_imm6"_h:
+      int rot = instr->ExtractBits(15, 10);
+      eor(kFormat2D, temp, rn, rm);
+      ror(kFormat2D, rd, temp, rot);
+      break;
+  }
+}
+
 void Simulator::VisitSVEAddressGeneration(const Instruction* instr) {
   SimVRegister& zd = ReadVRegister(instr->GetRd());
   SimVRegister& zn = ReadVRegister(instr->GetRn());
diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h
index 760fa6c..73277e4 100644
--- a/src/aarch64/simulator-aarch64.h
+++ b/src/aarch64/simulator-aarch64.h
@@ -1509,6 +1509,7 @@
   void SimulateNEONFPMulByElementLong(const Instruction* instr);
   void SimulateNEONComplexMulByElement(const Instruction* instr);
   void SimulateNEONDotProdByElement(const Instruction* instr);
+  void SimulateNEONSHA3(const Instruction* instr);
   void SimulateMTEAddSubTag(const Instruction* instr);
   void SimulateMTETagMaskInsert(const Instruction* instr);
   void SimulateMTESubPointer(const Instruction* instr);
diff --git a/test/aarch64/test-cpu-features-aarch64.cc b/test/aarch64/test-cpu-features-aarch64.cc
index 187bbd5..8430d7f 100644
--- a/test/aarch64/test-cpu-features-aarch64.cc
+++ b/test/aarch64/test-cpu-features-aarch64.cc
@@ -3785,5 +3785,14 @@
 TEST_FEAT(pmull1q_0, pmull(v5.V1Q(), v6.V1D(), v7.V1D()))
 #undef TEST_FEAT
 
+#define TEST_NEON_SHA3(NAME, ASM)                                    \
+  TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3), \
+                NEON_SHA3_##NAME,                                    \
+                ASM)
+TEST_NEON_SHA3(bcax_0, bcax(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B()))
+TEST_NEON_SHA3(eor3_0, eor3(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B()))
+TEST_NEON_SHA3(xar_0, xar(v0.V2D(), v1.V2D(), v2.V2D(), 42))
+TEST_NEON_SHA3(rax1_0, rax1(v0.V2D(), v1.V2D(), v2.V2D()))
+
 }  // namespace aarch64
 }  // namespace vixl
diff --git a/test/aarch64/test-disasm-aarch64.cc b/test/aarch64/test-disasm-aarch64.cc
index 7c8f2cc..5d8579f 100644
--- a/test/aarch64/test-disasm-aarch64.cc
+++ b/test/aarch64/test-disasm-aarch64.cc
@@ -3789,10 +3789,10 @@
   COMPARE_PREFIX(dci(0xd503221f), "esb");  // ESB_HI_hints
 
   // ARMv8.2 - SHA3
-  // COMPARE_PREFIX(dci(0xce000000), "eor3");   // EOR3_VVV16_crypto4
-  // COMPARE_PREFIX(dci(0xce200000), "bcax");   // BCAX_VVV16_crypto4
-  // COMPARE_PREFIX(dci(0xce608c00), "rax1");   // RAX1_VVV2_cryptosha512_3
-  // COMPARE_PREFIX(dci(0xce800000), "xar");   // XAR_VVV2_crypto3_imm6
+  COMPARE_PREFIX(dci(0xce000000), "eor3");  // EOR3_VVV16_crypto4
+  COMPARE_PREFIX(dci(0xce200000), "bcax");  // BCAX_VVV16_crypto4
+  COMPARE_PREFIX(dci(0xce608c00), "rax1");  // RAX1_VVV2_cryptosha512_3
+  COMPARE_PREFIX(dci(0xce800000), "xar");   // XAR_VVV2_crypto3_imm6
 
   // ARMv8.2 - SHA512
   // COMPARE_PREFIX(dci(0xce608000), "sha512h");   // SHA512H_QQV_cryptosha512_3
diff --git a/test/aarch64/test-disasm-neon-aarch64.cc b/test/aarch64/test-disasm-neon-aarch64.cc
index c2824c9..774114d 100644
--- a/test/aarch64/test-disasm-neon-aarch64.cc
+++ b/test/aarch64/test-disasm-neon-aarch64.cc
@@ -4516,6 +4516,20 @@
   CLEANUP();
 }
 
+TEST(neon_sha3) {
+  SETUP();
+
+  COMPARE_MACRO(Bcax(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B()),
+                "bcax v0.16b, v1.16b, v2.16b, v3.16b");
+  COMPARE_MACRO(Eor3(v10.V16B(), v11.V16B(), v12.V16B(), v13.V16B()),
+                "eor3 v10.16b, v11.16b, v12.16b, v13.16b");
+  COMPARE_MACRO(Xar(v20.V2D(), v21.V2D(), v22.V2D(), 42),
+                "xar v20.2d, v21.2d, v22.2d, #42");
+  COMPARE_MACRO(Rax1(v0.V2D(), v1.V2D(), v2.V2D()), "rax1 v0.2d, v1.2d, v2.2d");
+
+  CLEANUP();
+}
+
 TEST(neon_unallocated_regression_test) {
   SETUP();
 
diff --git a/test/aarch64/test-simulator-sve-aarch64.cc b/test/aarch64/test-simulator-sve-aarch64.cc
index 1ba7783..bdd5c81 100644
--- a/test/aarch64/test-simulator-sve-aarch64.cc
+++ b/test/aarch64/test-simulator-sve-aarch64.cc
@@ -394,5 +394,169 @@
   }
 }
 
+TEST_SVE(neon_sha3) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSHA3);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 60 * kInstructionSize);
+    __ dci(0xce608c00);  // rax1 v0.2d, v0.2d, v0.2d
+    // vl128 state = 0x960c2b9f
+    __ dci(0xce608e28);  // rax1 v8.2d, v17.2d, v0.2d
+    // vl128 state = 0x89ea3f7b
+    __ dci(0xce618e6c);  // rax1 v12.2d, v19.2d, v1.2d
+    // vl128 state = 0xa7801384
+    __ dci(0xce718e48);  // rax1 v8.2d, v18.2d, v17.2d
+    // vl128 state = 0x4477d70d
+    __ dci(0xce738e60);  // rax1 v0.2d, v19.2d, v19.2d
+    // vl128 state = 0xdee66854
+    __ dci(0xce6b8e61);  // rax1 v1.2d, v19.2d, v11.2d
+    // vl128 state = 0x2e383dc2
+    __ dci(0xce6e8e60);  // rax1 v0.2d, v19.2d, v14.2d
+    // vl128 state = 0xa022bb6d
+    __ dci(0xce6e8e62);  // rax1 v2.2d, v19.2d, v14.2d
+    // vl128 state = 0x923f5d32
+    __ dci(0xce668e23);  // rax1 v3.2d, v17.2d, v6.2d
+    // vl128 state = 0xc2c6ca00
+    __ dci(0xce260e33);  // bcax v19.16b, v17.16b, v6.16b, v3.16b
+    // vl128 state = 0x517e85e9
+    __ dci(0xce260e23);  // bcax v3.16b, v17.16b, v6.16b, v3.16b
+    // vl128 state = 0xbcf4c332
+    __ dci(0xce260e93);  // bcax v19.16b, v20.16b, v6.16b, v3.16b
+    // vl128 state = 0x5d9d51ef
+    __ dci(0xce260a11);  // bcax v17.16b, v16.16b, v6.16b, v2.16b
+    // vl128 state = 0x69ce0099
+    __ dci(0xce260a15);  // bcax v21.16b, v16.16b, v6.16b, v2.16b
+    // vl128 state = 0x9a2cdc9f
+    __ dci(0xce244a11);  // bcax v17.16b, v16.16b, v4.16b, v18.16b
+    // vl128 state = 0x27eeff29
+    __ dci(0xce304a10);  // bcax v16.16b, v16.16b, v16.16b, v18.16b
+    // vl128 state = 0x6d586875
+    __ dci(0xce314b18);  // bcax v24.16b, v24.16b, v17.16b, v18.16b
+    // vl128 state = 0xe38b6054
+    __ dci(0xce214b28);  // bcax v8.16b, v25.16b, v1.16b, v18.16b
+    // vl128 state = 0x27a3f5f6
+    __ dci(0xce294f38);  // bcax v24.16b, v25.16b, v9.16b, v19.16b
+    // vl128 state = 0x7d7ffa9b
+    __ dci(0xce214e39);  // bcax v25.16b, v17.16b, v1.16b, v19.16b
+    // vl128 state = 0x936374f0
+    __ dci(0xce216a3d);  // bcax v29.16b, v17.16b, v1.16b, v26.16b
+    // vl128 state = 0x1c5136d5
+    __ dci(0xce296b39);  // bcax v25.16b, v25.16b, v9.16b, v26.16b
+    // vl128 state = 0x75cd7131
+    __ dci(0xce216338);  // bcax v24.16b, v25.16b, v1.16b, v24.16b
+    // vl128 state = 0xcc747626
+    __ dci(0xce2163f9);  // bcax v25.16b, v31.16b, v1.16b, v24.16b
+    // vl128 state = 0x9409c8bc
+    __ dci(0xce2043f1);  // bcax v17.16b, v31.16b, v0.16b, v16.16b
+    // vl128 state = 0x8db3a0c8
+    __ dci(0xce2043f5);  // bcax v21.16b, v31.16b, v0.16b, v16.16b
+    // vl128 state = 0xa55f8d7d
+    __ dci(0xce2043e5);  // bcax v5.16b, v31.16b, v0.16b, v16.16b
+    // vl128 state = 0xe1960c7a
+    __ dci(0xce224be7);  // bcax v7.16b, v31.16b, v2.16b, v18.16b
+    // vl128 state = 0xc9599bde
+    __ dci(0xce204bb7);  // bcax v23.16b, v29.16b, v0.16b, v18.16b
+    // vl128 state = 0x7176d08d
+    __ dci(0xce004b9f);  // eor3 v31.16b, v28.16b, v0.16b, v18.16b
+    // vl128 state = 0x10620821
+    __ dci(0xce000baf);  // eor3 v15.16b, v29.16b, v0.16b, v2.16b
+    // vl128 state = 0x0aba0288
+    __ dci(0xce0a0bab);  // eor3 v11.16b, v29.16b, v10.16b, v2.16b
+    // vl128 state = 0xe6517156
+    __ dci(0xce0e1baf);  // eor3 v15.16b, v29.16b, v14.16b, v6.16b
+    // vl128 state = 0x6b7021fb
+    __ dci(0xce0e3fa7);  // eor3 v7.16b, v29.16b, v14.16b, v15.16b
+    // vl128 state = 0x05761b1f
+    __ dci(0xce0e2fe5);  // eor3 v5.16b, v31.16b, v14.16b, v11.16b
+    // vl128 state = 0xe01822c6
+    __ dci(0xce2e2fc7);  // bcax v7.16b, v30.16b, v14.16b, v11.16b
+    // vl128 state = 0xdc6444d7
+    __ dci(0xce3e2dcf);  // bcax v15.16b, v14.16b, v30.16b, v11.16b
+    // vl128 state = 0xa5ecad2e
+    __ dci(0xce3e3fdf);  // bcax v31.16b, v30.16b, v30.16b, v15.16b
+    // vl128 state = 0x2124dc42
+    __ dci(0xce3a3ede);  // bcax v30.16b, v22.16b, v26.16b, v15.16b
+    // vl128 state = 0x57f77204
+    __ dci(0xce3a2e9c);  // bcax v28.16b, v20.16b, v26.16b, v11.16b
+    // vl128 state = 0x6e8d303d
+    __ dci(0xce3a2294);  // bcax v20.16b, v20.16b, v26.16b, v8.16b
+    // vl128 state = 0xdb53d42c
+    __ dci(0xce38029c);  // bcax v28.16b, v20.16b, v24.16b, v0.16b
+    // vl128 state = 0x258d49b8
+    __ dci(0xce38088c);  // bcax v12.16b, v4.16b, v24.16b, v2.16b
+    // vl128 state = 0xe751a348
+    __ dci(0xce28008e);  // bcax v14.16b, v4.16b, v8.16b, v0.16b
+    // vl128 state = 0x8ce0aa1a
+    __ dci(0xce28008a);  // bcax v10.16b, v4.16b, v8.16b, v0.16b
+    // vl128 state = 0x1fdf89a5
+    __ dci(0xce280088);  // bcax v8.16b, v4.16b, v8.16b, v0.16b
+    // vl128 state = 0xcc51f5e1
+    __ dci(0xce2a1089);  // bcax v9.16b, v4.16b, v10.16b, v4.16b
+    // vl128 state = 0xdaf766b0
+    __ dci(0xce0b1081);  // eor3 v1.16b, v4.16b, v11.16b, v4.16b
+    // vl128 state = 0x2da7deb5
+    __ dci(0xce0a1011);  // eor3 v17.16b, v0.16b, v10.16b, v4.16b
+    // vl128 state = 0xcc86f5d4
+    __ dci(0xce121010);  // eor3 v16.16b, v0.16b, v18.16b, v4.16b
+    // vl128 state = 0xfb722105
+    __ dci(0xce921118);  // xar v24.2d, v8.2d, v18.2d, #4
+    // vl128 state = 0x9a7752e3
+    __ dci(0xce9a1199);  // xar v25.2d, v12.2d, v26.2d, #4
+    // vl128 state = 0x83a251c2
+    __ dci(0xce9e11dd);  // xar v29.2d, v14.2d, v30.2d, #4
+    // vl128 state = 0x1e31c9d5
+    __ dci(0xce9e915c);  // xar v28.2d, v10.2d, v30.2d, #36
+    // vl128 state = 0x0e421d73
+    __ dci(0xce1e115d);  // eor3 v29.16b, v10.16b, v30.16b, v4.16b
+    // vl128 state = 0xb5a8c677
+    __ dci(0xce3e515c);  // bcax v28.16b, v10.16b, v30.16b, v20.16b
+    // vl128 state = 0x21587300
+    __ dci(0xce3e5154);  // bcax v20.16b, v10.16b, v30.16b, v20.16b
+    // vl128 state = 0x9459c629
+    __ dci(0xce3e1056);  // bcax v22.16b, v2.16b, v30.16b, v4.16b
+    // vl128 state = 0xdb02263a
+    __ dci(0xce2a105e);  // bcax v30.16b, v2.16b, v10.16b, v4.16b
+    // vl128 state = 0xc9d210aa
+    __ dci(0xce3a5056);  // bcax v22.16b, v2.16b, v26.16b, v20.16b
+    // vl128 state = 0x4cc56293
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x4cc56293,
+        0xee8bac03,
+        0xc1253ac9,
+        0x9fe5aa0f,
+        0x43df27f4,
+        0x19f03be6,
+        0xd26c928b,
+        0x7b9da4c4,
+        0xe13149a7,
+        0x9fa11ed9,
+        0xe02cc4dd,
+        0x7848dfe7,
+        0x5ed1726f,
+        0x983e0123,
+        0x34166240,
+        0xc4ee172f,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
 }  // namespace aarch64
 }  // namespace vixl
diff --git a/tools/code_coverage.log b/tools/code_coverage.log
index f913151..d29b39b 100644
--- a/tools/code_coverage.log
+++ b/tools/code_coverage.log
@@ -23,6 +23,7 @@
 1693487542 82.91% 97.57% 94.87%
 1694008240 82.72% 97.50% 94.95%
 1697036303 82.87% 97.56% 94.76%
+1698330215 82.92% 97.57% 94.88%
 1702052331 82.89% 97.59% 94.77%
 1706691191 82.87% 97.59% 94.74%
 1707395574 82.89% 97.59% 94.77%