Support PMULL for 1Q destination vectors (#91)

Extend the Neon PMULL instruction to support 1Q destination registers when the
CPU feature is supported.
diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc
index c022717..c0f66ec 100644
--- a/src/aarch64/assembler-aarch64.cc
+++ b/src/aarch64/assembler-aarch64.cc
@@ -2913,6 +2913,25 @@
   LoadStoreStructSingle(vt, lane, dst, NEONLoadStoreSingleStructStore1);
 }
 
+void Assembler::pmull(const VRegister& vd,
+                      const VRegister& vn,
+                      const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(AreSameFormat(vn, vm));
+  VIXL_ASSERT((vn.Is8B() && vd.Is8H()) || (vn.Is1D() && vd.Is1Q()));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPmull1Q) || vd.Is8H());
+  Emit(VFormat(vn) | NEON_PMULL | Rm(vm) | Rn(vn) | Rd(vd));
+}
+
+void Assembler::pmull2(const VRegister& vd,
+                       const VRegister& vn,
+                       const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(AreSameFormat(vn, vm));
+  VIXL_ASSERT((vn.Is16B() && vd.Is8H()) || (vn.Is2D() && vd.Is1Q()));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPmull1Q) || vd.Is8H());
+  Emit(VFormat(vn) | NEON_PMULL2 | Rm(vm) | Rn(vn) | Rd(vd));
+}
 
 void Assembler::NEON3DifferentL(const VRegister& vd,
                                 const VRegister& vn,
@@ -2960,8 +2979,6 @@
 
 // clang-format off
 #define NEON_3DIFF_LONG_LIST(V) \
-  V(pmull,  NEON_PMULL,  vn.IsVector() && vn.Is8B())                           \
-  V(pmull2, NEON_PMULL2, vn.IsVector() && vn.Is16B())                          \
   V(saddl,  NEON_SADDL,  vn.IsVector() && vn.IsD())                            \
   V(saddl2, NEON_SADDL2, vn.IsVector() && vn.IsQ())                            \
   V(sabal,  NEON_SABAL,  vn.IsVector() && vn.IsD())                            \
diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h
index 2746f2c..1028da2 100644
--- a/src/aarch64/assembler-aarch64.h
+++ b/src/aarch64/assembler-aarch64.h
@@ -7540,6 +7540,8 @@
   static Instr VFormat(VRegister vd) {
     if (vd.Is64Bits()) {
       switch (vd.GetLanes()) {
+        case 1:
+          return NEON_1D;
         case 2:
           return NEON_2S;
         case 4:
diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc
index c198177..66ac97a 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.cc
+++ b/src/aarch64/cpu-features-auditor-aarch64.cc
@@ -736,6 +736,12 @@
   RecordInstructionFeaturesScope scope(this);
   // All of these instructions require NEON.
   scope.Record(CPUFeatures::kNEON);
+  if (form_hash_ == "pmull_asimddiff_l"_h) {
+    if (instr->GetNEONSize() == 3) {
+      // Source is 1D or 2D, destination is 1Q.
+      scope.Record(CPUFeatures::kPmull1Q);
+    }
+  }
   USE(instr);
 }
 
@@ -1408,9 +1414,9 @@
 void CPUFeaturesAuditor::Visit(Metadata* metadata, const Instruction* instr) {
   VIXL_ASSERT(metadata->count("form") > 0);
   const std::string& form = (*metadata)["form"];
-  uint32_t form_hash = Hash(form.c_str());
+  form_hash_ = Hash(form.c_str());
   const FormToVisitorFnMap* fv = CPUFeaturesAuditor::GetFormToVisitorFnMap();
-  FormToVisitorFnMap::const_iterator it = fv->find(form_hash);
+  FormToVisitorFnMap::const_iterator it = fv->find(form_hash_);
   if (it == fv->end()) {
     RecordInstructionFeaturesScope scope(this);
     std::map<uint32_t, const CPUFeatures> features = {
@@ -1829,8 +1835,8 @@
         {"umin_64u_minmax_imm"_h, CPUFeatures::kCSSC},
     };
 
-    if (features.count(form_hash) > 0) {
-      scope.Record(features[form_hash]);
+    if (features.count(form_hash_) > 0) {
+      scope.Record(features[form_hash_]);
     }
   } else {
     (it->second)(this, instr);
diff --git a/src/aarch64/cpu-features-auditor-aarch64.h b/src/aarch64/cpu-features-auditor-aarch64.h
index 613c500..67de644 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.h
+++ b/src/aarch64/cpu-features-auditor-aarch64.h
@@ -127,6 +127,7 @@
       uint32_t,
       std::function<void(CPUFeaturesAuditor*, const Instruction*)>>;
   static const FormToVisitorFnMap* GetFormToVisitorFnMap();
+  uint32_t form_hash_;
 };
 
 }  // namespace aarch64
diff --git a/src/aarch64/decoder-visitor-map-aarch64.h b/src/aarch64/decoder-visitor-map-aarch64.h
index 8ae438c..b40e0ae 100644
--- a/src/aarch64/decoder-visitor-map-aarch64.h
+++ b/src/aarch64/decoder-visitor-map-aarch64.h
@@ -2074,7 +2074,6 @@
       {"scvtf_asimdmiscfp16_r"_h, &VISITORCLASS::VisitNEON2RegMiscFP16},       \
       {"ucvtf_asimdmiscfp16_r"_h, &VISITORCLASS::VisitNEON2RegMiscFP16},       \
       {"addhn_asimddiff_n"_h, &VISITORCLASS::VisitNEON3Different},             \
-      {"pmull_asimddiff_l"_h, &VISITORCLASS::VisitNEON3Different},             \
       {"raddhn_asimddiff_n"_h, &VISITORCLASS::VisitNEON3Different},            \
       {"rsubhn_asimddiff_n"_h, &VISITORCLASS::VisitNEON3Different},            \
       {"sabal_asimddiff_l"_h, &VISITORCLASS::VisitNEON3Different},             \
@@ -2827,6 +2826,7 @@
       {"fmlal_asimdsame_f"_h, &VISITORCLASS::VisitNEON3Same},                  \
       {"fmlsl2_asimdsame_f"_h, &VISITORCLASS::VisitNEON3Same},                 \
       {"fmlsl_asimdsame_f"_h, &VISITORCLASS::VisitNEON3Same},                  \
+      {"pmull_asimddiff_l"_h, &VISITORCLASS::VisitNEON3Different},             \
       {"ushll_asimdshf_l"_h, &VISITORCLASS::VisitNEONShiftImmediate},          \
       {"sshll_asimdshf_l"_h, &VISITORCLASS::VisitNEONShiftImmediate},          \
       {"shrn_asimdshf_n"_h, &VISITORCLASS::VisitNEONShiftImmediate},           \
diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc
index ae8fac8..b94ecfb 100644
--- a/src/aarch64/disasm-aarch64.cc
+++ b/src/aarch64/disasm-aarch64.cc
@@ -330,6 +330,7 @@
       {"frsqrte_asisdmisc_r"_h, &Disassembler::DisassembleNEONFPScalar2RegMisc},
       {"scvtf_asisdmisc_r"_h, &Disassembler::DisassembleNEONFPScalar2RegMisc},
       {"ucvtf_asisdmisc_r"_h, &Disassembler::DisassembleNEONFPScalar2RegMisc},
+      {"pmull_asimddiff_l"_h, &Disassembler::DisassembleNEONPolynomialMul},
       {"adclb_z_zzz"_h, &Disassembler::DisassembleSVEAddSubCarry},
       {"adclt_z_zzz"_h, &Disassembler::DisassembleSVEAddSubCarry},
       {"addhnb_z_zz"_h, &Disassembler::DisassembleSVEAddSubHigh},
@@ -2425,11 +2426,6 @@
       nfd.SetFormatMaps(nfd.LongIntegerFormatMap());
       nfd.SetFormatMap(0, nfd.IntegerFormatMap());
       break;
-    case "pmull_asimddiff_l"_h:
-      if (nfd.GetVectorFormat(0) != kFormat8H) {
-        mnemonic = NULL;
-      }
-      break;
     case "sqdmlal_asimddiff_l"_h:
     case "sqdmlsl_asimddiff_l"_h:
     case "sqdmull_asimddiff_l"_h:
@@ -2441,6 +2437,22 @@
   Format(instr, nfd.Mnemonic(mnemonic), nfd.Substitute(form));
 }
 
+void Disassembler::DisassembleNEONPolynomialMul(const Instruction *instr) {
+  const char *mnemonic = instr->ExtractBit(30) ? "pmull2" : "pmull";
+  const char *form = NULL;
+  int size = instr->ExtractBits(23, 22);
+  if (size == 0) {
+    // Bits 30:27 of the instruction are x001, where x is the Q bit. Map
+    // this to "8" and "16" by adding 7.
+    form = "'Vd.8h, 'Vn.'u3127+7b, 'Vm.'u3127+7b";
+  } else if (size == 3) {
+    form = "'Vd.1q, 'Vn.'?30:21d, 'Vm.'?30:21d";
+  } else {
+    mnemonic = NULL;
+  }
+  Format(instr, mnemonic, form);
+}
+
 void Disassembler::DisassembleNEONFPAcrossLanes(const Instruction *instr) {
   const char *mnemonic = mnemonic_.c_str();
   const char *form = "'Sd, 'Vn.4s";
diff --git a/src/aarch64/disasm-aarch64.h b/src/aarch64/disasm-aarch64.h
index cc941bb..7985383 100644
--- a/src/aarch64/disasm-aarch64.h
+++ b/src/aarch64/disasm-aarch64.h
@@ -228,6 +228,7 @@
   void DisassembleNEONScalarShiftRightNarrowImm(const Instruction* instr);
   void DisassembleNEONScalar2RegMiscOnlyD(const Instruction* instr);
   void DisassembleNEONFPScalar2RegMisc(const Instruction* instr);
+  void DisassembleNEONPolynomialMul(const Instruction* instr);
 
   void DisassembleMTELoadTag(const Instruction* instr);
   void DisassembleMTEStoreTag(const Instruction* instr);
diff --git a/src/aarch64/instructions-aarch64.cc b/src/aarch64/instructions-aarch64.cc
index 298a7be..2ac3bca 100644
--- a/src/aarch64/instructions-aarch64.cc
+++ b/src/aarch64/instructions-aarch64.cc
@@ -1011,6 +1011,8 @@
       return kFormat4H;
     case kFormat2D:
       return kFormat2S;
+    case kFormat1Q:
+      return kFormat1D;
     case kFormatH:
       return kFormatB;
     case kFormatS:
@@ -1095,6 +1097,8 @@
       return kFormat2S;
     case kFormat2D:
       return kFormat4S;
+    case kFormat1Q:
+      return kFormat2D;
     case kFormatVnH:
       return kFormatVnB;
     case kFormatVnS:
@@ -1246,6 +1250,7 @@
     case kFormat8H:
     case kFormat4S:
     case kFormat2D:
+    case kFormat1Q:
       return kQRegSize;
     default:
       VIXL_UNREACHABLE();
@@ -1283,6 +1288,7 @@
     case kFormat2D:
     case kFormatVnD:
       return 64;
+    case kFormat1Q:
     case kFormatVnQ:
       return 128;
     case kFormatVnO:
@@ -1348,6 +1354,7 @@
     case kFormat2D:
       return 2;
     case kFormat1D:
+    case kFormat1Q:
     case kFormatB:
     case kFormatH:
     case kFormatS:
diff --git a/src/aarch64/instructions-aarch64.h b/src/aarch64/instructions-aarch64.h
index 38a0d67..ce08ea3 100644
--- a/src/aarch64/instructions-aarch64.h
+++ b/src/aarch64/instructions-aarch64.h
@@ -217,9 +217,10 @@
   kFormatVnQ = kFormatSVEQ | kFormatSVE,
   kFormatVnO = kFormatSVEO | kFormatSVE,
 
-  // An artificial value, used by simulator trace tests and a few oddball
+  // Artificial values, used by simulator trace tests and a few oddball
   // instructions (such as FMLAL).
-  kFormat2H = 0xfffffffe
+  kFormat2H = 0xfffffffe,
+  kFormat1Q = 0xfffffffd
 };
 
 // Instructions. ---------------------------------------------------------------
diff --git a/src/aarch64/logic-aarch64.cc b/src/aarch64/logic-aarch64.cc
index 5e53d27..afd107c 100644
--- a/src/aarch64/logic-aarch64.cc
+++ b/src/aarch64/logic-aarch64.cc
@@ -913,23 +913,12 @@
   return sqrdmlsh(vform, dst, src1, dup_element(indexform, temp, src2, index));
 }
 
-
 uint64_t Simulator::PolynomialMult(uint64_t op1,
                                    uint64_t op2,
                                    int lane_size_in_bits) const {
-  VIXL_ASSERT(static_cast<unsigned>(lane_size_in_bits) <= kSRegSize);
-  VIXL_ASSERT(IsUintN(lane_size_in_bits, op1));
-  VIXL_ASSERT(IsUintN(lane_size_in_bits, op2));
-  uint64_t result = 0;
-  for (int i = 0; i < lane_size_in_bits; ++i) {
-    if ((op1 >> i) & 1) {
-      result = result ^ (op2 << i);
-    }
-  }
-  return result;
+  return PolynomialMult128(op1, op2, lane_size_in_bits).second;
 }
 
-
 LogicVRegister Simulator::pmul(VectorFormat vform,
                                LogicVRegister dst,
                                const LogicVRegister& src1,
@@ -951,14 +940,16 @@
                                 const LogicVRegister& src1,
                                 const LogicVRegister& src2) {
   dst.ClearForWrite(vform);
-
   VectorFormat vform_src = VectorFormatHalfWidth(vform);
-  for (int i = 0; i < LaneCountFromFormat(vform); i++) {
+
+  // Process the elements in reverse to avoid problems when the destination
+  // register is the same as a source.
+  for (int i = LaneCountFromFormat(vform) - 1; i >= 0; i--) {
     dst.SetUint(vform,
                 i,
-                PolynomialMult(src1.Uint(vform_src, i),
-                               src2.Uint(vform_src, i),
-                               LaneSizeInBitsFromFormat(vform_src)));
+                PolynomialMult128(src1.Uint(vform_src, i),
+                                  src2.Uint(vform_src, i),
+                                  LaneSizeInBitsFromFormat(vform_src)));
   }
 
   return dst;
@@ -969,16 +960,18 @@
                                  LogicVRegister dst,
                                  const LogicVRegister& src1,
                                  const LogicVRegister& src2) {
-  VectorFormat vform_src = VectorFormatHalfWidthDoubleLanes(vform);
   dst.ClearForWrite(vform);
+  VectorFormat vform_src = VectorFormatHalfWidthDoubleLanes(vform);
+
   int lane_count = LaneCountFromFormat(vform);
   for (int i = 0; i < lane_count; i++) {
     dst.SetUint(vform,
                 i,
-                PolynomialMult(src1.Uint(vform_src, lane_count + i),
-                               src2.Uint(vform_src, lane_count + i),
-                               LaneSizeInBitsFromFormat(vform_src)));
+                PolynomialMult128(src1.Uint(vform_src, lane_count + i),
+                                  src2.Uint(vform_src, lane_count + i),
+                                  LaneSizeInBitsFromFormat(vform_src)));
   }
+
   return dst;
 }
 
diff --git a/src/aarch64/registers-aarch64.cc b/src/aarch64/registers-aarch64.cc
index f7101a7..3df7831 100644
--- a/src/aarch64/registers-aarch64.cc
+++ b/src/aarch64/registers-aarch64.cc
@@ -153,7 +153,8 @@
   V(2, S)                                 \
   V(4, S)                                 \
   V(1, D)                                 \
-  V(2, D)
+  V(2, D)                                 \
+  V(1, Q)
 #define VIXL_DEFINE_CPUREG_NEON_COERCION(LANES, LANE_TYPE)             \
   VRegister VRegister::V##LANES##LANE_TYPE() const {                   \
     VIXL_ASSERT(IsVRegister());                                        \
diff --git a/src/aarch64/registers-aarch64.h b/src/aarch64/registers-aarch64.h
index 7175c65..53bbe13 100644
--- a/src/aarch64/registers-aarch64.h
+++ b/src/aarch64/registers-aarch64.h
@@ -575,6 +575,7 @@
   VRegister V4S() const;
   VRegister V1D() const;
   VRegister V2D() const;
+  VRegister V1Q() const;
   VRegister S4B() const;
 
   bool IsValid() const { return IsValidVRegister(); }
diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc
index 32f04c3..17f0916 100644
--- a/src/aarch64/simulator-aarch64.cc
+++ b/src/aarch64/simulator-aarch64.cc
@@ -997,6 +997,19 @@
   return std::make_pair(sum_hi.first, sum_lo.first);
 }
 
+vixl_uint128_t Simulator::Lsl128(vixl_uint128_t x, unsigned shift) const {
+  VIXL_ASSERT(shift <= 64);
+  if (shift == 0) return x;
+  if (shift == 64) return std::make_pair(x.second, 0);
+  uint64_t lo = x.second << shift;
+  uint64_t hi = (x.first << shift) | (x.second >> (64 - shift));
+  return std::make_pair(hi, lo);
+}
+
+vixl_uint128_t Simulator::Eor128(vixl_uint128_t x, vixl_uint128_t y) const {
+  return std::make_pair(x.first ^ y.first, x.second ^ y.second);
+}
+
 vixl_uint128_t Simulator::Neg128(vixl_uint128_t x) {
   // Negate the integer value. Throw an assertion when the input is INT128_MIN.
   VIXL_ASSERT((x.first != GetSignMask(64)) || (x.second != 0));
@@ -1035,6 +1048,20 @@
                     : result;
 }
 
+vixl_uint128_t Simulator::PolynomialMult128(uint64_t op1,
+                                            uint64_t op2,
+                                            int lane_size_in_bits) const {
+  VIXL_ASSERT(static_cast<unsigned>(lane_size_in_bits) <= kDRegSize);
+  vixl_uint128_t result = std::make_pair(0, 0);
+  vixl_uint128_t op2q = std::make_pair(0, op2);
+  for (int i = 0; i < lane_size_in_bits; i++) {
+    if ((op1 >> i) & 1) {
+      result = Eor128(result, Lsl128(op2q, i));
+    }
+  }
+  return result;
+}
+
 int64_t Simulator::ShiftOperand(unsigned reg_size,
                                 uint64_t uvalue,
                                 Shift shift_type,
@@ -7800,13 +7827,24 @@
   SimVRegister& rd = ReadVRegister(instr->GetRd());
   SimVRegister& rn = ReadVRegister(instr->GetRn());
   SimVRegister& rm = ReadVRegister(instr->GetRm());
+  int size = instr->GetNEONSize();
 
   switch (instr->Mask(NEON3DifferentMask)) {
     case NEON_PMULL:
-      pmull(vf_l, rd, rn, rm);
+      if ((size == 1) || (size == 2)) {  // S/D reserved.
+        VisitUnallocated(instr);
+      } else {
+        if (size == 3) vf_l = kFormat1Q;
+        pmull(vf_l, rd, rn, rm);
+      }
       break;
     case NEON_PMULL2:
-      pmull2(vf_l, rd, rn, rm);
+      if ((size == 1) || (size == 2)) {  // S/D reserved.
+        VisitUnallocated(instr);
+      } else {
+        if (size == 3) vf_l = kFormat1Q;
+        pmull2(vf_l, rd, rn, rm);
+      }
       break;
     case NEON_UADDL:
       uaddl(vf_l, rd, rn, rm);
diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h
index 5b71fc1..77b00a6 100644
--- a/src/aarch64/simulator-aarch64.h
+++ b/src/aarch64/simulator-aarch64.h
@@ -729,6 +729,8 @@
   SimPRegister& register_;
 };
 
+using vixl_uint128_t = std::pair<uint64_t, uint64_t>;
+
 // Representation of a vector register, with typed getters and setters for lanes
 // and additional information to represent lane state.
 class LogicVRegister {
@@ -857,6 +859,17 @@
     }
   }
 
+  void SetUint(VectorFormat vform, int index, vixl_uint128_t value) const {
+    if (LaneSizeInBitsFromFormat(vform) <= 64) {
+      SetUint(vform, index, value.second);
+      return;
+    }
+    // TODO: Extend this to SVE.
+    VIXL_ASSERT((vform == kFormat1Q) && (index == 0));
+    SetUint(kFormat2D, 0, value.second);
+    SetUint(kFormat2D, 1, value.first);
+  }
+
   void SetUintArray(VectorFormat vform, const uint64_t* src) const {
     ClearForWrite(vform);
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
@@ -3279,8 +3292,9 @@
                                             uint64_t left,
                                             uint64_t right,
                                             int carry_in);
-  using vixl_uint128_t = std::pair<uint64_t, uint64_t>;
   vixl_uint128_t Add128(vixl_uint128_t x, vixl_uint128_t y);
+  vixl_uint128_t Lsl128(vixl_uint128_t x, unsigned shift) const;
+  vixl_uint128_t Eor128(vixl_uint128_t x, vixl_uint128_t y) const;
   vixl_uint128_t Mul64(uint64_t x, uint64_t y);
   vixl_uint128_t Neg128(vixl_uint128_t x);
   void LogicalHelper(const Instruction* instr, int64_t op2);
@@ -3362,6 +3376,9 @@
   uint64_t PolynomialMult(uint64_t op1,
                           uint64_t op2,
                           int lane_size_in_bits) const;
+  vixl_uint128_t PolynomialMult128(uint64_t op1,
+                                   uint64_t op2,
+                                   int lane_size_in_bits) const;
 
   bool ld1(VectorFormat vform, LogicVRegister dst, uint64_t addr);
   bool ld1(VectorFormat vform, LogicVRegister dst, int index, uint64_t addr);
diff --git a/test/aarch64/test-assembler-neon-aarch64.cc b/test/aarch64/test-assembler-neon-aarch64.cc
index 1682d13..35f768a 100644
--- a/test/aarch64/test-assembler-neon-aarch64.cc
+++ b/test/aarch64/test-assembler-neon-aarch64.cc
@@ -10975,6 +10975,24 @@
   }
 }
 
+TEST(neon_pmull_regression_test) {
+  SETUP_WITH_FEATURES(CPUFeatures::kNEON);
+
+  START();
+  __ Movi(v0.V2D(), 0xdecafc0ffee);
+  __ Pmull(v0.V8H(), v0.V8B(), v0.V8B());
+
+  __ Movi(v1.V2D(), 0xaaaaaaaa55555555);
+  __ Pmull2(v1.V8H(), v1.V16B(), v1.V16B());
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+    ASSERT_EQUAL_128(0x0000000000515450, 0x4455500055555454, q0);
+    ASSERT_EQUAL_128(0x4444444444444444, 0x1111111111111111, q1);
+  }
+}
+
 TEST(zero_high_b) {
   SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON, CPUFeatures::kRDM);
   START();
diff --git a/test/aarch64/test-cpu-features-aarch64.cc b/test/aarch64/test-cpu-features-aarch64.cc
index 4a82127..187bbd5 100644
--- a/test/aarch64/test-cpu-features-aarch64.cc
+++ b/test/aarch64/test-cpu-features-aarch64.cc
@@ -3778,5 +3778,12 @@
 TEST_FP_FCMA_NEON_NEONHALF(fcmla_2, fcmla(v0.V4H(), v1.V4H(), v2.V4H(), 180))
 TEST_FP_FCMA_NEON_NEONHALF(fcmla_3, fcmla(v0.V8H(), v1.V8H(), v2.V8H(), 0))
 
+#define TEST_FEAT(NAME, ASM)                                            \
+  TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kPmull1Q), \
+                NEON_Pmull1Q_##NAME,                                    \
+                ASM)
+TEST_FEAT(pmull1q_0, pmull(v5.V1Q(), v6.V1D(), v7.V1D()))
+#undef TEST_FEAT
+
 }  // namespace aarch64
 }  // namespace vixl
diff --git a/test/aarch64/test-disasm-neon-aarch64.cc b/test/aarch64/test-disasm-neon-aarch64.cc
index 14dd18a..0b37794 100644
--- a/test/aarch64/test-disasm-neon-aarch64.cc
+++ b/test/aarch64/test-disasm-neon-aarch64.cc
@@ -2904,6 +2904,10 @@
                 "pmull v0.8h, v1.8b, v2.8b");
   COMPARE_MACRO(Pmull2(v2.V8H(), v3.V16B(), v4.V16B()),
                 "pmull2 v2.8h, v3.16b, v4.16b");
+  COMPARE_MACRO(Pmull(v5.V1Q(), v6.V1D(), v7.V1D()),
+                "pmull v5.1q, v6.1d, v7.1d");
+  COMPARE_MACRO(Pmull2(v8.V1Q(), v9.V2D(), v10.V2D()),
+                "pmull2 v8.1q, v9.2d, v10.2d");
 
   CLEANUP();
 }
@@ -4562,8 +4566,6 @@
   COMPARE_PREFIX(dci(0x2efb9dbd), "unallocated");  // pmul v.und, v.und, v.und
   COMPARE_PREFIX(dci(0x4eace101), "unallocated");  // pmull v.d, v.s, v.s
   COMPARE_PREFIX(dci(0x0e6de3ad), "unallocated");  // pmull v.s, v.h, v.h
-  COMPARE_PREFIX(dci(0x4ee3e2c0), "unallocated");  // pmull v.und, v.d, v.d
-  COMPARE_PREFIX(dci(0x0eede060), "unallocated");  // pmull v.und, v.und, v.und
   COMPARE_PREFIX(dci(0x6ee00afd), "unallocated");  // rev v.d, v.d
   COMPARE_PREFIX(dci(0x4e601975), "unallocated");  // rev v.h, v.h
   COMPARE_PREFIX(dci(0x4ea019f3), "unallocated");  // rev v.s, v.s
diff --git a/test/aarch64/test-simulator-aarch64.cc b/test/aarch64/test-simulator-aarch64.cc
index 1e2feaf..5318f65 100644
--- a/test/aarch64/test-simulator-aarch64.cc
+++ b/test/aarch64/test-simulator-aarch64.cc
@@ -5188,7 +5188,8 @@
   // next instruction, after this handler.
   uc->uc_mcontext.gregs[REG_RIP] = sim->GetSignalReturnAddress();
   // Return that the memory read failed.
-  uc->uc_mcontext.gregs[REG_RAX] = static_cast<greg_t>(MemoryReadResult::Failure);
+  uc->uc_mcontext.gregs[REG_RAX] =
+      static_cast<greg_t>(MemoryReadResult::Failure);
 }
 
 TEST(ImplicitCheck) {
diff --git a/test/aarch64/test-simulator-sve-aarch64.cc b/test/aarch64/test-simulator-sve-aarch64.cc
index 51f7d82..1ba7783 100644
--- a/test/aarch64/test-simulator-sve-aarch64.cc
+++ b/test/aarch64/test-simulator-sve-aarch64.cc
@@ -267,5 +267,132 @@
   }
 }
 
+// Below here, there are tests for Neon instructions. As these forms of test
+// check the entire register state, they also need SVE features.
+
+TEST_SVE(neon_pmull) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kPmull1Q);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 40 * kInstructionSize);
+    __ dci(0x4e20e000);  // pmull2 v0.8h, v0.16b, v0.16b
+    // vl128 state = 0x5eba4d4f
+    __ dci(0x4e20e228);  // pmull2 v8.8h, v17.16b, v0.16b
+    // vl128 state = 0x86bceb87
+    __ dci(0x4ee0e22a);  // pmull2 v10.1q, v17.2d, v0.2d
+    // vl128 state = 0x1332fe02
+    __ dci(0x0ee8e222);  // pmull v2.1q, v17.1d, v8.1d
+    // vl128 state = 0xd357dc7b
+    __ dci(0x4eece226);  // pmull2 v6.1q, v17.2d, v12.2d
+    // vl128 state = 0xdff409ad
+    __ dci(0x0eece276);  // pmull v22.1q, v19.1d, v12.1d
+    // vl128 state = 0xd8af1dc6
+    __ dci(0x0eede232);  // pmull v18.1q, v17.1d, v13.1d
+    // vl128 state = 0x41e6ed0e
+    __ dci(0x0efde216);  // pmull v22.1q, v16.1d, v29.1d
+    // vl128 state = 0x1f10365f
+    __ dci(0x0effe23e);  // pmull v30.1q, v17.1d, v31.1d
+    // vl128 state = 0x9779ece5
+    __ dci(0x0ee7e23f);  // pmull v31.1q, v17.1d, v7.1d
+    // vl128 state = 0x11fc8ce9
+    __ dci(0x0ee2e23e);  // pmull v30.1q, v17.1d, v2.1d
+    // vl128 state = 0x101d5a6f
+    __ dci(0x0ee2e23c);  // pmull v28.1q, v17.1d, v2.1d
+    // vl128 state = 0xcc4fe26e
+    __ dci(0x0eeae27d);  // pmull v29.1q, v19.1d, v10.1d
+    // vl128 state = 0xc84be9f4
+    __ dci(0x4eeae24d);  // pmull2 v13.1q, v18.2d, v10.2d
+    // vl128 state = 0x2fc540b4
+    __ dci(0x4eeae25d);  // pmull2 v29.1q, v18.2d, v10.2d
+    // vl128 state = 0x1b2d99cd
+    __ dci(0x4eeae2ed);  // pmull2 v13.1q, v23.2d, v10.2d
+    // vl128 state = 0x8a278b95
+    __ dci(0x4eeae2e9);  // pmull2 v9.1q, v23.2d, v10.2d
+    // vl128 state = 0x3359b4c8
+    __ dci(0x4efee2e8);  // pmull2 v8.1q, v23.2d, v30.2d
+    // vl128 state = 0x5c25ed31
+    __ dci(0x4effe3e0);  // pmull2 v0.1q, v31.2d, v31.2d
+    // vl128 state = 0x28ff67d1
+    __ dci(0x4eefe3d0);  // pmull2 v16.1q, v30.2d, v15.2d
+    // vl128 state = 0x1543436d
+    __ dci(0x4ee7e2d1);  // pmull2 v17.1q, v22.2d, v7.2d
+    // vl128 state = 0x71b8bc90
+    __ dci(0x4eefe3d5);  // pmull2 v21.1q, v30.2d, v15.2d
+    // vl128 state = 0x3d35ca02
+    __ dci(0x4eefe314);  // pmull2 v20.1q, v24.2d, v15.2d
+    // vl128 state = 0x40e8fade
+    __ dci(0x4eefe310);  // pmull2 v16.1q, v24.2d, v15.2d
+    // vl128 state = 0xb8affb87
+    __ dci(0x4eefe300);  // pmull2 v0.1q, v24.2d, v15.2d
+    // vl128 state = 0x4824ee5c
+    __ dci(0x4eede350);  // pmull2 v16.1q, v26.2d, v13.2d
+    // vl128 state = 0x39202868
+    __ dci(0x4ee7e354);  // pmull2 v20.1q, v26.2d, v7.2d
+    // vl128 state = 0xc8fde340
+    __ dci(0x4e27e356);  // pmull2 v22.8h, v26.16b, v7.16b
+    // vl128 state = 0x0f02316b
+    __ dci(0x4e37e15e);  // pmull2 v30.8h, v10.16b, v23.16b
+    // vl128 state = 0xced4f8bd
+    __ dci(0x4e33e05f);  // pmull2 v31.8h, v2.16b, v19.16b
+    // vl128 state = 0x0c76bdb3
+    __ dci(0x0e23e05e);  // pmull v30.8h, v2.8b, v3.8b
+    // vl128 state = 0x0e36962b
+    __ dci(0x4e23e25f);  // pmull2 v31.8h, v18.16b, v3.16b
+    // vl128 state = 0x11a8dcc3
+    __ dci(0x4e23e25b);  // pmull2 v27.8h, v18.16b, v3.16b
+    // vl128 state = 0xf01bfe16
+    __ dci(0x4e23e259);  // pmull2 v25.8h, v18.16b, v3.16b
+    // vl128 state = 0xea351afe
+    __ dci(0x4e22e2c9);  // pmull2 v9.8h, v22.16b, v2.16b
+    // vl128 state = 0x16e933ef
+    __ dci(0x4e3ae2c8);  // pmull2 v8.8h, v22.16b, v26.16b
+    // vl128 state = 0x02528a2a
+    __ dci(0x4e32e249);  // pmull2 v9.8h, v18.16b, v18.16b
+    // vl128 state = 0xe7e20633
+    __ dci(0x4e36e20d);  // pmull2 v13.8h, v16.16b, v22.16b
+    // vl128 state = 0x6f231732
+    __ dci(0x4e36e205);  // pmull2 v5.8h, v16.16b, v22.16b
+    // vl128 state = 0x423eb7ea
+    __ dci(0x4e22e20d);  // pmull2 v13.8h, v16.16b, v2.16b
+    // vl128 state = 0xfc0d1c14
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0xfc0d1c14,
+        0x4cb040a3,
+        0x4b913ebe,
+        0xfa35b836,
+        0x78745d20,
+        0x6666b09a,
+        0xee2868f4,
+        0x1936a795,
+        0x1025244a,
+        0xe8551950,
+        0xae73af02,
+        0x0fdd5fc7,
+        0x22e9827b,
+        0x384ce1ac,
+        0xc833cbeb,
+        0x255baab5,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
 }  // namespace aarch64
 }  // namespace vixl