Support PMULL for 1Q destination vectors (#91)
Extend the Neon PMULL instruction to support 1Q destination registers when the
CPU feature is supported.
diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc
index c022717..c0f66ec 100644
--- a/src/aarch64/assembler-aarch64.cc
+++ b/src/aarch64/assembler-aarch64.cc
@@ -2913,6 +2913,25 @@
LoadStoreStructSingle(vt, lane, dst, NEONLoadStoreSingleStructStore1);
}
+void Assembler::pmull(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm) {
+ VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+ VIXL_ASSERT(AreSameFormat(vn, vm));
+ VIXL_ASSERT((vn.Is8B() && vd.Is8H()) || (vn.Is1D() && vd.Is1Q()));
+ VIXL_ASSERT(CPUHas(CPUFeatures::kPmull1Q) || vd.Is8H());
+ Emit(VFormat(vn) | NEON_PMULL | Rm(vm) | Rn(vn) | Rd(vd));
+}
+
+void Assembler::pmull2(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm) {
+ VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+ VIXL_ASSERT(AreSameFormat(vn, vm));
+ VIXL_ASSERT((vn.Is16B() && vd.Is8H()) || (vn.Is2D() && vd.Is1Q()));
+ VIXL_ASSERT(CPUHas(CPUFeatures::kPmull1Q) || vd.Is8H());
+ Emit(VFormat(vn) | NEON_PMULL2 | Rm(vm) | Rn(vn) | Rd(vd));
+}
void Assembler::NEON3DifferentL(const VRegister& vd,
const VRegister& vn,
@@ -2960,8 +2979,6 @@
// clang-format off
#define NEON_3DIFF_LONG_LIST(V) \
- V(pmull, NEON_PMULL, vn.IsVector() && vn.Is8B()) \
- V(pmull2, NEON_PMULL2, vn.IsVector() && vn.Is16B()) \
V(saddl, NEON_SADDL, vn.IsVector() && vn.IsD()) \
V(saddl2, NEON_SADDL2, vn.IsVector() && vn.IsQ()) \
V(sabal, NEON_SABAL, vn.IsVector() && vn.IsD()) \
diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h
index 2746f2c..1028da2 100644
--- a/src/aarch64/assembler-aarch64.h
+++ b/src/aarch64/assembler-aarch64.h
@@ -7540,6 +7540,8 @@
static Instr VFormat(VRegister vd) {
if (vd.Is64Bits()) {
switch (vd.GetLanes()) {
+ case 1:
+ return NEON_1D;
case 2:
return NEON_2S;
case 4:
diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc
index c198177..66ac97a 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.cc
+++ b/src/aarch64/cpu-features-auditor-aarch64.cc
@@ -736,6 +736,12 @@
RecordInstructionFeaturesScope scope(this);
// All of these instructions require NEON.
scope.Record(CPUFeatures::kNEON);
+ if (form_hash_ == "pmull_asimddiff_l"_h) {
+ if (instr->GetNEONSize() == 3) {
+ // Source is 1D or 2D, destination is 1Q.
+ scope.Record(CPUFeatures::kPmull1Q);
+ }
+ }
USE(instr);
}
@@ -1408,9 +1414,9 @@
void CPUFeaturesAuditor::Visit(Metadata* metadata, const Instruction* instr) {
VIXL_ASSERT(metadata->count("form") > 0);
const std::string& form = (*metadata)["form"];
- uint32_t form_hash = Hash(form.c_str());
+ form_hash_ = Hash(form.c_str());
const FormToVisitorFnMap* fv = CPUFeaturesAuditor::GetFormToVisitorFnMap();
- FormToVisitorFnMap::const_iterator it = fv->find(form_hash);
+ FormToVisitorFnMap::const_iterator it = fv->find(form_hash_);
if (it == fv->end()) {
RecordInstructionFeaturesScope scope(this);
std::map<uint32_t, const CPUFeatures> features = {
@@ -1829,8 +1835,8 @@
{"umin_64u_minmax_imm"_h, CPUFeatures::kCSSC},
};
- if (features.count(form_hash) > 0) {
- scope.Record(features[form_hash]);
+ if (features.count(form_hash_) > 0) {
+ scope.Record(features[form_hash_]);
}
} else {
(it->second)(this, instr);
diff --git a/src/aarch64/cpu-features-auditor-aarch64.h b/src/aarch64/cpu-features-auditor-aarch64.h
index 613c500..67de644 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.h
+++ b/src/aarch64/cpu-features-auditor-aarch64.h
@@ -127,6 +127,7 @@
uint32_t,
std::function<void(CPUFeaturesAuditor*, const Instruction*)>>;
static const FormToVisitorFnMap* GetFormToVisitorFnMap();
+ uint32_t form_hash_;
};
} // namespace aarch64
diff --git a/src/aarch64/decoder-visitor-map-aarch64.h b/src/aarch64/decoder-visitor-map-aarch64.h
index 8ae438c..b40e0ae 100644
--- a/src/aarch64/decoder-visitor-map-aarch64.h
+++ b/src/aarch64/decoder-visitor-map-aarch64.h
@@ -2074,7 +2074,6 @@
{"scvtf_asimdmiscfp16_r"_h, &VISITORCLASS::VisitNEON2RegMiscFP16}, \
{"ucvtf_asimdmiscfp16_r"_h, &VISITORCLASS::VisitNEON2RegMiscFP16}, \
{"addhn_asimddiff_n"_h, &VISITORCLASS::VisitNEON3Different}, \
- {"pmull_asimddiff_l"_h, &VISITORCLASS::VisitNEON3Different}, \
{"raddhn_asimddiff_n"_h, &VISITORCLASS::VisitNEON3Different}, \
{"rsubhn_asimddiff_n"_h, &VISITORCLASS::VisitNEON3Different}, \
{"sabal_asimddiff_l"_h, &VISITORCLASS::VisitNEON3Different}, \
@@ -2827,6 +2826,7 @@
{"fmlal_asimdsame_f"_h, &VISITORCLASS::VisitNEON3Same}, \
{"fmlsl2_asimdsame_f"_h, &VISITORCLASS::VisitNEON3Same}, \
{"fmlsl_asimdsame_f"_h, &VISITORCLASS::VisitNEON3Same}, \
+ {"pmull_asimddiff_l"_h, &VISITORCLASS::VisitNEON3Different}, \
{"ushll_asimdshf_l"_h, &VISITORCLASS::VisitNEONShiftImmediate}, \
{"sshll_asimdshf_l"_h, &VISITORCLASS::VisitNEONShiftImmediate}, \
{"shrn_asimdshf_n"_h, &VISITORCLASS::VisitNEONShiftImmediate}, \
diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc
index ae8fac8..b94ecfb 100644
--- a/src/aarch64/disasm-aarch64.cc
+++ b/src/aarch64/disasm-aarch64.cc
@@ -330,6 +330,7 @@
{"frsqrte_asisdmisc_r"_h, &Disassembler::DisassembleNEONFPScalar2RegMisc},
{"scvtf_asisdmisc_r"_h, &Disassembler::DisassembleNEONFPScalar2RegMisc},
{"ucvtf_asisdmisc_r"_h, &Disassembler::DisassembleNEONFPScalar2RegMisc},
+ {"pmull_asimddiff_l"_h, &Disassembler::DisassembleNEONPolynomialMul},
{"adclb_z_zzz"_h, &Disassembler::DisassembleSVEAddSubCarry},
{"adclt_z_zzz"_h, &Disassembler::DisassembleSVEAddSubCarry},
{"addhnb_z_zz"_h, &Disassembler::DisassembleSVEAddSubHigh},
@@ -2425,11 +2426,6 @@
nfd.SetFormatMaps(nfd.LongIntegerFormatMap());
nfd.SetFormatMap(0, nfd.IntegerFormatMap());
break;
- case "pmull_asimddiff_l"_h:
- if (nfd.GetVectorFormat(0) != kFormat8H) {
- mnemonic = NULL;
- }
- break;
case "sqdmlal_asimddiff_l"_h:
case "sqdmlsl_asimddiff_l"_h:
case "sqdmull_asimddiff_l"_h:
@@ -2441,6 +2437,22 @@
Format(instr, nfd.Mnemonic(mnemonic), nfd.Substitute(form));
}
+void Disassembler::DisassembleNEONPolynomialMul(const Instruction *instr) {
+ const char *mnemonic = instr->ExtractBit(30) ? "pmull2" : "pmull";
+ const char *form = NULL;
+ int size = instr->ExtractBits(23, 22);
+ if (size == 0) {
+ // Bits 30:27 of the instruction are x001, where x is the Q bit. Map
+ // this to "8" and "16" by adding 7.
+ form = "'Vd.8h, 'Vn.'u3127+7b, 'Vm.'u3127+7b";
+ } else if (size == 3) {
+ form = "'Vd.1q, 'Vn.'?30:21d, 'Vm.'?30:21d";
+ } else {
+ mnemonic = NULL;
+ }
+ Format(instr, mnemonic, form);
+}
+
void Disassembler::DisassembleNEONFPAcrossLanes(const Instruction *instr) {
const char *mnemonic = mnemonic_.c_str();
const char *form = "'Sd, 'Vn.4s";
diff --git a/src/aarch64/disasm-aarch64.h b/src/aarch64/disasm-aarch64.h
index cc941bb..7985383 100644
--- a/src/aarch64/disasm-aarch64.h
+++ b/src/aarch64/disasm-aarch64.h
@@ -228,6 +228,7 @@
void DisassembleNEONScalarShiftRightNarrowImm(const Instruction* instr);
void DisassembleNEONScalar2RegMiscOnlyD(const Instruction* instr);
void DisassembleNEONFPScalar2RegMisc(const Instruction* instr);
+ void DisassembleNEONPolynomialMul(const Instruction* instr);
void DisassembleMTELoadTag(const Instruction* instr);
void DisassembleMTEStoreTag(const Instruction* instr);
diff --git a/src/aarch64/instructions-aarch64.cc b/src/aarch64/instructions-aarch64.cc
index 298a7be..2ac3bca 100644
--- a/src/aarch64/instructions-aarch64.cc
+++ b/src/aarch64/instructions-aarch64.cc
@@ -1011,6 +1011,8 @@
return kFormat4H;
case kFormat2D:
return kFormat2S;
+ case kFormat1Q:
+ return kFormat1D;
case kFormatH:
return kFormatB;
case kFormatS:
@@ -1095,6 +1097,8 @@
return kFormat2S;
case kFormat2D:
return kFormat4S;
+ case kFormat1Q:
+ return kFormat2D;
case kFormatVnH:
return kFormatVnB;
case kFormatVnS:
@@ -1246,6 +1250,7 @@
case kFormat8H:
case kFormat4S:
case kFormat2D:
+ case kFormat1Q:
return kQRegSize;
default:
VIXL_UNREACHABLE();
@@ -1283,6 +1288,7 @@
case kFormat2D:
case kFormatVnD:
return 64;
+ case kFormat1Q:
case kFormatVnQ:
return 128;
case kFormatVnO:
@@ -1348,6 +1354,7 @@
case kFormat2D:
return 2;
case kFormat1D:
+ case kFormat1Q:
case kFormatB:
case kFormatH:
case kFormatS:
diff --git a/src/aarch64/instructions-aarch64.h b/src/aarch64/instructions-aarch64.h
index 38a0d67..ce08ea3 100644
--- a/src/aarch64/instructions-aarch64.h
+++ b/src/aarch64/instructions-aarch64.h
@@ -217,9 +217,10 @@
kFormatVnQ = kFormatSVEQ | kFormatSVE,
kFormatVnO = kFormatSVEO | kFormatSVE,
- // An artificial value, used by simulator trace tests and a few oddball
+ // Artificial values, used by simulator trace tests and a few oddball
// instructions (such as FMLAL).
- kFormat2H = 0xfffffffe
+ kFormat2H = 0xfffffffe,
+ kFormat1Q = 0xfffffffd
};
// Instructions. ---------------------------------------------------------------
diff --git a/src/aarch64/logic-aarch64.cc b/src/aarch64/logic-aarch64.cc
index 5e53d27..afd107c 100644
--- a/src/aarch64/logic-aarch64.cc
+++ b/src/aarch64/logic-aarch64.cc
@@ -913,23 +913,12 @@
return sqrdmlsh(vform, dst, src1, dup_element(indexform, temp, src2, index));
}
-
uint64_t Simulator::PolynomialMult(uint64_t op1,
uint64_t op2,
int lane_size_in_bits) const {
- VIXL_ASSERT(static_cast<unsigned>(lane_size_in_bits) <= kSRegSize);
- VIXL_ASSERT(IsUintN(lane_size_in_bits, op1));
- VIXL_ASSERT(IsUintN(lane_size_in_bits, op2));
- uint64_t result = 0;
- for (int i = 0; i < lane_size_in_bits; ++i) {
- if ((op1 >> i) & 1) {
- result = result ^ (op2 << i);
- }
- }
- return result;
+ return PolynomialMult128(op1, op2, lane_size_in_bits).second;
}
-
LogicVRegister Simulator::pmul(VectorFormat vform,
LogicVRegister dst,
const LogicVRegister& src1,
@@ -951,14 +940,16 @@
const LogicVRegister& src1,
const LogicVRegister& src2) {
dst.ClearForWrite(vform);
-
VectorFormat vform_src = VectorFormatHalfWidth(vform);
- for (int i = 0; i < LaneCountFromFormat(vform); i++) {
+
+ // Process the elements in reverse to avoid problems when the destination
+ // register is the same as a source.
+ for (int i = LaneCountFromFormat(vform) - 1; i >= 0; i--) {
dst.SetUint(vform,
i,
- PolynomialMult(src1.Uint(vform_src, i),
- src2.Uint(vform_src, i),
- LaneSizeInBitsFromFormat(vform_src)));
+ PolynomialMult128(src1.Uint(vform_src, i),
+ src2.Uint(vform_src, i),
+ LaneSizeInBitsFromFormat(vform_src)));
}
return dst;
@@ -969,16 +960,18 @@
LogicVRegister dst,
const LogicVRegister& src1,
const LogicVRegister& src2) {
- VectorFormat vform_src = VectorFormatHalfWidthDoubleLanes(vform);
dst.ClearForWrite(vform);
+ VectorFormat vform_src = VectorFormatHalfWidthDoubleLanes(vform);
+
int lane_count = LaneCountFromFormat(vform);
for (int i = 0; i < lane_count; i++) {
dst.SetUint(vform,
i,
- PolynomialMult(src1.Uint(vform_src, lane_count + i),
- src2.Uint(vform_src, lane_count + i),
- LaneSizeInBitsFromFormat(vform_src)));
+ PolynomialMult128(src1.Uint(vform_src, lane_count + i),
+ src2.Uint(vform_src, lane_count + i),
+ LaneSizeInBitsFromFormat(vform_src)));
}
+
return dst;
}
diff --git a/src/aarch64/registers-aarch64.cc b/src/aarch64/registers-aarch64.cc
index f7101a7..3df7831 100644
--- a/src/aarch64/registers-aarch64.cc
+++ b/src/aarch64/registers-aarch64.cc
@@ -153,7 +153,8 @@
V(2, S) \
V(4, S) \
V(1, D) \
- V(2, D)
+ V(2, D) \
+ V(1, Q)
#define VIXL_DEFINE_CPUREG_NEON_COERCION(LANES, LANE_TYPE) \
VRegister VRegister::V##LANES##LANE_TYPE() const { \
VIXL_ASSERT(IsVRegister()); \
diff --git a/src/aarch64/registers-aarch64.h b/src/aarch64/registers-aarch64.h
index 7175c65..53bbe13 100644
--- a/src/aarch64/registers-aarch64.h
+++ b/src/aarch64/registers-aarch64.h
@@ -575,6 +575,7 @@
VRegister V4S() const;
VRegister V1D() const;
VRegister V2D() const;
+ VRegister V1Q() const;
VRegister S4B() const;
bool IsValid() const { return IsValidVRegister(); }
diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc
index 32f04c3..17f0916 100644
--- a/src/aarch64/simulator-aarch64.cc
+++ b/src/aarch64/simulator-aarch64.cc
@@ -997,6 +997,19 @@
return std::make_pair(sum_hi.first, sum_lo.first);
}
+vixl_uint128_t Simulator::Lsl128(vixl_uint128_t x, unsigned shift) const {
+ VIXL_ASSERT(shift <= 64);
+ if (shift == 0) return x;
+ if (shift == 64) return std::make_pair(x.second, 0);
+ uint64_t lo = x.second << shift;
+ uint64_t hi = (x.first << shift) | (x.second >> (64 - shift));
+ return std::make_pair(hi, lo);
+}
+
+vixl_uint128_t Simulator::Eor128(vixl_uint128_t x, vixl_uint128_t y) const {
+ return std::make_pair(x.first ^ y.first, x.second ^ y.second);
+}
+
vixl_uint128_t Simulator::Neg128(vixl_uint128_t x) {
// Negate the integer value. Throw an assertion when the input is INT128_MIN.
VIXL_ASSERT((x.first != GetSignMask(64)) || (x.second != 0));
@@ -1035,6 +1048,20 @@
: result;
}
+vixl_uint128_t Simulator::PolynomialMult128(uint64_t op1,
+ uint64_t op2,
+ int lane_size_in_bits) const {
+ VIXL_ASSERT(static_cast<unsigned>(lane_size_in_bits) <= kDRegSize);
+ vixl_uint128_t result = std::make_pair(0, 0);
+ vixl_uint128_t op2q = std::make_pair(0, op2);
+ for (int i = 0; i < lane_size_in_bits; i++) {
+ if ((op1 >> i) & 1) {
+ result = Eor128(result, Lsl128(op2q, i));
+ }
+ }
+ return result;
+}
+
int64_t Simulator::ShiftOperand(unsigned reg_size,
uint64_t uvalue,
Shift shift_type,
@@ -7800,13 +7827,24 @@
SimVRegister& rd = ReadVRegister(instr->GetRd());
SimVRegister& rn = ReadVRegister(instr->GetRn());
SimVRegister& rm = ReadVRegister(instr->GetRm());
+ int size = instr->GetNEONSize();
switch (instr->Mask(NEON3DifferentMask)) {
case NEON_PMULL:
- pmull(vf_l, rd, rn, rm);
+ if ((size == 1) || (size == 2)) { // S/D reserved.
+ VisitUnallocated(instr);
+ } else {
+ if (size == 3) vf_l = kFormat1Q;
+ pmull(vf_l, rd, rn, rm);
+ }
break;
case NEON_PMULL2:
- pmull2(vf_l, rd, rn, rm);
+ if ((size == 1) || (size == 2)) { // S/D reserved.
+ VisitUnallocated(instr);
+ } else {
+ if (size == 3) vf_l = kFormat1Q;
+ pmull2(vf_l, rd, rn, rm);
+ }
break;
case NEON_UADDL:
uaddl(vf_l, rd, rn, rm);
diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h
index 5b71fc1..77b00a6 100644
--- a/src/aarch64/simulator-aarch64.h
+++ b/src/aarch64/simulator-aarch64.h
@@ -729,6 +729,8 @@
SimPRegister& register_;
};
+using vixl_uint128_t = std::pair<uint64_t, uint64_t>;
+
// Representation of a vector register, with typed getters and setters for lanes
// and additional information to represent lane state.
class LogicVRegister {
@@ -857,6 +859,17 @@
}
}
+ void SetUint(VectorFormat vform, int index, vixl_uint128_t value) const {
+ if (LaneSizeInBitsFromFormat(vform) <= 64) {
+ SetUint(vform, index, value.second);
+ return;
+ }
+ // TODO: Extend this to SVE.
+ VIXL_ASSERT((vform == kFormat1Q) && (index == 0));
+ SetUint(kFormat2D, 0, value.second);
+ SetUint(kFormat2D, 1, value.first);
+ }
+
void SetUintArray(VectorFormat vform, const uint64_t* src) const {
ClearForWrite(vform);
for (int i = 0; i < LaneCountFromFormat(vform); i++) {
@@ -3279,8 +3292,9 @@
uint64_t left,
uint64_t right,
int carry_in);
- using vixl_uint128_t = std::pair<uint64_t, uint64_t>;
vixl_uint128_t Add128(vixl_uint128_t x, vixl_uint128_t y);
+ vixl_uint128_t Lsl128(vixl_uint128_t x, unsigned shift) const;
+ vixl_uint128_t Eor128(vixl_uint128_t x, vixl_uint128_t y) const;
vixl_uint128_t Mul64(uint64_t x, uint64_t y);
vixl_uint128_t Neg128(vixl_uint128_t x);
void LogicalHelper(const Instruction* instr, int64_t op2);
@@ -3362,6 +3376,9 @@
uint64_t PolynomialMult(uint64_t op1,
uint64_t op2,
int lane_size_in_bits) const;
+ vixl_uint128_t PolynomialMult128(uint64_t op1,
+ uint64_t op2,
+ int lane_size_in_bits) const;
bool ld1(VectorFormat vform, LogicVRegister dst, uint64_t addr);
bool ld1(VectorFormat vform, LogicVRegister dst, int index, uint64_t addr);
diff --git a/test/aarch64/test-assembler-neon-aarch64.cc b/test/aarch64/test-assembler-neon-aarch64.cc
index 1682d13..35f768a 100644
--- a/test/aarch64/test-assembler-neon-aarch64.cc
+++ b/test/aarch64/test-assembler-neon-aarch64.cc
@@ -10975,6 +10975,24 @@
}
}
+TEST(neon_pmull_regression_test) {
+ SETUP_WITH_FEATURES(CPUFeatures::kNEON);
+
+ START();
+ __ Movi(v0.V2D(), 0xdecafc0ffee);
+ __ Pmull(v0.V8H(), v0.V8B(), v0.V8B());
+
+ __ Movi(v1.V2D(), 0xaaaaaaaa55555555);
+ __ Pmull2(v1.V8H(), v1.V16B(), v1.V16B());
+ END();
+
+ if (CAN_RUN()) {
+ RUN();
+ ASSERT_EQUAL_128(0x0000000000515450, 0x4455500055555454, q0);
+ ASSERT_EQUAL_128(0x4444444444444444, 0x1111111111111111, q1);
+ }
+}
+
TEST(zero_high_b) {
SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON, CPUFeatures::kRDM);
START();
diff --git a/test/aarch64/test-cpu-features-aarch64.cc b/test/aarch64/test-cpu-features-aarch64.cc
index 4a82127..187bbd5 100644
--- a/test/aarch64/test-cpu-features-aarch64.cc
+++ b/test/aarch64/test-cpu-features-aarch64.cc
@@ -3778,5 +3778,12 @@
TEST_FP_FCMA_NEON_NEONHALF(fcmla_2, fcmla(v0.V4H(), v1.V4H(), v2.V4H(), 180))
TEST_FP_FCMA_NEON_NEONHALF(fcmla_3, fcmla(v0.V8H(), v1.V8H(), v2.V8H(), 0))
+#define TEST_FEAT(NAME, ASM) \
+ TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kPmull1Q), \
+ NEON_Pmull1Q_##NAME, \
+ ASM)
+TEST_FEAT(pmull1q_0, pmull(v5.V1Q(), v6.V1D(), v7.V1D()))
+#undef TEST_FEAT
+
} // namespace aarch64
} // namespace vixl
diff --git a/test/aarch64/test-disasm-neon-aarch64.cc b/test/aarch64/test-disasm-neon-aarch64.cc
index 14dd18a..0b37794 100644
--- a/test/aarch64/test-disasm-neon-aarch64.cc
+++ b/test/aarch64/test-disasm-neon-aarch64.cc
@@ -2904,6 +2904,10 @@
"pmull v0.8h, v1.8b, v2.8b");
COMPARE_MACRO(Pmull2(v2.V8H(), v3.V16B(), v4.V16B()),
"pmull2 v2.8h, v3.16b, v4.16b");
+ COMPARE_MACRO(Pmull(v5.V1Q(), v6.V1D(), v7.V1D()),
+ "pmull v5.1q, v6.1d, v7.1d");
+ COMPARE_MACRO(Pmull2(v8.V1Q(), v9.V2D(), v10.V2D()),
+ "pmull2 v8.1q, v9.2d, v10.2d");
CLEANUP();
}
@@ -4562,8 +4566,6 @@
COMPARE_PREFIX(dci(0x2efb9dbd), "unallocated"); // pmul v.und, v.und, v.und
COMPARE_PREFIX(dci(0x4eace101), "unallocated"); // pmull v.d, v.s, v.s
COMPARE_PREFIX(dci(0x0e6de3ad), "unallocated"); // pmull v.s, v.h, v.h
- COMPARE_PREFIX(dci(0x4ee3e2c0), "unallocated"); // pmull v.und, v.d, v.d
- COMPARE_PREFIX(dci(0x0eede060), "unallocated"); // pmull v.und, v.und, v.und
COMPARE_PREFIX(dci(0x6ee00afd), "unallocated"); // rev v.d, v.d
COMPARE_PREFIX(dci(0x4e601975), "unallocated"); // rev v.h, v.h
COMPARE_PREFIX(dci(0x4ea019f3), "unallocated"); // rev v.s, v.s
diff --git a/test/aarch64/test-simulator-aarch64.cc b/test/aarch64/test-simulator-aarch64.cc
index 1e2feaf..5318f65 100644
--- a/test/aarch64/test-simulator-aarch64.cc
+++ b/test/aarch64/test-simulator-aarch64.cc
@@ -5188,7 +5188,8 @@
// next instruction, after this handler.
uc->uc_mcontext.gregs[REG_RIP] = sim->GetSignalReturnAddress();
// Return that the memory read failed.
- uc->uc_mcontext.gregs[REG_RAX] = static_cast<greg_t>(MemoryReadResult::Failure);
+ uc->uc_mcontext.gregs[REG_RAX] =
+ static_cast<greg_t>(MemoryReadResult::Failure);
}
TEST(ImplicitCheck) {
diff --git a/test/aarch64/test-simulator-sve-aarch64.cc b/test/aarch64/test-simulator-sve-aarch64.cc
index 51f7d82..1ba7783 100644
--- a/test/aarch64/test-simulator-sve-aarch64.cc
+++ b/test/aarch64/test-simulator-sve-aarch64.cc
@@ -267,5 +267,132 @@
}
}
+// Below here, there are tests for Neon instructions. As these forms of test
+// check the entire register state, they also need SVE features.
+
+TEST_SVE(neon_pmull) {
+ SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+ CPUFeatures::kNEON,
+ CPUFeatures::kCRC32,
+ CPUFeatures::kPmull1Q);
+ START();
+
+ SetInitialMachineState(&masm);
+ // state = 0xe2bd2480
+
+ {
+ ExactAssemblyScope scope(&masm, 40 * kInstructionSize);
+ __ dci(0x4e20e000); // pmull2 v0.8h, v0.16b, v0.16b
+ // vl128 state = 0x5eba4d4f
+ __ dci(0x4e20e228); // pmull2 v8.8h, v17.16b, v0.16b
+ // vl128 state = 0x86bceb87
+ __ dci(0x4ee0e22a); // pmull2 v10.1q, v17.2d, v0.2d
+ // vl128 state = 0x1332fe02
+ __ dci(0x0ee8e222); // pmull v2.1q, v17.1d, v8.1d
+ // vl128 state = 0xd357dc7b
+ __ dci(0x4eece226); // pmull2 v6.1q, v17.2d, v12.2d
+ // vl128 state = 0xdff409ad
+ __ dci(0x0eece276); // pmull v22.1q, v19.1d, v12.1d
+ // vl128 state = 0xd8af1dc6
+ __ dci(0x0eede232); // pmull v18.1q, v17.1d, v13.1d
+ // vl128 state = 0x41e6ed0e
+ __ dci(0x0efde216); // pmull v22.1q, v16.1d, v29.1d
+ // vl128 state = 0x1f10365f
+ __ dci(0x0effe23e); // pmull v30.1q, v17.1d, v31.1d
+ // vl128 state = 0x9779ece5
+ __ dci(0x0ee7e23f); // pmull v31.1q, v17.1d, v7.1d
+ // vl128 state = 0x11fc8ce9
+ __ dci(0x0ee2e23e); // pmull v30.1q, v17.1d, v2.1d
+ // vl128 state = 0x101d5a6f
+ __ dci(0x0ee2e23c); // pmull v28.1q, v17.1d, v2.1d
+ // vl128 state = 0xcc4fe26e
+ __ dci(0x0eeae27d); // pmull v29.1q, v19.1d, v10.1d
+ // vl128 state = 0xc84be9f4
+ __ dci(0x4eeae24d); // pmull2 v13.1q, v18.2d, v10.2d
+ // vl128 state = 0x2fc540b4
+ __ dci(0x4eeae25d); // pmull2 v29.1q, v18.2d, v10.2d
+ // vl128 state = 0x1b2d99cd
+ __ dci(0x4eeae2ed); // pmull2 v13.1q, v23.2d, v10.2d
+ // vl128 state = 0x8a278b95
+ __ dci(0x4eeae2e9); // pmull2 v9.1q, v23.2d, v10.2d
+ // vl128 state = 0x3359b4c8
+ __ dci(0x4efee2e8); // pmull2 v8.1q, v23.2d, v30.2d
+ // vl128 state = 0x5c25ed31
+ __ dci(0x4effe3e0); // pmull2 v0.1q, v31.2d, v31.2d
+ // vl128 state = 0x28ff67d1
+ __ dci(0x4eefe3d0); // pmull2 v16.1q, v30.2d, v15.2d
+ // vl128 state = 0x1543436d
+ __ dci(0x4ee7e2d1); // pmull2 v17.1q, v22.2d, v7.2d
+ // vl128 state = 0x71b8bc90
+ __ dci(0x4eefe3d5); // pmull2 v21.1q, v30.2d, v15.2d
+ // vl128 state = 0x3d35ca02
+ __ dci(0x4eefe314); // pmull2 v20.1q, v24.2d, v15.2d
+ // vl128 state = 0x40e8fade
+ __ dci(0x4eefe310); // pmull2 v16.1q, v24.2d, v15.2d
+ // vl128 state = 0xb8affb87
+ __ dci(0x4eefe300); // pmull2 v0.1q, v24.2d, v15.2d
+ // vl128 state = 0x4824ee5c
+ __ dci(0x4eede350); // pmull2 v16.1q, v26.2d, v13.2d
+ // vl128 state = 0x39202868
+ __ dci(0x4ee7e354); // pmull2 v20.1q, v26.2d, v7.2d
+ // vl128 state = 0xc8fde340
+ __ dci(0x4e27e356); // pmull2 v22.8h, v26.16b, v7.16b
+ // vl128 state = 0x0f02316b
+ __ dci(0x4e37e15e); // pmull2 v30.8h, v10.16b, v23.16b
+ // vl128 state = 0xced4f8bd
+ __ dci(0x4e33e05f); // pmull2 v31.8h, v2.16b, v19.16b
+ // vl128 state = 0x0c76bdb3
+ __ dci(0x0e23e05e); // pmull v30.8h, v2.8b, v3.8b
+ // vl128 state = 0x0e36962b
+ __ dci(0x4e23e25f); // pmull2 v31.8h, v18.16b, v3.16b
+ // vl128 state = 0x11a8dcc3
+ __ dci(0x4e23e25b); // pmull2 v27.8h, v18.16b, v3.16b
+ // vl128 state = 0xf01bfe16
+ __ dci(0x4e23e259); // pmull2 v25.8h, v18.16b, v3.16b
+ // vl128 state = 0xea351afe
+ __ dci(0x4e22e2c9); // pmull2 v9.8h, v22.16b, v2.16b
+ // vl128 state = 0x16e933ef
+ __ dci(0x4e3ae2c8); // pmull2 v8.8h, v22.16b, v26.16b
+ // vl128 state = 0x02528a2a
+ __ dci(0x4e32e249); // pmull2 v9.8h, v18.16b, v18.16b
+ // vl128 state = 0xe7e20633
+ __ dci(0x4e36e20d); // pmull2 v13.8h, v16.16b, v22.16b
+ // vl128 state = 0x6f231732
+ __ dci(0x4e36e205); // pmull2 v5.8h, v16.16b, v22.16b
+ // vl128 state = 0x423eb7ea
+ __ dci(0x4e22e20d); // pmull2 v13.8h, v16.16b, v2.16b
+ // vl128 state = 0xfc0d1c14
+ }
+
+ uint32_t state;
+ ComputeMachineStateHash(&masm, &state);
+ __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+ __ Ldr(w0, MemOperand(x0));
+
+ END();
+ if (CAN_RUN()) {
+ RUN();
+ uint32_t expected_hashes[] = {
+ 0xfc0d1c14,
+ 0x4cb040a3,
+ 0x4b913ebe,
+ 0xfa35b836,
+ 0x78745d20,
+ 0x6666b09a,
+ 0xee2868f4,
+ 0x1936a795,
+ 0x1025244a,
+ 0xe8551950,
+ 0xae73af02,
+ 0x0fdd5fc7,
+ 0x22e9827b,
+ 0x384ce1ac,
+ 0xc833cbeb,
+ 0x255baab5,
+ };
+ ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+ }
+}
+
} // namespace aarch64
} // namespace vixl