[sve] Support PMULLB/T for Q destination elements (#126)
Extend the SVE PMULL instructions to support Q destination elements when the
CPU feature is supported.
diff --git a/src/aarch64/assembler-sve-aarch64.cc b/src/aarch64/assembler-sve-aarch64.cc
index e99cfdc..0c3c7f8 100644
--- a/src/aarch64/assembler-sve-aarch64.cc
+++ b/src/aarch64/assembler-sve-aarch64.cc
@@ -7410,13 +7410,13 @@
// size<23:22> | Zm<20:16> | op<12> | U<11> | T<10> | Zn<9:5> | Zd<4:0>
VIXL_ASSERT(CPUHas(CPUFeatures::kSVE2));
+ VIXL_ASSERT(CPUHas(CPUFeatures::kSVEPmull128) || !zd.IsLaneSizeQ());
VIXL_ASSERT(AreSameLaneSize(zn, zm));
VIXL_ASSERT(!zd.IsLaneSizeB() && !zd.IsLaneSizeS());
VIXL_ASSERT(zd.GetLaneSizeInBytes() == zn.GetLaneSizeInBytes() * 2);
- // SVEPmull128 is not supported
- VIXL_ASSERT(!zd.IsLaneSizeQ());
+ Instr size = zd.IsLaneSizeQ() ? 0 : SVESize(zd);
- Emit(0x45006800 | SVESize(zd) | Rd(zd) | Rn(zn) | Rm(zm));
+ Emit(0x45006800 | size | Rd(zd) | Rn(zn) | Rm(zm));
}
void Assembler::pmullt(const ZRegister& zd,
@@ -7427,13 +7427,13 @@
// size<23:22> | Zm<20:16> | op<12> | U<11> | T<10> | Zn<9:5> | Zd<4:0>
VIXL_ASSERT(CPUHas(CPUFeatures::kSVE2));
+ VIXL_ASSERT(CPUHas(CPUFeatures::kSVEPmull128) || !zd.IsLaneSizeQ());
VIXL_ASSERT(AreSameLaneSize(zn, zm));
VIXL_ASSERT(!zd.IsLaneSizeB() && !zd.IsLaneSizeS());
VIXL_ASSERT(zd.GetLaneSizeInBytes() == zn.GetLaneSizeInBytes() * 2);
- // SVEPmull128 is not supported
- VIXL_ASSERT(!zd.IsLaneSizeQ());
+ Instr size = zd.IsLaneSizeQ() ? 0 : SVESize(zd);
- Emit(0x45006c00 | SVESize(zd) | Rd(zd) | Rn(zn) | Rm(zm));
+ Emit(0x45006c00 | size | Rd(zd) | Rn(zn) | Rm(zm));
}
void Assembler::raddhnb(const ZRegister& zd,
diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc
index 407ff98..972bf03 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.cc
+++ b/src/aarch64/cpu-features-auditor-aarch64.cc
@@ -1882,6 +1882,10 @@
CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA512)},
{"sha512su1_vvv2_cryptosha512_3"_h,
CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA512)},
+ {"pmullb_z_zz_q"_h,
+ CPUFeatures(CPUFeatures::kSVE2, CPUFeatures::kSVEPmull128)},
+ {"pmullt_z_zz_q"_h,
+ CPUFeatures(CPUFeatures::kSVE2, CPUFeatures::kSVEPmull128)},
};
if (features.count(form_hash_) > 0) {
diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc
index cc42709..7e1ab46 100644
--- a/src/aarch64/disasm-aarch64.cc
+++ b/src/aarch64/disasm-aarch64.cc
@@ -418,8 +418,8 @@
{"nbsl_z_zzz"_h, &Disassembler::DisassembleSVEBitwiseTernary},
{"nmatch_p_p_zz"_h, &Disassembler::Disassemble_PdT_PgZ_ZnT_ZmT},
{"pmul_z_zz"_h, &Disassembler::Disassemble_ZdB_ZnB_ZmB},
- {"pmullb_z_zz"_h, &Disassembler::Disassemble_ZdT_ZnTb_ZmTb},
- {"pmullt_z_zz"_h, &Disassembler::Disassemble_ZdT_ZnTb_ZmTb},
+ {"pmullb_z_zz"_h, &Disassembler::DisassembleSVEPmull},
+ {"pmullt_z_zz"_h, &Disassembler::DisassembleSVEPmull},
{"raddhnb_z_zz"_h, &Disassembler::DisassembleSVEAddSubHigh},
{"raddhnt_z_zz"_h, &Disassembler::DisassembleSVEAddSubHigh},
{"rax1_z_zz"_h, &Disassembler::Disassemble_ZdD_ZnD_ZmD},
@@ -761,6 +761,8 @@
{"sha512h_qqv_cryptosha512_3"_h, &Disassembler::DisassembleSHA512},
{"sha512su0_vv2_cryptosha512_2"_h, &Disassembler::DisassembleSHA512},
{"sha512su1_vvv2_cryptosha512_3"_h, &Disassembler::DisassembleSHA512},
+ {"pmullb_z_zz_q"_h, &Disassembler::DisassembleSVEPmull128},
+ {"pmullt_z_zz_q"_h, &Disassembler::DisassembleSVEPmull128},
};
return &form_to_visitor;
} // NOLINT(readability/fn_size)
@@ -5852,15 +5854,26 @@
}
}
+void Disassembler::DisassembleSVEPmull(const Instruction *instr) {
+ if (instr->GetSVEVectorFormat() == kFormatVnS) {
+ VisitUnallocated(instr);
+ } else {
+ Disassemble_ZdT_ZnTb_ZmTb(instr);
+ }
+}
+
+void Disassembler::DisassembleSVEPmull128(const Instruction *instr) {
+ FormatWithDecodedMnemonic(instr, "'Zd.q, 'Zn.d, 'Zm.d");
+}
+
void Disassembler::Disassemble_ZdT_ZnTb_ZmTb(const Instruction *instr) {
- const char *form = "'Zd.'t, 'Zn.'th, 'Zm.'th";
if (instr->GetSVEVectorFormat() == kFormatVnB) {
// TODO: This is correct for saddlbt, ssublbt, subltb, which don't have
- // b-lane sized form, and for pmull[b|t] as feature `SVEPmull128` isn't
- // supported, but may need changes for other instructions reaching here.
+ // b-lane sized form, but may need changes for other instructions reaching
+ // here.
Format(instr, "unimplemented", "(ZdT_ZnTb_ZmTb)");
} else {
- Format(instr, mnemonic_.c_str(), form);
+ FormatWithDecodedMnemonic(instr, "'Zd.'t, 'Zn.'th, 'Zm.'th");
}
}
diff --git a/src/aarch64/disasm-aarch64.h b/src/aarch64/disasm-aarch64.h
index 8f028b5..9470565 100644
--- a/src/aarch64/disasm-aarch64.h
+++ b/src/aarch64/disasm-aarch64.h
@@ -196,6 +196,8 @@
void DisassembleSVEBitwiseTernary(const Instruction* instr);
void DisassembleSVEFlogb(const Instruction* instr);
void DisassembleSVEFPPair(const Instruction* instr);
+ void DisassembleSVEPmull(const Instruction* instr);
+ void DisassembleSVEPmull128(const Instruction* instr);
void DisassembleNoArgs(const Instruction* instr);
diff --git a/src/aarch64/instructions-aarch64.cc b/src/aarch64/instructions-aarch64.cc
index a37be34..adef87f 100644
--- a/src/aarch64/instructions-aarch64.cc
+++ b/src/aarch64/instructions-aarch64.cc
@@ -1047,6 +1047,8 @@
return kFormatVnH;
case kFormatVnD:
return kFormatVnS;
+ case kFormatVnQ:
+ return kFormatVnD;
default:
VIXL_UNREACHABLE();
return kFormatUndefined;
diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc
index 1a2959b..2130c46 100644
--- a/src/aarch64/simulator-aarch64.cc
+++ b/src/aarch64/simulator-aarch64.cc
@@ -530,6 +530,8 @@
{"sha512h2_qqv_cryptosha512_3"_h, &Simulator::SimulateSHA512},
{"sha512su0_vv2_cryptosha512_2"_h, &Simulator::SimulateSHA512},
{"sha512su1_vvv2_cryptosha512_3"_h, &Simulator::SimulateSHA512},
+ {"pmullb_z_zz_q"_h, &Simulator::SimulateSVEPmull128},
+ {"pmullt_z_zz_q"_h, &Simulator::SimulateSVEPmull128},
};
return &form_to_visitor;
}
@@ -2909,6 +2911,23 @@
}
}
+void Simulator::SimulateSVEPmull128(const Instruction* instr) {
+ SimVRegister& zd = ReadVRegister(instr->GetRd());
+ SimVRegister& zm = ReadVRegister(instr->GetRm());
+ SimVRegister& zn = ReadVRegister(instr->GetRn());
+ SimVRegister zn_temp, zm_temp;
+
+ if (form_hash_ == "pmullb_z_zz_q"_h) {
+ pack_even_elements(kFormatVnD, zn_temp, zn);
+ pack_even_elements(kFormatVnD, zm_temp, zm);
+ } else {
+ VIXL_ASSERT(form_hash_ == "pmullt_z_zz_q"_h);
+ pack_odd_elements(kFormatVnD, zn_temp, zn);
+ pack_odd_elements(kFormatVnD, zm_temp, zm);
+ }
+ pmull(kFormatVnQ, zd, zn_temp, zm_temp);
+}
+
void Simulator::SimulateSVEIntMulLongVec(const Instruction* instr) {
VectorFormat vform = instr->GetSVEVectorFormat();
SimVRegister& zd = ReadVRegister(instr->GetRd());
@@ -2923,15 +2942,15 @@
switch (form_hash_) {
case "pmullb_z_zz"_h:
- // '00' is reserved for Q-sized lane.
- if (vform == kFormatVnB) {
+ // Size '10' is undefined.
+ if (vform == kFormatVnS) {
VIXL_UNIMPLEMENTED();
}
pmull(vform, zd, zn_b, zm_b);
break;
case "pmullt_z_zz"_h:
- // '00' is reserved for Q-sized lane.
- if (vform == kFormatVnB) {
+ // Size '10' is undefined.
+ if (vform == kFormatVnS) {
VIXL_UNIMPLEMENTED();
}
pmull(vform, zd, zn_t, zm_t);
diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h
index 632b8ed..cbb1c4c 100644
--- a/src/aarch64/simulator-aarch64.h
+++ b/src/aarch64/simulator-aarch64.h
@@ -872,10 +872,9 @@
SetUint(vform, index, value.second);
return;
}
- // TODO: Extend this to SVE.
- VIXL_ASSERT((vform == kFormat1Q) && (index == 0));
- SetUint(kFormat2D, 0, value.second);
- SetUint(kFormat2D, 1, value.first);
+ VIXL_ASSERT((vform == kFormat1Q) || (vform == kFormatVnQ));
+ SetUint(kFormatVnD, 2 * index, value.second);
+ SetUint(kFormatVnD, 2 * index + 1, value.first);
}
void SetUintArray(VectorFormat vform, const uint64_t* src) const {
@@ -1504,6 +1503,7 @@
void SimulateSVESaturatingMulAddHigh(const Instruction* instr);
void SimulateSVESaturatingMulHighIndex(const Instruction* instr);
void SimulateSVEFPConvertLong(const Instruction* instr);
+ void SimulateSVEPmull128(const Instruction* instr);
void SimulateMatrixMul(const Instruction* instr);
void SimulateSVEFPMatrixMul(const Instruction* instr);
void SimulateNEONMulByElementLong(const Instruction* instr);
diff --git a/test/aarch64/test-cpu-features-aarch64.cc b/test/aarch64/test-cpu-features-aarch64.cc
index 5b5e603..c018f49 100644
--- a/test/aarch64/test-cpu-features-aarch64.cc
+++ b/test/aarch64/test-cpu-features-aarch64.cc
@@ -3856,5 +3856,13 @@
TEST_FEAT(sm4ekey, sm4ekey(v12.V4S(), v13.V4S(), v14.V4S()))
#undef TEST_FEAT
+#define TEST_FEAT(NAME, ASM) \
+ TEST_TEMPLATE(CPUFeatures(CPUFeatures::kSVE2, CPUFeatures::kSVEPmull128), \
+ SVE_PMULL128_##NAME, \
+ ASM)
+TEST_FEAT(pmullb, pmullb(z12.VnQ(), z21.VnD(), z12.VnD()))
+TEST_FEAT(pmullt, pmullt(z12.VnQ(), z21.VnD(), z12.VnD()))
+#undef TEST_FEAT
+
} // namespace aarch64
} // namespace vixl
diff --git a/test/aarch64/test-disasm-sve-aarch64.cc b/test/aarch64/test-disasm-sve-aarch64.cc
index 5e001e7..fbdff33 100644
--- a/test/aarch64/test-disasm-sve-aarch64.cc
+++ b/test/aarch64/test-disasm-sve-aarch64.cc
@@ -7673,13 +7673,14 @@
COMPARE(sqdmullt(z7.VnD(), z4.VnS(), z0.VnS(), 0),
"sqdmullt z7.d, z4.s, z0.s[0]");
- // Feature `SVEPmull128` is not supported.
- // COMPARE(pmullb(z12.VnQ(), z21.VnD(), z12.VnD()),
- // "pmullb z12.q, z21.d, z12.d");
COMPARE(pmullb(z12.VnH(), z21.VnB(), z12.VnB()),
"pmullb z12.h, z21.b, z12.b");
COMPARE(pmullt(z31.VnD(), z30.VnS(), z26.VnS()),
"pmullt z31.d, z30.s, z26.s");
+ COMPARE(pmullb(z12.VnQ(), z21.VnD(), z12.VnD()),
+ "pmullb z12.q, z21.d, z12.d");
+ COMPARE(pmullt(z12.VnQ(), z21.VnD(), z12.VnD()),
+ "pmullt z12.q, z21.d, z12.d");
COMPARE(smullb(z10.VnD(), z4.VnS(), z4.VnS()), "smullb z10.d, z4.s, z4.s");
COMPARE(smullb(z11.VnH(), z14.VnB(), z14.VnB()),
@@ -7701,6 +7702,10 @@
COMPARE(umullt(z24.VnH(), z7.VnB(), z16.VnB()), "umullt z24.h, z7.b, z16.b");
COMPARE(umullt(z24.VnS(), z8.VnH(), z26.VnH()), "umullt z24.s, z8.h, z26.h");
+ // Check related but undefined encodings.
+ COMPARE(dci(0x45806800), "unallocated (Unallocated)"); // pmullb s, h, h
+ COMPARE(dci(0x45806c00), "unallocated (Unallocated)"); // pmullt s, h, h
+
CLEANUP();
}
diff --git a/test/aarch64/test-simulator-sve2-aarch64.cc b/test/aarch64/test-simulator-sve2-aarch64.cc
index a7c0f40..621754d 100644
--- a/test/aarch64/test-simulator-sve2-aarch64.cc
+++ b/test/aarch64/test-simulator-sve2-aarch64.cc
@@ -9117,5 +9117,130 @@
}
}
+TEST_SVE(sve2_pmull128) {
+ SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+ CPUFeatures::kSVE2,
+ CPUFeatures::kNEON,
+ CPUFeatures::kCRC32,
+ CPUFeatures::kSVEPmull128);
+ START();
+
+ SetInitialMachineState(&masm);
+ // state = 0xe2bd2480
+
+ {
+ ExactAssemblyScope scope(&masm, 40 * kInstructionSize);
+ __ dci(0x45006800); // pmullb z0.q, z0.d, z0.d
+ // vl128 state = 0x4107ca0c
+ __ dci(0x45006a28); // pmullb z8.q, z17.d, z0.d
+ // vl128 state = 0xa87d231a
+ __ dci(0x45016a6c); // pmullb z12.q, z19.d, z1.d
+ // vl128 state = 0xc547fcf6
+ __ dci(0x45116e68); // pmullt z8.q, z19.d, z17.d
+ // vl128 state = 0x6a01d521
+ __ dci(0x45106a69); // pmullb z9.q, z19.d, z16.d
+ // vl128 state = 0x64a7ba8a
+ __ dci(0x45006a4d); // pmullb z13.q, z18.d, z0.d
+ // vl128 state = 0xe59e3f8e
+ __ dci(0x45086e5d); // pmullt z29.q, z18.d, z8.d
+ // vl128 state = 0xbfbb9316
+ __ dci(0x450a6e75); // pmullt z21.q, z19.d, z10.d
+ // vl128 state = 0x29f6a4c7
+ __ dci(0x45126e74); // pmullt z20.q, z19.d, z18.d
+ // vl128 state = 0x4ced9406
+ __ dci(0x45176e75); // pmullt z21.q, z19.d, z23.d
+ // vl128 state = 0xd09e5676
+ __ dci(0x45176e77); // pmullt z23.q, z19.d, z23.d
+ // vl128 state = 0x568c0e25
+ __ dci(0x45176e75); // pmullt z21.q, z19.d, z23.d
+ // vl128 state = 0xb2f13c36
+ __ dci(0x45176b71); // pmullb z17.q, z27.d, z23.d
+ // vl128 state = 0x160bec4f
+ __ dci(0x451f6b30); // pmullb z16.q, z25.d, z31.d
+ // vl128 state = 0x2d7e7f49
+ __ dci(0x451f6b20); // pmullb z0.q, z25.d, z31.d
+ // vl128 state = 0x113d828b
+ __ dci(0x451f6b90); // pmullb z16.q, z28.d, z31.d
+ // vl128 state = 0xb8b3b3d9
+ __ dci(0x451f6f12); // pmullt z18.q, z24.d, z31.d
+ // vl128 state = 0x277aacb8
+ __ dci(0x451f6f16); // pmullt z22.q, z24.d, z31.d
+ // vl128 state = 0xef79c8da
+ __ dci(0x450b6f17); // pmullt z23.q, z24.d, z11.d
+ // vl128 state = 0x1dc19104
+ __ dci(0x450a6e1f); // pmullt z31.q, z16.d, z10.d
+ // vl128 state = 0x3ccb4ea8
+ __ dci(0x451a6e2f); // pmullt z15.q, z17.d, z26.d
+ // vl128 state = 0x14e13481
+ __ dci(0x45126a3f); // pmullb z31.q, z17.d, z18.d
+ // vl128 state = 0x4e6502f9
+ __ dci(0x451a6b3e); // pmullb z30.q, z25.d, z26.d
+ // vl128 state = 0xf6f18478
+ __ dci(0x45126a3a); // pmullb z26.q, z17.d, z18.d
+ // vl128 state = 0xdd4f14fb
+ __ dci(0x45126afb); // pmullb z27.q, z23.d, z18.d
+ // vl128 state = 0xcbf3bee2
+ __ dci(0x45126aff); // pmullb z31.q, z23.d, z18.d
+ // vl128 state = 0x627bec09
+ __ dci(0x45126aef); // pmullb z15.q, z23.d, z18.d
+ // vl128 state = 0xf5de1fa9
+ __ dci(0x45106abf); // pmullb z31.q, z21.d, z16.d
+ // vl128 state = 0x44bb6385
+ __ dci(0x451a6abb); // pmullb z27.q, z21.d, z26.d
+ // vl128 state = 0x5c5fa224
+ __ dci(0x450a68b3); // pmullb z19.q, z5.d, z10.d
+ // vl128 state = 0x28b6085c
+ __ dci(0x450e69b2); // pmullb z18.q, z13.d, z14.d
+ // vl128 state = 0x450898d6
+ __ dci(0x450e69b6); // pmullb z22.q, z13.d, z14.d
+ // vl128 state = 0x79d7911b
+ __ dci(0x450e69b4); // pmullb z20.q, z13.d, z14.d
+ // vl128 state = 0x98bf6939
+ __ dci(0x450f6924); // pmullb z4.q, z9.d, z15.d
+ // vl128 state = 0xb8a1bbc7
+ __ dci(0x45176925); // pmullb z5.q, z9.d, z23.d
+ // vl128 state = 0x631b41c8
+ __ dci(0x451f69a4); // pmullb z4.q, z13.d, z31.d
+ // vl128 state = 0x617fc272
+ __ dci(0x451b69e0); // pmullb z0.q, z15.d, z27.d
+ // vl128 state = 0x77780ac1
+ __ dci(0x451b69e8); // pmullb z8.q, z15.d, z27.d
+ // vl128 state = 0xce5ae18f
+ __ dci(0x450f69e0); // pmullb z0.q, z15.d, z15.d
+ // vl128 state = 0xa037371a
+ __ dci(0x450b6be8); // pmullb z8.q, z31.d, z11.d
+ // vl128 state = 0xb59be233
+ }
+
+ uint32_t state;
+ ComputeMachineStateHash(&masm, &state);
+ __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+ __ Ldr(w0, MemOperand(x0));
+
+ END();
+ if (CAN_RUN()) {
+ RUN();
+ uint32_t expected_hashes[] = {
+ 0xb59be233,
+ 0x32430624,
+ 0x5cc3ec66,
+ 0xecfdffe7,
+ 0x6d77a270,
+ 0xa0d604f2,
+ 0x2178aa11,
+ 0xabdcbeaa,
+ 0xab3b974f,
+ 0x11a874f5,
+ 0xf2eb6131,
+ 0x6d311c6c,
+ 0xd4e99b72,
+ 0x5177ce8e,
+ 0x32aa02f0,
+ 0x681ef977,
+ };
+ ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+ }
+}
+
} // namespace aarch64
} // namespace vixl