aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlexander Gilday <alexander.gilday@arm.com>2018-04-05 13:25:17 +0100
committerAlexander Gilday <alexander.gilday@arm.com>2018-04-19 09:29:09 +0000
commit560332df277a0e143763e5f5038fbb539c57453b (patch)
tree6a5326476cd1c52e1abcaa78d322261e99dad2a2 /src
parent4e5bad9e4915ba673bfe016dbdced31fe3cb7687 (diff)
Add support for dot product instructions.
Add support for v8.2 signed and unsigned dot product instructions: Sdot and Udot. Change-Id: I19d0ad1ad962e6c2a72efe8f62a4512e430b5e33
Diffstat (limited to 'src')
-rw-r--r--src/aarch64/assembler-aarch64.cc46
-rw-r--r--src/aarch64/assembler-aarch64.h18
-rw-r--r--src/aarch64/constants-aarch64.h8
-rw-r--r--src/aarch64/decoder-aarch64.cc4
-rw-r--r--src/aarch64/disasm-aarch64.cc32
-rw-r--r--src/aarch64/logic-aarch64.cc69
-rw-r--r--src/aarch64/macro-assembler-aarch64.h4
-rw-r--r--src/aarch64/operands-aarch64.h8
-rw-r--r--src/aarch64/simulator-aarch64.cc14
-rw-r--r--src/aarch64/simulator-aarch64.h23
10 files changed, 214 insertions, 12 deletions
diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc
index 147bb86d..8525aa72 100644
--- a/src/aarch64/assembler-aarch64.cc
+++ b/src/aarch64/assembler-aarch64.cc
@@ -2871,6 +2871,26 @@ void Assembler::sqrdmlsh(const VRegister& vd,
}
+void Assembler::sdot(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm) {
+ VIXL_ASSERT(AreSameFormat(vn, vm));
+ VIXL_ASSERT((vd.Is2S() && vn.Is8B()) || (vd.Is4S() && vn.Is16B()));
+
+ Emit(VFormat(vd) | NEON_SDOT | Rm(vm) | Rn(vn) | Rd(vd));
+}
+
+
+void Assembler::udot(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm) {
+ VIXL_ASSERT(AreSameFormat(vn, vm));
+ VIXL_ASSERT((vd.Is2S() && vn.Is8B()) || (vd.Is4S() && vn.Is16B()));
+
+ Emit(VFormat(vd) | NEON_UDOT | Rm(vm) | Rn(vn) | Rd(vd));
+}
+
+
void Assembler::faddp(const VRegister& vd, const VRegister& vn) {
VIXL_ASSERT((vd.Is1S() && vn.Is2S()) || (vd.Is1D() && vn.Is2D()));
Emit(FPFormat(vd) | NEON_FADDP_scalar | Rn(vn) | Rd(vd));
@@ -3093,6 +3113,32 @@ void Assembler::NEONByElementL(const VRegister& vd,
}
+void Assembler::sdot(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm,
+ int vm_index) {
+ VIXL_ASSERT((vd.Is2S() && vn.Is8B() && vm.Is1S4B()) ||
+ (vd.Is4S() && vn.Is16B() && vm.Is1S4B()));
+
+ int index_num_bits = 2;
+ Emit(VFormat(vd) | NEON_SDOT_byelement |
+ ImmNEONHLM(vm_index, index_num_bits) | Rm(vm) | Rn(vn) | Rd(vd));
+}
+
+
+void Assembler::udot(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm,
+ int vm_index) {
+ VIXL_ASSERT((vd.Is2S() && vn.Is8B() && vm.Is1S4B()) ||
+ (vd.Is4S() && vn.Is16B() && vm.Is1S4B()));
+
+ int index_num_bits = 2;
+ Emit(VFormat(vd) | NEON_UDOT_byelement |
+ ImmNEONHLM(vm_index, index_num_bits) | Rm(vm) | Rn(vn) | Rd(vd));
+}
+
+
// clang-format off
#define NEON_BYELEMENT_LIST(V) \
V(mul, NEON_MUL_byelement, vn.IsVector()) \
diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h
index 0c7abcbc..09613bb6 100644
--- a/src/aarch64/assembler-aarch64.h
+++ b/src/aarch64/assembler-aarch64.h
@@ -2479,10 +2479,16 @@ class Assembler : public vixl::internal::AssemblerBase {
// Signed saturating rounding doubling multiply returning high half.
void sqrdmulh(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+ // Signed dot product [Armv8.2].
+ void sdot(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
// Signed saturating rounding doubling multiply accumulate returning high
// half [Armv8.1].
void sqrdmlah(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+ // Unsigned dot product [Armv8.2].
+ void udot(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
// Signed saturating rounding doubling multiply subtract returning high half
// [Armv8.1].
void sqrdmlsh(const VRegister& vd, const VRegister& vn, const VRegister& vm);
@@ -2499,6 +2505,12 @@ class Assembler : public vixl::internal::AssemblerBase {
const VRegister& vm,
int vm_index);
+ // Signed dot product by element [Armv8.2].
+ void sdot(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm,
+ int vm_index);
+
// Signed saturating rounding doubling multiply accumulate element returning
// high half [Armv8.1].
void sqrdmlah(const VRegister& vd,
@@ -2506,6 +2518,12 @@ class Assembler : public vixl::internal::AssemblerBase {
const VRegister& vm,
int vm_index);
+ // Unsigned dot product by element [Armv8.2].
+ void udot(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm,
+ int vm_index);
+
// Signed saturating rounding doubling multiply subtract element returning
// high half [Armv8.1].
void sqrdmlsh(const VRegister& vd,
diff --git a/src/aarch64/constants-aarch64.h b/src/aarch64/constants-aarch64.h
index 64a42469..24e1d88a 100644
--- a/src/aarch64/constants-aarch64.h
+++ b/src/aarch64/constants-aarch64.h
@@ -1587,6 +1587,8 @@ enum NEON3SameExtraOp {
NEON3SameExtraMask = 0xBE20FC00,
NEON_SQRDMLAH = NEON3SameExtraFixed | NEON3SameExtraUBit,
NEON_SQRDMLSH = NEON3SameExtraFixed | NEON3SameExtraUBit | 0x00000800,
+ NEON_SDOT = NEON3SameExtraFixed | 0x00001000,
+ NEON_UDOT = NEON3SameExtraFixed | NEON3SameExtraUBit | 0x00001000,
/* v8.3 Complex Numbers */
NEON3SameExtraFCFixed = 0x2E00C400,
@@ -1698,6 +1700,10 @@ enum NEONByIndexedElementOp {
NEON_SQDMLSL_byelement = NEONByIndexedElementFixed | 0x00007000,
NEON_SQDMULH_byelement = NEONByIndexedElementFixed | 0x0000C000,
NEON_SQRDMULH_byelement = NEONByIndexedElementFixed | 0x0000D000,
+ NEON_SDOT_byelement = NEONByIndexedElementFixed | 0x0000E000,
+ NEON_SQRDMLAH_byelement = NEONByIndexedElementFixed | 0x2000D000,
+ NEON_UDOT_byelement = NEONByIndexedElementFixed | 0x2000E000,
+ NEON_SQRDMLSH_byelement = NEONByIndexedElementFixed | 0x2000F000,
// Floating point instructions.
NEONByIndexedElementFPFixed = NEONByIndexedElementFixed | 0x00800000,
@@ -1707,8 +1713,6 @@ enum NEONByIndexedElementOp {
NEON_FMUL_byelement = NEONByIndexedElementFPFixed | 0x00009000,
NEON_FMULX_byelement = NEONByIndexedElementFPFixed | 0x20009000,
NEON_FCMLA_byelement = NEONByIndexedElementFixed | 0x20001000,
- NEON_SQRDMLAH_byelement = NEONByIndexedElementFixed | 0x2000D000,
- NEON_SQRDMLSH_byelement = NEONByIndexedElementFixed | 0x2000F000,
// Complex instruction(s) this is necessary because 'rot' encoding moves into the NEONByIndex..Mask space
NEONByIndexedElementFPComplexMask = 0xBF009400
diff --git a/src/aarch64/decoder-aarch64.cc b/src/aarch64/decoder-aarch64.cc
index fe1cb08c..054fd93e 100644
--- a/src/aarch64/decoder-aarch64.cc
+++ b/src/aarch64/decoder-aarch64.cc
@@ -758,10 +758,8 @@ void Decoder::DecodeNEONVectorDataProcessing(const Instruction* instr) {
VisitUnallocated(instr);
} else if ((instr->ExtractBits(14, 11) <= 0xE &&
instr->ExtractBits(14, 11) >= 0x8) ||
- instr->ExtractBits(14, 12) == 0x0) {
+ instr->ExtractBits(14, 11) <= 0x2) {
VisitNEON3SameExtra(instr);
- } else if (instr->ExtractBits(13, 11) < 0x4) {
- VisitUnimplemented(instr);
} else {
VisitUnallocated(instr);
}
diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc
index bc52fb87..98c7fb45 100644
--- a/src/aarch64/disasm-aarch64.cc
+++ b/src/aarch64/disasm-aarch64.cc
@@ -2286,8 +2286,10 @@ void Disassembler::VisitNEON3Same(const Instruction *instr) {
}
void Disassembler::VisitNEON3SameExtra(const Instruction *instr) {
+ static const NEONFormatMap map_usdot = {{30}, {NF_8B, NF_16B}};
+
const char *mnemonic = "unimplemented";
- const char *form = "'Vd.%s, 'Vn.%s, 'Vm.%s, 'IVFCN";
+ const char *form = "(NEON3SameExtra)";
NEONFormatDecoder nfd(instr);
@@ -2301,21 +2303,26 @@ void Disassembler::VisitNEON3SameExtra(const Instruction *instr) {
mnemonic = "fcadd";
form = "'Vd.%s, 'Vn.%s, 'Vm.%s, 'IVFCNA";
break;
- default:
- form = "(NEON3SameExtra)";
}
} else {
+ form = "'Vd.%s, 'Vn.%s, 'Vm.%s";
switch (instr->Mask(NEON3SameExtraMask)) {
+ case NEON_SDOT:
+ mnemonic = "sdot";
+ nfd.SetFormatMap(1, &map_usdot);
+ nfd.SetFormatMap(2, &map_usdot);
+ break;
case NEON_SQRDMLAH:
mnemonic = "sqrdmlah";
- form = "'Vd.%s, 'Vn.%s, 'Vm.%s";
+ break;
+ case NEON_UDOT:
+ mnemonic = "udot";
+ nfd.SetFormatMap(1, &map_usdot);
+ nfd.SetFormatMap(2, &map_usdot);
break;
case NEON_SQRDMLSH:
mnemonic = "sqrdmlsh";
- form = "'Vd.%s, 'Vn.%s, 'Vm.%s";
break;
- default:
- form = "(NEON3SameExtra)";
}
}
@@ -2507,6 +2514,7 @@ void Disassembler::VisitNEONByIndexedElement(const Instruction *instr) {
static const NEONFormatMap map_cn =
{{23, 22, 30},
{NF_UNDEF, NF_UNDEF, NF_4H, NF_8H, NF_UNDEF, NF_4S, NF_UNDEF, NF_UNDEF}};
+ static const NEONFormatMap map_usdot = {{30}, {NF_8B, NF_16B}};
NEONFormatDecoder nfd(instr,
&map_ta,
@@ -2565,9 +2573,19 @@ void Disassembler::VisitNEONByIndexedElement(const Instruction *instr) {
case NEON_SQRDMULH_byelement:
mnemonic = "sqrdmulh";
break;
+ case NEON_SDOT_byelement:
+ mnemonic = "sdot";
+ form = "'Vd.%s, 'Vn.%s, 'Ve.4b['IVByElemIndex]";
+ nfd.SetFormatMap(1, &map_usdot);
+ break;
case NEON_SQRDMLAH_byelement:
mnemonic = "sqrdmlah";
break;
+ case NEON_UDOT_byelement:
+ mnemonic = "udot";
+ form = "'Vd.%s, 'Vn.%s, 'Ve.4b['IVByElemIndex]";
+ nfd.SetFormatMap(1, &map_usdot);
+ break;
case NEON_SQRDMLSH_byelement:
mnemonic = "sqrdmlsh";
break;
diff --git a/src/aarch64/logic-aarch64.cc b/src/aarch64/logic-aarch64.cc
index d8198fa4..20d4c002 100644
--- a/src/aarch64/logic-aarch64.cc
+++ b/src/aarch64/logic-aarch64.cc
@@ -823,6 +823,17 @@ LogicVRegister Simulator::sqrdmulh(VectorFormat vform,
}
+LogicVRegister Simulator::sdot(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ int index) {
+ SimVRegister temp;
+ VectorFormat indexform = VectorFormatFillQ(vform);
+ return sdot(vform, dst, src1, dup_element(indexform, temp, src2, index));
+}
+
+
LogicVRegister Simulator::sqrdmlah(VectorFormat vform,
LogicVRegister dst,
const LogicVRegister& src1,
@@ -834,6 +845,17 @@ LogicVRegister Simulator::sqrdmlah(VectorFormat vform,
}
+LogicVRegister Simulator::udot(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ int index) {
+ SimVRegister temp;
+ VectorFormat indexform = VectorFormatFillQ(vform);
+ return udot(vform, dst, src1, dup_element(indexform, temp, src2, index));
+}
+
+
LogicVRegister Simulator::sqrdmlsh(VectorFormat vform,
LogicVRegister dst,
const LogicVRegister& src1,
@@ -3270,6 +3292,53 @@ LogicVRegister Simulator::sqrdmulh(VectorFormat vform,
}
+LogicVRegister Simulator::dot(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ bool is_signed) {
+ VectorFormat quarter_vform =
+ VectorFormatHalfWidthDoubleLanes(VectorFormatHalfWidthDoubleLanes(vform));
+
+ dst.ClearForWrite(vform);
+ for (int e = 0; e < LaneCountFromFormat(vform); e++) {
+ int64_t result = 0;
+ int64_t element1, element2;
+ for (int i = 0; i < 4; i++) {
+ int index = 4 * e + i;
+ if (is_signed) {
+ element1 = src1.Int(quarter_vform, index);
+ element2 = src2.Int(quarter_vform, index);
+ } else {
+ element1 = src1.Uint(quarter_vform, index);
+ element2 = src2.Uint(quarter_vform, index);
+ }
+ result += element1 * element2;
+ }
+
+ result += dst.Int(vform, e);
+ dst.SetInt(vform, e, result);
+ }
+ return dst;
+}
+
+
+LogicVRegister Simulator::sdot(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2) {
+ return dot(vform, dst, src1, src2, true);
+}
+
+
+LogicVRegister Simulator::udot(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2) {
+ return dot(vform, dst, src1, src2, false);
+}
+
+
LogicVRegister Simulator::sqrdmlash(VectorFormat vform,
LogicVRegister dst,
const LogicVRegister& src1,
diff --git a/src/aarch64/macro-assembler-aarch64.h b/src/aarch64/macro-assembler-aarch64.h
index e0789306..29a29556 100644
--- a/src/aarch64/macro-assembler-aarch64.h
+++ b/src/aarch64/macro-assembler-aarch64.h
@@ -2284,7 +2284,9 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface {
V(sqdmull, Sqdmull) \
V(sqdmull2, Sqdmull2) \
V(sqrdmulh, Sqrdmulh) \
+ V(sdot, Sdot) \
V(sqrdmlah, Sqrdmlah) \
+ V(udot, Udot) \
V(sqrdmlsh, Sqrdmlsh) \
V(sqrshl, Sqrshl) \
V(sqshl, Sqshl) \
@@ -2460,7 +2462,9 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface {
V(mls, Mls) \
V(sqdmulh, Sqdmulh) \
V(sqrdmulh, Sqrdmulh) \
+ V(sdot, Sdot) \
V(sqrdmlah, Sqrdmlah) \
+ V(udot, Udot) \
V(sqrdmlsh, Sqrdmlsh) \
V(sqdmull, Sqdmull) \
V(sqdmull2, Sqdmull2) \
diff --git a/src/aarch64/operands-aarch64.h b/src/aarch64/operands-aarch64.h
index 2fb1f1d3..5b154eaa 100644
--- a/src/aarch64/operands-aarch64.h
+++ b/src/aarch64/operands-aarch64.h
@@ -207,6 +207,10 @@ class CPURegister {
bool IsD() const { return IsV() && Is64Bits(); }
bool IsQ() const { return IsV() && Is128Bits(); }
+ // Semantic type for sdot and udot instructions.
+ bool IsS4B() const { return IsS(); }
+ const VRegister& S4B() const { return S(); }
+
const Register& W() const;
const Register& X() const;
const VRegister& V() const;
@@ -392,6 +396,10 @@ class VRegister : public CPURegister {
return Is32Bits();
}
+ // Semantic type for sdot and udot instructions.
+ bool Is1S4B() const { return Is1S(); }
+
+
bool IsLaneSizeB() const { return GetLaneSizeInBits() == kBRegSize; }
bool IsLaneSizeH() const { return GetLaneSizeInBits() == kHRegSize; }
bool IsLaneSizeS() const { return GetLaneSizeInBits() == kSRegSize; }
diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc
index 5432c1f3..3abc7800 100644
--- a/src/aarch64/simulator-aarch64.cc
+++ b/src/aarch64/simulator-aarch64.cc
@@ -3670,9 +3670,15 @@ void Simulator::VisitNEON3SameExtra(const Instruction* instr) {
}
} else {
switch (instr->Mask(NEON3SameExtraMask)) {
+ case NEON_SDOT:
+ sdot(vf, rd, rn, rm);
+ break;
case NEON_SQRDMLAH:
sqrdmlah(vf, rd, rn, rm);
break;
+ case NEON_UDOT:
+ udot(vf, rd, rn, rm);
+ break;
case NEON_SQRDMLSH:
sqrdmlsh(vf, rd, rn, rm);
break;
@@ -3952,10 +3958,18 @@ void Simulator::VisitNEONByIndexedElement(const Instruction* instr) {
Op = &Simulator::sqrdmulh;
vf = vf_r;
break;
+ case NEON_SDOT_byelement:
+ Op = &Simulator::sdot;
+ vf = vf_r;
+ break;
case NEON_SQRDMLAH_byelement:
Op = &Simulator::sqrdmlah;
vf = vf_r;
break;
+ case NEON_UDOT_byelement:
+ Op = &Simulator::udot;
+ vf = vf_r;
+ break;
case NEON_SQRDMLSH_byelement:
Op = &Simulator::sqrdmlsh;
vf = vf_r;
diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h
index 7c84cb38..4de98294 100644
--- a/src/aarch64/simulator-aarch64.h
+++ b/src/aarch64/simulator-aarch64.h
@@ -1985,11 +1985,21 @@ class Simulator : public DecoderVisitor {
const LogicVRegister& src1,
const LogicVRegister& src2,
int index);
+ LogicVRegister sdot(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ int index);
LogicVRegister sqrdmlah(VectorFormat vform,
LogicVRegister dst,
const LogicVRegister& src1,
const LogicVRegister& src2,
int index);
+ LogicVRegister udot(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ int index);
LogicVRegister sqrdmlsh(VectorFormat vform,
LogicVRegister dst,
const LogicVRegister& src1,
@@ -2577,6 +2587,19 @@ class Simulator : public DecoderVisitor {
const LogicVRegister& src1,
const LogicVRegister& src2,
bool round = true);
+ LogicVRegister dot(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ bool is_signed);
+ LogicVRegister sdot(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2);
+ LogicVRegister udot(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2);
LogicVRegister sqrdmlash(VectorFormat vform,
LogicVRegister dst,
const LogicVRegister& src1,