aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Gilday <alexander.gilday@arm.com>2018-04-04 13:42:33 +0100
committerAlexander Gilday <alexander.gilday@arm.com>2018-04-10 14:05:14 +0100
commit4378564f0d26925b8aee59b28fa5fd9f249e3ef8 (patch)
treee0d66c243fc30708e154db13b0109e9f6c745444
parentd82faf6442f6b611101c2cebe35bb3deddbf1d6e (diff)
Add support for SQRDMLAH and SQRDMLSH.
Change-Id: I2b490d877e7e9db77608b84ab0b92aa972a54a6d
-rw-r--r--examples/aarch64/non-const-visitor.h1
-rw-r--r--src/aarch64/assembler-aarch64.cc40
-rw-r--r--src/aarch64/assembler-aarch64.h22
-rw-r--r--src/aarch64/constants-aarch64.h29
-rw-r--r--src/aarch64/decoder-aarch64.cc15
-rw-r--r--src/aarch64/decoder-aarch64.h1
-rw-r--r--src/aarch64/disasm-aarch64.cc69
-rw-r--r--src/aarch64/instrument-aarch64.cc8
-rw-r--r--src/aarch64/logic-aarch64.cc78
-rw-r--r--src/aarch64/macro-assembler-aarch64.h4
-rw-r--r--src/aarch64/simulator-aarch64.cc74
-rw-r--r--src/aarch64/simulator-aarch64.h26
-rw-r--r--test/aarch64/test-assembler-aarch64.cc129
-rw-r--r--test/aarch64/test-disasm-aarch64.cc38
14 files changed, 502 insertions, 32 deletions
diff --git a/examples/aarch64/non-const-visitor.h b/examples/aarch64/non-const-visitor.h
index d5c99fec..56b1e51b 100644
--- a/examples/aarch64/non-const-visitor.h
+++ b/examples/aarch64/non-const-visitor.h
@@ -118,6 +118,7 @@ class SwitchAddSubRegisterSources : public DecoderVisitor {
V(NEONScalar2RegMisc) \
V(NEONScalar3Diff) \
V(NEONScalar3Same) \
+ V(NEONScalar3SameExtra) \
V(NEON3SameExtra) \
V(NEONScalarByIndexedElement) \
V(NEONScalarCopy) \
diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc
index 8851631f..41841e54 100644
--- a/src/aarch64/assembler-aarch64.cc
+++ b/src/aarch64/assembler-aarch64.cc
@@ -2767,6 +2767,42 @@ void Assembler::addp(const VRegister& vd, const VRegister& vn) {
}
+void Assembler::sqrdmlah(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm) {
+ VIXL_ASSERT(AreSameFormat(vd, vn, vm));
+ VIXL_ASSERT(vd.IsVector() || !vd.IsQ());
+
+ Instr format, op = NEON_SQRDMLAH;
+ if (vd.IsScalar()) {
+ op |= NEON_Q | NEONScalar;
+ format = SFormat(vd);
+ } else {
+ format = VFormat(vd);
+ }
+
+ Emit(format | op | Rm(vm) | Rn(vn) | Rd(vd));
+}
+
+
+void Assembler::sqrdmlsh(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm) {
+ VIXL_ASSERT(AreSameFormat(vd, vn, vm));
+ VIXL_ASSERT(vd.IsVector() || !vd.IsQ());
+
+ Instr format, op = NEON_SQRDMLSH;
+ if (vd.IsScalar()) {
+ op |= NEON_Q | NEONScalar;
+ format = SFormat(vd);
+ } else {
+ format = VFormat(vd);
+ }
+
+ Emit(format | op | Rm(vm) | Rn(vn) | Rd(vd));
+}
+
+
void Assembler::faddp(const VRegister& vd, const VRegister& vn) {
VIXL_ASSERT((vd.Is1S() && vn.Is2S()) || (vd.Is1D() && vn.Is2D()));
Emit(FPFormat(vd) | NEON_FADDP_scalar | Rn(vn) | Rd(vd));
@@ -2995,7 +3031,9 @@ void Assembler::NEONByElementL(const VRegister& vd,
V(mla, NEON_MLA_byelement, vn.IsVector()) \
V(mls, NEON_MLS_byelement, vn.IsVector()) \
V(sqdmulh, NEON_SQDMULH_byelement, true) \
- V(sqrdmulh, NEON_SQRDMULH_byelement, true)
+ V(sqrdmulh, NEON_SQRDMULH_byelement, true) \
+ V(sqrdmlah, NEON_SQRDMLAH_byelement, true) \
+ V(sqrdmlsh, NEON_SQRDMLSH_byelement, true)
// clang-format on
diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h
index 412f648d..1c25886b 100644
--- a/src/aarch64/assembler-aarch64.h
+++ b/src/aarch64/assembler-aarch64.h
@@ -2415,6 +2415,14 @@ class Assembler : public vixl::internal::AssemblerBase {
// Signed saturating rounding doubling multiply returning high half.
void sqrdmulh(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+ // Signed saturating rounding doubling multiply accumulate returning high
+ // half [Armv8.1].
+ void sqrdmlah(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+ // Signed saturating rounding doubling multiply subtract returning high half
+ // [Armv8.1].
+ void sqrdmlsh(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
// Signed saturating doubling multiply element returning high half.
void sqdmulh(const VRegister& vd,
const VRegister& vn,
@@ -2427,6 +2435,20 @@ class Assembler : public vixl::internal::AssemblerBase {
const VRegister& vm,
int vm_index);
+ // Signed saturating rounding doubling multiply accumulate element returning
+ // high half [Armv8.1].
+ void sqrdmlah(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm,
+ int vm_index);
+
+ // Signed saturating rounding doubling multiply subtract element returning
+ // high half [Armv8.1].
+ void sqrdmlsh(const VRegister& vd,
+ const VRegister& vn,
+ const VRegister& vm,
+ int vm_index);
+
// Unsigned long multiply long.
void umull(const VRegister& vd, const VRegister& vn, const VRegister& vm);
diff --git a/src/aarch64/constants-aarch64.h b/src/aarch64/constants-aarch64.h
index 7f175a41..b57bdbfe 100644
--- a/src/aarch64/constants-aarch64.h
+++ b/src/aarch64/constants-aarch64.h
@@ -1550,13 +1550,17 @@ enum NEON3SameOp {
enum NEON3SameExtraOp {
NEON3SameExtraFixed = 0x0E008400,
NEON3SameExtraUBit = 0x20000000,
- NEON3SameExtraFMask = 0x0E008400,
- NEON3SameExtraMask = 0x2E00E400,
+ NEON3SameExtraFMask = 0x9E208400,
+ NEON3SameExtraMask = 0xBE20FC00,
+ NEON_SQRDMLAH = NEON3SameExtraFixed | NEON3SameExtraUBit,
+ NEON_SQRDMLSH = NEON3SameExtraFixed | NEON3SameExtraUBit | 0x00000800,
/* v8.3 Complex Numbers */
- NEON_FCMLA = NEON3SameExtraFixed | NEON3SameExtraUBit | 0x00004000,
- NEON_FCADD = NEON3SameExtraFixed | NEON3SameExtraUBit | 0x00006000
-
+ NEON3SameExtraFCFixed = 0x2E00C400,
+ NEON3SameExtraFCFMask = 0xBE20C400,
+ NEON3SameExtraFCMask = 0xBE20E400,
+ NEON_FCMLA = NEON3SameExtraFCFixed,
+ NEON_FCADD = NEON3SameExtraFCFixed | 0x00002000
};
// NEON instructions with three different-type operands.
@@ -1670,6 +1674,8 @@ enum NEONByIndexedElementOp {
NEON_FMUL_byelement = NEONByIndexedElementFPFixed | 0x00009000,
NEON_FMULX_byelement = NEONByIndexedElementFPFixed | 0x20009000,
NEON_FCMLA_byelement = NEONByIndexedElementFixed | 0x20001000,
+ NEON_SQRDMLAH_byelement = NEONByIndexedElementFixed | 0x2000D000,
+ NEON_SQRDMLSH_byelement = NEONByIndexedElementFixed | 0x2000F000,
// Complex instruction(s) this is necessary because 'rot' encoding moves into the NEONByIndex..Mask space
NEONByIndexedElementFPComplexMask = 0xBF009400
@@ -2051,6 +2057,15 @@ enum NEONScalar3SameOp {
NEON_FABD_scalar = NEON_Q | NEONScalar | NEON_FABD
};
+// 'Extra' NEON scalar instructions with three same-type operands.
+enum NEONScalar3SameExtraOp {
+ NEONScalar3SameExtraFixed = 0x5E008400,
+ NEONScalar3SameExtraFMask = 0xDF208400,
+ NEONScalar3SameExtraMask = 0xFF20FC00,
+ NEON_SQRDMLAH_scalar = NEON_Q | NEONScalar | NEON_SQRDMLAH,
+ NEON_SQRDMLSH_scalar = NEON_Q | NEONScalar | NEON_SQRDMLSH
+};
+
// NEON scalar instructions with three different-type operands.
enum NEONScalar3DiffOp {
NEONScalar3DiffFixed = 0x5E200000,
@@ -2072,6 +2087,10 @@ enum NEONScalarByIndexedElementOp {
NEON_SQDMULH_byelement_scalar = NEON_Q | NEONScalar | NEON_SQDMULH_byelement,
NEON_SQRDMULH_byelement_scalar
= NEON_Q | NEONScalar | NEON_SQRDMULH_byelement,
+ NEON_SQRDMLAH_byelement_scalar
+ = NEON_Q | NEONScalar | NEON_SQRDMLAH_byelement,
+ NEON_SQRDMLSH_byelement_scalar
+ = NEON_Q | NEONScalar | NEON_SQRDMLSH_byelement,
// Floating point instructions.
NEONScalarByIndexedElementFPFixed
diff --git a/src/aarch64/decoder-aarch64.cc b/src/aarch64/decoder-aarch64.cc
index 4dc77212..fe1cb08c 100644
--- a/src/aarch64/decoder-aarch64.cc
+++ b/src/aarch64/decoder-aarch64.cc
@@ -756,8 +756,9 @@ void Decoder::DecodeNEONVectorDataProcessing(const Instruction* instr) {
}
} else if (instr->ExtractBit(10) == 0) {
VisitUnallocated(instr);
- } else if (instr->ExtractBits(14, 11) <= 0xE &&
- instr->ExtractBits(14, 11) >= 0x8) {
+ } else if ((instr->ExtractBits(14, 11) <= 0xE &&
+ instr->ExtractBits(14, 11) >= 0x8) ||
+ instr->ExtractBits(14, 12) == 0x0) {
VisitNEON3SameExtra(instr);
} else if (instr->ExtractBits(13, 11) < 0x4) {
VisitUnimplemented(instr);
@@ -839,7 +840,15 @@ void Decoder::DecodeNEONScalarDataProcessing(const Instruction* instr) {
}
}
} else {
- VisitUnallocated(instr);
+ if (instr->ExtractBit(29) == 0) {
+ VisitUnallocated(instr);
+ } else {
+ if (instr->ExtractBit(10) == 0) {
+ VisitUnallocated(instr);
+ } else {
+ VisitNEONScalar3SameExtra(instr);
+ }
+ }
}
} else {
if (instr->ExtractBit(10) == 0) {
diff --git a/src/aarch64/decoder-aarch64.h b/src/aarch64/decoder-aarch64.h
index 05c86a33..4a21b9f5 100644
--- a/src/aarch64/decoder-aarch64.h
+++ b/src/aarch64/decoder-aarch64.h
@@ -99,6 +99,7 @@
V(NEONScalar2RegMisc) \
V(NEONScalar3Diff) \
V(NEONScalar3Same) \
+ V(NEONScalar3SameExtra) \
V(NEONScalarByIndexedElement) \
V(NEONScalarCopy) \
V(NEONScalarPairwise) \
diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc
index 126ab273..defa07f8 100644
--- a/src/aarch64/disasm-aarch64.cc
+++ b/src/aarch64/disasm-aarch64.cc
@@ -2335,17 +2335,32 @@ void Disassembler::VisitNEON3SameExtra(const Instruction *instr) {
NEONFormatDecoder nfd(instr);
- switch (instr->Mask(NEON3SameExtraMask)) {
- case NEON_FCMLA:
- mnemonic = "fcmla";
- form = "'Vd.%s, 'Vn.%s, 'Vm.%s, 'IVFCNM";
- break;
- case NEON_FCADD:
- mnemonic = "fcadd";
- form = "'Vd.%s, 'Vn.%s, 'Vm.%s, 'IVFCNA";
- break;
- default:
- form = "(NEON3SameExtra)";
+ if (instr->Mask(NEON3SameExtraFCFMask) == NEON3SameExtraFCFixed) {
+ switch (instr->Mask(NEON3SameExtraFCMask)) {
+ case NEON_FCMLA:
+ mnemonic = "fcmla";
+ form = "'Vd.%s, 'Vn.%s, 'Vm.%s, 'IVFCNM";
+ break;
+ case NEON_FCADD:
+ mnemonic = "fcadd";
+ form = "'Vd.%s, 'Vn.%s, 'Vm.%s, 'IVFCNA";
+ break;
+ default:
+ form = "(NEON3SameExtra)";
+ }
+ } else {
+ switch (instr->Mask(NEON3SameExtraMask)) {
+ case NEON_SQRDMLAH:
+ mnemonic = "sqrdmlah";
+ form = "'Vd.%s, 'Vn.%s, 'Vm.%s";
+ break;
+ case NEON_SQRDMLSH:
+ mnemonic = "sqrdmlsh";
+ form = "'Vd.%s, 'Vn.%s, 'Vm.%s";
+ break;
+ default:
+ form = "(NEON3SameExtra)";
+ }
}
Format(instr, mnemonic, nfd.Substitute(form));
@@ -2594,6 +2609,12 @@ void Disassembler::VisitNEONByIndexedElement(const Instruction *instr) {
case NEON_SQRDMULH_byelement:
mnemonic = "sqrdmulh";
break;
+ case NEON_SQRDMLAH_byelement:
+ mnemonic = "sqrdmlah";
+ break;
+ case NEON_SQRDMLSH_byelement:
+ mnemonic = "sqrdmlsh";
+ break;
default:
switch (instr->Mask(NEONByIndexedElementFPMask)) {
case NEON_FMUL_byelement:
@@ -2622,6 +2643,7 @@ void Disassembler::VisitNEONByIndexedElement(const Instruction *instr) {
}
}
}
+
if (l_instr) {
Format(instr, nfd.Mnemonic(mnemonic), nfd.Substitute(form));
} else if (fp_instr) {
@@ -3576,6 +3598,25 @@ void Disassembler::VisitNEONScalar3Same(const Instruction *instr) {
}
+void Disassembler::VisitNEONScalar3SameExtra(const Instruction *instr) {
+ const char *mnemonic = "unimplemented";
+ const char *form = "%sd, %sn, %sm";
+ NEONFormatDecoder nfd(instr, NEONFormatDecoder::ScalarFormatMap());
+
+ switch (instr->Mask(NEONScalar3SameExtraMask)) {
+ case NEON_SQRDMLAH_scalar:
+ mnemonic = "sqrdmlah";
+ break;
+ case NEON_SQRDMLSH_scalar:
+ mnemonic = "sqrdmlsh";
+ break;
+ default:
+ form = "(NEONScalar3SameExtra)";
+ }
+ Format(instr, mnemonic, nfd.SubstitutePlaceholders(form));
+}
+
+
void Disassembler::VisitNEONScalarByIndexedElement(const Instruction *instr) {
const char *mnemonic = "unimplemented";
const char *form = "%sd, %sn, 'Ve.%s['IVByElemIndex]";
@@ -3601,6 +3642,12 @@ void Disassembler::VisitNEONScalarByIndexedElement(const Instruction *instr) {
case NEON_SQRDMULH_byelement_scalar:
mnemonic = "sqrdmulh";
break;
+ case NEON_SQRDMLAH_byelement_scalar:
+ mnemonic = "sqrdmlah";
+ break;
+ case NEON_SQRDMLSH_byelement_scalar:
+ mnemonic = "sqrdmlsh";
+ break;
default:
nfd.SetFormatMap(0, nfd.FPScalarFormatMap());
switch (instr->Mask(NEONScalarByIndexedElementFPMask)) {
diff --git a/src/aarch64/instrument-aarch64.cc b/src/aarch64/instrument-aarch64.cc
index c664b50c..3882e8ee 100644
--- a/src/aarch64/instrument-aarch64.cc
+++ b/src/aarch64/instrument-aarch64.cc
@@ -789,6 +789,14 @@ void Instrument::VisitNEONScalar3Same(const Instruction* instr) {
}
+void Instrument::VisitNEONScalar3SameExtra(const Instruction* instr) {
+ USE(instr);
+ Update();
+ static Counter* counter = GetCounter("NEON");
+ counter->Increment();
+}
+
+
void Instrument::VisitNEONScalarByIndexedElement(const Instruction* instr) {
USE(instr);
Update();
diff --git a/src/aarch64/logic-aarch64.cc b/src/aarch64/logic-aarch64.cc
index fc7d234b..15fac2cb 100644
--- a/src/aarch64/logic-aarch64.cc
+++ b/src/aarch64/logic-aarch64.cc
@@ -1124,6 +1124,28 @@ LogicVRegister Simulator::sqrdmulh(VectorFormat vform,
}
+LogicVRegister Simulator::sqrdmlah(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ int index) {
+ SimVRegister temp;
+ VectorFormat indexform = VectorFormatFillQ(vform);
+ return sqrdmlah(vform, dst, src1, dup_element(indexform, temp, src2, index));
+}
+
+
+LogicVRegister Simulator::sqrdmlsh(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ int index) {
+ SimVRegister temp;
+ VectorFormat indexform = VectorFormatFillQ(vform);
+ return sqrdmlsh(vform, dst, src1, dup_element(indexform, temp, src2, index));
+}
+
+
uint16_t Simulator::PolynomialMult(uint8_t op1, uint8_t op2) const {
uint16_t result = 0;
uint16_t extended_op2 = op2;
@@ -3549,6 +3571,62 @@ LogicVRegister Simulator::sqrdmulh(VectorFormat vform,
}
+LogicVRegister Simulator::sqrdmlash(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ bool round,
+ bool sub_op) {
+ // 2 * INT_32_MIN * INT_32_MIN causes int64_t to overflow.
+ // To avoid this, we use:
+ // (dst << (esize - 1) + src1 * src2 + 1 << (esize - 2)) >> (esize - 1)
+ // which is same as:
+ // (dst << esize + 2 * src1 * src2 + 1 << (esize - 1)) >> esize.
+
+ int esize = LaneSizeInBitsFromFormat(vform);
+ int round_const = round ? (1 << (esize - 2)) : 0;
+ int64_t accum;
+
+ dst.ClearForWrite(vform);
+ for (int i = 0; i < LaneCountFromFormat(vform); i++) {
+ accum = dst.Int(vform, i) << (esize - 1);
+ if (sub_op) {
+ accum -= src1.Int(vform, i) * src2.Int(vform, i);
+ } else {
+ accum += src1.Int(vform, i) * src2.Int(vform, i);
+ }
+ accum += round_const;
+ accum = accum >> (esize - 1);
+
+ if (accum > MaxIntFromFormat(vform)) {
+ accum = MaxIntFromFormat(vform);
+ } else if (accum < MinIntFromFormat(vform)) {
+ accum = MinIntFromFormat(vform);
+ }
+ dst.SetInt(vform, i, accum);
+ }
+ return dst;
+}
+
+
+LogicVRegister Simulator::sqrdmlah(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ bool round) {
+ return sqrdmlash(vform, dst, src1, src2, round, false);
+}
+
+
+LogicVRegister Simulator::sqrdmlsh(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ bool round) {
+ return sqrdmlash(vform, dst, src1, src2, round, true);
+}
+
+
LogicVRegister Simulator::sqdmulh(VectorFormat vform,
LogicVRegister dst,
const LogicVRegister& src1,
diff --git a/src/aarch64/macro-assembler-aarch64.h b/src/aarch64/macro-assembler-aarch64.h
index 5255872a..acb65ed9 100644
--- a/src/aarch64/macro-assembler-aarch64.h
+++ b/src/aarch64/macro-assembler-aarch64.h
@@ -2236,6 +2236,8 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface {
V(sqdmull, Sqdmull) \
V(sqdmull2, Sqdmull2) \
V(sqrdmulh, Sqrdmulh) \
+ V(sqrdmlah, Sqrdmlah) \
+ V(sqrdmlsh, Sqrdmlsh) \
V(sqrshl, Sqrshl) \
V(sqshl, Sqshl) \
V(sqsub, Sqsub) \
@@ -2410,6 +2412,8 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface {
V(mls, Mls) \
V(sqdmulh, Sqdmulh) \
V(sqrdmulh, Sqrdmulh) \
+ V(sqrdmlah, Sqrdmlah) \
+ V(sqrdmlsh, Sqrdmlsh) \
V(sqdmull, Sqdmull) \
V(sqdmull2, Sqdmull2) \
V(sqdmlal, Sqdmlal) \
diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc
index 1f7a5154..ab08d28f 100644
--- a/src/aarch64/simulator-aarch64.cc
+++ b/src/aarch64/simulator-aarch64.cc
@@ -3514,18 +3514,32 @@ void Simulator::VisitNEON3SameExtra(const Instruction* instr) {
SimVRegister& rm = ReadVRegister(instr->GetRm());
int rot = 0;
VectorFormat vf = nfd.GetVectorFormat();
- switch (instr->Mask(NEON3SameExtraMask)) {
- case NEON_FCADD:
- rot = instr->GetImmRotFcadd();
- fcadd(vf, rd, rn, rm, rot);
- break;
- case NEON_FCMLA:
- rot = instr->GetImmRotFcmlaVec();
- fcmla(vf, rd, rn, rm, rot);
- break;
- default:
- VIXL_UNIMPLEMENTED();
- break;
+ if (instr->Mask(NEON3SameExtraFCFMask) == NEON3SameExtraFCFixed) {
+ switch (instr->Mask(NEON3SameExtraFCMask)) {
+ case NEON_FCADD:
+ rot = instr->GetImmRotFcadd();
+ fcadd(vf, rd, rn, rm, rot);
+ break;
+ case NEON_FCMLA:
+ rot = instr->GetImmRotFcmlaVec();
+ fcmla(vf, rd, rn, rm, rot);
+ break;
+ default:
+ VIXL_UNIMPLEMENTED();
+ break;
+ }
+ } else {
+ switch (instr->Mask(NEON3SameExtraMask)) {
+ case NEON_SQRDMLAH:
+ sqrdmlah(vf, rd, rn, rm);
+ break;
+ case NEON_SQRDMLSH:
+ sqrdmlsh(vf, rd, rn, rm);
+ break;
+ default:
+ VIXL_UNIMPLEMENTED();
+ break;
+ }
}
}
@@ -3798,6 +3812,14 @@ void Simulator::VisitNEONByIndexedElement(const Instruction* instr) {
Op = &Simulator::sqrdmulh;
vf = vf_r;
break;
+ case NEON_SQRDMLAH_byelement:
+ Op = &Simulator::sqrdmlah;
+ vf = vf_r;
+ break;
+ case NEON_SQRDMLSH_byelement:
+ Op = &Simulator::sqrdmlsh;
+ vf = vf_r;
+ break;
case NEON_SMULL_byelement:
if (instr->Mask(NEON_Q)) {
Op = &Simulator::smull2;
@@ -4753,6 +4775,26 @@ void Simulator::VisitNEONScalar3Same(const Instruction* instr) {
}
+void Simulator::VisitNEONScalar3SameExtra(const Instruction* instr) {
+ NEONFormatDecoder nfd(instr, NEONFormatDecoder::ScalarFormatMap());
+ VectorFormat vf = nfd.GetVectorFormat();
+
+ SimVRegister& rd = ReadVRegister(instr->GetRd());
+ SimVRegister& rn = ReadVRegister(instr->GetRn());
+ SimVRegister& rm = ReadVRegister(instr->GetRm());
+
+ switch (instr->Mask(NEONScalar3SameExtraMask)) {
+ case NEON_SQRDMLAH_scalar:
+ sqrdmlah(vf, rd, rn, rm);
+ break;
+ case NEON_SQRDMLSH_scalar:
+ sqrdmlsh(vf, rd, rn, rm);
+ break;
+ default:
+ VIXL_UNIMPLEMENTED();
+ }
+}
+
void Simulator::VisitNEONScalarByIndexedElement(const Instruction* instr) {
NEONFormatDecoder nfd(instr, NEONFormatDecoder::LongScalarFormatMap());
VectorFormat vf = nfd.GetVectorFormat();
@@ -4787,6 +4829,14 @@ void Simulator::VisitNEONScalarByIndexedElement(const Instruction* instr) {
Op = &Simulator::sqrdmulh;
vf = vf_r;
break;
+ case NEON_SQRDMLAH_byelement_scalar:
+ Op = &Simulator::sqrdmlah;
+ vf = vf_r;
+ break;
+ case NEON_SQRDMLSH_byelement_scalar:
+ Op = &Simulator::sqrdmlsh;
+ vf = vf_r;
+ break;
default:
vf = nfd.GetVectorFormat(nfd.FPScalarFormatMap());
index = instr->GetNEONH();
diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h
index 813cee1a..c8cbffb3 100644
--- a/src/aarch64/simulator-aarch64.h
+++ b/src/aarch64/simulator-aarch64.h
@@ -2173,6 +2173,16 @@ class Simulator : public DecoderVisitor {
const LogicVRegister& src1,
const LogicVRegister& src2,
int index);
+ LogicVRegister sqrdmlah(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ int index);
+ LogicVRegister sqrdmlsh(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ int index);
LogicVRegister sub(VectorFormat vform,
LogicVRegister dst,
const LogicVRegister& src1,
@@ -2755,6 +2765,22 @@ class Simulator : public DecoderVisitor {
const LogicVRegister& src1,
const LogicVRegister& src2,
bool round = true);
+ LogicVRegister sqrdmlash(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ bool round = true,
+ bool sub_op = false);
+ LogicVRegister sqrdmlah(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ bool round = true);
+ LogicVRegister sqrdmlsh(VectorFormat vform,
+ LogicVRegister dst,
+ const LogicVRegister& src1,
+ const LogicVRegister& src2,
+ bool round = true);
LogicVRegister sqdmulh(VectorFormat vform,
LogicVRegister dst,
const LogicVRegister& src1,
diff --git a/test/aarch64/test-assembler-aarch64.cc b/test/aarch64/test-assembler-aarch64.cc
index 0491a4e3..9cbe2f64 100644
--- a/test/aarch64/test-assembler-aarch64.cc
+++ b/test/aarch64/test-assembler-aarch64.cc
@@ -17256,6 +17256,135 @@ TEST(neon_byelement_sqdmulh_sqrdmulh) {
TEARDOWN();
}
+TEST(neon_3same_sqrdmlah) {
+ SETUP();
+
+ START();
+
+ __ Movi(v0.V2D(), 0x0000000000000000, 0x0000040004008000);
+ __ Movi(v1.V2D(), 0x0000000000000000, 0x0000002000108000);
+ __ Movi(v2.V2D(), 0x0400000080000000, 0x0400000080000000);
+ __ Movi(v3.V2D(), 0x0000002080000000, 0x0000001080000000);
+
+ __ Movi(v16.V2D(), 0x0000040004008000, 0x0000040004008000);
+ __ Movi(v17.V2D(), 0x0000000000000000, 0x0000002000108000);
+ __ Movi(v18.V2D(), 0x0400000080000000, 0x0400000080000000);
+ __ Movi(v19.V2D(), 0x0000002080000000, 0x0000001080000000);
+
+ __ Sqrdmlah(v16.V4H(), v0.V4H(), v1.V4H());
+ __ Sqrdmlah(v17.V4S(), v2.V4S(), v3.V4S());
+ __ Sqrdmlah(h18, h0, h1);
+ __ Sqrdmlah(s19, s2, s3);
+
+ END();
+
+// TODO: test on real hardware when available
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
+ RUN();
+ ASSERT_EQUAL_128(0, 0x0000040104010000, q16);
+ ASSERT_EQUAL_128(0x000000017fffffff, 0x000000217fffffff, q17);
+ ASSERT_EQUAL_128(0, 0x7fff, q18);
+ ASSERT_EQUAL_128(0, 0, q19);
+#endif
+ TEARDOWN();
+}
+
+TEST(neon_byelement_sqrdmlah) {
+ SETUP();
+
+ START();
+
+ __ Movi(v0.V2D(), 0x0000000000000000, 0x0000040004008000);
+ __ Movi(v1.V2D(), 0x0000000000000000, 0x0000002000108000);
+ __ Movi(v2.V2D(), 0x0400000080000000, 0x0400000080000000);
+ __ Movi(v3.V2D(), 0x0000002080000000, 0x0000001080000000);
+
+ __ Movi(v16.V2D(), 0x0000040004008000, 0x0000040004008000);
+ __ Movi(v17.V2D(), 0x0000000000000000, 0x0000002000108000);
+ __ Movi(v18.V2D(), 0x0400000080000000, 0x0400000080000000);
+ __ Movi(v19.V2D(), 0x0000002080000000, 0x0000001080000000);
+
+ __ Sqrdmlah(v16.V4H(), v0.V4H(), v1.H(), 1);
+ __ Sqrdmlah(v17.V4S(), v2.V4S(), v3.S(), 1);
+ __ Sqrdmlah(h18, h0, v1.H(), 0);
+ __ Sqrdmlah(s19, s2, v3.S(), 0);
+
+ END();
+
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
+ RUN();
+ ASSERT_EQUAL_128(0, 0x0000040104018000, q16);
+ ASSERT_EQUAL_128(0x00000001fffffff0, 0x0000002100107ff0, q17);
+ ASSERT_EQUAL_128(0, 0x7fff, q18);
+ ASSERT_EQUAL_128(0, 0, q19);
+#endif
+ TEARDOWN();
+}
+
+TEST(neon_3same_sqrdmlsh) {
+ SETUP();
+
+ START();
+
+ __ Movi(v0.V2D(), 0x0000000000000000, 0x0000040004000500);
+ __ Movi(v1.V2D(), 0x0000000000000000, 0x0000002000100080);
+ __ Movi(v2.V2D(), 0x0400000080000000, 0x0400000080000000);
+ __ Movi(v3.V2D(), 0x0000002080000000, 0x0000001080000000);
+
+ __ Movi(v16.V2D(), 0x4000400040004000, 0x4000400040004000);
+ __ Movi(v17.V2D(), 0x4000400040004000, 0x4000400040004000);
+ __ Movi(v18.V2D(), 0x4000400040004000, 0x4000400040004000);
+ __ Movi(v19.V2D(), 0x4000400040004000, 0x4000400040004000);
+
+ __ Sqrdmlsh(v16.V4H(), v0.V4H(), v1.V4H());
+ __ Sqrdmlsh(v17.V4S(), v2.V4S(), v3.V4S());
+ __ Sqrdmlsh(h18, h0, h1);
+ __ Sqrdmlsh(s19, s2, s3);
+
+ END();
+
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
+ RUN();
+ ASSERT_EQUAL_128(0, 0x40003fff40003ffb, q16);
+ ASSERT_EQUAL_128(0x40003fffc0004000, 0x40004000c0004000, q17);
+ ASSERT_EQUAL_128(0, 0x3ffb, q18);
+ ASSERT_EQUAL_128(0, 0xc0004000, q19);
+#endif
+ TEARDOWN();
+}
+
+TEST(neon_byelement_sqrdmlsh) {
+ SETUP();
+
+ START();
+
+ __ Movi(v0.V2D(), 0x0000000000000000, 0x0000040004008000);
+ __ Movi(v1.V2D(), 0x0000000000000000, 0x0000002000108000);
+ __ Movi(v2.V2D(), 0x0400000080000000, 0x0400000080000000);
+ __ Movi(v3.V2D(), 0x0000002080000000, 0x0000001080000000);
+
+ __ Movi(v16.V2D(), 0x4000400040004000, 0x4000400040004000);
+ __ Movi(v17.V2D(), 0x4000400040004000, 0x4000400040004000);
+ __ Movi(v18.V2D(), 0x4000400040004000, 0x4000400040004000);
+ __ Movi(v19.V2D(), 0x4000400040004000, 0x4000400040004000);
+
+ __ Sqrdmlsh(v16.V4H(), v0.V4H(), v1.H(), 1);
+ __ Sqrdmlsh(v17.V4S(), v2.V4S(), v3.S(), 1);
+ __ Sqrdmlsh(h18, h0, v1.H(), 0);
+ __ Sqrdmlsh(s19, s2, v3.S(), 0);
+
+ END();
+
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
+ RUN();
+ ASSERT_EQUAL_128(0, 0x4000400040004010, q16);
+ ASSERT_EQUAL_128(0x4000400040004010, 0x4000400040004010, q17);
+ ASSERT_EQUAL_128(0, 0xc000, q18);
+ ASSERT_EQUAL_128(0, 0xc0004000, q19);
+#endif
+ TEARDOWN();
+}
+
TEST(neon_2regmisc_saddlp) {
SETUP();
diff --git a/test/aarch64/test-disasm-aarch64.cc b/test/aarch64/test-disasm-aarch64.cc
index f18c4427..616e3555 100644
--- a/test/aarch64/test-disasm-aarch64.cc
+++ b/test/aarch64/test-disasm-aarch64.cc
@@ -4504,6 +4504,18 @@ TEST(neon_3same) {
NEON_FORMAT_LIST_HS(DISASM_INST)
#undef DISASM_INST
+#define DISASM_INST(M, S) \
+ COMPARE_MACRO(Sqrdmlah(v1.M, v2.M, v3.M), \
+ "sqrdmlah v1." S ", v2." S ", v3." S);
+ NEON_FORMAT_LIST_HS(DISASM_INST)
+#undef DISASM_INST
+
+#define DISASM_INST(M, S) \
+ COMPARE_MACRO(Sqrdmlsh(v1.M, v2.M, v3.M), \
+ "sqrdmlsh v1." S ", v2." S ", v3." S);
+ NEON_FORMAT_LIST_HS(DISASM_INST)
+#undef DISASM_INST
+
COMPARE_MACRO(And(v6.V8B(), v7.V8B(), v8.V8B()), "and v6.8b, v7.8b, v8.8b");
COMPARE_MACRO(And(v6.V16B(), v7.V16B(), v8.V16B()),
"and v6.16b, v7.16b, v8.16b");
@@ -4776,6 +4788,10 @@ TEST(neon_scalar_3same) {
COMPARE_MACRO(Sqdmulh(v15.H(), v16.H(), v17.H()), "sqdmulh h15, h16, h17");
COMPARE_MACRO(Sqrdmulh(v12.S(), v13.S(), v14.S()), "sqrdmulh s12, s13, s14");
COMPARE_MACRO(Sqrdmulh(v15.H(), v16.H(), v17.H()), "sqrdmulh h15, h16, h17");
+ COMPARE_MACRO(Sqrdmlah(v12.S(), v13.S(), v14.S()), "sqrdmlah s12, s13, s14");
+ COMPARE_MACRO(Sqrdmlah(v15.H(), v16.H(), v17.H()), "sqrdmlah h15, h16, h17");
+ COMPARE_MACRO(Sqrdmlsh(v12.S(), v13.S(), v14.S()), "sqrdmlsh s12, s13, s14");
+ COMPARE_MACRO(Sqrdmlsh(v15.H(), v16.H(), v17.H()), "sqrdmlsh h15, h16, h17");
#define DISASM_INST(M, R) \
COMPARE_MACRO(Uqadd(v6.M, v7.M, v8.M), "uqadd " R "6, " R "7, " R "8");
@@ -4885,6 +4901,28 @@ TEST(neon_byelement) {
COMPARE_MACRO(Sqrdmulh(h0, h1, v2.H(), 0), "sqrdmulh h0, h1, v2.h[0]");
COMPARE_MACRO(Sqrdmulh(s0, s1, v2.S(), 0), "sqrdmulh s0, s1, v2.s[0]");
+ COMPARE_MACRO(Sqrdmlah(v0.V4H(), v1.V4H(), v2.H(), 0),
+ "sqrdmlah v0.4h, v1.4h, v2.h[0]");
+ COMPARE_MACRO(Sqrdmlah(v2.V8H(), v3.V8H(), v15.H(), 7),
+ "sqrdmlah v2.8h, v3.8h, v15.h[7]");
+ COMPARE_MACRO(Sqrdmlah(v0.V2S(), v1.V2S(), v2.S(), 0),
+ "sqrdmlah v0.2s, v1.2s, v2.s[0]");
+ COMPARE_MACRO(Sqrdmlah(v2.V4S(), v3.V4S(), v15.S(), 3),
+ "sqrdmlah v2.4s, v3.4s, v15.s[3]");
+ COMPARE_MACRO(Sqrdmlah(h0, h1, v2.H(), 0), "sqrdmlah h0, h1, v2.h[0]");
+ COMPARE_MACRO(Sqrdmlah(s0, s1, v2.S(), 0), "sqrdmlah s0, s1, v2.s[0]");
+
+ COMPARE_MACRO(Sqrdmlsh(v0.V4H(), v1.V4H(), v2.H(), 0),
+ "sqrdmlsh v0.4h, v1.4h, v2.h[0]");
+ COMPARE_MACRO(Sqrdmlsh(v2.V8H(), v3.V8H(), v15.H(), 7),
+ "sqrdmlsh v2.8h, v3.8h, v15.h[7]");
+ COMPARE_MACRO(Sqrdmlsh(v0.V2S(), v1.V2S(), v2.S(), 0),
+ "sqrdmlsh v0.2s, v1.2s, v2.s[0]");
+ COMPARE_MACRO(Sqrdmlsh(v2.V4S(), v3.V4S(), v15.S(), 3),
+ "sqrdmlsh v2.4s, v3.4s, v15.s[3]");
+ COMPARE_MACRO(Sqrdmlsh(h0, h1, v2.H(), 0), "sqrdmlsh h0, h1, v2.h[0]");
+ COMPARE_MACRO(Sqrdmlsh(s0, s1, v2.S(), 0), "sqrdmlsh s0, s1, v2.s[0]");
+
COMPARE_MACRO(Smull(v0.V4S(), v1.V4H(), v2.H(), 0),
"smull v0.4s, v1.4h, v2.h[0]");
COMPARE_MACRO(Smull2(v2.V4S(), v3.V8H(), v4.H(), 7),