diff options
author | Alexander Gilday <alexander.gilday@arm.com> | 2018-04-04 13:42:33 +0100 |
---|---|---|
committer | Alexander Gilday <alexander.gilday@arm.com> | 2018-04-10 14:05:14 +0100 |
commit | 4378564f0d26925b8aee59b28fa5fd9f249e3ef8 (patch) | |
tree | e0d66c243fc30708e154db13b0109e9f6c745444 | |
parent | d82faf6442f6b611101c2cebe35bb3deddbf1d6e (diff) |
Add support for SQRDMLAH and SQRDMLSH.
Change-Id: I2b490d877e7e9db77608b84ab0b92aa972a54a6d
-rw-r--r-- | examples/aarch64/non-const-visitor.h | 1 | ||||
-rw-r--r-- | src/aarch64/assembler-aarch64.cc | 40 | ||||
-rw-r--r-- | src/aarch64/assembler-aarch64.h | 22 | ||||
-rw-r--r-- | src/aarch64/constants-aarch64.h | 29 | ||||
-rw-r--r-- | src/aarch64/decoder-aarch64.cc | 15 | ||||
-rw-r--r-- | src/aarch64/decoder-aarch64.h | 1 | ||||
-rw-r--r-- | src/aarch64/disasm-aarch64.cc | 69 | ||||
-rw-r--r-- | src/aarch64/instrument-aarch64.cc | 8 | ||||
-rw-r--r-- | src/aarch64/logic-aarch64.cc | 78 | ||||
-rw-r--r-- | src/aarch64/macro-assembler-aarch64.h | 4 | ||||
-rw-r--r-- | src/aarch64/simulator-aarch64.cc | 74 | ||||
-rw-r--r-- | src/aarch64/simulator-aarch64.h | 26 | ||||
-rw-r--r-- | test/aarch64/test-assembler-aarch64.cc | 129 | ||||
-rw-r--r-- | test/aarch64/test-disasm-aarch64.cc | 38 |
14 files changed, 502 insertions, 32 deletions
diff --git a/examples/aarch64/non-const-visitor.h b/examples/aarch64/non-const-visitor.h index d5c99fec..56b1e51b 100644 --- a/examples/aarch64/non-const-visitor.h +++ b/examples/aarch64/non-const-visitor.h @@ -118,6 +118,7 @@ class SwitchAddSubRegisterSources : public DecoderVisitor { V(NEONScalar2RegMisc) \ V(NEONScalar3Diff) \ V(NEONScalar3Same) \ + V(NEONScalar3SameExtra) \ V(NEON3SameExtra) \ V(NEONScalarByIndexedElement) \ V(NEONScalarCopy) \ diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc index 8851631f..41841e54 100644 --- a/src/aarch64/assembler-aarch64.cc +++ b/src/aarch64/assembler-aarch64.cc @@ -2767,6 +2767,42 @@ void Assembler::addp(const VRegister& vd, const VRegister& vn) { } +void Assembler::sqrdmlah(const VRegister& vd, + const VRegister& vn, + const VRegister& vm) { + VIXL_ASSERT(AreSameFormat(vd, vn, vm)); + VIXL_ASSERT(vd.IsVector() || !vd.IsQ()); + + Instr format, op = NEON_SQRDMLAH; + if (vd.IsScalar()) { + op |= NEON_Q | NEONScalar; + format = SFormat(vd); + } else { + format = VFormat(vd); + } + + Emit(format | op | Rm(vm) | Rn(vn) | Rd(vd)); +} + + +void Assembler::sqrdmlsh(const VRegister& vd, + const VRegister& vn, + const VRegister& vm) { + VIXL_ASSERT(AreSameFormat(vd, vn, vm)); + VIXL_ASSERT(vd.IsVector() || !vd.IsQ()); + + Instr format, op = NEON_SQRDMLSH; + if (vd.IsScalar()) { + op |= NEON_Q | NEONScalar; + format = SFormat(vd); + } else { + format = VFormat(vd); + } + + Emit(format | op | Rm(vm) | Rn(vn) | Rd(vd)); +} + + void Assembler::faddp(const VRegister& vd, const VRegister& vn) { VIXL_ASSERT((vd.Is1S() && vn.Is2S()) || (vd.Is1D() && vn.Is2D())); Emit(FPFormat(vd) | NEON_FADDP_scalar | Rn(vn) | Rd(vd)); @@ -2995,7 +3031,9 @@ void Assembler::NEONByElementL(const VRegister& vd, V(mla, NEON_MLA_byelement, vn.IsVector()) \ V(mls, NEON_MLS_byelement, vn.IsVector()) \ V(sqdmulh, NEON_SQDMULH_byelement, true) \ - V(sqrdmulh, NEON_SQRDMULH_byelement, true) + V(sqrdmulh, NEON_SQRDMULH_byelement, true) \ + V(sqrdmlah, NEON_SQRDMLAH_byelement, true) \ + V(sqrdmlsh, NEON_SQRDMLSH_byelement, true) // clang-format on diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h index 412f648d..1c25886b 100644 --- a/src/aarch64/assembler-aarch64.h +++ b/src/aarch64/assembler-aarch64.h @@ -2415,6 +2415,14 @@ class Assembler : public vixl::internal::AssemblerBase { // Signed saturating rounding doubling multiply returning high half. void sqrdmulh(const VRegister& vd, const VRegister& vn, const VRegister& vm); + // Signed saturating rounding doubling multiply accumulate returning high + // half [Armv8.1]. + void sqrdmlah(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // Signed saturating rounding doubling multiply subtract returning high half + // [Armv8.1]. + void sqrdmlsh(const VRegister& vd, const VRegister& vn, const VRegister& vm); + // Signed saturating doubling multiply element returning high half. void sqdmulh(const VRegister& vd, const VRegister& vn, @@ -2427,6 +2435,20 @@ class Assembler : public vixl::internal::AssemblerBase { const VRegister& vm, int vm_index); + // Signed saturating rounding doubling multiply accumulate element returning + // high half [Armv8.1]. + void sqrdmlah(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int vm_index); + + // Signed saturating rounding doubling multiply subtract element returning + // high half [Armv8.1]. + void sqrdmlsh(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int vm_index); + // Unsigned long multiply long. void umull(const VRegister& vd, const VRegister& vn, const VRegister& vm); diff --git a/src/aarch64/constants-aarch64.h b/src/aarch64/constants-aarch64.h index 7f175a41..b57bdbfe 100644 --- a/src/aarch64/constants-aarch64.h +++ b/src/aarch64/constants-aarch64.h @@ -1550,13 +1550,17 @@ enum NEON3SameOp { enum NEON3SameExtraOp { NEON3SameExtraFixed = 0x0E008400, NEON3SameExtraUBit = 0x20000000, - NEON3SameExtraFMask = 0x0E008400, - NEON3SameExtraMask = 0x2E00E400, + NEON3SameExtraFMask = 0x9E208400, + NEON3SameExtraMask = 0xBE20FC00, + NEON_SQRDMLAH = NEON3SameExtraFixed | NEON3SameExtraUBit, + NEON_SQRDMLSH = NEON3SameExtraFixed | NEON3SameExtraUBit | 0x00000800, /* v8.3 Complex Numbers */ - NEON_FCMLA = NEON3SameExtraFixed | NEON3SameExtraUBit | 0x00004000, - NEON_FCADD = NEON3SameExtraFixed | NEON3SameExtraUBit | 0x00006000 - + NEON3SameExtraFCFixed = 0x2E00C400, + NEON3SameExtraFCFMask = 0xBE20C400, + NEON3SameExtraFCMask = 0xBE20E400, + NEON_FCMLA = NEON3SameExtraFCFixed, + NEON_FCADD = NEON3SameExtraFCFixed | 0x00002000 }; // NEON instructions with three different-type operands. @@ -1670,6 +1674,8 @@ enum NEONByIndexedElementOp { NEON_FMUL_byelement = NEONByIndexedElementFPFixed | 0x00009000, NEON_FMULX_byelement = NEONByIndexedElementFPFixed | 0x20009000, NEON_FCMLA_byelement = NEONByIndexedElementFixed | 0x20001000, + NEON_SQRDMLAH_byelement = NEONByIndexedElementFixed | 0x2000D000, + NEON_SQRDMLSH_byelement = NEONByIndexedElementFixed | 0x2000F000, // Complex instruction(s) this is necessary because 'rot' encoding moves into the NEONByIndex..Mask space NEONByIndexedElementFPComplexMask = 0xBF009400 @@ -2051,6 +2057,15 @@ enum NEONScalar3SameOp { NEON_FABD_scalar = NEON_Q | NEONScalar | NEON_FABD }; +// 'Extra' NEON scalar instructions with three same-type operands. +enum NEONScalar3SameExtraOp { + NEONScalar3SameExtraFixed = 0x5E008400, + NEONScalar3SameExtraFMask = 0xDF208400, + NEONScalar3SameExtraMask = 0xFF20FC00, + NEON_SQRDMLAH_scalar = NEON_Q | NEONScalar | NEON_SQRDMLAH, + NEON_SQRDMLSH_scalar = NEON_Q | NEONScalar | NEON_SQRDMLSH +}; + // NEON scalar instructions with three different-type operands. enum NEONScalar3DiffOp { NEONScalar3DiffFixed = 0x5E200000, @@ -2072,6 +2087,10 @@ enum NEONScalarByIndexedElementOp { NEON_SQDMULH_byelement_scalar = NEON_Q | NEONScalar | NEON_SQDMULH_byelement, NEON_SQRDMULH_byelement_scalar = NEON_Q | NEONScalar | NEON_SQRDMULH_byelement, + NEON_SQRDMLAH_byelement_scalar + = NEON_Q | NEONScalar | NEON_SQRDMLAH_byelement, + NEON_SQRDMLSH_byelement_scalar + = NEON_Q | NEONScalar | NEON_SQRDMLSH_byelement, // Floating point instructions. NEONScalarByIndexedElementFPFixed diff --git a/src/aarch64/decoder-aarch64.cc b/src/aarch64/decoder-aarch64.cc index 4dc77212..fe1cb08c 100644 --- a/src/aarch64/decoder-aarch64.cc +++ b/src/aarch64/decoder-aarch64.cc @@ -756,8 +756,9 @@ void Decoder::DecodeNEONVectorDataProcessing(const Instruction* instr) { } } else if (instr->ExtractBit(10) == 0) { VisitUnallocated(instr); - } else if (instr->ExtractBits(14, 11) <= 0xE && - instr->ExtractBits(14, 11) >= 0x8) { + } else if ((instr->ExtractBits(14, 11) <= 0xE && + instr->ExtractBits(14, 11) >= 0x8) || + instr->ExtractBits(14, 12) == 0x0) { VisitNEON3SameExtra(instr); } else if (instr->ExtractBits(13, 11) < 0x4) { VisitUnimplemented(instr); @@ -839,7 +840,15 @@ void Decoder::DecodeNEONScalarDataProcessing(const Instruction* instr) { } } } else { - VisitUnallocated(instr); + if (instr->ExtractBit(29) == 0) { + VisitUnallocated(instr); + } else { + if (instr->ExtractBit(10) == 0) { + VisitUnallocated(instr); + } else { + VisitNEONScalar3SameExtra(instr); + } + } } } else { if (instr->ExtractBit(10) == 0) { diff --git a/src/aarch64/decoder-aarch64.h b/src/aarch64/decoder-aarch64.h index 05c86a33..4a21b9f5 100644 --- a/src/aarch64/decoder-aarch64.h +++ b/src/aarch64/decoder-aarch64.h @@ -99,6 +99,7 @@ V(NEONScalar2RegMisc) \ V(NEONScalar3Diff) \ V(NEONScalar3Same) \ + V(NEONScalar3SameExtra) \ V(NEONScalarByIndexedElement) \ V(NEONScalarCopy) \ V(NEONScalarPairwise) \ diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc index 126ab273..defa07f8 100644 --- a/src/aarch64/disasm-aarch64.cc +++ b/src/aarch64/disasm-aarch64.cc @@ -2335,17 +2335,32 @@ void Disassembler::VisitNEON3SameExtra(const Instruction *instr) { NEONFormatDecoder nfd(instr); - switch (instr->Mask(NEON3SameExtraMask)) { - case NEON_FCMLA: - mnemonic = "fcmla"; - form = "'Vd.%s, 'Vn.%s, 'Vm.%s, 'IVFCNM"; - break; - case NEON_FCADD: - mnemonic = "fcadd"; - form = "'Vd.%s, 'Vn.%s, 'Vm.%s, 'IVFCNA"; - break; - default: - form = "(NEON3SameExtra)"; + if (instr->Mask(NEON3SameExtraFCFMask) == NEON3SameExtraFCFixed) { + switch (instr->Mask(NEON3SameExtraFCMask)) { + case NEON_FCMLA: + mnemonic = "fcmla"; + form = "'Vd.%s, 'Vn.%s, 'Vm.%s, 'IVFCNM"; + break; + case NEON_FCADD: + mnemonic = "fcadd"; + form = "'Vd.%s, 'Vn.%s, 'Vm.%s, 'IVFCNA"; + break; + default: + form = "(NEON3SameExtra)"; + } + } else { + switch (instr->Mask(NEON3SameExtraMask)) { + case NEON_SQRDMLAH: + mnemonic = "sqrdmlah"; + form = "'Vd.%s, 'Vn.%s, 'Vm.%s"; + break; + case NEON_SQRDMLSH: + mnemonic = "sqrdmlsh"; + form = "'Vd.%s, 'Vn.%s, 'Vm.%s"; + break; + default: + form = "(NEON3SameExtra)"; + } } Format(instr, mnemonic, nfd.Substitute(form)); @@ -2594,6 +2609,12 @@ void Disassembler::VisitNEONByIndexedElement(const Instruction *instr) { case NEON_SQRDMULH_byelement: mnemonic = "sqrdmulh"; break; + case NEON_SQRDMLAH_byelement: + mnemonic = "sqrdmlah"; + break; + case NEON_SQRDMLSH_byelement: + mnemonic = "sqrdmlsh"; + break; default: switch (instr->Mask(NEONByIndexedElementFPMask)) { case NEON_FMUL_byelement: @@ -2622,6 +2643,7 @@ void Disassembler::VisitNEONByIndexedElement(const Instruction *instr) { } } } + if (l_instr) { Format(instr, nfd.Mnemonic(mnemonic), nfd.Substitute(form)); } else if (fp_instr) { @@ -3576,6 +3598,25 @@ void Disassembler::VisitNEONScalar3Same(const Instruction *instr) { } +void Disassembler::VisitNEONScalar3SameExtra(const Instruction *instr) { + const char *mnemonic = "unimplemented"; + const char *form = "%sd, %sn, %sm"; + NEONFormatDecoder nfd(instr, NEONFormatDecoder::ScalarFormatMap()); + + switch (instr->Mask(NEONScalar3SameExtraMask)) { + case NEON_SQRDMLAH_scalar: + mnemonic = "sqrdmlah"; + break; + case NEON_SQRDMLSH_scalar: + mnemonic = "sqrdmlsh"; + break; + default: + form = "(NEONScalar3SameExtra)"; + } + Format(instr, mnemonic, nfd.SubstitutePlaceholders(form)); +} + + void Disassembler::VisitNEONScalarByIndexedElement(const Instruction *instr) { const char *mnemonic = "unimplemented"; const char *form = "%sd, %sn, 'Ve.%s['IVByElemIndex]"; @@ -3601,6 +3642,12 @@ void Disassembler::VisitNEONScalarByIndexedElement(const Instruction *instr) { case NEON_SQRDMULH_byelement_scalar: mnemonic = "sqrdmulh"; break; + case NEON_SQRDMLAH_byelement_scalar: + mnemonic = "sqrdmlah"; + break; + case NEON_SQRDMLSH_byelement_scalar: + mnemonic = "sqrdmlsh"; + break; default: nfd.SetFormatMap(0, nfd.FPScalarFormatMap()); switch (instr->Mask(NEONScalarByIndexedElementFPMask)) { diff --git a/src/aarch64/instrument-aarch64.cc b/src/aarch64/instrument-aarch64.cc index c664b50c..3882e8ee 100644 --- a/src/aarch64/instrument-aarch64.cc +++ b/src/aarch64/instrument-aarch64.cc @@ -789,6 +789,14 @@ void Instrument::VisitNEONScalar3Same(const Instruction* instr) { } +void Instrument::VisitNEONScalar3SameExtra(const Instruction* instr) { + USE(instr); + Update(); + static Counter* counter = GetCounter("NEON"); + counter->Increment(); +} + + void Instrument::VisitNEONScalarByIndexedElement(const Instruction* instr) { USE(instr); Update(); diff --git a/src/aarch64/logic-aarch64.cc b/src/aarch64/logic-aarch64.cc index fc7d234b..15fac2cb 100644 --- a/src/aarch64/logic-aarch64.cc +++ b/src/aarch64/logic-aarch64.cc @@ -1124,6 +1124,28 @@ LogicVRegister Simulator::sqrdmulh(VectorFormat vform, } +LogicVRegister Simulator::sqrdmlah(VectorFormat vform, + LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2, + int index) { + SimVRegister temp; + VectorFormat indexform = VectorFormatFillQ(vform); + return sqrdmlah(vform, dst, src1, dup_element(indexform, temp, src2, index)); +} + + +LogicVRegister Simulator::sqrdmlsh(VectorFormat vform, + LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2, + int index) { + SimVRegister temp; + VectorFormat indexform = VectorFormatFillQ(vform); + return sqrdmlsh(vform, dst, src1, dup_element(indexform, temp, src2, index)); +} + + uint16_t Simulator::PolynomialMult(uint8_t op1, uint8_t op2) const { uint16_t result = 0; uint16_t extended_op2 = op2; @@ -3549,6 +3571,62 @@ LogicVRegister Simulator::sqrdmulh(VectorFormat vform, } +LogicVRegister Simulator::sqrdmlash(VectorFormat vform, + LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2, + bool round, + bool sub_op) { + // 2 * INT_32_MIN * INT_32_MIN causes int64_t to overflow. + // To avoid this, we use: + // (dst << (esize - 1) + src1 * src2 + 1 << (esize - 2)) >> (esize - 1) + // which is same as: + // (dst << esize + 2 * src1 * src2 + 1 << (esize - 1)) >> esize. + + int esize = LaneSizeInBitsFromFormat(vform); + int round_const = round ? (1 << (esize - 2)) : 0; + int64_t accum; + + dst.ClearForWrite(vform); + for (int i = 0; i < LaneCountFromFormat(vform); i++) { + accum = dst.Int(vform, i) << (esize - 1); + if (sub_op) { + accum -= src1.Int(vform, i) * src2.Int(vform, i); + } else { + accum += src1.Int(vform, i) * src2.Int(vform, i); + } + accum += round_const; + accum = accum >> (esize - 1); + + if (accum > MaxIntFromFormat(vform)) { + accum = MaxIntFromFormat(vform); + } else if (accum < MinIntFromFormat(vform)) { + accum = MinIntFromFormat(vform); + } + dst.SetInt(vform, i, accum); + } + return dst; +} + + +LogicVRegister Simulator::sqrdmlah(VectorFormat vform, + LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2, + bool round) { + return sqrdmlash(vform, dst, src1, src2, round, false); +} + + +LogicVRegister Simulator::sqrdmlsh(VectorFormat vform, + LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2, + bool round) { + return sqrdmlash(vform, dst, src1, src2, round, true); +} + + LogicVRegister Simulator::sqdmulh(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src1, diff --git a/src/aarch64/macro-assembler-aarch64.h b/src/aarch64/macro-assembler-aarch64.h index 5255872a..acb65ed9 100644 --- a/src/aarch64/macro-assembler-aarch64.h +++ b/src/aarch64/macro-assembler-aarch64.h @@ -2236,6 +2236,8 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface { V(sqdmull, Sqdmull) \ V(sqdmull2, Sqdmull2) \ V(sqrdmulh, Sqrdmulh) \ + V(sqrdmlah, Sqrdmlah) \ + V(sqrdmlsh, Sqrdmlsh) \ V(sqrshl, Sqrshl) \ V(sqshl, Sqshl) \ V(sqsub, Sqsub) \ @@ -2410,6 +2412,8 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface { V(mls, Mls) \ V(sqdmulh, Sqdmulh) \ V(sqrdmulh, Sqrdmulh) \ + V(sqrdmlah, Sqrdmlah) \ + V(sqrdmlsh, Sqrdmlsh) \ V(sqdmull, Sqdmull) \ V(sqdmull2, Sqdmull2) \ V(sqdmlal, Sqdmlal) \ diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc index 1f7a5154..ab08d28f 100644 --- a/src/aarch64/simulator-aarch64.cc +++ b/src/aarch64/simulator-aarch64.cc @@ -3514,18 +3514,32 @@ void Simulator::VisitNEON3SameExtra(const Instruction* instr) { SimVRegister& rm = ReadVRegister(instr->GetRm()); int rot = 0; VectorFormat vf = nfd.GetVectorFormat(); - switch (instr->Mask(NEON3SameExtraMask)) { - case NEON_FCADD: - rot = instr->GetImmRotFcadd(); - fcadd(vf, rd, rn, rm, rot); - break; - case NEON_FCMLA: - rot = instr->GetImmRotFcmlaVec(); - fcmla(vf, rd, rn, rm, rot); - break; - default: - VIXL_UNIMPLEMENTED(); - break; + if (instr->Mask(NEON3SameExtraFCFMask) == NEON3SameExtraFCFixed) { + switch (instr->Mask(NEON3SameExtraFCMask)) { + case NEON_FCADD: + rot = instr->GetImmRotFcadd(); + fcadd(vf, rd, rn, rm, rot); + break; + case NEON_FCMLA: + rot = instr->GetImmRotFcmlaVec(); + fcmla(vf, rd, rn, rm, rot); + break; + default: + VIXL_UNIMPLEMENTED(); + break; + } + } else { + switch (instr->Mask(NEON3SameExtraMask)) { + case NEON_SQRDMLAH: + sqrdmlah(vf, rd, rn, rm); + break; + case NEON_SQRDMLSH: + sqrdmlsh(vf, rd, rn, rm); + break; + default: + VIXL_UNIMPLEMENTED(); + break; + } } } @@ -3798,6 +3812,14 @@ void Simulator::VisitNEONByIndexedElement(const Instruction* instr) { Op = &Simulator::sqrdmulh; vf = vf_r; break; + case NEON_SQRDMLAH_byelement: + Op = &Simulator::sqrdmlah; + vf = vf_r; + break; + case NEON_SQRDMLSH_byelement: + Op = &Simulator::sqrdmlsh; + vf = vf_r; + break; case NEON_SMULL_byelement: if (instr->Mask(NEON_Q)) { Op = &Simulator::smull2; @@ -4753,6 +4775,26 @@ void Simulator::VisitNEONScalar3Same(const Instruction* instr) { } +void Simulator::VisitNEONScalar3SameExtra(const Instruction* instr) { + NEONFormatDecoder nfd(instr, NEONFormatDecoder::ScalarFormatMap()); + VectorFormat vf = nfd.GetVectorFormat(); + + SimVRegister& rd = ReadVRegister(instr->GetRd()); + SimVRegister& rn = ReadVRegister(instr->GetRn()); + SimVRegister& rm = ReadVRegister(instr->GetRm()); + + switch (instr->Mask(NEONScalar3SameExtraMask)) { + case NEON_SQRDMLAH_scalar: + sqrdmlah(vf, rd, rn, rm); + break; + case NEON_SQRDMLSH_scalar: + sqrdmlsh(vf, rd, rn, rm); + break; + default: + VIXL_UNIMPLEMENTED(); + } +} + void Simulator::VisitNEONScalarByIndexedElement(const Instruction* instr) { NEONFormatDecoder nfd(instr, NEONFormatDecoder::LongScalarFormatMap()); VectorFormat vf = nfd.GetVectorFormat(); @@ -4787,6 +4829,14 @@ void Simulator::VisitNEONScalarByIndexedElement(const Instruction* instr) { Op = &Simulator::sqrdmulh; vf = vf_r; break; + case NEON_SQRDMLAH_byelement_scalar: + Op = &Simulator::sqrdmlah; + vf = vf_r; + break; + case NEON_SQRDMLSH_byelement_scalar: + Op = &Simulator::sqrdmlsh; + vf = vf_r; + break; default: vf = nfd.GetVectorFormat(nfd.FPScalarFormatMap()); index = instr->GetNEONH(); diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h index 813cee1a..c8cbffb3 100644 --- a/src/aarch64/simulator-aarch64.h +++ b/src/aarch64/simulator-aarch64.h @@ -2173,6 +2173,16 @@ class Simulator : public DecoderVisitor { const LogicVRegister& src1, const LogicVRegister& src2, int index); + LogicVRegister sqrdmlah(VectorFormat vform, + LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2, + int index); + LogicVRegister sqrdmlsh(VectorFormat vform, + LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2, + int index); LogicVRegister sub(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src1, @@ -2755,6 +2765,22 @@ class Simulator : public DecoderVisitor { const LogicVRegister& src1, const LogicVRegister& src2, bool round = true); + LogicVRegister sqrdmlash(VectorFormat vform, + LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2, + bool round = true, + bool sub_op = false); + LogicVRegister sqrdmlah(VectorFormat vform, + LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2, + bool round = true); + LogicVRegister sqrdmlsh(VectorFormat vform, + LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2, + bool round = true); LogicVRegister sqdmulh(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src1, diff --git a/test/aarch64/test-assembler-aarch64.cc b/test/aarch64/test-assembler-aarch64.cc index 0491a4e3..9cbe2f64 100644 --- a/test/aarch64/test-assembler-aarch64.cc +++ b/test/aarch64/test-assembler-aarch64.cc @@ -17256,6 +17256,135 @@ TEST(neon_byelement_sqdmulh_sqrdmulh) { TEARDOWN(); } +TEST(neon_3same_sqrdmlah) { + SETUP(); + + START(); + + __ Movi(v0.V2D(), 0x0000000000000000, 0x0000040004008000); + __ Movi(v1.V2D(), 0x0000000000000000, 0x0000002000108000); + __ Movi(v2.V2D(), 0x0400000080000000, 0x0400000080000000); + __ Movi(v3.V2D(), 0x0000002080000000, 0x0000001080000000); + + __ Movi(v16.V2D(), 0x0000040004008000, 0x0000040004008000); + __ Movi(v17.V2D(), 0x0000000000000000, 0x0000002000108000); + __ Movi(v18.V2D(), 0x0400000080000000, 0x0400000080000000); + __ Movi(v19.V2D(), 0x0000002080000000, 0x0000001080000000); + + __ Sqrdmlah(v16.V4H(), v0.V4H(), v1.V4H()); + __ Sqrdmlah(v17.V4S(), v2.V4S(), v3.V4S()); + __ Sqrdmlah(h18, h0, h1); + __ Sqrdmlah(s19, s2, s3); + + END(); + +// TODO: test on real hardware when available +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 + RUN(); + ASSERT_EQUAL_128(0, 0x0000040104010000, q16); + ASSERT_EQUAL_128(0x000000017fffffff, 0x000000217fffffff, q17); + ASSERT_EQUAL_128(0, 0x7fff, q18); + ASSERT_EQUAL_128(0, 0, q19); +#endif + TEARDOWN(); +} + +TEST(neon_byelement_sqrdmlah) { + SETUP(); + + START(); + + __ Movi(v0.V2D(), 0x0000000000000000, 0x0000040004008000); + __ Movi(v1.V2D(), 0x0000000000000000, 0x0000002000108000); + __ Movi(v2.V2D(), 0x0400000080000000, 0x0400000080000000); + __ Movi(v3.V2D(), 0x0000002080000000, 0x0000001080000000); + + __ Movi(v16.V2D(), 0x0000040004008000, 0x0000040004008000); + __ Movi(v17.V2D(), 0x0000000000000000, 0x0000002000108000); + __ Movi(v18.V2D(), 0x0400000080000000, 0x0400000080000000); + __ Movi(v19.V2D(), 0x0000002080000000, 0x0000001080000000); + + __ Sqrdmlah(v16.V4H(), v0.V4H(), v1.H(), 1); + __ Sqrdmlah(v17.V4S(), v2.V4S(), v3.S(), 1); + __ Sqrdmlah(h18, h0, v1.H(), 0); + __ Sqrdmlah(s19, s2, v3.S(), 0); + + END(); + +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 + RUN(); + ASSERT_EQUAL_128(0, 0x0000040104018000, q16); + ASSERT_EQUAL_128(0x00000001fffffff0, 0x0000002100107ff0, q17); + ASSERT_EQUAL_128(0, 0x7fff, q18); + ASSERT_EQUAL_128(0, 0, q19); +#endif + TEARDOWN(); +} + +TEST(neon_3same_sqrdmlsh) { + SETUP(); + + START(); + + __ Movi(v0.V2D(), 0x0000000000000000, 0x0000040004000500); + __ Movi(v1.V2D(), 0x0000000000000000, 0x0000002000100080); + __ Movi(v2.V2D(), 0x0400000080000000, 0x0400000080000000); + __ Movi(v3.V2D(), 0x0000002080000000, 0x0000001080000000); + + __ Movi(v16.V2D(), 0x4000400040004000, 0x4000400040004000); + __ Movi(v17.V2D(), 0x4000400040004000, 0x4000400040004000); + __ Movi(v18.V2D(), 0x4000400040004000, 0x4000400040004000); + __ Movi(v19.V2D(), 0x4000400040004000, 0x4000400040004000); + + __ Sqrdmlsh(v16.V4H(), v0.V4H(), v1.V4H()); + __ Sqrdmlsh(v17.V4S(), v2.V4S(), v3.V4S()); + __ Sqrdmlsh(h18, h0, h1); + __ Sqrdmlsh(s19, s2, s3); + + END(); + +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 + RUN(); + ASSERT_EQUAL_128(0, 0x40003fff40003ffb, q16); + ASSERT_EQUAL_128(0x40003fffc0004000, 0x40004000c0004000, q17); + ASSERT_EQUAL_128(0, 0x3ffb, q18); + ASSERT_EQUAL_128(0, 0xc0004000, q19); +#endif + TEARDOWN(); +} + +TEST(neon_byelement_sqrdmlsh) { + SETUP(); + + START(); + + __ Movi(v0.V2D(), 0x0000000000000000, 0x0000040004008000); + __ Movi(v1.V2D(), 0x0000000000000000, 0x0000002000108000); + __ Movi(v2.V2D(), 0x0400000080000000, 0x0400000080000000); + __ Movi(v3.V2D(), 0x0000002080000000, 0x0000001080000000); + + __ Movi(v16.V2D(), 0x4000400040004000, 0x4000400040004000); + __ Movi(v17.V2D(), 0x4000400040004000, 0x4000400040004000); + __ Movi(v18.V2D(), 0x4000400040004000, 0x4000400040004000); + __ Movi(v19.V2D(), 0x4000400040004000, 0x4000400040004000); + + __ Sqrdmlsh(v16.V4H(), v0.V4H(), v1.H(), 1); + __ Sqrdmlsh(v17.V4S(), v2.V4S(), v3.S(), 1); + __ Sqrdmlsh(h18, h0, v1.H(), 0); + __ Sqrdmlsh(s19, s2, v3.S(), 0); + + END(); + +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 + RUN(); + ASSERT_EQUAL_128(0, 0x4000400040004010, q16); + ASSERT_EQUAL_128(0x4000400040004010, 0x4000400040004010, q17); + ASSERT_EQUAL_128(0, 0xc000, q18); + ASSERT_EQUAL_128(0, 0xc0004000, q19); +#endif + TEARDOWN(); +} + TEST(neon_2regmisc_saddlp) { SETUP(); diff --git a/test/aarch64/test-disasm-aarch64.cc b/test/aarch64/test-disasm-aarch64.cc index f18c4427..616e3555 100644 --- a/test/aarch64/test-disasm-aarch64.cc +++ b/test/aarch64/test-disasm-aarch64.cc @@ -4504,6 +4504,18 @@ TEST(neon_3same) { NEON_FORMAT_LIST_HS(DISASM_INST) #undef DISASM_INST +#define DISASM_INST(M, S) \ + COMPARE_MACRO(Sqrdmlah(v1.M, v2.M, v3.M), \ + "sqrdmlah v1." S ", v2." S ", v3." S); + NEON_FORMAT_LIST_HS(DISASM_INST) +#undef DISASM_INST + +#define DISASM_INST(M, S) \ + COMPARE_MACRO(Sqrdmlsh(v1.M, v2.M, v3.M), \ + "sqrdmlsh v1." S ", v2." S ", v3." S); + NEON_FORMAT_LIST_HS(DISASM_INST) +#undef DISASM_INST + COMPARE_MACRO(And(v6.V8B(), v7.V8B(), v8.V8B()), "and v6.8b, v7.8b, v8.8b"); COMPARE_MACRO(And(v6.V16B(), v7.V16B(), v8.V16B()), "and v6.16b, v7.16b, v8.16b"); @@ -4776,6 +4788,10 @@ TEST(neon_scalar_3same) { COMPARE_MACRO(Sqdmulh(v15.H(), v16.H(), v17.H()), "sqdmulh h15, h16, h17"); COMPARE_MACRO(Sqrdmulh(v12.S(), v13.S(), v14.S()), "sqrdmulh s12, s13, s14"); COMPARE_MACRO(Sqrdmulh(v15.H(), v16.H(), v17.H()), "sqrdmulh h15, h16, h17"); + COMPARE_MACRO(Sqrdmlah(v12.S(), v13.S(), v14.S()), "sqrdmlah s12, s13, s14"); + COMPARE_MACRO(Sqrdmlah(v15.H(), v16.H(), v17.H()), "sqrdmlah h15, h16, h17"); + COMPARE_MACRO(Sqrdmlsh(v12.S(), v13.S(), v14.S()), "sqrdmlsh s12, s13, s14"); + COMPARE_MACRO(Sqrdmlsh(v15.H(), v16.H(), v17.H()), "sqrdmlsh h15, h16, h17"); #define DISASM_INST(M, R) \ COMPARE_MACRO(Uqadd(v6.M, v7.M, v8.M), "uqadd " R "6, " R "7, " R "8"); @@ -4885,6 +4901,28 @@ TEST(neon_byelement) { COMPARE_MACRO(Sqrdmulh(h0, h1, v2.H(), 0), "sqrdmulh h0, h1, v2.h[0]"); COMPARE_MACRO(Sqrdmulh(s0, s1, v2.S(), 0), "sqrdmulh s0, s1, v2.s[0]"); + COMPARE_MACRO(Sqrdmlah(v0.V4H(), v1.V4H(), v2.H(), 0), + "sqrdmlah v0.4h, v1.4h, v2.h[0]"); + COMPARE_MACRO(Sqrdmlah(v2.V8H(), v3.V8H(), v15.H(), 7), + "sqrdmlah v2.8h, v3.8h, v15.h[7]"); + COMPARE_MACRO(Sqrdmlah(v0.V2S(), v1.V2S(), v2.S(), 0), + "sqrdmlah v0.2s, v1.2s, v2.s[0]"); + COMPARE_MACRO(Sqrdmlah(v2.V4S(), v3.V4S(), v15.S(), 3), + "sqrdmlah v2.4s, v3.4s, v15.s[3]"); + COMPARE_MACRO(Sqrdmlah(h0, h1, v2.H(), 0), "sqrdmlah h0, h1, v2.h[0]"); + COMPARE_MACRO(Sqrdmlah(s0, s1, v2.S(), 0), "sqrdmlah s0, s1, v2.s[0]"); + + COMPARE_MACRO(Sqrdmlsh(v0.V4H(), v1.V4H(), v2.H(), 0), + "sqrdmlsh v0.4h, v1.4h, v2.h[0]"); + COMPARE_MACRO(Sqrdmlsh(v2.V8H(), v3.V8H(), v15.H(), 7), + "sqrdmlsh v2.8h, v3.8h, v15.h[7]"); + COMPARE_MACRO(Sqrdmlsh(v0.V2S(), v1.V2S(), v2.S(), 0), + "sqrdmlsh v0.2s, v1.2s, v2.s[0]"); + COMPARE_MACRO(Sqrdmlsh(v2.V4S(), v3.V4S(), v15.S(), 3), + "sqrdmlsh v2.4s, v3.4s, v15.s[3]"); + COMPARE_MACRO(Sqrdmlsh(h0, h1, v2.H(), 0), "sqrdmlsh h0, h1, v2.h[0]"); + COMPARE_MACRO(Sqrdmlsh(s0, s1, v2.S(), 0), "sqrdmlsh s0, s1, v2.s[0]"); + COMPARE_MACRO(Smull(v0.V4S(), v1.V4H(), v2.H(), 0), "smull v0.4s, v1.4h, v2.h[0]"); COMPARE_MACRO(Smull2(v2.V4S(), v3.V8H(), v4.H(), 7), |