Improve SIMD & FP constant materialization (#74)
* Fix a code generation issue inside the MacroAssembler::Movi64bitHelper()
method that could set the upper 64 bits of a vector register to an
incorrect value instead of 0
* Reduce the instructions necessary to materialize a vector constant
by 2 when the upper 64 bits are 0, while the lower ones aren't
* Restructure the code paths for the immediate forms of FMOV, so
that the common case, 0, is handled first
diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc
index 604ec46..f7fbd5e 100644
--- a/src/aarch64/assembler-aarch64.cc
+++ b/src/aarch64/assembler-aarch64.cc
@@ -3318,7 +3318,8 @@
void Assembler::fmov(const VRegister& vd, const Register& rn) {
- VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
+ VIXL_ASSERT(CPUHas(CPUFeatures::kFP) ||
+ (vd.Is1D() && CPUHas(CPUFeatures::kNEON)));
VIXL_ASSERT(vd.Is1H() || vd.Is1S() || vd.Is1D());
VIXL_ASSERT((vd.GetSizeInBits() == rn.GetSizeInBits()) || vd.Is1H());
FPIntegerConvertOp op;
@@ -5847,9 +5848,9 @@
uint32_t Assembler::FP32ToImm8(float imm) {
- VIXL_ASSERT(IsImmFP32(imm));
// bits: aBbb.bbbc.defg.h000.0000.0000.0000.0000
uint32_t bits = FloatToRawbits(imm);
+ VIXL_ASSERT(IsImmFP32(bits));
// bit7: a000.0000
uint32_t bit7 = ((bits >> 31) & 0x1) << 7;
// bit6: 0b00.0000
@@ -5865,10 +5866,10 @@
uint32_t Assembler::FP64ToImm8(double imm) {
- VIXL_ASSERT(IsImmFP64(imm));
// bits: aBbb.bbbb.bbcd.efgh.0000.0000.0000.0000
// 0000.0000.0000.0000.0000.0000.0000.0000
uint64_t bits = DoubleToRawbits(imm);
+ VIXL_ASSERT(IsImmFP64(bits));
// bit7: a000.0000
uint64_t bit7 = ((bits >> 63) & 0x1) << 7;
// bit6: 0b00.0000
@@ -6422,10 +6423,9 @@
}
-bool Assembler::IsImmFP32(float imm) {
+bool Assembler::IsImmFP32(uint32_t bits) {
// Valid values will have the form:
// aBbb.bbbc.defg.h000.0000.0000.0000.0000
- uint32_t bits = FloatToRawbits(imm);
// bits[19..0] are cleared.
if ((bits & 0x7ffff) != 0) {
return false;
@@ -6446,11 +6446,10 @@
}
-bool Assembler::IsImmFP64(double imm) {
+bool Assembler::IsImmFP64(uint64_t bits) {
// Valid values will have the form:
// aBbb.bbbb.bbcd.efgh.0000.0000.0000.0000
// 0000.0000.0000.0000.0000.0000.0000.0000
- uint64_t bits = DoubleToRawbits(imm);
// bits[47..0] are cleared.
if ((bits & 0x0000ffffffffffff) != 0) {
return false;
diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h
index fbf5a84..1cc53d5 100644
--- a/src/aarch64/assembler-aarch64.h
+++ b/src/aarch64/assembler-aarch64.h
@@ -7516,8 +7516,14 @@
static bool IsImmAddSub(int64_t immediate);
static bool IsImmConditionalCompare(int64_t immediate);
static bool IsImmFP16(Float16 imm);
- static bool IsImmFP32(float imm);
- static bool IsImmFP64(double imm);
+
+ static bool IsImmFP32(float imm) { return IsImmFP32(FloatToRawbits(imm)); }
+
+ static bool IsImmFP32(uint32_t bits);
+
+ static bool IsImmFP64(double imm) { return IsImmFP64(DoubleToRawbits(imm)); }
+
+ static bool IsImmFP64(uint64_t bits);
static bool IsImmLogical(uint64_t value,
unsigned width,
unsigned* n = NULL,
diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc
index e426932..563ee07 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.cc
+++ b/src/aarch64/cpu-features-auditor-aarch64.cc
@@ -507,8 +507,6 @@
void CPUFeaturesAuditor::VisitFPIntegerConvert(const Instruction* instr) {
RecordInstructionFeaturesScope scope(this);
- // All of these instructions require FP.
- scope.Record(CPUFeatures::kFP);
switch (instr->Mask(FPIntegerConvertMask)) {
case FCVTAS_wh:
case FCVTAS_xh:
@@ -538,17 +536,23 @@
case SCVTF_hx:
case UCVTF_hw:
case UCVTF_hx:
+ scope.Record(CPUFeatures::kFP);
scope.Record(CPUFeatures::kFPHalf);
return;
+ case FMOV_dx:
+ scope.RecordOneOrBothOf(CPUFeatures::kFP, CPUFeatures::kNEON);
+ return;
case FMOV_d1_x:
case FMOV_x_d1:
+ scope.Record(CPUFeatures::kFP);
scope.Record(CPUFeatures::kNEON);
return;
case FJCVTZS:
+ scope.Record(CPUFeatures::kFP);
scope.Record(CPUFeatures::kJSCVT);
return;
default:
- // No special CPU features.
+ scope.Record(CPUFeatures::kFP);
return;
}
}
diff --git a/src/aarch64/macro-assembler-aarch64.cc b/src/aarch64/macro-assembler-aarch64.cc
index 94e153a..71ad139 100644
--- a/src/aarch64/macro-assembler-aarch64.cc
+++ b/src/aarch64/macro-assembler-aarch64.cc
@@ -1061,7 +1061,7 @@
Register temp = temps.AcquireX();
Mov(temp, imm);
if (vd.Is1D()) {
- mov(vd.D(), 0, temp);
+ fmov(vd.D(), temp);
} else {
dup(vd.V2D(), temp);
}
@@ -1097,8 +1097,14 @@
void MacroAssembler::Movi(const VRegister& vd, uint64_t hi, uint64_t lo) {
// TODO: Move 128-bit values in a more efficient way.
VIXL_ASSERT(vd.Is128Bits());
- Movi(vd.V2D(), lo);
- if (hi != lo) {
+ if (hi == lo) {
+ Movi(vd.V2D(), lo);
+ return;
+ }
+
+ Movi(vd.V1D(), lo);
+
+ if (hi != 0) {
UseScratchRegisterScope temps(this);
// TODO: Figure out if using a temporary V register to materialise the
// immediate is better.
@@ -1538,6 +1544,12 @@
VIXL_ASSERT(allow_macro_instructions_);
// Floating point immediates are loaded through the literal pool.
MacroEmissionCheckScope guard(this);
+ uint64_t rawbits = DoubleToRawbits(imm);
+
+ if (rawbits == 0) {
+ fmov(vd.D(), xzr);
+ return;
+ }
if (vd.Is1H() || vd.Is4H() || vd.Is8H()) {
Fmov(vd, Float16(imm));
@@ -1550,23 +1562,16 @@
}
VIXL_ASSERT(vd.Is1D() || vd.Is2D());
- if (IsImmFP64(imm)) {
+ if (IsImmFP64(rawbits)) {
fmov(vd, imm);
+ } else if (vd.IsScalar()) {
+ ldr(vd,
+ new Literal<double>(imm,
+ &literal_pool_,
+ RawLiteral::kDeletedOnPlacementByPool));
} else {
- uint64_t rawbits = DoubleToRawbits(imm);
- if (vd.IsScalar()) {
- if (rawbits == 0) {
- fmov(vd, xzr);
- } else {
- ldr(vd,
- new Literal<double>(imm,
- &literal_pool_,
- RawLiteral::kDeletedOnPlacementByPool));
- }
- } else {
- // TODO: consider NEON support for load literal.
- Movi(vd, rawbits);
- }
+ // TODO: consider NEON support for load literal.
+ Movi(vd, rawbits);
}
}
@@ -1575,6 +1580,12 @@
VIXL_ASSERT(allow_macro_instructions_);
// Floating point immediates are loaded through the literal pool.
MacroEmissionCheckScope guard(this);
+ uint32_t rawbits = FloatToRawbits(imm);
+
+ if (rawbits == 0) {
+ fmov(vd.S(), wzr);
+ return;
+ }
if (vd.Is1H() || vd.Is4H() || vd.Is8H()) {
Fmov(vd, Float16(imm));
@@ -1587,23 +1598,16 @@
}
VIXL_ASSERT(vd.Is1S() || vd.Is2S() || vd.Is4S());
- if (IsImmFP32(imm)) {
+ if (IsImmFP32(rawbits)) {
fmov(vd, imm);
+ } else if (vd.IsScalar()) {
+ ldr(vd,
+ new Literal<float>(imm,
+ &literal_pool_,
+ RawLiteral::kDeletedOnPlacementByPool));
} else {
- uint32_t rawbits = FloatToRawbits(imm);
- if (vd.IsScalar()) {
- if (rawbits == 0) {
- fmov(vd, wzr);
- } else {
- ldr(vd,
- new Literal<float>(imm,
- &literal_pool_,
- RawLiteral::kDeletedOnPlacementByPool));
- }
- } else {
- // TODO: consider NEON support for load literal.
- Movi(vd, rawbits);
- }
+ // TODO: consider NEON support for load literal.
+ Movi(vd, rawbits);
}
}
diff --git a/src/aarch64/macro-assembler-sve-aarch64.cc b/src/aarch64/macro-assembler-sve-aarch64.cc
index 208f67e..56a504c 100644
--- a/src/aarch64/macro-assembler-sve-aarch64.cc
+++ b/src/aarch64/macro-assembler-sve-aarch64.cc
@@ -832,11 +832,12 @@
Fdup(zd, static_cast<float>(imm));
break;
case kDRegSize:
- if (IsImmFP64(imm)) {
+ uint64_t bits = DoubleToRawbits(imm);
+ if (IsImmFP64(bits)) {
SingleEmissionCheckScope guard(this);
fdup(zd, imm);
} else {
- Dup(zd, DoubleToRawbits(imm));
+ Dup(zd, bits);
}
break;
}
diff --git a/test/aarch64/test-assembler-aarch64.cc b/test/aarch64/test-assembler-aarch64.cc
index 58f4fd1..934c109 100644
--- a/test/aarch64/test-assembler-aarch64.cc
+++ b/test/aarch64/test-assembler-aarch64.cc
@@ -14918,5 +14918,26 @@
#endif
#endif
+TEST(scalar_movi) {
+ SETUP_WITH_FEATURES(CPUFeatures::kFP, CPUFeatures::kNEON);
+ START();
+
+ // Make sure that V0 is initialized to a non-zero value.
+ __ Movi(v0.V16B(), 0xFF);
+ // This constant value can't be encoded in a MOVI instruction,
+ // so the program would use a fallback path that must set the
+ // upper 64 bits of the destination vector to 0.
+ __ Movi(v0.V1D(), 0xDECAFC0FFEE);
+ __ Mov(x0, v0.V2D(), 1);
+
+ END();
+
+ if (CAN_RUN()) {
+ RUN();
+
+ ASSERT_EQUAL_64(0, x0);
+ }
+}
+
} // namespace aarch64
} // namespace vixl
diff --git a/test/aarch64/test-assembler-sve-aarch64.cc b/test/aarch64/test-assembler-sve-aarch64.cc
index 806a7c5..f34c246 100644
--- a/test/aarch64/test-assembler-sve-aarch64.cc
+++ b/test/aarch64/test-assembler-sve-aarch64.cc
@@ -16274,9 +16274,7 @@
}
TEST_SVE(sve_fadda) {
- SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
- CPUFeatures::kFP,
- CPUFeatures::kFPHalf);
+ SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kFP);
START();
__ Ptrue(p0.VnB());
diff --git a/test/aarch64/test-disasm-neon-aarch64.cc b/test/aarch64/test-disasm-neon-aarch64.cc
index 45f9a8f..14dd18a 100644
--- a/test/aarch64/test-disasm-neon-aarch64.cc
+++ b/test/aarch64/test-disasm-neon-aarch64.cc
@@ -3199,7 +3199,7 @@
COMPARE_MACRO(Movi(v2.V2D(), 0xff00ff00ff00ff, 0xff00ff00ff00ff),
"movi v2.2d, #0xff00ff00ff00ff");
COMPARE_MACRO(Movi(v3.V2D(), 0xffff, 0xff00ff00ff00ff),
- "movi v3.2d, #0xff00ff00ff00ff\n"
+ "movi d3, #0xff00ff00ff00ff\n"
"mov x16, #0xffff\n"
"mov v3.d[1], x16");
diff --git a/test/test-trace-reference/log-cpufeatures b/test/test-trace-reference/log-cpufeatures
index 0b24590..fd727e6 100644
--- a/test/test-trace-reference/log-cpufeatures
+++ b/test/test-trace-reference/log-cpufeatures
@@ -459,7 +459,7 @@
0x~~~~~~~~~~~~~~~~ 1e7e79d3 fminnm d19, d14, d30 // Needs: FP
0x~~~~~~~~~~~~~~~~ 1e217820 fminnm s0, s1, s1 // Needs: FP
0x~~~~~~~~~~~~~~~~ 1e6040cd fmov d13, d6 // Needs: FP
-0x~~~~~~~~~~~~~~~~ 9e670222 fmov d2, x17 // Needs: FP
+0x~~~~~~~~~~~~~~~~ 9e670222 fmov d2, x17 // Needs: FP, NEON
0x~~~~~~~~~~~~~~~~ 1e709008 fmov d8, #0x84 (-2.5000) // Needs: FP
0x~~~~~~~~~~~~~~~~ 1e204065 fmov s5, s3 // Needs: FP
0x~~~~~~~~~~~~~~~~ 1e270299 fmov s25, w20 // Needs: FP
diff --git a/test/test-trace-reference/log-cpufeatures-colour b/test/test-trace-reference/log-cpufeatures-colour
index dcbe4d8..94c9c95 100644
--- a/test/test-trace-reference/log-cpufeatures-colour
+++ b/test/test-trace-reference/log-cpufeatures-colour
@@ -459,7 +459,7 @@
0x~~~~~~~~~~~~~~~~ 1e7e79d3 fminnm d19, d14, d30 [1;35mFP[0;m
0x~~~~~~~~~~~~~~~~ 1e217820 fminnm s0, s1, s1 [1;35mFP[0;m
0x~~~~~~~~~~~~~~~~ 1e6040cd fmov d13, d6 [1;35mFP[0;m
-0x~~~~~~~~~~~~~~~~ 9e670222 fmov d2, x17 [1;35mFP[0;m
+0x~~~~~~~~~~~~~~~~ 9e670222 fmov d2, x17 [1;35mFP, NEON[0;m
0x~~~~~~~~~~~~~~~~ 1e709008 fmov d8, #0x84 (-2.5000) [1;35mFP[0;m
0x~~~~~~~~~~~~~~~~ 1e204065 fmov s5, s3 [1;35mFP[0;m
0x~~~~~~~~~~~~~~~~ 1e270299 fmov s25, w20 [1;35mFP[0;m
diff --git a/test/test-trace-reference/log-cpufeatures-custom b/test/test-trace-reference/log-cpufeatures-custom
index e64e6f7..b46c23e 100644
--- a/test/test-trace-reference/log-cpufeatures-custom
+++ b/test/test-trace-reference/log-cpufeatures-custom
@@ -459,7 +459,7 @@
0x~~~~~~~~~~~~~~~~ 1e7e79d3 fminnm d19, d14, d30 ### {FP} ###
0x~~~~~~~~~~~~~~~~ 1e217820 fminnm s0, s1, s1 ### {FP} ###
0x~~~~~~~~~~~~~~~~ 1e6040cd fmov d13, d6 ### {FP} ###
-0x~~~~~~~~~~~~~~~~ 9e670222 fmov d2, x17 ### {FP} ###
+0x~~~~~~~~~~~~~~~~ 9e670222 fmov d2, x17 ### {FP, NEON} ###
0x~~~~~~~~~~~~~~~~ 1e709008 fmov d8, #0x84 (-2.5000) ### {FP} ###
0x~~~~~~~~~~~~~~~~ 1e204065 fmov s5, s3 ### {FP} ###
0x~~~~~~~~~~~~~~~~ 1e270299 fmov s25, w20 ### {FP} ###