Improve SIMD & FP constant materialization (#74) * Fix a code generation issue inside the MacroAssembler::Movi64bitHelper() method that could set the upper 64 bits of a vector register to an incorrect value instead of 0 * Reduce the instructions necessary to materialize a vector constant by 2 when the upper 64 bits are 0, while the lower ones aren't * Restructure the code paths for the immediate forms of FMOV, so that the common case, 0, is handled first

commit: 279f08b57b2e7ebb38f60486e65319e82d12197a [log] [tgz]
author: Anton Kirilov <anton.kirilov@arm.com> Tue Jun 20 10:55:10 2023 +0100
committer: GitHub <noreply@github.com> Tue Jun 20 10:55:10 2023 +0100
tree: b52de26f8cc8e40d113ea64706ab679ba07a1ffc
parent: 02c670693338bfe55992bdc857808f5122bd45f3 [diff]
diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc
index 604ec46..f7fbd5e 100644
--- a/src/aarch64/assembler-aarch64.cc
+++ b/src/aarch64/assembler-aarch64.cc

@@ -3318,7 +3318,8 @@
 
 
 void Assembler::fmov(const VRegister& vd, const Register& rn) {
-  VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kFP) ||
+              (vd.Is1D() && CPUHas(CPUFeatures::kNEON)));
   VIXL_ASSERT(vd.Is1H() || vd.Is1S() || vd.Is1D());
   VIXL_ASSERT((vd.GetSizeInBits() == rn.GetSizeInBits()) || vd.Is1H());
   FPIntegerConvertOp op;
@@ -5847,9 +5848,9 @@
 
 
 uint32_t Assembler::FP32ToImm8(float imm) {
-  VIXL_ASSERT(IsImmFP32(imm));
   // bits: aBbb.bbbc.defg.h000.0000.0000.0000.0000
   uint32_t bits = FloatToRawbits(imm);
+  VIXL_ASSERT(IsImmFP32(bits));
   // bit7: a000.0000
   uint32_t bit7 = ((bits >> 31) & 0x1) << 7;
   // bit6: 0b00.0000
@@ -5865,10 +5866,10 @@
 
 
 uint32_t Assembler::FP64ToImm8(double imm) {
-  VIXL_ASSERT(IsImmFP64(imm));
   // bits: aBbb.bbbb.bbcd.efgh.0000.0000.0000.0000
   //       0000.0000.0000.0000.0000.0000.0000.0000
   uint64_t bits = DoubleToRawbits(imm);
+  VIXL_ASSERT(IsImmFP64(bits));
   // bit7: a000.0000
   uint64_t bit7 = ((bits >> 63) & 0x1) << 7;
   // bit6: 0b00.0000
@@ -6422,10 +6423,9 @@
 }
 
 
-bool Assembler::IsImmFP32(float imm) {
+bool Assembler::IsImmFP32(uint32_t bits) {
   // Valid values will have the form:
   // aBbb.bbbc.defg.h000.0000.0000.0000.0000
-  uint32_t bits = FloatToRawbits(imm);
   // bits[19..0] are cleared.
   if ((bits & 0x7ffff) != 0) {
     return false;
@@ -6446,11 +6446,10 @@
 }
 
 
-bool Assembler::IsImmFP64(double imm) {
+bool Assembler::IsImmFP64(uint64_t bits) {
   // Valid values will have the form:
   // aBbb.bbbb.bbcd.efgh.0000.0000.0000.0000
   // 0000.0000.0000.0000.0000.0000.0000.0000
-  uint64_t bits = DoubleToRawbits(imm);
   // bits[47..0] are cleared.
   if ((bits & 0x0000ffffffffffff) != 0) {
     return false;

diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h
index fbf5a84..1cc53d5 100644
--- a/src/aarch64/assembler-aarch64.h
+++ b/src/aarch64/assembler-aarch64.h

@@ -7516,8 +7516,14 @@
   static bool IsImmAddSub(int64_t immediate);
   static bool IsImmConditionalCompare(int64_t immediate);
   static bool IsImmFP16(Float16 imm);
-  static bool IsImmFP32(float imm);
-  static bool IsImmFP64(double imm);
+
+  static bool IsImmFP32(float imm) { return IsImmFP32(FloatToRawbits(imm)); }
+
+  static bool IsImmFP32(uint32_t bits);
+
+  static bool IsImmFP64(double imm) { return IsImmFP64(DoubleToRawbits(imm)); }
+
+  static bool IsImmFP64(uint64_t bits);
   static bool IsImmLogical(uint64_t value,
                            unsigned width,
                            unsigned* n = NULL,

diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc
index e426932..563ee07 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.cc
+++ b/src/aarch64/cpu-features-auditor-aarch64.cc

@@ -507,8 +507,6 @@
 
 void CPUFeaturesAuditor::VisitFPIntegerConvert(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
-  // All of these instructions require FP.
-  scope.Record(CPUFeatures::kFP);
   switch (instr->Mask(FPIntegerConvertMask)) {
     case FCVTAS_wh:
     case FCVTAS_xh:
@@ -538,17 +536,23 @@
     case SCVTF_hx:
     case UCVTF_hw:
     case UCVTF_hx:
+      scope.Record(CPUFeatures::kFP);
       scope.Record(CPUFeatures::kFPHalf);
       return;
+    case FMOV_dx:
+      scope.RecordOneOrBothOf(CPUFeatures::kFP, CPUFeatures::kNEON);
+      return;
     case FMOV_d1_x:
     case FMOV_x_d1:
+      scope.Record(CPUFeatures::kFP);
       scope.Record(CPUFeatures::kNEON);
       return;
     case FJCVTZS:
+      scope.Record(CPUFeatures::kFP);
       scope.Record(CPUFeatures::kJSCVT);
       return;
     default:
-      // No special CPU features.
+      scope.Record(CPUFeatures::kFP);
       return;
   }
 }

diff --git a/src/aarch64/macro-assembler-aarch64.cc b/src/aarch64/macro-assembler-aarch64.cc
index 94e153a..71ad139 100644
--- a/src/aarch64/macro-assembler-aarch64.cc
+++ b/src/aarch64/macro-assembler-aarch64.cc

@@ -1061,7 +1061,7 @@
     Register temp = temps.AcquireX();
     Mov(temp, imm);
     if (vd.Is1D()) {
-      mov(vd.D(), 0, temp);
+      fmov(vd.D(), temp);
     } else {
       dup(vd.V2D(), temp);
     }
@@ -1097,8 +1097,14 @@
 void MacroAssembler::Movi(const VRegister& vd, uint64_t hi, uint64_t lo) {
   // TODO: Move 128-bit values in a more efficient way.
   VIXL_ASSERT(vd.Is128Bits());
-  Movi(vd.V2D(), lo);
-  if (hi != lo) {
+  if (hi == lo) {
+    Movi(vd.V2D(), lo);
+    return;
+  }
+
+  Movi(vd.V1D(), lo);
+
+  if (hi != 0) {
     UseScratchRegisterScope temps(this);
     // TODO: Figure out if using a temporary V register to materialise the
     // immediate is better.
@@ -1538,6 +1544,12 @@
   VIXL_ASSERT(allow_macro_instructions_);
   // Floating point immediates are loaded through the literal pool.
   MacroEmissionCheckScope guard(this);
+  uint64_t rawbits = DoubleToRawbits(imm);
+
+  if (rawbits == 0) {
+    fmov(vd.D(), xzr);
+    return;
+  }
 
   if (vd.Is1H() || vd.Is4H() || vd.Is8H()) {
     Fmov(vd, Float16(imm));
@@ -1550,23 +1562,16 @@
   }
 
   VIXL_ASSERT(vd.Is1D() || vd.Is2D());
-  if (IsImmFP64(imm)) {
+  if (IsImmFP64(rawbits)) {
     fmov(vd, imm);
+  } else if (vd.IsScalar()) {
+    ldr(vd,
+        new Literal<double>(imm,
+                            &literal_pool_,
+                            RawLiteral::kDeletedOnPlacementByPool));
   } else {
-    uint64_t rawbits = DoubleToRawbits(imm);
-    if (vd.IsScalar()) {
-      if (rawbits == 0) {
-        fmov(vd, xzr);
-      } else {
-        ldr(vd,
-            new Literal<double>(imm,
-                                &literal_pool_,
-                                RawLiteral::kDeletedOnPlacementByPool));
-      }
-    } else {
-      // TODO: consider NEON support for load literal.
-      Movi(vd, rawbits);
-    }
+    // TODO: consider NEON support for load literal.
+    Movi(vd, rawbits);
   }
 }
 
@@ -1575,6 +1580,12 @@
   VIXL_ASSERT(allow_macro_instructions_);
   // Floating point immediates are loaded through the literal pool.
   MacroEmissionCheckScope guard(this);
+  uint32_t rawbits = FloatToRawbits(imm);
+
+  if (rawbits == 0) {
+    fmov(vd.S(), wzr);
+    return;
+  }
 
   if (vd.Is1H() || vd.Is4H() || vd.Is8H()) {
     Fmov(vd, Float16(imm));
@@ -1587,23 +1598,16 @@
   }
 
   VIXL_ASSERT(vd.Is1S() || vd.Is2S() || vd.Is4S());
-  if (IsImmFP32(imm)) {
+  if (IsImmFP32(rawbits)) {
     fmov(vd, imm);
+  } else if (vd.IsScalar()) {
+    ldr(vd,
+        new Literal<float>(imm,
+                           &literal_pool_,
+                           RawLiteral::kDeletedOnPlacementByPool));
   } else {
-    uint32_t rawbits = FloatToRawbits(imm);
-    if (vd.IsScalar()) {
-      if (rawbits == 0) {
-        fmov(vd, wzr);
-      } else {
-        ldr(vd,
-            new Literal<float>(imm,
-                               &literal_pool_,
-                               RawLiteral::kDeletedOnPlacementByPool));
-      }
-    } else {
-      // TODO: consider NEON support for load literal.
-      Movi(vd, rawbits);
-    }
+    // TODO: consider NEON support for load literal.
+    Movi(vd, rawbits);
   }
 }
 

diff --git a/src/aarch64/macro-assembler-sve-aarch64.cc b/src/aarch64/macro-assembler-sve-aarch64.cc
index 208f67e..56a504c 100644
--- a/src/aarch64/macro-assembler-sve-aarch64.cc
+++ b/src/aarch64/macro-assembler-sve-aarch64.cc

@@ -832,11 +832,12 @@
       Fdup(zd, static_cast<float>(imm));
       break;
     case kDRegSize:
-      if (IsImmFP64(imm)) {
+      uint64_t bits = DoubleToRawbits(imm);
+      if (IsImmFP64(bits)) {
         SingleEmissionCheckScope guard(this);
         fdup(zd, imm);
       } else {
-        Dup(zd, DoubleToRawbits(imm));
+        Dup(zd, bits);
       }
       break;
   }

diff --git a/test/aarch64/test-assembler-aarch64.cc b/test/aarch64/test-assembler-aarch64.cc
index 58f4fd1..934c109 100644
--- a/test/aarch64/test-assembler-aarch64.cc
+++ b/test/aarch64/test-assembler-aarch64.cc

@@ -14918,5 +14918,26 @@
 #endif
 #endif
 
+TEST(scalar_movi) {
+  SETUP_WITH_FEATURES(CPUFeatures::kFP, CPUFeatures::kNEON);
+  START();
+
+  // Make sure that V0 is initialized to a non-zero value.
+  __ Movi(v0.V16B(), 0xFF);
+  // This constant value can't be encoded in a MOVI instruction,
+  // so the program would use a fallback path that must set the
+  // upper 64 bits of the destination vector to 0.
+  __ Movi(v0.V1D(), 0xDECAFC0FFEE);
+  __ Mov(x0, v0.V2D(), 1);
+
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+
+    ASSERT_EQUAL_64(0, x0);
+  }
+}
+
 }  // namespace aarch64
 }  // namespace vixl

diff --git a/test/aarch64/test-assembler-sve-aarch64.cc b/test/aarch64/test-assembler-sve-aarch64.cc
index 806a7c5..f34c246 100644
--- a/test/aarch64/test-assembler-sve-aarch64.cc
+++ b/test/aarch64/test-assembler-sve-aarch64.cc

@@ -16274,9 +16274,7 @@
 }
 
 TEST_SVE(sve_fadda) {
-  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
-                          CPUFeatures::kFP,
-                          CPUFeatures::kFPHalf);
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kFP);
   START();
 
   __ Ptrue(p0.VnB());

diff --git a/test/aarch64/test-disasm-neon-aarch64.cc b/test/aarch64/test-disasm-neon-aarch64.cc
index 45f9a8f..14dd18a 100644
--- a/test/aarch64/test-disasm-neon-aarch64.cc
+++ b/test/aarch64/test-disasm-neon-aarch64.cc

@@ -3199,7 +3199,7 @@
   COMPARE_MACRO(Movi(v2.V2D(), 0xff00ff00ff00ff, 0xff00ff00ff00ff),
                 "movi v2.2d, #0xff00ff00ff00ff");
   COMPARE_MACRO(Movi(v3.V2D(), 0xffff, 0xff00ff00ff00ff),
-                "movi v3.2d, #0xff00ff00ff00ff\n"
+                "movi d3, #0xff00ff00ff00ff\n"
                 "mov x16, #0xffff\n"
                 "mov v3.d[1], x16");
 

diff --git a/test/test-trace-reference/log-cpufeatures b/test/test-trace-reference/log-cpufeatures
index 0b24590..fd727e6 100644
--- a/test/test-trace-reference/log-cpufeatures
+++ b/test/test-trace-reference/log-cpufeatures

@@ -459,7 +459,7 @@
 0x~~~~~~~~~~~~~~~~  1e7e79d3		fminnm d19, d14, d30                    // Needs: FP
 0x~~~~~~~~~~~~~~~~  1e217820		fminnm s0, s1, s1                       // Needs: FP
 0x~~~~~~~~~~~~~~~~  1e6040cd		fmov d13, d6                            // Needs: FP
-0x~~~~~~~~~~~~~~~~  9e670222		fmov d2, x17                            // Needs: FP
+0x~~~~~~~~~~~~~~~~  9e670222		fmov d2, x17                            // Needs: FP, NEON
 0x~~~~~~~~~~~~~~~~  1e709008		fmov d8, #0x84 (-2.5000)                // Needs: FP
 0x~~~~~~~~~~~~~~~~  1e204065		fmov s5, s3                             // Needs: FP
 0x~~~~~~~~~~~~~~~~  1e270299		fmov s25, w20                           // Needs: FP

diff --git a/test/test-trace-reference/log-cpufeatures-colour b/test/test-trace-reference/log-cpufeatures-colour
index dcbe4d8..94c9c95 100644
--- a/test/test-trace-reference/log-cpufeatures-colour
+++ b/test/test-trace-reference/log-cpufeatures-colour

@@ -459,7 +459,7 @@
 0x~~~~~~~~~~~~~~~~  1e7e79d3		fminnm d19, d14, d30                    [1;35mFP[0;m
 0x~~~~~~~~~~~~~~~~  1e217820		fminnm s0, s1, s1                       [1;35mFP[0;m
 0x~~~~~~~~~~~~~~~~  1e6040cd		fmov d13, d6                            [1;35mFP[0;m
-0x~~~~~~~~~~~~~~~~  9e670222		fmov d2, x17                            [1;35mFP[0;m
+0x~~~~~~~~~~~~~~~~  9e670222		fmov d2, x17                            [1;35mFP, NEON[0;m
 0x~~~~~~~~~~~~~~~~  1e709008		fmov d8, #0x84 (-2.5000)                [1;35mFP[0;m
 0x~~~~~~~~~~~~~~~~  1e204065		fmov s5, s3                             [1;35mFP[0;m
 0x~~~~~~~~~~~~~~~~  1e270299		fmov s25, w20                           [1;35mFP[0;m

diff --git a/test/test-trace-reference/log-cpufeatures-custom b/test/test-trace-reference/log-cpufeatures-custom
index e64e6f7..b46c23e 100644
--- a/test/test-trace-reference/log-cpufeatures-custom
+++ b/test/test-trace-reference/log-cpufeatures-custom

@@ -459,7 +459,7 @@
 0x~~~~~~~~~~~~~~~~  1e7e79d3		fminnm d19, d14, d30                    ### {FP} ###
 0x~~~~~~~~~~~~~~~~  1e217820		fminnm s0, s1, s1                       ### {FP} ###
 0x~~~~~~~~~~~~~~~~  1e6040cd		fmov d13, d6                            ### {FP} ###
-0x~~~~~~~~~~~~~~~~  9e670222		fmov d2, x17                            ### {FP} ###
+0x~~~~~~~~~~~~~~~~  9e670222		fmov d2, x17                            ### {FP, NEON} ###
 0x~~~~~~~~~~~~~~~~  1e709008		fmov d8, #0x84 (-2.5000)                ### {FP} ###
 0x~~~~~~~~~~~~~~~~  1e204065		fmov s5, s3                             ### {FP} ###
 0x~~~~~~~~~~~~~~~~  1e270299		fmov s25, w20                           ### {FP} ###
commit	279f08b57b2e7ebb38f60486e65319e82d12197a	[log] [tgz]
author	Anton Kirilov <anton.kirilov@arm.com>	Tue Jun 20 10:55:10 2023 +0100
committer	GitHub <noreply@github.com>	Tue Jun 20 10:55:10 2023 +0100
tree	b52de26f8cc8e40d113ea64706ab679ba07a1ffc
parent	02c670693338bfe55992bdc857808f5122bd45f3 [diff]