aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMaxim Kuvyrkov <maxim.kuvyrkov@linaro.org>2018-01-16 09:35:54 +0000
committerMaxim Kuvyrkov <maxim.kuvyrkov@linaro.org>2018-02-14 08:11:46 +0000
commitf7cd0a49635b07f902e4ce8f5323b894f0baef20 (patch)
tree47e1d8c81c681207ab5aa4943f914a120a78bb40
parent2632342128a90974aed618be38050f4b380badbf (diff)
gcc/
Backport from trunk r253780. 2017-10-16 Tamar Christina <tamar.christina@arm.com> * config/arm/arm.h (TARGET_DOTPROD): New. * config/arm/arm.c (arm_arch_dotprod): New. (arm_option_reconfigure_globals): Add arm_arch_dotprod. * config/arm/arm-c.c (__ARM_FEATURE_DOTPROD): New. * config/arm/arm-cpus.in (armv8.2-a): Enabled +dotprod. (feature dotprod, group dotprod, ALL_SIMD_INTERNAL): New. (ALL_FPU_INTERNAL): Use ALL_SIMD_INTERNAL. * config/arm/t-multilib (v8_2_a_simd_variants): Add dotprod. * doc/invoke.texi (armv8.2-a): Document dotprod gcc/ Backport from trunk r253781. 2017-10-16 Tamar Christina <tamar.christina@arm.com> * config/arm/arm-builtins.c (arm_unsigned_uternop_qualifiers): New. (UTERNOP_QUALIFIERS, arm_umac_lane_qualifiers, UMAC_LANE_QUALIFIERS): New. * config/arm/arm_neon_builtins.def (sdot, udot, sdot_lane, udot_lane): new. * config/arm/iterators.md (DOTPROD, VSI2QI, vsi2qi): New. (UNSPEC_DOT_S, UNSPEC_DOT_U, opsuffix): New. * config/arm/neon.md (neon_<sup>dot<vsi2qi>): New. (neon_<sup>dot_lane<vsi2qi>, <sup>dot_prod<vsi2qi>): New. * config/arm/types.md (neon_dot, neon_dot_q): New. * config/arm/unspecs.md (sup): Add UNSPEC_DOT_S, UNSPEC_DOT_U. gcc/ Backport from trunk r253782. 2017-10-16 Tamar Christina <tamar.christina@arm.com> * config/aarch64/aarch64.h (AARCH64_FL_DOTPROD): New. (AARCH64_ISA_DOTPROD, TARGET_DOTPROD): New. * config/aarch64/aarch64-c.c (aarch64_update_cpp_builtins): Add TARGET_DOTPROD. * config/aarch64/aarch64-option-extensions.def (dotprod): New. * config/aarch64/aarch64-cores.def (cortex-a55, cortex-a75): Enable TARGET_DOTPROD. (cortex-a75.cortex-a55): Likewise. * doc/invoke.texi (aarch64-feature-modifiers): Document dotprod. gcc/ Backport from trunk r253783. 2017-10-16 Tamar Christina <tamar.christina@arm.com> * config/aarch64/aarch64-builtins.c (aarch64_types_quadopu_lane_qualifiers): New. (TYPES_QUADOPU_LANE): New. * config/aarch64/aarch64-simd.md (aarch64_<sur>dot<vsi2qi>): New. (<sur>dot_prod<vsi2qi>, aarch64_<sur>dot_lane<vsi2qi>): New. (aarch64_<sur>dot_laneq<vsi2qi>): New. * config/aarch64/aarch64-simd-builtins.def (sdot, udot): New. (sdot_lane, udot_lane, sdot_laneq, udot_laneq): New. * config/aarch64/iterators.md (sur): Add UNSPEC_SDOT, UNSPEC_UDOT. (Vdottype, DOTPROD): New. (sur): Add SDOT and UDOT. gcc/ Backport from trunk r253784. 2017-10-16 Tamar Christina <tamar.christina@arm.com> * config/aarch64/arm_neon.h (vdot_u32, vdotq_u32, vdot_s32, vdotq_s32): New. (vdot_lane_u32, vdot_laneq_u32, vdotq_lane_u32, vdotq_laneq_u32): New. (vdot_lane_s32, vdot_laneq_s32, vdotq_lane_s32, vdotq_laneq_s32): New. gcc/testsuite/ Backport from trunk r253784. 2017-10-16 Tamar Christina <tamar.christina@arm.com> * gcc.target/aarch64/advsimd-intrinsics/vect-dot-qi.h: New. * gcc.target/aarch64/advsimd-intrinsics/vdot-compile.c: New. * gcc.target/aarch64/advsimd-intrinsics/vect-dot-s8.c: New. * gcc.target/aarch64/advsimd-intrinsics/vect-dot-u8.c: New. gcc/testsuite/ Backport from trunk r254097. 2017-10-26 Tamar Christina <tamar.christina@arm.com> * gcc.target/aarch64/advsimd-intrinsics/vect-dot-qi.h: New. * gcc.target/aarch64/advsimd-intrinsics/vdot-compile.c: New. * gcc.target/aarch64/advsimd-intrinsics/vect-dot-s8.c: New. * gcc.target/aarch64/advsimd-intrinsics/vect-dot-u8.c: New. gcc/testsuite/ Backport from trunk r254100. 2017-10-26 Tamar Christina <tamar.christina@arm.com> * lib/target-supports.exp (check_effective_target_arm_v8_2a_dotprod_neon_ok_nocache): New. (check_effective_target_arm_v8_2a_dotprod_neon_ok): New. (add_options_for_arm_v8_2a_dotprod_neon): New. (check_effective_target_arm_v8_2a_dotprod_neon_hw): New. (check_effective_target_vect_sdot_qi): Add ARM && AArch64. (check_effective_target_vect_udot_qi): Likewise. * gcc.target/arm/simd/vdot-exec.c: New. * gcc.target/aarch64/advsimd-intrinsics/vdot-exec.c: New. * gcc/doc/sourcebuild.texi: Document arm_v8_2a_dotprod_neon. gcc/testsuite/ Backport from trunk r254101. 2017-10-26 Tamar Christina <tamar.christina@arm.com> * gcc.dg/vect/vect-reduc-dot-s8a.c (dg-additional-options, dg-require-effective-target): Add +dotprod. * gcc.dg/vect/vect-reduc-dot-u8a.c (dg-additional-options, dg-require-effective-target): Add +dotprod. gcc/ Backport from trunk r254775. 2017-11-15 Tamar Christina <tamar.christina@arm.com> * config/arm/arm.h (TARGET_DOTPROD): Add arm_arch8_2. gcc/ Backport from trunk r255064. 2017-11-22 Tamar Christina <tamar.christina@arm.com> * config/arm/arm_neon.h (vdot_u32, vdotq_u32) (vdot_s32, vdotq_s32): New. (vdot_lane_u32, vdotq_lane_u32): New. (vdot_lane_s32, vdotq_lane_s32): New. gcc/testsuite/ Backport from trunk r255064. 2017-11-22 Tamar Christina <tamar.christina@arm.com> * gcc.target/arm/simd/vdot-compile.c: New. * gcc.target/arm/simd/vect-dot-qi.h: New. * gcc.target/arm/simd/vect-dot-s8.c: New. * gcc.target/arm/simd/vect-dot-u8.c: New gcc/ Backport from trunk r255126. 2017-11-24 Christophe Lyon <christophe.lyon@linaro.org> * config/arm/arm_neon.h: Fix pragma GCC push_options before vdot_u32. Change-Id: I469f803e36949046f8e339cfd9d1556170ce4a90
-rw-r--r--gcc/config/aarch64/aarch64-builtins.c5
-rw-r--r--gcc/config/aarch64/aarch64-c.c1
-rw-r--r--gcc/config/aarch64/aarch64-option-extensions.def8
-rw-r--r--gcc/config/aarch64/aarch64-simd-builtins.def8
-rw-r--r--gcc/config/aarch64/aarch64-simd.md81
-rw-r--r--gcc/config/aarch64/aarch64.h15
-rw-r--r--gcc/config/aarch64/arm_neon.h93
-rw-r--r--gcc/config/aarch64/iterators.md8
-rw-r--r--gcc/config/arm/arm-builtins.c14
-rw-r--r--gcc/config/arm/arm-c.c6
-rw-r--r--gcc/config/arm/arm-cpu-cdata.h14
-rw-r--r--gcc/config/arm/arm-cpu-data.h20
-rw-r--r--gcc/config/arm/arm-cpus.in14
-rw-r--r--gcc/config/arm/arm-isa.h5
-rw-r--r--gcc/config/arm/arm-tables.opt16
-rw-r--r--gcc/config/arm/arm.h6
-rw-r--r--gcc/config/arm/arm_neon.h63
-rw-r--r--gcc/config/arm/arm_neon_builtins.def4
-rw-r--r--gcc/config/arm/iterators.md9
-rw-r--r--gcc/config/arm/neon.md70
-rw-r--r--gcc/config/arm/types.md8
-rw-r--r--gcc/config/arm/unspecs.md2
-rw-r--r--gcc/doc/invoke.texi11
-rw-r--r--gcc/doc/sourcebuild.texi16
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c3
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8a.c3
-rw-r--r--gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile.c73
-rw-r--r--gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-exec.c81
-rw-r--r--gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vect-dot-qi.h15
-rw-r--r--gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vect-dot-s8.c9
-rw-r--r--gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vect-dot-u8.c9
-rw-r--r--gcc/testsuite/gcc.target/arm/simd/vdot-compile.c56
-rw-r--r--gcc/testsuite/gcc.target/arm/simd/vdot-exec.c55
-rw-r--r--gcc/testsuite/gcc.target/arm/simd/vect-dot-qi.h16
-rw-r--r--gcc/testsuite/gcc.target/arm/simd/vect-dot-s8.c12
-rw-r--r--gcc/testsuite/gcc.target/arm/simd/vect-dot-u8.c12
-rw-r--r--gcc/testsuite/lib/target-supports.exp87
37 files changed, 909 insertions, 19 deletions
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index f09399f4c15..362bc7ddcd3 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -168,6 +168,11 @@ aarch64_types_quadop_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
= { qualifier_none, qualifier_none, qualifier_none,
qualifier_none, qualifier_lane_index };
#define TYPES_QUADOP_LANE (aarch64_types_quadop_lane_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_quadopu_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
+ qualifier_unsigned, qualifier_lane_index };
+#define TYPES_QUADOPU_LANE (aarch64_types_quadopu_lane_qualifiers)
static enum aarch64_type_qualifiers
aarch64_types_binop_imm_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
diff --git a/gcc/config/aarch64/aarch64-c.c b/gcc/config/aarch64/aarch64-c.c
index 177e638682f..c7d866f3b56 100644
--- a/gcc/config/aarch64/aarch64-c.c
+++ b/gcc/config/aarch64/aarch64-c.c
@@ -106,6 +106,7 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
aarch64_def_or_undef (TARGET_CRC32, "__ARM_FEATURE_CRC32", pfile);
+ aarch64_def_or_undef (TARGET_DOTPROD, "__ARM_FEATURE_DOTPROD", pfile);
cpp_undef (pfile, "__AARCH64_CMODEL_TINY__");
cpp_undef (pfile, "__AARCH64_CMODEL_SMALL__");
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index c4f059ab7c5..d7dcad6726c 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -43,8 +43,8 @@
AARCH64_OPT_EXTENSION("fp", AARCH64_FL_FP, 0, AARCH64_FL_SIMD | AARCH64_FL_CRYPTO | AARCH64_FL_F16, "fp")
/* Enabling "simd" also enables "fp".
- Disabling "simd" also disables "crypto". */
-AARCH64_OPT_EXTENSION("simd", AARCH64_FL_SIMD, AARCH64_FL_FP, AARCH64_FL_CRYPTO, "asimd")
+ Disabling "simd" also disables "crypto" and "dotprod". */
+AARCH64_OPT_EXTENSION("simd", AARCH64_FL_SIMD, AARCH64_FL_FP, AARCH64_FL_CRYPTO | AARCH64_FL_DOTPROD, "asimd")
/* Enabling "crypto" also enables "fp", "simd".
Disabling "crypto" just disables "crypto". */
@@ -67,4 +67,8 @@ AARCH64_OPT_EXTENSION("rcpc", AARCH64_FL_RCPC, 0, 0, "lrcpc")
Disabling "rdma" just disables "rdma". */
AARCH64_OPT_EXTENSION("rdma", AARCH64_FL_RDMA, AARCH64_FL_FP | AARCH64_FL_SIMD, 0, "rdma")
+/* Enabling "dotprod" also enables "simd".
+ Disabling "dotprod" only disables "dotprod". */
+AARCH64_OPT_EXTENSION("dotprod", AARCH64_FL_DOTPROD, AARCH64_FL_SIMD, 0, "asimddp")
+
#undef AARCH64_OPT_EXTENSION
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index d713d5d8b88..52d01342372 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -205,6 +205,14 @@
BUILTIN_VSDQ_I_DI (BINOP, srshl, 0)
BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0)
+ /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>. */
+ BUILTIN_VB (TERNOP, sdot, 0)
+ BUILTIN_VB (TERNOPU, udot, 0)
+ BUILTIN_VB (QUADOP_LANE, sdot_lane, 0)
+ BUILTIN_VB (QUADOPU_LANE, udot_lane, 0)
+ BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0)
+ BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0)
+
BUILTIN_VDQ_I (SHIFTIMM, ashr, 3)
VAR1 (SHIFTIMM, ashr_simd, 0, di)
BUILTIN_VDQ_I (SHIFTIMM, lshr, 3)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f4a4ef39f87..6e4acbc77f9 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -351,6 +351,87 @@
}
)
+;; These instructions map to the __builtins for the Dot Product operations.
+(define_insn "aarch64_<sur>dot<vsi2qi>"
+ [(set (match_operand:VS 0 "register_operand" "=w")
+ (plus:VS (match_operand:VS 1 "register_operand" "0")
+ (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+ (match_operand:<VSI2QI> 3 "register_operand" "w")]
+ DOTPROD)))]
+ "TARGET_DOTPROD"
+ "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
+ [(set_attr "type" "neon_dot")]
+)
+
+;; These expands map to the Dot Product optab the vectorizer checks for.
+;; The auto-vectorizer expects a dot product builtin that also does an
+;; accumulation into the provided register.
+;; Given the following pattern
+;;
+;; for (i=0; i<len; i++) {
+;; c = a[i] * b[i];
+;; r += c;
+;; }
+;; return result;
+;;
+;; This can be auto-vectorized to
+;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
+;;
+;; given enough iterations. However the vectorizer can keep unrolling the loop
+;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
+;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
+;; ...
+;;
+;; and so the vectorizer provides r, in which the result has to be accumulated.
+(define_expand "<sur>dot_prod<vsi2qi>"
+ [(set (match_operand:VS 0 "register_operand")
+ (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
+ (match_operand:<VSI2QI> 2 "register_operand")]
+ DOTPROD)
+ (match_operand:VS 3 "register_operand")))]
+ "TARGET_DOTPROD"
+{
+ emit_insn (
+ gen_aarch64_<sur>dot<vsi2qi> (operands[3], operands[3], operands[1],
+ operands[2]));
+ emit_insn (gen_rtx_SET (operands[0], operands[3]));
+ DONE;
+})
+
+;; These instructions map to the __builtins for the Dot Product
+;; indexed operations.
+(define_insn "aarch64_<sur>dot_lane<vsi2qi>"
+ [(set (match_operand:VS 0 "register_operand" "=w")
+ (plus:VS (match_operand:VS 1 "register_operand" "0")
+ (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+ (match_operand:V8QI 3 "register_operand" "<h_con>")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ DOTPROD)))]
+ "TARGET_DOTPROD"
+ {
+ operands[4]
+ = GEN_INT (ENDIAN_LANE_N (V8QImode, INTVAL (operands[4])));
+ return "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.4b[%4]";
+ }
+ [(set_attr "type" "neon_dot")]
+)
+
+(define_insn "aarch64_<sur>dot_laneq<vsi2qi>"
+ [(set (match_operand:VS 0 "register_operand" "=w")
+ (plus:VS (match_operand:VS 1 "register_operand" "0")
+ (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+ (match_operand:V16QI 3 "register_operand" "<h_con>")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ DOTPROD)))]
+ "TARGET_DOTPROD"
+ {
+ operands[4]
+ = GEN_INT (ENDIAN_LANE_N (V16QImode, INTVAL (operands[4])));
+ return "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.4b[%4]";
+ }
+ [(set_attr "type" "neon_dot")]
+)
+
(define_expand "copysign<mode>3"
[(match_operand:VHSDF 0 "register_operand")
(match_operand:VHSDF 1 "register_operand")
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 03fd93046bd..31741898392 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -144,14 +144,15 @@ extern unsigned aarch64_architecture_version;
#define AARCH64_FL_CRC (1 << 3) /* Has CRC. */
/* ARMv8.1-A architecture extensions. */
#define AARCH64_FL_LSE (1 << 4) /* Has Large System Extensions. */
-#define AARCH64_FL_RDMA (1 << 5) /* Has Round Double Multiply Add. */
-#define AARCH64_FL_V8_1 (1 << 6) /* Has ARMv8.1-A extensions. */
+#define AARCH64_FL_RDMA (1 << 5) /* Has Round Double Multiply Add. */
+#define AARCH64_FL_V8_1 (1 << 6) /* Has ARMv8.1-A extensions. */
/* ARMv8.2-A architecture extensions. */
-#define AARCH64_FL_V8_2 (1 << 8) /* Has ARMv8.2-A features. */
+#define AARCH64_FL_V8_2 (1 << 8) /* Has ARMv8.2-A features. */
#define AARCH64_FL_F16 (1 << 9) /* Has ARMv8.2-A FP16 extensions. */
/* ARMv8.3-A architecture extensions. */
-#define AARCH64_FL_V8_3 (1 << 10) /* Has ARMv8.3-A features. */
-#define AARCH64_FL_RCPC (1 << 11) /* Has support for RCpc model. */
+#define AARCH64_FL_V8_3 (1 << 10) /* Has ARMv8.3-A features. */
+#define AARCH64_FL_RCPC (1 << 11) /* Has support for RCpc model. */
+#define AARCH64_FL_DOTPROD (1 << 12) /* Has ARMv8.2-A Dot Product ins. */
/* Has FP and SIMD. */
#define AARCH64_FL_FPSIMD (AARCH64_FL_FP | AARCH64_FL_SIMD)
@@ -180,6 +181,7 @@ extern unsigned aarch64_architecture_version;
#define AARCH64_ISA_V8_2 (aarch64_isa_flags & AARCH64_FL_V8_2)
#define AARCH64_ISA_F16 (aarch64_isa_flags & AARCH64_FL_F16)
#define AARCH64_ISA_V8_3 (aarch64_isa_flags & AARCH64_FL_V8_3)
+#define AARCH64_ISA_DOTPROD (aarch64_isa_flags & AARCH64_FL_DOTPROD)
/* Crypto is an optional extension to AdvSIMD. */
#define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
@@ -194,6 +196,9 @@ extern unsigned aarch64_architecture_version;
#define TARGET_FP_F16INST (TARGET_FLOAT && AARCH64_ISA_F16)
#define TARGET_SIMD_F16INST (TARGET_SIMD && AARCH64_ISA_F16)
+/* Dot Product is an optional extension to AdvSIMD enabled through +dotprod. */
+#define TARGET_DOTPROD (TARGET_SIMD && AARCH64_ISA_DOTPROD)
+
/* ARMv8.3-A features. */
#define TARGET_ARMV8_3 (AARCH64_ISA_V8_3)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index d7b30b0e5ee..96e740f91a7 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -31541,6 +31541,99 @@ vminnmvq_f16 (float16x8_t __a)
#pragma GCC pop_options
+/* AdvSIMD Dot Product intrinsics. */
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+dotprod")
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b)
+{
+ return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
+{
+ return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b)
+{
+ return __builtin_aarch64_sdotv8qi (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
+{
+ return __builtin_aarch64_sdotv16qi (__r, __a, __b);
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_lane_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b, const int __index)
+{
+ return __builtin_aarch64_udot_lanev8qi_uuuus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_laneq_u32 (uint32x2_t __r, uint8x8_t __a, uint8x16_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_udot_laneqv8qi_uuuus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_lane_u32 (uint32x4_t __r, uint8x16_t __a, uint8x8_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_udot_lanev16qi_uuuus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_laneq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_udot_laneqv16qi_uuuus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_lane_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b, const int __index)
+{
+ return __builtin_aarch64_sdot_lanev8qi (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_laneq_s32 (int32x2_t __r, int8x8_t __a, int8x16_t __b, const int __index)
+{
+ return __builtin_aarch64_sdot_laneqv8qi (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_lane_s32 (int32x4_t __r, int8x16_t __a, int8x8_t __b, const int __index)
+{
+ return __builtin_aarch64_sdot_lanev16qi (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int __index)
+{
+ return __builtin_aarch64_sdot_laneqv16qi (__r, __a, __b, __index);
+}
+#pragma GCC pop_options
+
#undef __aarch64_vget_lane_any
#undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 067cef78533..ca9eeebd8e6 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -354,6 +354,8 @@
UNSPEC_SQRDMLSH ; Used in aarch64-simd.md.
UNSPEC_FMAXNM ; Used in aarch64-simd.md.
UNSPEC_FMINNM ; Used in aarch64-simd.md.
+ UNSPEC_SDOT ; Used in aarch64-simd.md.
+ UNSPEC_UDOT ; Used in aarch64-simd.md.
])
;; ------------------------------------------------------------------
@@ -799,6 +801,10 @@
(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
+
+;; Register suffix for DOTPROD input types from the return type.
+(define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")])
+
;; Sum of lengths of instructions needed to move vector registers of a mode.
(define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
@@ -1028,6 +1034,7 @@
UNSPEC_SHSUB UNSPEC_UHSUB
UNSPEC_SRHSUB UNSPEC_URHSUB])
+(define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT])
(define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN
UNSPEC_SUBHN UNSPEC_RSUBHN])
@@ -1165,6 +1172,7 @@
(UNSPEC_USHLL "u") (UNSPEC_SSHLL "s")
(UNSPEC_URSHL "ur") (UNSPEC_SRSHL "sr")
(UNSPEC_UQRSHL "u") (UNSPEC_SQRSHL "s")
+ (UNSPEC_SDOT "s") (UNSPEC_UDOT "u")
])
(define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r")
diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 8ecf58171c1..39ef48f0aef 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -104,6 +104,13 @@ arm_ternop_qualifiers[SIMD_MAX_BUILTIN_ARGS]
= { qualifier_none, qualifier_none, qualifier_none, qualifier_none };
#define TERNOP_QUALIFIERS (arm_ternop_qualifiers)
+/* unsigned T (unsigned T, unsigned T, unsigned T). */
+static enum arm_type_qualifiers
+arm_unsigned_uternop_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
+ qualifier_unsigned };
+#define UTERNOP_QUALIFIERS (arm_unsigned_uternop_qualifiers)
+
/* T (T, immediate). */
static enum arm_type_qualifiers
arm_binop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
@@ -130,6 +137,13 @@ arm_mac_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
qualifier_none, qualifier_lane_index };
#define MAC_LANE_QUALIFIERS (arm_mac_lane_qualifiers)
+/* unsigned T (unsigned T, unsigned T, unsigend T, lane index). */
+static enum arm_type_qualifiers
+arm_umac_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
+ qualifier_unsigned, qualifier_lane_index };
+#define UMAC_LANE_QUALIFIERS (arm_umac_lane_qualifiers)
+
/* T (T, T, immediate). */
static enum arm_type_qualifiers
arm_ternop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
diff --git a/gcc/config/arm/arm-c.c b/gcc/config/arm/arm-c.c
index 3abe7d1f1f5..36ebf14f407 100644
--- a/gcc/config/arm/arm-c.c
+++ b/gcc/config/arm/arm-c.c
@@ -72,11 +72,11 @@ arm_cpu_builtins (struct cpp_reader* pfile)
def_or_undef_macro (pfile, "__ARM_FEATURE_QRDMX", TARGET_NEON_RDMA);
- if (TARGET_CRC32)
- builtin_define ("__ARM_FEATURE_CRC32");
-
+ def_or_undef_macro (pfile, "__ARM_FEATURE_CRC32", TARGET_CRC32);
+ def_or_undef_macro (pfile, "__ARM_FEATURE_DOTPROD", TARGET_DOTPROD);
def_or_undef_macro (pfile, "__ARM_32BIT_STATE", TARGET_32BIT);
+ cpp_undef (pfile, "__ARM_FEATURE_CMSE");
if (arm_arch8 && !arm_arch_notm)
{
if (arm_arch_cmse && use_cmse)
diff --git a/gcc/config/arm/arm-cpu-cdata.h b/gcc/config/arm/arm-cpu-cdata.h
index b00d83302f6..8c996bd2b0a 100644
--- a/gcc/config/arm/arm-cpu-cdata.h
+++ b/gcc/config/arm/arm-cpu-cdata.h
@@ -1006,6 +1006,20 @@ static const struct arm_arch_core_flag arm_arch_core_flags[] =
},
},
{
+ "armv8.2-a+dotprod",
+ {
+ ISA_ARMv8_2a,isa_bit_dotprod,
+ isa_nobit
+ },
+ },
+ {
+ "armv8.2-a+fp16+dotprod",
+ {
+ ISA_ARMv8_2a,isa_bit_fp16,isa_bit_dotprod,
+ isa_nobit
+ },
+ },
+ {
"armv8-m.base",
{
ISA_ARMv8m_base,
diff --git a/gcc/config/arm/arm-cpu-data.h b/gcc/config/arm/arm-cpu-data.h
index 78421adb9e5..dda38b55c2d 100644
--- a/gcc/config/arm/arm-cpu-data.h
+++ b/gcc/config/arm/arm-cpu-data.h
@@ -1536,6 +1536,26 @@ static const struct processors all_architectures[] =
NULL
},
{
+ "armv8.2-a+dotprod", TARGET_CPU_cortexa53,
+ (TF_CO_PROC),
+ "8A", BASE_ARCH_8A,
+ {
+ ISA_ARMv8_2a,isa_bit_dotprod,
+ isa_nobit
+ },
+ NULL
+ },
+ {
+ "armv8.2-a+fp16+dotprod", TARGET_CPU_cortexa53,
+ (TF_CO_PROC),
+ "8A", BASE_ARCH_8A,
+ {
+ ISA_ARMv8_2a,isa_bit_fp16,isa_bit_dotprod,
+ isa_nobit
+ },
+ NULL
+ },
+ {
"armv8-m.base", TARGET_CPU_cortexm23,
0,
"8M_BASE", BASE_ARCH_8M_BASE,
diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index 1100f3a5411..813cbe10814 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -267,6 +267,20 @@ begin arch armv8.2-a+fp16
isa ARMv8_2a bit_fp16
end arch armv8.2-a+fp16
+begin arch armv8.2-a+dotprod
+ tune for cortex-a53
+ tune flags CO_PROC
+ base 8A
+ isa ARMv8_2a bit_dotprod
+end arch armv8.2-a+dotprod
+
+begin arch armv8.2-a+fp16+dotprod
+ tune for cortex-a53
+ tune flags CO_PROC
+ base 8A
+ isa ARMv8_2a bit_fp16 bit_dotprod
+end arch armv8.2-a+fp16+dotprod
+
begin arch armv8-m.base
tune for cortex-m23
base 8M_BASE
diff --git a/gcc/config/arm/arm-isa.h b/gcc/config/arm/arm-isa.h
index 7d1e23be0a2..c1c8a765fed 100644
--- a/gcc/config/arm/arm-isa.h
+++ b/gcc/config/arm/arm-isa.h
@@ -66,6 +66,7 @@ enum isa_feature
isa_bit_fp_d32, /* 32 Double precision registers. */
isa_bit_crypto, /* Crypto extension to ARMv8. */
isa_bit_fp16, /* FP16 data processing (half-precision float). */
+ isa_bit_dotprod, /* Dot Product instructions. */
/* ISA Quirks (errata?). Don't forget to add this to the list of
all quirks below. */
@@ -128,7 +129,8 @@ enum isa_feature
#define ISA_ARMv8m_main ISA_ARMv7m, isa_bit_ARMv8, isa_bit_cmse
/* List of all FPU bits to strip out if -mfpu is used to override the
- default. isa_bit_fp16 is deliberately missing from this list. */
+ default. isa_bit_fp16 and isa_bit_dotprod are deliberately missing from
+ this list. */
#define ISA_ALL_FPU isa_bit_VFPv2, isa_bit_VFPv3, isa_bit_VFPv4, \
isa_bit_FPv5, isa_bit_FP_ARMv8, isa_bit_neon, isa_bit_fp16conv, \
isa_bit_fp_dbl, isa_bit_fp_d32, isa_bit_crypto
@@ -144,6 +146,7 @@ enum isa_feature
#define ISA_FP_D32 ISA_FP_DBL, isa_bit_fp_d32
#define ISA_NEON ISA_FP_D32, isa_bit_neon
#define ISA_CRYPTO ISA_NEON, isa_bit_crypto
+#define ISA_DOTPROD ISA_NEON, isa_bit_dotprod
/* List of all quirk bits to strip out when comparing CPU features with
architectures. */
diff --git a/gcc/config/arm/arm-tables.opt b/gcc/config/arm/arm-tables.opt
index cb45e097c90..4c7e57571dd 100644
--- a/gcc/config/arm/arm-tables.opt
+++ b/gcc/config/arm/arm-tables.opt
@@ -446,19 +446,25 @@ EnumValue
Enum(arm_arch) String(armv8.2-a+fp16) Value(30)
EnumValue
-Enum(arm_arch) String(armv8-m.base) Value(31)
+Enum(arm_arch) String(armv8.2-a+dotprod) Value(31)
EnumValue
-Enum(arm_arch) String(armv8-m.main) Value(32)
+Enum(arm_arch) String(armv8.2-a+fp16+dotprod) Value(32)
EnumValue
-Enum(arm_arch) String(armv8-m.main+dsp) Value(33)
+Enum(arm_arch) String(armv8-m.base) Value(33)
EnumValue
-Enum(arm_arch) String(iwmmxt) Value(34)
+Enum(arm_arch) String(armv8-m.main) Value(34)
EnumValue
-Enum(arm_arch) String(iwmmxt2) Value(35)
+Enum(arm_arch) String(armv8-m.main+dsp) Value(35)
+
+EnumValue
+Enum(arm_arch) String(iwmmxt) Value(36)
+
+EnumValue
+Enum(arm_arch) String(iwmmxt2) Value(37)
Enum
Name(arm_fpu) Type(enum fpu_type)
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index ac1a49f8705..39d8b1aba5c 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -209,6 +209,12 @@ extern tree arm_fp16_type_node;
/* FPU supports ARMv8.1 Adv.SIMD extensions. */
#define TARGET_NEON_RDMA (TARGET_NEON && arm_arch8_1)
+/* Supports the Dot Product AdvSIMD extensions. */
+#define TARGET_DOTPROD (TARGET_NEON \
+ && bitmap_bit_p (arm_active_target.isa, \
+ isa_bit_dotprod) \
+ && arm_arch8_2)
+
/* FPU supports the floating point FP16 instructions for ARMv8.2 and later. */
#define TARGET_VFP_FP16INST \
(TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPU_ARMV8 && arm_fp16_inst)
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 65f36e2c91e..f4f5dacf639 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18034,6 +18034,69 @@ vzipq_f16 (float16x8_t __a, float16x8_t __b)
#endif
+/* AdvSIMD Dot Product intrinsics. */
+
+#ifdef __ARM_FEATURE_DOTPROD
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b)
+{
+ return __builtin_neon_udotv8qi_uuuu (__r, __a, __b);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
+{
+ return __builtin_neon_udotv16qi_uuuu (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b)
+{
+ return __builtin_neon_sdotv8qi (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
+{
+ return __builtin_neon_sdotv16qi (__r, __a, __b);
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_lane_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b, const int __index)
+{
+ return __builtin_neon_udot_lanev8qi_uuuus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_lane_u32 (uint32x4_t __r, uint8x16_t __a, uint8x8_t __b,
+ const int __index)
+{
+ return __builtin_neon_udot_lanev16qi_uuuus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_lane_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b, const int __index)
+{
+ return __builtin_neon_sdot_lanev8qi (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_lane_s32 (int32x4_t __r, int8x16_t __a, int8x8_t __b, const int __index)
+{
+ return __builtin_neon_sdot_lanev16qi (__r, __a, __b, __index);
+}
+
+#endif /* __ARM_FEATURE_DOTPROD */
+
#ifdef __cplusplus
}
#endif
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index 07f0368343a..982eec810da 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -331,3 +331,7 @@ VAR11 (STORE1, vst4,
v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
VAR9 (STORE1LANE, vst4_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
+VAR2 (TERNOP, sdot, v8qi, v16qi)
+VAR2 (UTERNOP, udot, v8qi, v16qi)
+VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi)
+VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 48992879a8e..971e0450b22 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -410,6 +410,8 @@
(define_int_iterator VFM_LANE_AS [UNSPEC_VFMA_LANE UNSPEC_VFMS_LANE])
+(define_int_iterator DOTPROD [UNSPEC_DOT_S UNSPEC_DOT_U])
+
;;----------------------------------------------------------------------------
;; Mode attributes
;;----------------------------------------------------------------------------
@@ -712,6 +714,9 @@
(define_mode_attr pf [(V8QI "p") (V16QI "p") (V2SF "f") (V4SF "f")])
+(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
+(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
+
;;----------------------------------------------------------------------------
;; Code attributes
;;----------------------------------------------------------------------------
@@ -808,6 +813,7 @@
(UNSPEC_VSRA_S_N "s") (UNSPEC_VSRA_U_N "u")
(UNSPEC_VRSRA_S_N "s") (UNSPEC_VRSRA_U_N "u")
(UNSPEC_VCVTH_S "s") (UNSPEC_VCVTH_U "u")
+ (UNSPEC_DOT_S "s") (UNSPEC_DOT_U "u")
])
(define_int_attr vcvth_op
@@ -995,3 +1001,6 @@
(define_int_attr mrrc [(VUNSPEC_MRRC "mrrc") (VUNSPEC_MRRC2 "mrrc2")])
(define_int_attr MRRC [(VUNSPEC_MRRC "MRRC") (VUNSPEC_MRRC2 "MRRC2")])
+
+(define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
+ (UNSPEC_DOT_U "u8")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 33b25ff3c73..2626c7c66d7 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -3044,6 +3044,76 @@
DONE;
})
+;; These instructions map to the __builtins for the Dot Product operations.
+(define_insn "neon_<sup>dot<vsi2qi>"
+ [(set (match_operand:VCVTI 0 "register_operand" "=w")
+ (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
+ (unspec:VCVTI [(match_operand:<VSI2QI> 2
+ "register_operand" "w")
+ (match_operand:<VSI2QI> 3
+ "register_operand" "w")]
+ DOTPROD)))]
+ "TARGET_DOTPROD"
+ "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+ [(set_attr "type" "neon_dot")]
+)
+
+;; These instructions map to the __builtins for the Dot Product
+;; indexed operations.
+(define_insn "neon_<sup>dot_lane<vsi2qi>"
+ [(set (match_operand:VCVTI 0 "register_operand" "=w")
+ (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
+ (unspec:VCVTI [(match_operand:<VSI2QI> 2
+ "register_operand" "w")
+ (match_operand:V8QI 3 "register_operand" "t")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ DOTPROD)))]
+ "TARGET_DOTPROD"
+ {
+ operands[4]
+ = GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL (operands[4])));
+ return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
+ }
+ [(set_attr "type" "neon_dot")]
+)
+
+;; These expands map to the Dot Product optab the vectorizer checks for.
+;; The auto-vectorizer expects a dot product builtin that also does an
+;; accumulation into the provided register.
+;; Given the following pattern
+;;
+;; for (i=0; i<len; i++) {
+;; c = a[i] * b[i];
+;; r += c;
+;; }
+;; return result;
+;;
+;; This can be auto-vectorized to
+;; r = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
+;;
+;; given enough iterations. However the vectorizer can keep unrolling the loop
+;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
+;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
+;; ...
+;;
+;; and so the vectorizer provides r, in which the result has to be accumulated.
+(define_expand "<sup>dot_prod<vsi2qi>"
+ [(set (match_operand:VCVTI 0 "register_operand")
+ (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1
+ "register_operand")
+ (match_operand:<VSI2QI> 2
+ "register_operand")]
+ DOTPROD)
+ (match_operand:VCVTI 3 "register_operand")))]
+ "TARGET_DOTPROD"
+{
+ emit_insn (
+ gen_neon_<sup>dot<vsi2qi> (operands[3], operands[3], operands[1],
+ operands[2]));
+ emit_insn (gen_rtx_SET (operands[0], operands[3]));
+ DONE;
+})
+
(define_expand "neon_copysignf<mode>"
[(match_operand:VCVTF 0 "register_operand")
(match_operand:VCVTF 1 "register_operand")
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index b0b375c6ddf..351b3fc5a00 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -316,6 +316,8 @@
; neon_cls_q
; neon_cnt
; neon_cnt_q
+; neon_dot
+; neon_dot_q
; neon_ext
; neon_ext_q
; neon_rbit
@@ -763,6 +765,8 @@
\
neon_abs,\
neon_abs_q,\
+ neon_dot,\
+ neon_dot_q,\
neon_neg,\
neon_neg_q,\
neon_qneg,\
@@ -1108,8 +1112,8 @@
neon_sub, neon_sub_q, neon_sub_widen, neon_sub_long, neon_qsub,\
neon_qsub_q, neon_sub_halve, neon_sub_halve_q,\
neon_sub_halve_narrow_q,\
- neon_abs, neon_abs_q, neon_neg, neon_neg_q, neon_qneg,\
- neon_qneg_q, neon_qabs, neon_qabs_q, neon_abd, neon_abd_q,\
+ neon_abs, neon_abs_q, neon_dot, neon_dot_q, neon_neg, neon_neg_q,\
+ neon_qneg, neon_qneg_q, neon_qabs, neon_qabs_q, neon_abd, neon_abd_q,\
neon_abd_long, neon_minmax, neon_minmax_q, neon_compare,\
neon_compare_q, neon_compare_zero, neon_compare_zero_q,\
neon_arith_acc, neon_arith_acc_q, neon_reduc_add,\
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 99cfa41b08d..c474f4bb5db 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -410,4 +410,6 @@
UNSPEC_VRNDN
UNSPEC_VRNDP
UNSPEC_VRNDX
+ UNSPEC_DOT_S
+ UNSPEC_DOT_U
])
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 6c8eadc9574..e02b95f353a 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14084,6 +14084,8 @@ Enable FP16 extension. This also enables floating-point instructions.
Enable the RcPc extension. This does not change code generation from GCC,
but is passed on to the assembler, enabling inline asm statements to use
instructions from the RcPc extension.
+@item dotprod
+Enable the Dot Product extension. This also enables Advanced SIMD instructions.
@end table
@@ -15097,6 +15099,15 @@ ARMv8.2-A architecture with the optional FP16 instructions extension.
This also enables the features provided by @option{-march=armv8.1-a}
and implies @option{-mfp16-format=ieee}.
+@option{-march=armv8.2-a+dotprod} enables compiler support for the
+ARMv8.2-A architecture with the optional Dot Product instructions extension.
+This also enables the features provided by @option{-march=armv8.1-a}.
+
+@option{-march=armv8.2-a+fp16+dotprod} enables compiler support for the
+ARMv8.2-A architecture with the optional FP16 and Dot Product instructions
+extension. This also enables the features provided by @option{-march=armv8.1-a}
+and implies @option{-mfp16-format=ieee}.
+
@option{-march=native} causes the compiler to auto-detect the architecture
of the build computer. At present, this feature is only supported on
GNU/Linux, and not all architectures are recognized. If the auto-detect
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index 35b5b87ac8a..437f9c9bd70 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -1666,6 +1666,17 @@ ARM target supports executing instructions from ARMv8.2 with the FP16
extension. Some multilibs may be incompatible with these options.
Implies arm_v8_2a_fp16_neon_ok and arm_v8_2a_fp16_scalar_hw.
+@item arm_v8_2a_dotprod_neon_ok
+@anchor{arm_v8_2a_dotprod_neon_ok}
+ARM target supports options to generate instructions from ARMv8.2 with
+the Dot Product extension. Some multilibs may be incompatible with these
+options.
+
+@item arm_v8_2a_dotprod_neon_hw
+ARM target supports executing instructions from ARMv8.2 with the Dot
+Product extension. Some multilibs may be incompatible with these options.
+Implies arm_v8_2a_dotprod_neon_ok.
+
@item arm_prefer_ldrd_strd
ARM target prefers @code{LDRD} and @code{STRD} instructions over
@code{LDM} and @code{STM} instructions.
@@ -2241,6 +2252,11 @@ supported by the target; see the
@ref{arm_v8_2a_fp16_neon_ok,,arm_v8_2a_fp16_neon_ok} effective target
keyword.
+@item arm_v8_2a_dotprod_neon
+Add options for ARMv8.2 with Adv.SIMD Dot Product support, if this is
+supported by the target; see the
+@ref{arm_v8_2a_dotprod_neon_ok} effective target keyword.
+
@item bind_pic_locally
Add the target-specific flags needed to enable functions to bind
locally when using pic/PIC passes in the testsuite.
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c
index dc4f52019d5..ac674749b6f 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c
@@ -1,4 +1,7 @@
/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-additional-options "-march=armv8.2-a+dotprod" { target { aarch64*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
#include <stdarg.h>
#include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8a.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8a.c
index f3cc6c78c25..d020f643bb8 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8a.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8a.c
@@ -1,4 +1,7 @@
/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-additional-options "-march=armv8.2-a+dotprod" { target { aarch64*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
#include <stdarg.h>
#include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile.c
new file mode 100644
index 00000000000..b7378adf8ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile.c
@@ -0,0 +1,73 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+dotprod" } */
+
+#include <arm_neon.h>
+
+/* Unsigned Dot Product instructions. */
+
+uint32x2_t ufoo (uint32x2_t r, uint8x8_t x, uint8x8_t y)
+{
+ return vdot_u32 (r, x, y);
+}
+
+uint32x4_t ufooq (uint32x4_t r, uint8x16_t x, uint8x16_t y)
+{
+ return vdotq_u32 (r, x, y);
+}
+
+uint32x2_t ufoo_lane (uint32x2_t r, uint8x8_t x, uint8x8_t y)
+{
+ return vdot_lane_u32 (r, x, y, 0);
+}
+
+uint32x2_t ufoo_laneq (uint32x2_t r, uint8x8_t x, uint8x16_t y)
+{
+ return vdot_laneq_u32 (r, x, y, 0);
+}
+
+uint32x4_t ufooq_lane (uint32x4_t r, uint8x16_t x, uint8x8_t y)
+{
+ return vdotq_lane_u32 (r, x, y, 0);
+}
+
+uint32x4_t ufooq_laneq (uint32x4_t r, uint8x16_t x, uint8x16_t y)
+{
+ return vdotq_laneq_u32 (r, x, y, 0);
+}
+
+/* Signed Dot Product instructions. */
+
+int32x2_t sfoo (int32x2_t r, int8x8_t x, int8x8_t y)
+{
+ return vdot_s32 (r, x, y);
+}
+
+int32x4_t sfooq (int32x4_t r, int8x16_t x, int8x16_t y)
+{
+ return vdotq_s32 (r, x, y);
+}
+
+int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, int8x8_t y)
+{
+ return vdot_lane_s32 (r, x, y, 0);
+}
+
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, int8x16_t y)
+{
+ return vdot_laneq_s32 (r, x, y, 0);
+}
+
+int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, int8x8_t y)
+{
+ return vdotq_lane_s32 (r, x, y, 0);
+}
+
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, int8x16_t y)
+{
+ return vdotq_laneq_s32 (r, x, y, 0);
+}
+
+/* { dg-final { scan-assembler-times {[us]dot\tv[0-9]+\.2s, v[0-9]+\.8b, v[0-9]+\.8b} 2 } } */
+/* { dg-final { scan-assembler-times {[us]dot\tv[0-9]+\.2s, v[0-9]+\.8b, v[0-9]+\.4b\[[0-9]+\]} 4 } } */
+/* { dg-final { scan-assembler-times {[us]dot\tv[0-9]+\.4s, v[0-9]+\.16b, v[0-9]+\.16b} 2 } } */
+/* { dg-final { scan-assembler-times {[us]dot\tv[0-9]+\.4s, v[0-9]+\.16b, v[0-9]+\.4b\[[0-9]+\]} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-exec.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-exec.c
new file mode 100644
index 00000000000..3e7cd6c2fc2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-exec.c
@@ -0,0 +1,81 @@
+/* { dg-skip-if "can't compile on arm." { arm*-*-* } } */
+/* { dg-do run { target { aarch64*-*-* } } } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+dotprod" } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw } */
+
+#include <arm_neon.h>
+
+extern void abort();
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+# define ORDER(x, y) y
+#else
+# define ORDER(x, y) x - y
+#endif
+
+#define P(n1,n2) n1,n1,n1,n1,n2,n2,n2,n2
+#define ARR(nm, p, ty, ...) ty nm##_##p = { __VA_ARGS__ }
+#define TEST(t1, t2, t3, f, r1, r2, n1, n2) \
+ ARR(f, x, t1, r1); \
+ ARR(f, y, t2, r2); \
+ t3 f##_##r = {0}; \
+ f##_##r = f (f##_##r, f##_##x, f##_##y); \
+ if (f##_##r[0] != n1 || f##_##r[1] != n2) \
+ abort ();
+
+#define TEST_LANE(t1, t2, t3, f, r1, r2, n1, n2, n3, n4) \
+ ARR(f, x, t1, r1); \
+ ARR(f, y, t2, r2); \
+ t3 f##_##rx = {0}; \
+ f##_##rx = f (f##_##rx, f##_##x, f##_##y, ORDER (1, 0)); \
+ if (f##_##rx[0] != n1 || f##_##rx[1] != n2) \
+ abort (); \
+ t3 f##_##rx1 = {0}; \
+ f##_##rx1 = f (f##_##rx1, f##_##x, f##_##y, ORDER (1, 1)); \
+ if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4) \
+ abort ();
+
+#define Px(n1,n2,n3,n4) P(n1,n2),P(n3,n4)
+#define TEST_LANEQ(t1, t2, t3, f, r1, r2, n1, n2, n3, n4, n5, n6, n7, n8) \
+ ARR(f, x, t1, r1); \
+ ARR(f, y, t2, r2); \
+ t3 f##_##rx = {0}; \
+ f##_##rx = f (f##_##rx, f##_##x, f##_##y, ORDER (3, 0)); \
+ if (f##_##rx[0] != n1 || f##_##rx[1] != n2) \
+ abort (); \
+ t3 f##_##rx1 = {0}; \
+ f##_##rx1 = f (f##_##rx1, f##_##x, f##_##y, ORDER (3, 1)); \
+ if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4) \
+ abort (); \
+ t3 f##_##rx2 = {0}; \
+ f##_##rx2 = f (f##_##rx2, f##_##x, f##_##y, ORDER (3, 2)); \
+ if (f##_##rx2[0] != n5 || f##_##rx2[1] != n6) \
+ abort (); \
+ t3 f##_##rx3 = {0}; \
+ f##_##rx3 = f (f##_##rx3, f##_##x, f##_##y, ORDER (3, 3)); \
+ if (f##_##rx3[0] != n7 || f##_##rx3[1] != n8) \
+ abort ();
+
+int
+main()
+{
+ TEST (uint8x8_t, uint8x8_t, uint32x2_t, vdot_u32, P(1,2), P(2,3), 8, 24);
+ TEST (int8x8_t, int8x8_t, int32x2_t, vdot_s32, P(1,2), P(-2,-3), -8, -24);
+
+ TEST (uint8x16_t, uint8x16_t, uint32x4_t, vdotq_u32, P(1,2), P(2,3), 8, 24);
+ TEST (int8x16_t, int8x16_t, int32x4_t, vdotq_s32, P(1,2), P(-2,-3), -8, -24);
+
+ TEST_LANE (uint8x8_t, uint8x8_t, uint32x2_t, vdot_lane_u32, P(1,2), P(2,3), 8, 16, 12, 24);
+ TEST_LANE (int8x8_t, int8x8_t, int32x2_t, vdot_lane_s32, P(1,2), P(-2,-3), -8, -16, -12, -24);
+
+ TEST_LANE (uint8x16_t, uint8x8_t, uint32x4_t, vdotq_lane_u32, P(1,2), P(2,3), 8, 16, 12, 24);
+ TEST_LANE (int8x16_t, int8x8_t, int32x4_t, vdotq_lane_s32, P(1,2), P(-2,-3), -8, -16, -12, -24);
+
+ TEST_LANEQ (uint8x8_t, uint8x16_t, uint32x2_t, vdot_laneq_u32, P(1,2), Px(2,3,1,4), 8, 16, 12, 24, 4, 8, 16, 32);
+ TEST_LANEQ (int8x8_t, int8x16_t, int32x2_t, vdot_laneq_s32, P(1,2), Px(-2,-3,-1,-4), -8, -16, -12, -24, -4, -8, -16, -32);
+
+ TEST_LANEQ (uint8x16_t, uint8x16_t, uint32x4_t, vdotq_laneq_u32, Px(1,2,2,1), Px(2,3,1,4), 8, 16, 12, 24, 4, 8, 16, 32);
+ TEST_LANEQ (int8x16_t, int8x16_t, int32x4_t, vdotq_laneq_s32, Px(1,2,2,1), Px(-2,-3,-1,-4), -8, -16, -12, -24, -4, -8, -16, -32);
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vect-dot-qi.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vect-dot-qi.h
new file mode 100644
index 00000000000..90b00aff95c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vect-dot-qi.h
@@ -0,0 +1,15 @@
+TYPE char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+TYPE char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+
+__attribute__ ((noinline)) int
+foo1(int len) {
+ int i;
+ TYPE int result = 0;
+ TYPE short prod;
+
+ for (i=0; i<len; i++) {
+ prod = X[i] * Y[i];
+ result += prod;
+ }
+ return result;
+} \ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vect-dot-s8.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vect-dot-s8.c
new file mode 100644
index 00000000000..57b5ef82f85
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vect-dot-s8.c
@@ -0,0 +1,9 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+dotprod" } */
+
+#define N 64
+#define TYPE signed
+
+#include "vect-dot-qi.h"
+
+/* { dg-final { scan-assembler-times {sdot\tv[0-9]+\.4s, v[0-9]+\.16b, v[0-9]+\.16b} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vect-dot-u8.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vect-dot-u8.c
new file mode 100644
index 00000000000..b2cef318500
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vect-dot-u8.c
@@ -0,0 +1,9 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+dotprod" } */
+
+#define N 64
+#define TYPE unsigned
+
+#include "vect-dot-qi.h"
+
+/* { dg-final { scan-assembler-times {udot\tv[0-9]+\.4s, v[0-9]+\.16b, v[0-9]+\.16b} 4 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c
new file mode 100644
index 00000000000..b3bd3bf00e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3" } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#include <arm_neon.h>
+
+/* Unsigned Dot Product instructions. */
+
+uint32x2_t ufoo (uint32x2_t r, uint8x8_t x, uint8x8_t y)
+{
+ return vdot_u32 (r, x, y);
+}
+
+uint32x4_t ufooq (uint32x4_t r, uint8x16_t x, uint8x16_t y)
+{
+ return vdotq_u32 (r, x, y);
+}
+
+uint32x2_t ufoo_lane (uint32x2_t r, uint8x8_t x, uint8x8_t y)
+{
+ return vdot_lane_u32 (r, x, y, 0);
+}
+
+uint32x4_t ufooq_lane (uint32x4_t r, uint8x16_t x, uint8x8_t y)
+{
+ return vdotq_lane_u32 (r, x, y, 0);
+}
+
+/* Signed Dot Product instructions. */
+
+int32x2_t sfoo (int32x2_t r, int8x8_t x, int8x8_t y)
+{
+ return vdot_s32 (r, x, y);
+}
+
+int32x4_t sfooq (int32x4_t r, int8x16_t x, int8x16_t y)
+{
+ return vdotq_s32 (r, x, y);
+}
+
+int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, int8x8_t y)
+{
+ return vdot_lane_s32 (r, x, y, 0);
+}
+
+int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, int8x8_t y)
+{
+ return vdotq_lane_s32 (r, x, y, 0);
+}
+
+/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+\[#?[0-9]\]} 2 } } */
+/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, d[0-9]+\[#?[0-9]\]} 2 } } */
+
diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c
new file mode 100644
index 00000000000..054f4703394
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c
@@ -0,0 +1,55 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O3" } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#include <arm_neon.h>
+
+extern void abort();
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+# define ORDER(x, y) y
+#else
+# define ORDER(x, y) x - y
+#endif
+
+#define P(n1,n2) n1,n1,n1,n1,n2,n2,n2,n2
+#define ARR(nm, p, ty, ...) ty nm##_##p = { __VA_ARGS__ }
+#define TEST(t1, t2, t3, f, r1, r2, n1, n2) \
+ ARR(f, x, t1, r1); \
+ ARR(f, y, t2, r2); \
+ t3 f##_##r = {0}; \
+ f##_##r = f (f##_##r, f##_##x, f##_##y); \
+ if (f##_##r[0] != n1 || f##_##r[1] != n2) \
+ abort ();
+
+#define TEST_LANE(t1, t2, t3, f, r1, r2, n1, n2, n3, n4) \
+ ARR(f, x, t1, r1); \
+ ARR(f, y, t2, r2); \
+ t3 f##_##rx = {0}; \
+ f##_##rx = f (f##_##rx, f##_##x, f##_##y, ORDER (1, 0)); \
+ if (f##_##rx[0] != n1 || f##_##rx[1] != n2) \
+ abort (); \
+ t3 f##_##rx1 = {0}; \
+ f##_##rx1 = f (f##_##rx1, f##_##x, f##_##y, ORDER (1, 1)); \
+ if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4) \
+ abort (); \
+
+int
+main()
+{
+ TEST (uint8x8_t, uint8x8_t, uint32x2_t, vdot_u32, P(1,2), P(2,3), 8, 24);
+ TEST (int8x8_t, int8x8_t, int32x2_t, vdot_s32, P(1,2), P(-2,-3), -8, -24);
+
+ TEST (uint8x16_t, uint8x16_t, uint32x4_t, vdotq_u32, P(1,2), P(2,3), 8, 24);
+ TEST (int8x16_t, int8x16_t, int32x4_t, vdotq_s32, P(1,2), P(-2,-3), -8, -24);
+
+ TEST_LANE (uint8x8_t, uint8x8_t, uint32x2_t, vdot_lane_u32, P(1,2), P(2,3), 8, 16, 12, 24);
+
+ TEST_LANE (int8x8_t, int8x8_t, int32x2_t, vdot_lane_s32, P(1,2), P(-2,-3), -8, -16, -12, -24);
+
+ TEST_LANE (uint8x16_t, uint8x8_t, uint32x4_t, vdotq_lane_u32, P(1,2), P(2,3), 8, 16, 12, 24);
+ TEST_LANE (int8x16_t, int8x8_t, int32x4_t, vdotq_lane_s32, P(1,2), P(-2,-3), -8, -16, -12, -24);
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/vect-dot-qi.h b/gcc/testsuite/gcc.target/arm/simd/vect-dot-qi.h
new file mode 100644
index 00000000000..ea5c212419d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/vect-dot-qi.h
@@ -0,0 +1,16 @@
+TYPE char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+TYPE char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+
+__attribute__ ((noinline)) int
+foo1(int len) {
+ int i;
+ TYPE int result = 0;
+ TYPE short prod;
+
+ for (i=0; i<len; i++) {
+ prod = X[i] * Y[i];
+ result += prod;
+ }
+ return result;
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/simd/vect-dot-s8.c b/gcc/testsuite/gcc.target/arm/simd/vect-dot-s8.c
new file mode 100644
index 00000000000..d0b9d8ae71a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/vect-dot-s8.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3" } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#define N 64
+#define TYPE signed
+
+#include "vect-dot-qi.h"
+
+/* { dg-final { scan-assembler-times {vsdot\.s8\tq[0-9]+, q[0-9]+, q[0-9]+} 4 } } */
+
diff --git a/gcc/testsuite/gcc.target/arm/simd/vect-dot-u8.c b/gcc/testsuite/gcc.target/arm/simd/vect-dot-u8.c
new file mode 100644
index 00000000000..957be3c9886
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/vect-dot-u8.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3" } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#define N 64
+#define TYPE unsigned
+
+#include "vect-dot-qi.h"
+
+/* { dg-final { scan-assembler-times {vudot\.u8\tq[0-9]+, q[0-9]+, q[0-9]+} 4 } } */
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 677230c1406..290704dabea 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4156,6 +4156,51 @@ proc check_effective_target_arm_v8_2a_fp16_neon_ok { } {
check_effective_target_arm_v8_2a_fp16_neon_ok_nocache]
}
+# Return 1 if the target supports ARMv8.2 Adv.SIMD Dot Product
+# instructions, 0 otherwise. The test is valid for ARM and for AArch64.
+# Record the command line options needed.
+
+proc check_effective_target_arm_v8_2a_dotprod_neon_ok_nocache { } {
+ global et_arm_v8_2a_dotprod_neon_flags
+ set et_arm_v8_2a_dotprod_neon_flags ""
+
+ if { ![istarget arm*-*-*] && ![istarget aarch64*-*-*] } {
+ return 0;
+ }
+
+ # Iterate through sets of options to find the compiler flags that
+ # need to be added to the -march option.
+ foreach flags {"" "-mfloat-abi=softfp -mfpu=neon-fp-armv8" "-mfloat-abi=hard -mfpu=neon-fp-armv8"} {
+ if { [check_no_compiler_messages_nocache \
+ arm_v8_2a_dotprod_neon_ok object {
+ #if !defined (__ARM_FEATURE_DOTPROD)
+ #error "__ARM_FEATURE_DOTPROD not defined"
+ #else
+ /* Make sure including arm_neon.h is OK. */
+ #include <arm_neon.h>
+ #endif
+ } "$flags -march=armv8.2-a+dotprod"] } {
+ set et_arm_v8_2a_dotprod_neon_flags "$flags -march=armv8.2-a+dotprod"
+ return 1
+ }
+ }
+
+ return 0;
+}
+
+proc check_effective_target_arm_v8_2a_dotprod_neon_ok { } {
+ return [check_cached_effective_target arm_v8_2a_dotprod_neon_ok \
+ check_effective_target_arm_v8_2a_dotprod_neon_ok_nocache]
+}
+
+proc add_options_for_arm_v8_2a_dotprod_neon { flags } {
+ if { ! [check_effective_target_arm_v8_2a_dotprod_neon_ok] } {
+ return "$flags"
+ }
+ global et_arm_v8_2a_dotprod_neon_flags
+ return "$flags $et_arm_v8_2a_dotprod_neon_flags"
+}
+
# Return 1 if the target supports executing ARMv8 NEON instructions, 0
# otherwise.
@@ -4293,6 +4338,42 @@ proc check_effective_target_arm_v8_2a_fp16_neon_hw { } {
} [add_options_for_arm_v8_2a_fp16_neon ""]]
}
+# Return 1 if the target supports executing AdvSIMD instructions from ARMv8.2
+# with the Dot Product extension, 0 otherwise. The test is valid for ARM and for
+# AArch64.
+
+proc check_effective_target_arm_v8_2a_dotprod_neon_hw { } {
+ if { ![check_effective_target_arm_v8_2a_dotprod_neon_ok] } {
+ return 0;
+ }
+ return [check_runtime arm_v8_2a_dotprod_neon_hw_available {
+ #include "arm_neon.h"
+ int
+ main (void)
+ {
+
+ uint32x2_t results = {0,0};
+ uint8x8_t a = {1,1,1,1,2,2,2,2};
+ uint8x8_t b = {2,2,2,2,3,3,3,3};
+
+ #ifdef __ARM_ARCH_ISA_A64
+ asm ("udot %0.2s, %1.8b, %2.8b"
+ : "=w"(results)
+ : "w"(a), "w"(b)
+ : /* No clobbers. */);
+
+ #else
+ asm ("vudot.u8 %P0, %P1, %P2"
+ : "=w"(results)
+ : "w"(a), "w"(b)
+ : /* No clobbers. */);
+ #endif
+
+ return (results[0] == 8 && results[1] == 24) ? 1 : 0;
+ }
+ } [add_options_for_arm_v8_2a_dotprod_neon ""]]
+}
+
# Return 1 if this is a ARM target with NEON enabled.
proc check_effective_target_arm_neon { } {
@@ -5563,6 +5644,8 @@ proc check_effective_target_vect_sdot_qi { } {
} else {
set et_vect_sdot_qi_saved($et_index) 0
if { [istarget ia64-*-*]
+ || [istarget aarch64*-*-*]
+ || [istarget arm*-*-*]
|| ([istarget mips*-*-*]
&& [et-is-effective-target mips_msa]) } {
set et_vect_udot_qi_saved 1
@@ -5587,6 +5670,8 @@ proc check_effective_target_vect_udot_qi { } {
} else {
set et_vect_udot_qi_saved($et_index) 0
if { [istarget powerpc*-*-*]
+ || [istarget aarch64*-*-*]
+ || [istarget arm*-*-*]
|| [istarget ia64-*-*]
|| ([istarget mips*-*-*]
&& [et-is-effective-target mips_msa]) } {
@@ -7885,7 +7970,7 @@ proc check_effective_target_aarch64_tiny { } {
# Create functions to check that the AArch64 assembler supports the
# various architecture extensions via the .arch_extension pseudo-op.
-foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse"} {
+foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod"} {
eval [string map [list FUNC $aarch64_ext] {
proc check_effective_target_aarch64_asm_FUNC_ok { } {
if { [istarget aarch64*-*-*] } {