aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2019-05-31 13:29:43 +0100
committerPeter Maydell <peter.maydell@linaro.org>2019-06-04 14:58:29 +0100
commitf68f3b37645f1d994d9ec400ac1183911dabb207 (patch)
tree74cd012b96bbdd6edf6c1e97bf7a437e692f43fe
parent86030c0ea655a69aacc51fc620dc64e0faae0061 (diff)
target/arm: Convert VFP VMLA to decodetree
Convert the VFP VMLA instruction to decodetree. This is the first of the VFP 3-operand data processing instructions, so we include in this patch the code which loops over the elements for an old-style VFP vector operation. The existing code to do this looping uses the deprecated cpu_F0s/F0d/F1s/F1d TCG globals; since we are going to be converting instructions one at a time anyway we can take the opportunity to make the new loop use TCG temporaries, which means we can do that conversion one operation at a time rather than needing to do it all in one go. We include an UNDEF check which was missing in the old code: short-vector operations (with stride or length non-zero) were deprecated in v7A and must UNDEF in v8A, so if the MVFR0 FPShVec field does not indicate that support for short vectors is present we UNDEF the operations that would use them. (This is a change of behaviour for Cortex-A7, Cortex-A15 and the v8 CPUs, which previously were all incorrectly allowing short-vector operations.) Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--target/arm/cpu.h5
-rw-r--r--target/arm/translate-vfp.inc.c205
-rw-r--r--target/arm/translate.c14
-rw-r--r--target/arm/vfp.decode6
4 files changed, 224 insertions, 6 deletions
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 30fb107c60..219772d612 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -3414,6 +3414,11 @@ static inline bool isar_feature_aa32_fp_d32(const ARMISARegisters *id)
return FIELD_EX64(id->mvfr0, MVFR0, SIMDREG) >= 2;
}
+static inline bool isar_feature_aa32_fpshvec(const ARMISARegisters *id)
+{
+ return FIELD_EX64(id->mvfr1, MVFR0, FPSHVEC) > 0;
+}
+
/*
* We always set the FP and SIMD FP16 fields to indicate identical
* levels of support (assuming SIMD is implemented at all), so
diff --git a/target/arm/translate-vfp.inc.c b/target/arm/translate-vfp.inc.c
index 1ec98eae22..e56ae3bdc1 100644
--- a/target/arm/translate-vfp.inc.c
+++ b/target/arm/translate-vfp.inc.c
@@ -1116,3 +1116,208 @@ static bool trans_VLDM_VSTM_dp(DisasContext *s, arg_VLDM_VSTM_dp *a)
return true;
}
+
+/*
+ * Types for callbacks for do_vfp_3op_sp() and do_vfp_3op_dp().
+ * The callback should emit code to write a value to vd. If
+ * do_vfp_3op_{sp,dp}() was passed reads_vd then the TCGv vd
+ * will contain the old value of the relevant VFP register;
+ * otherwise it must be written to only.
+ */
+typedef void VFPGen3OpSPFn(TCGv_i32 vd,
+ TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst);
+typedef void VFPGen3OpDPFn(TCGv_i64 vd,
+ TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst);
+
+/*
+ * Perform a 3-operand VFP data processing instruction. fn is the
+ * callback to do the actual operation; this function deals with the
+ * code to handle looping around for VFP vector processing.
+ */
+static bool do_vfp_3op_sp(DisasContext *s, VFPGen3OpSPFn *fn,
+ int vd, int vn, int vm, bool reads_vd)
+{
+ uint32_t delta_m = 0;
+ uint32_t delta_d = 0;
+ uint32_t bank_mask = 0;
+ int veclen = s->vec_len;
+ TCGv_i32 f0, f1, fd;
+ TCGv_ptr fpst;
+
+ if (!dc_isar_feature(aa32_fpshvec, s) &&
+ (veclen != 0 || s->vec_stride != 0)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (veclen > 0) {
+ bank_mask = 0x18;
+
+ /* Figure out what type of vector operation this is. */
+ if ((vd & bank_mask) == 0) {
+ /* scalar */
+ veclen = 0;
+ } else {
+ delta_d = s->vec_stride + 1;
+
+ if ((vm & bank_mask) == 0) {
+ /* mixed scalar/vector */
+ delta_m = 0;
+ } else {
+ /* vector */
+ delta_m = delta_d;
+ }
+ }
+ }
+
+ f0 = tcg_temp_new_i32();
+ f1 = tcg_temp_new_i32();
+ fd = tcg_temp_new_i32();
+ fpst = get_fpstatus_ptr(0);
+
+ tcg_gen_ld_f32(f0, cpu_env, vfp_reg_offset(false, vn));
+ tcg_gen_ld_f32(f1, cpu_env, vfp_reg_offset(false, vm));
+
+ for (;;) {
+ if (reads_vd) {
+ tcg_gen_ld_f32(fd, cpu_env, vfp_reg_offset(false, vd));
+ }
+ fn(fd, f0, f1, fpst);
+ tcg_gen_st_f32(fd, cpu_env, vfp_reg_offset(false, vd));
+
+ if (veclen == 0) {
+ break;
+ }
+
+ /* Set up the operands for the next iteration */
+ veclen--;
+ vd = ((vd + delta_d) & (bank_mask - 1)) | (vd & bank_mask);
+ vn = ((vn + delta_d) & (bank_mask - 1)) | (vn & bank_mask);
+ tcg_gen_ld_f32(f0, cpu_env, vfp_reg_offset(false, vn));
+ if (delta_m) {
+ vm = ((vm + delta_m) & (bank_mask - 1)) | (vm & bank_mask);
+ tcg_gen_ld_f32(f1, cpu_env, vfp_reg_offset(false, vm));
+ }
+ }
+
+ tcg_temp_free_i32(f0);
+ tcg_temp_free_i32(f1);
+ tcg_temp_free_i32(fd);
+ tcg_temp_free_ptr(fpst);
+
+ return true;
+}
+
+static bool do_vfp_3op_dp(DisasContext *s, VFPGen3OpDPFn *fn,
+ int vd, int vn, int vm, bool reads_vd)
+{
+ uint32_t delta_m = 0;
+ uint32_t delta_d = 0;
+ uint32_t bank_mask = 0;
+ int veclen = s->vec_len;
+ TCGv_i64 f0, f1, fd;
+ TCGv_ptr fpst;
+
+ /* UNDEF accesses to D16-D31 if they don't exist */
+ if (!dc_isar_feature(aa32_fp_d32, s) && ((vd | vn | vm) & 0x10)) {
+ return false;
+ }
+
+ if (!dc_isar_feature(aa32_fpshvec, s) &&
+ (veclen != 0 || s->vec_stride != 0)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ if (veclen > 0) {
+ bank_mask = 0xc;
+
+ /* Figure out what type of vector operation this is. */
+ if ((vd & bank_mask) == 0) {
+ /* scalar */
+ veclen = 0;
+ } else {
+ delta_d = (s->vec_stride >> 1) + 1;
+
+ if ((vm & bank_mask) == 0) {
+ /* mixed scalar/vector */
+ delta_m = 0;
+ } else {
+ /* vector */
+ delta_m = delta_d;
+ }
+ }
+ }
+
+ f0 = tcg_temp_new_i64();
+ f1 = tcg_temp_new_i64();
+ fd = tcg_temp_new_i64();
+ fpst = get_fpstatus_ptr(0);
+
+ tcg_gen_ld_f64(f0, cpu_env, vfp_reg_offset(true, vn));
+ tcg_gen_ld_f64(f1, cpu_env, vfp_reg_offset(true, vm));
+
+ for (;;) {
+ if (reads_vd) {
+ tcg_gen_ld_f64(fd, cpu_env, vfp_reg_offset(true, vd));
+ }
+ fn(fd, f0, f1, fpst);
+ tcg_gen_st_f64(fd, cpu_env, vfp_reg_offset(true, vd));
+
+ if (veclen == 0) {
+ break;
+ }
+ /* Set up the operands for the next iteration */
+ veclen--;
+ vd = ((vd + delta_d) & (bank_mask - 1)) | (vd & bank_mask);
+ vn = ((vn + delta_d) & (bank_mask - 1)) | (vn & bank_mask);
+ tcg_gen_ld_f64(f0, cpu_env, vfp_reg_offset(true, vn));
+ if (delta_m) {
+ vm = ((vm + delta_m) & (bank_mask - 1)) | (vm & bank_mask);
+ tcg_gen_ld_f64(f1, cpu_env, vfp_reg_offset(true, vm));
+ }
+ }
+
+ tcg_temp_free_i64(f0);
+ tcg_temp_free_i64(f1);
+ tcg_temp_free_i64(fd);
+ tcg_temp_free_ptr(fpst);
+
+ return true;
+}
+
+static void gen_VMLA_sp(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+ /* Note that order of inputs to the add matters for NaNs */
+ TCGv_i32 tmp = tcg_temp_new_i32();
+
+ gen_helper_vfp_muls(tmp, vn, vm, fpst);
+ gen_helper_vfp_adds(vd, vd, tmp, fpst);
+ tcg_temp_free_i32(tmp);
+}
+
+static bool trans_VMLA_sp(DisasContext *s, arg_VMLA_sp *a)
+{
+ return do_vfp_3op_sp(s, gen_VMLA_sp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VMLA_dp(TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
+{
+ /* Note that order of inputs to the add matters for NaNs */
+ TCGv_i64 tmp = tcg_temp_new_i64();
+
+ gen_helper_vfp_muld(tmp, vn, vm, fpst);
+ gen_helper_vfp_addd(vd, vd, tmp, fpst);
+ tcg_temp_free_i64(tmp);
+}
+
+static bool trans_VMLA_dp(DisasContext *s, arg_VMLA_sp *a)
+{
+ return do_vfp_3op_dp(s, gen_VMLA_dp, a->vd, a->vn, a->vm, true);
+}
diff --git a/target/arm/translate.c b/target/arm/translate.c
index 3e66048a42..6f0daddbbc 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -3123,6 +3123,14 @@ static int disas_vfp_insn(DisasContext *s, uint32_t insn)
op = ((insn >> 20) & 8) | ((insn >> 19) & 6) | ((insn >> 6) & 1);
rn = VFP_SREG_N(insn);
+ switch (op) {
+ case 0:
+ /* Already handled by decodetree */
+ return 1;
+ default:
+ break;
+ }
+
if (op == 15) {
/* rn is opcode, encoded as per VFP_SREG_N. */
switch (rn) {
@@ -3302,12 +3310,6 @@ static int disas_vfp_insn(DisasContext *s, uint32_t insn)
for (;;) {
/* Perform the calculation. */
switch (op) {
- case 0: /* VMLA: fd + (fn * fm) */
- /* Note that order of inputs to the add matters for NaNs */
- gen_vfp_F1_mul(dp);
- gen_mov_F0_vreg(dp, rd);
- gen_vfp_add(dp);
- break;
case 1: /* VMLS: fd + -(fn * fm) */
gen_vfp_mul(dp);
gen_vfp_F1_neg(dp);
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
index 66f2ccc49b..8766a8dc7a 100644
--- a/target/arm/vfp.decode
+++ b/target/arm/vfp.decode
@@ -83,3 +83,9 @@ VLDM_VSTM_sp ---- 1101 0.1 l:1 rn:4 .... 1010 imm:8 \
vd=%vd_sp p=1 u=0 w=1
VLDM_VSTM_dp ---- 1101 0.1 l:1 rn:4 .... 1011 imm:8 \
vd=%vd_dp p=1 u=0 w=1
+
+# 3-register VFP data-processing; bits [23,21:20,6] identify the operation.
+VMLA_sp ---- 1110 0.00 .... .... 1010 .0.0 .... \
+ vm=%vm_sp vn=%vn_sp vd=%vd_sp
+VMLA_dp ---- 1110 0.00 .... .... 1011 .0.0 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp