add ldp/stp vector forms

ldp/stp decode wasn't checking the vector bit, so we wrote the
int regs by mistake for LDP d8,d9,[sp,#96]

Add support.

Note to Alex: we really should fold the load and store support
together better, most of the decode is the same at the bottom
level. Possibly factoring out the 'if vector do_fp_st else do_gpr_st'
logic as the suse patches do would also be a good plan.

I've left some XXX notes about updating comments in this one.

Oh yes, we should check the semantics for what happens if the
second load of the pair faults -- currently we update the
first register, but I have a feeling maybe we should not.
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index da6c742..6e1acff 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -1259,6 +1259,9 @@
    s: 0 -> unsigned, 1 -> signed
  idx: 001 -> post-index, 011 -> pre-index, 010 -> signed off
 
+XXX update: supports V=1
+XXX diagram above is wrong, bits 31:30 are opc
+
 */
 static void handle_gpr_ldp(DisasContext *s, uint32_t insn)
 {
@@ -1267,10 +1270,10 @@
     int rt2 = extract32(insn, 10, 5);
     int64_t offset = sextract32(insn, 15, 7);
     int idx = extract32(insn, 23, 3);
-    int is_signed = extract32(insn, 30, 1);
-    int sf = extract32(insn, 31, 1);
-
-    int size = sf?3:2;
+    bool is_signed = false;
+    bool is_vector = extract32(insn, 26, 1);
+    int opc = extract32(insn, 30, 2);
+    int size;
     bool postindex = true;
     bool wback = false;
 
@@ -1278,6 +1281,17 @@
     TCGv_i64 tcg_rt2 = cpu_reg(s, rt2);
     TCGv_i64 tcg_addr = tcg_temp_new_i64();
 
+    if (opc == 3) {
+        unallocated_encoding(s);
+        return;
+    }
+    if (is_vector) {
+        size = 2 + opc;
+    } else {
+        is_signed = opc & 1;
+        size = 2 + extract32(opc, 1, 1);
+    }
+
     switch (idx) {
     case 1: /* post-index */
         postindex = true;
@@ -1306,9 +1320,17 @@
         tcg_gen_addi_i64(tcg_addr, tcg_addr, offset);
     }
 
-    do_gpr_ld(s, tcg_rt, tcg_addr, size, is_signed);
+    if (is_vector) {
+        do_fp_ld(s, rt, tcg_addr, size);
+    } else {
+        do_gpr_ld(s, tcg_rt, tcg_addr, size, is_signed);
+    }
     tcg_gen_addi_i64(tcg_addr, tcg_addr, 1 << size);
-    do_gpr_ld(s, tcg_rt2, tcg_addr, size, is_signed);
+    if (is_vector) {
+        do_fp_ld(s, rt2, tcg_addr, size);
+    } else {
+        do_gpr_ld(s, tcg_rt2, tcg_addr, size, is_signed);
+    }
 
     // XXX - this could be more optimal?
     tcg_gen_subi_i64(tcg_addr, tcg_addr, 1 << size);
@@ -1337,6 +1359,8 @@
   Rt, Rt2 = general purpose registers to be stored
   Rn = general purpose register containing address
   imm7 = signed offset (multiple of 4 or 8 depending on size)
+
+XXX update comment, we accept V=1
  */
 static void handle_gpr_stp(DisasContext *s, uint32_t insn)
 {
@@ -1345,14 +1369,29 @@
     int rt2 = extract32(insn, 10, 5);
     int64_t offset = sextract32(insn, 15, 7);
     int type = extract32(insn, 23, 2);
-    int is_32bit = !extract32(insn, 30, 2);
+    bool is_vector = extract32(insn, 26, 1);
+    int opc = extract32(insn, 30, 2);
 
     TCGv_i64 tcg_rt = cpu_reg(s, rt);
     TCGv_i64 tcg_rt2 = cpu_reg(s, rt2);
     TCGv_i64 tcg_addr; /* calculated address */
     bool postindex = false;
     bool wback = false;
-    int size = is_32bit ? 2 : 3;
+    int size;
+
+    if (is_vector) {
+        if (opc == 3) {
+            unallocated_encoding(s);
+            return;
+        }
+        size = 2 + opc;
+    } else {
+        size = 2 + extract32(opc, 1, 1);
+        if (opc & 1) {
+            unallocated_encoding(s);
+            return;
+        }
+    }
 
     switch (type) {
     case 1: /* STP (post-index) */
@@ -1383,9 +1422,17 @@
         tcg_gen_addi_i64(tcg_addr, tcg_addr, offset);
     }
 
-    do_gpr_st(s, tcg_rt, tcg_addr, size);
+    if (is_vector) {
+        do_fp_st(s, rt, tcg_addr, size);
+    } else {
+        do_gpr_st(s, tcg_rt, tcg_addr, size);
+    }
     tcg_gen_addi_i64(tcg_addr, tcg_addr, 1 << size);
-    do_gpr_st(s, tcg_rt2, tcg_addr, size);
+    if (is_vector) {
+        do_fp_st(s, rt2, tcg_addr, size);
+    } else {
+        do_gpr_st(s, tcg_rt2, tcg_addr, size);
+    }
     // XXX - this could be more optimal?
     tcg_gen_subi_i64(tcg_addr, tcg_addr, 1 << size);
 
@@ -1428,6 +1475,8 @@
                10100111 -> pre-index
                10100101 -> signed offset
 
+XXX also handles vector forms, comment needs fixing
+
  */
 static void disas_ldst_pair(DisasContext *s, uint32_t insn)
 {