Add support for features up to Armv8.3. This is a squashed merge of several separate patches that, combined, add support for most architectural features described up to Armv8.3. Change-Id: Ia67f25fb9b82d5968120b0b144bd232e1898dc90

commit: ca789743e5514898789665068e7c84aae1dbfe75 [log] [tgz]
author: Jacob Bramley <jacob.bramley@arm.com> Thu Sep 13 14:25:46 2018 +0100
committer: Jacob Bramley <jacob.bramley@arm.com> Wed Sep 19 14:03:40 2018 +0100
tree: 60664fc214705653ed7e95dfad9d7b0da5f99c99
parent: 764d67e63327f194be65e5112a34c994360eb13c [diff]
diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc
index 4617fba..937809b 100644
--- a/src/aarch64/assembler-aarch64.cc
+++ b/src/aarch64/assembler-aarch64.cc

@@ -195,6 +195,66 @@
 }
 
 
+void Assembler::braaz(const Register& xn) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  VIXL_ASSERT(xn.Is64Bits());
+  Emit(BRAAZ | Rn(xn) | Rd_mask);
+}
+
+void Assembler::brabz(const Register& xn) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  VIXL_ASSERT(xn.Is64Bits());
+  Emit(BRABZ | Rn(xn) | Rd_mask);
+}
+
+void Assembler::blraaz(const Register& xn) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  VIXL_ASSERT(xn.Is64Bits());
+  Emit(BLRAAZ | Rn(xn) | Rd_mask);
+}
+
+void Assembler::blrabz(const Register& xn) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  VIXL_ASSERT(xn.Is64Bits());
+  Emit(BLRABZ | Rn(xn) | Rd_mask);
+}
+
+void Assembler::retaa() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  Emit(RETAA | Rn_mask | Rd_mask);
+}
+
+void Assembler::retab() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  Emit(RETAB | Rn_mask | Rd_mask);
+}
+
+// The Arm ARM names the register Xm but encodes it in the Xd bitfield.
+void Assembler::braa(const Register& xn, const Register& xm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  VIXL_ASSERT(xn.Is64Bits() && xm.Is64Bits());
+  Emit(BRAA | Rn(xn) | RdSP(xm));
+}
+
+void Assembler::brab(const Register& xn, const Register& xm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  VIXL_ASSERT(xn.Is64Bits() && xm.Is64Bits());
+  Emit(BRAB | Rn(xn) | RdSP(xm));
+}
+
+void Assembler::blraa(const Register& xn, const Register& xm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  VIXL_ASSERT(xn.Is64Bits() && xm.Is64Bits());
+  Emit(BLRAA | Rn(xn) | RdSP(xm));
+}
+
+void Assembler::blrab(const Register& xn, const Register& xm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  VIXL_ASSERT(xn.Is64Bits() && xm.Is64Bits());
+  Emit(BLRAB | Rn(xn) | RdSP(xm));
+}
+
+
 void Assembler::b(int64_t imm26) { Emit(B | ImmUncondBranch(imm26)); }
 
 
@@ -959,6 +1019,60 @@
   DataProcessing1Source(rd, rn, CLS);
 }
 
+#define PAUTH_VARIATIONS(V) \
+  V(paci, PACI)             \
+  V(pacd, PACD)             \
+  V(auti, AUTI)             \
+  V(autd, AUTD)
+
+#define DEFINE_ASM_FUNCS(PRE, OP)                                  \
+  void Assembler::PRE##a(const Register& xd, const Register& xn) { \
+    VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));                      \
+    VIXL_ASSERT(xd.Is64Bits() && xn.Is64Bits());                   \
+    Emit(SF(xd) | OP##A | Rd(xd) | RnSP(xn));                      \
+  }                                                                \
+                                                                   \
+  void Assembler::PRE##za(const Register& xd) {                    \
+    VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));                      \
+    VIXL_ASSERT(xd.Is64Bits());                                    \
+    Emit(SF(xd) | OP##ZA | Rd(xd));                                \
+  }                                                                \
+                                                                   \
+  void Assembler::PRE##b(const Register& xd, const Register& xn) { \
+    VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));                      \
+    VIXL_ASSERT(xd.Is64Bits() && xn.Is64Bits());                   \
+    Emit(SF(xd) | OP##B | Rd(xd) | RnSP(xn));                      \
+  }                                                                \
+                                                                   \
+  void Assembler::PRE##zb(const Register& xd) {                    \
+    VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));                      \
+    VIXL_ASSERT(xd.Is64Bits());                                    \
+    Emit(SF(xd) | OP##ZB | Rd(xd));                                \
+  }
+
+PAUTH_VARIATIONS(DEFINE_ASM_FUNCS)
+#undef DEFINE_ASM_FUNCS
+
+void Assembler::pacga(const Register& xd,
+                      const Register& xn,
+                      const Register& xm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth, CPUFeatures::kPAuthGeneric));
+  VIXL_ASSERT(xd.Is64Bits() && xn.Is64Bits() && xm.Is64Bits());
+  Emit(SF(xd) | PACGA | Rd(xd) | Rn(xn) | RmSP(xm));
+}
+
+void Assembler::xpaci(const Register& xd) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  VIXL_ASSERT(xd.Is64Bits());
+  Emit(SF(xd) | XPACI | Rd(xd));
+}
+
+void Assembler::xpacd(const Register& xd) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  VIXL_ASSERT(xd.Is64Bits());
+  Emit(SF(xd) | XPACD | Rd(xd));
+}
+
 
 void Assembler::ldp(const CPURegister& rt,
                     const CPURegister& rt2,
@@ -1524,6 +1638,96 @@
 COMPARE_AND_SWAP_PAIR_LIST(DEFINE_ASM_FUNC)
 #undef DEFINE_ASM_FUNC
 
+// These macros generate all the variations of the atomic memory operations,
+// e.g. ldadd, ldadda, ldaddb, staddl, etc.
+// For a full list of the methods with comments, see the assembler header file.
+
+// clang-format off
+#define ATOMIC_MEMORY_SIMPLE_OPERATION_LIST(V, DEF) \
+  V(DEF, add,  LDADD)                               \
+  V(DEF, clr,  LDCLR)                               \
+  V(DEF, eor,  LDEOR)                               \
+  V(DEF, set,  LDSET)                               \
+  V(DEF, smax, LDSMAX)                              \
+  V(DEF, smin, LDSMIN)                              \
+  V(DEF, umax, LDUMAX)                              \
+  V(DEF, umin, LDUMIN)
+
+#define ATOMIC_MEMORY_STORE_MODES(V, NAME, OP) \
+  V(NAME,     OP##_x,   OP##_w)                \
+  V(NAME##l,  OP##L_x,  OP##L_w)               \
+  V(NAME##b,  OP##B,    OP##B)                 \
+  V(NAME##lb, OP##LB,   OP##LB)                \
+  V(NAME##h,  OP##H,    OP##H)                 \
+  V(NAME##lh, OP##LH,   OP##LH)
+
+#define ATOMIC_MEMORY_LOAD_MODES(V, NAME, OP) \
+  ATOMIC_MEMORY_STORE_MODES(V, NAME, OP)      \
+  V(NAME##a,   OP##A_x,  OP##A_w)             \
+  V(NAME##al,  OP##AL_x, OP##AL_w)            \
+  V(NAME##ab,  OP##AB,   OP##AB)              \
+  V(NAME##alb, OP##ALB,  OP##ALB)             \
+  V(NAME##ah,  OP##AH,   OP##AH)              \
+  V(NAME##alh, OP##ALH,  OP##ALH)
+// clang-format on
+
+#define DEFINE_ASM_LOAD_FUNC(FN, OP_X, OP_W)                        \
+  void Assembler::ld##FN(const Register& rs,                        \
+                         const Register& rt,                        \
+                         const MemOperand& src) {                   \
+    VIXL_ASSERT(CPUHas(CPUFeatures::kAtomics));                     \
+    VIXL_ASSERT(src.IsImmediateOffset() && (src.GetOffset() == 0)); \
+    AtomicMemoryOp op = rt.Is64Bits() ? OP_X : OP_W;                \
+    Emit(op | Rs(rs) | Rt(rt) | RnSP(src.GetBaseRegister()));       \
+  }
+#define DEFINE_ASM_STORE_FUNC(FN, OP_X, OP_W)                         \
+  void Assembler::st##FN(const Register& rs, const MemOperand& src) { \
+    VIXL_ASSERT(CPUHas(CPUFeatures::kAtomics));                       \
+    ld##FN(rs, AppropriateZeroRegFor(rs), src);                       \
+  }
+
+ATOMIC_MEMORY_SIMPLE_OPERATION_LIST(ATOMIC_MEMORY_LOAD_MODES,
+                                    DEFINE_ASM_LOAD_FUNC)
+ATOMIC_MEMORY_SIMPLE_OPERATION_LIST(ATOMIC_MEMORY_STORE_MODES,
+                                    DEFINE_ASM_STORE_FUNC)
+
+#define DEFINE_ASM_SWP_FUNC(FN, OP_X, OP_W)                         \
+  void Assembler::FN(const Register& rs,                            \
+                     const Register& rt,                            \
+                     const MemOperand& src) {                       \
+    VIXL_ASSERT(CPUHas(CPUFeatures::kAtomics));                     \
+    VIXL_ASSERT(src.IsImmediateOffset() && (src.GetOffset() == 0)); \
+    AtomicMemoryOp op = rt.Is64Bits() ? OP_X : OP_W;                \
+    Emit(op | Rs(rs) | Rt(rt) | RnSP(src.GetBaseRegister()));       \
+  }
+
+ATOMIC_MEMORY_LOAD_MODES(DEFINE_ASM_SWP_FUNC, swp, SWP)
+
+#undef DEFINE_ASM_LOAD_FUNC
+#undef DEFINE_ASM_STORE_FUNC
+#undef DEFINE_ASM_SWP_FUNC
+
+
+void Assembler::ldaprb(const Register& rt, const MemOperand& src) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kRCpc));
+  VIXL_ASSERT(src.IsImmediateOffset() && (src.GetOffset() == 0));
+  AtomicMemoryOp op = LDAPRB;
+  Emit(op | Rs(xzr) | Rt(rt) | RnSP(src.GetBaseRegister()));
+}
+
+void Assembler::ldaprh(const Register& rt, const MemOperand& src) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kRCpc));
+  VIXL_ASSERT(src.IsImmediateOffset() && (src.GetOffset() == 0));
+  AtomicMemoryOp op = LDAPRH;
+  Emit(op | Rs(xzr) | Rt(rt) | RnSP(src.GetBaseRegister()));
+}
+
+void Assembler::ldapr(const Register& rt, const MemOperand& src) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kRCpc));
+  VIXL_ASSERT(src.IsImmediateOffset() && (src.GetOffset() == 0));
+  AtomicMemoryOp op = rt.Is64Bits() ? LDAPR_x : LDAPR_w;
+  Emit(op | Rs(xzr) | Rt(rt) | RnSP(src.GetBaseRegister()));
+}
 
 void Assembler::prfm(PrefetchOperation op,
                      const MemOperand& address,
@@ -2219,6 +2423,71 @@
   }
 }
 
+void Assembler::xpaclri() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  Emit(XPACLRI);
+}
+
+void Assembler::pacia1716() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  Emit(PACIA1716);
+}
+
+void Assembler::pacib1716() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  Emit(PACIB1716);
+}
+
+void Assembler::autia1716() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  Emit(AUTIA1716);
+}
+
+void Assembler::autib1716() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  Emit(AUTIB1716);
+}
+
+void Assembler::paciaz() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  Emit(PACIAZ);
+}
+
+void Assembler::pacibz() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  Emit(PACIBZ);
+}
+
+void Assembler::autiaz() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  Emit(AUTIAZ);
+}
+
+void Assembler::autibz() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  Emit(AUTIBZ);
+}
+
+void Assembler::paciasp() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  Emit(PACIASP);
+}
+
+void Assembler::pacibsp() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  Emit(PACIBSP);
+}
+
+void Assembler::autiasp() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  Emit(AUTIASP);
+}
+
+void Assembler::autibsp() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPAuth));
+  Emit(AUTIBSP);
+}
+
 
 void Assembler::mvn(const Register& rd, const Operand& operand) {
   orn(rd, AppropriateZeroRegFor(rd), operand);
@@ -2254,6 +2523,11 @@
   Emit(ISB | ImmBarrierDomain(FullSystem) | ImmBarrierType(BarrierAll));
 }
 
+void Assembler::esb() {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kRAS));
+  hint(ESB);
+}
+
 void Assembler::csdb() { hint(CSDB); }
 
 void Assembler::fmov(const VRegister& vd, double imm) {
@@ -2288,18 +2562,17 @@
 }
 
 
-void Assembler::fmov(const VRegister& vd, F16 imm) {
+void Assembler::fmov(const VRegister& vd, Float16 imm) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
-  uint16_t rawbits = imm.ToRawbits();
   if (vd.IsScalar()) {
     VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
     VIXL_ASSERT(vd.Is1H());
-    Emit(FMOV_h_imm | Rd(vd) | ImmFP16(rawbits));
+    Emit(FMOV_h_imm | Rd(vd) | ImmFP16(imm));
   } else {
     VIXL_ASSERT(CPUHas(CPUFeatures::kNEON, CPUFeatures::kNEONHalf));
     VIXL_ASSERT(vd.Is4H() | vd.Is8H());
     Instr q = vd.Is8H() ? NEON_Q : 0;
-    uint32_t encoded_imm = FP16ToImm8(rawbits);
+    uint32_t encoded_imm = FP16ToImm8(imm);
     Emit(q | NEONModifiedImmediate_FMOV | ImmNEONabcdefgh(encoded_imm) |
          NEONCmode(0xf) | Rd(vd));
   }
@@ -2378,7 +2651,17 @@
                       const VRegister& vm,
                       const VRegister& va) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
-  FPDataProcessing3Source(vd, vn, vm, va, vd.Is1S() ? FMADD_s : FMADD_d);
+  FPDataProcessing3SourceOp op;
+  if (vd.Is1H()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
+    op = FMADD_h;
+  } else if (vd.Is1S()) {
+    op = FMADD_s;
+  } else {
+    VIXL_ASSERT(vd.Is1D());
+    op = FMADD_d;
+  }
+  FPDataProcessing3Source(vd, vn, vm, va, op);
 }
 
 
@@ -2387,7 +2670,17 @@
                       const VRegister& vm,
                       const VRegister& va) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
-  FPDataProcessing3Source(vd, vn, vm, va, vd.Is1S() ? FMSUB_s : FMSUB_d);
+  FPDataProcessing3SourceOp op;
+  if (vd.Is1H()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
+    op = FMSUB_h;
+  } else if (vd.Is1S()) {
+    op = FMSUB_s;
+  } else {
+    VIXL_ASSERT(vd.Is1D());
+    op = FMSUB_d;
+  }
+  FPDataProcessing3Source(vd, vn, vm, va, op);
 }
 
 
@@ -2396,7 +2689,17 @@
                        const VRegister& vm,
                        const VRegister& va) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
-  FPDataProcessing3Source(vd, vn, vm, va, vd.Is1S() ? FNMADD_s : FNMADD_d);
+  FPDataProcessing3SourceOp op;
+  if (vd.Is1H()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
+    op = FNMADD_h;
+  } else if (vd.Is1S()) {
+    op = FNMADD_s;
+  } else {
+    VIXL_ASSERT(vd.Is1D());
+    op = FNMADD_d;
+  }
+  FPDataProcessing3Source(vd, vn, vm, va, op);
 }
 
 
@@ -2405,7 +2708,17 @@
                        const VRegister& vm,
                        const VRegister& va) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
-  FPDataProcessing3Source(vd, vn, vm, va, vd.Is1S() ? FNMSUB_s : FNMSUB_d);
+  FPDataProcessing3SourceOp op;
+  if (vd.Is1H()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
+    op = FNMSUB_h;
+  } else if (vd.Is1S()) {
+    op = FNMSUB_s;
+  } else {
+    VIXL_ASSERT(vd.Is1D());
+    op = FNMSUB_d;
+  }
+  FPDataProcessing3Source(vd, vn, vm, va, op);
 }
 
 
@@ -2414,7 +2727,16 @@
                       const VRegister& vm) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
   VIXL_ASSERT(AreSameSizeAndType(vd, vn, vm));
-  Instr op = vd.Is1S() ? FNMUL_s : FNMUL_d;
+  Instr op;
+  if (vd.Is1H()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
+    op = FNMUL_h;
+  } else if (vd.Is1S()) {
+    op = FNMUL_s;
+  } else {
+    VIXL_ASSERT(vd.Is1D());
+    op = FNMUL_d;
+  }
   Emit(FPType(vd) | op | Rm(vm) | Rn(vn) | Rd(vd));
 }
 
@@ -2427,7 +2749,7 @@
   // value of +0.0, we don't need to check for -0.0 because the sign of 0.0
   // doesn't affect the result of the comparison.
   VIXL_ASSERT(value == 0.0);
-  VIXL_ASSERT(vn.Is1S() || vn.Is1D());
+  VIXL_ASSERT(vn.Is1H() || vn.Is1S() || vn.Is1D());
   Instr op = (trap == EnableTrap) ? FCMPE_zero : FCMP_zero;
   Emit(FPType(vn) | op | Rn(vn));
 }
@@ -2436,7 +2758,7 @@
 void Assembler::FPCompareMacro(const VRegister& vn,
                                const VRegister& vm,
                                FPTrapFlags trap) {
-  VIXL_ASSERT(vn.Is1S() || vn.Is1D());
+  VIXL_ASSERT(vn.Is1H() || vn.Is1S() || vn.Is1D());
   VIXL_ASSERT(vn.IsSameSizeAndType(vm));
   Instr op = (trap == EnableTrap) ? FCMPE : FCMP;
   Emit(FPType(vn) | op | Rm(vm) | Rn(vn));
@@ -2445,24 +2767,28 @@
 
 void Assembler::fcmp(const VRegister& vn, const VRegister& vm) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
+  if (vn.Is1H()) VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
   FPCompareMacro(vn, vm, DisableTrap);
 }
 
 
 void Assembler::fcmpe(const VRegister& vn, const VRegister& vm) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
+  if (vn.Is1H()) VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
   FPCompareMacro(vn, vm, EnableTrap);
 }
 
 
 void Assembler::fcmp(const VRegister& vn, double value) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
+  if (vn.Is1H()) VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
   FPCompareMacro(vn, value, DisableTrap);
 }
 
 
 void Assembler::fcmpe(const VRegister& vn, double value) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
+  if (vn.Is1H()) VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
   FPCompareMacro(vn, value, EnableTrap);
 }
 
@@ -2472,7 +2798,7 @@
                                 StatusFlags nzcv,
                                 Condition cond,
                                 FPTrapFlags trap) {
-  VIXL_ASSERT(vn.Is1S() || vn.Is1D());
+  VIXL_ASSERT(vn.Is1H() || vn.Is1S() || vn.Is1D());
   VIXL_ASSERT(vn.IsSameSizeAndType(vm));
   Instr op = (trap == EnableTrap) ? FCCMPE : FCCMP;
   Emit(FPType(vn) | op | Rm(vm) | Cond(cond) | Rn(vn) | Nzcv(nzcv));
@@ -2483,6 +2809,7 @@
                       StatusFlags nzcv,
                       Condition cond) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
+  if (vn.Is1H()) VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
   FPCCompareMacro(vn, vm, nzcv, cond, DisableTrap);
 }
 
@@ -2492,6 +2819,7 @@
                        StatusFlags nzcv,
                        Condition cond) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
+  if (vn.Is1H()) VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
   FPCCompareMacro(vn, vm, nzcv, cond, EnableTrap);
 }
 
@@ -2501,7 +2829,8 @@
                       const VRegister& vm,
                       Condition cond) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
-  VIXL_ASSERT(vd.Is1S() || vd.Is1D());
+  if (vd.Is1H()) VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
+  VIXL_ASSERT(vd.Is1H() || vd.Is1S() || vd.Is1D());
   VIXL_ASSERT(AreSameFormat(vd, vn, vm));
   Emit(FPType(vd) | FCSEL | Rm(vm) | Cond(cond) | Rn(vn) | Rd(vd));
 }
@@ -2510,6 +2839,7 @@
 void Assembler::fcvt(const VRegister& vd, const VRegister& vn) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
   FPDataProcessing1SourceOp op;
+  // The half-precision variants belong to base FP, and do not require kFPHalf.
   if (vd.Is1D()) {
     VIXL_ASSERT(vn.Is1S() || vn.Is1H());
     op = vn.Is1S() ? FCVT_ds : FCVT_dh;
@@ -2528,6 +2858,7 @@
 void Assembler::fcvtl(const VRegister& vd, const VRegister& vn) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
   VIXL_ASSERT((vd.Is4S() && vn.Is4H()) || (vd.Is2D() && vn.Is2S()));
+  // The half-precision variants belong to base FP, and do not require kFPHalf.
   Instr format = vd.Is2D() ? (1 << NEONSize_offset) : 0;
   Emit(format | NEON_FCVTL | Rn(vn) | Rd(vd));
 }
@@ -2536,6 +2867,7 @@
 void Assembler::fcvtl2(const VRegister& vd, const VRegister& vn) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
   VIXL_ASSERT((vd.Is4S() && vn.Is8H()) || (vd.Is2D() && vn.Is4S()));
+  // The half-precision variants belong to base FP, and do not require kFPHalf.
   Instr format = vd.Is2D() ? (1 << NEONSize_offset) : 0;
   Emit(NEON_Q | format | NEON_FCVTL | Rn(vn) | Rd(vd));
 }
@@ -2544,6 +2876,7 @@
 void Assembler::fcvtn(const VRegister& vd, const VRegister& vn) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
   VIXL_ASSERT((vn.Is4S() && vd.Is4H()) || (vn.Is2D() && vd.Is2S()));
+  // The half-precision variants belong to base FP, and do not require kFPHalf.
   Instr format = vn.Is2D() ? (1 << NEONSize_offset) : 0;
   Emit(format | NEON_FCVTN | Rn(vn) | Rd(vd));
 }
@@ -2552,6 +2885,7 @@
 void Assembler::fcvtn2(const VRegister& vd, const VRegister& vn) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
   VIXL_ASSERT((vn.Is4S() && vd.Is8H()) || (vn.Is2D() && vd.Is4S()));
+  // The half-precision variants belong to base FP, and do not require kFPHalf.
   Instr format = vn.Is2D() ? (1 << NEONSize_offset) : 0;
   Emit(NEON_Q | format | NEON_FCVTN | Rn(vn) | Rd(vd));
 }
@@ -2577,6 +2911,12 @@
   Emit(NEON_Q | format | NEON_FCVTXN | Rn(vn) | Rd(vd));
 }
 
+void Assembler::fjcvtzs(const Register& rd, const VRegister& vn) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kJSCVT));
+  VIXL_ASSERT(rd.IsW() && vn.Is1D());
+  Emit(FJCVTZS | Rn(vn) | Rd(rd));
+}
+
 
 void Assembler::NEONFPConvertToInt(const Register& rd,
                                    const VRegister& vn,
@@ -2596,6 +2936,20 @@
 }
 
 
+void Assembler::NEONFP16ConvertToInt(const VRegister& vd,
+                                     const VRegister& vn,
+                                     Instr op) {
+  VIXL_ASSERT(AreSameFormat(vd, vn));
+  VIXL_ASSERT(vn.IsLaneSizeH());
+  if (vn.IsScalar()) {
+    op |= NEON_Q | NEONScalar;
+  } else if (vn.Is8H()) {
+    op |= NEON_Q;
+  }
+  Emit(op | Rn(vn) | Rd(vd));
+}
+
+
 #define NEON_FP2REGMISC_FCVT_LIST(V) \
   V(fcvtnu, NEON_FCVTNU, FCVTNU)     \
   V(fcvtns, NEON_FCVTNS, FCVTNS)     \
@@ -2609,12 +2963,17 @@
 #define DEFINE_ASM_FUNCS(FN, VEC_OP, SCA_OP)                     \
   void Assembler::FN(const Register& rd, const VRegister& vn) {  \
     VIXL_ASSERT(CPUHas(CPUFeatures::kFP));                       \
+    if (vn.IsH()) VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));     \
     NEONFPConvertToInt(rd, vn, SCA_OP);                          \
   }                                                              \
   void Assembler::FN(const VRegister& vd, const VRegister& vn) { \
-    /* This form is a NEON scalar FP instruction. */             \
     VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));   \
-    NEONFPConvertToInt(vd, vn, VEC_OP);                          \
+    if (vd.IsLaneSizeH()) {                                      \
+      VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));               \
+      NEONFP16ConvertToInt(vd, vn, VEC_OP##_H);                  \
+    } else {                                                     \
+      NEONFPConvertToInt(vd, vn, VEC_OP);                        \
+    }                                                            \
   }
 NEON_FP2REGMISC_FCVT_LIST(DEFINE_ASM_FUNCS)
 #undef DEFINE_ASM_FUNCS
@@ -2622,7 +2981,8 @@
 
 void Assembler::fcvtzs(const Register& rd, const VRegister& vn, int fbits) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
-  VIXL_ASSERT(vn.Is1S() || vn.Is1D());
+  if (vn.Is1H()) VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
+  VIXL_ASSERT(vn.Is1H() || vn.Is1S() || vn.Is1D());
   VIXL_ASSERT((fbits >= 0) && (fbits <= rd.GetSizeInBits()));
   if (fbits == 0) {
     Emit(SF(rd) | FPType(vn) | FCVTZS | Rn(vn) | Rd(rd));
@@ -2636,11 +2996,17 @@
 void Assembler::fcvtzs(const VRegister& vd, const VRegister& vn, int fbits) {
   // This form is a NEON scalar FP instruction.
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
+  if (vn.IsLaneSizeH()) VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
   VIXL_ASSERT(fbits >= 0);
   if (fbits == 0) {
-    NEONFP2RegMisc(vd, vn, NEON_FCVTZS);
+    if (vd.IsLaneSizeH()) {
+      NEONFP2RegMiscFP16(vd, vn, NEON_FCVTZS_H);
+    } else {
+      NEONFP2RegMisc(vd, vn, NEON_FCVTZS);
+    }
   } else {
-    VIXL_ASSERT(vd.Is1D() || vd.Is1S() || vd.Is2D() || vd.Is2S() || vd.Is4S());
+    VIXL_ASSERT(vd.Is1D() || vd.Is1S() || vd.Is2D() || vd.Is2S() || vd.Is4S() ||
+                vd.Is1H() || vd.Is4H() || vd.Is8H());
     NEONShiftRightImmediate(vd, vn, fbits, NEON_FCVTZS_imm);
   }
 }
@@ -2648,7 +3014,8 @@
 
 void Assembler::fcvtzu(const Register& rd, const VRegister& vn, int fbits) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
-  VIXL_ASSERT(vn.Is1S() || vn.Is1D());
+  if (vn.Is1H()) VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
+  VIXL_ASSERT(vn.Is1H() || vn.Is1S() || vn.Is1D());
   VIXL_ASSERT((fbits >= 0) && (fbits <= rd.GetSizeInBits()));
   if (fbits == 0) {
     Emit(SF(rd) | FPType(vn) | FCVTZU | Rn(vn) | Rd(rd));
@@ -2662,11 +3029,17 @@
 void Assembler::fcvtzu(const VRegister& vd, const VRegister& vn, int fbits) {
   // This form is a NEON scalar FP instruction.
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
+  if (vn.IsLaneSizeH()) VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
   VIXL_ASSERT(fbits >= 0);
   if (fbits == 0) {
-    NEONFP2RegMisc(vd, vn, NEON_FCVTZU);
+    if (vd.IsLaneSizeH()) {
+      NEONFP2RegMiscFP16(vd, vn, NEON_FCVTZU_H);
+    } else {
+      NEONFP2RegMisc(vd, vn, NEON_FCVTZU);
+    }
   } else {
-    VIXL_ASSERT(vd.Is1D() || vd.Is1S() || vd.Is2D() || vd.Is2S() || vd.Is4S());
+    VIXL_ASSERT(vd.Is1D() || vd.Is1S() || vd.Is2D() || vd.Is2S() || vd.Is4S() ||
+                vd.Is1H() || vd.Is4H() || vd.Is8H());
     NEONShiftRightImmediate(vd, vn, fbits, NEON_FCVTZU_imm);
   }
 }
@@ -2674,11 +3047,17 @@
 void Assembler::ucvtf(const VRegister& vd, const VRegister& vn, int fbits) {
   // This form is a NEON scalar FP instruction.
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
+  if (vn.IsLaneSizeH()) VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
   VIXL_ASSERT(fbits >= 0);
   if (fbits == 0) {
-    NEONFP2RegMisc(vd, vn, NEON_UCVTF);
+    if (vd.IsLaneSizeH()) {
+      NEONFP2RegMiscFP16(vd, vn, NEON_UCVTF_H);
+    } else {
+      NEONFP2RegMisc(vd, vn, NEON_UCVTF);
+    }
   } else {
-    VIXL_ASSERT(vd.Is1D() || vd.Is1S() || vd.Is2D() || vd.Is2S() || vd.Is4S());
+    VIXL_ASSERT(vd.Is1D() || vd.Is1S() || vd.Is2D() || vd.Is2S() || vd.Is4S() ||
+                vd.Is1H() || vd.Is4H() || vd.Is8H());
     NEONShiftRightImmediate(vd, vn, fbits, NEON_UCVTF_imm);
   }
 }
@@ -2686,11 +3065,17 @@
 void Assembler::scvtf(const VRegister& vd, const VRegister& vn, int fbits) {
   // This form is a NEON scalar FP instruction.
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
+  if (vn.IsLaneSizeH()) VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
   VIXL_ASSERT(fbits >= 0);
   if (fbits == 0) {
-    NEONFP2RegMisc(vd, vn, NEON_SCVTF);
+    if (vd.IsLaneSizeH()) {
+      NEONFP2RegMiscFP16(vd, vn, NEON_SCVTF_H);
+    } else {
+      NEONFP2RegMisc(vd, vn, NEON_SCVTF);
+    }
   } else {
-    VIXL_ASSERT(vd.Is1D() || vd.Is1S() || vd.Is2D() || vd.Is2S() || vd.Is4S());
+    VIXL_ASSERT(vd.Is1D() || vd.Is1S() || vd.Is2D() || vd.Is2S() || vd.Is4S() ||
+                vd.Is1H() || vd.Is4H() || vd.Is8H());
     NEONShiftRightImmediate(vd, vn, fbits, NEON_SCVTF_imm);
   }
 }
@@ -2698,7 +3083,8 @@
 
 void Assembler::scvtf(const VRegister& vd, const Register& rn, int fbits) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
-  VIXL_ASSERT(vd.Is1S() || vd.Is1D());
+  if (vd.Is1H()) VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
+  VIXL_ASSERT(vd.Is1H() || vd.Is1S() || vd.Is1D());
   VIXL_ASSERT(fbits >= 0);
   if (fbits == 0) {
     Emit(SF(rn) | FPType(vd) | SCVTF | Rn(rn) | Rd(vd));
@@ -2711,7 +3097,8 @@
 
 void Assembler::ucvtf(const VRegister& vd, const Register& rn, int fbits) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP));
-  VIXL_ASSERT(vd.Is1S() || vd.Is1D());
+  if (vd.Is1H()) VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));
+  VIXL_ASSERT(vd.Is1H() || vd.Is1S() || vd.Is1D());
   VIXL_ASSERT(fbits >= 0);
   if (fbits == 0) {
     Emit(SF(rn) | FPType(vd) | UCVTF | Rn(rn) | Rd(vd));
@@ -2750,44 +3137,85 @@
 }
 
 
+void Assembler::NEON3SameFP16(const VRegister& vd,
+                              const VRegister& vn,
+                              const VRegister& vm,
+                              Instr op) {
+  VIXL_ASSERT(AreSameFormat(vd, vn, vm));
+  VIXL_ASSERT(vd.GetLaneSizeInBytes() == kHRegSizeInBytes);
+  if (vd.Is8H()) op |= NEON_Q;
+  Emit(op | Rm(vm) | Rn(vn) | Rd(vd));
+}
+
+
 // clang-format off
-#define NEON_FP2REGMISC_LIST(V)                 \
-  V(fabs,    NEON_FABS,    FABS)                \
-  V(fneg,    NEON_FNEG,    FNEG)                \
-  V(fsqrt,   NEON_FSQRT,   FSQRT)               \
-  V(frintn,  NEON_FRINTN,  FRINTN)              \
-  V(frinta,  NEON_FRINTA,  FRINTA)              \
-  V(frintp,  NEON_FRINTP,  FRINTP)              \
-  V(frintm,  NEON_FRINTM,  FRINTM)              \
-  V(frintx,  NEON_FRINTX,  FRINTX)              \
-  V(frintz,  NEON_FRINTZ,  FRINTZ)              \
-  V(frinti,  NEON_FRINTI,  FRINTI)              \
-  V(frsqrte, NEON_FRSQRTE, NEON_FRSQRTE_scalar) \
-  V(frecpe,  NEON_FRECPE,  NEON_FRECPE_scalar )
+#define NEON_FP2REGMISC_LIST(V)                                        \
+  V(fabs,    NEON_FABS,    FABS,                FABS_h)                \
+  V(fneg,    NEON_FNEG,    FNEG,                FNEG_h)                \
+  V(fsqrt,   NEON_FSQRT,   FSQRT,               FSQRT_h)               \
+  V(frintn,  NEON_FRINTN,  FRINTN,              FRINTN_h)              \
+  V(frinta,  NEON_FRINTA,  FRINTA,              FRINTA_h)              \
+  V(frintp,  NEON_FRINTP,  FRINTP,              FRINTP_h)              \
+  V(frintm,  NEON_FRINTM,  FRINTM,              FRINTM_h)              \
+  V(frintx,  NEON_FRINTX,  FRINTX,              FRINTX_h)              \
+  V(frintz,  NEON_FRINTZ,  FRINTZ,              FRINTZ_h)              \
+  V(frinti,  NEON_FRINTI,  FRINTI,              FRINTI_h)              \
+  V(frsqrte, NEON_FRSQRTE, NEON_FRSQRTE_scalar, NEON_FRSQRTE_H_scalar) \
+  V(frecpe,  NEON_FRECPE,  NEON_FRECPE_scalar,  NEON_FRECPE_H_scalar)
 // clang-format on
 
-
-#define DEFINE_ASM_FUNC(FN, VEC_OP, SCA_OP)                                \
-  void Assembler::FN(const VRegister& vd, const VRegister& vn) {           \
-    VIXL_ASSERT(CPUHas(CPUFeatures::kFP));                                 \
-    Instr op;                                                              \
-    if (vd.IsScalar()) {                                                   \
-      if ((SCA_OP & NEONScalar2RegMiscFMask) == NEONScalar2RegMiscFixed) { \
-        VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));                           \
-      }                                                                    \
-      VIXL_ASSERT(vd.Is1S() || vd.Is1D());                                 \
-      op = SCA_OP;                                                         \
-    } else {                                                               \
-      VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));                             \
-      VIXL_ASSERT(vd.Is2S() || vd.Is2D() || vd.Is4S());                    \
-      op = VEC_OP;                                                         \
-    }                                                                      \
-    NEONFP2RegMisc(vd, vn, op);                                            \
+#define DEFINE_ASM_FUNC(FN, VEC_OP, SCA_OP, SCA_OP_H)                        \
+  void Assembler::FN(const VRegister& vd, const VRegister& vn) {             \
+    VIXL_ASSERT(CPUHas(CPUFeatures::kFP));                                   \
+    Instr op;                                                                \
+    if (vd.IsScalar()) {                                                     \
+      if (vd.Is1H()) {                                                       \
+        if ((SCA_OP_H & NEONScalar2RegMiscFP16FMask) ==                      \
+            NEONScalar2RegMiscFP16Fixed) {                                   \
+          VIXL_ASSERT(CPUHas(CPUFeatures::kNEON, CPUFeatures::kNEONHalf));   \
+        } else {                                                             \
+          VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));                         \
+        }                                                                    \
+        op = SCA_OP_H;                                                       \
+      } else {                                                               \
+        if ((SCA_OP & NEONScalar2RegMiscFMask) == NEONScalar2RegMiscFixed) { \
+          VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));                           \
+        }                                                                    \
+        VIXL_ASSERT(vd.Is1S() || vd.Is1D());                                 \
+        op = SCA_OP;                                                         \
+      }                                                                      \
+    } else {                                                                 \
+      VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));                               \
+      VIXL_ASSERT(vd.Is4H() || vd.Is8H() || vd.Is2S() || vd.Is2D() ||        \
+                  vd.Is4S());                                                \
+      if (vd.IsLaneSizeH()) {                                                \
+        VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));                         \
+        op = VEC_OP##_H;                                                     \
+        if (vd.Is8H()) {                                                     \
+          op |= NEON_Q;                                                      \
+        }                                                                    \
+      } else {                                                               \
+        op = VEC_OP;                                                         \
+      }                                                                      \
+    }                                                                        \
+    if (vd.IsLaneSizeH()) {                                                  \
+      NEONFP2RegMiscFP16(vd, vn, op);                                        \
+    } else {                                                                 \
+      NEONFP2RegMisc(vd, vn, op);                                            \
+    }                                                                        \
   }
 NEON_FP2REGMISC_LIST(DEFINE_ASM_FUNC)
 #undef DEFINE_ASM_FUNC
 
 
+void Assembler::NEONFP2RegMiscFP16(const VRegister& vd,
+                                   const VRegister& vn,
+                                   Instr op) {
+  VIXL_ASSERT(AreSameFormat(vd, vn));
+  Emit(op | Rn(vn) | Rd(vd));
+}
+
+
 void Assembler::NEONFP2RegMisc(const VRegister& vd,
                                const VRegister& vn,
                                Instr op) {
@@ -2891,33 +3319,81 @@
 }
 
 
+void Assembler::NEONFP2RegMiscFP16(const VRegister& vd,
+                                   const VRegister& vn,
+                                   NEON2RegMiscFP16Op vop,
+                                   double value) {
+  VIXL_ASSERT(AreSameFormat(vd, vn));
+  VIXL_ASSERT(value == 0.0);
+  USE(value);
+
+  Instr op = vop;
+  if (vd.IsScalar()) {
+    VIXL_ASSERT(vd.Is1H());
+    op |= NEON_Q | NEONScalar;
+  } else {
+    VIXL_ASSERT(vd.Is4H() || vd.Is8H());
+    if (vd.Is8H()) {
+      op |= NEON_Q;
+    }
+  }
+
+  Emit(op | Rn(vn) | Rd(vd));
+}
+
+
 void Assembler::fcmeq(const VRegister& vd, const VRegister& vn, double value) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
-  NEONFP2RegMisc(vd, vn, NEON_FCMEQ_zero, value);
+  if (vd.IsLaneSizeH()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
+    NEONFP2RegMiscFP16(vd, vn, NEON_FCMEQ_H_zero, value);
+  } else {
+    NEONFP2RegMisc(vd, vn, NEON_FCMEQ_zero, value);
+  }
 }
 
 
 void Assembler::fcmge(const VRegister& vd, const VRegister& vn, double value) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
-  NEONFP2RegMisc(vd, vn, NEON_FCMGE_zero, value);
+  if (vd.IsLaneSizeH()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
+    NEONFP2RegMiscFP16(vd, vn, NEON_FCMGE_H_zero, value);
+  } else {
+    NEONFP2RegMisc(vd, vn, NEON_FCMGE_zero, value);
+  }
 }
 
 
 void Assembler::fcmgt(const VRegister& vd, const VRegister& vn, double value) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
-  NEONFP2RegMisc(vd, vn, NEON_FCMGT_zero, value);
+  if (vd.IsLaneSizeH()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
+    NEONFP2RegMiscFP16(vd, vn, NEON_FCMGT_H_zero, value);
+  } else {
+    NEONFP2RegMisc(vd, vn, NEON_FCMGT_zero, value);
+  }
 }
 
 
 void Assembler::fcmle(const VRegister& vd, const VRegister& vn, double value) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
-  NEONFP2RegMisc(vd, vn, NEON_FCMLE_zero, value);
+  if (vd.IsLaneSizeH()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
+    NEONFP2RegMiscFP16(vd, vn, NEON_FCMLE_H_zero, value);
+  } else {
+    NEONFP2RegMisc(vd, vn, NEON_FCMLE_zero, value);
+  }
 }
 
 
 void Assembler::fcmlt(const VRegister& vd, const VRegister& vn, double value) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
-  NEONFP2RegMisc(vd, vn, NEON_FCMLT_zero, value);
+  if (vd.IsLaneSizeH()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
+    NEONFP2RegMiscFP16(vd, vn, NEON_FCMLT_H_zero, value);
+  } else {
+    NEONFP2RegMisc(vd, vn, NEON_FCMLT_zero, value);
+  }
 }
 
 
@@ -2925,8 +3401,15 @@
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
   VIXL_ASSERT(vd.IsScalar());
   VIXL_ASSERT(AreSameFormat(vd, vn));
-  VIXL_ASSERT(vd.Is1S() || vd.Is1D());
-  Emit(FPFormat(vd) | NEON_FRECPX_scalar | Rn(vn) | Rd(vd));
+  Instr op;
+  if (vd.Is1H()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
+    op = NEON_FRECPX_H_scalar;
+  } else {
+    VIXL_ASSERT(vd.Is1S() || vd.Is1D());
+    op = NEON_FRECPX_scalar;
+  }
+  Emit(FPFormat(vd) | op | Rn(vn) | Rd(vd));
 }
 
 
@@ -2999,52 +3482,75 @@
 #undef DEFINE_ASM_FUNC
 
 // clang-format off
-#define NEON_FP3SAME_OP_LIST(V)                  \
-  V(fadd,    NEON_FADD,    FADD)                 \
-  V(fsub,    NEON_FSUB,    FSUB)                 \
-  V(fmul,    NEON_FMUL,    FMUL)                 \
-  V(fdiv,    NEON_FDIV,    FDIV)                 \
-  V(fmax,    NEON_FMAX,    FMAX)                 \
-  V(fmaxnm,  NEON_FMAXNM,  FMAXNM)               \
-  V(fmin,    NEON_FMIN,    FMIN)                 \
-  V(fminnm,  NEON_FMINNM,  FMINNM)               \
-  V(fmulx,   NEON_FMULX,   NEON_FMULX_scalar)    \
-  V(frecps,  NEON_FRECPS,  NEON_FRECPS_scalar)   \
-  V(frsqrts, NEON_FRSQRTS, NEON_FRSQRTS_scalar)  \
-  V(fabd,    NEON_FABD,    NEON_FABD_scalar)     \
-  V(fmla,    NEON_FMLA,    0)                    \
-  V(fmls,    NEON_FMLS,    0)                    \
-  V(facge,   NEON_FACGE,   NEON_FACGE_scalar)    \
-  V(facgt,   NEON_FACGT,   NEON_FACGT_scalar)    \
-  V(fcmeq,   NEON_FCMEQ,   NEON_FCMEQ_scalar)    \
-  V(fcmge,   NEON_FCMGE,   NEON_FCMGE_scalar)    \
-  V(fcmgt,   NEON_FCMGT,   NEON_FCMGT_scalar)    \
-  V(faddp,   NEON_FADDP,   0)                    \
-  V(fmaxp,   NEON_FMAXP,   0)                    \
-  V(fminp,   NEON_FMINP,   0)                    \
-  V(fmaxnmp, NEON_FMAXNMP, 0)                    \
-  V(fminnmp, NEON_FMINNMP, 0)
+#define NEON_FP3SAME_OP_LIST(V)                                        \
+  V(fmulx,   NEON_FMULX,   NEON_FMULX_scalar,   NEON_FMULX_H_scalar)   \
+  V(frecps,  NEON_FRECPS,  NEON_FRECPS_scalar,  NEON_FRECPS_H_scalar)  \
+  V(frsqrts, NEON_FRSQRTS, NEON_FRSQRTS_scalar, NEON_FRSQRTS_H_scalar) \
+  V(fabd,    NEON_FABD,    NEON_FABD_scalar,    NEON_FABD_H_scalar)    \
+  V(fmla,    NEON_FMLA,    0,                   0)                     \
+  V(fmls,    NEON_FMLS,    0,                   0)                     \
+  V(facge,   NEON_FACGE,   NEON_FACGE_scalar,   NEON_FACGE_H_scalar)   \
+  V(facgt,   NEON_FACGT,   NEON_FACGT_scalar,   NEON_FACGT_H_scalar)   \
+  V(fcmeq,   NEON_FCMEQ,   NEON_FCMEQ_scalar,   NEON_FCMEQ_H_scalar)   \
+  V(fcmge,   NEON_FCMGE,   NEON_FCMGE_scalar,   NEON_FCMGE_H_scalar)   \
+  V(fcmgt,   NEON_FCMGT,   NEON_FCMGT_scalar,   NEON_FCMGT_H_scalar)   \
+  V(faddp,   NEON_FADDP,   0,                   0)                     \
+  V(fmaxp,   NEON_FMAXP,   0,                   0)                     \
+  V(fminp,   NEON_FMINP,   0,                   0)                     \
+  V(fmaxnmp, NEON_FMAXNMP, 0,                   0)                     \
+  V(fadd,    NEON_FADD,    FADD,                0)                     \
+  V(fsub,    NEON_FSUB,    FSUB,                0)                     \
+  V(fmul,    NEON_FMUL,    FMUL,                0)                     \
+  V(fdiv,    NEON_FDIV,    FDIV,                0)                     \
+  V(fmax,    NEON_FMAX,    FMAX,                0)                     \
+  V(fmin,    NEON_FMIN,    FMIN,                0)                     \
+  V(fmaxnm,  NEON_FMAXNM,  FMAXNM,              0)                     \
+  V(fminnm,  NEON_FMINNM,  FMINNM,              0)                     \
+  V(fminnmp, NEON_FMINNMP, 0,                   0)
 // clang-format on
 
-#define DEFINE_ASM_FUNC(FN, VEC_OP, SCA_OP)                          \
-  void Assembler::FN(const VRegister& vd,                            \
-                     const VRegister& vn,                            \
-                     const VRegister& vm) {                          \
-    VIXL_ASSERT(CPUHas(CPUFeatures::kFP));                           \
-    Instr op;                                                        \
-    if ((SCA_OP != 0) && vd.IsScalar()) {                            \
-      if ((SCA_OP & NEONScalar3SameFMask) == NEONScalar3SameFixed) { \
-        VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));                     \
-      }                                                              \
-      VIXL_ASSERT(vd.Is1S() || vd.Is1D());                           \
-      op = SCA_OP;                                                   \
-    } else {                                                         \
-      VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));                       \
-      VIXL_ASSERT(vd.IsVector());                                    \
-      VIXL_ASSERT(vd.Is2S() || vd.Is2D() || vd.Is4S());              \
-      op = VEC_OP;                                                   \
-    }                                                                \
-    NEONFP3Same(vd, vn, vm, op);                                     \
+// TODO: This macro is complicated because it classifies the instructions in the
+// macro list above, and treats each case differently. It could be somewhat
+// simpler if we were to split the macro, at the cost of some duplication.
+#define DEFINE_ASM_FUNC(FN, VEC_OP, SCA_OP, SCA_OP_H)                    \
+  void Assembler::FN(const VRegister& vd,                                \
+                     const VRegister& vn,                                \
+                     const VRegister& vm) {                              \
+    VIXL_ASSERT(CPUHas(CPUFeatures::kFP));                               \
+    Instr op;                                                            \
+    bool is_fp16 = false;                                                \
+    if ((SCA_OP != 0) && vd.IsScalar()) {                                \
+      if ((SCA_OP_H != 0) && vd.Is1H()) {                                \
+        VIXL_ASSERT(CPUHas(CPUFeatures::kNEON, CPUFeatures::kNEONHalf)); \
+        is_fp16 = true;                                                  \
+        op = SCA_OP_H;                                                   \
+      } else {                                                           \
+        VIXL_ASSERT(vd.Is1H() || vd.Is1S() || vd.Is1D());                \
+        if ((SCA_OP & NEONScalar3SameFMask) == NEONScalar3SameFixed) {   \
+          VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));                       \
+          if (vd.Is1H()) VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));    \
+        } else if (vd.Is1H()) {                                          \
+          VIXL_ASSERT(CPUHas(CPUFeatures::kFPHalf));                     \
+        }                                                                \
+        op = SCA_OP;                                                     \
+      }                                                                  \
+    } else {                                                             \
+      VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));                           \
+      VIXL_ASSERT(vd.IsVector());                                        \
+      if (vd.Is4H() || vd.Is8H()) {                                      \
+        VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));                     \
+        is_fp16 = true;                                                  \
+        op = VEC_OP##_H;                                                 \
+      } else {                                                           \
+        VIXL_ASSERT(vd.Is2S() || vd.Is2D() || vd.Is4S());                \
+        op = VEC_OP;                                                     \
+      }                                                                  \
+    }                                                                    \
+    if (is_fp16) {                                                       \
+      NEON3SameFP16(vd, vn, vm, op);                                     \
+    } else {                                                             \
+      NEONFP3Same(vd, vn, vm, op);                                       \
+    }                                                                    \
   }
 NEON_FP3SAME_OP_LIST(DEFINE_ASM_FUNC)
 #undef DEFINE_ASM_FUNC
@@ -3119,36 +3625,66 @@
 
 void Assembler::faddp(const VRegister& vd, const VRegister& vn) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
-  VIXL_ASSERT((vd.Is1S() && vn.Is2S()) || (vd.Is1D() && vn.Is2D()));
-  Emit(FPFormat(vd) | NEON_FADDP_scalar | Rn(vn) | Rd(vd));
+  VIXL_ASSERT((vd.Is1S() && vn.Is2S()) || (vd.Is1D() && vn.Is2D()) ||
+              (vd.Is1H() && vn.Is2H()));
+  if (vd.Is1H()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
+    Emit(NEON_FADDP_h_scalar | Rn(vn) | Rd(vd));
+  } else {
+    Emit(FPFormat(vd) | NEON_FADDP_scalar | Rn(vn) | Rd(vd));
+  }
 }
 
 
 void Assembler::fmaxp(const VRegister& vd, const VRegister& vn) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
-  VIXL_ASSERT((vd.Is1S() && vn.Is2S()) || (vd.Is1D() && vn.Is2D()));
-  Emit(FPFormat(vd) | NEON_FMAXP_scalar | Rn(vn) | Rd(vd));
+  VIXL_ASSERT((vd.Is1S() && vn.Is2S()) || (vd.Is1D() && vn.Is2D()) ||
+              (vd.Is1H() && vn.Is2H()));
+  if (vd.Is1H()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
+    Emit(NEON_FMAXP_h_scalar | Rn(vn) | Rd(vd));
+  } else {
+    Emit(FPFormat(vd) | NEON_FMAXP_scalar | Rn(vn) | Rd(vd));
+  }
 }
 
 
 void Assembler::fminp(const VRegister& vd, const VRegister& vn) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
-  VIXL_ASSERT((vd.Is1S() && vn.Is2S()) || (vd.Is1D() && vn.Is2D()));
-  Emit(FPFormat(vd) | NEON_FMINP_scalar | Rn(vn) | Rd(vd));
+  VIXL_ASSERT((vd.Is1S() && vn.Is2S()) || (vd.Is1D() && vn.Is2D()) ||
+              (vd.Is1H() && vn.Is2H()));
+  if (vd.Is1H()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
+    Emit(NEON_FMINP_h_scalar | Rn(vn) | Rd(vd));
+  } else {
+    Emit(FPFormat(vd) | NEON_FMINP_scalar | Rn(vn) | Rd(vd));
+  }
 }
 
 
 void Assembler::fmaxnmp(const VRegister& vd, const VRegister& vn) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
-  VIXL_ASSERT((vd.Is1S() && vn.Is2S()) || (vd.Is1D() && vn.Is2D()));
-  Emit(FPFormat(vd) | NEON_FMAXNMP_scalar | Rn(vn) | Rd(vd));
+  VIXL_ASSERT((vd.Is1S() && vn.Is2S()) || (vd.Is1D() && vn.Is2D()) ||
+              (vd.Is1H() && vn.Is2H()));
+  if (vd.Is1H()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
+    Emit(NEON_FMAXNMP_h_scalar | Rn(vn) | Rd(vd));
+  } else {
+    Emit(FPFormat(vd) | NEON_FMAXNMP_scalar | Rn(vn) | Rd(vd));
+  }
 }
 
 
 void Assembler::fminnmp(const VRegister& vd, const VRegister& vn) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));
-  VIXL_ASSERT((vd.Is1S() && vn.Is2S()) || (vd.Is1D() && vn.Is2D()));
-  Emit(FPFormat(vd) | NEON_FMINNMP_scalar | Rn(vn) | Rd(vd));
+  VIXL_ASSERT((vd.Is1S() && vn.Is2S()) || (vd.Is1D() && vn.Is2D()) ||
+              (vd.Is1H() && vn.Is2H()));
+  if (vd.Is1H()) {
+    VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
+    Emit(NEON_FMINNMP_h_scalar | Rn(vn) | Rd(vd));
+  } else {
+    Emit(FPFormat(vd) | NEON_FMINNMP_scalar | Rn(vn) | Rd(vd));
+  }
 }
 
 
@@ -3162,6 +3698,7 @@
   VIXL_ASSERT(vd.IsVector() && AreSameFormat(vd, vn));
   VIXL_ASSERT((vm.IsH() && (vd.Is8H() || vd.Is4H())) ||
               (vm.IsS() && vd.Is4S()));
+  if (vd.IsLaneSizeH()) VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
   int index_num_bits = vd.Is4S() ? 1 : 2;
   Emit(VFormat(vd) | Rm(vm) | NEON_FCMLA_byelement |
        ImmNEONHLM(vm_index, index_num_bits) | ImmRotFcmlaSca(rot) | Rn(vn) |
@@ -3176,6 +3713,7 @@
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON, CPUFeatures::kFcma));
   VIXL_ASSERT(AreSameFormat(vd, vn, vm));
   VIXL_ASSERT(vd.IsVector() && !vd.IsLaneSizeB());
+  if (vd.IsLaneSizeH()) VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
   Emit(VFormat(vd) | Rm(vm) | NEON_FCMLA | ImmRotFcmlaVec(rot) | Rn(vn) |
        Rd(vd));
 }
@@ -3189,6 +3727,7 @@
   VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON, CPUFeatures::kFcma));
   VIXL_ASSERT(AreSameFormat(vd, vn, vm));
   VIXL_ASSERT(vd.IsVector() && !vd.IsLaneSizeB());
+  if (vd.IsLaneSizeH()) VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));
   Emit(VFormat(vd) | Rm(vm) | NEON_FCADD | ImmRotFcadd(rot) | Rn(vn) | Rd(vd));
 }
 
@@ -3282,21 +3821,38 @@
                                 const VRegister& vn,
                                 const VRegister& vm,
                                 int vm_index,
-                                NEONByIndexedElementOp vop) {
+                                NEONByIndexedElementOp vop,
+                                NEONByIndexedElementOp vop_half) {
   VIXL_ASSERT(AreSameFormat(vd, vn));
   VIXL_ASSERT((vd.Is2S() && vm.Is1S()) || (vd.Is4S() && vm.Is1S()) ||
               (vd.Is1S() && vm.Is1S()) || (vd.Is2D() && vm.Is1D()) ||
-              (vd.Is1D() && vm.Is1D()));
-  VIXL_ASSERT((vm.Is1S() && (vm_index < 4)) || (vm.Is1D() && (vm_index < 2)));
+              (vd.Is1D() && vm.Is1D()) || (vd.Is4H() && vm.Is1H()) ||
+              (vd.Is8H() && vm.Is1H()) || (vd.Is1H() && vm.Is1H()));
+  VIXL_ASSERT((vm.Is1S() && (vm_index < 4)) || (vm.Is1D() && (vm_index < 2)) ||
+              (vm.Is1H() && (vm.GetCode() < 16) && (vm_index < 8)));
 
   Instr op = vop;
-  int index_num_bits = vm.Is1S() ? 2 : 1;
+  int index_num_bits;
+  if (vm.Is1D()) {
+    index_num_bits = 1;
+  } else if (vm.Is1S()) {
+    index_num_bits = 2;
+  } else {
+    index_num_bits = 3;
+    op = vop_half;
+  }
+
   if (vd.IsScalar()) {
     op |= NEON_Q | NEONScalar;
   }
 
-  Emit(FPFormat(vd) | op | ImmNEONHLM(vm_index, index_num_bits) | Rm(vm) |
-       Rn(vn) | Rd(vd));
+  if (!vm.Is1H()) {
+    op |= FPFormat(vd);
+  } else if (vd.Is8H()) {
+    op |= NEON_Q;
+  }
+
+  Emit(op | ImmNEONHLM(vm_index, index_num_bits) | Rm(vm) | Rn(vn) | Rd(vd));
 }
 
 
@@ -3423,19 +3979,20 @@
 
 // clang-format off
 #define NEON_FPBYELEMENT_LIST(V) \
-  V(fmul,  NEON_FMUL_byelement)  \
-  V(fmla,  NEON_FMLA_byelement)  \
-  V(fmls,  NEON_FMLS_byelement)  \
-  V(fmulx, NEON_FMULX_byelement)
+  V(fmul,  NEON_FMUL_byelement,  NEON_FMUL_H_byelement)  \
+  V(fmla,  NEON_FMLA_byelement,  NEON_FMLA_H_byelement)  \
+  V(fmls,  NEON_FMLS_byelement,  NEON_FMLS_H_byelement)  \
+  V(fmulx, NEON_FMULX_byelement, NEON_FMULX_H_byelement)
 // clang-format on
 
-#define DEFINE_ASM_FUNC(FN, OP)                                \
-  void Assembler::FN(const VRegister& vd,                      \
-                     const VRegister& vn,                      \
-                     const VRegister& vm,                      \
-                     int vm_index) {                           \
-    VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON)); \
-    NEONFPByElement(vd, vn, vm, vm_index, OP);                 \
+#define DEFINE_ASM_FUNC(FN, OP, OP_H)                                  \
+  void Assembler::FN(const VRegister& vd,                              \
+                     const VRegister& vn,                              \
+                     const VRegister& vm,                              \
+                     int vm_index) {                                   \
+    VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));         \
+    if (vd.IsLaneSizeH()) VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf)); \
+    NEONFPByElement(vd, vn, vm, vm_index, OP, OP_H);                   \
   }
 NEON_FPBYELEMENT_LIST(DEFINE_ASM_FUNC)
 #undef DEFINE_ASM_FUNC
@@ -3955,46 +4512,59 @@
 
 void Assembler::NEONAcrossLanes(const VRegister& vd,
                                 const VRegister& vn,
-                                NEONAcrossLanesOp op) {
+                                NEONAcrossLanesOp op,
+                                Instr op_half) {
   VIXL_ASSERT((vn.Is8B() && vd.Is1B()) || (vn.Is16B() && vd.Is1B()) ||
               (vn.Is4H() && vd.Is1H()) || (vn.Is8H() && vd.Is1H()) ||
               (vn.Is4S() && vd.Is1S()));
   if ((op & NEONAcrossLanesFPFMask) == NEONAcrossLanesFPFixed) {
-    Emit(FPFormat(vn) | op | Rn(vn) | Rd(vd));
+    if (vd.Is1H()) {
+      VIXL_ASSERT(op_half != 0);
+      Instr vop = op_half;
+      if (vn.Is8H()) {
+        vop |= NEON_Q;
+      }
+      Emit(vop | Rn(vn) | Rd(vd));
+    } else {
+      Emit(FPFormat(vn) | op | Rn(vn) | Rd(vd));
+    }
   } else {
     Emit(VFormat(vn) | op | Rn(vn) | Rd(vd));
   }
 }
 
+// clang-format off
+#define NEON_ACROSSLANES_LIST(V)           \
+  V(addv,    NEON_ADDV)                    \
+  V(smaxv,   NEON_SMAXV)                   \
+  V(sminv,   NEON_SMINV)                   \
+  V(umaxv,   NEON_UMAXV)                   \
+  V(uminv,   NEON_UMINV)
+// clang-format on
 
-#define NEON_ACROSSLANES_LIST(V) \
-  V(addv, NEON_ADDV, true)       \
-  V(smaxv, NEON_SMAXV, true)     \
-  V(sminv, NEON_SMINV, true)     \
-  V(umaxv, NEON_UMAXV, true)     \
-  V(uminv, NEON_UMINV, true)
-
-#define DEFINE_ASM_FUNC(FN, OP, AS)                              \
+#define DEFINE_ASM_FUNC(FN, OP)                                  \
   void Assembler::FN(const VRegister& vd, const VRegister& vn) { \
     VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));                     \
-    VIXL_ASSERT(AS);                                             \
-    NEONAcrossLanes(vd, vn, OP);                                 \
+    NEONAcrossLanes(vd, vn, OP, 0);                              \
   }
 NEON_ACROSSLANES_LIST(DEFINE_ASM_FUNC)
 #undef DEFINE_ASM_FUNC
 
 
+// clang-format off
 #define NEON_ACROSSLANES_FP_LIST(V)   \
-  V(fmaxv, NEON_FMAXV, vd.Is1S())     \
-  V(fminv, NEON_FMINV, vd.Is1S())     \
-  V(fmaxnmv, NEON_FMAXNMV, vd.Is1S()) \
-  V(fminnmv, NEON_FMINNMV, vd.Is1S())
+  V(fmaxv,   NEON_FMAXV,   NEON_FMAXV_H) \
+  V(fminv,   NEON_FMINV,   NEON_FMINV_H) \
+  V(fmaxnmv, NEON_FMAXNMV, NEON_FMAXNMV_H) \
+  V(fminnmv, NEON_FMINNMV, NEON_FMINNMV_H) \
+// clang-format on
 
-#define DEFINE_ASM_FUNC(FN, OP, AS)                              \
+#define DEFINE_ASM_FUNC(FN, OP, OP_H)                            \
   void Assembler::FN(const VRegister& vd, const VRegister& vn) { \
     VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON));   \
-    VIXL_ASSERT(AS);                                             \
-    NEONAcrossLanes(vd, vn, OP);                                 \
+    if (vd.Is1H()) VIXL_ASSERT(CPUHas(CPUFeatures::kNEONHalf));  \
+    VIXL_ASSERT(vd.Is1S() || vd.Is1H());                         \
+    NEONAcrossLanes(vd, vn, OP, OP_H);                           \
   }
 NEON_ACROSSLANES_FP_LIST(DEFINE_ASM_FUNC)
 #undef DEFINE_ASM_FUNC
@@ -4400,7 +4970,7 @@
 // For all ToImm instructions below, a difference in case
 // for the same letter indicates a negated bit.
 // If b is 1, then B is 0.
-uint32_t Assembler::FP16ToImm8(float16 imm) {
+uint32_t Assembler::FP16ToImm8(Float16 imm) {
   VIXL_ASSERT(IsImmFP16(imm));
   // Half: aBbb.cdef.gh00.0000 (16 bits)
   uint16_t bits = Float16ToRawbits(imm);
@@ -4415,7 +4985,7 @@
 }
 
 
-Instr Assembler::ImmFP16(float16 imm) {
+Instr Assembler::ImmFP16(Float16 imm) {
   return FP16ToImm8(imm) << ImmFP_offset;
 }
 
@@ -4663,7 +5233,7 @@
                                         const VRegister& vm,
                                         const VRegister& va,
                                         FPDataProcessing3SourceOp op) {
-  VIXL_ASSERT(vd.Is1S() || vd.Is1D());
+  VIXL_ASSERT(vd.Is1H() || vd.Is1S() || vd.Is1D());
   VIXL_ASSERT(AreSameSizeAndType(vd, vn, vm, va));
   Emit(FPType(vd) | op | Rm(vm) | Rn(vn) | Rd(vd) | Ra(va));
 }
@@ -4905,7 +5475,7 @@
 }
 
 
-bool Assembler::IsImmFP16(float16 imm) {
+bool Assembler::IsImmFP16(Float16 imm) {
   // Valid values will have the form:
   // aBbb.cdef.gh00.000
   uint16_t bits = Float16ToRawbits(imm);
@@ -5379,7 +5949,7 @@
 
   const CPURegister regs[] = {reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8};
 
-  for (unsigned i = 0; i < sizeof(regs) / sizeof(regs[0]); i++) {
+  for (size_t i = 0; i < ArrayLength(regs); i++) {
     if (regs[i].IsRegister()) {
       number_of_valid_regs++;
       unique_regs |= regs[i].GetBit();

diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h
index 54256fb..7d95466 100644
--- a/src/aarch64/assembler-aarch64.h
+++ b/src/aarch64/assembler-aarch64.h

@@ -34,7 +34,6 @@
 #include "../invalset-vixl.h"
 #include "../utils-vixl.h"
 #include "operands-aarch64.h"
-#include "utils-aarch64.h"
 
 namespace vixl {
 namespace aarch64 {
@@ -496,6 +495,42 @@
   // Branch to register with return hint.
   void ret(const Register& xn = lr);
 
+  // Branch to register, with pointer authentication. Using key A and a modifier
+  // of zero [Armv8.3].
+  void braaz(const Register& xn);
+
+  // Branch to register, with pointer authentication. Using key B and a modifier
+  // of zero [Armv8.3].
+  void brabz(const Register& xn);
+
+  // Branch with link to register, with pointer authentication. Using key A and
+  // a modifier of zero [Armv8.3].
+  void blraaz(const Register& xn);
+
+  // Branch with link to register, with pointer authentication. Using key B and
+  // a modifier of zero [Armv8.3].
+  void blrabz(const Register& xn);
+
+  // Return from subroutine, with pointer authentication. Using key A [Armv8.3].
+  void retaa();
+
+  // Return from subroutine, with pointer authentication. Using key B [Armv8.3].
+  void retab();
+
+  // Branch to register, with pointer authentication. Using key A [Armv8.3].
+  void braa(const Register& xn, const Register& xm);
+
+  // Branch to register, with pointer authentication. Using key B [Armv8.3].
+  void brab(const Register& xn, const Register& xm);
+
+  // Branch with link to register, with pointer authentication. Using key A
+  // [Armv8.3].
+  void blraa(const Register& xn, const Register& xm);
+
+  // Branch with link to register, with pointer authentication. Using key B
+  // [Armv8.3].
+  void blrab(const Register& xn, const Register& xm);
+
   // Unconditional branch to label.
   void b(Label* label);
 
@@ -728,6 +763,11 @@
     bfm(rd, rn, lsb, lsb + width - 1);
   }
 
+  // Bitfield clear [Armv8.2].
+  void bfc(const Register& rd, unsigned lsb, unsigned width) {
+    bfi(rd, AppropriateZeroRegFor(rd), lsb, width);
+  }
+
   // Sbfm aliases.
   // Arithmetic shift right.
   void asr(const Register& rd, const Register& rn, unsigned shift) {
@@ -971,6 +1011,13 @@
   // Reverse bytes in 32-bit words.
   void rev32(const Register& xd, const Register& xn);
 
+  // Reverse bytes in 64-bit general purpose register, an alias for rev
+  // [Armv8.2].
+  void rev64(const Register& xd, const Register& xn) {
+    VIXL_ASSERT(xd.Is64Bits() && xn.Is64Bits());
+    rev(xd, xn);
+  }
+
   // Reverse bytes.
   void rev(const Register& rd, const Register& rn);
 
@@ -980,6 +1027,168 @@
   // Count leading sign bits.
   void cls(const Register& rd, const Register& rn);
 
+  // Pointer Authentication Code for Instruction address, using key A [Armv8.3].
+  void pacia(const Register& xd, const Register& rn);
+
+  // Pointer Authentication Code for Instruction address, using key A and a
+  // modifier of zero [Armv8.3].
+  void paciza(const Register& xd);
+
+  // Pointer Authentication Code for Instruction address, using key A, with
+  // address in x17 and modifier in x16 [Armv8.3].
+  void pacia1716();
+
+  // Pointer Authentication Code for Instruction address, using key A, with
+  // address in LR and modifier in SP [Armv8.3].
+  void paciasp();
+
+  // Pointer Authentication Code for Instruction address, using key A, with
+  // address in LR and a modifier of zero [Armv8.3].
+  void paciaz();
+
+  // Pointer Authentication Code for Instruction address, using key B [Armv8.3].
+  void pacib(const Register& xd, const Register& xn);
+
+  // Pointer Authentication Code for Instruction address, using key B and a
+  // modifier of zero [Armv8.3].
+  void pacizb(const Register& xd);
+
+  // Pointer Authentication Code for Instruction address, using key B, with
+  // address in x17 and modifier in x16 [Armv8.3].
+  void pacib1716();
+
+  // Pointer Authentication Code for Instruction address, using key B, with
+  // address in LR and modifier in SP [Armv8.3].
+  void pacibsp();
+
+  // Pointer Authentication Code for Instruction address, using key B, with
+  // address in LR and a modifier of zero [Armv8.3].
+  void pacibz();
+
+  // Pointer Authentication Code for Data address, using key A [Armv8.3].
+  void pacda(const Register& xd, const Register& xn);
+
+  // Pointer Authentication Code for Data address, using key A and a modifier of
+  // zero [Armv8.3].
+  void pacdza(const Register& xd);
+
+  // Pointer Authentication Code for Data address, using key A, with address in
+  // x17 and modifier in x16 [Armv8.3].
+  void pacda1716();
+
+  // Pointer Authentication Code for Data address, using key A, with address in
+  // LR and modifier in SP [Armv8.3].
+  void pacdasp();
+
+  // Pointer Authentication Code for Data address, using key A, with address in
+  // LR and a modifier of zero [Armv8.3].
+  void pacdaz();
+
+  // Pointer Authentication Code for Data address, using key B [Armv8.3].
+  void pacdb(const Register& xd, const Register& xn);
+
+  // Pointer Authentication Code for Data address, using key B and a modifier of
+  // zero [Armv8.3].
+  void pacdzb(const Register& xd);
+
+  // Pointer Authentication Code for Data address, using key B, with address in
+  // x17 and modifier in x16 [Armv8.3].
+  void pacdb1716();
+
+  // Pointer Authentication Code for Data address, using key B, with address in
+  // LR and modifier in SP [Armv8.3].
+  void pacdbsp();
+
+  // Pointer Authentication Code for Data address, using key B, with address in
+  // LR and a modifier of zero [Armv8.3].
+  void pacdbz();
+
+  // Pointer Authentication Code, using Generic key [Armv8.3].
+  void pacga(const Register& xd, const Register& xn, const Register& xm);
+
+  // Authenticate Instruction address, using key A [Armv8.3].
+  void autia(const Register& xd, const Register& xn);
+
+  // Authenticate Instruction address, using key A and a modifier of zero
+  // [Armv8.3].
+  void autiza(const Register& xd);
+
+  // Authenticate Instruction address, using key A, with address in x17 and
+  // modifier in x16 [Armv8.3].
+  void autia1716();
+
+  // Authenticate Instruction address, using key A, with address in LR and
+  // modifier in SP [Armv8.3].
+  void autiasp();
+
+  // Authenticate Instruction address, using key A, with address in LR and a
+  // modifier of zero [Armv8.3].
+  void autiaz();
+
+  // Authenticate Instruction address, using key B [Armv8.3].
+  void autib(const Register& xd, const Register& xn);
+
+  // Authenticate Instruction address, using key B and a modifier of zero
+  // [Armv8.3].
+  void autizb(const Register& xd);
+
+  // Authenticate Instruction address, using key B, with address in x17 and
+  // modifier in x16 [Armv8.3].
+  void autib1716();
+
+  // Authenticate Instruction address, using key B, with address in LR and
+  // modifier in SP [Armv8.3].
+  void autibsp();
+
+  // Authenticate Instruction address, using key B, with address in LR and a
+  // modifier of zero [Armv8.3].
+  void autibz();
+
+  // Authenticate Data address, using key A [Armv8.3].
+  void autda(const Register& xd, const Register& xn);
+
+  // Authenticate Data address, using key A and a modifier of zero [Armv8.3].
+  void autdza(const Register& xd);
+
+  // Authenticate Data address, using key A, with address in x17 and modifier in
+  // x16 [Armv8.3].
+  void autda1716();
+
+  // Authenticate Data address, using key A, with address in LR and modifier in
+  // SP [Armv8.3].
+  void autdasp();
+
+  // Authenticate Data address, using key A, with address in LR and a modifier
+  // of zero [Armv8.3].
+  void autdaz();
+
+  // Authenticate Data address, using key B [Armv8.3].
+  void autdb(const Register& xd, const Register& xn);
+
+  // Authenticate Data address, using key B and a modifier of zero [Armv8.3].
+  void autdzb(const Register& xd);
+
+  // Authenticate Data address, using key B, with address in x17 and modifier in
+  // x16 [Armv8.3].
+  void autdb1716();
+
+  // Authenticate Data address, using key B, with address in LR and modifier in
+  // SP [Armv8.3].
+  void autdbsp();
+
+  // Authenticate Data address, using key B, with address in LR and a modifier
+  // of zero [Armv8.3].
+  void autdbz();
+
+  // Strip Pointer Authentication Code of Data address [Armv8.3].
+  void xpacd(const Register& xd);
+
+  // Strip Pointer Authentication Code of Instruction address [Armv8.3].
+  void xpaci(const Register& xd);
+
+  // Strip Pointer Authentication Code of Instruction address in LR [Armv8.3].
+  void xpaclri();
+
   // Memory instructions.
   // Load integer or FP register.
   void ldr(const CPURegister& rt,
@@ -1260,6 +1469,578 @@
               const Register& rt2,
               const MemOperand& src);
 
+  // Atomic add on byte in memory [Armv8.1]
+  void ldaddb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic add on byte in memory, with Load-acquire semantics [Armv8.1]
+  void ldaddab(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic add on byte in memory, with Store-release semantics [Armv8.1]
+  void ldaddlb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic add on byte in memory, with Load-acquire and Store-release semantics
+  // [Armv8.1]
+  void ldaddalb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic add on halfword in memory [Armv8.1]
+  void ldaddh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic add on halfword in memory, with Load-acquire semantics [Armv8.1]
+  void ldaddah(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic add on halfword in memory, with Store-release semantics [Armv8.1]
+  void ldaddlh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic add on halfword in memory, with Load-acquire and Store-release
+  // semantics [Armv8.1]
+  void ldaddalh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic add on word or doubleword in memory [Armv8.1]
+  void ldadd(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic add on word or doubleword in memory, with Load-acquire semantics
+  // [Armv8.1]
+  void ldadda(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic add on word or doubleword in memory, with Store-release semantics
+  // [Armv8.1]
+  void ldaddl(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic add on word or doubleword in memory, with Load-acquire and
+  // Store-release semantics [Armv8.1]
+  void ldaddal(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit clear on byte in memory [Armv8.1]
+  void ldclrb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit clear on byte in memory, with Load-acquire semantics [Armv8.1]
+  void ldclrab(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit clear on byte in memory, with Store-release semantics [Armv8.1]
+  void ldclrlb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit clear on byte in memory, with Load-acquire and Store-release
+  // semantics [Armv8.1]
+  void ldclralb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit clear on halfword in memory [Armv8.1]
+  void ldclrh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit clear on halfword in memory, with Load-acquire semantics
+  // [Armv8.1]
+  void ldclrah(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit clear on halfword in memory, with Store-release semantics
+  // [Armv8.1]
+  void ldclrlh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit clear on halfword in memory, with Load-acquire and Store-release
+  // semantics [Armv8.1]
+  void ldclralh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit clear on word or doubleword in memory [Armv8.1]
+  void ldclr(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit clear on word or doubleword in memory, with Load-acquire
+  // semantics [Armv8.1]
+  void ldclra(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit clear on word or doubleword in memory, with Store-release
+  // semantics [Armv8.1]
+  void ldclrl(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit clear on word or doubleword in memory, with Load-acquire and
+  // Store-release semantics [Armv8.1]
+  void ldclral(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic exclusive OR on byte in memory [Armv8.1]
+  void ldeorb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic exclusive OR on byte in memory, with Load-acquire semantics
+  // [Armv8.1]
+  void ldeorab(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic exclusive OR on byte in memory, with Store-release semantics
+  // [Armv8.1]
+  void ldeorlb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic exclusive OR on byte in memory, with Load-acquire and Store-release
+  // semantics [Armv8.1]
+  void ldeoralb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic exclusive OR on halfword in memory [Armv8.1]
+  void ldeorh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic exclusive OR on halfword in memory, with Load-acquire semantics
+  // [Armv8.1]
+  void ldeorah(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic exclusive OR on halfword in memory, with Store-release semantics
+  // [Armv8.1]
+  void ldeorlh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic exclusive OR on halfword in memory, with Load-acquire and
+  // Store-release semantics [Armv8.1]
+  void ldeoralh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic exclusive OR on word or doubleword in memory [Armv8.1]
+  void ldeor(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic exclusive OR on word or doubleword in memory, with Load-acquire
+  // semantics [Armv8.1]
+  void ldeora(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic exclusive OR on word or doubleword in memory, with Store-release
+  // semantics [Armv8.1]
+  void ldeorl(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic exclusive OR on word or doubleword in memory, with Load-acquire and
+  // Store-release semantics [Armv8.1]
+  void ldeoral(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit set on byte in memory [Armv8.1]
+  void ldsetb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit set on byte in memory, with Load-acquire semantics [Armv8.1]
+  void ldsetab(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit set on byte in memory, with Store-release semantics [Armv8.1]
+  void ldsetlb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit set on byte in memory, with Load-acquire and Store-release
+  // semantics [Armv8.1]
+  void ldsetalb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit set on halfword in memory [Armv8.1]
+  void ldseth(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit set on halfword in memory, with Load-acquire semantics [Armv8.1]
+  void ldsetah(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit set on halfword in memory, with Store-release semantics
+  // [Armv8.1]
+  void ldsetlh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit set on halfword in memory, with Load-acquire and Store-release
+  // semantics [Armv8.1]
+  void ldsetalh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit set on word or doubleword in memory [Armv8.1]
+  void ldset(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit set on word or doubleword in memory, with Load-acquire semantics
+  // [Armv8.1]
+  void ldseta(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit set on word or doubleword in memory, with Store-release
+  // semantics [Armv8.1]
+  void ldsetl(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic bit set on word or doubleword in memory, with Load-acquire and
+  // Store-release semantics [Armv8.1]
+  void ldsetal(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed maximum on byte in memory [Armv8.1]
+  void ldsmaxb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed maximum on byte in memory, with Load-acquire semantics
+  // [Armv8.1]
+  void ldsmaxab(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed maximum on byte in memory, with Store-release semantics
+  // [Armv8.1]
+  void ldsmaxlb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed maximum on byte in memory, with Load-acquire and
+  // Store-release semantics [Armv8.1]
+  void ldsmaxalb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed maximum on halfword in memory [Armv8.1]
+  void ldsmaxh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed maximum on halfword in memory, with Load-acquire semantics
+  // [Armv8.1]
+  void ldsmaxah(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed maximum on halfword in memory, with Store-release semantics
+  // [Armv8.1]
+  void ldsmaxlh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed maximum on halfword in memory, with Load-acquire and
+  // Store-release semantics [Armv8.1]
+  void ldsmaxalh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed maximum on word or doubleword in memory [Armv8.1]
+  void ldsmax(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed maximum on word or doubleword in memory, with Load-acquire
+  // semantics [Armv8.1]
+  void ldsmaxa(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed maximum on word or doubleword in memory, with Store-release
+  // semantics [Armv8.1]
+  void ldsmaxl(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed maximum on word or doubleword in memory, with Load-acquire
+  // and Store-release semantics [Armv8.1]
+  void ldsmaxal(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed minimum on byte in memory [Armv8.1]
+  void ldsminb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed minimum on byte in memory, with Load-acquire semantics
+  // [Armv8.1]
+  void ldsminab(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed minimum on byte in memory, with Store-release semantics
+  // [Armv8.1]
+  void ldsminlb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed minimum on byte in memory, with Load-acquire and
+  // Store-release semantics [Armv8.1]
+  void ldsminalb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed minimum on halfword in memory [Armv8.1]
+  void ldsminh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed minimum on halfword in memory, with Load-acquire semantics
+  // [Armv8.1]
+  void ldsminah(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed minimum on halfword in memory, with Store-release semantics
+  // [Armv8.1]
+  void ldsminlh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed minimum on halfword in memory, with Load-acquire and
+  // Store-release semantics [Armv8.1]
+  void ldsminalh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed minimum on word or doubleword in memory [Armv8.1]
+  void ldsmin(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed minimum on word or doubleword in memory, with Load-acquire
+  // semantics [Armv8.1]
+  void ldsmina(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed minimum on word or doubleword in memory, with Store-release
+  // semantics [Armv8.1]
+  void ldsminl(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic signed minimum on word or doubleword in memory, with Load-acquire
+  // and Store-release semantics [Armv8.1]
+  void ldsminal(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned maximum on byte in memory [Armv8.1]
+  void ldumaxb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned maximum on byte in memory, with Load-acquire semantics
+  // [Armv8.1]
+  void ldumaxab(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned maximum on byte in memory, with Store-release semantics
+  // [Armv8.1]
+  void ldumaxlb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned maximum on byte in memory, with Load-acquire and
+  // Store-release semantics [Armv8.1]
+  void ldumaxalb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned maximum on halfword in memory [Armv8.1]
+  void ldumaxh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned maximum on halfword in memory, with Load-acquire semantics
+  // [Armv8.1]
+  void ldumaxah(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned maximum on halfword in memory, with Store-release semantics
+  // [Armv8.1]
+  void ldumaxlh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned maximum on halfword in memory, with Load-acquire and
+  // Store-release semantics [Armv8.1]
+  void ldumaxalh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned maximum on word or doubleword in memory [Armv8.1]
+  void ldumax(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned maximum on word or doubleword in memory, with Load-acquire
+  // semantics [Armv8.1]
+  void ldumaxa(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned maximum on word or doubleword in memory, with Store-release
+  // semantics [Armv8.1]
+  void ldumaxl(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned maximum on word or doubleword in memory, with Load-acquire
+  // and Store-release semantics [Armv8.1]
+  void ldumaxal(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned minimum on byte in memory [Armv8.1]
+  void lduminb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned minimum on byte in memory, with Load-acquire semantics
+  // [Armv8.1]
+  void lduminab(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned minimum on byte in memory, with Store-release semantics
+  // [Armv8.1]
+  void lduminlb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned minimum on byte in memory, with Load-acquire and
+  // Store-release semantics [Armv8.1]
+  void lduminalb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned minimum on halfword in memory [Armv8.1]
+  void lduminh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned minimum on halfword in memory, with Load-acquire semantics
+  // [Armv8.1]
+  void lduminah(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned minimum on halfword in memory, with Store-release semantics
+  // [Armv8.1]
+  void lduminlh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned minimum on halfword in memory, with Load-acquire and
+  // Store-release semantics [Armv8.1]
+  void lduminalh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned minimum on word or doubleword in memory [Armv8.1]
+  void ldumin(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned minimum on word or doubleword in memory, with Load-acquire
+  // semantics [Armv8.1]
+  void ldumina(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned minimum on word or doubleword in memory, with Store-release
+  // semantics [Armv8.1]
+  void lduminl(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic unsigned minimum on word or doubleword in memory, with Load-acquire
+  // and Store-release semantics [Armv8.1]
+  void lduminal(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Atomic add on byte in memory, without return. [Armv8.1]
+  void staddb(const Register& rs, const MemOperand& src);
+
+  // Atomic add on byte in memory, with Store-release semantics and without
+  // return. [Armv8.1]
+  void staddlb(const Register& rs, const MemOperand& src);
+
+  // Atomic add on halfword in memory, without return. [Armv8.1]
+  void staddh(const Register& rs, const MemOperand& src);
+
+  // Atomic add on halfword in memory, with Store-release semantics and without
+  // return. [Armv8.1]
+  void staddlh(const Register& rs, const MemOperand& src);
+
+  // Atomic add on word or doubleword in memory, without return. [Armv8.1]
+  void stadd(const Register& rs, const MemOperand& src);
+
+  // Atomic add on word or doubleword in memory, with Store-release semantics
+  // and without return. [Armv8.1]
+  void staddl(const Register& rs, const MemOperand& src);
+
+  // Atomic bit clear on byte in memory, without return. [Armv8.1]
+  void stclrb(const Register& rs, const MemOperand& src);
+
+  // Atomic bit clear on byte in memory, with Store-release semantics and
+  // without return. [Armv8.1]
+  void stclrlb(const Register& rs, const MemOperand& src);
+
+  // Atomic bit clear on halfword in memory, without return. [Armv8.1]
+  void stclrh(const Register& rs, const MemOperand& src);
+
+  // Atomic bit clear on halfword in memory, with Store-release semantics and
+  // without return. [Armv8.1]
+  void stclrlh(const Register& rs, const MemOperand& src);
+
+  // Atomic bit clear on word or doubleword in memory, without return. [Armv8.1]
+  void stclr(const Register& rs, const MemOperand& src);
+
+  // Atomic bit clear on word or doubleword in memory, with Store-release
+  // semantics and without return. [Armv8.1]
+  void stclrl(const Register& rs, const MemOperand& src);
+
+  // Atomic exclusive OR on byte in memory, without return. [Armv8.1]
+  void steorb(const Register& rs, const MemOperand& src);
+
+  // Atomic exclusive OR on byte in memory, with Store-release semantics and
+  // without return. [Armv8.1]
+  void steorlb(const Register& rs, const MemOperand& src);
+
+  // Atomic exclusive OR on halfword in memory, without return. [Armv8.1]
+  void steorh(const Register& rs, const MemOperand& src);
+
+  // Atomic exclusive OR on halfword in memory, with Store-release semantics
+  // and without return. [Armv8.1]
+  void steorlh(const Register& rs, const MemOperand& src);
+
+  // Atomic exclusive OR on word or doubleword in memory, without return.
+  // [Armv8.1]
+  void steor(const Register& rs, const MemOperand& src);
+
+  // Atomic exclusive OR on word or doubleword in memory, with Store-release
+  // semantics and without return. [Armv8.1]
+  void steorl(const Register& rs, const MemOperand& src);
+
+  // Atomic bit set on byte in memory, without return. [Armv8.1]
+  void stsetb(const Register& rs, const MemOperand& src);
+
+  // Atomic bit set on byte in memory, with Store-release semantics and without
+  // return. [Armv8.1]
+  void stsetlb(const Register& rs, const MemOperand& src);
+
+  // Atomic bit set on halfword in memory, without return. [Armv8.1]
+  void stseth(const Register& rs, const MemOperand& src);
+
+  // Atomic bit set on halfword in memory, with Store-release semantics and
+  // without return. [Armv8.1]
+  void stsetlh(const Register& rs, const MemOperand& src);
+
+  // Atomic bit set on word or doubleword in memory, without return. [Armv8.1]
+  void stset(const Register& rs, const MemOperand& src);
+
+  // Atomic bit set on word or doubleword in memory, with Store-release
+  // semantics and without return. [Armv8.1]
+  void stsetl(const Register& rs, const MemOperand& src);
+
+  // Atomic signed maximum on byte in memory, without return. [Armv8.1]
+  void stsmaxb(const Register& rs, const MemOperand& src);
+
+  // Atomic signed maximum on byte in memory, with Store-release semantics and
+  // without return. [Armv8.1]
+  void stsmaxlb(const Register& rs, const MemOperand& src);
+
+  // Atomic signed maximum on halfword in memory, without return. [Armv8.1]
+  void stsmaxh(const Register& rs, const MemOperand& src);
+
+  // Atomic signed maximum on halfword in memory, with Store-release semantics
+  // and without return. [Armv8.1]
+  void stsmaxlh(const Register& rs, const MemOperand& src);
+
+  // Atomic signed maximum on word or doubleword in memory, without return.
+  // [Armv8.1]
+  void stsmax(const Register& rs, const MemOperand& src);
+
+  // Atomic signed maximum on word or doubleword in memory, with Store-release
+  // semantics and without return. [Armv8.1]
+  void stsmaxl(const Register& rs, const MemOperand& src);
+
+  // Atomic signed minimum on byte in memory, without return. [Armv8.1]
+  void stsminb(const Register& rs, const MemOperand& src);
+
+  // Atomic signed minimum on byte in memory, with Store-release semantics and
+  // without return. [Armv8.1]
+  void stsminlb(const Register& rs, const MemOperand& src);
+
+  // Atomic signed minimum on halfword in memory, without return. [Armv8.1]
+  void stsminh(const Register& rs, const MemOperand& src);
+
+  // Atomic signed minimum on halfword in memory, with Store-release semantics
+  // and without return. [Armv8.1]
+  void stsminlh(const Register& rs, const MemOperand& src);
+
+  // Atomic signed minimum on word or doubleword in memory, without return.
+  // [Armv8.1]
+  void stsmin(const Register& rs, const MemOperand& src);
+
+  // Atomic signed minimum on word or doubleword in memory, with Store-release
+  // semantics and without return. semantics [Armv8.1]
+  void stsminl(const Register& rs, const MemOperand& src);
+
+  // Atomic unsigned maximum on byte in memory, without return. [Armv8.1]
+  void stumaxb(const Register& rs, const MemOperand& src);
+
+  // Atomic unsigned maximum on byte in memory, with Store-release semantics and
+  // without return. [Armv8.1]
+  void stumaxlb(const Register& rs, const MemOperand& src);
+
+  // Atomic unsigned maximum on halfword in memory, without return. [Armv8.1]
+  void stumaxh(const Register& rs, const MemOperand& src);
+
+  // Atomic unsigned maximum on halfword in memory, with Store-release semantics
+  // and without return. [Armv8.1]
+  void stumaxlh(const Register& rs, const MemOperand& src);
+
+  // Atomic unsigned maximum on word or doubleword in memory, without return.
+  // [Armv8.1]
+  void stumax(const Register& rs, const MemOperand& src);
+
+  // Atomic unsigned maximum on word or doubleword in memory, with Store-release
+  // semantics and without return. [Armv8.1]
+  void stumaxl(const Register& rs, const MemOperand& src);
+
+  // Atomic unsigned minimum on byte in memory, without return. [Armv8.1]
+  void stuminb(const Register& rs, const MemOperand& src);
+
+  // Atomic unsigned minimum on byte in memory, with Store-release semantics and
+  // without return. [Armv8.1]
+  void stuminlb(const Register& rs, const MemOperand& src);
+
+  // Atomic unsigned minimum on halfword in memory, without return. [Armv8.1]
+  void stuminh(const Register& rs, const MemOperand& src);
+
+  // Atomic unsigned minimum on halfword in memory, with Store-release semantics
+  // and without return. [Armv8.1]
+  void stuminlh(const Register& rs, const MemOperand& src);
+
+  // Atomic unsigned minimum on word or doubleword in memory, without return.
+  // [Armv8.1]
+  void stumin(const Register& rs, const MemOperand& src);
+
+  // Atomic unsigned minimum on word or doubleword in memory, with Store-release
+  // semantics and without return. [Armv8.1]
+  void stuminl(const Register& rs, const MemOperand& src);
+
+  // Swap byte in memory [Armv8.1]
+  void swpb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Swap byte in memory, with Load-acquire semantics [Armv8.1]
+  void swpab(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Swap byte in memory, with Store-release semantics [Armv8.1]
+  void swplb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Swap byte in memory, with Load-acquire and Store-release semantics
+  // [Armv8.1]
+  void swpalb(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Swap halfword in memory [Armv8.1]
+  void swph(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Swap halfword in memory, with Load-acquire semantics [Armv8.1]
+  void swpah(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Swap halfword in memory, with Store-release semantics [Armv8.1]
+  void swplh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Swap halfword in memory, with Load-acquire and Store-release semantics
+  // [Armv8.1]
+  void swpalh(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Swap word or doubleword in memory [Armv8.1]
+  void swp(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Swap word or doubleword in memory, with Load-acquire semantics [Armv8.1]
+  void swpa(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Swap word or doubleword in memory, with Store-release semantics [Armv8.1]
+  void swpl(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Swap word or doubleword in memory, with Load-acquire and Store-release
+  // semantics [Armv8.1]
+  void swpal(const Register& rs, const Register& rt, const MemOperand& src);
+
+  // Load-Acquire RCpc Register byte [Armv8.3]
+  void ldaprb(const Register& rt, const MemOperand& src);
+
+  // Load-Acquire RCpc Register halfword [Armv8.3]
+  void ldaprh(const Register& rt, const MemOperand& src);
+
+  // Load-Acquire RCpc Register word or doubleword [Armv8.3]
+  void ldapr(const Register& rt, const MemOperand& src);
+
   // Prefetch memory.
   void prfm(PrefetchOperation op,
             const MemOperand& addr,
@@ -1354,6 +2135,9 @@
   // Instruction synchronization barrier.
   void isb();
 
+  // Error synchronization barrier.
+  void esb();
+
   // Conditional speculation dependency barrier.
   void csdb();
 
@@ -1369,7 +2153,7 @@
   void fmov(const VRegister& vd, float imm);
 
   // Move half precision immediate to FP register [Armv8.2].
-  void fmov(const VRegister& vd, F16 imm);
+  void fmov(const VRegister& vd, Float16 imm);
 
   // Move FP register to register.
   void fmov(const Register& rd, const VRegister& fn);
@@ -1515,6 +2299,7 @@
   // Common FP Convert functions.
   void NEONFPConvertToInt(const Register& rd, const VRegister& vn, Instr op);
   void NEONFPConvertToInt(const VRegister& vd, const VRegister& vn, Instr op);
+  void NEONFP16ConvertToInt(const VRegister& vd, const VRegister& vn, Instr op);
 
   // FP convert between precisions.
   void fcvt(const VRegister& vd, const VRegister& vn);
@@ -1564,6 +2349,9 @@
   // FP convert to signed integer, nearest with ties to even.
   void fcvtns(const Register& rd, const VRegister& vn);
 
+  // FP JavaScript convert to signed integer, rounding toward zero [Armv8.3].
+  void fjcvtzs(const Register& rd, const VRegister& vn);
+
   // FP convert to unsigned integer, nearest with ties to even.
   void fcvtnu(const Register& rd, const VRegister& vn);
 
@@ -2791,6 +3579,11 @@
     return (rn.GetCode() & kRegCodeMask) << Rn_offset;
   }
 
+  static Instr RmSP(Register rm) {
+    VIXL_ASSERT(!rm.IsZero());
+    return (rm.GetCode() & kRegCodeMask) << Rm_offset;
+  }
+
   // Flags encoding.
   static Instr Flags(FlagsUpdate S) {
     if (S == SetFlags) {
@@ -3018,7 +3811,7 @@
   }
 
   // FP Immediates.
-  static Instr ImmFP16(float16 imm);
+  static Instr ImmFP16(Float16 imm);
   static Instr ImmFP32(float imm);
   static Instr ImmFP64(double imm);
 
@@ -3045,7 +3838,7 @@
   // Immediate field checking helpers.
   static bool IsImmAddSub(int64_t immediate);
   static bool IsImmConditionalCompare(int64_t immediate);
-  static bool IsImmFP16(float16 imm);
+  static bool IsImmFP16(Float16 imm);
   static bool IsImmFP32(float imm);
   static bool IsImmFP64(double imm);
   static bool IsImmLogical(uint64_t value,
@@ -3422,7 +4215,7 @@
   bool CPUHas(const CPURegister& rt, const CPURegister& rt2) const;
 
  private:
-  static uint32_t FP16ToImm8(float16 imm);
+  static uint32_t FP16ToImm8(Float16 imm);
   static uint32_t FP32ToImm8(float imm);
   static uint32_t FP64ToImm8(double imm);
 
@@ -3472,7 +4265,8 @@
                         NEONAcrossLanesOp op);
   void NEONAcrossLanes(const VRegister& vd,
                        const VRegister& vn,
-                       NEONAcrossLanesOp op);
+                       NEONAcrossLanesOp op,
+                       Instr op_half);
   void NEONModifiedImmShiftLsl(const VRegister& vd,
                                const int imm8,
                                const int left_shift,
@@ -3486,6 +4280,10 @@
                  const VRegister& vn,
                  const VRegister& vm,
                  NEON3SameOp vop);
+  void NEON3SameFP16(const VRegister& vd,
+                     const VRegister& vn,
+                     const VRegister& vm,
+                     Instr op);
   void NEONFP3Same(const VRegister& vd,
                    const VRegister& vn,
                    const VRegister& vm,
@@ -3506,11 +4304,16 @@
                       const VRegister& vn,
                       NEON2RegMiscOp vop,
                       double value = 0.0);
+  void NEONFP2RegMiscFP16(const VRegister& vd,
+                          const VRegister& vn,
+                          NEON2RegMiscFP16Op vop,
+                          double value = 0.0);
   void NEON2RegMisc(const VRegister& vd,
                     const VRegister& vn,
                     NEON2RegMiscOp vop,
                     int value = 0);
   void NEONFP2RegMisc(const VRegister& vd, const VRegister& vn, Instr op);
+  void NEONFP2RegMiscFP16(const VRegister& vd, const VRegister& vn, Instr op);
   void NEONAddlp(const VRegister& vd, const VRegister& vn, NEON2RegMiscOp op);
   void NEONPerm(const VRegister& vd,
                 const VRegister& vn,
@@ -3520,7 +4323,8 @@
                        const VRegister& vn,
                        const VRegister& vm,
                        int vm_index,
-                       NEONByIndexedElementOp op);
+                       NEONByIndexedElementOp op,
+                       NEONByIndexedElementOp op_half);
   void NEONByElement(const VRegister& vd,
                      const VRegister& vn,
                      const VRegister& vm,

diff --git a/src/aarch64/constants-aarch64.h b/src/aarch64/constants-aarch64.h
index d3d403f..de659f0 100644
--- a/src/aarch64/constants-aarch64.h
+++ b/src/aarch64/constants-aarch64.h

@@ -305,6 +305,7 @@
   WFI   = 3,
   SEV   = 4,
   SEVL  = 5,
+  ESB   = 16,
   CSDB  = 20
 };
 
@@ -653,10 +654,21 @@
 enum UnconditionalBranchToRegisterOp {
   UnconditionalBranchToRegisterFixed = 0xD6000000,
   UnconditionalBranchToRegisterFMask = 0xFE000000,
-  UnconditionalBranchToRegisterMask  = 0xFFFFFC1F,
+  UnconditionalBranchToRegisterMask  = 0xFFFFFC00,
   BR      = UnconditionalBranchToRegisterFixed | 0x001F0000,
   BLR     = UnconditionalBranchToRegisterFixed | 0x003F0000,
-  RET     = UnconditionalBranchToRegisterFixed | 0x005F0000
+  RET     = UnconditionalBranchToRegisterFixed | 0x005F0000,
+
+  BRAAZ  = UnconditionalBranchToRegisterFixed | 0x001F0800,
+  BRABZ  = UnconditionalBranchToRegisterFixed | 0x001F0C00,
+  BLRAAZ = UnconditionalBranchToRegisterFixed | 0x003F0800,
+  BLRABZ = UnconditionalBranchToRegisterFixed | 0x003F0C00,
+  RETAA  = UnconditionalBranchToRegisterFixed | 0x005F0800,
+  RETAB  = UnconditionalBranchToRegisterFixed | 0x005F0C00,
+  BRAA   = UnconditionalBranchToRegisterFixed | 0x011F0800,
+  BRAB   = UnconditionalBranchToRegisterFixed | 0x011F0C00,
+  BLRAA  = UnconditionalBranchToRegisterFixed | 0x013F0800,
+  BLRAB  = UnconditionalBranchToRegisterFixed | 0x013F0C00
 };
 
 // Compare and branch.
@@ -752,6 +764,28 @@
   CLREX                       = SystemExclusiveMonitorFixed
 };
 
+enum SystemPAuthOp {
+  SystemPAuthFixed = 0xD503211F,
+  SystemPAuthFMask = 0xFFFFFD1F,
+  SystemPAuthMask  = 0xFFFFFFFF,
+  PACIA1716 = SystemPAuthFixed | 0x00000100,
+  PACIB1716 = SystemPAuthFixed | 0x00000140,
+  AUTIA1716 = SystemPAuthFixed | 0x00000180,
+  AUTIB1716 = SystemPAuthFixed | 0x000001C0,
+  PACIAZ    = SystemPAuthFixed | 0x00000300,
+  PACIASP   = SystemPAuthFixed | 0x00000320,
+  PACIBZ    = SystemPAuthFixed | 0x00000340,
+  PACIBSP   = SystemPAuthFixed | 0x00000360,
+  AUTIAZ    = SystemPAuthFixed | 0x00000380,
+  AUTIASP   = SystemPAuthFixed | 0x000003A0,
+  AUTIBZ    = SystemPAuthFixed | 0x000003C0,
+  AUTIBSP   = SystemPAuthFixed | 0x000003E0,
+
+  // XPACLRI has the same fixed mask as System Hints and needs to be handled
+  // differently.
+  XPACLRI   = 0xD50320FF
+};
+
 // Any load or store.
 enum LoadStoreAnyOp {
   LoadStoreAnyFMask = 0x0a000000,
@@ -1023,6 +1057,67 @@
   CASPAL_x = CASPFixed | LSEBit_l | LSEBit_o0 | LSEBit_sz
 };
 
+#define ATOMIC_MEMORY_SIMPLE_OPC_LIST(V) \
+  V(LDADD, 0x00000000),                  \
+  V(LDCLR, 0x00001000),                  \
+  V(LDEOR, 0x00002000),                  \
+  V(LDSET, 0x00003000),                  \
+  V(LDSMAX, 0x00004000),                 \
+  V(LDSMIN, 0x00005000),                 \
+  V(LDUMAX, 0x00006000),                 \
+  V(LDUMIN, 0x00007000)
+
+// Atomic memory.
+enum AtomicMemoryOp {
+  AtomicMemoryFixed = 0x38200000,
+  AtomicMemoryFMask = 0x3B200C00,
+  AtomicMemoryMask = 0xFFE0FC00,
+  SWPB = AtomicMemoryFixed | 0x00008000,
+  SWPAB = AtomicMemoryFixed | 0x00808000,
+  SWPLB = AtomicMemoryFixed | 0x00408000,
+  SWPALB = AtomicMemoryFixed | 0x00C08000,
+  SWPH = AtomicMemoryFixed | 0x40008000,
+  SWPAH = AtomicMemoryFixed | 0x40808000,
+  SWPLH = AtomicMemoryFixed | 0x40408000,
+  SWPALH = AtomicMemoryFixed | 0x40C08000,
+  SWP_w = AtomicMemoryFixed | 0x80008000,
+  SWPA_w = AtomicMemoryFixed | 0x80808000,
+  SWPL_w = AtomicMemoryFixed | 0x80408000,
+  SWPAL_w = AtomicMemoryFixed | 0x80C08000,
+  SWP_x = AtomicMemoryFixed | 0xC0008000,
+  SWPA_x = AtomicMemoryFixed | 0xC0808000,
+  SWPL_x = AtomicMemoryFixed | 0xC0408000,
+  SWPAL_x = AtomicMemoryFixed | 0xC0C08000,
+  LDAPRB = AtomicMemoryFixed | 0x0080C000,
+  LDAPRH = AtomicMemoryFixed | 0x4080C000,
+  LDAPR_w = AtomicMemoryFixed | 0x8080C000,
+  LDAPR_x = AtomicMemoryFixed | 0xC080C000,
+
+  AtomicMemorySimpleFMask = 0x3B208C00,
+  AtomicMemorySimpleOpMask = 0x00007000,
+#define ATOMIC_MEMORY_SIMPLE(N, OP)              \
+  N##Op = OP,                                    \
+  N##B = AtomicMemoryFixed | OP,                 \
+  N##AB = AtomicMemoryFixed | OP | 0x00800000,   \
+  N##LB = AtomicMemoryFixed | OP | 0x00400000,   \
+  N##ALB = AtomicMemoryFixed | OP | 0x00C00000,  \
+  N##H = AtomicMemoryFixed | OP | 0x40000000,    \
+  N##AH = AtomicMemoryFixed | OP | 0x40800000,   \
+  N##LH = AtomicMemoryFixed | OP | 0x40400000,   \
+  N##ALH = AtomicMemoryFixed | OP | 0x40C00000,  \
+  N##_w = AtomicMemoryFixed | OP | 0x80000000,   \
+  N##A_w = AtomicMemoryFixed | OP | 0x80800000,  \
+  N##L_w = AtomicMemoryFixed | OP | 0x80400000,  \
+  N##AL_w = AtomicMemoryFixed | OP | 0x80C00000, \
+  N##_x = AtomicMemoryFixed | OP | 0xC0000000,   \
+  N##A_x = AtomicMemoryFixed | OP | 0xC0800000,  \
+  N##L_x = AtomicMemoryFixed | OP | 0xC0400000,  \
+  N##AL_x = AtomicMemoryFixed | OP | 0xC0C00000
+
+  ATOMIC_MEMORY_SIMPLE_OPC_LIST(ATOMIC_MEMORY_SIMPLE)
+#undef ATOMIC_MEMORY_SIMPLE
+};
+
 // Conditional compare.
 enum ConditionalCompareOp {
   ConditionalCompareMask = 0x60000000,
@@ -1091,7 +1186,27 @@
   CLZ_x   = CLZ | SixtyFourBits,
   CLS     = DataProcessing1SourceFixed | 0x00001400,
   CLS_w   = CLS,
-  CLS_x   = CLS | SixtyFourBits
+  CLS_x   = CLS | SixtyFourBits,
+
+  // Pointer authentication instructions in Armv8.3.
+  PACIA  = DataProcessing1SourceFixed | 0x80010000,
+  PACIB  = DataProcessing1SourceFixed | 0x80010400,
+  PACDA  = DataProcessing1SourceFixed | 0x80010800,
+  PACDB  = DataProcessing1SourceFixed | 0x80010C00,
+  AUTIA  = DataProcessing1SourceFixed | 0x80011000,
+  AUTIB  = DataProcessing1SourceFixed | 0x80011400,
+  AUTDA  = DataProcessing1SourceFixed | 0x80011800,
+  AUTDB  = DataProcessing1SourceFixed | 0x80011C00,
+  PACIZA = DataProcessing1SourceFixed | 0x80012000,
+  PACIZB = DataProcessing1SourceFixed | 0x80012400,
+  PACDZA = DataProcessing1SourceFixed | 0x80012800,
+  PACDZB = DataProcessing1SourceFixed | 0x80012C00,
+  AUTIZA = DataProcessing1SourceFixed | 0x80013000,
+  AUTIZB = DataProcessing1SourceFixed | 0x80013400,
+  AUTDZA = DataProcessing1SourceFixed | 0x80013800,
+  AUTDZB = DataProcessing1SourceFixed | 0x80013C00,
+  XPACI  = DataProcessing1SourceFixed | 0x80014000,
+  XPACD  = DataProcessing1SourceFixed | 0x80014400
 };
 
 // Data processing 2 source.
@@ -1117,6 +1232,7 @@
   RORV_w  = DataProcessing2SourceFixed | 0x00002C00,
   RORV_x  = DataProcessing2SourceFixed | 0x80002C00,
   RORV    = RORV_w,
+  PACGA   = DataProcessing2SourceFixed | SixtyFourBits | 0x00003000,
   CRC32B  = DataProcessing2SourceFixed | 0x00004000,
   CRC32H  = DataProcessing2SourceFixed | 0x00004400,
   CRC32W  = DataProcessing2SourceFixed | 0x00004800,
@@ -1151,15 +1267,19 @@
   FPCompareFixed = 0x1E202000,
   FPCompareFMask = 0x5F203C00,
   FPCompareMask  = 0xFFE0FC1F,
+  FCMP_h         = FPCompareFixed | FP16 | 0x00000000,
   FCMP_s         = FPCompareFixed | 0x00000000,
   FCMP_d         = FPCompareFixed | FP64 | 0x00000000,
   FCMP           = FCMP_s,
+  FCMP_h_zero    = FPCompareFixed | FP16 | 0x00000008,
   FCMP_s_zero    = FPCompareFixed | 0x00000008,
   FCMP_d_zero    = FPCompareFixed | FP64 | 0x00000008,
   FCMP_zero      = FCMP_s_zero,
+  FCMPE_h        = FPCompareFixed | FP16 | 0x00000010,
   FCMPE_s        = FPCompareFixed | 0x00000010,
   FCMPE_d        = FPCompareFixed | FP64 | 0x00000010,
   FCMPE          = FCMPE_s,
+  FCMPE_h_zero   = FPCompareFixed | FP16 | 0x00000018,
   FCMPE_s_zero   = FPCompareFixed | 0x00000018,
   FCMPE_d_zero   = FPCompareFixed | FP64 | 0x00000018,
   FCMPE_zero     = FCMPE_s_zero
@@ -1170,9 +1290,11 @@
   FPConditionalCompareFixed = 0x1E200400,
   FPConditionalCompareFMask = 0x5F200C00,
   FPConditionalCompareMask  = 0xFFE00C10,
+  FCCMP_h                   = FPConditionalCompareFixed | FP16 | 0x00000000,
   FCCMP_s                   = FPConditionalCompareFixed | 0x00000000,
   FCCMP_d                   = FPConditionalCompareFixed | FP64 | 0x00000000,
   FCCMP                     = FCCMP_s,
+  FCCMPE_h                  = FPConditionalCompareFixed | FP16 | 0x00000010,
   FCCMPE_s                  = FPConditionalCompareFixed | 0x00000010,
   FCCMPE_d                  = FPConditionalCompareFixed | FP64 | 0x00000010,
   FCCMPE                    = FCCMPE_s
@@ -1183,6 +1305,7 @@
   FPConditionalSelectFixed = 0x1E200C00,
   FPConditionalSelectFMask = 0x5F200C00,
   FPConditionalSelectMask  = 0xFFE00C00,
+  FCSEL_h                  = FPConditionalSelectFixed | FP16 | 0x00000000,
   FCSEL_s                  = FPConditionalSelectFixed | 0x00000000,
   FCSEL_d                  = FPConditionalSelectFixed | FP64 | 0x00000000,
   FCSEL                    = FCSEL_s
@@ -1207,12 +1330,15 @@
   FMOV_s   = FPDataProcessing1SourceFixed | 0x00000000,
   FMOV_d   = FPDataProcessing1SourceFixed | FP64 | 0x00000000,
   FMOV     = FMOV_s,
+  FABS_h   = FPDataProcessing1SourceFixed | FP16 | 0x00008000,
   FABS_s   = FPDataProcessing1SourceFixed | 0x00008000,
   FABS_d   = FPDataProcessing1SourceFixed | FP64 | 0x00008000,
   FABS     = FABS_s,
+  FNEG_h   = FPDataProcessing1SourceFixed | FP16 | 0x00010000,
   FNEG_s   = FPDataProcessing1SourceFixed | 0x00010000,
   FNEG_d   = FPDataProcessing1SourceFixed | FP64 | 0x00010000,
   FNEG     = FNEG_s,
+  FSQRT_h  = FPDataProcessing1SourceFixed | FP16 | 0x00018000,
   FSQRT_s  = FPDataProcessing1SourceFixed | 0x00018000,
   FSQRT_d  = FPDataProcessing1SourceFixed | FP64 | 0x00018000,
   FSQRT    = FSQRT_s,
@@ -1222,24 +1348,31 @@
   FCVT_hd  = FPDataProcessing1SourceFixed | FP64 | 0x00038000,
   FCVT_sh  = FPDataProcessing1SourceFixed | 0x00C20000,
   FCVT_dh  = FPDataProcessing1SourceFixed | 0x00C28000,
+  FRINTN_h = FPDataProcessing1SourceFixed | FP16 | 0x00040000,
   FRINTN_s = FPDataProcessing1SourceFixed | 0x00040000,
   FRINTN_d = FPDataProcessing1SourceFixed | FP64 | 0x00040000,
   FRINTN   = FRINTN_s,
+  FRINTP_h = FPDataProcessing1SourceFixed | FP16 | 0x00048000,
   FRINTP_s = FPDataProcessing1SourceFixed | 0x00048000,
   FRINTP_d = FPDataProcessing1SourceFixed | FP64 | 0x00048000,
   FRINTP   = FRINTP_s,
+  FRINTM_h = FPDataProcessing1SourceFixed | FP16 | 0x00050000,
   FRINTM_s = FPDataProcessing1SourceFixed | 0x00050000,
   FRINTM_d = FPDataProcessing1SourceFixed | FP64 | 0x00050000,
   FRINTM   = FRINTM_s,
+  FRINTZ_h = FPDataProcessing1SourceFixed | FP16 | 0x00058000,
   FRINTZ_s = FPDataProcessing1SourceFixed | 0x00058000,
   FRINTZ_d = FPDataProcessing1SourceFixed | FP64 | 0x00058000,
   FRINTZ   = FRINTZ_s,
+  FRINTA_h = FPDataProcessing1SourceFixed | FP16 | 0x00060000,
   FRINTA_s = FPDataProcessing1SourceFixed | 0x00060000,
   FRINTA_d = FPDataProcessing1SourceFixed | FP64 | 0x00060000,
   FRINTA   = FRINTA_s,
+  FRINTX_h = FPDataProcessing1SourceFixed | FP16 | 0x00070000,
   FRINTX_s = FPDataProcessing1SourceFixed | 0x00070000,
   FRINTX_d = FPDataProcessing1SourceFixed | FP64 | 0x00070000,
   FRINTX   = FRINTX_s,
+  FRINTI_h = FPDataProcessing1SourceFixed | FP16 | 0x00078000,
   FRINTI_s = FPDataProcessing1SourceFixed | 0x00078000,
   FRINTI_d = FPDataProcessing1SourceFixed | FP64 | 0x00078000,
   FRINTI   = FRINTI_s
@@ -1251,30 +1384,39 @@
   FPDataProcessing2SourceFMask = 0x5F200C00,
   FPDataProcessing2SourceMask  = 0xFFE0FC00,
   FMUL     = FPDataProcessing2SourceFixed | 0x00000000,
+  FMUL_h   = FMUL | FP16,
   FMUL_s   = FMUL,
   FMUL_d   = FMUL | FP64,
   FDIV     = FPDataProcessing2SourceFixed | 0x00001000,
+  FDIV_h   = FDIV | FP16,
   FDIV_s   = FDIV,
   FDIV_d   = FDIV | FP64,
   FADD     = FPDataProcessing2SourceFixed | 0x00002000,
+  FADD_h   = FADD | FP16,
   FADD_s   = FADD,
   FADD_d   = FADD | FP64,
   FSUB     = FPDataProcessing2SourceFixed | 0x00003000,
+  FSUB_h   = FSUB | FP16,
   FSUB_s   = FSUB,
   FSUB_d   = FSUB | FP64,
   FMAX     = FPDataProcessing2SourceFixed | 0x00004000,
+  FMAX_h   = FMAX | FP16,
   FMAX_s   = FMAX,
   FMAX_d   = FMAX | FP64,
   FMIN     = FPDataProcessing2SourceFixed | 0x00005000,
+  FMIN_h   = FMIN | FP16,
   FMIN_s   = FMIN,
   FMIN_d   = FMIN | FP64,
   FMAXNM   = FPDataProcessing2SourceFixed | 0x00006000,
+  FMAXNM_h = FMAXNM | FP16,
   FMAXNM_s = FMAXNM,
   FMAXNM_d = FMAXNM | FP64,
   FMINNM   = FPDataProcessing2SourceFixed | 0x00007000,
+  FMINNM_h = FMINNM | FP16,
   FMINNM_s = FMINNM,
   FMINNM_d = FMINNM | FP64,
   FNMUL    = FPDataProcessing2SourceFixed | 0x00008000,
+  FNMUL_h  = FNMUL | FP16,
   FNMUL_s  = FNMUL,
   FNMUL_d  = FNMUL | FP64
 };
@@ -1284,6 +1426,10 @@
   FPDataProcessing3SourceFixed = 0x1F000000,
   FPDataProcessing3SourceFMask = 0x5F000000,
   FPDataProcessing3SourceMask  = 0xFFE08000,
+  FMADD_h                      = FPDataProcessing3SourceFixed | 0x00C00000,
+  FMSUB_h                      = FPDataProcessing3SourceFixed | 0x00C08000,
+  FNMADD_h                     = FPDataProcessing3SourceFixed | 0x00E00000,
+  FNMSUB_h                     = FPDataProcessing3SourceFixed | 0x00E08000,
   FMADD_s                      = FPDataProcessing3SourceFixed | 0x00000000,
   FMSUB_s                      = FPDataProcessing3SourceFixed | 0x00008000,
   FNMADD_s                     = FPDataProcessing3SourceFixed | 0x00200000,
@@ -1300,61 +1446,85 @@
   FPIntegerConvertFMask = 0x5F20FC00,
   FPIntegerConvertMask  = 0xFFFFFC00,
   FCVTNS    = FPIntegerConvertFixed | 0x00000000,
+  FCVTNS_wh = FCVTNS | FP16,
+  FCVTNS_xh = FCVTNS | SixtyFourBits | FP16,
   FCVTNS_ws = FCVTNS,
   FCVTNS_xs = FCVTNS | SixtyFourBits,
   FCVTNS_wd = FCVTNS | FP64,
   FCVTNS_xd = FCVTNS | SixtyFourBits | FP64,
   FCVTNU    = FPIntegerConvertFixed | 0x00010000,
+  FCVTNU_wh = FCVTNU | FP16,
+  FCVTNU_xh = FCVTNU | SixtyFourBits | FP16,
   FCVTNU_ws = FCVTNU,
   FCVTNU_xs = FCVTNU | SixtyFourBits,
   FCVTNU_wd = FCVTNU | FP64,
   FCVTNU_xd = FCVTNU | SixtyFourBits | FP64,
   FCVTPS    = FPIntegerConvertFixed | 0x00080000,
+  FCVTPS_wh = FCVTPS | FP16,
+  FCVTPS_xh = FCVTPS | SixtyFourBits | FP16,
   FCVTPS_ws = FCVTPS,
   FCVTPS_xs = FCVTPS | SixtyFourBits,
   FCVTPS_wd = FCVTPS | FP64,
   FCVTPS_xd = FCVTPS | SixtyFourBits | FP64,
   FCVTPU    = FPIntegerConvertFixed | 0x00090000,
+  FCVTPU_wh = FCVTPU | FP16,
+  FCVTPU_xh = FCVTPU | SixtyFourBits | FP16,
   FCVTPU_ws = FCVTPU,
   FCVTPU_xs = FCVTPU | SixtyFourBits,
   FCVTPU_wd = FCVTPU | FP64,
   FCVTPU_xd = FCVTPU | SixtyFourBits | FP64,
   FCVTMS    = FPIntegerConvertFixed | 0x00100000,
+  FCVTMS_wh = FCVTMS | FP16,
+  FCVTMS_xh = FCVTMS | SixtyFourBits | FP16,
   FCVTMS_ws = FCVTMS,
   FCVTMS_xs = FCVTMS | SixtyFourBits,
   FCVTMS_wd = FCVTMS | FP64,
   FCVTMS_xd = FCVTMS | SixtyFourBits | FP64,
   FCVTMU    = FPIntegerConvertFixed | 0x00110000,
+  FCVTMU_wh = FCVTMU | FP16,
+  FCVTMU_xh = FCVTMU | SixtyFourBits | FP16,
   FCVTMU_ws = FCVTMU,
   FCVTMU_xs = FCVTMU | SixtyFourBits,
   FCVTMU_wd = FCVTMU | FP64,
   FCVTMU_xd = FCVTMU | SixtyFourBits | FP64,
   FCVTZS    = FPIntegerConvertFixed | 0x00180000,
+  FCVTZS_wh = FCVTZS | FP16,
+  FCVTZS_xh = FCVTZS | SixtyFourBits | FP16,
   FCVTZS_ws = FCVTZS,
   FCVTZS_xs = FCVTZS | SixtyFourBits,
   FCVTZS_wd = FCVTZS | FP64,
   FCVTZS_xd = FCVTZS | SixtyFourBits | FP64,
   FCVTZU    = FPIntegerConvertFixed | 0x00190000,
+  FCVTZU_wh = FCVTZU | FP16,
+  FCVTZU_xh = FCVTZU | SixtyFourBits | FP16,
   FCVTZU_ws = FCVTZU,
   FCVTZU_xs = FCVTZU | SixtyFourBits,
   FCVTZU_wd = FCVTZU | FP64,
   FCVTZU_xd = FCVTZU | SixtyFourBits | FP64,
   SCVTF     = FPIntegerConvertFixed | 0x00020000,
+  SCVTF_hw  = SCVTF | FP16,
+  SCVTF_hx  = SCVTF | SixtyFourBits | FP16,
   SCVTF_sw  = SCVTF,
   SCVTF_sx  = SCVTF | SixtyFourBits,
   SCVTF_dw  = SCVTF | FP64,
   SCVTF_dx  = SCVTF | SixtyFourBits | FP64,
   UCVTF     = FPIntegerConvertFixed | 0x00030000,
+  UCVTF_hw  = UCVTF | FP16,
+  UCVTF_hx  = UCVTF | SixtyFourBits | FP16,
   UCVTF_sw  = UCVTF,
   UCVTF_sx  = UCVTF | SixtyFourBits,
   UCVTF_dw  = UCVTF | FP64,
   UCVTF_dx  = UCVTF | SixtyFourBits | FP64,
   FCVTAS    = FPIntegerConvertFixed | 0x00040000,
+  FCVTAS_wh = FCVTAS | FP16,
+  FCVTAS_xh = FCVTAS | SixtyFourBits | FP16,
   FCVTAS_ws = FCVTAS,
   FCVTAS_xs = FCVTAS | SixtyFourBits,
   FCVTAS_wd = FCVTAS | FP64,
   FCVTAS_xd = FCVTAS | SixtyFourBits | FP64,
   FCVTAU    = FPIntegerConvertFixed | 0x00050000,
+  FCVTAU_wh = FCVTAU | FP16,
+  FCVTAU_xh = FCVTAU | SixtyFourBits | FP16,
   FCVTAU_ws = FCVTAU,
   FCVTAU_xs = FCVTAU | SixtyFourBits,
   FCVTAU_wd = FCVTAU | FP64,
@@ -1368,7 +1538,8 @@
   FMOV_xd   = FMOV_ws | SixtyFourBits | FP64,
   FMOV_dx   = FMOV_sw | SixtyFourBits | FP64,
   FMOV_d1_x = FPIntegerConvertFixed | SixtyFourBits | 0x008F0000,
-  FMOV_x_d1 = FPIntegerConvertFixed | SixtyFourBits | 0x008E0000
+  FMOV_x_d1 = FPIntegerConvertFixed | SixtyFourBits | 0x008E0000,
+  FJCVTZS   = FPIntegerConvertFixed | FP64 | 0x001E0000
 };
 
 // Conversion between fixed point and floating point.
@@ -1377,21 +1548,29 @@
   FPFixedPointConvertFMask = 0x5F200000,
   FPFixedPointConvertMask  = 0xFFFF0000,
   FCVTZS_fixed    = FPFixedPointConvertFixed | 0x00180000,
+  FCVTZS_wh_fixed = FCVTZS_fixed | FP16,
+  FCVTZS_xh_fixed = FCVTZS_fixed | SixtyFourBits | FP16,
   FCVTZS_ws_fixed = FCVTZS_fixed,
   FCVTZS_xs_fixed = FCVTZS_fixed | SixtyFourBits,
   FCVTZS_wd_fixed = FCVTZS_fixed | FP64,
   FCVTZS_xd_fixed = FCVTZS_fixed | SixtyFourBits | FP64,
   FCVTZU_fixed    = FPFixedPointConvertFixed | 0x00190000,
+  FCVTZU_wh_fixed = FCVTZU_fixed | FP16,
+  FCVTZU_xh_fixed = FCVTZU_fixed | SixtyFourBits | FP16,
   FCVTZU_ws_fixed = FCVTZU_fixed,
   FCVTZU_xs_fixed = FCVTZU_fixed | SixtyFourBits,
   FCVTZU_wd_fixed = FCVTZU_fixed | FP64,
   FCVTZU_xd_fixed = FCVTZU_fixed | SixtyFourBits | FP64,
   SCVTF_fixed     = FPFixedPointConvertFixed | 0x00020000,
+  SCVTF_hw_fixed  = SCVTF_fixed | FP16,
+  SCVTF_hx_fixed  = SCVTF_fixed | SixtyFourBits | FP16,
   SCVTF_sw_fixed  = SCVTF_fixed,
   SCVTF_sx_fixed  = SCVTF_fixed | SixtyFourBits,
   SCVTF_dw_fixed  = SCVTF_fixed | FP64,
   SCVTF_dx_fixed  = SCVTF_fixed | SixtyFourBits | FP64,
   UCVTF_fixed     = FPFixedPointConvertFixed | 0x00030000,
+  UCVTF_hw_fixed  = UCVTF_fixed | FP16,
+  UCVTF_hx_fixed  = UCVTF_fixed | SixtyFourBits | FP16,
   UCVTF_sw_fixed  = UCVTF_fixed,
   UCVTF_sx_fixed  = UCVTF_fixed | SixtyFourBits,
   UCVTF_dw_fixed  = UCVTF_fixed | FP64,
@@ -1498,6 +1677,42 @@
   NEON_FCVTN_opcode = NEON_FCVTN & NEON2RegMiscOpcode
 };
 
+// NEON instructions with two register operands (FP16).
+enum NEON2RegMiscFP16Op {
+  NEON2RegMiscFP16Fixed = 0x0E780800,
+  NEON2RegMiscFP16FMask = 0x9F7E0C00,
+  NEON2RegMiscFP16Mask  = 0xBFFFFC00,
+  NEON_FRINTN_H     = NEON2RegMiscFP16Fixed | 0x00018000,
+  NEON_FRINTM_H     = NEON2RegMiscFP16Fixed | 0x00019000,
+  NEON_FCVTNS_H     = NEON2RegMiscFP16Fixed | 0x0001A000,
+  NEON_FCVTMS_H     = NEON2RegMiscFP16Fixed | 0x0001B000,
+  NEON_FCVTAS_H     = NEON2RegMiscFP16Fixed | 0x0001C000,
+  NEON_SCVTF_H      = NEON2RegMiscFP16Fixed | 0x0001D000,
+  NEON_FCMGT_H_zero = NEON2RegMiscFP16Fixed | 0x0080C000,
+  NEON_FCMEQ_H_zero = NEON2RegMiscFP16Fixed | 0x0080D000,
+  NEON_FCMLT_H_zero = NEON2RegMiscFP16Fixed | 0x0080E000,
+  NEON_FABS_H       = NEON2RegMiscFP16Fixed | 0x0080F000,
+  NEON_FRINTP_H     = NEON2RegMiscFP16Fixed | 0x00818000,
+  NEON_FRINTZ_H     = NEON2RegMiscFP16Fixed | 0x00819000,
+  NEON_FCVTPS_H     = NEON2RegMiscFP16Fixed | 0x0081A000,
+  NEON_FCVTZS_H     = NEON2RegMiscFP16Fixed | 0x0081B000,
+  NEON_FRECPE_H     = NEON2RegMiscFP16Fixed | 0x0081D000,
+  NEON_FRINTA_H     = NEON2RegMiscFP16Fixed | 0x20018000,
+  NEON_FRINTX_H     = NEON2RegMiscFP16Fixed | 0x20019000,
+  NEON_FCVTNU_H     = NEON2RegMiscFP16Fixed | 0x2001A000,
+  NEON_FCVTMU_H     = NEON2RegMiscFP16Fixed | 0x2001B000,
+  NEON_FCVTAU_H     = NEON2RegMiscFP16Fixed | 0x2001C000,
+  NEON_UCVTF_H      = NEON2RegMiscFP16Fixed | 0x2001D000,
+  NEON_FCMGE_H_zero = NEON2RegMiscFP16Fixed | 0x2080C000,
+  NEON_FCMLE_H_zero = NEON2RegMiscFP16Fixed | 0x2080D000,
+  NEON_FNEG_H       = NEON2RegMiscFP16Fixed | 0x2080F000,
+  NEON_FRINTI_H     = NEON2RegMiscFP16Fixed | 0x20819000,
+  NEON_FCVTPU_H     = NEON2RegMiscFP16Fixed | 0x2081A000,
+  NEON_FCVTZU_H     = NEON2RegMiscFP16Fixed | 0x2081B000,
+  NEON_FRSQRTE_H    = NEON2RegMiscFP16Fixed | 0x2081D000,
+  NEON_FSQRT_H      = NEON2RegMiscFP16Fixed | 0x2081F000
+};
+
 // NEON instructions with three same-type operands.
 enum NEON3SameOp {
   NEON3SameFixed = 0x0E200400,
@@ -1595,6 +1810,37 @@
 };
 
 
+enum NEON3SameFP16 {
+  NEON3SameFP16Fixed = 0x0E400400,
+  NEON3SameFP16FMask = 0x9F60C400,
+  NEON3SameFP16Mask =  0xBFE0FC00,
+  NEON_FMAXNM_H  = NEON3SameFP16Fixed | 0x00000000,
+  NEON_FMLA_H    = NEON3SameFP16Fixed | 0x00000800,
+  NEON_FADD_H    = NEON3SameFP16Fixed | 0x00001000,
+  NEON_FMULX_H   = NEON3SameFP16Fixed | 0x00001800,
+  NEON_FCMEQ_H   = NEON3SameFP16Fixed | 0x00002000,
+  NEON_FMAX_H    = NEON3SameFP16Fixed | 0x00003000,
+  NEON_FRECPS_H  = NEON3SameFP16Fixed | 0x00003800,
+  NEON_FMINNM_H  = NEON3SameFP16Fixed | 0x00800000,
+  NEON_FMLS_H    = NEON3SameFP16Fixed | 0x00800800,
+  NEON_FSUB_H    = NEON3SameFP16Fixed | 0x00801000,
+  NEON_FMIN_H    = NEON3SameFP16Fixed | 0x00803000,
+  NEON_FRSQRTS_H = NEON3SameFP16Fixed | 0x00803800,
+  NEON_FMAXNMP_H = NEON3SameFP16Fixed | 0x20000000,
+  NEON_FADDP_H   = NEON3SameFP16Fixed | 0x20001000,
+  NEON_FMUL_H    = NEON3SameFP16Fixed | 0x20001800,
+  NEON_FCMGE_H   = NEON3SameFP16Fixed | 0x20002000,
+  NEON_FACGE_H   = NEON3SameFP16Fixed | 0x20002800,
+  NEON_FMAXP_H   = NEON3SameFP16Fixed | 0x20003000,
+  NEON_FDIV_H    = NEON3SameFP16Fixed | 0x20003800,
+  NEON_FMINNMP_H = NEON3SameFP16Fixed | 0x20800000,
+  NEON_FABD_H    = NEON3SameFP16Fixed | 0x20801000,
+  NEON_FCMGT_H   = NEON3SameFP16Fixed | 0x20802000,
+  NEON_FACGT_H   = NEON3SameFP16Fixed | 0x20802800,
+  NEON_FMINP_H   = NEON3SameFP16Fixed | 0x20803000
+};
+
+
 // 'Extra' NEON instructions with three same-type operands.
 enum NEON3SameExtraOp {
   NEON3SameExtraFixed = 0x0E008400,
@@ -1690,10 +1936,18 @@
   NEON_UMAXV  = NEONAcrossLanesFixed | 0x2000A000,
   NEON_UMINV  = NEONAcrossLanesFixed | 0x2001A000,
 
+  NEONAcrossLanesFP16Fixed = NEONAcrossLanesFixed | 0x0000C000,
+  NEONAcrossLanesFP16FMask = NEONAcrossLanesFMask | 0x2000C000,
+  NEONAcrossLanesFP16Mask  = NEONAcrossLanesMask  | 0x20800000,
+  NEON_FMAXNMV_H = NEONAcrossLanesFP16Fixed | 0x00000000,
+  NEON_FMAXV_H   = NEONAcrossLanesFP16Fixed | 0x00003000,
+  NEON_FMINNMV_H = NEONAcrossLanesFP16Fixed | 0x00800000,
+  NEON_FMINV_H   = NEONAcrossLanesFP16Fixed | 0x00803000,
+
   // NEON floating point across instructions.
-  NEONAcrossLanesFPFixed = NEONAcrossLanesFixed | 0x0000C000,
-  NEONAcrossLanesFPFMask = NEONAcrossLanesFMask | 0x0000C000,
-  NEONAcrossLanesFPMask  = NEONAcrossLanesMask  | 0x00800000,
+  NEONAcrossLanesFPFixed = NEONAcrossLanesFixed | 0x2000C000,
+  NEONAcrossLanesFPFMask = NEONAcrossLanesFMask | 0x2000C000,
+  NEONAcrossLanesFPMask  = NEONAcrossLanesMask  | 0x20800000,
 
   NEON_FMAXV   = NEONAcrossLanesFPFixed | 0x2000F000,
   NEON_FMINV   = NEONAcrossLanesFPFixed | 0x2080F000,
@@ -1724,6 +1978,10 @@
   NEON_SQRDMLAH_byelement = NEONByIndexedElementFixed | 0x2000D000,
   NEON_UDOT_byelement = NEONByIndexedElementFixed | 0x2000E000,
   NEON_SQRDMLSH_byelement = NEONByIndexedElementFixed | 0x2000F000,
+  NEON_FMLA_H_byelement   = NEONByIndexedElementFixed | 0x00001000,
+  NEON_FMLS_H_byelement   = NEONByIndexedElementFixed | 0x00005000,
+  NEON_FMUL_H_byelement   = NEONByIndexedElementFixed | 0x00009000,
+  NEON_FMULX_H_byelement  = NEONByIndexedElementFixed | 0x20009000,
 
   // Floating point instructions.
   NEONByIndexedElementFPFixed = NEONByIndexedElementFixed | 0x00800000,
@@ -2072,6 +2330,33 @@
   NEON_FCVTXN_scalar     = NEON_Q | NEONScalar | NEON_FCVTXN
 };
 
+// NEON instructions with two register operands (FP16).
+enum NEONScalar2RegMiscFP16Op {
+  NEONScalar2RegMiscFP16Fixed = 0x5E780800,
+  NEONScalar2RegMiscFP16FMask = 0xDF7E0C00,
+  NEONScalar2RegMiscFP16Mask  = 0xFFFFFC00,
+  NEON_FCVTNS_H_scalar     = NEON_Q | NEONScalar | NEON_FCVTNS_H,
+  NEON_FCVTMS_H_scalar     = NEON_Q | NEONScalar | NEON_FCVTMS_H,
+  NEON_FCVTAS_H_scalar     = NEON_Q | NEONScalar | NEON_FCVTAS_H,
+  NEON_SCVTF_H_scalar      = NEON_Q | NEONScalar | NEON_SCVTF_H,
+  NEON_FCMGT_H_zero_scalar = NEON_Q | NEONScalar | NEON_FCMGT_H_zero,
+  NEON_FCMEQ_H_zero_scalar = NEON_Q | NEONScalar | NEON_FCMEQ_H_zero,
+  NEON_FCMLT_H_zero_scalar = NEON_Q | NEONScalar | NEON_FCMLT_H_zero,
+  NEON_FCVTPS_H_scalar     = NEON_Q | NEONScalar | NEON_FCVTPS_H,
+  NEON_FCVTZS_H_scalar     = NEON_Q | NEONScalar | NEON_FCVTZS_H,
+  NEON_FRECPE_H_scalar     = NEON_Q | NEONScalar | NEON_FRECPE_H,
+  NEON_FRECPX_H_scalar     = NEONScalar2RegMiscFP16Fixed | 0x0081F000,
+  NEON_FCVTNU_H_scalar     = NEON_Q | NEONScalar | NEON_FCVTNU_H,
+  NEON_FCVTMU_H_scalar     = NEON_Q | NEONScalar | NEON_FCVTMU_H,
+  NEON_FCVTAU_H_scalar     = NEON_Q | NEONScalar | NEON_FCVTAU_H,
+  NEON_UCVTF_H_scalar      = NEON_Q | NEONScalar | NEON_UCVTF_H,
+  NEON_FCMGE_H_zero_scalar = NEON_Q | NEONScalar | NEON_FCMGE_H_zero,
+  NEON_FCMLE_H_zero_scalar = NEON_Q | NEONScalar | NEON_FCMLE_H_zero,
+  NEON_FCVTPU_H_scalar     = NEON_Q | NEONScalar | NEON_FCVTPU_H,
+  NEON_FCVTZU_H_scalar     = NEON_Q | NEONScalar | NEON_FCVTZU_H,
+  NEON_FRSQRTE_H_scalar    = NEON_Q | NEONScalar | NEON_FRSQRTE_H
+};
+
 // NEON scalar instructions with three same-type operands.
 enum NEONScalar3SameOp {
   NEONScalar3SameFixed = 0x5E200400,
@@ -2115,6 +2400,22 @@
   NEON_FABD_scalar    = NEON_Q | NEONScalar | NEON_FABD
 };
 
+// NEON scalar FP16 instructions with three same-type operands.
+enum NEONScalar3SameFP16Op {
+  NEONScalar3SameFP16Fixed = 0x5E400400,
+  NEONScalar3SameFP16FMask = 0xDF60C400,
+  NEONScalar3SameFP16Mask  = 0xFFE0FC00,
+  NEON_FABD_H_scalar    = NEON_Q | NEONScalar | NEON_FABD_H,
+  NEON_FMULX_H_scalar   = NEON_Q | NEONScalar | NEON_FMULX_H,
+  NEON_FCMEQ_H_scalar   = NEON_Q | NEONScalar | NEON_FCMEQ_H,
+  NEON_FCMGE_H_scalar   = NEON_Q | NEONScalar | NEON_FCMGE_H,
+  NEON_FCMGT_H_scalar   = NEON_Q | NEONScalar | NEON_FCMGT_H,
+  NEON_FACGE_H_scalar   = NEON_Q | NEONScalar | NEON_FACGE_H,
+  NEON_FACGT_H_scalar   = NEON_Q | NEONScalar | NEON_FACGT_H,
+  NEON_FRECPS_H_scalar  = NEON_Q | NEONScalar | NEON_FRECPS_H,
+  NEON_FRSQRTS_H_scalar = NEON_Q | NEONScalar | NEON_FRSQRTS_H
+};
+
 // 'Extra' NEON scalar instructions with three same-type operands.
 enum NEONScalar3SameExtraOp {
   NEONScalar3SameExtraFixed = 0x5E008400,
@@ -2149,6 +2450,10 @@
     = NEON_Q | NEONScalar | NEON_SQRDMLAH_byelement,
   NEON_SQRDMLSH_byelement_scalar
     = NEON_Q | NEONScalar | NEON_SQRDMLSH_byelement,
+  NEON_FMLA_H_byelement_scalar  = NEON_Q | NEONScalar | NEON_FMLA_H_byelement,
+  NEON_FMLS_H_byelement_scalar  = NEON_Q | NEONScalar | NEON_FMLS_H_byelement,
+  NEON_FMUL_H_byelement_scalar  = NEON_Q | NEONScalar | NEON_FMUL_H_byelement,
+  NEON_FMULX_H_byelement_scalar = NEON_Q | NEONScalar | NEON_FMULX_H_byelement,
 
   // Floating point instructions.
   NEONScalarByIndexedElementFPFixed
@@ -2174,12 +2479,17 @@
   NEONScalarPairwiseFixed = 0x5E300800,
   NEONScalarPairwiseFMask = 0xDF3E0C00,
   NEONScalarPairwiseMask  = 0xFFB1F800,
-  NEON_ADDP_scalar    = NEONScalarPairwiseFixed | 0x0081B000,
-  NEON_FMAXNMP_scalar = NEONScalarPairwiseFixed | 0x2000C000,
-  NEON_FMINNMP_scalar = NEONScalarPairwiseFixed | 0x2080C000,
-  NEON_FADDP_scalar   = NEONScalarPairwiseFixed | 0x2000D000,
-  NEON_FMAXP_scalar   = NEONScalarPairwiseFixed | 0x2000F000,
-  NEON_FMINP_scalar   = NEONScalarPairwiseFixed | 0x2080F000
+  NEON_ADDP_scalar      = NEONScalarPairwiseFixed | 0x0081B000,
+  NEON_FMAXNMP_h_scalar = NEONScalarPairwiseFixed | 0x0000C000,
+  NEON_FADDP_h_scalar   = NEONScalarPairwiseFixed | 0x0000D000,
+  NEON_FMAXP_h_scalar   = NEONScalarPairwiseFixed | 0x0000F000,
+  NEON_FMINNMP_h_scalar = NEONScalarPairwiseFixed | 0x0080C000,
+  NEON_FMINP_h_scalar   = NEONScalarPairwiseFixed | 0x0080F000,
+  NEON_FMAXNMP_scalar   = NEONScalarPairwiseFixed | 0x2000C000,
+  NEON_FMINNMP_scalar   = NEONScalarPairwiseFixed | 0x2080C000,
+  NEON_FADDP_scalar     = NEONScalarPairwiseFixed | 0x2000D000,
+  NEON_FMAXP_scalar     = NEONScalarPairwiseFixed | 0x2000F000,
+  NEON_FMINP_scalar     = NEONScalarPairwiseFixed | 0x2080F000
 };
 
 // NEON scalar shift immediate.

diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc
index 68fae51..66f0d80 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.cc
+++ b/src/aarch64/cpu-features-auditor-aarch64.cc

@@ -47,6 +47,10 @@
     auditor_->seen_.Combine(auditor_->last_instruction_);
   }
 
+  void Record(const CPUFeatures& features) {
+    auditor_->last_instruction_.Combine(features);
+  }
+
   void Record(CPUFeatures::Feature feature0,
               CPUFeatures::Feature feature1 = CPUFeatures::kNone,
               CPUFeatures::Feature feature2 = CPUFeatures::kNone,
@@ -136,6 +140,22 @@
   USE(instr);
 }
 
+void CPUFeaturesAuditor::VisitAtomicMemory(const Instruction* instr) {
+  RecordInstructionFeaturesScope scope(this);
+  switch (instr->Mask(AtomicMemoryMask)) {
+    case LDAPRB:
+    case LDAPRH:
+    case LDAPR_w:
+    case LDAPR_x:
+      scope.Record(CPUFeatures::kRCpc);
+      return;
+    default:
+      // Everything else belongs to the Atomics extension.
+      scope.Record(CPUFeatures::kAtomics);
+      return;
+  }
+}
+
 void CPUFeaturesAuditor::VisitBitfield(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
   USE(instr);
@@ -184,8 +204,32 @@
 }
 
 void CPUFeaturesAuditor::VisitDataProcessing1Source(const Instruction* instr) {
-  USE(instr);
   RecordInstructionFeaturesScope scope(this);
+  switch (instr->Mask(DataProcessing1SourceMask)) {
+    case PACIA:
+    case PACIB:
+    case PACDA:
+    case PACDB:
+    case AUTIA:
+    case AUTIB:
+    case AUTDA:
+    case AUTDB:
+    case PACIZA:
+    case PACIZB:
+    case PACDZA:
+    case PACDZB:
+    case AUTIZA:
+    case AUTIZB:
+    case AUTDZA:
+    case AUTDZB:
+    case XPACI:
+    case XPACD:
+      scope.Record(CPUFeatures::kPAuth);
+      return;
+    default:
+      // No special CPU features.
+      return;
+  }
 }
 
 void CPUFeaturesAuditor::VisitDataProcessing2Source(const Instruction* instr) {
@@ -201,6 +245,9 @@
     case CRC32CX:
       scope.Record(CPUFeatures::kCRC32);
       return;
+    case PACGA:
+      scope.Record(CPUFeatures::kPAuth, CPUFeatures::kPAuthGeneric);
+      return;
     default:
       // No special CPU features.
       return;
@@ -226,21 +273,41 @@
   RecordInstructionFeaturesScope scope(this);
   // All of these instructions require FP.
   scope.Record(CPUFeatures::kFP);
-  USE(instr);
+  switch (instr->Mask(FPCompareMask)) {
+    case FCMP_h:
+    case FCMP_h_zero:
+    case FCMPE_h:
+    case FCMPE_h_zero:
+      scope.Record(CPUFeatures::kFPHalf);
+      return;
+    default:
+      // No special CPU features.
+      return;
+  }
 }
 
 void CPUFeaturesAuditor::VisitFPConditionalCompare(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
   // All of these instructions require FP.
   scope.Record(CPUFeatures::kFP);
-  USE(instr);
+  switch (instr->Mask(FPConditionalCompareMask)) {
+    case FCCMP_h:
+    case FCCMPE_h:
+      scope.Record(CPUFeatures::kFPHalf);
+      return;
+    default:
+      // No special CPU features.
+      return;
+  }
 }
 
 void CPUFeaturesAuditor::VisitFPConditionalSelect(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
   // All of these instructions require FP.
   scope.Record(CPUFeatures::kFP);
-  USE(instr);
+  if (instr->Mask(FPConditionalSelectMask) == FCSEL_h) {
+    scope.Record(CPUFeatures::kFPHalf);
+  }
 }
 
 void CPUFeaturesAuditor::VisitFPDataProcessing1Source(
@@ -248,8 +315,25 @@
   RecordInstructionFeaturesScope scope(this);
   // All of these instructions require FP.
   scope.Record(CPUFeatures::kFP);
-  if (instr->Mask(FPDataProcessing1SourceMask) == FMOV_h) {
-    scope.Record(CPUFeatures::kFPHalf);
+  switch (instr->Mask(FPDataProcessing1SourceMask)) {
+    case FMOV_h:
+    case FABS_h:
+    case FNEG_h:
+    case FSQRT_h:
+    case FRINTN_h:
+    case FRINTP_h:
+    case FRINTM_h:
+    case FRINTZ_h:
+    case FRINTA_h:
+    case FRINTX_h:
+    case FRINTI_h:
+      scope.Record(CPUFeatures::kFPHalf);
+      return;
+    default:
+      // No special CPU features.
+      // This category includes some half-precision FCVT instructions that do
+      // not require FPHalf.
+      return;
   }
 }
 
@@ -258,7 +342,22 @@
   RecordInstructionFeaturesScope scope(this);
   // All of these instructions require FP.
   scope.Record(CPUFeatures::kFP);
-  USE(instr);
+  switch (instr->Mask(FPDataProcessing2SourceMask)) {
+    case FMUL_h:
+    case FDIV_h:
+    case FADD_h:
+    case FSUB_h:
+    case FMAX_h:
+    case FMIN_h:
+    case FMAXNM_h:
+    case FMINNM_h:
+    case FNMUL_h:
+      scope.Record(CPUFeatures::kFPHalf);
+      return;
+    default:
+      // No special CPU features.
+      return;
+  }
 }
 
 void CPUFeaturesAuditor::VisitFPDataProcessing3Source(
@@ -266,14 +365,38 @@
   RecordInstructionFeaturesScope scope(this);
   // All of these instructions require FP.
   scope.Record(CPUFeatures::kFP);
-  USE(instr);
+  switch (instr->Mask(FPDataProcessing3SourceMask)) {
+    case FMADD_h:
+    case FMSUB_h:
+    case FNMADD_h:
+    case FNMSUB_h:
+      scope.Record(CPUFeatures::kFPHalf);
+      return;
+    default:
+      // No special CPU features.
+      return;
+  }
 }
 
 void CPUFeaturesAuditor::VisitFPFixedPointConvert(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
   // All of these instructions require FP.
   scope.Record(CPUFeatures::kFP);
-  USE(instr);
+  switch (instr->Mask(FPFixedPointConvertMask)) {
+    case FCVTZS_wh_fixed:
+    case FCVTZS_xh_fixed:
+    case FCVTZU_wh_fixed:
+    case FCVTZU_xh_fixed:
+    case SCVTF_hw_fixed:
+    case SCVTF_hx_fixed:
+    case UCVTF_hw_fixed:
+    case UCVTF_hx_fixed:
+      scope.Record(CPUFeatures::kFPHalf);
+      return;
+    default:
+      // No special CPU features.
+      return;
+  }
 }
 
 void CPUFeaturesAuditor::VisitFPImmediate(const Instruction* instr) {
@@ -290,16 +413,43 @@
   // All of these instructions require FP.
   scope.Record(CPUFeatures::kFP);
   switch (instr->Mask(FPIntegerConvertMask)) {
+    case FCVTAS_wh:
+    case FCVTAS_xh:
+    case FCVTAU_wh:
+    case FCVTAU_xh:
+    case FCVTMS_wh:
+    case FCVTMS_xh:
+    case FCVTMU_wh:
+    case FCVTMU_xh:
+    case FCVTNS_wh:
+    case FCVTNS_xh:
+    case FCVTNU_wh:
+    case FCVTNU_xh:
+    case FCVTPS_wh:
+    case FCVTPS_xh:
+    case FCVTPU_wh:
+    case FCVTPU_xh:
+    case FCVTZS_wh:
+    case FCVTZS_xh:
+    case FCVTZU_wh:
+    case FCVTZU_xh:
     case FMOV_hw:
+    case FMOV_hx:
     case FMOV_wh:
     case FMOV_xh:
-    case FMOV_hx:
+    case SCVTF_hw:
+    case SCVTF_hx:
+    case UCVTF_hw:
+    case UCVTF_hx:
       scope.Record(CPUFeatures::kFPHalf);
       return;
     case FMOV_d1_x:
     case FMOV_x_d1:
       scope.Record(CPUFeatures::kNEON);
       return;
+    case FJCVTZS:
+      scope.Record(CPUFeatures::kJSCVT);
+      return;
     default:
       // No special CPU features.
       return;
@@ -467,6 +617,13 @@
   }
 }
 
+void CPUFeaturesAuditor::VisitNEON2RegMiscFP16(const Instruction* instr) {
+  RecordInstructionFeaturesScope scope(this);
+  // All of these instructions require NEONHalf.
+  scope.Record(CPUFeatures::kFP, CPUFeatures::kNEON, CPUFeatures::kNEONHalf);
+  USE(instr);
+}
+
 void CPUFeaturesAuditor::VisitNEON3Different(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
   // All of these instructions require NEON.
@@ -490,6 +647,7 @@
   if ((instr->Mask(NEON3SameExtraFCMLAMask) == NEON_FCMLA) ||
       (instr->Mask(NEON3SameExtraFCADDMask) == NEON_FCADD)) {
     scope.Record(CPUFeatures::kFP, CPUFeatures::kFcma);
+    if (instr->GetNEONSize() == 1) scope.Record(CPUFeatures::kNEONHalf);
   } else {
     switch (instr->Mask(NEON3SameExtraMask)) {
       case NEON_SDOT:
@@ -507,11 +665,22 @@
   }
 }
 
+void CPUFeaturesAuditor::VisitNEON3SameFP16(const Instruction* instr) {
+  RecordInstructionFeaturesScope scope(this);
+  // All of these instructions require NEON FP16 support.
+  scope.Record(CPUFeatures::kFP, CPUFeatures::kNEON, CPUFeatures::kNEONHalf);
+  USE(instr);
+}
+
 void CPUFeaturesAuditor::VisitNEONAcrossLanes(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
   // All of these instructions require NEON.
   scope.Record(CPUFeatures::kNEON);
-  if (instr->Mask(NEONAcrossLanesFPFMask) == NEONAcrossLanesFPFixed) {
+  if (instr->Mask(NEONAcrossLanesFP16FMask) == NEONAcrossLanesFP16Fixed) {
+    // FMAXV_H, FMINV_H, FMAXNMV_H, FMINNMV_H
+    scope.Record(CPUFeatures::kFP, CPUFeatures::kNEONHalf);
+  } else if (instr->Mask(NEONAcrossLanesFPFMask) == NEONAcrossLanesFPFixed) {
+    // FMAXV, FMINV, FMAXNMV, FMINNMV
     scope.Record(CPUFeatures::kFP);
   }
 }
@@ -534,6 +703,12 @@
       break;
   }
   switch (instr->Mask(NEONByIndexedElementFPMask)) {
+    case NEON_FMLA_H_byelement:
+    case NEON_FMLS_H_byelement:
+    case NEON_FMUL_H_byelement:
+    case NEON_FMULX_H_byelement:
+      scope.Record(CPUFeatures::kNEONHalf);
+      VIXL_FALLTHROUGH();
     case NEON_FMLA_byelement:
     case NEON_FMLS_byelement:
     case NEON_FMUL_byelement:
@@ -544,6 +719,7 @@
       switch (instr->Mask(NEONByIndexedElementFPComplexMask)) {
         case NEON_FCMLA_byelement:
           scope.Record(CPUFeatures::kFP, CPUFeatures::kFcma);
+          if (instr->GetNEONSize() == 1) scope.Record(CPUFeatures::kNEONHalf);
           return;
       }
       // No additional features.
@@ -650,6 +826,13 @@
   }
 }
 
+void CPUFeaturesAuditor::VisitNEONScalar2RegMiscFP16(const Instruction* instr) {
+  RecordInstructionFeaturesScope scope(this);
+  // All of these instructions require NEONHalf.
+  scope.Record(CPUFeatures::kFP, CPUFeatures::kNEON, CPUFeatures::kNEONHalf);
+  USE(instr);
+}
+
 void CPUFeaturesAuditor::VisitNEONScalar3Diff(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
   // All of these instructions require NEON.
@@ -673,26 +856,39 @@
   USE(instr);
 }
 
+void CPUFeaturesAuditor::VisitNEONScalar3SameFP16(const Instruction* instr) {
+  RecordInstructionFeaturesScope scope(this);
+  // All of these instructions require NEONHalf.
+  scope.Record(CPUFeatures::kFP, CPUFeatures::kNEON, CPUFeatures::kNEONHalf);
+  USE(instr);
+}
+
 void CPUFeaturesAuditor::VisitNEONScalarByIndexedElement(
     const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
   // All of these instructions require NEON.
   scope.Record(CPUFeatures::kNEON);
   switch (instr->Mask(NEONScalarByIndexedElementMask)) {
-    case NEON_SQDMULL_byelement_scalar:
-    case NEON_SQDMLAL_byelement_scalar:
-    case NEON_SQDMLSL_byelement_scalar:
-    case NEON_SQDMULH_byelement_scalar:
-    case NEON_SQRDMULH_byelement_scalar:
-      // No additional features.
-      return;
     case NEON_SQRDMLAH_byelement_scalar:
     case NEON_SQRDMLSH_byelement_scalar:
       scope.Record(CPUFeatures::kRDM);
       return;
     default:
-      // FMUL, FMLA, FMLS, FMULX
-      scope.Record(CPUFeatures::kFP);
+      switch (instr->Mask(NEONScalarByIndexedElementFPMask)) {
+        case NEON_FMLA_H_byelement_scalar:
+        case NEON_FMLS_H_byelement_scalar:
+        case NEON_FMUL_H_byelement_scalar:
+        case NEON_FMULX_H_byelement_scalar:
+          scope.Record(CPUFeatures::kNEONHalf);
+          VIXL_FALLTHROUGH();
+        case NEON_FMLA_byelement_scalar:
+        case NEON_FMLS_byelement_scalar:
+        case NEON_FMUL_byelement_scalar:
+        case NEON_FMULX_byelement_scalar:
+          scope.Record(CPUFeatures::kFP);
+          return;
+      }
+      // No additional features.
       return;
   }
 }
@@ -709,6 +905,13 @@
   // All of these instructions require NEON.
   scope.Record(CPUFeatures::kNEON);
   switch (instr->Mask(NEONScalarPairwiseMask)) {
+    case NEON_FMAXNMP_h_scalar:
+    case NEON_FADDP_h_scalar:
+    case NEON_FMAXP_h_scalar:
+    case NEON_FMINNMP_h_scalar:
+    case NEON_FMINP_h_scalar:
+      scope.Record(CPUFeatures::kNEONHalf);
+      VIXL_FALLTHROUGH();
     case NEON_FADDP_scalar:
     case NEON_FMAXP_scalar:
     case NEON_FMAXNMP_scalar:
@@ -778,8 +981,35 @@
 }
 
 void CPUFeaturesAuditor::VisitSystem(const Instruction* instr) {
-  USE(instr);
   RecordInstructionFeaturesScope scope(this);
+  if (instr->Mask(SystemHintFMask) == SystemHintFixed) {
+    CPUFeatures required;
+    switch (instr->GetInstructionBits()) {
+      case PACIA1716:
+      case PACIB1716:
+      case AUTIA1716:
+      case AUTIB1716:
+      case PACIAZ:
+      case PACIASP:
+      case PACIBZ:
+      case PACIBSP:
+      case AUTIAZ:
+      case AUTIASP:
+      case AUTIBZ:
+      case AUTIBSP:
+      case XPACLRI:
+        required.Combine(CPUFeatures::kPAuth);
+        break;
+      default:
+        if (instr->GetImmHint() == ESB) required.Combine(CPUFeatures::kRAS);
+        break;
+    }
+
+    // These are all HINT instructions, and behave as NOPs if the corresponding
+    // features are not implemented, so we record the corresponding features
+    // only if they are available.
+    if (available_.Has(required)) scope.Record(required);
+  }
 }
 
 void CPUFeaturesAuditor::VisitTestBranch(const Instruction* instr) {
@@ -799,8 +1029,24 @@
 
 void CPUFeaturesAuditor::VisitUnconditionalBranchToRegister(
     const Instruction* instr) {
-  USE(instr);
   RecordInstructionFeaturesScope scope(this);
+  switch (instr->Mask(UnconditionalBranchToRegisterMask)) {
+    case BRAAZ:
+    case BRABZ:
+    case BLRAAZ:
+    case BLRABZ:
+    case RETAA:
+    case RETAB:
+    case BRAA:
+    case BRAB:
+    case BLRAA:
+    case BLRAB:
+      scope.Record(CPUFeatures::kPAuth);
+      return;
+    default:
+      // No additional features.
+      return;
+  }
 }
 
 void CPUFeaturesAuditor::VisitUnimplemented(const Instruction* instr) {

diff --git a/src/aarch64/decoder-aarch64.cc b/src/aarch64/decoder-aarch64.cc
index 9db8b51..4cac45c 100644
--- a/src/aarch64/decoder-aarch64.cc
+++ b/src/aarch64/decoder-aarch64.cc

@@ -257,10 +257,11 @@
           }
         }
       } else {
-        if ((instr->ExtractBit(24) == 0x1) ||
+        if (((instr->ExtractBit(24) == 0x1) &&
+             (instr->ExtractBits(23, 21) > 0x1)) ||
             (instr->ExtractBits(20, 16) != 0x1F) ||
-            (instr->ExtractBits(15, 10) != 0) ||
-            (instr->ExtractBits(4, 0) != 0) ||
+            (instr->ExtractBits(15, 10) == 0x1) ||
+            (instr->ExtractBits(15, 10) > 0x3) ||
             (instr->ExtractBits(24, 21) == 0x3) ||
             (instr->ExtractBits(24, 22) == 0x3)) {
           VisitUnallocated(instr);
@@ -323,8 +324,7 @@
           VisitLoadLiteral(instr);
         }
       } else {
-        if ((instr->Mask(0x84C00000) == 0x80C00000) ||
-            (instr->Mask(0x44800000) == 0x44800000) ||
+        if ((instr->Mask(0x44800000) == 0x44800000) ||
             (instr->Mask(0x84800000) == 0x84800000)) {
           VisitUnallocated(instr);
         } else {
@@ -364,7 +364,29 @@
                 VisitLoadStoreRegisterOffset(instr);
               }
             } else {
-              VisitUnallocated(instr);
+              if (instr->ExtractBits(11, 10) == 0x0) {
+                if (instr->ExtractBit(25) == 0) {
+                  if (instr->ExtractBit(26) == 0) {
+                    if ((instr->ExtractBit(15) == 1) &&
+                        ((instr->ExtractBits(14, 12) == 0x1) ||
+                         (instr->ExtractBit(13) == 1) ||
+                         (instr->ExtractBits(14, 12) == 0x5) ||
+                         ((instr->ExtractBits(14, 12) == 0x4) &&
+                          ((instr->ExtractBit(23) == 0) ||
+                           (instr->ExtractBits(23, 22) == 0x3))))) {
+                      VisitUnallocated(instr);
+                    } else {
+                      VisitAtomicMemory(instr);
+                    }
+                  } else {
+                    VisitUnallocated(instr);
+                  }
+                } else {
+                  VisitUnallocated(instr);
+                }
+              } else {
+                VisitUnallocated(instr);
+              }
             }
           }
         }
@@ -507,7 +529,8 @@
               if ((instr->ExtractBit(15) == 0x1) ||
                   (instr->ExtractBits(15, 11) == 0) ||
                   (instr->ExtractBits(15, 12) == 0x1) ||
-                  (instr->ExtractBits(15, 12) == 0x3) ||
+                  ((instr->ExtractBits(15, 12) == 0x3) &&
+                   (instr->ExtractBit(31) == 0)) ||
                   (instr->ExtractBits(15, 13) == 0x3) ||
                   (instr->Mask(0x8000EC00) == 0x00004C00) ||
                   (instr->Mask(0x8000E800) == 0x80004000) ||
@@ -517,11 +540,15 @@
                 VisitDataProcessing2Source(instr);
               }
             } else {
-              if ((instr->ExtractBit(13) == 1) ||
-                  (instr->ExtractBits(20, 16) != 0) ||
-                  (instr->ExtractBits(15, 14) != 0) ||
-                  (instr->Mask(0xA01FFC00) == 0x00000C00) ||
-                  (instr->Mask(0x201FF800) == 0x00001800)) {
+              if ((instr->ExtractBits(20, 17) != 0) ||
+                  (instr->ExtractBit(15) == 1) ||
+                  ((instr->ExtractBit(16) == 1) &&
+                   ((instr->ExtractBits(14, 10) > 17) ||
+                    (instr->ExtractBit(31) == 0))) ||
+                  ((instr->ExtractBit(16) == 0) &&
+                   ((instr->ExtractBits(14, 13) != 0) ||
+                    (instr->Mask(0xA01FFC00) == 0x00000C00) ||
+                    (instr->Mask(0x201FF800) == 0x00001800)))) {
                 VisitUnallocated(instr);
               } else {
                 VisitDataProcessing1Source(instr);
@@ -588,7 +615,8 @@
       if (instr->ExtractBit(29) == 0) {
         if (instr->ExtractBit(24) == 0) {
           if (instr->ExtractBit(21) == 0) {
-            if ((instr->ExtractBit(23) == 1) || (instr->ExtractBit(18) == 1) ||
+            if ((instr->ExtractBits(23, 22) == 0x2) ||
+                (instr->ExtractBit(18) == 1) ||
                 (instr->Mask(0x80008000) == 0x00000000) ||
                 (instr->Mask(0x000E0000) == 0x00000000) ||
                 (instr->Mask(0x000E0000) == 0x000A0000) ||
@@ -610,7 +638,6 @@
                   (instr->Mask(0x20C60000) == 0x00840000) ||
                   (instr->Mask(0xA0C60000) == 0x80060000) ||
                   (instr->Mask(0xA0C60000) == 0x00860000) ||
-                  (instr->Mask(0xA0C60000) == 0x00460000) ||
                   (instr->Mask(0xA0CE0000) == 0x80860000) ||
                   (instr->Mask(0xA0CE0000) == 0x804E0000) ||
                   (instr->Mask(0xA0CE0000) == 0x000E0000) ||
@@ -632,7 +659,7 @@
                   (masked_A0DF8000 == 0x00468000) ||
                   (instr->Mask(0xA0D80000) == 0x00800000) ||
                   (instr->Mask(0xA0DF0000) == 0x00C30000) ||
-                  (instr->Mask(0xA0DC0000) == 0x00C40000)) {
+                  (instr->Mask(0xA0DF8000) == 0x00C68000)) {
                 VisitUnallocated(instr);
               } else {
                 VisitFPDataProcessing1Source(instr);
@@ -640,7 +667,8 @@
             } else if (instr->ExtractBits(13, 10) == 8) {
               if ((instr->ExtractBits(15, 14) != 0) ||
                   (instr->ExtractBits(2, 0) != 0) ||
-                  (instr->Mask(0x80800000) != 0x00000000)) {
+                  (instr->ExtractBit(31) == 1) ||
+                  (instr->ExtractBits(23, 22) == 0x2)) {
                 VisitUnallocated(instr);
               } else {
                 VisitFPCompare(instr);
@@ -655,7 +683,8 @@
                 VisitFPImmediate(instr);
               }
             } else {
-              if (instr->Mask(0x80800000) != 0x00000000) {
+              if ((instr->ExtractBits(23, 22) == 0x2) ||
+                  (instr->ExtractBit(31) == 1)) {
                 VisitUnallocated(instr);
               } else {
                 switch (instr->ExtractBits(11, 10)) {
@@ -664,9 +693,7 @@
                     break;
                   }
                   case 2: {
-                    if ((instr->ExtractBits(15, 14) == 0x3) ||
-                        (instr->Mask(0x00009000) == 0x00009000) ||
-                        (instr->Mask(0x0000A000) == 0x0000A000)) {
+                    if (instr->ExtractBits(15, 12) > 0x8) {
                       VisitUnallocated(instr);
                     } else {
                       VisitFPDataProcessing2Source(instr);
@@ -686,7 +713,8 @@
         } else {
           // Bit 30 == 1 has been handled earlier.
           VIXL_ASSERT(instr->ExtractBit(30) == 0);
-          if (instr->Mask(0xA0800000) != 0) {
+          if ((instr->Mask(0xA0000000) != 0) ||
+              (instr->ExtractBits(23, 22) == 0x2)) {
             VisitUnallocated(instr);
           } else {
             VisitFPDataProcessing3Source(instr);
@@ -750,6 +778,27 @@
           } else {
             if (instr->ExtractBits(23, 22) == 0) {
               VisitNEONCopy(instr);
+            } else if (instr->ExtractBit(14) == 0x0 &&
+                       instr->ExtractBit(22) == 0x1) {
+              // U + a + opcode.
+              uint8_t decode_field =
+                  (instr->ExtractBit(29) << 1) | instr->ExtractBit(23);
+              decode_field = (decode_field << 3) | instr->ExtractBits(13, 11);
+              switch (decode_field) {
+                case 0x5:
+                case 0xB:
+                case 0xC:
+                case 0xD:
+                case 0x11:
+                case 0x19:
+                case 0x1B:
+                case 0x1F:
+                  VisitUnallocated(instr);
+                  break;
+                default:
+                  VisitNEON3SameFP16(instr);
+                  break;
+              }
             } else {
               VisitUnallocated(instr);
             }
@@ -799,7 +848,23 @@
                 if (instr->ExtractBit(19) == 0) {
                   VisitNEONAcrossLanes(instr);
                 } else {
-                  VisitUnallocated(instr);
+                  if (instr->ExtractBit(22) == 0) {
+                    VisitUnallocated(instr);
+                  } else {
+                    if ((instr->ExtractBits(16, 15) == 0x0) ||
+                        (instr->ExtractBits(16, 14) == 0x2) ||
+                        (instr->ExtractBits(16, 15) == 0x2) ||
+                        (instr->ExtractBits(16, 12) == 0x1e) ||
+                        ((instr->ExtractBit(23) == 0) &&
+                         ((instr->ExtractBits(16, 14) == 0x3) ||
+                          (instr->ExtractBits(16, 12) == 0x1f))) ||
+                        ((instr->ExtractBit(23) == 1) &&
+                         (instr->ExtractBits(16, 12) == 0x1c))) {
+                      VisitUnallocated(instr);
+                    } else {
+                      VisitNEON2RegMiscFP16(instr);
+                    }
+                  }
                 }
               }
             } else {
@@ -850,7 +915,26 @@
           if (instr->ExtractBits(23, 22) == 0) {
             VisitNEONScalarCopy(instr);
           } else {
-            VisitUnallocated(instr);
+            if (instr->Mask(0x00404000) == 0x00400000) {
+              if ((instr->ExtractBits(13, 11) == 0x6) ||
+                  (instr->ExtractBits(13, 11) < 2) ||
+                  ((instr->Mask(0x20800000) == 0x00000000) &&
+                   ((instr->ExtractBits(13, 11) < 0x3) ||
+                    (instr->ExtractBits(13, 11) == 0x5))) ||
+                  ((instr->Mask(0x20800000) == 0x00800000) &&
+                   (instr->ExtractBits(13, 11) < 0x7)) ||
+                  ((instr->Mask(0x20800000) == 0x20000000) &&
+                   ((instr->ExtractBits(13, 11) < 0x4) ||
+                    (instr->ExtractBits(13, 11) == 0x7))) ||
+                  ((instr->Mask(0x20800000) == 0x20800000) &&
+                   (instr->ExtractBits(12, 11) == 0x3))) {
+                VisitUnallocated(instr);
+              } else {
+                VisitNEONScalar3SameFP16(instr);
+              }
+            } else {
+              VisitUnallocated(instr);
+            }
           }
         }
       } else {
@@ -884,7 +968,28 @@
               if (instr->ExtractBit(19) == 0) {
                 VisitNEONScalarPairwise(instr);
               } else {
-                VisitUnallocated(instr);
+                if (instr->ExtractBit(22) == 0) {
+                  VisitUnallocated(instr);
+                } else {
+                  if ((instr->ExtractBits(16, 15) == 0x0) ||
+                      (instr->ExtractBits(16, 14) == 0x2) ||
+                      (instr->ExtractBits(16, 15) == 0x2) ||
+                      (instr->ExtractBits(16, 13) == 0xc) ||
+                      (instr->ExtractBits(16, 12) == 0x1e) ||
+                      ((instr->ExtractBit(23) == 0) &&
+                       ((instr->ExtractBits(16, 14) == 0x3) ||
+                        (instr->ExtractBits(16, 12) == 0x1f))) ||
+                      ((instr->ExtractBit(23) == 1) &&
+                       ((instr->ExtractBits(16, 12) == 0xf) ||
+                        (instr->ExtractBits(16, 12) == 0x1c) ||
+                        ((instr->ExtractBit(29) == 1) &&
+                         ((instr->ExtractBits(16, 12) == 0xe) ||
+                          (instr->ExtractBits(16, 12) == 0x1f)))))) {
+                    VisitUnallocated(instr);
+                  } else {
+                    VisitNEONScalar2RegMiscFP16(instr);
+                  }
+                }
               }
             }
           } else {

diff --git a/src/aarch64/decoder-aarch64.h b/src/aarch64/decoder-aarch64.h
index 4ebc7ce..100fbb3 100644
--- a/src/aarch64/decoder-aarch64.h
+++ b/src/aarch64/decoder-aarch64.h

@@ -37,56 +37,54 @@
 // List macro containing all visitors needed by the decoder class.
 
 #define VISITOR_LIST_THAT_RETURN(V)     \
-  V(PCRelAddressing)                    \
-  V(AddSubImmediate)                    \
-  V(LogicalImmediate)                   \
-  V(MoveWideImmediate)                  \
-  V(Bitfield)                           \
-  V(Extract)                            \
-  V(UnconditionalBranch)                \
-  V(UnconditionalBranchToRegister)      \
-  V(CompareBranch)                      \
-  V(TestBranch)                         \
-  V(ConditionalBranch)                  \
-  V(System)                             \
-  V(Exception)                          \
-  V(LoadStorePairPostIndex)             \
-  V(LoadStorePairOffset)                \
-  V(LoadStorePairPreIndex)              \
-  V(LoadStorePairNonTemporal)           \
-  V(LoadLiteral)                        \
-  V(LoadStoreUnscaledOffset)            \
-  V(LoadStorePostIndex)                 \
-  V(LoadStorePreIndex)                  \
-  V(LoadStoreRegisterOffset)            \
-  V(LoadStoreUnsignedOffset)            \
-  V(LoadStoreExclusive)                 \
-  V(LogicalShifted)                     \
-  V(AddSubShifted)                      \
   V(AddSubExtended)                     \
+  V(AddSubImmediate)                    \
+  V(AddSubShifted)                      \
   V(AddSubWithCarry)                    \
-  V(ConditionalCompareRegister)         \
+  V(AtomicMemory)                       \
+  V(Bitfield)                           \
+  V(CompareBranch)                      \
+  V(ConditionalBranch)                  \
   V(ConditionalCompareImmediate)        \
+  V(ConditionalCompareRegister)         \
   V(ConditionalSelect)                  \
-  V(DataProcessing1Source)              \
-  V(DataProcessing2Source)              \
-  V(DataProcessing3Source)              \
-  V(FPCompare)                          \
-  V(FPConditionalCompare)               \
-  V(FPConditionalSelect)                \
-  V(FPImmediate)                        \
-  V(FPDataProcessing1Source)            \
-  V(FPDataProcessing2Source)            \
-  V(FPDataProcessing3Source)            \
-  V(FPIntegerConvert)                   \
-  V(FPFixedPointConvert)                \
   V(Crypto2RegSHA)                      \
   V(Crypto3RegSHA)                      \
   V(CryptoAES)                          \
+  V(DataProcessing1Source)              \
+  V(DataProcessing2Source)              \
+  V(DataProcessing3Source)              \
+  V(Exception)                          \
+  V(Extract)                            \
+  V(FPCompare)                          \
+  V(FPConditionalCompare)               \
+  V(FPConditionalSelect)                \
+  V(FPDataProcessing1Source)            \
+  V(FPDataProcessing2Source)            \
+  V(FPDataProcessing3Source)            \
+  V(FPFixedPointConvert)                \
+  V(FPImmediate)                        \
+  V(FPIntegerConvert)                   \
+  V(LoadLiteral)                        \
+  V(LoadStoreExclusive)                 \
+  V(LoadStorePairNonTemporal)           \
+  V(LoadStorePairOffset)                \
+  V(LoadStorePairPostIndex)             \
+  V(LoadStorePairPreIndex)              \
+  V(LoadStorePostIndex)                 \
+  V(LoadStorePreIndex)                  \
+  V(LoadStoreRegisterOffset)            \
+  V(LoadStoreUnscaledOffset)            \
+  V(LoadStoreUnsignedOffset)            \
+  V(LogicalImmediate)                   \
+  V(LogicalShifted)                     \
+  V(MoveWideImmediate)                  \
   V(NEON2RegMisc)                       \
+  V(NEON2RegMiscFP16)                   \
   V(NEON3Different)                     \
   V(NEON3Same)                          \
   V(NEON3SameExtra)                     \
+  V(NEON3SameFP16)                      \
   V(NEONAcrossLanes)                    \
   V(NEONByIndexedElement)               \
   V(NEONCopy)                           \
@@ -96,17 +94,24 @@
   V(NEONLoadStoreSingleStruct)          \
   V(NEONLoadStoreSingleStructPostIndex) \
   V(NEONModifiedImmediate)              \
+  V(NEONPerm)                           \
   V(NEONScalar2RegMisc)                 \
+  V(NEONScalar2RegMiscFP16)             \
   V(NEONScalar3Diff)                    \
   V(NEONScalar3Same)                    \
   V(NEONScalar3SameExtra)               \
+  V(NEONScalar3SameFP16)                \
   V(NEONScalarByIndexedElement)         \
   V(NEONScalarCopy)                     \
   V(NEONScalarPairwise)                 \
   V(NEONScalarShiftImmediate)           \
   V(NEONShiftImmediate)                 \
   V(NEONTable)                          \
-  V(NEONPerm)
+  V(PCRelAddressing)                    \
+  V(System)                             \
+  V(TestBranch)                         \
+  V(UnconditionalBranch)                \
+  V(UnconditionalBranchToRegister)
 
 #define VISITOR_LIST_THAT_DONT_RETURN(V) \
   V(Unallocated)                         \

diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc
index 0b76ae6..1c00443 100644
--- a/src/aarch64/disasm-aarch64.cc
+++ b/src/aarch64/disasm-aarch64.cc

@@ -28,7 +28,6 @@
 #include <sstream>
 
 #include "disasm-aarch64.h"
-#include "utils-aarch64.h"
 
 namespace vixl {
 namespace aarch64 {
@@ -489,6 +488,7 @@
   const char *form_shift_right = "'Rd, 'Rn, 'IBr";
   const char *form_extend = "'Rd, 'Wn";
   const char *form_bfiz = "'Rd, 'Rn, 'IBZ-r, 'IBs+1";
+  const char *form_bfc = "'Rd, 'IBZ-r, 'IBs+1";
   const char *form_bfx = "'Rd, 'Rn, 'IBr, 'IBs-r+1";
   const char *form_lsl = "'Rd, 'Rn, 'IBZ-r";
 
@@ -548,8 +548,13 @@
       mnemonic = "bfxil";
       form = form_bfx;
       if (s < r) {
-        mnemonic = "bfi";
-        form = form_bfiz;
+        if (instr->GetRn() == kZeroRegCode) {
+          mnemonic = "bfc";
+          form = form_bfc;
+        } else {
+          mnemonic = "bfi";
+          form = form_bfiz;
+        }
       }
     }
   }
@@ -607,22 +612,66 @@
 void Disassembler::VisitUnconditionalBranchToRegister(
     const Instruction *instr) {
   const char *mnemonic = "unimplemented";
-  const char *form = "'Xn";
+  const char *form;
 
   switch (instr->Mask(UnconditionalBranchToRegisterMask)) {
     case BR:
       mnemonic = "br";
+      form = "'Xn";
       break;
     case BLR:
       mnemonic = "blr";
+      form = "'Xn";
       break;
     case RET: {
       mnemonic = "ret";
       if (instr->GetRn() == kLinkRegCode) {
         form = NULL;
+      } else {
+        form = "'Xn";
       }
       break;
     }
+    case BRAAZ:
+      mnemonic = "braaz";
+      form = "'Xn";
+      break;
+    case BRABZ:
+      mnemonic = "brabz";
+      form = "'Xn";
+      break;
+    case BLRAAZ:
+      mnemonic = "blraaz";
+      form = "'Xn";
+      break;
+    case BLRABZ:
+      mnemonic = "blrabz";
+      form = "'Xn";
+      break;
+    case RETAA:
+      mnemonic = "retaa";
+      form = NULL;
+      break;
+    case RETAB:
+      mnemonic = "retab";
+      form = NULL;
+      break;
+    case BRAA:
+      mnemonic = "braa";
+      form = "'Xn, 'Xds";
+      break;
+    case BRAB:
+      mnemonic = "brab";
+      form = "'Xn, 'Xds";
+      break;
+    case BLRAA:
+      mnemonic = "blraa";
+      form = "'Xn, 'Xds";
+      break;
+    case BLRAB:
+      mnemonic = "blrab";
+      form = "'Xn, 'Xds";
+      break;
     default:
       form = "(UnconditionalBranchToRegister)";
   }
@@ -664,6 +713,41 @@
     FORMAT(CLZ, "clz");
     FORMAT(CLS, "cls");
 #undef FORMAT
+
+#define PAUTH_VARIATIONS(V) \
+  V(PACI, "paci")           \
+  V(PACD, "pacd")           \
+  V(AUTI, "auti")           \
+  V(AUTD, "autd")
+#define PAUTH_CASE(NAME, MN) \
+  case NAME##A:              \
+    mnemonic = MN "a";       \
+    form = "'Xd, 'Xns";      \
+    break;                   \
+  case NAME##ZA:             \
+    mnemonic = MN "za";      \
+    form = "'Xd";            \
+    break;                   \
+  case NAME##B:              \
+    mnemonic = MN "b";       \
+    form = "'Xd, 'Xns";      \
+    break;                   \
+  case NAME##ZB:             \
+    mnemonic = MN "zb";      \
+    form = "'Xd";            \
+    break;
+
+    PAUTH_VARIATIONS(PAUTH_CASE)
+#undef PAUTH_CASE
+
+    case XPACI:
+      mnemonic = "xpaci";
+      form = "'Xd";
+      break;
+    case XPACD:
+      mnemonic = "xpacd";
+      form = "'Xd";
+      break;
     case REV32_x:
       mnemonic = "rev32";
       break;
@@ -692,6 +776,10 @@
     FORMAT(ASRV, "asr");
     FORMAT(RORV, "ror");
 #undef FORMAT
+    case PACGA:
+      mnemonic = "pacga";
+      form = "'Xd, 'Xn, 'Xms";
+      break;
     case CRC32B:
       mnemonic = "crc32b";
       break;
@@ -1347,6 +1435,152 @@
   Format(instr, mnemonic, form);
 }
 
+#define ATOMIC_MEMORY_SIMPLE_LIST(V) \
+  V(LDADD, "add")                    \
+  V(LDCLR, "clr")                    \
+  V(LDEOR, "eor")                    \
+  V(LDSET, "set")                    \
+  V(LDSMAX, "smax")                  \
+  V(LDSMIN, "smin")                  \
+  V(LDUMAX, "umax")                  \
+  V(LDUMIN, "umin")
+
+void Disassembler::VisitAtomicMemory(const Instruction *instr) {
+  const int kMaxAtomicOpMnemonicLength = 16;
+  const char *mnemonic;
+  const char *form = "'Ws, 'Wt, ['Xns]";
+
+  switch (instr->Mask(AtomicMemoryMask)) {
+#define AMS(A, MN)             \
+  case A##B:                   \
+    mnemonic = MN "b";         \
+    break;                     \
+  case A##AB:                  \
+    mnemonic = MN "ab";        \
+    break;                     \
+  case A##LB:                  \
+    mnemonic = MN "lb";        \
+    break;                     \
+  case A##ALB:                 \
+    mnemonic = MN "alb";       \
+    break;                     \
+  case A##H:                   \
+    mnemonic = MN "h";         \
+    break;                     \
+  case A##AH:                  \
+    mnemonic = MN "ah";        \
+    break;                     \
+  case A##LH:                  \
+    mnemonic = MN "lh";        \
+    break;                     \
+  case A##ALH:                 \
+    mnemonic = MN "alh";       \
+    break;                     \
+  case A##_w:                  \
+    mnemonic = MN;             \
+    break;                     \
+  case A##A_w:                 \
+    mnemonic = MN "a";         \
+    break;                     \
+  case A##L_w:                 \
+    mnemonic = MN "l";         \
+    break;                     \
+  case A##AL_w:                \
+    mnemonic = MN "al";        \
+    break;                     \
+  case A##_x:                  \
+    mnemonic = MN;             \
+    form = "'Xs, 'Xt, ['Xns]"; \
+    break;                     \
+  case A##A_x:                 \
+    mnemonic = MN "a";         \
+    form = "'Xs, 'Xt, ['Xns]"; \
+    break;                     \
+  case A##L_x:                 \
+    mnemonic = MN "l";         \
+    form = "'Xs, 'Xt, ['Xns]"; \
+    break;                     \
+  case A##AL_x:                \
+    mnemonic = MN "al";        \
+    form = "'Xs, 'Xt, ['Xns]"; \
+    break;
+    ATOMIC_MEMORY_SIMPLE_LIST(AMS)
+
+    // SWP has the same semantics as ldadd etc but without the store aliases.
+    AMS(SWP, "swp")
+#undef AMS
+
+    case LDAPRB:
+      mnemonic = "ldaprb";
+      form = "'Wt, ['Xns]";
+      break;
+    case LDAPRH:
+      mnemonic = "ldaprh";
+      form = "'Wt, ['Xns]";
+      break;
+    case LDAPR_w:
+      mnemonic = "ldapr";
+      form = "'Wt, ['Xns]";
+      break;
+    case LDAPR_x:
+      mnemonic = "ldapr";
+      form = "'Xt, ['Xns]";
+      break;
+    default:
+      mnemonic = "unimplemented";
+      form = "(AtomicMemory)";
+  }
+
+  const char *prefix = "";
+  switch (instr->Mask(AtomicMemoryMask)) {
+#define AMS(A, MN)                   \
+  case A##AB:                        \
+  case A##ALB:                       \
+  case A##AH:                        \
+  case A##ALH:                       \
+  case A##A_w:                       \
+  case A##AL_w:                      \
+  case A##A_x:                       \
+  case A##AL_x:                      \
+    prefix = "ld";                   \
+    break;                           \
+  case A##B:                         \
+  case A##LB:                        \
+  case A##H:                         \
+  case A##LH:                        \
+  case A##_w:                        \
+  case A##L_w: {                     \
+    prefix = "ld";                   \
+    unsigned rt = instr->GetRt();    \
+    if (Register(rt, 32).IsZero()) { \
+      prefix = "st";                 \
+      form = "'Ws, ['Xns]";          \
+    }                                \
+    break;                           \
+  }                                  \
+  case A##_x:                        \
+  case A##L_x: {                     \
+    prefix = "ld";                   \
+    unsigned rt = instr->GetRt();    \
+    if (Register(rt, 64).IsZero()) { \
+      prefix = "st";                 \
+      form = "'Xs, ['Xns]";          \
+    }                                \
+    break;                           \
+  }
+    ATOMIC_MEMORY_SIMPLE_LIST(AMS)
+#undef AMS
+  }
+
+  char buffer[kMaxAtomicOpMnemonicLength];
+  if (strlen(prefix) > 0) {
+    snprintf(buffer, kMaxAtomicOpMnemonicLength, "%s%s", prefix, mnemonic);
+    mnemonic = buffer;
+  }
+
+  Format(instr, mnemonic, form);
+}
+
 
 void Disassembler::VisitFPCompare(const Instruction *instr) {
   const char *mnemonic = "unimplemented";
@@ -1354,18 +1588,22 @@
   const char *form_zero = "'Fn, #0.0";
 
   switch (instr->Mask(FPCompareMask)) {
+    case FCMP_h_zero:
     case FCMP_s_zero:
     case FCMP_d_zero:
       form = form_zero;
       VIXL_FALLTHROUGH();
+    case FCMP_h:
     case FCMP_s:
     case FCMP_d:
       mnemonic = "fcmp";
       break;
+    case FCMPE_h_zero:
     case FCMPE_s_zero:
     case FCMPE_d_zero:
       form = form_zero;
       VIXL_FALLTHROUGH();
+    case FCMPE_h:
     case FCMPE_s:
     case FCMPE_d:
       mnemonic = "fcmpe";
@@ -1382,10 +1620,12 @@
   const char *form = "'Fn, 'Fm, 'INzcv, 'Cond";
 
   switch (instr->Mask(FPConditionalCompareMask)) {
+    case FCCMP_h:
     case FCCMP_s:
     case FCCMP_d:
       mnemonic = "fccmp";
       break;
+    case FCCMPE_h:
     case FCCMPE_s:
     case FCCMPE_d:
       mnemonic = "fccmpe";
@@ -1402,6 +1642,7 @@
   const char *form = "'Fd, 'Fn, 'Fm, 'Cond";
 
   switch (instr->Mask(FPConditionalSelectMask)) {
+    case FCSEL_h:
     case FCSEL_s:
     case FCSEL_d:
       mnemonic = "fcsel";
@@ -1418,7 +1659,6 @@
   const char *form = "'Fd, 'Fn";
 
   switch (instr->Mask(FPDataProcessing1SourceMask)) {
-// Duplicated until half precision support for all fp instructions.
 #define FORMAT(A, B) \
   case A##_h:        \
   case A##_s:        \
@@ -1426,12 +1666,6 @@
     mnemonic = B;    \
     break;
     FORMAT(FMOV, "fmov");
-#undef FORMAT
-#define FORMAT(A, B) \
-  case A##_s:        \
-  case A##_d:        \
-    mnemonic = B;    \
-    break;
     FORMAT(FABS, "fabs");
     FORMAT(FNEG, "fneg");
     FORMAT(FSQRT, "fsqrt");
@@ -1480,14 +1714,15 @@
 
   switch (instr->Mask(FPDataProcessing2SourceMask)) {
 #define FORMAT(A, B) \
+  case A##_h:        \
   case A##_s:        \
   case A##_d:        \
     mnemonic = B;    \
     break;
-    FORMAT(FMUL, "fmul");
-    FORMAT(FDIV, "fdiv");
     FORMAT(FADD, "fadd");
     FORMAT(FSUB, "fsub");
+    FORMAT(FMUL, "fmul");
+    FORMAT(FDIV, "fdiv");
     FORMAT(FMAX, "fmax");
     FORMAT(FMIN, "fmin");
     FORMAT(FMAXNM, "fmaxnm");
@@ -1507,6 +1742,7 @@
 
   switch (instr->Mask(FPDataProcessing3SourceMask)) {
 #define FORMAT(A, B) \
+  case A##_h:        \
   case A##_s:        \
   case A##_d:        \
     mnemonic = B;    \
@@ -1575,6 +1811,8 @@
       mnemonic = "fmov";
       form = "'Rd, 'Vn.D[1]";
       break;
+    case FCVTAS_wh:
+    case FCVTAS_xh:
     case FCVTAS_ws:
     case FCVTAS_xs:
     case FCVTAS_wd:
@@ -1582,6 +1820,8 @@
       mnemonic = "fcvtas";
       form = form_rf;
       break;
+    case FCVTAU_wh:
+    case FCVTAU_xh:
     case FCVTAU_ws:
     case FCVTAU_xs:
     case FCVTAU_wd:
@@ -1589,6 +1829,8 @@
       mnemonic = "fcvtau";
       form = form_rf;
       break;
+    case FCVTMS_wh:
+    case FCVTMS_xh:
     case FCVTMS_ws:
     case FCVTMS_xs:
     case FCVTMS_wd:
@@ -1596,6 +1838,8 @@
       mnemonic = "fcvtms";
       form = form_rf;
       break;
+    case FCVTMU_wh:
+    case FCVTMU_xh:
     case FCVTMU_ws:
     case FCVTMU_xs:
     case FCVTMU_wd:
@@ -1603,6 +1847,8 @@
       mnemonic = "fcvtmu";
       form = form_rf;
       break;
+    case FCVTNS_wh:
+    case FCVTNS_xh:
     case FCVTNS_ws:
     case FCVTNS_xs:
     case FCVTNS_wd:
@@ -1610,6 +1856,8 @@
       mnemonic = "fcvtns";
       form = form_rf;
       break;
+    case FCVTNU_wh:
+    case FCVTNU_xh:
     case FCVTNU_ws:
     case FCVTNU_xs:
     case FCVTNU_wd:
@@ -1617,34 +1865,44 @@
       mnemonic = "fcvtnu";
       form = form_rf;
       break;
-    case FCVTZU_xd:
+    case FCVTZU_wh:
+    case FCVTZU_xh:
     case FCVTZU_ws:
-    case FCVTZU_wd:
     case FCVTZU_xs:
+    case FCVTZU_wd:
+    case FCVTZU_xd:
       mnemonic = "fcvtzu";
       form = form_rf;
       break;
-    case FCVTZS_xd:
-    case FCVTZS_wd:
-    case FCVTZS_xs:
+    case FCVTZS_wh:
+    case FCVTZS_xh:
     case FCVTZS_ws:
+    case FCVTZS_xs:
+    case FCVTZS_wd:
+    case FCVTZS_xd:
       mnemonic = "fcvtzs";
       form = form_rf;
       break;
-    case FCVTPU_xd:
-    case FCVTPU_ws:
-    case FCVTPU_wd:
+    case FCVTPU_wh:
+    case FCVTPU_xh:
     case FCVTPU_xs:
+    case FCVTPU_wd:
+    case FCVTPU_ws:
+    case FCVTPU_xd:
       mnemonic = "fcvtpu";
       form = form_rf;
       break;
-    case FCVTPS_xd:
-    case FCVTPS_wd:
-    case FCVTPS_xs:
+    case FCVTPS_wh:
+    case FCVTPS_xh:
     case FCVTPS_ws:
+    case FCVTPS_xs:
+    case FCVTPS_wd:
+    case FCVTPS_xd:
       mnemonic = "fcvtps";
       form = form_rf;
       break;
+    case SCVTF_hw:
+    case SCVTF_hx:
     case SCVTF_sw:
     case SCVTF_sx:
     case SCVTF_dw:
@@ -1652,6 +1910,8 @@
       mnemonic = "scvtf";
       form = form_fr;
       break;
+    case UCVTF_hw:
+    case UCVTF_hx:
     case UCVTF_sw:
     case UCVTF_sx:
     case UCVTF_dw:
@@ -1659,6 +1919,10 @@
       mnemonic = "ucvtf";
       form = form_fr;
       break;
+    case FJCVTZS:
+      mnemonic = "fjcvtzs";
+      form = form_rf;
+      break;
   }
   Format(instr, mnemonic, form);
 }
@@ -1670,18 +1934,24 @@
   const char *form_fr = "'Fd, 'Rn, 'IFPFBits";
 
   switch (instr->Mask(FPFixedPointConvertMask)) {
+    case FCVTZS_wh_fixed:
+    case FCVTZS_xh_fixed:
     case FCVTZS_ws_fixed:
     case FCVTZS_xs_fixed:
     case FCVTZS_wd_fixed:
     case FCVTZS_xd_fixed:
       mnemonic = "fcvtzs";
       break;
+    case FCVTZU_wh_fixed:
+    case FCVTZU_xh_fixed:
     case FCVTZU_ws_fixed:
     case FCVTZU_xs_fixed:
     case FCVTZU_wd_fixed:
     case FCVTZU_xd_fixed:
       mnemonic = "fcvtzu";
       break;
+    case SCVTF_hw_fixed:
+    case SCVTF_hx_fixed:
     case SCVTF_sw_fixed:
     case SCVTF_sx_fixed:
     case SCVTF_dw_fixed:
@@ -1689,6 +1959,8 @@
       mnemonic = "scvtf";
       form = form_fr;
       break;
+    case UCVTF_hw_fixed:
+    case UCVTF_hx_fixed:
     case UCVTF_sw_fixed:
     case UCVTF_sx_fixed:
     case UCVTF_dw_fixed:
@@ -1702,6 +1974,21 @@
   Format(instr, mnemonic, form);
 }
 
+// clang-format off
+#define PAUTH_SYSTEM_MNEMONICS(V) \
+  V(PACIA1716, "pacia1716")       \
+  V(PACIB1716, "pacib1716")       \
+  V(AUTIA1716, "autia1716")       \
+  V(AUTIB1716, "autib1716")       \
+  V(PACIAZ,    "paciaz")          \
+  V(PACIASP,   "paciasp")         \
+  V(PACIBZ,    "pacibz")          \
+  V(PACIBSP,   "pacibsp")         \
+  V(AUTIAZ,    "autiaz")          \
+  V(AUTIASP,   "autiasp")         \
+  V(AUTIBZ,    "autibz")          \
+  V(AUTIBSP,   "autibsp")
+// clang-format on
 
 void Disassembler::VisitSystem(const Instruction *instr) {
   // Some system instructions hijack their Op and Cp fields to represent a
@@ -1709,8 +1996,22 @@
   // makes the decoding tricky.
   const char *mnemonic = "unimplemented";
   const char *form = "(System)";
+  if (instr->GetInstructionBits() == XPACLRI) {
+    mnemonic = "xpaclri";
+    form = NULL;
+  } else if (instr->Mask(SystemPAuthFMask) == SystemPAuthFixed) {
+    switch (instr->Mask(SystemPAuthMask)) {
+#define PAUTH_CASE(NAME, MN) \
+  case NAME:                 \
+    mnemonic = MN;           \
+    form = NULL;             \
+    break;
 
-  if (instr->Mask(SystemExclusiveMonitorFMask) == SystemExclusiveMonitorFixed) {
+      PAUTH_SYSTEM_MNEMONICS(PAUTH_CASE)
+#undef PAUTH_CASE
+    }
+  } else if (instr->Mask(SystemExclusiveMonitorFMask) ==
+             SystemExclusiveMonitorFixed) {
     switch (instr->Mask(SystemExclusiveMonitorMask)) {
       case CLREX: {
         mnemonic = "clrex";
@@ -1763,6 +2064,11 @@
         mnemonic = "sevl";
         break;
       }
+      case ESB: {
+        form = NULL;
+        mnemonic = "esb";
+        break;
+      }
       case CSDB: {
         form = NULL;
         mnemonic = "csdb";
@@ -2154,6 +2460,73 @@
   Format(instr, mnemonic, nfd.Substitute(form));
 }
 
+void Disassembler::VisitNEON2RegMiscFP16(const Instruction *instr) {
+  const char *mnemonic = "unimplemented";
+  const char *form = "'Vd.%s, 'Vn.%s";
+  const char *form_cmp = "'Vd.%s, 'Vn.%s, #0.0";
+
+  static const NEONFormatMap map_half = {{30}, {NF_4H, NF_8H}};
+  NEONFormatDecoder nfd(instr, &map_half);
+
+  switch (instr->Mask(NEON2RegMiscFP16Mask)) {
+// clang-format off
+#define FORMAT(A, B) \
+  case NEON_##A##_H: \
+    mnemonic = B;    \
+    break;
+    FORMAT(FABS,    "fabs")
+    FORMAT(FCVTAS,  "fcvtas")
+    FORMAT(FCVTAU,  "fcvtau")
+    FORMAT(FCVTMS,  "fcvtms")
+    FORMAT(FCVTMU,  "fcvtmu")
+    FORMAT(FCVTNS,  "fcvtns")
+    FORMAT(FCVTNU,  "fcvtnu")
+    FORMAT(FCVTPS,  "fcvtps")
+    FORMAT(FCVTPU,  "fcvtpu")
+    FORMAT(FCVTZS,  "fcvtzs")
+    FORMAT(FCVTZU,  "fcvtzu")
+    FORMAT(FNEG,    "fneg")
+    FORMAT(FRECPE,  "frecpe")
+    FORMAT(FRINTA,  "frinta")
+    FORMAT(FRINTI,  "frinti")
+    FORMAT(FRINTM,  "frintm")
+    FORMAT(FRINTN,  "frintn")
+    FORMAT(FRINTP,  "frintp")
+    FORMAT(FRINTX,  "frintx")
+    FORMAT(FRINTZ,  "frintz")
+    FORMAT(FRSQRTE, "frsqrte")
+    FORMAT(FSQRT,   "fsqrt")
+    FORMAT(SCVTF,   "scvtf")
+    FORMAT(UCVTF,   "ucvtf")
+// clang-format on
+#undef FORMAT
+
+    case NEON_FCMEQ_H_zero:
+      mnemonic = "fcmeq";
+      form = form_cmp;
+      break;
+    case NEON_FCMGT_H_zero:
+      mnemonic = "fcmgt";
+      form = form_cmp;
+      break;
+    case NEON_FCMGE_H_zero:
+      mnemonic = "fcmge";
+      form = form_cmp;
+      break;
+    case NEON_FCMLT_H_zero:
+      mnemonic = "fcmlt";
+      form = form_cmp;
+      break;
+    case NEON_FCMLE_H_zero:
+      mnemonic = "fcmle";
+      form = form_cmp;
+      break;
+    default:
+      form = "(NEON2RegMiscFP16)";
+  }
+  Format(instr, mnemonic, nfd.Substitute(form));
+}
+
 
 void Disassembler::VisitNEON3Same(const Instruction *instr) {
   const char *mnemonic = "unimplemented";
@@ -2329,7 +2702,7 @@
     // size (23) and the U bit (29).
     unsigned index = (instr->ExtractBits(15, 11) << 2) |
                      (instr->ExtractBit(23) << 1) | instr->ExtractBit(29);
-    VIXL_ASSERT(index < (sizeof(mnemonics) / sizeof(mnemonics[0])));
+    VIXL_ASSERT(index < ArrayLength(mnemonics));
     mnemonic = mnemonics[index];
     // Assert that index is not one of the previously handled logical
     // instructions.
@@ -2342,6 +2715,50 @@
   Format(instr, mnemonic, nfd.Substitute(form));
 }
 
+void Disassembler::VisitNEON3SameFP16(const Instruction *instr) {
+  const char *mnemonic = "unimplemented";
+  const char *form = "'Vd.%s, 'Vn.%s, 'Vm.%s";
+
+  NEONFormatDecoder nfd(instr);
+  nfd.SetFormatMaps(nfd.FP16FormatMap());
+
+  switch (instr->Mask(NEON3SameFP16Mask)) {
+#define FORMAT(A, B) \
+  case NEON_##A##_H: \
+    mnemonic = B;    \
+    break;
+    FORMAT(FMAXNM, "fmaxnm");
+    FORMAT(FMLA, "fmla");
+    FORMAT(FADD, "fadd");
+    FORMAT(FMULX, "fmulx");
+    FORMAT(FCMEQ, "fcmeq");
+    FORMAT(FMAX, "fmax");
+    FORMAT(FRECPS, "frecps");
+    FORMAT(FMINNM, "fminnm");
+    FORMAT(FMLS, "fmls");
+    FORMAT(FSUB, "fsub");
+    FORMAT(FMIN, "fmin");
+    FORMAT(FRSQRTS, "frsqrts");
+    FORMAT(FMAXNMP, "fmaxnmp");
+    FORMAT(FADDP, "faddp");
+    FORMAT(FMUL, "fmul");
+    FORMAT(FCMGE, "fcmge");
+    FORMAT(FACGE, "facge");
+    FORMAT(FMAXP, "fmaxp");
+    FORMAT(FDIV, "fdiv");
+    FORMAT(FMINNMP, "fminnmp");
+    FORMAT(FABD, "fabd");
+    FORMAT(FCMGT, "fcmgt");
+    FORMAT(FACGT, "facgt");
+    FORMAT(FMINP, "fminp");
+#undef FORMAT
+    default:
+      form = "(NEON3SameFP16)";
+  }
+
+  Format(instr, mnemonic, nfd.Substitute(form));
+}
+
 void Disassembler::VisitNEON3SameExtra(const Instruction *instr) {
   static const NEONFormatMap map_usdot = {{30}, {NF_8B, NF_16B}};
 
@@ -2491,12 +2908,33 @@
 void Disassembler::VisitNEONAcrossLanes(const Instruction *instr) {
   const char *mnemonic = "unimplemented";
   const char *form = "%sd, 'Vn.%s";
+  const char *form_half = "'Hd, 'Vn.%s";
+  bool half_op = false;
+  static const NEONFormatMap map_half = {{30}, {NF_4H, NF_8H}};
 
   NEONFormatDecoder nfd(instr,
                         NEONFormatDecoder::ScalarFormatMap(),
                         NEONFormatDecoder::IntegerFormatMap());
 
-  if (instr->Mask(NEONAcrossLanesFPFMask) == NEONAcrossLanesFPFixed) {
+  if (instr->Mask(NEONAcrossLanesFP16FMask) == NEONAcrossLanesFP16Fixed) {
+    half_op = true;
+    form = form_half;
+    nfd.SetFormatMaps(&map_half);
+    switch (instr->Mask(NEONAcrossLanesFP16Mask)) {
+      case NEON_FMAXV_H:
+        mnemonic = "fmaxv";
+        break;
+      case NEON_FMINV_H:
+        mnemonic = "fminv";
+        break;
+      case NEON_FMAXNMV_H:
+        mnemonic = "fmaxnmv";
+        break;
+      case NEON_FMINNMV_H:
+        mnemonic = "fminnmv";
+        break;
+    }
+  } else if (instr->Mask(NEONAcrossLanesFPFMask) == NEONAcrossLanesFPFixed) {
     nfd.SetFormatMap(0, nfd.FPScalarFormatMap());
     nfd.SetFormatMap(1, nfd.FPFormatMap());
     switch (instr->Mask(NEONAcrossLanesFPMask)) {
@@ -2546,11 +2984,16 @@
         break;
     }
   }
-  Format(instr,
-         mnemonic,
-         nfd.Substitute(form,
-                        NEONFormatDecoder::kPlaceholder,
-                        NEONFormatDecoder::kFormat));
+
+  if (half_op) {
+    Format(instr, mnemonic, nfd.Substitute(form));
+  } else {
+    Format(instr,
+           mnemonic,
+           nfd.Substitute(form,
+                          NEONFormatDecoder::kPlaceholder,
+                          NEONFormatDecoder::kFormat));
+  }
 }
 
 
@@ -2559,6 +3002,7 @@
   bool l_instr = false;
   bool fp_instr = false;
   bool cn_instr = false;
+  bool half_instr = false;
 
   const char *form = "'Vd.%s, 'Vn.%s, 'Ve.%s['IVByElemIndex]";
 
@@ -2567,6 +3011,7 @@
       {{23, 22, 30},
        {NF_UNDEF, NF_UNDEF, NF_4H, NF_8H, NF_UNDEF, NF_4S, NF_UNDEF, NF_UNDEF}};
   static const NEONFormatMap map_usdot = {{30}, {NF_8B, NF_16B}};
+  static const NEONFormatMap map_half = {{30}, {NF_4H, NF_8H}};
 
   NEONFormatDecoder nfd(instr,
                         &map_ta,
@@ -2659,6 +3104,22 @@
           mnemonic = "fmulx";
           fp_instr = true;
           break;
+        case NEON_FMLA_H_byelement:
+          mnemonic = "fmla";
+          half_instr = true;
+          break;
+        case NEON_FMLS_H_byelement:
+          mnemonic = "fmls";
+          half_instr = true;
+          break;
+        case NEON_FMUL_H_byelement:
+          mnemonic = "fmul";
+          half_instr = true;
+          break;
+        case NEON_FMULX_H_byelement:
+          mnemonic = "fmulx";
+          half_instr = true;
+          break;
         default:
           switch (instr->Mask(NEONByIndexedElementFPComplexMask)) {
             case NEON_FCMLA_byelement:
@@ -2670,7 +3131,11 @@
       }
   }
 
-  if (l_instr) {
+  if (half_instr) {
+    form = "'Vd.%s, 'Vn.%s, 'Ve.h['IVByElemIndex]";
+    nfd.SetFormatMaps(&map_half, &map_half);
+    Format(instr, mnemonic, nfd.Substitute(form));
+  } else if (l_instr) {
     Format(instr, nfd.Mnemonic(mnemonic), nfd.Substitute(form));
   } else if (fp_instr) {
     nfd.SetFormatMap(0, nfd.FPFormatMap());
@@ -3488,6 +3953,52 @@
   Format(instr, mnemonic, nfd.SubstitutePlaceholders(form));
 }
 
+void Disassembler::VisitNEONScalar2RegMiscFP16(const Instruction *instr) {
+  const char *mnemonic = "unimplemented";
+  const char *form = "'Hd, 'Hn";
+  const char *form_fp0 = "'Hd, 'Hn, #0.0";
+
+  switch (instr->Mask(NEONScalar2RegMiscFP16Mask)) {
+#define FORMAT(A, B)        \
+  case NEON_##A##_H_scalar: \
+    mnemonic = B;           \
+    break;
+    // clang-format off
+    FORMAT(FCVTNS,  "fcvtns")
+    FORMAT(FCVTMS,  "fcvtms")
+    FORMAT(FCVTAS,  "fcvtas")
+    FORMAT(SCVTF,   "scvtf")
+    FORMAT(FCVTPS,  "fcvtps")
+    FORMAT(FCVTZS,  "fcvtzs")
+    FORMAT(FRECPE,  "frecpe")
+    FORMAT(FRECPX,  "frecpx")
+    FORMAT(FCVTNU,  "fcvtnu")
+    FORMAT(FCVTMU,  "fcvtmu")
+    FORMAT(FCVTAU,  "fcvtau")
+    FORMAT(UCVTF,   "ucvtf")
+    FORMAT(FCVTPU,  "fcvtpu")
+    FORMAT(FCVTZU,  "fcvtzu")
+    FORMAT(FRSQRTE, "frsqrte")
+// clang-format on
+#undef FORMAT
+#define FORMAT(A, B)             \
+  case NEON_##A##_H_zero_scalar: \
+    mnemonic = B;                \
+    form = form_fp0;             \
+    break;
+    FORMAT(FCMGT, "fcmgt")
+    FORMAT(FCMEQ, "fcmeq")
+    FORMAT(FCMLT, "fcmlt")
+    FORMAT(FCMGE, "fcmge")
+    FORMAT(FCMLE, "fcmle")
+#undef FORMAT
+
+    default:
+      VIXL_UNREACHABLE();
+  }
+  Format(instr, mnemonic, form);
+}
+
 
 void Disassembler::VisitNEONScalar3Diff(const Instruction *instr) {
   const char *mnemonic = "unimplemented";
@@ -3626,6 +4137,43 @@
   Format(instr, mnemonic, nfd.SubstitutePlaceholders(form));
 }
 
+void Disassembler::VisitNEONScalar3SameFP16(const Instruction *instr) {
+  const char *mnemonic = NULL;
+  const char *form = "'Hd, 'Hn, 'Hm";
+
+  switch (instr->Mask(NEONScalar3SameFP16Mask)) {
+    case NEON_FABD_H_scalar:
+      mnemonic = "fabd";
+      break;
+    case NEON_FMULX_H_scalar:
+      mnemonic = "fmulx";
+      break;
+    case NEON_FCMEQ_H_scalar:
+      mnemonic = "fcmeq";
+      break;
+    case NEON_FCMGE_H_scalar:
+      mnemonic = "fcmge";
+      break;
+    case NEON_FCMGT_H_scalar:
+      mnemonic = "fcmgt";
+      break;
+    case NEON_FACGE_H_scalar:
+      mnemonic = "facge";
+      break;
+    case NEON_FACGT_H_scalar:
+      mnemonic = "facgt";
+      break;
+    case NEON_FRECPS_H_scalar:
+      mnemonic = "frecps";
+      break;
+    case NEON_FRSQRTS_H_scalar:
+      mnemonic = "frsqrts";
+      break;
+    default:
+      VIXL_UNREACHABLE();
+  }
+  Format(instr, mnemonic, form);
+}
 
 void Disassembler::VisitNEONScalar3SameExtra(const Instruction *instr) {
   const char *mnemonic = "unimplemented";
@@ -3649,6 +4197,7 @@
 void Disassembler::VisitNEONScalarByIndexedElement(const Instruction *instr) {
   const char *mnemonic = "unimplemented";
   const char *form = "%sd, %sn, 'Ve.%s['IVByElemIndex]";
+  const char *form_half = "'Hd, 'Hn, 'Ve.h['IVByElemIndex]";
   NEONFormatDecoder nfd(instr, NEONFormatDecoder::ScalarFormatMap());
   bool long_instr = false;
 
@@ -3692,6 +4241,22 @@
         case NEON_FMULX_byelement_scalar:
           mnemonic = "fmulx";
           break;
+        case NEON_FMLA_H_byelement_scalar:
+          mnemonic = "fmla";
+          form = form_half;
+          break;
+        case NEON_FMLS_H_byelement_scalar:
+          mnemonic = "fmls";
+          form = form_half;
+          break;
+        case NEON_FMUL_H_byelement_scalar:
+          mnemonic = "fmul";
+          form = form_half;
+          break;
+        case NEON_FMULX_H_byelement_scalar:
+          mnemonic = "fmulx";
+          form = form_half;
+          break;
         default:
           form = "(NEONScalarByIndexedElement)";
       }
@@ -3726,24 +4291,44 @@
   const char *mnemonic = "unimplemented";
   const char *form = "%sd, 'Vn.%s";
   NEONFormatMap map = {{22}, {NF_2S, NF_2D}};
-  NEONFormatDecoder nfd(instr, NEONFormatDecoder::FPScalarFormatMap(), &map);
+  NEONFormatDecoder nfd(instr,
+                        NEONFormatDecoder::FPScalarPairwiseFormatMap(),
+                        &map);
 
   switch (instr->Mask(NEONScalarPairwiseMask)) {
     case NEON_ADDP_scalar:
+      // All pairwise operations except ADDP use bit U to differentiate FP16
+      // from FP32/FP64 variations.
+      nfd.SetFormatMap(0, NEONFormatDecoder::FPScalarFormatMap());
       mnemonic = "addp";
       break;
+    case NEON_FADDP_h_scalar:
+      form = "%sd, 'Vn.2h";
+      VIXL_FALLTHROUGH();
     case NEON_FADDP_scalar:
       mnemonic = "faddp";
       break;
+    case NEON_FMAXP_h_scalar:
+      form = "%sd, 'Vn.2h";
+      VIXL_FALLTHROUGH();
     case NEON_FMAXP_scalar:
       mnemonic = "fmaxp";
       break;
+    case NEON_FMAXNMP_h_scalar:
+      form = "%sd, 'Vn.2h";
+      VIXL_FALLTHROUGH();
     case NEON_FMAXNMP_scalar:
       mnemonic = "fmaxnmp";
       break;
+    case NEON_FMINP_h_scalar:
+      form = "%sd, 'Vn.2h";
+      VIXL_FALLTHROUGH();
     case NEON_FMINP_scalar:
       mnemonic = "fminp";
       break;
+    case NEON_FMINNMP_h_scalar:
+      form = "%sd, 'Vn.2h";
+      VIXL_FALLTHROUGH();
     case NEON_FMINNMP_scalar:
       mnemonic = "fminnmp";
       break;
@@ -4683,6 +5268,10 @@
             ret += 3;
           } else if (instr->GetNEONSize() == 1) {
             vm_index = (vm_index << 1) | instr->GetNEONM();
+          } else if (instr->GetNEONSize() == 0) {
+            // Half-precision FP ops use H:L:M bit index
+            vm_index = (instr->GetNEONH() << 2) | (instr->GetNEONL() << 1) |
+                       instr->GetNEONM();
           }
           AppendToOutput("%d", vm_index);
           return ret;
@@ -5079,7 +5668,7 @@
   unsigned target = instr->GetPrefetchTarget() + 1;
   unsigned stream = instr->GetPrefetchStream();
 
-  if ((hint >= (sizeof(hints) / sizeof(hints[0]))) || (target > 3)) {
+  if ((hint >= ArrayLength(hints)) || (target > 3)) {
     // Unallocated prefetch operations.
     int prefetch_mode = instr->GetImmPrefetchOperation();
     AppendToOutput("#0b%c%c%c%c%c",
@@ -5089,7 +5678,7 @@
                    (prefetch_mode & (1 << 1)) ? '1' : '0',
                    (prefetch_mode & (1 << 0)) ? '1' : '0');
   } else {
-    VIXL_ASSERT(stream < (sizeof(stream_options) / sizeof(stream_options[0])));
+    VIXL_ASSERT(stream < ArrayLength(stream_options));
     AppendToOutput("p%sl%d%s", hints[hint], target, stream_options[stream]);
   }
   return 6;

diff --git a/src/aarch64/instructions-aarch64.cc b/src/aarch64/instructions-aarch64.cc
index 2ebb085..a99a045 100644
--- a/src/aarch64/instructions-aarch64.cc
+++ b/src/aarch64/instructions-aarch64.cc

@@ -30,24 +30,6 @@
 namespace vixl {
 namespace aarch64 {
 
-
-// Floating-point infinity values.
-const float16 kFP16PositiveInfinity = 0x7c00;
-const float16 kFP16NegativeInfinity = 0xfc00;
-const float kFP32PositiveInfinity = RawbitsToFloat(0x7f800000);
-const float kFP32NegativeInfinity = RawbitsToFloat(0xff800000);
-const double kFP64PositiveInfinity =
-    RawbitsToDouble(UINT64_C(0x7ff0000000000000));
-const double kFP64NegativeInfinity =
-    RawbitsToDouble(UINT64_C(0xfff0000000000000));
-
-
-// The default NaN values (for FPCR.DN=1).
-const double kFP64DefaultNaN = RawbitsToDouble(UINT64_C(0x7ff8000000000000));
-const float kFP32DefaultNaN = RawbitsToFloat(0x7fc00000);
-const float16 kFP16DefaultNaN = 0x7e00;
-
-
 static uint64_t RepeatBitsAcrossReg(unsigned reg_size,
                                     uint64_t value,
                                     unsigned width) {
@@ -180,7 +162,7 @@
 }
 
 
-float16 Instruction::Imm8ToFP16(uint32_t imm8) {
+Float16 Instruction::Imm8ToFloat16(uint32_t imm8) {
   // Imm8: abcdefgh (8 bits)
   // Half: aBbb.cdef.gh00.0000 (16 bits)
   // where B is b ^ 1
@@ -207,7 +189,7 @@
 }
 
 
-float16 Instruction::GetImmFP16() const { return Imm8ToFP16(GetImmFP()); }
+Float16 Instruction::GetImmFP16() const { return Imm8ToFloat16(GetImmFP()); }
 
 
 float Instruction::GetImmFP32() const { return Imm8ToFP32(GetImmFP()); }
@@ -231,8 +213,8 @@
 double Instruction::GetImmFP64() const { return Imm8ToFP64(GetImmFP()); }
 
 
-float16 Instruction::GetImmNEONFP16() const {
-  return Imm8ToFP16(GetImmNEONabcdefgh());
+Float16 Instruction::GetImmNEONFP16() const {
+  return Imm8ToFloat16(GetImmNEONabcdefgh());
 }
 
 
@@ -566,6 +548,7 @@
     case kFormatH:
       return kHRegSize;
     case kFormatS:
+    case kFormat2H:
       return kSRegSize;
     case kFormatD:
       return kDRegSize;
@@ -593,6 +576,7 @@
     case kFormat16B:
       return 8;
     case kFormatH:
+    case kFormat2H:
     case kFormat4H:
     case kFormat8H:
       return 16;
@@ -624,6 +608,7 @@
     case kFormat16B:
       return 0;
     case kFormatH:
+    case kFormat2H:
     case kFormat4H:
     case kFormat8H:
       return 1;
@@ -653,6 +638,7 @@
     case kFormat4H:
     case kFormat4S:
       return 4;
+    case kFormat2H:
     case kFormat2S:
     case kFormat2D:
       return 2;

diff --git a/src/aarch64/instructions-aarch64.h b/src/aarch64/instructions-aarch64.h
index 1f134c1..4e6bce7 100644
--- a/src/aarch64/instructions-aarch64.h
+++ b/src/aarch64/instructions-aarch64.h

@@ -90,12 +90,15 @@
 const uint64_t kWordMask = UINT64_C(0xffffffff);
 const uint64_t kXMaxUInt = UINT64_C(0xffffffffffffffff);
 const uint64_t kWMaxUInt = UINT64_C(0xffffffff);
+const uint64_t kHMaxUInt = UINT64_C(0xffff);
 // Define k*MinInt with "-k*MaxInt - 1", because the hexadecimal representation
 // (e.g. "INT32_C(0x80000000)") has implementation-defined behaviour.
 const int64_t kXMaxInt = INT64_C(0x7fffffffffffffff);
 const int64_t kXMinInt = -kXMaxInt - 1;
 const int32_t kWMaxInt = INT32_C(0x7fffffff);
 const int32_t kWMinInt = -kWMaxInt - 1;
+const int16_t kHMaxInt = INT16_C(0x7fff);
+const int16_t kHMinInt = -kHMaxInt - 1;
 const unsigned kFpRegCode = 29;
 const unsigned kLinkRegCode = 30;
 const unsigned kSpRegCode = 31;
@@ -109,26 +112,27 @@
                                  << kAddressTagOffset;
 VIXL_STATIC_ASSERT(kAddressTagMask == UINT64_C(0xff00000000000000));
 
-// AArch64 floating-point specifics. These match IEEE-754.
-const unsigned kDoubleMantissaBits = 52;
-const unsigned kDoubleExponentBits = 11;
-const unsigned kFloatMantissaBits = 23;
-const unsigned kFloatExponentBits = 8;
-const unsigned kFloat16MantissaBits = 10;
-const unsigned kFloat16ExponentBits = 5;
+const uint64_t kTTBRMask = UINT64_C(1) << 55;
 
-// Floating-point infinity values.
-extern const float16 kFP16PositiveInfinity;
-extern const float16 kFP16NegativeInfinity;
-extern const float kFP32PositiveInfinity;
-extern const float kFP32NegativeInfinity;
-extern const double kFP64PositiveInfinity;
-extern const double kFP64NegativeInfinity;
+// Make these moved float constants backwards compatible
+// with explicit vixl::aarch64:: namespace references.
+using vixl::kDoubleMantissaBits;
+using vixl::kDoubleExponentBits;
+using vixl::kFloatMantissaBits;
+using vixl::kFloatExponentBits;
+using vixl::kFloat16MantissaBits;
+using vixl::kFloat16ExponentBits;
 
-// The default NaN values (for FPCR.DN=1).
-extern const float16 kFP16DefaultNaN;
-extern const float kFP32DefaultNaN;
-extern const double kFP64DefaultNaN;
+using vixl::kFP16PositiveInfinity;
+using vixl::kFP16NegativeInfinity;
+using vixl::kFP32PositiveInfinity;
+using vixl::kFP32NegativeInfinity;
+using vixl::kFP64PositiveInfinity;
+using vixl::kFP64NegativeInfinity;
+
+using vixl::kFP16DefaultNaN;
+using vixl::kFP32DefaultNaN;
+using vixl::kFP64DefaultNaN;
 
 unsigned CalcLSDataSize(LoadStoreOp op);
 unsigned CalcLSPairDataSize(LoadStorePairOp op);
@@ -143,19 +147,6 @@
 
 enum AddrMode { Offset, PreIndex, PostIndex };
 
-enum FPRounding {
-  // The first four values are encodable directly by FPCR<RMode>.
-  FPTieEven = 0x0,
-  FPPositiveInfinity = 0x1,
-  FPNegativeInfinity = 0x2,
-  FPZero = 0x3,
-
-  // The final rounding modes are only available when explicitly specified by
-  // the instruction (such as with fcvta). It cannot be set in FPCR.
-  FPTieAway,
-  FPRoundOdd
-};
-
 enum Reg31Mode { Reg31IsStackPointer, Reg31IsZeroRegister };
 
 // Instructions. ---------------------------------------------------------------
@@ -226,7 +217,7 @@
     return GetImmNEONabcdefgh();
   }
 
-  float16 GetImmFP16() const;
+  Float16 GetImmFP16() const;
 
   float GetImmFP32() const;
   VIXL_DEPRECATED("GetImmFP32", float ImmFP32() const) { return GetImmFP32(); }
@@ -234,7 +225,7 @@
   double GetImmFP64() const;
   VIXL_DEPRECATED("GetImmFP64", double ImmFP64() const) { return GetImmFP64(); }
 
-  float16 GetImmNEONFP16() const;
+  Float16 GetImmNEONFP16() const;
 
   float GetImmNEONFP32() const;
   VIXL_DEPRECATED("GetImmNEONFP32", float ImmNEONFP32() const) {
@@ -505,7 +496,7 @@
  private:
   int GetImmBranch() const;
 
-  static float16 Imm8ToFP16(uint32_t imm8);
+  static Float16 Imm8ToFloat16(uint32_t imm8);
   static float Imm8ToFP32(uint32_t imm8);
   static double Imm8ToFP64(uint32_t imm8);
 
@@ -534,7 +525,10 @@
   kFormatB = NEON_B | NEONScalar,
   kFormatH = NEON_H | NEONScalar,
   kFormatS = NEON_S | NEONScalar,
-  kFormatD = NEON_D | NEONScalar
+  kFormatD = NEON_D | NEONScalar,
+
+  // A value invented solely for FP16 scalar pairwise simulator trace tests.
+  kFormat2H = 0xfffffffe
 };
 
 const int kMaxLanesPerVector = 16;
@@ -628,7 +622,7 @@
     formats_[2] = (format2 == NULL) ? formats_[1] : format2;
   }
   void SetFormatMap(unsigned index, const NEONFormatMap* format) {
-    VIXL_ASSERT(index <= (sizeof(formats_) / sizeof(formats_[0])));
+    VIXL_ASSERT(index <= ArrayLength(formats_));
     VIXL_ASSERT(format != NULL);
     formats_[index] = format;
   }
@@ -681,7 +675,7 @@
                                          kFormatH,
                                          kFormatS,
                                          kFormatD};
-    VIXL_ASSERT(GetNEONFormat(format_map) < (sizeof(vform) / sizeof(vform[0])));
+    VIXL_ASSERT(GetNEONFormat(format_map) < ArrayLength(vform));
     return vform[GetNEONFormat(format_map)];
   }
 
@@ -714,6 +708,13 @@
     return &map;
   }
 
+  // The FP16 format map uses one bit (Q) to encode the NEON vector format:
+  // NF_4H, NF_8H.
+  static const NEONFormatMap* FP16FormatMap() {
+    static const NEONFormatMap map = {{30}, {NF_4H, NF_8H}};
+    return &map;
+  }
+
   // The load/store format map uses three bits (Q, 11, 10) to encode the
   // set of NEON vector formats.
   static const NEONFormatMap* LoadStoreFormatMap() {
@@ -765,6 +766,13 @@
     return &map;
   }
 
+  // The FP scalar pairwise format map assumes two bits (U, size<0>) are used to
+  // encode the NEON FP scalar formats: NF_H, NF_S, NF_D.
+  static const NEONFormatMap* FPScalarPairwiseFormatMap() {
+    static const NEONFormatMap map = {{29, 22}, {NF_H, NF_UNDEF, NF_S, NF_D}};
+    return &map;
+  }
+
   // The triangular scalar format map uses between one and four bits to encode
   // the NEON FP scalar formats:
   // xxx1->B, xx10->H, x100->S, 1000->D, all others undefined.
@@ -815,7 +823,7 @@
       "b", "h", "s", "d"
     };
     // clang-format on
-    VIXL_ASSERT(format < (sizeof(formats) / sizeof(formats[0])));
+    VIXL_ASSERT(format < ArrayLength(formats));
     return formats[format];
   }
 

diff --git a/src/aarch64/instrument-aarch64.cc b/src/aarch64/instrument-aarch64.cc
index a2e6ca8..c3097ef 100644
--- a/src/aarch64/instrument-aarch64.cc
+++ b/src/aarch64/instrument-aarch64.cc

@@ -407,6 +407,14 @@
 }
 
 
+void Instrument::VisitAtomicMemory(const Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("Other");
+  counter->Increment();
+}
+
+
 void Instrument::VisitLoadLiteral(const Instruction* instr) {
   USE(instr);
   Update();
@@ -670,6 +678,14 @@
 }
 
 
+void Instrument::VisitNEON2RegMiscFP16(const Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+
 void Instrument::VisitNEON3Same(const Instruction* instr) {
   USE(instr);
   Update();
@@ -678,6 +694,14 @@
 }
 
 
+void Instrument::VisitNEON3SameFP16(const Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+
 void Instrument::VisitNEON3SameExtra(const Instruction* instr) {
   USE(instr);
   Update();
@@ -776,6 +800,14 @@
 }
 
 
+void Instrument::VisitNEONScalar2RegMiscFP16(const Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+
 void Instrument::VisitNEONScalar3Diff(const Instruction* instr) {
   USE(instr);
   Update();
@@ -792,6 +824,14 @@
 }
 
 
+void Instrument::VisitNEONScalar3SameFP16(const Instruction* instr) {
+  USE(instr);
+  Update();
+  static Counter* counter = GetCounter("NEON");
+  counter->Increment();
+}
+
+
 void Instrument::VisitNEONScalar3SameExtra(const Instruction* instr) {
   USE(instr);
   Update();

diff --git a/src/aarch64/logic-aarch64.cc b/src/aarch64/logic-aarch64.cc
index 20d4c00..aebd227 100644
--- a/src/aarch64/logic-aarch64.cc
+++ b/src/aarch64/logic-aarch64.cc

@@ -33,6 +33,39 @@
 namespace vixl {
 namespace aarch64 {
 
+using vixl::internal::SimFloat16;
+
+template <typename T>
+bool IsFloat64() {
+  return false;
+}
+template <>
+bool IsFloat64<double>() {
+  return true;
+}
+
+template <typename T>
+bool IsFloat32() {
+  return false;
+}
+template <>
+bool IsFloat32<float>() {
+  return true;
+}
+
+template <typename T>
+bool IsFloat16() {
+  return false;
+}
+template <>
+bool IsFloat16<Float16>() {
+  return true;
+}
+template <>
+bool IsFloat16<SimFloat16>() {
+  return true;
+}
+
 template <>
 double Simulator::FPDefaultNaN<double>() {
   return kFP64DefaultNaN;
@@ -44,6 +77,13 @@
   return kFP32DefaultNaN;
 }
 
+
+template <>
+SimFloat16 Simulator::FPDefaultNaN<SimFloat16>() {
+  return SimFloat16(kFP16DefaultNaN);
+}
+
+
 double Simulator::FixedToDouble(int64_t src, int fbits, FPRounding round) {
   if (src >= 0) {
     return UFixedToDouble(src, fbits, round);
@@ -98,6 +138,35 @@
 }
 
 
+SimFloat16 Simulator::FixedToFloat16(int64_t src, int fbits, FPRounding round) {
+  if (src >= 0) {
+    return UFixedToFloat16(src, fbits, round);
+  } else if (src == INT64_MIN) {
+    return -UFixedToFloat16(src, fbits, round);
+  } else {
+    return -UFixedToFloat16(-src, fbits, round);
+  }
+}
+
+
+SimFloat16 Simulator::UFixedToFloat16(uint64_t src,
+                                      int fbits,
+                                      FPRounding round) {
+  // An input of 0 is a special case because the result is effectively
+  // subnormal: The exponent is encoded as 0 and there is no implicit 1 bit.
+  if (src == 0) {
+    return 0.0f;
+  }
+
+  // Calculate the exponent. The highest significant bit will have the value
+  // 2^exponent.
+  const int highest_significant_bit = 63 - CountLeadingZeros(src);
+  const int16_t exponent = highest_significant_bit - fbits;
+
+  return FPRoundToFloat16(0, exponent, src, round);
+}
+
+
 void Simulator::ld1(VectorFormat vform, LogicVRegister dst, uint64_t addr) {
   dst.ClearForWrite(vform);
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
@@ -3614,13 +3683,14 @@
   return -op;
 }
 
-
 template <typename T>
 T Simulator::FPAdd(T op1, T op2) {
   T result = FPProcessNaNs(op1, op2);
-  if (std::isnan(result)) return result;
+  if (IsNaN(result)) {
+    return result;
+  }
 
-  if (std::isinf(op1) && std::isinf(op2) && (op1 != op2)) {
+  if (IsInf(op1) && IsInf(op2) && (op1 != op2)) {
     // inf + -inf returns the default NaN.
     FPProcessException();
     return FPDefaultNaN<T>();
@@ -3634,9 +3704,9 @@
 template <typename T>
 T Simulator::FPSub(T op1, T op2) {
   // NaNs should be handled elsewhere.
-  VIXL_ASSERT(!std::isnan(op1) && !std::isnan(op2));
+  VIXL_ASSERT(!IsNaN(op1) && !IsNaN(op2));
 
-  if (std::isinf(op1) && std::isinf(op2) && (op1 == op2)) {
+  if (IsInf(op1) && IsInf(op2) && (op1 == op2)) {
     // inf - inf returns the default NaN.
     FPProcessException();
     return FPDefaultNaN<T>();
@@ -3650,9 +3720,9 @@
 template <typename T>
 T Simulator::FPMul(T op1, T op2) {
   // NaNs should be handled elsewhere.
-  VIXL_ASSERT(!std::isnan(op1) && !std::isnan(op2));
+  VIXL_ASSERT(!IsNaN(op1) && !IsNaN(op2));
 
-  if ((std::isinf(op1) && (op2 == 0.0)) || (std::isinf(op2) && (op1 == 0.0))) {
+  if ((IsInf(op1) && (op2 == 0.0)) || (IsInf(op2) && (op1 == 0.0))) {
     // inf * 0.0 returns the default NaN.
     FPProcessException();
     return FPDefaultNaN<T>();
@@ -3665,7 +3735,7 @@
 
 template <typename T>
 T Simulator::FPMulx(T op1, T op2) {
-  if ((std::isinf(op1) && (op2 == 0.0)) || (std::isinf(op2) && (op1 == 0.0))) {
+  if ((IsInf(op1) && (op2 == 0.0)) || (IsInf(op2) && (op1 == 0.0))) {
     // inf * 0.0 returns +/-2.0.
     T two = 2.0;
     return copysign(1.0, op1) * copysign(1.0, op2) * two;
@@ -3680,13 +3750,13 @@
 
   T sign_a = copysign(1.0, a);
   T sign_prod = copysign(1.0, op1) * copysign(1.0, op2);
-  bool isinf_prod = std::isinf(op1) || std::isinf(op2);
+  bool isinf_prod = IsInf(op1) || IsInf(op2);
   bool operation_generates_nan =
-      (std::isinf(op1) && (op2 == 0.0)) ||                     // inf * 0.0
-      (std::isinf(op2) && (op1 == 0.0)) ||                     // 0.0 * inf
-      (std::isinf(a) && isinf_prod && (sign_a != sign_prod));  // inf - inf
+      (IsInf(op1) && (op2 == 0.0)) ||                     // inf * 0.0
+      (IsInf(op2) && (op1 == 0.0)) ||                     // 0.0 * inf
+      (IsInf(a) && isinf_prod && (sign_a != sign_prod));  // inf - inf
 
-  if (std::isnan(result)) {
+  if (IsNaN(result)) {
     // Generated NaNs override quiet NaNs propagated from a.
     if (operation_generates_nan && IsQuietNaN(a)) {
       FPProcessException();
@@ -3705,11 +3775,11 @@
   // Work around broken fma implementations for exact zero results: The sign of
   // exact 0.0 results is positive unless both a and op1 * op2 are negative.
   if (((op1 == 0.0) || (op2 == 0.0)) && (a == 0.0)) {
-    return ((sign_a < 0) && (sign_prod < 0)) ? -0.0 : 0.0;
+    return ((sign_a < T(0.0)) && (sign_prod < T(0.0))) ? -0.0 : 0.0;
   }
 
   result = FusedMultiplyAdd(op1, op2, a);
-  VIXL_ASSERT(!std::isnan(result));
+  VIXL_ASSERT(!IsNaN(result));
 
   // Work around broken fma implementations for rounded zero results: If a is
   // 0.0, the sign of the result is the sign of op1 * op2 before rounding.
@@ -3724,16 +3794,16 @@
 template <typename T>
 T Simulator::FPDiv(T op1, T op2) {
   // NaNs should be handled elsewhere.
-  VIXL_ASSERT(!std::isnan(op1) && !std::isnan(op2));
+  VIXL_ASSERT(!IsNaN(op1) && !IsNaN(op2));
 
-  if ((std::isinf(op1) && std::isinf(op2)) || ((op1 == 0.0) && (op2 == 0.0))) {
+  if ((IsInf(op1) && IsInf(op2)) || ((op1 == 0.0) && (op2 == 0.0))) {
     // inf / inf and 0.0 / 0.0 return the default NaN.
     FPProcessException();
     return FPDefaultNaN<T>();
   } else {
     if (op2 == 0.0) {
       FPProcessException();
-      if (!std::isnan(op1)) {
+      if (!IsNaN(op1)) {
         double op1_sign = copysign(1.0, op1);
         double op2_sign = copysign(1.0, op2);
         return static_cast<T>(op1_sign * op2_sign * kFP64PositiveInfinity);
@@ -3748,9 +3818,9 @@
 
 template <typename T>
 T Simulator::FPSqrt(T op) {
-  if (std::isnan(op)) {
+  if (IsNaN(op)) {
     return FPProcessNaN(op);
-  } else if (op < 0.0) {
+  } else if (op < T(0.0)) {
     FPProcessException();
     return FPDefaultNaN<T>();
   } else {
@@ -3762,7 +3832,7 @@
 template <typename T>
 T Simulator::FPMax(T a, T b) {
   T result = FPProcessNaNs(a, b);
-  if (std::isnan(result)) return result;
+  if (IsNaN(result)) return result;
 
   if ((a == 0.0) && (b == 0.0) && (copysign(1.0, a) != copysign(1.0, b))) {
     // a and b are zero, and the sign differs: return +0.0.
@@ -3782,14 +3852,14 @@
   }
 
   T result = FPProcessNaNs(a, b);
-  return std::isnan(result) ? result : FPMax(a, b);
+  return IsNaN(result) ? result : FPMax(a, b);
 }
 
 
 template <typename T>
 T Simulator::FPMin(T a, T b) {
   T result = FPProcessNaNs(a, b);
-  if (std::isnan(result)) return result;
+  if (IsNaN(result)) return result;
 
   if ((a == 0.0) && (b == 0.0) && (copysign(1.0, a) != copysign(1.0, b))) {
     // a and b are zero, and the sign differs: return -0.0.
@@ -3809,17 +3879,16 @@
   }
 
   T result = FPProcessNaNs(a, b);
-  return std::isnan(result) ? result : FPMin(a, b);
+  return IsNaN(result) ? result : FPMin(a, b);
 }
 
 
 template <typename T>
 T Simulator::FPRecipStepFused(T op1, T op2) {
   const T two = 2.0;
-  if ((std::isinf(op1) && (op2 == 0.0)) ||
-      ((op1 == 0.0) && (std::isinf(op2)))) {
+  if ((IsInf(op1) && (op2 == 0.0)) || ((op1 == 0.0) && (IsInf(op2)))) {
     return two;
-  } else if (std::isinf(op1) || std::isinf(op2)) {
+  } else if (IsInf(op1) || IsInf(op2)) {
     // Return +inf if signs match, otherwise -inf.
     return ((op1 >= 0.0) == (op2 >= 0.0)) ? kFP64PositiveInfinity
                                           : kFP64NegativeInfinity;
@@ -3828,16 +3897,28 @@
   }
 }
 
+template <typename T>
+bool IsNormal(T value) {
+  return std::isnormal(value);
+}
+
+template <>
+bool IsNormal(SimFloat16 value) {
+  uint16_t rawbits = Float16ToRawbits(value);
+  uint16_t exp_mask = 0x7c00;
+  // Check that the exponent is neither all zeroes or all ones.
+  return ((rawbits & exp_mask) != 0) && ((~rawbits & exp_mask) != 0);
+}
+
 
 template <typename T>
 T Simulator::FPRSqrtStepFused(T op1, T op2) {
   const T one_point_five = 1.5;
   const T two = 2.0;
 
-  if ((std::isinf(op1) && (op2 == 0.0)) ||
-      ((op1 == 0.0) && (std::isinf(op2)))) {
+  if ((IsInf(op1) && (op2 == 0.0)) || ((op1 == 0.0) && (IsInf(op2)))) {
     return one_point_five;
-  } else if (std::isinf(op1) || std::isinf(op2)) {
+  } else if (IsInf(op1) || IsInf(op2)) {
     // Return +inf if signs match, otherwise -inf.
     return ((op1 >= 0.0) == (op2 >= 0.0)) ? kFP64PositiveInfinity
                                           : kFP64NegativeInfinity;
@@ -3845,9 +3926,9 @@
     // The multiply-add-halve operation must be fully fused, so avoid interim
     // rounding by checking which operand can be losslessly divided by two
     // before doing the multiply-add.
-    if (std::isnormal(op1 / two)) {
+    if (IsNormal(op1 / two)) {
       return FusedMultiplyAdd(op1 / two, op2, one_point_five);
-    } else if (std::isnormal(op2 / two)) {
+    } else if (IsNormal(op2 / two)) {
       return FusedMultiplyAdd(op1, op2 / two, one_point_five);
     } else {
       // Neither operand is normal after halving: the result is dominated by
@@ -3857,12 +3938,80 @@
   }
 }
 
+int32_t Simulator::FPToFixedJS(double value) {
+  // The Z-flag is set when the conversion from double precision floating-point
+  // to 32-bit integer is exact. If the source value is +/-Infinity, -0.0, NaN,
+  // outside the bounds of a 32-bit integer, or isn't an exact integer then the
+  // Z-flag is unset.
+  int Z = 1;
+  int32_t result;
+
+  if ((value == 0.0) || (value == kFP64PositiveInfinity) ||
+      (value == kFP64NegativeInfinity)) {
+    // +/- zero and infinity all return zero, however -0 and +/- Infinity also
+    // unset the Z-flag.
+    result = 0.0;
+    if ((value != 0.0) || std::signbit(value)) {
+      Z = 0;
+    }
+  } else if (std::isnan(value)) {
+    // NaN values unset the Z-flag and set the result to 0.
+    FPProcessNaN(value);
+    result = 0;
+    Z = 0;
+  } else {
+    // All other values are converted to an integer representation, rounded
+    // toward zero.
+    double int_result = std::floor(value);
+    double error = value - int_result;
+
+    if ((error != 0.0) && (int_result < 0.0)) {
+      int_result++;
+    }
+
+    // Constrain the value into the range [INT32_MIN, INT32_MAX]. We can almost
+    // write a one-liner with std::round, but the behaviour on ties is incorrect
+    // for our purposes.
+    double mod_const = static_cast<double>(UINT64_C(1) << 32);
+    double mod_error =
+        (int_result / mod_const) - std::floor(int_result / mod_const);
+    double constrained;
+    if (mod_error == 0.5) {
+      constrained = INT32_MIN;
+    } else {
+      constrained = int_result - mod_const * round(int_result / mod_const);
+    }
+
+    VIXL_ASSERT(std::floor(constrained) == constrained);
+    VIXL_ASSERT(constrained >= INT32_MIN);
+    VIXL_ASSERT(constrained <= INT32_MAX);
+
+    // Take the bottom 32 bits of the result as a 32-bit integer.
+    result = static_cast<int32_t>(constrained);
+
+    if ((int_result < INT32_MIN) || (int_result > INT32_MAX) ||
+        (error != 0.0)) {
+      // If the integer result is out of range or the conversion isn't exact,
+      // take exception and unset the Z-flag.
+      FPProcessException();
+      Z = 0;
+    }
+  }
+
+  ReadNzcv().SetN(0);
+  ReadNzcv().SetZ(Z);
+  ReadNzcv().SetC(0);
+  ReadNzcv().SetV(0);
+
+  return result;
+}
+
 
 double Simulator::FPRoundInt(double value, FPRounding round_mode) {
   if ((value == 0.0) || (value == kFP64PositiveInfinity) ||
       (value == kFP64NegativeInfinity)) {
     return value;
-  } else if (std::isnan(value)) {
+  } else if (IsNaN(value)) {
     return FPProcessNaN(value);
   }
 
@@ -3927,6 +4076,17 @@
 }
 
 
+int16_t Simulator::FPToInt16(double value, FPRounding rmode) {
+  value = FPRoundInt(value, rmode);
+  if (value >= kHMaxInt) {
+    return kHMaxInt;
+  } else if (value < kHMinInt) {
+    return kHMinInt;
+  }
+  return IsNaN(value) ? 0 : static_cast<int16_t>(value);
+}
+
+
 int32_t Simulator::FPToInt32(double value, FPRounding rmode) {
   value = FPRoundInt(value, rmode);
   if (value >= kWMaxInt) {
@@ -3934,7 +4094,7 @@
   } else if (value < kWMinInt) {
     return kWMinInt;
   }
-  return std::isnan(value) ? 0 : static_cast<int32_t>(value);
+  return IsNaN(value) ? 0 : static_cast<int32_t>(value);
 }
 
 
@@ -3945,7 +4105,18 @@
   } else if (value < kXMinInt) {
     return kXMinInt;
   }
-  return std::isnan(value) ? 0 : static_cast<int64_t>(value);
+  return IsNaN(value) ? 0 : static_cast<int64_t>(value);
+}
+
+
+uint16_t Simulator::FPToUInt16(double value, FPRounding rmode) {
+  value = FPRoundInt(value, rmode);
+  if (value >= kHMaxUInt) {
+    return kHMaxUInt;
+  } else if (value < 0.0) {
+    return 0;
+  }
+  return IsNaN(value) ? 0 : static_cast<uint16_t>(value);
 }
 
 
@@ -3956,7 +4127,7 @@
   } else if (value < 0.0) {
     return 0;
   }
-  return std::isnan(value) ? 0 : static_cast<uint32_t>(value);
+  return IsNaN(value) ? 0 : static_cast<uint32_t>(value);
 }
 
 
@@ -3967,7 +4138,7 @@
   } else if (value < 0.0) {
     return 0;
   }
-  return std::isnan(value) ? 0 : static_cast<uint64_t>(value);
+  return IsNaN(value) ? 0 : static_cast<uint64_t>(value);
 }
 
 
@@ -3984,7 +4155,7 @@
       T result;                                                  \
       if (PROCNAN) {                                             \
         result = FPProcessNaNs(op1, op2);                        \
-        if (!std::isnan(result)) {                               \
+        if (!IsNaN(result)) {                                    \
           result = OP(op1, op2);                                 \
         }                                                        \
       } else {                                                   \
@@ -3999,7 +4170,9 @@
                                LogicVRegister dst,               \
                                const LogicVRegister& src1,       \
                                const LogicVRegister& src2) {     \
-    if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {          \
+    if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {          \
+      FN<SimFloat16>(vform, dst, src1, src2);                    \
+    } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {   \
       FN<float>(vform, dst, src1, src2);                         \
     } else {                                                     \
       VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize); \
@@ -4031,7 +4204,7 @@
     T op1 = -src1.Float<T>(i);
     T op2 = src2.Float<T>(i);
     T result = FPProcessNaNs(op1, op2);
-    dst.SetFloat(i, std::isnan(result) ? result : FPRecipStepFused(op1, op2));
+    dst.SetFloat(i, IsNaN(result) ? result : FPRecipStepFused(op1, op2));
   }
   return dst;
 }
@@ -4041,7 +4214,9 @@
                                  LogicVRegister dst,
                                  const LogicVRegister& src1,
                                  const LogicVRegister& src2) {
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    frecps<SimFloat16>(vform, dst, src1, src2);
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     frecps<float>(vform, dst, src1, src2);
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
@@ -4061,7 +4236,7 @@
     T op1 = -src1.Float<T>(i);
     T op2 = src2.Float<T>(i);
     T result = FPProcessNaNs(op1, op2);
-    dst.SetFloat(i, std::isnan(result) ? result : FPRSqrtStepFused(op1, op2));
+    dst.SetFloat(i, IsNaN(result) ? result : FPRSqrtStepFused(op1, op2));
   }
   return dst;
 }
@@ -4071,7 +4246,9 @@
                                   LogicVRegister dst,
                                   const LogicVRegister& src1,
                                   const LogicVRegister& src2) {
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    frsqrts<SimFloat16>(vform, dst, src1, src2);
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     frsqrts<float>(vform, dst, src1, src2);
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
@@ -4093,7 +4270,7 @@
     T op1 = src1.Float<T>(i);
     T op2 = src2.Float<T>(i);
     T nan_result = FPProcessNaNs(op1, op2);
-    if (!std::isnan(nan_result)) {
+    if (!IsNaN(nan_result)) {
       switch (cond) {
         case eq:
           result = (op1 == op2);
@@ -4126,7 +4303,9 @@
                                const LogicVRegister& src1,
                                const LogicVRegister& src2,
                                Condition cond) {
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    fcmp<SimFloat16>(vform, dst, src1, src2, cond);
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     fcmp<float>(vform, dst, src1, src2, cond);
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
@@ -4141,7 +4320,11 @@
                                     const LogicVRegister& src,
                                     Condition cond) {
   SimVRegister temp;
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    LogicVRegister zero_reg =
+        dup_immediate(vform, temp, Float16ToRawbits(SimFloat16(0.0)));
+    fcmp<SimFloat16>(vform, dst, src, zero_reg, cond);
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     LogicVRegister zero_reg = dup_immediate(vform, temp, FloatToRawbits(0.0));
     fcmp<float>(vform, dst, src, zero_reg, cond);
   } else {
@@ -4159,7 +4342,11 @@
                                   const LogicVRegister& src2,
                                   Condition cond) {
   SimVRegister temp1, temp2;
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    LogicVRegister abs_src1 = fabs_<SimFloat16>(vform, temp1, src1);
+    LogicVRegister abs_src2 = fabs_<SimFloat16>(vform, temp2, src2);
+    fcmp<SimFloat16>(vform, dst, abs_src1, abs_src2, cond);
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     LogicVRegister abs_src1 = fabs_<float>(vform, temp1, src1);
     LogicVRegister abs_src2 = fabs_<float>(vform, temp2, src2);
     fcmp<float>(vform, dst, abs_src1, abs_src2, cond);
@@ -4194,7 +4381,9 @@
                                LogicVRegister dst,
                                const LogicVRegister& src1,
                                const LogicVRegister& src2) {
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    fmla<SimFloat16>(vform, dst, src1, src2);
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     fmla<float>(vform, dst, src1, src2);
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
@@ -4225,7 +4414,9 @@
                                LogicVRegister dst,
                                const LogicVRegister& src1,
                                const LogicVRegister& src2) {
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    fmls<SimFloat16>(vform, dst, src1, src2);
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     fmls<float>(vform, dst, src1, src2);
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
@@ -4252,7 +4443,9 @@
 LogicVRegister Simulator::fneg(VectorFormat vform,
                                LogicVRegister dst,
                                const LogicVRegister& src) {
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    fneg<SimFloat16>(vform, dst, src);
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     fneg<float>(vform, dst, src);
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
@@ -4281,7 +4474,9 @@
 LogicVRegister Simulator::fabs_(VectorFormat vform,
                                 LogicVRegister dst,
                                 const LogicVRegister& src) {
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    fabs_<SimFloat16>(vform, dst, src);
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     fabs_<float>(vform, dst, src);
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
@@ -4306,7 +4501,12 @@
                                 LogicVRegister dst,
                                 const LogicVRegister& src) {
   dst.ClearForWrite(vform);
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    for (int i = 0; i < LaneCountFromFormat(vform); i++) {
+      SimFloat16 result = FPSqrt(src.Float<SimFloat16>(i));
+      dst.SetFloat(i, result);
+    }
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
       float result = FPSqrt(src.Float<float>(i));
       dst.SetFloat(i, result);
@@ -4322,47 +4522,58 @@
 }
 
 
-#define DEFINE_NEON_FP_PAIR_OP(FNP, FN, OP)                           \
-  LogicVRegister Simulator::FNP(VectorFormat vform,                   \
-                                LogicVRegister dst,                   \
-                                const LogicVRegister& src1,           \
-                                const LogicVRegister& src2) {         \
-    SimVRegister temp1, temp2;                                        \
-    uzp1(vform, temp1, src1, src2);                                   \
-    uzp2(vform, temp2, src1, src2);                                   \
-    FN(vform, dst, temp1, temp2);                                     \
-    return dst;                                                       \
-  }                                                                   \
-                                                                      \
-  LogicVRegister Simulator::FNP(VectorFormat vform,                   \
-                                LogicVRegister dst,                   \
-                                const LogicVRegister& src) {          \
-    if (vform == kFormatS) {                                          \
-      float result = OP(src.Float<float>(0), src.Float<float>(1));    \
-      dst.SetFloat(0, result);                                        \
-    } else {                                                          \
-      VIXL_ASSERT(vform == kFormatD);                                 \
-      double result = OP(src.Float<double>(0), src.Float<double>(1)); \
-      dst.SetFloat(0, result);                                        \
-    }                                                                 \
-    dst.ClearForWrite(vform);                                         \
-    return dst;                                                       \
+#define DEFINE_NEON_FP_PAIR_OP(FNP, FN, OP)                                    \
+  LogicVRegister Simulator::FNP(VectorFormat vform,                            \
+                                LogicVRegister dst,                            \
+                                const LogicVRegister& src1,                    \
+                                const LogicVRegister& src2) {                  \
+    SimVRegister temp1, temp2;                                                 \
+    uzp1(vform, temp1, src1, src2);                                            \
+    uzp2(vform, temp2, src1, src2);                                            \
+    FN(vform, dst, temp1, temp2);                                              \
+    return dst;                                                                \
+  }                                                                            \
+                                                                               \
+  LogicVRegister Simulator::FNP(VectorFormat vform,                            \
+                                LogicVRegister dst,                            \
+                                const LogicVRegister& src) {                   \
+    if (vform == kFormatH) {                                                   \
+      SimFloat16 result(OP(SimFloat16(RawbitsToFloat16(src.Uint(vform, 0))),   \
+                           SimFloat16(RawbitsToFloat16(src.Uint(vform, 1))))); \
+      dst.SetUint(vform, 0, Float16ToRawbits(result));                         \
+    } else if (vform == kFormatS) {                                            \
+      float result = OP(src.Float<float>(0), src.Float<float>(1));             \
+      dst.SetFloat(0, result);                                                 \
+    } else {                                                                   \
+      VIXL_ASSERT(vform == kFormatD);                                          \
+      double result = OP(src.Float<double>(0), src.Float<double>(1));          \
+      dst.SetFloat(0, result);                                                 \
+    }                                                                          \
+    dst.ClearForWrite(vform);                                                  \
+    return dst;                                                                \
   }
 NEON_FPPAIRWISE_LIST(DEFINE_NEON_FP_PAIR_OP)
 #undef DEFINE_NEON_FP_PAIR_OP
 
-
+template <typename T>
 LogicVRegister Simulator::fminmaxv(VectorFormat vform,
                                    LogicVRegister dst,
                                    const LogicVRegister& src,
-                                   FPMinMaxOp Op) {
-  VIXL_ASSERT(vform == kFormat4S);
+                                   typename TFPMinMaxOp<T>::type Op) {
+  VIXL_ASSERT((vform == kFormat4H) || (vform == kFormat8H) ||
+              (vform == kFormat4S));
   USE(vform);
-  float result1 = (this->*Op)(src.Float<float>(0), src.Float<float>(1));
-  float result2 = (this->*Op)(src.Float<float>(2), src.Float<float>(3));
-  float result = (this->*Op)(result1, result2);
-  dst.ClearForWrite(kFormatS);
-  dst.SetFloat<float>(0, result);
+  T result1 = (this->*Op)(src.Float<T>(0), src.Float<T>(1));
+  T result2 = (this->*Op)(src.Float<T>(2), src.Float<T>(3));
+  if (vform == kFormat8H) {
+    T result3 = (this->*Op)(src.Float<T>(4), src.Float<T>(5));
+    T result4 = (this->*Op)(src.Float<T>(6), src.Float<T>(7));
+    result1 = (this->*Op)(result1, result3);
+    result2 = (this->*Op)(result2, result4);
+  }
+  T result = (this->*Op)(result1, result2);
+  dst.ClearForWrite(ScalarFormatFromFormat(vform));
+  dst.SetFloat<T>(0, result);
   return dst;
 }
 
@@ -4370,28 +4581,50 @@
 LogicVRegister Simulator::fmaxv(VectorFormat vform,
                                 LogicVRegister dst,
                                 const LogicVRegister& src) {
-  return fminmaxv(vform, dst, src, &Simulator::FPMax);
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    return fminmaxv<SimFloat16>(vform, dst, src, &Simulator::FPMax<SimFloat16>);
+  } else {
+    return fminmaxv<float>(vform, dst, src, &Simulator::FPMax<float>);
+  }
 }
 
 
 LogicVRegister Simulator::fminv(VectorFormat vform,
                                 LogicVRegister dst,
                                 const LogicVRegister& src) {
-  return fminmaxv(vform, dst, src, &Simulator::FPMin);
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    return fminmaxv<SimFloat16>(vform, dst, src, &Simulator::FPMin<SimFloat16>);
+  } else {
+    return fminmaxv<float>(vform, dst, src, &Simulator::FPMin<float>);
+  }
 }
 
 
 LogicVRegister Simulator::fmaxnmv(VectorFormat vform,
                                   LogicVRegister dst,
                                   const LogicVRegister& src) {
-  return fminmaxv(vform, dst, src, &Simulator::FPMaxNM);
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    return fminmaxv<SimFloat16>(vform,
+                                dst,
+                                src,
+                                &Simulator::FPMaxNM<SimFloat16>);
+  } else {
+    return fminmaxv<float>(vform, dst, src, &Simulator::FPMaxNM<float>);
+  }
 }
 
 
 LogicVRegister Simulator::fminnmv(VectorFormat vform,
                                   LogicVRegister dst,
                                   const LogicVRegister& src) {
-  return fminmaxv(vform, dst, src, &Simulator::FPMinNM);
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    return fminmaxv<SimFloat16>(vform,
+                                dst,
+                                src,
+                                &Simulator::FPMinNM<SimFloat16>);
+  } else {
+    return fminmaxv<float>(vform, dst, src, &Simulator::FPMinNM<float>);
+  }
 }
 
 
@@ -4402,10 +4635,12 @@
                                int index) {
   dst.ClearForWrite(vform);
   SimVRegister temp;
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    LogicVRegister index_reg = dup_element(kFormat8H, temp, src2, index);
+    fmul<SimFloat16>(vform, dst, src1, index_reg);
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     LogicVRegister index_reg = dup_element(kFormat4S, temp, src2, index);
     fmul<float>(vform, dst, src1, index_reg);
-
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
     LogicVRegister index_reg = dup_element(kFormat2D, temp, src2, index);
@@ -4422,10 +4657,12 @@
                                int index) {
   dst.ClearForWrite(vform);
   SimVRegister temp;
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    LogicVRegister index_reg = dup_element(kFormat8H, temp, src2, index);
+    fmla<SimFloat16>(vform, dst, src1, index_reg);
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     LogicVRegister index_reg = dup_element(kFormat4S, temp, src2, index);
     fmla<float>(vform, dst, src1, index_reg);
-
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
     LogicVRegister index_reg = dup_element(kFormat2D, temp, src2, index);
@@ -4442,10 +4679,12 @@
                                int index) {
   dst.ClearForWrite(vform);
   SimVRegister temp;
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    LogicVRegister index_reg = dup_element(kFormat8H, temp, src2, index);
+    fmls<SimFloat16>(vform, dst, src1, index_reg);
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     LogicVRegister index_reg = dup_element(kFormat4S, temp, src2, index);
     fmls<float>(vform, dst, src1, index_reg);
-
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
     LogicVRegister index_reg = dup_element(kFormat2D, temp, src2, index);
@@ -4462,10 +4701,12 @@
                                 int index) {
   dst.ClearForWrite(vform);
   SimVRegister temp;
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    LogicVRegister index_reg = dup_element(kFormat8H, temp, src2, index);
+    fmulx<SimFloat16>(vform, dst, src1, index_reg);
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     LogicVRegister index_reg = dup_element(kFormat4S, temp, src2, index);
     fmulx<float>(vform, dst, src1, index_reg);
-
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
     LogicVRegister index_reg = dup_element(kFormat2D, temp, src2, index);
@@ -4481,11 +4722,20 @@
                                 FPRounding rounding_mode,
                                 bool inexact_exception) {
   dst.ClearForWrite(vform);
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    for (int i = 0; i < LaneCountFromFormat(vform); i++) {
+      SimFloat16 input = src.Float<SimFloat16>(i);
+      SimFloat16 rounded = FPRoundInt(input, rounding_mode);
+      if (inexact_exception && !IsNaN(input) && (input != rounded)) {
+        FPProcessException();
+      }
+      dst.SetFloat<SimFloat16>(i, rounded);
+    }
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
       float input = src.Float<float>(i);
       float rounded = FPRoundInt(input, rounding_mode);
-      if (inexact_exception && !std::isnan(input) && (input != rounded)) {
+      if (inexact_exception && !IsNaN(input) && (input != rounded)) {
         FPProcessException();
       }
       dst.SetFloat<float>(i, rounded);
@@ -4495,7 +4745,7 @@
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
       double input = src.Float<double>(i);
       double rounded = FPRoundInt(input, rounding_mode);
-      if (inexact_exception && !std::isnan(input) && (input != rounded)) {
+      if (inexact_exception && !IsNaN(input) && (input != rounded)) {
         FPProcessException();
       }
       dst.SetFloat<double>(i, rounded);
@@ -4511,7 +4761,13 @@
                                 FPRounding rounding_mode,
                                 int fbits) {
   dst.ClearForWrite(vform);
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    for (int i = 0; i < LaneCountFromFormat(vform); i++) {
+      SimFloat16 op =
+          static_cast<double>(src.Float<SimFloat16>(i)) * std::pow(2.0, fbits);
+      dst.SetInt(vform, i, FPToInt16(op, rounding_mode));
+    }
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
       float op = src.Float<float>(i) * std::pow(2.0f, fbits);
       dst.SetInt(vform, i, FPToInt32(op, rounding_mode));
@@ -4533,7 +4789,13 @@
                                 FPRounding rounding_mode,
                                 int fbits) {
   dst.ClearForWrite(vform);
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    for (int i = 0; i < LaneCountFromFormat(vform); i++) {
+      SimFloat16 op =
+          static_cast<double>(src.Float<SimFloat16>(i)) * std::pow(2.0, fbits);
+      dst.SetUint(vform, i, FPToUInt16(op, rounding_mode));
+    }
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
       float op = src.Float<float>(i) * std::pow(2.0f, fbits);
       dst.SetUint(vform, i, FPToUInt32(op, rounding_mode));
@@ -4554,7 +4816,10 @@
                                 const LogicVRegister& src) {
   if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     for (int i = LaneCountFromFormat(vform) - 1; i >= 0; i--) {
-      dst.SetFloat(i, FPToFloat(src.Float<float16>(i), ReadDN()));
+      // TODO: Full support for SimFloat16 in SimRegister(s).
+      dst.SetFloat(i,
+                   FPToFloat(RawbitsToFloat16(src.Float<uint16_t>(i)),
+                             ReadDN()));
     }
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
@@ -4572,7 +4837,11 @@
   int lane_count = LaneCountFromFormat(vform);
   if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     for (int i = 0; i < lane_count; i++) {
-      dst.SetFloat(i, FPToFloat(src.Float<float16>(i + lane_count), ReadDN()));
+      // TODO: Full support for SimFloat16 in SimRegister(s).
+      dst.SetFloat(i,
+                   FPToFloat(RawbitsToFloat16(
+                                 src.Float<uint16_t>(i + lane_count)),
+                             ReadDN()));
     }
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
@@ -4589,7 +4858,9 @@
                                 const LogicVRegister& src) {
   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-      dst.SetFloat(i, FPToFloat16(src.Float<float>(i), FPTieEven, ReadDN()));
+      dst.SetFloat(i,
+                   Float16ToRawbits(
+                       FPToFloat16(src.Float<float>(i), FPTieEven, ReadDN())));
     }
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize);
@@ -4608,7 +4879,8 @@
   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
     for (int i = lane_count - 1; i >= 0; i--) {
       dst.SetFloat(i + lane_count,
-                   FPToFloat16(src.Float<float>(i), FPTieEven, ReadDN()));
+                   Float16ToRawbits(
+                       FPToFloat16(src.Float<float>(i), FPTieEven, ReadDN())));
     }
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize);
@@ -4669,7 +4941,7 @@
 
 template <typename T>
 T Simulator::FPRecipSqrtEstimate(T op) {
-  if (std::isnan(op)) {
+  if (IsNaN(op)) {
     return FPProcessNaN(op);
   } else if (op == 0.0) {
     if (copysign(1.0, op) < 0.0) {
@@ -4680,17 +4952,22 @@
   } else if (copysign(1.0, op) < 0.0) {
     FPProcessException();
     return FPDefaultNaN<T>();
-  } else if (std::isinf(op)) {
+  } else if (IsInf(op)) {
     return 0.0;
   } else {
     uint64_t fraction;
     int exp, result_exp;
 
-    if (sizeof(T) == sizeof(float)) {  // NOLINT(runtime/sizeof)
+    if (IsFloat16<T>()) {
+      exp = Float16Exp(op);
+      fraction = Float16Mantissa(op);
+      fraction <<= 42;
+    } else if (IsFloat32<T>()) {
       exp = FloatExp(op);
       fraction = FloatMantissa(op);
       fraction <<= 29;
     } else {
+      VIXL_ASSERT(IsFloat64<T>());
       exp = DoubleExp(op);
       fraction = DoubleMantissa(op);
     }
@@ -4710,19 +4987,27 @@
       scaled = DoublePack(0, 1021, Bits(fraction, 51, 44) << 44);
     }
 
-    if (sizeof(T) == sizeof(float)) {  // NOLINT(runtime/sizeof)
+    if (IsFloat16<T>()) {
+      result_exp = (44 - exp) / 2;
+    } else if (IsFloat32<T>()) {
       result_exp = (380 - exp) / 2;
     } else {
+      VIXL_ASSERT(IsFloat64<T>());
       result_exp = (3068 - exp) / 2;
     }
 
     uint64_t estimate = DoubleToRawbits(recip_sqrt_estimate(scaled));
 
-    if (sizeof(T) == sizeof(float)) {  // NOLINT(runtime/sizeof)
+    if (IsFloat16<T>()) {
+      uint16_t exp_bits = static_cast<uint16_t>(Bits(result_exp, 4, 0));
+      uint16_t est_bits = static_cast<uint16_t>(Bits(estimate, 51, 42));
+      return Float16Pack(0, exp_bits, est_bits);
+    } else if (IsFloat32<T>()) {
       uint32_t exp_bits = static_cast<uint32_t>(Bits(result_exp, 7, 0));
       uint32_t est_bits = static_cast<uint32_t>(Bits(estimate, 51, 29));
       return FloatPack(0, exp_bits, est_bits);
     } else {
+      VIXL_ASSERT(IsFloat64<T>());
       return DoublePack(0, Bits(result_exp, 10, 0), Bits(estimate, 51, 0));
     }
   }
@@ -4733,7 +5018,12 @@
                                   LogicVRegister dst,
                                   const LogicVRegister& src) {
   dst.ClearForWrite(vform);
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    for (int i = 0; i < LaneCountFromFormat(vform); i++) {
+      SimFloat16 input = src.Float<SimFloat16>(i);
+      dst.SetFloat(i, FPRecipSqrtEstimate<SimFloat16>(input));
+    }
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
       float input = src.Float<float>(i);
       dst.SetFloat(i, FPRecipSqrtEstimate<float>(input));
@@ -4752,23 +5042,25 @@
 T Simulator::FPRecipEstimate(T op, FPRounding rounding) {
   uint32_t sign;
 
-  if (sizeof(T) == sizeof(float)) {  // NOLINT(runtime/sizeof)
+  if (IsFloat16<T>()) {
+    sign = Float16Sign(op);
+  } else if (IsFloat32<T>()) {
     sign = FloatSign(op);
   } else {
+    VIXL_ASSERT(IsFloat64<T>());
     sign = DoubleSign(op);
   }
 
-  if (std::isnan(op)) {
+  if (IsNaN(op)) {
     return FPProcessNaN(op);
-  } else if (std::isinf(op)) {
+  } else if (IsInf(op)) {
     return (sign == 1) ? -0.0 : 0.0;
   } else if (op == 0.0) {
     FPProcessException();  // FPExc_DivideByZero exception.
     return (sign == 1) ? kFP64NegativeInfinity : kFP64PositiveInfinity;
-  } else if (((sizeof(T) == sizeof(float)) &&  // NOLINT(runtime/sizeof)
-              (std::fabs(op) < std::pow(2.0, -128.0))) ||
-             ((sizeof(T) == sizeof(double)) &&  // NOLINT(runtime/sizeof)
-              (std::fabs(op) < std::pow(2.0, -1024.0)))) {
+  } else if ((IsFloat16<T>() && (std::fabs(op) < std::pow(2.0, -16.0))) ||
+             (IsFloat32<T>() && (std::fabs(op) < std::pow(2.0, -128.0))) ||
+             (IsFloat64<T>() && (std::fabs(op) < std::pow(2.0, -1024.0)))) {
     bool overflow_to_inf = false;
     switch (rounding) {
       case FPTieEven:
@@ -4791,9 +5083,12 @@
       return (sign == 1) ? kFP64NegativeInfinity : kFP64PositiveInfinity;
     } else {
       // Return FPMaxNormal(sign).
-      if (sizeof(T) == sizeof(float)) {  // NOLINT(runtime/sizeof)
+      if (IsFloat16<T>()) {
+        return Float16Pack(sign, 0x1f, 0x3ff);
+      } else if (IsFloat32<T>()) {
         return FloatPack(sign, 0xfe, 0x07fffff);
       } else {
+        VIXL_ASSERT(IsFloat64<T>());
         return DoublePack(sign, 0x7fe, 0x0fffffffffffffl);
       }
     }
@@ -4802,12 +5097,18 @@
     int exp, result_exp;
     uint32_t sign;
 
-    if (sizeof(T) == sizeof(float)) {  // NOLINT(runtime/sizeof)
+    if (IsFloat16<T>()) {
+      sign = Float16Sign(op);
+      exp = Float16Exp(op);
+      fraction = Float16Mantissa(op);
+      fraction <<= 42;
+    } else if (IsFloat32<T>()) {
       sign = FloatSign(op);
       exp = FloatExp(op);
       fraction = FloatMantissa(op);
       fraction <<= 29;
     } else {
+      VIXL_ASSERT(IsFloat64<T>());
       sign = DoubleSign(op);
       exp = DoubleExp(op);
       fraction = DoubleMantissa(op);
@@ -4824,9 +5125,12 @@
 
     double scaled = DoublePack(0, 1022, Bits(fraction, 51, 44) << 44);
 
-    if (sizeof(T) == sizeof(float)) {  // NOLINT(runtime/sizeof)
-      result_exp = (253 - exp);        // In range 253-254 = -1 to 253+1 = 254.
+    if (IsFloat16<T>()) {
+      result_exp = (29 - exp);  // In range 29-30 = -1 to 29+1 = 30.
+    } else if (IsFloat32<T>()) {
+      result_exp = (253 - exp);  // In range 253-254 = -1 to 253+1 = 254.
     } else {
+      VIXL_ASSERT(IsFloat64<T>());
       result_exp = (2045 - exp);  // In range 2045-2046 = -1 to 2045+1 = 2046.
     }
 
@@ -4839,11 +5143,16 @@
       fraction = (UINT64_C(1) << 50) | Bits(fraction, 51, 2);
       result_exp = 0;
     }
-    if (sizeof(T) == sizeof(float)) {  // NOLINT(runtime/sizeof)
+    if (IsFloat16<T>()) {
+      uint16_t exp_bits = static_cast<uint16_t>(Bits(result_exp, 4, 0));
+      uint16_t frac_bits = static_cast<uint16_t>(Bits(fraction, 51, 42));
+      return Float16Pack(sign, exp_bits, frac_bits);
+    } else if (IsFloat32<T>()) {
       uint32_t exp_bits = static_cast<uint32_t>(Bits(result_exp, 7, 0));
       uint32_t frac_bits = static_cast<uint32_t>(Bits(fraction, 51, 29));
       return FloatPack(sign, exp_bits, frac_bits);
     } else {
+      VIXL_ASSERT(IsFloat64<T>());
       return DoublePack(sign, Bits(result_exp, 10, 0), Bits(fraction, 51, 0));
     }
   }
@@ -4855,7 +5164,12 @@
                                  const LogicVRegister& src,
                                  FPRounding round) {
   dst.ClearForWrite(vform);
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    for (int i = 0; i < LaneCountFromFormat(vform); i++) {
+      SimFloat16 input = src.Float<SimFloat16>(i);
+      dst.SetFloat(i, FPRecipEstimate<SimFloat16>(input, round));
+    }
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
       float input = src.Float<float>(i);
       dst.SetFloat(i, FPRecipEstimate<float>(input, round));
@@ -4933,17 +5247,23 @@
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
     T op = src.Float<T>(i);
     T result;
-    if (std::isnan(op)) {
+    if (IsNaN(op)) {
       result = FPProcessNaN(op);
     } else {
       int exp;
       uint32_t sign;
-      if (sizeof(T) == sizeof(float)) {  // NOLINT(runtime/sizeof)
+      if (IsFloat16<T>()) {
+        sign = Float16Sign(op);
+        exp = Float16Exp(op);
+        exp = (exp == 0) ? (0x1F - 1) : static_cast<int>(Bits(~exp, 4, 0));
+        result = Float16Pack(sign, exp, 0);
+      } else if (IsFloat32<T>()) {
         sign = FloatSign(op);
         exp = FloatExp(op);
         exp = (exp == 0) ? (0xFF - 1) : static_cast<int>(Bits(~exp, 7, 0));
         result = FloatPack(sign, exp, 0);
       } else {
+        VIXL_ASSERT(IsFloat64<T>());
         sign = DoubleSign(op);
         exp = DoubleExp(op);
         exp = (exp == 0) ? (0x7FF - 1) : static_cast<int>(Bits(~exp, 10, 0));
@@ -4959,7 +5279,9 @@
 LogicVRegister Simulator::frecpx(VectorFormat vform,
                                  LogicVRegister dst,
                                  const LogicVRegister& src) {
-  if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+  if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+    frecpx<SimFloat16>(vform, dst, src);
+  } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     frecpx<float>(vform, dst, src);
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
@@ -4974,7 +5296,10 @@
                                 int fbits,
                                 FPRounding round) {
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-    if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+    if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+      SimFloat16 result = FixedToFloat16(src.Int(kFormatH, i), fbits, round);
+      dst.SetFloat<SimFloat16>(i, result);
+    } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
       float result = FixedToFloat(src.Int(kFormatS, i), fbits, round);
       dst.SetFloat<float>(i, result);
     } else {
@@ -4993,7 +5318,10 @@
                                 int fbits,
                                 FPRounding round) {
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-    if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
+    if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
+      SimFloat16 result = UFixedToFloat16(src.Uint(kFormatH, i), fbits, round);
+      dst.SetFloat<SimFloat16>(i, result);
+    } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
       float result = UFixedToFloat(src.Uint(kFormatS, i), fbits, round);
       dst.SetFloat<float>(i, result);
     } else {

diff --git a/src/aarch64/macro-assembler-aarch64.cc b/src/aarch64/macro-assembler-aarch64.cc
index f13d15a..e881a81 100644
--- a/src/aarch64/macro-assembler-aarch64.cc
+++ b/src/aarch64/macro-assembler-aarch64.cc

@@ -1485,7 +1485,7 @@
   MacroEmissionCheckScope guard(this);
 
   if (vd.Is1H() || vd.Is4H() || vd.Is8H()) {
-    Fmov(vd, F16(imm));
+    Fmov(vd, Float16(imm));
     return;
   }
 
@@ -1522,7 +1522,7 @@
   MacroEmissionCheckScope guard(this);
 
   if (vd.Is1H() || vd.Is4H() || vd.Is8H()) {
-    Fmov(vd, F16(imm));
+    Fmov(vd, Float16(imm));
     return;
   }
 
@@ -1553,23 +1553,23 @@
 }
 
 
-void MacroAssembler::Fmov(VRegister vd, F16 imm) {
+void MacroAssembler::Fmov(VRegister vd, Float16 imm) {
   VIXL_ASSERT(allow_macro_instructions_);
   MacroEmissionCheckScope guard(this);
 
   if (vd.Is1S() || vd.Is2S() || vd.Is4S()) {
-    Fmov(vd, static_cast<float>(imm));
+    Fmov(vd, FPToFloat(imm, kIgnoreDefaultNaN));
     return;
   }
 
   if (vd.Is1D() || vd.Is2D()) {
-    Fmov(vd, static_cast<double>(imm));
+    Fmov(vd, FPToDouble(imm, kIgnoreDefaultNaN));
     return;
   }
 
   VIXL_ASSERT(vd.Is1H() || vd.Is4H() || vd.Is8H());
-  uint16_t rawbits = imm.ToRawbits();
-  if (IsImmFP16(rawbits)) {
+  uint16_t rawbits = Float16ToRawbits(imm);
+  if (IsImmFP16(imm)) {
     fmov(vd, imm);
   } else {
     if (vd.IsScalar()) {
@@ -2966,7 +2966,7 @@
 
   const CPURegister regs[] = {reg1, reg2, reg3, reg4};
 
-  for (unsigned i = 0; i < (sizeof(regs) / sizeof(regs[0])); i++) {
+  for (size_t i = 0; i < ArrayLength(regs); i++) {
     if (regs[i].IsRegister()) {
       exclude |= regs[i].GetBit();
     } else if (regs[i].IsFPRegister()) {

diff --git a/src/aarch64/macro-assembler-aarch64.h b/src/aarch64/macro-assembler-aarch64.h
index 1dc0c0e..88ed557 100644
--- a/src/aarch64/macro-assembler-aarch64.h
+++ b/src/aarch64/macro-assembler-aarch64.h

@@ -44,7 +44,6 @@
 // is needed regardless of whether the simulator is included or not, since
 // generating simulator specific instructions is controlled at runtime.
 #include "simulator-constants-aarch64.h"
-#include "utils-aarch64.h"
 
 
 #define LS_MACRO_LIST(V)                                     \
@@ -1036,6 +1035,12 @@
     SingleEmissionCheckScope guard(this);
     bfi(rd, rn, lsb, width);
   }
+  void Bfc(const Register& rd, unsigned lsb, unsigned width) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    VIXL_ASSERT(!rd.IsZero());
+    SingleEmissionCheckScope guard(this);
+    bfc(rd, lsb, width);
+  }
   void Bfxil(const Register& rd,
              const Register& rn,
              unsigned lsb,
@@ -1066,6 +1071,56 @@
     SingleEmissionCheckScope guard(this);
     br(xn);
   }
+  void Braaz(const Register& xn) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    braaz(xn);
+  }
+  void Brabz(const Register& xn) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    brabz(xn);
+  }
+  void Blraaz(const Register& xn) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    blraaz(xn);
+  }
+  void Blrabz(const Register& xn) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    blrabz(xn);
+  }
+  void Retaa() {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    retaa();
+  }
+  void Retab() {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    retab();
+  }
+  void Braa(const Register& xn, const Register& xm) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    braa(xn, xm);
+  }
+  void Brab(const Register& xn, const Register& xm) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    brab(xn, xm);
+  }
+  void Blraa(const Register& xn, const Register& xm) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    blraa(xn, xm);
+  }
+  void Blrab(const Register& xn, const Register& xm) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    blrab(xn, xm);
+  }
   void Brk(int code = 0) {
     VIXL_ASSERT(allow_macro_instructions_);
     SingleEmissionCheckScope guard(this);
@@ -1087,6 +1142,79 @@
     SingleEmissionCheckScope guard(this);
     cinv(rd, rn, cond);
   }
+
+#define PAUTH_SYSTEM_MODES(V) \
+  V(az)                       \
+  V(bz)                       \
+  V(asp)                      \
+  V(bsp)
+
+#define DEFINE_MACRO_ASM_FUNCS(SUFFIX)      \
+  void Paci##SUFFIX() {                     \
+    VIXL_ASSERT(allow_macro_instructions_); \
+    SingleEmissionCheckScope guard(this);   \
+    paci##SUFFIX();                         \
+  }                                         \
+  void Auti##SUFFIX() {                     \
+    VIXL_ASSERT(allow_macro_instructions_); \
+    SingleEmissionCheckScope guard(this);   \
+    auti##SUFFIX();                         \
+  }
+
+  PAUTH_SYSTEM_MODES(DEFINE_MACRO_ASM_FUNCS)
+#undef DEFINE_MACRO_ASM_FUNCS
+
+  // The 1716 pac and aut instructions encourage people to use x16 and x17
+  // directly, perhaps without realising that this is forbidden. For example:
+  //
+  //     UseScratchRegisterScope temps(&masm);
+  //     Register temp = temps.AcquireX();  // temp will be x16
+  //     __ Mov(x17, ptr);
+  //     __ Mov(x16, modifier);  // Will override temp!
+  //     __ Pacia1716();
+  //
+  // To work around this issue, you must exclude x16 and x17 from the scratch
+  // register list. You may need to replace them with other registers:
+  //
+  //     UseScratchRegisterScope temps(&masm);
+  //     temps.Exclude(x16, x17);
+  //     temps.Include(x10, x11);
+  //     __ Mov(x17, ptr);
+  //     __ Mov(x16, modifier);
+  //     __ Pacia1716();
+  void Pacia1716() {
+    VIXL_ASSERT(allow_macro_instructions_);
+    VIXL_ASSERT(!GetScratchRegisterList()->IncludesAliasOf(x16));
+    VIXL_ASSERT(!GetScratchRegisterList()->IncludesAliasOf(x17));
+    SingleEmissionCheckScope guard(this);
+    pacia1716();
+  }
+  void Pacib1716() {
+    VIXL_ASSERT(allow_macro_instructions_);
+    VIXL_ASSERT(!GetScratchRegisterList()->IncludesAliasOf(x16));
+    VIXL_ASSERT(!GetScratchRegisterList()->IncludesAliasOf(x17));
+    SingleEmissionCheckScope guard(this);
+    pacib1716();
+  }
+  void Autia1716() {
+    VIXL_ASSERT(allow_macro_instructions_);
+    VIXL_ASSERT(!GetScratchRegisterList()->IncludesAliasOf(x16));
+    VIXL_ASSERT(!GetScratchRegisterList()->IncludesAliasOf(x17));
+    SingleEmissionCheckScope guard(this);
+    autia1716();
+  }
+  void Autib1716() {
+    VIXL_ASSERT(allow_macro_instructions_);
+    VIXL_ASSERT(!GetScratchRegisterList()->IncludesAliasOf(x16));
+    VIXL_ASSERT(!GetScratchRegisterList()->IncludesAliasOf(x17));
+    SingleEmissionCheckScope guard(this);
+    autib1716();
+  }
+  void Xpaclri() {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    xpaclri();
+  }
   void Clrex() {
     VIXL_ASSERT(allow_macro_instructions_);
     SingleEmissionCheckScope guard(this);
@@ -1113,6 +1241,11 @@
     SingleEmissionCheckScope guard(this);
     cneg(rd, rn, cond);
   }
+  void Esb() {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    esb();
+  }
   void Csdb() {
     VIXL_ASSERT(allow_macro_instructions_);
     SingleEmissionCheckScope guard(this);
@@ -1318,6 +1451,12 @@
     SingleEmissionCheckScope guard(this);
     fcvtzs(rd, vn, fbits);
   }
+  void Fjcvtzs(const Register& rd, const VRegister& vn) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    VIXL_ASSERT(!rd.IsZero());
+    SingleEmissionCheckScope guard(this);
+    fjcvtzs(rd, vn);
+  }
   void Fcvtzu(const Register& rd, const VRegister& vn, int fbits = 0) {
     VIXL_ASSERT(allow_macro_instructions_);
     VIXL_ASSERT(!rd.IsZero());
@@ -1389,7 +1528,7 @@
   // signalling NaNs to quiet NaNs when converting between float and double.
   void Fmov(VRegister vd, double imm);
   void Fmov(VRegister vd, float imm);
-  void Fmov(VRegister vd, const F16 imm);
+  void Fmov(VRegister vd, const Float16 imm);
   // Provide a template to allow other types to be converted automatically.
   template <typename T>
   void Fmov(VRegister vd, T imm) {
@@ -1568,6 +1707,91 @@
   COMPARE_AND_SWAP_PAIR_MACRO_LIST(DEFINE_MACRO_ASM_FUNC)
 #undef DEFINE_MACRO_ASM_FUNC
 
+// These macros generate all the variations of the atomic memory operations,
+// e.g. ldadd, ldadda, ldaddb, staddl, etc.
+
+// clang-format off
+#define ATOMIC_MEMORY_SIMPLE_MACRO_LIST(V, DEF, MASM_PRE, ASM_PRE) \
+  V(DEF, MASM_PRE##add,  ASM_PRE##add)                             \
+  V(DEF, MASM_PRE##clr,  ASM_PRE##clr)                             \
+  V(DEF, MASM_PRE##eor,  ASM_PRE##eor)                             \
+  V(DEF, MASM_PRE##set,  ASM_PRE##set)                             \
+  V(DEF, MASM_PRE##smax, ASM_PRE##smax)                            \
+  V(DEF, MASM_PRE##smin, ASM_PRE##smin)                            \
+  V(DEF, MASM_PRE##umax, ASM_PRE##umax)                            \
+  V(DEF, MASM_PRE##umin, ASM_PRE##umin)
+
+#define ATOMIC_MEMORY_STORE_MACRO_MODES(V, MASM, ASM) \
+  V(MASM,     ASM)                                    \
+  V(MASM##l,  ASM##l)                                 \
+  V(MASM##b,  ASM##b)                                 \
+  V(MASM##lb, ASM##lb)                                \
+  V(MASM##h,  ASM##h)                                 \
+  V(MASM##lh, ASM##lh)
+
+#define ATOMIC_MEMORY_LOAD_MACRO_MODES(V, MASM, ASM) \
+  ATOMIC_MEMORY_STORE_MACRO_MODES(V, MASM, ASM)      \
+  V(MASM##a,   ASM##a)                               \
+  V(MASM##al,  ASM##al)                              \
+  V(MASM##ab,  ASM##ab)                              \
+  V(MASM##alb, ASM##alb)                             \
+  V(MASM##ah,  ASM##ah)                              \
+  V(MASM##alh, ASM##alh)
+// clang-format on
+
+#define DEFINE_MACRO_LOAD_ASM_FUNC(MASM, ASM)                                \
+  void MASM(const Register& rs, const Register& rt, const MemOperand& src) { \
+    VIXL_ASSERT(allow_macro_instructions_);                                  \
+    SingleEmissionCheckScope guard(this);                                    \
+    ASM(rs, rt, src);                                                        \
+  }
+#define DEFINE_MACRO_STORE_ASM_FUNC(MASM, ASM)           \
+  void MASM(const Register& rs, const MemOperand& src) { \
+    VIXL_ASSERT(allow_macro_instructions_);              \
+    SingleEmissionCheckScope guard(this);                \
+    ASM(rs, src);                                        \
+  }
+
+  ATOMIC_MEMORY_SIMPLE_MACRO_LIST(ATOMIC_MEMORY_LOAD_MACRO_MODES,
+                                  DEFINE_MACRO_LOAD_ASM_FUNC,
+                                  Ld,
+                                  ld)
+  ATOMIC_MEMORY_SIMPLE_MACRO_LIST(ATOMIC_MEMORY_STORE_MACRO_MODES,
+                                  DEFINE_MACRO_STORE_ASM_FUNC,
+                                  St,
+                                  st)
+
+#define DEFINE_MACRO_SWP_ASM_FUNC(MASM, ASM)                                 \
+  void MASM(const Register& rs, const Register& rt, const MemOperand& src) { \
+    VIXL_ASSERT(allow_macro_instructions_);                                  \
+    SingleEmissionCheckScope guard(this);                                    \
+    ASM(rs, rt, src);                                                        \
+  }
+
+  ATOMIC_MEMORY_LOAD_MACRO_MODES(DEFINE_MACRO_SWP_ASM_FUNC, Swp, swp)
+
+#undef DEFINE_MACRO_LOAD_ASM_FUNC
+#undef DEFINE_MACRO_STORE_ASM_FUNC
+#undef DEFINE_MACRO_SWP_ASM_FUNC
+
+  void Ldaprb(const Register& rt, const MemOperand& src) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    ldaprb(rt, src);
+  }
+
+  void Ldaprh(const Register& rt, const MemOperand& src) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    ldaprh(rt, src);
+  }
+
+  void Ldapr(const Register& rt, const MemOperand& src) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    ldapr(rt, src);
+  }
+
   void Ldnp(const CPURegister& rt,
             const CPURegister& rt2,
             const MemOperand& src) {
@@ -1838,6 +2062,62 @@
     SingleEmissionCheckScope guard(this);
     rev32(rd, rn);
   }
+  void Rev64(const Register& rd, const Register& rn) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    VIXL_ASSERT(!rd.IsZero());
+    VIXL_ASSERT(!rn.IsZero());
+    SingleEmissionCheckScope guard(this);
+    rev64(rd, rn);
+  }
+
+#define PAUTH_MASM_VARIATIONS(V) \
+  V(Paci, paci)                  \
+  V(Pacd, pacd)                  \
+  V(Auti, auti)                  \
+  V(Autd, autd)
+
+#define DEFINE_MACRO_ASM_FUNCS(MASM_PRE, ASM_PRE)            \
+  void MASM_PRE##a(const Register& xd, const Register& xn) { \
+    VIXL_ASSERT(allow_macro_instructions_);                  \
+    SingleEmissionCheckScope guard(this);                    \
+    ASM_PRE##a(xd, xn);                                      \
+  }                                                          \
+  void MASM_PRE##za(const Register& xd) {                    \
+    VIXL_ASSERT(allow_macro_instructions_);                  \
+    SingleEmissionCheckScope guard(this);                    \
+    ASM_PRE##za(xd);                                         \
+  }                                                          \
+  void MASM_PRE##b(const Register& xd, const Register& xn) { \
+    VIXL_ASSERT(allow_macro_instructions_);                  \
+    SingleEmissionCheckScope guard(this);                    \
+    ASM_PRE##b(xd, xn);                                      \
+  }                                                          \
+  void MASM_PRE##zb(const Register& xd) {                    \
+    VIXL_ASSERT(allow_macro_instructions_);                  \
+    SingleEmissionCheckScope guard(this);                    \
+    ASM_PRE##zb(xd);                                         \
+  }
+
+  PAUTH_MASM_VARIATIONS(DEFINE_MACRO_ASM_FUNCS)
+#undef DEFINE_MACRO_ASM_FUNCS
+
+  void Pacga(const Register& xd, const Register& xn, const Register& xm) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    pacga(xd, xn, xm);
+  }
+
+  void Xpaci(const Register& xd) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    xpaci(xd);
+  }
+
+  void Xpacd(const Register& xd) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    xpacd(xd);
+  }
   void Ror(const Register& rd, const Register& rs, unsigned shift) {
     VIXL_ASSERT(allow_macro_instructions_);
     VIXL_ASSERT(!rd.IsZero());

diff --git a/src/aarch64/operands-aarch64.h b/src/aarch64/operands-aarch64.h
index 5b154ea..e3dbfa3 100644
--- a/src/aarch64/operands-aarch64.h
+++ b/src/aarch64/operands-aarch64.h

@@ -364,6 +364,7 @@
 
   VRegister V8B() const { return VRegister(code_, kDRegSize, 8); }
   VRegister V16B() const { return VRegister(code_, kQRegSize, 16); }
+  VRegister V2H() const { return VRegister(code_, kSRegSize, 2); }
   VRegister V4H() const { return VRegister(code_, kDRegSize, 4); }
   VRegister V8H() const { return VRegister(code_, kQRegSize, 8); }
   VRegister V2S() const { return VRegister(code_, kDRegSize, 2); }
@@ -373,6 +374,7 @@
 
   bool Is8B() const { return (Is64Bits() && (lanes_ == 8)); }
   bool Is16B() const { return (Is128Bits() && (lanes_ == 16)); }
+  bool Is2H() const { return (Is32Bits() && (lanes_ == 2)); }
   bool Is4H() const { return (Is64Bits() && (lanes_ == 4)); }
   bool Is8H() const { return (Is128Bits() && (lanes_ == 8)); }
   bool Is2S() const { return (Is64Bits() && (lanes_ == 2)); }

diff --git a/src/aarch64/pointer-auth-aarch64.cc b/src/aarch64/pointer-auth-aarch64.cc
new file mode 100644
index 0000000..55cf4ca
--- /dev/null
+++ b/src/aarch64/pointer-auth-aarch64.cc

@@ -0,0 +1,197 @@
+// Copyright 2018, VIXL authors
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//   * Redistributions of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//   * Redistributions in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//   * Neither the name of ARM Limited nor the names of its contributors may be
+//     used to endorse or promote products derived from this software without
+//     specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
+
+#include "simulator-aarch64.h"
+
+#include "utils-vixl.h"
+
+namespace vixl {
+namespace aarch64 {
+
+// Randomly generated example keys for simulating only.
+const Simulator::PACKey Simulator::kPACKeyIA = {0xc31718727de20f71,
+                                                0xab9fd4e14b2fec51,
+                                                0};
+const Simulator::PACKey Simulator::kPACKeyIB = {0xeebb163b474e04c8,
+                                                0x5267ac6fc280fb7c,
+                                                1};
+const Simulator::PACKey Simulator::kPACKeyDA = {0x5caef808deb8b1e2,
+                                                0xd347cbc06b7b0f77,
+                                                0};
+const Simulator::PACKey Simulator::kPACKeyDB = {0xe06aa1a949ba8cc7,
+                                                0xcfde69e3db6d0432,
+                                                1};
+
+// The general PAC key isn't intended to be used with AuthPAC so we ensure the
+// key number is invalid and asserts if used incorrectly.
+const Simulator::PACKey Simulator::kPACKeyGA = {0xfcd98a44d564b3d5,
+                                                0x6c56df1904bf0ddc,
+                                                -1};
+
+static uint64_t GetNibble(uint64_t in_data, int position) {
+  return (in_data >> position) & 0xf;
+}
+
+static uint64_t ShuffleNibbles(uint64_t in_data) {
+  static int in_positions[16] =
+      {4, 36, 52, 40, 44, 0, 24, 12, 56, 60, 8, 32, 16, 28, 20, 48};
+  uint64_t out_data = 0;
+  for (int i = 0; i < 16; i++) {
+    out_data |= GetNibble(in_data, in_positions[i]) << (4 * i);
+  }
+  return out_data;
+}
+
+static uint64_t SubstituteNibbles(uint64_t in_data) {
+  // Randomly chosen substitutes.
+  static uint64_t subs[16] =
+      {4, 7, 3, 9, 10, 14, 0, 1, 15, 2, 8, 6, 12, 5, 11, 13};
+  uint64_t out_data = 0;
+  for (int i = 0; i < 16; i++) {
+    int index = (in_data >> (4 * i)) & 0xf;
+    out_data |= subs[index] << (4 * i);
+  }
+  return out_data;
+}
+
+// Rotate nibble to the left by the amount specified.
+static uint64_t RotNibble(uint64_t in_cell, int amount) {
+  VIXL_ASSERT((amount >= 0) && (amount <= 3));
+
+  in_cell &= 0xf;
+  uint64_t temp = (in_cell << 4) | in_cell;
+  return (temp >> (4 - amount)) & 0xf;
+}
+
+static uint64_t BigShuffle(uint64_t in_data) {
+  uint64_t out_data = 0;
+  for (int i = 0; i < 4; i++) {
+    uint64_t n12 = GetNibble(in_data, 4 * (i + 12));
+    uint64_t n8 = GetNibble(in_data, 4 * (i + 8));
+    uint64_t n4 = GetNibble(in_data, 4 * (i + 4));
+    uint64_t n0 = GetNibble(in_data, 4 * (i + 0));
+
+    uint64_t t0 = RotNibble(n8, 2) ^ RotNibble(n4, 1) ^ RotNibble(n0, 1);
+    uint64_t t1 = RotNibble(n12, 1) ^ RotNibble(n4, 2) ^ RotNibble(n0, 1);
+    uint64_t t2 = RotNibble(n12, 2) ^ RotNibble(n8, 1) ^ RotNibble(n0, 1);
+    uint64_t t3 = RotNibble(n12, 1) ^ RotNibble(n8, 1) ^ RotNibble(n4, 2);
+
+    out_data |= t3 << (4 * (i + 0));
+    out_data |= t2 << (4 * (i + 4));
+    out_data |= t1 << (4 * (i + 8));
+    out_data |= t0 << (4 * (i + 12));
+  }
+  return out_data;
+}
+
+// A simple, non-standard hash function invented for simulating. It mixes
+// reasonably well, however it is unlikely to be cryptographically secure and
+// may have a higher collision chance than other hashing algorithms.
+uint64_t Simulator::ComputePAC(uint64_t data, uint64_t context, PACKey key) {
+  uint64_t working_value = data ^ key.high;
+  working_value = BigShuffle(working_value);
+  working_value = ShuffleNibbles(working_value);
+  working_value ^= key.low;
+  working_value = ShuffleNibbles(working_value);
+  working_value = BigShuffle(working_value);
+  working_value ^= context;
+  working_value = SubstituteNibbles(working_value);
+  working_value = BigShuffle(working_value);
+  working_value = SubstituteNibbles(working_value);
+
+  return working_value;
+}
+
+// The TTBR is selected by bit 63 or 55 depending on TBI for pointers without
+// codes, but is always 55 once a PAC code is added to a pointer. For this
+// reason, it must be calculated at the call site.
+uint64_t Simulator::CalculatePACMask(uint64_t ptr, PointerType type, int ttbr) {
+  int bottom_pac_bit = GetBottomPACBit(ptr, ttbr);
+  int top_pac_bit = GetTopPACBit(ptr, type);
+  return ExtractUnsignedBitfield64(top_pac_bit,
+                                   bottom_pac_bit,
+                                   0xffffffffffffffff & ~kTTBRMask)
+         << bottom_pac_bit;
+}
+
+uint64_t Simulator::AuthPAC(uint64_t ptr,
+                            uint64_t context,
+                            PACKey key,
+                            PointerType type) {
+  VIXL_ASSERT((key.number == 0) || (key.number == 1));
+
+  uint64_t pac_mask = CalculatePACMask(ptr, type, (ptr >> 55) & 1);
+  uint64_t original_ptr =
+      ((ptr & kTTBRMask) == 0) ? (ptr & ~pac_mask) : (ptr | pac_mask);
+
+  uint64_t pac = ComputePAC(original_ptr, context, key);
+
+  uint64_t error_code = 1 << key.number;
+  if ((pac & pac_mask) == (ptr & pac_mask)) {
+    return original_ptr;
+  } else {
+    int error_lsb = GetTopPACBit(ptr, type) - 2;
+    uint64_t error_mask = UINT64_C(0x3) << error_lsb;
+    return (original_ptr & ~error_mask) | (error_code << error_lsb);
+  }
+}
+
+uint64_t Simulator::AddPAC(uint64_t ptr,
+                           uint64_t context,
+                           PACKey key,
+                           PointerType type) {
+  int top_pac_bit = GetTopPACBit(ptr, type);
+
+  // TODO: Properly handle the case where extension bits are bad and TBI is
+  // turned off, and also test me.
+  VIXL_ASSERT(HasTBI(ptr, type));
+  int ttbr = (ptr >> 55) & 1;
+  uint64_t pac_mask = CalculatePACMask(ptr, type, ttbr);
+  uint64_t ext_ptr = (ttbr == 0) ? (ptr & ~pac_mask) : (ptr | pac_mask);
+
+  uint64_t pac = ComputePAC(ext_ptr, context, key);
+
+  // If the pointer isn't all zeroes or all ones in the PAC bitfield, corrupt
+  // the resulting code.
+  if (((ptr & (pac_mask | kTTBRMask)) != 0x0) &&
+      ((~ptr & (pac_mask | kTTBRMask)) != 0x0)) {
+    pac ^= UINT64_C(1) << (top_pac_bit - 1);
+  }
+
+  uint64_t ttbr_shifted = static_cast<uint64_t>(ttbr) << 55;
+  return (pac & pac_mask) | ttbr_shifted | (ptr & ~pac_mask);
+}
+
+uint64_t Simulator::StripPAC(uint64_t ptr, PointerType type) {
+  uint64_t pac_mask = CalculatePACMask(ptr, type, (ptr >> 55) & 1);
+  return ((ptr & kTTBRMask) == 0) ? (ptr & ~pac_mask) : (ptr | pac_mask);
+}
+}  // namespace aarch64
+}  // namespace vixl
+
+#endif  // VIXL_INCLUDE_SIMULATOR_AARCH64

diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc
index a23d57e..c09650d 100644
--- a/src/aarch64/simulator-aarch64.cc
+++ b/src/aarch64/simulator-aarch64.cc

@@ -35,6 +35,8 @@
 namespace vixl {
 namespace aarch64 {
 
+using vixl::internal::SimFloat16;
+
 const Instruction* Simulator::kEndOfSimAddress = NULL;
 
 void SimSystemRegister::SetBits(int msb, int lsb, uint32_t bits) {
@@ -460,7 +462,7 @@
   // TODO: This assumes that the C++ implementation handles comparisons in the
   // way that we expect (as per AssertSupportedFPCR()).
   bool process_exception = false;
-  if ((std::isnan(val0) != 0) || (std::isnan(val1) != 0)) {
+  if ((IsNaN(val0) != 0) || (IsNaN(val1) != 0)) {
     ReadNzcv().SetRawValue(FPUnorderedFlag);
     if (IsSignallingNaN(val0) || IsSignallingNaN(val1) ||
         (trap == EnableTrap)) {
@@ -839,7 +841,7 @@
     double value;
     switch (lane_size_in_bytes) {
       case kHRegSizeInBytes:
-        value = ReadVRegister(code).GetLane<float16>(lane);
+        value = ReadVRegister(code).GetLane<uint16_t>(lane);
         break;
       case kSRegSizeInBytes:
         value = ReadVRegister(code).GetLane<float>(lane);
@@ -851,7 +853,7 @@
         value = 0.0;
         VIXL_UNREACHABLE();
     }
-    if (std::isnan(value)) {
+    if (IsNaN(value)) {
       // The output for NaNs is implementation defined. Always print `nan`, so
       // that traces are coherent across different implementations.
       fprintf(stream_, "%s%snan%s", separator, clr_vreg_value, clr_normal);
@@ -925,7 +927,7 @@
                                     "0b01 (Round towards Plus Infinity)",
                                     "0b10 (Round towards Minus Infinity)",
                                     "0b11 (Round towards Zero)"};
-      VIXL_ASSERT(ReadFpcr().GetRMode() < (sizeof(rmode) / sizeof(rmode[0])));
+      VIXL_ASSERT(ReadFpcr().GetRMode() < ArrayLength(rmode));
       fprintf(stream_,
               "# %sFPCR: %sAHP:%d DN:%d FZ:%d RMode:%s%s\n",
               clr_flag_name,
@@ -1085,19 +1087,68 @@
 
 
 void Simulator::VisitUnconditionalBranchToRegister(const Instruction* instr) {
-  const Instruction* target = Instruction::Cast(ReadXRegister(instr->GetRn()));
+  bool authenticate = false;
+  bool link = false;
+  uint64_t addr = 0;
+  uint64_t context = 0;
+  Instruction* target;
 
   switch (instr->Mask(UnconditionalBranchToRegisterMask)) {
     case BLR:
-      WriteLr(instr->GetNextInstruction());
+      link = true;
       VIXL_FALLTHROUGH();
     case BR:
     case RET:
-      WritePc(target);
+      addr = ReadXRegister(instr->GetRn());
+      break;
+
+    case BLRAAZ:
+    case BLRABZ:
+      link = true;
+      VIXL_FALLTHROUGH();
+    case BRAAZ:
+    case BRABZ:
+      authenticate = true;
+      addr = ReadXRegister(instr->GetRn());
+      break;
+
+    case BLRAA:
+    case BLRAB:
+      link = true;
+      VIXL_FALLTHROUGH();
+    case BRAA:
+    case BRAB:
+      authenticate = true;
+      addr = ReadXRegister(instr->GetRn());
+      context = ReadXRegister(instr->GetRd());
+      break;
+
+    case RETAA:
+    case RETAB:
+      authenticate = true;
+      addr = ReadXRegister(kLinkRegCode);
+      context = ReadXRegister(31, Reg31IsStackPointer);
       break;
     default:
       VIXL_UNREACHABLE();
   }
+
+  if (link) {
+    WriteLr(instr->GetNextInstruction());
+  }
+
+  if (authenticate) {
+    PACKey key = (instr->ExtractBit(10) == 0) ? kPACKeyIA : kPACKeyIB;
+    addr = AuthPAC(addr, context, key, kInstructionPointer);
+
+    int error_lsb = GetTopPACBit(addr, kInstructionPointer) - 2;
+    if (((addr >> error_lsb) & 0x3) != 0x0) {
+      VIXL_ABORT_WITH_MSG("Failed to authenticate pointer.");
+    }
+  }
+
+  target = Instruction::Cast(addr);
+  WritePc(target);
 }
 
 
@@ -1427,7 +1478,7 @@
       Memory::Write<uint8_t>(address, ReadBRegister(srcdst));
       break;
     case STR_h:
-      Memory::Write<uint16_t>(address, ReadHRegister(srcdst));
+      Memory::Write<uint16_t>(address, ReadHRegisterBits(srcdst));
       break;
     case STR_s:
       Memory::Write<float>(address, ReadSRegister(srcdst));
@@ -1918,6 +1969,223 @@
   }
 }
 
+template <typename T>
+void Simulator::AtomicMemorySimpleHelper(const Instruction* instr) {
+  unsigned rs = instr->GetRs();
+  unsigned rt = instr->GetRt();
+  unsigned rn = instr->GetRn();
+
+  bool is_acquire = (instr->ExtractBit(23) == 1) && (rt != kZeroRegCode);
+  bool is_release = instr->ExtractBit(22) == 1;
+
+  unsigned element_size = sizeof(T);
+  uint64_t address = ReadRegister<uint64_t>(rn, Reg31IsStackPointer);
+
+  // Verify that the address is available to the host.
+  VIXL_ASSERT(address == static_cast<uintptr_t>(address));
+
+  T value = ReadRegister<T>(rs);
+
+  T data = Memory::Read<T>(address);
+
+  if (is_acquire) {
+    // Approximate load-acquire by issuing a full barrier after the load.
+    __sync_synchronize();
+  }
+
+  T result = 0;
+  switch (instr->Mask(AtomicMemorySimpleOpMask)) {
+    case LDADDOp:
+      result = data + value;
+      break;
+    case LDCLROp:
+      VIXL_ASSERT(!std::numeric_limits<T>::is_signed);
+      result = data & ~value;
+      break;
+    case LDEOROp:
+      VIXL_ASSERT(!std::numeric_limits<T>::is_signed);
+      result = data ^ value;
+      break;
+    case LDSETOp:
+      VIXL_ASSERT(!std::numeric_limits<T>::is_signed);
+      result = data | value;
+      break;
+
+    // Signed/Unsigned difference is done via the templated type T.
+    case LDSMAXOp:
+    case LDUMAXOp:
+      result = (data > value) ? data : value;
+      break;
+    case LDSMINOp:
+    case LDUMINOp:
+      result = (data > value) ? value : data;
+      break;
+  }
+
+  if (is_release) {
+    // Approximate store-release by issuing a full barrier before the store.
+    __sync_synchronize();
+  }
+
+  Memory::Write<T>(address, result);
+  WriteRegister<T>(rt, data, NoRegLog);
+
+  LogRead(address, rt, GetPrintRegisterFormatForSize(element_size));
+  LogWrite(address, rs, GetPrintRegisterFormatForSize(element_size));
+}
+
+template <typename T>
+void Simulator::AtomicMemorySwapHelper(const Instruction* instr) {
+  unsigned rs = instr->GetRs();
+  unsigned rt = instr->GetRt();
+  unsigned rn = instr->GetRn();
+
+  bool is_acquire = (instr->ExtractBit(23) == 1) && (rt != kZeroRegCode);
+  bool is_release = instr->ExtractBit(22) == 1;
+
+  unsigned element_size = sizeof(T);
+  uint64_t address = ReadRegister<uint64_t>(rn, Reg31IsStackPointer);
+
+  // Verify that the address is available to the host.
+  VIXL_ASSERT(address == static_cast<uintptr_t>(address));
+
+  T data = Memory::Read<T>(address);
+  if (is_acquire) {
+    // Approximate load-acquire by issuing a full barrier after the load.
+    __sync_synchronize();
+  }
+
+  if (is_release) {
+    // Approximate store-release by issuing a full barrier before the store.
+    __sync_synchronize();
+  }
+  Memory::Write<T>(address, ReadRegister<T>(rs));
+
+  WriteRegister<T>(rt, data);
+
+  LogRead(address, rt, GetPrintRegisterFormat(element_size));
+  LogWrite(address, rs, GetPrintRegisterFormat(element_size));
+}
+
+template <typename T>
+void Simulator::LoadAcquireRCpcHelper(const Instruction* instr) {
+  unsigned rt = instr->GetRt();
+  unsigned rn = instr->GetRn();
+
+  unsigned element_size = sizeof(T);
+  uint64_t address = ReadRegister<uint64_t>(rn, Reg31IsStackPointer);
+
+  // Verify that the address is available to the host.
+  VIXL_ASSERT(address == static_cast<uintptr_t>(address));
+  WriteRegister<T>(rt, Memory::Read<T>(address));
+
+  // Approximate load-acquire by issuing a full barrier after the load.
+  __sync_synchronize();
+
+  LogRead(address, rt, GetPrintRegisterFormat(element_size));
+}
+
+#define ATOMIC_MEMORY_SIMPLE_UINT_LIST(V) \
+  V(LDADD)                                \
+  V(LDCLR)                                \
+  V(LDEOR)                                \
+  V(LDSET)                                \
+  V(LDUMAX)                               \
+  V(LDUMIN)
+
+#define ATOMIC_MEMORY_SIMPLE_INT_LIST(V) \
+  V(LDSMAX)                              \
+  V(LDSMIN)
+
+void Simulator::VisitAtomicMemory(const Instruction* instr) {
+  switch (instr->Mask(AtomicMemoryMask)) {
+// clang-format off
+#define SIM_FUNC_B(A) \
+    case A##B:        \
+    case A##AB:       \
+    case A##LB:       \
+    case A##ALB:
+#define SIM_FUNC_H(A) \
+    case A##H:        \
+    case A##AH:       \
+    case A##LH:       \
+    case A##ALH:
+#define SIM_FUNC_w(A) \
+    case A##_w:       \
+    case A##A_w:      \
+    case A##L_w:      \
+    case A##AL_w:
+#define SIM_FUNC_x(A) \
+    case A##_x:       \
+    case A##A_x:      \
+    case A##L_x:      \
+    case A##AL_x:
+
+    ATOMIC_MEMORY_SIMPLE_UINT_LIST(SIM_FUNC_B)
+      AtomicMemorySimpleHelper<uint8_t>(instr);
+      break;
+    ATOMIC_MEMORY_SIMPLE_INT_LIST(SIM_FUNC_B)
+      AtomicMemorySimpleHelper<int8_t>(instr);
+      break;
+    ATOMIC_MEMORY_SIMPLE_UINT_LIST(SIM_FUNC_H)
+      AtomicMemorySimpleHelper<uint16_t>(instr);
+      break;
+    ATOMIC_MEMORY_SIMPLE_INT_LIST(SIM_FUNC_H)
+      AtomicMemorySimpleHelper<int16_t>(instr);
+      break;
+    ATOMIC_MEMORY_SIMPLE_UINT_LIST(SIM_FUNC_w)
+      AtomicMemorySimpleHelper<uint32_t>(instr);
+      break;
+    ATOMIC_MEMORY_SIMPLE_INT_LIST(SIM_FUNC_w)
+      AtomicMemorySimpleHelper<int32_t>(instr);
+      break;
+    ATOMIC_MEMORY_SIMPLE_UINT_LIST(SIM_FUNC_x)
+      AtomicMemorySimpleHelper<uint64_t>(instr);
+      break;
+    ATOMIC_MEMORY_SIMPLE_INT_LIST(SIM_FUNC_x)
+      AtomicMemorySimpleHelper<int64_t>(instr);
+      break;
+    // clang-format on
+
+    case SWPB:
+    case SWPAB:
+    case SWPLB:
+    case SWPALB:
+      AtomicMemorySwapHelper<uint8_t>(instr);
+      break;
+    case SWPH:
+    case SWPAH:
+    case SWPLH:
+    case SWPALH:
+      AtomicMemorySwapHelper<uint16_t>(instr);
+      break;
+    case SWP_w:
+    case SWPA_w:
+    case SWPL_w:
+    case SWPAL_w:
+      AtomicMemorySwapHelper<uint32_t>(instr);
+      break;
+    case SWP_x:
+    case SWPA_x:
+    case SWPL_x:
+    case SWPAL_x:
+      AtomicMemorySwapHelper<uint64_t>(instr);
+      break;
+    case LDAPRB:
+      LoadAcquireRCpcHelper<uint8_t>(instr);
+      break;
+    case LDAPRH:
+      LoadAcquireRCpcHelper<uint16_t>(instr);
+      break;
+    case LDAPR_w:
+      LoadAcquireRCpcHelper<uint32_t>(instr);
+      break;
+    case LDAPR_x:
+      LoadAcquireRCpcHelper<uint64_t>(instr);
+      break;
+  }
+}
+
 
 void Simulator::VisitLoadLiteral(const Instruction* instr) {
   unsigned rt = instr->GetRt();
@@ -2072,11 +2340,44 @@
 }
 
 
+// clang-format off
+#define PAUTH_MODES(V)                                       \
+  V(IA,  ReadXRegister(src), kPACKeyIA, kInstructionPointer) \
+  V(IB,  ReadXRegister(src), kPACKeyIB, kInstructionPointer) \
+  V(IZA, 0x00000000,         kPACKeyIA, kInstructionPointer) \
+  V(IZB, 0x00000000,         kPACKeyIB, kInstructionPointer) \
+  V(DA,  ReadXRegister(src), kPACKeyDA, kDataPointer)        \
+  V(DB,  ReadXRegister(src), kPACKeyDB, kDataPointer)        \
+  V(DZA, 0x00000000,         kPACKeyDA, kDataPointer)        \
+  V(DZB, 0x00000000,         kPACKeyDB, kDataPointer)
+// clang-format on
+
 void Simulator::VisitDataProcessing1Source(const Instruction* instr) {
   unsigned dst = instr->GetRd();
   unsigned src = instr->GetRn();
 
   switch (instr->Mask(DataProcessing1SourceMask)) {
+#define DEFINE_PAUTH_FUNCS(SUFFIX, MOD, KEY, D)     \
+  case PAC##SUFFIX: {                               \
+    uint64_t ptr = ReadXRegister(dst);              \
+    WriteXRegister(dst, AddPAC(ptr, MOD, KEY, D));  \
+    break;                                          \
+  }                                                 \
+  case AUT##SUFFIX: {                               \
+    uint64_t ptr = ReadXRegister(dst);              \
+    WriteXRegister(dst, AuthPAC(ptr, MOD, KEY, D)); \
+    break;                                          \
+  }
+
+    PAUTH_MODES(DEFINE_PAUTH_FUNCS)
+#undef DEFINE_PAUTH_FUNCS
+
+    case XPACI:
+      WriteXRegister(dst, StripPAC(ReadXRegister(dst), kInstructionPointer));
+      break;
+    case XPACD:
+      WriteXRegister(dst, StripPAC(ReadXRegister(dst), kDataPointer));
+      break;
     case RBIT_w:
       WriteWRegister(dst, ReverseBits(ReadWRegister(src)));
       break;
@@ -2217,6 +2518,14 @@
     case RORV_x:
       shift_op = ROR;
       break;
+    case PACGA: {
+      uint64_t dst = static_cast<uint64_t>(ReadXRegister(instr->GetRn()));
+      uint64_t src = static_cast<uint64_t>(
+          ReadXRegister(instr->GetRm(), Reg31IsStackPointer));
+      uint64_t code = ComputePAC(dst, src, kPACKeyGA);
+      result = code & 0xffffffff00000000;
+      break;
+    }
     case CRC32B: {
       uint32_t acc = ReadRegister<uint32_t>(instr->GetRn());
       uint8_t val = ReadRegister<uint8_t>(instr->GetRm());
@@ -2438,7 +2747,7 @@
   unsigned dest = instr->GetRd();
   switch (instr->Mask(FPImmediateMask)) {
     case FMOV_h_imm:
-      WriteHRegister(dest, instr->GetImmFP16());
+      WriteHRegister(dest, Float16ToRawbits(instr->GetImmFP16()));
       break;
     case FMOV_s_imm:
       WriteSRegister(dest, instr->GetImmFP32());
@@ -2461,6 +2770,12 @@
   FPRounding round = ReadRMode();
 
   switch (instr->Mask(FPIntegerConvertMask)) {
+    case FCVTAS_wh:
+      WriteWRegister(dst, FPToInt32(ReadHRegister(src), FPTieAway));
+      break;
+    case FCVTAS_xh:
+      WriteXRegister(dst, FPToInt64(ReadHRegister(src), FPTieAway));
+      break;
     case FCVTAS_ws:
       WriteWRegister(dst, FPToInt32(ReadSRegister(src), FPTieAway));
       break;
@@ -2473,6 +2788,12 @@
     case FCVTAS_xd:
       WriteXRegister(dst, FPToInt64(ReadDRegister(src), FPTieAway));
       break;
+    case FCVTAU_wh:
+      WriteWRegister(dst, FPToUInt32(ReadHRegister(src), FPTieAway));
+      break;
+    case FCVTAU_xh:
+      WriteXRegister(dst, FPToUInt64(ReadHRegister(src), FPTieAway));
+      break;
     case FCVTAU_ws:
       WriteWRegister(dst, FPToUInt32(ReadSRegister(src), FPTieAway));
       break;
@@ -2485,6 +2806,12 @@
     case FCVTAU_xd:
       WriteXRegister(dst, FPToUInt64(ReadDRegister(src), FPTieAway));
       break;
+    case FCVTMS_wh:
+      WriteWRegister(dst, FPToInt32(ReadHRegister(src), FPNegativeInfinity));
+      break;
+    case FCVTMS_xh:
+      WriteXRegister(dst, FPToInt64(ReadHRegister(src), FPNegativeInfinity));
+      break;
     case FCVTMS_ws:
       WriteWRegister(dst, FPToInt32(ReadSRegister(src), FPNegativeInfinity));
       break;
@@ -2497,6 +2824,12 @@
     case FCVTMS_xd:
       WriteXRegister(dst, FPToInt64(ReadDRegister(src), FPNegativeInfinity));
       break;
+    case FCVTMU_wh:
+      WriteWRegister(dst, FPToUInt32(ReadHRegister(src), FPNegativeInfinity));
+      break;
+    case FCVTMU_xh:
+      WriteXRegister(dst, FPToUInt64(ReadHRegister(src), FPNegativeInfinity));
+      break;
     case FCVTMU_ws:
       WriteWRegister(dst, FPToUInt32(ReadSRegister(src), FPNegativeInfinity));
       break;
@@ -2509,6 +2842,12 @@
     case FCVTMU_xd:
       WriteXRegister(dst, FPToUInt64(ReadDRegister(src), FPNegativeInfinity));
       break;
+    case FCVTPS_wh:
+      WriteWRegister(dst, FPToInt32(ReadHRegister(src), FPPositiveInfinity));
+      break;
+    case FCVTPS_xh:
+      WriteXRegister(dst, FPToInt64(ReadHRegister(src), FPPositiveInfinity));
+      break;
     case FCVTPS_ws:
       WriteWRegister(dst, FPToInt32(ReadSRegister(src), FPPositiveInfinity));
       break;
@@ -2521,6 +2860,12 @@
     case FCVTPS_xd:
       WriteXRegister(dst, FPToInt64(ReadDRegister(src), FPPositiveInfinity));
       break;
+    case FCVTPU_wh:
+      WriteWRegister(dst, FPToUInt32(ReadHRegister(src), FPPositiveInfinity));
+      break;
+    case FCVTPU_xh:
+      WriteXRegister(dst, FPToUInt64(ReadHRegister(src), FPPositiveInfinity));
+      break;
     case FCVTPU_ws:
       WriteWRegister(dst, FPToUInt32(ReadSRegister(src), FPPositiveInfinity));
       break;
@@ -2533,6 +2878,12 @@
     case FCVTPU_xd:
       WriteXRegister(dst, FPToUInt64(ReadDRegister(src), FPPositiveInfinity));
       break;
+    case FCVTNS_wh:
+      WriteWRegister(dst, FPToInt32(ReadHRegister(src), FPTieEven));
+      break;
+    case FCVTNS_xh:
+      WriteXRegister(dst, FPToInt64(ReadHRegister(src), FPTieEven));
+      break;
     case FCVTNS_ws:
       WriteWRegister(dst, FPToInt32(ReadSRegister(src), FPTieEven));
       break;
@@ -2545,6 +2896,12 @@
     case FCVTNS_xd:
       WriteXRegister(dst, FPToInt64(ReadDRegister(src), FPTieEven));
       break;
+    case FCVTNU_wh:
+      WriteWRegister(dst, FPToUInt32(ReadHRegister(src), FPTieEven));
+      break;
+    case FCVTNU_xh:
+      WriteXRegister(dst, FPToUInt64(ReadHRegister(src), FPTieEven));
+      break;
     case FCVTNU_ws:
       WriteWRegister(dst, FPToUInt32(ReadSRegister(src), FPTieEven));
       break;
@@ -2557,6 +2914,12 @@
     case FCVTNU_xd:
       WriteXRegister(dst, FPToUInt64(ReadDRegister(src), FPTieEven));
       break;
+    case FCVTZS_wh:
+      WriteWRegister(dst, FPToInt32(ReadHRegister(src), FPZero));
+      break;
+    case FCVTZS_xh:
+      WriteXRegister(dst, FPToInt64(ReadHRegister(src), FPZero));
+      break;
     case FCVTZS_ws:
       WriteWRegister(dst, FPToInt32(ReadSRegister(src), FPZero));
       break;
@@ -2569,6 +2932,12 @@
     case FCVTZS_xd:
       WriteXRegister(dst, FPToInt64(ReadDRegister(src), FPZero));
       break;
+    case FCVTZU_wh:
+      WriteWRegister(dst, FPToUInt32(ReadHRegister(src), FPZero));
+      break;
+    case FCVTZU_xh:
+      WriteXRegister(dst, FPToUInt64(ReadHRegister(src), FPZero));
+      break;
     case FCVTZU_ws:
       WriteWRegister(dst, FPToUInt32(ReadSRegister(src), FPZero));
       break;
@@ -2581,6 +2950,9 @@
     case FCVTZU_xd:
       WriteXRegister(dst, FPToUInt64(ReadDRegister(src), FPZero));
       break;
+    case FJCVTZS:
+      WriteWRegister(dst, FPToFixedJS(ReadDRegister(src)));
+      break;
     case FMOV_hw:
       WriteHRegister(dst, ReadWRegister(src) & kHRegMask);
       break;
@@ -2626,9 +2998,7 @@
       break;
     case UCVTF_dw: {
       WriteDRegister(dst,
-                     UFixedToDouble(static_cast<uint32_t>(ReadWRegister(src)),
-                                    0,
-                                    round));
+                     UFixedToDouble(ReadRegister<uint32_t>(src), 0, round));
       break;
     }
     case SCVTF_sx:
@@ -2641,10 +3011,21 @@
       WriteSRegister(dst, UFixedToFloat(ReadXRegister(src), 0, round));
       break;
     case UCVTF_sw: {
-      WriteSRegister(dst,
-                     UFixedToFloat(static_cast<uint32_t>(ReadWRegister(src)),
-                                   0,
-                                   round));
+      WriteSRegister(dst, UFixedToFloat(ReadRegister<uint32_t>(src), 0, round));
+      break;
+    }
+    case SCVTF_hx:
+      WriteHRegister(dst, FixedToFloat16(ReadXRegister(src), 0, round));
+      break;
+    case SCVTF_hw:
+      WriteHRegister(dst, FixedToFloat16(ReadWRegister(src), 0, round));
+      break;
+    case UCVTF_hx:
+      WriteHRegister(dst, UFixedToFloat16(ReadXRegister(src), 0, round));
+      break;
+    case UCVTF_hw: {
+      WriteHRegister(dst,
+                     UFixedToFloat16(ReadRegister<uint32_t>(src), 0, round));
       break;
     }
 
@@ -2677,9 +3058,7 @@
       break;
     case UCVTF_dw_fixed: {
       WriteDRegister(dst,
-                     UFixedToDouble(static_cast<uint32_t>(ReadWRegister(src)),
-                                    fbits,
-                                    round));
+                     UFixedToDouble(ReadRegister<uint32_t>(src), fbits, round));
       break;
     }
     case SCVTF_sx_fixed:
@@ -2693,9 +3072,23 @@
       break;
     case UCVTF_sw_fixed: {
       WriteSRegister(dst,
-                     UFixedToFloat(static_cast<uint32_t>(ReadWRegister(src)),
-                                   fbits,
-                                   round));
+                     UFixedToFloat(ReadRegister<uint32_t>(src), fbits, round));
+      break;
+    }
+    case SCVTF_hx_fixed:
+      WriteHRegister(dst, FixedToFloat16(ReadXRegister(src), fbits, round));
+      break;
+    case SCVTF_hw_fixed:
+      WriteHRegister(dst, FixedToFloat16(ReadWRegister(src), fbits, round));
+      break;
+    case UCVTF_hx_fixed:
+      WriteHRegister(dst, UFixedToFloat16(ReadXRegister(src), fbits, round));
+      break;
+    case UCVTF_hw_fixed: {
+      WriteHRegister(dst,
+                     UFixedToFloat16(ReadRegister<uint32_t>(src),
+                                     fbits,
+                                     round));
       break;
     }
     case FCVTZS_xd_fixed:
@@ -2738,6 +3131,30 @@
                      FPToUInt32(ReadSRegister(src) * std::pow(2.0f, fbits),
                                 FPZero));
       break;
+    case FCVTZS_xh_fixed: {
+      double output =
+          static_cast<double>(ReadHRegister(src)) * std::pow(2.0, fbits);
+      WriteXRegister(dst, FPToInt64(output, FPZero));
+      break;
+    }
+    case FCVTZS_wh_fixed: {
+      double output =
+          static_cast<double>(ReadHRegister(src)) * std::pow(2.0, fbits);
+      WriteWRegister(dst, FPToInt32(output, FPZero));
+      break;
+    }
+    case FCVTZU_xh_fixed: {
+      double output =
+          static_cast<double>(ReadHRegister(src)) * std::pow(2.0, fbits);
+      WriteXRegister(dst, FPToUInt64(output, FPZero));
+      break;
+    }
+    case FCVTZU_wh_fixed: {
+      double output =
+          static_cast<double>(ReadHRegister(src)) * std::pow(2.0, fbits);
+      WriteWRegister(dst, FPToUInt32(output, FPZero));
+      break;
+    }
     default:
       VIXL_UNREACHABLE();
   }
@@ -2749,6 +3166,14 @@
 
   FPTrapFlags trap = DisableTrap;
   switch (instr->Mask(FPCompareMask)) {
+    case FCMPE_h:
+      trap = EnableTrap;
+      VIXL_FALLTHROUGH();
+    case FCMP_h:
+      FPCompare(ReadHRegister(instr->GetRn()),
+                ReadHRegister(instr->GetRm()),
+                trap);
+      break;
     case FCMPE_s:
       trap = EnableTrap;
       VIXL_FALLTHROUGH();
@@ -2765,6 +3190,12 @@
                 ReadDRegister(instr->GetRm()),
                 trap);
       break;
+    case FCMPE_h_zero:
+      trap = EnableTrap;
+      VIXL_FALLTHROUGH();
+    case FCMP_h_zero:
+      FPCompare(ReadHRegister(instr->GetRn()), SimFloat16(0.0), trap);
+      break;
     case FCMPE_s_zero:
       trap = EnableTrap;
       VIXL_FALLTHROUGH();
@@ -2788,6 +3219,19 @@
 
   FPTrapFlags trap = DisableTrap;
   switch (instr->Mask(FPConditionalCompareMask)) {
+    case FCCMPE_h:
+      trap = EnableTrap;
+      VIXL_FALLTHROUGH();
+    case FCCMP_h:
+      if (ConditionPassed(instr->GetCondition())) {
+        FPCompare(ReadHRegister(instr->GetRn()),
+                  ReadHRegister(instr->GetRm()),
+                  trap);
+      } else {
+        ReadNzcv().SetFlags(instr->GetNzcv());
+        LogSystemRegister(NZCV);
+      }
+      break;
     case FCCMPE_s:
       trap = EnableTrap;
       VIXL_FALLTHROUGH();
@@ -2831,6 +3275,9 @@
   }
 
   switch (instr->Mask(FPConditionalSelectMask)) {
+    case FCSEL_h:
+      WriteHRegister(instr->GetRd(), ReadHRegister(selected));
+      break;
     case FCSEL_s:
       WriteSRegister(instr->GetRd(), ReadSRegister(selected));
       break;
@@ -2861,6 +3308,7 @@
       vform = kFormatH;
       break;
   }
+
   SimVRegister& rd = ReadVRegister(instr->GetRd());
   SimVRegister& rn = ReadVRegister(instr->GetRn());
   bool inexact_exception = false;
@@ -2878,12 +3326,14 @@
     case FMOV_d:
       WriteDRegister(fd, ReadDRegister(fn));
       return;
+    case FABS_h:
     case FABS_s:
     case FABS_d:
       fabs_(vform, ReadVRegister(fd), ReadVRegister(fn));
       // Explicitly log the register update whilst we have type information.
       LogVRegister(fd, GetPrintRegisterFormatFP(vform));
       return;
+    case FNEG_h:
     case FNEG_s:
     case FNEG_d:
       fneg(vform, ReadVRegister(fd), ReadVRegister(fn));
@@ -2897,48 +3347,58 @@
       WriteSRegister(fd, FPToFloat(ReadDRegister(fn), FPTieEven, ReadDN()));
       return;
     case FCVT_hs:
-      WriteHRegister(fd, FPToFloat16(ReadSRegister(fn), FPTieEven, ReadDN()));
+      WriteHRegister(fd,
+                     Float16ToRawbits(
+                         FPToFloat16(ReadSRegister(fn), FPTieEven, ReadDN())));
       return;
     case FCVT_sh:
       WriteSRegister(fd, FPToFloat(ReadHRegister(fn), ReadDN()));
       return;
     case FCVT_dh:
-      WriteDRegister(fd,
-                     FPToDouble(FPToFloat(ReadHRegister(fn), ReadDN()),
-                                ReadDN()));
+      WriteDRegister(fd, FPToDouble(ReadHRegister(fn), ReadDN()));
       return;
     case FCVT_hd:
-      WriteHRegister(fd, FPToFloat16(ReadDRegister(fn), FPTieEven, ReadDN()));
+      WriteHRegister(fd,
+                     Float16ToRawbits(
+                         FPToFloat16(ReadDRegister(fn), FPTieEven, ReadDN())));
       return;
+    case FSQRT_h:
     case FSQRT_s:
     case FSQRT_d:
       fsqrt(vform, rd, rn);
       // Explicitly log the register update whilst we have type information.
       LogVRegister(fd, GetPrintRegisterFormatFP(vform));
       return;
+    case FRINTI_h:
     case FRINTI_s:
     case FRINTI_d:
       break;  // Use FPCR rounding mode.
+    case FRINTX_h:
     case FRINTX_s:
     case FRINTX_d:
       inexact_exception = true;
       break;
+    case FRINTA_h:
     case FRINTA_s:
     case FRINTA_d:
       fpcr_rounding = FPTieAway;
       break;
+    case FRINTM_h:
     case FRINTM_s:
     case FRINTM_d:
       fpcr_rounding = FPNegativeInfinity;
       break;
+    case FRINTN_h:
     case FRINTN_s:
     case FRINTN_d:
       fpcr_rounding = FPTieEven;
       break;
+    case FRINTP_h:
     case FRINTP_s:
     case FRINTP_d:
       fpcr_rounding = FPPositiveInfinity;
       break;
+    case FRINTZ_h:
     case FRINTZ_s:
     case FRINTZ_d:
       fpcr_rounding = FPZero;
@@ -2967,44 +3427,56 @@
     case FP32:
       vform = kFormatS;
       break;
+    case FP16:
+      vform = kFormatH;
+      break;
   }
   SimVRegister& rd = ReadVRegister(instr->GetRd());
   SimVRegister& rn = ReadVRegister(instr->GetRn());
   SimVRegister& rm = ReadVRegister(instr->GetRm());
 
   switch (instr->Mask(FPDataProcessing2SourceMask)) {
+    case FADD_h:
     case FADD_s:
     case FADD_d:
       fadd(vform, rd, rn, rm);
       break;
+    case FSUB_h:
     case FSUB_s:
     case FSUB_d:
       fsub(vform, rd, rn, rm);
       break;
+    case FMUL_h:
     case FMUL_s:
     case FMUL_d:
       fmul(vform, rd, rn, rm);
       break;
+    case FNMUL_h:
     case FNMUL_s:
     case FNMUL_d:
       fnmul(vform, rd, rn, rm);
       break;
+    case FDIV_h:
     case FDIV_s:
     case FDIV_d:
       fdiv(vform, rd, rn, rm);
       break;
+    case FMAX_h:
     case FMAX_s:
     case FMAX_d:
       fmax(vform, rd, rn, rm);
       break;
+    case FMIN_h:
     case FMIN_s:
     case FMIN_d:
       fmin(vform, rd, rn, rm);
       break;
+    case FMAXNM_h:
     case FMAXNM_s:
     case FMAXNM_d:
       fmaxnm(vform, rd, rn, rm);
       break;
+    case FMINNM_h:
     case FMINNM_s:
     case FMINNM_d:
       fminnm(vform, rd, rn, rm);
@@ -3027,6 +3499,18 @@
 
   switch (instr->Mask(FPDataProcessing3SourceMask)) {
     // fd = fa +/- (fn * fm)
+    case FMADD_h:
+      WriteHRegister(fd,
+                     FPMulAdd(ReadHRegister(fa),
+                              ReadHRegister(fn),
+                              ReadHRegister(fm)));
+      break;
+    case FMSUB_h:
+      WriteHRegister(fd,
+                     FPMulAdd(ReadHRegister(fa),
+                              -ReadHRegister(fn),
+                              ReadHRegister(fm)));
+      break;
     case FMADD_s:
       WriteSRegister(fd,
                      FPMulAdd(ReadSRegister(fa),
@@ -3052,6 +3536,18 @@
                               ReadDRegister(fm)));
       break;
     // Negated variants of the above.
+    case FNMADD_h:
+      WriteHRegister(fd,
+                     FPMulAdd(-ReadHRegister(fa),
+                              -ReadHRegister(fn),
+                              ReadHRegister(fm)));
+      break;
+    case FNMSUB_h:
+      WriteHRegister(fd,
+                     FPMulAdd(-ReadHRegister(fa),
+                              ReadHRegister(fn),
+                              ReadHRegister(fm)));
+      break;
     case FNMADD_s:
       WriteSRegister(fd,
                      FPMulAdd(-ReadSRegister(fa),
@@ -3090,16 +3586,19 @@
 
   if (instr->Mask(FP64) == FP64) {
     double result = FPProcessNaNs(ReadDRegister(fn), ReadDRegister(fm));
-    if (std::isnan(result)) {
+    if (IsNaN(result)) {
       WriteDRegister(fd, result);
       done = true;
     }
-  } else {
+  } else if (instr->Mask(FP32) == FP32) {
     float result = FPProcessNaNs(ReadSRegister(fn), ReadSRegister(fm));
-    if (std::isnan(result)) {
+    if (IsNaN(result)) {
       WriteSRegister(fd, result);
       done = true;
     }
+  } else {
+    VIXL_ASSERT(instr->Mask(FP16) == FP16);
+    VIXL_UNIMPLEMENTED();
   }
 
   return done;
@@ -3125,11 +3624,43 @@
 }
 
 
+// clang-format off
+#define PAUTH_SYSTEM_MODES(V)                                     \
+  V(A1716, 17, ReadXRegister(16),                      kPACKeyIA) \
+  V(B1716, 17, ReadXRegister(16),                      kPACKeyIB) \
+  V(AZ,    30, 0x00000000,                             kPACKeyIA) \
+  V(BZ,    30, 0x00000000,                             kPACKeyIB) \
+  V(ASP,   30, ReadXRegister(31, Reg31IsStackPointer), kPACKeyIA) \
+  V(BSP,   30, ReadXRegister(31, Reg31IsStackPointer), kPACKeyIB)
+// clang-format on
+
+
 void Simulator::VisitSystem(const Instruction* instr) {
   // Some system instructions hijack their Op and Cp fields to represent a
   // range of immediates instead of indicating a different instruction. This
   // makes the decoding tricky.
-  if (instr->Mask(SystemExclusiveMonitorFMask) == SystemExclusiveMonitorFixed) {
+  if (instr->GetInstructionBits() == XPACLRI) {
+    WriteXRegister(30, StripPAC(ReadXRegister(30), kInstructionPointer));
+  } else if (instr->Mask(SystemPAuthFMask) == SystemPAuthFixed) {
+    switch (instr->Mask(SystemPAuthMask)) {
+#define DEFINE_PAUTH_FUNCS(SUFFIX, DST, MOD, KEY)                              \
+  case PACI##SUFFIX:                                                           \
+    WriteXRegister(DST,                                                        \
+                   AddPAC(ReadXRegister(DST), MOD, KEY, kInstructionPointer)); \
+    break;                                                                     \
+  case AUTI##SUFFIX:                                                           \
+    WriteXRegister(DST,                                                        \
+                   AuthPAC(ReadXRegister(DST),                                 \
+                           MOD,                                                \
+                           KEY,                                                \
+                           kInstructionPointer));                              \
+    break;
+
+      PAUTH_SYSTEM_MODES(DEFINE_PAUTH_FUNCS)
+#undef DEFINE_PAUTH_FUNCS
+    }
+  } else if (instr->Mask(SystemExclusiveMonitorFMask) ==
+             SystemExclusiveMonitorFixed) {
     VIXL_ASSERT(instr->Mask(SystemExclusiveMonitorMask) == CLREX);
     switch (instr->Mask(SystemExclusiveMonitorMask)) {
       case CLREX: {
@@ -3173,6 +3704,7 @@
     VIXL_ASSERT(instr->Mask(SystemHintMask) == HINT);
     switch (instr->GetImmHint()) {
       case NOP:
+      case ESB:
       case CSDB:
         break;
       default:
@@ -3514,6 +4046,111 @@
 }
 
 
+void Simulator::VisitNEON2RegMiscFP16(const Instruction* instr) {
+  static const NEONFormatMap map_half = {{30}, {NF_4H, NF_8H}};
+  NEONFormatDecoder nfd(instr);
+  VectorFormat fpf = nfd.GetVectorFormat(&map_half);
+
+  FPRounding fpcr_rounding = static_cast<FPRounding>(ReadFpcr().GetRMode());
+
+  SimVRegister& rd = ReadVRegister(instr->GetRd());
+  SimVRegister& rn = ReadVRegister(instr->GetRn());
+
+  switch (instr->Mask(NEON2RegMiscFP16Mask)) {
+    case NEON_SCVTF_H:
+      scvtf(fpf, rd, rn, 0, fpcr_rounding);
+      return;
+    case NEON_UCVTF_H:
+      ucvtf(fpf, rd, rn, 0, fpcr_rounding);
+      return;
+    case NEON_FCVTNS_H:
+      fcvts(fpf, rd, rn, FPTieEven);
+      return;
+    case NEON_FCVTNU_H:
+      fcvtu(fpf, rd, rn, FPTieEven);
+      return;
+    case NEON_FCVTPS_H:
+      fcvts(fpf, rd, rn, FPPositiveInfinity);
+      return;
+    case NEON_FCVTPU_H:
+      fcvtu(fpf, rd, rn, FPPositiveInfinity);
+      return;
+    case NEON_FCVTMS_H:
+      fcvts(fpf, rd, rn, FPNegativeInfinity);
+      return;
+    case NEON_FCVTMU_H:
+      fcvtu(fpf, rd, rn, FPNegativeInfinity);
+      return;
+    case NEON_FCVTZS_H:
+      fcvts(fpf, rd, rn, FPZero);
+      return;
+    case NEON_FCVTZU_H:
+      fcvtu(fpf, rd, rn, FPZero);
+      return;
+    case NEON_FCVTAS_H:
+      fcvts(fpf, rd, rn, FPTieAway);
+      return;
+    case NEON_FCVTAU_H:
+      fcvtu(fpf, rd, rn, FPTieAway);
+      return;
+    case NEON_FRINTI_H:
+      frint(fpf, rd, rn, fpcr_rounding, false);
+      return;
+    case NEON_FRINTX_H:
+      frint(fpf, rd, rn, fpcr_rounding, true);
+      return;
+    case NEON_FRINTA_H:
+      frint(fpf, rd, rn, FPTieAway, false);
+      return;
+    case NEON_FRINTM_H:
+      frint(fpf, rd, rn, FPNegativeInfinity, false);
+      return;
+    case NEON_FRINTN_H:
+      frint(fpf, rd, rn, FPTieEven, false);
+      return;
+    case NEON_FRINTP_H:
+      frint(fpf, rd, rn, FPPositiveInfinity, false);
+      return;
+    case NEON_FRINTZ_H:
+      frint(fpf, rd, rn, FPZero, false);
+      return;
+    case NEON_FABS_H:
+      fabs_(fpf, rd, rn);
+      return;
+    case NEON_FNEG_H:
+      fneg(fpf, rd, rn);
+      return;
+    case NEON_FSQRT_H:
+      fsqrt(fpf, rd, rn);
+      return;
+    case NEON_FRSQRTE_H:
+      frsqrte(fpf, rd, rn);
+      return;
+    case NEON_FRECPE_H:
+      frecpe(fpf, rd, rn, fpcr_rounding);
+      return;
+    case NEON_FCMGT_H_zero:
+      fcmp_zero(fpf, rd, rn, gt);
+      return;
+    case NEON_FCMGE_H_zero:
+      fcmp_zero(fpf, rd, rn, ge);
+      return;
+    case NEON_FCMEQ_H_zero:
+      fcmp_zero(fpf, rd, rn, eq);
+      return;
+    case NEON_FCMLE_H_zero:
+      fcmp_zero(fpf, rd, rn, le);
+      return;
+    case NEON_FCMLT_H_zero:
+      fcmp_zero(fpf, rd, rn, lt);
+      return;
+    default:
+      VIXL_UNIMPLEMENTED();
+      return;
+  }
+}
+
+
 void Simulator::VisitNEON3Same(const Instruction* instr) {
   NEONFormatDecoder nfd(instr);
   SimVRegister& rd = ReadVRegister(instr->GetRd());
@@ -3773,6 +4410,59 @@
 }
 
 
+void Simulator::VisitNEON3SameFP16(const Instruction* instr) {
+  NEONFormatDecoder nfd(instr);
+  SimVRegister& rd = ReadVRegister(instr->GetRd());
+  SimVRegister& rn = ReadVRegister(instr->GetRn());
+  SimVRegister& rm = ReadVRegister(instr->GetRm());
+
+  VectorFormat vf = nfd.GetVectorFormat(nfd.FP16FormatMap());
+  switch (instr->Mask(NEON3SameFP16Mask)) {
+#define SIM_FUNC(A, B) \
+  case NEON_##A##_H:   \
+    B(vf, rd, rn, rm); \
+    break;
+    SIM_FUNC(FMAXNM, fmaxnm);
+    SIM_FUNC(FMLA, fmla);
+    SIM_FUNC(FADD, fadd);
+    SIM_FUNC(FMULX, fmulx);
+    SIM_FUNC(FMAX, fmax);
+    SIM_FUNC(FRECPS, frecps);
+    SIM_FUNC(FMINNM, fminnm);
+    SIM_FUNC(FMLS, fmls);
+    SIM_FUNC(FSUB, fsub);
+    SIM_FUNC(FMIN, fmin);
+    SIM_FUNC(FRSQRTS, frsqrts);
+    SIM_FUNC(FMAXNMP, fmaxnmp);
+    SIM_FUNC(FADDP, faddp);
+    SIM_FUNC(FMUL, fmul);
+    SIM_FUNC(FMAXP, fmaxp);
+    SIM_FUNC(FDIV, fdiv);
+    SIM_FUNC(FMINNMP, fminnmp);
+    SIM_FUNC(FABD, fabd);
+    SIM_FUNC(FMINP, fminp);
+#undef SIM_FUNC
+    case NEON_FCMEQ_H:
+      fcmp(vf, rd, rn, rm, eq);
+      break;
+    case NEON_FCMGE_H:
+      fcmp(vf, rd, rn, rm, ge);
+      break;
+    case NEON_FACGE_H:
+      fabscmp(vf, rd, rn, rm, ge);
+      break;
+    case NEON_FCMGT_H:
+      fcmp(vf, rd, rn, rm, gt);
+      break;
+    case NEON_FACGT_H:
+      fabscmp(vf, rd, rn, rm, gt);
+      break;
+    default:
+      VIXL_UNIMPLEMENTED();
+      break;
+  }
+}
+
 void Simulator::VisitNEON3SameExtra(const Instruction* instr) {
   NEONFormatDecoder nfd(instr);
   SimVRegister& rd = ReadVRegister(instr->GetRd());
@@ -3983,11 +4673,31 @@
 void Simulator::VisitNEONAcrossLanes(const Instruction* instr) {
   NEONFormatDecoder nfd(instr);
 
+  static const NEONFormatMap map_half = {{30}, {NF_4H, NF_8H}};
+
   SimVRegister& rd = ReadVRegister(instr->GetRd());
   SimVRegister& rn = ReadVRegister(instr->GetRn());
 
-  // The input operand's VectorFormat is passed for these instructions.
-  if (instr->Mask(NEONAcrossLanesFPFMask) == NEONAcrossLanesFPFixed) {
+  if (instr->Mask(NEONAcrossLanesFP16FMask) == NEONAcrossLanesFP16Fixed) {
+    VectorFormat vf = nfd.GetVectorFormat(&map_half);
+    switch (instr->Mask(NEONAcrossLanesFP16Mask)) {
+      case NEON_FMAXV_H:
+        fmaxv(vf, rd, rn);
+        break;
+      case NEON_FMINV_H:
+        fminv(vf, rd, rn);
+        break;
+      case NEON_FMAXNMV_H:
+        fmaxnmv(vf, rd, rn);
+        break;
+      case NEON_FMINNMV_H:
+        fminnmv(vf, rd, rn);
+        break;
+      default:
+        VIXL_UNIMPLEMENTED();
+    }
+  } else if (instr->Mask(NEONAcrossLanesFPFMask) == NEONAcrossLanesFPFixed) {
+    // The input operand's VectorFormat is passed for these instructions.
     VectorFormat vf = nfd.GetVectorFormat(nfd.FPFormatMap());
 
     switch (instr->Mask(NEONAcrossLanesFPMask)) {
@@ -4040,7 +4750,9 @@
 
 void Simulator::VisitNEONByIndexedElement(const Instruction* instr) {
   NEONFormatDecoder nfd(instr);
+  static const NEONFormatMap map_half = {{30}, {NF_4H, NF_8H}};
   VectorFormat vf_r = nfd.GetVectorFormat();
+  VectorFormat vf_half = nfd.GetVectorFormat(&map_half);
   VectorFormat vf = nfd.GetVectorFormat(nfd.LongIntegerFormatMap());
 
   SimVRegister& rd = ReadVRegister(instr->GetRd());
@@ -4157,22 +4869,37 @@
       break;
     default:
       index = instr->GetNEONH();
-      if ((instr->GetFPType() & 1) == 0) {
+      if (instr->GetFPType() == 0) {
+        rm_reg &= 0xf;
+        index = (index << 2) | (instr->GetNEONL() << 1) | instr->GetNEONM();
+      } else if ((instr->GetFPType() & 1) == 0) {
         index = (index << 1) | instr->GetNEONL();
       }
 
       vf = nfd.GetVectorFormat(nfd.FPFormatMap());
 
       switch (instr->Mask(NEONByIndexedElementFPMask)) {
+        case NEON_FMUL_H_byelement:
+          vf = vf_half;
+          VIXL_FALLTHROUGH();
         case NEON_FMUL_byelement:
           Op = &Simulator::fmul;
           break;
+        case NEON_FMLA_H_byelement:
+          vf = vf_half;
+          VIXL_FALLTHROUGH();
         case NEON_FMLA_byelement:
           Op = &Simulator::fmla;
           break;
+        case NEON_FMLS_H_byelement:
+          vf = vf_half;
+          VIXL_FALLTHROUGH();
         case NEON_FMLS_byelement:
           Op = &Simulator::fmls;
           break;
+        case NEON_FMULX_H_byelement:
+          vf = vf_half;
+          VIXL_FALLTHROUGH();
         case NEON_FMULX_byelement:
           Op = &Simulator::fmulx;
           break;
@@ -4710,7 +5437,7 @@
       } else {  // cmode_0 == 1, cmode == 0xf.
         if (half_enc == 1) {
           vform = q ? kFormat8H : kFormat4H;
-          imm = instr->GetImmNEONFP16();
+          imm = Float16ToRawbits(instr->GetImmNEONFP16());
         } else if (op_bit == 0) {
           vform = q ? kFormat4S : kFormat2S;
           imm = FloatToRawbits(instr->GetImmNEONFP32());
@@ -4911,6 +5638,78 @@
 }
 
 
+void Simulator::VisitNEONScalar2RegMiscFP16(const Instruction* instr) {
+  VectorFormat fpf = kFormatH;
+  FPRounding fpcr_rounding = static_cast<FPRounding>(ReadFpcr().GetRMode());
+
+  SimVRegister& rd = ReadVRegister(instr->GetRd());
+  SimVRegister& rn = ReadVRegister(instr->GetRn());
+
+  switch (instr->Mask(NEONScalar2RegMiscFP16Mask)) {
+    case NEON_FRECPE_H_scalar:
+      frecpe(fpf, rd, rn, fpcr_rounding);
+      break;
+    case NEON_FRECPX_H_scalar:
+      frecpx(fpf, rd, rn);
+      break;
+    case NEON_FRSQRTE_H_scalar:
+      frsqrte(fpf, rd, rn);
+      break;
+    case NEON_FCMGT_H_zero_scalar:
+      fcmp_zero(fpf, rd, rn, gt);
+      break;
+    case NEON_FCMGE_H_zero_scalar:
+      fcmp_zero(fpf, rd, rn, ge);
+      break;
+    case NEON_FCMEQ_H_zero_scalar:
+      fcmp_zero(fpf, rd, rn, eq);
+      break;
+    case NEON_FCMLE_H_zero_scalar:
+      fcmp_zero(fpf, rd, rn, le);
+      break;
+    case NEON_FCMLT_H_zero_scalar:
+      fcmp_zero(fpf, rd, rn, lt);
+      break;
+    case NEON_SCVTF_H_scalar:
+      scvtf(fpf, rd, rn, 0, fpcr_rounding);
+      break;
+    case NEON_UCVTF_H_scalar:
+      ucvtf(fpf, rd, rn, 0, fpcr_rounding);
+      break;
+    case NEON_FCVTNS_H_scalar:
+      fcvts(fpf, rd, rn, FPTieEven);
+      break;
+    case NEON_FCVTNU_H_scalar:
+      fcvtu(fpf, rd, rn, FPTieEven);
+      break;
+    case NEON_FCVTPS_H_scalar:
+      fcvts(fpf, rd, rn, FPPositiveInfinity);
+      break;
+    case NEON_FCVTPU_H_scalar:
+      fcvtu(fpf, rd, rn, FPPositiveInfinity);
+      break;
+    case NEON_FCVTMS_H_scalar:
+      fcvts(fpf, rd, rn, FPNegativeInfinity);
+      break;
+    case NEON_FCVTMU_H_scalar:
+      fcvtu(fpf, rd, rn, FPNegativeInfinity);
+      break;
+    case NEON_FCVTZS_H_scalar:
+      fcvts(fpf, rd, rn, FPZero);
+      break;
+    case NEON_FCVTZU_H_scalar:
+      fcvtu(fpf, rd, rn, FPZero);
+      break;
+    case NEON_FCVTAS_H_scalar:
+      fcvts(fpf, rd, rn, FPTieAway);
+      break;
+    case NEON_FCVTAU_H_scalar:
+      fcvtu(fpf, rd, rn, FPTieAway);
+      break;
+  }
+}
+
+
 void Simulator::VisitNEONScalar3Diff(const Instruction* instr) {
   NEONFormatDecoder nfd(instr, NEONFormatDecoder::LongScalarFormatMap());
   VectorFormat vf = nfd.GetVectorFormat();
@@ -5049,6 +5848,44 @@
   }
 }
 
+void Simulator::VisitNEONScalar3SameFP16(const Instruction* instr) {
+  SimVRegister& rd = ReadVRegister(instr->GetRd());
+  SimVRegister& rn = ReadVRegister(instr->GetRn());
+  SimVRegister& rm = ReadVRegister(instr->GetRm());
+
+  switch (instr->Mask(NEONScalar3SameFP16Mask)) {
+    case NEON_FABD_H_scalar:
+      fabd(kFormatH, rd, rn, rm);
+      break;
+    case NEON_FMULX_H_scalar:
+      fmulx(kFormatH, rd, rn, rm);
+      break;
+    case NEON_FCMEQ_H_scalar:
+      fcmp(kFormatH, rd, rn, rm, eq);
+      break;
+    case NEON_FCMGE_H_scalar:
+      fcmp(kFormatH, rd, rn, rm, ge);
+      break;
+    case NEON_FCMGT_H_scalar:
+      fcmp(kFormatH, rd, rn, rm, gt);
+      break;
+    case NEON_FACGE_H_scalar:
+      fabscmp(kFormatH, rd, rn, rm, ge);
+      break;
+    case NEON_FACGT_H_scalar:
+      fabscmp(kFormatH, rd, rn, rm, gt);
+      break;
+    case NEON_FRECPS_H_scalar:
+      frecps(kFormatH, rd, rn, rm);
+      break;
+    case NEON_FRSQRTS_H_scalar:
+      frsqrts(kFormatH, rd, rn, rm);
+      break;
+    default:
+      VIXL_UNREACHABLE();
+  }
+}
+
 
 void Simulator::VisitNEONScalar3SameExtra(const Instruction* instr) {
   NEONFormatDecoder nfd(instr, NEONFormatDecoder::ScalarFormatMap());
@@ -5115,19 +5952,27 @@
     default:
       vf = nfd.GetVectorFormat(nfd.FPScalarFormatMap());
       index = instr->GetNEONH();
-      if ((instr->GetFPType() & 1) == 0) {
+      if (instr->GetFPType() == 0) {
+        index = (index << 2) | (instr->GetNEONL() << 1) | instr->GetNEONM();
+        rm_reg &= 0xf;
+        vf = kFormatH;
+      } else if ((instr->GetFPType() & 1) == 0) {
         index = (index << 1) | instr->GetNEONL();
       }
       switch (instr->Mask(NEONScalarByIndexedElementFPMask)) {
+        case NEON_FMUL_H_byelement_scalar:
         case NEON_FMUL_byelement_scalar:
           Op = &Simulator::fmul;
           break;
+        case NEON_FMLA_H_byelement_scalar:
         case NEON_FMLA_byelement_scalar:
           Op = &Simulator::fmla;
           break;
+        case NEON_FMLS_H_byelement_scalar:
         case NEON_FMLS_byelement_scalar:
           Op = &Simulator::fmls;
           break;
+        case NEON_FMULX_H_byelement_scalar:
         case NEON_FMULX_byelement_scalar:
           Op = &Simulator::fmulx;
           break;
@@ -5159,27 +6004,36 @@
 
 
 void Simulator::VisitNEONScalarPairwise(const Instruction* instr) {
-  NEONFormatDecoder nfd(instr, NEONFormatDecoder::FPScalarFormatMap());
+  NEONFormatDecoder nfd(instr, NEONFormatDecoder::FPScalarPairwiseFormatMap());
   VectorFormat vf = nfd.GetVectorFormat();
 
   SimVRegister& rd = ReadVRegister(instr->GetRd());
   SimVRegister& rn = ReadVRegister(instr->GetRn());
   switch (instr->Mask(NEONScalarPairwiseMask)) {
-    case NEON_ADDP_scalar:
-      addp(vf, rd, rn);
+    case NEON_ADDP_scalar: {
+      // All pairwise operations except ADDP use bit U to differentiate FP16
+      // from FP32/FP64 variations.
+      NEONFormatDecoder nfd_addp(instr, NEONFormatDecoder::FPScalarFormatMap());
+      addp(nfd_addp.GetVectorFormat(), rd, rn);
       break;
+    }
+    case NEON_FADDP_h_scalar:
     case NEON_FADDP_scalar:
       faddp(vf, rd, rn);
       break;
+    case NEON_FMAXP_h_scalar:
     case NEON_FMAXP_scalar:
       fmaxp(vf, rd, rn);
       break;
+    case NEON_FMAXNMP_h_scalar:
     case NEON_FMAXNMP_scalar:
       fmaxnmp(vf, rd, rn);
       break;
+    case NEON_FMINP_h_scalar:
     case NEON_FMINP_scalar:
       fminp(vf, rd, rn);
       break;
+    case NEON_FMINNMP_h_scalar:
     case NEON_FMINNMP_scalar:
       fminnmp(vf, rd, rn);
       break;

diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h
index f63f0c2..a411787 100644
--- a/src/aarch64/simulator-aarch64.h
+++ b/src/aarch64/simulator-aarch64.h

@@ -39,7 +39,6 @@
 #include "instructions-aarch64.h"
 #include "instrument-aarch64.h"
 #include "simulator-constants-aarch64.h"
-#include "utils-aarch64.h"
 
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
 
@@ -97,12 +96,11 @@
   // Write the specified value. The value is zero-extended if necessary.
   template <typename T>
   void Write(T new_value) {
-    VIXL_STATIC_ASSERT(sizeof(new_value) <= kSizeInBytes);
     if (sizeof(new_value) < kSizeInBytes) {
       // All AArch64 registers are zero-extending.
       memset(value_ + sizeof(new_value), 0, kSizeInBytes - sizeof(new_value));
     }
-    memcpy(value_, &new_value, sizeof(new_value));
+    WriteLane(new_value, 0);
     NotifyRegisterWrite();
   }
   template <typename T>
@@ -116,10 +114,7 @@
   // 0 represents the least significant bits.
   template <typename T>
   void Insert(int lane, T new_value) {
-    VIXL_ASSERT(lane >= 0);
-    VIXL_ASSERT((sizeof(new_value) + (lane * sizeof(new_value))) <=
-                kSizeInBytes);
-    memcpy(&value_[lane * sizeof(new_value)], &new_value, sizeof(new_value));
+    WriteLane(new_value, lane);
     NotifyRegisterWrite();
   }
 
@@ -134,9 +129,7 @@
   template <typename T>
   T GetLane(int lane) const {
     T result;
-    VIXL_ASSERT(lane >= 0);
-    VIXL_ASSERT((sizeof(result) + (lane * sizeof(result))) <= kSizeInBytes);
-    memcpy(&result, &value_[lane * sizeof(result)], sizeof(result));
+    ReadLane(&result, lane);
     return result;
   }
   template <typename T>
@@ -158,10 +151,44 @@
   bool written_since_last_log_;
 
   void NotifyRegisterWrite() { written_since_last_log_ = true; }
+
+ private:
+  template <typename T>
+  void ReadLane(T* dst, int lane) const {
+    VIXL_ASSERT(lane >= 0);
+    VIXL_ASSERT((sizeof(*dst) + (lane * sizeof(*dst))) <= kSizeInBytes);
+    memcpy(dst, &value_[lane * sizeof(*dst)], sizeof(*dst));
+  }
+
+  template <typename T>
+  void WriteLane(T src, int lane) {
+    VIXL_ASSERT(lane >= 0);
+    VIXL_ASSERT((sizeof(src) + (lane * sizeof(src))) <= kSizeInBytes);
+    memcpy(&value_[lane * sizeof(src)], &src, sizeof(src));
+  }
 };
 typedef SimRegisterBase<kXRegSizeInBytes> SimRegister;   // r0-r31
 typedef SimRegisterBase<kQRegSizeInBytes> SimVRegister;  // v0-v31
 
+// The default ReadLane and WriteLane methods assume what we are copying is
+// "trivially copyable" by using memcpy. We have to provide alternative
+// implementations for SimFloat16 which cannot be copied this way.
+
+template <>
+template <>
+inline void SimVRegister::ReadLane(vixl::internal::SimFloat16* dst,
+                                   int lane) const {
+  uint16_t rawbits;
+  ReadLane(&rawbits, lane);
+  *dst = RawbitsToFloat16(rawbits);
+}
+
+template <>
+template <>
+inline void SimVRegister::WriteLane(vixl::internal::SimFloat16 src, int lane) {
+  WriteLane(Float16ToRawbits(src), lane);
+}
+
 // Representation of a vector register, with typed getters and setters for lanes
 // and additional information to represent lane state.
 class LogicVRegister {
@@ -169,10 +196,10 @@
   inline LogicVRegister(
       SimVRegister& other)  // NOLINT(runtime/references)(runtime/explicit)
       : register_(other) {
-    for (unsigned i = 0; i < sizeof(saturated_) / sizeof(saturated_[0]); i++) {
+    for (size_t i = 0; i < ArrayLength(saturated_); i++) {
       saturated_[i] = kNotSaturated;
     }
-    for (unsigned i = 0; i < sizeof(round_) / sizeof(round_[0]); i++) {
+    for (size_t i = 0; i < ArrayLength(round_); i++) {
       round_[i] = 0;
     }
   }
@@ -977,11 +1004,11 @@
     return ReadBRegister(code);
   }
 
-  int16_t ReadHRegister(unsigned code) const {
-    return ReadVRegister<int16_t>(code);
+  vixl::internal::SimFloat16 ReadHRegister(unsigned code) const {
+    return RawbitsToFloat16(ReadHRegisterBits(code));
   }
   VIXL_DEPRECATED("ReadHRegister", int16_t hreg(unsigned code) const) {
-    return ReadHRegister(code);
+    return Float16ToRawbits(ReadHRegister(code));
   }
 
   uint16_t ReadHRegisterBits(unsigned code) const {
@@ -1098,6 +1125,12 @@
   }
 
   void WriteHRegister(unsigned code,
+                      vixl::internal::SimFloat16 value,
+                      RegLogMode log_mode = LogRegWrites) {
+    WriteVRegister(code, Float16ToRawbits(value), log_mode);
+  }
+
+  void WriteHRegister(unsigned code,
                       int16_t value,
                       RegLogMode log_mode = LogRegWrites) {
     WriteVRegister(code, value, log_mode);
@@ -1397,9 +1430,9 @@
     return GetPrintRegisterFormatForSizeFP(sizeof(value));
   }
 
-  PrintRegisterFormat GetPrintRegisterFormat(float16 value) {
-    VIXL_STATIC_ASSERT(sizeof(value) == kHRegSizeInBytes);
-    return GetPrintRegisterFormatForSizeFP(sizeof(value));
+  PrintRegisterFormat GetPrintRegisterFormat(Float16 value) {
+    VIXL_STATIC_ASSERT(sizeof(Float16ToRawbits(value)) == kHRegSizeInBytes);
+    return GetPrintRegisterFormatForSizeFP(sizeof(Float16ToRawbits(value)));
   }
 
   PrintRegisterFormat GetPrintRegisterFormat(VectorFormat vform);
@@ -1552,6 +1585,44 @@
     print_exclusive_access_warning_ = false;
   }
 
+  enum PointerType { kDataPointer, kInstructionPointer };
+
+  struct PACKey {
+    uint64_t high;
+    uint64_t low;
+    int number;
+  };
+
+  // Current implementation is that all pointers are tagged.
+  bool HasTBI(uint64_t ptr, PointerType type) {
+    USE(ptr, type);
+    return true;
+  }
+
+  // Current implementation uses 48-bit virtual addresses.
+  int GetBottomPACBit(uint64_t ptr, int ttbr) {
+    USE(ptr, ttbr);
+    VIXL_ASSERT((ttbr == 0) || (ttbr == 1));
+    return 48;
+  }
+
+  // The top PAC bit is 55 for the purposes of relative bit fields with TBI,
+  // however bit 55 is the TTBR bit regardless of TBI so isn't part of the PAC
+  // codes in pointers.
+  int GetTopPACBit(uint64_t ptr, PointerType type) {
+    return HasTBI(ptr, type) ? 55 : 63;
+  }
+
+  // Armv8.3 Pointer authentication helpers.
+  uint64_t CalculatePACMask(uint64_t ptr, PointerType type, int ext_bit);
+  uint64_t ComputePAC(uint64_t data, uint64_t context, PACKey key);
+  uint64_t AuthPAC(uint64_t ptr,
+                   uint64_t context,
+                   PACKey key,
+                   PointerType type);
+  uint64_t AddPAC(uint64_t ptr, uint64_t context, PACKey key, PointerType type);
+  uint64_t StripPAC(uint64_t ptr, PointerType type);
+
   // The common CPUFeatures interface with the set of available features.
 
   CPUFeatures* GetCPUFeatures() {
@@ -1740,6 +1811,12 @@
   void CompareAndSwapHelper(const Instruction* instr);
   template <typename T>
   void CompareAndSwapPairHelper(const Instruction* instr);
+  template <typename T>
+  void AtomicMemorySimpleHelper(const Instruction* instr);
+  template <typename T>
+  void AtomicMemorySwapHelper(const Instruction* instr);
+  template <typename T>
+  void LoadAcquireRCpcHelper(const Instruction* instr);
   uintptr_t AddressModeHelper(unsigned addr_reg,
                               int64_t offset,
                               AddrMode addrmode);
@@ -2891,12 +2968,16 @@
                         LogicVRegister dst,
                         const LogicVRegister& src);
 
-  typedef float (Simulator::*FPMinMaxOp)(float a, float b);
+  template <typename T>
+  struct TFPMinMaxOp {
+    typedef T (Simulator::*type)(T a, T b);
+  };
 
+  template <typename T>
   LogicVRegister fminmaxv(VectorFormat vform,
                           LogicVRegister dst,
                           const LogicVRegister& src,
-                          FPMinMaxOp Op);
+                          typename TFPMinMaxOp<T>::type Op);
 
   LogicVRegister fminv(VectorFormat vform,
                        LogicVRegister dst,
@@ -2937,10 +3018,19 @@
   double UFixedToDouble(uint64_t src, int fbits, FPRounding round_mode);
   float FixedToFloat(int64_t src, int fbits, FPRounding round_mode);
   float UFixedToFloat(uint64_t src, int fbits, FPRounding round_mode);
+  ::vixl::internal::SimFloat16 FixedToFloat16(int64_t src,
+                                              int fbits,
+                                              FPRounding round_mode);
+  ::vixl::internal::SimFloat16 UFixedToFloat16(uint64_t src,
+                                               int fbits,
+                                               FPRounding round_mode);
+  int16_t FPToInt16(double value, FPRounding rmode);
   int32_t FPToInt32(double value, FPRounding rmode);
   int64_t FPToInt64(double value, FPRounding rmode);
+  uint16_t FPToUInt16(double value, FPRounding rmode);
   uint32_t FPToUInt32(double value, FPRounding rmode);
   uint64_t FPToUInt64(double value, FPRounding rmode);
+  int32_t FPToFixedJS(double value);
 
   template <typename T>
   T FPAdd(T op1, T op2);
@@ -3078,13 +3168,19 @@
   static const char* vreg_names[];
 
  private:
+  static const PACKey kPACKeyIA;
+  static const PACKey kPACKeyIB;
+  static const PACKey kPACKeyDA;
+  static const PACKey kPACKeyDB;
+  static const PACKey kPACKeyGA;
+
   template <typename T>
   static T FPDefaultNaN();
 
   // Standard NaN processing.
   template <typename T>
   T FPProcessNaN(T op) {
-    VIXL_ASSERT(std::isnan(op));
+    VIXL_ASSERT(IsNaN(op));
     if (IsSignallingNaN(op)) {
       FPProcessException();
     }
@@ -3097,10 +3193,10 @@
       return FPProcessNaN(op1);
     } else if (IsSignallingNaN(op2)) {
       return FPProcessNaN(op2);
-    } else if (std::isnan(op1)) {
+    } else if (IsNaN(op1)) {
       VIXL_ASSERT(IsQuietNaN(op1));
       return FPProcessNaN(op1);
-    } else if (std::isnan(op2)) {
+    } else if (IsNaN(op2)) {
       VIXL_ASSERT(IsQuietNaN(op2));
       return FPProcessNaN(op2);
     } else {
@@ -3116,13 +3212,13 @@
       return FPProcessNaN(op2);
     } else if (IsSignallingNaN(op3)) {
       return FPProcessNaN(op3);
-    } else if (std::isnan(op1)) {
+    } else if (IsNaN(op1)) {
       VIXL_ASSERT(IsQuietNaN(op1));
       return FPProcessNaN(op1);
-    } else if (std::isnan(op2)) {
+    } else if (IsNaN(op2)) {
       VIXL_ASSERT(IsQuietNaN(op2));
       return FPProcessNaN(op2);
-    } else if (std::isnan(op3)) {
+    } else if (IsNaN(op3)) {
       VIXL_ASSERT(IsQuietNaN(op3));
       return FPProcessNaN(op3);
     } else {

diff --git a/src/aarch64/utils-aarch64.cc b/src/aarch64/utils-aarch64.cc
deleted file mode 100644
index 4873add..0000000
--- a/src/aarch64/utils-aarch64.cc
+++ /dev/null

@@ -1,311 +0,0 @@
-// Copyright 2018, VIXL authors
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-//   * Redistributions of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//   * Redistributions in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//   * Neither the name of ARM Limited nor the names of its contributors may be
-//     used to endorse or promote products derived from this software without
-//     specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
-// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "utils-aarch64.h"
-
-namespace vixl {
-namespace aarch64 {
-
-float FPToFloat(float16 value, UseDefaultNaN DN, bool* exception) {
-  uint32_t sign = value >> 15;
-  uint32_t exponent =
-      ExtractUnsignedBitfield32(kFloat16MantissaBits + kFloat16ExponentBits - 1,
-                                kFloat16MantissaBits,
-                                value);
-  uint32_t mantissa =
-      ExtractUnsignedBitfield32(kFloat16MantissaBits - 1, 0, value);
-
-  switch (Float16Classify(value)) {
-    case FP_ZERO:
-      return (sign == 0) ? 0.0f : -0.0f;
-
-    case FP_INFINITE:
-      return (sign == 0) ? kFP32PositiveInfinity : kFP32NegativeInfinity;
-
-    case FP_SUBNORMAL: {
-      // Calculate shift required to put mantissa into the most-significant bits
-      // of the destination mantissa.
-      int shift = CountLeadingZeros(mantissa << (32 - 10));
-
-      // Shift mantissa and discard implicit '1'.
-      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits) + shift + 1;
-      mantissa &= (1 << kFloatMantissaBits) - 1;
-
-      // Adjust the exponent for the shift applied, and rebias.
-      exponent = exponent - shift + (-15 + 127);
-      break;
-    }
-
-    case FP_NAN:
-      if (IsSignallingNaN(value)) {
-        if (exception != NULL) {
-          *exception = true;
-        }
-      }
-      if (DN == kUseDefaultNaN) return kFP32DefaultNaN;
-
-      // Convert NaNs as the processor would:
-      //  - The sign is propagated.
-      //  - The payload (mantissa) is transferred entirely, except that the top
-      //    bit is forced to '1', making the result a quiet NaN. The unused
-      //    (low-order) payload bits are set to 0.
-      exponent = (1 << kFloatExponentBits) - 1;
-
-      // Increase bits in mantissa, making low-order bits 0.
-      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
-      mantissa |= 1 << 22;  // Force a quiet NaN.
-      break;
-
-    case FP_NORMAL:
-      // Increase bits in mantissa, making low-order bits 0.
-      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
-
-      // Change exponent bias.
-      exponent += (-15 + 127);
-      break;
-
-    default:
-      VIXL_UNREACHABLE();
-  }
-  return RawbitsToFloat((sign << 31) | (exponent << kFloatMantissaBits) |
-                        mantissa);
-}
-
-
-float FPToFloat(double value,
-                FPRounding round_mode,
-                UseDefaultNaN DN,
-                bool* exception) {
-  // Only the FPTieEven rounding mode is implemented.
-  VIXL_ASSERT((round_mode == FPTieEven) || (round_mode == FPRoundOdd));
-  USE(round_mode);
-
-  switch (std::fpclassify(value)) {
-    case FP_NAN: {
-      if (IsSignallingNaN(value)) {
-        if (exception != NULL) {
-          *exception = true;
-        }
-      }
-      if (DN == kUseDefaultNaN) return kFP32DefaultNaN;
-
-      // Convert NaNs as the processor would:
-      //  - The sign is propagated.
-      //  - The payload (mantissa) is transferred as much as possible, except
-      //    that the top bit is forced to '1', making the result a quiet NaN.
-      uint64_t raw = DoubleToRawbits(value);
-
-      uint32_t sign = raw >> 63;
-      uint32_t exponent = (1 << 8) - 1;
-      uint32_t payload =
-          static_cast<uint32_t>(ExtractUnsignedBitfield64(50, 52 - 23, raw));
-      payload |= (1 << 22);  // Force a quiet NaN.
-
-      return RawbitsToFloat((sign << 31) | (exponent << 23) | payload);
-    }
-
-    case FP_ZERO:
-    case FP_INFINITE: {
-      // In a C++ cast, any value representable in the target type will be
-      // unchanged. This is always the case for +/-0.0 and infinities.
-      return static_cast<float>(value);
-    }
-
-    case FP_NORMAL:
-    case FP_SUBNORMAL: {
-      // Convert double-to-float as the processor would, assuming that FPCR.FZ
-      // (flush-to-zero) is not set.
-      uint64_t raw = DoubleToRawbits(value);
-      // Extract the IEEE-754 double components.
-      uint32_t sign = raw >> 63;
-      // Extract the exponent and remove the IEEE-754 encoding bias.
-      int32_t exponent =
-          static_cast<int32_t>(ExtractUnsignedBitfield64(62, 52, raw)) - 1023;
-      // Extract the mantissa and add the implicit '1' bit.
-      uint64_t mantissa = ExtractUnsignedBitfield64(51, 0, raw);
-      if (std::fpclassify(value) == FP_NORMAL) {
-        mantissa |= (UINT64_C(1) << 52);
-      }
-      return FPRoundToFloat(sign, exponent, mantissa, round_mode);
-    }
-  }
-
-  VIXL_UNREACHABLE();
-  return value;
-}
-
-
-double FPToDouble(float value, UseDefaultNaN DN, bool* exception) {
-  switch (std::fpclassify(value)) {
-    case FP_NAN: {
-      if (IsSignallingNaN(value)) {
-        if (exception != NULL) {
-          *exception = true;
-        }
-      }
-      if (DN == kUseDefaultNaN) return kFP64DefaultNaN;
-
-      // Convert NaNs as the processor would:
-      //  - The sign is propagated.
-      //  - The payload (mantissa) is transferred entirely, except that the top
-      //    bit is forced to '1', making the result a quiet NaN. The unused
-      //    (low-order) payload bits are set to 0.
-      uint32_t raw = FloatToRawbits(value);
-
-      uint64_t sign = raw >> 31;
-      uint64_t exponent = (1 << 11) - 1;
-      uint64_t payload = ExtractUnsignedBitfield64(21, 0, raw);
-      payload <<= (52 - 23);           // The unused low-order bits should be 0.
-      payload |= (UINT64_C(1) << 51);  // Force a quiet NaN.
-
-      return RawbitsToDouble((sign << 63) | (exponent << 52) | payload);
-    }
-
-    case FP_ZERO:
-    case FP_NORMAL:
-    case FP_SUBNORMAL:
-    case FP_INFINITE: {
-      // All other inputs are preserved in a standard cast, because every value
-      // representable using an IEEE-754 float is also representable using an
-      // IEEE-754 double.
-      return static_cast<double>(value);
-    }
-  }
-
-  VIXL_UNREACHABLE();
-  return static_cast<double>(value);
-}
-
-
-float16 FPToFloat16(float value,
-                    FPRounding round_mode,
-                    UseDefaultNaN DN,
-                    bool* exception) {
-  // Only the FPTieEven rounding mode is implemented.
-  VIXL_ASSERT(round_mode == FPTieEven);
-  USE(round_mode);
-
-  uint32_t raw = FloatToRawbits(value);
-  int32_t sign = raw >> 31;
-  int32_t exponent = ExtractUnsignedBitfield32(30, 23, raw) - 127;
-  uint32_t mantissa = ExtractUnsignedBitfield32(22, 0, raw);
-
-  switch (std::fpclassify(value)) {
-    case FP_NAN: {
-      if (IsSignallingNaN(value)) {
-        if (exception != NULL) {
-          *exception = true;
-        }
-      }
-      if (DN == kUseDefaultNaN) return kFP16DefaultNaN;
-
-      // Convert NaNs as the processor would:
-      //  - The sign is propagated.
-      //  - The payload (mantissa) is transferred as much as possible, except
-      //    that the top bit is forced to '1', making the result a quiet NaN.
-      float16 result =
-          (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
-      result |= mantissa >> (kFloatMantissaBits - kFloat16MantissaBits);
-      result |= (1 << 9);  // Force a quiet NaN;
-      return result;
-    }
-
-    case FP_ZERO:
-      return (sign == 0) ? 0 : 0x8000;
-
-    case FP_INFINITE:
-      return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
-
-    case FP_NORMAL:
-    case FP_SUBNORMAL: {
-      // Convert float-to-half as the processor would, assuming that FPCR.FZ
-      // (flush-to-zero) is not set.
-
-      // Add the implicit '1' bit to the mantissa.
-      mantissa += (1 << 23);
-      return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
-    }
-  }
-
-  VIXL_UNREACHABLE();
-  return 0;
-}
-
-
-float16 FPToFloat16(double value,
-                    FPRounding round_mode,
-                    UseDefaultNaN DN,
-                    bool* exception) {
-  // Only the FPTieEven rounding mode is implemented.
-  VIXL_ASSERT(round_mode == FPTieEven);
-  USE(round_mode);
-
-  uint64_t raw = DoubleToRawbits(value);
-  int32_t sign = raw >> 63;
-  int64_t exponent = ExtractUnsignedBitfield64(62, 52, raw) - 1023;
-  uint64_t mantissa = ExtractUnsignedBitfield64(51, 0, raw);
-
-  switch (std::fpclassify(value)) {
-    case FP_NAN: {
-      if (IsSignallingNaN(value)) {
-        if (exception != NULL) {
-          *exception = true;
-        }
-      }
-      if (DN == kUseDefaultNaN) return kFP16DefaultNaN;
-
-      // Convert NaNs as the processor would:
-      //  - The sign is propagated.
-      //  - The payload (mantissa) is transferred as much as possible, except
-      //    that the top bit is forced to '1', making the result a quiet NaN.
-      float16 result =
-          (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
-      result |= mantissa >> (kDoubleMantissaBits - kFloat16MantissaBits);
-      result |= (1 << 9);  // Force a quiet NaN;
-      return result;
-    }
-
-    case FP_ZERO:
-      return (sign == 0) ? 0 : 0x8000;
-
-    case FP_INFINITE:
-      return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
-    case FP_NORMAL:
-    case FP_SUBNORMAL: {
-      // Convert double-to-half as the processor would, assuming that FPCR.FZ
-      // (flush-to-zero) is not set.
-
-      // Add the implicit '1' bit to the mantissa.
-      mantissa += (UINT64_C(1) << 52);
-      return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
-    }
-  }
-
-  VIXL_UNREACHABLE();
-  return 0;
-}
-}  // namespace aarch64
-}  // namespace vixl

diff --git a/src/aarch64/utils-aarch64.h b/src/aarch64/utils-aarch64.h
deleted file mode 100644
index d714516..0000000
--- a/src/aarch64/utils-aarch64.h
+++ /dev/null

@@ -1,328 +0,0 @@
-// Copyright 2018, VIXL authors
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-//   * Redistributions of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//   * Redistributions in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//   * Neither the name of ARM Limited nor the names of its contributors may be
-//     used to endorse or promote products derived from this software without
-//     specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
-// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#ifndef VIXL_AARCH64_UTILS_AARCH64_H_
-#define VIXL_AARCH64_UTILS_AARCH64_H_
-
-#include <limits>
-
-#include "instructions-aarch64.h"
-
-namespace vixl {
-namespace aarch64 {
-
-enum UseDefaultNaN { kUseDefaultNaN, kIgnoreDefaultNaN };
-
-// Assemble the specified IEEE-754 components into the target type and apply
-// appropriate rounding.
-//  sign:     0 = positive, 1 = negative
-//  exponent: Unbiased IEEE-754 exponent.
-//  mantissa: The mantissa of the input. The top bit (which is not encoded for
-//            normal IEEE-754 values) must not be omitted. This bit has the
-//            value 'pow(2, exponent)'.
-//
-// The input value is assumed to be a normalized value. That is, the input may
-// not be infinity or NaN. If the source value is subnormal, it must be
-// normalized before calling this function such that the highest set bit in the
-// mantissa has the value 'pow(2, exponent)'.
-//
-// Callers should use FPRoundToFloat or FPRoundToDouble directly, rather than
-// calling a templated FPRound.
-template <class T, int ebits, int mbits>
-T FPRound(int64_t sign,
-          int64_t exponent,
-          uint64_t mantissa,
-          FPRounding round_mode) {
-  VIXL_ASSERT((sign == 0) || (sign == 1));
-
-  // Only FPTieEven and FPRoundOdd rounding modes are implemented.
-  VIXL_ASSERT((round_mode == FPTieEven) || (round_mode == FPRoundOdd));
-
-  // Rounding can promote subnormals to normals, and normals to infinities. For
-  // example, a double with exponent 127 (FLT_MAX_EXP) would appear to be
-  // encodable as a float, but rounding based on the low-order mantissa bits
-  // could make it overflow. With ties-to-even rounding, this value would become
-  // an infinity.
-
-  // ---- Rounding Method ----
-  //
-  // The exponent is irrelevant in the rounding operation, so we treat the
-  // lowest-order bit that will fit into the result ('onebit') as having
-  // the value '1'. Similarly, the highest-order bit that won't fit into
-  // the result ('halfbit') has the value '0.5'. The 'point' sits between
-  // 'onebit' and 'halfbit':
-  //
-  //            These bits fit into the result.
-  //               |---------------------|
-  //  mantissa = 0bxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
-  //                                     ||
-  //                                    / |
-  //                                   /  halfbit
-  //                               onebit
-  //
-  // For subnormal outputs, the range of representable bits is smaller and
-  // the position of onebit and halfbit depends on the exponent of the
-  // input, but the method is otherwise similar.
-  //
-  //   onebit(frac)
-  //     |
-  //     | halfbit(frac)          halfbit(adjusted)
-  //     | /                      /
-  //     | |                      |
-  //  0b00.0 (exact)      -> 0b00.0 (exact)                    -> 0b00
-  //  0b00.0...           -> 0b00.0...                         -> 0b00
-  //  0b00.1 (exact)      -> 0b00.0111..111                    -> 0b00
-  //  0b00.1...           -> 0b00.1...                         -> 0b01
-  //  0b01.0 (exact)      -> 0b01.0 (exact)                    -> 0b01
-  //  0b01.0...           -> 0b01.0...                         -> 0b01
-  //  0b01.1 (exact)      -> 0b01.1 (exact)                    -> 0b10
-  //  0b01.1...           -> 0b01.1...                         -> 0b10
-  //  0b10.0 (exact)      -> 0b10.0 (exact)                    -> 0b10
-  //  0b10.0...           -> 0b10.0...                         -> 0b10
-  //  0b10.1 (exact)      -> 0b10.0111..111                    -> 0b10
-  //  0b10.1...           -> 0b10.1...                         -> 0b11
-  //  0b11.0 (exact)      -> 0b11.0 (exact)                    -> 0b11
-  //  ...                   /             |                      /   |
-  //                       /              |                     /    |
-  //                                                           /     |
-  // adjusted = frac - (halfbit(mantissa) & ~onebit(frac));   /      |
-  //
-  //                   mantissa = (mantissa >> shift) + halfbit(adjusted);
-
-  static const int mantissa_offset = 0;
-  static const int exponent_offset = mantissa_offset + mbits;
-  static const int sign_offset = exponent_offset + ebits;
-  VIXL_ASSERT(sign_offset == (sizeof(T) * 8 - 1));
-
-  // Bail out early for zero inputs.
-  if (mantissa == 0) {
-    return static_cast<T>(sign << sign_offset);
-  }
-
-  // If all bits in the exponent are set, the value is infinite or NaN.
-  // This is true for all binary IEEE-754 formats.
-  static const int infinite_exponent = (1 << ebits) - 1;
-  static const int max_normal_exponent = infinite_exponent - 1;
-
-  // Apply the exponent bias to encode it for the result. Doing this early makes
-  // it easy to detect values that will be infinite or subnormal.
-  exponent += max_normal_exponent >> 1;
-
-  if (exponent > max_normal_exponent) {
-    // Overflow: the input is too large for the result type to represent.
-    if (round_mode == FPTieEven) {
-      // FPTieEven rounding mode handles overflows using infinities.
-      exponent = infinite_exponent;
-      mantissa = 0;
-    } else {
-      VIXL_ASSERT(round_mode == FPRoundOdd);
-      // FPRoundOdd rounding mode handles overflows using the largest magnitude
-      // normal number.
-      exponent = max_normal_exponent;
-      mantissa = (UINT64_C(1) << exponent_offset) - 1;
-    }
-    return static_cast<T>((sign << sign_offset) |
-                          (exponent << exponent_offset) |
-                          (mantissa << mantissa_offset));
-  }
-
-  // Calculate the shift required to move the top mantissa bit to the proper
-  // place in the destination type.
-  const int highest_significant_bit = 63 - CountLeadingZeros(mantissa);
-  int shift = highest_significant_bit - mbits;
-
-  if (exponent <= 0) {
-    // The output will be subnormal (before rounding).
-    // For subnormal outputs, the shift must be adjusted by the exponent. The +1
-    // is necessary because the exponent of a subnormal value (encoded as 0) is
-    // the same as the exponent of the smallest normal value (encoded as 1).
-    shift += -exponent + 1;
-
-    // Handle inputs that would produce a zero output.
-    //
-    // Shifts higher than highest_significant_bit+1 will always produce a zero
-    // result. A shift of exactly highest_significant_bit+1 might produce a
-    // non-zero result after rounding.
-    if (shift > (highest_significant_bit + 1)) {
-      if (round_mode == FPTieEven) {
-        // The result will always be +/-0.0.
-        return static_cast<T>(sign << sign_offset);
-      } else {
-        VIXL_ASSERT(round_mode == FPRoundOdd);
-        VIXL_ASSERT(mantissa != 0);
-        // For FPRoundOdd, if the mantissa is too small to represent and
-        // non-zero return the next "odd" value.
-        return static_cast<T>((sign << sign_offset) | 1);
-      }
-    }
-
-    // Properly encode the exponent for a subnormal output.
-    exponent = 0;
-  } else {
-    // Clear the topmost mantissa bit, since this is not encoded in IEEE-754
-    // normal values.
-    mantissa &= ~(UINT64_C(1) << highest_significant_bit);
-  }
-
-  // The casts below are only well-defined for unsigned integers.
-  VIXL_STATIC_ASSERT(std::numeric_limits<T>::is_integer);
-  VIXL_STATIC_ASSERT(!std::numeric_limits<T>::is_signed);
-
-  if (shift > 0) {
-    if (round_mode == FPTieEven) {
-      // We have to shift the mantissa to the right. Some precision is lost, so
-      // we need to apply rounding.
-      uint64_t onebit_mantissa = (mantissa >> (shift)) & 1;
-      uint64_t halfbit_mantissa = (mantissa >> (shift - 1)) & 1;
-      uint64_t adjustment = (halfbit_mantissa & ~onebit_mantissa);
-      uint64_t adjusted = mantissa - adjustment;
-      T halfbit_adjusted = (adjusted >> (shift - 1)) & 1;
-
-      T result =
-          static_cast<T>((sign << sign_offset) | (exponent << exponent_offset) |
-                         ((mantissa >> shift) << mantissa_offset));
-
-      // A very large mantissa can overflow during rounding. If this happens,
-      // the exponent should be incremented and the mantissa set to 1.0
-      // (encoded as 0). Applying halfbit_adjusted after assembling the float
-      // has the nice side-effect that this case is handled for free.
-      //
-      // This also handles cases where a very large finite value overflows to
-      // infinity, or where a very large subnormal value overflows to become
-      // normal.
-      return result + halfbit_adjusted;
-    } else {
-      VIXL_ASSERT(round_mode == FPRoundOdd);
-      // If any bits at position halfbit or below are set, onebit (ie. the
-      // bottom bit of the resulting mantissa) must be set.
-      uint64_t fractional_bits = mantissa & ((UINT64_C(1) << shift) - 1);
-      if (fractional_bits != 0) {
-        mantissa |= UINT64_C(1) << shift;
-      }
-
-      return static_cast<T>((sign << sign_offset) |
-                            (exponent << exponent_offset) |
-                            ((mantissa >> shift) << mantissa_offset));
-    }
-  } else {
-    // We have to shift the mantissa to the left (or not at all). The input
-    // mantissa is exactly representable in the output mantissa, so apply no
-    // rounding correction.
-    return static_cast<T>((sign << sign_offset) |
-                          (exponent << exponent_offset) |
-                          ((mantissa << -shift) << mantissa_offset));
-  }
-}
-
-
-// See FPRound for a description of this function.
-inline double FPRoundToDouble(int64_t sign,
-                              int64_t exponent,
-                              uint64_t mantissa,
-                              FPRounding round_mode) {
-  uint64_t bits =
-      FPRound<uint64_t, kDoubleExponentBits, kDoubleMantissaBits>(sign,
-                                                                  exponent,
-                                                                  mantissa,
-                                                                  round_mode);
-  return RawbitsToDouble(bits);
-}
-
-
-// See FPRound for a description of this function.
-inline float16 FPRoundToFloat16(int64_t sign,
-                                int64_t exponent,
-                                uint64_t mantissa,
-                                FPRounding round_mode) {
-  return FPRound<float16,
-                 kFloat16ExponentBits,
-                 kFloat16MantissaBits>(sign, exponent, mantissa, round_mode);
-}
-
-
-// See FPRound for a description of this function.
-static inline float FPRoundToFloat(int64_t sign,
-                                   int64_t exponent,
-                                   uint64_t mantissa,
-                                   FPRounding round_mode) {
-  uint32_t bits =
-      FPRound<uint32_t, kFloatExponentBits, kFloatMantissaBits>(sign,
-                                                                exponent,
-                                                                mantissa,
-                                                                round_mode);
-  return RawbitsToFloat(bits);
-}
-
-
-float FPToFloat(float16 value, UseDefaultNaN DN, bool* exception = NULL);
-float FPToFloat(double value,
-                FPRounding round_mode,
-                UseDefaultNaN DN,
-                bool* exception = NULL);
-
-double FPToDouble(float value, UseDefaultNaN DN, bool* exception = NULL);
-
-float16 FPToFloat16(float value,
-                    FPRounding round_mode,
-                    UseDefaultNaN DN,
-                    bool* exception = NULL);
-
-float16 FPToFloat16(double value,
-                    FPRounding round_mode,
-                    UseDefaultNaN DN,
-                    bool* exception = NULL);
-
-
-// Wrapper class for passing FP16 values through the assembler.
-// This is purely to aid with type checking/casting.
-class F16 {
- public:
-  static F16 FromRawbits(uint16_t bits) {
-    F16 f(0.0);
-    f.rawbits_ = bits;
-    return f;
-  }
-  // This is class used to aid in the context of h registers
-  // in the assembler(s). It is only used by half-precision
-  // instructions and utilities, so shouldn't suffer from
-  // any ambiguity. Providing this constructor as implicit
-  // allows for a more transparent solution to the end user.
-  F16(double dvalue) {  // NOLINT(runtime/explicit).
-    rawbits_ = FPToFloat16(dvalue, FPTieEven, kIgnoreDefaultNaN);
-  }
-  uint16_t ToRawbits() { return rawbits_; }
-  operator double() const {
-    return FPToDouble(FPToFloat(rawbits_, kUseDefaultNaN), kUseDefaultNaN);
-  }
-
- private:
-  uint16_t rawbits_;
-};
-
-}  // namespace aarch64
-}  // namespace vixl
-
-#endif  // VIXL_AARCH64_UTILS_AARCH64_H_

diff --git a/src/cpu-features.h b/src/cpu-features.h
index e5c68ab..853421b 100644
--- a/src/cpu-features.h
+++ b/src/cpu-features.h

@@ -61,6 +61,8 @@
   /* Half-precision (FP16) support for FP and NEON, respectively.           */ \
   V(kFPHalf,              "FPHalf",                 "fphp")                    \
   V(kNEONHalf,            "NEONHalf",               "asimdhp")                 \
+  /* The RAS extension, including the ESB instruction.                      */ \
+  V(kRAS,                 "RAS",                    NULL)                      \
   /* Data cache clean to the point of persistence: DC CVAP.                 */ \
   V(kDCPoP,               "DCPoP",                  "dcpop")                   \
   /* Cryptographic support instructions.                                    */ \

diff --git a/src/globals-vixl.h b/src/globals-vixl.h
index 1a71c24..727d494 100644
--- a/src/globals-vixl.h
+++ b/src/globals-vixl.h

@@ -63,9 +63,6 @@
 
 typedef uint8_t byte;
 
-// Type for half-precision (16 bit) floating point numbers.
-typedef uint16_t float16;
-
 const int KBytes = 1024;
 const int MBytes = 1024 * KBytes;
 

diff --git a/src/utils-vixl.cc b/src/utils-vixl.cc
index bfe8bf1..41b5586 100644
--- a/src/utils-vixl.cc
+++ b/src/utils-vixl.cc

@@ -30,12 +30,33 @@
 
 namespace vixl {
 
-uint16_t Float16ToRawbits(float16 value) {
-  uint16_t bits = 0;
-  memcpy(&bits, &value, 2);
-  return value;
+// The default NaN values (for FPCR.DN=1).
+const double kFP64DefaultNaN = RawbitsToDouble(UINT64_C(0x7ff8000000000000));
+const float kFP32DefaultNaN = RawbitsToFloat(0x7fc00000);
+const Float16 kFP16DefaultNaN = RawbitsToFloat16(0x7e00);
+
+// Floating-point zero values.
+const Float16 kFP16PositiveZero = RawbitsToFloat16(0x0);
+const Float16 kFP16NegativeZero = RawbitsToFloat16(0x8000);
+
+// Floating-point infinity values.
+const Float16 kFP16PositiveInfinity = RawbitsToFloat16(0x7c00);
+const Float16 kFP16NegativeInfinity = RawbitsToFloat16(0xfc00);
+const float kFP32PositiveInfinity = RawbitsToFloat(0x7f800000);
+const float kFP32NegativeInfinity = RawbitsToFloat(0xff800000);
+const double kFP64PositiveInfinity =
+    RawbitsToDouble(UINT64_C(0x7ff0000000000000));
+const double kFP64NegativeInfinity =
+    RawbitsToDouble(UINT64_C(0xfff0000000000000));
+
+bool IsZero(Float16 value) {
+  uint16_t bits = Float16ToRawbits(value);
+  return (bits == Float16ToRawbits(kFP16PositiveZero) ||
+          bits == Float16ToRawbits(kFP16NegativeZero));
 }
 
+uint16_t Float16ToRawbits(Float16 value) { return value.rawbits_; }
+
 uint32_t FloatToRawbits(float value) {
   uint32_t bits = 0;
   memcpy(&bits, &value, 4);
@@ -50,10 +71,10 @@
 }
 
 
-float16 RawbitsToFloat16(uint16_t bits) {
-  float16 value = 0x0;
-  memcpy(&value, &bits, 2);
-  return value;
+Float16 RawbitsToFloat16(uint16_t bits) {
+  Float16 f;
+  f.rawbits_ = bits;
+  return f;
 }
 
 
@@ -71,6 +92,23 @@
 }
 
 
+uint32_t Float16Sign(internal::SimFloat16 val) {
+  uint16_t rawbits = Float16ToRawbits(val);
+  return ExtractUnsignedBitfield32(15, 15, rawbits);
+}
+
+
+uint32_t Float16Exp(internal::SimFloat16 val) {
+  uint16_t rawbits = Float16ToRawbits(val);
+  return ExtractUnsignedBitfield32(14, 10, rawbits);
+}
+
+uint32_t Float16Mantissa(internal::SimFloat16 val) {
+  uint16_t rawbits = Float16ToRawbits(val);
+  return ExtractUnsignedBitfield32(9, 0, rawbits);
+}
+
+
 uint32_t FloatSign(float val) {
   uint32_t rawbits = FloatToRawbits(val);
   return ExtractUnsignedBitfield32(31, 31, rawbits);
@@ -107,6 +145,14 @@
 }
 
 
+internal::SimFloat16 Float16Pack(uint16_t sign,
+                                 uint16_t exp,
+                                 uint16_t mantissa) {
+  uint16_t bits = (sign << 15) | (exp << 10) | mantissa;
+  return RawbitsToFloat16(bits);
+}
+
+
 float FloatPack(uint32_t sign, uint32_t exp, uint32_t mantissa) {
   uint32_t bits = (sign << 31) | (exp << 23) | mantissa;
   return RawbitsToFloat(bits);
@@ -119,13 +165,14 @@
 }
 
 
-int Float16Classify(float16 value) {
+int Float16Classify(Float16 value) {
+  uint16_t bits = Float16ToRawbits(value);
   uint16_t exponent_max = (1 << 5) - 1;
   uint16_t exponent_mask = exponent_max << 10;
   uint16_t mantissa_mask = (1 << 10) - 1;
 
-  uint16_t exponent = (value & exponent_mask) >> 10;
-  uint16_t mantissa = value & mantissa_mask;
+  uint16_t exponent = (bits & exponent_mask) >> 10;
+  uint16_t mantissa = bits & mantissa_mask;
   if (exponent == 0) {
     if (mantissa == 0) {
       return FP_ZERO;
@@ -156,10 +203,353 @@
 
 int BitCount(uint64_t value) { return CountSetBits(value); }
 
+// Float16 definitions.
+
+Float16::Float16(double dvalue) {
+  rawbits_ =
+      Float16ToRawbits(FPToFloat16(dvalue, FPTieEven, kIgnoreDefaultNaN));
+}
+
 namespace internal {
 
+SimFloat16 SimFloat16::operator-() const {
+  return RawbitsToFloat16(rawbits_ ^ 0x8000);
+}
+
+// SimFloat16 definitions.
+SimFloat16 SimFloat16::operator+(SimFloat16 rhs) const {
+  return static_cast<double>(*this) + static_cast<double>(rhs);
+}
+
+SimFloat16 SimFloat16::operator-(SimFloat16 rhs) const {
+  return static_cast<double>(*this) - static_cast<double>(rhs);
+}
+
+SimFloat16 SimFloat16::operator*(SimFloat16 rhs) const {
+  return static_cast<double>(*this) * static_cast<double>(rhs);
+}
+
+SimFloat16 SimFloat16::operator/(SimFloat16 rhs) const {
+  return static_cast<double>(*this) / static_cast<double>(rhs);
+}
+
+bool SimFloat16::operator<(SimFloat16 rhs) const {
+  return static_cast<double>(*this) < static_cast<double>(rhs);
+}
+
+bool SimFloat16::operator>(SimFloat16 rhs) const {
+  return static_cast<double>(*this) > static_cast<double>(rhs);
+}
+
+bool SimFloat16::operator==(SimFloat16 rhs) const {
+  if (IsNaN(*this) || IsNaN(rhs)) {
+    return false;
+  } else if (IsZero(rhs) && IsZero(*this)) {
+    // +0 and -0 should be treated as equal.
+    return true;
+  }
+  return this->rawbits_ == rhs.rawbits_;
+}
+
+bool SimFloat16::operator!=(SimFloat16 rhs) const { return !(*this == rhs); }
+
+bool SimFloat16::operator==(double rhs) const {
+  return static_cast<double>(*this) == static_cast<double>(rhs);
+}
+
+SimFloat16::operator double() const {
+  return FPToDouble(*this, kIgnoreDefaultNaN);
+}
+
 Int64 BitCount(Uint32 value) { return CountSetBits(value.Get()); }
 
 }  // namespace internal
 
+float FPToFloat(Float16 value, UseDefaultNaN DN, bool* exception) {
+  uint16_t bits = Float16ToRawbits(value);
+  uint32_t sign = bits >> 15;
+  uint32_t exponent =
+      ExtractUnsignedBitfield32(kFloat16MantissaBits + kFloat16ExponentBits - 1,
+                                kFloat16MantissaBits,
+                                bits);
+  uint32_t mantissa =
+      ExtractUnsignedBitfield32(kFloat16MantissaBits - 1, 0, bits);
+
+  switch (Float16Classify(value)) {
+    case FP_ZERO:
+      return (sign == 0) ? 0.0f : -0.0f;
+
+    case FP_INFINITE:
+      return (sign == 0) ? kFP32PositiveInfinity : kFP32NegativeInfinity;
+
+    case FP_SUBNORMAL: {
+      // Calculate shift required to put mantissa into the most-significant bits
+      // of the destination mantissa.
+      int shift = CountLeadingZeros(mantissa << (32 - 10));
+
+      // Shift mantissa and discard implicit '1'.
+      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits) + shift + 1;
+      mantissa &= (1 << kFloatMantissaBits) - 1;
+
+      // Adjust the exponent for the shift applied, and rebias.
+      exponent = exponent - shift + (-15 + 127);
+      break;
+    }
+
+    case FP_NAN:
+      if (IsSignallingNaN(value)) {
+        if (exception != NULL) {
+          *exception = true;
+        }
+      }
+      if (DN == kUseDefaultNaN) return kFP32DefaultNaN;
+
+      // Convert NaNs as the processor would:
+      //  - The sign is propagated.
+      //  - The payload (mantissa) is transferred entirely, except that the top
+      //    bit is forced to '1', making the result a quiet NaN. The unused
+      //    (low-order) payload bits are set to 0.
+      exponent = (1 << kFloatExponentBits) - 1;
+
+      // Increase bits in mantissa, making low-order bits 0.
+      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
+      mantissa |= 1 << 22;  // Force a quiet NaN.
+      break;
+
+    case FP_NORMAL:
+      // Increase bits in mantissa, making low-order bits 0.
+      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
+
+      // Change exponent bias.
+      exponent += (-15 + 127);
+      break;
+
+    default:
+      VIXL_UNREACHABLE();
+  }
+  return RawbitsToFloat((sign << 31) | (exponent << kFloatMantissaBits) |
+                        mantissa);
+}
+
+
+float FPToFloat(double value,
+                FPRounding round_mode,
+                UseDefaultNaN DN,
+                bool* exception) {
+  // Only the FPTieEven rounding mode is implemented.
+  VIXL_ASSERT((round_mode == FPTieEven) || (round_mode == FPRoundOdd));
+  USE(round_mode);
+
+  switch (std::fpclassify(value)) {
+    case FP_NAN: {
+      if (IsSignallingNaN(value)) {
+        if (exception != NULL) {
+          *exception = true;
+        }
+      }
+      if (DN == kUseDefaultNaN) return kFP32DefaultNaN;
+
+      // Convert NaNs as the processor would:
+      //  - The sign is propagated.
+      //  - The payload (mantissa) is transferred as much as possible, except
+      //    that the top bit is forced to '1', making the result a quiet NaN.
+      uint64_t raw = DoubleToRawbits(value);
+
+      uint32_t sign = raw >> 63;
+      uint32_t exponent = (1 << 8) - 1;
+      uint32_t payload =
+          static_cast<uint32_t>(ExtractUnsignedBitfield64(50, 52 - 23, raw));
+      payload |= (1 << 22);  // Force a quiet NaN.
+
+      return RawbitsToFloat((sign << 31) | (exponent << 23) | payload);
+    }
+
+    case FP_ZERO:
+    case FP_INFINITE: {
+      // In a C++ cast, any value representable in the target type will be
+      // unchanged. This is always the case for +/-0.0 and infinities.
+      return static_cast<float>(value);
+    }
+
+    case FP_NORMAL:
+    case FP_SUBNORMAL: {
+      // Convert double-to-float as the processor would, assuming that FPCR.FZ
+      // (flush-to-zero) is not set.
+      uint64_t raw = DoubleToRawbits(value);
+      // Extract the IEEE-754 double components.
+      uint32_t sign = raw >> 63;
+      // Extract the exponent and remove the IEEE-754 encoding bias.
+      int32_t exponent =
+          static_cast<int32_t>(ExtractUnsignedBitfield64(62, 52, raw)) - 1023;
+      // Extract the mantissa and add the implicit '1' bit.
+      uint64_t mantissa = ExtractUnsignedBitfield64(51, 0, raw);
+      if (std::fpclassify(value) == FP_NORMAL) {
+        mantissa |= (UINT64_C(1) << 52);
+      }
+      return FPRoundToFloat(sign, exponent, mantissa, round_mode);
+    }
+  }
+
+  VIXL_UNREACHABLE();
+  return value;
+}
+
+// TODO: We should consider implementing a full FPToDouble(Float16)
+// conversion function (for performance reasons).
+double FPToDouble(Float16 value, UseDefaultNaN DN, bool* exception) {
+  // We can rely on implicit float to double conversion here.
+  return FPToFloat(value, DN, exception);
+}
+
+
+double FPToDouble(float value, UseDefaultNaN DN, bool* exception) {
+  switch (std::fpclassify(value)) {
+    case FP_NAN: {
+      if (IsSignallingNaN(value)) {
+        if (exception != NULL) {
+          *exception = true;
+        }
+      }
+      if (DN == kUseDefaultNaN) return kFP64DefaultNaN;
+
+      // Convert NaNs as the processor would:
+      //  - The sign is propagated.
+      //  - The payload (mantissa) is transferred entirely, except that the top
+      //    bit is forced to '1', making the result a quiet NaN. The unused
+      //    (low-order) payload bits are set to 0.
+      uint32_t raw = FloatToRawbits(value);
+
+      uint64_t sign = raw >> 31;
+      uint64_t exponent = (1 << 11) - 1;
+      uint64_t payload = ExtractUnsignedBitfield64(21, 0, raw);
+      payload <<= (52 - 23);           // The unused low-order bits should be 0.
+      payload |= (UINT64_C(1) << 51);  // Force a quiet NaN.
+
+      return RawbitsToDouble((sign << 63) | (exponent << 52) | payload);
+    }
+
+    case FP_ZERO:
+    case FP_NORMAL:
+    case FP_SUBNORMAL:
+    case FP_INFINITE: {
+      // All other inputs are preserved in a standard cast, because every value
+      // representable using an IEEE-754 float is also representable using an
+      // IEEE-754 double.
+      return static_cast<double>(value);
+    }
+  }
+
+  VIXL_UNREACHABLE();
+  return static_cast<double>(value);
+}
+
+
+Float16 FPToFloat16(float value,
+                    FPRounding round_mode,
+                    UseDefaultNaN DN,
+                    bool* exception) {
+  // Only the FPTieEven rounding mode is implemented.
+  VIXL_ASSERT(round_mode == FPTieEven);
+  USE(round_mode);
+
+  uint32_t raw = FloatToRawbits(value);
+  int32_t sign = raw >> 31;
+  int32_t exponent = ExtractUnsignedBitfield32(30, 23, raw) - 127;
+  uint32_t mantissa = ExtractUnsignedBitfield32(22, 0, raw);
+
+  switch (std::fpclassify(value)) {
+    case FP_NAN: {
+      if (IsSignallingNaN(value)) {
+        if (exception != NULL) {
+          *exception = true;
+        }
+      }
+      if (DN == kUseDefaultNaN) return kFP16DefaultNaN;
+
+      // Convert NaNs as the processor would:
+      //  - The sign is propagated.
+      //  - The payload (mantissa) is transferred as much as possible, except
+      //    that the top bit is forced to '1', making the result a quiet NaN.
+      uint16_t result = (sign == 0) ? Float16ToRawbits(kFP16PositiveInfinity)
+                                    : Float16ToRawbits(kFP16NegativeInfinity);
+      result |= mantissa >> (kFloatMantissaBits - kFloat16MantissaBits);
+      result |= (1 << 9);  // Force a quiet NaN;
+      return RawbitsToFloat16(result);
+    }
+
+    case FP_ZERO:
+      return (sign == 0) ? kFP16PositiveZero : kFP16NegativeZero;
+
+    case FP_INFINITE:
+      return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
+
+    case FP_NORMAL:
+    case FP_SUBNORMAL: {
+      // Convert float-to-half as the processor would, assuming that FPCR.FZ
+      // (flush-to-zero) is not set.
+
+      // Add the implicit '1' bit to the mantissa.
+      mantissa += (1 << 23);
+      return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
+    }
+  }
+
+  VIXL_UNREACHABLE();
+  return kFP16PositiveZero;
+}
+
+
+Float16 FPToFloat16(double value,
+                    FPRounding round_mode,
+                    UseDefaultNaN DN,
+                    bool* exception) {
+  // Only the FPTieEven rounding mode is implemented.
+  VIXL_ASSERT(round_mode == FPTieEven);
+  USE(round_mode);
+
+  uint64_t raw = DoubleToRawbits(value);
+  int32_t sign = raw >> 63;
+  int64_t exponent = ExtractUnsignedBitfield64(62, 52, raw) - 1023;
+  uint64_t mantissa = ExtractUnsignedBitfield64(51, 0, raw);
+
+  switch (std::fpclassify(value)) {
+    case FP_NAN: {
+      if (IsSignallingNaN(value)) {
+        if (exception != NULL) {
+          *exception = true;
+        }
+      }
+      if (DN == kUseDefaultNaN) return kFP16DefaultNaN;
+
+      // Convert NaNs as the processor would:
+      //  - The sign is propagated.
+      //  - The payload (mantissa) is transferred as much as possible, except
+      //    that the top bit is forced to '1', making the result a quiet NaN.
+      uint16_t result = (sign == 0) ? Float16ToRawbits(kFP16PositiveInfinity)
+                                    : Float16ToRawbits(kFP16NegativeInfinity);
+      result |= mantissa >> (kDoubleMantissaBits - kFloat16MantissaBits);
+      result |= (1 << 9);  // Force a quiet NaN;
+      return RawbitsToFloat16(result);
+    }
+
+    case FP_ZERO:
+      return (sign == 0) ? kFP16PositiveZero : kFP16NegativeZero;
+
+    case FP_INFINITE:
+      return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
+    case FP_NORMAL:
+    case FP_SUBNORMAL: {
+      // Convert double-to-half as the processor would, assuming that FPCR.FZ
+      // (flush-to-zero) is not set.
+
+      // Add the implicit '1' bit to the mantissa.
+      mantissa += (UINT64_C(1) << 52);
+      return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
+    }
+  }
+
+  VIXL_UNREACHABLE();
+  return kFP16PositiveZero;
+}
+
 }  // namespace vixl

diff --git a/src/utils-vixl.h b/src/utils-vixl.h
index c4ba800..1c76fcb 100644
--- a/src/utils-vixl.h
+++ b/src/utils-vixl.h

@@ -29,6 +29,7 @@
 
 #include <cmath>
 #include <cstring>
+#include <limits>
 #include <vector>
 
 #include "compiler-intrinsics-vixl.h"
@@ -65,6 +66,11 @@
 #define VIXL_UNREACHABLE_OR_FALLTHROUGH() VIXL_FALLTHROUGH()
 #endif
 
+template <typename T, size_t n>
+size_t ArrayLength(const T (&)[n]) {
+  return n;
+}
+
 // Check number width.
 // TODO: Refactor these using templates.
 inline bool IsIntN(unsigned n, uint32_t x) {
@@ -222,8 +228,21 @@
 }
 
 
+// Wrapper class for passing FP16 values through the assembler.
+// This is purely to aid with type checking/casting.
+class Float16 {
+ public:
+  explicit Float16(double dvalue);
+  Float16() : rawbits_(0x0) {}
+  friend uint16_t Float16ToRawbits(Float16 value);
+  friend Float16 RawbitsToFloat16(uint16_t bits);
+
+ protected:
+  uint16_t rawbits_;
+};
+
 // Floating point representation.
-uint16_t Float16ToRawbits(float16 value);
+uint16_t Float16ToRawbits(Float16 value);
 
 
 uint32_t FloatToRawbits(float value);
@@ -238,7 +257,7 @@
   return DoubleToRawbits(value);
 }
 
-float16 RawbitsToFloat16(uint16_t bits);
+Float16 RawbitsToFloat16(uint16_t bits);
 
 float RawbitsToFloat(uint32_t bits);
 VIXL_DEPRECATED("RawbitsToFloat",
@@ -252,6 +271,41 @@
   return RawbitsToDouble(bits);
 }
 
+namespace internal {
+
+// Internal simulation class used solely by the simulator to
+// provide an abstraction layer for any half-precision arithmetic.
+class SimFloat16 : public Float16 {
+ public:
+  // TODO: We should investigate making this constructor explicit.
+  // This is currently difficult to do due to a number of templated
+  // functions in the simulator which rely on returning double values.
+  SimFloat16(double dvalue) : Float16(dvalue) {}  // NOLINT(runtime/explicit)
+  SimFloat16(Float16 f) {                         // NOLINT(runtime/explicit)
+    this->rawbits_ = Float16ToRawbits(f);
+  }
+  SimFloat16() : Float16() {}
+  SimFloat16 operator-() const;
+  SimFloat16 operator+(SimFloat16 rhs) const;
+  SimFloat16 operator-(SimFloat16 rhs) const;
+  SimFloat16 operator*(SimFloat16 rhs) const;
+  SimFloat16 operator/(SimFloat16 rhs) const;
+  bool operator<(SimFloat16 rhs) const;
+  bool operator>(SimFloat16 rhs) const;
+  bool operator==(SimFloat16 rhs) const;
+  bool operator!=(SimFloat16 rhs) const;
+  // This is necessary for conversions peformed in (macro asm) Fmov.
+  bool operator==(double rhs) const;
+  operator double() const;
+};
+}  // namespace internal
+
+uint32_t Float16Sign(internal::SimFloat16 value);
+
+uint32_t Float16Exp(internal::SimFloat16 value);
+
+uint32_t Float16Mantissa(internal::SimFloat16 value);
+
 uint32_t FloatSign(float value);
 VIXL_DEPRECATED("FloatSign", inline uint32_t float_sign(float value)) {
   return FloatSign(value);
@@ -283,6 +337,10 @@
   return DoubleMantissa(value);
 }
 
+internal::SimFloat16 Float16Pack(uint16_t sign,
+                                 uint16_t exp,
+                                 uint16_t mantissa);
+
 float FloatPack(uint32_t sign, uint32_t exp, uint32_t mantissa);
 VIXL_DEPRECATED("FloatPack",
                 inline float float_pack(uint32_t sign,
@@ -300,21 +358,33 @@
 }
 
 // An fpclassify() function for 16-bit half-precision floats.
-int Float16Classify(float16 value);
-VIXL_DEPRECATED("Float16Classify", inline int float16classify(float16 value)) {
-  return Float16Classify(value);
+int Float16Classify(Float16 value);
+VIXL_DEPRECATED("Float16Classify", inline int float16classify(uint16_t value)) {
+  return Float16Classify(RawbitsToFloat16(value));
 }
 
+bool IsZero(Float16 value);
 
-// Check for float16 (uint16_t) NaNs.
-inline bool IsNaN(float16 value) { return Float16Classify(value) == FP_NAN; }
+inline bool IsNaN(float value) { return std::isnan(value); }
+
+inline bool IsNaN(double value) { return std::isnan(value); }
+
+inline bool IsNaN(Float16 value) { return Float16Classify(value) == FP_NAN; }
+
+inline bool IsInf(float value) { return std::isinf(value); }
+
+inline bool IsInf(double value) { return std::isinf(value); }
+
+inline bool IsInf(Float16 value) {
+  return Float16Classify(value) == FP_INFINITE;
+}
 
 
 // NaN tests.
 inline bool IsSignallingNaN(double num) {
   const uint64_t kFP64QuietNaNMask = UINT64_C(0x0008000000000000);
   uint64_t raw = DoubleToRawbits(num);
-  if (std::isnan(num) && ((raw & kFP64QuietNaNMask) == 0)) {
+  if (IsNaN(num) && ((raw & kFP64QuietNaNMask) == 0)) {
     return true;
   }
   return false;
@@ -324,40 +394,48 @@
 inline bool IsSignallingNaN(float num) {
   const uint32_t kFP32QuietNaNMask = 0x00400000;
   uint32_t raw = FloatToRawbits(num);
-  if (std::isnan(num) && ((raw & kFP32QuietNaNMask) == 0)) {
+  if (IsNaN(num) && ((raw & kFP32QuietNaNMask) == 0)) {
     return true;
   }
   return false;
 }
 
 
-inline bool IsSignallingNaN(float16 num) {
+inline bool IsSignallingNaN(Float16 num) {
   const uint16_t kFP16QuietNaNMask = 0x0200;
-  return IsNaN(num) && ((num & kFP16QuietNaNMask) == 0);
+  return IsNaN(num) && ((Float16ToRawbits(num) & kFP16QuietNaNMask) == 0);
 }
 
 
 template <typename T>
 inline bool IsQuietNaN(T num) {
-  return std::isnan(num) && !IsSignallingNaN(num);
+  return IsNaN(num) && !IsSignallingNaN(num);
 }
 
 
 // Convert the NaN in 'num' to a quiet NaN.
 inline double ToQuietNaN(double num) {
   const uint64_t kFP64QuietNaNMask = UINT64_C(0x0008000000000000);
-  VIXL_ASSERT(std::isnan(num));
+  VIXL_ASSERT(IsNaN(num));
   return RawbitsToDouble(DoubleToRawbits(num) | kFP64QuietNaNMask);
 }
 
 
 inline float ToQuietNaN(float num) {
   const uint32_t kFP32QuietNaNMask = 0x00400000;
-  VIXL_ASSERT(std::isnan(num));
+  VIXL_ASSERT(IsNaN(num));
   return RawbitsToFloat(FloatToRawbits(num) | kFP32QuietNaNMask);
 }
 
 
+inline internal::SimFloat16 ToQuietNaN(internal::SimFloat16 num) {
+  const uint16_t kFP16QuietNaNMask = 0x0200;
+  VIXL_ASSERT(IsNaN(num));
+  return internal::SimFloat16(
+      RawbitsToFloat16(Float16ToRawbits(num) | kFP16QuietNaNMask));
+}
+
+
 // Fused multiply-add.
 inline double FusedMultiplyAdd(double op1, double op2, double a) {
   return fma(op1, op2, a);
@@ -898,6 +976,306 @@
 
 }  // namespace internal
 
+// The default NaN values (for FPCR.DN=1).
+extern const double kFP64DefaultNaN;
+extern const float kFP32DefaultNaN;
+extern const Float16 kFP16DefaultNaN;
+
+// Floating-point infinity values.
+extern const Float16 kFP16PositiveInfinity;
+extern const Float16 kFP16NegativeInfinity;
+extern const float kFP32PositiveInfinity;
+extern const float kFP32NegativeInfinity;
+extern const double kFP64PositiveInfinity;
+extern const double kFP64NegativeInfinity;
+
+// Floating-point zero values.
+extern const Float16 kFP16PositiveZero;
+extern const Float16 kFP16NegativeZero;
+
+// AArch64 floating-point specifics. These match IEEE-754.
+const unsigned kDoubleMantissaBits = 52;
+const unsigned kDoubleExponentBits = 11;
+const unsigned kFloatMantissaBits = 23;
+const unsigned kFloatExponentBits = 8;
+const unsigned kFloat16MantissaBits = 10;
+const unsigned kFloat16ExponentBits = 5;
+
+enum FPRounding {
+  // The first four values are encodable directly by FPCR<RMode>.
+  FPTieEven = 0x0,
+  FPPositiveInfinity = 0x1,
+  FPNegativeInfinity = 0x2,
+  FPZero = 0x3,
+
+  // The final rounding modes are only available when explicitly specified by
+  // the instruction (such as with fcvta). It cannot be set in FPCR.
+  FPTieAway,
+  FPRoundOdd
+};
+
+enum UseDefaultNaN { kUseDefaultNaN, kIgnoreDefaultNaN };
+
+// Assemble the specified IEEE-754 components into the target type and apply
+// appropriate rounding.
+//  sign:     0 = positive, 1 = negative
+//  exponent: Unbiased IEEE-754 exponent.
+//  mantissa: The mantissa of the input. The top bit (which is not encoded for
+//            normal IEEE-754 values) must not be omitted. This bit has the
+//            value 'pow(2, exponent)'.
+//
+// The input value is assumed to be a normalized value. That is, the input may
+// not be infinity or NaN. If the source value is subnormal, it must be
+// normalized before calling this function such that the highest set bit in the
+// mantissa has the value 'pow(2, exponent)'.
+//
+// Callers should use FPRoundToFloat or FPRoundToDouble directly, rather than
+// calling a templated FPRound.
+template <class T, int ebits, int mbits>
+T FPRound(int64_t sign,
+          int64_t exponent,
+          uint64_t mantissa,
+          FPRounding round_mode) {
+  VIXL_ASSERT((sign == 0) || (sign == 1));
+
+  // Only FPTieEven and FPRoundOdd rounding modes are implemented.
+  VIXL_ASSERT((round_mode == FPTieEven) || (round_mode == FPRoundOdd));
+
+  // Rounding can promote subnormals to normals, and normals to infinities. For
+  // example, a double with exponent 127 (FLT_MAX_EXP) would appear to be
+  // encodable as a float, but rounding based on the low-order mantissa bits
+  // could make it overflow. With ties-to-even rounding, this value would become
+  // an infinity.
+
+  // ---- Rounding Method ----
+  //
+  // The exponent is irrelevant in the rounding operation, so we treat the
+  // lowest-order bit that will fit into the result ('onebit') as having
+  // the value '1'. Similarly, the highest-order bit that won't fit into
+  // the result ('halfbit') has the value '0.5'. The 'point' sits between
+  // 'onebit' and 'halfbit':
+  //
+  //            These bits fit into the result.
+  //               |---------------------|
+  //  mantissa = 0bxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+  //                                     ||
+  //                                    / |
+  //                                   /  halfbit
+  //                               onebit
+  //
+  // For subnormal outputs, the range of representable bits is smaller and
+  // the position of onebit and halfbit depends on the exponent of the
+  // input, but the method is otherwise similar.
+  //
+  //   onebit(frac)
+  //     |
+  //     | halfbit(frac)          halfbit(adjusted)
+  //     | /                      /
+  //     | |                      |
+  //  0b00.0 (exact)      -> 0b00.0 (exact)                    -> 0b00
+  //  0b00.0...           -> 0b00.0...                         -> 0b00
+  //  0b00.1 (exact)      -> 0b00.0111..111                    -> 0b00
+  //  0b00.1...           -> 0b00.1...                         -> 0b01
+  //  0b01.0 (exact)      -> 0b01.0 (exact)                    -> 0b01
+  //  0b01.0...           -> 0b01.0...                         -> 0b01
+  //  0b01.1 (exact)      -> 0b01.1 (exact)                    -> 0b10
+  //  0b01.1...           -> 0b01.1...                         -> 0b10
+  //  0b10.0 (exact)      -> 0b10.0 (exact)                    -> 0b10
+  //  0b10.0...           -> 0b10.0...                         -> 0b10
+  //  0b10.1 (exact)      -> 0b10.0111..111                    -> 0b10
+  //  0b10.1...           -> 0b10.1...                         -> 0b11
+  //  0b11.0 (exact)      -> 0b11.0 (exact)                    -> 0b11
+  //  ...                   /             |                      /   |
+  //                       /              |                     /    |
+  //                                                           /     |
+  // adjusted = frac - (halfbit(mantissa) & ~onebit(frac));   /      |
+  //
+  //                   mantissa = (mantissa >> shift) + halfbit(adjusted);
+
+  static const int mantissa_offset = 0;
+  static const int exponent_offset = mantissa_offset + mbits;
+  static const int sign_offset = exponent_offset + ebits;
+  VIXL_ASSERT(sign_offset == (sizeof(T) * 8 - 1));
+
+  // Bail out early for zero inputs.
+  if (mantissa == 0) {
+    return static_cast<T>(sign << sign_offset);
+  }
+
+  // If all bits in the exponent are set, the value is infinite or NaN.
+  // This is true for all binary IEEE-754 formats.
+  static const int infinite_exponent = (1 << ebits) - 1;
+  static const int max_normal_exponent = infinite_exponent - 1;
+
+  // Apply the exponent bias to encode it for the result. Doing this early makes
+  // it easy to detect values that will be infinite or subnormal.
+  exponent += max_normal_exponent >> 1;
+
+  if (exponent > max_normal_exponent) {
+    // Overflow: the input is too large for the result type to represent.
+    if (round_mode == FPTieEven) {
+      // FPTieEven rounding mode handles overflows using infinities.
+      exponent = infinite_exponent;
+      mantissa = 0;
+    } else {
+      VIXL_ASSERT(round_mode == FPRoundOdd);
+      // FPRoundOdd rounding mode handles overflows using the largest magnitude
+      // normal number.
+      exponent = max_normal_exponent;
+      mantissa = (UINT64_C(1) << exponent_offset) - 1;
+    }
+    return static_cast<T>((sign << sign_offset) |
+                          (exponent << exponent_offset) |
+                          (mantissa << mantissa_offset));
+  }
+
+  // Calculate the shift required to move the top mantissa bit to the proper
+  // place in the destination type.
+  const int highest_significant_bit = 63 - CountLeadingZeros(mantissa);
+  int shift = highest_significant_bit - mbits;
+
+  if (exponent <= 0) {
+    // The output will be subnormal (before rounding).
+    // For subnormal outputs, the shift must be adjusted by the exponent. The +1
+    // is necessary because the exponent of a subnormal value (encoded as 0) is
+    // the same as the exponent of the smallest normal value (encoded as 1).
+    shift += -exponent + 1;
+
+    // Handle inputs that would produce a zero output.
+    //
+    // Shifts higher than highest_significant_bit+1 will always produce a zero
+    // result. A shift of exactly highest_significant_bit+1 might produce a
+    // non-zero result after rounding.
+    if (shift > (highest_significant_bit + 1)) {
+      if (round_mode == FPTieEven) {
+        // The result will always be +/-0.0.
+        return static_cast<T>(sign << sign_offset);
+      } else {
+        VIXL_ASSERT(round_mode == FPRoundOdd);
+        VIXL_ASSERT(mantissa != 0);
+        // For FPRoundOdd, if the mantissa is too small to represent and
+        // non-zero return the next "odd" value.
+        return static_cast<T>((sign << sign_offset) | 1);
+      }
+    }
+
+    // Properly encode the exponent for a subnormal output.
+    exponent = 0;
+  } else {
+    // Clear the topmost mantissa bit, since this is not encoded in IEEE-754
+    // normal values.
+    mantissa &= ~(UINT64_C(1) << highest_significant_bit);
+  }
+
+  // The casts below are only well-defined for unsigned integers.
+  VIXL_STATIC_ASSERT(std::numeric_limits<T>::is_integer);
+  VIXL_STATIC_ASSERT(!std::numeric_limits<T>::is_signed);
+
+  if (shift > 0) {
+    if (round_mode == FPTieEven) {
+      // We have to shift the mantissa to the right. Some precision is lost, so
+      // we need to apply rounding.
+      uint64_t onebit_mantissa = (mantissa >> (shift)) & 1;
+      uint64_t halfbit_mantissa = (mantissa >> (shift - 1)) & 1;
+      uint64_t adjustment = (halfbit_mantissa & ~onebit_mantissa);
+      uint64_t adjusted = mantissa - adjustment;
+      T halfbit_adjusted = (adjusted >> (shift - 1)) & 1;
+
+      T result =
+          static_cast<T>((sign << sign_offset) | (exponent << exponent_offset) |
+                         ((mantissa >> shift) << mantissa_offset));
+
+      // A very large mantissa can overflow during rounding. If this happens,
+      // the exponent should be incremented and the mantissa set to 1.0
+      // (encoded as 0). Applying halfbit_adjusted after assembling the float
+      // has the nice side-effect that this case is handled for free.
+      //
+      // This also handles cases where a very large finite value overflows to
+      // infinity, or where a very large subnormal value overflows to become
+      // normal.
+      return result + halfbit_adjusted;
+    } else {
+      VIXL_ASSERT(round_mode == FPRoundOdd);
+      // If any bits at position halfbit or below are set, onebit (ie. the
+      // bottom bit of the resulting mantissa) must be set.
+      uint64_t fractional_bits = mantissa & ((UINT64_C(1) << shift) - 1);
+      if (fractional_bits != 0) {
+        mantissa |= UINT64_C(1) << shift;
+      }
+
+      return static_cast<T>((sign << sign_offset) |
+                            (exponent << exponent_offset) |
+                            ((mantissa >> shift) << mantissa_offset));
+    }
+  } else {
+    // We have to shift the mantissa to the left (or not at all). The input
+    // mantissa is exactly representable in the output mantissa, so apply no
+    // rounding correction.
+    return static_cast<T>((sign << sign_offset) |
+                          (exponent << exponent_offset) |
+                          ((mantissa << -shift) << mantissa_offset));
+  }
+}
+
+
+// See FPRound for a description of this function.
+inline double FPRoundToDouble(int64_t sign,
+                              int64_t exponent,
+                              uint64_t mantissa,
+                              FPRounding round_mode) {
+  uint64_t bits =
+      FPRound<uint64_t, kDoubleExponentBits, kDoubleMantissaBits>(sign,
+                                                                  exponent,
+                                                                  mantissa,
+                                                                  round_mode);
+  return RawbitsToDouble(bits);
+}
+
+
+// See FPRound for a description of this function.
+inline Float16 FPRoundToFloat16(int64_t sign,
+                                int64_t exponent,
+                                uint64_t mantissa,
+                                FPRounding round_mode) {
+  return RawbitsToFloat16(
+      FPRound<uint16_t,
+              kFloat16ExponentBits,
+              kFloat16MantissaBits>(sign, exponent, mantissa, round_mode));
+}
+
+
+// See FPRound for a description of this function.
+static inline float FPRoundToFloat(int64_t sign,
+                                   int64_t exponent,
+                                   uint64_t mantissa,
+                                   FPRounding round_mode) {
+  uint32_t bits =
+      FPRound<uint32_t, kFloatExponentBits, kFloatMantissaBits>(sign,
+                                                                exponent,
+                                                                mantissa,
+                                                                round_mode);
+  return RawbitsToFloat(bits);
+}
+
+
+float FPToFloat(Float16 value, UseDefaultNaN DN, bool* exception = NULL);
+float FPToFloat(double value,
+                FPRounding round_mode,
+                UseDefaultNaN DN,
+                bool* exception = NULL);
+
+double FPToDouble(Float16 value, UseDefaultNaN DN, bool* exception = NULL);
+double FPToDouble(float value, UseDefaultNaN DN, bool* exception = NULL);
+
+Float16 FPToFloat16(float value,
+                    FPRounding round_mode,
+                    UseDefaultNaN DN,
+                    bool* exception = NULL);
+
+Float16 FPToFloat16(double value,
+                    FPRounding round_mode,
+                    UseDefaultNaN DN,
+                    bool* exception = NULL);
 }  // namespace vixl
 
 #endif  // VIXL_UTILS_H
commit	ca789743e5514898789665068e7c84aae1dbfe75	[log] [tgz]
author	Jacob Bramley <jacob.bramley@arm.com>	Thu Sep 13 14:25:46 2018 +0100
committer	Jacob Bramley <jacob.bramley@arm.com>	Wed Sep 19 14:03:40 2018 +0100
tree	60664fc214705653ed7e95dfad9d7b0da5f99c99
parent	764d67e63327f194be65e5112a34c994360eb13c [diff]