VIXL Release 1.9

Refer to the README.md and LICENCE files for details.
diff --git a/README.md b/README.md
index a2f6e4f..6a57632 100644
--- a/README.md
+++ b/README.md
@@ -1,44 +1,24 @@
-VIXL: AArch64 Runtime Code Generation Library Version 1.8
+VIXL: AArch64 Runtime Code Generation Library Version 1.9
 =========================================================
 
 Contents:
 
- * Requirements
  * Overview
+ * Requirements
  * Known limitations
  * Usage
 
 
-Requirements
-============
-
-To build VIXL the following software is required:
-
- 1. Python 2.7
- 2. SCons 2.0
- 3. GCC 4.6+
-
-A 64-bit host machine is required, implementing an LP64 data model. VIXL has
-only been tested using GCC on AArch64 Debian and amd64 Ubuntu systems.
-
-To run the linter stage of the tests, the following software is also required:
-
- 1. Git
- 2. [Google's `cpplint.py`][cpplint]
-
-Refer to the 'Usage' section for details.
-
-
 Overview
 ========
 
-VIXL is made of three components.
+VIXL contains three components.
 
- 1. A programmatic assembler to generate A64 code at runtime. The assembler
+ 1. A programmatic **assembler** to generate A64 code at runtime. The assembler
     abstracts some of the constraints of the A64 ISA; for example, most
     instructions support any immediate.
- 2. A disassembler which can print any instruction emitted by the assembler.
- 3. A simulator which can simulate any instruction emitted by the assembler.
+ 2. A **disassembler** that can print any instruction emitted by the assembler.
+ 3. A **simulator** that can simulate any instruction emitted by the assembler.
     The simulator allows generated code to be run on another architecture
     without the need for a full ISA model.
 
@@ -48,11 +28,32 @@
 [Changelog](doc/changelog.md).
 
 
+Requirements
+============
+
+To build VIXL the following software is required:
+
+ 1. Python 2.7
+ 2. SCons 2.0
+ 3. GCC 4.8+ or Clang 3.4+
+
+A 64-bit host machine is required, implementing an LP64 data model. VIXL has
+been tested using GCC on AArch64 Debian, GCC and Clang on amd64 Ubuntu
+systems.
+
+To run the linter stage of the tests, the following software is also required:
+
+ 1. Git
+ 2. [Google's `cpplint.py`][cpplint]
+
+Refer to the 'Usage' section for details.
+
+
 Known Limitations
 =================
 
-VIXL was developed to target JavaScript engines so a number of features from A64
-were deemed unnecessary:
+VIXL was developed for JavaScript engines so a number of features from A64 were
+deemed unnecessary:
 
  * Limited rounding mode support for floating point.
  * Limited support for synchronisation instructions.
diff --git a/SConstruct b/SConstruct
index 47e3e86..e29e9a7 100644
--- a/SConstruct
+++ b/SConstruct
@@ -49,18 +49,19 @@
 # Global configuration.
 PROJ_SRC_DIR   = 'src'
 PROJ_SRC_FILES = '''
-src/a64/assembler-a64.cc
-src/a64/cpu-a64.cc
-src/a64/debugger-a64.cc
-src/a64/decoder-a64.cc
-src/a64/disasm-a64.cc
-src/a64/instructions-a64.cc
-src/a64/instrument-a64.cc
-src/a64/logic-a64.cc
-src/a64/macro-assembler-a64.cc
-src/a64/simulator-a64.cc
-src/code-buffer.cc
-src/utils.cc
+src/vixl/a64/assembler-a64.cc
+src/vixl/a64/cpu-a64.cc
+src/vixl/a64/debugger-a64.cc
+src/vixl/a64/decoder-a64.cc
+src/vixl/a64/disasm-a64.cc
+src/vixl/a64/instructions-a64.cc
+src/vixl/a64/instrument-a64.cc
+src/vixl/a64/logic-a64.cc
+src/vixl/a64/macro-assembler-a64.cc
+src/vixl/a64/simulator-a64.cc
+src/vixl/code-buffer.cc
+src/vixl/compiler-intrinsics.cc
+src/vixl/utils.cc
 '''.split()
 PROJ_EXAMPLES_DIR = 'examples'
 PROJ_EXAMPLES_SRC_FILES = '''
@@ -119,9 +120,7 @@
     benchmarks/bench-branch-link-masm.cc
     '''.split()
 }
-RELEASE_OBJ_DIR  = 'obj/release'
-DEBUG_OBJ_DIR    = 'obj/debug'
-
+OBJ_DIR  = 'obj'
 
 # Helper functions.
 def abort(message):
@@ -133,6 +132,10 @@
   return map(lambda x: os.path.join(obj_dir, x), src_files)
 
 
+def is_compiler(compiler):
+  return env['CXX'].find(compiler) == 0
+
+
 def create_variant(obj_dir, targets_dir):
   VariantDir(os.path.join(obj_dir, PROJ_SRC_DIR), PROJ_SRC_DIR)
   for directory in targets_dir.itervalues():
@@ -146,10 +149,9 @@
 sim_default = 'off' if platform.machine() == 'aarch64' else 'on'
 args.Add(EnumVariable('simulator', 'build for the simulator', sim_default,
                       allowed_values = ['on', 'off']))
+args.Add('std', 'c++ standard')
 
 # Configure the environment.
-create_variant(RELEASE_OBJ_DIR, TARGET_SRC_DIR)
-create_variant(DEBUG_OBJ_DIR, TARGET_SRC_DIR)
 env = Environment(variables=args)
 
 # Commandline help.
@@ -175,18 +177,32 @@
   env.Append(LINKFLAGS = os.environ.get('LINKFLAGS').split())
 
 # Always look in 'src' for include files.
+# TODO: Restore the '-Wunreachable-code' flag. This flag breaks builds for clang
+# 3.4 with std=c++98. So we need to re-enable this conditionally when clang is at
+# version 3.5 or later.
 env.Append(CPPPATH = [PROJ_SRC_DIR])
 env.Append(CPPFLAGS = ['-Wall',
                        '-Werror',
                        '-fdiagnostics-show-option',
                        '-Wextra',
+                       '-Wredundant-decls',
                        '-pedantic',
                        # Explicitly enable the write-strings warning. VIXL uses
                        # const correctly when handling string constants.
                        '-Wwrite-strings'])
 
 build_suffix = ''
+std_path = 'default-std'
 
+if 'std' in env:
+  env.Append(CPPFLAGS = ['-std=' + env['std']])
+  std_path = env['std']
+
+if is_compiler('clang++'):
+  # This warning only works for Clang, when compiling the code base as C++11
+  # or newer. The compiler does not complain if the option is passed when
+  # compiling earlier C++ standards.
+  env.Append(CPPFLAGS = ['-Wimplicit-fallthrough'])
 
 if env['simulator'] == 'on':
   env.Append(CPPFLAGS = ['-DUSE_SIMULATOR'])
@@ -196,11 +212,9 @@
   env.Append(CPPFLAGS = ['-g', '-DVIXL_DEBUG'])
   # Append the debug mode suffix to the executable name.
   build_suffix += '_g'
-  build_dir = DEBUG_OBJ_DIR
 else:
   # Release mode.
   env.Append(CPPFLAGS = ['-O3'])
-  build_dir = RELEASE_OBJ_DIR
   process = subprocess.Popen(env['CXX'] + ' --version | grep "gnu.*4\.8"',
                              shell = True,
                              stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
@@ -214,6 +228,9 @@
     # GCC 4.8.
     env.Append(CPPFLAGS = ['-Wno-maybe-uninitialized'])
 
+# Configure build directory
+build_dir = os.path.join(OBJ_DIR, env['mode'], env['CXX'], std_path, '')
+create_variant(build_dir, TARGET_SRC_DIR)
 
 # The lists of available targets and target names.
 targets = []
@@ -226,7 +243,7 @@
 
 
 # The vixl library.
-libvixl = env.Library('vixl' + build_suffix,
+libvixl = env.Library(build_dir + 'vixl' + build_suffix,
                       list_target(build_dir, PROJ_SRC_FILES))
 create_alias('libvixl', libvixl)
 
@@ -238,7 +255,7 @@
 VariantDir(test_ex_vdir, '.')
 test_ex_obj = env.Object(list_target(test_ex_vdir, PROJ_EXAMPLES_SRC_FILES),
                          CPPFLAGS = env['CPPFLAGS'] + ['-DTEST_EXAMPLES'])
-test = env.Program('test-runner' + build_suffix,
+test = env.Program(build_dir + 'test-runner' + build_suffix,
                    list_target(build_dir, TARGET_SRC_FILES['test']) +
                    test_ex_obj + libvixl,
                    CPPPATH = env['CPPPATH'] + [PROJ_EXAMPLES_DIR])
@@ -248,7 +265,7 @@
 benchmarks = ['bench-dataop', 'bench-branch', 'bench-branch-link',
               'bench-branch-masm', 'bench-branch-link-masm']
 for bench in benchmarks:
-  prog = env.Program(bench + build_suffix,
+  prog = env.Program(build_dir + bench + build_suffix,
                      list_target(build_dir, TARGET_SRC_FILES[bench]) + libvixl)
   create_alias(bench, prog)
 # Alias to build all benchmarks.
@@ -258,7 +275,7 @@
 examples = []
 for example in PROJ_EXAMPLES_SRC_FILES:
   example_name = "example-" + os.path.splitext(os.path.basename(example))[0]
-  prog = env.Program(example_name,
+  prog = env.Program(build_dir + example_name,
                      [os.path.join(build_dir, example)] + libvixl,
                      CPPPATH = env['CPPPATH'] + [PROJ_EXAMPLES_DIR])
   create_alias(example_name, prog)
diff --git a/benchmarks/bench-branch-link-masm.cc b/benchmarks/bench-branch-link-masm.cc
index 2f6c65e..115f402 100644
--- a/benchmarks/bench-branch-link-masm.cc
+++ b/benchmarks/bench-branch-link-masm.cc
@@ -24,9 +24,9 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include "a64/macro-assembler-a64.h"
-#include "a64/instructions-a64.h"
-#include "globals.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/instructions-a64.h"
+#include "vixl/globals.h"
 
 using namespace vixl;
 
diff --git a/benchmarks/bench-branch-link.cc b/benchmarks/bench-branch-link.cc
index 6448566..b18cd8b 100644
--- a/benchmarks/bench-branch-link.cc
+++ b/benchmarks/bench-branch-link.cc
@@ -24,9 +24,9 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include "a64/macro-assembler-a64.h"
-#include "a64/instructions-a64.h"
-#include "globals.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/instructions-a64.h"
+#include "vixl/globals.h"
 
 using namespace vixl;
 
diff --git a/benchmarks/bench-branch-masm.cc b/benchmarks/bench-branch-masm.cc
index 910403c..23cbd1e 100644
--- a/benchmarks/bench-branch-masm.cc
+++ b/benchmarks/bench-branch-masm.cc
@@ -24,10 +24,10 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include "globals.h"
+#include "vixl/globals.h"
 
-#include "a64/macro-assembler-a64.h"
-#include "a64/instructions-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/instructions-a64.h"
 
 using namespace vixl;
 
diff --git a/benchmarks/bench-branch.cc b/benchmarks/bench-branch.cc
index 227ba95..706ecfd 100644
--- a/benchmarks/bench-branch.cc
+++ b/benchmarks/bench-branch.cc
@@ -24,9 +24,9 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include "a64/macro-assembler-a64.h"
-#include "a64/instructions-a64.h"
-#include "globals.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/instructions-a64.h"
+#include "vixl/globals.h"
 
 using namespace vixl;
 
diff --git a/benchmarks/bench-dataop.cc b/benchmarks/bench-dataop.cc
index 431e991..cf9faed 100644
--- a/benchmarks/bench-dataop.cc
+++ b/benchmarks/bench-dataop.cc
@@ -24,9 +24,9 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include "a64/macro-assembler-a64.h"
-#include "a64/instructions-a64.h"
-#include "globals.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/instructions-a64.h"
+#include "vixl/globals.h"
 
 using namespace vixl;
 
diff --git a/doc/changelog.md b/doc/changelog.md
index 4881ef4..55943e5 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -1,6 +1,13 @@
 VIXL Change Log
 ===============
 
+* 1.9
+    + Improved compatibility with Android build system.
+    + Improved compatibility with Clang toolchain.
+    + Added support for `umulh` instruction.
+    + Added support for `fcmpe` and `fccmpe` instructions.
+    + Other small bug fixes and improvements.
+
 * 1.8
     + Complete NEON instruction set support.
     + Support long branches using veneers.
diff --git a/examples/custom-disassembler.h b/examples/custom-disassembler.h
index 382a55d..4fcc693 100644
--- a/examples/custom-disassembler.h
+++ b/examples/custom-disassembler.h
@@ -27,7 +27,7 @@
 #ifndef VIXL_EXAMPLES_CUSTOM_DISASSEMBLER_H_
 #define VIXL_EXAMPLES_CUSTOM_DISASSEMBLER_H_
 
-#include "a64/disasm-a64.h"
+#include "vixl/a64/disasm-a64.h"
 
 using namespace vixl;
 
diff --git a/examples/examples.h b/examples/examples.h
index 5d74e29..8c51589 100644
--- a/examples/examples.h
+++ b/examples/examples.h
@@ -27,9 +27,9 @@
 #ifndef VIXL_EXAMPLE_EXAMPLES_H_
 # define VIXL_EXAMPLE_EXAMPLES_H_
 
-#include "a64/simulator-a64.h"
-#include "a64/debugger-a64.h"
-#include "a64/macro-assembler-a64.h"
+#include "vixl/a64/simulator-a64.h"
+#include "vixl/a64/debugger-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
 
 using namespace vixl;
 
diff --git a/examples/getting-started.cc b/examples/getting-started.cc
index 27e9dd7..0987429 100644
--- a/examples/getting-started.cc
+++ b/examples/getting-started.cc
@@ -24,8 +24,8 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include "a64/simulator-a64.h"
-#include "a64/macro-assembler-a64.h"
+#include "vixl/a64/simulator-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
 
 #define BUF_SIZE (4096)
 #define __ masm->
diff --git a/examples/neon-matrix-multiply.cc b/examples/neon-matrix-multiply.cc
index 88123fb..6a27150 100644
--- a/examples/neon-matrix-multiply.cc
+++ b/examples/neon-matrix-multiply.cc
@@ -117,7 +117,7 @@
   float mat1[kLength], mat2[kLength], output[kLength];
 
   // Initialise the output matrix to the zero matrix.
-  memset(output, 0, sizeof(float)*kLength);
+  memset(output, 0, sizeof(output[0]) * kLength);
 
   // Fill the two input matrices with some 32 bit floating point values.
   // Array initialisation using curly brackets is also possible like so:
diff --git a/src/a64/assembler-a64.cc b/src/vixl/a64/assembler-a64.cc
similarity index 98%
rename from src/a64/assembler-a64.cc
rename to src/vixl/a64/assembler-a64.cc
index 6af2291..9f85e8f 100644
--- a/src/a64/assembler-a64.cc
+++ b/src/vixl/a64/assembler-a64.cc
@@ -26,7 +26,7 @@
 
 
 #include <cmath>
-#include "a64/assembler-a64.h"
+#include "vixl/a64/assembler-a64.h"
 
 namespace vixl {
 
@@ -35,7 +35,7 @@
   if (IsEmpty()) {
     return NoCPUReg;
   }
-  int index = CountTrailingZeros(list_, kRegListSizeInBits);
+  int index = CountTrailingZeros(list_);
   VIXL_ASSERT((1 << index) & list_);
   Remove(index);
   return CPURegister(index, size_, type_);
@@ -47,7 +47,7 @@
   if (IsEmpty()) {
     return NoCPUReg;
   }
-  int index = CountLeadingZeros(list_, kRegListSizeInBits);
+  int index = CountLeadingZeros(list_);
   index = kRegListSizeInBits - 1 - index;
   VIXL_ASSERT((1 << index) & list_);
   Remove(index);
@@ -463,6 +463,12 @@
 }
 
 
+void MemOperand::AddOffset(int64_t offset) {
+  VIXL_ASSERT(IsImmediateOffset());
+  offset_ += offset;
+}
+
+
 // Assembler
 Assembler::Assembler(byte* buffer, size_t capacity,
                      PositionIndependentCodeOption pic)
@@ -1349,6 +1355,14 @@
 }
 
 
+void Assembler::umulh(const Register& xd,
+                      const Register& xn,
+                      const Register& xm) {
+  VIXL_ASSERT(xd.Is64Bits() && xn.Is64Bits() && xm.Is64Bits());
+  DataProcessing3Source(xd, xn, xm, xzr, UMULH_x);
+}
+
+
 void Assembler::udiv(const Register& rd,
                      const Register& rn,
                      const Register& rm) {
@@ -2628,33 +2642,78 @@
 }
 
 
-void Assembler::fcmp(const VRegister& vn,
-                     const VRegister& vm) {
+void Assembler::FPCompareMacro(const VRegister& vn,
+                               double value,
+                               FPTrapFlags trap) {
+  USE(value);
+  // Although the fcmp{e} instructions can strictly only take an immediate
+  // value of +0.0, we don't need to check for -0.0 because the sign of 0.0
+  // doesn't affect the result of the comparison.
+  VIXL_ASSERT(value == 0.0);
+  VIXL_ASSERT(vn.Is1S() || vn.Is1D());
+  Instr op = (trap == EnableTrap) ? FCMPE_zero : FCMP_zero;
+  Emit(FPType(vn) | op | Rn(vn));
+}
+
+
+void Assembler::FPCompareMacro(const VRegister& vn,
+                               const VRegister& vm,
+                               FPTrapFlags trap) {
   VIXL_ASSERT(vn.Is1S() || vn.Is1D());
   VIXL_ASSERT(vn.IsSameSizeAndType(vm));
-  Emit(FPType(vn) | FCMP | Rm(vm) | Rn(vn));
+  Instr op = (trap == EnableTrap) ? FCMPE : FCMP;
+  Emit(FPType(vn) | op | Rm(vm) | Rn(vn));
+}
+
+
+void Assembler::fcmp(const VRegister& vn,
+                     const VRegister& vm) {
+  FPCompareMacro(vn, vm, DisableTrap);
+}
+
+
+void Assembler::fcmpe(const VRegister& vn,
+                      const VRegister& vm) {
+  FPCompareMacro(vn, vm, EnableTrap);
 }
 
 
 void Assembler::fcmp(const VRegister& vn,
                      double value) {
-  USE(value);
-  // Although the fcmp instruction can strictly only take an immediate value of
-  // +0.0, we don't need to check for -0.0 because the sign of 0.0 doesn't
-  // affect the result of the comparison.
-  VIXL_ASSERT(value == 0.0);
-  VIXL_ASSERT(vn.Is1S() || vn.Is1D());
-  Emit(FPType(vn) | FCMP_zero | Rn(vn));
+  FPCompareMacro(vn, value, DisableTrap);
 }
 
 
+void Assembler::fcmpe(const VRegister& vn,
+                      double value) {
+  FPCompareMacro(vn, value, EnableTrap);
+}
+
+
+void Assembler::FPCCompareMacro(const VRegister& vn,
+                                const VRegister& vm,
+                                StatusFlags nzcv,
+                                Condition cond,
+                                FPTrapFlags trap) {
+  VIXL_ASSERT(vn.Is1S() || vn.Is1D());
+  VIXL_ASSERT(vn.IsSameSizeAndType(vm));
+  Instr op = (trap == EnableTrap) ? FCCMPE : FCCMP;
+  Emit(FPType(vn) | op | Rm(vm) | Cond(cond) | Rn(vn) | Nzcv(nzcv));
+}
+
 void Assembler::fccmp(const VRegister& vn,
                       const VRegister& vm,
                       StatusFlags nzcv,
                       Condition cond) {
-  VIXL_ASSERT(vn.Is1S() || vn.Is1D());
-  VIXL_ASSERT(vn.IsSameSizeAndType(vm));
-  Emit(FPType(vn) | FCCMP | Rm(vm) | Cond(cond) | Rn(vn) | Nzcv(nzcv));
+  FPCCompareMacro(vn, vm, nzcv, cond, DisableTrap);
+}
+
+
+void Assembler::fccmpe(const VRegister& vn,
+                       const VRegister& vm,
+                       StatusFlags nzcv,
+                       Condition cond) {
+  FPCCompareMacro(vn, vm, nzcv, cond, EnableTrap);
 }
 
 
@@ -4948,6 +5007,7 @@
 
 
 bool Assembler::IsImmLSPair(int64_t offset, unsigned access_size) {
+  VIXL_ASSERT(access_size <= kQRegSizeInBytesLog2);
   bool offset_is_size_multiple =
       (((offset >> access_size) << access_size) == offset);
   return offset_is_size_multiple && is_int7(offset >> access_size);
@@ -4955,6 +5015,7 @@
 
 
 bool Assembler::IsImmLSScaled(int64_t offset, unsigned access_size) {
+  VIXL_ASSERT(access_size <= kQRegSizeInBytesLog2);
   bool offset_is_size_multiple =
       (((offset >> access_size) << access_size) == offset);
   return offset_is_size_multiple && is_uint12(offset >> access_size);
@@ -5319,10 +5380,8 @@
     }
   }
 
-  int number_of_unique_regs =
-    CountSetBits(unique_regs, sizeof(unique_regs) * 8);
-  int number_of_unique_fpregs =
-    CountSetBits(unique_fpregs, sizeof(unique_fpregs) * 8);
+  int number_of_unique_regs = CountSetBits(unique_regs);
+  int number_of_unique_fpregs = CountSetBits(unique_fpregs);
 
   VIXL_ASSERT(number_of_valid_regs >= number_of_unique_regs);
   VIXL_ASSERT(number_of_valid_fpregs >= number_of_unique_fpregs);
diff --git a/src/a64/assembler-a64.h b/src/vixl/a64/assembler-a64.h
similarity index 98%
rename from src/a64/assembler-a64.h
rename to src/vixl/a64/assembler-a64.h
index 5f24105..39763b3 100644
--- a/src/a64/assembler-a64.h
+++ b/src/vixl/a64/assembler-a64.h
@@ -28,11 +28,11 @@
 #define VIXL_A64_ASSEMBLER_A64_H_
 
 
-#include "globals.h"
-#include "invalset.h"
-#include "utils.h"
-#include "code-buffer.h"
-#include "a64/instructions-a64.h"
+#include "vixl/globals.h"
+#include "vixl/invalset.h"
+#include "vixl/utils.h"
+#include "vixl/code-buffer.h"
+#include "vixl/a64/instructions-a64.h"
 
 namespace vixl {
 
@@ -55,6 +55,7 @@
     kInvalid = 0,
     kRegister,
     kVRegister,
+    kFPRegister = kVRegister,
     kNoRegister
   };
 
@@ -556,6 +557,10 @@
                                  const CPURegList& list_3,
                                  const CPURegList& list_4);
 
+  bool Overlaps(const CPURegList& other) const {
+    return (type_ == other.type_) && ((list_ & other.list_) != 0);
+  }
+
   RegList list() const {
     VIXL_ASSERT(IsValid());
     return list_;
@@ -600,7 +605,7 @@
 
   int Count() const {
     VIXL_ASSERT(IsValid());
-    return CountSetBits(list_, kRegListSizeInBits);
+    return CountSetBits(list_);
   }
 
   unsigned RegisterSizeInBits() const {
@@ -630,7 +635,7 @@
 
 // AAPCS64 callee-saved registers.
 extern const CPURegList kCalleeSaved;
-extern const CPURegList kCalleeSavedFP;
+extern const CPURegList kCalleeSavedV;
 
 
 // AAPCS64 caller-saved registers. Note that this includes lr.
@@ -710,17 +715,17 @@
   explicit MemOperand(Register base,
                       int64_t offset = 0,
                       AddrMode addrmode = Offset);
-  explicit MemOperand(Register base,
-                      Register regoffset,
-                      Shift shift = LSL,
-                      unsigned shift_amount = 0);
-  explicit MemOperand(Register base,
-                      Register regoffset,
-                      Extend extend,
-                      unsigned shift_amount = 0);
-  explicit MemOperand(Register base,
-                      const Operand& offset,
-                      AddrMode addrmode = Offset);
+  MemOperand(Register base,
+             Register regoffset,
+             Shift shift = LSL,
+             unsigned shift_amount = 0);
+  MemOperand(Register base,
+             Register regoffset,
+             Extend extend,
+             unsigned shift_amount = 0);
+  MemOperand(Register base,
+             const Operand& offset,
+             AddrMode addrmode = Offset);
 
   const Register& base() const { return base_; }
   const Register& regoffset() const { return regoffset_; }
@@ -734,6 +739,8 @@
   bool IsPreIndex() const;
   bool IsPostIndex() const;
 
+  void AddOffset(int64_t offset);
+
  private:
   Register base_;
   Register regoffset_;
@@ -1606,6 +1613,11 @@
     umaddl(rd, rn, rm, xzr);
   }
 
+  // Unsigned multiply high: 64 x 64 -> 64-bit <127:64>.
+  void umulh(const Register& xd,
+             const Register& xn,
+             const Register& xm);
+
   // Signed long multiply and subtract: 64 - (32 x 32) -> 64-bit.
   void smsubl(const Register& rd,
               const Register& rn,
@@ -2022,18 +2034,44 @@
   // FP round to integer, towards zero.
   void frintz(const VRegister& vd, const VRegister& vn);
 
+  void FPCompareMacro(const VRegister& vn,
+                      double value,
+                      FPTrapFlags trap);
+
+  void FPCompareMacro(const VRegister& vn,
+                      const VRegister& vm,
+                      FPTrapFlags trap);
+
   // FP compare registers.
   void fcmp(const VRegister& vn, const VRegister& vm);
 
   // FP compare immediate.
   void fcmp(const VRegister& vn, double value);
 
+  void FPCCompareMacro(const VRegister& vn,
+                       const VRegister& vm,
+                       StatusFlags nzcv,
+                       Condition cond,
+                       FPTrapFlags trap);
+
   // FP conditional compare.
   void fccmp(const VRegister& vn,
              const VRegister& vm,
              StatusFlags nzcv,
              Condition cond);
 
+  // FP signaling compare registers.
+  void fcmpe(const VRegister& vn, const VRegister& vm);
+
+  // FP signaling compare immediate.
+  void fcmpe(const VRegister& vn, double value);
+
+  // FP conditional signaling compare.
+  void fccmpe(const VRegister& vn,
+              const VRegister& vm,
+              StatusFlags nzcv,
+              Condition cond);
+
   // FP conditional select.
   void fcsel(const VRegister& vd,
              const VRegister& vn,
@@ -3949,8 +3987,8 @@
                            unsigned* n = NULL,
                            unsigned* imm_s = NULL,
                            unsigned* imm_r = NULL);
-  static bool IsImmLSPair(int64_t offset, unsigned size);
-  static bool IsImmLSScaled(int64_t offset, unsigned size);
+  static bool IsImmLSPair(int64_t offset, unsigned access_size);
+  static bool IsImmLSScaled(int64_t offset, unsigned access_size);
   static bool IsImmLSUnscaled(int64_t offset);
   static bool IsImmMovn(uint64_t imm, unsigned reg_size);
   static bool IsImmMovz(uint64_t imm, unsigned reg_size);
diff --git a/src/a64/constants-a64.h b/src/vixl/a64/constants-a64.h
similarity index 99%
rename from src/a64/constants-a64.h
rename to src/vixl/a64/constants-a64.h
index 0791575..2caa73a 100644
--- a/src/a64/constants-a64.h
+++ b/src/vixl/a64/constants-a64.h
@@ -225,6 +225,11 @@
   return static_cast<Condition>(cond ^ 1);
 }
 
+enum FPTrapFlags {
+  EnableTrap   = 1,
+  DisableTrap = 0
+};
+
 enum FlagsUpdate {
   SetFlags   = 1,
   LeaveFlags = 0
@@ -1092,8 +1097,10 @@
   FCMP_zero      = FCMP_s_zero,
   FCMPE_s        = FPCompareFixed | 0x00000010,
   FCMPE_d        = FPCompareFixed | FP64 | 0x00000010,
+  FCMPE          = FCMPE_s,
   FCMPE_s_zero   = FPCompareFixed | 0x00000018,
-  FCMPE_d_zero   = FPCompareFixed | FP64 | 0x00000018
+  FCMPE_d_zero   = FPCompareFixed | FP64 | 0x00000018,
+  FCMPE_zero     = FCMPE_s_zero
 };
 
 // Floating point conditional compare.
diff --git a/src/a64/cpu-a64.cc b/src/vixl/a64/cpu-a64.cc
similarity index 98%
rename from src/a64/cpu-a64.cc
rename to src/vixl/a64/cpu-a64.cc
index f71a065..7a33551 100644
--- a/src/a64/cpu-a64.cc
+++ b/src/vixl/a64/cpu-a64.cc
@@ -24,8 +24,8 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include "utils.h"
-#include "a64/cpu-a64.h"
+#include "vixl/utils.h"
+#include "vixl/a64/cpu-a64.h"
 
 namespace vixl {
 
diff --git a/src/a64/cpu-a64.h b/src/vixl/a64/cpu-a64.h
similarity index 97%
rename from src/a64/cpu-a64.h
rename to src/vixl/a64/cpu-a64.h
index 71e7fd4..cdf09a6 100644
--- a/src/a64/cpu-a64.h
+++ b/src/vixl/a64/cpu-a64.h
@@ -27,8 +27,8 @@
 #ifndef VIXL_CPU_A64_H
 #define VIXL_CPU_A64_H
 
-#include "globals.h"
-#include "instructions-a64.h"
+#include "vixl/globals.h"
+#include "vixl/a64/instructions-a64.h"
 
 namespace vixl {
 
diff --git a/src/a64/debugger-a64.cc b/src/vixl/a64/debugger-a64.cc
similarity index 99%
rename from src/a64/debugger-a64.cc
rename to src/vixl/a64/debugger-a64.cc
index e412e9c..1a65bd3 100644
--- a/src/a64/debugger-a64.cc
+++ b/src/vixl/a64/debugger-a64.cc
@@ -26,7 +26,7 @@
 
 #ifdef USE_SIMULATOR
 
-#include "a64/debugger-a64.h"
+#include "vixl/a64/debugger-a64.h"
 
 namespace vixl {
 
@@ -645,7 +645,8 @@
     case BRK:
       DoBreakpoint(instr);
       return;
-    case HLT:   // Fall through.
+    case HLT:
+      VIXL_FALLTHROUGH();
     default: Simulator::VisitException(instr);
   }
 }
@@ -994,6 +995,7 @@
       break;
     case 'i':
       if (length == 1) return new Format<uint32_t>("%08" PRIx32, 'i');
+      VIXL_FALLTHROUGH();
     default: return NULL;
   }
 
diff --git a/src/a64/debugger-a64.h b/src/vixl/a64/debugger-a64.h
similarity index 96%
rename from src/a64/debugger-a64.h
rename to src/vixl/a64/debugger-a64.h
index fbc5b59..aecd620 100644
--- a/src/a64/debugger-a64.h
+++ b/src/vixl/a64/debugger-a64.h
@@ -32,10 +32,10 @@
 #include <errno.h>
 #include <vector>
 
-#include "globals.h"
-#include "utils.h"
-#include "a64/constants-a64.h"
-#include "a64/simulator-a64.h"
+#include "vixl/globals.h"
+#include "vixl/utils.h"
+#include "vixl/a64/constants-a64.h"
+#include "vixl/a64/simulator-a64.h"
 
 namespace vixl {
 
diff --git a/src/a64/decoder-a64.cc b/src/vixl/a64/decoder-a64.cc
similarity index 99%
rename from src/a64/decoder-a64.cc
rename to src/vixl/a64/decoder-a64.cc
index 58834be..5ba2d3c 100644
--- a/src/a64/decoder-a64.cc
+++ b/src/vixl/a64/decoder-a64.cc
@@ -24,9 +24,9 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include "globals.h"
-#include "utils.h"
-#include "a64/decoder-a64.h"
+#include "vixl/globals.h"
+#include "vixl/utils.h"
+#include "vixl/a64/decoder-a64.h"
 
 namespace vixl {
 
@@ -488,6 +488,7 @@
         case 6: {
           if (instr->Bit(29) == 0x1) {
             VisitUnallocated(instr);
+            VIXL_FALLTHROUGH();
           } else {
             if (instr->Bit(30) == 0) {
               if ((instr->Bit(15) == 0x1) ||
diff --git a/src/a64/decoder-a64.h b/src/vixl/a64/decoder-a64.h
similarity index 99%
rename from src/a64/decoder-a64.h
rename to src/vixl/a64/decoder-a64.h
index 81cd0c2..4f4f19c 100644
--- a/src/a64/decoder-a64.h
+++ b/src/vixl/a64/decoder-a64.h
@@ -29,8 +29,8 @@
 
 #include <list>
 
-#include "globals.h"
-#include "a64/instructions-a64.h"
+#include "vixl/globals.h"
+#include "vixl/a64/instructions-a64.h"
 
 
 // List macro containing all visitors needed by the decoder class.
diff --git a/src/a64/disasm-a64.cc b/src/vixl/a64/disasm-a64.cc
similarity index 99%
rename from src/a64/disasm-a64.cc
rename to src/vixl/a64/disasm-a64.cc
index 37b4b5c..a12d028 100644
--- a/src/a64/disasm-a64.cc
+++ b/src/vixl/a64/disasm-a64.cc
@@ -25,7 +25,7 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <cstdlib>
-#include "a64/disasm-a64.h"
+#include "vixl/a64/disasm-a64.h"
 
 namespace vixl {
 
@@ -890,9 +890,9 @@
     case LDUR_s:   mnemonic = "ldur"; form = form_s; break;
     case LDUR_d:   mnemonic = "ldur"; form = form_d; break;
     case LDUR_q:   mnemonic = "ldur"; form = form_q; break;
-    case LDURSB_x: form = form_x;  // Fall through.
+    case LDURSB_x: form = form_x; VIXL_FALLTHROUGH();
     case LDURSB_w: mnemonic = "ldursb"; break;
-    case LDURSH_x: form = form_x;  // Fall through.
+    case LDURSH_x: form = form_x; VIXL_FALLTHROUGH();
     case LDURSH_w: mnemonic = "ldursh"; break;
     case LDURSW_x: mnemonic = "ldursw"; form = form_x; break;
     case PRFUM:    mnemonic = "prfum"; form = form_prefetch; break;
@@ -1054,9 +1054,13 @@
 
   switch (instr->Mask(FPCompareMask)) {
     case FCMP_s_zero:
-    case FCMP_d_zero: form = form_zero;  // Fall through.
+    case FCMP_d_zero: form = form_zero; VIXL_FALLTHROUGH();
     case FCMP_s:
     case FCMP_d: mnemonic = "fcmp"; break;
+    case FCMPE_s_zero:
+    case FCMPE_d_zero: form = form_zero; VIXL_FALLTHROUGH();
+    case FCMPE_s:
+    case FCMPE_d: mnemonic = "fcmpe"; break;
     default: form = "(FPCompare)";
   }
   Format(instr, mnemonic, form);
@@ -2884,8 +2888,8 @@
     field_len = 3;
   }
 
-  CPURegister::RegisterType reg_type;
-  unsigned reg_size;
+  CPURegister::RegisterType reg_type = CPURegister::kRegister;
+  unsigned reg_size = kXRegSize;
 
   if (reg_prefix == 'R') {
     reg_prefix = instr->SixtyFourBits() ? 'X' : 'W';
@@ -2913,8 +2917,6 @@
       return field_len;
     default:
       VIXL_UNREACHABLE();
-      reg_type = CPURegister::kRegister;
-      reg_size = kXRegSize;
   }
 
   if ((reg_type == CPURegister::kRegister) &&
@@ -3087,6 +3089,7 @@
               return 0;
             }
           }
+          VIXL_FALLTHROUGH();
         }
         case 'L': {  // IVLSLane[0123] - suffix indicates access size shift.
           AppendToOutput("%d", instr->NEONLSIndex(format[8] - '0'));
@@ -3236,7 +3239,8 @@
   switch (format[1]) {
     case 'D': {  // HDP.
       VIXL_ASSERT(instr->ShiftDP() != ROR);
-    }  // Fall through.
+      VIXL_FALLTHROUGH();
+    }
     case 'L': {  // HLo.
       if (instr->ImmDPShift() != 0) {
         const char* shift_type[] = {"lsl", "lsr", "asr", "ror"};
diff --git a/src/a64/disasm-a64.h b/src/vixl/a64/disasm-a64.h
similarity index 97%
rename from src/a64/disasm-a64.h
rename to src/vixl/a64/disasm-a64.h
index dcb6f08..e203156 100644
--- a/src/a64/disasm-a64.h
+++ b/src/vixl/a64/disasm-a64.h
@@ -27,11 +27,11 @@
 #ifndef VIXL_A64_DISASM_A64_H
 #define VIXL_A64_DISASM_A64_H
 
-#include "globals.h"
-#include "utils.h"
-#include "instructions-a64.h"
-#include "decoder-a64.h"
-#include "assembler-a64.h"
+#include "vixl/globals.h"
+#include "vixl/utils.h"
+#include "vixl/a64/instructions-a64.h"
+#include "vixl/a64/decoder-a64.h"
+#include "vixl/a64/assembler-a64.h"
 
 namespace vixl {
 
diff --git a/src/a64/instructions-a64.cc b/src/vixl/a64/instructions-a64.cc
similarity index 99%
rename from src/a64/instructions-a64.cc
rename to src/vixl/a64/instructions-a64.cc
index f9f4a42..6f6b5d2 100644
--- a/src/a64/instructions-a64.cc
+++ b/src/vixl/a64/instructions-a64.cc
@@ -24,8 +24,8 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include "a64/instructions-a64.h"
-#include "a64/assembler-a64.h"
+#include "vixl/a64/instructions-a64.h"
+#include "vixl/a64/assembler-a64.h"
 
 namespace vixl {
 
diff --git a/src/a64/instructions-a64.h b/src/vixl/a64/instructions-a64.h
similarity index 99%
rename from src/a64/instructions-a64.h
rename to src/vixl/a64/instructions-a64.h
index d3bbd9c..64da966 100644
--- a/src/a64/instructions-a64.h
+++ b/src/vixl/a64/instructions-a64.h
@@ -27,9 +27,9 @@
 #ifndef VIXL_A64_INSTRUCTIONS_A64_H_
 #define VIXL_A64_INSTRUCTIONS_A64_H_
 
-#include "globals.h"
-#include "utils.h"
-#include "a64/constants-a64.h"
+#include "vixl/globals.h"
+#include "vixl/utils.h"
+#include "vixl/a64/constants-a64.h"
 
 namespace vixl {
 // ISA constants. --------------------------------------------------------------
diff --git a/src/a64/instrument-a64.cc b/src/vixl/a64/instrument-a64.cc
similarity index 97%
rename from src/a64/instrument-a64.cc
rename to src/vixl/a64/instrument-a64.cc
index 36923e7..21ec604 100644
--- a/src/a64/instrument-a64.cc
+++ b/src/vixl/a64/instrument-a64.cc
@@ -24,7 +24,7 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include "a64/instrument-a64.h"
+#include "vixl/a64/instrument-a64.h"
 
 namespace vixl {
 
@@ -421,22 +421,26 @@
   static Counter* store_fp_counter = GetCounter("Store FP");
 
   switch (instr->Mask(LoadStoreMask)) {
-    case STRB_w:    // Fall through.
-    case STRH_w:    // Fall through.
-    case STR_w:     // Fall through.
+    case STRB_w:
+    case STRH_w:
+    case STR_w:
+      VIXL_FALLTHROUGH();
     case STR_x:     store_int_counter->Increment(); break;
-    case STR_s:     // Fall through.
+    case STR_s:
+      VIXL_FALLTHROUGH();
     case STR_d:     store_fp_counter->Increment(); break;
-    case LDRB_w:    // Fall through.
-    case LDRH_w:    // Fall through.
-    case LDR_w:     // Fall through.
-    case LDR_x:     // Fall through.
-    case LDRSB_x:   // Fall through.
-    case LDRSH_x:   // Fall through.
-    case LDRSW_x:   // Fall through.
-    case LDRSB_w:   // Fall through.
+    case LDRB_w:
+    case LDRH_w:
+    case LDR_w:
+    case LDR_x:
+    case LDRSB_x:
+    case LDRSH_x:
+    case LDRSW_x:
+    case LDRSB_w:
+      VIXL_FALLTHROUGH();
     case LDRSH_w:   load_int_counter->Increment(); break;
-    case LDR_s:     // Fall through.
+    case LDR_s:
+      VIXL_FALLTHROUGH();
     case LDR_d:     load_fp_counter->Increment(); break;
   }
 }
diff --git a/src/a64/instrument-a64.h b/src/vixl/a64/instrument-a64.h
similarity index 95%
rename from src/a64/instrument-a64.h
rename to src/vixl/a64/instrument-a64.h
index a55369a..8468ceb 100644
--- a/src/a64/instrument-a64.h
+++ b/src/vixl/a64/instrument-a64.h
@@ -27,11 +27,11 @@
 #ifndef VIXL_A64_INSTRUMENT_A64_H_
 #define VIXL_A64_INSTRUMENT_A64_H_
 
-#include "globals.h"
-#include "utils.h"
-#include "a64/decoder-a64.h"
-#include "a64/constants-a64.h"
-#include "a64/instrument-a64.h"
+#include "vixl/globals.h"
+#include "vixl/utils.h"
+#include "vixl/a64/decoder-a64.h"
+#include "vixl/a64/constants-a64.h"
+#include "vixl/a64/instrument-a64.h"
 
 namespace vixl {
 
diff --git a/src/a64/logic-a64.cc b/src/vixl/a64/logic-a64.cc
similarity index 90%
rename from src/a64/logic-a64.cc
rename to src/vixl/a64/logic-a64.cc
index c367b35..2b62443 100644
--- a/src/a64/logic-a64.cc
+++ b/src/vixl/a64/logic-a64.cc
@@ -24,9 +24,365 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include "a64/simulator-a64.h"
+#include <cmath>
+#include "vixl/a64/simulator-a64.h"
 
 namespace vixl {
+
+template<> double Simulator::FPDefaultNaN<double>() {
+  return kFP64DefaultNaN;
+}
+
+
+template<> float Simulator::FPDefaultNaN<float>() {
+  return kFP32DefaultNaN;
+}
+
+// See FPRound for a description of this function.
+static inline double FPRoundToDouble(int64_t sign, int64_t exponent,
+                                     uint64_t mantissa, FPRounding round_mode) {
+  int64_t bits =
+      FPRound<int64_t, kDoubleExponentBits, kDoubleMantissaBits>(sign,
+                                                                 exponent,
+                                                                 mantissa,
+                                                                 round_mode);
+  return rawbits_to_double(bits);
+}
+
+
+// See FPRound for a description of this function.
+static inline float FPRoundToFloat(int64_t sign, int64_t exponent,
+                                   uint64_t mantissa, FPRounding round_mode) {
+  int32_t bits =
+      FPRound<int32_t, kFloatExponentBits, kFloatMantissaBits>(sign,
+                                                               exponent,
+                                                               mantissa,
+                                                               round_mode);
+  return rawbits_to_float(bits);
+}
+
+
+// See FPRound for a description of this function.
+static inline float16 FPRoundToFloat16(int64_t sign,
+                                       int64_t exponent,
+                                       uint64_t mantissa,
+                                       FPRounding round_mode) {
+  return FPRound<float16, kFloat16ExponentBits, kFloat16MantissaBits>(
+      sign, exponent, mantissa, round_mode);
+}
+
+
+double Simulator::FixedToDouble(int64_t src, int fbits, FPRounding round) {
+  if (src >= 0) {
+    return UFixedToDouble(src, fbits, round);
+  } else {
+    // This works for all negative values, including INT64_MIN.
+    return -UFixedToDouble(-src, fbits, round);
+  }
+}
+
+
+double Simulator::UFixedToDouble(uint64_t src, int fbits, FPRounding round) {
+  // An input of 0 is a special case because the result is effectively
+  // subnormal: The exponent is encoded as 0 and there is no implicit 1 bit.
+  if (src == 0) {
+    return 0.0;
+  }
+
+  // Calculate the exponent. The highest significant bit will have the value
+  // 2^exponent.
+  const int highest_significant_bit = 63 - CountLeadingZeros(src);
+  const int64_t exponent = highest_significant_bit - fbits;
+
+  return FPRoundToDouble(0, exponent, src, round);
+}
+
+
+float Simulator::FixedToFloat(int64_t src, int fbits, FPRounding round) {
+  if (src >= 0) {
+    return UFixedToFloat(src, fbits, round);
+  } else {
+    // This works for all negative values, including INT64_MIN.
+    return -UFixedToFloat(-src, fbits, round);
+  }
+}
+
+
+float Simulator::UFixedToFloat(uint64_t src, int fbits, FPRounding round) {
+  // An input of 0 is a special case because the result is effectively
+  // subnormal: The exponent is encoded as 0 and there is no implicit 1 bit.
+  if (src == 0) {
+    return 0.0f;
+  }
+
+  // Calculate the exponent. The highest significant bit will have the value
+  // 2^exponent.
+  const int highest_significant_bit = 63 - CountLeadingZeros(src);
+  const int32_t exponent = highest_significant_bit - fbits;
+
+  return FPRoundToFloat(0, exponent, src, round);
+}
+
+
+double Simulator::FPToDouble(float value) {
+  switch (std::fpclassify(value)) {
+    case FP_NAN: {
+      if (IsSignallingNaN(value)) {
+        FPProcessException();
+      }
+      if (DN()) return kFP64DefaultNaN;
+
+      // Convert NaNs as the processor would:
+      //  - The sign is propagated.
+      //  - The payload (mantissa) is transferred entirely, except that the top
+      //    bit is forced to '1', making the result a quiet NaN. The unused
+      //    (low-order) payload bits are set to 0.
+      uint32_t raw = float_to_rawbits(value);
+
+      uint64_t sign = raw >> 31;
+      uint64_t exponent = (1 << 11) - 1;
+      uint64_t payload = unsigned_bitextract_64(21, 0, raw);
+      payload <<= (52 - 23);  // The unused low-order bits should be 0.
+      payload |= (UINT64_C(1) << 51);  // Force a quiet NaN.
+
+      return rawbits_to_double((sign << 63) | (exponent << 52) | payload);
+    }
+
+    case FP_ZERO:
+    case FP_NORMAL:
+    case FP_SUBNORMAL:
+    case FP_INFINITE: {
+      // All other inputs are preserved in a standard cast, because every value
+      // representable using an IEEE-754 float is also representable using an
+      // IEEE-754 double.
+      return static_cast<double>(value);
+    }
+  }
+
+  VIXL_UNREACHABLE();
+  return static_cast<double>(value);
+}
+
+
+float Simulator::FPToFloat(float16 value) {
+  uint32_t sign = value >> 15;
+  uint32_t exponent = unsigned_bitextract_32(
+      kFloat16MantissaBits + kFloat16ExponentBits - 1, kFloat16MantissaBits,
+      value);
+  uint32_t mantissa = unsigned_bitextract_32(
+      kFloat16MantissaBits - 1, 0, value);
+
+  switch (float16classify(value)) {
+    case FP_ZERO:
+      return (sign == 0) ? 0.0f : -0.0f;
+
+    case FP_INFINITE:
+      return (sign == 0) ? kFP32PositiveInfinity : kFP32NegativeInfinity;
+
+    case FP_SUBNORMAL: {
+      // Calculate shift required to put mantissa into the most-significant bits
+      // of the destination mantissa.
+      int shift = CountLeadingZeros(mantissa << (32 - 10));
+
+      // Shift mantissa and discard implicit '1'.
+      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits) + shift + 1;
+      mantissa &= (1 << kFloatMantissaBits) - 1;
+
+      // Adjust the exponent for the shift applied, and rebias.
+      exponent = exponent - shift + (-15 + 127);
+      break;
+    }
+
+    case FP_NAN:
+      if (IsSignallingNaN(value)) {
+        FPProcessException();
+      }
+      if (DN()) return kFP32DefaultNaN;
+
+      // Convert NaNs as the processor would:
+      //  - The sign is propagated.
+      //  - The payload (mantissa) is transferred entirely, except that the top
+      //    bit is forced to '1', making the result a quiet NaN. The unused
+      //    (low-order) payload bits are set to 0.
+      exponent = (1 << kFloatExponentBits) - 1;
+
+      // Increase bits in mantissa, making low-order bits 0.
+      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
+      mantissa |= 1 << 22;  // Force a quiet NaN.
+      break;
+
+    case FP_NORMAL:
+      // Increase bits in mantissa, making low-order bits 0.
+      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
+
+      // Change exponent bias.
+      exponent += (-15 + 127);
+      break;
+
+    default: VIXL_UNREACHABLE();
+  }
+  return rawbits_to_float((sign << 31) |
+                          (exponent << kFloatMantissaBits) |
+                          mantissa);
+}
+
+
+float16 Simulator::FPToFloat16(float value, FPRounding round_mode) {
+  // Only the FPTieEven rounding mode is implemented.
+  VIXL_ASSERT(round_mode == FPTieEven);
+  USE(round_mode);
+
+  uint32_t raw = float_to_rawbits(value);
+  int32_t sign = raw >> 31;
+  int32_t exponent = unsigned_bitextract_32(30, 23, raw) - 127;
+  uint32_t mantissa = unsigned_bitextract_32(22, 0, raw);
+
+  switch (std::fpclassify(value)) {
+    case FP_NAN: {
+      if (IsSignallingNaN(value)) {
+        FPProcessException();
+      }
+      if (DN()) return kFP16DefaultNaN;
+
+      // Convert NaNs as the processor would:
+      //  - The sign is propagated.
+      //  - The payload (mantissa) is transferred as much as possible, except
+      //    that the top bit is forced to '1', making the result a quiet NaN.
+      float16 result = (sign == 0) ? kFP16PositiveInfinity
+                                   : kFP16NegativeInfinity;
+      result |= mantissa >> (kFloatMantissaBits - kFloat16MantissaBits);
+      result |= (1 << 9);  // Force a quiet NaN;
+      return result;
+    }
+
+    case FP_ZERO:
+      return (sign == 0) ? 0 : 0x8000;
+
+    case FP_INFINITE:
+      return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
+
+    case FP_NORMAL:
+    case FP_SUBNORMAL: {
+      // Convert float-to-half as the processor would, assuming that FPCR.FZ
+      // (flush-to-zero) is not set.
+
+      // Add the implicit '1' bit to the mantissa.
+      mantissa += (1 << 23);
+      return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
+    }
+  }
+
+  VIXL_UNREACHABLE();
+  return 0;
+}
+
+
+float16 Simulator::FPToFloat16(double value, FPRounding round_mode) {
+  // Only the FPTieEven rounding mode is implemented.
+  VIXL_ASSERT(round_mode == FPTieEven);
+  USE(round_mode);
+
+  uint64_t raw = double_to_rawbits(value);
+  int32_t sign = raw >> 63;
+  int64_t exponent = unsigned_bitextract_64(62, 52, raw) - 1023;
+  uint64_t mantissa = unsigned_bitextract_64(51, 0, raw);
+
+  switch (std::fpclassify(value)) {
+    case FP_NAN: {
+      if (IsSignallingNaN(value)) {
+        FPProcessException();
+      }
+      if (DN()) return kFP16DefaultNaN;
+
+      // Convert NaNs as the processor would:
+      //  - The sign is propagated.
+      //  - The payload (mantissa) is transferred as much as possible, except
+      //    that the top bit is forced to '1', making the result a quiet NaN.
+      float16 result = (sign == 0) ? kFP16PositiveInfinity
+                                   : kFP16NegativeInfinity;
+      result |= mantissa >> (kDoubleMantissaBits - kFloat16MantissaBits);
+      result |= (1 << 9);  // Force a quiet NaN;
+      return result;
+    }
+
+    case FP_ZERO:
+      return (sign == 0) ? 0 : 0x8000;
+
+    case FP_INFINITE:
+      return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
+
+    case FP_NORMAL:
+    case FP_SUBNORMAL: {
+      // Convert double-to-half as the processor would, assuming that FPCR.FZ
+      // (flush-to-zero) is not set.
+
+      // Add the implicit '1' bit to the mantissa.
+      mantissa += (UINT64_C(1) << 52);
+      return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
+    }
+  }
+
+  VIXL_UNREACHABLE();
+  return 0;
+}
+
+
+float Simulator::FPToFloat(double value, FPRounding round_mode) {
+  // Only the FPTieEven rounding mode is implemented.
+  VIXL_ASSERT((round_mode == FPTieEven) || (round_mode == FPRoundOdd));
+  USE(round_mode);
+
+  switch (std::fpclassify(value)) {
+    case FP_NAN: {
+      if (IsSignallingNaN(value)) {
+        FPProcessException();
+      }
+      if (DN()) return kFP32DefaultNaN;
+
+      // Convert NaNs as the processor would:
+      //  - The sign is propagated.
+      //  - The payload (mantissa) is transferred as much as possible, except
+      //    that the top bit is forced to '1', making the result a quiet NaN.
+      uint64_t raw = double_to_rawbits(value);
+
+      uint32_t sign = raw >> 63;
+      uint32_t exponent = (1 << 8) - 1;
+      uint32_t payload = unsigned_bitextract_64(50, 52 - 23, raw);
+      payload |= (1 << 22);   // Force a quiet NaN.
+
+      return rawbits_to_float((sign << 31) | (exponent << 23) | payload);
+    }
+
+    case FP_ZERO:
+    case FP_INFINITE: {
+      // In a C++ cast, any value representable in the target type will be
+      // unchanged. This is always the case for +/-0.0 and infinities.
+      return static_cast<float>(value);
+    }
+
+    case FP_NORMAL:
+    case FP_SUBNORMAL: {
+      // Convert double-to-float as the processor would, assuming that FPCR.FZ
+      // (flush-to-zero) is not set.
+      uint64_t raw = double_to_rawbits(value);
+      // Extract the IEEE-754 double components.
+      uint32_t sign = raw >> 63;
+      // Extract the exponent and remove the IEEE-754 encoding bias.
+      int32_t exponent = unsigned_bitextract_64(62, 52, raw) - 1023;
+      // Extract the mantissa and add the implicit '1' bit.
+      uint64_t mantissa = unsigned_bitextract_64(51, 0, raw);
+      if (std::fpclassify(value) == FP_NORMAL) {
+        mantissa |= (UINT64_C(1) << 52);
+      }
+      return FPRoundToFloat(sign, exponent, mantissa, round_mode);
+    }
+  }
+
+  VIXL_UNREACHABLE();
+  return value;
+}
+
+
 void Simulator::ld1(VectorFormat vform,
                     LogicVRegister dst,
                     uint64_t addr) {
@@ -1524,7 +1880,7 @@
     int64_t lj_src_val = src1.IntLeftJustified(vform, i);
 
     // Set signed saturation state.
-    if ((shift_val > CountLeadingSignBits(lj_src_val, 64)) &&
+    if ((shift_val > CountLeadingSignBits(lj_src_val)) &&
         (lj_src_val != 0)) {
       dst.SetSignedSat(i, lj_src_val >= 0);
     }
@@ -1532,7 +1888,7 @@
     // Set unsigned saturation state.
     if (lj_src_val < 0) {
       dst.SetUnsignedSat(i, false);
-    } else if ((shift_val > CountLeadingZeros(lj_src_val, 64)) &&
+    } else if ((shift_val > CountLeadingZeros(lj_src_val)) &&
                (lj_src_val != 0)) {
       dst.SetUnsignedSat(i, true);
     }
@@ -1570,7 +1926,7 @@
     uint64_t lj_src_val = src1.UintLeftJustified(vform, i);
 
     // Set saturation state.
-    if ((shift_val > CountLeadingZeros(lj_src_val, 64)) && (lj_src_val != 0)) {
+    if ((shift_val > CountLeadingZeros(lj_src_val)) && (lj_src_val != 0)) {
       dst.SetUnsignedSat(i, true);
     }
 
@@ -3153,9 +3509,9 @@
 template <typename T>
 T Simulator::FPAdd(T op1, T op2) {
   T result = FPProcessNaNs(op1, op2);
-  if (isnan(result)) return result;
+  if (std::isnan(result)) return result;
 
-  if (isinf(op1) && isinf(op2) && (op1 != op2)) {
+  if (std::isinf(op1) && std::isinf(op2) && (op1 != op2)) {
     // inf + -inf returns the default NaN.
     FPProcessException();
     return FPDefaultNaN<T>();
@@ -3169,9 +3525,9 @@
 template <typename T>
 T Simulator::FPSub(T op1, T op2) {
   // NaNs should be handled elsewhere.
-  VIXL_ASSERT(!isnan(op1) && !isnan(op2));
+  VIXL_ASSERT(!std::isnan(op1) && !std::isnan(op2));
 
-  if (isinf(op1) && isinf(op2) && (op1 == op2)) {
+  if (std::isinf(op1) && std::isinf(op2) && (op1 == op2)) {
     // inf - inf returns the default NaN.
     FPProcessException();
     return FPDefaultNaN<T>();
@@ -3185,9 +3541,9 @@
 template <typename T>
 T Simulator::FPMul(T op1, T op2) {
   // NaNs should be handled elsewhere.
-  VIXL_ASSERT(!isnan(op1) && !isnan(op2));
+  VIXL_ASSERT(!std::isnan(op1) && !std::isnan(op2));
 
-  if ((isinf(op1) && (op2 == 0.0)) || (isinf(op2) && (op1 == 0.0))) {
+  if ((std::isinf(op1) && (op2 == 0.0)) || (std::isinf(op2) && (op1 == 0.0))) {
     // inf * 0.0 returns the default NaN.
     FPProcessException();
     return FPDefaultNaN<T>();
@@ -3200,7 +3556,7 @@
 
 template<typename T>
 T Simulator::FPMulx(T op1, T op2) {
-  if ((isinf(op1) && (op2 == 0.0)) || (isinf(op2) && (op1 == 0.0))) {
+  if ((std::isinf(op1) && (op2 == 0.0)) || (std::isinf(op2) && (op1 == 0.0))) {
     // inf * 0.0 returns +/-2.0.
     T two = 2.0;
     return copysign(1.0, op1) * copysign(1.0, op2) * two;
@@ -3215,13 +3571,13 @@
 
   T sign_a = copysign(1.0, a);
   T sign_prod = copysign(1.0, op1) * copysign(1.0, op2);
-  bool isinf_prod = isinf(op1) || isinf(op2);
+  bool isinf_prod = std::isinf(op1) || std::isinf(op2);
   bool operation_generates_nan =
-      (isinf(op1) && (op2 == 0.0)) ||                     // inf * 0.0
-      (isinf(op2) && (op1 == 0.0)) ||                     // 0.0 * inf
-      (isinf(a) && isinf_prod && (sign_a != sign_prod));  // inf - inf
+      (std::isinf(op1) && (op2 == 0.0)) ||                     // inf * 0.0
+      (std::isinf(op2) && (op1 == 0.0)) ||                     // 0.0 * inf
+      (std::isinf(a) && isinf_prod && (sign_a != sign_prod));  // inf - inf
 
-  if (isnan(result)) {
+  if (std::isnan(result)) {
     // Generated NaNs override quiet NaNs propagated from a.
     if (operation_generates_nan && IsQuietNaN(a)) {
       FPProcessException();
@@ -3244,7 +3600,7 @@
   }
 
   result = FusedMultiplyAdd(op1, op2, a);
-  VIXL_ASSERT(!isnan(result));
+  VIXL_ASSERT(!std::isnan(result));
 
   // Work around broken fma implementations for rounded zero results: If a is
   // 0.0, the sign of the result is the sign of op1 * op2 before rounding.
@@ -3259,9 +3615,9 @@
 template <typename T>
 T Simulator::FPDiv(T op1, T op2) {
   // NaNs should be handled elsewhere.
-  VIXL_ASSERT(!isnan(op1) && !isnan(op2));
+  VIXL_ASSERT(!std::isnan(op1) && !std::isnan(op2));
 
-  if ((isinf(op1) && isinf(op2)) || ((op1 == 0.0) && (op2 == 0.0))) {
+  if ((std::isinf(op1) && std::isinf(op2)) || ((op1 == 0.0) && (op2 == 0.0))) {
     // inf / inf and 0.0 / 0.0 return the default NaN.
     FPProcessException();
     return FPDefaultNaN<T>();
@@ -3276,7 +3632,7 @@
 
 template <typename T>
 T Simulator::FPSqrt(T op) {
-  if (isnan(op)) {
+  if (std::isnan(op)) {
     return FPProcessNaN(op);
   } else if (op < 0.0) {
     FPProcessException();
@@ -3290,7 +3646,7 @@
 template <typename T>
 T Simulator::FPMax(T a, T b) {
   T result = FPProcessNaNs(a, b);
-  if (isnan(result)) return result;
+  if (std::isnan(result)) return result;
 
   if ((a == 0.0) && (b == 0.0) &&
       (copysign(1.0, a) != copysign(1.0, b))) {
@@ -3311,14 +3667,14 @@
   }
 
   T result = FPProcessNaNs(a, b);
-  return isnan(result) ? result : FPMax(a, b);
+  return std::isnan(result) ? result : FPMax(a, b);
 }
 
 
 template <typename T>
 T Simulator::FPMin(T a, T b) {
   T result = FPProcessNaNs(a, b);
-  if (isnan(result)) return result;
+  if (std::isnan(result)) return result;
 
   if ((a == 0.0) && (b == 0.0) &&
       (copysign(1.0, a) != copysign(1.0, b))) {
@@ -3339,16 +3695,17 @@
   }
 
   T result = FPProcessNaNs(a, b);
-  return isnan(result) ? result : FPMin(a, b);
+  return std::isnan(result) ? result : FPMin(a, b);
 }
 
 
 template <typename T>
 T Simulator::FPRecipStepFused(T op1, T op2) {
   const T two = 2.0;
-  if ((isinf(op1) && (op2 == 0.0)) || ((op1 == 0.0) && (isinf(op2)))) {
+  if ((std::isinf(op1) && (op2 == 0.0))
+      || ((op1 == 0.0) && (std::isinf(op2)))) {
     return two;
-  } else if (isinf(op1) || isinf(op2)) {
+  } else if (std::isinf(op1) || std::isinf(op2)) {
     // Return +inf if signs match, otherwise -inf.
     return ((op1 >= 0.0) == (op2 >= 0.0)) ? kFP64PositiveInfinity
                                           : kFP64NegativeInfinity;
@@ -3363,9 +3720,10 @@
   const T one_point_five = 1.5;
   const T two = 2.0;
 
-  if ((isinf(op1) && (op2 == 0.0)) || ((op1 == 0.0) && (isinf(op2)))) {
+  if ((std::isinf(op1) && (op2 == 0.0))
+      || ((op1 == 0.0) && (std::isinf(op2)))) {
     return one_point_five;
-  } else if (isinf(op1) || isinf(op2)) {
+  } else if (std::isinf(op1) || std::isinf(op2)) {
     // Return +inf if signs match, otherwise -inf.
     return ((op1 >= 0.0) == (op2 >= 0.0)) ? kFP64PositiveInfinity
                                           : kFP64NegativeInfinity;
@@ -3373,9 +3731,9 @@
     // The multiply-add-halve operation must be fully fused, so avoid interim
     // rounding by checking which operand can be losslessly divided by two
     // before doing the multiply-add.
-    if (isnormal(op1 / two)) {
+    if (std::isnormal(op1 / two)) {
       return FusedMultiplyAdd(op1 / two, op2, one_point_five);
-    } else if (isnormal(op2 / two)) {
+    } else if (std::isnormal(op2 / two)) {
       return FusedMultiplyAdd(op1, op2 / two, one_point_five);
     } else {
       // Neither operand is normal after halving: the result is dominated by
@@ -3390,11 +3748,11 @@
   if ((value == 0.0) || (value == kFP64PositiveInfinity) ||
       (value == kFP64NegativeInfinity)) {
     return value;
-  } else if (isnan(value)) {
+  } else if (std::isnan(value)) {
     return FPProcessNaN(value);
   }
 
-  double int_result = floor(value);
+  double int_result = std::floor(value);
   double error = value - int_result;
   switch (round_mode) {
     case FPTieAway: {
@@ -3419,7 +3777,7 @@
       // If the error is greater than 0.5, or is equal to 0.5 and the integer
       // result is odd, round up.
       } else if ((error > 0.5) ||
-          ((error == 0.5) && (fmod(int_result, 2) != 0))) {
+          ((error == 0.5) && (std::fmod(int_result, 2) != 0))) {
         int_result++;
       }
       break;
@@ -3461,7 +3819,7 @@
   } else if (value < kWMinInt) {
     return kWMinInt;
   }
-  return isnan(value) ? 0 : static_cast<int32_t>(value);
+  return std::isnan(value) ? 0 : static_cast<int32_t>(value);
 }
 
 
@@ -3472,7 +3830,7 @@
   } else if (value < kXMinInt) {
     return kXMinInt;
   }
-  return isnan(value) ? 0 : static_cast<int64_t>(value);
+  return std::isnan(value) ? 0 : static_cast<int64_t>(value);
 }
 
 
@@ -3483,7 +3841,7 @@
   } else if (value < 0.0) {
     return 0;
   }
-  return isnan(value) ? 0 : static_cast<uint32_t>(value);
+  return std::isnan(value) ? 0 : static_cast<uint32_t>(value);
 }
 
 
@@ -3494,7 +3852,7 @@
   } else if (value < 0.0) {
     return 0;
   }
-  return isnan(value) ? 0 : static_cast<uint64_t>(value);
+  return std::isnan(value) ? 0 : static_cast<uint64_t>(value);
 }
 
 
@@ -3511,7 +3869,7 @@
     T result;                                                    \
     if (PROCNAN) {                                               \
       result = FPProcessNaNs(op1, op2);                          \
-      if (!isnan(result)) {                                      \
+      if (!std::isnan(result)) {                                      \
         result = OP(op1, op2);                                   \
       }                                                          \
     } else {                                                     \
@@ -3558,7 +3916,7 @@
     T op1 = -src1.Float<T>(i);
     T op2 = src2.Float<T>(i);
     T result = FPProcessNaNs(op1, op2);
-    dst.SetFloat(i, isnan(result) ? result : FPRecipStepFused(op1, op2));
+    dst.SetFloat(i, std::isnan(result) ? result : FPRecipStepFused(op1, op2));
   }
   return dst;
 }
@@ -3588,7 +3946,7 @@
     T op1 = -src1.Float<T>(i);
     T op2 = src2.Float<T>(i);
     T result = FPProcessNaNs(op1, op2);
-    dst.SetFloat(i, isnan(result) ? result : FPRSqrtStepFused(op1, op2));
+    dst.SetFloat(i, std::isnan(result) ? result : FPRSqrtStepFused(op1, op2));
   }
   return dst;
 }
@@ -3620,7 +3978,7 @@
     T op1 = src1.Float<T>(i);
     T op2 = src2.Float<T>(i);
     T nan_result = FPProcessNaNs(op1, op2);
-    if (!isnan(nan_result)) {
+    if (!std::isnan(nan_result)) {
       switch (cond) {
         case eq: result = (op1 == op2); break;
         case ge: result = (op1 >= op2); break;
@@ -4001,7 +4359,7 @@
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
       float input = src.Float<float>(i);
       float rounded = FPRoundInt(input, rounding_mode);
-      if (inexact_exception && !isnan(input) && (input != rounded)) {
+      if (inexact_exception && !std::isnan(input) && (input != rounded)) {
         FPProcessException();
       }
       dst.SetFloat<float>(i, rounded);
@@ -4011,7 +4369,7 @@
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
       double input = src.Float<double>(i);
       double rounded = FPRoundInt(input, rounding_mode);
-      if (inexact_exception && !isnan(input) && (input != rounded)) {
+      if (inexact_exception && !std::isnan(input) && (input != rounded)) {
         FPProcessException();
       }
       dst.SetFloat<double>(i, rounded);
@@ -4029,13 +4387,13 @@
   dst.ClearForWrite(vform);
   if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-      float op = src.Float<float>(i) * powf(2.0f, fbits);
+      float op = src.Float<float>(i) * std::pow(2.0f, fbits);
       dst.SetInt(vform, i, FPToInt32(op, rounding_mode));
     }
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-      double op = src.Float<double>(i) * pow(2.0, fbits);
+      double op = src.Float<double>(i) * std::pow(2.0, fbits);
       dst.SetInt(vform, i, FPToInt64(op, rounding_mode));
     }
   }
@@ -4051,13 +4409,13 @@
   dst.ClearForWrite(vform);
   if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-      float op = src.Float<float>(i) * powf(2.0f, fbits);
+      float op = src.Float<float>(i) * std::pow(2.0f, fbits);
       dst.SetUint(vform, i, FPToUInt32(op, rounding_mode));
     }
   } else {
     VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kDRegSize);
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-      double op = src.Float<double>(i) * pow(2.0, fbits);
+      double op = src.Float<double>(i) * std::pow(2.0, fbits);
       dst.SetUint(vform, i, FPToUInt64(op, rounding_mode));
     }
   }
@@ -4182,7 +4540,7 @@
 
 template <typename T>
 T Simulator::FPRecipSqrtEstimate(T op) {
-  if (isnan(op)) {
+  if (std::isnan(op)) {
     return FPProcessNaN(op);
   } else if (op == 0.0) {
     if (copysign(1.0, op) < 0.0) {
@@ -4193,7 +4551,7 @@
   } else if (copysign(1.0, op) < 0.0) {
     FPProcessException();
     return FPDefaultNaN<T>();
-  } else if (isinf(op)) {
+  } else if (std::isinf(op)) {
     return 0.0;
   } else {
     uint64_t fraction;
@@ -4271,17 +4629,17 @@
     sign = double_sign(op);
   }
 
-  if (isnan(op)) {
+  if (std::isnan(op)) {
     return FPProcessNaN(op);
-  } else if (isinf(op)) {
+  } else if (std::isinf(op)) {
     return (sign == 1) ? -0.0 : 0.0;
   } else if (op == 0.0) {
     FPProcessException();  // FPExc_DivideByZero exception.
     return (sign == 1) ? kFP64NegativeInfinity : kFP64PositiveInfinity;
   } else if (((sizeof(T) == sizeof(float)) &&  // NOLINT(runtime/sizeof)
-              (fabsf(op) < pow(2.0, -128))) ||
+              (std::fabs(op) < std::pow(2.0, -128.0))) ||
              ((sizeof(T) == sizeof(double)) &&  // NOLINT(runtime/sizeof)
-              (fabs(op) < pow(2.0, -1024)))) {
+              (std::fabs(op) < std::pow(2.0, -1024.0)))) {
     bool overflow_to_inf = false;
     switch (rounding) {
       case FPTieEven: overflow_to_inf = true; break;
@@ -4338,9 +4696,9 @@
 
     fraction = double_mantissa(estimate);
     if (result_exp == 0) {
-      fraction = (1L << 51) | Bits(fraction, 51, 1);
+      fraction = (UINT64_C(1) << 51) | Bits(fraction, 51, 1);
     } else if (result_exp == -1) {
-      fraction = (1L << 50) | Bits(fraction, 51, 2);
+      fraction = (UINT64_C(1) << 50) | Bits(fraction, 51, 2);
       result_exp = 0;
     }
     if (sizeof(T) == sizeof(float)) {  // NOLINT(runtime/sizeof)
@@ -4384,8 +4742,8 @@
     if (operand <= 0x3FFFFFFF) {
       result = 0xFFFFFFFF;
     } else {
-      dp_operand = operand * pow(2.0, -32);
-      dp_result = recip_sqrt_estimate(dp_operand) * pow(2.0, 31);
+      dp_operand = operand * std::pow(2.0, -32);
+      dp_result = recip_sqrt_estimate(dp_operand) * std::pow(2.0, 31);
       result = static_cast<uint32_t>(dp_result);
     }
     dst.SetUint(vform, i, result);
@@ -4416,8 +4774,8 @@
     if (operand <= 0x7FFFFFFF) {
       result = 0xFFFFFFFF;
     } else {
-      dp_operand = operand * pow(2.0, -32);
-      dp_result = recip_estimate(dp_operand) * pow(2.0, 31);
+      dp_operand = operand * std::pow(2.0, -32);
+      dp_result = recip_estimate(dp_operand) * std::pow(2.0, 31);
       result = static_cast<uint32_t>(dp_result);
     }
     dst.SetUint(vform, i, result);
@@ -4433,7 +4791,7 @@
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
     T op = src.Float<T>(i);
     T result;
-    if (isnan(op)) {
+    if (std::isnan(op)) {
        result = FPProcessNaN(op);
     } else {
       int exp;
diff --git a/src/a64/macro-assembler-a64.cc b/src/vixl/a64/macro-assembler-a64.cc
similarity index 93%
rename from src/a64/macro-assembler-a64.cc
rename to src/vixl/a64/macro-assembler-a64.cc
index 41b571a..49218b4 100644
--- a/src/a64/macro-assembler-a64.cc
+++ b/src/vixl/a64/macro-assembler-a64.cc
@@ -24,7 +24,7 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include "a64/macro-assembler-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
 
 namespace vixl {
 
@@ -43,8 +43,8 @@
 }
 
 
-LiteralPool::LiteralPool(MacroAssembler* masm) :
-    Pool(masm), size_(0), first_use_(-1) {
+LiteralPool::LiteralPool(MacroAssembler* masm)
+  : Pool(masm), size_(0), first_use_(-1) {
 }
 
 
@@ -718,11 +718,13 @@
         case AND:
           Mov(rd, 0);
           return;
-        case ORR:  // Fall through.
+        case ORR:
+          VIXL_FALLTHROUGH();
         case EOR:
           Mov(rd, rn);
           return;
-        case ANDS:  // Fall through.
+        case ANDS:
+          VIXL_FALLTHROUGH();
         case BICS:
           break;
         default:
@@ -740,7 +742,8 @@
         case EOR:
           Mvn(rd, rn);
           return;
-        case ANDS:  // Fall through.
+        case ANDS:
+          VIXL_FALLTHROUGH();
         case BICS:
           break;
         default:
@@ -1131,13 +1134,14 @@
 
 void MacroAssembler::Add(const Register& rd,
                          const Register& rn,
-                         const Operand& operand) {
+                         const Operand& operand,
+                         FlagsUpdate S) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (operand.IsImmediate() && (operand.immediate() < 0) &&
       IsImmAddSub(-operand.immediate())) {
-    AddSubMacro(rd, rn, -operand.immediate(), LeaveFlags, SUB);
+    AddSubMacro(rd, rn, -operand.immediate(), S, SUB);
   } else {
-    AddSubMacro(rd, rn, operand, LeaveFlags, ADD);
+    AddSubMacro(rd, rn, operand, S, ADD);
   }
 }
 
@@ -1145,25 +1149,20 @@
 void MacroAssembler::Adds(const Register& rd,
                           const Register& rn,
                           const Operand& operand) {
-  VIXL_ASSERT(allow_macro_instructions_);
-  if (operand.IsImmediate() && (operand.immediate() < 0) &&
-      IsImmAddSub(-operand.immediate())) {
-    AddSubMacro(rd, rn, -operand.immediate(), SetFlags, SUB);
-  } else {
-    AddSubMacro(rd, rn, operand, SetFlags, ADD);
-  }
+  Add(rd, rn, operand, SetFlags);
 }
 
 
 void MacroAssembler::Sub(const Register& rd,
                          const Register& rn,
-                         const Operand& operand) {
+                         const Operand& operand,
+                         FlagsUpdate S) {
   VIXL_ASSERT(allow_macro_instructions_);
   if (operand.IsImmediate() && (operand.immediate() < 0) &&
       IsImmAddSub(-operand.immediate())) {
-    AddSubMacro(rd, rn, -operand.immediate(), LeaveFlags, ADD);
+    AddSubMacro(rd, rn, -operand.immediate(), S, ADD);
   } else {
-    AddSubMacro(rd, rn, operand, LeaveFlags, SUB);
+    AddSubMacro(rd, rn, operand, S, SUB);
   }
 }
 
@@ -1171,13 +1170,7 @@
 void MacroAssembler::Subs(const Register& rd,
                           const Register& rn,
                           const Operand& operand) {
-  VIXL_ASSERT(allow_macro_instructions_);
-  if (operand.IsImmediate() && (operand.immediate() < 0) &&
-      IsImmAddSub(-operand.immediate())) {
-    AddSubMacro(rd, rn, -operand.immediate(), SetFlags, ADD);
-  } else {
-    AddSubMacro(rd, rn, operand, SetFlags, SUB);
-  }
+  Sub(rd, rn, operand, SetFlags);
 }
 
 
@@ -1193,23 +1186,29 @@
 }
 
 
-void MacroAssembler::Fcmp(const FPRegister& fn, double value) {
+void MacroAssembler::Fcmp(const FPRegister& fn, double value,
+                          FPTrapFlags trap) {
   VIXL_ASSERT(allow_macro_instructions_);
   // The worst case for size is:
   //  * 1 to materialise the constant, using literal pool if necessary
-  //  * 1 instruction for fcmp
+  //  * 1 instruction for fcmp{e}
   MacroEmissionCheckScope guard(this);
   if (value != 0.0) {
     UseScratchRegisterScope temps(this);
     FPRegister tmp = temps.AcquireSameSizeAs(fn);
     Fmov(tmp, value);
-    fcmp(fn, tmp);
+    FPCompareMacro(fn, tmp, trap);
   } else {
-    fcmp(fn, value);
+    FPCompareMacro(fn, value, trap);
   }
 }
 
 
+void MacroAssembler::Fcmpe(const FPRegister& fn, double value) {
+  Fcmp(fn, value, EnableTrap);
+}
+
+
 void MacroAssembler::Fmov(VRegister vd, double imm) {
   VIXL_ASSERT(allow_macro_instructions_);
   // Floating point immediates are loaded through the literal pool.
@@ -1637,41 +1636,67 @@
 
 
 void MacroAssembler::PushCPURegList(CPURegList registers) {
-  int size = registers.RegisterSizeInBytes();
-
-  PrepareForPush(registers.Count(), size);
-  // Push up to four registers at a time because if the current stack pointer is
-  // sp and reg_size is 32, registers must be pushed in blocks of four in order
-  // to maintain the 16-byte alignment for sp.
+  VIXL_ASSERT(!registers.Overlaps(*TmpList()));
+  VIXL_ASSERT(!registers.Overlaps(*FPTmpList()));
   VIXL_ASSERT(allow_macro_instructions_);
+
+  int reg_size = registers.RegisterSizeInBytes();
+  PrepareForPush(registers.Count(), reg_size);
+
+  // Bump the stack pointer and store two registers at the bottom.
+  int size = registers.TotalSizeInBytes();
+  const CPURegister& bottom_0 = registers.PopLowestIndex();
+  const CPURegister& bottom_1 = registers.PopLowestIndex();
+  if (bottom_0.IsValid() && bottom_1.IsValid()) {
+    Stp(bottom_0, bottom_1, MemOperand(StackPointer(), -size, PreIndex));
+  } else if (bottom_0.IsValid()) {
+    Str(bottom_0, MemOperand(StackPointer(), -size, PreIndex));
+  }
+
+  int offset = 2 * reg_size;
   while (!registers.IsEmpty()) {
-    int count_before = registers.Count();
-    const CPURegister& src0 = registers.PopHighestIndex();
-    const CPURegister& src1 = registers.PopHighestIndex();
-    const CPURegister& src2 = registers.PopHighestIndex();
-    const CPURegister& src3 = registers.PopHighestIndex();
-    int count = count_before - registers.Count();
-    PushHelper(count, size, src0, src1, src2, src3);
+    const CPURegister& src0 = registers.PopLowestIndex();
+    const CPURegister& src1 = registers.PopLowestIndex();
+    if (src1.IsValid()) {
+      Stp(src0, src1, MemOperand(StackPointer(), offset));
+    } else {
+      Str(src0, MemOperand(StackPointer(), offset));
+    }
+    offset += 2 * reg_size;
   }
 }
 
 
 void MacroAssembler::PopCPURegList(CPURegList registers) {
-  int size = registers.RegisterSizeInBytes();
-
-  PrepareForPop(registers.Count(), size);
-  // Pop up to four registers at a time because if the current stack pointer is
-  // sp and reg_size is 32, registers must be pushed in blocks of four in order
-  // to maintain the 16-byte alignment for sp.
+  VIXL_ASSERT(!registers.Overlaps(*TmpList()));
+  VIXL_ASSERT(!registers.Overlaps(*FPTmpList()));
   VIXL_ASSERT(allow_macro_instructions_);
+
+  int reg_size = registers.RegisterSizeInBytes();
+  PrepareForPop(registers.Count(), reg_size);
+
+
+  int size = registers.TotalSizeInBytes();
+  const CPURegister& bottom_0 = registers.PopLowestIndex();
+  const CPURegister& bottom_1 = registers.PopLowestIndex();
+
+  int offset = 2 * reg_size;
   while (!registers.IsEmpty()) {
-    int count_before = registers.Count();
     const CPURegister& dst0 = registers.PopLowestIndex();
     const CPURegister& dst1 = registers.PopLowestIndex();
-    const CPURegister& dst2 = registers.PopLowestIndex();
-    const CPURegister& dst3 = registers.PopLowestIndex();
-    int count = count_before - registers.Count();
-    PopHelper(count, size, dst0, dst1, dst2, dst3);
+    if (dst1.IsValid()) {
+      Ldp(dst0, dst1, MemOperand(StackPointer(), offset));
+    } else {
+      Ldr(dst0, MemOperand(StackPointer(), offset));
+    }
+    offset += 2 * reg_size;
+  }
+
+  // Load the two registers at the bottom and drop the stack pointer.
+  if (bottom_0.IsValid() && bottom_1.IsValid()) {
+    Ldp(bottom_0, bottom_1, MemOperand(StackPointer(), size, PostIndex));
+  } else if (bottom_0.IsValid()) {
+    Ldr(bottom_0, MemOperand(StackPointer(), size, PostIndex));
   }
 }
 
@@ -1831,42 +1856,6 @@
 }
 
 
-void MacroAssembler::PeekCPURegList(CPURegList registers, int offset) {
-  VIXL_ASSERT(!registers.IncludesAliasOf(StackPointer()));
-  VIXL_ASSERT(offset >= 0);
-  int size = registers.RegisterSizeInBytes();
-
-  while (registers.Count() >= 2) {
-    const CPURegister& dst0 = registers.PopLowestIndex();
-    const CPURegister& dst1 = registers.PopLowestIndex();
-    Ldp(dst0, dst1, MemOperand(StackPointer(), offset));
-    offset += 2 * size;
-  }
-  if (!registers.IsEmpty()) {
-    Ldr(registers.PopLowestIndex(),
-        MemOperand(StackPointer(), offset));
-  }
-}
-
-
-void MacroAssembler::PokeCPURegList(CPURegList registers, int offset) {
-  VIXL_ASSERT(!registers.IncludesAliasOf(StackPointer()));
-  VIXL_ASSERT(offset >= 0);
-  int size = registers.RegisterSizeInBytes();
-
-  while (registers.Count() >= 2) {
-    const CPURegister& dst0 = registers.PopLowestIndex();
-    const CPURegister& dst1 = registers.PopLowestIndex();
-    Stp(dst0, dst1, MemOperand(StackPointer(), offset));
-    offset += 2 * size;
-  }
-  if (!registers.IsEmpty()) {
-    Str(registers.PopLowestIndex(),
-        MemOperand(StackPointer(), offset));
-  }
-}
-
-
 void MacroAssembler::Claim(const Operand& size) {
   VIXL_ASSERT(allow_macro_instructions_);
 
@@ -1956,6 +1945,80 @@
   ldp(x29, x30, tos);
 }
 
+void MacroAssembler::LoadCPURegList(CPURegList registers,
+                                    const MemOperand& src) {
+  LoadStoreCPURegListHelper(kLoad, registers, src);
+}
+
+void MacroAssembler::StoreCPURegList(CPURegList registers,
+                                     const MemOperand& dst) {
+  LoadStoreCPURegListHelper(kStore, registers, dst);
+}
+
+
+void MacroAssembler::LoadStoreCPURegListHelper(LoadStoreCPURegListAction op,
+                                               CPURegList registers,
+                                               const MemOperand& mem) {
+  // We do not handle pre-indexing or post-indexing.
+  VIXL_ASSERT(!(mem.IsPreIndex() || mem.IsPostIndex()));
+  VIXL_ASSERT(!registers.Overlaps(tmp_list_));
+  VIXL_ASSERT(!registers.Overlaps(fptmp_list_));
+  VIXL_ASSERT(!registers.IncludesAliasOf(sp));
+
+  UseScratchRegisterScope temps(this);
+
+  MemOperand loc = BaseMemOperandForLoadStoreCPURegList(registers,
+                                                        mem,
+                                                        &temps);
+
+  while (registers.Count() >= 2) {
+    const CPURegister& dst0 = registers.PopLowestIndex();
+    const CPURegister& dst1 = registers.PopLowestIndex();
+    if (op == kStore) {
+      Stp(dst0, dst1, loc);
+    } else {
+      VIXL_ASSERT(op == kLoad);
+      Ldp(dst0, dst1, loc);
+    }
+    loc.AddOffset(2 * registers.RegisterSizeInBytes());
+  }
+  if (!registers.IsEmpty()) {
+    if (op == kStore) {
+      Str(registers.PopLowestIndex(), loc);
+    } else {
+      VIXL_ASSERT(op == kLoad);
+      Ldr(registers.PopLowestIndex(), loc);
+    }
+  }
+}
+
+MemOperand MacroAssembler::BaseMemOperandForLoadStoreCPURegList(
+    const CPURegList& registers,
+    const MemOperand& mem,
+    UseScratchRegisterScope* scratch_scope) {
+  // If necessary, pre-compute the base address for the accesses.
+  if (mem.IsRegisterOffset()) {
+    Register reg_base = scratch_scope->AcquireX();
+    ComputeAddress(reg_base, mem);
+    return MemOperand(reg_base);
+
+  } else if (mem.IsImmediateOffset()) {
+    int reg_size = registers.RegisterSizeInBytes();
+    int total_size = registers.TotalSizeInBytes();
+    int64_t min_offset = mem.offset();
+    int64_t max_offset = mem.offset() + std::max(0, total_size - 2 * reg_size);
+    if ((registers.Count() >= 2) &&
+        (!Assembler::IsImmLSPair(min_offset, WhichPowerOf2(reg_size)) ||
+         !Assembler::IsImmLSPair(max_offset, WhichPowerOf2(reg_size)))) {
+      Register reg_base = scratch_scope->AcquireX();
+      ComputeAddress(reg_base, mem);
+      return MemOperand(reg_base);
+    }
+  }
+
+  return mem;
+}
+
 void MacroAssembler::BumpSystemStackPointer(const Operand& space) {
   VIXL_ASSERT(!sp.Is(StackPointer()));
   // TODO: Several callers rely on this not using scratch registers, so we use
diff --git a/src/a64/macro-assembler-a64.h b/src/vixl/a64/macro-assembler-a64.h
similarity index 97%
rename from src/a64/macro-assembler-a64.h
rename to src/vixl/a64/macro-assembler-a64.h
index ecc4c27..e94933c 100644
--- a/src/a64/macro-assembler-a64.h
+++ b/src/vixl/a64/macro-assembler-a64.h
@@ -30,9 +30,9 @@
 #include <algorithm>
 #include <limits>
 
-#include "globals.h"
-#include "a64/assembler-a64.h"
-#include "a64/debugger-a64.h"
+#include "vixl/globals.h"
+#include "vixl/a64/assembler-a64.h"
+#include "vixl/a64/debugger-a64.h"
 
 
 #define LS_MACRO_LIST(V)                                      \
@@ -56,6 +56,7 @@
 
 // Forward declaration
 class MacroAssembler;
+class UseScratchRegisterScope;
 
 class Pool {
  public:
@@ -631,13 +632,15 @@
   // Add and sub macros.
   void Add(const Register& rd,
            const Register& rn,
-           const Operand& operand);
+           const Operand& operand,
+           FlagsUpdate S = LeaveFlags);
   void Adds(const Register& rd,
             const Register& rn,
             const Operand& operand);
   void Sub(const Register& rd,
            const Register& rn,
-           const Operand& operand);
+           const Operand& operand,
+           FlagsUpdate S = LeaveFlags);
   void Subs(const Register& rd,
             const Register& rn,
             const Operand& operand);
@@ -844,39 +847,43 @@
   // supported.
   //
   // Otherwise, (Peek|Poke)(CPU|X|W|D|S)RegList is preferred.
-  void PeekCPURegList(CPURegList registers, int offset);
-  void PokeCPURegList(CPURegList registers, int offset);
+  void PeekCPURegList(CPURegList registers, int64_t offset) {
+    LoadCPURegList(registers, MemOperand(StackPointer(), offset));
+  }
+  void PokeCPURegList(CPURegList registers, int64_t offset) {
+    StoreCPURegList(registers, MemOperand(StackPointer(), offset));
+  }
 
-  void PeekSizeRegList(RegList registers, int offset, unsigned reg_size,
+  void PeekSizeRegList(RegList registers, int64_t offset, unsigned reg_size,
       CPURegister::RegisterType type = CPURegister::kRegister) {
     PeekCPURegList(CPURegList(type, reg_size, registers), offset);
   }
-  void PokeSizeRegList(RegList registers, int offset, unsigned reg_size,
+  void PokeSizeRegList(RegList registers, int64_t offset, unsigned reg_size,
       CPURegister::RegisterType type = CPURegister::kRegister) {
     PokeCPURegList(CPURegList(type, reg_size, registers), offset);
   }
-  void PeekXRegList(RegList regs, int offset) {
+  void PeekXRegList(RegList regs, int64_t offset) {
     PeekSizeRegList(regs, offset, kXRegSize);
   }
-  void PokeXRegList(RegList regs, int offset) {
+  void PokeXRegList(RegList regs, int64_t offset) {
     PokeSizeRegList(regs, offset, kXRegSize);
   }
-  void PeekWRegList(RegList regs, int offset) {
+  void PeekWRegList(RegList regs, int64_t offset) {
     PeekSizeRegList(regs, offset, kWRegSize);
   }
-  void PokeWRegList(RegList regs, int offset) {
+  void PokeWRegList(RegList regs, int64_t offset) {
     PokeSizeRegList(regs, offset, kWRegSize);
   }
-  void PeekDRegList(RegList regs, int offset) {
+  void PeekDRegList(RegList regs, int64_t offset) {
     PeekSizeRegList(regs, offset, kDRegSize, CPURegister::kVRegister);
   }
-  void PokeDRegList(RegList regs, int offset) {
+  void PokeDRegList(RegList regs, int64_t offset) {
     PokeSizeRegList(regs, offset, kDRegSize, CPURegister::kVRegister);
   }
-  void PeekSRegList(RegList regs, int offset) {
+  void PeekSRegList(RegList regs, int64_t offset) {
     PeekSizeRegList(regs, offset, kSRegSize, CPURegister::kVRegister);
   }
-  void PokeSRegList(RegList regs, int offset) {
+  void PokeSRegList(RegList regs, int64_t offset) {
     PokeSizeRegList(regs, offset, kSRegSize, CPURegister::kVRegister);
   }
 
@@ -911,6 +918,9 @@
   // aligned to 16 bytes.
   void PopCalleeSavedRegisters();
 
+  void LoadCPURegList(CPURegList registers, const MemOperand& src);
+  void StoreCPURegList(CPURegList registers, const MemOperand& dst);
+
   // Remaining instructions are simple pass-through calls to the assembler.
   void Adr(const Register& rd, Label* label) {
     VIXL_ASSERT(allow_macro_instructions_);
@@ -1135,18 +1145,31 @@
   void Fccmp(const VRegister& vn,
              const VRegister& vm,
              StatusFlags nzcv,
-             Condition cond) {
+             Condition cond,
+             FPTrapFlags trap = DisableTrap) {
     VIXL_ASSERT(allow_macro_instructions_);
     VIXL_ASSERT((cond != al) && (cond != nv));
     SingleEmissionCheckScope guard(this);
-    fccmp(vn, vm, nzcv, cond);
+    FPCCompareMacro(vn, vm, nzcv, cond, trap);
   }
-  void Fcmp(const VRegister& vn, const VRegister& vm) {
+  void Fccmpe(const VRegister& vn,
+              const VRegister& vm,
+              StatusFlags nzcv,
+              Condition cond) {
+    Fccmp(vn, vm, nzcv, cond, EnableTrap);
+  }
+  void Fcmp(const VRegister& vn, const VRegister& vm,
+            FPTrapFlags trap = DisableTrap) {
     VIXL_ASSERT(allow_macro_instructions_);
     SingleEmissionCheckScope guard(this);
-    fcmp(vn, vm);
+    FPCompareMacro(vn, vm, trap);
   }
-  void Fcmp(const VRegister& vn, double value);
+  void Fcmp(const VRegister& vn, double value,
+            FPTrapFlags trap = DisableTrap);
+  void Fcmpe(const VRegister& vn, double value);
+  void Fcmpe(const VRegister& vn, const VRegister& vm) {
+    Fcmp(vn, vm, EnableTrap);
+  }
   void Fcsel(const VRegister& vd,
              const VRegister& vn,
              const VRegister& vm,
@@ -2000,6 +2023,14 @@
     SingleEmissionCheckScope guard(this);
     umull(rd, rn, rm);
   }
+  void Umulh(const Register& xd, const Register& xn, const Register& xm) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    VIXL_ASSERT(!xd.IsZero());
+    VIXL_ASSERT(!xn.IsZero());
+    VIXL_ASSERT(!xm.IsZero());
+    SingleEmissionCheckScope guard(this);
+    umulh(xd, xn, xm);
+  }
   void Umsubl(const Register& rd,
               const Register& rn,
               const Register& rm,
@@ -2989,6 +3020,23 @@
   void PrepareForPush(int count, int size);
   void PrepareForPop(int count, int size);
 
+  // The actual implementation of load and store operations for CPURegList.
+  enum LoadStoreCPURegListAction {
+    kLoad,
+    kStore
+  };
+  void LoadStoreCPURegListHelper(LoadStoreCPURegListAction operation,
+                                 CPURegList registers,
+                                 const MemOperand& mem);
+  // Returns a MemOperand suitable for loading or storing a CPURegList at `dst`.
+  // This helper may allocate registers from `scratch_scope` and generate code
+  // to compute an intermediate address. The resulting MemOperand is only valid
+  // as long as `scratch_scope` remains valid.
+  MemOperand BaseMemOperandForLoadStoreCPURegList(
+      const CPURegList& registers,
+      const MemOperand& mem,
+      UseScratchRegisterScope* scratch_scope);
+
   bool LabelIsOutOfRange(Label* label, ImmBranchType branch_type) {
     return !Instruction::IsValidImmPCOffset(branch_type,
                                             label->location() - CursorOffset());
diff --git a/src/a64/simulator-a64.cc b/src/vixl/a64/simulator-a64.cc
similarity index 85%
rename from src/a64/simulator-a64.cc
rename to src/vixl/a64/simulator-a64.cc
index 3f3f3e2..79256bb 100644
--- a/src/a64/simulator-a64.cc
+++ b/src/vixl/a64/simulator-a64.cc
@@ -27,8 +27,8 @@
 #ifdef USE_SIMULATOR
 
 #include <string.h>
-#include <math.h>
-#include "a64/simulator-a64.h"
+#include <cmath>
+#include "vixl/a64/simulator-a64.h"
 
 namespace vixl {
 
@@ -396,23 +396,18 @@
 }
 
 
-template<> double Simulator::FPDefaultNaN<double>() const {
-  return kFP64DefaultNaN;
-}
-
-
-template<> float Simulator::FPDefaultNaN<float>() const {
-  return kFP32DefaultNaN;
-}
-
-
-void Simulator::FPCompare(double val0, double val1) {
+void Simulator::FPCompare(double val0, double val1, FPTrapFlags trap) {
   AssertSupportedFPCR();
 
   // TODO: This assumes that the C++ implementation handles comparisons in the
   // way that we expect (as per AssertSupportedFPCR()).
-  if ((isnan(val0) != 0) || (isnan(val1) != 0)) {
+  bool process_exception = false;
+  if ((std::isnan(val0) != 0) || (std::isnan(val1) != 0)) {
     nzcv().SetRawValue(FPUnorderedFlag);
+    if (IsSignallingNaN(val0) || IsSignallingNaN(val1) ||
+        (trap == EnableTrap)) {
+      process_exception = true;
+    }
   } else if (val0 < val1) {
     nzcv().SetRawValue(FPLessThanFlag);
   } else if (val0 > val1) {
@@ -423,6 +418,7 @@
     VIXL_UNREACHABLE();
   }
   LogSystemRegister(NZCV);
+  if (process_exception) FPProcessException();
 }
 
 
@@ -440,7 +436,7 @@
   }
 
   switch (lane_size) {
-    default: VIXL_UNREACHABLE();
+    default: VIXL_UNREACHABLE(); break;
     case kQRegSizeInBytes: format |= kPrintReg1Q; break;
     case kDRegSizeInBytes: format |= kPrintReg1D; break;
     case kSRegSizeInBytes: format |= kPrintReg1S; break;
@@ -460,7 +456,7 @@
 Simulator::PrintRegisterFormat Simulator::GetPrintRegisterFormat(
     VectorFormat vform) {
   switch (vform) {
-    default: VIXL_UNREACHABLE();
+    default: VIXL_UNREACHABLE(); return kPrintReg16B;
     case kFormat16B: return kPrintReg16B;
     case kFormat8B: return kPrintReg8B;
     case kFormat8H: return kPrintReg8H;
@@ -841,7 +837,7 @@
   switch (instr->Mask(UnconditionalBranchMask)) {
     case BL:
       set_lr(instr->NextInstruction());
-      // Fall through.
+      VIXL_FALLTHROUGH();
     case B:
       set_pc(instr->ImmPCOffsetTarget());
       break;
@@ -864,7 +860,7 @@
   switch (instr->Mask(UnconditionalBranchToRegisterMask)) {
     case BLR:
       set_lr(instr->NextInstruction());
-      // Fall through.
+      VIXL_FALLTHROUGH();
     case BR:
     case RET: set_pc(target); break;
     default: VIXL_UNREACHABLE();
@@ -1007,7 +1003,7 @@
   // Switch on the logical operation, stripping out the NOT bit, as it has a
   // different meaning for logical immediate instructions.
   switch (instr->Mask(LogicalOpMask & ~NOT)) {
-    case ANDS: update_flags = true;  // Fall through.
+    case ANDS: update_flags = true; VIXL_FALLTHROUGH();
     case AND: result = op1 & op2; break;
     case ORR: result = op1 | op2; break;
     case EOR: result = op1 ^ op2; break;
@@ -1616,14 +1612,14 @@
     case REV_w: set_wreg(dst, ReverseBytes(wreg(src), Reverse32)); break;
     case REV32_x: set_xreg(dst, ReverseBytes(xreg(src), Reverse32)); break;
     case REV_x: set_xreg(dst, ReverseBytes(xreg(src), Reverse64)); break;
-    case CLZ_w: set_wreg(dst, CountLeadingZeros(wreg(src), kWRegSize)); break;
-    case CLZ_x: set_xreg(dst, CountLeadingZeros(xreg(src), kXRegSize)); break;
+    case CLZ_w: set_wreg(dst, CountLeadingZeros(wreg(src))); break;
+    case CLZ_x: set_xreg(dst, CountLeadingZeros(xreg(src))); break;
     case CLS_w: {
-      set_wreg(dst, CountLeadingSignBits(wreg(src), kWRegSize));
+      set_wreg(dst, CountLeadingSignBits(wreg(src)));
       break;
     }
     case CLS_x: {
-      set_xreg(dst, CountLeadingSignBits(xreg(src), kXRegSize));
+      set_xreg(dst, CountLeadingSignBits(xreg(src)));
       break;
     }
     default: VIXL_UNIMPLEMENTED();
@@ -1831,9 +1827,13 @@
 // The algorithm used is adapted from the one described in section 8.2 of
 //   Hacker's Delight, by Henry S. Warren, Jr.
 // It assumes that a right shift on a signed integer is an arithmetic shift.
-static int64_t MultiplyHighSigned(int64_t u, int64_t v) {
+// Type T must be either uint64_t or int64_t.
+template <typename T>
+static T MultiplyHigh(T u, T v) {
   uint64_t u0, v0, w0;
-  int64_t u1, v1, w1, w2, t;
+  T u1, v1, w1, w2, t;
+
+  VIXL_ASSERT(sizeof(u) == sizeof(u0));
 
   u0 = u & 0xffffffff;
   u1 = u >> 32;
@@ -1872,8 +1872,12 @@
     case SMSUBL_x: result = xreg(instr->Ra()) - (rn_s32 * rm_s32); break;
     case UMADDL_x: result = xreg(instr->Ra()) + (rn_u32 * rm_u32); break;
     case UMSUBL_x: result = xreg(instr->Ra()) - (rn_u32 * rm_u32); break;
+    case UMULH_x:
+      result = MultiplyHigh(reg<uint64_t>(instr->Rn()),
+                            reg<uint64_t>(instr->Rm()));
+      break;
     case SMULH_x:
-      result = MultiplyHighSigned(xreg(instr->Rn()), xreg(instr->Rm()));
+      result = MultiplyHigh(xreg(instr->Rn()), xreg(instr->Rm()));
       break;
     default: VIXL_UNIMPLEMENTED();
   }
@@ -2112,28 +2116,28 @@
       break;
     }
     case FCVTZS_xd_fixed:
-      set_xreg(dst, FPToInt64(dreg(src) * pow(2.0, fbits), FPZero));
+      set_xreg(dst, FPToInt64(dreg(src) * std::pow(2.0, fbits), FPZero));
       break;
     case FCVTZS_wd_fixed:
-      set_wreg(dst, FPToInt32(dreg(src) * pow(2.0, fbits), FPZero));
+      set_wreg(dst, FPToInt32(dreg(src) * std::pow(2.0, fbits), FPZero));
       break;
     case FCVTZU_xd_fixed:
-      set_xreg(dst, FPToUInt64(dreg(src) * pow(2.0, fbits), FPZero));
+      set_xreg(dst, FPToUInt64(dreg(src) * std::pow(2.0, fbits), FPZero));
       break;
     case FCVTZU_wd_fixed:
-      set_wreg(dst, FPToUInt32(dreg(src) * pow(2.0, fbits), FPZero));
+      set_wreg(dst, FPToUInt32(dreg(src) * std::pow(2.0, fbits), FPZero));
       break;
     case FCVTZS_xs_fixed:
-      set_xreg(dst, FPToInt64(sreg(src) * powf(2.0f, fbits), FPZero));
+      set_xreg(dst, FPToInt64(sreg(src) * std::pow(2.0f, fbits), FPZero));
       break;
     case FCVTZS_ws_fixed:
-      set_wreg(dst, FPToInt32(sreg(src) * powf(2.0f, fbits), FPZero));
+      set_wreg(dst, FPToInt32(sreg(src) * std::pow(2.0f, fbits), FPZero));
       break;
     case FCVTZU_xs_fixed:
-      set_xreg(dst, FPToUInt64(sreg(src) * powf(2.0f, fbits), FPZero));
+      set_xreg(dst, FPToUInt64(sreg(src) * std::pow(2.0f, fbits), FPZero));
       break;
     case FCVTZU_ws_fixed:
-      set_wreg(dst, FPToUInt32(sreg(src) * powf(2.0f, fbits), FPZero));
+      set_wreg(dst, FPToUInt32(sreg(src) * std::pow(2.0f, fbits), FPZero));
       break;
     default: VIXL_UNREACHABLE();
   }
@@ -2143,11 +2147,16 @@
 void Simulator::VisitFPCompare(const Instruction* instr) {
   AssertSupportedFPCR();
 
+  FPTrapFlags trap = DisableTrap;
   switch (instr->Mask(FPCompareMask)) {
-    case FCMP_s: FPCompare(sreg(instr->Rn()), sreg(instr->Rm())); break;
-    case FCMP_d: FPCompare(dreg(instr->Rn()), dreg(instr->Rm())); break;
-    case FCMP_s_zero: FPCompare(sreg(instr->Rn()), 0.0f); break;
-    case FCMP_d_zero: FPCompare(dreg(instr->Rn()), 0.0); break;
+    case FCMPE_s: trap = EnableTrap; VIXL_FALLTHROUGH();
+    case FCMP_s: FPCompare(sreg(instr->Rn()), sreg(instr->Rm()), trap); break;
+    case FCMPE_d: trap = EnableTrap; VIXL_FALLTHROUGH();
+    case FCMP_d: FPCompare(dreg(instr->Rn()), dreg(instr->Rm()), trap); break;
+    case FCMPE_s_zero: trap = EnableTrap; VIXL_FALLTHROUGH();
+    case FCMP_s_zero: FPCompare(sreg(instr->Rn()), 0.0f, trap); break;
+    case FCMPE_d_zero: trap = EnableTrap; VIXL_FALLTHROUGH();
+    case FCMP_d_zero: FPCompare(dreg(instr->Rn()), 0.0, trap); break;
     default: VIXL_UNIMPLEMENTED();
   }
 }
@@ -2156,18 +2165,23 @@
 void Simulator::VisitFPConditionalCompare(const Instruction* instr) {
   AssertSupportedFPCR();
 
+  FPTrapFlags trap = DisableTrap;
   switch (instr->Mask(FPConditionalCompareMask)) {
+    case FCCMPE_s: trap = EnableTrap;
+      VIXL_FALLTHROUGH();
     case FCCMP_s:
       if (ConditionPassed(instr->Condition())) {
-        FPCompare(sreg(instr->Rn()), sreg(instr->Rm()));
+        FPCompare(sreg(instr->Rn()), sreg(instr->Rm()), trap);
       } else {
         nzcv().SetFlags(instr->Nzcv());
         LogSystemRegister(NZCV);
       }
       break;
+    case FCCMPE_d: trap = EnableTrap;
+      VIXL_FALLTHROUGH();
     case FCCMP_d:
       if (ConditionPassed(instr->Condition())) {
-        FPCompare(dreg(instr->Rn()), dreg(instr->Rm()));
+        FPCompare(dreg(instr->Rn()), dreg(instr->Rm()), trap);
       } else {
         nzcv().SetFlags(instr->Nzcv());
         LogSystemRegister(NZCV);
@@ -2245,547 +2259,6 @@
 }
 
 
-// Assemble the specified IEEE-754 components into the target type and apply
-// appropriate rounding.
-//  sign:     0 = positive, 1 = negative
-//  exponent: Unbiased IEEE-754 exponent.
-//  mantissa: The mantissa of the input. The top bit (which is not encoded for
-//            normal IEEE-754 values) must not be omitted. This bit has the
-//            value 'pow(2, exponent)'.
-//
-// The input value is assumed to be a normalized value. That is, the input may
-// not be infinity or NaN. If the source value is subnormal, it must be
-// normalized before calling this function such that the highest set bit in the
-// mantissa has the value 'pow(2, exponent)'.
-//
-// Callers should use FPRoundToFloat or FPRoundToDouble directly, rather than
-// calling a templated FPRound.
-template <class T, int ebits, int mbits>
-static T FPRound(int64_t sign, int64_t exponent, uint64_t mantissa,
-                 FPRounding round_mode) {
-  VIXL_ASSERT((sign == 0) || (sign == 1));
-
-  // Only FPTieEven and FPRoundOdd rounding modes are implemented.
-  VIXL_ASSERT((round_mode == FPTieEven) || (round_mode == FPRoundOdd));
-
-  // Rounding can promote subnormals to normals, and normals to infinities. For
-  // example, a double with exponent 127 (FLT_MAX_EXP) would appear to be
-  // encodable as a float, but rounding based on the low-order mantissa bits
-  // could make it overflow. With ties-to-even rounding, this value would become
-  // an infinity.
-
-  // ---- Rounding Method ----
-  //
-  // The exponent is irrelevant in the rounding operation, so we treat the
-  // lowest-order bit that will fit into the result ('onebit') as having
-  // the value '1'. Similarly, the highest-order bit that won't fit into
-  // the result ('halfbit') has the value '0.5'. The 'point' sits between
-  // 'onebit' and 'halfbit':
-  //
-  //            These bits fit into the result.
-  //               |---------------------|
-  //  mantissa = 0bxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
-  //                                     ||
-  //                                    / |
-  //                                   /  halfbit
-  //                               onebit
-  //
-  // For subnormal outputs, the range of representable bits is smaller and
-  // the position of onebit and halfbit depends on the exponent of the
-  // input, but the method is otherwise similar.
-  //
-  //   onebit(frac)
-  //     |
-  //     | halfbit(frac)          halfbit(adjusted)
-  //     | /                      /
-  //     | |                      |
-  //  0b00.0 (exact)      -> 0b00.0 (exact)                    -> 0b00
-  //  0b00.0...           -> 0b00.0...                         -> 0b00
-  //  0b00.1 (exact)      -> 0b00.0111..111                    -> 0b00
-  //  0b00.1...           -> 0b00.1...                         -> 0b01
-  //  0b01.0 (exact)      -> 0b01.0 (exact)                    -> 0b01
-  //  0b01.0...           -> 0b01.0...                         -> 0b01
-  //  0b01.1 (exact)      -> 0b01.1 (exact)                    -> 0b10
-  //  0b01.1...           -> 0b01.1...                         -> 0b10
-  //  0b10.0 (exact)      -> 0b10.0 (exact)                    -> 0b10
-  //  0b10.0...           -> 0b10.0...                         -> 0b10
-  //  0b10.1 (exact)      -> 0b10.0111..111                    -> 0b10
-  //  0b10.1...           -> 0b10.1...                         -> 0b11
-  //  0b11.0 (exact)      -> 0b11.0 (exact)                    -> 0b11
-  //  ...                   /             |                      /   |
-  //                       /              |                     /    |
-  //                                                           /     |
-  // adjusted = frac - (halfbit(mantissa) & ~onebit(frac));   /      |
-  //
-  //                   mantissa = (mantissa >> shift) + halfbit(adjusted);
-
-  static const int mantissa_offset = 0;
-  static const int exponent_offset = mantissa_offset + mbits;
-  static const int sign_offset = exponent_offset + ebits;
-  VIXL_ASSERT(sign_offset == (sizeof(T) * 8 - 1));
-
-  // Bail out early for zero inputs.
-  if (mantissa == 0) {
-    return sign << sign_offset;
-  }
-
-  // If all bits in the exponent are set, the value is infinite or NaN.
-  // This is true for all binary IEEE-754 formats.
-  static const int infinite_exponent = (1 << ebits) - 1;
-  static const int max_normal_exponent = infinite_exponent - 1;
-
-  // Apply the exponent bias to encode it for the result. Doing this early makes
-  // it easy to detect values that will be infinite or subnormal.
-  exponent += max_normal_exponent >> 1;
-
-  if (exponent > max_normal_exponent) {
-    // Overflow: the input is too large for the result type to represent.
-    if (round_mode == FPTieEven) {
-      // FPTieEven rounding mode handles overflows using infinities.
-      exponent = infinite_exponent;
-      mantissa = 0;
-    } else {
-      VIXL_ASSERT(round_mode == FPRoundOdd);
-      // FPRoundOdd rounding mode handles overflows using the largest magnitude
-      // normal number.
-      exponent = max_normal_exponent;
-      mantissa = (UINT64_C(1) << exponent_offset) - 1;
-    }
-    return (sign << sign_offset) |
-           (exponent << exponent_offset) |
-           (mantissa << mantissa_offset);
-  }
-
-  // Calculate the shift required to move the top mantissa bit to the proper
-  // place in the destination type.
-  const int highest_significant_bit = 63 - CountLeadingZeros(mantissa, 64);
-  int shift = highest_significant_bit - mbits;
-
-  if (exponent <= 0) {
-    // The output will be subnormal (before rounding).
-    // For subnormal outputs, the shift must be adjusted by the exponent. The +1
-    // is necessary because the exponent of a subnormal value (encoded as 0) is
-    // the same as the exponent of the smallest normal value (encoded as 1).
-    shift += -exponent + 1;
-
-    // Handle inputs that would produce a zero output.
-    //
-    // Shifts higher than highest_significant_bit+1 will always produce a zero
-    // result. A shift of exactly highest_significant_bit+1 might produce a
-    // non-zero result after rounding.
-    if (shift > (highest_significant_bit + 1)) {
-      if (round_mode == FPTieEven) {
-        // The result will always be +/-0.0.
-        return sign << sign_offset;
-      } else {
-        VIXL_ASSERT(round_mode == FPRoundOdd);
-        VIXL_ASSERT(mantissa != 0);
-        // For FPRoundOdd, if the mantissa is too small to represent and
-        // non-zero return the next "odd" value.
-        return (sign << sign_offset) | 1;
-      }
-    }
-
-    // Properly encode the exponent for a subnormal output.
-    exponent = 0;
-  } else {
-    // Clear the topmost mantissa bit, since this is not encoded in IEEE-754
-    // normal values.
-    mantissa &= ~(UINT64_C(1) << highest_significant_bit);
-  }
-
-  if (shift > 0) {
-    if (round_mode == FPTieEven) {
-      // We have to shift the mantissa to the right. Some precision is lost, so
-      // we need to apply rounding.
-      uint64_t onebit_mantissa = (mantissa >> (shift)) & 1;
-      uint64_t halfbit_mantissa = (mantissa >> (shift-1)) & 1;
-      uint64_t adjustment = (halfbit_mantissa & ~onebit_mantissa);
-      uint64_t adjusted = mantissa - adjustment;
-      T halfbit_adjusted = (adjusted >> (shift-1)) & 1;
-
-      T result = (sign << sign_offset) |
-                 (exponent << exponent_offset) |
-                 ((mantissa >> shift) << mantissa_offset);
-
-      // A very large mantissa can overflow during rounding. If this happens,
-      // the exponent should be incremented and the mantissa set to 1.0
-      // (encoded as 0). Applying halfbit_adjusted after assembling the float
-      // has the nice side-effect that this case is handled for free.
-      //
-      // This also handles cases where a very large finite value overflows to
-      // infinity, or where a very large subnormal value overflows to become
-      // normal.
-      return result + halfbit_adjusted;
-    } else {
-      VIXL_ASSERT(round_mode == FPRoundOdd);
-      // If any bits at position halfbit or below are set, onebit (ie. the
-      // bottom bit of the resulting mantissa) must be set.
-      uint64_t fractional_bits = mantissa & ((UINT64_C(1) << shift) - 1);
-      if (fractional_bits != 0) {
-        mantissa |= UINT64_C(1) << shift;
-      }
-
-      return (sign << sign_offset) |
-             (exponent << exponent_offset) |
-             ((mantissa >> shift) << mantissa_offset);
-    }
-  } else {
-    // We have to shift the mantissa to the left (or not at all). The input
-    // mantissa is exactly representable in the output mantissa, so apply no
-    // rounding correction.
-    return (sign << sign_offset) |
-           (exponent << exponent_offset) |
-           ((mantissa << -shift) << mantissa_offset);
-  }
-}
-
-
-// See FPRound for a description of this function.
-static inline double FPRoundToDouble(int64_t sign, int64_t exponent,
-                                     uint64_t mantissa, FPRounding round_mode) {
-  int64_t bits =
-      FPRound<int64_t, kDoubleExponentBits, kDoubleMantissaBits>(sign,
-                                                                 exponent,
-                                                                 mantissa,
-                                                                 round_mode);
-  return rawbits_to_double(bits);
-}
-
-
-// See FPRound for a description of this function.
-static inline float FPRoundToFloat(int64_t sign, int64_t exponent,
-                                   uint64_t mantissa, FPRounding round_mode) {
-  int32_t bits =
-      FPRound<int32_t, kFloatExponentBits, kFloatMantissaBits>(sign,
-                                                               exponent,
-                                                               mantissa,
-                                                               round_mode);
-  return rawbits_to_float(bits);
-}
-
-
-// See FPRound for a description of this function.
-static inline float16 FPRoundToFloat16(int64_t sign,
-                                       int64_t exponent,
-                                       uint64_t mantissa,
-                                       FPRounding round_mode) {
-  return FPRound<float16, kFloat16ExponentBits, kFloat16MantissaBits>(
-      sign, exponent, mantissa, round_mode);
-}
-
-
-double Simulator::FixedToDouble(int64_t src, int fbits, FPRounding round) {
-  if (src >= 0) {
-    return UFixedToDouble(src, fbits, round);
-  } else {
-    // This works for all negative values, including INT64_MIN.
-    return -UFixedToDouble(-src, fbits, round);
-  }
-}
-
-
-double Simulator::UFixedToDouble(uint64_t src, int fbits, FPRounding round) {
-  // An input of 0 is a special case because the result is effectively
-  // subnormal: The exponent is encoded as 0 and there is no implicit 1 bit.
-  if (src == 0) {
-    return 0.0;
-  }
-
-  // Calculate the exponent. The highest significant bit will have the value
-  // 2^exponent.
-  const int highest_significant_bit = 63 - CountLeadingZeros(src, 64);
-  const int64_t exponent = highest_significant_bit - fbits;
-
-  return FPRoundToDouble(0, exponent, src, round);
-}
-
-
-float Simulator::FixedToFloat(int64_t src, int fbits, FPRounding round) {
-  if (src >= 0) {
-    return UFixedToFloat(src, fbits, round);
-  } else {
-    // This works for all negative values, including INT64_MIN.
-    return -UFixedToFloat(-src, fbits, round);
-  }
-}
-
-
-float Simulator::UFixedToFloat(uint64_t src, int fbits, FPRounding round) {
-  // An input of 0 is a special case because the result is effectively
-  // subnormal: The exponent is encoded as 0 and there is no implicit 1 bit.
-  if (src == 0) {
-    return 0.0f;
-  }
-
-  // Calculate the exponent. The highest significant bit will have the value
-  // 2^exponent.
-  const int highest_significant_bit = 63 - CountLeadingZeros(src, 64);
-  const int32_t exponent = highest_significant_bit - fbits;
-
-  return FPRoundToFloat(0, exponent, src, round);
-}
-
-
-double Simulator::FPToDouble(float value) {
-  switch (fpclassify(value)) {
-    case FP_NAN: {
-      if (IsSignallingNaN(value)) {
-        FPProcessException();
-      }
-      if (DN()) return kFP64DefaultNaN;
-
-      // Convert NaNs as the processor would:
-      //  - The sign is propagated.
-      //  - The payload (mantissa) is transferred entirely, except that the top
-      //    bit is forced to '1', making the result a quiet NaN. The unused
-      //    (low-order) payload bits are set to 0.
-      uint32_t raw = float_to_rawbits(value);
-
-      uint64_t sign = raw >> 31;
-      uint64_t exponent = (1 << 11) - 1;
-      uint64_t payload = unsigned_bitextract_64(21, 0, raw);
-      payload <<= (52 - 23);  // The unused low-order bits should be 0.
-      payload |= (UINT64_C(1) << 51);  // Force a quiet NaN.
-
-      return rawbits_to_double((sign << 63) | (exponent << 52) | payload);
-    }
-
-    case FP_ZERO:
-    case FP_NORMAL:
-    case FP_SUBNORMAL:
-    case FP_INFINITE: {
-      // All other inputs are preserved in a standard cast, because every value
-      // representable using an IEEE-754 float is also representable using an
-      // IEEE-754 double.
-      return static_cast<double>(value);
-    }
-  }
-
-  VIXL_UNREACHABLE();
-  return static_cast<double>(value);
-}
-
-
-float Simulator::FPToFloat(float16 value) {
-  uint32_t sign = value >> 15;
-  uint32_t exponent = unsigned_bitextract_32(
-      kFloat16MantissaBits + kFloat16ExponentBits - 1, kFloat16MantissaBits,
-      value);
-  uint32_t mantissa = unsigned_bitextract_32(
-      kFloat16MantissaBits - 1, 0, value);
-
-  switch (float16classify(value)) {
-    case FP_ZERO:
-      return (sign == 0) ? 0.0f : -0.0f;
-
-    case FP_INFINITE:
-      return (sign == 0) ? kFP32PositiveInfinity : kFP32NegativeInfinity;
-
-    case FP_SUBNORMAL: {
-      // Calculate shift required to put mantissa into the most-significant bits
-      // of the destination mantissa.
-      int shift = CountLeadingZeros(mantissa << (32 - 10), 32);
-
-      // Shift mantissa and discard implicit '1'.
-      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits) + shift + 1;
-      mantissa &= (1 << kFloatMantissaBits) - 1;
-
-      // Adjust the exponent for the shift applied, and rebias.
-      exponent = exponent - shift + (-15 + 127);
-      break;
-    }
-
-    case FP_NAN:
-      if (IsSignallingNaN(value)) {
-        FPProcessException();
-      }
-      if (DN()) return kFP32DefaultNaN;
-
-      // Convert NaNs as the processor would:
-      //  - The sign is propagated.
-      //  - The payload (mantissa) is transferred entirely, except that the top
-      //    bit is forced to '1', making the result a quiet NaN. The unused
-      //    (low-order) payload bits are set to 0.
-      exponent = (1 << kFloatExponentBits) - 1;
-
-      // Increase bits in mantissa, making low-order bits 0.
-      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
-      mantissa |= 1 << 22;  // Force a quiet NaN.
-      break;
-
-    case FP_NORMAL:
-      // Increase bits in mantissa, making low-order bits 0.
-      mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
-
-      // Change exponent bias.
-      exponent += (-15 + 127);
-      break;
-
-    default: VIXL_UNREACHABLE();
-  }
-  return rawbits_to_float((sign << 31) |
-                          (exponent << kFloatMantissaBits) |
-                          mantissa);
-}
-
-
-float16 Simulator::FPToFloat16(float value, FPRounding round_mode) {
-  // Only the FPTieEven rounding mode is implemented.
-  VIXL_ASSERT(round_mode == FPTieEven);
-  USE(round_mode);
-
-  uint32_t raw = float_to_rawbits(value);
-  int32_t sign = raw >> 31;
-  int32_t exponent = unsigned_bitextract_32(30, 23, raw) - 127;
-  uint32_t mantissa = unsigned_bitextract_32(22, 0, raw);
-
-  switch (fpclassify(value)) {
-    case FP_NAN: {
-      if (IsSignallingNaN(value)) {
-        FPProcessException();
-      }
-      if (DN()) return kFP16DefaultNaN;
-
-      // Convert NaNs as the processor would:
-      //  - The sign is propagated.
-      //  - The payload (mantissa) is transferred as much as possible, except
-      //    that the top bit is forced to '1', making the result a quiet NaN.
-      float16 result = (sign == 0) ? kFP16PositiveInfinity
-                                   : kFP16NegativeInfinity;
-      result |= mantissa >> (kFloatMantissaBits - kFloat16MantissaBits);
-      result |= (1 << 9);  // Force a quiet NaN;
-      return result;
-    }
-
-    case FP_ZERO:
-      return (sign == 0) ? 0 : 0x8000;
-
-    case FP_INFINITE:
-      return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
-
-    case FP_NORMAL:
-    case FP_SUBNORMAL: {
-      // Convert float-to-half as the processor would, assuming that FPCR.FZ
-      // (flush-to-zero) is not set.
-
-      // Add the implicit '1' bit to the mantissa.
-      mantissa += (1 << 23);
-      return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
-    }
-  }
-
-  VIXL_UNREACHABLE();
-  return 0;
-}
-
-
-float16 Simulator::FPToFloat16(double value, FPRounding round_mode) {
-  // Only the FPTieEven rounding mode is implemented.
-  VIXL_ASSERT(round_mode == FPTieEven);
-  USE(round_mode);
-
-  uint64_t raw = double_to_rawbits(value);
-  int32_t sign = raw >> 63;
-  int64_t exponent = unsigned_bitextract_64(62, 52, raw) - 1023;
-  uint64_t mantissa = unsigned_bitextract_64(51, 0, raw);
-
-  switch (fpclassify(value)) {
-    case FP_NAN: {
-      if (IsSignallingNaN(value)) {
-        FPProcessException();
-      }
-      if (DN()) return kFP16DefaultNaN;
-
-      // Convert NaNs as the processor would:
-      //  - The sign is propagated.
-      //  - The payload (mantissa) is transferred as much as possible, except
-      //    that the top bit is forced to '1', making the result a quiet NaN.
-      float16 result = (sign == 0) ? kFP16PositiveInfinity
-                                   : kFP16NegativeInfinity;
-      result |= mantissa >> (kDoubleMantissaBits - kFloat16MantissaBits);
-      result |= (1 << 9);  // Force a quiet NaN;
-      return result;
-    }
-
-    case FP_ZERO:
-      return (sign == 0) ? 0 : 0x8000;
-
-    case FP_INFINITE:
-      return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
-
-    case FP_NORMAL:
-    case FP_SUBNORMAL: {
-      // Convert double-to-half as the processor would, assuming that FPCR.FZ
-      // (flush-to-zero) is not set.
-
-      // Add the implicit '1' bit to the mantissa.
-      mantissa += (UINT64_C(1) << 52);
-      return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
-    }
-  }
-
-  VIXL_UNREACHABLE();
-  return 0;
-}
-
-
-float Simulator::FPToFloat(double value, FPRounding round_mode) {
-  // Only the FPTieEven rounding mode is implemented.
-  VIXL_ASSERT((round_mode == FPTieEven) || (round_mode == FPRoundOdd));
-  USE(round_mode);
-
-  switch (fpclassify(value)) {
-    case FP_NAN: {
-      if (IsSignallingNaN(value)) {
-        FPProcessException();
-      }
-      if (DN()) return kFP32DefaultNaN;
-
-      // Convert NaNs as the processor would:
-      //  - The sign is propagated.
-      //  - The payload (mantissa) is transferred as much as possible, except
-      //    that the top bit is forced to '1', making the result a quiet NaN.
-      uint64_t raw = double_to_rawbits(value);
-
-      uint32_t sign = raw >> 63;
-      uint32_t exponent = (1 << 8) - 1;
-      uint32_t payload = unsigned_bitextract_64(50, 52 - 23, raw);
-      payload |= (1 << 22);   // Force a quiet NaN.
-
-      return rawbits_to_float((sign << 31) | (exponent << 23) | payload);
-    }
-
-    case FP_ZERO:
-    case FP_INFINITE: {
-      // In a C++ cast, any value representable in the target type will be
-      // unchanged. This is always the case for +/-0.0 and infinities.
-      return static_cast<float>(value);
-    }
-
-    case FP_NORMAL:
-    case FP_SUBNORMAL: {
-      // Convert double-to-float as the processor would, assuming that FPCR.FZ
-      // (flush-to-zero) is not set.
-      uint64_t raw = double_to_rawbits(value);
-      // Extract the IEEE-754 double components.
-      uint32_t sign = raw >> 63;
-      // Extract the exponent and remove the IEEE-754 encoding bias.
-      int32_t exponent = unsigned_bitextract_64(62, 52, raw) - 1023;
-      // Extract the mantissa and add the implicit '1' bit.
-      uint64_t mantissa = unsigned_bitextract_64(51, 0, raw);
-      if (fpclassify(value) == FP_NORMAL) {
-        mantissa |= (UINT64_C(1) << 52);
-      }
-      return FPRoundToFloat(sign, exponent, mantissa, round_mode);
-    }
-  }
-
-  VIXL_UNREACHABLE();
-  return value;
-}
-
-
 void Simulator::VisitFPDataProcessing2Source(const Instruction* instr) {
   AssertSupportedFPCR();
 
@@ -2851,63 +2324,6 @@
 }
 
 
-template <typename T>
-T Simulator::FPProcessNaN(T op) {
-  VIXL_ASSERT(isnan(op));
-  if (IsSignallingNaN(op)) {
-    FPProcessException();
-  }
-  return DN() ? FPDefaultNaN<T>() : ToQuietNaN(op);
-}
-
-template float Simulator::FPProcessNaN(float op);
-template double Simulator::FPProcessNaN(double op);
-
-template <typename T>
-T Simulator::FPProcessNaNs(T op1, T op2) {
-  if (IsSignallingNaN(op1)) {
-    return FPProcessNaN(op1);
-  } else if (IsSignallingNaN(op2)) {
-    return FPProcessNaN(op2);
-  } else if (isnan(op1)) {
-    VIXL_ASSERT(IsQuietNaN(op1));
-    return FPProcessNaN(op1);
-  } else if (isnan(op2)) {
-    VIXL_ASSERT(IsQuietNaN(op2));
-    return FPProcessNaN(op2);
-  } else {
-    return 0.0;
-  }
-}
-
-template float Simulator::FPProcessNaNs(float op1, float op2);
-template double Simulator::FPProcessNaNs(double op1, double op2);
-
-template <typename T>
-T Simulator::FPProcessNaNs3(T op1, T op2, T op3) {
-  if (IsSignallingNaN(op1)) {
-    return FPProcessNaN(op1);
-  } else if (IsSignallingNaN(op2)) {
-    return FPProcessNaN(op2);
-  } else if (IsSignallingNaN(op3)) {
-    return FPProcessNaN(op3);
-  } else if (isnan(op1)) {
-    VIXL_ASSERT(IsQuietNaN(op1));
-    return FPProcessNaN(op1);
-  } else if (isnan(op2)) {
-    VIXL_ASSERT(IsQuietNaN(op2));
-    return FPProcessNaN(op2);
-  } else if (isnan(op3)) {
-    VIXL_ASSERT(IsQuietNaN(op3));
-    return FPProcessNaN(op3);
-  } else {
-    return 0.0;
-  }
-}
-
-template float Simulator::FPProcessNaNs3(float op1, float op2, float op3);
-template double Simulator::FPProcessNaNs3(double op1, double op2, double op3);
-
 bool Simulator::FPProcessNaNs(const Instruction* instr) {
   unsigned fd = instr->Rd();
   unsigned fn = instr->Rn();
@@ -2916,13 +2332,13 @@
 
   if (instr->Mask(FP64) == FP64) {
     double result = FPProcessNaNs(dreg(fn), dreg(fm));
-    if (isnan(result)) {
+    if (std::isnan(result)) {
       set_dreg(fd, result);
       done = true;
     }
   } else {
     float result = FPProcessNaNs(sreg(fn), sreg(fm));
-    if (isnan(result)) {
+    if (std::isnan(result)) {
       set_sreg(fd, result);
       done = true;
     }
@@ -3618,13 +3034,13 @@
   switch (instr->Mask(NEONLoadStoreMultiStructPostIndexMask)) {
     case NEON_LD1_4v:
     case NEON_LD1_4v_post: ld1(vf, vreg(reg[3]), addr[3]); count++;
-      // Fall through.
+      VIXL_FALLTHROUGH();
     case NEON_LD1_3v:
     case NEON_LD1_3v_post: ld1(vf, vreg(reg[2]), addr[2]); count++;
-      // Fall through.
+      VIXL_FALLTHROUGH();
     case NEON_LD1_2v:
     case NEON_LD1_2v_post: ld1(vf, vreg(reg[1]), addr[1]); count++;
-      // Fall through.
+      VIXL_FALLTHROUGH();
     case NEON_LD1_1v:
     case NEON_LD1_1v_post:
       ld1(vf, vreg(reg[0]), addr[0]);
@@ -3632,13 +3048,13 @@
       break;
     case NEON_ST1_4v:
     case NEON_ST1_4v_post: st1(vf, vreg(reg[3]), addr[3]); count++;
-      // Fall through.
+      VIXL_FALLTHROUGH();
     case NEON_ST1_3v:
     case NEON_ST1_3v_post: st1(vf, vreg(reg[2]), addr[2]); count++;
-      // Fall through.
+      VIXL_FALLTHROUGH();
     case NEON_ST1_2v:
     case NEON_ST1_2v_post: st1(vf, vreg(reg[1]), addr[1]); count++;
-      // Fall through.
+      VIXL_FALLTHROUGH();
     case NEON_ST1_1v:
     case NEON_ST1_1v_post:
       st1(vf, vreg(reg[0]), addr[0]);
@@ -3745,6 +3161,7 @@
     case NEON_LD3_b_post:
     case NEON_LD4_b:
     case NEON_LD4_b_post: do_load = true;
+      VIXL_FALLTHROUGH();
     case NEON_ST1_b:
     case NEON_ST1_b_post:
     case NEON_ST2_b:
@@ -3762,6 +3179,7 @@
     case NEON_LD3_h_post:
     case NEON_LD4_h:
     case NEON_LD4_h_post: do_load = true;
+      VIXL_FALLTHROUGH();
     case NEON_ST1_h:
     case NEON_ST1_h_post:
     case NEON_ST2_h:
@@ -3778,6 +3196,7 @@
     case NEON_LD3_s_post:
     case NEON_LD4_s:
     case NEON_LD4_s_post: do_load = true;
+      VIXL_FALLTHROUGH();
     case NEON_ST1_s:
     case NEON_ST1_s_post:
     case NEON_ST2_s:
diff --git a/src/a64/simulator-a64.h b/src/vixl/a64/simulator-a64.h
similarity index 89%
rename from src/a64/simulator-a64.h
rename to src/vixl/a64/simulator-a64.h
index c314f2a..34dd5a1 100644
--- a/src/a64/simulator-a64.h
+++ b/src/vixl/a64/simulator-a64.h
@@ -27,12 +27,12 @@
 #ifndef VIXL_A64_SIMULATOR_A64_H_
 #define VIXL_A64_SIMULATOR_A64_H_
 
-#include "globals.h"
-#include "utils.h"
-#include "a64/instructions-a64.h"
-#include "a64/assembler-a64.h"
-#include "a64/disasm-a64.h"
-#include "a64/instrument-a64.h"
+#include "vixl/globals.h"
+#include "vixl/utils.h"
+#include "vixl/a64/instructions-a64.h"
+#include "vixl/a64/assembler-a64.h"
+#include "vixl/a64/disasm-a64.h"
+#include "vixl/a64/instrument-a64.h"
 
 namespace vixl {
 
@@ -150,6 +150,201 @@
 const unsigned kLogLength = 2 * kInstructionSize;
 
 
+// Assemble the specified IEEE-754 components into the target type and apply
+// appropriate rounding.
+//  sign:     0 = positive, 1 = negative
+//  exponent: Unbiased IEEE-754 exponent.
+//  mantissa: The mantissa of the input. The top bit (which is not encoded for
+//            normal IEEE-754 values) must not be omitted. This bit has the
+//            value 'pow(2, exponent)'.
+//
+// The input value is assumed to be a normalized value. That is, the input may
+// not be infinity or NaN. If the source value is subnormal, it must be
+// normalized before calling this function such that the highest set bit in the
+// mantissa has the value 'pow(2, exponent)'.
+//
+// Callers should use FPRoundToFloat or FPRoundToDouble directly, rather than
+// calling a templated FPRound.
+template <class T, int ebits, int mbits>
+T FPRound(int64_t sign, int64_t exponent, uint64_t mantissa,
+                 FPRounding round_mode) {
+  VIXL_ASSERT((sign == 0) || (sign == 1));
+
+  // Only FPTieEven and FPRoundOdd rounding modes are implemented.
+  VIXL_ASSERT((round_mode == FPTieEven) || (round_mode == FPRoundOdd));
+
+  // Rounding can promote subnormals to normals, and normals to infinities. For
+  // example, a double with exponent 127 (FLT_MAX_EXP) would appear to be
+  // encodable as a float, but rounding based on the low-order mantissa bits
+  // could make it overflow. With ties-to-even rounding, this value would become
+  // an infinity.
+
+  // ---- Rounding Method ----
+  //
+  // The exponent is irrelevant in the rounding operation, so we treat the
+  // lowest-order bit that will fit into the result ('onebit') as having
+  // the value '1'. Similarly, the highest-order bit that won't fit into
+  // the result ('halfbit') has the value '0.5'. The 'point' sits between
+  // 'onebit' and 'halfbit':
+  //
+  //            These bits fit into the result.
+  //               |---------------------|
+  //  mantissa = 0bxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+  //                                     ||
+  //                                    / |
+  //                                   /  halfbit
+  //                               onebit
+  //
+  // For subnormal outputs, the range of representable bits is smaller and
+  // the position of onebit and halfbit depends on the exponent of the
+  // input, but the method is otherwise similar.
+  //
+  //   onebit(frac)
+  //     |
+  //     | halfbit(frac)          halfbit(adjusted)
+  //     | /                      /
+  //     | |                      |
+  //  0b00.0 (exact)      -> 0b00.0 (exact)                    -> 0b00
+  //  0b00.0...           -> 0b00.0...                         -> 0b00
+  //  0b00.1 (exact)      -> 0b00.0111..111                    -> 0b00
+  //  0b00.1...           -> 0b00.1...                         -> 0b01
+  //  0b01.0 (exact)      -> 0b01.0 (exact)                    -> 0b01
+  //  0b01.0...           -> 0b01.0...                         -> 0b01
+  //  0b01.1 (exact)      -> 0b01.1 (exact)                    -> 0b10
+  //  0b01.1...           -> 0b01.1...                         -> 0b10
+  //  0b10.0 (exact)      -> 0b10.0 (exact)                    -> 0b10
+  //  0b10.0...           -> 0b10.0...                         -> 0b10
+  //  0b10.1 (exact)      -> 0b10.0111..111                    -> 0b10
+  //  0b10.1...           -> 0b10.1...                         -> 0b11
+  //  0b11.0 (exact)      -> 0b11.0 (exact)                    -> 0b11
+  //  ...                   /             |                      /   |
+  //                       /              |                     /    |
+  //                                                           /     |
+  // adjusted = frac - (halfbit(mantissa) & ~onebit(frac));   /      |
+  //
+  //                   mantissa = (mantissa >> shift) + halfbit(adjusted);
+
+  static const int mantissa_offset = 0;
+  static const int exponent_offset = mantissa_offset + mbits;
+  static const int sign_offset = exponent_offset + ebits;
+  VIXL_ASSERT(sign_offset == (sizeof(T) * 8 - 1));
+
+  // Bail out early for zero inputs.
+  if (mantissa == 0) {
+    return sign << sign_offset;
+  }
+
+  // If all bits in the exponent are set, the value is infinite or NaN.
+  // This is true for all binary IEEE-754 formats.
+  static const int infinite_exponent = (1 << ebits) - 1;
+  static const int max_normal_exponent = infinite_exponent - 1;
+
+  // Apply the exponent bias to encode it for the result. Doing this early makes
+  // it easy to detect values that will be infinite or subnormal.
+  exponent += max_normal_exponent >> 1;
+
+  if (exponent > max_normal_exponent) {
+    // Overflow: the input is too large for the result type to represent.
+    if (round_mode == FPTieEven) {
+      // FPTieEven rounding mode handles overflows using infinities.
+      exponent = infinite_exponent;
+      mantissa = 0;
+    } else {
+      VIXL_ASSERT(round_mode == FPRoundOdd);
+      // FPRoundOdd rounding mode handles overflows using the largest magnitude
+      // normal number.
+      exponent = max_normal_exponent;
+      mantissa = (UINT64_C(1) << exponent_offset) - 1;
+    }
+    return (sign << sign_offset) |
+           (exponent << exponent_offset) |
+           (mantissa << mantissa_offset);
+  }
+
+  // Calculate the shift required to move the top mantissa bit to the proper
+  // place in the destination type.
+  const int highest_significant_bit = 63 - CountLeadingZeros(mantissa);
+  int shift = highest_significant_bit - mbits;
+
+  if (exponent <= 0) {
+    // The output will be subnormal (before rounding).
+    // For subnormal outputs, the shift must be adjusted by the exponent. The +1
+    // is necessary because the exponent of a subnormal value (encoded as 0) is
+    // the same as the exponent of the smallest normal value (encoded as 1).
+    shift += -exponent + 1;
+
+    // Handle inputs that would produce a zero output.
+    //
+    // Shifts higher than highest_significant_bit+1 will always produce a zero
+    // result. A shift of exactly highest_significant_bit+1 might produce a
+    // non-zero result after rounding.
+    if (shift > (highest_significant_bit + 1)) {
+      if (round_mode == FPTieEven) {
+        // The result will always be +/-0.0.
+        return sign << sign_offset;
+      } else {
+        VIXL_ASSERT(round_mode == FPRoundOdd);
+        VIXL_ASSERT(mantissa != 0);
+        // For FPRoundOdd, if the mantissa is too small to represent and
+        // non-zero return the next "odd" value.
+        return (sign << sign_offset) | 1;
+      }
+    }
+
+    // Properly encode the exponent for a subnormal output.
+    exponent = 0;
+  } else {
+    // Clear the topmost mantissa bit, since this is not encoded in IEEE-754
+    // normal values.
+    mantissa &= ~(UINT64_C(1) << highest_significant_bit);
+  }
+
+  if (shift > 0) {
+    if (round_mode == FPTieEven) {
+      // We have to shift the mantissa to the right. Some precision is lost, so
+      // we need to apply rounding.
+      uint64_t onebit_mantissa = (mantissa >> (shift)) & 1;
+      uint64_t halfbit_mantissa = (mantissa >> (shift-1)) & 1;
+      uint64_t adjustment = (halfbit_mantissa & ~onebit_mantissa);
+      uint64_t adjusted = mantissa - adjustment;
+      T halfbit_adjusted = (adjusted >> (shift-1)) & 1;
+
+      T result = (sign << sign_offset) |
+                 (exponent << exponent_offset) |
+                 ((mantissa >> shift) << mantissa_offset);
+
+      // A very large mantissa can overflow during rounding. If this happens,
+      // the exponent should be incremented and the mantissa set to 1.0
+      // (encoded as 0). Applying halfbit_adjusted after assembling the float
+      // has the nice side-effect that this case is handled for free.
+      //
+      // This also handles cases where a very large finite value overflows to
+      // infinity, or where a very large subnormal value overflows to become
+      // normal.
+      return result + halfbit_adjusted;
+    } else {
+      VIXL_ASSERT(round_mode == FPRoundOdd);
+      // If any bits at position halfbit or below are set, onebit (ie. the
+      // bottom bit of the resulting mantissa) must be set.
+      uint64_t fractional_bits = mantissa & ((UINT64_C(1) << shift) - 1);
+      if (fractional_bits != 0) {
+        mantissa |= UINT64_C(1) << shift;
+      }
+
+      return (sign << sign_offset) |
+             (exponent << exponent_offset) |
+             ((mantissa >> shift) << mantissa_offset);
+    }
+  } else {
+    // We have to shift the mantissa to the left (or not at all). The input
+    // mantissa is exactly representable in the output mantissa, so apply no
+    // rounding correction.
+    return (sign << sign_offset) |
+           (exponent << exponent_offset) |
+           ((mantissa << -shift) << mantissa_offset);
+  }
+}
+
 
 // Representation of memory, with typed getters and setters for access.
 class Memory {
@@ -988,7 +1183,7 @@
 
   PrintRegisterFormat GetPrintRegisterFormatForSizeFP(unsigned size) {
     switch (size) {
-      default: VIXL_UNREACHABLE();
+      default: VIXL_UNREACHABLE(); return kPrintDReg;
       case kDRegSizeInBytes: return kPrintDReg;
       case kSRegSizeInBytes: return kPrintSReg;
     }
@@ -1170,7 +1365,8 @@
         return !Z() && (N() == V());
       case le:
         return !(!Z() && (N() == V()));
-      case nv:  // Fall through.
+      case nv:
+        VIXL_FALLTHROUGH();
       case al:
         return true;
       default:
@@ -2318,15 +2514,13 @@
   void SysOp_W(int op, int64_t val);
 
   template <typename T>
-  T FPDefaultNaN() const;
-  template <typename T>
   T FPRecipSqrtEstimate(T op);
   template <typename T>
   T FPRecipEstimate(T op, FPRounding rounding);
   template <typename T, typename R>
   R FPToFixed(T op, int fbits, bool is_signed, FPRounding rounding);
 
-  void FPCompare(double val0, double val1);
+  void FPCompare(double val0, double val1, FPTrapFlags trap);
   double FPRoundInt(double value, FPRounding round_mode);
   double FPToDouble(float value);
   float FPToFloat(double value, FPRounding round_mode);
@@ -2389,18 +2583,8 @@
   // for cumulative exception bits or floating-point exceptions.
   void FPProcessException() { }
 
-  // Standard NaN processing.
-  template <typename T>
-  T FPProcessNaN(T op);
-
   bool FPProcessNaNs(const Instruction* instr);
 
-  template <typename T>
-  T FPProcessNaNs(T op1, T op2);
-
-  template <typename T>
-  T FPProcessNaNs3(T op1, T op2, T op3);
-
   // Pseudo Printf instruction
   void DoPrintf(const Instruction* instr);
 
@@ -2478,6 +2662,58 @@
   static const Instruction* kEndOfSimAddress;
 
  private:
+  template <typename T>
+  static T FPDefaultNaN();
+
+  // Standard NaN processing.
+  template <typename T>
+  T FPProcessNaN(T op) {
+    VIXL_ASSERT(std::isnan(op));
+    if (IsSignallingNaN(op)) {
+      FPProcessException();
+    }
+    return DN() ? FPDefaultNaN<T>() : ToQuietNaN(op);
+  }
+
+  template <typename T>
+  T FPProcessNaNs(T op1, T op2) {
+    if (IsSignallingNaN(op1)) {
+      return FPProcessNaN(op1);
+    } else if (IsSignallingNaN(op2)) {
+      return FPProcessNaN(op2);
+    } else if (std::isnan(op1)) {
+      VIXL_ASSERT(IsQuietNaN(op1));
+      return FPProcessNaN(op1);
+    } else if (std::isnan(op2)) {
+      VIXL_ASSERT(IsQuietNaN(op2));
+      return FPProcessNaN(op2);
+    } else {
+      return 0.0;
+    }
+  }
+
+  template <typename T>
+  T FPProcessNaNs3(T op1, T op2, T op3) {
+    if (IsSignallingNaN(op1)) {
+      return FPProcessNaN(op1);
+    } else if (IsSignallingNaN(op2)) {
+      return FPProcessNaN(op2);
+    } else if (IsSignallingNaN(op3)) {
+      return FPProcessNaN(op3);
+    } else if (std::isnan(op1)) {
+      VIXL_ASSERT(IsQuietNaN(op1));
+      return FPProcessNaN(op1);
+    } else if (std::isnan(op2)) {
+      VIXL_ASSERT(IsQuietNaN(op2));
+      return FPProcessNaN(op2);
+    } else if (std::isnan(op3)) {
+      VIXL_ASSERT(IsQuietNaN(op3));
+      return FPProcessNaN(op3);
+    } else {
+      return 0.0;
+    }
+  }
+
   bool coloured_trace_;
 
   // A set of TraceParameters flags.
diff --git a/src/code-buffer.cc b/src/vixl/code-buffer.cc
similarity index 98%
rename from src/code-buffer.cc
rename to src/vixl/code-buffer.cc
index bc86e75..bb83975 100644
--- a/src/code-buffer.cc
+++ b/src/vixl/code-buffer.cc
@@ -24,8 +24,8 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include "code-buffer.h"
-#include "utils.h"
+#include "vixl/code-buffer.h"
+#include "vixl/utils.h"
 
 namespace vixl {
 
diff --git a/src/code-buffer.h b/src/vixl/code-buffer.h
similarity index 98%
rename from src/code-buffer.h
rename to src/vixl/code-buffer.h
index da6233d..f93ebb6 100644
--- a/src/code-buffer.h
+++ b/src/vixl/code-buffer.h
@@ -28,7 +28,7 @@
 #define VIXL_CODE_BUFFER_H
 
 #include <string.h>
-#include "globals.h"
+#include "vixl/globals.h"
 
 namespace vixl {
 
diff --git a/src/vixl/compiler-intrinsics.cc b/src/vixl/compiler-intrinsics.cc
new file mode 100644
index 0000000..005bd55
--- /dev/null
+++ b/src/vixl/compiler-intrinsics.cc
@@ -0,0 +1,144 @@
+// Copyright 2015, ARM Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//   * Redistributions of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//   * Redistributions in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//   * Neither the name of ARM Limited nor the names of its contributors may be
+//     used to endorse or promote products derived from this software without
+//     specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "compiler-intrinsics.h"
+
+namespace vixl {
+
+
+int CountLeadingSignBitsFallBack(int64_t value, int width) {
+  VIXL_ASSERT(IsPowerOf2(width) && (width <= 64));
+  if (value >= 0) {
+    return CountLeadingZeros(value, width) - 1;
+  } else {
+    return CountLeadingZeros(~value, width) - 1;
+  }
+}
+
+
+int CountLeadingZerosFallBack(uint64_t value, int width) {
+  VIXL_ASSERT(IsPowerOf2(width) && (width <= 64));
+  if (value == 0) {
+    return width;
+  }
+  int count = 0;
+  value = value << (64 - width);
+  if ((value & UINT64_C(0xffffffff00000000)) == 0) {
+    count += 32;
+    value = value << 32;
+  }
+  if ((value & UINT64_C(0xffff000000000000)) == 0) {
+    count += 16;
+    value = value << 16;
+  }
+  if ((value & UINT64_C(0xff00000000000000)) == 0) {
+    count += 8;
+    value = value << 8;
+  }
+  if ((value & UINT64_C(0xf000000000000000)) == 0) {
+    count += 4;
+    value = value << 4;
+  }
+  if ((value & UINT64_C(0xc000000000000000)) == 0) {
+    count += 2;
+    value = value << 2;
+  }
+  if ((value & UINT64_C(0x8000000000000000)) == 0) {
+    count += 1;
+  }
+  count += (value == 0);
+  return count;
+}
+
+
+int CountSetBitsFallBack(uint64_t value, int width) {
+  VIXL_ASSERT(IsPowerOf2(width) && (width <= 64));
+
+  // Mask out unused bits to ensure that they are not counted.
+  value &= (UINT64_C(0xffffffffffffffff) >> (64 - width));
+
+  // Add up the set bits.
+  // The algorithm works by adding pairs of bit fields together iteratively,
+  // where the size of each bit field doubles each time.
+  // An example for an 8-bit value:
+  // Bits:  h  g  f  e  d  c  b  a
+  //         \ |   \ |   \ |   \ |
+  // value = h+g   f+e   d+c   b+a
+  //            \    |      \    |
+  // value =   h+g+f+e     d+c+b+a
+  //                  \          |
+  // value =       h+g+f+e+d+c+b+a
+  const uint64_t kMasks[] = {
+    UINT64_C(0x5555555555555555),
+    UINT64_C(0x3333333333333333),
+    UINT64_C(0x0f0f0f0f0f0f0f0f),
+    UINT64_C(0x00ff00ff00ff00ff),
+    UINT64_C(0x0000ffff0000ffff),
+    UINT64_C(0x00000000ffffffff),
+  };
+
+  for (unsigned i = 0; i < (sizeof(kMasks) / sizeof(kMasks[0])); i++) {
+    int shift = 1 << i;
+    value = ((value >> shift) & kMasks[i]) + (value & kMasks[i]);
+  }
+
+  return value;
+}
+
+
+int CountTrailingZerosFallBack(uint64_t value, int width) {
+  VIXL_ASSERT(IsPowerOf2(width) && (width <= 64));
+  int count = 0;
+  value = value << (64 - width);
+  if ((value & UINT64_C(0xffffffff)) == 0) {
+    count += 32;
+    value = value >> 32;
+  }
+  if ((value & 0xffff) == 0) {
+    count += 16;
+    value = value >> 16;
+  }
+  if ((value & 0xff) == 0) {
+    count += 8;
+    value = value >> 8;
+  }
+  if ((value & 0xf) == 0) {
+    count += 4;
+    value = value >> 4;
+  }
+  if ((value & 0x3) == 0) {
+    count += 2;
+    value = value >> 2;
+  }
+  if ((value & 0x1) == 0) {
+    count += 1;
+  }
+  count += (value == 0);
+  return count - (64 - width);
+}
+
+
+}  // namespace vixl
diff --git a/src/vixl/compiler-intrinsics.h b/src/vixl/compiler-intrinsics.h
new file mode 100644
index 0000000..534f7e6
--- /dev/null
+++ b/src/vixl/compiler-intrinsics.h
@@ -0,0 +1,155 @@
+// Copyright 2015, ARM Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//   * Redistributions of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//   * Redistributions in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//   * Neither the name of ARM Limited nor the names of its contributors may be
+//     used to endorse or promote products derived from this software without
+//     specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
+// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#ifndef VIXL_COMPILER_INTRINSICS_H
+#define VIXL_COMPILER_INTRINSICS_H
+
+#include "globals.h"
+
+namespace vixl {
+
+// Helper to check whether the version of GCC used is greater than the specified
+// requirement.
+#define MAJOR 1000000
+#define MINOR 1000
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+#define GCC_VERSION_OR_NEWER(major, minor, patchlevel)                         \
+    ((__GNUC__ * MAJOR + __GNUC_MINOR__ * MINOR + __GNUC_PATCHLEVEL__) >=      \
+     ((major) * MAJOR + (minor) * MINOR + (patchlevel)))
+#elif defined(__GNUC__) && defined(__GNUC_MINOR__)
+#define GCC_VERSION_OR_NEWER(major, minor, patchlevel)                         \
+    ((__GNUC__ * MAJOR + __GNUC_MINOR__ * MINOR) >=                            \
+     ((major) * MAJOR + (minor) * MINOR + (patchlevel)))
+#else
+#define GCC_VERSION_OR_NEWER(major, minor, patchlevel) 0
+#endif
+
+
+#if defined(__clang__) && !defined(VIXL_NO_COMPILER_BUILTINS)
+
+#define COMPILER_HAS_BUILTIN_CLRSB    (__has_builtin(__builtin_clrsb))
+#define COMPILER_HAS_BUILTIN_CLZ      (__has_builtin(__builtin_clz))
+#define COMPILER_HAS_BUILTIN_CTZ      (__has_builtin(__builtin_ctz))
+#define COMPILER_HAS_BUILTIN_FFS      (__has_builtin(__builtin_ffs))
+#define COMPILER_HAS_BUILTIN_POPCOUNT (__has_builtin(__builtin_popcount))
+
+#elif defined(__GNUC__) && !defined(VIXL_NO_COMPILER_BUILTINS)
+// The documentation for these builtins is available at:
+// https://gcc.gnu.org/onlinedocs/gcc-$MAJOR.$MINOR.$PATCHLEVEL/gcc//Other-Builtins.html
+
+# define COMPILER_HAS_BUILTIN_CLRSB    (GCC_VERSION_OR_NEWER(4, 7, 0))
+# define COMPILER_HAS_BUILTIN_CLZ      (GCC_VERSION_OR_NEWER(3, 4, 0))
+# define COMPILER_HAS_BUILTIN_CTZ      (GCC_VERSION_OR_NEWER(3, 4, 0))
+# define COMPILER_HAS_BUILTIN_FFS      (GCC_VERSION_OR_NEWER(3, 4, 0))
+# define COMPILER_HAS_BUILTIN_POPCOUNT (GCC_VERSION_OR_NEWER(3, 4, 0))
+
+#else
+// One can define VIXL_NO_COMPILER_BUILTINS to force using the manually
+// implemented C++ methods.
+
+#define COMPILER_HAS_BUILTIN_BSWAP    false
+#define COMPILER_HAS_BUILTIN_CLRSB    false
+#define COMPILER_HAS_BUILTIN_CLZ      false
+#define COMPILER_HAS_BUILTIN_CTZ      false
+#define COMPILER_HAS_BUILTIN_FFS      false
+#define COMPILER_HAS_BUILTIN_POPCOUNT false
+
+#endif
+
+
+template<typename V>
+inline bool IsPowerOf2(V value) {
+  return (value != 0) && ((value & (value - 1)) == 0);
+}
+
+
+// Declaration of fallback functions.
+int CountLeadingSignBitsFallBack(int64_t value, int width);
+int CountLeadingZerosFallBack(uint64_t value, int width);
+int CountSetBitsFallBack(uint64_t value, int width);
+int CountTrailingZerosFallBack(uint64_t value, int width);
+
+
+// Implementation of intrinsics functions.
+// TODO: The implementations could be improved for sizes different from 32bit
+// and 64bit: we could mask the values and call the appropriate builtin.
+
+template<typename V>
+inline int CountLeadingSignBits(V value, int width = (sizeof(V) * 8)) {
+#if COMPILER_HAS_BUILTIN_CLRSB
+  if (width == 32) {
+    return __builtin_clrsb(value);
+  } else if (width == 64) {
+    return __builtin_clrsbll(value);
+  }
+#endif
+  return CountLeadingSignBitsFallBack(value, width);
+}
+
+
+template<typename V>
+inline int CountLeadingZeros(V value, int width = (sizeof(V) * 8)) {
+#if COMPILER_HAS_BUILTIN_CLZ
+  if (width == 32) {
+    return (value == 0) ? 32 : __builtin_clz(value);
+  } else if (width == 64) {
+    return (value == 0) ? 64 : __builtin_clzll(value);
+  }
+#endif
+  return CountLeadingZerosFallBack(value, width);
+}
+
+
+template<typename V>
+inline int CountSetBits(V value, int width = (sizeof(V) * 8)) {
+#if COMPILER_HAS_BUILTIN_POPCOUNT
+  if (width == 32) {
+    return __builtin_popcount(value);
+  } else if (width == 64) {
+    return __builtin_popcountll(value);
+  }
+#endif
+  return CountSetBitsFallBack(value, width);
+}
+
+
+template<typename V>
+inline int CountTrailingZeros(V value, int width = (sizeof(V) * 8)) {
+#if COMPILER_HAS_BUILTIN_CTZ
+  if (width == 32) {
+    return (value == 0) ? 32 : __builtin_ctz(value);
+  } else if (width == 64) {
+    return (value == 0) ? 64 : __builtin_ctzll(value);
+  }
+#endif
+  return CountTrailingZerosFallBack(value, width);
+}
+
+}  // namespace vixl
+
+#endif  // VIXL_COMPILER_INTRINSICS_H
+
diff --git a/src/globals.h b/src/vixl/globals.h
similarity index 84%
rename from src/globals.h
rename to src/vixl/globals.h
index 35d9b05..6dfd000 100644
--- a/src/globals.h
+++ b/src/vixl/globals.h
@@ -49,7 +49,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <stddef.h>
-#include "platform.h"
+#include "vixl/platform.h"
 
 
 typedef uint8_t byte;
@@ -88,4 +88,20 @@
 
 #define VIXL_ALIGNMENT_EXCEPTION() printf("ALIGNMENT EXCEPTION\t"); VIXL_ABORT()
 
+// The clang::fallthrough attribute is used along with the Wimplicit-fallthrough
+// argument to annotate intentional fall-through between switch labels.
+// For more information please refer to:
+// http://clang.llvm.org/docs/AttributeReference.html#fallthrough-clang-fallthrough
+#ifndef __has_warning
+  #define __has_warning(x)  0
+#endif
+
+// Note: This option is only available for Clang. And will only be enabled for
+// C++11(201103L).
+#if __has_warning("-Wimplicit-fallthrough") && __cplusplus >= 201103L
+  #define VIXL_FALLTHROUGH() [[clang::fallthrough]] //NOLINT
+#else
+  #define VIXL_FALLTHROUGH() do {} while (0)
+#endif
+
 #endif  // VIXL_GLOBALS_H
diff --git a/src/invalset.h b/src/vixl/invalset.h
similarity index 98%
rename from src/invalset.h
rename to src/vixl/invalset.h
index c18353a..ffdc023 100644
--- a/src/invalset.h
+++ b/src/vixl/invalset.h
@@ -32,7 +32,7 @@
 #include <algorithm>
 #include <vector>
 
-#include "globals.h"
+#include "vixl/globals.h"
 
 namespace vixl {
 
@@ -250,7 +250,7 @@
 
   // Indicates if the iterator is looking at the vector or at the preallocated
   // elements.
-  bool using_vector_;
+  const bool using_vector_;
   // Used when looking at the preallocated elements, or in debug mode when using
   // the vector to track how many times the iterator has advanced.
   size_t index_;
@@ -657,13 +657,14 @@
 
 template<class S>
 InvalSetIterator<S>::InvalSetIterator(S* inval_set)
-    : using_vector_(false), index_(0), inval_set_(inval_set) {
+    : using_vector_((inval_set != NULL) && inval_set->IsUsingVector()),
+      index_(0),
+      inval_set_(inval_set) {
   if (inval_set != NULL) {
     inval_set->Sort(S::kSoftSort);
 #ifdef VIXL_DEBUG
     inval_set->Acquire();
 #endif
-    using_vector_ = inval_set->IsUsingVector();
     if (using_vector_) {
       iterator_ = typename std::vector<ElementType>::iterator(
           inval_set_->vector_->begin());
diff --git a/src/platform.h b/src/vixl/platform.h
similarity index 100%
rename from src/platform.h
rename to src/vixl/platform.h
diff --git a/src/utils.cc b/src/vixl/utils.cc
similarity index 63%
rename from src/utils.cc
rename to src/vixl/utils.cc
index e026c2d..867001d 100644
--- a/src/utils.cc
+++ b/src/vixl/utils.cc
@@ -24,7 +24,7 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include "utils.h"
+#include "vixl/utils.h"
 #include <stdio.h>
 
 namespace vixl {
@@ -127,91 +127,6 @@
 }
 
 
-int CountLeadingZeros(uint64_t value, int width) {
-  VIXL_ASSERT((width == 8) || (width == 16) || (width == 32) || (width == 64));
-  int count = 0;
-  uint64_t bit_test = UINT64_C(1) << (width - 1);
-  while ((count < width) && ((bit_test & value) == 0)) {
-    count++;
-    bit_test >>= 1;
-  }
-  return count;
-}
-
-
-int CountLeadingSignBits(int64_t value, int width) {
-  VIXL_ASSERT((width == 8) || (width == 16) || (width == 32) || (width == 64));
-  if (value >= 0) {
-    return CountLeadingZeros(value, width) - 1;
-  } else {
-    return CountLeadingZeros(~value, width) - 1;
-  }
-}
-
-
-int CountTrailingZeros(uint64_t value, int width) {
-  VIXL_ASSERT((width == 32) || (width == 64));
-  int count = 0;
-  while ((count < width) && (((value >> count) & 1) == 0)) {
-    count++;
-  }
-  return count;
-}
-
-
-int CountSetBits(uint64_t value, int width) {
-  // TODO: Other widths could be added here, as the implementation already
-  // supports them.
-  VIXL_ASSERT((width == 32) || (width == 64));
-
-  // Mask out unused bits to ensure that they are not counted.
-  value &= (UINT64_C(0xffffffffffffffff) >> (64-width));
-
-  // Add up the set bits.
-  // The algorithm works by adding pairs of bit fields together iteratively,
-  // where the size of each bit field doubles each time.
-  // An example for an 8-bit value:
-  // Bits:  h  g  f  e  d  c  b  a
-  //         \ |   \ |   \ |   \ |
-  // value = h+g   f+e   d+c   b+a
-  //            \    |      \    |
-  // value =   h+g+f+e     d+c+b+a
-  //                  \          |
-  // value =       h+g+f+e+d+c+b+a
-  const uint64_t kMasks[] = {
-    UINT64_C(0x5555555555555555),
-    UINT64_C(0x3333333333333333),
-    UINT64_C(0x0f0f0f0f0f0f0f0f),
-    UINT64_C(0x00ff00ff00ff00ff),
-    UINT64_C(0x0000ffff0000ffff),
-    UINT64_C(0x00000000ffffffff),
-  };
-
-  for (unsigned i = 0; i < (sizeof(kMasks) / sizeof(kMasks[0])); i++) {
-    int shift = 1 << i;
-    value = ((value >> shift) & kMasks[i]) + (value & kMasks[i]);
-  }
-
-  return value;
-}
-
-
-uint64_t LowestSetBit(uint64_t value) {
-  return value & -value;
-}
-
-
-int HighestSetBitPosition(uint64_t number) {
-  VIXL_ASSERT(number != 0);
-  return 63 - CountLeadingZeros(number, 64);
-}
-
-
-bool IsPowerOf2(int64_t value) {
-  return (value != 0) && ((value & (value - 1)) == 0);
-}
-
-
 unsigned CountClearHalfWords(uint64_t imm, unsigned reg_size) {
   VIXL_ASSERT((reg_size % 8) == 0);
   int count = 0;
diff --git a/src/utils.h b/src/vixl/utils.h
similarity index 91%
rename from src/utils.h
rename to src/vixl/utils.h
index 6b9b72f..d7488b7 100644
--- a/src/utils.h
+++ b/src/vixl/utils.h
@@ -27,9 +27,10 @@
 #ifndef VIXL_UTILS_H
 #define VIXL_UTILS_H
 
-#include <math.h>
 #include <string.h>
-#include "globals.h"
+#include <cmath>
+#include "vixl/globals.h"
+#include "vixl/compiler-intrinsics.h"
 
 namespace vixl {
 
@@ -121,7 +122,7 @@
 inline bool IsSignallingNaN(double num) {
   const uint64_t kFP64QuietNaNMask = UINT64_C(0x0008000000000000);
   uint64_t raw = double_to_rawbits(num);
-  if (isnan(num) && ((raw & kFP64QuietNaNMask) == 0)) {
+  if (std::isnan(num) && ((raw & kFP64QuietNaNMask) == 0)) {
     return true;
   }
   return false;
@@ -131,7 +132,7 @@
 inline bool IsSignallingNaN(float num) {
   const uint32_t kFP32QuietNaNMask = 0x00400000;
   uint32_t raw = float_to_rawbits(num);
-  if (isnan(num) && ((raw & kFP32QuietNaNMask) == 0)) {
+  if (std::isnan(num) && ((raw & kFP32QuietNaNMask) == 0)) {
     return true;
   }
   return false;
@@ -147,21 +148,21 @@
 
 template <typename T>
 inline bool IsQuietNaN(T num) {
-  return isnan(num) && !IsSignallingNaN(num);
+  return std::isnan(num) && !IsSignallingNaN(num);
 }
 
 
 // Convert the NaN in 'num' to a quiet NaN.
 inline double ToQuietNaN(double num) {
   const uint64_t kFP64QuietNaNMask = UINT64_C(0x0008000000000000);
-  VIXL_ASSERT(isnan(num));
+  VIXL_ASSERT(std::isnan(num));
   return rawbits_to_double(double_to_rawbits(num) | kFP64QuietNaNMask);
 }
 
 
 inline float ToQuietNaN(float num) {
   const uint32_t kFP32QuietNaNMask = 0x00400000;
-  VIXL_ASSERT(isnan(num));
+  VIXL_ASSERT(std::isnan(num));
   return rawbits_to_float(float_to_rawbits(num) | kFP32QuietNaNMask);
 }
 
@@ -177,14 +178,23 @@
 }
 
 
-// Bit counting.
-int CountLeadingZeros(uint64_t value, int width);
-int CountLeadingSignBits(int64_t value, int width);
-int CountTrailingZeros(uint64_t value, int width);
-int CountSetBits(uint64_t value, int width);
-uint64_t LowestSetBit(uint64_t value);
-int HighestSetBitPosition(uint64_t value);
-bool IsPowerOf2(int64_t value);
+inline uint64_t LowestSetBit(uint64_t value) {
+  return value & -value;
+}
+
+
+template<typename T>
+inline int HighestSetBitPosition(T value) {
+  VIXL_ASSERT(value != 0);
+  return (sizeof(value) * 8 - 1) - CountLeadingZeros(value);
+}
+
+
+template<typename V>
+inline int WhichPowerOf2(V value) {
+  VIXL_ASSERT(IsPowerOf2(value));
+  return CountTrailingZeros(value);
+}
 
 unsigned CountClearHalfWords(uint64_t imm, unsigned reg_size);
 
diff --git a/test/examples/test-examples.cc b/test/examples/test-examples.cc
index 61a31fb..7777c20 100644
--- a/test/examples/test-examples.cc
+++ b/test/examples/test-examples.cc
@@ -24,9 +24,9 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include "a64/macro-assembler-a64.h"
-#include "a64/debugger-a64.h"
-#include "a64/simulator-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/debugger-a64.h"
+#include "vixl/a64/simulator-a64.h"
 #include "examples.h"
 #include "non-const-visitor.h"
 #include "custom-disassembler.h"
diff --git a/test/test-assembler-a64.cc b/test/test-assembler-a64.cc
index ed55ae9..55e42ab 100644
--- a/test/test-assembler-a64.cc
+++ b/test/test-assembler-a64.cc
@@ -27,16 +27,16 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
 #include <float.h>
+#include <cmath>
 
 #include "test-runner.h"
 #include "test-utils-a64.h"
-#include "a64/macro-assembler-a64.h"
-#include "a64/simulator-a64.h"
-#include "a64/debugger-a64.h"
-#include "a64/disasm-a64.h"
-#include "a64/cpu-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/simulator-a64.h"
+#include "vixl/a64/debugger-a64.h"
+#include "vixl/a64/disasm-a64.h"
+#include "vixl/a64/cpu-a64.h"
 
 namespace vixl {
 
@@ -1072,28 +1072,28 @@
   SETUP();
 
   START();
-  __ Mov(x16, 0);
-  __ Mov(x17, 1);
+  __ Mov(x25, 0);
+  __ Mov(x26, 1);
   __ Mov(x18, 0xffffffff);
   __ Mov(x19, 0xffffffffffffffff);
 
-  __ Mul(w0, w16, w16);
-  __ Mul(w1, w16, w17);
-  __ Mul(w2, w17, w18);
+  __ Mul(w0, w25, w25);
+  __ Mul(w1, w25, w26);
+  __ Mul(w2, w26, w18);
   __ Mul(w3, w18, w19);
-  __ Mul(x4, x16, x16);
-  __ Mul(x5, x17, x18);
+  __ Mul(x4, x25, x25);
+  __ Mul(x5, x26, x18);
   __ Mul(x6, x18, x19);
   __ Mul(x7, x19, x19);
-  __ Smull(x8, w17, w18);
+  __ Smull(x8, w26, w18);
   __ Smull(x9, w18, w18);
   __ Smull(x10, w19, w19);
-  __ Mneg(w11, w16, w16);
-  __ Mneg(w12, w16, w17);
-  __ Mneg(w13, w17, w18);
+  __ Mneg(w11, w25, w25);
+  __ Mneg(w12, w25, w26);
+  __ Mneg(w13, w26, w18);
   __ Mneg(w14, w18, w19);
-  __ Mneg(x20, x16, x16);
-  __ Mneg(x21, x17, x18);
+  __ Mneg(x20, x25, x25);
+  __ Mneg(x21, x26, x18);
   __ Mneg(x22, x18, x19);
   __ Mneg(x23, x19, x19);
   END();
@@ -1333,6 +1333,54 @@
 }
 
 
+TEST(umulh) {
+  SETUP();
+
+  START();
+  __ Mov(x20, 0);
+  __ Mov(x21, 1);
+  __ Mov(x22, 0x0000000100000000);
+  __ Mov(x23, 0x0000000012345678);
+  __ Mov(x24, 0x0123456789abcdef);
+  __ Mov(x25, 0x0000000200000000);
+  __ Mov(x26, 0x8000000000000000);
+  __ Mov(x27, 0xffffffffffffffff);
+  __ Mov(x28, 0x5555555555555555);
+  __ Mov(x29, 0xaaaaaaaaaaaaaaaa);
+
+  __ Umulh(x0, x20, x24);
+  __ Umulh(x1, x21, x24);
+  __ Umulh(x2, x22, x23);
+  __ Umulh(x3, x22, x24);
+  __ Umulh(x4, x24, x25);
+  __ Umulh(x5, x23, x27);
+  __ Umulh(x6, x26, x26);
+  __ Umulh(x7, x26, x27);
+  __ Umulh(x8, x27, x27);
+  __ Umulh(x9, x28, x28);
+  __ Umulh(x10, x28, x29);
+  __ Umulh(x11, x29, x29);
+  END();
+
+  RUN();
+
+  ASSERT_EQUAL_64(0, x0);
+  ASSERT_EQUAL_64(0, x1);
+  ASSERT_EQUAL_64(0, x2);
+  ASSERT_EQUAL_64(0x0000000001234567, x3);
+  ASSERT_EQUAL_64(0x0000000002468acf, x4);
+  ASSERT_EQUAL_64(0x0000000012345677, x5);
+  ASSERT_EQUAL_64(0x4000000000000000, x6);
+  ASSERT_EQUAL_64(0x7fffffffffffffff, x7);
+  ASSERT_EQUAL_64(0xfffffffffffffffe, x8);
+  ASSERT_EQUAL_64(0x1c71c71c71c71c71, x9);
+  ASSERT_EQUAL_64(0x38e38e38e38e38e3, x10);
+  ASSERT_EQUAL_64(0x71c71c71c71c71c6, x11);
+
+  TEARDOWN();
+}
+
+
 TEST(smaddl_umaddl_umull) {
   SETUP();
 
@@ -9446,26 +9494,26 @@
   uint32_t raw_n = float_to_rawbits(n);
   uint32_t raw_m = float_to_rawbits(m);
 
-  if (isnan(n) && ((raw_n & kFP32QuietNaNMask) == 0)) {
+  if (std::isnan(n) && ((raw_n & kFP32QuietNaNMask) == 0)) {
     // n is signalling NaN.
     return rawbits_to_float(raw_n | kFP32QuietNaNMask);
-  } else if (isnan(m) && ((raw_m & kFP32QuietNaNMask) == 0)) {
+  } else if (std::isnan(m) && ((raw_m & kFP32QuietNaNMask) == 0)) {
     // m is signalling NaN.
     return rawbits_to_float(raw_m | kFP32QuietNaNMask);
   } else if (quiet_nan_substitute == 0.0) {
-    if (isnan(n)) {
+    if (std::isnan(n)) {
       // n is quiet NaN.
       return n;
-    } else if (isnan(m)) {
+    } else if (std::isnan(m)) {
       // m is quiet NaN.
       return m;
     }
   } else {
     // Substitute n or m if one is quiet, but not both.
-    if (isnan(n) && !isnan(m)) {
+    if (std::isnan(n) && !std::isnan(m)) {
       // n is quiet NaN: replace with substitute.
       n = quiet_nan_substitute;
-    } else if (!isnan(n) && isnan(m)) {
+    } else if (!std::isnan(n) && std::isnan(m)) {
       // m is quiet NaN: replace with substitute.
       m = quiet_nan_substitute;
     }
@@ -9488,26 +9536,26 @@
   uint64_t raw_n = double_to_rawbits(n);
   uint64_t raw_m = double_to_rawbits(m);
 
-  if (isnan(n) && ((raw_n & kFP64QuietNaNMask) == 0)) {
+  if (std::isnan(n) && ((raw_n & kFP64QuietNaNMask) == 0)) {
     // n is signalling NaN.
     return rawbits_to_double(raw_n | kFP64QuietNaNMask);
-  } else if (isnan(m) && ((raw_m & kFP64QuietNaNMask) == 0)) {
+  } else if (std::isnan(m) && ((raw_m & kFP64QuietNaNMask) == 0)) {
     // m is signalling NaN.
     return rawbits_to_double(raw_m | kFP64QuietNaNMask);
   } else if (quiet_nan_substitute == 0.0) {
-    if (isnan(n)) {
+    if (std::isnan(n)) {
       // n is quiet NaN.
       return n;
-    } else if (isnan(m)) {
+    } else if (std::isnan(m)) {
       // m is quiet NaN.
       return m;
     }
   } else {
     // Substitute n or m if one is quiet, but not both.
-    if (isnan(n) && !isnan(m)) {
+    if (std::isnan(n) && !std::isnan(m)) {
       // n is quiet NaN: replace with substitute.
       n = quiet_nan_substitute;
-    } else if (!isnan(n) && isnan(m)) {
+    } else if (!std::isnan(n) && std::isnan(m)) {
       // m is quiet NaN: replace with substitute.
       m = quiet_nan_substitute;
     }
@@ -9700,6 +9748,10 @@
   __ Fmov(d18, -0.5);
   __ Fmov(d19, -1.0);
   __ Mov(x20, 0);
+  __ Mov(x21, 0x7ff0000000000001);  // Double precision NaN.
+  __ Fmov(d21, x21);
+  __ Mov(w22, 0x7f800001);  // Single precision NaN.
+  __ Fmov(s22, w22);
 
   __ Cmp(x20, 0);
   __ Fccmp(s16, s16, NoFlag, eq);
@@ -9739,6 +9791,22 @@
 
   __ fccmp(d18, d18, NFlag, nv);
   __ Mrs(x9, NZCV);
+
+  __ Cmp(x20, 0);
+  __ Fccmpe(s16, s16, NoFlag, eq);
+  __ Mrs(x10, NZCV);
+
+  __ Cmp(x20, 0);
+  __ Fccmpe(d18, d19, ZCVFlag, ls);
+  __ Mrs(x11, NZCV);
+
+  __ Cmp(x20, 0);
+  __ Fccmpe(d21, d21, NoFlag, eq);
+  __ Mrs(x12, NZCV);
+
+  __ Cmp(x20, 0);
+  __ Fccmpe(s22, s22, NoFlag, eq);
+  __ Mrs(x13, NZCV);
   END();
 
   RUN();
@@ -9753,6 +9821,10 @@
   ASSERT_EQUAL_32(NFlag, w7);
   ASSERT_EQUAL_32(ZCFlag, w8);
   ASSERT_EQUAL_32(ZCFlag, w9);
+  ASSERT_EQUAL_32(ZCFlag, w10);
+  ASSERT_EQUAL_32(CFlag, w11);
+  ASSERT_EQUAL_32(CVFlag, w12);
+  ASSERT_EQUAL_32(CVFlag, w13);
 
   TEARDOWN();
 }
@@ -9813,6 +9885,19 @@
     __ Fcmp(d19, 12.3456);
     temps.Exclude(d0);
     __ Mrs(x16, NZCV);
+
+    __ Fcmpe(s8, s8);
+    __ Mrs(x22, NZCV);
+    __ Fcmpe(s8, 0.0);
+    __ Mrs(x23, NZCV);
+    __ Fcmpe(d19, d19);
+    __ Mrs(x24, NZCV);
+    __ Fcmpe(d19, 0.0);
+    __ Mrs(x25, NZCV);
+    __ Fcmpe(s18, s18);
+    __ Mrs(x26, NZCV);
+    __ Fcmpe(d21, d21);
+    __ Mrs(x27, NZCV);
   }
 
   END();
@@ -9833,6 +9918,12 @@
   ASSERT_EQUAL_32(CVFlag, w14);
   ASSERT_EQUAL_32(ZCFlag, w15);
   ASSERT_EQUAL_32(NFlag, w16);
+  ASSERT_EQUAL_32(ZCFlag, w22);
+  ASSERT_EQUAL_32(ZCFlag, w23);
+  ASSERT_EQUAL_32(ZCFlag, w24);
+  ASSERT_EQUAL_32(ZCFlag, w25);
+  ASSERT_EQUAL_32(CVFlag, w26);
+  ASSERT_EQUAL_32(CVFlag, w27);
 
   TEARDOWN();
 }
@@ -11869,16 +11960,16 @@
   double expected_ucvtf_base = rawbits_to_double(expected_ucvtf_bits);
 
   for (int fbits = 0; fbits <= 32; fbits++) {
-    double expected_scvtf = expected_scvtf_base / pow(2, fbits);
-    double expected_ucvtf = expected_ucvtf_base / pow(2, fbits);
+    double expected_scvtf = expected_scvtf_base / std::pow(2, fbits);
+    double expected_ucvtf = expected_ucvtf_base / std::pow(2, fbits);
     ASSERT_EQUAL_FP64(expected_scvtf, results_scvtf_x[fbits]);
     ASSERT_EQUAL_FP64(expected_ucvtf, results_ucvtf_x[fbits]);
     if (cvtf_s32) ASSERT_EQUAL_FP64(expected_scvtf, results_scvtf_w[fbits]);
     if (cvtf_u32) ASSERT_EQUAL_FP64(expected_ucvtf, results_ucvtf_w[fbits]);
   }
   for (int fbits = 33; fbits <= 64; fbits++) {
-    double expected_scvtf = expected_scvtf_base / pow(2, fbits);
-    double expected_ucvtf = expected_ucvtf_base / pow(2, fbits);
+    double expected_scvtf = expected_scvtf_base / std::pow(2, fbits);
+    double expected_ucvtf = expected_ucvtf_base / std::pow(2, fbits);
     ASSERT_EQUAL_FP64(expected_scvtf, results_scvtf_x[fbits]);
     ASSERT_EQUAL_FP64(expected_ucvtf, results_ucvtf_x[fbits]);
   }
@@ -12023,18 +12114,16 @@
   float expected_ucvtf_base = rawbits_to_float(expected_ucvtf_bits);
 
   for (int fbits = 0; fbits <= 32; fbits++) {
-    float expected_scvtf = expected_scvtf_base / powf(2, fbits);
-    float expected_ucvtf = expected_ucvtf_base / powf(2, fbits);
+    float expected_scvtf = expected_scvtf_base / std::pow(2.0f, fbits);
+    float expected_ucvtf = expected_ucvtf_base / std::pow(2.0f, fbits);
     ASSERT_EQUAL_FP32(expected_scvtf, results_scvtf_x[fbits]);
     ASSERT_EQUAL_FP32(expected_ucvtf, results_ucvtf_x[fbits]);
     if (cvtf_s32) ASSERT_EQUAL_FP32(expected_scvtf, results_scvtf_w[fbits]);
     if (cvtf_u32) ASSERT_EQUAL_FP32(expected_ucvtf, results_ucvtf_w[fbits]);
-    break;
   }
   for (int fbits = 33; fbits <= 64; fbits++) {
-    break;
-    float expected_scvtf = expected_scvtf_base / powf(2, fbits);
-    float expected_ucvtf = expected_ucvtf_base / powf(2, fbits);
+    float expected_scvtf = expected_scvtf_base / std::pow(2.0f, fbits);
+    float expected_ucvtf = expected_ucvtf_base / std::pow(2.0f, fbits);
     ASSERT_EQUAL_FP32(expected_scvtf, results_scvtf_x[fbits]);
     ASSERT_EQUAL_FP32(expected_ucvtf, results_ucvtf_x[fbits]);
   }
@@ -12617,6 +12706,10 @@
   SETUP();
   START();
 
+  // Acquire all temps from the MacroAssembler. They are used arbitrarily below.
+  UseScratchRegisterScope temps(&masm);
+  temps.ExcludeAll();
+
   // The literal base is chosen to have two useful properties:
   //  * When multiplied by small values (such as a register index), this value
   //    is clearly readable in the result.
@@ -12687,6 +12780,10 @@
   SETUP();
   START();
 
+  // Acquire all temps from the MacroAssembler. They are used arbitrarily below.
+  UseScratchRegisterScope temps(&masm);
+  temps.ExcludeAll();
+
   // The literal base is chosen to have two useful properties:
   //  * When multiplied by small values (such as a register index), this value
   //    is clearly readable in the result.
@@ -12769,6 +12866,121 @@
 }
 
 
+TEST(load_store_reglist) {
+  SETUP();
+  START();
+
+  // The literal base is chosen to have two useful properties:
+  //  * When multiplied by small values (such as a register index), this value
+  //    is clearly readable in the result.
+  //  * The value is not formed from repeating fixed-size smaller values, so it
+  //    can be used to detect endianness-related errors.
+  uint64_t high_base = UINT32_C(0x01000010);
+  uint64_t low_base =  UINT32_C(0x00100101);
+  uint64_t base = (high_base << 32) | low_base;
+  uint64_t array[21];
+  memset(array, 0, sizeof(array));
+
+  // Initialize the registers.
+  __ Mov(x1, base);
+  __ Add(x2, x1, x1);
+  __ Add(x3, x2, x1);
+  __ Add(x4, x3, x1);
+  __ Fmov(d1, x1);
+  __ Fmov(d2, x2);
+  __ Fmov(d3, x3);
+  __ Fmov(d4, x4);
+  __ Fmov(d5, x1);
+  __ Fmov(d6, x2);
+  __ Fmov(d7, x3);
+  __ Fmov(d8, x4);
+
+  Register reg_base = x20;
+  Register reg_index = x21;
+  int size_stored = 0;
+
+  __ Mov(reg_base, reinterpret_cast<uintptr_t>(&array));
+
+  // Test aligned accesses.
+  CPURegList list_src(w1, w2, w3, w4);
+  CPURegList list_dst(w11, w12, w13, w14);
+  CPURegList list_fp_src_1(d1, d2, d3, d4);
+  CPURegList list_fp_dst_1(d11, d12, d13, d14);
+
+  __ StoreCPURegList(list_src, MemOperand(reg_base, 0 * sizeof(uint64_t)));
+  __ LoadCPURegList(list_dst, MemOperand(reg_base, 0 * sizeof(uint64_t)));
+  size_stored += 4 * kWRegSizeInBytes;
+
+  __ Mov(reg_index, size_stored);
+  __ StoreCPURegList(list_src, MemOperand(reg_base, reg_index));
+  __ LoadCPURegList(list_dst, MemOperand(reg_base, reg_index));
+  size_stored += 4 * kWRegSizeInBytes;
+
+  __ StoreCPURegList(list_fp_src_1, MemOperand(reg_base, size_stored));
+  __ LoadCPURegList(list_fp_dst_1, MemOperand(reg_base, size_stored));
+  size_stored += 4 * kDRegSizeInBytes;
+
+  __ Mov(reg_index, size_stored);
+  __ StoreCPURegList(list_fp_src_1, MemOperand(reg_base, reg_index));
+  __ LoadCPURegList(list_fp_dst_1, MemOperand(reg_base, reg_index));
+  size_stored += 4 * kDRegSizeInBytes;
+
+  // Test unaligned accesses.
+  CPURegList list_fp_src_2(d5, d6, d7, d8);
+  CPURegList list_fp_dst_2(d15, d16, d17, d18);
+
+  __ Str(wzr, MemOperand(reg_base, size_stored));
+  size_stored += 1 * kWRegSizeInBytes;
+  __ StoreCPURegList(list_fp_src_2, MemOperand(reg_base, size_stored));
+  __ LoadCPURegList(list_fp_dst_2, MemOperand(reg_base, size_stored));
+  size_stored += 4 * kDRegSizeInBytes;
+
+  __ Mov(reg_index, size_stored);
+  __ StoreCPURegList(list_fp_src_2, MemOperand(reg_base, reg_index));
+  __ LoadCPURegList(list_fp_dst_2, MemOperand(reg_base, reg_index));
+
+  END();
+  RUN();
+
+  VIXL_CHECK(array[0] == (1 * low_base) + (2 * low_base << kWRegSize));
+  VIXL_CHECK(array[1] == (3 * low_base) + (4 * low_base << kWRegSize));
+  VIXL_CHECK(array[2] == (1 * low_base) + (2 * low_base << kWRegSize));
+  VIXL_CHECK(array[3] == (3 * low_base) + (4 * low_base << kWRegSize));
+  VIXL_CHECK(array[4] == 1 * base);
+  VIXL_CHECK(array[5] == 2 * base);
+  VIXL_CHECK(array[6] == 3 * base);
+  VIXL_CHECK(array[7] == 4 * base);
+  VIXL_CHECK(array[8] == 1 * base);
+  VIXL_CHECK(array[9] == 2 * base);
+  VIXL_CHECK(array[10] == 3 * base);
+  VIXL_CHECK(array[11] == 4 * base);
+  VIXL_CHECK(array[12] == ((1 * low_base) << kSRegSize));
+  VIXL_CHECK(array[13] == (((2 * low_base) << kSRegSize) | (1 * high_base)));
+  VIXL_CHECK(array[14] == (((3 * low_base) << kSRegSize) | (2 * high_base)));
+  VIXL_CHECK(array[15] == (((4 * low_base) << kSRegSize) | (3 * high_base)));
+  VIXL_CHECK(array[16] == (((1 * low_base) << kSRegSize) | (4 * high_base)));
+  VIXL_CHECK(array[17] == (((2 * low_base) << kSRegSize) | (1 * high_base)));
+  VIXL_CHECK(array[18] == (((3 * low_base) << kSRegSize) | (2 * high_base)));
+  VIXL_CHECK(array[19] == (((4 * low_base) << kSRegSize) | (3 * high_base)));
+  VIXL_CHECK(array[20] == (4 * high_base));
+
+  ASSERT_EQUAL_64(1 * low_base, x11);
+  ASSERT_EQUAL_64(2 * low_base, x12);
+  ASSERT_EQUAL_64(3 * low_base, x13);
+  ASSERT_EQUAL_64(4 * low_base, x14);
+  ASSERT_EQUAL_FP64(rawbits_to_double(1 * base), d11);
+  ASSERT_EQUAL_FP64(rawbits_to_double(2 * base), d12);
+  ASSERT_EQUAL_FP64(rawbits_to_double(3 * base), d13);
+  ASSERT_EQUAL_FP64(rawbits_to_double(4 * base), d14);
+  ASSERT_EQUAL_FP64(rawbits_to_double(1 * base), d15);
+  ASSERT_EQUAL_FP64(rawbits_to_double(2 * base), d16);
+  ASSERT_EQUAL_FP64(rawbits_to_double(3 * base), d17);
+  ASSERT_EQUAL_FP64(rawbits_to_double(4 * base), d18);
+
+  TEARDOWN();
+}
+
+
 // This enum is used only as an argument to the push-pop test helpers.
 enum PushPopMethod {
   // Push or Pop using the Push and Pop methods, with blocks of up to four
@@ -12814,6 +13026,10 @@
   RegList list = PopulateRegisterArray(NULL, x, r, reg_size, reg_count,
                                        allowed);
 
+  // Acquire all temps from the MacroAssembler. They are used arbitrarily below.
+  UseScratchRegisterScope temps(&masm);
+  temps.ExcludeAll();
+
   // The literal base is chosen to have two useful properties:
   //  * When multiplied by small values (such as a register index), this value
   //    is clearly readable in the result.
@@ -12993,6 +13209,10 @@
   // Arbitrarily pick a register to use as a stack pointer.
   const Register& stack_pointer = x10;
 
+  // Acquire all temps from the MacroAssembler. They are used arbitrarily below.
+  UseScratchRegisterScope temps(&masm);
+  temps.ExcludeAll();
+
   // The literal base is chosen to have two useful properties:
   //  * When multiplied (using an integer) by small values (such as a register
   //    index), this value is clearly readable in the result.
@@ -13167,6 +13387,10 @@
     r6_to_r9 |= x[i].Bit();
   }
 
+  // Acquire all temps from the MacroAssembler. They are used arbitrarily below.
+  UseScratchRegisterScope temps(&masm);
+  temps.ExcludeAll();
+
   // The literal base is chosen to have two useful properties:
   //  * When multiplied by small values (such as a register index), this value
   //    is clearly readable in the result.
@@ -13267,6 +13491,10 @@
     stack[i] = 0xdeadbeef;
   }
 
+  // Acquire all temps from the MacroAssembler. They are used arbitrarily below.
+  UseScratchRegisterScope temps(&masm);
+  temps.ExcludeAll();
+
   // The literal base is chosen to have two useful properties:
   //  * When multiplied by small values (such as a register index), this value
   //    is clearly readable in the result.
@@ -13446,6 +13674,10 @@
 
   VIXL_ASSERT(sp.Is(__ StackPointer()));
 
+  // Acquire all temps from the MacroAssembler. They are used arbitrarily below.
+  UseScratchRegisterScope temps(&masm);
+  temps.ExcludeAll();
+
   __ Mov(x3, 0x3333333333333333);
   __ Mov(x2, 0x2222222222222222);
   __ Mov(x1, 0x1111111111111111);
@@ -14154,8 +14386,8 @@
 
 
 static void ProcessNaNsHelper(double n, double m, double expected) {
-  VIXL_ASSERT(isnan(n) || isnan(m));
-  VIXL_ASSERT(isnan(expected));
+  VIXL_ASSERT(std::isnan(n) || std::isnan(m));
+  VIXL_ASSERT(std::isnan(expected));
 
   SETUP();
   START();
@@ -14225,8 +14457,8 @@
 
 
 static void ProcessNaNsHelper(float n, float m, float expected) {
-  VIXL_ASSERT(isnan(n) || isnan(m));
-  VIXL_ASSERT(isnan(expected));
+  VIXL_ASSERT(std::isnan(n) || std::isnan(m));
+  VIXL_ASSERT(std::isnan(expected));
 
   SETUP();
   START();
@@ -14296,10 +14528,10 @@
 
 
 static void DefaultNaNHelper(float n, float m, float a) {
-  VIXL_ASSERT(isnan(n) || isnan(m) || isnan(a));
+  VIXL_ASSERT(std::isnan(n) || std::isnan(m) || std::isnan(a));
 
-  bool test_1op = isnan(n);
-  bool test_2op = isnan(n) || isnan(m);
+  bool test_1op = std::isnan(n);
+  bool test_2op = std::isnan(n) || std::isnan(m);
 
   SETUP();
   START();
@@ -14423,10 +14655,10 @@
 
 
 static void DefaultNaNHelper(double n, double m, double a) {
-  VIXL_ASSERT(isnan(n) || isnan(m) || isnan(a));
+  VIXL_ASSERT(std::isnan(n) || std::isnan(m) || std::isnan(a));
 
-  bool test_1op = isnan(n);
-  bool test_2op = isnan(n) || isnan(m);
+  bool test_1op = std::isnan(n);
+  bool test_2op = std::isnan(n) || std::isnan(m);
 
   SETUP();
   START();
diff --git a/test/test-disasm-a64.cc b/test/test-disasm-a64.cc
index 63f1b41..036d755 100644
--- a/test/test-disasm-a64.cc
+++ b/test/test-disasm-a64.cc
@@ -28,8 +28,8 @@
 #include <cstring>
 #include "test-runner.h"
 
-#include "a64/macro-assembler-a64.h"
-#include "a64/disasm-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/disasm-a64.h"
 
 #define TEST(name)  TEST_(DISASM_##name)
 
@@ -457,6 +457,7 @@
   COMPARE(smull(x0, w0, w1), "smull x0, w0, w1");
   COMPARE(smull(x30, w30, w0), "smull x30, w30, w0");
   COMPARE(smulh(x0, x1, x2), "smulh x0, x1, x2");
+  COMPARE(umulh(x0, x2, x1), "umulh x0, x2, x1");
 
   COMPARE(sdiv(w0, w1, w2), "sdiv w0, w1, w2");
   COMPARE(sdiv(x3, x4, x5), "sdiv x3, x4, x5");
@@ -2361,6 +2362,13 @@
   COMPARE(fcmp(s12, 0), "fcmp s12, #0.0");
   COMPARE(fcmp(d12, 0), "fcmp d12, #0.0");
 
+  COMPARE(fcmpe(s0, s1), "fcmpe s0, s1");
+  COMPARE(fcmpe(s31, s30), "fcmpe s31, s30");
+  COMPARE(fcmpe(d0, d1), "fcmpe d0, d1");
+  COMPARE(fcmpe(d31, d30), "fcmpe d31, d30");
+  COMPARE(fcmpe(s12, 0), "fcmpe s12, #0.0");
+  COMPARE(fcmpe(d12, 0), "fcmpe d12, #0.0");
+
   CLEANUP();
 }
 
@@ -2379,6 +2387,17 @@
   COMPARE(fccmp(s14, s15, CVFlag, al), "fccmp s14, s15, #nzCV, al");
   COMPARE(fccmp(d16, d17, CFlag, nv), "fccmp d16, d17, #nzCv, nv");
 
+  COMPARE(fccmpe(s0, s1, NoFlag, eq), "fccmpe s0, s1, #nzcv, eq");
+  COMPARE(fccmpe(s2, s3, ZVFlag, ne), "fccmpe s2, s3, #nZcV, ne");
+  COMPARE(fccmpe(s30, s16, NCFlag, pl), "fccmpe s30, s16, #NzCv, pl");
+  COMPARE(fccmpe(s31, s31, NZCVFlag, le), "fccmpe s31, s31, #NZCV, le");
+  COMPARE(fccmpe(d4, d5, VFlag, gt), "fccmpe d4, d5, #nzcV, gt");
+  COMPARE(fccmpe(d6, d7, NFlag, vs), "fccmpe d6, d7, #Nzcv, vs");
+  COMPARE(fccmpe(d30, d0, NZFlag, vc), "fccmpe d30, d0, #NZcv, vc");
+  COMPARE(fccmpe(d31, d31, ZFlag, hs), "fccmpe d31, d31, #nZcv, hs");
+  COMPARE(fccmpe(s14, s15, CVFlag, al), "fccmpe s14, s15, #nzCV, al");
+  COMPARE(fccmpe(d16, d17, CFlag, nv), "fccmpe d16, d17, #nzCv, nv");
+
   CLEANUP();
 }
 
@@ -2655,6 +2674,12 @@
   COMPARE(Add(w19, w3, -0x344), "sub w19, w3, #0x344 (836)");
   COMPARE(Add(w20, w4, -2000), "sub w20, w4, #0x7d0 (2000)");
 
+  COMPARE(Add(w0, w1, 5, LeaveFlags), "add w0, w1, #0x5 (5)");
+  COMPARE(Add(w1, w2, 15, SetFlags), "adds w1, w2, #0xf (15)");
+
+  COMPARE(Sub(w0, w1, 5, LeaveFlags), "sub w0, w1, #0x5 (5)");
+  COMPARE(Sub(w1, w2, 15, SetFlags), "subs w1, w2, #0xf (15)");
+
   COMPARE(Sub(w21, w3, -0xbc), "add w21, w3, #0xbc (188)");
   COMPARE(Sub(w22, w4, -2000), "add w22, w4, #0x7d0 (2000)");
 
diff --git a/test/test-fuzz-a64.cc b/test/test-fuzz-a64.cc
index f488201..c73d71b 100644
--- a/test/test-fuzz-a64.cc
+++ b/test/test-fuzz-a64.cc
@@ -27,8 +27,8 @@
 #include <stdlib.h>
 #include "test-runner.h"
 
-#include "a64/decoder-a64.h"
-#include "a64/disasm-a64.h"
+#include "vixl/a64/decoder-a64.h"
+#include "vixl/a64/disasm-a64.h"
 
 #define TEST(name)  TEST_(FUZZ_##name)
 
diff --git a/test/test-invalset.cc b/test/test-invalset.cc
index a17755e..2fb2b34 100644
--- a/test/test-invalset.cc
+++ b/test/test-invalset.cc
@@ -26,7 +26,7 @@
 
 #include "test-runner.h"
 
-#include "invalset.h"
+#include "vixl/invalset.h"
 
 namespace vixl {
 
diff --git a/test/test-runner.h b/test/test-runner.h
index 40709c1..3acf053 100644
--- a/test/test-runner.h
+++ b/test/test-runner.h
@@ -27,7 +27,7 @@
 #ifndef TEST_TEST_H_
 #define TEST_TEST_H_
 
-#include "utils.h"
+#include "vixl/utils.h"
 
 namespace vixl {
 
diff --git a/test/test-simulator-a64.cc b/test/test-simulator-a64.cc
index f389ece..b83642c 100644
--- a/test/test-simulator-a64.cc
+++ b/test/test-simulator-a64.cc
@@ -31,8 +31,8 @@
 #include "test-utils-a64.h"
 #include "test-simulator-inputs-a64.h"
 #include "test-simulator-traces-a64.h"
-#include "a64/macro-assembler-a64.h"
-#include "a64/simulator-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/simulator-a64.h"
 
 namespace vixl {
 
diff --git a/test/test-utils-a64.cc b/test/test-utils-a64.cc
index 10923a1..9de5b44 100644
--- a/test/test-utils-a64.cc
+++ b/test/test-utils-a64.cc
@@ -26,13 +26,13 @@
 
 #include "test-utils-a64.h"
 
-#include <math.h>   // Needed for isnan().
+#include <cmath>
 
 #include "test-runner.h"
-#include "a64/macro-assembler-a64.h"
-#include "a64/simulator-a64.h"
-#include "a64/disasm-a64.h"
-#include "a64/cpu-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/simulator-a64.h"
+#include "vixl/a64/disasm-a64.h"
+#include "vixl/a64/cpu-a64.h"
 
 #define __ masm->
 
@@ -85,7 +85,7 @@
   if (float_to_rawbits(expected) == float_to_rawbits(result)) {
     return true;
   } else {
-    if (isnan(expected) || (expected == 0.0)) {
+    if (std::isnan(expected) || (expected == 0.0)) {
       printf("Expected 0x%08" PRIx32 "\t Found 0x%08" PRIx32 "\n",
              float_to_rawbits(expected), float_to_rawbits(result));
     } else {
@@ -104,7 +104,7 @@
     return true;
   }
 
-  if (isnan(expected) || (expected == 0.0)) {
+  if (std::isnan(expected) || (expected == 0.0)) {
     printf("Expected 0x%016" PRIx64 "\t Found 0x%016" PRIx64 "\n",
            double_to_rawbits(expected), double_to_rawbits(result));
   } else {
diff --git a/test/test-utils-a64.h b/test/test-utils-a64.h
index 3612809..94d7745 100644
--- a/test/test-utils-a64.h
+++ b/test/test-utils-a64.h
@@ -28,10 +28,10 @@
 #define VIXL_A64_TEST_UTILS_A64_H_
 
 #include "test-runner.h"
-#include "a64/macro-assembler-a64.h"
-#include "a64/simulator-a64.h"
-#include "a64/disasm-a64.h"
-#include "a64/cpu-a64.h"
+#include "vixl/a64/macro-assembler-a64.h"
+#include "vixl/a64/simulator-a64.h"
+#include "vixl/a64/disasm-a64.h"
+#include "vixl/a64/cpu-a64.h"
 
 namespace vixl {
 
diff --git a/tools/presubmit.py b/tools/presubmit.py
index 4342aa6..bd9f2b5 100755
--- a/tools/presubmit.py
+++ b/tools/presubmit.py
@@ -40,6 +40,10 @@
 import util
 
 
+SUPPORTED_COMPILERS = ['g++', 'clang++']
+OBJ_DIR = './obj'
+
+
 def BuildOptions():
   result = argparse.ArgumentParser(
           description='Run the linter and unit tests.',
@@ -53,9 +57,11 @@
                       help='Do not run the linter. Run the tests only.')
   result.add_argument('--noclean', action='store_true',
                       help='Do not clean before build.')
+  result.add_argument('--fast', action='store_true',
+                      help='Only test with one toolchain')
   result.add_argument('--jobs', '-j', metavar='N', type=int, nargs='?',
                       default=1, const=multiprocessing.cpu_count(),
-                      help='''Runs the tests using N jobs. If the option is set
+                      help='''Run the tests using N jobs. If the option is set
                       but no value is provided, the script will use as many jobs
                       as it thinks useful.''')
   sim_default = 'off' if platform.machine() == 'aarch64' else 'on'
@@ -65,30 +71,72 @@
   return result.parse_args()
 
 
-def CleanBuildSystem():
-  def clean(mode):
-    if args.verbose: print('Cleaning ' + mode + ' mode test...')
-    command = 'scons mode=%s simulator=%s all --clean' % \
-              (mode, args.simulator)
+def check_supported(compiler, mode, std):
+  if compiler not in SUPPORTED_COMPILERS:
+    print 'Invalid compiler.'
+    sys.exit(1)
+  if mode not in ['release', 'debug']:
+    print 'Invalid mode.'
+    sys.exit(1)
+  if std not in ['c++98', 'c++11']:
+    print 'Invalid c++ standard.'
+    sys.exit(1)
+
+
+def initalize_compiler_list():
+  compiler_list = []
+  for compiler in SUPPORTED_COMPILERS:
+    if util.has_compiler(compiler) and (len(compiler_list) == 0 or not args.fast):
+      compiler_list.append(compiler)
+    else:
+      # This warning suffices for args.fast too.
+      print 'WARNING: Skipping ' + compiler + ' tests.'
+  if len(compiler_list) == 0:
+    util.abort('Found no supported compilers')
+  return compiler_list
+
+
+def CleanBuildSystem(compiler):
+  def clean(compiler, mode, std):
+    check_supported(compiler, mode, std)
+    os.environ['CXX'] = compiler
+    if args.verbose:
+      print 'Cleaning ' + compiler + ' ' + std + ' ' \
+            + mode + ' mode test...'
+    command = 'scons mode=%s std=%s simulator=%s all --clean' % \
+              (mode, std, args.simulator)
     status, output = util.getstatusoutput(command)
     if status != 0:
       print(output)
       util.abort('Failed cleaning test: ' + command)
-  clean('debug')
-  clean('release')
+
+  clean(compiler, 'debug',    'c++98')
+  clean(compiler, 'debug',    'c++11')
+  clean(compiler, 'release',  'c++98')
+  clean(compiler, 'release',  'c++11')
 
 
-def BuildEverything():
-  def build(mode):
-    if args.verbose: print('Building ' + mode + ' mode test...')
-    command = 'scons mode=%s simulator=%s all -j%u' % \
-              (mode, args.simulator, args.jobs)
+def BuildEverything(compiler):
+  def build(compiler, mode, std):
+    check_supported(compiler, mode, std)
+    os.environ['CXX'] = compiler
+    if args.verbose:
+      print 'Building ' + compiler + ' ' +  std + ' ' \
+            + mode + ' mode test...'
+    if args.jobs == 1:
+      print '- This may take a while. Pass `-j` to use multiple threads.'
+    command = 'scons mode=%s std=%s simulator=%s all -j%u' % \
+              (mode, std, args.simulator, args.jobs)
     status, output = util.getstatusoutput(command)
     if status != 0:
       print(output)
       util.abort('Failed building test: ' + command)
-  build('debug')
-  build('release')
+
+  print 'Building ' + compiler + ' tests...'
+  build(compiler, 'debug',    'c++98')
+  build(compiler, 'debug',    'c++11')
+  build(compiler, 'release',  'c++98')
+  build(compiler, 'release',  'c++11')
 
 
 NOT_RUN = 'NOT RUN'
@@ -101,7 +149,7 @@
     self.status = NOT_RUN
 
   def name_prefix(self):
-    return '%-26s : ' % self.name
+    return '%-40s : ' % self.name
 
 
 class Tester:
@@ -121,33 +169,36 @@
 
 
 class VIXLTest(Test):
-  def __init__(self, mode, simulator, debugger = False, verbose = False):
-    if not mode in ['release', 'debug']:
-      print 'Invalid mode.'
-      sys.exit(1)
-
-    self.debugger = debugger
+  def __init__(self, compiler, mode, std, simulator, debugger = False, verbose = False):
+    check_supported(compiler, mode, std)
     self.verbose = verbose
+    self.debugger = debugger
+    self.compiler = compiler
+    self.mode = mode
+    self.std = std
 
-    name = 'test ' + mode
+    name = 'test ' + compiler + ' ' + std + ' ' + mode
     if simulator:
       name += ' (%s)' % ('debugger' if debugger else 'simulator')
     Test.__init__(self, name)
 
-    self.exe = './test-runner'
+    self.exe = 'test-runner'
     if simulator:
         self.exe += '_sim'
     if mode == 'debug':
       self.exe += '_g'
 
   def Run(self):
-    manifest = test.ReadManifest(self.exe, [], self.debugger,
-                                 False, self.verbose)
+    self.status = PASSED
+    command = os.path.join(OBJ_DIR, self.mode, self.compiler,
+                           self.std, self.exe)
+    manifest = test.ReadManifest(command, [], self.debugger, False, self.verbose)
     retcode = test.RunTests(manifest, jobs = args.jobs,
                             verbose = self.verbose, debugger = self.debugger,
                             progress_prefix = self.name_prefix())
     printer.EnsureNewLine()
-    self.status = PASSED if retcode == 0 else FAILED
+    if retcode != 0:
+      self.status = FAILED
 
 
 class LintTest(Test):
@@ -167,13 +218,17 @@
     n_errors = lint.LintFiles(lint.default_tracked_files,
                               jobs = args.jobs, verbose = args.verbose,
                               progress_prefix = self.name_prefix())
-
     self.status = PASSED if n_errors == 0 else FAILED
 
 
 class BenchTest(Test):
-  def __init__(self, mode, simulator):
-    name = 'benchmarks ' + mode
+  def __init__(self, compiler, mode, std, simulator):
+    check_supported(compiler, mode, std)
+    self.compiler = compiler
+    self.mode = mode
+    self.std = std
+
+    name = 'benchmarks ' + compiler + ' ' + std + ' ' + mode
     Test.__init__(self, name)
     self.exe_suffix = ''
     if simulator:
@@ -186,7 +241,8 @@
                   'bench-branch-masm', 'bench-branch-link-masm']
     self.status = PASSED
     for bench in benchmarks:
-      command = './' + bench + self.exe_suffix
+      command = os.path.join(OBJ_DIR, self.mode, self.compiler, self.std,
+                             bench + self.exe_suffix)
       (rc, out) = util.getstatusoutput(command)
       if rc != 0:
         self.status = FAILED
@@ -206,31 +262,44 @@
     print 'WARNING: This is not a Git repository. The linter will not run.'
     args.nolint = True
 
-  tester = Tester()
   if not args.nolint:
     import lint
-    tester.AddTest(LintTest())
+    LintTest().Run()
 
   if not args.notest:
-    if not args.noclean:
-      CleanBuildSystem()
-    BuildEverything()
+    tester = Tester()
+    compiler_list = initalize_compiler_list()
 
-    if args.simulator == 'on':
-      #                        mode,      sim,   debugger, verbose
-      tester.AddTest(VIXLTest('release',  True,  True,     args.verbose))
-      tester.AddTest(VIXLTest('debug',    True,  True,     args.verbose))
-      tester.AddTest(VIXLTest('release',  True,  False,    args.verbose))
-      tester.AddTest(VIXLTest('debug',    True,  False,    args.verbose))
-      tester.AddTest(BenchTest('release', True))
-      tester.AddTest(BenchTest('debug',   True))
-    else:
-      tester.AddTest(VIXLTest('release',  False, False,    args.verbose))
-      tester.AddTest(VIXLTest('debug',    False, False,    args.verbose))
-      tester.AddTest(BenchTest('release', False))
-      tester.AddTest(BenchTest('debug',   False))
+    for compiler in compiler_list:
+      if not args.noclean:
+        CleanBuildSystem(compiler)
+      BuildEverything(compiler)
 
-  tester.RunAll()
+      if args.simulator == 'on':
+        #                                 mode,       std,      sim,   debugger, verbose
+        tester.AddTest(VIXLTest(compiler, 'release',  'c++98',  True,  True,     args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'debug',    'c++98',  True,  True,     args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'release',  'c++98',  True,  False,    args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'debug',    'c++98',  True,  False,    args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'release',  'c++11',  True,  True,     args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'debug',    'c++11',  True,  True,     args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'release',  'c++11',  True,  False,    args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'debug',    'c++11',  True,  False,    args.verbose))
+        tester.AddTest(BenchTest(compiler,'release',  'c++98',  True))
+        tester.AddTest(BenchTest(compiler,'debug',    'c++98',  True))
+        tester.AddTest(BenchTest(compiler,'release',  'c++11',  True))
+        tester.AddTest(BenchTest(compiler,'debug',    'c++11',  True))
+      else:
+        tester.AddTest(VIXLTest(compiler, 'release',  'c++98',  False, False,    args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'debug',    'c++98',  False, False,    args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'release',  'c++11',  False, False,    args.verbose))
+        tester.AddTest(VIXLTest(compiler, 'debug',    'c++11',  False, False,    args.verbose))
+        tester.AddTest(BenchTest(compiler,'release',  'c++98',  False))
+        tester.AddTest(BenchTest(compiler,'debug',    'c++98',  False))
+        tester.AddTest(BenchTest(compiler,'release',  'c++11',  False))
+        tester.AddTest(BenchTest(compiler,'debug',    'c++11',  False))
+
+    tester.RunAll()
 
   if git.is_git_repository_root():
     untracked_files = git.get_untracked_files()
diff --git a/tools/util.py b/tools/util.py
index db4a07d..1c127de 100644
--- a/tools/util.py
+++ b/tools/util.py
@@ -24,6 +24,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import os
 import sys
 import subprocess
 import shlex
@@ -49,3 +50,8 @@
   lines = text.split('\n')
   last = lines[-1].split('\r')
   return last[-1]
+
+
+def has_compiler(compiler):
+  status, output = getstatusoutput('which ' + compiler)
+  return status == 0