test/aarch64/test-assembler-sve-aarch64.cc - arm/vixl.git - Linaro Git Browser

 // Copyright 2019, VIXL authors
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 //   * Redistributions of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //   * Redistributions in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //   * Neither the name of ARM Limited nor the names of its contributors may be
 //     used to endorse or promote products derived from this software without
 //     specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include <cfloat>
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <functional>
 #include <sys/mman.h>
 #include <unistd.h>

 #include "test-runner.h"
 #include "test-utils.h"

 #include "aarch64/cpu-aarch64.h"
 #include "aarch64/disasm-aarch64.h"
 #include "aarch64/macro-assembler-aarch64.h"
 #include "aarch64/simulator-aarch64.h"
 #include "aarch64/test-utils-aarch64.h"
 #include "test-assembler-aarch64.h"

 #define TEST_SVE(name) TEST_SVE_INNER("ASM", name)

 namespace vixl {
 namespace aarch64 {

 // Conveniently initialise P registers with scalar bit patterns. The destination
 // lane size is ignored. This is optimised for call-site clarity, not generated
 // code quality.
 //
 // Usage:
 //
 //    Initialise(&masm, p0, 0x1234);  // Sets p0 = 0b'0001'0010'0011'0100
 void Initialise(MacroAssembler* masm,
                 const PRegister& pd,
                 uint64_t value3,
                 uint64_t value2,
                 uint64_t value1,
                 uint64_t value0) {
   // Generate a literal pool, as in the array form.
   UseScratchRegisterScope temps(masm);
   Register temp = temps.AcquireX();
   Label data;
   Label done;

   masm->Adr(temp, &data);
   masm->Ldr(pd, SVEMemOperand(temp));
   masm->B(&done);
   {
     ExactAssemblyScope total(masm, kPRegMaxSizeInBytes);
     masm->bind(&data);
     masm->dc64(value0);
     masm->dc64(value1);
     masm->dc64(value2);
     masm->dc64(value3);
   }
   masm->Bind(&done);
 }
 void Initialise(MacroAssembler* masm,
                 const PRegister& pd,
                 uint64_t value2,
                 uint64_t value1,
                 uint64_t value0) {
   Initialise(masm, pd, 0, value2, value1, value0);
 }
 void Initialise(MacroAssembler* masm,
                 const PRegister& pd,
                 uint64_t value1,
                 uint64_t value0) {
   Initialise(masm, pd, 0, 0, value1, value0);
 }
 void Initialise(MacroAssembler* masm, const PRegister& pd, uint64_t value0) {
   Initialise(masm, pd, 0, 0, 0, value0);
 }

 // Conveniently initialise P registers by lane. This is optimised for call-site
 // clarity, not generated code quality.
 //
 // Usage:
 //
 //     int values[] = { 0x0, 0x1, 0x2 };
 //     Initialise(&masm, p0.VnS(), values);  // Sets p0 = 0b'0000'0001'0010
 //
 // The rightmost (highest-indexed) array element maps to the lowest-numbered
 // lane. Unspecified lanes are set to 0 (inactive).
 //
 // Each element of the `values` array is mapped onto a lane in `pd`. The
 // architecture only respects the lower bit, and writes zero the upper bits, but
 // other (encodable) values can be specified if required by the test.
 template <typename T, size_t N>
 void Initialise(MacroAssembler* masm,
                 const PRegisterWithLaneSize& pd,
                 const T (&values)[N]) {
   // Turn the array into 64-bit chunks.
   uint64_t chunks[4] = {0, 0, 0, 0};
   VIXL_STATIC_ASSERT(sizeof(chunks) == kPRegMaxSizeInBytes);

   int p_bits_per_lane = pd.GetLaneSizeInBits() / kZRegBitsPerPRegBit;
   VIXL_ASSERT((64 % p_bits_per_lane) == 0);
   VIXL_ASSERT((N * p_bits_per_lane) <= kPRegMaxSize);

   uint64_t p_lane_mask = GetUintMask(p_bits_per_lane);

   VIXL_STATIC_ASSERT(N <= kPRegMaxSize);
   size_t bit = 0;
   for (int n = static_cast<int>(N - 1); n >= 0; n--) {
     VIXL_ASSERT(bit < (sizeof(chunks) * kBitsPerByte));
     uint64_t value = values[n] & p_lane_mask;
     chunks[bit / 64] |= value << (bit % 64);
     bit += p_bits_per_lane;
   }

   Initialise(masm, pd, chunks[3], chunks[2], chunks[1], chunks[0]);
 }

 // Ensure that basic test infrastructure works.
 TEST_SVE(sve_test_infrastructure_z) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   __ Mov(x0, 0x0123456789abcdef);

   // Test basic `Insr` behaviour.
   __ Insr(z0.VnB(), 1);
   __ Insr(z0.VnB(), 2);
   __ Insr(z0.VnB(), x0);
   __ Insr(z0.VnB(), -42);
   __ Insr(z0.VnB(), 0);

   // Test array inputs.
   int z1_inputs[] = {3, 4, 5, -42, 0};
   InsrHelper(&masm, z1.VnH(), z1_inputs);

   // Test that sign-extension works as intended for various lane sizes.
   __ Dup(z2.VnD(), 0);            // Clear the register first.
   __ Insr(z2.VnB(), -42);         //                       0xd6
   __ Insr(z2.VnB(), 0xfe);        //                       0xfe
   __ Insr(z2.VnH(), -42);         //                     0xffd6
   __ Insr(z2.VnH(), 0xfedc);      //                     0xfedc
   __ Insr(z2.VnS(), -42);         //                 0xffffffd6
   __ Insr(z2.VnS(), 0xfedcba98);  //                 0xfedcba98
   // Use another register for VnD(), so we can support 128-bit Z registers.
   __ Insr(z3.VnD(), -42);                 // 0xffffffffffffffd6
   __ Insr(z3.VnD(), 0xfedcba9876543210);  // 0xfedcba9876543210

   END();

   if (CAN_RUN()) {
     RUN();

     // Test that array checks work properly on a register initialised
     // lane-by-lane.
     int z0_inputs_b[] = {0x01, 0x02, 0xef, 0xd6, 0x00};
     ASSERT_EQUAL_SVE(z0_inputs_b, z0.VnB());

     // Test that lane-by-lane checks work properly on a register initialised
     // by array.
     for (size_t i = 0; i < ArrayLength(z1_inputs); i++) {
       // The rightmost (highest-indexed) array element maps to the
       // lowest-numbered lane.
       int lane = static_cast<int>(ArrayLength(z1_inputs) - i - 1);
       ASSERT_EQUAL_SVE_LANE(z1_inputs[i], z1.VnH(), lane);
     }

     uint64_t z2_inputs_d[] = {0x0000d6feffd6fedc, 0xffffffd6fedcba98};
     ASSERT_EQUAL_SVE(z2_inputs_d, z2.VnD());
     uint64_t z3_inputs_d[] = {0xffffffffffffffd6, 0xfedcba9876543210};
     ASSERT_EQUAL_SVE(z3_inputs_d, z3.VnD());
   }
 }

 // Ensure that basic test infrastructure works.
 TEST_SVE(sve_test_infrastructure_p) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Simple cases: move boolean (0 or 1) values.

   int p0_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};
   Initialise(&masm, p0.VnB(), p0_inputs);

   int p1_inputs[] = {1, 0, 1, 1, 0, 1, 1, 1};
   Initialise(&masm, p1.VnH(), p1_inputs);

   int p2_inputs[] = {1, 1, 0, 1};
   Initialise(&masm, p2.VnS(), p2_inputs);

   int p3_inputs[] = {0, 1};
   Initialise(&masm, p3.VnD(), p3_inputs);

   // Advanced cases: move numeric value into architecturally-ignored bits.

   // B-sized lanes get one bit in a P register, so there are no ignored bits.

   // H-sized lanes get two bits in a P register.
   int p4_inputs[] = {0x3, 0x2, 0x1, 0x0, 0x1, 0x2, 0x3};
   Initialise(&masm, p4.VnH(), p4_inputs);

   // S-sized lanes get four bits in a P register.
   int p5_inputs[] = {0xc, 0x7, 0x9, 0x6, 0xf};
   Initialise(&masm, p5.VnS(), p5_inputs);

   // D-sized lanes get eight bits in a P register.
   int p6_inputs[] = {0x81, 0xcc, 0x55};
   Initialise(&masm, p6.VnD(), p6_inputs);

   // The largest possible P register has 32 bytes.
   int p7_inputs[] = {0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
                      0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
                      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
                      0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f};
   Initialise(&masm, p7.VnD(), p7_inputs);

   END();

   if (CAN_RUN()) {
     RUN();

     // Test that lane-by-lane checks work properly. The rightmost
     // (highest-indexed) array element maps to the lowest-numbered lane.
     for (size_t i = 0; i < ArrayLength(p0_inputs); i++) {
       int lane = static_cast<int>(ArrayLength(p0_inputs) - i - 1);
       ASSERT_EQUAL_SVE_LANE(p0_inputs[i], p0.VnB(), lane);
     }
     for (size_t i = 0; i < ArrayLength(p1_inputs); i++) {
       int lane = static_cast<int>(ArrayLength(p1_inputs) - i - 1);
       ASSERT_EQUAL_SVE_LANE(p1_inputs[i], p1.VnH(), lane);
     }
     for (size_t i = 0; i < ArrayLength(p2_inputs); i++) {
       int lane = static_cast<int>(ArrayLength(p2_inputs) - i - 1);
       ASSERT_EQUAL_SVE_LANE(p2_inputs[i], p2.VnS(), lane);
     }
     for (size_t i = 0; i < ArrayLength(p3_inputs); i++) {
       int lane = static_cast<int>(ArrayLength(p3_inputs) - i - 1);
       ASSERT_EQUAL_SVE_LANE(p3_inputs[i], p3.VnD(), lane);
     }

     // Test that array checks work properly on predicates initialised with a
     // possibly-different lane size.
     // 0b...11'10'01'00'01'10'11
     int p4_expected[] = {0x39, 0x1b};
     ASSERT_EQUAL_SVE(p4_expected, p4.VnD());

     ASSERT_EQUAL_SVE(p5_inputs, p5.VnS());

     // 0b...10000001'11001100'01010101
     int p6_expected[] = {2, 0, 0, 1, 3, 0, 3, 0, 1, 1, 1, 1};
     ASSERT_EQUAL_SVE(p6_expected, p6.VnH());

     // 0b...10011100'10011101'10011110'10011111
     int p7_expected[] = {1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
                          1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1};
     ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
   }
 }

 // Test that writes to V registers clear the high bits of the corresponding Z
 // register.
 TEST_SVE(sve_v_write_clear) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON,
                           CPUFeatures::kFP,
                           CPUFeatures::kSVE);
   START();

   // The Simulator has two mechanisms for writing V registers:
   //  - Write*Register, calling through to SimRegisterBase::Write.
   //  - LogicVRegister::ClearForWrite followed by one or more lane updates.
   // Try to cover both variants.

   // Prepare some known inputs.
   uint8_t data[kQRegSizeInBytes];
   for (size_t i = 0; i < kQRegSizeInBytes; i++) {
     data[i] = 42 + i;
   }
   __ Mov(x10, reinterpret_cast<uintptr_t>(data));
   __ Fmov(d30, 42.0);

   // Use Index to label the lane indices, so failures are easy to detect and
   // diagnose.
   __ Index(z0.VnB(), 0, 1);
   __ Index(z1.VnB(), 0, 1);
   __ Index(z2.VnB(), 0, 1);
   __ Index(z3.VnB(), 0, 1);
   __ Index(z4.VnB(), 0, 1);

   __ Index(z10.VnB(), 0, -1);
   __ Index(z11.VnB(), 0, -1);
   __ Index(z12.VnB(), 0, -1);
   __ Index(z13.VnB(), 0, -1);
   __ Index(z14.VnB(), 0, -1);

   // Instructions using Write*Register (and SimRegisterBase::Write).
   __ Ldr(b0, MemOperand(x10));
   __ Fcvt(h1, d30);
   __ Fmov(s2, 1.5f);
   __ Fmov(d3, d30);
   __ Ldr(q4, MemOperand(x10));

   // Instructions using LogicVRegister::ClearForWrite.
   // These also (incidentally) test that across-lane instructions correctly
   // ignore the high-order Z register lanes.
   __ Sminv(b10, v10.V16B());
   __ Addv(h11, v11.V4H());
   __ Saddlv(s12, v12.V8H());
   __ Dup(v13.V8B(), b13, kDRegSizeInBytes);
   __ Uaddl(v14.V8H(), v14.V8B(), v14.V8B());

   END();

   if (CAN_RUN()) {
     RUN();

     // Check the Q part first.
     ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000002a, v0);
     ASSERT_EQUAL_128(0x0000000000000000, 0x0000000000005140, v1);  // 42.0 (f16)
     ASSERT_EQUAL_128(0x0000000000000000, 0x000000003fc00000, v2);  // 1.5 (f32)
     ASSERT_EQUAL_128(0x0000000000000000, 0x4045000000000000, v3);  // 42.0 (f64)
     ASSERT_EQUAL_128(0x3938373635343332, 0x31302f2e2d2c2b2a, v4);
     ASSERT_EQUAL_128(0x0000000000000000, 0x00000000000000f1, v10);  // -15
     //  0xf9fa + 0xfbfc + 0xfdfe + 0xff00 -> 0xf2f4
     ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000f2f4, v11);
     //  0xfffff1f2 + 0xfffff3f4 + ... + 0xfffffdfe + 0xffffff00 -> 0xffffc6c8
     ASSERT_EQUAL_128(0x0000000000000000, 0x00000000ffffc6c8, v12);
     ASSERT_EQUAL_128(0x0000000000000000, 0xf8f8f8f8f8f8f8f8, v13);  // [-8] x 8
     //    [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000]
     //  + [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000]
     // -> [0x01f2, 0x01f4, 0x01f6, 0x01f8, 0x01fa, 0x01fc, 0x01fe, 0x0000]
     ASSERT_EQUAL_128(0x01f201f401f601f8, 0x01fa01fc01fe0000, v14);

     // Check that the upper lanes are all clear.
     for (int i = kQRegSizeInBytes; i < core.GetSVELaneCount(kBRegSize); i++) {
       ASSERT_EQUAL_SVE_LANE(0x00, z0.VnB(), i);
       ASSERT_EQUAL_SVE_LANE(0x00, z1.VnB(), i);
       ASSERT_EQUAL_SVE_LANE(0x00, z2.VnB(), i);
       ASSERT_EQUAL_SVE_LANE(0x00, z3.VnB(), i);
       ASSERT_EQUAL_SVE_LANE(0x00, z4.VnB(), i);
       ASSERT_EQUAL_SVE_LANE(0x00, z10.VnB(), i);
       ASSERT_EQUAL_SVE_LANE(0x00, z11.VnB(), i);
       ASSERT_EQUAL_SVE_LANE(0x00, z12.VnB(), i);
       ASSERT_EQUAL_SVE_LANE(0x00, z13.VnB(), i);
       ASSERT_EQUAL_SVE_LANE(0x00, z14.VnB(), i);
     }
   }
 }

 static void MlaMlsHelper(Test* config, unsigned lane_size_in_bits) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int zd_inputs[] = {0xbb, 0xcc, 0xdd, 0xee};
   int za_inputs[] = {-39, 1, -3, 2};
   int zn_inputs[] = {-5, -20, 9, 8};
   int zm_inputs[] = {9, -5, 4, 5};

   ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
   ZRegister za = z1.WithLaneSize(lane_size_in_bits);
   ZRegister zn = z2.WithLaneSize(lane_size_in_bits);
   ZRegister zm = z3.WithLaneSize(lane_size_in_bits);

   // TODO: Use a simple `Dup` once it accepts arbitrary immediates.
   InsrHelper(&masm, zd, zd_inputs);
   InsrHelper(&masm, za, za_inputs);
   InsrHelper(&masm, zn, zn_inputs);
   InsrHelper(&masm, zm, zm_inputs);

   int p0_inputs[] = {1, 1, 0, 1};
   int p1_inputs[] = {1, 0, 1, 1};
   int p2_inputs[] = {0, 1, 1, 1};
   int p3_inputs[] = {1, 1, 1, 0};

   Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), p0_inputs);
   Initialise(&masm, p1.WithLaneSize(lane_size_in_bits), p1_inputs);
   Initialise(&masm, p2.WithLaneSize(lane_size_in_bits), p2_inputs);
   Initialise(&masm, p3.WithLaneSize(lane_size_in_bits), p3_inputs);

   // The Mla macro automatically selects between mla, mad and movprfx + mla
   // based on what registers are aliased.
   ZRegister mla_da_result = z10.WithLaneSize(lane_size_in_bits);
   ZRegister mla_dn_result = z11.WithLaneSize(lane_size_in_bits);
   ZRegister mla_dm_result = z12.WithLaneSize(lane_size_in_bits);
   ZRegister mla_d_result = z13.WithLaneSize(lane_size_in_bits);

   __ Mov(mla_da_result, za);
   __ Mla(mla_da_result, p0.Merging(), mla_da_result, zn, zm);

   __ Mov(mla_dn_result, zn);
   __ Mla(mla_dn_result, p1.Merging(), za, mla_dn_result, zm);

   __ Mov(mla_dm_result, zm);
   __ Mla(mla_dm_result, p2.Merging(), za, zn, mla_dm_result);

   __ Mov(mla_d_result, zd);
   __ Mla(mla_d_result, p3.Merging(), za, zn, zm);

   // The Mls macro automatically selects between mls, msb and movprfx + mls
   // based on what registers are aliased.
   ZRegister mls_da_result = z20.WithLaneSize(lane_size_in_bits);
   ZRegister mls_dn_result = z21.WithLaneSize(lane_size_in_bits);
   ZRegister mls_dm_result = z22.WithLaneSize(lane_size_in_bits);
   ZRegister mls_d_result = z23.WithLaneSize(lane_size_in_bits);

   __ Mov(mls_da_result, za);
   __ Mls(mls_da_result, p0.Merging(), mls_da_result, zn, zm);

   __ Mov(mls_dn_result, zn);
   __ Mls(mls_dn_result, p1.Merging(), za, mls_dn_result, zm);

   __ Mov(mls_dm_result, zm);
   __ Mls(mls_dm_result, p2.Merging(), za, zn, mls_dm_result);

   __ Mov(mls_d_result, zd);
   __ Mls(mls_d_result, p3.Merging(), za, zn, zm);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits));
     ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits));
     ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits));

     int mla[] = {-84, 101, 33, 42};
     int mls[] = {6, -99, -39, -38};

     int mla_da_expected[] = {mla[0], mla[1], za_inputs[2], mla[3]};
     ASSERT_EQUAL_SVE(mla_da_expected, mla_da_result);

     int mla_dn_expected[] = {mla[0], zn_inputs[1], mla[2], mla[3]};
     ASSERT_EQUAL_SVE(mla_dn_expected, mla_dn_result);

     int mla_dm_expected[] = {zm_inputs[0], mla[1], mla[2], mla[3]};
     ASSERT_EQUAL_SVE(mla_dm_expected, mla_dm_result);

     int mla_d_expected[] = {mla[0], mla[1], mla[2], zd_inputs[3]};
     ASSERT_EQUAL_SVE(mla_d_expected, mla_d_result);

     int mls_da_expected[] = {mls[0], mls[1], za_inputs[2], mls[3]};
     ASSERT_EQUAL_SVE(mls_da_expected, mls_da_result);

     int mls_dn_expected[] = {mls[0], zn_inputs[1], mls[2], mls[3]};
     ASSERT_EQUAL_SVE(mls_dn_expected, mls_dn_result);

     int mls_dm_expected[] = {zm_inputs[0], mls[1], mls[2], mls[3]};
     ASSERT_EQUAL_SVE(mls_dm_expected, mls_dm_result);

     int mls_d_expected[] = {mls[0], mls[1], mls[2], zd_inputs[3]};
     ASSERT_EQUAL_SVE(mls_d_expected, mls_d_result);
   }
 }

 TEST_SVE(sve_mla_mls_b) { MlaMlsHelper(config, kBRegSize); }
 TEST_SVE(sve_mla_mls_h) { MlaMlsHelper(config, kHRegSize); }
 TEST_SVE(sve_mla_mls_s) { MlaMlsHelper(config, kSRegSize); }
 TEST_SVE(sve_mla_mls_d) { MlaMlsHelper(config, kDRegSize); }

 TEST_SVE(sve_bitwise_unpredicate_logical) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t z8_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef};
   InsrHelper(&masm, z8.VnD(), z8_inputs);
   uint64_t z15_inputs[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff};
   InsrHelper(&masm, z15.VnD(), z15_inputs);

   __ And(z1.VnD(), z8.VnD(), z15.VnD());
   __ Bic(z2.VnD(), z8.VnD(), z15.VnD());
   __ Eor(z3.VnD(), z8.VnD(), z15.VnD());
   __ Orr(z4.VnD(), z8.VnD(), z15.VnD());

   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t z1_expected[] = {0xfedcaa8854540000, 0x0000454588aacdef};
     uint64_t z2_expected[] = {0x0000101022003210, 0x0123002201010000};
     uint64_t z3_expected[] = {0x01235476ab89fedc, 0xcdef98ba67453210};
     uint64_t z4_expected[] = {0xfffffefeffddfedc, 0xcdefddffefefffff};

     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
     ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
     ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
   }
 }

 TEST_SVE(sve_last_r) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
   START();

   __ Pfalse(p1.VnB());
   int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
   int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
   Initialise(&masm, p2.VnB(), p2_inputs);
   Initialise(&masm, p3.VnB(), p3_inputs);
   __ Ptrue(p4.VnB());

   __ Index(z0.VnB(), 0x10, 1);
   __ Lasta(x1, p1, z0.VnB());
   __ Lastb(x2, p1, z0.VnB());
   __ Lasta(x3, p2, z0.VnB());
   __ Lastb(x4, p2, z0.VnB());
   __ Lasta(x5, p3, z0.VnB());
   __ Lastb(x6, p3, z0.VnB());
   __ Lasta(x7, p4, z0.VnB());

   __ Punpklo(p3.VnH(), p3.VnB());
   __ Index(z0.VnH(), 0x1110, 1);
   __ Lasta(x9, p1, z0.VnH());
   __ Lastb(x10, p3, z0.VnH());
   __ Lasta(x12, p4, z0.VnH());

   __ Index(z0.VnS(), 0x11111110, 1);
   __ Lastb(x13, p1, z0.VnS());
   __ Lasta(x14, p2, z0.VnS());
   __ Lastb(x18, p4, z0.VnS());

   __ Index(z0.VnD(), 0x1111111111111110, 1);
   __ Lasta(x19, p1, z0.VnD());
   __ Lastb(x20, p3, z0.VnD());
   __ Lasta(x21, p3, z0.VnD());
   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_64(0x0000000000000010, x1);
     ASSERT_EQUAL_64(0x0000000000000011, x3);
     ASSERT_EQUAL_64(0x0000000000000010, x4);
     ASSERT_EQUAL_64(0x0000000000000019, x5);
     ASSERT_EQUAL_64(0x0000000000000018, x6);
     ASSERT_EQUAL_64(0x0000000000000010, x7);
     ASSERT_EQUAL_64(0x0000000000001110, x9);
     ASSERT_EQUAL_64(0x0000000000001110, x12);
     ASSERT_EQUAL_64(0x0000000011111111, x14);
     ASSERT_EQUAL_64(0x1111111111111110, x19);

     int vl = core.GetSVELaneCount(kBRegSize) * 8;
     switch (vl) {
       case 128:
         ASSERT_EQUAL_64(0x000000000000001f, x2);
         ASSERT_EQUAL_64(0x0000000000001116, x10);
         ASSERT_EQUAL_64(0x0000000011111113, x13);
         ASSERT_EQUAL_64(0x0000000011111113, x18);
         ASSERT_EQUAL_64(0x1111111111111111, x20);
         ASSERT_EQUAL_64(0x1111111111111110, x21);
         break;
       case 512:
         ASSERT_EQUAL_64(0x000000000000004f, x2);
         ASSERT_EQUAL_64(0x0000000000001118, x10);
         ASSERT_EQUAL_64(0x000000001111111f, x13);
         ASSERT_EQUAL_64(0x000000001111111f, x18);
         ASSERT_EQUAL_64(0x1111111111111112, x20);
         ASSERT_EQUAL_64(0x1111111111111113, x21);
         break;
       case 2048:
         ASSERT_EQUAL_64(0x000000000000000f, x2);
         ASSERT_EQUAL_64(0x0000000000001118, x10);
         ASSERT_EQUAL_64(0x000000001111114f, x13);
         ASSERT_EQUAL_64(0x000000001111114f, x18);
         ASSERT_EQUAL_64(0x1111111111111112, x20);
         ASSERT_EQUAL_64(0x1111111111111113, x21);
         break;
       default:
         printf("WARNING: Some tests skipped due to unexpected VL.\n");
         break;
     }
   }
 }

 TEST_SVE(sve_last_v) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
   START();

   __ Pfalse(p1.VnB());
   int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
   int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
   Initialise(&masm, p2.VnB(), p2_inputs);
   Initialise(&masm, p3.VnB(), p3_inputs);
   __ Ptrue(p4.VnB());

   __ Index(z0.VnB(), 0x10, 1);
   __ Lasta(b1, p1, z0.VnB());
   __ Lastb(b2, p1, z0.VnB());
   __ Lasta(b3, p2, z0.VnB());
   __ Lastb(b4, p2, z0.VnB());
   __ Lasta(b5, p3, z0.VnB());
   __ Lastb(b6, p3, z0.VnB());
   __ Lasta(b7, p4, z0.VnB());

   __ Punpklo(p3.VnH(), p3.VnB());
   __ Index(z0.VnH(), 0x1110, 1);
   __ Lasta(h9, p1, z0.VnH());
   __ Lastb(h10, p3, z0.VnH());
   __ Lasta(h12, p4, z0.VnH());

   __ Index(z0.VnS(), 0x11111110, 1);
   __ Lastb(s13, p1, z0.VnS());
   __ Lasta(s14, p2, z0.VnS());
   __ Lastb(s18, p4, z0.VnS());

   __ Index(z0.VnD(), 0x1111111111111110, 1);
   __ Lasta(d19, p1, z0.VnD());
   __ Lastb(d20, p3, z0.VnD());
   __ Lasta(d21, p3, z0.VnD());
   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_128(0, 0x0000000000000010, q1);
     ASSERT_EQUAL_128(0, 0x0000000000000011, q3);
     ASSERT_EQUAL_128(0, 0x0000000000000010, q4);
     ASSERT_EQUAL_128(0, 0x0000000000000019, q5);
     ASSERT_EQUAL_128(0, 0x0000000000000018, q6);
     ASSERT_EQUAL_128(0, 0x0000000000000010, q7);
     ASSERT_EQUAL_128(0, 0x0000000000001110, q9);
     ASSERT_EQUAL_128(0, 0x0000000000001110, q12);
     ASSERT_EQUAL_128(0, 0x0000000011111111, q14);
     ASSERT_EQUAL_128(0, 0x1111111111111110, q19);

     int vl = core.GetSVELaneCount(kBRegSize) * 8;
     switch (vl) {
       case 128:
         ASSERT_EQUAL_128(0, 0x000000000000001f, q2);
         ASSERT_EQUAL_128(0, 0x0000000000001116, q10);
         ASSERT_EQUAL_128(0, 0x0000000011111113, q13);
         ASSERT_EQUAL_128(0, 0x0000000011111113, q18);
         ASSERT_EQUAL_128(0, 0x1111111111111111, q20);
         ASSERT_EQUAL_128(0, 0x1111111111111110, q21);
         break;
       case 512:
         ASSERT_EQUAL_128(0, 0x000000000000004f, q2);
         ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
         ASSERT_EQUAL_128(0, 0x000000001111111f, q13);
         ASSERT_EQUAL_128(0, 0x000000001111111f, q18);
         ASSERT_EQUAL_128(0, 0x1111111111111112, q20);
         ASSERT_EQUAL_128(0, 0x1111111111111113, q21);
         break;
       case 2048:
         ASSERT_EQUAL_128(0, 0x000000000000000f, q2);
         ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
         ASSERT_EQUAL_128(0, 0x000000001111114f, q13);
         ASSERT_EQUAL_128(0, 0x000000001111114f, q18);
         ASSERT_EQUAL_128(0, 0x1111111111111112, q20);
         ASSERT_EQUAL_128(0, 0x1111111111111113, q21);
         break;
       default:
         printf("WARNING: Some tests skipped due to unexpected VL.\n");
         break;
     }
   }
 }

 TEST_SVE(sve_clast_r) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
   START();

   __ Pfalse(p1.VnB());
   int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
   int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
   Initialise(&masm, p2.VnB(), p2_inputs);
   Initialise(&masm, p3.VnB(), p3_inputs);
   __ Ptrue(p4.VnB());

   __ Index(z0.VnB(), 0x10, 1);
   __ Mov(x1, -1);
   __ Mov(x2, -1);
   __ Clasta(x1, p1, x1, z0.VnB());
   __ Clastb(x2, p1, x2, z0.VnB());
   __ Clasta(x3, p2, x3, z0.VnB());
   __ Clastb(x4, p2, x4, z0.VnB());
   __ Clasta(x5, p3, x5, z0.VnB());
   __ Clastb(x6, p3, x6, z0.VnB());
   __ Clasta(x7, p4, x7, z0.VnB());

   __ Punpklo(p3.VnH(), p3.VnB());
   __ Index(z0.VnH(), 0x1110, 1);
   __ Mov(x9, -1);
   __ Clasta(x9, p1, x9, z0.VnH());
   __ Clastb(x10, p3, x10, z0.VnH());
   __ Clasta(x12, p4, x12, z0.VnH());

   __ Index(z0.VnS(), 0x11111110, 1);
   __ Mov(x13, -1);
   __ Clasta(x13, p1, x13, z0.VnS());
   __ Clastb(x14, p2, x14, z0.VnS());
   __ Clasta(x18, p4, x18, z0.VnS());

   __ Index(z0.VnD(), 0x1111111111111110, 1);
   __ Mov(x19, -1);
   __ Clasta(x19, p1, x19, z0.VnD());
   __ Clastb(x20, p2, x20, z0.VnD());
   __ Clasta(x21, p4, x21, z0.VnD());
   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_64(0x00000000000000ff, x1);
     ASSERT_EQUAL_64(0x00000000000000ff, x2);
     ASSERT_EQUAL_64(0x0000000000000011, x3);
     ASSERT_EQUAL_64(0x0000000000000010, x4);
     ASSERT_EQUAL_64(0x0000000000000019, x5);
     ASSERT_EQUAL_64(0x0000000000000018, x6);
     ASSERT_EQUAL_64(0x0000000000000010, x7);
     ASSERT_EQUAL_64(0x000000000000ffff, x9);
     ASSERT_EQUAL_64(0x0000000000001110, x12);
     ASSERT_EQUAL_64(0x00000000ffffffff, x13);
     ASSERT_EQUAL_64(0x0000000011111110, x14);
     ASSERT_EQUAL_64(0x0000000011111110, x18);
     ASSERT_EQUAL_64(0xffffffffffffffff, x19);
     ASSERT_EQUAL_64(0x1111111111111110, x20);
     ASSERT_EQUAL_64(0x1111111111111110, x21);

     int vl = core.GetSVELaneCount(kBRegSize) * 8;
     switch (vl) {
       case 128:
         ASSERT_EQUAL_64(0x0000000000001116, x10);
         break;
       case 512:
         ASSERT_EQUAL_64(0x0000000000001118, x10);
         break;
       case 2048:
         ASSERT_EQUAL_64(0x0000000000001118, x10);
         break;
       default:
         printf("WARNING: Some tests skipped due to unexpected VL.\n");
         break;
     }
   }
 }

 TEST_SVE(sve_clast_v) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
   START();

   __ Pfalse(p1.VnB());
   int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
   int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
   Initialise(&masm, p2.VnB(), p2_inputs);
   Initialise(&masm, p3.VnB(), p3_inputs);
   __ Ptrue(p4.VnB());

   __ Index(z0.VnB(), 0x10, 1);
   __ Dup(z1.VnB(), -1);
   __ Dup(z2.VnB(), -1);
   __ Clasta(b1, p1, b1, z0.VnB());
   __ Clastb(b2, p1, b2, z0.VnB());
   __ Clasta(b3, p2, b3, z0.VnB());
   __ Clastb(b4, p2, b4, z0.VnB());
   __ Clasta(b5, p3, b5, z0.VnB());
   __ Clastb(b6, p3, b6, z0.VnB());
   __ Clasta(b7, p4, b7, z0.VnB());

   __ Punpklo(p3.VnH(), p3.VnB());
   __ Index(z0.VnH(), 0x1110, 1);
   __ Dup(z9.VnB(), -1);
   __ Clasta(h9, p1, h9, z0.VnH());
   __ Clastb(h10, p3, h10, z0.VnH());
   __ Clasta(h12, p4, h12, z0.VnH());

   __ Index(z0.VnS(), 0x11111110, 1);
   __ Dup(z13.VnB(), -1);
   __ Clasta(s13, p1, s13, z0.VnS());
   __ Clastb(s14, p2, s14, z0.VnS());
   __ Clasta(s18, p4, s18, z0.VnS());

   __ Index(z0.VnD(), 0x1111111111111110, 1);
   __ Dup(z19.VnB(), -1);
   __ Clasta(d19, p1, d19, z0.VnD());
   __ Clastb(d20, p2, d20, z0.VnD());
   __ Clasta(d21, p4, d21, z0.VnD());
   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_128(0, 0x00000000000000ff, q1);
     ASSERT_EQUAL_128(0, 0x00000000000000ff, q2);
     ASSERT_EQUAL_128(0, 0x0000000000000011, q3);
     ASSERT_EQUAL_128(0, 0x0000000000000010, q4);
     ASSERT_EQUAL_128(0, 0x0000000000000019, q5);
     ASSERT_EQUAL_128(0, 0x0000000000000018, q6);
     ASSERT_EQUAL_128(0, 0x0000000000000010, q7);
     ASSERT_EQUAL_128(0, 0x000000000000ffff, q9);
     ASSERT_EQUAL_128(0, 0x0000000000001110, q12);
     ASSERT_EQUAL_128(0, 0x00000000ffffffff, q13);
     ASSERT_EQUAL_128(0, 0x0000000011111110, q14);
     ASSERT_EQUAL_128(0, 0x0000000011111110, q18);
     ASSERT_EQUAL_128(0, 0xffffffffffffffff, q19);
     ASSERT_EQUAL_128(0, 0x1111111111111110, q20);
     ASSERT_EQUAL_128(0, 0x1111111111111110, q21);

     int vl = core.GetSVELaneCount(kBRegSize) * 8;
     switch (vl) {
       case 128:
         ASSERT_EQUAL_128(0, 0x0000000000001116, q10);
         break;
       case 512:
         ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
         break;
       case 2048:
         ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
         break;
       default:
         printf("WARNING: Some tests skipped due to unexpected VL.\n");
         break;
     }
   }
 }

 TEST_SVE(sve_clast_z) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
   START();

   __ Pfalse(p1.VnB());
   int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
   int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
   Initialise(&masm, p2.VnB(), p2_inputs);
   Initialise(&masm, p3.VnB(), p3_inputs);
   __ Ptrue(p4.VnB());

   __ Index(z0.VnB(), 0x10, 1);
   __ Dup(z1.VnB(), 0xff);
   __ Dup(z2.VnB(), 0xff);
   __ Clasta(z1.VnB(), p1, z1.VnB(), z0.VnB());
   __ Clastb(z2.VnB(), p1, z2.VnB(), z0.VnB());
   __ Clasta(z3.VnB(), p2, z3.VnB(), z0.VnB());
   __ Clastb(z4.VnB(), p2, z4.VnB(), z0.VnB());
   __ Clasta(z5.VnB(), p3, z5.VnB(), z0.VnB());
   __ Clastb(z6.VnB(), p3, z6.VnB(), z0.VnB());
   __ Clasta(z7.VnB(), p4, z7.VnB(), z0.VnB());

   __ Punpklo(p3.VnH(), p3.VnB());
   __ Index(z0.VnH(), 0x1110, 1);
   __ Dup(z9.VnB(), 0xff);
   __ Clasta(z9.VnH(), p1, z9.VnH(), z0.VnH());
   __ Clastb(z10.VnH(), p3, z10.VnH(), z0.VnH());
   __ Clasta(z12.VnH(), p4, z12.VnH(), z0.VnH());

   __ Index(z0.VnS(), 0x11111110, 1);
   __ Dup(z13.VnB(), 0xff);
   __ Clasta(z13.VnS(), p1, z13.VnS(), z0.VnS());
   __ Clastb(z14.VnS(), p2, z14.VnS(), z0.VnS());
   __ Clasta(z16.VnS(), p4, z16.VnS(), z0.VnS());

   __ Index(z0.VnD(), 0x1111111111111110, 1);
   __ Dup(z17.VnB(), 0xff);
   __ Clasta(z17.VnD(), p1, z17.VnD(), z0.VnD());
   __ Clastb(z18.VnD(), p2, z18.VnD(), z0.VnD());
   __ Clasta(z20.VnD(), p4, z20.VnD(), z0.VnD());
   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t z1_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
     uint64_t z2_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
     uint64_t z3_expected[] = {0x1111111111111111, 0x1111111111111111};
     uint64_t z4_expected[] = {0x1010101010101010, 0x1010101010101010};
     uint64_t z5_expected[] = {0x1919191919191919, 0x1919191919191919};
     uint64_t z6_expected[] = {0x1818181818181818, 0x1818181818181818};
     uint64_t z7_expected[] = {0x1010101010101010, 0x1010101010101010};
     uint64_t z9_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
     uint64_t z12_expected[] = {0x1110111011101110, 0x1110111011101110};
     uint64_t z13_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
     uint64_t z14_expected[] = {0x1111111011111110, 0x1111111011111110};
     uint64_t z16_expected[] = {0x1111111011111110, 0x1111111011111110};
     uint64_t z17_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
     uint64_t z18_expected[] = {0x1111111111111110, 0x1111111111111110};
     uint64_t z20_expected[] = {0x1111111111111110, 0x1111111111111110};

     uint64_t z10_expected_vl128[] = {0x1116111611161116, 0x1116111611161116};
     uint64_t z10_expected_vl_long[] = {0x1118111811181118, 0x1118111811181118};

     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
     ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
     ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
     ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
     ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
     ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
     ASSERT_EQUAL_SVE(z12_expected, z12.VnD());
     ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
     ASSERT_EQUAL_SVE(z14_expected, z14.VnD());
     ASSERT_EQUAL_SVE(z16_expected, z16.VnD());
     ASSERT_EQUAL_SVE(z17_expected, z17.VnD());
     ASSERT_EQUAL_SVE(z18_expected, z18.VnD());
     ASSERT_EQUAL_SVE(z20_expected, z20.VnD());

     int vl = core.GetSVELaneCount(kBRegSize) * 8;
     switch (vl) {
       case 128:
         ASSERT_EQUAL_SVE(z10_expected_vl128, z10.VnD());
         break;
       case 512:
       case 2048:
         ASSERT_EQUAL_SVE(z10_expected_vl_long, z10.VnD());
         break;
       default:
         printf("WARNING: Some tests skipped due to unexpected VL.\n");
         break;
     }
   }
 }

 TEST_SVE(sve_compact) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
   START();

   __ Ptrue(p0.VnB());
   __ Pfalse(p1.VnB());
   __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
   __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
   __ Zip1(p4.VnD(), p0.VnD(), p1.VnD());

   __ Index(z0.VnS(), 0x11111111, 0x11111111);
   __ Mov(q0, q0);
   __ Compact(z1.VnS(), p0, z0.VnS());
   __ Compact(z2.VnS(), p2, z0.VnS());
   __ Compact(z0.VnS(), p3, z0.VnS());

   __ Index(z3.VnD(), 0x1111111111111111, 0x1111111111111111);
   __ Mov(q3, q3);
   __ Compact(z4.VnD(), p0, z3.VnD());
   __ Compact(z5.VnD(), p1, z3.VnD());
   __ Compact(z6.VnD(), p4, z3.VnD());

   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t z1_expected[] = {0x4444444433333333, 0x2222222211111111};
     uint64_t z2_expected[] = {0x0000000000000000, 0x3333333311111111};
     uint64_t z0_expected[] = {0x0000000000000000, 0x4444444422222222};
     uint64_t z4_expected[] = {0x2222222222222222, 0x1111111111111111};
     uint64_t z5_expected[] = {0x0000000000000000, 0x0000000000000000};
     uint64_t z6_expected[] = {0x0000000000000000, 0x1111111111111111};
     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
     ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
     ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
     ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
   }
 }

 TEST_SVE(sve_splice) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   __ Ptrue(p0.VnB());
   __ Pfalse(p1.VnB());
   int p2b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
   int p3b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};
   int p4b_inputs[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int p5b_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0};
   int p6b_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0};
   Initialise(&masm, p2.VnB(), p2b_inputs);
   Initialise(&masm, p3.VnB(), p3b_inputs);
   Initialise(&masm, p4.VnB(), p4b_inputs);
   Initialise(&masm, p5.VnB(), p5b_inputs);
   Initialise(&masm, p6.VnB(), p6b_inputs);

   __ Index(z30.VnB(), 1, 1);

   __ Index(z0.VnB(), -1, -1);
   __ Splice(z0.VnB(), p0, z0.VnB(), z30.VnB());
   __ Index(z1.VnB(), -1, -1);
   __ Splice(z1.VnB(), p1, z1.VnB(), z30.VnB());
   __ Index(z2.VnB(), -1, -1);
   __ Splice(z2.VnB(), p2, z2.VnB(), z30.VnB());
   __ Index(z3.VnB(), -1, -1);
   __ Splice(z3.VnB(), p3, z3.VnB(), z30.VnB());
   __ Index(z4.VnB(), -1, -1);
   __ Splice(z4.VnB(), p4, z4.VnB(), z30.VnB());
   __ Index(z5.VnB(), -1, -1);
   __ Splice(z5.VnB(), p5, z5.VnB(), z30.VnB());
   __ Index(z6.VnB(), -1, -1);
   __ Splice(z6.VnB(), p6, z6.VnB(), z30.VnB());

   int p2h_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0};
   int p3h_inputs[] = {0, 0, 1, 0, 0, 0, 1, 0};
   Initialise(&masm, p2.VnH(), p2h_inputs);
   Initialise(&masm, p3.VnH(), p3h_inputs);

   __ Index(z30.VnH(), 1, 1);
   __ Index(z29.VnH(), -1, -1);
   __ Splice(z7.VnH(), p2, z29.VnH(), z30.VnH());
   __ Splice(z8.VnH(), p3, z29.VnH(), z30.VnH());

   int p2s_inputs[] = {0, 0, 1, 0};
   int p3s_inputs[] = {1, 0, 1, 0};
   Initialise(&masm, p2.VnS(), p2s_inputs);
   Initialise(&masm, p3.VnS(), p3s_inputs);

   __ Index(z30.VnS(), 1, 1);
   __ Index(z29.VnS(), -1, -1);
   __ Splice(z9.VnS(), p2, z29.VnS(), z30.VnS());
   __ Splice(z10.VnS(), p3, z29.VnS(), z30.VnS());

   int p2d_inputs[] = {0, 1};
   int p3d_inputs[] = {1, 0};
   Initialise(&masm, p2.VnD(), p2d_inputs);
   Initialise(&masm, p3.VnD(), p3d_inputs);

   __ Index(z30.VnD(), 1, 1);
   __ Index(z29.VnD(), -1, -1);
   __ Splice(z11.VnD(), p2, z29.VnD(), z30.VnD());
   __ Splice(z30.VnD(), p3, z29.VnD(), z30.VnD());

   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t z0_expected[] = {0xf0f1f2f3f4f5f6f7, 0xf8f9fafbfcfdfeff};
     uint64_t z1_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201};
     uint64_t z2_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201ff};
     uint64_t z3_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201fe};
     uint64_t z4_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201f0};
     uint64_t z5_expected[] = {0x0c0b0a0908070605, 0x04030201f6f7f8f9};
     uint64_t z6_expected[] = {0x01f0f1f2f3f4f5f6, 0xf7f8f9fafbfcfdfe};
     uint64_t z7_expected[] = {0x0007000600050004, 0x000300020001fffe};
     uint64_t z8_expected[] = {0x000300020001fffa, 0xfffbfffcfffdfffe};
     uint64_t z9_expected[] = {0x0000000300000002, 0x00000001fffffffe};
     uint64_t z10_expected[] = {0x00000001fffffffc, 0xfffffffdfffffffe};
     uint64_t z11_expected[] = {0x0000000000000001, 0xffffffffffffffff};
     uint64_t z30_expected[] = {0x0000000000000001, 0xfffffffffffffffe};

     ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
     ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
     ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
     ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
     ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
     ASSERT_EQUAL_SVE(z8_expected, z8.VnD());
     ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
     ASSERT_EQUAL_SVE(z10_expected, z10.VnD());
     ASSERT_EQUAL_SVE(z11_expected, z11.VnD());
     ASSERT_EQUAL_SVE(z30_expected, z30.VnD());
   }
 }

 TEST_SVE(sve_predicate_logical) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // 0b...01011010'10110111
   int p10_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1};  // Pm
   // 0b...11011001'01010010
   int p11_inputs[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0};  // Pn
   // 0b...01010101'10110010
   int p12_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};  // pg

   Initialise(&masm, p10.VnB(), p10_inputs);
   Initialise(&masm, p11.VnB(), p11_inputs);
   Initialise(&masm, p12.VnB(), p12_inputs);

   __ Ands(p0.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
   __ Mrs(x0, NZCV);
   __ Bics(p1.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
   __ Mrs(x1, NZCV);
   __ Eor(p2.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
   __ Nand(p3.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
   __ Nor(p4.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
   __ Orn(p5.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
   __ Orr(p6.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
   __ Sel(p7.VnB(), p12, p11.VnB(), p10.VnB());

   END();

   if (CAN_RUN()) {
     RUN();

     // 0b...01010000'00010010
     int p0_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0};
     // 0b...00000001'00000000
     int p1_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0};
     // 0b...00000001'10100000
     int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0};
     // 0b...00000101'10100000
     int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0};
     // 0b...00000100'00000000
     int p4_expected[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     // 0b...01010101'00010010
     int p5_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0};
     // 0b...01010001'10110010
     int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};
     // 0b...01011011'00010111
     int p7_expected[] = {0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1};

     ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
     ASSERT_EQUAL_SVE(p1_expected, p1.VnB());
     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
     ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
     ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
     ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
     ASSERT_EQUAL_SVE(p7_expected, p7.VnB());

     ASSERT_EQUAL_32(SVEFirstFlag, w0);
     ASSERT_EQUAL_32(SVENotLastFlag, w1);
   }
 }

 TEST_SVE(sve_int_compare_vectors) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int z10_inputs[] = {0x00, 0x80, 0xff, 0x7f, 0x00, 0x00, 0x00, 0xff};
   int z11_inputs[] = {0x00, 0x00, 0x00, 0x00, 0x80, 0xff, 0x7f, 0xfe};
   int p0_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1};
   InsrHelper(&masm, z10.VnB(), z10_inputs);
   InsrHelper(&masm, z11.VnB(), z11_inputs);
   Initialise(&masm, p0.VnB(), p0_inputs);

   __ Cmphs(p6.VnB(), p0.Zeroing(), z10.VnB(), z11.VnB());
   __ Mrs(x6, NZCV);

   uint64_t z12_inputs[] = {0xffffffffffffffff, 0x8000000000000000};
   uint64_t z13_inputs[] = {0x0000000000000000, 0x8000000000000000};
   int p1_inputs[] = {1, 1};
   InsrHelper(&masm, z12.VnD(), z12_inputs);
   InsrHelper(&masm, z13.VnD(), z13_inputs);
   Initialise(&masm, p1.VnD(), p1_inputs);

   __ Cmphi(p7.VnD(), p1.Zeroing(), z12.VnD(), z13.VnD());
   __ Mrs(x7, NZCV);

   int z14_inputs[] = {0, 32767, -1, -32767, 0, 0, 0, 32766};
   int z15_inputs[] = {0, 0, 0, 0, 32767, -1, -32767, 32767};

   int p2_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1};
   InsrHelper(&masm, z14.VnH(), z14_inputs);
   InsrHelper(&masm, z15.VnH(), z15_inputs);
   Initialise(&masm, p2.VnH(), p2_inputs);

   __ Cmpge(p8.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH());
   __ Mrs(x8, NZCV);

   __ Cmpeq(p9.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH());
   __ Mrs(x9, NZCV);

   int z16_inputs[] = {0, -1, 0, 0};
   int z17_inputs[] = {0, 0, 2147483647, -2147483648};
   int p3_inputs[] = {1, 1, 1, 1};
   InsrHelper(&masm, z16.VnS(), z16_inputs);
   InsrHelper(&masm, z17.VnS(), z17_inputs);
   Initialise(&masm, p3.VnS(), p3_inputs);

   __ Cmpgt(p10.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS());
   __ Mrs(x10, NZCV);

   __ Cmpne(p11.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS());
   __ Mrs(x11, NZCV);

   // Architectural aliases testing.
   __ Cmpls(p12.VnB(), p0.Zeroing(), z11.VnB(), z10.VnB());  // HS
   __ Cmplo(p13.VnD(), p1.Zeroing(), z13.VnD(), z12.VnD());  // HI
   __ Cmple(p14.VnH(), p2.Zeroing(), z15.VnH(), z14.VnH());  // GE
   __ Cmplt(p15.VnS(), p3.Zeroing(), z17.VnS(), z16.VnS());  // GT

   END();

   if (CAN_RUN()) {
     RUN();

     int p6_expected[] = {1, 0, 1, 1, 0, 0, 0, 1};
     for (size_t i = 0; i < ArrayLength(p6_expected); i++) {
       int lane = static_cast<int>(ArrayLength(p6_expected) - i - 1);
       ASSERT_EQUAL_SVE_LANE(p6_expected[i], p6.VnB(), lane);
     }

     int p7_expected[] = {1, 0};
     ASSERT_EQUAL_SVE(p7_expected, p7.VnD());

     int p8_expected[] = {1, 0, 0, 0, 0, 1, 1, 0};
     ASSERT_EQUAL_SVE(p8_expected, p8.VnH());

     int p9_expected[] = {1, 0, 0, 0, 0, 0, 0, 0};
     ASSERT_EQUAL_SVE(p9_expected, p9.VnH());

     int p10_expected[] = {0, 0, 0, 1};
     ASSERT_EQUAL_SVE(p10_expected, p10.VnS());

     int p11_expected[] = {0, 1, 1, 1};
     ASSERT_EQUAL_SVE(p11_expected, p11.VnS());

     // Reuse the expected results to verify the architectural aliases.
     ASSERT_EQUAL_SVE(p6_expected, p12.VnB());
     ASSERT_EQUAL_SVE(p7_expected, p13.VnD());
     ASSERT_EQUAL_SVE(p8_expected, p14.VnH());
     ASSERT_EQUAL_SVE(p10_expected, p15.VnS());

     ASSERT_EQUAL_32(SVEFirstFlag, w6);
     ASSERT_EQUAL_32(NoFlag, w7);
     ASSERT_EQUAL_32(NoFlag, w8);
     ASSERT_EQUAL_32(NoFlag, w9);
     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10);
   }
 }

 TEST_SVE(sve_int_compare_vectors_wide_elements) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int src1_inputs_1[] = {0, 1, -1, -128, 127, 100, -66};
   int src2_inputs_1[] = {0, -1};
   int mask_inputs_1[] = {1, 1, 1, 1, 1, 0, 1};
   InsrHelper(&masm, z13.VnB(), src1_inputs_1);
   InsrHelper(&masm, z19.VnD(), src2_inputs_1);
   Initialise(&masm, p0.VnB(), mask_inputs_1);

   __ Cmpge(p2.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
   __ Mrs(x2, NZCV);
   __ Cmpgt(p3.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
   __ Mrs(x3, NZCV);

   int src1_inputs_2[] = {0, 32767, -1, -32767, 1, 1234, 0, 32766};
   int src2_inputs_2[] = {0, -32767};
   int mask_inputs_2[] = {1, 0, 1, 1, 1, 1, 1, 1};
   InsrHelper(&masm, z13.VnH(), src1_inputs_2);
   InsrHelper(&masm, z19.VnD(), src2_inputs_2);
   Initialise(&masm, p0.VnH(), mask_inputs_2);

   __ Cmple(p4.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD());
   __ Mrs(x4, NZCV);
   __ Cmplt(p5.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD());
   __ Mrs(x5, NZCV);

   int src1_inputs_3[] = {0, -1, 2147483647, -2147483648};
   int src2_inputs_3[] = {0, -2147483648};
   int mask_inputs_3[] = {1, 1, 1, 1};
   InsrHelper(&masm, z13.VnS(), src1_inputs_3);
   InsrHelper(&masm, z19.VnD(), src2_inputs_3);
   Initialise(&masm, p0.VnS(), mask_inputs_3);

   __ Cmpeq(p6.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
   __ Mrs(x6, NZCV);
   __ Cmpne(p7.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
   __ Mrs(x7, NZCV);

   int src1_inputs_4[] = {0x00, 0x80, 0x7f, 0xff, 0x7f, 0xf0, 0x0f, 0x55};
   int src2_inputs_4[] = {0x00, 0x7f};
   int mask_inputs_4[] = {1, 1, 1, 1, 0, 1, 1, 1};
   InsrHelper(&masm, z13.VnB(), src1_inputs_4);
   InsrHelper(&masm, z19.VnD(), src2_inputs_4);
   Initialise(&masm, p0.VnB(), mask_inputs_4);

   __ Cmplo(p8.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
   __ Mrs(x8, NZCV);
   __ Cmpls(p9.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
   __ Mrs(x9, NZCV);

   int src1_inputs_5[] = {0x0000, 0x8000, 0x7fff, 0xffff};
   int src2_inputs_5[] = {0x8000, 0xffff};
   int mask_inputs_5[] = {1, 1, 1, 1};
   InsrHelper(&masm, z13.VnS(), src1_inputs_5);
   InsrHelper(&masm, z19.VnD(), src2_inputs_5);
   Initialise(&masm, p0.VnS(), mask_inputs_5);

   __ Cmphi(p10.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
   __ Mrs(x10, NZCV);
   __ Cmphs(p11.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
   __ Mrs(x11, NZCV);

   END();

   if (CAN_RUN()) {
     RUN();
     int p2_expected[] = {1, 1, 1, 0, 1, 0, 0};
     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());

     int p3_expected[] = {1, 1, 0, 0, 1, 0, 0};
     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());

     int p4_expected[] = {0x1, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0};
     ASSERT_EQUAL_SVE(p4_expected, p4.VnH());

     int p5_expected[] = {0x0, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0};
     ASSERT_EQUAL_SVE(p5_expected, p5.VnH());

     int p6_expected[] = {0x1, 0x0, 0x0, 0x1};
     ASSERT_EQUAL_SVE(p6_expected, p6.VnS());

     int p7_expected[] = {0x0, 0x1, 0x1, 0x0};
     ASSERT_EQUAL_SVE(p7_expected, p7.VnS());

     int p8_expected[] = {1, 0, 0, 0, 0, 0, 1, 1};
     ASSERT_EQUAL_SVE(p8_expected, p8.VnB());

     int p9_expected[] = {1, 0, 1, 0, 0, 0, 1, 1};
     ASSERT_EQUAL_SVE(p9_expected, p9.VnB());

     int p10_expected[] = {0x0, 0x0, 0x0, 0x0};
     ASSERT_EQUAL_SVE(p10_expected, p10.VnS());

     int p11_expected[] = {0x0, 0x1, 0x0, 0x1};
     ASSERT_EQUAL_SVE(p11_expected, p11.VnS());

     ASSERT_EQUAL_32(NoFlag, w2);
     ASSERT_EQUAL_32(NoFlag, w3);
     ASSERT_EQUAL_32(NoFlag, w4);
     ASSERT_EQUAL_32(SVENotLastFlag, w5);
     ASSERT_EQUAL_32(SVEFirstFlag, w6);
     ASSERT_EQUAL_32(SVENotLastFlag, w7);
     ASSERT_EQUAL_32(SVEFirstFlag, w8);
     ASSERT_EQUAL_32(SVEFirstFlag, w9);
     ASSERT_EQUAL_32(SVENotLastFlag | SVENoneFlag, w10);
     ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w11);
   }
 }

 TEST_SVE(sve_bitwise_imm) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // clang-format off
   uint64_t z21_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef};
   uint32_t z22_inputs[] = {0xfedcba98, 0x76543210, 0x01234567, 0x89abcdef};
   uint16_t z23_inputs[] = {0xfedc, 0xba98, 0x7654, 0x3210,
                            0x0123, 0x4567, 0x89ab, 0xcdef};
   uint8_t z24_inputs[] = {0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
                           0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef};
   // clang-format on

   InsrHelper(&masm, z1.VnD(), z21_inputs);
   InsrHelper(&masm, z2.VnS(), z22_inputs);
   InsrHelper(&masm, z3.VnH(), z23_inputs);
   InsrHelper(&masm, z4.VnB(), z24_inputs);

   __ And(z1.VnD(), z1.VnD(), 0x0000ffff0000ffff);
   __ And(z2.VnS(), z2.VnS(), 0xff0000ff);
   __ And(z3.VnH(), z3.VnH(), 0x0ff0);
   __ And(z4.VnB(), z4.VnB(), 0x3f);

   InsrHelper(&masm, z5.VnD(), z21_inputs);
   InsrHelper(&masm, z6.VnS(), z22_inputs);
   InsrHelper(&masm, z7.VnH(), z23_inputs);
   InsrHelper(&masm, z8.VnB(), z24_inputs);

   __ Eor(z5.VnD(), z5.VnD(), 0x0000ffff0000ffff);
   __ Eor(z6.VnS(), z6.VnS(), 0xff0000ff);
   __ Eor(z7.VnH(), z7.VnH(), 0x0ff0);
   __ Eor(z8.VnB(), z8.VnB(), 0x3f);

   InsrHelper(&masm, z9.VnD(), z21_inputs);
   InsrHelper(&masm, z10.VnS(), z22_inputs);
   InsrHelper(&masm, z11.VnH(), z23_inputs);
   InsrHelper(&masm, z12.VnB(), z24_inputs);

   __ Orr(z9.VnD(), z9.VnD(), 0x0000ffff0000ffff);
   __ Orr(z10.VnS(), z10.VnS(), 0xff0000ff);
   __ Orr(z11.VnH(), z11.VnH(), 0x0ff0);
   __ Orr(z12.VnB(), z12.VnB(), 0x3f);

   {
     // The `Dup` macro maps onto either `dup` or `dupm`, but has its own test,
     // so here we test `dupm` directly.
     ExactAssemblyScope guard(&masm, 4 * kInstructionSize);
     __ dupm(z13.VnD(), 0x7ffffff800000000);
     __ dupm(z14.VnS(), 0x7ffc7ffc);
     __ dupm(z15.VnH(), 0x3ffc);
     __ dupm(z16.VnB(), 0xc3);
   }

   END();

   if (CAN_RUN()) {
     RUN();

     // clang-format off
     uint64_t z1_expected[] = {0x0000ba9800003210, 0x000045670000cdef};
     uint32_t z2_expected[] = {0xfe000098, 0x76000010, 0x01000067, 0x890000ef};
     uint16_t z3_expected[] = {0x0ed0, 0x0a90, 0x0650, 0x0210,
                               0x0120, 0x0560, 0x09a0, 0x0de0};
     uint8_t z4_expected[] = {0x3e, 0x1c, 0x3a, 0x18, 0x36, 0x14, 0x32, 0x10,
                              0x01, 0x23, 0x05, 0x27, 0x09, 0x2b, 0x0d, 0x2f};

     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
     ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
     ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
     ASSERT_EQUAL_SVE(z4_expected, z4.VnB());

     uint64_t z5_expected[] = {0xfedc45677654cdef, 0x0123ba9889ab3210};
     uint32_t z6_expected[] = {0x01dcba67, 0x895432ef, 0xfe234598, 0x76abcd10};
     uint16_t z7_expected[] = {0xf12c, 0xb568, 0x79a4, 0x3de0,
                               0x0ed3, 0x4a97, 0x865b, 0xc21f};
     uint8_t z8_expected[] = {0xc1, 0xe3, 0x85, 0xa7, 0x49, 0x6b, 0x0d, 0x2f,
                              0x3e, 0x1c, 0x7a, 0x58, 0xb6, 0x94, 0xf2, 0xd0};

     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
     ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
     ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
     ASSERT_EQUAL_SVE(z8_expected, z8.VnB());

     uint64_t z9_expected[] = {0xfedcffff7654ffff, 0x0123ffff89abffff};
     uint32_t z10_expected[] = {0xffdcbaff, 0xff5432ff,  0xff2345ff, 0xffabcdff};
     uint16_t z11_expected[] = {0xfffc, 0xbff8, 0x7ff4, 0x3ff0,
                                0x0ff3, 0x4ff7, 0x8ffb, 0xcfff};
     uint8_t z12_expected[] = {0xff, 0xff, 0xbf, 0xbf, 0x7f, 0x7f, 0x3f, 0x3f,
                               0x3f, 0x3f, 0x7f, 0x7f, 0xbf, 0xbf, 0xff, 0xff};

     ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
     ASSERT_EQUAL_SVE(z10_expected, z10.VnS());
     ASSERT_EQUAL_SVE(z11_expected, z11.VnH());
     ASSERT_EQUAL_SVE(z12_expected, z12.VnB());

     uint64_t z13_expected[] = {0x7ffffff800000000, 0x7ffffff800000000};
     uint32_t z14_expected[] = {0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc};
     uint16_t z15_expected[] = {0x3ffc, 0x3ffc, 0x3ffc, 0x3ffc,
                                0x3ffc, 0x3ffc, 0x3ffc ,0x3ffc};
     ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
     ASSERT_EQUAL_SVE(z14_expected, z14.VnS());
     ASSERT_EQUAL_SVE(z15_expected, z15.VnH());
     // clang-format on
   }
 }

 TEST_SVE(sve_dup_imm) {
   // The `Dup` macro can generate `dup`, `dupm`, and it can synthesise
   // unencodable immediates.

   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Encodable with `dup` (shift 0).
   __ Dup(z0.VnD(), -1);
   __ Dup(z1.VnS(), 0x7f);
   __ Dup(z2.VnH(), -0x80);
   __ Dup(z3.VnB(), 42);

   // Encodable with `dup` (shift 8).
   __ Dup(z4.VnD(), -42 * 256);
   __ Dup(z5.VnS(), -0x8000);
   __ Dup(z6.VnH(), 0x7f00);
   // B-sized lanes cannot take a shift of 8.

   // Encodable with `dupm` (but not `dup`).
   __ Dup(z10.VnD(), 0x3fc);
   __ Dup(z11.VnS(), -516097);  // 0xfff81fff, as a signed int.
   __ Dup(z12.VnH(), 0x0001);
   // All values that fit B-sized lanes are encodable with `dup`.

   // Cases that require immediate synthesis.
   __ Dup(z20.VnD(), 0x1234);
   __ Dup(z21.VnD(), -4242);
   __ Dup(z22.VnD(), 0xfedcba9876543210);
   __ Dup(z23.VnS(), 0x01020304);
   __ Dup(z24.VnS(), -0x01020304);
   __ Dup(z25.VnH(), 0x3c38);
   // All values that fit B-sized lanes are directly encodable.

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(0xffffffffffffffff, z0.VnD());
     ASSERT_EQUAL_SVE(0x0000007f, z1.VnS());
     ASSERT_EQUAL_SVE(0xff80, z2.VnH());
     ASSERT_EQUAL_SVE(0x2a, z3.VnB());

     ASSERT_EQUAL_SVE(0xffffffffffffd600, z4.VnD());
     ASSERT_EQUAL_SVE(0xffff8000, z5.VnS());
     ASSERT_EQUAL_SVE(0x7f00, z6.VnH());

     ASSERT_EQUAL_SVE(0x00000000000003fc, z10.VnD());
     ASSERT_EQUAL_SVE(0xfff81fff, z11.VnS());
     ASSERT_EQUAL_SVE(0x0001, z12.VnH());

     ASSERT_EQUAL_SVE(0x1234, z20.VnD());
     ASSERT_EQUAL_SVE(0xffffffffffffef6e, z21.VnD());
     ASSERT_EQUAL_SVE(0xfedcba9876543210, z22.VnD());
     ASSERT_EQUAL_SVE(0x01020304, z23.VnS());
     ASSERT_EQUAL_SVE(0xfefdfcfc, z24.VnS());
     ASSERT_EQUAL_SVE(0x3c38, z25.VnH());
   }
 }

 TEST_SVE(sve_inc_dec_p_scalar) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
   Initialise(&masm, p0.VnB(), p0_inputs);

   int p0_b_count = 9;
   int p0_h_count = 5;
   int p0_s_count = 3;
   int p0_d_count = 2;

   // 64-bit operations preserve their high bits.
   __ Mov(x0, 0x123456780000002a);
   __ Decp(x0, p0.VnB());

   __ Mov(x1, 0x123456780000002a);
   __ Incp(x1, p0.VnH());

   // Check that saturation does not occur.
   __ Mov(x10, 1);
   __ Decp(x10, p0.VnS());

   __ Mov(x11, UINT64_MAX);
   __ Incp(x11, p0.VnD());

   __ Mov(x12, INT64_MAX);
   __ Incp(x12, p0.VnB());

   // With an all-true predicate, these instructions increment or decrement by
   // the vector length.
   __ Ptrue(p15.VnB());

   __ Mov(x20, 0x4000000000000000);
   __ Decp(x20, p15.VnB());

   __ Mov(x21, 0x4000000000000000);
   __ Incp(x21, p15.VnH());

   END();
   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_64(0x123456780000002a - p0_b_count, x0);
     ASSERT_EQUAL_64(0x123456780000002a + p0_h_count, x1);

     ASSERT_EQUAL_64(UINT64_C(1) - p0_s_count, x10);
     ASSERT_EQUAL_64(UINT64_MAX + p0_d_count, x11);
     ASSERT_EQUAL_64(static_cast<uint64_t>(INT64_MAX) + p0_b_count, x12);

     ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20);
     ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21);
   }
 }

 TEST_SVE(sve_sqinc_sqdec_p_scalar) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
   Initialise(&masm, p0.VnB(), p0_inputs);

   int p0_b_count = 9;
   int p0_h_count = 5;
   int p0_s_count = 3;
   int p0_d_count = 2;

   uint64_t placeholder_high = 0x1234567800000000;

   // 64-bit operations preserve their high bits.
   __ Mov(x0, placeholder_high + 42);
   __ Sqdecp(x0, p0.VnB());

   __ Mov(x1, placeholder_high + 42);
   __ Sqincp(x1, p0.VnH());

   // 32-bit operations sign-extend into their high bits.
   __ Mov(x2, placeholder_high + 42);
   __ Sqdecp(x2, p0.VnS(), w2);

   __ Mov(x3, placeholder_high + 42);
   __ Sqincp(x3, p0.VnD(), w3);

   __ Mov(x4, placeholder_high + 1);
   __ Sqdecp(x4, p0.VnS(), w4);

   __ Mov(x5, placeholder_high - 1);
   __ Sqincp(x5, p0.VnD(), w5);

   // Check that saturation behaves correctly.
   __ Mov(x10, 0x8000000000000001);  // INT64_MIN + 1
   __ Sqdecp(x10, p0.VnB());

   __ Mov(x11, placeholder_high + 0x80000001);  // INT32_MIN + 1
   __ Sqdecp(x11, p0.VnH(), w11);

   __ Mov(x12, 1);
   __ Sqdecp(x12, p0.VnS());

   __ Mov(x13, placeholder_high + 1);
   __ Sqdecp(x13, p0.VnD(), w13);

   __ Mov(x14, 0x7ffffffffffffffe);  // INT64_MAX - 1
   __ Sqincp(x14, p0.VnB());

   __ Mov(x15, placeholder_high + 0x7ffffffe);  // INT32_MAX - 1
   __ Sqincp(x15, p0.VnH(), w15);

   // Don't use x16 and x17 since they are scratch registers by default.

   __ Mov(x18, 0xffffffffffffffff);
   __ Sqincp(x18, p0.VnS());

   __ Mov(x19, placeholder_high + 0xffffffff);
   __ Sqincp(x19, p0.VnD(), w19);

   __ Mov(x20, placeholder_high + 0xffffffff);
   __ Sqdecp(x20, p0.VnB(), w20);

   // With an all-true predicate, these instructions increment or decrement by
   // the vector length.
   __ Ptrue(p15.VnB());

   __ Mov(x21, 0);
   __ Sqdecp(x21, p15.VnB());

   __ Mov(x22, 0);
   __ Sqincp(x22, p15.VnH());

   __ Mov(x23, placeholder_high);
   __ Sqdecp(x23, p15.VnS(), w23);

   __ Mov(x24, placeholder_high);
   __ Sqincp(x24, p15.VnD(), w24);

   END();
   if (CAN_RUN()) {
     RUN();

     // 64-bit operations preserve their high bits.
     ASSERT_EQUAL_64(placeholder_high + 42 - p0_b_count, x0);
     ASSERT_EQUAL_64(placeholder_high + 42 + p0_h_count, x1);

     // 32-bit operations sign-extend into their high bits.
     ASSERT_EQUAL_64(42 - p0_s_count, x2);
     ASSERT_EQUAL_64(42 + p0_d_count, x3);
     ASSERT_EQUAL_64(0xffffffff00000000 | (1 - p0_s_count), x4);
     ASSERT_EQUAL_64(p0_d_count - 1, x5);

     // Check that saturation behaves correctly.
     ASSERT_EQUAL_64(INT64_MIN, x10);
     ASSERT_EQUAL_64(INT32_MIN, x11);
     ASSERT_EQUAL_64(1 - p0_s_count, x12);
     ASSERT_EQUAL_64(1 - p0_d_count, x13);
     ASSERT_EQUAL_64(INT64_MAX, x14);
     ASSERT_EQUAL_64(INT32_MAX, x15);
     ASSERT_EQUAL_64(p0_s_count - 1, x18);
     ASSERT_EQUAL_64(p0_d_count - 1, x19);
     ASSERT_EQUAL_64(-1 - p0_b_count, x20);

     // Check all-true predicates.
     ASSERT_EQUAL_64(-core.GetSVELaneCount(kBRegSize), x21);
     ASSERT_EQUAL_64(core.GetSVELaneCount(kHRegSize), x22);
     ASSERT_EQUAL_64(-core.GetSVELaneCount(kSRegSize), x23);
     ASSERT_EQUAL_64(core.GetSVELaneCount(kDRegSize), x24);
   }
 }

 TEST_SVE(sve_uqinc_uqdec_p_scalar) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
   Initialise(&masm, p0.VnB(), p0_inputs);

   int p0_b_count = 9;
   int p0_h_count = 5;
   int p0_s_count = 3;
   int p0_d_count = 2;

   uint64_t placeholder_high = 0x1234567800000000;

   // 64-bit operations preserve their high bits.
   __ Mov(x0, placeholder_high + 42);
   __ Uqdecp(x0, p0.VnB());

   __ Mov(x1, placeholder_high + 42);
   __ Uqincp(x1, p0.VnH());

   // 32-bit operations zero-extend into their high bits.
   __ Mov(x2, placeholder_high + 42);
   __ Uqdecp(x2, p0.VnS(), w2);

   __ Mov(x3, placeholder_high + 42);
   __ Uqincp(x3, p0.VnD(), w3);

   __ Mov(x4, placeholder_high + 0x80000001);
   __ Uqdecp(x4, p0.VnS(), w4);

   __ Mov(x5, placeholder_high + 0x7fffffff);
   __ Uqincp(x5, p0.VnD(), w5);

   // Check that saturation behaves correctly.
   __ Mov(x10, 1);
   __ Uqdecp(x10, p0.VnB(), x10);

   __ Mov(x11, placeholder_high + 1);
   __ Uqdecp(x11, p0.VnH(), w11);

   __ Mov(x12, 0x8000000000000000);  // INT64_MAX + 1
   __ Uqdecp(x12, p0.VnS(), x12);

   __ Mov(x13, placeholder_high + 0x80000000);  // INT32_MAX + 1
   __ Uqdecp(x13, p0.VnD(), w13);

   __ Mov(x14, 0xfffffffffffffffe);  // UINT64_MAX - 1
   __ Uqincp(x14, p0.VnB(), x14);

   __ Mov(x15, placeholder_high + 0xfffffffe);  // UINT32_MAX - 1
   __ Uqincp(x15, p0.VnH(), w15);

   // Don't use x16 and x17 since they are scratch registers by default.

   __ Mov(x18, 0x7ffffffffffffffe);  // INT64_MAX - 1
   __ Uqincp(x18, p0.VnS(), x18);

   __ Mov(x19, placeholder_high + 0x7ffffffe);  // INT32_MAX - 1
   __ Uqincp(x19, p0.VnD(), w19);

   // With an all-true predicate, these instructions increment or decrement by
   // the vector length.
   __ Ptrue(p15.VnB());

   __ Mov(x20, 0x4000000000000000);
   __ Uqdecp(x20, p15.VnB(), x20);

   __ Mov(x21, 0x4000000000000000);
   __ Uqincp(x21, p15.VnH(), x21);

   __ Mov(x22, placeholder_high + 0x40000000);
   __ Uqdecp(x22, p15.VnS(), w22);

   __ Mov(x23, placeholder_high + 0x40000000);
   __ Uqincp(x23, p15.VnD(), w23);

   END();
   if (CAN_RUN()) {
     RUN();

     // 64-bit operations preserve their high bits.
     ASSERT_EQUAL_64(placeholder_high + 42 - p0_b_count, x0);
     ASSERT_EQUAL_64(placeholder_high + 42 + p0_h_count, x1);

     // 32-bit operations zero-extend into their high bits.
     ASSERT_EQUAL_64(42 - p0_s_count, x2);
     ASSERT_EQUAL_64(42 + p0_d_count, x3);
     ASSERT_EQUAL_64(UINT64_C(0x80000001) - p0_s_count, x4);
     ASSERT_EQUAL_64(UINT64_C(0x7fffffff) + p0_d_count, x5);

     // Check that saturation behaves correctly.
     ASSERT_EQUAL_64(0, x10);
     ASSERT_EQUAL_64(0, x11);
     ASSERT_EQUAL_64(0x8000000000000000 - p0_s_count, x12);
     ASSERT_EQUAL_64(UINT64_C(0x80000000) - p0_d_count, x13);
     ASSERT_EQUAL_64(UINT64_MAX, x14);
     ASSERT_EQUAL_64(UINT32_MAX, x15);
     ASSERT_EQUAL_64(0x7ffffffffffffffe + p0_s_count, x18);
     ASSERT_EQUAL_64(UINT64_C(0x7ffffffe) + p0_d_count, x19);

     // Check all-true predicates.
     ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20);
     ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21);
     ASSERT_EQUAL_64(0x40000000 - core.GetSVELaneCount(kSRegSize), x22);
     ASSERT_EQUAL_64(0x40000000 + core.GetSVELaneCount(kDRegSize), x23);
   }
 }

 TEST_SVE(sve_inc_dec_p_vector) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
   int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
   Initialise(&masm, p0.VnB(), p0_inputs);

   // Check that saturation does not occur.

   int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN};
   InsrHelper(&masm, z0.VnD(), z0_inputs);

   int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX};
   InsrHelper(&masm, z1.VnD(), z1_inputs);

   int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN};
   InsrHelper(&masm, z2.VnS(), z2_inputs);

   int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX};
   InsrHelper(&masm, z3.VnH(), z3_inputs);

   // The MacroAssembler implements non-destructive operations using movprfx.
   __ Decp(z10.VnD(), p0, z0.VnD());
   __ Decp(z11.VnD(), p0, z1.VnD());
   __ Decp(z12.VnS(), p0, z2.VnS());
   __ Decp(z13.VnH(), p0, z3.VnH());

   __ Incp(z14.VnD(), p0, z0.VnD());
   __ Incp(z15.VnD(), p0, z1.VnD());
   __ Incp(z16.VnS(), p0, z2.VnS());
   __ Incp(z17.VnH(), p0, z3.VnH());

   // Also test destructive forms.
   __ Mov(z4, z0);
   __ Mov(z5, z1);
   __ Mov(z6, z2);
   __ Mov(z7, z3);

   __ Decp(z0.VnD(), p0);
   __ Decp(z1.VnD(), p0);
   __ Decp(z2.VnS(), p0);
   __ Decp(z3.VnH(), p0);

   __ Incp(z4.VnD(), p0);
   __ Incp(z5.VnD(), p0);
   __ Incp(z6.VnS(), p0);
   __ Incp(z7.VnH(), p0);

   END();
   if (CAN_RUN()) {
     RUN();

     // z0_inputs[...] - number of active D lanes (2)
     int64_t z0_expected[] = {0x1234567800000040, -2, -1, 0x7ffffffffffffffe};
     ASSERT_EQUAL_SVE(z0_expected, z0.VnD());

     // z1_inputs[...] - number of active D lanes (2)
     int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd};
     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());

     // z2_inputs[...] - number of active S lanes (3)
     int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, 0x7ffffffd};
     ASSERT_EQUAL_SVE(z2_expected, z2.VnS());

     // z3_inputs[...] - number of active H lanes (5)
     int16_t z3_expected[] = {0x1225, -5, -4, -6, 0x7ffb, 0x7ffa};
     ASSERT_EQUAL_SVE(z3_expected, z3.VnH());

     // z0_inputs[...] + number of active D lanes (2)
     uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());

     // z1_inputs[...] + number of active D lanes (2)
     uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, 0x8000000000000001};
     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());

     // z2_inputs[...] + number of active S lanes (3)
     uint32_t z6_expected[] = {0x12340045, 3, 2, 4, 0x80000002, 0x80000003};
     ASSERT_EQUAL_SVE(z6_expected, z6.VnS());

     // z3_inputs[...] + number of active H lanes (5)
     uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, 0x8004};
     ASSERT_EQUAL_SVE(z7_expected, z7.VnH());

     // Check that the non-destructive macros produced the same results.
     ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
     ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
     ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
     ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
     ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
     ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
     ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
     ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
   }
 }

 TEST_SVE(sve_inc_dec_ptrue_vector) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // With an all-true predicate, these instructions increment or decrement by
   // the vector length.
   __ Ptrue(p15.VnB());

   __ Dup(z0.VnD(), 0);
   __ Decp(z0.VnD(), p15);

   __ Dup(z1.VnS(), 0);
   __ Decp(z1.VnS(), p15);

   __ Dup(z2.VnH(), 0);
   __ Decp(z2.VnH(), p15);

   __ Dup(z3.VnD(), 0);
   __ Incp(z3.VnD(), p15);

   __ Dup(z4.VnS(), 0);
   __ Incp(z4.VnS(), p15);

   __ Dup(z5.VnH(), 0);
   __ Incp(z5.VnH(), p15);

   END();
   if (CAN_RUN()) {
     RUN();

     int d_lane_count = core.GetSVELaneCount(kDRegSize);
     int s_lane_count = core.GetSVELaneCount(kSRegSize);
     int h_lane_count = core.GetSVELaneCount(kHRegSize);

     for (int i = 0; i < d_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i);
     }

     for (int i = 0; i < s_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i);
       ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i);
     }

     for (int i = 0; i < h_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i);
       ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i);
     }
   }
 }

 TEST_SVE(sve_sqinc_sqdec_p_vector) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
   int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
   Initialise(&masm, p0.VnB(), p0_inputs);

   // Check that saturation behaves correctly.

   int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN};
   InsrHelper(&masm, z0.VnD(), z0_inputs);

   int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX};
   InsrHelper(&masm, z1.VnD(), z1_inputs);

   int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN};
   InsrHelper(&masm, z2.VnS(), z2_inputs);

   int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX};
   InsrHelper(&masm, z3.VnH(), z3_inputs);

   // The MacroAssembler implements non-destructive operations using movprfx.
   __ Sqdecp(z10.VnD(), p0, z0.VnD());
   __ Sqdecp(z11.VnD(), p0, z1.VnD());
   __ Sqdecp(z12.VnS(), p0, z2.VnS());
   __ Sqdecp(z13.VnH(), p0, z3.VnH());

   __ Sqincp(z14.VnD(), p0, z0.VnD());
   __ Sqincp(z15.VnD(), p0, z1.VnD());
   __ Sqincp(z16.VnS(), p0, z2.VnS());
   __ Sqincp(z17.VnH(), p0, z3.VnH());

   // Also test destructive forms.
   __ Mov(z4, z0);
   __ Mov(z5, z1);
   __ Mov(z6, z2);
   __ Mov(z7, z3);

   __ Sqdecp(z0.VnD(), p0);
   __ Sqdecp(z1.VnD(), p0);
   __ Sqdecp(z2.VnS(), p0);
   __ Sqdecp(z3.VnH(), p0);

   __ Sqincp(z4.VnD(), p0);
   __ Sqincp(z5.VnD(), p0);
   __ Sqincp(z6.VnS(), p0);
   __ Sqincp(z7.VnH(), p0);

   END();
   if (CAN_RUN()) {
     RUN();

     // z0_inputs[...] - number of active D lanes (2)
     int64_t z0_expected[] = {0x1234567800000040, -2, -1, INT64_MIN};
     ASSERT_EQUAL_SVE(z0_expected, z0.VnD());

     // z1_inputs[...] - number of active D lanes (2)
     int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd};
     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());

     // z2_inputs[...] - number of active S lanes (3)
     int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, INT32_MIN};
     ASSERT_EQUAL_SVE(z2_expected, z2.VnS());

     // z3_inputs[...] - number of active H lanes (5)
     int16_t z3_expected[] = {0x1225, -5, -4, -6, INT16_MIN, 0x7ffa};
     ASSERT_EQUAL_SVE(z3_expected, z3.VnH());

     // z0_inputs[...] + number of active D lanes (2)
     uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());

     // z1_inputs[...] + number of active D lanes (2)
     uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, INT64_MAX};
     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());

     // z2_inputs[...] + number of active S lanes (3)
     uint32_t z6_expected[] = {0x12340045, 3, 2, 4, INT32_MAX, 0x80000003};
     ASSERT_EQUAL_SVE(z6_expected, z6.VnS());

     // z3_inputs[...] + number of active H lanes (5)
     uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, INT16_MAX};
     ASSERT_EQUAL_SVE(z7_expected, z7.VnH());

     // Check that the non-destructive macros produced the same results.
     ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
     ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
     ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
     ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
     ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
     ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
     ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
     ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
   }
 }

 TEST_SVE(sve_sqinc_sqdec_ptrue_vector) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // With an all-true predicate, these instructions increment or decrement by
   // the vector length.
   __ Ptrue(p15.VnB());

   __ Dup(z0.VnD(), 0);
   __ Sqdecp(z0.VnD(), p15);

   __ Dup(z1.VnS(), 0);
   __ Sqdecp(z1.VnS(), p15);

   __ Dup(z2.VnH(), 0);
   __ Sqdecp(z2.VnH(), p15);

   __ Dup(z3.VnD(), 0);
   __ Sqincp(z3.VnD(), p15);

   __ Dup(z4.VnS(), 0);
   __ Sqincp(z4.VnS(), p15);

   __ Dup(z5.VnH(), 0);
   __ Sqincp(z5.VnH(), p15);

   END();
   if (CAN_RUN()) {
     RUN();

     int d_lane_count = core.GetSVELaneCount(kDRegSize);
     int s_lane_count = core.GetSVELaneCount(kSRegSize);
     int h_lane_count = core.GetSVELaneCount(kHRegSize);

     for (int i = 0; i < d_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i);
     }

     for (int i = 0; i < s_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i);
       ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i);
     }

     for (int i = 0; i < h_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i);
       ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i);
     }
   }
 }

 TEST_SVE(sve_uqinc_uqdec_p_vector) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
   int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
   Initialise(&masm, p0.VnB(), p0_inputs);

   // Check that saturation behaves correctly.

   uint64_t z0_inputs[] = {0x1234567800000042, 0, 1, 0x8000000000000000};
   InsrHelper(&masm, z0.VnD(), z0_inputs);

   uint64_t z1_inputs[] = {0x12345678ffffff2a, 0, UINT64_MAX, INT64_MAX};
   InsrHelper(&masm, z1.VnD(), z1_inputs);

   uint32_t z2_inputs[] = {0x12340042, 0, UINT32_MAX, 1, INT32_MAX, 0x80000000};
   InsrHelper(&masm, z2.VnS(), z2_inputs);

   uint16_t z3_inputs[] = {0x122a, 0, 1, UINT16_MAX, 0x8000, INT16_MAX};
   InsrHelper(&masm, z3.VnH(), z3_inputs);

   // The MacroAssembler implements non-destructive operations using movprfx.
   __ Uqdecp(z10.VnD(), p0, z0.VnD());
   __ Uqdecp(z11.VnD(), p0, z1.VnD());
   __ Uqdecp(z12.VnS(), p0, z2.VnS());
   __ Uqdecp(z13.VnH(), p0, z3.VnH());

   __ Uqincp(z14.VnD(), p0, z0.VnD());
   __ Uqincp(z15.VnD(), p0, z1.VnD());
   __ Uqincp(z16.VnS(), p0, z2.VnS());
   __ Uqincp(z17.VnH(), p0, z3.VnH());

   // Also test destructive forms.
   __ Mov(z4, z0);
   __ Mov(z5, z1);
   __ Mov(z6, z2);
   __ Mov(z7, z3);

   __ Uqdecp(z0.VnD(), p0);
   __ Uqdecp(z1.VnD(), p0);
   __ Uqdecp(z2.VnS(), p0);
   __ Uqdecp(z3.VnH(), p0);

   __ Uqincp(z4.VnD(), p0);
   __ Uqincp(z5.VnD(), p0);
   __ Uqincp(z6.VnS(), p0);
   __ Uqincp(z7.VnH(), p0);

   END();
   if (CAN_RUN()) {
     RUN();

     // z0_inputs[...] - number of active D lanes (2)
     uint64_t z0_expected[] = {0x1234567800000040, 0, 0, 0x7ffffffffffffffe};
     ASSERT_EQUAL_SVE(z0_expected, z0.VnD());

     // z1_inputs[...] - number of active D lanes (2)
     uint64_t z1_expected[] = {0x12345678ffffff28,
                               0,
                               0xfffffffffffffffd,
                               0x7ffffffffffffffd};
     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());

     // z2_inputs[...] - number of active S lanes (3)
     uint32_t z2_expected[] =
         {0x1234003f, 0, 0xfffffffc, 0, 0x7ffffffc, 0x7ffffffd};
     ASSERT_EQUAL_SVE(z2_expected, z2.VnS());

     // z3_inputs[...] - number of active H lanes (5)
     uint16_t z3_expected[] = {0x1225, 0, 0, 0xfffa, 0x7ffb, 0x7ffa};
     ASSERT_EQUAL_SVE(z3_expected, z3.VnH());

     // z0_inputs[...] + number of active D lanes (2)
     uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());

     // z1_inputs[...] + number of active D lanes (2)
     uint64_t z5_expected[] = {0x12345678ffffff2c,
                               2,
                               UINT64_MAX,
                               0x8000000000000001};
     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());

     // z2_inputs[...] + number of active S lanes (3)
     uint32_t z6_expected[] =
         {0x12340045, 3, UINT32_MAX, 4, 0x80000002, 0x80000003};
     ASSERT_EQUAL_SVE(z6_expected, z6.VnS());

     // z3_inputs[...] + number of active H lanes (5)
     uint16_t z7_expected[] = {0x122f, 5, 6, UINT16_MAX, 0x8005, 0x8004};
     ASSERT_EQUAL_SVE(z7_expected, z7.VnH());

     // Check that the non-destructive macros produced the same results.
     ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
     ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
     ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
     ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
     ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
     ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
     ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
     ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
   }
 }

 TEST_SVE(sve_uqinc_uqdec_ptrue_vector) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // With an all-true predicate, these instructions increment or decrement by
   // the vector length.
   __ Ptrue(p15.VnB());

   __ Mov(x0, 0x1234567800000000);
   __ Mov(x1, 0x12340000);
   __ Mov(x2, 0x1200);

   __ Dup(z0.VnD(), x0);
   __ Uqdecp(z0.VnD(), p15);

   __ Dup(z1.VnS(), x1);
   __ Uqdecp(z1.VnS(), p15);

   __ Dup(z2.VnH(), x2);
   __ Uqdecp(z2.VnH(), p15);

   __ Dup(z3.VnD(), x0);
   __ Uqincp(z3.VnD(), p15);

   __ Dup(z4.VnS(), x1);
   __ Uqincp(z4.VnS(), p15);

   __ Dup(z5.VnH(), x2);
   __ Uqincp(z5.VnH(), p15);

   END();
   if (CAN_RUN()) {
     RUN();

     int d_lane_count = core.GetSVELaneCount(kDRegSize);
     int s_lane_count = core.GetSVELaneCount(kSRegSize);
     int h_lane_count = core.GetSVELaneCount(kHRegSize);

     for (int i = 0; i < d_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE(0x1234567800000000 - d_lane_count, z0.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0x1234567800000000 + d_lane_count, z3.VnD(), i);
     }

     for (int i = 0; i < s_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE(0x12340000 - s_lane_count, z1.VnS(), i);
       ASSERT_EQUAL_SVE_LANE(0x12340000 + s_lane_count, z4.VnS(), i);
     }

     for (int i = 0; i < h_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE(0x1200 - h_lane_count, z2.VnH(), i);
       ASSERT_EQUAL_SVE_LANE(0x1200 + h_lane_count, z5.VnH(), i);
     }
   }
 }

 TEST_SVE(sve_index) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Simple cases.
   __ Index(z0.VnB(), 0, 1);
   __ Index(z1.VnH(), 1, 1);
   __ Index(z2.VnS(), 2, 1);
   __ Index(z3.VnD(), 3, 1);

   // Synthesised immediates.
   __ Index(z4.VnB(), 42, -1);
   __ Index(z5.VnH(), -1, 42);
   __ Index(z6.VnS(), 42, 42);

   // Register arguments.
   __ Mov(x0, 42);
   __ Mov(x1, -3);
   __ Index(z10.VnD(), x0, x1);
   __ Index(z11.VnB(), w0, w1);
   // The register size should correspond to the lane size, but VIXL allows any
   // register at least as big as the lane size.
   __ Index(z12.VnB(), x0, x1);
   __ Index(z13.VnH(), w0, x1);
   __ Index(z14.VnS(), x0, w1);

   // Integer overflow.
   __ Index(z20.VnB(), UINT8_MAX - 2, 2);
   __ Index(z21.VnH(), 7, -3);
   __ Index(z22.VnS(), INT32_MAX - 2, 1);
   __ Index(z23.VnD(), INT64_MIN + 6, -7);

   END();

   if (CAN_RUN()) {
     RUN();

     int b_lane_count = core.GetSVELaneCount(kBRegSize);
     int h_lane_count = core.GetSVELaneCount(kHRegSize);
     int s_lane_count = core.GetSVELaneCount(kSRegSize);
     int d_lane_count = core.GetSVELaneCount(kDRegSize);

     uint64_t b_mask = GetUintMask(kBRegSize);
     uint64_t h_mask = GetUintMask(kHRegSize);
     uint64_t s_mask = GetUintMask(kSRegSize);
     uint64_t d_mask = GetUintMask(kDRegSize);

     // Simple cases.
     for (int i = 0; i < b_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE((0 + i) & b_mask, z0.VnB(), i);
     }
     for (int i = 0; i < h_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE((1 + i) & h_mask, z1.VnH(), i);
     }
     for (int i = 0; i < s_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE((2 + i) & s_mask, z2.VnS(), i);
     }
     for (int i = 0; i < d_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE((3 + i) & d_mask, z3.VnD(), i);
     }

     // Synthesised immediates.
     for (int i = 0; i < b_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE((42 - i) & b_mask, z4.VnB(), i);
     }
     for (int i = 0; i < h_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE((-1 + (42 * i)) & h_mask, z5.VnH(), i);
     }
     for (int i = 0; i < s_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE((42 + (42 * i)) & s_mask, z6.VnS(), i);
     }

     // Register arguments.
     for (int i = 0; i < d_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & d_mask, z10.VnD(), i);
     }
     for (int i = 0; i < b_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z11.VnB(), i);
     }
     for (int i = 0; i < b_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z12.VnB(), i);
     }
     for (int i = 0; i < h_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & h_mask, z13.VnH(), i);
     }
     for (int i = 0; i < s_lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & s_mask, z14.VnS(), i);
     }

     // Integer overflow.
     uint8_t expected_z20[] = {0x05, 0x03, 0x01, 0xff, 0xfd};
     ASSERT_EQUAL_SVE(expected_z20, z20.VnB());
     uint16_t expected_z21[] = {0xfffb, 0xfffe, 0x0001, 0x0004, 0x0007};
     ASSERT_EQUAL_SVE(expected_z21, z21.VnH());
     uint32_t expected_z22[] = {0x80000000, 0x7fffffff, 0x7ffffffe, 0x7ffffffd};
     ASSERT_EQUAL_SVE(expected_z22, z22.VnS());
     uint64_t expected_z23[] = {0x7fffffffffffffff, 0x8000000000000006};
     ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
   }
 }

 TEST(sve_int_compare_count_and_limit_scalars) {
   SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   __ Mov(w20, 0xfffffffd);
   __ Mov(w21, 0xffffffff);

   __ Whilele(p0.VnB(), w20, w21);
   __ Mrs(x0, NZCV);
   __ Whilele(p1.VnH(), w20, w21);
   __ Mrs(x1, NZCV);

   __ Mov(w20, 0xffffffff);
   __ Mov(w21, 0x00000000);

   __ Whilelt(p2.VnS(), w20, w21);
   __ Mrs(x2, NZCV);
   __ Whilelt(p3.VnD(), w20, w21);
   __ Mrs(x3, NZCV);

   __ Mov(w20, 0xfffffffd);
   __ Mov(w21, 0xffffffff);

   __ Whilels(p4.VnB(), w20, w21);
   __ Mrs(x4, NZCV);
   __ Whilels(p5.VnH(), w20, w21);
   __ Mrs(x5, NZCV);

   __ Mov(w20, 0xffffffff);
   __ Mov(w21, 0x00000000);

   __ Whilelo(p6.VnS(), w20, w21);
   __ Mrs(x6, NZCV);
   __ Whilelo(p7.VnD(), w20, w21);
   __ Mrs(x7, NZCV);

   __ Mov(x20, 0xfffffffffffffffd);
   __ Mov(x21, 0xffffffffffffffff);

   __ Whilele(p8.VnB(), x20, x21);
   __ Mrs(x8, NZCV);
   __ Whilele(p9.VnH(), x20, x21);
   __ Mrs(x9, NZCV);

   __ Mov(x20, 0xffffffffffffffff);
   __ Mov(x21, 0x0000000000000000);

   __ Whilelt(p10.VnS(), x20, x21);
   __ Mrs(x10, NZCV);
   __ Whilelt(p11.VnD(), x20, x21);
   __ Mrs(x11, NZCV);

   __ Mov(x20, 0xfffffffffffffffd);
   __ Mov(x21, 0xffffffffffffffff);

   __ Whilels(p12.VnB(), x20, x21);
   __ Mrs(x12, NZCV);
   __ Whilels(p13.VnH(), x20, x21);
   __ Mrs(x13, NZCV);

   __ Mov(x20, 0xffffffffffffffff);
   __ Mov(x21, 0x0000000000000000);

   __ Whilelo(p14.VnS(), x20, x21);
   __ Mrs(x14, NZCV);
   __ Whilelo(p15.VnD(), x20, x21);
   __ Mrs(x15, NZCV);

   END();

   if (CAN_RUN()) {
     RUN();

     // 0b...00000000'00000111
     int p0_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
     ASSERT_EQUAL_SVE(p0_expected, p0.VnB());

     // 0b...00000000'00010101
     int p1_expected[] = {0, 0, 0, 0, 0, 1, 1, 1};
     ASSERT_EQUAL_SVE(p1_expected, p1.VnH());

     int p2_expected[] = {0x0, 0x0, 0x0, 0x1};
     ASSERT_EQUAL_SVE(p2_expected, p2.VnS());

     int p3_expected[] = {0x00, 0x01};
     ASSERT_EQUAL_SVE(p3_expected, p3.VnD());

     // 0b...11111111'11111111
     int p4_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
     ASSERT_EQUAL_SVE(p4_expected, p4.VnB());

     // 0b...01010101'01010101
     int p5_expected[] = {1, 1, 1, 1, 1, 1, 1, 1};
     ASSERT_EQUAL_SVE(p5_expected, p5.VnH());

     int p6_expected[] = {0x0, 0x0, 0x0, 0x0};
     ASSERT_EQUAL_SVE(p6_expected, p6.VnS());

     int p7_expected[] = {0x00, 0x00};
     ASSERT_EQUAL_SVE(p7_expected, p7.VnD());

     // 0b...00000000'00000111
     int p8_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
     ASSERT_EQUAL_SVE(p8_expected, p8.VnB());

     // 0b...00000000'00010101
     int p9_expected[] = {0, 0, 0, 0, 0, 1, 1, 1};
     ASSERT_EQUAL_SVE(p9_expected, p9.VnH());

     int p10_expected[] = {0x0, 0x0, 0x0, 0x1};
     ASSERT_EQUAL_SVE(p10_expected, p10.VnS());

     int p11_expected[] = {0x00, 0x01};
     ASSERT_EQUAL_SVE(p11_expected, p11.VnD());

     // 0b...11111111'11111111
     int p12_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
     ASSERT_EQUAL_SVE(p12_expected, p12.VnB());

     // 0b...01010101'01010101
     int p13_expected[] = {1, 1, 1, 1, 1, 1, 1, 1};
     ASSERT_EQUAL_SVE(p13_expected, p13.VnH());

     int p14_expected[] = {0x0, 0x0, 0x0, 0x0};
     ASSERT_EQUAL_SVE(p14_expected, p14.VnS());

     int p15_expected[] = {0x00, 0x00};
     ASSERT_EQUAL_SVE(p15_expected, p15.VnD());

     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w0);
     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w1);
     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w2);
     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w3);
     ASSERT_EQUAL_32(SVEFirstFlag, w4);
     ASSERT_EQUAL_32(SVEFirstFlag, w5);
     ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w6);
     ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w7);
     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w8);
     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w9);
     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10);
     ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w11);
     ASSERT_EQUAL_32(SVEFirstFlag, w12);
     ASSERT_EQUAL_32(SVEFirstFlag, w13);
     ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w14);
     ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w15);
   }
 }

 TEST(sve_int_compare_count_and_limit_scalars_regression_test) {
   SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   __ Mov(w0, 0x7ffffffd);
   __ Mov(w1, 0x7fffffff);
   __ Whilele(p0.VnB(), w0, w1);

   END();

   if (CAN_RUN()) {
     RUN();

     int p0_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
     ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
   }
 }

 TEST(sve_int_compare_vectors_signed_imm) {
   SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int z13_inputs[] = {0, 1, -1, -15, 126, -127, -126, -15};
   int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 1, 1};
   InsrHelper(&masm, z13.VnB(), z13_inputs);
   Initialise(&masm, p0.VnB(), mask_inputs1);

   __ Cmpeq(p2.VnB(), p0.Zeroing(), z13.VnB(), -15);
   __ Mrs(x2, NZCV);
   __ Cmpeq(p3.VnB(), p0.Zeroing(), z13.VnB(), -127);

   int z14_inputs[] = {0, 1, -1, -32767, -32766, 32767, 32766, 0};
   int mask_inputs2[] = {1, 1, 1, 0, 1, 1, 1, 1};
   InsrHelper(&masm, z14.VnH(), z14_inputs);
   Initialise(&masm, p0.VnH(), mask_inputs2);

   __ Cmpge(p4.VnH(), p0.Zeroing(), z14.VnH(), -1);
   __ Mrs(x4, NZCV);
   __ Cmpge(p5.VnH(), p0.Zeroing(), z14.VnH(), -32767);

   int z15_inputs[] = {0, 1, -1, INT_MIN};
   int mask_inputs3[] = {0, 1, 1, 1};
   InsrHelper(&masm, z15.VnS(), z15_inputs);
   Initialise(&masm, p0.VnS(), mask_inputs3);

   __ Cmpgt(p6.VnS(), p0.Zeroing(), z15.VnS(), 0);
   __ Mrs(x6, NZCV);
   __ Cmpgt(p7.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1);

   __ Cmplt(p8.VnS(), p0.Zeroing(), z15.VnS(), 0);
   __ Mrs(x8, NZCV);
   __ Cmplt(p9.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1);

   int64_t z16_inputs[] = {0, -1};
   int mask_inputs4[] = {1, 1};
   InsrHelper(&masm, z16.VnD(), z16_inputs);
   Initialise(&masm, p0.VnD(), mask_inputs4);

   __ Cmple(p10.VnD(), p0.Zeroing(), z16.VnD(), -1);
   __ Mrs(x10, NZCV);
   __ Cmple(p11.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MIN);

   __ Cmpne(p12.VnD(), p0.Zeroing(), z16.VnD(), -1);
   __ Mrs(x12, NZCV);
   __ Cmpne(p13.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MAX);

   END();

   if (CAN_RUN()) {
     RUN();

     int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1};
     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());

     int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 0};
     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());

     int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1, 0x1};
     ASSERT_EQUAL_SVE(p4_expected, p4.VnH());

     int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1, 0x1};
     ASSERT_EQUAL_SVE(p5_expected, p5.VnH());

     int p6_expected[] = {0x0, 0x1, 0x0, 0x0};
     ASSERT_EQUAL_SVE(p6_expected, p6.VnS());

     int p7_expected[] = {0x0, 0x1, 0x1, 0x0};
     ASSERT_EQUAL_SVE(p7_expected, p7.VnS());

     int p8_expected[] = {0x0, 0x0, 0x1, 0x1};
     ASSERT_EQUAL_SVE(p8_expected, p8.VnS());

     int p9_expected[] = {0x0, 0x0, 0x0, 0x1};
     ASSERT_EQUAL_SVE(p9_expected, p9.VnS());

     int p10_expected[] = {0x00, 0x01};
     ASSERT_EQUAL_SVE(p10_expected, p10.VnD());

     int p11_expected[] = {0x00, 0x00};
     ASSERT_EQUAL_SVE(p11_expected, p11.VnD());

     int p12_expected[] = {0x01, 0x00};
     ASSERT_EQUAL_SVE(p12_expected, p12.VnD());

     int p13_expected[] = {0x01, 0x01};
     ASSERT_EQUAL_SVE(p13_expected, p13.VnD());

     ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w2);
     ASSERT_EQUAL_32(SVEFirstFlag, w4);
     ASSERT_EQUAL_32(NoFlag, w6);
     ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8);
     ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w10);
     ASSERT_EQUAL_32(NoFlag, w12);
   }
 }

 TEST(sve_int_compare_vectors_unsigned_imm) {
   SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint32_t src1_inputs[] = {0xf7, 0x0f, 0x8f, 0x1f, 0x83, 0x12, 0x00, 0xf1};
   int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 0, 1};
   InsrHelper(&masm, z13.VnB(), src1_inputs);
   Initialise(&masm, p0.VnB(), mask_inputs1);

   __ Cmphi(p2.VnB(), p0.Zeroing(), z13.VnB(), 0x0f);
   __ Mrs(x2, NZCV);
   __ Cmphi(p3.VnB(), p0.Zeroing(), z13.VnB(), 0xf0);

   uint32_t src2_inputs[] = {0xffff, 0x8000, 0x1fff, 0x0000, 0x1234};
   int mask_inputs2[] = {1, 1, 1, 1, 0};
   InsrHelper(&masm, z13.VnH(), src2_inputs);
   Initialise(&masm, p0.VnH(), mask_inputs2);

   __ Cmphs(p4.VnH(), p0.Zeroing(), z13.VnH(), 0x1f);
   __ Mrs(x4, NZCV);
   __ Cmphs(p5.VnH(), p0.Zeroing(), z13.VnH(), 0x1fff);

   uint32_t src3_inputs[] = {0xffffffff, 0xfedcba98, 0x0000ffff, 0x00000000};
   int mask_inputs3[] = {1, 1, 1, 1};
   InsrHelper(&masm, z13.VnS(), src3_inputs);
   Initialise(&masm, p0.VnS(), mask_inputs3);

   __ Cmplo(p6.VnS(), p0.Zeroing(), z13.VnS(), 0x3f);
   __ Mrs(x6, NZCV);
   __ Cmplo(p7.VnS(), p0.Zeroing(), z13.VnS(), 0x3f3f3f3f);

   uint64_t src4_inputs[] = {0xffffffffffffffff, 0x0000000000000000};
   int mask_inputs4[] = {1, 1};
   InsrHelper(&masm, z13.VnD(), src4_inputs);
   Initialise(&masm, p0.VnD(), mask_inputs4);

   __ Cmpls(p8.VnD(), p0.Zeroing(), z13.VnD(), 0x2f);
   __ Mrs(x8, NZCV);
   __ Cmpls(p9.VnD(), p0.Zeroing(), z13.VnD(), 0x800000000000000);

   END();

   if (CAN_RUN()) {
     RUN();

     int p2_expected[] = {1, 0, 1, 0, 1, 1, 0, 1};
     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());

     int p3_expected[] = {1, 0, 0, 0, 0, 0, 0, 1};
     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());

     int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0};
     ASSERT_EQUAL_SVE(p4_expected, p4.VnH());

     int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0};
     ASSERT_EQUAL_SVE(p5_expected, p5.VnH());

     int p6_expected[] = {0x0, 0x0, 0x0, 0x1};
     ASSERT_EQUAL_SVE(p6_expected, p6.VnS());

     int p7_expected[] = {0x0, 0x0, 0x1, 0x1};
     ASSERT_EQUAL_SVE(p7_expected, p7.VnS());

     int p8_expected[] = {0x00, 0x01};
     ASSERT_EQUAL_SVE(p8_expected, p8.VnD());

     int p9_expected[] = {0x00, 0x01};
     ASSERT_EQUAL_SVE(p9_expected, p9.VnD());

     ASSERT_EQUAL_32(SVEFirstFlag, w2);
     ASSERT_EQUAL_32(NoFlag, w4);
     ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w6);
     ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8);
   }
 }

 TEST(sve_int_compare_conditionally_terminate_scalars) {
   SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   __ Mov(x0, 0xfedcba9887654321);
   __ Mov(x1, 0x1000100010001000);

   // Initialise Z and C. These are preserved by cterm*, and the V flag is set to
   // !C if the condition does not hold.
   __ Mov(x10, NoFlag);
   __ Msr(NZCV, x10);

   __ Ctermeq(w0, w0);
   __ Mrs(x2, NZCV);
   __ Ctermeq(x0, x1);
   __ Mrs(x3, NZCV);
   __ Ctermne(x0, x0);
   __ Mrs(x4, NZCV);
   __ Ctermne(w0, w1);
   __ Mrs(x5, NZCV);

   // As above, but with all flags initially set.
   __ Mov(x10, NZCVFlag);
   __ Msr(NZCV, x10);

   __ Ctermeq(w0, w0);
   __ Mrs(x6, NZCV);
   __ Ctermeq(x0, x1);
   __ Mrs(x7, NZCV);
   __ Ctermne(x0, x0);
   __ Mrs(x8, NZCV);
   __ Ctermne(w0, w1);
   __ Mrs(x9, NZCV);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_32(SVEFirstFlag, w2);
     ASSERT_EQUAL_32(VFlag, w3);
     ASSERT_EQUAL_32(VFlag, w4);
     ASSERT_EQUAL_32(SVEFirstFlag, w5);

     ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w6);
     ASSERT_EQUAL_32(ZCFlag, w7);
     ASSERT_EQUAL_32(ZCFlag, w8);
     ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w9);
   }
 }

 // Work out what the architectural `PredTest` pseudocode should produce for the
 // given result and governing predicate.
 template <typename Tg, typename Td, int N>
 static StatusFlags GetPredTestFlags(const Td (&pd)[N],
                                     const Tg (&pg)[N],
                                     int vl) {
   int first = -1;
   int last = -1;
   bool any_active = false;

   // Only consider potentially-active lanes.
   int start = (N > vl) ? (N - vl) : 0;
   for (int i = start; i < N; i++) {
     if ((pg[i] & 1) == 1) {
       // Look for the first and last active lanes.
       // Note that the 'first' lane is the one with the highest index.
       if (last < 0) last = i;
       first = i;
       // Look for any active lanes that are also active in pd.
       if ((pd[i] & 1) == 1) any_active = true;
     }
   }

   uint32_t flags = 0;
   if ((first >= 0) && ((pd[first] & 1) == 1)) flags |= SVEFirstFlag;
   if (!any_active) flags |= SVENoneFlag;
   if ((last < 0) || ((pd[last] & 1) == 0)) flags |= SVENotLastFlag;
   return static_cast<StatusFlags>(flags);
 }

 typedef void (MacroAssembler::*PfirstPnextFn)(const PRegisterWithLaneSize& pd,
                                               const PRegister& pg,
                                               const PRegisterWithLaneSize& pn);
 template <typename Tg, typename Tn, typename Td>
 static void PfirstPnextHelper(Test* config,
                               PfirstPnextFn macro,
                               unsigned lane_size_in_bits,
                               const Tg& pg_inputs,
                               const Tn& pn_inputs,
                               const Td& pd_expected) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   PRegister pg = p15;
   PRegister pn = p14;
   Initialise(&masm, pg.WithLaneSize(lane_size_in_bits), pg_inputs);
   Initialise(&masm, pn.WithLaneSize(lane_size_in_bits), pn_inputs);

   // Initialise NZCV to an impossible value, to check that we actually write it.
   __ Mov(x10, NZCVFlag);

   // If pd.Is(pn), the MacroAssembler simply passes the arguments directly to
   // the Assembler.
   __ Msr(NZCV, x10);
   __ Mov(p0, pn);
   (masm.*macro)(p0.WithLaneSize(lane_size_in_bits),
                 pg,
                 p0.WithLaneSize(lane_size_in_bits));
   __ Mrs(x0, NZCV);

   // The MacroAssembler supports non-destructive use.
   __ Msr(NZCV, x10);
   (masm.*macro)(p1.WithLaneSize(lane_size_in_bits),
                 pg,
                 pn.WithLaneSize(lane_size_in_bits));
   __ Mrs(x1, NZCV);

   // If pd.Aliases(pg) the macro requires a scratch register.
   {
     UseScratchRegisterScope temps(&masm);
     temps.Include(p13);
     __ Msr(NZCV, x10);
     __ Mov(p2, p15);
     (masm.*macro)(p2.WithLaneSize(lane_size_in_bits),
                   p2,
                   pn.WithLaneSize(lane_size_in_bits));
     __ Mrs(x2, NZCV);
   }

   END();

   if (CAN_RUN()) {
     RUN();

     // Check that the inputs weren't modified.
     ASSERT_EQUAL_SVE(pn_inputs, pn.WithLaneSize(lane_size_in_bits));
     ASSERT_EQUAL_SVE(pg_inputs, pg.WithLaneSize(lane_size_in_bits));

     // Check the primary operation.
     ASSERT_EQUAL_SVE(pd_expected, p0.WithLaneSize(lane_size_in_bits));
     ASSERT_EQUAL_SVE(pd_expected, p1.WithLaneSize(lane_size_in_bits));
     ASSERT_EQUAL_SVE(pd_expected, p2.WithLaneSize(lane_size_in_bits));

     // Check that the flags were properly set.
     StatusFlags nzcv_expected =
         GetPredTestFlags(pd_expected,
                          pg_inputs,
                          core.GetSVELaneCount(kBRegSize));
     ASSERT_EQUAL_64(nzcv_expected, x0);
     ASSERT_EQUAL_64(nzcv_expected, x1);
     ASSERT_EQUAL_64(nzcv_expected, x2);
   }
 }

 template <typename Tg, typename Tn, typename Td>
 static void PfirstHelper(Test* config,
                          const Tg& pg_inputs,
                          const Tn& pn_inputs,
                          const Td& pd_expected) {
   PfirstPnextHelper(config,
                     &MacroAssembler::Pfirst,
                     kBRegSize,  // pfirst only accepts B-sized lanes.
                     pg_inputs,
                     pn_inputs,
                     pd_expected);
 }

 template <typename Tg, typename Tn, typename Td>
 static void PnextHelper(Test* config,
                         unsigned lane_size_in_bits,
                         const Tg& pg_inputs,
                         const Tn& pn_inputs,
                         const Td& pd_expected) {
   PfirstPnextHelper(config,
                     &MacroAssembler::Pnext,
                     lane_size_in_bits,
                     pg_inputs,
                     pn_inputs,
                     pd_expected);
 }

 TEST_SVE(sve_pfirst) {
   // Provide more lanes than kPRegMinSize (to check propagation if we have a
   // large VL), but few enough to make the test easy to read.
   int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int in1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
   int in2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
   int in3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
   int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   VIXL_ASSERT(ArrayLength(in0) > kPRegMinSize);

   // Pfirst finds the first active lane in pg, and activates the corresponding
   // lane in pn (if it isn't already active).

   //                             The first active lane in in1 is here. |
   //                                                                   v
   int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
   int exp12[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0};
   int exp13[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
   int exp14[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
   PfirstHelper(config, in1, in0, exp10);
   PfirstHelper(config, in1, in2, exp12);
   PfirstHelper(config, in1, in3, exp13);
   PfirstHelper(config, in1, in4, exp14);

   //                          The first active lane in in2 is here. |
   //                                                                v
   int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
   int exp21[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0};
   int exp23[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
   int exp24[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
   PfirstHelper(config, in2, in0, exp20);
   PfirstHelper(config, in2, in1, exp21);
   PfirstHelper(config, in2, in3, exp23);
   PfirstHelper(config, in2, in4, exp24);

   //                                   The first active lane in in3 is here. |
   //                                                                         v
   int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
   int exp31[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1};
   int exp32[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1};
   int exp34[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
   PfirstHelper(config, in3, in0, exp30);
   PfirstHelper(config, in3, in1, exp31);
   PfirstHelper(config, in3, in2, exp32);
   PfirstHelper(config, in3, in4, exp34);

   //             | The first active lane in in4 is here.
   //             v
   int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp41[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
   int exp42[] = {1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
   int exp43[] = {1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
   PfirstHelper(config, in4, in0, exp40);
   PfirstHelper(config, in4, in1, exp41);
   PfirstHelper(config, in4, in2, exp42);
   PfirstHelper(config, in4, in3, exp43);

   // If pg is all inactive, the input is passed through unchanged.
   PfirstHelper(config, in0, in0, in0);
   PfirstHelper(config, in0, in1, in1);
   PfirstHelper(config, in0, in2, in2);
   PfirstHelper(config, in0, in3, in3);

   // If the values of pg and pn match, the value is passed through unchanged.
   PfirstHelper(config, in0, in0, in0);
   PfirstHelper(config, in1, in1, in1);
   PfirstHelper(config, in2, in2, in2);
   PfirstHelper(config, in3, in3, in3);
 }

 TEST_SVE(sve_pfirst_alias) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Check that the Simulator behaves correctly when all arguments are aliased.
   int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0};
   int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0};
   int in_s[] = {0, 1, 1, 0};
   int in_d[] = {1, 1};

   Initialise(&masm, p0.VnB(), in_b);
   Initialise(&masm, p1.VnH(), in_h);
   Initialise(&masm, p2.VnS(), in_s);
   Initialise(&masm, p3.VnD(), in_d);

   // Initialise NZCV to an impossible value, to check that we actually write it.
   __ Mov(x10, NZCVFlag);

   __ Msr(NZCV, x10);
   __ Pfirst(p0.VnB(), p0, p0.VnB());
   __ Mrs(x0, NZCV);

   __ Msr(NZCV, x10);
   __ Pfirst(p1.VnB(), p1, p1.VnB());
   __ Mrs(x1, NZCV);

   __ Msr(NZCV, x10);
   __ Pfirst(p2.VnB(), p2, p2.VnB());
   __ Mrs(x2, NZCV);

   __ Msr(NZCV, x10);
   __ Pfirst(p3.VnB(), p3, p3.VnB());
   __ Mrs(x3, NZCV);

   END();

   if (CAN_RUN()) {
     RUN();

     // The first lane from pg is already active in pdn, so the P register should
     // be unchanged.
     ASSERT_EQUAL_SVE(in_b, p0.VnB());
     ASSERT_EQUAL_SVE(in_h, p1.VnH());
     ASSERT_EQUAL_SVE(in_s, p2.VnS());
     ASSERT_EQUAL_SVE(in_d, p3.VnD());

     ASSERT_EQUAL_64(SVEFirstFlag, x0);
     ASSERT_EQUAL_64(SVEFirstFlag, x1);
     ASSERT_EQUAL_64(SVEFirstFlag, x2);
     ASSERT_EQUAL_64(SVEFirstFlag, x3);
   }
 }

 TEST_SVE(sve_pnext_b) {
   // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
   // (to check propagation if we have a large VL), but few enough to make the
   // test easy to read.
   // For now, we just use kPRegMinSize so that the test works anywhere.
   int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int in1[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
   int in2[] = {0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
   int in3[] = {0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1};
   int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

   // Pnext activates the next element that is true in pg, after the last-active
   // element in pn. If all pn elements are false (as in in0), it starts looking
   // at element 0.

   // There are no active lanes in in0, so the result is simply the first active
   // lane from pg.
   int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
   int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
   int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
   int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

   //      The last active lane in in1 is here. |
   //                                           v
   int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp21[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp31[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

   //                | The last active lane in in2 is here.
   //                v
   int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

   //                               | The last active lane in in3 is here.
   //                               v
   int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp23[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

   //             | The last active lane in in4 is here.
   //             v
   int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

   PnextHelper(config, kBRegSize, in0, in0, exp00);
   PnextHelper(config, kBRegSize, in1, in0, exp10);
   PnextHelper(config, kBRegSize, in2, in0, exp20);
   PnextHelper(config, kBRegSize, in3, in0, exp30);
   PnextHelper(config, kBRegSize, in4, in0, exp40);

   PnextHelper(config, kBRegSize, in0, in1, exp01);
   PnextHelper(config, kBRegSize, in1, in1, exp11);
   PnextHelper(config, kBRegSize, in2, in1, exp21);
   PnextHelper(config, kBRegSize, in3, in1, exp31);
   PnextHelper(config, kBRegSize, in4, in1, exp41);

   PnextHelper(config, kBRegSize, in0, in2, exp02);
   PnextHelper(config, kBRegSize, in1, in2, exp12);
   PnextHelper(config, kBRegSize, in2, in2, exp22);
   PnextHelper(config, kBRegSize, in3, in2, exp32);
   PnextHelper(config, kBRegSize, in4, in2, exp42);

   PnextHelper(config, kBRegSize, in0, in3, exp03);
   PnextHelper(config, kBRegSize, in1, in3, exp13);
   PnextHelper(config, kBRegSize, in2, in3, exp23);
   PnextHelper(config, kBRegSize, in3, in3, exp33);
   PnextHelper(config, kBRegSize, in4, in3, exp43);

   PnextHelper(config, kBRegSize, in0, in4, exp04);
   PnextHelper(config, kBRegSize, in1, in4, exp14);
   PnextHelper(config, kBRegSize, in2, in4, exp24);
   PnextHelper(config, kBRegSize, in3, in4, exp34);
   PnextHelper(config, kBRegSize, in4, in4, exp44);
 }

 TEST_SVE(sve_pnext_h) {
   // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
   // (to check propagation if we have a large VL), but few enough to make the
   // test easy to read.
   // For now, we just use kPRegMinSize so that the test works anywhere.
   int in0[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int in1[] = {0, 0, 0, 1, 0, 2, 1, 0};
   int in2[] = {0, 1, 2, 0, 2, 0, 2, 0};
   int in3[] = {0, 0, 0, 3, 0, 0, 0, 3};
   int in4[] = {3, 0, 0, 0, 0, 0, 0, 0};

   // Pnext activates the next element that is true in pg, after the last-active
   // element in pn. If all pn elements are false (as in in0), it starts looking
   // at element 0.
   //
   // As for other SVE instructions, elements are only considered to be active if
   // the _first_ bit in each field is one. Other bits are ignored.

   // There are no active lanes in in0, so the result is simply the first active
   // lane from pg.
   int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int exp10[] = {0, 0, 0, 0, 0, 0, 1, 0};
   int exp20[] = {0, 1, 0, 0, 0, 0, 0, 0};
   int exp30[] = {0, 0, 0, 0, 0, 0, 0, 1};
   int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0};

   //                      | The last active lane in in1 is here.
   //                      v
   int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int exp21[] = {0, 1, 0, 0, 0, 0, 0, 0};
   int exp31[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0};

   //                | The last active lane in in2 is here.
   //                v
   int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0};

   //                      | The last active lane in in3 is here.
   //                      v
   int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int exp23[] = {0, 1, 0, 0, 0, 0, 0, 0};
   int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0};

   //             | The last active lane in in4 is here.
   //             v
   int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0};

   PnextHelper(config, kHRegSize, in0, in0, exp00);
   PnextHelper(config, kHRegSize, in1, in0, exp10);
   PnextHelper(config, kHRegSize, in2, in0, exp20);
   PnextHelper(config, kHRegSize, in3, in0, exp30);
   PnextHelper(config, kHRegSize, in4, in0, exp40);

   PnextHelper(config, kHRegSize, in0, in1, exp01);
   PnextHelper(config, kHRegSize, in1, in1, exp11);
   PnextHelper(config, kHRegSize, in2, in1, exp21);
   PnextHelper(config, kHRegSize, in3, in1, exp31);
   PnextHelper(config, kHRegSize, in4, in1, exp41);

   PnextHelper(config, kHRegSize, in0, in2, exp02);
   PnextHelper(config, kHRegSize, in1, in2, exp12);
   PnextHelper(config, kHRegSize, in2, in2, exp22);
   PnextHelper(config, kHRegSize, in3, in2, exp32);
   PnextHelper(config, kHRegSize, in4, in2, exp42);

   PnextHelper(config, kHRegSize, in0, in3, exp03);
   PnextHelper(config, kHRegSize, in1, in3, exp13);
   PnextHelper(config, kHRegSize, in2, in3, exp23);
   PnextHelper(config, kHRegSize, in3, in3, exp33);
   PnextHelper(config, kHRegSize, in4, in3, exp43);

   PnextHelper(config, kHRegSize, in0, in4, exp04);
   PnextHelper(config, kHRegSize, in1, in4, exp14);
   PnextHelper(config, kHRegSize, in2, in4, exp24);
   PnextHelper(config, kHRegSize, in3, in4, exp34);
   PnextHelper(config, kHRegSize, in4, in4, exp44);
 }

 TEST_SVE(sve_pnext_s) {
   // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
   // (to check propagation if we have a large VL), but few enough to make the
   // test easy to read.
   // For now, we just use kPRegMinSize so that the test works anywhere.
   int in0[] = {0xe, 0xc, 0x8, 0x0};
   int in1[] = {0x0, 0x2, 0x0, 0x1};
   int in2[] = {0x0, 0x1, 0xf, 0x0};
   int in3[] = {0xf, 0x0, 0x0, 0x0};

   // Pnext activates the next element that is true in pg, after the last-active
   // element in pn. If all pn elements are false (as in in0), it starts looking
   // at element 0.
   //
   // As for other SVE instructions, elements are only considered to be active if
   // the _first_ bit in each field is one. Other bits are ignored.

   // There are no active lanes in in0, so the result is simply the first active
   // lane from pg.
   int exp00[] = {0, 0, 0, 0};
   int exp10[] = {0, 0, 0, 1};
   int exp20[] = {0, 0, 1, 0};
   int exp30[] = {1, 0, 0, 0};

   //                      | The last active lane in in1 is here.
   //                      v
   int exp01[] = {0, 0, 0, 0};
   int exp11[] = {0, 0, 0, 0};
   int exp21[] = {0, 0, 1, 0};
   int exp31[] = {1, 0, 0, 0};

   //                | The last active lane in in2 is here.
   //                v
   int exp02[] = {0, 0, 0, 0};
   int exp12[] = {0, 0, 0, 0};
   int exp22[] = {0, 0, 0, 0};
   int exp32[] = {1, 0, 0, 0};

   //             | The last active lane in in3 is here.
   //             v
   int exp03[] = {0, 0, 0, 0};
   int exp13[] = {0, 0, 0, 0};
   int exp23[] = {0, 0, 0, 0};
   int exp33[] = {0, 0, 0, 0};

   PnextHelper(config, kSRegSize, in0, in0, exp00);
   PnextHelper(config, kSRegSize, in1, in0, exp10);
   PnextHelper(config, kSRegSize, in2, in0, exp20);
   PnextHelper(config, kSRegSize, in3, in0, exp30);

   PnextHelper(config, kSRegSize, in0, in1, exp01);
   PnextHelper(config, kSRegSize, in1, in1, exp11);
   PnextHelper(config, kSRegSize, in2, in1, exp21);
   PnextHelper(config, kSRegSize, in3, in1, exp31);

   PnextHelper(config, kSRegSize, in0, in2, exp02);
   PnextHelper(config, kSRegSize, in1, in2, exp12);
   PnextHelper(config, kSRegSize, in2, in2, exp22);
   PnextHelper(config, kSRegSize, in3, in2, exp32);

   PnextHelper(config, kSRegSize, in0, in3, exp03);
   PnextHelper(config, kSRegSize, in1, in3, exp13);
   PnextHelper(config, kSRegSize, in2, in3, exp23);
   PnextHelper(config, kSRegSize, in3, in3, exp33);
 }

 TEST_SVE(sve_pnext_d) {
   // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
   // (to check propagation if we have a large VL), but few enough to make the
   // test easy to read.
   // For now, we just use kPRegMinSize so that the test works anywhere.
   int in0[] = {0xfe, 0xf0};
   int in1[] = {0x00, 0x55};
   int in2[] = {0x33, 0xff};

   // Pnext activates the next element that is true in pg, after the last-active
   // element in pn. If all pn elements are false (as in in0), it starts looking
   // at element 0.
   //
   // As for other SVE instructions, elements are only considered to be active if
   // the _first_ bit in each field is one. Other bits are ignored.

   // There are no active lanes in in0, so the result is simply the first active
   // lane from pg.
   int exp00[] = {0, 0};
   int exp10[] = {0, 1};
   int exp20[] = {0, 1};

   //                | The last active lane in in1 is here.
   //                v
   int exp01[] = {0, 0};
   int exp11[] = {0, 0};
   int exp21[] = {1, 0};

   //             | The last active lane in in2 is here.
   //             v
   int exp02[] = {0, 0};
   int exp12[] = {0, 0};
   int exp22[] = {0, 0};

   PnextHelper(config, kDRegSize, in0, in0, exp00);
   PnextHelper(config, kDRegSize, in1, in0, exp10);
   PnextHelper(config, kDRegSize, in2, in0, exp20);

   PnextHelper(config, kDRegSize, in0, in1, exp01);
   PnextHelper(config, kDRegSize, in1, in1, exp11);
   PnextHelper(config, kDRegSize, in2, in1, exp21);

   PnextHelper(config, kDRegSize, in0, in2, exp02);
   PnextHelper(config, kDRegSize, in1, in2, exp12);
   PnextHelper(config, kDRegSize, in2, in2, exp22);
 }

 TEST_SVE(sve_pnext_alias) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Check that the Simulator behaves correctly when all arguments are aliased.
   int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0};
   int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0};
   int in_s[] = {0, 1, 1, 0};
   int in_d[] = {1, 1};

   Initialise(&masm, p0.VnB(), in_b);
   Initialise(&masm, p1.VnH(), in_h);
   Initialise(&masm, p2.VnS(), in_s);
   Initialise(&masm, p3.VnD(), in_d);

   // Initialise NZCV to an impossible value, to check that we actually write it.
   __ Mov(x10, NZCVFlag);

   __ Msr(NZCV, x10);
   __ Pnext(p0.VnB(), p0, p0.VnB());
   __ Mrs(x0, NZCV);

   __ Msr(NZCV, x10);
   __ Pnext(p1.VnB(), p1, p1.VnB());
   __ Mrs(x1, NZCV);

   __ Msr(NZCV, x10);
   __ Pnext(p2.VnB(), p2, p2.VnB());
   __ Mrs(x2, NZCV);

   __ Msr(NZCV, x10);
   __ Pnext(p3.VnB(), p3, p3.VnB());
   __ Mrs(x3, NZCV);

   END();

   if (CAN_RUN()) {
     RUN();

     // Since pg.Is(pdn), there can be no active lanes in pg above the last
     // active lane in pdn, so the result should always be zero.
     ASSERT_EQUAL_SVE(0, p0.VnB());
     ASSERT_EQUAL_SVE(0, p1.VnH());
     ASSERT_EQUAL_SVE(0, p2.VnS());
     ASSERT_EQUAL_SVE(0, p3.VnD());

     ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x0);
     ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x1);
     ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x2);
     ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x3);
   }
 }

 static void PtrueHelper(Test* config,
                         unsigned lane_size_in_bits,
                         FlagsUpdate s = LeaveFlags) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   PRegisterWithLaneSize p[kNumberOfPRegisters];
   for (unsigned i = 0; i < kNumberOfPRegisters; i++) {
     p[i] = PRegister(i).WithLaneSize(lane_size_in_bits);
   }

   // Initialise NZCV to an impossible value, to check that we actually write it.
   StatusFlags nzcv_unmodified = NZCVFlag;
   __ Mov(x20, nzcv_unmodified);

   // We don't have enough registers to conveniently test every pattern, so take
   // samples from each group.
   __ Msr(NZCV, x20);
   __ Ptrue(p[0], SVE_POW2, s);
   __ Mrs(x0, NZCV);

   __ Msr(NZCV, x20);
   __ Ptrue(p[1], SVE_VL1, s);
   __ Mrs(x1, NZCV);

   __ Msr(NZCV, x20);
   __ Ptrue(p[2], SVE_VL2, s);
   __ Mrs(x2, NZCV);

   __ Msr(NZCV, x20);
   __ Ptrue(p[3], SVE_VL5, s);
   __ Mrs(x3, NZCV);

   __ Msr(NZCV, x20);
   __ Ptrue(p[4], SVE_VL6, s);
   __ Mrs(x4, NZCV);

   __ Msr(NZCV, x20);
   __ Ptrue(p[5], SVE_VL8, s);
   __ Mrs(x5, NZCV);

   __ Msr(NZCV, x20);
   __ Ptrue(p[6], SVE_VL16, s);
   __ Mrs(x6, NZCV);

   __ Msr(NZCV, x20);
   __ Ptrue(p[7], SVE_VL64, s);
   __ Mrs(x7, NZCV);

   __ Msr(NZCV, x20);
   __ Ptrue(p[8], SVE_VL256, s);
   __ Mrs(x8, NZCV);

   {
     // We have to use the Assembler to use values not defined by
     // SVEPredicateConstraint, so call `ptrues` directly..
     typedef void (
         MacroAssembler::*AssemblePtrueFn)(const PRegisterWithLaneSize& pd,
                                           int pattern);
     AssemblePtrueFn assemble = &MacroAssembler::ptrue;
     if (s == SetFlags) {
       assemble = &MacroAssembler::ptrues;
     }

     ExactAssemblyScope guard(&masm, 12 * kInstructionSize);
     __ msr(NZCV, x20);
     (masm.*assemble)(p[9], 0xe);
     __ mrs(x9, NZCV);

     __ msr(NZCV, x20);
     (masm.*assemble)(p[10], 0x16);
     __ mrs(x10, NZCV);

     __ msr(NZCV, x20);
     (masm.*assemble)(p[11], 0x1a);
     __ mrs(x11, NZCV);

     __ msr(NZCV, x20);
     (masm.*assemble)(p[12], 0x1c);
     __ mrs(x12, NZCV);
   }

   __ Msr(NZCV, x20);
   __ Ptrue(p[13], SVE_MUL4, s);
   __ Mrs(x13, NZCV);

   __ Msr(NZCV, x20);
   __ Ptrue(p[14], SVE_MUL3, s);
   __ Mrs(x14, NZCV);

   __ Msr(NZCV, x20);
   __ Ptrue(p[15], SVE_ALL, s);
   __ Mrs(x15, NZCV);

   END();

   if (CAN_RUN()) {
     RUN();

     int all = core.GetSVELaneCount(lane_size_in_bits);
     int pow2 = 1 << HighestSetBitPosition(all);
     int mul4 = all - (all % 4);
     int mul3 = all - (all % 3);

     // Check P register results.
     for (int i = 0; i < all; i++) {
       ASSERT_EQUAL_SVE_LANE(i < pow2, p[0], i);
       ASSERT_EQUAL_SVE_LANE((all >= 1) && (i < 1), p[1], i);
       ASSERT_EQUAL_SVE_LANE((all >= 2) && (i < 2), p[2], i);
       ASSERT_EQUAL_SVE_LANE((all >= 5) && (i < 5), p[3], i);
       ASSERT_EQUAL_SVE_LANE((all >= 6) && (i < 6), p[4], i);
       ASSERT_EQUAL_SVE_LANE((all >= 8) && (i < 8), p[5], i);
       ASSERT_EQUAL_SVE_LANE((all >= 16) && (i < 16), p[6], i);
       ASSERT_EQUAL_SVE_LANE((all >= 64) && (i < 64), p[7], i);
       ASSERT_EQUAL_SVE_LANE((all >= 256) && (i < 256), p[8], i);
       ASSERT_EQUAL_SVE_LANE(false, p[9], i);
       ASSERT_EQUAL_SVE_LANE(false, p[10], i);
       ASSERT_EQUAL_SVE_LANE(false, p[11], i);
       ASSERT_EQUAL_SVE_LANE(false, p[12], i);
       ASSERT_EQUAL_SVE_LANE(i < mul4, p[13], i);
       ASSERT_EQUAL_SVE_LANE(i < mul3, p[14], i);
       ASSERT_EQUAL_SVE_LANE(true, p[15], i);
     }

     // Check NZCV results.
     if (s == LeaveFlags) {
       // No flags should have been updated.
       for (int i = 0; i <= 15; i++) {
         ASSERT_EQUAL_64(nzcv_unmodified, XRegister(i));
       }
     } else {
       StatusFlags zero = static_cast<StatusFlags>(SVENoneFlag | SVENotLastFlag);
       StatusFlags nonzero = SVEFirstFlag;

       // POW2
       ASSERT_EQUAL_64(nonzero, x0);
       // VL*
       ASSERT_EQUAL_64((all >= 1) ? nonzero : zero, x1);
       ASSERT_EQUAL_64((all >= 2) ? nonzero : zero, x2);
       ASSERT_EQUAL_64((all >= 5) ? nonzero : zero, x3);
       ASSERT_EQUAL_64((all >= 6) ? nonzero : zero, x4);
       ASSERT_EQUAL_64((all >= 8) ? nonzero : zero, x5);
       ASSERT_EQUAL_64((all >= 16) ? nonzero : zero, x6);
       ASSERT_EQUAL_64((all >= 64) ? nonzero : zero, x7);
       ASSERT_EQUAL_64((all >= 256) ? nonzero : zero, x8);
       // #uimm5
       ASSERT_EQUAL_64(zero, x9);
       ASSERT_EQUAL_64(zero, x10);
       ASSERT_EQUAL_64(zero, x11);
       ASSERT_EQUAL_64(zero, x12);
       // MUL*
       ASSERT_EQUAL_64((all >= 4) ? nonzero : zero, x13);
       ASSERT_EQUAL_64((all >= 3) ? nonzero : zero, x14);
       // ALL
       ASSERT_EQUAL_64(nonzero, x15);
     }
   }
 }

 TEST_SVE(sve_ptrue_b) { PtrueHelper(config, kBRegSize, LeaveFlags); }
 TEST_SVE(sve_ptrue_h) { PtrueHelper(config, kHRegSize, LeaveFlags); }
 TEST_SVE(sve_ptrue_s) { PtrueHelper(config, kSRegSize, LeaveFlags); }
 TEST_SVE(sve_ptrue_d) { PtrueHelper(config, kDRegSize, LeaveFlags); }

 TEST_SVE(sve_ptrues_b) { PtrueHelper(config, kBRegSize, SetFlags); }
 TEST_SVE(sve_ptrues_h) { PtrueHelper(config, kHRegSize, SetFlags); }
 TEST_SVE(sve_ptrues_s) { PtrueHelper(config, kSRegSize, SetFlags); }
 TEST_SVE(sve_ptrues_d) { PtrueHelper(config, kDRegSize, SetFlags); }

 TEST_SVE(sve_pfalse) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Initialise non-zero inputs.
   __ Ptrue(p0.VnB());
   __ Ptrue(p1.VnH());
   __ Ptrue(p2.VnS());
   __ Ptrue(p3.VnD());

   // The instruction only supports B-sized lanes, but the lane size has no
   // logical effect, so the MacroAssembler accepts anything.
   __ Pfalse(p0.VnB());
   __ Pfalse(p1.VnH());
   __ Pfalse(p2.VnS());
   __ Pfalse(p3.VnD());

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(0, p0.VnB());
     ASSERT_EQUAL_SVE(0, p1.VnB());
     ASSERT_EQUAL_SVE(0, p2.VnB());
     ASSERT_EQUAL_SVE(0, p3.VnB());
   }
 }

 TEST_SVE(sve_ptest) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Initialise NZCV to a known (impossible) value.
   StatusFlags nzcv_unmodified = NZCVFlag;
   __ Mov(x0, nzcv_unmodified);
   __ Msr(NZCV, x0);

   // Construct some test inputs.
   int in2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0};
   int in3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0};
   int in4[] = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0};
   __ Pfalse(p0.VnB());
   __ Ptrue(p1.VnB());
   Initialise(&masm, p2.VnB(), in2);
   Initialise(&masm, p3.VnB(), in3);
   Initialise(&masm, p4.VnB(), in4);

   // All-inactive pg.
   __ Ptest(p0, p0.VnB());
   __ Mrs(x0, NZCV);
   __ Ptest(p0, p1.VnB());
   __ Mrs(x1, NZCV);
   __ Ptest(p0, p2.VnB());
   __ Mrs(x2, NZCV);
   __ Ptest(p0, p3.VnB());
   __ Mrs(x3, NZCV);
   __ Ptest(p0, p4.VnB());
   __ Mrs(x4, NZCV);

   // All-active pg.
   __ Ptest(p1, p0.VnB());
   __ Mrs(x5, NZCV);
   __ Ptest(p1, p1.VnB());
   __ Mrs(x6, NZCV);
   __ Ptest(p1, p2.VnB());
   __ Mrs(x7, NZCV);
   __ Ptest(p1, p3.VnB());
   __ Mrs(x8, NZCV);
   __ Ptest(p1, p4.VnB());
   __ Mrs(x9, NZCV);

   // Combinations of other inputs.
   __ Ptest(p2, p2.VnB());
   __ Mrs(x20, NZCV);
   __ Ptest(p2, p3.VnB());
   __ Mrs(x21, NZCV);
   __ Ptest(p2, p4.VnB());
   __ Mrs(x22, NZCV);
   __ Ptest(p3, p2.VnB());
   __ Mrs(x23, NZCV);
   __ Ptest(p3, p3.VnB());
   __ Mrs(x24, NZCV);
   __ Ptest(p3, p4.VnB());
   __ Mrs(x25, NZCV);
   __ Ptest(p4, p2.VnB());
   __ Mrs(x26, NZCV);
   __ Ptest(p4, p3.VnB());
   __ Mrs(x27, NZCV);
   __ Ptest(p4, p4.VnB());
   __ Mrs(x28, NZCV);

   END();

   if (CAN_RUN()) {
     RUN();

     StatusFlags zero = static_cast<StatusFlags>(SVENoneFlag | SVENotLastFlag);

     // If pg is all inactive, the value of pn is irrelevant.
     ASSERT_EQUAL_64(zero, x0);
     ASSERT_EQUAL_64(zero, x1);
     ASSERT_EQUAL_64(zero, x2);
     ASSERT_EQUAL_64(zero, x3);
     ASSERT_EQUAL_64(zero, x4);

     // All-active pg.
     ASSERT_EQUAL_64(zero, x5);          // All-inactive pn.
     ASSERT_EQUAL_64(SVEFirstFlag, x6);  // All-active pn.
     // Other pn inputs are non-zero, but the first and last lanes are inactive.
     ASSERT_EQUAL_64(SVENotLastFlag, x7);
     ASSERT_EQUAL_64(SVENotLastFlag, x8);
     ASSERT_EQUAL_64(SVENotLastFlag, x9);

     // Other inputs.
     ASSERT_EQUAL_64(SVEFirstFlag, x20);  // pg: in2, pn: in2
     ASSERT_EQUAL_64(NoFlag, x21);        // pg: in2, pn: in3
     ASSERT_EQUAL_64(zero, x22);          // pg: in2, pn: in4
     ASSERT_EQUAL_64(static_cast<StatusFlags>(SVEFirstFlag | SVENotLastFlag),
                     x23);                // pg: in3, pn: in2
     ASSERT_EQUAL_64(SVEFirstFlag, x24);  // pg: in3, pn: in3
     ASSERT_EQUAL_64(zero, x25);          // pg: in3, pn: in4
     ASSERT_EQUAL_64(zero, x26);          // pg: in4, pn: in2
     ASSERT_EQUAL_64(zero, x27);          // pg: in4, pn: in3
     ASSERT_EQUAL_64(SVEFirstFlag, x28);  // pg: in4, pn: in4
   }
 }

 TEST_SVE(sve_cntp) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // There are {7, 5, 2, 1} active {B, H, S, D} lanes.
   int p0_inputs[] = {0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0};
   Initialise(&masm, p0.VnB(), p0_inputs);

   // With an all-true predicate, these instructions measure the vector length.
   __ Ptrue(p10.VnB());
   __ Ptrue(p11.VnH());
   __ Ptrue(p12.VnS());
   __ Ptrue(p13.VnD());

   // `ptrue p10.b` provides an all-active pg.
   __ Cntp(x10, p10, p10.VnB());
   __ Cntp(x11, p10, p11.VnH());
   __ Cntp(x12, p10, p12.VnS());
   __ Cntp(x13, p10, p13.VnD());

   // Check that the predicate mask is applied properly.
   __ Cntp(x14, p10, p10.VnB());
   __ Cntp(x15, p11, p10.VnB());
   __ Cntp(x16, p12, p10.VnB());
   __ Cntp(x17, p13, p10.VnB());

   // Check other patterns (including some ignored bits).
   __ Cntp(x0, p10, p0.VnB());
   __ Cntp(x1, p10, p0.VnH());
   __ Cntp(x2, p10, p0.VnS());
   __ Cntp(x3, p10, p0.VnD());
   __ Cntp(x4, p0, p10.VnB());
   __ Cntp(x5, p0, p10.VnH());
   __ Cntp(x6, p0, p10.VnS());
   __ Cntp(x7, p0, p10.VnD());

   END();

   if (CAN_RUN()) {
     RUN();

     int vl_b = core.GetSVELaneCount(kBRegSize);
     int vl_h = core.GetSVELaneCount(kHRegSize);
     int vl_s = core.GetSVELaneCount(kSRegSize);
     int vl_d = core.GetSVELaneCount(kDRegSize);

     // Check all-active predicates in various combinations.
     ASSERT_EQUAL_64(vl_b, x10);
     ASSERT_EQUAL_64(vl_h, x11);
     ASSERT_EQUAL_64(vl_s, x12);
     ASSERT_EQUAL_64(vl_d, x13);

     ASSERT_EQUAL_64(vl_b, x14);
     ASSERT_EQUAL_64(vl_h, x15);
     ASSERT_EQUAL_64(vl_s, x16);
     ASSERT_EQUAL_64(vl_d, x17);

     // Check that irrelevant bits are properly ignored.
     ASSERT_EQUAL_64(7, x0);
     ASSERT_EQUAL_64(5, x1);
     ASSERT_EQUAL_64(2, x2);
     ASSERT_EQUAL_64(1, x3);

     ASSERT_EQUAL_64(7, x4);
     ASSERT_EQUAL_64(5, x5);
     ASSERT_EQUAL_64(2, x6);
     ASSERT_EQUAL_64(1, x7);
   }
 }

 typedef void (MacroAssembler::*CntFn)(const Register& dst,
                                       int pattern,
                                       int multiplier);

 template <typename T>
 void GenerateCntSequence(MacroAssembler* masm,
                          CntFn cnt,
                          T acc_value,
                          int multiplier) {
   // Initialise accumulators.
   masm->Mov(x0, acc_value);
   masm->Mov(x1, acc_value);
   masm->Mov(x2, acc_value);
   masm->Mov(x3, acc_value);
   masm->Mov(x4, acc_value);
   masm->Mov(x5, acc_value);
   masm->Mov(x6, acc_value);
   masm->Mov(x7, acc_value);
   masm->Mov(x8, acc_value);
   masm->Mov(x9, acc_value);
   masm->Mov(x10, acc_value);
   masm->Mov(x11, acc_value);
   masm->Mov(x12, acc_value);
   masm->Mov(x13, acc_value);
   masm->Mov(x14, acc_value);
   masm->Mov(x15, acc_value);
   masm->Mov(x18, acc_value);
   masm->Mov(x19, acc_value);
   masm->Mov(x20, acc_value);
   masm->Mov(x21, acc_value);

   (masm->*cnt)(Register(0, sizeof(T) * kBitsPerByte), SVE_POW2, multiplier);
   (masm->*cnt)(Register(1, sizeof(T) * kBitsPerByte), SVE_VL1, multiplier);
   (masm->*cnt)(Register(2, sizeof(T) * kBitsPerByte), SVE_VL2, multiplier);
   (masm->*cnt)(Register(3, sizeof(T) * kBitsPerByte), SVE_VL3, multiplier);
   (masm->*cnt)(Register(4, sizeof(T) * kBitsPerByte), SVE_VL4, multiplier);
   (masm->*cnt)(Register(5, sizeof(T) * kBitsPerByte), SVE_VL5, multiplier);
   (masm->*cnt)(Register(6, sizeof(T) * kBitsPerByte), SVE_VL6, multiplier);
   (masm->*cnt)(Register(7, sizeof(T) * kBitsPerByte), SVE_VL7, multiplier);
   (masm->*cnt)(Register(8, sizeof(T) * kBitsPerByte), SVE_VL8, multiplier);
   (masm->*cnt)(Register(9, sizeof(T) * kBitsPerByte), SVE_VL16, multiplier);
   (masm->*cnt)(Register(10, sizeof(T) * kBitsPerByte), SVE_VL32, multiplier);
   (masm->*cnt)(Register(11, sizeof(T) * kBitsPerByte), SVE_VL64, multiplier);
   (masm->*cnt)(Register(12, sizeof(T) * kBitsPerByte), SVE_VL128, multiplier);
   (masm->*cnt)(Register(13, sizeof(T) * kBitsPerByte), SVE_VL256, multiplier);
   (masm->*cnt)(Register(14, sizeof(T) * kBitsPerByte), 16, multiplier);
   (masm->*cnt)(Register(15, sizeof(T) * kBitsPerByte), 23, multiplier);
   (masm->*cnt)(Register(18, sizeof(T) * kBitsPerByte), 28, multiplier);
   (masm->*cnt)(Register(19, sizeof(T) * kBitsPerByte), SVE_MUL4, multiplier);
   (masm->*cnt)(Register(20, sizeof(T) * kBitsPerByte), SVE_MUL3, multiplier);
   (masm->*cnt)(Register(21, sizeof(T) * kBitsPerByte), SVE_ALL, multiplier);
 }

 int FixedVL(int fixed, int length) {
   VIXL_ASSERT(((fixed >= 1) && (fixed <= 8)) || (fixed == 16) ||
               (fixed == 32) || (fixed == 64) || (fixed == 128) ||
               (fixed = 256));
   return (length >= fixed) ? fixed : 0;
 }

 static void CntHelper(Test* config,
                       CntFn cnt,
                       int multiplier,
                       int lane_size_in_bits,
                       int64_t acc_value = 0,
                       bool is_increment = true) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();
   GenerateCntSequence(&masm, cnt, acc_value, multiplier);
   END();

   if (CAN_RUN()) {
     RUN();

     int all = core.GetSVELaneCount(lane_size_in_bits);
     int pow2 = 1 << HighestSetBitPosition(all);
     int mul4 = all - (all % 4);
     int mul3 = all - (all % 3);

     multiplier = is_increment ? multiplier : -multiplier;

     ASSERT_EQUAL_64(acc_value + (multiplier * pow2), x0);
     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(1, all)), x1);
     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(2, all)), x2);
     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(3, all)), x3);
     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(4, all)), x4);
     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(5, all)), x5);
     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(6, all)), x6);
     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(7, all)), x7);
     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(8, all)), x8);
     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(16, all)), x9);
     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(32, all)), x10);
     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(64, all)), x11);
     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(128, all)), x12);
     ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(256, all)), x13);
     ASSERT_EQUAL_64(acc_value, x14);
     ASSERT_EQUAL_64(acc_value, x15);
     ASSERT_EQUAL_64(acc_value, x18);
     ASSERT_EQUAL_64(acc_value + (multiplier * mul4), x19);
     ASSERT_EQUAL_64(acc_value + (multiplier * mul3), x20);
     ASSERT_EQUAL_64(acc_value + (multiplier * all), x21);
   }
 }

 static void IncHelper(Test* config,
                       CntFn cnt,
                       int multiplier,
                       int lane_size_in_bits,
                       int64_t acc_value) {
   CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
 }

 static void DecHelper(Test* config,
                       CntFn cnt,
                       int multiplier,
                       int lane_size_in_bits,
                       int64_t acc_value) {
   CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false);
 }

 TEST_SVE(sve_cntb) {
   CntHelper(config, &MacroAssembler::Cntb, 1, kBRegSize);
   CntHelper(config, &MacroAssembler::Cntb, 2, kBRegSize);
   CntHelper(config, &MacroAssembler::Cntb, 15, kBRegSize);
   CntHelper(config, &MacroAssembler::Cntb, 16, kBRegSize);
 }

 TEST_SVE(sve_cnth) {
   CntHelper(config, &MacroAssembler::Cnth, 1, kHRegSize);
   CntHelper(config, &MacroAssembler::Cnth, 2, kHRegSize);
   CntHelper(config, &MacroAssembler::Cnth, 15, kHRegSize);
   CntHelper(config, &MacroAssembler::Cnth, 16, kHRegSize);
 }

 TEST_SVE(sve_cntw) {
   CntHelper(config, &MacroAssembler::Cntw, 1, kWRegSize);
   CntHelper(config, &MacroAssembler::Cntw, 2, kWRegSize);
   CntHelper(config, &MacroAssembler::Cntw, 15, kWRegSize);
   CntHelper(config, &MacroAssembler::Cntw, 16, kWRegSize);
 }

 TEST_SVE(sve_cntd) {
   CntHelper(config, &MacroAssembler::Cntd, 1, kDRegSize);
   CntHelper(config, &MacroAssembler::Cntd, 2, kDRegSize);
   CntHelper(config, &MacroAssembler::Cntd, 15, kDRegSize);
   CntHelper(config, &MacroAssembler::Cntd, 16, kDRegSize);
 }

 TEST_SVE(sve_decb) {
   DecHelper(config, &MacroAssembler::Decb, 1, kBRegSize, 42);
   DecHelper(config, &MacroAssembler::Decb, 2, kBRegSize, -1);
   DecHelper(config, &MacroAssembler::Decb, 15, kBRegSize, INT64_MIN);
   DecHelper(config, &MacroAssembler::Decb, 16, kBRegSize, -42);
 }

 TEST_SVE(sve_dech) {
   DecHelper(config, &MacroAssembler::Dech, 1, kHRegSize, 42);
   DecHelper(config, &MacroAssembler::Dech, 2, kHRegSize, -1);
   DecHelper(config, &MacroAssembler::Dech, 15, kHRegSize, INT64_MIN);
   DecHelper(config, &MacroAssembler::Dech, 16, kHRegSize, -42);
 }

 TEST_SVE(sve_decw) {
   DecHelper(config, &MacroAssembler::Decw, 1, kWRegSize, 42);
   DecHelper(config, &MacroAssembler::Decw, 2, kWRegSize, -1);
   DecHelper(config, &MacroAssembler::Decw, 15, kWRegSize, INT64_MIN);
   DecHelper(config, &MacroAssembler::Decw, 16, kWRegSize, -42);
 }

 TEST_SVE(sve_decd) {
   DecHelper(config, &MacroAssembler::Decd, 1, kDRegSize, 42);
   DecHelper(config, &MacroAssembler::Decd, 2, kDRegSize, -1);
   DecHelper(config, &MacroAssembler::Decd, 15, kDRegSize, INT64_MIN);
   DecHelper(config, &MacroAssembler::Decd, 16, kDRegSize, -42);
 }

 TEST_SVE(sve_incb) {
   IncHelper(config, &MacroAssembler::Incb, 1, kBRegSize, 42);
   IncHelper(config, &MacroAssembler::Incb, 2, kBRegSize, -1);
   IncHelper(config, &MacroAssembler::Incb, 15, kBRegSize, INT64_MAX);
   IncHelper(config, &MacroAssembler::Incb, 16, kBRegSize, -42);
 }

 TEST_SVE(sve_inch) {
   IncHelper(config, &MacroAssembler::Inch, 1, kHRegSize, 42);
   IncHelper(config, &MacroAssembler::Inch, 2, kHRegSize, -1);
   IncHelper(config, &MacroAssembler::Inch, 15, kHRegSize, INT64_MAX);
   IncHelper(config, &MacroAssembler::Inch, 16, kHRegSize, -42);
 }

 TEST_SVE(sve_incw) {
   IncHelper(config, &MacroAssembler::Incw, 1, kWRegSize, 42);
   IncHelper(config, &MacroAssembler::Incw, 2, kWRegSize, -1);
   IncHelper(config, &MacroAssembler::Incw, 15, kWRegSize, INT64_MAX);
   IncHelper(config, &MacroAssembler::Incw, 16, kWRegSize, -42);
 }

 TEST_SVE(sve_incd) {
   IncHelper(config, &MacroAssembler::Incd, 1, kDRegSize, 42);
   IncHelper(config, &MacroAssembler::Incd, 2, kDRegSize, -1);
   IncHelper(config, &MacroAssembler::Incd, 15, kDRegSize, INT64_MAX);
   IncHelper(config, &MacroAssembler::Incd, 16, kDRegSize, -42);
 }

 template <typename T>
 static T QAdd(T x, int y) {
   VIXL_ASSERT(y > INT_MIN);
   T result;
   T min = std::numeric_limits<T>::min();
   T max = std::numeric_limits<T>::max();
   if ((x >= 0) && (y >= 0)) {
     // For positive a and b, saturate at max.
     result = (max - x) < static_cast<T>(y) ? max : x + y;
   } else if ((y < 0) && ((x < 0) || (min == 0))) {
     // For negative b, where either a negative or T unsigned.
     result = (x - min) < static_cast<T>(-y) ? min : x + y;
   } else {
     result = x + y;
   }
   return result;
 }

 template <typename T>
 static void QIncDecHelper(Test* config,
                           CntFn cnt,
                           int multiplier,
                           int lane_size_in_bits,
                           T acc_value,
                           bool is_increment) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();
   GenerateCntSequence(&masm, cnt, acc_value, multiplier);
   END();

   if (CAN_RUN()) {
     RUN();

     int all = core.GetSVELaneCount(lane_size_in_bits);
     int pow2 = 1 << HighestSetBitPosition(all);
     int mul4 = all - (all % 4);
     int mul3 = all - (all % 3);

     multiplier = is_increment ? multiplier : -multiplier;

     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13);
     ASSERT_EQUAL_64(acc_value, x14);
     ASSERT_EQUAL_64(acc_value, x15);
     ASSERT_EQUAL_64(acc_value, x18);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21);
   }
 }

 template <typename T>
 static void QIncHelper(Test* config,
                        CntFn cnt,
                        int multiplier,
                        int lane_size_in_bits,
                        T acc_value) {
   QIncDecHelper<T>(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
 }

 template <typename T>
 static void QDecHelper(Test* config,
                        CntFn cnt,
                        int multiplier,
                        int lane_size_in_bits,
                        T acc_value) {
   QIncDecHelper<T>(config,
                    cnt,
                    multiplier,
                    lane_size_in_bits,
                    acc_value,
                    false);
 }

 TEST_SVE(sve_sqdecb) {
   int64_t bigneg = INT64_MIN + 42;
   int64_t bigpos = INT64_MAX - 42;
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1);
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 2, kBRegSize, bigneg);
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999);
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 16, kBRegSize, bigpos);
 }

 TEST_SVE(sve_sqdech) {
   int64_t bigneg = INT64_MIN + 42;
   int64_t bigpos = INT64_MAX - 42;
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1);
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 2, kHRegSize, bigneg);
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999);
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 16, kHRegSize, bigpos);
 }

 TEST_SVE(sve_sqdecw) {
   int64_t bigneg = INT64_MIN + 42;
   int64_t bigpos = INT64_MAX - 42;
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1);
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 2, kWRegSize, bigneg);
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999);
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 16, kWRegSize, bigpos);
 }

 TEST_SVE(sve_sqdecd) {
   int64_t bigneg = INT64_MIN + 42;
   int64_t bigpos = INT64_MAX - 42;
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1);
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 2, kDRegSize, bigneg);
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999);
   QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 16, kDRegSize, bigpos);
 }

 TEST_SVE(sve_sqincb) {
   int64_t bigneg = INT64_MIN + 42;
   int64_t bigpos = INT64_MAX - 42;
   QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1);
   QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 2, kBRegSize, bigneg);
   QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999);
   QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 16, kBRegSize, bigpos);
 }

 TEST_SVE(sve_sqinch) {
   int64_t bigneg = INT64_MIN + 42;
   int64_t bigpos = INT64_MAX - 42;
   QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1);
   QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 2, kHRegSize, bigneg);
   QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999);
   QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 16, kHRegSize, bigpos);
 }

 TEST_SVE(sve_sqincw) {
   int64_t bigneg = INT64_MIN + 42;
   int64_t bigpos = INT64_MAX - 42;
   QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1);
   QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 2, kWRegSize, bigneg);
   QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999);
   QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 16, kWRegSize, bigpos);
 }

 TEST_SVE(sve_sqincd) {
   int64_t bigneg = INT64_MIN + 42;
   int64_t bigpos = INT64_MAX - 42;
   QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1);
   QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 2, kDRegSize, bigneg);
   QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999);
   QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 16, kDRegSize, bigpos);
 }

 TEST_SVE(sve_uqdecb) {
   int32_t big32 = UINT32_MAX - 42;
   int64_t big64 = UINT64_MAX - 42;
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1);
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42);
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999);
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big32);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big64);
 }

 TEST_SVE(sve_uqdech) {
   int32_t big32 = UINT32_MAX - 42;
   int64_t big64 = UINT64_MAX - 42;
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1);
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42);
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999);
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 16, kHRegSize, big32);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 16, kHRegSize, big64);
 }

 TEST_SVE(sve_uqdecw) {
   int32_t big32 = UINT32_MAX - 42;
   int64_t big64 = UINT64_MAX - 42;
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1);
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42);
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999);
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big32);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big64);
 }

 TEST_SVE(sve_uqdecd) {
   int32_t big32 = UINT32_MAX - 42;
   int64_t big64 = UINT64_MAX - 42;
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1);
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42);
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999);
   QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big32);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999);
   QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big64);
 }

 TEST_SVE(sve_uqincb) {
   int32_t big32 = UINT32_MAX - 42;
   int64_t big64 = UINT64_MAX - 42;
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1);
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42);
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999);
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 16, kBRegSize, big32);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 16, kBRegSize, big64);
 }

 TEST_SVE(sve_uqinch) {
   int32_t big32 = UINT32_MAX - 42;
   int64_t big64 = UINT64_MAX - 42;
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1);
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42);
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999);
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 16, kHRegSize, big32);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 16, kHRegSize, big64);
 }

 TEST_SVE(sve_uqincw) {
   int32_t big32 = UINT32_MAX - 42;
   int64_t big64 = UINT64_MAX - 42;
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1);
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42);
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999);
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 16, kWRegSize, big32);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 16, kWRegSize, big64);
 }

 TEST_SVE(sve_uqincd) {
   int32_t big32 = UINT32_MAX - 42;
   int64_t big64 = UINT64_MAX - 42;
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1);
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42);
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999);
   QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 16, kDRegSize, big32);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999);
   QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 16, kDRegSize, big64);
 }

 typedef void (MacroAssembler::*QIncDecXWFn)(const Register& dst,
                                             const Register& src,
                                             int pattern,
                                             int multiplier);

 static void QIncDecXWHelper(Test* config,
                             QIncDecXWFn cnt,
                             int multiplier,
                             int lane_size_in_bits,
                             int32_t acc_value,
                             bool is_increment) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Initialise accumulators.
   __ Mov(x0, acc_value);
   __ Mov(x1, acc_value);
   __ Mov(x2, acc_value);
   __ Mov(x3, acc_value);
   __ Mov(x4, acc_value);
   __ Mov(x5, acc_value);
   __ Mov(x6, acc_value);
   __ Mov(x7, acc_value);
   __ Mov(x8, acc_value);
   __ Mov(x9, acc_value);
   __ Mov(x10, acc_value);
   __ Mov(x11, acc_value);
   __ Mov(x12, acc_value);
   __ Mov(x13, acc_value);
   __ Mov(x14, acc_value);
   __ Mov(x15, acc_value);
   __ Mov(x18, acc_value);
   __ Mov(x19, acc_value);
   __ Mov(x20, acc_value);
   __ Mov(x21, acc_value);

   (masm.*cnt)(x0, w0, SVE_POW2, multiplier);
   (masm.*cnt)(x1, w1, SVE_VL1, multiplier);
   (masm.*cnt)(x2, w2, SVE_VL2, multiplier);
   (masm.*cnt)(x3, w3, SVE_VL3, multiplier);
   (masm.*cnt)(x4, w4, SVE_VL4, multiplier);
   (masm.*cnt)(x5, w5, SVE_VL5, multiplier);
   (masm.*cnt)(x6, w6, SVE_VL6, multiplier);
   (masm.*cnt)(x7, w7, SVE_VL7, multiplier);
   (masm.*cnt)(x8, w8, SVE_VL8, multiplier);
   (masm.*cnt)(x9, w9, SVE_VL16, multiplier);
   (masm.*cnt)(x10, w10, SVE_VL32, multiplier);
   (masm.*cnt)(x11, w11, SVE_VL64, multiplier);
   (masm.*cnt)(x12, w12, SVE_VL128, multiplier);
   (masm.*cnt)(x13, w13, SVE_VL256, multiplier);
   (masm.*cnt)(x14, w14, 16, multiplier);
   (masm.*cnt)(x15, w15, 23, multiplier);
   (masm.*cnt)(x18, w18, 28, multiplier);
   (masm.*cnt)(x19, w19, SVE_MUL4, multiplier);
   (masm.*cnt)(x20, w20, SVE_MUL3, multiplier);
   (masm.*cnt)(x21, w21, SVE_ALL, multiplier);

   END();

   if (CAN_RUN()) {
     RUN();

     int all = core.GetSVELaneCount(lane_size_in_bits);
     int pow2 = 1 << HighestSetBitPosition(all);
     int mul4 = all - (all % 4);
     int mul3 = all - (all % 3);

     multiplier = is_increment ? multiplier : -multiplier;

     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13);
     ASSERT_EQUAL_64(acc_value, x14);
     ASSERT_EQUAL_64(acc_value, x15);
     ASSERT_EQUAL_64(acc_value, x18);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20);
     ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21);
   }
 }

 static void QIncXWHelper(Test* config,
                          QIncDecXWFn cnt,
                          int multiplier,
                          int lane_size_in_bits,
                          int32_t acc_value) {
   QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
 }

 static void QDecXWHelper(Test* config,
                          QIncDecXWFn cnt,
                          int multiplier,
                          int lane_size_in_bits,
                          int32_t acc_value) {
   QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false);
 }

 TEST_SVE(sve_sqdecb_xw) {
   QDecXWHelper(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1);
   QDecXWHelper(config, &MacroAssembler::Sqdecb, 2, kBRegSize, INT32_MIN + 42);
   QDecXWHelper(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999);
   QDecXWHelper(config, &MacroAssembler::Sqdecb, 16, kBRegSize, INT32_MAX - 42);
 }

 TEST_SVE(sve_sqdech_xw) {
   QDecXWHelper(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1);
   QDecXWHelper(config, &MacroAssembler::Sqdech, 2, kHRegSize, INT32_MIN + 42);
   QDecXWHelper(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999);
   QDecXWHelper(config, &MacroAssembler::Sqdech, 16, kHRegSize, INT32_MAX - 42);
 }

 TEST_SVE(sve_sqdecw_xw) {
   QDecXWHelper(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1);
   QDecXWHelper(config, &MacroAssembler::Sqdecw, 2, kWRegSize, INT32_MIN + 42);
   QDecXWHelper(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999);
   QDecXWHelper(config, &MacroAssembler::Sqdecw, 16, kWRegSize, INT32_MAX - 42);
 }

 TEST_SVE(sve_sqdecd_xw) {
   QDecXWHelper(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1);
   QDecXWHelper(config, &MacroAssembler::Sqdecd, 2, kDRegSize, INT32_MIN + 42);
   QDecXWHelper(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999);
   QDecXWHelper(config, &MacroAssembler::Sqdecd, 16, kDRegSize, INT32_MAX - 42);
 }

 TEST_SVE(sve_sqincb_xw) {
   QIncXWHelper(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1);
   QIncXWHelper(config, &MacroAssembler::Sqincb, 2, kBRegSize, INT32_MIN + 42);
   QIncXWHelper(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999);
   QIncXWHelper(config, &MacroAssembler::Sqincb, 16, kBRegSize, INT32_MAX - 42);
 }

 TEST_SVE(sve_sqinch_xw) {
   QIncXWHelper(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1);
   QIncXWHelper(config, &MacroAssembler::Sqinch, 2, kHRegSize, INT32_MIN + 42);
   QIncXWHelper(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999);
   QIncXWHelper(config, &MacroAssembler::Sqinch, 16, kHRegSize, INT32_MAX - 42);
 }

 TEST_SVE(sve_sqincw_xw) {
   QIncXWHelper(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1);
   QIncXWHelper(config, &MacroAssembler::Sqincw, 2, kWRegSize, INT32_MIN + 42);
   QIncXWHelper(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999);
   QIncXWHelper(config, &MacroAssembler::Sqincw, 16, kWRegSize, INT32_MAX - 42);
 }

 TEST_SVE(sve_sqincd_xw) {
   QIncXWHelper(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1);
   QIncXWHelper(config, &MacroAssembler::Sqincd, 2, kDRegSize, INT32_MIN + 42);
   QIncXWHelper(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999);
   QIncXWHelper(config, &MacroAssembler::Sqincd, 16, kDRegSize, INT32_MAX - 42);
 }

 typedef void (MacroAssembler::*IncDecZFn)(const ZRegister& dst,
                                           int pattern,
                                           int multiplier);
 typedef void (MacroAssembler::*AddSubFn)(const ZRegister& dst,
                                          const ZRegister& src1,
                                          const ZRegister& src2);

 static void IncDecZHelper(Test* config,
                           IncDecZFn fn,
                           CntFn cnt,
                           AddSubFn addsub,
                           int multiplier,
                           int lane_size_in_bits) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t acc_inputs[] = {0x7766554433221100,
                            0xffffffffffffffff,
                            0x0000000000000000,
                            0xffffffff0000ffff,
                            0x7fffffffffffffff,
                            0x8000000000000000,
                            0x7fffffff7fff7fff,
                            0x8000000080008000};

   for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
     for (int j = 0; j < 4; j++) {
       InsrHelper(&masm, ZRegister(i, kDRegSize), acc_inputs);
     }
   }
   for (unsigned i = 0; i < 15; i++) {
     __ Mov(XRegister(i), 0);
   }

   (masm.*fn)(z16.WithLaneSize(lane_size_in_bits), SVE_POW2, multiplier);
   (masm.*fn)(z17.WithLaneSize(lane_size_in_bits), SVE_VL1, multiplier);
   (masm.*fn)(z18.WithLaneSize(lane_size_in_bits), SVE_VL2, multiplier);
   (masm.*fn)(z19.WithLaneSize(lane_size_in_bits), SVE_VL3, multiplier);
   (masm.*fn)(z20.WithLaneSize(lane_size_in_bits), SVE_VL4, multiplier);
   (masm.*fn)(z21.WithLaneSize(lane_size_in_bits), SVE_VL7, multiplier);
   (masm.*fn)(z22.WithLaneSize(lane_size_in_bits), SVE_VL8, multiplier);
   (masm.*fn)(z23.WithLaneSize(lane_size_in_bits), SVE_VL16, multiplier);
   (masm.*fn)(z24.WithLaneSize(lane_size_in_bits), SVE_VL64, multiplier);
   (masm.*fn)(z25.WithLaneSize(lane_size_in_bits), SVE_VL256, multiplier);
   (masm.*fn)(z26.WithLaneSize(lane_size_in_bits), 16, multiplier);
   (masm.*fn)(z27.WithLaneSize(lane_size_in_bits), 28, multiplier);
   (masm.*fn)(z28.WithLaneSize(lane_size_in_bits), SVE_MUL3, multiplier);
   (masm.*fn)(z29.WithLaneSize(lane_size_in_bits), SVE_MUL4, multiplier);
   (masm.*fn)(z30.WithLaneSize(lane_size_in_bits), SVE_ALL, multiplier);

   // Perform computation using alternative instructions.
   (masm.*cnt)(x0, SVE_POW2, multiplier);
   (masm.*cnt)(x1, SVE_VL1, multiplier);
   (masm.*cnt)(x2, SVE_VL2, multiplier);
   (masm.*cnt)(x3, SVE_VL3, multiplier);
   (masm.*cnt)(x4, SVE_VL4, multiplier);
   (masm.*cnt)(x5, SVE_VL7, multiplier);
   (masm.*cnt)(x6, SVE_VL8, multiplier);
   (masm.*cnt)(x7, SVE_VL16, multiplier);
   (masm.*cnt)(x8, SVE_VL64, multiplier);
   (masm.*cnt)(x9, SVE_VL256, multiplier);
   (masm.*cnt)(x10, 16, multiplier);
   (masm.*cnt)(x11, 28, multiplier);
   (masm.*cnt)(x12, SVE_MUL3, multiplier);
   (masm.*cnt)(x13, SVE_MUL4, multiplier);
   (masm.*cnt)(x14, SVE_ALL, multiplier);

   ZRegister zscratch = z15.WithLaneSize(lane_size_in_bits);
   for (unsigned i = 0; i < 15; i++) {
     ZRegister zsrcdst = ZRegister(i, lane_size_in_bits);
     Register x = Register(i, kXRegSize);
     __ Dup(zscratch, x);
     (masm.*addsub)(zsrcdst, zsrcdst, zscratch);
   }

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(z0, z16);
     ASSERT_EQUAL_SVE(z1, z17);
     ASSERT_EQUAL_SVE(z2, z18);
     ASSERT_EQUAL_SVE(z3, z19);
     ASSERT_EQUAL_SVE(z4, z20);
     ASSERT_EQUAL_SVE(z5, z21);
     ASSERT_EQUAL_SVE(z6, z22);
     ASSERT_EQUAL_SVE(z7, z23);
     ASSERT_EQUAL_SVE(z8, z24);
     ASSERT_EQUAL_SVE(z9, z25);
     ASSERT_EQUAL_SVE(z10, z26);
     ASSERT_EQUAL_SVE(z11, z27);
     ASSERT_EQUAL_SVE(z12, z28);
     ASSERT_EQUAL_SVE(z13, z29);
     ASSERT_EQUAL_SVE(z14, z30);
   }
 }

 TEST_SVE(sve_inc_dec_vec) {
   CntFn cnth = &MacroAssembler::Cnth;
   CntFn cntw = &MacroAssembler::Cntw;
   CntFn cntd = &MacroAssembler::Cntd;
   AddSubFn sub = &MacroAssembler::Sub;
   AddSubFn add = &MacroAssembler::Add;
   for (int mult = 1; mult <= 16; mult += 5) {
     IncDecZHelper(config, &MacroAssembler::Dech, cnth, sub, mult, kHRegSize);
     IncDecZHelper(config, &MacroAssembler::Decw, cntw, sub, mult, kSRegSize);
     IncDecZHelper(config, &MacroAssembler::Decd, cntd, sub, mult, kDRegSize);
     IncDecZHelper(config, &MacroAssembler::Inch, cnth, add, mult, kHRegSize);
     IncDecZHelper(config, &MacroAssembler::Incw, cntw, add, mult, kSRegSize);
     IncDecZHelper(config, &MacroAssembler::Incd, cntd, add, mult, kDRegSize);
   }
 }

 TEST_SVE(sve_unsigned_sat_inc_dec_vec) {
   CntFn cnth = &MacroAssembler::Cnth;
   CntFn cntw = &MacroAssembler::Cntw;
   CntFn cntd = &MacroAssembler::Cntd;
   AddSubFn sub = &MacroAssembler::Uqsub;
   AddSubFn add = &MacroAssembler::Uqadd;
   for (int mult = 1; mult <= 16; mult += 5) {
     IncDecZHelper(config, &MacroAssembler::Uqdech, cnth, sub, mult, kHRegSize);
     IncDecZHelper(config, &MacroAssembler::Uqdecw, cntw, sub, mult, kSRegSize);
     IncDecZHelper(config, &MacroAssembler::Uqdecd, cntd, sub, mult, kDRegSize);
     IncDecZHelper(config, &MacroAssembler::Uqinch, cnth, add, mult, kHRegSize);
     IncDecZHelper(config, &MacroAssembler::Uqincw, cntw, add, mult, kSRegSize);
     IncDecZHelper(config, &MacroAssembler::Uqincd, cntd, add, mult, kDRegSize);
   }
 }

 TEST_SVE(sve_signed_sat_inc_dec_vec) {
   CntFn cnth = &MacroAssembler::Cnth;
   CntFn cntw = &MacroAssembler::Cntw;
   CntFn cntd = &MacroAssembler::Cntd;
   AddSubFn sub = &MacroAssembler::Sqsub;
   AddSubFn add = &MacroAssembler::Sqadd;
   for (int mult = 1; mult <= 16; mult += 5) {
     IncDecZHelper(config, &MacroAssembler::Sqdech, cnth, sub, mult, kHRegSize);
     IncDecZHelper(config, &MacroAssembler::Sqdecw, cntw, sub, mult, kSRegSize);
     IncDecZHelper(config, &MacroAssembler::Sqdecd, cntd, sub, mult, kDRegSize);
     IncDecZHelper(config, &MacroAssembler::Sqinch, cnth, add, mult, kHRegSize);
     IncDecZHelper(config, &MacroAssembler::Sqincw, cntw, add, mult, kSRegSize);
     IncDecZHelper(config, &MacroAssembler::Sqincd, cntd, add, mult, kDRegSize);
   }
 }

 typedef void (MacroAssembler::*ArithPredicatedFn)(const ZRegister& zd,
                                                   const PRegisterM& pg,
                                                   const ZRegister& zn,
                                                   const ZRegister& zm);

 template <typename Td, typename Tg, typename Tn>
 static void IntBinArithHelper(Test* config,
                               ArithPredicatedFn macro,
                               unsigned lane_size_in_bits,
                               const Tg& pg_inputs,
                               const Tn& zn_inputs,
                               const Tn& zm_inputs,
                               const Td& zd_expected) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   ZRegister src_a = z30.WithLaneSize(lane_size_in_bits);
   ZRegister src_b = z27.WithLaneSize(lane_size_in_bits);
   InsrHelper(&masm, src_a, zn_inputs);
   InsrHelper(&masm, src_b, zm_inputs);

   Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);

   ZRegister zd_1 = z0.WithLaneSize(lane_size_in_bits);
   ZRegister zd_2 = z1.WithLaneSize(lane_size_in_bits);
   ZRegister zd_3 = z2.WithLaneSize(lane_size_in_bits);

   // `instr` zd(dst), zd(src_a), zn(src_b)
   __ Mov(zd_1, src_a);
   (masm.*macro)(zd_1, p0.Merging(), zd_1, src_b);

   // `instr` zd(dst), zm(src_a), zd(src_b)
   // Based on whether zd and zm registers are aliased, the macro of instructions
   // (`Instr`) swaps the order of operands if it has the commutative property,
   // otherwise, transfer to the reversed `Instr`, such as subr and divr.
   __ Mov(zd_2, src_b);
   (masm.*macro)(zd_2, p0.Merging(), src_a, zd_2);

   // `instr` zd(dst), zm(src_a), zn(src_b)
   // The macro of instructions (`Instr`) automatically selects between `instr`
   // and movprfx + `instr` based on whether zd and zn registers are aliased.
   // A generated movprfx instruction is predicated that using the same
   // governing predicate register. In order to keep the result constant,
   // initialize the destination register first.
   __ Mov(zd_3, src_a);
   (masm.*macro)(zd_3, p0.Merging(), src_a, src_b);

   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(zd_expected, zd_1);

     for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
       int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
       if (!core.HasSVELane(zd_1, lane)) break;
       if ((pg_inputs[i] & 1) != 0) {
         ASSERT_EQUAL_SVE_LANE(zd_expected[i], zd_1, lane);
       } else {
         ASSERT_EQUAL_SVE_LANE(zn_inputs[i], zd_1, lane);
       }
     }

     ASSERT_EQUAL_SVE(zd_expected, zd_3);
   }
 }

 TEST_SVE(sve_binary_arithmetic_predicated_add) {
   // clang-format off
   unsigned zn_b[] = {0x00, 0x01, 0x10, 0x81, 0xff, 0x0f, 0x01, 0x7f};

   unsigned zm_b[] = {0x00, 0x01, 0x10, 0x00, 0x81, 0x80, 0xff, 0xff};

   unsigned zn_h[] = {0x0000, 0x0123, 0x1010, 0x8181, 0xffff, 0x0f0f, 0x0101, 0x7f7f};

   unsigned zm_h[] = {0x0000, 0x0123, 0x1010, 0x0000, 0x8181, 0x8080, 0xffff, 0xffff};

   unsigned zn_s[] = {0x00000000, 0x01234567, 0x10101010, 0x81818181,
                      0xffffffff, 0x0f0f0f0f, 0x01010101, 0x7f7f7f7f};

   unsigned zm_s[] = {0x00000000, 0x01234567, 0x10101010, 0x00000000,
                      0x81818181, 0x80808080, 0xffffffff, 0xffffffff};

   uint64_t zn_d[] = {0x0000000000000000, 0x0123456789abcdef,
                      0x1010101010101010, 0x8181818181818181,
                      0xffffffffffffffff, 0x0f0f0f0f0f0f0f0f,
                      0x0101010101010101, 0x7f7f7f7fffffffff};

   uint64_t zm_d[] = {0x0000000000000000, 0x0123456789abcdef,
                      0x1010101010101010, 0x0000000000000000,
                      0x8181818181818181, 0x8080808080808080,
                      0xffffffffffffffff, 0xffffffffffffffff};

   int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
   int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
   int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
   int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1};

   unsigned add_exp_b[] = {0x00, 0x02, 0x20, 0x81, 0x80, 0x8f, 0x00, 0x7f};

   unsigned add_exp_h[] = {0x0000, 0x0246, 0x1010, 0x8181,
                           0x8180, 0x8f8f, 0x0101, 0x7f7e};

   unsigned add_exp_s[] = {0x00000000, 0x01234567, 0x20202020, 0x81818181,
                           0x81818180, 0x0f0f0f0f, 0x01010100, 0x7f7f7f7e};

   uint64_t add_exp_d[] = {0x0000000000000000, 0x02468acf13579bde,
                           0x2020202020202020, 0x8181818181818181,
                           0xffffffffffffffff, 0x8f8f8f8f8f8f8f8f,
                           0x0101010101010100, 0x7f7f7f7ffffffffe};

   ArithPredicatedFn fn = &MacroAssembler::Add;
   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, add_exp_b);
   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, add_exp_h);
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, add_exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, add_exp_d);

   unsigned sub_exp_b[] = {0x00, 0x00, 0x00, 0x81, 0x7e, 0x8f, 0x02, 0x7f};

   unsigned sub_exp_h[] = {0x0000, 0x0000, 0x1010, 0x8181,
                           0x7e7e, 0x8e8f, 0x0101, 0x7f80};

   unsigned sub_exp_s[] = {0x00000000, 0x01234567, 0x00000000, 0x81818181,
                           0x7e7e7e7e, 0x0f0f0f0f, 0x01010102, 0x7f7f7f80};

   uint64_t sub_exp_d[] = {0x0000000000000000, 0x0000000000000000,
                           0x0000000000000000, 0x8181818181818181,
                           0xffffffffffffffff, 0x8e8e8e8e8e8e8e8f,
                           0x0101010101010102, 0x7f7f7f8000000000};

   fn = &MacroAssembler::Sub;
   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sub_exp_b);
   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sub_exp_h);
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sub_exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sub_exp_d);
   // clang-format on
 }

 TEST_SVE(sve_binary_arithmetic_predicated_umin_umax_uabd) {
   // clang-format off
   unsigned zn_b[] = {0x00, 0xff, 0x0f, 0xff, 0xf0, 0x98, 0x55, 0x67};

   unsigned zm_b[] = {0x01, 0x00, 0x0e, 0xfe, 0xfe, 0xab, 0xcd, 0x78};

   unsigned zn_h[] = {0x0000, 0xffff, 0x00ff, 0xffff,
                      0xff00, 0xba98, 0x5555, 0x4567};

   unsigned zm_h[] = {0x0001, 0x0000, 0x00ee, 0xfffe,
                      0xfe00, 0xabab, 0xcdcd, 0x5678};

   unsigned zn_s[] = {0x00000000, 0xffffffff, 0x0000ffff, 0xffffffff,
                      0xffff0000, 0xfedcba98, 0x55555555, 0x01234567};

   unsigned zm_s[] = {0x00000001, 0x00000000, 0x0000eeee, 0xfffffffe,
                      0xfffe0000, 0xabababab, 0xcdcdcdcd, 0x12345678};

   uint64_t zn_d[] = {0x0000000000000000, 0xffffffffffffffff,
                      0x5555555555555555, 0x0000000001234567};

   uint64_t zm_d[] = {0x0000000000000001, 0x0000000000000000,
                      0xcdcdcdcdcdcdcdcd, 0x0000000012345678};

   int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
   int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
   int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
   int pg_d[] = {1, 0, 1, 1};

   unsigned umax_exp_b[] = {0x01, 0xff, 0x0f, 0xff, 0xfe, 0xab, 0xcd, 0x67};

   unsigned umax_exp_h[] = {0x0001, 0xffff, 0x00ff, 0xffff,
                            0xff00, 0xba98, 0x5555, 0x5678};

   unsigned umax_exp_s[] = {0x00000001, 0xffffffff, 0x0000ffff, 0xffffffff,
                            0xffff0000, 0xfedcba98, 0xcdcdcdcd, 0x12345678};

   uint64_t umax_exp_d[] = {0x0000000000000001, 0xffffffffffffffff,
                            0xcdcdcdcdcdcdcdcd, 0x0000000012345678};

   ArithPredicatedFn fn = &MacroAssembler::Umax;
   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umax_exp_b);
   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umax_exp_h);
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umax_exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umax_exp_d);

   unsigned umin_exp_b[] = {0x00, 0x00, 0x0e, 0xff, 0xf0, 0x98, 0x55, 0x67};

   unsigned umin_exp_h[] = {0x0000, 0x0000, 0x00ff, 0xfffe,
                            0xfe00, 0xabab, 0x5555, 0x4567};

   unsigned umin_exp_s[] = {0x00000000, 0xffffffff, 0x0000eeee, 0xfffffffe,
                            0xfffe0000, 0xfedcba98, 0x55555555, 0x01234567};

   uint64_t umin_exp_d[] = {0x0000000000000000, 0xffffffffffffffff,
                            0x5555555555555555, 0x0000000001234567};
   fn = &MacroAssembler::Umin;
   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umin_exp_b);
   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umin_exp_h);
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umin_exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umin_exp_d);

   unsigned uabd_exp_b[] = {0x01, 0xff, 0x01, 0xff, 0x0e, 0x13, 0x78, 0x67};

   unsigned uabd_exp_h[] = {0x0001, 0xffff, 0x00ff, 0x0001,
                            0x0100, 0x0eed, 0x5555, 0x1111};

   unsigned uabd_exp_s[] = {0x00000001, 0xffffffff, 0x00001111, 0x00000001,
                            0x00010000, 0xfedcba98, 0x78787878, 0x11111111};

   uint64_t uabd_exp_d[] = {0x0000000000000001, 0xffffffffffffffff,
                            0x7878787878787878, 0x0000000011111111};

   fn = &MacroAssembler::Uabd;
   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, uabd_exp_b);
   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, uabd_exp_h);
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, uabd_exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, uabd_exp_d);
   // clang-format on
 }

 TEST_SVE(sve_binary_arithmetic_predicated_smin_smax_sabd) {
   // clang-format off
   int zn_b[] = {0, -128, -128, -128, -128, 127, 127, 1};

   int zm_b[] = {-1, 0, -1, -127, 127, 126, -1, 0};

   int zn_h[] = {0, INT16_MIN, INT16_MIN, INT16_MIN,
                 INT16_MIN, INT16_MAX, INT16_MAX, 1};

   int zm_h[] = {-1, 0, -1, INT16_MIN + 1,
                 INT16_MAX, INT16_MAX - 1, -1, 0};

   int zn_s[] = {0, INT32_MIN, INT32_MIN, INT32_MIN,
                 INT32_MIN, INT32_MAX, INT32_MAX, 1};

   int zm_s[] = {-1, 0, -1, -INT32_MAX,
                 INT32_MAX, INT32_MAX - 1, -1, 0};

   int64_t zn_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN,
                     INT64_MIN, INT64_MAX, INT64_MAX, 1};

   int64_t zm_d[] = {-1, 0, -1, INT64_MIN + 1,
                     INT64_MAX, INT64_MAX - 1, -1, 0};

   int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
   int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
   int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
   int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1};

   int smax_exp_b[] = {0, 0, -1, -128, 127, 127, 127, 1};

   int smax_exp_h[] = {0, 0, INT16_MIN, INT16_MIN + 1,
                       INT16_MAX, INT16_MAX, INT16_MAX, 1};

   int smax_exp_s[] = {0, INT32_MIN, -1, INT32_MIN + 1,
                       INT32_MAX, INT32_MAX, INT32_MAX, 1};

   int64_t smax_exp_d[] = {0, 0, -1, INT64_MIN + 1,
                           INT64_MIN, INT64_MAX, INT64_MAX, 1};

   ArithPredicatedFn fn = &MacroAssembler::Smax;
   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smax_exp_b);
   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smax_exp_h);
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smax_exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smax_exp_d);

   int smin_exp_b[] = {-1, -128, -128, -128, -128, 126, -1, 1};

   int smin_exp_h[] = {-1, INT16_MIN, INT16_MIN, INT16_MIN,
                       INT16_MIN, INT16_MAX - 1, INT16_MAX, 0};

   int smin_exp_s[] = {-1, INT32_MIN, INT32_MIN, INT32_MIN,
                       INT32_MIN, INT32_MAX, -1, 0};

   int64_t smin_exp_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN,
                           INT64_MIN, INT64_MAX - 1, -1, 0};

   fn = &MacroAssembler::Smin;
   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smin_exp_b);
   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smin_exp_h);
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smin_exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smin_exp_d);

   unsigned sabd_exp_b[] = {1, 128, 127, 128, 255, 1, 128, 1};

   unsigned sabd_exp_h[] = {1, 0x8000, 0x8000, 1, 0xffff, 1, 0x7fff, 1};

   unsigned sabd_exp_s[] = {1, 0x80000000, 0x7fffffff, 1,
                            0xffffffff, 0x7fffffff, 0x80000000, 1};

   uint64_t sabd_exp_d[] = {0, 0x8000000000000000, 0x7fffffffffffffff, 1,
                            0x8000000000000000, 1, 0x8000000000000000, 1};

   fn = &MacroAssembler::Sabd;
   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sabd_exp_b);
   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sabd_exp_h);
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sabd_exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sabd_exp_d);
   // clang-format on
 }

 TEST_SVE(sve_binary_arithmetic_predicated_mul_umulh) {
   // clang-format off
   unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa};

   unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08};

   unsigned zn_h[] = {0x0000, 0x0001, 0x0020, 0x0800,
                      0x8000, 0xff00, 0x5555, 0xaaaa};

   unsigned zm_h[] = {0x007f, 0x00cd, 0x0800, 0xffff,
                      0x5555, 0xaaaa, 0x0001, 0x1234};

   unsigned zn_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800,
                      0x12345678, 0xffffffff, 0x55555555, 0xaaaaaaaa};

   unsigned zm_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800,
                      0x12345678, 0x22223333, 0x55556666, 0x77778888};

   uint64_t zn_d[] = {0x0000000000000000, 0x5555555555555555,
                      0xffffffffffffffff, 0xaaaaaaaaaaaaaaaa};

   uint64_t zm_d[] = {0x0000000000000000, 0x1111111133333333,
                      0xddddddddeeeeeeee, 0xaaaaaaaaaaaaaaaa};

   int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
   int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
   int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1};
   int pg_d[] = {1, 1, 0, 1};

   unsigned mul_exp_b[] = {0x00, 0xcd, 0x00, 0xf8, 0x80, 0x56, 0x00, 0x50};

   unsigned mul_exp_h[] = {0x0000, 0x0001, 0x0000, 0xf800,
                           0x8000, 0xff00, 0x5555, 0x9e88};

   unsigned mul_exp_s[] = {0x00000000, 0x00000001, 0x00200020, 0x00400000,
                           0x1df4d840, 0xddddcccd, 0x55555555, 0xb05afa50};

   uint64_t mul_exp_d[] = {0x0000000000000000, 0xa4fa4fa4eeeeeeef,
                           0xffffffffffffffff, 0x38e38e38e38e38e4};

   ArithPredicatedFn fn = &MacroAssembler::Mul;
   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, mul_exp_b);
   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, mul_exp_h);
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, mul_exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, mul_exp_d);

   unsigned umulh_exp_b[] = {0x00, 0x00, 0x10, 0x07, 0x80, 0xa9, 0x00, 0x05};

   unsigned umulh_exp_h[] = {0x0000, 0x0001, 0x0001, 0x07ff,
                             0x2aaa, 0xff00, 0x0000, 0x0c22};

   unsigned umulh_exp_s[] = {0x00000000, 0x00000000, 0x00200020, 0x00400080,
                             0x014b66dc, 0x22223332, 0x55555555, 0x4fa505af};

   uint64_t umulh_exp_d[] = {0x0000000000000000, 0x05b05b05bbbbbbbb,
                             0xffffffffffffffff, 0x71c71c71c71c71c6};

   fn = &MacroAssembler::Umulh;
   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umulh_exp_b);
   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umulh_exp_h);
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umulh_exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umulh_exp_d);
   // clang-format on
 }

 TEST_SVE(sve_binary_arithmetic_predicated_smulh) {
   // clang-format off
   int zn_b[] = {0, 1, -1, INT8_MIN, INT8_MAX, -1, 100, -3};

   int zm_b[] = {0, INT8_MIN, INT8_MIN, INT8_MAX, INT8_MAX, -1, 2, 66};

   int zn_h[] = {0, 1, -1, INT16_MIN, INT16_MAX, -1, 10000, -3};

   int zm_h[] = {0, INT16_MIN, INT16_MIN, INT16_MAX, INT16_MAX, -1, 2, 6666};

   int zn_s[] = {0, 1, -1, INT32_MIN, INT32_MAX, -1, 100000000, -3};

   int zm_s[] = {0, INT32_MIN, INT32_MIN, INT32_MAX, INT32_MAX, -1, 2, 66666666};

   int64_t zn_d[] = {0, -1, INT64_MIN, INT64_MAX};

   int64_t zm_d[] = {INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX};

   int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
   int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
   int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1};
   int pg_d[] = {1, 1, 0, 1};

   int exp_b[] = {0, -1, 0, -64, INT8_MAX, 0, 0, -1};

   int exp_h[] = {0, 1, 0, -16384, 16383, -1, 0, -1};

   int exp_s[] = {0, -1, -1, -1073741824, 1073741823, 0, 100000000, -1};

   int64_t exp_d[] = {0, -1, INT64_MIN, 4611686018427387903};

   ArithPredicatedFn fn = &MacroAssembler::Smulh;
   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, exp_b);
   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, exp_h);
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
   // clang-format on
 }

 TEST_SVE(sve_binary_arithmetic_predicated_logical) {
   // clang-format off
   unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa};
   unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08};

   unsigned zn_h[] = {0x0000, 0x0001, 0x2020, 0x0008,
                      0x8000, 0xffff, 0x5555, 0xaaaa};
   unsigned zm_h[] = {0x7fff, 0xabcd, 0x8000, 0xffff,
                      0x5555, 0xaaaa, 0x0000, 0x0800};

   unsigned zn_s[] = {0x00000001, 0x20200008, 0x8000ffff, 0x5555aaaa};
   unsigned zm_s[] = {0x7fffabcd, 0x8000ffff, 0x5555aaaa, 0x00000800};

   uint64_t zn_d[] = {0xfedcba9876543210, 0x0123456789abcdef,
                      0x0001200880ff55aa, 0x0022446688aaccee};
   uint64_t zm_d[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff,
                      0x7fcd80ff55aa0008, 0x1133557799bbddff};

   int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
   int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
   int pg_s[] = {1, 1, 1, 0};
   int pg_d[] = {1, 1, 0, 1};

   unsigned and_exp_b[] = {0x00, 0x01, 0x00, 0x08, 0x80, 0xaa, 0x00, 0x08};

   unsigned and_exp_h[] = {0x0000, 0x0001, 0x0000, 0x0008,
                           0x0000, 0xffff, 0x0000, 0x0800};

   unsigned and_exp_s[] = {0x00000001, 0x00000008, 0x0000aaaa, 0x5555aaaa};

   uint64_t and_exp_d[] = {0xfedcaa8854540000, 0x0000454588aacdef,
                           0x0001200880ff55aa, 0x0022446688aaccee};

   ArithPredicatedFn fn = &MacroAssembler::And;
   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, and_exp_b);
   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, and_exp_h);
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, and_exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, and_exp_d);

   unsigned bic_exp_b[] = {0x00, 0x00, 0x20, 0x00, 0x80, 0x55, 0x55, 0xa2};

   unsigned bic_exp_h[] = {0x0000, 0x0001, 0x2020, 0x0000,
                           0x8000, 0xffff, 0x5555, 0xa2aa};

   unsigned bic_exp_s[] = {0x00000000, 0x20200000, 0x80005555, 0x5555aaaa};

   uint64_t bic_exp_d[] = {0x0000101022003210, 0x0123002201010000,
                           0x0001200880ff55aa, 0x0000000000000000};

   fn = &MacroAssembler::Bic;
   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, bic_exp_b);
   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, bic_exp_h);
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, bic_exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, bic_exp_d);

   unsigned eor_exp_b[] = {0x00, 0xcc, 0xa0, 0xf7, 0x80, 0x55, 0x55, 0xa2};

   unsigned eor_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xfff7,
                           0xd555, 0xffff, 0x5555, 0xa2aa};

   unsigned eor_exp_s[] = {0x7fffabcc, 0xa020fff7, 0xd5555555, 0x5555aaaa};

   uint64_t eor_exp_d[] = {0x01235476ab89fedc, 0xcdef98ba67453210,
                           0x0001200880ff55aa, 0x1111111111111111};

   fn = &MacroAssembler::Eor;
   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, eor_exp_b);
   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, eor_exp_h);
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, eor_exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, eor_exp_d);

   unsigned orr_exp_b[] = {0x00, 0xcd, 0xa0, 0xff, 0x80, 0xff, 0x55, 0xaa};

   unsigned orr_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xffff,
                           0xd555, 0xffff, 0x5555, 0xaaaa};

   unsigned orr_exp_s[] = {0x7fffabcd, 0xa020ffff, 0xd555ffff, 0x5555aaaa};

   uint64_t orr_exp_d[] = {0xfffffefeffddfedc, 0xcdefddffefefffff,
                           0x0001200880ff55aa, 0x1133557799bbddff};

   fn = &MacroAssembler::Orr;
   IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, orr_exp_b);
   IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, orr_exp_h);
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, orr_exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, orr_exp_d);
   // clang-format on
 }

 TEST_SVE(sve_binary_arithmetic_predicated_sdiv) {
   // clang-format off
   int zn_s[] = {0, 1, -1, 2468,
                 INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX,
                 -11111111, 87654321, 0, 0};

   int zm_s[] = {1, -1, 1, 1234,
                 -1, INT32_MIN, 1, -1,
                 22222222, 80000000, -1, 0};

   int64_t zn_d[] = {0, 1, -1, 2468,
                     INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX,
                     -11111111, 87654321, 0, 0};

   int64_t zm_d[] = {1, -1, 1, 1234,
                     -1, INT64_MIN, 1, -1,
                     22222222, 80000000, -1, 0};

   int pg_s[] = {1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0};
   int pg_d[] = {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1};

   int exp_s[] = {0, 1, -1, 2,
                  INT32_MIN, 0, INT32_MIN, -INT32_MAX,
                  0, 1, 0, 0};

   int64_t exp_d[] = {0, -1, -1, 2,
                      INT64_MIN, INT64_MAX, INT64_MIN, -INT64_MAX,
                      0, 1, 0, 0};

   ArithPredicatedFn fn = &MacroAssembler::Sdiv;
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
   // clang-format on
 }

 TEST_SVE(sve_binary_arithmetic_predicated_udiv) {
   // clang-format off
   unsigned zn_s[] = {0x00000000, 0x00000001, 0xffffffff, 0x80000000,
                      0xffffffff, 0x80000000, 0xffffffff, 0x0000f000};

   unsigned zm_s[] = {0x00000001, 0xffffffff, 0x80000000, 0x00000002,
                      0x00000000, 0x00000001, 0x00008000, 0xf0000000};

   uint64_t zn_d[] = {0x0000000000000000, 0x0000000000000001,
                      0xffffffffffffffff, 0x8000000000000000,
                      0xffffffffffffffff, 0x8000000000000000,
                      0xffffffffffffffff, 0xf0000000f0000000};

   uint64_t zm_d[] = {0x0000000000000001, 0xffffffff00000000,
                      0x8000000000000000, 0x0000000000000002,
                      0x8888888888888888, 0x0000000000000001,
                      0x0000000080000000, 0x00000000f0000000};

   int pg_s[] = {1, 1, 0, 1, 1, 0, 1, 1};
   int pg_d[] = {1, 0, 1, 1, 1, 1, 0, 1};

   unsigned exp_s[] = {0x00000000, 0x00000000, 0xffffffff, 0x40000000,
                       0x00000000, 0x80000000, 0x0001ffff, 0x00000000};

   uint64_t exp_d[] = {0x0000000000000000, 0x0000000000000001,
                       0x0000000000000001, 0x4000000000000000,
                       0x0000000000000001, 0x8000000000000000,
                       0xffffffffffffffff, 0x0000000100000001};

   ArithPredicatedFn fn = &MacroAssembler::Udiv;
   IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
   IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
   // clang-format on
 }

 typedef void (MacroAssembler::*ArithFn)(const ZRegister& zd,
                                         const ZRegister& zn,
                                         const ZRegister& zm);

 template <typename T>
 static void IntArithHelper(Test* config,
                            ArithFn macro,
                            unsigned lane_size_in_bits,
                            const T& zn_inputs,
                            const T& zm_inputs,
                            const T& zd_expected) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   ZRegister zn = z31.WithLaneSize(lane_size_in_bits);
   ZRegister zm = z27.WithLaneSize(lane_size_in_bits);
   InsrHelper(&masm, zn, zn_inputs);
   InsrHelper(&masm, zm, zm_inputs);

   ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
   (masm.*macro)(zd, zn, zm);

   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(zd_expected, zd);
   }
 }

 TEST_SVE(sve_arithmetic_unpredicated_add_sqadd_uqadd) {
   // clang-format off
   unsigned in_b[] = {0x81, 0x7f, 0x10, 0xaa, 0x55, 0xff, 0xf0};
   unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa, 0x5555, 0xffff, 0xf0f0};
   unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0x10001010, 0xaaaaaaaa, 0xf000f0f0};
   uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f,
                       0x1000000010001010, 0xf0000000f000f0f0};

   ArithFn fn = &MacroAssembler::Add;

   unsigned add_exp_b[] = {0x02, 0xfe, 0x20, 0x54, 0xaa, 0xfe, 0xe0};
   unsigned add_exp_h[] = {0x0302, 0xfefe, 0x2020, 0x5554, 0xaaaa, 0xfffe, 0xe1e0};
   unsigned add_exp_s[] = {0x00030302, 0xfffefefe, 0x20002020, 0x55555554, 0xe001e1e0};
   uint64_t add_exp_d[] = {0x0000000300030302, 0xfffffffefffefefe,
                           0x2000000020002020, 0xe0000001e001e1e0};

   IntArithHelper(config, fn, kBRegSize, in_b, in_b, add_exp_b);
   IntArithHelper(config, fn, kHRegSize, in_h, in_h, add_exp_h);
   IntArithHelper(config, fn, kSRegSize, in_s, in_s, add_exp_s);
   IntArithHelper(config, fn, kDRegSize, in_d, in_d, add_exp_d);

   fn = &MacroAssembler::Sqadd;

   unsigned sqadd_exp_b[] = {0x80, 0x7f, 0x20, 0x80, 0x7f, 0xfe, 0xe0};
   unsigned sqadd_exp_h[] = {0x8000, 0x7fff, 0x2020, 0x8000, 0x7fff, 0xfffe, 0xe1e0};
   unsigned sqadd_exp_s[] = {0x80000000, 0x7fffffff, 0x20002020, 0x80000000, 0xe001e1e0};
   uint64_t sqadd_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff,
                             0x2000000020002020, 0xe0000001e001e1e0};

   IntArithHelper(config, fn, kBRegSize, in_b, in_b, sqadd_exp_b);
   IntArithHelper(config, fn, kHRegSize, in_h, in_h, sqadd_exp_h);
   IntArithHelper(config, fn, kSRegSize, in_s, in_s, sqadd_exp_s);
   IntArithHelper(config, fn, kDRegSize, in_d, in_d, sqadd_exp_d);

   fn = &MacroAssembler::Uqadd;

   unsigned uqadd_exp_b[] = {0xff, 0xfe, 0x20, 0xff, 0xaa, 0xff, 0xff};
   unsigned uqadd_exp_h[] = {0xffff, 0xfefe, 0x2020, 0xffff, 0xaaaa, 0xffff, 0xffff};
   unsigned uqadd_exp_s[] = {0xffffffff, 0xfffefefe, 0x20002020, 0xffffffff, 0xffffffff};
   uint64_t uqadd_exp_d[] = {0xffffffffffffffff, 0xfffffffefffefefe,
                             0x2000000020002020, 0xffffffffffffffff};

   IntArithHelper(config, fn, kBRegSize, in_b, in_b, uqadd_exp_b);
   IntArithHelper(config, fn, kHRegSize, in_h, in_h, uqadd_exp_h);
   IntArithHelper(config, fn, kSRegSize, in_s, in_s, uqadd_exp_s);
   IntArithHelper(config, fn, kDRegSize, in_d, in_d, uqadd_exp_d);
   // clang-format on
 }

 TEST_SVE(sve_arithmetic_unpredicated_sub_sqsub_uqsub) {
   // clang-format off

   unsigned ins1_b[] = {0x81, 0x7f, 0x7e, 0xaa};
   unsigned ins2_b[] = {0x10, 0xf0, 0xf0, 0x55};

   unsigned ins1_h[] = {0x8181, 0x7f7f, 0x7e7e, 0xaaaa};
   unsigned ins2_h[] = {0x1010, 0xf0f0, 0xf0f0, 0x5555};

   unsigned ins1_s[] = {0x80018181, 0x7fff7f7f, 0x7eee7e7e, 0xaaaaaaaa};
   unsigned ins2_s[] = {0x10001010, 0xf000f0f0, 0xf000f0f0, 0x55555555};

   uint64_t ins1_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f,
                        0x7eeeeeee7eee7e7e, 0xaaaaaaaaaaaaaaaa};
   uint64_t ins2_d[] = {0x1000000010001010, 0xf0000000f000f0f0,
                        0xf0000000f000f0f0, 0x5555555555555555};

   ArithFn fn = &MacroAssembler::Sub;

   unsigned ins1_sub_ins2_exp_b[] = {0x71, 0x8f, 0x8e, 0x55};
   unsigned ins1_sub_ins2_exp_h[] = {0x7171, 0x8e8f, 0x8d8e, 0x5555};
   unsigned ins1_sub_ins2_exp_s[] = {0x70017171, 0x8ffe8e8f, 0x8eed8d8e, 0x55555555};
   uint64_t ins1_sub_ins2_exp_d[] = {0x7000000170017171, 0x8ffffffe8ffe8e8f,
                                     0x8eeeeeed8eed8d8e, 0x5555555555555555};

   IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sub_ins2_exp_b);
   IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sub_ins2_exp_h);
   IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sub_ins2_exp_s);
   IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sub_ins2_exp_d);

   unsigned ins2_sub_ins1_exp_b[] = {0x8f, 0x71, 0x72, 0xab};
   unsigned ins2_sub_ins1_exp_h[] = {0x8e8f, 0x7171, 0x7272, 0xaaab};
   unsigned ins2_sub_ins1_exp_s[] = {0x8ffe8e8f, 0x70017171, 0x71127272, 0xaaaaaaab};
   uint64_t ins2_sub_ins1_exp_d[] = {0x8ffffffe8ffe8e8f, 0x7000000170017171,
                                     0x7111111271127272, 0xaaaaaaaaaaaaaaab};

   IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sub_ins1_exp_b);
   IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sub_ins1_exp_h);
   IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sub_ins1_exp_s);
   IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sub_ins1_exp_d);

   fn = &MacroAssembler::Sqsub;

   unsigned ins1_sqsub_ins2_exp_b[] = {0x80, 0x7f, 0x7f, 0x80};
   unsigned ins1_sqsub_ins2_exp_h[] = {0x8000, 0x7fff, 0x7fff, 0x8000};
   unsigned ins1_sqsub_ins2_exp_s[] = {0x80000000, 0x7fffffff, 0x7fffffff, 0x80000000};
   uint64_t ins1_sqsub_ins2_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff,
                                       0x7fffffffffffffff, 0x8000000000000000};

   IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sqsub_ins2_exp_b);
   IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sqsub_ins2_exp_h);
   IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sqsub_ins2_exp_s);
   IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sqsub_ins2_exp_d);

   unsigned ins2_sqsub_ins1_exp_b[] = {0x7f, 0x80, 0x80, 0x7f};
   unsigned ins2_sqsub_ins1_exp_h[] = {0x7fff, 0x8000, 0x8000, 0x7fff};
   unsigned ins2_sqsub_ins1_exp_s[] = {0x7fffffff, 0x80000000, 0x80000000, 0x7fffffff};
   uint64_t ins2_sqsub_ins1_exp_d[] = {0x7fffffffffffffff, 0x8000000000000000,
                                       0x8000000000000000, 0x7fffffffffffffff};

   IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sqsub_ins1_exp_b);
   IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sqsub_ins1_exp_h);
   IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sqsub_ins1_exp_s);
   IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sqsub_ins1_exp_d);

   fn = &MacroAssembler::Uqsub;

   unsigned ins1_uqsub_ins2_exp_b[] = {0x71, 0x00, 0x00, 0x55};
   unsigned ins1_uqsub_ins2_exp_h[] = {0x7171, 0x0000, 0x0000, 0x5555};
   unsigned ins1_uqsub_ins2_exp_s[] = {0x70017171, 0x00000000, 0x00000000, 0x55555555};
   uint64_t ins1_uqsub_ins2_exp_d[] = {0x7000000170017171, 0x0000000000000000,
                                       0x0000000000000000, 0x5555555555555555};

   IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_uqsub_ins2_exp_b);
   IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_uqsub_ins2_exp_h);
   IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_uqsub_ins2_exp_s);
   IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_uqsub_ins2_exp_d);

   unsigned ins2_uqsub_ins1_exp_b[] = {0x00, 0x71, 0x72, 0x00};
   unsigned ins2_uqsub_ins1_exp_h[] = {0x0000, 0x7171, 0x7272, 0x0000};
   unsigned ins2_uqsub_ins1_exp_s[] = {0x00000000, 0x70017171, 0x71127272, 0x00000000};
   uint64_t ins2_uqsub_ins1_exp_d[] = {0x0000000000000000, 0x7000000170017171,
                                       0x7111111271127272, 0x0000000000000000};

   IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_uqsub_ins1_exp_b);
   IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_uqsub_ins1_exp_h);
   IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_uqsub_ins1_exp_s);
   IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_uqsub_ins1_exp_d);
   // clang-format on
 }

 TEST_SVE(sve_rdvl) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Encodable multipliers.
   __ Rdvl(x0, 0);
   __ Rdvl(x1, 1);
   __ Rdvl(x2, 2);
   __ Rdvl(x3, 31);
   __ Rdvl(x4, -1);
   __ Rdvl(x5, -2);
   __ Rdvl(x6, -32);

   // For unencodable multipliers, the MacroAssembler uses a sequence of
   // instructions.
   __ Rdvl(x10, 32);
   __ Rdvl(x11, -33);
   __ Rdvl(x12, 42);
   __ Rdvl(x13, -42);

   // The maximum value of VL is 256 (bytes), so the multiplier is limited to the
   // range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow
   // occurs in the macro.
   __ Rdvl(x14, 0x007fffffffffffff);
   __ Rdvl(x15, -0x0080000000000000);

   END();

   if (CAN_RUN()) {
     RUN();

     uint64_t vl = config->sve_vl_in_bytes();

     ASSERT_EQUAL_64(vl * 0, x0);
     ASSERT_EQUAL_64(vl * 1, x1);
     ASSERT_EQUAL_64(vl * 2, x2);
     ASSERT_EQUAL_64(vl * 31, x3);
     ASSERT_EQUAL_64(vl * -1, x4);
     ASSERT_EQUAL_64(vl * -2, x5);
     ASSERT_EQUAL_64(vl * -32, x6);

     ASSERT_EQUAL_64(vl * 32, x10);
     ASSERT_EQUAL_64(vl * -33, x11);
     ASSERT_EQUAL_64(vl * 42, x12);
     ASSERT_EQUAL_64(vl * -42, x13);

     ASSERT_EQUAL_64(vl * 0x007fffffffffffff, x14);
     ASSERT_EQUAL_64(vl * 0xff80000000000000, x15);
   }
 }

 TEST_SVE(sve_rdpl) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // There is no `rdpl` instruction, so the MacroAssembler maps `Rdpl` onto
   // Addpl(xd, xzr, ...).

   // Encodable multipliers (as `addvl`).
   __ Rdpl(x0, 0);
   __ Rdpl(x1, 8);
   __ Rdpl(x2, 248);
   __ Rdpl(x3, -8);
   __ Rdpl(x4, -256);

   // Encodable multipliers (as `movz` + `addpl`).
   __ Rdpl(x7, 31);
   __ Rdpl(x8, -31);

   // For unencodable multipliers, the MacroAssembler uses a sequence of
   // instructions.
   __ Rdpl(x10, 42);
   __ Rdpl(x11, -42);

   // The maximum value of VL is 256 (bytes), so the multiplier is limited to the
   // range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow
   // occurs in the macro.
   __ Rdpl(x12, 0x007fffffffffffff);
   __ Rdpl(x13, -0x0080000000000000);

   END();

   if (CAN_RUN()) {
     RUN();

     uint64_t vl = config->sve_vl_in_bytes();
     VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
     uint64_t pl = vl / kZRegBitsPerPRegBit;

     ASSERT_EQUAL_64(pl * 0, x0);
     ASSERT_EQUAL_64(pl * 8, x1);
     ASSERT_EQUAL_64(pl * 248, x2);
     ASSERT_EQUAL_64(pl * -8, x3);
     ASSERT_EQUAL_64(pl * -256, x4);

     ASSERT_EQUAL_64(pl * 31, x7);
     ASSERT_EQUAL_64(pl * -31, x8);

     ASSERT_EQUAL_64(pl * 42, x10);
     ASSERT_EQUAL_64(pl * -42, x11);

     ASSERT_EQUAL_64(pl * 0x007fffffffffffff, x12);
     ASSERT_EQUAL_64(pl * 0xff80000000000000, x13);
   }
 }

 TEST_SVE(sve_addvl) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t base = 0x1234567800000000;
   __ Mov(x30, base);

   // Encodable multipliers.
   __ Addvl(x0, x30, 0);
   __ Addvl(x1, x30, 1);
   __ Addvl(x2, x30, 31);
   __ Addvl(x3, x30, -1);
   __ Addvl(x4, x30, -32);

   // For unencodable multipliers, the MacroAssembler uses `Rdvl` and `Add`.
   __ Addvl(x5, x30, 32);
   __ Addvl(x6, x30, -33);

   // Test the limits of the multiplier supported by the `Rdvl` macro.
   __ Addvl(x7, x30, 0x007fffffffffffff);
   __ Addvl(x8, x30, -0x0080000000000000);

   // Check that xzr behaves correctly.
   __ Addvl(x9, xzr, 8);
   __ Addvl(x10, xzr, 42);

   // Check that sp behaves correctly with encodable and unencodable multipliers.
   __ Addvl(sp, sp, -5);
   __ Addvl(sp, sp, -37);
   __ Addvl(x11, sp, -2);
   __ Addvl(sp, x11, 2);
   __ Addvl(x12, sp, -42);

   // Restore the value of sp.
   __ Addvl(sp, x11, 39);
   __ Addvl(sp, sp, 5);

   // Adjust x11 and x12 to make the test sp-agnostic.
   __ Sub(x11, sp, x11);
   __ Sub(x12, sp, x12);

   // Check cases where xd.Is(xn). This stresses scratch register allocation.
   __ Mov(x20, x30);
   __ Mov(x21, x30);
   __ Mov(x22, x30);
   __ Addvl(x20, x20, 4);
   __ Addvl(x21, x21, 42);
   __ Addvl(x22, x22, -0x0080000000000000);

   END();

   if (CAN_RUN()) {
     RUN();

     uint64_t vl = config->sve_vl_in_bytes();

     ASSERT_EQUAL_64(base + (vl * 0), x0);
     ASSERT_EQUAL_64(base + (vl * 1), x1);
     ASSERT_EQUAL_64(base + (vl * 31), x2);
     ASSERT_EQUAL_64(base + (vl * -1), x3);
     ASSERT_EQUAL_64(base + (vl * -32), x4);

     ASSERT_EQUAL_64(base + (vl * 32), x5);
     ASSERT_EQUAL_64(base + (vl * -33), x6);

     ASSERT_EQUAL_64(base + (vl * 0x007fffffffffffff), x7);
     ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x8);

     ASSERT_EQUAL_64(vl * 8, x9);
     ASSERT_EQUAL_64(vl * 42, x10);

     ASSERT_EQUAL_64(vl * 44, x11);
     ASSERT_EQUAL_64(vl * 84, x12);

     ASSERT_EQUAL_64(base + (vl * 4), x20);
     ASSERT_EQUAL_64(base + (vl * 42), x21);
     ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x22);

     ASSERT_EQUAL_64(base, x30);
   }
 }

 TEST_SVE(sve_addpl) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t base = 0x1234567800000000;
   __ Mov(x30, base);

   // Encodable multipliers.
   __ Addpl(x0, x30, 0);
   __ Addpl(x1, x30, 1);
   __ Addpl(x2, x30, 31);
   __ Addpl(x3, x30, -1);
   __ Addpl(x4, x30, -32);

   // For unencodable multipliers, the MacroAssembler uses `Addvl` if it can, or
   // it falls back to `Rdvl` and `Add`.
   __ Addpl(x5, x30, 32);
   __ Addpl(x6, x30, -33);

   // Test the limits of the multiplier supported by the `Rdvl` macro.
   __ Addpl(x7, x30, 0x007fffffffffffff);
   __ Addpl(x8, x30, -0x0080000000000000);

   // Check that xzr behaves correctly.
   __ Addpl(x9, xzr, 8);
   __ Addpl(x10, xzr, 42);

   // Check that sp behaves correctly with encodable and unencodable multipliers.
   __ Addpl(sp, sp, -5);
   __ Addpl(sp, sp, -37);
   __ Addpl(x11, sp, -2);
   __ Addpl(sp, x11, 2);
   __ Addpl(x12, sp, -42);

   // Restore the value of sp.
   __ Addpl(sp, x11, 39);
   __ Addpl(sp, sp, 5);

   // Adjust x11 and x12 to make the test sp-agnostic.
   __ Sub(x11, sp, x11);
   __ Sub(x12, sp, x12);

   // Check cases where xd.Is(xn). This stresses scratch register allocation.
   __ Mov(x20, x30);
   __ Mov(x21, x30);
   __ Mov(x22, x30);
   __ Addpl(x20, x20, 4);
   __ Addpl(x21, x21, 42);
   __ Addpl(x22, x22, -0x0080000000000000);

   END();

   if (CAN_RUN()) {
     RUN();

     uint64_t vl = config->sve_vl_in_bytes();
     VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
     uint64_t pl = vl / kZRegBitsPerPRegBit;

     ASSERT_EQUAL_64(base + (pl * 0), x0);
     ASSERT_EQUAL_64(base + (pl * 1), x1);
     ASSERT_EQUAL_64(base + (pl * 31), x2);
     ASSERT_EQUAL_64(base + (pl * -1), x3);
     ASSERT_EQUAL_64(base + (pl * -32), x4);

     ASSERT_EQUAL_64(base + (pl * 32), x5);
     ASSERT_EQUAL_64(base + (pl * -33), x6);

     ASSERT_EQUAL_64(base + (pl * 0x007fffffffffffff), x7);
     ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x8);

     ASSERT_EQUAL_64(pl * 8, x9);
     ASSERT_EQUAL_64(pl * 42, x10);

     ASSERT_EQUAL_64(pl * 44, x11);
     ASSERT_EQUAL_64(pl * 84, x12);

     ASSERT_EQUAL_64(base + (pl * 4), x20);
     ASSERT_EQUAL_64(base + (pl * 42), x21);
     ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x22);

     ASSERT_EQUAL_64(base, x30);
   }
 }

 TEST_SVE(sve_calculate_sve_address) {
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wshadow"

   // Shadow the `MacroAssembler` type so that the test macros work without
   // modification.
   typedef CalculateSVEAddressMacroAssembler MacroAssembler;

   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();  // NOLINT(clang-diagnostic-local-type-template-args)

   uint64_t base = 0x1234567800000000;
   __ Mov(x28, base);
   __ Mov(x29, 48);
   __ Mov(x30, -48);

   // Simple scalar (or equivalent) cases.

   __ CalculateSVEAddress(x0, SVEMemOperand(x28));
   __ CalculateSVEAddress(x1, SVEMemOperand(x28, 0));
   __ CalculateSVEAddress(x2, SVEMemOperand(x28, 0, SVE_MUL_VL));
   __ CalculateSVEAddress(x3, SVEMemOperand(x28, 0, SVE_MUL_VL), 3);
   __ CalculateSVEAddress(x4, SVEMemOperand(x28, xzr));
   __ CalculateSVEAddress(x5, SVEMemOperand(x28, xzr, LSL, 42));

   // scalar-plus-immediate

   // Unscaled immediates, handled with `Add`.
   __ CalculateSVEAddress(x6, SVEMemOperand(x28, 42));
   __ CalculateSVEAddress(x7, SVEMemOperand(x28, -42));
   // Scaled immediates, handled with `Addvl` or `Addpl`.
   __ CalculateSVEAddress(x8, SVEMemOperand(x28, 31, SVE_MUL_VL), 0);
   __ CalculateSVEAddress(x9, SVEMemOperand(x28, -32, SVE_MUL_VL), 0);
   // Out of `addvl` or `addpl` range.
   __ CalculateSVEAddress(x10, SVEMemOperand(x28, 42, SVE_MUL_VL), 0);
   __ CalculateSVEAddress(x11, SVEMemOperand(x28, -42, SVE_MUL_VL), 0);
   // As above, for VL-based accesses smaller than a Z register.
   VIXL_STATIC_ASSERT(kZRegBitsPerPRegBitLog2 == 3);
   __ CalculateSVEAddress(x12, SVEMemOperand(x28, -32 * 8, SVE_MUL_VL), 3);
   __ CalculateSVEAddress(x13, SVEMemOperand(x28, -42 * 8, SVE_MUL_VL), 3);
   __ CalculateSVEAddress(x14, SVEMemOperand(x28, -32 * 4, SVE_MUL_VL), 2);
   __ CalculateSVEAddress(x15, SVEMemOperand(x28, -42 * 4, SVE_MUL_VL), 2);
   __ CalculateSVEAddress(x18, SVEMemOperand(x28, -32 * 2, SVE_MUL_VL), 1);
   __ CalculateSVEAddress(x19, SVEMemOperand(x28, -42 * 2, SVE_MUL_VL), 1);

   // scalar-plus-scalar

   __ CalculateSVEAddress(x20, SVEMemOperand(x28, x29));
   __ CalculateSVEAddress(x21, SVEMemOperand(x28, x30));
   __ CalculateSVEAddress(x22, SVEMemOperand(x28, x29, LSL, 8));
   __ CalculateSVEAddress(x23, SVEMemOperand(x28, x30, LSL, 8));

   // In-place updates, to stress scratch register allocation.

   __ Mov(x24, 0xabcd000000000000);
   __ Mov(x25, 0xabcd101100000000);
   __ Mov(x26, 0xabcd202200000000);
   __ Mov(x27, 0xabcd303300000000);
   __ Mov(x28, 0xabcd404400000000);
   __ Mov(x29, 0xabcd505500000000);

   __ CalculateSVEAddress(x24, SVEMemOperand(x24));
   __ CalculateSVEAddress(x25, SVEMemOperand(x25, 0x42));
   __ CalculateSVEAddress(x26, SVEMemOperand(x26, 3, SVE_MUL_VL), 0);
   __ CalculateSVEAddress(x27, SVEMemOperand(x27, 0x42, SVE_MUL_VL), 3);
   __ CalculateSVEAddress(x28, SVEMemOperand(x28, x30));
   __ CalculateSVEAddress(x29, SVEMemOperand(x29, x30, LSL, 4));

   END();

   if (CAN_RUN()) {
     RUN();

     uint64_t vl = config->sve_vl_in_bytes();
     VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
     uint64_t pl = vl / kZRegBitsPerPRegBit;

     // Simple scalar (or equivalent) cases.
     ASSERT_EQUAL_64(base, x0);
     ASSERT_EQUAL_64(base, x1);
     ASSERT_EQUAL_64(base, x2);
     ASSERT_EQUAL_64(base, x3);
     ASSERT_EQUAL_64(base, x4);
     ASSERT_EQUAL_64(base, x5);

     // scalar-plus-immediate
     ASSERT_EQUAL_64(base + 42, x6);
     ASSERT_EQUAL_64(base - 42, x7);
     ASSERT_EQUAL_64(base + (31 * vl), x8);
     ASSERT_EQUAL_64(base - (32 * vl), x9);
     ASSERT_EQUAL_64(base + (42 * vl), x10);
     ASSERT_EQUAL_64(base - (42 * vl), x11);
     ASSERT_EQUAL_64(base - (32 * vl), x12);
     ASSERT_EQUAL_64(base - (42 * vl), x13);
     ASSERT_EQUAL_64(base - (32 * vl), x14);
     ASSERT_EQUAL_64(base - (42 * vl), x15);
     ASSERT_EQUAL_64(base - (32 * vl), x18);
     ASSERT_EQUAL_64(base - (42 * vl), x19);

     // scalar-plus-scalar
     ASSERT_EQUAL_64(base + 48, x20);
     ASSERT_EQUAL_64(base - 48, x21);
     ASSERT_EQUAL_64(base + (48 << 8), x22);
     ASSERT_EQUAL_64(base - (48 << 8), x23);

     // In-place updates.
     ASSERT_EQUAL_64(0xabcd000000000000, x24);
     ASSERT_EQUAL_64(0xabcd101100000000 + 0x42, x25);
     ASSERT_EQUAL_64(0xabcd202200000000 + (3 * vl), x26);
     ASSERT_EQUAL_64(0xabcd303300000000 + (0x42 * pl), x27);
     ASSERT_EQUAL_64(0xabcd404400000000 - 48, x28);
     ASSERT_EQUAL_64(0xabcd505500000000 - (48 << 4), x29);
   }
 #pragma GCC diagnostic pop
 }

 TEST_SVE(sve_permute_vector_unpredicated) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
   START();

   // Initialise registers with known values first.
   __ Dup(z1.VnB(), 0x11);
   __ Dup(z2.VnB(), 0x22);
   __ Dup(z3.VnB(), 0x33);
   __ Dup(z4.VnB(), 0x44);

   __ Mov(x0, 0x0123456789abcdef);
   __ Fmov(d0, RawbitsToDouble(0x7ffaaaaa22223456));
   __ Insr(z1.VnS(), w0);
   __ Insr(z2.VnD(), x0);
   __ Insr(z3.VnH(), h0);
   __ Insr(z4.VnD(), d0);

   uint64_t inputs[] = {0xfedcba9876543210,
                        0x0123456789abcdef,
                        0x8f8e8d8c8b8a8988,
                        0x8786858483828180};

   // Initialize a distinguishable value throughout the register first.
   __ Dup(z9.VnB(), 0xff);
   InsrHelper(&masm, z9.VnD(), inputs);

   __ Rev(z5.VnB(), z9.VnB());
   __ Rev(z6.VnH(), z9.VnH());
   __ Rev(z7.VnS(), z9.VnS());
   __ Rev(z8.VnD(), z9.VnD());

   int index[7] = {22, 7, 7, 3, 1, 1, 63};
   // Broadcasting an data within the input array.
   __ Dup(z10.VnB(), z9.VnB(), index[0]);
   __ Dup(z11.VnH(), z9.VnH(), index[1]);
   __ Dup(z12.VnS(), z9.VnS(), index[2]);
   __ Dup(z13.VnD(), z9.VnD(), index[3]);
   __ Dup(z14.VnQ(), z9.VnQ(), index[4]);
   // Test dst == src
   __ Mov(z15, z9);
   __ Dup(z15.VnS(), z15.VnS(), index[5]);
   // Selecting an data beyond the input array.
   __ Dup(z16.VnB(), z9.VnB(), index[6]);

   END();

   if (CAN_RUN()) {
     RUN();

     // Insr
     uint64_t z1_expected[] = {0x1111111111111111, 0x1111111189abcdef};
     uint64_t z2_expected[] = {0x2222222222222222, 0x0123456789abcdef};
     uint64_t z3_expected[] = {0x3333333333333333, 0x3333333333333456};
     uint64_t z4_expected[] = {0x4444444444444444, 0x7ffaaaaa22223456};
     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
     ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
     ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());

     // Rev
     int lane_count = core.GetSVELaneCount(kBRegSize);
     for (int i = 0; i < lane_count; i++) {
       uint64_t expected =
           core.zreg_lane(z5.GetCode(), kBRegSize, lane_count - i - 1);
       uint64_t input = core.zreg_lane(z9.GetCode(), kBRegSize, i);
       ASSERT_EQUAL_64(expected, input);
     }

     lane_count = core.GetSVELaneCount(kHRegSize);
     for (int i = 0; i < lane_count; i++) {
       uint64_t expected =
           core.zreg_lane(z6.GetCode(), kHRegSize, lane_count - i - 1);
       uint64_t input = core.zreg_lane(z9.GetCode(), kHRegSize, i);
       ASSERT_EQUAL_64(expected, input);
     }

     lane_count = core.GetSVELaneCount(kSRegSize);
     for (int i = 0; i < lane_count; i++) {
       uint64_t expected =
           core.zreg_lane(z7.GetCode(), kSRegSize, lane_count - i - 1);
       uint64_t input = core.zreg_lane(z9.GetCode(), kSRegSize, i);
       ASSERT_EQUAL_64(expected, input);
     }

     lane_count = core.GetSVELaneCount(kDRegSize);
     for (int i = 0; i < lane_count; i++) {
       uint64_t expected =
           core.zreg_lane(z8.GetCode(), kDRegSize, lane_count - i - 1);
       uint64_t input = core.zreg_lane(z9.GetCode(), kDRegSize, i);
       ASSERT_EQUAL_64(expected, input);
     }

     // Dup
     unsigned vl = config->sve_vl_in_bits();
     lane_count = core.GetSVELaneCount(kBRegSize);
     uint64_t expected_z10 = (vl > (index[0] * kBRegSize)) ? 0x23 : 0;
     for (int i = 0; i < lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE(expected_z10, z10.VnB(), i);
     }

     lane_count = core.GetSVELaneCount(kHRegSize);
     uint64_t expected_z11 = (vl > (index[1] * kHRegSize)) ? 0x8f8e : 0;
     for (int i = 0; i < lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE(expected_z11, z11.VnH(), i);
     }

     lane_count = core.GetSVELaneCount(kSRegSize);
     uint64_t expected_z12 = (vl > (index[2] * kSRegSize)) ? 0xfedcba98 : 0;
     for (int i = 0; i < lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE(expected_z12, z12.VnS(), i);
     }

     lane_count = core.GetSVELaneCount(kDRegSize);
     uint64_t expected_z13 =
         (vl > (index[3] * kDRegSize)) ? 0xfedcba9876543210 : 0;
     for (int i = 0; i < lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE(expected_z13, z13.VnD(), i);
     }

     lane_count = core.GetSVELaneCount(kDRegSize);
     uint64_t expected_z14_lo = 0;
     uint64_t expected_z14_hi = 0;
     if (vl > (index[4] * kQRegSize)) {
       expected_z14_lo = 0x0123456789abcdef;
       expected_z14_hi = 0xfedcba9876543210;
     }
     for (int i = 0; i < lane_count; i += 2) {
       ASSERT_EQUAL_SVE_LANE(expected_z14_lo, z14.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(expected_z14_hi, z14.VnD(), i + 1);
     }

     lane_count = core.GetSVELaneCount(kSRegSize);
     uint64_t expected_z15 = (vl > (index[5] * kSRegSize)) ? 0x87868584 : 0;
     for (int i = 0; i < lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE(expected_z15, z15.VnS(), i);
     }

     lane_count = core.GetSVELaneCount(kBRegSize);
     uint64_t expected_z16 = (vl > (index[6] * kBRegSize)) ? 0xff : 0;
     for (int i = 0; i < lane_count; i++) {
       ASSERT_EQUAL_SVE_LANE(expected_z16, z16.VnB(), i);
     }
   }
 }

 TEST_SVE(sve_permute_vector_unpredicated_unpack_vector_elements) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t z9_inputs[] = {0xfedcba9876543210,
                           0x0123456789abcdef,
                           0x8f8e8d8c8b8a8988,
                           0x8786858483828180};
   InsrHelper(&masm, z9.VnD(), z9_inputs);

   __ Sunpkhi(z10.VnH(), z9.VnB());
   __ Sunpkhi(z11.VnS(), z9.VnH());
   __ Sunpkhi(z12.VnD(), z9.VnS());

   __ Sunpklo(z13.VnH(), z9.VnB());
   __ Sunpklo(z14.VnS(), z9.VnH());
   __ Sunpklo(z15.VnD(), z9.VnS());

   __ Uunpkhi(z16.VnH(), z9.VnB());
   __ Uunpkhi(z17.VnS(), z9.VnH());
   __ Uunpkhi(z18.VnD(), z9.VnS());

   __ Uunpklo(z19.VnH(), z9.VnB());
   __ Uunpklo(z20.VnS(), z9.VnH());
   __ Uunpklo(z21.VnD(), z9.VnS());

   // Test unpacking with same source and destination.
   __ Mov(z22, z9);
   __ Sunpklo(z22.VnH(), z22.VnB());
   __ Mov(z23, z9);
   __ Uunpklo(z23.VnH(), z23.VnB());

   END();

   if (CAN_RUN()) {
     RUN();

     // Suunpkhi
     int lane_count = core.GetSVELaneCount(kHRegSize);
     for (int i = lane_count - 1; i >= 0; i--) {
       uint16_t expected = core.zreg_lane<uint16_t>(z10.GetCode(), i);
       uint8_t b_lane = core.zreg_lane<uint8_t>(z9.GetCode(), i + lane_count);
       uint16_t input = SignExtend<int16_t>(b_lane, kBRegSize);
       ASSERT_EQUAL_64(expected, input);
     }

     lane_count = core.GetSVELaneCount(kSRegSize);
     for (int i = lane_count - 1; i >= 0; i--) {
       uint32_t expected = core.zreg_lane<uint32_t>(z11.GetCode(), i);
       uint16_t h_lane = core.zreg_lane<uint16_t>(z9.GetCode(), i + lane_count);
       uint32_t input = SignExtend<int32_t>(h_lane, kHRegSize);
       ASSERT_EQUAL_64(expected, input);
     }

     lane_count = core.GetSVELaneCount(kDRegSize);
     for (int i = lane_count - 1; i >= 0; i--) {
       uint64_t expected = core.zreg_lane<uint64_t>(z12.GetCode(), i);
       uint32_t s_lane = core.zreg_lane<uint32_t>(z9.GetCode(), i + lane_count);
       uint64_t input = SignExtend<int64_t>(s_lane, kSRegSize);
       ASSERT_EQUAL_64(expected, input);
     }

     // Suunpklo
     lane_count = core.GetSVELaneCount(kHRegSize);
     for (int i = lane_count - 1; i >= 0; i--) {
       uint16_t expected = core.zreg_lane<uint16_t>(z13.GetCode(), i);
       uint8_t b_lane = core.zreg_lane<uint8_t>(z9.GetCode(), i);
       uint16_t input = SignExtend<int16_t>(b_lane, kBRegSize);
       ASSERT_EQUAL_64(expected, input);
     }

     lane_count = core.GetSVELaneCount(kSRegSize);
     for (int i = lane_count - 1; i >= 0; i--) {
       uint32_t expected = core.zreg_lane<uint32_t>(z14.GetCode(), i);
       uint16_t h_lane = core.zreg_lane<uint16_t>(z9.GetCode(), i);
       uint32_t input = SignExtend<int32_t>(h_lane, kHRegSize);
       ASSERT_EQUAL_64(expected, input);
     }

     lane_count = core.GetSVELaneCount(kDRegSize);
     for (int i = lane_count - 1; i >= 0; i--) {
       uint64_t expected = core.zreg_lane<uint64_t>(z15.GetCode(), i);
       uint32_t s_lane = core.zreg_lane<uint32_t>(z9.GetCode(), i);
       uint64_t input = SignExtend<int64_t>(s_lane, kSRegSize);
       ASSERT_EQUAL_64(expected, input);
     }

     // Uuunpkhi
     lane_count = core.GetSVELaneCount(kHRegSize);
     for (int i = lane_count - 1; i >= 0; i--) {
       uint16_t expected = core.zreg_lane<uint16_t>(z16.GetCode(), i);
       uint16_t input = core.zreg_lane<uint8_t>(z9.GetCode(), i + lane_count);
       ASSERT_EQUAL_64(expected, input);
     }

     lane_count = core.GetSVELaneCount(kSRegSize);
     for (int i = lane_count - 1; i >= 0; i--) {
       uint32_t expected = core.zreg_lane<uint32_t>(z17.GetCode(), i);
       uint32_t input = core.zreg_lane<uint16_t>(z9.GetCode(), i + lane_count);
       ASSERT_EQUAL_64(expected, input);
     }

     lane_count = core.GetSVELaneCount(kDRegSize);
     for (int i = lane_count - 1; i >= 0; i--) {
       uint64_t expected = core.zreg_lane<uint64_t>(z18.GetCode(), i);
       uint64_t input = core.zreg_lane<uint32_t>(z9.GetCode(), i + lane_count);
       ASSERT_EQUAL_64(expected, input);
     }

     // Uuunpklo
     lane_count = core.GetSVELaneCount(kHRegSize);
     for (int i = lane_count - 1; i >= 0; i--) {
       uint16_t expected = core.zreg_lane<uint16_t>(z19.GetCode(), i);
       uint16_t input = core.zreg_lane<uint8_t>(z9.GetCode(), i);
       ASSERT_EQUAL_64(expected, input);
     }

     lane_count = core.GetSVELaneCount(kSRegSize);
     for (int i = lane_count - 1; i >= 0; i--) {
       uint32_t expected = core.zreg_lane<uint32_t>(z20.GetCode(), i);
       uint32_t input = core.zreg_lane<uint16_t>(z9.GetCode(), i);
       ASSERT_EQUAL_64(expected, input);
     }

     lane_count = core.GetSVELaneCount(kDRegSize);
     for (int i = lane_count - 1; i >= 0; i--) {
       uint64_t expected = core.zreg_lane<uint64_t>(z21.GetCode(), i);
       uint64_t input = core.zreg_lane<uint32_t>(z9.GetCode(), i);
       ASSERT_EQUAL_64(expected, input);
     }

     ASSERT_EQUAL_SVE(z13, z22);
     ASSERT_EQUAL_SVE(z19, z23);
   }
 }

 TEST_SVE(sve_cnot_not) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t in[] = {0x0000000000000000, 0x00000000e1c30000, 0x123456789abcdef0};

   // For simplicity, we re-use the same pg for various lane sizes.
   // For D lanes:         1,                      1,                      0
   // For S lanes:         1,          1,          1,          0,          0
   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
   Initialise(&masm, p0.VnB(), pg_in);
   PRegisterM pg = p0.Merging();

   // These are merging operations, so we have to initialise the result register.
   // We use a mixture of constructive and destructive operations.

   InsrHelper(&masm, z31.VnD(), in);
   // Make a copy so we can check that constructive operations preserve zn.
   __ Mov(z30, z31);

   // For constructive operations, use a different initial result value.
   __ Index(z29.VnB(), 0, -1);

   __ Mov(z0, z31);
   __ Cnot(z0.VnB(), pg, z0.VnB());  // destructive
   __ Mov(z1, z29);
   __ Cnot(z1.VnH(), pg, z31.VnH());
   __ Mov(z2, z31);
   __ Cnot(z2.VnS(), pg, z2.VnS());  // destructive
   __ Mov(z3, z29);
   __ Cnot(z3.VnD(), pg, z31.VnD());

   __ Mov(z4, z29);
   __ Not(z4.VnB(), pg, z31.VnB());
   __ Mov(z5, z31);
   __ Not(z5.VnH(), pg, z5.VnH());  // destructive
   __ Mov(z6, z29);
   __ Not(z6.VnS(), pg, z31.VnS());
   __ Mov(z7, z31);
   __ Not(z7.VnD(), pg, z7.VnD());  // destructive

   END();

   if (CAN_RUN()) {
     RUN();

     // Check that constructive operations preserve their inputs.
     ASSERT_EQUAL_SVE(z30, z31);

     // clang-format off

     // Cnot (B) destructive
     uint64_t expected_z0[] =
     // pg:  0 0 0 0 1 0 1 1     1 0 0 1 0 1 1 1     0 0 1 0 1 1 1 0
         {0x0000000001000101, 0x01000001e1000101, 0x12340078000000f0};
     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());

     // Cnot (H)
     uint64_t expected_z1[] =
     // pg:    0   0   0   1       0   1   1   1       0   0   1   0
         {0xe9eaebecedee0001, 0xf1f2000100000001, 0xf9fafbfc0000ff00};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());

     // Cnot (S) destructive
     uint64_t expected_z2[] =
     // pg:        0       1           1       1           0       0
         {0x0000000000000001, 0x0000000100000000, 0x123456789abcdef0};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());

     // Cnot (D)
     uint64_t expected_z3[] =
     // pg:                1                   1                   0
         {0x0000000000000001, 0x0000000000000000, 0xf9fafbfcfdfeff00};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());

     // Not (B)
     uint64_t expected_z4[] =
     // pg:  0 0 0 0 1 0 1 1     1 0 0 1 0 1 1 1     0 0 1 0 1 1 1 0
         {0xe9eaebecffeeffff, 0xfff2f3fff53cffff, 0xf9faa9fc65432100};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());

     // Not (H) destructive
     uint64_t expected_z5[] =
     // pg:    0   0   0   1       0   1   1   1       0   0   1   0
         {0x000000000000ffff, 0x0000ffff1e3cffff, 0x123456786543def0};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());

     // Not (S)
     uint64_t expected_z6[] =
     // pg:        0       1           1       1           0       0
         {0xe9eaebecffffffff, 0xffffffff1e3cffff, 0xf9fafbfcfdfeff00};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());

     // Not (D) destructive
     uint64_t expected_z7[] =
     // pg:                1                   1                   0
         {0xffffffffffffffff, 0xffffffff1e3cffff, 0x123456789abcdef0};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());

     // clang-format on
   }
 }

 TEST_SVE(sve_fabs_fneg) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Include FP64, FP32 and FP16 signalling NaNs. Most FP operations quieten
   // NaNs, but fabs and fneg do not.
   uint64_t in[] = {0xc04500004228d140,  // Recognisable (+/-42) values.
                    0xfff00000ff80fc01,  // Signalling NaNs.
                    0x123456789abcdef0};

   // For simplicity, we re-use the same pg for various lane sizes.
   // For D lanes:         1,                      1,                      0
   // For S lanes:         1,          1,          1,          0,          0
   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
   Initialise(&masm, p0.VnB(), pg_in);
   PRegisterM pg = p0.Merging();

   // These are merging operations, so we have to initialise the result register.
   // We use a mixture of constructive and destructive operations.

   InsrHelper(&masm, z31.VnD(), in);
   // Make a copy so we can check that constructive operations preserve zn.
   __ Mov(z30, z31);

   // For constructive operations, use a different initial result value.
   __ Index(z29.VnB(), 0, -1);

   __ Mov(z0, z29);
   __ Fabs(z0.VnH(), pg, z31.VnH());
   __ Mov(z1, z31);
   __ Fabs(z1.VnS(), pg, z1.VnS());  // destructive
   __ Mov(z2, z29);
   __ Fabs(z2.VnD(), pg, z31.VnD());

   __ Mov(z3, z31);
   __ Fneg(z3.VnH(), pg, z3.VnH());  // destructive
   __ Mov(z4, z29);
   __ Fneg(z4.VnS(), pg, z31.VnS());
   __ Mov(z5, z31);
   __ Fneg(z5.VnD(), pg, z5.VnD());  // destructive

   END();

   if (CAN_RUN()) {
     RUN();

     // Check that constructive operations preserve their inputs.
     ASSERT_EQUAL_SVE(z30, z31);

     // clang-format off

     // Fabs (H)
     uint64_t expected_z0[] =
     // pg:    0   0   0   1       0   1   1   1       0   0   1   0
         {0xe9eaebecedee5140, 0xf1f200007f807c01, 0xf9fafbfc1abcff00};
     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());

     // Fabs (S) destructive
     uint64_t expected_z1[] =
     // pg:        0       1           1       1           0       0
         {0xc04500004228d140, 0x7ff000007f80fc01, 0x123456789abcdef0};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());

     // Fabs (D)
     uint64_t expected_z2[] =
     // pg:                1                   1                   0
         {0x404500004228d140, 0x7ff00000ff80fc01, 0xf9fafbfcfdfeff00};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());

     // Fneg (H) destructive
     uint64_t expected_z3[] =
     // pg:    0   0   0   1       0   1   1   1       0   0   1   0
         {0xc045000042285140, 0xfff080007f807c01, 0x123456781abcdef0};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());

     // Fneg (S)
     uint64_t expected_z4[] =
     // pg:        0       1           1       1           0       0
         {0xe9eaebecc228d140, 0x7ff000007f80fc01, 0xf9fafbfcfdfeff00};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());

     // Fneg (D) destructive
     uint64_t expected_z5[] =
     // pg:                1                   1                   0
         {0x404500004228d140, 0x7ff00000ff80fc01, 0x123456789abcdef0};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());

     // clang-format on
   }
 }

 TEST_SVE(sve_cls_clz_cnt) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t in[] = {0x0000000000000000, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};

   // For simplicity, we re-use the same pg for various lane sizes.
   // For D lanes:         1,                      1,                      0
   // For S lanes:         1,          1,          1,          0,          0
   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
   Initialise(&masm, p0.VnB(), pg_in);
   PRegisterM pg = p0.Merging();

   // These are merging operations, so we have to initialise the result register.
   // We use a mixture of constructive and destructive operations.

   InsrHelper(&masm, z31.VnD(), in);
   // Make a copy so we can check that constructive operations preserve zn.
   __ Mov(z30, z31);

   // For constructive operations, use a different initial result value.
   __ Index(z29.VnB(), 0, -1);

   __ Mov(z0, z29);
   __ Cls(z0.VnB(), pg, z31.VnB());
   __ Mov(z1, z31);
   __ Clz(z1.VnH(), pg, z1.VnH());  // destructive
   __ Mov(z2, z29);
   __ Cnt(z2.VnS(), pg, z31.VnS());
   __ Mov(z3, z31);
   __ Cnt(z3.VnD(), pg, z3.VnD());  // destructive

   END();

   if (CAN_RUN()) {
     RUN();
     // Check that non-destructive operations preserve their inputs.
     ASSERT_EQUAL_SVE(z30, z31);

     // clang-format off

     // cls (B)
     uint8_t expected_z0[] =
     // pg:  0     0     0     0     1     0     1     1
     // pg:  1     0     0     1     0     1     1     1
     // pg:  0     0     1     0     1     1     1     0
         {0xe9, 0xea, 0xeb, 0xec,    7, 0xee,    7,    7,
             6, 0xf2, 0xf3,    3, 0xf5,    1,    0,    3,
          0xf9, 0xfa,    0, 0xfc,    0,    0,    1, 0x00};
     ASSERT_EQUAL_SVE(expected_z0, z0.VnB());

     // clz (H) destructive
     uint16_t expected_z1[] =
     // pg:    0       0       0       1
     // pg:    0       1       1       1
     // pg:    0       0       1       0
         {0x0000, 0x0000, 0x0000,     16,
          0xfefc,      0,      0,      0,
          0x1234, 0x5678,      0, 0xdef0};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnH());

     // cnt (S)
     uint32_t expected_z2[] =
     // pg:        0           1
     // pg:        1           1
     // pg:        0           0
         {0xe9eaebec,          0,
                  22,         16,
          0xf9fafbfc, 0xfdfeff00};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnS());

     // cnt (D) destructive
     uint64_t expected_z3[] =
     // pg:                1                   1                   0
         {                 0,                 38, 0x123456789abcdef0};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());

     // clang-format on
   }
 }

 TEST_SVE(sve_sxt) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};

   // For simplicity, we re-use the same pg for various lane sizes.
   // For D lanes:         1,                      1,                      0
   // For S lanes:         1,          1,          1,          0,          0
   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
   Initialise(&masm, p0.VnB(), pg_in);
   PRegisterM pg = p0.Merging();

   // These are merging operations, so we have to initialise the result register.
   // We use a mixture of constructive and destructive operations.

   InsrHelper(&masm, z31.VnD(), in);
   // Make a copy so we can check that constructive operations preserve zn.
   __ Mov(z30, z31);

   // For constructive operations, use a different initial result value.
   __ Index(z29.VnB(), 0, -1);

   __ Mov(z0, z31);
   __ Sxtb(z0.VnH(), pg, z0.VnH());  // destructive
   __ Mov(z1, z29);
   __ Sxtb(z1.VnS(), pg, z31.VnS());
   __ Mov(z2, z31);
   __ Sxtb(z2.VnD(), pg, z2.VnD());  // destructive
   __ Mov(z3, z29);
   __ Sxth(z3.VnS(), pg, z31.VnS());
   __ Mov(z4, z31);
   __ Sxth(z4.VnD(), pg, z4.VnD());  // destructive
   __ Mov(z5, z29);
   __ Sxtw(z5.VnD(), pg, z31.VnD());

   END();

   if (CAN_RUN()) {
     RUN();
     // Check that constructive operations preserve their inputs.
     ASSERT_EQUAL_SVE(z30, z31);

     // clang-format off

     // Sxtb (H) destructive
     uint64_t expected_z0[] =
     // pg:    0   0   0   1       0   1   1   1       0   0   1   0
         {0x01f203f405f6fff8, 0xfefcfff0ffc3000f, 0x12345678ffbcdef0};
     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());

     // Sxtb (S)
     uint64_t expected_z1[] =
     // pg:        0       1           1       1           0       0
         {0xe9eaebecfffffff8, 0xfffffff00000000f, 0xf9fafbfcfdfeff00};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());

     // Sxtb (D) destructive
     uint64_t expected_z2[] =
     // pg:                1                   1                   0
         {0xfffffffffffffff8, 0x000000000000000f, 0x123456789abcdef0};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());

     // Sxth (S)
     uint64_t expected_z3[] =
     // pg:        0       1           1       1           0       0
         {0xe9eaebec000007f8, 0xfffff8f0ffff870f, 0xf9fafbfcfdfeff00};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());

     // Sxth (D) destructive
     uint64_t expected_z4[] =
     // pg:                1                   1                   0
         {0x00000000000007f8, 0xffffffffffff870f, 0x123456789abcdef0};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());

     // Sxtw (D)
     uint64_t expected_z5[] =
     // pg:                1                   1                   0
         {0x0000000005f607f8, 0xffffffffe1c3870f, 0xf9fafbfcfdfeff00};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());

     // clang-format on
   }
 }

 TEST_SVE(sve_uxt) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};

   // For simplicity, we re-use the same pg for various lane sizes.
   // For D lanes:         1,                      1,                      0
   // For S lanes:         1,          1,          1,          0,          0
   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
   Initialise(&masm, p0.VnB(), pg_in);
   PRegisterM pg = p0.Merging();

   // These are merging operations, so we have to initialise the result register.
   // We use a mixture of constructive and destructive operations.

   InsrHelper(&masm, z31.VnD(), in);
   // Make a copy so we can check that constructive operations preserve zn.
   __ Mov(z30, z31);

   // For constructive operations, use a different initial result value.
   __ Index(z29.VnB(), 0, -1);

   __ Mov(z0, z29);
   __ Uxtb(z0.VnH(), pg, z31.VnH());
   __ Mov(z1, z31);
   __ Uxtb(z1.VnS(), pg, z1.VnS());  // destructive
   __ Mov(z2, z29);
   __ Uxtb(z2.VnD(), pg, z31.VnD());
   __ Mov(z3, z31);
   __ Uxth(z3.VnS(), pg, z3.VnS());  // destructive
   __ Mov(z4, z29);
   __ Uxth(z4.VnD(), pg, z31.VnD());
   __ Mov(z5, z31);
   __ Uxtw(z5.VnD(), pg, z5.VnD());  // destructive

   END();

   if (CAN_RUN()) {
     RUN();
     // clang-format off

     // Uxtb (H)
     uint64_t expected_z0[] =
     // pg:    0   0   0   1       0   1   1   1       0   0   1   0
         {0xe9eaebecedee00f8, 0xf1f200f000c3000f, 0xf9fafbfc00bcff00};
     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());

     // Uxtb (S) destructive
     uint64_t expected_z1[] =
     // pg:        0       1           1       1           0       0
         {0x01f203f4000000f8, 0x000000f00000000f, 0x123456789abcdef0};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());

     // Uxtb (D)
     uint64_t expected_z2[] =
     // pg:                1                   1                   0
         {0x00000000000000f8, 0x000000000000000f, 0xf9fafbfcfdfeff00};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());

     // Uxth (S) destructive
     uint64_t expected_z3[] =
     // pg:        0       1           1       1           0       0
         {0x01f203f4000007f8, 0x0000f8f00000870f, 0x123456789abcdef0};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());

     // Uxth (D)
     uint64_t expected_z4[] =
     // pg:                1                   1                   0
         {0x00000000000007f8, 0x000000000000870f, 0xf9fafbfcfdfeff00};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());

     // Uxtw (D) destructive
     uint64_t expected_z5[] =
     // pg:                1                   1                   0
         {0x0000000005f607f8, 0x00000000e1c3870f, 0x123456789abcdef0};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());

     // clang-format on
   }
 }

 TEST_SVE(sve_abs_neg) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};

   // For simplicity, we re-use the same pg for various lane sizes.
   // For D lanes:         1,                      1,                      0
   // For S lanes:         1,          1,          1,          0,          0
   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
   Initialise(&masm, p0.VnB(), pg_in);
   PRegisterM pg = p0.Merging();

   InsrHelper(&masm, z31.VnD(), in);

   // These are merging operations, so we have to initialise the result register.
   // We use a mixture of constructive and destructive operations.

   InsrHelper(&masm, z31.VnD(), in);
   // Make a copy so we can check that constructive operations preserve zn.
   __ Mov(z30, z31);

   // For constructive operations, use a different initial result value.
   __ Index(z29.VnB(), 0, -1);

   __ Mov(z0, z31);
   __ Abs(z0.VnD(), pg, z0.VnD());  // destructive
   __ Mov(z1, z29);
   __ Abs(z1.VnB(), pg, z31.VnB());

   __ Mov(z2, z31);
   __ Neg(z2.VnH(), pg, z2.VnH());  // destructive
   __ Mov(z3, z29);
   __ Neg(z3.VnS(), pg, z31.VnS());

   // The unpredicated form of `Neg` is implemented using `subr`.
   __ Mov(z4, z31);
   __ Neg(z4.VnB(), z4.VnB());  // destructive
   __ Mov(z5, z29);
   __ Neg(z5.VnD(), z31.VnD());

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(z30, z31);

     // clang-format off

     // Abs (D) destructive
     uint64_t expected_z0[] =
     // pg:                1                   1                   0
         {0x01f203f405f607f8, 0x0103070f1e3c78f1, 0x123456789abcdef0};
     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());

     // Abs (B)
     uint64_t expected_z1[] =
     // pg:  0 0 0 0 1 0 1 1     1 0 0 1 0 1 1 1     0 0 1 0 1 1 1 0
         {0xe9eaebec05ee0708, 0x02f2f310f53d790f, 0xf9fa56fc66442200};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());

     // Neg (H) destructive
     uint64_t expected_z2[] =
     // pg:    0   0   0   1       0   1   1   1       0   0   1   0
         {0x01f203f405f6f808, 0xfefc07101e3d78f1, 0x123456786544def0};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());

     // Neg (S)
     uint64_t expected_z3[] =
     // pg:        0       1           1       1           0       0
         {0xe9eaebecfa09f808, 0x010307101e3c78f1, 0xf9fafbfcfdfeff00};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());

     // Neg (B) destructive, unpredicated
     uint64_t expected_z4[] =
         {0xff0efd0cfb0af908, 0x020408101f3d79f1, 0xeeccaa8866442210};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());

     // Neg (D) unpredicated
     uint64_t expected_z5[] =
         {0xfe0dfc0bfa09f808, 0x0103070f1e3c78f1, 0xedcba98765432110};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());

     // clang-format on
   }
 }

 TEST_SVE(sve_cpy) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
   START();

   // For simplicity, we re-use the same pg for various lane sizes.
   // For D lanes:         0,                      1,                      1
   // For S lanes:         0,          1,          1,          0,          1
   // For H lanes:   1,    0,    0,    1,    0,    1,    1,    0,    0,    1
   int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};

   PRegisterM pg = p7.Merging();
   Initialise(&masm, pg.VnB(), pg_in);

   // These are merging operations, so we have to initialise the result registers
   // for each operation.
   for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
     __ Index(ZRegister(i, kBRegSize), 0, -1);
   }

   // Recognisable values to copy.
   __ Mov(x0, 0xdeadbeefdeadbe42);
   __ Mov(x1, 0xdeadbeefdead8421);
   __ Mov(x2, 0xdeadbeef80042001);
   __ Mov(x3, 0x8000000420000001);

   // Use NEON moves, to avoid testing SVE `cpy` against itself.
   __ Dup(v28.V2D(), x0);
   __ Dup(v29.V2D(), x1);
   __ Dup(v30.V2D(), x2);
   __ Dup(v31.V2D(), x3);

   // Register forms (CPY_z_p_r)
   __ Cpy(z0.VnB(), pg, w0);
   __ Cpy(z1.VnH(), pg, x1);  // X registers are accepted for small lanes.
   __ Cpy(z2.VnS(), pg, w2);
   __ Cpy(z3.VnD(), pg, x3);

   // VRegister forms (CPY_z_p_v)
   __ Cpy(z4.VnB(), pg, b28);
   __ Cpy(z5.VnH(), pg, h29);
   __ Cpy(z6.VnS(), pg, s30);
   __ Cpy(z7.VnD(), pg, d31);

   // Check that we can copy the stack pointer.
   __ Mov(x10, sp);
   __ Mov(sp, 0xabcabcabcabcabca);  // Set sp to a known value.
   __ Cpy(z16.VnB(), pg, sp);
   __ Cpy(z17.VnH(), pg, wsp);
   __ Cpy(z18.VnS(), pg, wsp);
   __ Cpy(z19.VnD(), pg, sp);
   __ Mov(sp, x10);  // Restore sp.

   END();

   if (CAN_RUN()) {
     RUN();
     // clang-format off

     uint64_t expected_b[] =
     // pg:  0 0 0 0 1 1 1 0     1 0 0 1 1 0 1 1     0 1 0 0 0 0 0 1
         {0xe9eaebec424242f0, 0x42f2f34242f64242, 0xf942fbfcfdfeff42};
     ASSERT_EQUAL_SVE(expected_b, z0.VnD());
     ASSERT_EQUAL_SVE(expected_b, z4.VnD());

     uint64_t expected_h[] =
     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
         {0xe9eaebec8421eff0, 0xf1f28421f5f68421, 0x8421fbfcfdfe8421};
     ASSERT_EQUAL_SVE(expected_h, z1.VnD());
     ASSERT_EQUAL_SVE(expected_h, z5.VnD());

     uint64_t expected_s[] =
     // pg:        0       0           1       1           0       1
         {0xe9eaebecedeeeff0, 0x8004200180042001, 0xf9fafbfc80042001};
     ASSERT_EQUAL_SVE(expected_s, z2.VnD());
     ASSERT_EQUAL_SVE(expected_s, z6.VnD());

     uint64_t expected_d[] =
     // pg:                0                   1                   1
         {0xe9eaebecedeeeff0, 0x8000000420000001, 0x8000000420000001};
     ASSERT_EQUAL_SVE(expected_d, z3.VnD());
     ASSERT_EQUAL_SVE(expected_d, z7.VnD());


     uint64_t expected_b_sp[] =
     // pg:  0 0 0 0 1 1 1 0     1 0 0 1 1 0 1 1     0 1 0 0 0 0 0 1
         {0xe9eaebeccacacaf0, 0xcaf2f3cacaf6caca, 0xf9cafbfcfdfeffca};
     ASSERT_EQUAL_SVE(expected_b_sp, z16.VnD());

     uint64_t expected_h_sp[] =
     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
         {0xe9eaebecabcaeff0, 0xf1f2abcaf5f6abca, 0xabcafbfcfdfeabca};
     ASSERT_EQUAL_SVE(expected_h_sp, z17.VnD());

     uint64_t expected_s_sp[] =
     // pg:        0       0           1       1           0       1
         {0xe9eaebecedeeeff0, 0xcabcabcacabcabca, 0xf9fafbfccabcabca};
     ASSERT_EQUAL_SVE(expected_s_sp, z18.VnD());

     uint64_t expected_d_sp[] =
     // pg:                0                   1                   1
         {0xe9eaebecedeeeff0, 0xabcabcabcabcabca, 0xabcabcabcabcabca};
     ASSERT_EQUAL_SVE(expected_d_sp, z19.VnD());

     // clang-format on
   }
 }

 TEST_SVE(sve_cpy_imm) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // For simplicity, we re-use the same pg for various lane sizes.
   // For D lanes:         0,                      1,                      1
   // For S lanes:         0,          1,          1,          0,          1
   // For H lanes:   1,    0,    0,    1,    0,    1,    1,    0,    0,    1
   int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};

   PRegister pg = p7;
   Initialise(&masm, pg.VnB(), pg_in);

   // These are (mostly) merging operations, so we have to initialise the result
   // registers for each operation.
   for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
     __ Index(ZRegister(i, kBRegSize), 0, -1);
   }

   // Encodable integer forms (CPY_z_p_i)
   __ Cpy(z0.VnB(), pg.Merging(), 0);
   __ Cpy(z1.VnB(), pg.Zeroing(), 42);
   __ Cpy(z2.VnB(), pg.Merging(), -42);
   __ Cpy(z3.VnB(), pg.Zeroing(), 0xff);
   __ Cpy(z4.VnH(), pg.Merging(), 127);
   __ Cpy(z5.VnS(), pg.Zeroing(), -128);
   __ Cpy(z6.VnD(), pg.Merging(), -1);

   // Forms encodable using fcpy.
   __ Cpy(z7.VnH(), pg.Merging(), Float16ToRawbits(Float16(-31.0)));
   __ Cpy(z8.VnS(), pg.Zeroing(), FloatToRawbits(2.0f));
   __ Cpy(z9.VnD(), pg.Merging(), DoubleToRawbits(-4.0));

   // Other forms use a scratch register.
   __ Cpy(z10.VnH(), pg.Merging(), 0xff);
   __ Cpy(z11.VnD(), pg.Zeroing(), 0x0123456789abcdef);

   END();

   if (CAN_RUN()) {
     RUN();
     // clang-format off

     uint64_t expected_z0[] =
     // pg:  0 0 0 0 1 1 1 0     1 0 0 1 1 0 1 1     0 1 0 0 0 0 0 1
         {0xe9eaebec000000f0, 0x00f2f30000f60000, 0xf900fbfcfdfeff00};
     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());

     uint64_t expected_z1[] =
     // pg:  0 0 0 0 1 1 1 0     1 0 0 1 1 0 1 1     0 1 0 0 0 0 0 1
         {0x000000002a2a2a00, 0x2a00002a2a002a2a, 0x002a00000000002a};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());

     uint64_t expected_z2[] =
     // pg:  0 0 0 0 1 1 1 0     1 0 0 1 1 0 1 1     0 1 0 0 0 0 0 1
         {0xe9eaebecd6d6d6f0, 0xd6f2f3d6d6f6d6d6, 0xf9d6fbfcfdfeffd6};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());

     uint64_t expected_z3[] =
     // pg:  0 0 0 0 1 1 1 0     1 0 0 1 1 0 1 1     0 1 0 0 0 0 0 1
         {0x00000000ffffff00, 0xff0000ffff00ffff, 0x00ff0000000000ff};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());

     uint64_t expected_z4[] =
     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
         {0xe9eaebec007feff0, 0xf1f2007ff5f6007f, 0x007ffbfcfdfe007f};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());

     uint64_t expected_z5[] =
     // pg:        0       0           1       1           0       1
         {0x0000000000000000, 0xffffff80ffffff80, 0x00000000ffffff80};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());

     uint64_t expected_z6[] =
     // pg:                0                   1                   1
         {0xe9eaebecedeeeff0, 0xffffffffffffffff, 0xffffffffffffffff};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());

     uint64_t expected_z7[] =
     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
         {0xe9eaebeccfc0eff0, 0xf1f2cfc0f5f6cfc0, 0xcfc0fbfcfdfecfc0};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());

     uint64_t expected_z8[] =
     // pg:        0       0           1       1           0       1
         {0x0000000000000000, 0x4000000040000000, 0x0000000040000000};
     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());

     uint64_t expected_z9[] =
     // pg:                0                   1                   1
         {0xe9eaebecedeeeff0, 0xc010000000000000, 0xc010000000000000};
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());

     uint64_t expected_z10[] =
     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
         {0xe9eaebec00ffeff0, 0xf1f200fff5f600ff, 0x00fffbfcfdfe00ff};
     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());

     uint64_t expected_z11[] =
     // pg:                0                   1                   1
         {0x0000000000000000, 0x0123456789abcdef, 0x0123456789abcdef};
     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());

     // clang-format on
   }
 }

 TEST_SVE(sve_fcpy_imm) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // For simplicity, we re-use the same pg for various lane sizes.
   // For D lanes:         0,                      1,                      1
   // For S lanes:         0,          1,          1,          0,          1
   // For H lanes:   1,    0,    0,    1,    0,    1,    1,    0,    0,    1
   int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};

   PRegister pg = p7;
   Initialise(&masm, pg.VnB(), pg_in);

   // These are (mostly) merging operations, so we have to initialise the result
   // registers for each operation.
   for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
     __ Index(ZRegister(i, kBRegSize), 0, -1);
   }

   // Encodable floating-point forms (FCPY_z_p_i)
   __ Fcpy(z1.VnH(), pg.Merging(), Float16(1.0));
   __ Fcpy(z2.VnH(), pg.Merging(), -2.0f);
   __ Fcpy(z3.VnH(), pg.Merging(), 3.0);
   __ Fcpy(z4.VnS(), pg.Merging(), Float16(-4.0));
   __ Fcpy(z5.VnS(), pg.Merging(), 5.0f);
   __ Fcpy(z6.VnS(), pg.Merging(), 6.0);
   __ Fcpy(z7.VnD(), pg.Merging(), Float16(7.0));
   __ Fcpy(z8.VnD(), pg.Merging(), 8.0f);
   __ Fmov(z9.VnD(), pg.Merging(), -9.0);

   // Unencodable immediates.
   __ Fcpy(z10.VnS(), pg.Merging(), 0.0);
   __ Fcpy(z11.VnH(), pg.Merging(), Float16(42.0));
   __ Fcpy(z12.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000));  // NaN
   __ Fcpy(z13.VnH(), pg.Merging(), kFP64NegativeInfinity);

   // Fmov alias.
   __ Fmov(z14.VnS(), pg.Merging(), 0.0);
   __ Fmov(z15.VnH(), pg.Merging(), Float16(42.0));
   __ Fmov(z16.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000));  // NaN
   __ Fmov(z17.VnH(), pg.Merging(), kFP64NegativeInfinity);
   END();

   if (CAN_RUN()) {
     RUN();
     // clang-format off

     // 1.0 as FP16: 0x3c00
     uint64_t expected_z1[] =
     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
         {0xe9eaebec3c00eff0, 0xf1f23c00f5f63c00, 0x3c00fbfcfdfe3c00};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());

     // -2.0 as FP16: 0xc000
     uint64_t expected_z2[] =
     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
         {0xe9eaebecc000eff0, 0xf1f2c000f5f6c000, 0xc000fbfcfdfec000};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());

     // 3.0 as FP16: 0x4200
     uint64_t expected_z3[] =
     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
         {0xe9eaebec4200eff0, 0xf1f24200f5f64200, 0x4200fbfcfdfe4200};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());

     // -4.0 as FP32: 0xc0800000
     uint64_t expected_z4[] =
     // pg:        0       0           1       1           0       1
         {0xe9eaebecedeeeff0, 0xc0800000c0800000, 0xf9fafbfcc0800000};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());

     // 5.0 as FP32: 0x40a00000
     uint64_t expected_z5[] =
     // pg:        0       0           1       1           0       1
         {0xe9eaebecedeeeff0, 0x40a0000040a00000, 0xf9fafbfc40a00000};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());

     // 6.0 as FP32: 0x40c00000
     uint64_t expected_z6[] =
     // pg:        0       0           1       1           0       1
         {0xe9eaebecedeeeff0, 0x40c0000040c00000, 0xf9fafbfc40c00000};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());

     // 7.0 as FP64: 0x401c000000000000
     uint64_t expected_z7[] =
     // pg:                0                   1                   1
         {0xe9eaebecedeeeff0, 0x401c000000000000, 0x401c000000000000};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());

     // 8.0 as FP64: 0x4020000000000000
     uint64_t expected_z8[] =
     // pg:                0                   1                   1
         {0xe9eaebecedeeeff0, 0x4020000000000000, 0x4020000000000000};
     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());

     // -9.0 as FP64: 0xc022000000000000
     uint64_t expected_z9[] =
     // pg:                0                   1                   1
         {0xe9eaebecedeeeff0, 0xc022000000000000, 0xc022000000000000};
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());

     // 0.0 as FP32: 0x00000000
     uint64_t expected_z10[] =
     // pg:        0       0           1       1           0       1
         {0xe9eaebecedeeeff0, 0x0000000000000000, 0xf9fafbfc00000000};
     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());

     // 42.0 as FP16: 0x5140
     uint64_t expected_z11[] =
     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
         {0xe9eaebec5140eff0, 0xf1f25140f5f65140, 0x5140fbfcfdfe5140};
     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());

     // Signalling NaN (with payload): 0x7ff0000012340000
     uint64_t expected_z12[] =
     // pg:                0                   1                   1
         {0xe9eaebecedeeeff0, 0x7ff0000012340000, 0x7ff0000012340000};
     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());

     // -infinity as FP16: 0xfc00
     uint64_t expected_z13[] =
     // pg:    0   0   1   0       0   1   0   1       1   0   0   1
         {0xe9eaebecfc00eff0, 0xf1f2fc00f5f6fc00, 0xfc00fbfcfdfefc00};
     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());

     ASSERT_EQUAL_SVE(z10.VnD(), z14.VnD());
     ASSERT_EQUAL_SVE(z11.VnD(), z15.VnD());
     ASSERT_EQUAL_SVE(z12.VnD(), z16.VnD());
     ASSERT_EQUAL_SVE(z13.VnD(), z17.VnD());
     // clang-format on
   }
 }

 TEST_SVE(sve_permute_vector_unpredicated_table_lookup) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t table_inputs[] = {0xffeeddccbbaa9988, 0x7766554433221100};

   int index_b[] = {255, 255, 11, 10, 15, 14, 13, 12, 1, 0, 4, 3, 7, 6, 5, 4};

   int index_h[] = {5, 6, 7, 8, 2, 3, 6, 4};

   int index_s[] = {1, 3, 2, 31, -1};

   int index_d[] = {31, 1};

   // Initialize the register with a value that doesn't existed in the table.
   __ Dup(z9.VnB(), 0x1f);
   InsrHelper(&masm, z9.VnD(), table_inputs);

   ZRegister ind_b = z0.WithLaneSize(kBRegSize);
   ZRegister ind_h = z1.WithLaneSize(kHRegSize);
   ZRegister ind_s = z2.WithLaneSize(kSRegSize);
   ZRegister ind_d = z3.WithLaneSize(kDRegSize);

   InsrHelper(&masm, ind_b, index_b);
   InsrHelper(&masm, ind_h, index_h);
   InsrHelper(&masm, ind_s, index_s);
   InsrHelper(&masm, ind_d, index_d);

   __ Tbl(z26.VnB(), z9.VnB(), ind_b);

   __ Tbl(z27.VnH(), z9.VnH(), ind_h);

   __ Tbl(z28.VnS(), z9.VnS(), ind_s);

   __ Tbl(z29.VnD(), z9.VnD(), ind_d);

   END();

   if (CAN_RUN()) {
     RUN();

     // clang-format off
     unsigned z26_expected[] = {0x1f, 0x1f, 0xbb, 0xaa, 0xff, 0xee, 0xdd, 0xcc,
                                0x11, 0x00, 0x44, 0x33, 0x77, 0x66, 0x55, 0x44};

     unsigned z27_expected[] = {0xbbaa, 0xddcc, 0xffee, 0x1f1f,
                                0x5544, 0x7766, 0xddcc, 0x9988};

     unsigned z28_expected[] =
        {0x77665544, 0xffeeddcc, 0xbbaa9988, 0x1f1f1f1f, 0x1f1f1f1f};

     uint64_t z29_expected[] = {0x1f1f1f1f1f1f1f1f, 0xffeeddccbbaa9988};
     // clang-format on

     unsigned vl = config->sve_vl_in_bits();
     for (size_t i = 0; i < ArrayLength(index_b); i++) {
       int lane = static_cast<int>(ArrayLength(index_b) - i - 1);
       if (!core.HasSVELane(z26.VnB(), lane)) break;
       uint64_t expected = (vl > (index_b[i] * kBRegSize)) ? z26_expected[i] : 0;
       ASSERT_EQUAL_SVE_LANE(expected, z26.VnB(), lane);
     }

     for (size_t i = 0; i < ArrayLength(index_h); i++) {
       int lane = static_cast<int>(ArrayLength(index_h) - i - 1);
       if (!core.HasSVELane(z27.VnH(), lane)) break;
       uint64_t expected = (vl > (index_h[i] * kHRegSize)) ? z27_expected[i] : 0;
       ASSERT_EQUAL_SVE_LANE(expected, z27.VnH(), lane);
     }

     for (size_t i = 0; i < ArrayLength(index_s); i++) {
       int lane = static_cast<int>(ArrayLength(index_s) - i - 1);
       if (!core.HasSVELane(z28.VnS(), lane)) break;
       uint64_t expected = (vl > (index_s[i] * kSRegSize)) ? z28_expected[i] : 0;
       ASSERT_EQUAL_SVE_LANE(expected, z28.VnS(), lane);
     }

     for (size_t i = 0; i < ArrayLength(index_d); i++) {
       int lane = static_cast<int>(ArrayLength(index_d) - i - 1);
       if (!core.HasSVELane(z29.VnD(), lane)) break;
       uint64_t expected = (vl > (index_d[i] * kDRegSize)) ? z29_expected[i] : 0;
       ASSERT_EQUAL_SVE_LANE(expected, z29.VnD(), lane);
     }
   }
 }

 TEST_SVE(ldr_str_z_bi) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int vl = config->sve_vl_in_bytes();

   // The immediate can address [-256, 255] times the VL, so allocate enough
   // space to exceed that in both directions.
   int data_size = vl * 1024;

   uint8_t* data = new uint8_t[data_size];
   memset(data, 0, data_size);

   // Set the base half-way through the buffer so we can use negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));

   __ Index(z1.VnB(), 1, 3);
   __ Index(z2.VnB(), 2, 5);
   __ Index(z3.VnB(), 3, 7);
   __ Index(z4.VnB(), 4, 11);
   __ Index(z5.VnB(), 5, 13);
   __ Index(z6.VnB(), 6, 2);
   __ Index(z7.VnB(), 7, 3);
   __ Index(z8.VnB(), 8, 5);
   __ Index(z9.VnB(), 9, 7);

   // Encodable cases.
   __ Str(z1, SVEMemOperand(x0));
   __ Str(z2, SVEMemOperand(x0, 2, SVE_MUL_VL));
   __ Str(z3, SVEMemOperand(x0, -3, SVE_MUL_VL));
   __ Str(z4, SVEMemOperand(x0, 255, SVE_MUL_VL));
   __ Str(z5, SVEMemOperand(x0, -256, SVE_MUL_VL));

   // Cases that fall back on `CalculateSVEAddress`.
   __ Str(z6, SVEMemOperand(x0, 6 * vl));
   __ Str(z7, SVEMemOperand(x0, -7 * vl));
   __ Str(z8, SVEMemOperand(x0, 314, SVE_MUL_VL));
   __ Str(z9, SVEMemOperand(x0, -314, SVE_MUL_VL));

   // Corresponding loads.
   __ Ldr(z11, SVEMemOperand(x0, xzr));  // Test xzr operand.
   __ Ldr(z12, SVEMemOperand(x0, 2, SVE_MUL_VL));
   __ Ldr(z13, SVEMemOperand(x0, -3, SVE_MUL_VL));
   __ Ldr(z14, SVEMemOperand(x0, 255, SVE_MUL_VL));
   __ Ldr(z15, SVEMemOperand(x0, -256, SVE_MUL_VL));

   __ Ldr(z16, SVEMemOperand(x0, 6 * vl));
   __ Ldr(z17, SVEMemOperand(x0, -7 * vl));
   __ Ldr(z18, SVEMemOperand(x0, 314, SVE_MUL_VL));
   __ Ldr(z19, SVEMemOperand(x0, -314, SVE_MUL_VL));

   END();

   if (CAN_RUN()) {
     RUN();

     uint8_t* expected = new uint8_t[data_size];
     memset(expected, 0, data_size);
     uint8_t* middle = &expected[data_size / 2];

     for (int i = 0; i < vl; i++) {
       middle[i] = (1 + (3 * i)) & 0xff;                 // z1
       middle[(2 * vl) + i] = (2 + (5 * i)) & 0xff;      // z2
       middle[(-3 * vl) + i] = (3 + (7 * i)) & 0xff;     // z3
       middle[(255 * vl) + i] = (4 + (11 * i)) & 0xff;   // z4
       middle[(-256 * vl) + i] = (5 + (13 * i)) & 0xff;  // z5
       middle[(6 * vl) + i] = (6 + (2 * i)) & 0xff;      // z6
       middle[(-7 * vl) + i] = (7 + (3 * i)) & 0xff;     // z7
       middle[(314 * vl) + i] = (8 + (5 * i)) & 0xff;    // z8
       middle[(-314 * vl) + i] = (9 + (7 * i)) & 0xff;   // z9
     }

     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);

     ASSERT_EQUAL_SVE(z1, z11);
     ASSERT_EQUAL_SVE(z2, z12);
     ASSERT_EQUAL_SVE(z3, z13);
     ASSERT_EQUAL_SVE(z4, z14);
     ASSERT_EQUAL_SVE(z5, z15);
     ASSERT_EQUAL_SVE(z6, z16);
     ASSERT_EQUAL_SVE(z7, z17);
     ASSERT_EQUAL_SVE(z8, z18);
     ASSERT_EQUAL_SVE(z9, z19);

     delete[] expected;
   }
   delete[] data;
 }

 TEST_SVE(ldr_str_p_bi) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int vl = config->sve_vl_in_bytes();
   VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
   int pl = vl / kZRegBitsPerPRegBit;

   // The immediate can address [-256, 255] times the PL, so allocate enough
   // space to exceed that in both directions.
   int data_size = pl * 1024;

   uint8_t* data = new uint8_t[data_size];
   memset(data, 0, data_size);

   // Set the base half-way through the buffer so we can use negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));

   uint64_t pattern[4] = {0x1010101011101111,
                          0x0010111011000101,
                          0x1001101110010110,
                          0x1010110101100011};
   for (int i = 8; i <= 15; i++) {
     // Initialise p8-p15 with a conveniently-recognisable, non-zero pattern.
     Initialise(&masm,
                PRegister(i),
                pattern[3] * i,
                pattern[2] * i,
                pattern[1] * i,
                pattern[0] * i);
   }

   // Encodable cases.
   __ Str(p8, SVEMemOperand(x0));
   __ Str(p9, SVEMemOperand(x0, 2, SVE_MUL_VL));
   __ Str(p10, SVEMemOperand(x0, -3, SVE_MUL_VL));
   __ Str(p11, SVEMemOperand(x0, 255, SVE_MUL_VL));

   // Cases that fall back on `CalculateSVEAddress`.
   __ Str(p12, SVEMemOperand(x0, 6 * pl));
   __ Str(p13, SVEMemOperand(x0, -7 * pl));
   __ Str(p14, SVEMemOperand(x0, 314, SVE_MUL_VL));
   __ Str(p15, SVEMemOperand(x0, -314, SVE_MUL_VL));

   // Corresponding loads.
   __ Ldr(p0, SVEMemOperand(x0));
   __ Ldr(p1, SVEMemOperand(x0, 2, SVE_MUL_VL));
   __ Ldr(p2, SVEMemOperand(x0, -3, SVE_MUL_VL));
   __ Ldr(p3, SVEMemOperand(x0, 255, SVE_MUL_VL));

   __ Ldr(p4, SVEMemOperand(x0, 6 * pl));
   __ Ldr(p5, SVEMemOperand(x0, -7 * pl));
   __ Ldr(p6, SVEMemOperand(x0, 314, SVE_MUL_VL));
   __ Ldr(p7, SVEMemOperand(x0, -314, SVE_MUL_VL));

   END();

   if (CAN_RUN()) {
     RUN();

     uint8_t* expected = new uint8_t[data_size];
     memset(expected, 0, data_size);
     uint8_t* middle = &expected[data_size / 2];

     for (int i = 0; i < pl; i++) {
       int bit_index = (i % sizeof(pattern[0])) * kBitsPerByte;
       size_t index = i / sizeof(pattern[0]);
       VIXL_ASSERT(index < ArrayLength(pattern));
       uint64_t byte = (pattern[index] >> bit_index) & 0xff;
       // Each byte of `pattern` can be multiplied by 15 without carry.
       VIXL_ASSERT((byte * 15) <= 0xff);

       middle[i] = byte * 8;                 // p8
       middle[(2 * pl) + i] = byte * 9;      // p9
       middle[(-3 * pl) + i] = byte * 10;    // p10
       middle[(255 * pl) + i] = byte * 11;   // p11
       middle[(6 * pl) + i] = byte * 12;     // p12
       middle[(-7 * pl) + i] = byte * 13;    // p13
       middle[(314 * pl) + i] = byte * 14;   // p14
       middle[(-314 * pl) + i] = byte * 15;  // p15
     }

     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);

     ASSERT_EQUAL_SVE(p0, p8);
     ASSERT_EQUAL_SVE(p1, p9);
     ASSERT_EQUAL_SVE(p2, p10);
     ASSERT_EQUAL_SVE(p3, p11);
     ASSERT_EQUAL_SVE(p4, p12);
     ASSERT_EQUAL_SVE(p5, p13);
     ASSERT_EQUAL_SVE(p6, p14);
     ASSERT_EQUAL_SVE(p7, p15);

     delete[] expected;
   }
   delete[] data;
 }

 template <typename T>
 static void MemoryWrite(uint8_t* base, int64_t offset, int64_t index, T data) {
   memcpy(base + offset + (index * sizeof(data)), &data, sizeof(data));
 }

 TEST_SVE(sve_ld1_st1_contiguous) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int vl = config->sve_vl_in_bytes();

   // The immediate can address [-8, 7] times the VL, so allocate enough space to
   // exceed that in both directions.
   int data_size = vl * 128;

   uint8_t* data = new uint8_t[data_size];
   memset(data, 0, data_size);

   // Set the base half-way through the buffer so we can use negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));

   // Encodable scalar-plus-immediate cases.
   __ Index(z1.VnB(), 1, -3);
   __ Ptrue(p1.VnB());
   __ St1b(z1.VnB(), p1, SVEMemOperand(x0));

   __ Index(z2.VnH(), -2, 5);
   __ Ptrue(p2.VnH(), SVE_MUL3);
   __ St1b(z2.VnH(), p2, SVEMemOperand(x0, 7, SVE_MUL_VL));

   __ Index(z3.VnS(), 3, -7);
   __ Ptrue(p3.VnS(), SVE_POW2);
   __ St1h(z3.VnS(), p3, SVEMemOperand(x0, -8, SVE_MUL_VL));

   // Encodable scalar-plus-scalar cases.
   __ Index(z4.VnD(), -4, 11);
   __ Ptrue(p4.VnD(), SVE_VL3);
   __ Addvl(x1, x0, 8);  // Try not to overlap with VL-dependent cases.
   __ Mov(x2, 17);
   __ St1b(z4.VnD(), p4, SVEMemOperand(x1, x2));

   __ Index(z5.VnD(), 6, -2);
   __ Ptrue(p5.VnD(), SVE_VL16);
   __ Addvl(x3, x0, 10);  // Try not to overlap with VL-dependent cases.
   __ Mov(x4, 6);
   __ St1d(z5.VnD(), p5, SVEMemOperand(x3, x4, LSL, 3));

   // Unencodable cases fall back on `CalculateSVEAddress`.
   __ Index(z6.VnS(), -7, 3);
   // Setting SVE_ALL on B lanes checks that the Simulator ignores irrelevant
   // predicate bits when handling larger lanes.
   __ Ptrue(p6.VnB(), SVE_ALL);
   __ St1w(z6.VnS(), p6, SVEMemOperand(x0, 42, SVE_MUL_VL));

   __ Index(z7.VnD(), 32, -11);
   __ Ptrue(p7.VnD(), SVE_MUL4);
   __ St1w(z7.VnD(), p7, SVEMemOperand(x0, 22, SVE_MUL_VL));

   // Corresponding loads.
   __ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0));
   __ Ld1b(z9.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
   __ Ld1h(z10.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
   __ Ld1b(z11.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2));
   __ Ld1d(z12.VnD(), p5.Zeroing(), SVEMemOperand(x3, x4, LSL, 3));
   __ Ld1w(z13.VnS(), p6.Zeroing(), SVEMemOperand(x0, 42, SVE_MUL_VL));

   __ Ld1sb(z14.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
   __ Ld1sh(z15.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
   __ Ld1sb(z16.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2));
   __ Ld1sw(z17.VnD(), p7.Zeroing(), SVEMemOperand(x0, 22, SVE_MUL_VL));

   // We can test ld1 by comparing the value loaded with the value stored. In
   // most cases, there are two complications:
   //  - Loads have zeroing predication, so we have to clear the inactive
   //    elements on our reference.
   //  - We have to replicate any sign- or zero-extension.

   // Ld1b(z8.VnB(), ...)
   __ Dup(z18.VnB(), 0);
   __ Mov(z18.VnB(), p1.Merging(), z1.VnB());

   // Ld1b(z9.VnH(), ...)
   __ Dup(z19.VnH(), 0);
   __ Uxtb(z19.VnH(), p2.Merging(), z2.VnH());

   // Ld1h(z10.VnS(), ...)
   __ Dup(z20.VnS(), 0);
   __ Uxth(z20.VnS(), p3.Merging(), z3.VnS());

   // Ld1b(z11.VnD(), ...)
   __ Dup(z21.VnD(), 0);
   __ Uxtb(z21.VnD(), p4.Merging(), z4.VnD());

   // Ld1d(z12.VnD(), ...)
   __ Dup(z22.VnD(), 0);
   __ Mov(z22.VnD(), p5.Merging(), z5.VnD());

   // Ld1w(z13.VnS(), ...)
   __ Dup(z23.VnS(), 0);
   __ Mov(z23.VnS(), p6.Merging(), z6.VnS());

   // Ld1sb(z14.VnH(), ...)
   __ Dup(z24.VnH(), 0);
   __ Sxtb(z24.VnH(), p2.Merging(), z2.VnH());

   // Ld1sh(z15.VnS(), ...)
   __ Dup(z25.VnS(), 0);
   __ Sxth(z25.VnS(), p3.Merging(), z3.VnS());

   // Ld1sb(z16.VnD(), ...)
   __ Dup(z26.VnD(), 0);
   __ Sxtb(z26.VnD(), p4.Merging(), z4.VnD());

   // Ld1sw(z17.VnD(), ...)
   __ Dup(z27.VnD(), 0);
   __ Sxtw(z27.VnD(), p7.Merging(), z7.VnD());

   END();

   if (CAN_RUN()) {
     RUN();

     uint8_t* expected = new uint8_t[data_size];
     memset(expected, 0, data_size);
     uint8_t* middle = &expected[data_size / 2];

     int vl_b = vl / kBRegSizeInBytes;
     int vl_h = vl / kHRegSizeInBytes;
     int vl_s = vl / kSRegSizeInBytes;
     int vl_d = vl / kDRegSizeInBytes;

     // Encodable cases.

     // st1b { z1.b }, SVE_ALL
     for (int i = 0; i < vl_b; i++) {
       MemoryWrite(middle, 0, i, static_cast<uint8_t>(1 - (3 * i)));
     }

     // st1b { z2.h }, SVE_MUL3
     int vl_h_mul3 = vl_h - (vl_h % 3);
     for (int i = 0; i < vl_h_mul3; i++) {
       int64_t offset = 7 * static_cast<int>(vl / (kHRegSize / kBRegSize));
       MemoryWrite(middle, offset, i, static_cast<uint8_t>(-2 + (5 * i)));
     }

     // st1h { z3.s }, SVE_POW2
     int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
     for (int i = 0; i < vl_s_pow2; i++) {
       int64_t offset = -8 * static_cast<int>(vl / (kSRegSize / kHRegSize));
       MemoryWrite(middle, offset, i, static_cast<uint16_t>(3 - (7 * i)));
     }

     // st1b { z4.d }, SVE_VL3
     if (vl_d >= 3) {
       for (int i = 0; i < 3; i++) {
         MemoryWrite(middle,
                     (8 * vl) + 17,
                     i,
                     static_cast<uint8_t>(-4 + (11 * i)));
       }
     }

     // st1d { z5.d }, SVE_VL16
     if (vl_d >= 16) {
       for (int i = 0; i < 16; i++) {
         MemoryWrite(middle,
                     (10 * vl) + (6 * kDRegSizeInBytes),
                     i,
                     static_cast<uint64_t>(6 - (2 * i)));
       }
     }

     // Unencodable cases.

     // st1w { z6.s }, SVE_ALL
     for (int i = 0; i < vl_s; i++) {
       MemoryWrite(middle, 42 * vl, i, static_cast<uint32_t>(-7 + (3 * i)));
     }

     // st1w { z7.d }, SVE_MUL4
     int vl_d_mul4 = vl_d - (vl_d % 4);
     for (int i = 0; i < vl_d_mul4; i++) {
       int64_t offset = 22 * static_cast<int>(vl / (kDRegSize / kWRegSize));
       MemoryWrite(middle, offset, i, static_cast<uint32_t>(32 + (-11 * i)));
     }

     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);

     // Check that we loaded back the expected values.

     ASSERT_EQUAL_SVE(z18, z8);
     ASSERT_EQUAL_SVE(z19, z9);
     ASSERT_EQUAL_SVE(z20, z10);
     ASSERT_EQUAL_SVE(z21, z11);
     ASSERT_EQUAL_SVE(z22, z12);
     ASSERT_EQUAL_SVE(z23, z13);
     ASSERT_EQUAL_SVE(z24, z14);
     ASSERT_EQUAL_SVE(z25, z15);
     ASSERT_EQUAL_SVE(z26, z16);
     ASSERT_EQUAL_SVE(z27, z17);

     delete[] expected;
   }
   delete[] data;
 }

 TEST_SVE(sve_ld2_st2_scalar_plus_imm) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int vl = config->sve_vl_in_bytes();

   // The immediate can address [-16, 14] times the VL, so allocate enough space
   // to exceed that in both directions.
   int data_size = vl * 128;

   uint8_t* data = new uint8_t[data_size];
   memset(data, 0, data_size);

   // Set the base half-way through the buffer so we can use negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));

   __ Index(z14.VnB(), 1, -3);
   __ Index(z15.VnB(), 2, -3);
   __ Ptrue(p0.VnB());
   __ St2b(z14.VnB(), z15.VnB(), p0, SVEMemOperand(x0));

   __ Index(z16.VnH(), -2, 5);
   __ Index(z17.VnH(), -3, 5);
   __ Ptrue(p1.VnH(), SVE_MUL3);
   __ St2h(z16.VnH(), z17.VnH(), p1, SVEMemOperand(x0, 8, SVE_MUL_VL));

   // Wrap around from z31 to z0.
   __ Index(z31.VnS(), 3, -7);
   __ Index(z0.VnS(), 4, -7);
   __ Ptrue(p2.VnS(), SVE_POW2);
   __ St2w(z31.VnS(), z0.VnS(), p2, SVEMemOperand(x0, -12, SVE_MUL_VL));

   __ Index(z18.VnD(), -7, 3);
   __ Index(z19.VnD(), -8, 3);
   // Sparse predication, including some irrelevant bits (0xe). To make the
   // results easy to check, activate each lane <n> where n is a multiple of 5.
   Initialise(&masm,
              p3,
              0xeee10000000001ee,
              0xeeeeeee100000000,
              0x01eeeeeeeee10000,
              0x000001eeeeeeeee1);
   __ St2d(z18.VnD(), z19.VnD(), p3, SVEMemOperand(x0, 14, SVE_MUL_VL));

   // We can test ld2 by comparing the values loaded with the values stored.
   // There are two complications:
   //  - Loads have zeroing predication, so we have to clear the inactive
   //    elements on our reference.
   //  - We want to test both loads and stores that span { z31, z0 }, so we have
   //    to move some values around.
   //
   // Registers z4-z11 will hold as-stored values (with inactive elements
   // cleared). Registers z20-z27 will hold the values that were loaded.

   // Ld2b(z14.VnB(), z15.VnB(), ...)
   __ Dup(z4.VnB(), 0);
   __ Dup(z5.VnB(), 0);
   __ Mov(z4.VnB(), p0.Merging(), z14.VnB());
   __ Mov(z5.VnB(), p0.Merging(), z15.VnB());

   // Ld2h(z16.VnH(), z17.VnH(), ...)
   __ Dup(z6.VnH(), 0);
   __ Dup(z7.VnH(), 0);
   __ Mov(z6.VnH(), p1.Merging(), z16.VnH());
   __ Mov(z7.VnH(), p1.Merging(), z17.VnH());

   // Ld2w(z31.VnS(), z0.VnS(), ...)
   __ Dup(z8.VnS(), 0);
   __ Dup(z9.VnS(), 0);
   __ Mov(z8.VnS(), p2.Merging(), z31.VnS());
   __ Mov(z9.VnS(), p2.Merging(), z0.VnS());

   // Ld2d(z18.VnD(), z19.VnD(), ...)
   __ Dup(z10.VnD(), 0);
   __ Dup(z11.VnD(), 0);
   __ Mov(z10.VnD(), p3.Merging(), z18.VnD());
   __ Mov(z11.VnD(), p3.Merging(), z19.VnD());

   // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
   __ Ld2b(z31.VnB(), z0.VnB(), p0.Zeroing(), SVEMemOperand(x0));
   __ Mov(z20, z31);
   __ Mov(z21, z0);

   __ Ld2h(z22.VnH(), z23.VnH(), p1.Zeroing(), SVEMemOperand(x0, 8, SVE_MUL_VL));
   __ Ld2w(z24.VnS(),
           z25.VnS(),
           p2.Zeroing(),
           SVEMemOperand(x0, -12, SVE_MUL_VL));
   __ Ld2d(z26.VnD(),
           z27.VnD(),
           p3.Zeroing(),
           SVEMemOperand(x0, 14, SVE_MUL_VL));

   END();

   if (CAN_RUN()) {
     RUN();

     uint8_t* expected = new uint8_t[data_size];
     memset(expected, 0, data_size);
     uint8_t* middle = &expected[data_size / 2];

     int vl_b = vl / kBRegSizeInBytes;
     int vl_h = vl / kHRegSizeInBytes;
     int vl_s = vl / kSRegSizeInBytes;
     int vl_d = vl / kDRegSizeInBytes;

     int reg_count = 2;

     // st2b { z14.b, z15.b }, SVE_ALL
     for (int i = 0; i < vl_b; i++) {
       uint8_t lane0 = 1 - (3 * i);
       uint8_t lane1 = 2 - (3 * i);
       MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
       MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
     }

     // st2h { z16.h, z17.h }, SVE_MUL3
     int vl_h_mul3 = vl_h - (vl_h % 3);
     for (int i = 0; i < vl_h_mul3; i++) {
       int64_t offset = 8 * vl;
       uint16_t lane0 = -2 + (5 * i);
       uint16_t lane1 = -3 + (5 * i);
       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
     }

     // st2w { z31.s, z0.s }, SVE_POW2
     int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
     for (int i = 0; i < vl_s_pow2; i++) {
       int64_t offset = -12 * vl;
       uint32_t lane0 = 3 - (7 * i);
       uint32_t lane1 = 4 - (7 * i);
       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
     }

     // st2d { z18.d, z19.d }, ((i % 5) == 0)
     for (int i = 0; i < vl_d; i++) {
       if ((i % 5) == 0) {
         int64_t offset = 14 * vl;
         uint64_t lane0 = -7 + (3 * i);
         uint64_t lane1 = -8 + (3 * i);
         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
       }
     }

     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);

     // Check that we loaded back the expected values.

     // st2b/ld2b
     ASSERT_EQUAL_SVE(z4, z20);
     ASSERT_EQUAL_SVE(z5, z21);

     // st2h/ld2h
     ASSERT_EQUAL_SVE(z6, z22);
     ASSERT_EQUAL_SVE(z7, z23);

     // st2w/ld2w
     ASSERT_EQUAL_SVE(z8, z24);
     ASSERT_EQUAL_SVE(z9, z25);

     // st2d/ld2d
     ASSERT_EQUAL_SVE(z10, z26);
     ASSERT_EQUAL_SVE(z11, z27);

     delete[] expected;
   }
   delete[] data;
 }

 TEST_SVE(sve_ld2_st2_scalar_plus_scalar) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int vl = config->sve_vl_in_bytes();

   // Allocate plenty of space to enable indexing in both directions.
   int data_size = vl * 128;

   uint8_t* data = new uint8_t[data_size];
   memset(data, 0, data_size);

   // Set the base half-way through the buffer so we can use negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));

   __ Index(z10.VnB(), -4, 11);
   __ Index(z11.VnB(), -5, 11);
   __ Ptrue(p7.VnB(), SVE_MUL4);
   __ Mov(x1, 0);
   __ St2b(z10.VnB(), z11.VnB(), p7, SVEMemOperand(x0, x1));

   __ Index(z12.VnH(), 6, -2);
   __ Index(z13.VnH(), 7, -2);
   __ Ptrue(p6.VnH(), SVE_VL16);
   __ Rdvl(x2, 3);  // Make offsets VL-dependent so we can avoid overlap.
   __ St2h(z12.VnH(), z13.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1));

   __ Index(z14.VnS(), -7, 3);
   __ Index(z15.VnS(), -8, 3);
   // Sparse predication, including some irrelevant bits (0xe). To make the
   // results easy to check, activate each lane <n> where n is a multiple of 5.
   Initialise(&masm,
              p5,
              0xeee1000010000100,
              0x001eeee100001000,
              0x0100001eeee10000,
              0x10000100001eeee1);
   __ Rdvl(x3, -3);
   __ St2w(z14.VnS(), z15.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2));

   // Wrap around from z31 to z0.
   __ Index(z31.VnD(), 32, -11);
   __ Index(z0.VnD(), 33, -11);
   __ Ptrue(p4.VnD(), SVE_MUL3);
   __ Rdvl(x4, 1);
   __ St2d(z31.VnD(), z0.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3));

   // We can test ld2 by comparing the values loaded with the values stored.
   // There are two complications:
   //  - Loads have zeroing predication, so we have to clear the inactive
   //    elements on our reference.
   //  - We want to test both loads and stores that span { z31, z0 }, so we have
   //    to move some values around.
   //
   // Registers z4-z11 will hold as-stored values (with inactive elements
   // cleared). Registers z20-z27 will hold the values that were loaded.

   // Ld2b(z20.VnB(), z21.VnB(), ...)
   __ Dup(z4.VnB(), 0);
   __ Dup(z5.VnB(), 0);
   __ Mov(z4.VnB(), p7.Merging(), z10.VnB());
   __ Mov(z5.VnB(), p7.Merging(), z11.VnB());

   // Ld2h(z22.VnH(), z23.VnH(), ...)
   __ Dup(z6.VnH(), 0);
   __ Dup(z7.VnH(), 0);
   __ Mov(z6.VnH(), p6.Merging(), z12.VnH());
   __ Mov(z7.VnH(), p6.Merging(), z13.VnH());

   // Ld2w(z24.VnS(), z25.VnS(), ...)
   __ Dup(z8.VnS(), 0);
   __ Dup(z9.VnS(), 0);
   __ Mov(z8.VnS(), p5.Merging(), z14.VnS());
   __ Mov(z9.VnS(), p5.Merging(), z15.VnS());

   // Ld2d(z31.VnD(), z0.VnD(), ...)
   __ Dup(z10.VnD(), 0);
   __ Dup(z11.VnD(), 0);
   __ Mov(z10.VnD(), p4.Merging(), z31.VnD());
   __ Mov(z11.VnD(), p4.Merging(), z0.VnD());

   // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
   __ Ld2b(z31.VnB(), z0.VnB(), p7.Zeroing(), SVEMemOperand(x0, x1));
   __ Mov(z20, z31);
   __ Mov(z21, z0);

   __ Ld2h(z22.VnH(), z23.VnH(), p6.Zeroing(), SVEMemOperand(x0, x2, LSL, 1));
   __ Ld2w(z24.VnS(), z25.VnS(), p5.Zeroing(), SVEMemOperand(x0, x3, LSL, 2));
   __ Ld2d(z26.VnD(), z27.VnD(), p4.Zeroing(), SVEMemOperand(x0, x4, LSL, 3));

   END();

   if (CAN_RUN()) {
     RUN();

     uint8_t* expected = new uint8_t[data_size];
     memset(expected, 0, data_size);
     uint8_t* middle = &expected[data_size / 2];

     int vl_b = vl / kBRegSizeInBytes;
     int vl_h = vl / kHRegSizeInBytes;
     int vl_s = vl / kSRegSizeInBytes;
     int vl_d = vl / kDRegSizeInBytes;

     int reg_count = 2;

     // st2b { z10.b, z11.b }, SVE_MUL4
     int vl_b_mul4 = vl_b - (vl_b % 4);
     for (int i = 0; i < vl_b_mul4; i++) {
       uint8_t lane0 = -4 + (11 * i);
       uint8_t lane1 = -5 + (11 * i);
       MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
       MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
     }

     // st2h { z12.h, z13.h }, SVE_VL16
     if (vl_h >= 16) {
       for (int i = 0; i < 16; i++) {
         int64_t offset = (3 << kHRegSizeInBytesLog2) * vl;
         uint16_t lane0 = 6 - (2 * i);
         uint16_t lane1 = 7 - (2 * i);
         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
       }
     }

     // st2w { z14.s, z15.s }, ((i % 5) == 0)
     for (int i = 0; i < vl_s; i++) {
       if ((i % 5) == 0) {
         int64_t offset = -(3 << kSRegSizeInBytesLog2) * vl;
         uint32_t lane0 = -7 + (3 * i);
         uint32_t lane1 = -8 + (3 * i);
         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
       }
     }

     // st2d { z31.b, z0.b }, SVE_MUL3
     int vl_d_mul3 = vl_d - (vl_d % 3);
     for (int i = 0; i < vl_d_mul3; i++) {
       int64_t offset = (1 << kDRegSizeInBytesLog2) * vl;
       uint64_t lane0 = 32 - (11 * i);
       uint64_t lane1 = 33 - (11 * i);
       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
     }

     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);

     // Check that we loaded back the expected values.

     // st2b/ld2b
     ASSERT_EQUAL_SVE(z4, z20);
     ASSERT_EQUAL_SVE(z5, z21);

     // st2h/ld2h
     ASSERT_EQUAL_SVE(z6, z22);
     ASSERT_EQUAL_SVE(z7, z23);

     // st2w/ld2w
     ASSERT_EQUAL_SVE(z8, z24);
     ASSERT_EQUAL_SVE(z9, z25);

     // st2d/ld2d
     ASSERT_EQUAL_SVE(z10, z26);
     ASSERT_EQUAL_SVE(z11, z27);

     delete[] expected;
   }
   delete[] data;
 }

 TEST_SVE(sve_ld3_st3_scalar_plus_imm) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int vl = config->sve_vl_in_bytes();

   // The immediate can address [-24, 21] times the VL, so allocate enough space
   // to exceed that in both directions.
   int data_size = vl * 128;

   uint8_t* data = new uint8_t[data_size];
   memset(data, 0, data_size);

   // Set the base half-way through the buffer so we can use negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));

   // We can test ld3 by comparing the values loaded with the values stored.
   // There are two complications:
   //  - Loads have zeroing predication, so we have to clear the inactive
   //    elements on our reference.
   //  - We want to test both loads and stores that span { z31, z0 }, so we have
   //    to move some values around.
   //
   // Registers z4-z15 will hold as-stored values (with inactive elements
   // cleared). Registers z16-z27 will hold the values that were loaded.

   __ Index(z10.VnB(), 1, -3);
   __ Index(z11.VnB(), 2, -3);
   __ Index(z12.VnB(), 3, -3);
   __ Ptrue(p0.VnB());
   __ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p0, SVEMemOperand(x0));
   // Save the stored values for ld3 tests.
   __ Dup(z4.VnB(), 0);
   __ Dup(z5.VnB(), 0);
   __ Dup(z6.VnB(), 0);
   __ Mov(z4.VnB(), p0.Merging(), z10.VnB());
   __ Mov(z5.VnB(), p0.Merging(), z11.VnB());
   __ Mov(z6.VnB(), p0.Merging(), z12.VnB());

   // Wrap around from z31 to z0.
   __ Index(z31.VnH(), -2, 5);
   __ Index(z0.VnH(), -3, 5);
   __ Index(z1.VnH(), -4, 5);
   __ Ptrue(p1.VnH(), SVE_MUL3);
   __ St3h(z31.VnH(), z0.VnH(), z1.VnH(), p1, SVEMemOperand(x0, 9, SVE_MUL_VL));
   // Save the stored values for ld3 tests.
   __ Dup(z7.VnH(), 0);
   __ Dup(z8.VnH(), 0);
   __ Dup(z9.VnH(), 0);
   __ Mov(z7.VnH(), p1.Merging(), z31.VnH());
   __ Mov(z8.VnH(), p1.Merging(), z0.VnH());
   __ Mov(z9.VnH(), p1.Merging(), z1.VnH());

   __ Index(z30.VnS(), 3, -7);
   __ Index(z31.VnS(), 4, -7);
   __ Index(z0.VnS(), 5, -7);
   __ Ptrue(p2.VnS(), SVE_POW2);
   __ St3w(z30.VnS(),
           z31.VnS(),
           z0.VnS(),
           p2,
           SVEMemOperand(x0, -12, SVE_MUL_VL));
   // Save the stored values for ld3 tests.
   __ Dup(z10.VnS(), 0);
   __ Dup(z11.VnS(), 0);
   __ Dup(z12.VnS(), 0);
   __ Mov(z10.VnS(), p2.Merging(), z30.VnS());
   __ Mov(z11.VnS(), p2.Merging(), z31.VnS());
   __ Mov(z12.VnS(), p2.Merging(), z0.VnS());

   __ Index(z0.VnD(), -7, 3);
   __ Index(z1.VnD(), -8, 3);
   __ Index(z2.VnD(), -9, 3);
   // Sparse predication, including some irrelevant bits (0xee). To make the
   // results easy to check, activate each lane <n> where n is a multiple of 5.
   Initialise(&masm,
              p3,
              0xeee10000000001ee,
              0xeeeeeee100000000,
              0x01eeeeeeeee10000,
              0x000001eeeeeeeee1);
   __ St3d(z0.VnD(), z1.VnD(), z2.VnD(), p3, SVEMemOperand(x0, 15, SVE_MUL_VL));
   // Save the stored values for ld3 tests.
   __ Dup(z13.VnD(), 0);
   __ Dup(z14.VnD(), 0);
   __ Dup(z15.VnD(), 0);
   __ Mov(z13.VnD(), p3.Merging(), z0.VnD());
   __ Mov(z14.VnD(), p3.Merging(), z1.VnD());
   __ Mov(z15.VnD(), p3.Merging(), z2.VnD());

   // Corresponding loads.
   // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
   __ Ld3b(z31.VnB(), z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(x0));
   __ Mov(z16, z31);
   __ Mov(z17, z0);
   __ Mov(z18, z1);
   __ Ld3h(z30.VnH(),
           z31.VnH(),
           z0.VnH(),
           p1.Zeroing(),
           SVEMemOperand(x0, 9, SVE_MUL_VL));
   __ Mov(z19, z30);
   __ Mov(z20, z31);
   __ Mov(z21, z0);
   __ Ld3w(z22.VnS(),
           z23.VnS(),
           z24.VnS(),
           p2.Zeroing(),
           SVEMemOperand(x0, -12, SVE_MUL_VL));
   __ Ld3d(z25.VnD(),
           z26.VnD(),
           z27.VnD(),
           p3.Zeroing(),
           SVEMemOperand(x0, 15, SVE_MUL_VL));

   END();

   if (CAN_RUN()) {
     RUN();

     uint8_t* expected = new uint8_t[data_size];
     memset(expected, 0, data_size);
     uint8_t* middle = &expected[data_size / 2];

     int vl_b = vl / kBRegSizeInBytes;
     int vl_h = vl / kHRegSizeInBytes;
     int vl_s = vl / kSRegSizeInBytes;
     int vl_d = vl / kDRegSizeInBytes;

     int reg_count = 3;

     // st3b { z10.b, z11.b, z12.b }, SVE_ALL
     for (int i = 0; i < vl_b; i++) {
       uint8_t lane0 = 1 - (3 * i);
       uint8_t lane1 = 2 - (3 * i);
       uint8_t lane2 = 3 - (3 * i);
       MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
       MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
       MemoryWrite(middle, 0, (i * reg_count) + 2, lane2);
     }

     // st3h { z31.h, z0.h, z1.h }, SVE_MUL3
     int vl_h_mul3 = vl_h - (vl_h % 3);
     for (int i = 0; i < vl_h_mul3; i++) {
       int64_t offset = 9 * vl;
       uint16_t lane0 = -2 + (5 * i);
       uint16_t lane1 = -3 + (5 * i);
       uint16_t lane2 = -4 + (5 * i);
       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
     }

     // st3w { z30.s, z31.s, z0.s }, SVE_POW2
     int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
     for (int i = 0; i < vl_s_pow2; i++) {
       int64_t offset = -12 * vl;
       uint32_t lane0 = 3 - (7 * i);
       uint32_t lane1 = 4 - (7 * i);
       uint32_t lane2 = 5 - (7 * i);
       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
     }

     // st3d { z0.d, z1.d, z2.d }, ((i % 5) == 0)
     for (int i = 0; i < vl_d; i++) {
       if ((i % 5) == 0) {
         int64_t offset = 15 * vl;
         uint64_t lane0 = -7 + (3 * i);
         uint64_t lane1 = -8 + (3 * i);
         uint64_t lane2 = -9 + (3 * i);
         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
         MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
       }
     }

     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);

     // Check that we loaded back the expected values.

     // st3b/ld3b
     ASSERT_EQUAL_SVE(z4, z16);
     ASSERT_EQUAL_SVE(z5, z17);
     ASSERT_EQUAL_SVE(z6, z18);

     // st3h/ld3h
     ASSERT_EQUAL_SVE(z7, z19);
     ASSERT_EQUAL_SVE(z8, z20);
     ASSERT_EQUAL_SVE(z9, z21);

     // st3w/ld3w
     ASSERT_EQUAL_SVE(z10, z22);
     ASSERT_EQUAL_SVE(z11, z23);
     ASSERT_EQUAL_SVE(z12, z24);

     // st3d/ld3d
     ASSERT_EQUAL_SVE(z13, z25);
     ASSERT_EQUAL_SVE(z14, z26);
     ASSERT_EQUAL_SVE(z15, z27);

     delete[] expected;
   }
   delete[] data;
 }

 TEST_SVE(sve_ld3_st3_scalar_plus_scalar) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int vl = config->sve_vl_in_bytes();

   // Allocate plenty of space to enable indexing in both directions.
   int data_size = vl * 128;

   uint8_t* data = new uint8_t[data_size];
   memset(data, 0, data_size);

   // Set the base half-way through the buffer so we can use negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));

   // We can test ld3 by comparing the values loaded with the values stored.
   // There are two complications:
   //  - Loads have zeroing predication, so we have to clear the inactive
   //    elements on our reference.
   //  - We want to test both loads and stores that span { z31, z0 }, so we have
   //    to move some values around.
   //
   // Registers z4-z15 will hold as-stored values (with inactive elements
   // cleared). Registers z16-z27 will hold the values that were loaded.

   __ Index(z10.VnB(), -4, 11);
   __ Index(z11.VnB(), -5, 11);
   __ Index(z12.VnB(), -6, 11);
   __ Ptrue(p7.VnB(), SVE_MUL4);
   __ Rdvl(x1, -1);  // Make offsets VL-dependent so we can avoid overlap.
   __ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p7, SVEMemOperand(x0, x1, LSL, 0));
   // Save the stored values for ld3 tests.
   __ Dup(z4.VnB(), 0);
   __ Dup(z5.VnB(), 0);
   __ Dup(z6.VnB(), 0);
   __ Mov(z4.VnB(), p7.Merging(), z10.VnB());
   __ Mov(z5.VnB(), p7.Merging(), z11.VnB());
   __ Mov(z6.VnB(), p7.Merging(), z12.VnB());

   __ Index(z13.VnH(), 6, -2);
   __ Index(z14.VnH(), 7, -2);
   __ Index(z15.VnH(), 8, -2);
   __ Ptrue(p6.VnH(), SVE_VL16);
   __ Rdvl(x2, 5);  // (5 * vl) << 1 = 10 * vl
   __ St3h(z13.VnH(), z14.VnH(), z15.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1));
   // Save the stored values for ld3 tests.
   __ Dup(z7.VnH(), 0);
   __ Dup(z8.VnH(), 0);
   __ Dup(z9.VnH(), 0);
   __ Mov(z7.VnH(), p6.Merging(), z13.VnH());
   __ Mov(z8.VnH(), p6.Merging(), z14.VnH());
   __ Mov(z9.VnH(), p6.Merging(), z15.VnH());

   // Wrap around from z31 to z0.
   __ Index(z30.VnS(), -7, 3);
   __ Index(z31.VnS(), -8, 3);
   __ Index(z0.VnS(), -9, 3);
   // Sparse predication, including some irrelevant bits (0xe). To make the
   // results easy to check, activate each lane <n> where n is a multiple of 5.
   Initialise(&masm,
              p5,
              0xeee1000010000100,
              0x001eeee100001000,
              0x0100001eeee10000,
              0x10000100001eeee1);
   __ Rdvl(x3, -5);  // -(5 * vl) << 2 = -20 * vl
   __ St3w(z30.VnS(), z31.VnS(), z0.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2));
   // Save the stored values for ld3 tests.
   __ Dup(z10.VnS(), 0);
   __ Dup(z11.VnS(), 0);
   __ Dup(z12.VnS(), 0);
   __ Mov(z10.VnS(), p5.Merging(), z30.VnS());
   __ Mov(z11.VnS(), p5.Merging(), z31.VnS());
   __ Mov(z12.VnS(), p5.Merging(), z0.VnS());

   __ Index(z31.VnD(), 32, -11);
   __ Index(z0.VnD(), 33, -11);
   __ Index(z1.VnD(), 34, -11);
   __ Ptrue(p4.VnD(), SVE_MUL3);
   __ Rdvl(x4, -1);  // -(1 * vl) << 3 = -8 * vl
   __ St3d(z31.VnD(), z0.VnD(), z1.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3));
   // Save the stored values for ld3 tests.
   __ Dup(z13.VnD(), 0);
   __ Dup(z14.VnD(), 0);
   __ Dup(z15.VnD(), 0);
   __ Mov(z13.VnD(), p4.Merging(), z31.VnD());
   __ Mov(z14.VnD(), p4.Merging(), z0.VnD());
   __ Mov(z15.VnD(), p4.Merging(), z1.VnD());

   // Corresponding loads.
   // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
   __ Ld3b(z31.VnB(),
           z0.VnB(),
           z1.VnB(),
           p7.Zeroing(),
           SVEMemOperand(x0, x1, LSL, 0));
   __ Mov(z16, z31);
   __ Mov(z17, z0);
   __ Mov(z18, z1);
   __ Ld3h(z30.VnH(),
           z31.VnH(),
           z0.VnH(),
           p6.Zeroing(),
           SVEMemOperand(x0, x2, LSL, 1));
   __ Mov(z19, z30);
   __ Mov(z20, z31);
   __ Mov(z21, z0);
   __ Ld3w(z22.VnS(),
           z23.VnS(),
           z24.VnS(),
           p5.Zeroing(),
           SVEMemOperand(x0, x3, LSL, 2));
   __ Ld3d(z25.VnD(),
           z26.VnD(),
           z27.VnD(),
           p4.Zeroing(),
           SVEMemOperand(x0, x4, LSL, 3));

   END();

   if (CAN_RUN()) {
     RUN();

     uint8_t* expected = new uint8_t[data_size];
     memset(expected, 0, data_size);
     uint8_t* middle = &expected[data_size / 2];

     int vl_b = vl / kBRegSizeInBytes;
     int vl_h = vl / kHRegSizeInBytes;
     int vl_s = vl / kSRegSizeInBytes;
     int vl_d = vl / kDRegSizeInBytes;

     int reg_count = 3;

     // st3b { z10.b, z11.b, z12.b }, SVE_MUL4
     int vl_b_mul4 = vl_b - (vl_b % 4);
     for (int i = 0; i < vl_b_mul4; i++) {
       int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl;
       uint8_t lane0 = -4 + (11 * i);
       uint8_t lane1 = -5 + (11 * i);
       uint8_t lane2 = -6 + (11 * i);
       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
     }

     // st3h { z13.h, z14.h, z15.h }, SVE_VL16
     if (vl_h >= 16) {
       for (int i = 0; i < 16; i++) {
         int64_t offset = (5 << kHRegSizeInBytesLog2) * vl;
         uint16_t lane0 = 6 - (2 * i);
         uint16_t lane1 = 7 - (2 * i);
         uint16_t lane2 = 8 - (2 * i);
         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
         MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
       }
     }

     // st3w { z30.s, z31.s, z0.s }, ((i % 5) == 0)
     for (int i = 0; i < vl_s; i++) {
       if ((i % 5) == 0) {
         int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl;
         uint32_t lane0 = -7 + (3 * i);
         uint32_t lane1 = -8 + (3 * i);
         uint32_t lane2 = -9 + (3 * i);
         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
         MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
       }
     }

     // st3d { z31.d, z0.d, z1.d }, SVE_MUL3
     int vl_d_mul3 = vl_d - (vl_d % 3);
     for (int i = 0; i < vl_d_mul3; i++) {
       int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl;
       uint64_t lane0 = 32 - (11 * i);
       uint64_t lane1 = 33 - (11 * i);
       uint64_t lane2 = 34 - (11 * i);
       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
     }

     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);

     // Check that we loaded back the expected values.

     // st3b/ld3b
     ASSERT_EQUAL_SVE(z4, z16);
     ASSERT_EQUAL_SVE(z5, z17);
     ASSERT_EQUAL_SVE(z6, z18);

     // st3h/ld3h
     ASSERT_EQUAL_SVE(z7, z19);
     ASSERT_EQUAL_SVE(z8, z20);
     ASSERT_EQUAL_SVE(z9, z21);

     // st3w/ld3w
     ASSERT_EQUAL_SVE(z10, z22);
     ASSERT_EQUAL_SVE(z11, z23);
     ASSERT_EQUAL_SVE(z12, z24);

     // st3d/ld3d
     ASSERT_EQUAL_SVE(z13, z25);
     ASSERT_EQUAL_SVE(z14, z26);
     ASSERT_EQUAL_SVE(z15, z27);

     delete[] expected;
   }
   delete[] data;
 }

 TEST_SVE(sve_ld4_st4_scalar_plus_imm) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int vl = config->sve_vl_in_bytes();

   // The immediate can address [-24, 21] times the VL, so allocate enough space
   // to exceed that in both directions.
   int data_size = vl * 128;

   uint8_t* data = new uint8_t[data_size];
   memset(data, 0, data_size);

   // Set the base half-way through the buffer so we can use negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));

   // We can test ld4 by comparing the values loaded with the values stored.
   // There are two complications:
   //  - Loads have zeroing predication, so we have to clear the inactive
   //    elements on our reference.
   //  - We want to test both loads and stores that span { z31, z0 }, so we have
   //    to move some values around.
   //
   // Registers z3-z18 will hold as-stored values (with inactive elements
   // cleared). Registers z19-z31 and z0-z2 will hold the values that were
   // loaded.

   __ Index(z10.VnB(), 1, -7);
   __ Index(z11.VnB(), 2, -7);
   __ Index(z12.VnB(), 3, -7);
   __ Index(z13.VnB(), 4, -7);
   __ Ptrue(p0.VnB());
   __ St4b(z10.VnB(), z11.VnB(), z12.VnB(), z13.VnB(), p0, SVEMemOperand(x0));
   // Save the stored values for ld4 tests.
   __ Dup(z3.VnB(), 0);
   __ Dup(z4.VnB(), 0);
   __ Dup(z5.VnB(), 0);
   __ Dup(z6.VnB(), 0);
   __ Mov(z3.VnB(), p0.Merging(), z10.VnB());
   __ Mov(z4.VnB(), p0.Merging(), z11.VnB());
   __ Mov(z5.VnB(), p0.Merging(), z12.VnB());
   __ Mov(z6.VnB(), p0.Merging(), z13.VnB());

   // Wrap around from z31 to z0.
   __ Index(z31.VnH(), -2, 5);
   __ Index(z0.VnH(), -3, 5);
   __ Index(z1.VnH(), -4, 5);
   __ Index(z2.VnH(), -5, 5);
   __ Ptrue(p1.VnH(), SVE_MUL3);
   __ St4h(z31.VnH(),
           z0.VnH(),
           z1.VnH(),
           z2.VnH(),
           p1,
           SVEMemOperand(x0, 4, SVE_MUL_VL));
   // Save the stored values for ld4 tests.
   __ Dup(z7.VnH(), 0);
   __ Dup(z8.VnH(), 0);
   __ Dup(z9.VnH(), 0);
   __ Dup(z10.VnH(), 0);
   __ Mov(z7.VnH(), p1.Merging(), z31.VnH());
   __ Mov(z8.VnH(), p1.Merging(), z0.VnH());
   __ Mov(z9.VnH(), p1.Merging(), z1.VnH());
   __ Mov(z10.VnH(), p1.Merging(), z2.VnH());

   // Wrap around from z31 to z0.
   __ Index(z29.VnS(), 2, -7);
   __ Index(z30.VnS(), 3, -7);
   __ Index(z31.VnS(), 4, -7);
   __ Index(z0.VnS(), 5, -7);
   __ Ptrue(p2.VnS(), SVE_POW2);
   __ St4w(z29.VnS(),
           z30.VnS(),
           z31.VnS(),
           z0.VnS(),
           p2,
           SVEMemOperand(x0, -12, SVE_MUL_VL));
   // Save the stored values for ld4 tests.
   __ Dup(z11.VnS(), 0);
   __ Dup(z12.VnS(), 0);
   __ Dup(z13.VnS(), 0);
   __ Dup(z14.VnS(), 0);
   __ Mov(z11.VnS(), p2.Merging(), z29.VnS());
   __ Mov(z12.VnS(), p2.Merging(), z30.VnS());
   __ Mov(z13.VnS(), p2.Merging(), z31.VnS());
   __ Mov(z14.VnS(), p2.Merging(), z0.VnS());

   __ Index(z20.VnD(), -7, 8);
   __ Index(z21.VnD(), -8, 8);
   __ Index(z22.VnD(), -9, 8);
   __ Index(z23.VnD(), -10, 8);
   // Sparse predication, including some irrelevant bits (0xee). To make the
   // results easy to check, activate each lane <n> where n is a multiple of 5.
   Initialise(&masm,
              p3,
              0xeee10000000001ee,
              0xeeeeeee100000000,
              0x01eeeeeeeee10000,
              0x000001eeeeeeeee1);
   __ St4d(z20.VnD(),
           z21.VnD(),
           z22.VnD(),
           z23.VnD(),
           p3,
           SVEMemOperand(x0, 16, SVE_MUL_VL));
   // Save the stored values for ld4 tests.
   __ Dup(z15.VnD(), 0);
   __ Dup(z16.VnD(), 0);
   __ Dup(z17.VnD(), 0);
   __ Dup(z18.VnD(), 0);
   __ Mov(z15.VnD(), p3.Merging(), z20.VnD());
   __ Mov(z16.VnD(), p3.Merging(), z21.VnD());
   __ Mov(z17.VnD(), p3.Merging(), z22.VnD());
   __ Mov(z18.VnD(), p3.Merging(), z23.VnD());

   // Corresponding loads.
   // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
   __ Ld4b(z31.VnB(),
           z0.VnB(),
           z1.VnB(),
           z2.VnB(),
           p0.Zeroing(),
           SVEMemOperand(x0));
   __ Mov(z19, z31);
   __ Mov(z20, z0);
   __ Mov(z21, z1);
   __ Mov(z22, z2);
   __ Ld4h(z23.VnH(),
           z24.VnH(),
           z25.VnH(),
           z26.VnH(),
           p1.Zeroing(),
           SVEMemOperand(x0, 4, SVE_MUL_VL));
   __ Ld4w(z27.VnS(),
           z28.VnS(),
           z29.VnS(),
           z30.VnS(),
           p2.Zeroing(),
           SVEMemOperand(x0, -12, SVE_MUL_VL));
   // Wrap around from z31 to z0.
   __ Ld4d(z31.VnD(),
           z0.VnD(),
           z1.VnD(),
           z2.VnD(),
           p3.Zeroing(),
           SVEMemOperand(x0, 16, SVE_MUL_VL));

   END();

   if (CAN_RUN()) {
     RUN();

     uint8_t* expected = new uint8_t[data_size];
     memset(expected, 0, data_size);
     uint8_t* middle = &expected[data_size / 2];

     int vl_b = vl / kBRegSizeInBytes;
     int vl_h = vl / kHRegSizeInBytes;
     int vl_s = vl / kSRegSizeInBytes;
     int vl_d = vl / kDRegSizeInBytes;

     int reg_count = 4;

     // st2b { z10.b, z11.b, z12.b, z13.b }, SVE_ALL
     for (int i = 0; i < vl_b; i++) {
       uint8_t lane0 = 1 - (7 * i);
       uint8_t lane1 = 2 - (7 * i);
       uint8_t lane2 = 3 - (7 * i);
       uint8_t lane3 = 4 - (7 * i);
       MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
       MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
       MemoryWrite(middle, 0, (i * reg_count) + 2, lane2);
       MemoryWrite(middle, 0, (i * reg_count) + 3, lane3);
     }

     // st4h { z31.h, z0.h, z1.h, z2.h }, SVE_MUL3
     int vl_h_mul3 = vl_h - (vl_h % 3);
     for (int i = 0; i < vl_h_mul3; i++) {
       int64_t offset = 4 * vl;
       uint16_t lane0 = -2 + (5 * i);
       uint16_t lane1 = -3 + (5 * i);
       uint16_t lane2 = -4 + (5 * i);
       uint16_t lane3 = -5 + (5 * i);
       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
       MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
     }

     // st4w { z29.s, z30.s, z31.s, z0.s }, SVE_POW2
     int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
     for (int i = 0; i < vl_s_pow2; i++) {
       int64_t offset = -12 * vl;
       uint32_t lane0 = 2 - (7 * i);
       uint32_t lane1 = 3 - (7 * i);
       uint32_t lane2 = 4 - (7 * i);
       uint32_t lane3 = 5 - (7 * i);
       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
       MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
     }

     // st4d { z20.d, z21.d, z22.d, z23.d }, ((i % 5) == 0)
     for (int i = 0; i < vl_d; i++) {
       if ((i % 5) == 0) {
         int64_t offset = 16 * vl;
         uint64_t lane0 = -7 + (8 * i);
         uint64_t lane1 = -8 + (8 * i);
         uint64_t lane2 = -9 + (8 * i);
         uint64_t lane3 = -10 + (8 * i);
         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
         MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
         MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
       }
     }

     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);

     // Check that we loaded back the expected values.

     // st4b/ld4b
     ASSERT_EQUAL_SVE(z3, z19);
     ASSERT_EQUAL_SVE(z4, z20);
     ASSERT_EQUAL_SVE(z5, z21);
     ASSERT_EQUAL_SVE(z6, z22);

     // st4h/ld4h
     ASSERT_EQUAL_SVE(z7, z23);
     ASSERT_EQUAL_SVE(z8, z24);
     ASSERT_EQUAL_SVE(z9, z25);
     ASSERT_EQUAL_SVE(z10, z26);

     // st4w/ld4w
     ASSERT_EQUAL_SVE(z11, z27);
     ASSERT_EQUAL_SVE(z12, z28);
     ASSERT_EQUAL_SVE(z13, z29);
     ASSERT_EQUAL_SVE(z14, z30);

     // st4d/ld4d
     ASSERT_EQUAL_SVE(z15, z31);
     ASSERT_EQUAL_SVE(z16, z0);
     ASSERT_EQUAL_SVE(z17, z1);
     ASSERT_EQUAL_SVE(z18, z2);

     delete[] expected;
   }
   delete[] data;
 }

 TEST_SVE(sve_ld4_st4_scalar_plus_scalar) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int vl = config->sve_vl_in_bytes();

   // Allocate plenty of space to enable indexing in both directions.
   int data_size = vl * 128;

   uint8_t* data = new uint8_t[data_size];
   memset(data, 0, data_size);

   // Set the base half-way through the buffer so we can use negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));

   // We can test ld4 by comparing the values loaded with the values stored.
   // There are two complications:
   //  - Loads have zeroing predication, so we have to clear the inactive
   //    elements on our reference.
   //  - We want to test both loads and stores that span { z31, z0 }, so we have
   //    to move some values around.
   //
   // Registers z3-z18 will hold as-stored values (with inactive elements
   // cleared). Registers z19-z31 and z0-z2 will hold the values that were
   // loaded.

   __ Index(z19.VnB(), -4, 11);
   __ Index(z20.VnB(), -5, 11);
   __ Index(z21.VnB(), -6, 11);
   __ Index(z22.VnB(), -7, 11);
   __ Ptrue(p7.VnB(), SVE_MUL4);
   __ Rdvl(x1, -1);  // Make offsets VL-dependent so we can avoid overlap.
   __ St4b(z19.VnB(),
           z20.VnB(),
           z21.VnB(),
           z22.VnB(),
           p7,
           SVEMemOperand(x0, x1, LSL, 0));
   // Save the stored values for ld4 tests.
   __ Dup(z3.VnB(), 0);
   __ Dup(z4.VnB(), 0);
   __ Dup(z5.VnB(), 0);
   __ Dup(z6.VnB(), 0);
   __ Mov(z3.VnB(), p7.Merging(), z19.VnB());
   __ Mov(z4.VnB(), p7.Merging(), z20.VnB());
   __ Mov(z5.VnB(), p7.Merging(), z21.VnB());
   __ Mov(z6.VnB(), p7.Merging(), z22.VnB());

   __ Index(z23.VnH(), 6, -2);
   __ Index(z24.VnH(), 7, -2);
   __ Index(z25.VnH(), 8, -2);
   __ Index(z26.VnH(), 9, -2);
   __ Ptrue(p6.VnH(), SVE_VL16);
   __ Rdvl(x2, 7);  // (7 * vl) << 1 = 14 * vl
   __ St4h(z23.VnH(),
           z24.VnH(),
           z25.VnH(),
           z26.VnH(),
           p6,
           SVEMemOperand(x0, x2, LSL, 1));
   // Save the stored values for ld4 tests.
   __ Dup(z7.VnH(), 0);
   __ Dup(z8.VnH(), 0);
   __ Dup(z9.VnH(), 0);
   __ Dup(z10.VnH(), 0);
   __ Mov(z7.VnH(), p6.Merging(), z23.VnH());
   __ Mov(z8.VnH(), p6.Merging(), z24.VnH());
   __ Mov(z9.VnH(), p6.Merging(), z25.VnH());
   __ Mov(z10.VnH(), p6.Merging(), z26.VnH());

   // Wrap around from z31 to z0.
   __ Index(z29.VnS(), -6, 7);
   __ Index(z30.VnS(), -7, 7);
   __ Index(z31.VnS(), -8, 7);
   __ Index(z0.VnS(), -9, 7);
   // Sparse predication, including some irrelevant bits (0xe). To make the
   // results easy to check, activate each lane <n> where n is a multiple of 5.
   Initialise(&masm,
              p5,
              0xeee1000010000100,
              0x001eeee100001000,
              0x0100001eeee10000,
              0x10000100001eeee1);
   __ Rdvl(x3, -5);  // -(5 * vl) << 2 = -20 * vl
   __ St4w(z29.VnS(),
           z30.VnS(),
           z31.VnS(),
           z0.VnS(),
           p5,
           SVEMemOperand(x0, x3, LSL, 2));
   // Save the stored values for ld4 tests.
   __ Dup(z11.VnS(), 0);
   __ Dup(z12.VnS(), 0);
   __ Dup(z13.VnS(), 0);
   __ Dup(z14.VnS(), 0);
   __ Mov(z11.VnS(), p5.Merging(), z29.VnS());
   __ Mov(z12.VnS(), p5.Merging(), z30.VnS());
   __ Mov(z13.VnS(), p5.Merging(), z31.VnS());
   __ Mov(z14.VnS(), p5.Merging(), z0.VnS());

   __ Index(z31.VnD(), 32, -11);
   __ Index(z0.VnD(), 33, -11);
   __ Index(z1.VnD(), 34, -11);
   __ Index(z2.VnD(), 35, -11);
   __ Ptrue(p4.VnD(), SVE_MUL3);
   __ Rdvl(x4, -1);  // -(1 * vl) << 3 = -8 *vl
   __ St4d(z31.VnD(),
           z0.VnD(),
           z1.VnD(),
           z2.VnD(),
           p4,
           SVEMemOperand(x0, x4, LSL, 3));
   // Save the stored values for ld4 tests.
   __ Dup(z15.VnD(), 0);
   __ Dup(z16.VnD(), 0);
   __ Dup(z17.VnD(), 0);
   __ Dup(z18.VnD(), 0);
   __ Mov(z15.VnD(), p4.Merging(), z31.VnD());
   __ Mov(z16.VnD(), p4.Merging(), z0.VnD());
   __ Mov(z17.VnD(), p4.Merging(), z1.VnD());
   __ Mov(z18.VnD(), p4.Merging(), z2.VnD());

   // Corresponding loads.
   // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
   __ Ld4b(z31.VnB(),
           z0.VnB(),
           z1.VnB(),
           z2.VnB(),
           p7.Zeroing(),
           SVEMemOperand(x0, x1, LSL, 0));
   __ Mov(z19, z31);
   __ Mov(z20, z0);
   __ Mov(z21, z1);
   __ Mov(z22, z2);
   __ Ld4h(z23.VnH(),
           z24.VnH(),
           z25.VnH(),
           z26.VnH(),
           p6.Zeroing(),
           SVEMemOperand(x0, x2, LSL, 1));
   __ Ld4w(z27.VnS(),
           z28.VnS(),
           z29.VnS(),
           z30.VnS(),
           p5.Zeroing(),
           SVEMemOperand(x0, x3, LSL, 2));
   // Wrap around from z31 to z0.
   __ Ld4d(z31.VnD(),
           z0.VnD(),
           z1.VnD(),
           z2.VnD(),
           p4.Zeroing(),
           SVEMemOperand(x0, x4, LSL, 3));

   END();

   if (CAN_RUN()) {
     RUN();

     uint8_t* expected = new uint8_t[data_size];
     memset(expected, 0, data_size);
     uint8_t* middle = &expected[data_size / 2];

     int vl_b = vl / kBRegSizeInBytes;
     int vl_h = vl / kHRegSizeInBytes;
     int vl_s = vl / kSRegSizeInBytes;
     int vl_d = vl / kDRegSizeInBytes;

     int reg_count = 4;

     // st4b { z19.b, z20.b, z21.b, z22.b }, SVE_MUL4
     int vl_b_mul4 = vl_b - (vl_b % 4);
     for (int i = 0; i < vl_b_mul4; i++) {
       int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl;
       uint8_t lane0 = -4 + (11 * i);
       uint8_t lane1 = -5 + (11 * i);
       uint8_t lane2 = -6 + (11 * i);
       uint8_t lane3 = -7 + (11 * i);
       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
       MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
     }

     // st4h { z22.h, z23.h, z24.h, z25.h }, SVE_VL16
     if (vl_h >= 16) {
       for (int i = 0; i < 16; i++) {
         int64_t offset = (7 << kHRegSizeInBytesLog2) * vl;
         uint16_t lane0 = 6 - (2 * i);
         uint16_t lane1 = 7 - (2 * i);
         uint16_t lane2 = 8 - (2 * i);
         uint16_t lane3 = 9 - (2 * i);
         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
         MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
         MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
       }
     }

     // st4w { z29.s, z30.s, z31.s, z0.s }, ((i % 5) == 0)
     for (int i = 0; i < vl_s; i++) {
       if ((i % 5) == 0) {
         int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl;
         uint32_t lane0 = -6 + (7 * i);
         uint32_t lane1 = -7 + (7 * i);
         uint32_t lane2 = -8 + (7 * i);
         uint32_t lane3 = -9 + (7 * i);
         MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
         MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
         MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
         MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
       }
     }

     // st4d { z31.d, z0.d, z1.d, z2.d }, SVE_MUL3
     int vl_d_mul3 = vl_d - (vl_d % 3);
     for (int i = 0; i < vl_d_mul3; i++) {
       int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl;
       uint64_t lane0 = 32 - (11 * i);
       uint64_t lane1 = 33 - (11 * i);
       uint64_t lane2 = 34 - (11 * i);
       uint64_t lane3 = 35 - (11 * i);
       MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
       MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
       MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
       MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
     }

     ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);

     // Check that we loaded back the expected values.

     // st4b/ld4b
     ASSERT_EQUAL_SVE(z3, z19);
     ASSERT_EQUAL_SVE(z4, z20);
     ASSERT_EQUAL_SVE(z5, z21);
     ASSERT_EQUAL_SVE(z6, z22);

     // st4h/ld4h
     ASSERT_EQUAL_SVE(z7, z23);
     ASSERT_EQUAL_SVE(z8, z24);
     ASSERT_EQUAL_SVE(z9, z25);
     ASSERT_EQUAL_SVE(z10, z26);

     // st4w/ld4w
     ASSERT_EQUAL_SVE(z11, z27);
     ASSERT_EQUAL_SVE(z12, z28);
     ASSERT_EQUAL_SVE(z13, z29);
     ASSERT_EQUAL_SVE(z14, z30);

     // st4d/ld4d
     ASSERT_EQUAL_SVE(z15, z31);
     ASSERT_EQUAL_SVE(z16, z0);
     ASSERT_EQUAL_SVE(z17, z1);
     ASSERT_EQUAL_SVE(z18, z2);

     delete[] expected;
   }
   delete[] data;
 }

 TEST_SVE(sve_ld234_st234_scalar_plus_scalar_sp) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Check that the simulator correctly interprets rn == 31 as sp.
   // The indexing logic is the same regardless so we just check one load and
   // store of each type.

   // There are no pre- or post-indexing modes, so reserve space first.
   __ ClaimVL(2 + 3 + 4);

   __ Index(z0.VnB(), 42, 2);
   __ Index(z1.VnB(), 43, 2);
   __ Ptrue(p0.VnB(), SVE_VL7);
   __ Rdvl(x0, 0);
   __ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, x0));

   __ Index(z4.VnH(), 42, 3);
   __ Index(z5.VnH(), 43, 3);
   __ Index(z6.VnH(), 44, 3);
   __ Ptrue(p1.VnH(), SVE_POW2);
   __ Rdvl(x1, 2);
   __ Lsr(x1, x1, 1);
   __ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, x1, LSL, 1));

   __ Index(z8.VnS(), 42, 4);
   __ Index(z9.VnS(), 43, 4);
   __ Index(z10.VnS(), 44, 4);
   __ Index(z11.VnS(), 45, 4);
   __ Ptrue(p2.VnS());
   __ Rdvl(x2, 2 + 3);
   __ Lsr(x2, x2, 2);
   __ St4w(z8.VnS(),
           z9.VnS(),
           z10.VnS(),
           z11.VnS(),
           p2,
           SVEMemOperand(sp, x2, LSL, 2));

   // Corresponding loads.
   // We have to explicitly zero inactive lanes in the reference values because
   // loads have zeroing predication.
   __ Dup(z12.VnB(), 0);
   __ Dup(z13.VnB(), 0);
   __ Mov(z12.VnB(), p0.Merging(), z0.VnB());
   __ Mov(z13.VnB(), p0.Merging(), z1.VnB());
   __ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, x0));

   __ Dup(z16.VnH(), 0);
   __ Dup(z17.VnH(), 0);
   __ Dup(z18.VnH(), 0);
   __ Mov(z16.VnH(), p1.Merging(), z4.VnH());
   __ Mov(z17.VnH(), p1.Merging(), z5.VnH());
   __ Mov(z18.VnH(), p1.Merging(), z6.VnH());
   __ Ld3h(z4.VnH(),
           z5.VnH(),
           z6.VnH(),
           p1.Zeroing(),
           SVEMemOperand(sp, x1, LSL, 1));

   __ Dup(z20.VnS(), 0);
   __ Dup(z21.VnS(), 0);
   __ Dup(z22.VnS(), 0);
   __ Dup(z23.VnS(), 0);
   __ Mov(z20.VnS(), p2.Merging(), z8.VnS());
   __ Mov(z21.VnS(), p2.Merging(), z9.VnS());
   __ Mov(z22.VnS(), p2.Merging(), z10.VnS());
   __ Mov(z23.VnS(), p2.Merging(), z11.VnS());
   __ Ld4w(z8.VnS(),
           z9.VnS(),
           z10.VnS(),
           z11.VnS(),
           p2.Zeroing(),
           SVEMemOperand(sp, x2, LSL, 2));

   __ DropVL(2 + 3 + 4);

   END();

   if (CAN_RUN()) {
     RUN();

     // The most likely failure mode is the that simulator reads sp as xzr and
     // crashes on execution. We already test the address calculations separately
     // and sp doesn't change this, so just test that we load the values we
     // stored.

     // st2b/ld2b
     ASSERT_EQUAL_SVE(z0, z12);
     ASSERT_EQUAL_SVE(z1, z13);

     // st3h/ld3h
     ASSERT_EQUAL_SVE(z4, z16);
     ASSERT_EQUAL_SVE(z5, z17);
     ASSERT_EQUAL_SVE(z6, z18);

     // st4h/ld4h
     ASSERT_EQUAL_SVE(z8, z20);
     ASSERT_EQUAL_SVE(z9, z21);
     ASSERT_EQUAL_SVE(z10, z22);
     ASSERT_EQUAL_SVE(z11, z23);
   }
 }

 TEST_SVE(sve_ld234_st234_scalar_plus_imm_sp) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Check that the simulator correctly interprets rn == 31 as sp.
   // The indexing logic is the same regardless so we just check one load and
   // store of each type.

   // There are no pre- or post-indexing modes, so reserve space first.
   // Note that the stores fill in an order that allows each immediate to be a
   // multiple of the number of registers.
   __ ClaimVL(4 + 2 + 3);

   __ Index(z0.VnB(), 42, 2);
   __ Index(z1.VnB(), 43, 2);
   __ Ptrue(p0.VnB(), SVE_POW2);
   __ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, 4, SVE_MUL_VL));

   __ Index(z4.VnH(), 42, 3);
   __ Index(z5.VnH(), 43, 3);
   __ Index(z6.VnH(), 44, 3);
   __ Ptrue(p1.VnH(), SVE_VL7);
   __ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, 6, SVE_MUL_VL));

   __ Index(z8.VnS(), 42, 4);
   __ Index(z9.VnS(), 43, 4);
   __ Index(z10.VnS(), 44, 4);
   __ Index(z11.VnS(), 45, 4);
   __ Ptrue(p2.VnS());
   __ St4w(z8.VnS(), z9.VnS(), z10.VnS(), z11.VnS(), p2, SVEMemOperand(sp));

   // Corresponding loads.
   // We have to explicitly zero inactive lanes in the reference values because
   // loads have zeroing predication.
   __ Dup(z12.VnB(), 0);
   __ Dup(z13.VnB(), 0);
   __ Mov(z12.VnB(), p0.Merging(), z0.VnB());
   __ Mov(z13.VnB(), p0.Merging(), z1.VnB());
   __ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, 4, SVE_MUL_VL));

   __ Dup(z16.VnH(), 0);
   __ Dup(z17.VnH(), 0);
   __ Dup(z18.VnH(), 0);
   __ Mov(z16.VnH(), p1.Merging(), z4.VnH());
   __ Mov(z17.VnH(), p1.Merging(), z5.VnH());
   __ Mov(z18.VnH(), p1.Merging(), z6.VnH());
   __ Ld3h(z4.VnH(),
           z5.VnH(),
           z6.VnH(),
           p1.Zeroing(),
           SVEMemOperand(sp, 6, SVE_MUL_VL));

   __ Dup(z20.VnS(), 0);
   __ Dup(z21.VnS(), 0);
   __ Dup(z22.VnS(), 0);
   __ Dup(z23.VnS(), 0);
   __ Mov(z20.VnS(), p2.Merging(), z8.VnS());
   __ Mov(z21.VnS(), p2.Merging(), z9.VnS());
   __ Mov(z22.VnS(), p2.Merging(), z10.VnS());
   __ Mov(z23.VnS(), p2.Merging(), z11.VnS());
   __ Ld4w(z8.VnS(),
           z9.VnS(),
           z10.VnS(),
           z11.VnS(),
           p2.Zeroing(),
           SVEMemOperand(sp));

   __ DropVL(4 + 2 + 3);

   END();

   if (CAN_RUN()) {
     RUN();

     // The most likely failure mode is the that simulator reads sp as xzr and
     // crashes on execution. We already test the address calculations separately
     // and sp doesn't change this, so just test that we load the values we
     // stored.
     // TODO: Actually do this, once loads are implemented.
   }
 }

 // Fill the input buffer with arbitrary data. Meanwhile, assign random offsets
 // from the base address of the buffer and corresponding addresses to the
 // arguments if provided.
 static void BufferFillingHelper(uint64_t data_ptr,
                                 size_t buffer_size,
                                 unsigned lane_size_in_bytes,
                                 int lane_count,
                                 uint64_t* offsets,
                                 uint64_t* addresses = nullptr,
                                 uint64_t* max_address = nullptr) {
   // Use a fixed seed for nrand48() so that test runs are reproducible.
   unsigned short seed[3] = {1, 2, 3};  // NOLINT(google-runtime-int)

   // Fill a buffer with arbitrary data.
   for (size_t i = 0; i < buffer_size; i++) {
     uint8_t byte = nrand48(seed) & 0xff;
     memcpy(reinterpret_cast<void*>(data_ptr + i), &byte, 1);
   }

   if (max_address != nullptr) {
     *max_address = 0;
   }

   // Vectors of random addresses and offsets into the buffer.
   for (int i = 0; i < lane_count; i++) {
     uint64_t rnd = nrand48(seed);
     // Limit the range to the set of completely-accessible elements in memory.
     offsets[i] = rnd % (buffer_size - lane_size_in_bytes);
     if ((addresses != nullptr) && (max_address != nullptr)) {
       addresses[i] = data_ptr + offsets[i];
       *max_address = std::max(*max_address, addresses[i]);
     }
   }
 }

 static void ScalarLoadHelper(MacroAssembler* masm,
                              Register dst,
                              Register addr,
                              int msize_in_bits,
                              bool is_signed) {
   if (is_signed) {
     switch (msize_in_bits) {
       case kBRegSize:
         masm->Ldrsb(dst, MemOperand(addr));
         break;
       case kHRegSize:
         masm->Ldrsh(dst, MemOperand(addr));
         break;
       case kWRegSize:
         masm->Ldrsw(dst, MemOperand(addr));
         break;
       default:
         VIXL_UNIMPLEMENTED();
         break;
     }
   } else {
     switch (msize_in_bits) {
       case kBRegSize:
         masm->Ldrb(dst, MemOperand(addr));
         break;
       case kHRegSize:
         masm->Ldrh(dst, MemOperand(addr));
         break;
       case kWRegSize:
         masm->Ldr(dst.W(), MemOperand(addr));
         break;
       case kXRegSize:
         masm->Ldr(dst, MemOperand(addr));
         break;
       default:
         VIXL_UNIMPLEMENTED();
         break;
     }
   }
 }

 // Generate a reference result using scalar loads.
 // For now this helper doesn't save and restore the caller registers.
 // Clobber register z30, x28, x29 and p7.
 template <size_t N>
 static void ScalarLoadHelper(MacroAssembler* masm,
                              int vl,
                              const uint64_t (&addresses)[N],
                              const ZRegister& zt_ref,
                              const PRegisterZ& pg,
                              unsigned esize_in_bits,
                              unsigned msize_in_bits,
                              bool is_signed) {
   unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
   ZRegister lane_numbers = z30.WithLaneSize(esize_in_bits);
   masm->Index(lane_numbers, 0, 1);
   masm->Dup(zt_ref, 0);
   for (unsigned i = 0; i < (vl / esize_in_bytes); i++) {
     masm->Mov(x29, addresses[N - i - 1]);
     Register rt(28, std::min(std::max(esize_in_bits, kSRegSize), kDRegSize));
     ScalarLoadHelper(masm, rt, x29, msize_in_bits, is_signed);

     // Emulate predication.
     masm->Cmpeq(p7.WithLaneSize(esize_in_bits), pg, lane_numbers, i);
     masm->Cpy(zt_ref, p7.Merging(), rt);
   }
 }

 typedef void (MacroAssembler::*Ld1Macro)(const ZRegister& zt,
                                          const PRegisterZ& pg,
                                          const SVEMemOperand& addr);

 template <typename T>
 static void Ldff1Helper(Test* config,
                         uintptr_t data,
                         unsigned msize_in_bits,
                         unsigned esize_in_bits,
                         CPURegister::RegisterType base_type,
                         Ld1Macro ldff1,
                         Ld1Macro ld1,
                         T mod,
                         bool scale = false) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int vl = config->sve_vl_in_bytes();
   size_t page_size = sysconf(_SC_PAGE_SIZE);
   VIXL_ASSERT(page_size > static_cast<size_t>(vl));

   unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
   unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
   unsigned msize_in_bytes_log2 = std::log2(msize_in_bytes);
   VIXL_ASSERT(msize_in_bits <= esize_in_bits);

   PRegister all = p7;
   __ Ptrue(all.VnB());

   size_t offset_modifier = 0;

   // The highest address at which a load stopped. Every FF load should fault at
   // `data + page_size`, so this value should not exceed that value. However,
   // the architecture allows fault-tolerant loads to fault arbitrarily, so the
   // real value may be lower.
   //
   // This is used to check that the `mprotect` above really does make the second
   // page inaccessible, and that the resulting FFR from each load reflects that.
   Register limit = x22;
   __ Mov(limit, 0);

   // If the FFR grows unexpectedly, we increment this register by the
   // difference. FFR should never grow, except when explicitly set.
   Register ffr_grow_count = x23;
   __ Mov(ffr_grow_count, 0);

   // Set the offset so that the load is guaranteed to start in the
   // accessible page, but end in the inaccessible one.
   VIXL_ASSERT((page_size % msize_in_bytes) == 0);
   VIXL_ASSERT((vl % msize_in_bytes) == 0);
   size_t elements_per_page = page_size / msize_in_bytes;
   size_t elements_per_access = vl / esize_in_bytes;
   size_t min_offset = (elements_per_page - elements_per_access) + 1;
   size_t max_offset = elements_per_page - 1;
   size_t offset =
       min_offset + (offset_modifier % (max_offset - min_offset + 1));
   offset_modifier++;

   __ Setffr();
   __ Mov(x20, data);
   __ Mov(x21, offset);

   if (base_type == CPURegister::kRegister) {
     // Scalar-plus-scalar mode.
     VIXL_ASSERT((std::is_same<T, vixl::aarch64::Shift>::value));
     VIXL_ASSERT((static_cast<int>(mod) == LSL) ||
                 (static_cast<int>(mod) == NO_SHIFT));
     (masm.*ldff1)(z0.WithLaneSize(esize_in_bits),
                   all.Zeroing(),
                   SVEMemOperand(x20, x21, mod, msize_in_bytes_log2));
   } else {
     VIXL_ASSERT(base_type == CPURegister::kZRegister);
     int offs_size;
     bool offs_is_unsigned;
     if (std::is_same<T, vixl::aarch64::Extend>::value) {
       // Scalar-plus-vector mode with 32-bit optional unpacked or upacked, and
       // unscaled or scaled offset.
       VIXL_ASSERT((static_cast<int>(mod) == SXTW) ||
                   (static_cast<int>(mod) == UXTW));
       if (scale == true) {
         // Gather first-fault bytes load doesn't support scaled offset.
         VIXL_ASSERT(msize_in_bits != kBRegSize);
       }
       offs_is_unsigned = (static_cast<int>(mod) == UXTW) ? true : false;
       offs_size = kSRegSize;

     } else {
       // Scalar-plus-vector mode with 64-bit unscaled or scaled offset.
       VIXL_ASSERT((std::is_same<T, vixl::aarch64::Shift>::value));
       VIXL_ASSERT((static_cast<int>(mod) == LSL) ||
                   (static_cast<int>(mod) == NO_SHIFT));
       offs_is_unsigned = false;
       offs_size = kDRegSize;
     }

     // For generating the pattern of "base address + index << shift".
     // In case of unscaled-offset operation, use `msize_in_bytes` be an offset
     // of each decreasing memory accesses. otherwise, decreases the indexes by 1
     // and then scale it by the shift value.
     int shift = (scale == true) ? msize_in_bytes_log2 : 0;
     int index_offset = msize_in_bytes >> shift;
     VIXL_ASSERT(index_offset > 0);
     uint64_t index = 0;
     uint64_t base_address = 0;

     if (offs_is_unsigned == true) {
       // Base address.
       base_address = data;
       // Maximum unsigned positive index.
       index = page_size >> shift;

     } else {
       // Base address.
       base_address = data + (2 * page_size);
       // Maximum unsigned positive index.
       uint64_t uint_e_max =
           (esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX;
       index = uint_e_max - (page_size >> shift) + 1;
     }

     __ Mov(x19, base_address);
     if ((offs_size == kSRegSize) && (esize_in_bits == kDRegSize)) {
       // In this case, the index values are optionally sign or zero-extended
       // from 32 to 64 bits, assign a convenient value to the top 32 bits to
       // ensure only the low 32 bits be the index values.
       index |= 0x1234567800000000;
     }

     index -= index_offset * (elements_per_access - 1);
     __ Index(z17.WithLaneSize(esize_in_bits), index, index_offset);

     // Scalar plus vector mode.
     (masm.*
      ldff1)(z0.WithLaneSize(esize_in_bits),
             all.Zeroing(),
             SVEMemOperand(x19, z17.WithLaneSize(esize_in_bits), mod, shift));
   }

   __ Rdffrs(p0.VnB(), all.Zeroing());

   // Execute another Ldff1 with no offset, so that every element could be
   // read. It should respect FFR, and load no more than we loaded the
   // first time.
   (masm.*
    ldff1)(z16.WithLaneSize(esize_in_bits), all.Zeroing(), SVEMemOperand(x20));
   __ Rdffrs(p1.VnB(), all.Zeroing());
   __ Cntp(x0, all, p1.VnB());
   __ Uqdecp(x0, p0.VnB());
   __ Add(ffr_grow_count, ffr_grow_count, x0);

   // Use the FFR to predicate the normal load. If it wasn't properly set,
   // the normal load will abort.
   (masm.*ld1)(z16.WithLaneSize(esize_in_bits),
               p0.Zeroing(),
               SVEMemOperand(x20, x21, LSL, msize_in_bytes_log2));

   // Work out the address after the one that was just accessed.
   __ Incp(x21, p0.WithLaneSize(esize_in_bits));
   __ Add(x0, x20, Operand(x21, LSL, msize_in_bytes_log2));
   __ Cmp(limit, x0);
   __ Csel(limit, limit, x0, hs);

   // Clear lanes inactive in FFR. These have an undefined result.
   __ Not(p0.VnB(), all.Zeroing(), p0.VnB());
   __ Mov(z0.WithLaneSize(esize_in_bits), p0.Merging(), 0);

   END();

   if (CAN_RUN()) {
     RUN();

     uintptr_t expected_limit = data + page_size;
     uintptr_t measured_limit = core.xreg(limit.GetCode());
     VIXL_CHECK(measured_limit <= expected_limit);
     if (measured_limit < expected_limit) {
       // We can't fail the test for this case, but a warning is helpful for
       // manually-run tests.
       printf(
           "WARNING: All fault-tolerant loads detected faults before the\n"
           "expected limit. This is architecturally possible, but improbable,\n"
           "and could be a symptom of another problem.\n");
     }

     ASSERT_EQUAL_64(0, ffr_grow_count);

     ASSERT_EQUAL_SVE(z0.WithLaneSize(esize_in_bits),
                      z16.WithLaneSize(esize_in_bits));
   }
 }

 TEST_SVE(sve_ldff1_scalar_plus_scalar) {
   size_t page_size = sysconf(_SC_PAGE_SIZE);
   VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));

   // Allocate two pages, then mprotect the second one to make it inaccessible.
   uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
                                                     page_size * 2,
                                                     PROT_READ | PROT_WRITE,
                                                     MAP_PRIVATE | MAP_ANONYMOUS,
                                                     -1,
                                                     0));
   mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);

   // Fill the accessible page with arbitrary data.
   for (size_t i = 0; i < page_size; i++) {
     // Reverse bits so we get a mixture of positive and negative values.
     uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
     memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
   }

   auto ldff1_unscaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
                                                 config,
                                                 data,
                                                 std::placeholders::_1,
                                                 std::placeholders::_2,
                                                 CPURegister::kRegister,
                                                 std::placeholders::_3,
                                                 std::placeholders::_4,
                                                 NO_SHIFT,
                                                 false);

   Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
   Ld1Macro ld1b = &MacroAssembler::Ld1b;
   ldff1_unscaled_offset_helper(kBRegSize, kBRegSize, ldff1b, ld1b);
   ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1b, ld1b);
   ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1b, ld1b);
   ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1b, ld1b);

   Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
   Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
   ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1sb, ld1sb);
   ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1sb, ld1sb);
   ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1sb, ld1sb);

   auto ldff1_scaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
                                               config,
                                               data,
                                               std::placeholders::_1,
                                               std::placeholders::_2,
                                               CPURegister::kRegister,
                                               std::placeholders::_3,
                                               std::placeholders::_4,
                                               LSL,
                                               true);

   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
   Ld1Macro ld1h = &MacroAssembler::Ld1h;
   ldff1_scaled_offset_helper(kHRegSize, kHRegSize, ldff1h, ld1h);
   ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1h, ld1h);
   ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1h, ld1h);

   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
   Ld1Macro ld1w = &MacroAssembler::Ld1w;
   ldff1_scaled_offset_helper(kSRegSize, kSRegSize, ldff1w, ld1w);
   ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1w, ld1w);

   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
   Ld1Macro ld1d = &MacroAssembler::Ld1d;
   ldff1_scaled_offset_helper(kDRegSize, kDRegSize, ldff1d, ld1d);

   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
   ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1sh, ld1sh);
   ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1sh, ld1sh);

   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
   ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1sw, ld1sw);

   munmap(reinterpret_cast<void*>(data), page_size * 2);
 }

 static void sve_ldff1_scalar_plus_vector_32_scaled_offset(Test* config,
                                                           uintptr_t data) {
   auto ldff1_32_scaled_offset_helper = std::bind(&Ldff1Helper<Extend>,
                                                  config,
                                                  data,
                                                  std::placeholders::_1,
                                                  kSRegSize,
                                                  CPURegister::kZRegister,
                                                  std::placeholders::_2,
                                                  std::placeholders::_3,
                                                  std::placeholders::_4,
                                                  true);
   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
   Ld1Macro ld1h = &MacroAssembler::Ld1h;
   ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
   ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);

   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
   Ld1Macro ld1w = &MacroAssembler::Ld1w;
   ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
   ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);

   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
   ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
   ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
 }

 static void sve_ldff1_scalar_plus_vector_32_unscaled_offset(Test* config,
                                                             uintptr_t data) {
   auto ldff1_32_unscaled_offset_helper = std::bind(&Ldff1Helper<Extend>,
                                                    config,
                                                    data,
                                                    std::placeholders::_1,
                                                    kSRegSize,
                                                    CPURegister::kZRegister,
                                                    std::placeholders::_2,
                                                    std::placeholders::_3,
                                                    std::placeholders::_4,
                                                    false);

   Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
   Ld1Macro ld1b = &MacroAssembler::Ld1b;
   ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW);
   ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW);

   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
   Ld1Macro ld1h = &MacroAssembler::Ld1h;
   ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
   ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);

   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
   Ld1Macro ld1w = &MacroAssembler::Ld1w;
   ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
   ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);

   Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
   Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
   ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW);
   ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW);

   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
   ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
   ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
 }

 static void sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(
     Test* config, uintptr_t data) {
   auto ldff1_32_unpacked_scaled_offset_helper =
       std::bind(&Ldff1Helper<Extend>,
                 config,
                 data,
                 std::placeholders::_1,
                 kDRegSize,
                 CPURegister::kZRegister,
                 std::placeholders::_2,
                 std::placeholders::_3,
                 std::placeholders::_4,
                 true);

   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
   Ld1Macro ld1h = &MacroAssembler::Ld1h;
   ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
   ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);

   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
   Ld1Macro ld1w = &MacroAssembler::Ld1w;
   ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
   ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);

   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
   Ld1Macro ld1d = &MacroAssembler::Ld1d;
   ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW);
   ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW);

   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
   ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
   ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);

   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
   ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW);
   ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW);
 }

 static void sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(
     Test* config, uintptr_t data) {
   auto ldff1_32_unpacked_unscaled_offset_helper =
       std::bind(&Ldff1Helper<Extend>,
                 config,
                 data,
                 std::placeholders::_1,
                 kDRegSize,
                 CPURegister::kZRegister,
                 std::placeholders::_2,
                 std::placeholders::_3,
                 std::placeholders::_4,
                 false);

   Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
   Ld1Macro ld1b = &MacroAssembler::Ld1b;
   ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW);
   ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW);

   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
   Ld1Macro ld1h = &MacroAssembler::Ld1h;
   ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
   ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);

   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
   Ld1Macro ld1w = &MacroAssembler::Ld1w;
   ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
   ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);

   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
   Ld1Macro ld1d = &MacroAssembler::Ld1d;
   ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW);
   ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW);

   Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
   Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
   ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW);
   ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW);

   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
   ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
   ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);

   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
   ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW);
   ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW);
 }

 static void sve_ldff1_scalar_plus_vector_64_scaled_offset(Test* config,
                                                           uintptr_t data) {
   auto ldff1_64_scaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
                                                  config,
                                                  data,
                                                  std::placeholders::_1,
                                                  kDRegSize,
                                                  CPURegister::kZRegister,
                                                  std::placeholders::_2,
                                                  std::placeholders::_3,
                                                  LSL,
                                                  true);

   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
   Ld1Macro ld1h = &MacroAssembler::Ld1h;
   ldff1_64_scaled_offset_helper(kHRegSize, ldff1h, ld1h);

   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
   Ld1Macro ld1w = &MacroAssembler::Ld1w;
   ldff1_64_scaled_offset_helper(kSRegSize, ldff1w, ld1w);

   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
   Ld1Macro ld1d = &MacroAssembler::Ld1d;
   ldff1_64_scaled_offset_helper(kDRegSize, ldff1d, ld1d);

   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
   ldff1_64_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh);

   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
   ldff1_64_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw);
 }

 static void sve_ldff1_scalar_plus_vector_64_unscaled_offset(Test* config,
                                                             uintptr_t data) {
   auto ldff1_64_unscaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
                                                    config,
                                                    data,
                                                    std::placeholders::_1,
                                                    kDRegSize,
                                                    CPURegister::kZRegister,
                                                    std::placeholders::_2,
                                                    std::placeholders::_3,
                                                    NO_SHIFT,
                                                    false);

   Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
   Ld1Macro ld1b = &MacroAssembler::Ld1b;
   ldff1_64_unscaled_offset_helper(kBRegSize, ldff1b, ld1b);

   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
   Ld1Macro ld1h = &MacroAssembler::Ld1h;
   ldff1_64_unscaled_offset_helper(kHRegSize, ldff1h, ld1h);

   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
   Ld1Macro ld1w = &MacroAssembler::Ld1w;
   ldff1_64_unscaled_offset_helper(kSRegSize, ldff1w, ld1w);

   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
   Ld1Macro ld1d = &MacroAssembler::Ld1d;
   ldff1_64_unscaled_offset_helper(kDRegSize, ldff1d, ld1d);

   Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
   Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
   ldff1_64_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb);

   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
   ldff1_64_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh);

   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
   ldff1_64_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw);
 }

 TEST_SVE(sve_ldff1_scalar_plus_vector) {
   size_t page_size = sysconf(_SC_PAGE_SIZE);
   VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));

   // Allocate two pages, then mprotect the second one to make it inaccessible.
   uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
                                                     page_size * 2,
                                                     PROT_READ | PROT_WRITE,
                                                     MAP_PRIVATE | MAP_ANONYMOUS,
                                                     -1,
                                                     0));
   mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);

   // Fill the accessible page with arbitrary data.
   for (size_t i = 0; i < page_size; i++) {
     // Reverse bits so we get a mixture of positive and negative values.
     uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
     memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
   }

   sve_ldff1_scalar_plus_vector_32_scaled_offset(config, data);
   sve_ldff1_scalar_plus_vector_32_unscaled_offset(config, data);
   sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(config, data);
   sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(config, data);
   sve_ldff1_scalar_plus_vector_64_scaled_offset(config, data);
   sve_ldff1_scalar_plus_vector_64_unscaled_offset(config, data);

   munmap(reinterpret_cast<void*>(data), page_size * 2);
 }

 TEST_SVE(sve_ldnf1) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
                           CPUFeatures::kNEON,
                           CPUFeatures::kFP);
   START();

   size_t page_size = sysconf(_SC_PAGE_SIZE);
   VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));

   // Allocate two pages, fill them with data, then mprotect the second one to
   // make it inaccessible.
   uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
                                                     page_size * 2,
                                                     PROT_READ | PROT_WRITE,
                                                     MAP_PRIVATE | MAP_ANONYMOUS,
                                                     -1,
                                                     0));

   // Fill the pages with arbitrary data.
   for (size_t i = 0; i < page_size; i++) {
     // Reverse bits so we get a mixture of positive and negative values.
     uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
     memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
   }

   mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);

   __ Setffr();
   __ Ptrue(p0.VnB());
   __ Dup(z10.VnB(), 0);

   // Move an address that points to the last unprotected eight bytes.
   __ Mov(x0, data + page_size - (kQRegSizeInBytes / kBRegSizeInBytes) / 2);

   // Load, non-faulting, a vector of bytes from x0. At most, eight bytes will be
   // loaded, the rest being in a protected page.
   __ Ldnf1b(z0.VnB(), p0.Zeroing(), SVEMemOperand(x0));
   __ Rdffr(p1.VnB());
   __ Setffr();

   // Create references using the FFR value in p1 to zero the undefined lanes.
   __ Sel(z0.VnB(), p1, z0.VnB(), z10.VnB());
   __ Ld1b(z20.VnB(), p1.Zeroing(), SVEMemOperand(x0));

   // Repeat for larger elements and different addresses, giving different FFR
   // results.
   __ Add(x1, x0, 1);
   __ Ldnf1h(z1.VnH(), p0.Zeroing(), SVEMemOperand(x1));
   __ Rdffr(p1.VnB());
   __ Setffr();
   __ Sel(z1.VnH(), p1, z1.VnH(), z10.VnH());
   __ Ld1h(z21.VnH(), p1.Zeroing(), SVEMemOperand(x1));

   __ Add(x1, x0, 2);
   __ Ldnf1w(z2.VnS(), p0.Zeroing(), SVEMemOperand(x1));
   __ Rdffr(p1.VnB());
   __ Setffr();
   __ Sel(z2.VnS(), p1, z2.VnS(), z10.VnS());
   __ Ld1w(z22.VnS(), p1.Zeroing(), SVEMemOperand(x1));

   __ Sub(x1, x0, 1);
   __ Ldnf1d(z3.VnD(), p0.Zeroing(), SVEMemOperand(x1));
   __ Rdffr(p1.VnB());
   __ Setffr();
   __ Sel(z3.VnD(), p1, z3.VnD(), z10.VnD());
   __ Ld1d(z23.VnD(), p1.Zeroing(), SVEMemOperand(x1));

   // Load from previous VL-sized area of memory. All of this should be in the
   // accessible page.
   __ Ldnf1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
   __ Rdffr(p1.VnB());
   __ Setffr();
   __ Sel(z4.VnB(), p1, z4.VnB(), z10.VnB());
   __ Ld1b(z24.VnB(), p1.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));

   // Repeat partial load for larger element size.
   __ Mov(x0, data + page_size - (kQRegSizeInBytes / kSRegSizeInBytes) / 2);
   __ Ldnf1b(z5.VnS(), p0.Zeroing(), SVEMemOperand(x0));
   __ Rdffr(p1.VnB());
   __ Setffr();
   __ Sel(z5.VnS(), p1, z5.VnS(), z10.VnS());
   __ Ld1b(z25.VnS(), p1.Zeroing(), SVEMemOperand(x0));

   // Repeat for sign extension.
   __ Mov(x0, data + page_size - (kQRegSizeInBytes / kHRegSizeInBytes) / 2);
   __ Ldnf1sb(z6.VnH(), p0.Zeroing(), SVEMemOperand(x0));
   __ Rdffr(p1.VnB());
   __ Setffr();
   __ Sel(z6.VnH(), p1, z6.VnH(), z10.VnH());
   __ Ld1sb(z26.VnH(), p1.Zeroing(), SVEMemOperand(x0));

   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(z20, z0);
     ASSERT_EQUAL_SVE(z21, z1);
     ASSERT_EQUAL_SVE(z22, z2);
     ASSERT_EQUAL_SVE(z23, z3);
     ASSERT_EQUAL_SVE(z24, z4);
     ASSERT_EQUAL_SVE(z25, z5);
     ASSERT_EQUAL_SVE(z26, z6);
   }

   munmap(reinterpret_cast<void*>(data), page_size * 2);
 }

 // Emphasis on test if the modifiers are propagated and simulated correctly.
 TEST_SVE(sve_ldff1_regression_test) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   size_t page_size = sysconf(_SC_PAGE_SIZE);
   VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));

   uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
                                                     page_size * 2,
                                                     PROT_READ | PROT_WRITE,
                                                     MAP_PRIVATE | MAP_ANONYMOUS,
                                                     -1,
                                                     0));
   uintptr_t middle = data + page_size;
   // Fill the accessible page with arbitrary data.
   for (size_t i = 0; i < page_size; i++) {
     // Reverse bits so we get a mixture of positive and negative values.
     uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
     memcpy(reinterpret_cast<void*>(middle + i), &byte, 1);
     // Make one bit roughly different in every byte and copy the bytes in the
     // reverse direction that convenient to verifying the loads in negative
     // indexes.
     byte += 1;
     memcpy(reinterpret_cast<void*>(middle - i), &byte, 1);
   }

   PRegister all = p6;
   __ Ptrue(all.VnB());

   __ Mov(x0, middle);
   __ Index(z31.VnS(), 0, 3);
   __ Neg(z30.VnS(), z31.VnS());

   __ Setffr();

   // Scalar plus vector 32 unscaled offset
   __ Ldff1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
   __ Ldff1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
   __ Ldff1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
   __ Ldff1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
   __ Ldff1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));

   // Scalar plus vector 32 scaled offset
   __ Ldff1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1));
   __ Ldff1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2));
   __ Ldff1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1));

   __ Index(z31.VnD(), 0, 3);
   __ Neg(z30.VnD(), z31.VnD());

   // Ensure only the low 32 bits are used for the testing with positive index
   // values. It also test if the indexes are treated as positive in `uxtw` form.
   __ Mov(x3, 0x8000000080000000);
   __ Dup(z28.VnD(), x3);
   __ Sub(x2, x0, 0x80000000);
   __ Add(z29.VnD(), z31.VnD(), z28.VnD());

   // Scalar plus vector 32 unpacked unscaled offset
   __ Ldff1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
   __ Ldff1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
   __ Ldff1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
   __ Ldff1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
   __ Ldff1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
   __ Ldff1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));

   // Scalar plus vector 32 unpacked scaled offset
   __ Ldff1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
   __ Ldff1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
   __ Ldff1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3));
   __ Ldff1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
   __ Ldff1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));

   __ Sub(x0, x0, x3);
   // Note that the positive indexes has been added by `0x8000000080000000`. The
   // wrong address will be accessed if the address is treated as negative.

   // Scalar plus vector 64 unscaled offset
   __ Ldff1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
   __ Ldff1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
   __ Ldff1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
   __ Ldff1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
   __ Ldff1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));

   // Scalar plus vector 64 scaled offset
   __ Lsr(z29.VnD(), z28.VnD(), 1);  // Shift right to 0x4000000040000000
   __ Add(z30.VnD(), z31.VnD(), z29.VnD());
   __ Ldff1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
   __ Ldff1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));

   __ Lsr(z29.VnD(), z29.VnD(), 1);  // Shift right to 0x2000000020000000
   __ Add(z30.VnD(), z31.VnD(), z29.VnD());
   __ Ldff1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
   __ Ldff1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));

   __ Lsr(z29.VnD(), z29.VnD(), 1);  // Shift right to 0x1000000010000000
   __ Add(z30.VnD(), z31.VnD(), z29.VnD());
   __ Ldff1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3));

   __ Rdffr(p1.VnB());
   __ Cntp(x10, all, p1.VnB());

   END();

   if (CAN_RUN()) {
     RUN();

     int64_t loaded_data_in_bytes = core.xreg(x10.GetCode());
     // Only check 128 bits in this test.
     if (loaded_data_in_bytes < kQRegSizeInBytes) {
       // Report a warning when we hit fault-tolerant loads before all expected
       // loads performed.
       printf(
           "WARNING: Fault-tolerant loads detected faults before the "
           "expected loads completed.\n");
       return;
     }

     // Scalar plus vector 32 unscaled offset
     uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001};
     uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001};
     uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001};
     uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001};
     uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001};

     ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
     ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
     ASSERT_EQUAL_SVE(expected_z3, z3.VnS());
     ASSERT_EQUAL_SVE(expected_z4, z4.VnS());
     ASSERT_EQUAL_SVE(expected_z5, z5.VnS());

     // Scalar plus vector 32 scaled offset
     uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001};
     uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001};
     uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001};

     ASSERT_EQUAL_SVE(expected_z6, z6.VnS());
     ASSERT_EQUAL_SVE(expected_z7, z7.VnS());
     ASSERT_EQUAL_SVE(expected_z8, z8.VnS());

     // Scalar plus vector 32 unpacked unscaled offset
     uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001};
     uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001};
     uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001};
     uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001};
     uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001};
     uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001};

     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
     ASSERT_EQUAL_SVE(expected_z14, z14.VnD());

     // Scalar plus vector 32 unpacked scaled offset
     uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001};
     uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001};
     uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001};
     uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001};
     uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001};

     ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
     ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
     ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
     ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
     ASSERT_EQUAL_SVE(expected_z19, z19.VnD());

     // Scalar plus vector 64 unscaled offset
     uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001};
     uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001};
     uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001};
     uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001};
     uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001};

     ASSERT_EQUAL_SVE(expected_z20, z20.VnD());
     ASSERT_EQUAL_SVE(expected_z21, z21.VnD());
     ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
     ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
     ASSERT_EQUAL_SVE(expected_z24, z24.VnD());

     uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001};
     uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001};
     uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001};
     uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001};
     uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001};

     // Scalar plus vector 64 scaled offset
     ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
     ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
     ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
     ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
     ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
   }
 }

 // Emphasis on test if the modifiers are propagated and simulated correctly.
 TEST_SVE(sve_ld1_regression_test) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   size_t page_size = sysconf(_SC_PAGE_SIZE);
   VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));

   uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
                                                     page_size * 2,
                                                     PROT_READ | PROT_WRITE,
                                                     MAP_PRIVATE | MAP_ANONYMOUS,
                                                     -1,
                                                     0));
   uintptr_t middle = data + page_size;
   // Fill the accessible page with arbitrary data.
   for (size_t i = 0; i < page_size; i++) {
     // Reverse bits so we get a mixture of positive and negative values.
     uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
     memcpy(reinterpret_cast<void*>(middle + i), &byte, 1);
     // Make one bit roughly different in every byte and copy the bytes in the
     // reverse direction that convenient to verifying the loads in negative
     // indexes.
     byte += 1;
     memcpy(reinterpret_cast<void*>(middle - i), &byte, 1);
   }

   PRegister all = p6;
   __ Ptrue(all.VnB());

   __ Mov(x0, middle);
   __ Index(z31.VnS(), 0, 3);
   __ Neg(z30.VnS(), z31.VnS());

   // Scalar plus vector 32 unscaled offset
   __ Ld1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
   __ Ld1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
   __ Ld1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
   __ Ld1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
   __ Ld1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));

   // Scalar plus vector 32 scaled offset
   __ Ld1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1));
   __ Ld1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2));
   __ Ld1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1));

   __ Index(z31.VnD(), 0, 3);
   __ Neg(z30.VnD(), z31.VnD());

   // Ensure only the low 32 bits are used for the testing with positive index
   // values. It also test if the indexes are treated as positive in `uxtw` form.
   __ Mov(x3, 0x8000000080000000);
   __ Dup(z28.VnD(), x3);
   __ Sub(x2, x0, 0x80000000);
   __ Add(z29.VnD(), z31.VnD(), z28.VnD());

   // Scalar plus vector 32 unpacked unscaled offset
   __ Ld1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
   __ Ld1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
   __ Ld1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
   __ Ld1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
   __ Ld1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
   __ Ld1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));

   // Scalar plus vector 32 unpacked scaled offset
   __ Ld1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
   __ Ld1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
   __ Ld1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3));
   __ Ld1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
   __ Ld1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));

   __ Sub(x0, x0, x3);
   // Note that the positive indexes has been added by `0x8000000080000000`. The
   // wrong address will be accessed if the address is treated as negative.

   // Scalar plus vector 64 unscaled offset
   __ Ld1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
   __ Ld1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
   __ Ld1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
   __ Ld1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
   __ Ld1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));

   // Scalar plus vector 64 scaled offset
   __ Lsr(z29.VnD(), z28.VnD(), 1);  // Shift right to 0x4000000040000000
   __ Add(z30.VnD(), z31.VnD(), z29.VnD());
   __ Ld1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
   __ Ld1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));

   __ Lsr(z29.VnD(), z29.VnD(), 1);  // Shift right to 0x2000000020000000
   __ Add(z30.VnD(), z31.VnD(), z29.VnD());
   __ Ld1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
   __ Ld1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));

   __ Lsr(z29.VnD(), z29.VnD(), 1);  // Shift right to 0x1000000010000000
   __ Add(z30.VnD(), z31.VnD(), z29.VnD());
   __ Ld1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3));

   END();

   if (CAN_RUN()) {
     RUN();

     // Scalar plus vector 32 unscaled offset
     uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001};
     uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001};
     uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001};
     uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001};
     uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001};

     ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
     ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
     ASSERT_EQUAL_SVE(expected_z3, z3.VnS());
     ASSERT_EQUAL_SVE(expected_z4, z4.VnS());
     ASSERT_EQUAL_SVE(expected_z5, z5.VnS());

     // Scalar plus vector 32 scaled offset
     uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001};
     uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001};
     uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001};

     ASSERT_EQUAL_SVE(expected_z6, z6.VnS());
     ASSERT_EQUAL_SVE(expected_z7, z7.VnS());
     ASSERT_EQUAL_SVE(expected_z8, z8.VnS());

     // Scalar plus vector 32 unpacked unscaled offset
     uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001};
     uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001};
     uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001};
     uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001};
     uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001};
     uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001};

     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
     ASSERT_EQUAL_SVE(expected_z14, z14.VnD());

     // Scalar plus vector 32 unpacked scaled offset
     uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001};
     uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001};
     uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001};
     uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001};
     uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001};

     ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
     ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
     ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
     ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
     ASSERT_EQUAL_SVE(expected_z19, z19.VnD());

     // Scalar plus vector 64 unscaled offset
     uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001};
     uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001};
     uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001};
     uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001};
     uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001};

     ASSERT_EQUAL_SVE(expected_z20, z20.VnD());
     ASSERT_EQUAL_SVE(expected_z21, z21.VnD());
     ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
     ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
     ASSERT_EQUAL_SVE(expected_z24, z24.VnD());

     uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001};
     uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001};
     uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001};
     uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001};
     uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001};

     // Scalar plus vector 64 scaled offset
     ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
     ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
     ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
     ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
     ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
   }
 }

 // Test gather loads by comparing them with the result of a set of equivalent
 // scalar loads.
 template <typename T>
 static void GatherLoadScalarPlusVectorHelper(Test* config,
                                              unsigned msize_in_bits,
                                              unsigned esize_in_bits,
                                              Ld1Macro ld1,
                                              Ld1Macro ldff1,
                                              T mod,
                                              bool is_signed,
                                              bool is_scaled) {
   // SVE supports 32- and 64-bit addressing for gather loads.
   VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
   static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize;

   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
   int vl = config->sve_vl_in_bytes();

   uint64_t addresses[kMaxLaneCount];
   uint64_t offsets[kMaxLaneCount];
   uint64_t max_address = 0;
   uint64_t buffer_size = vl * 64;
   uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
   // Fill the buffer with arbitrary data. Meanwhile, create the random addresses
   // and offsets into the buffer placed in the argument list.
   BufferFillingHelper(data,
                       buffer_size,
                       msize_in_bytes,
                       kMaxLaneCount,
                       offsets,
                       addresses,
                       &max_address);

   ZRegister zn = z0.WithLaneSize(esize_in_bits);
   ZRegister zt_ref = z1.WithLaneSize(esize_in_bits);
   ZRegister zt = z2.WithLaneSize(esize_in_bits);
   ZRegister zt_ff = z3.WithLaneSize(esize_in_bits);
   PRegisterWithLaneSize pg_ff = p1.WithLaneSize(esize_in_bits);
   PRegisterWithLaneSize pg_diff = p2.WithLaneSize(esize_in_bits);

   int shift = 0;
   if (is_scaled) {
     shift = std::log2(msize_in_bytes);
     for (unsigned i = 0; i < kMaxLaneCount; i++) {
       // Ensure the offsets are the multiple of the scale factor of the
       // operation.
       offsets[i] = (offsets[i] >> shift) << shift;
       addresses[i] = data + offsets[i];
     }
   }

   PRegister all = p6;
   __ Ptrue(all.WithLaneSize(esize_in_bits));

   PRegisterZ pg = p0.Zeroing();
   Initialise(&masm,
              pg,
              0x9abcdef012345678,
              0xabcdef0123456789,
              0xf4f3f1f0fefdfcfa,
              0xf9f8f6f5f3f2f1ff);

   __ Mov(x0, data);

   // Generate a reference result for scalar-plus-scalar form using scalar loads.
   ScalarLoadHelper(&masm,
                    vl,
                    addresses,
                    zt_ref,
                    pg,
                    esize_in_bits,
                    msize_in_bits,
                    is_signed);

   InsrHelper(&masm, zn, offsets);
   if (is_scaled) {
     // Scale down the offsets if testing scaled-offset operation.
     __ Lsr(zn, zn, shift);
   }

   (masm.*ld1)(zt, pg, SVEMemOperand(x0, zn, mod, shift));

   Register ffr_check_count = x17;
   __ Mov(ffr_check_count, 0);

   // Test the data correctness in which the data gather load from different
   // addresses. The first-fault behavior test is emphasized in `Ldff1Helper`.
   __ Setffr();
   (masm.*ldff1)(zt_ff, pg, SVEMemOperand(x0, zn, mod, shift));

   // Compare these two vector register and place the different to
   // `ffr_check_count`.
   __ Rdffrs(pg_ff.VnB(), all.Zeroing());
   __ Cmpeq(pg_diff, all.Zeroing(), zt_ref, zt_ff);
   __ Eor(pg_diff.VnB(), all.Zeroing(), pg_diff.VnB(), pg_ff.VnB());
   __ Incp(ffr_check_count, pg_diff);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(zt_ref, zt);
     ASSERT_EQUAL_64(0, ffr_check_count);
   }

   free(reinterpret_cast<void*>(data));
 }

 // Test gather loads by comparing them with the result of a set of equivalent
 // scalar loads.
 template <typename F>
 static void GatherLoadScalarPlusScalarOrImmHelper(Test* config,
                                                   unsigned msize_in_bits,
                                                   unsigned esize_in_bits,
                                                   F sve_ld1,
                                                   bool is_signed) {
   // SVE supports 32- and 64-bit addressing for gather loads.
   VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
   static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize;

   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
   int vl = config->sve_vl_in_bytes();

   uint64_t addresses[kMaxLaneCount];
   uint64_t offsets[kMaxLaneCount];
   uint64_t max_address = 0;
   uint64_t buffer_size = vl * 64;
   uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
   BufferFillingHelper(data,
                       buffer_size,
                       msize_in_bytes,
                       kMaxLaneCount,
                       offsets,
                       addresses,
                       &max_address);

   // Maximised offsets, to ensure that the address calculation is modulo-2^64,
   // and that the vector addresses are not sign-extended.
   uint64_t uint_e_max = (esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX;
   uint64_t maxed_offsets[kMaxLaneCount];
   uint64_t maxed_offsets_imm = max_address - uint_e_max;
   for (unsigned i = 0; i < kMaxLaneCount; i++) {
     maxed_offsets[i] = addresses[i] - maxed_offsets_imm;
   }

   ZRegister zn = z0.WithLaneSize(esize_in_bits);
   ZRegister zt_addresses = z1.WithLaneSize(esize_in_bits);
   ZRegister zt_offsets = z2.WithLaneSize(esize_in_bits);
   ZRegister zt_maxed = z3.WithLaneSize(esize_in_bits);
   ZRegister zt_ref = z4.WithLaneSize(esize_in_bits);

   PRegisterZ pg = p0.Zeroing();
   Initialise(&masm,
              pg,
              0x9abcdef012345678,
              0xabcdef0123456789,
              0xf4f3f1f0fefdfcfa,
              0xf9f8f6f5f3f2f0ff);

   // Execute each load.

   if (esize_in_bits == kDRegSize) {
     // Only test `addresses` if we can use 64-bit pointers. InsrHelper will fail
     // if any value won't fit in a lane of zn.
     InsrHelper(&masm, zn, addresses);
     (masm.*sve_ld1)(zt_addresses, pg, SVEMemOperand(zn));
   }

   InsrHelper(&masm, zn, offsets);
   (masm.*sve_ld1)(zt_offsets, pg, SVEMemOperand(zn, data));

   InsrHelper(&masm, zn, maxed_offsets);
   (masm.*sve_ld1)(zt_maxed, pg, SVEMemOperand(zn, maxed_offsets_imm));

   // Generate a reference result using scalar loads.
   ScalarLoadHelper(&masm,
                    vl,
                    addresses,
                    zt_ref,
                    pg,
                    esize_in_bits,
                    msize_in_bits,
                    is_signed);

   END();

   if (CAN_RUN()) {
     RUN();

     if (esize_in_bits == kDRegSize) {
       ASSERT_EQUAL_SVE(zt_ref, zt_addresses);
     }
     ASSERT_EQUAL_SVE(zt_ref, zt_offsets);
     ASSERT_EQUAL_SVE(zt_ref, zt_maxed);
   }

   free(reinterpret_cast<void*>(data));
 }

 TEST_SVE(sve_ld1b_64bit_vector_plus_immediate) {
   GatherLoadScalarPlusScalarOrImmHelper(config,
                                         kBRegSize,
                                         kDRegSize,
                                         &MacroAssembler::Ld1b,
                                         false);
 }

 TEST_SVE(sve_ld1h_64bit_vector_plus_immediate) {
   GatherLoadScalarPlusScalarOrImmHelper(config,
                                         kHRegSize,
                                         kDRegSize,
                                         &MacroAssembler::Ld1h,
                                         false);
 }

 TEST_SVE(sve_ld1w_64bit_vector_plus_immediate) {
   GatherLoadScalarPlusScalarOrImmHelper(config,
                                         kSRegSize,
                                         kDRegSize,
                                         &MacroAssembler::Ld1w,
                                         false);
 }

 TEST_SVE(sve_ld1d_64bit_vector_plus_immediate) {
   GatherLoadScalarPlusScalarOrImmHelper(config,
                                         kDRegSize,
                                         kDRegSize,
                                         &MacroAssembler::Ld1d,
                                         false);
 }

 TEST_SVE(sve_ld1sb_64bit_vector_plus_immediate) {
   GatherLoadScalarPlusScalarOrImmHelper(config,
                                         kBRegSize,
                                         kDRegSize,
                                         &MacroAssembler::Ld1sb,
                                         true);
 }

 TEST_SVE(sve_ld1sh_64bit_vector_plus_immediate) {
   GatherLoadScalarPlusScalarOrImmHelper(config,
                                         kHRegSize,
                                         kDRegSize,
                                         &MacroAssembler::Ld1sh,
                                         true);
 }

 TEST_SVE(sve_ld1sw_64bit_vector_plus_immediate) {
   GatherLoadScalarPlusScalarOrImmHelper(config,
                                         kSRegSize,
                                         kDRegSize,
                                         &MacroAssembler::Ld1sw,
                                         true);
 }

 TEST_SVE(sve_ld1b_32bit_vector_plus_immediate) {
   GatherLoadScalarPlusScalarOrImmHelper(config,
                                         kBRegSize,
                                         kSRegSize,
                                         &MacroAssembler::Ld1b,
                                         false);
 }

 TEST_SVE(sve_ld1h_32bit_vector_plus_immediate) {
   GatherLoadScalarPlusScalarOrImmHelper(config,
                                         kHRegSize,
                                         kSRegSize,
                                         &MacroAssembler::Ld1h,
                                         false);
 }

 TEST_SVE(sve_ld1w_32bit_vector_plus_immediate) {
   GatherLoadScalarPlusScalarOrImmHelper(config,
                                         kSRegSize,
                                         kSRegSize,
                                         &MacroAssembler::Ld1w,
                                         false);
 }

 TEST_SVE(sve_ld1sb_32bit_vector_plus_immediate) {
   GatherLoadScalarPlusScalarOrImmHelper(config,
                                         kBRegSize,
                                         kSRegSize,
                                         &MacroAssembler::Ld1sb,
                                         true);
 }

 TEST_SVE(sve_ld1sh_32bit_vector_plus_immediate) {
   GatherLoadScalarPlusScalarOrImmHelper(config,
                                         kHRegSize,
                                         kSRegSize,
                                         &MacroAssembler::Ld1sh,
                                         true);
 }

 TEST_SVE(sve_ld1_scalar_plus_vector_32_scaled_offset) {
   auto ld1_32_scaled_offset_helper =
       std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
                 config,
                 std::placeholders::_1,
                 kSRegSize,
                 std::placeholders::_2,
                 std::placeholders::_3,
                 std::placeholders::_4,
                 std::placeholders::_5,
                 true);

   Ld1Macro ld1h = &MacroAssembler::Ld1h;
   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
   ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
   ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);

   Ld1Macro ld1w = &MacroAssembler::Ld1w;
   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
   ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
   ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);

   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
   ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
   ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
 }

 TEST_SVE(sve_ld1_scalar_plus_vector_32_unscaled_offset) {
   auto ld1_32_unscaled_offset_helper =
       std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
                 config,
                 std::placeholders::_1,
                 kSRegSize,
                 std::placeholders::_2,
                 std::placeholders::_3,
                 std::placeholders::_4,
                 std::placeholders::_5,
                 false);

   Ld1Macro ld1b = &MacroAssembler::Ld1b;
   Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
   ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, UXTW, false);
   ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, SXTW, false);

   Ld1Macro ld1h = &MacroAssembler::Ld1h;
   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
   ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
   ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);

   Ld1Macro ld1w = &MacroAssembler::Ld1w;
   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
   ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
   ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);

   Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
   Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
   ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, UXTW, true);
   ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, SXTW, true);

   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
   ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
   ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
 }

 TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_scaled_offset) {
   auto ld1_32_unpacked_scaled_offset_helper =
       std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
                 config,
                 std::placeholders::_1,
                 kDRegSize,
                 std::placeholders::_2,
                 std::placeholders::_3,
                 std::placeholders::_4,
                 std::placeholders::_5,
                 true);

   Ld1Macro ld1h = &MacroAssembler::Ld1h;
   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
   ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
   ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);

   Ld1Macro ld1w = &MacroAssembler::Ld1w;
   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
   ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
   ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);

   Ld1Macro ld1d = &MacroAssembler::Ld1d;
   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
   ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false);
   ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false);

   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
   ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
   ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);

   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
   ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true);
   ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true);
 }

 TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_unscaled_offset) {
   auto ld1_32_unpacked_unscaled_offset_helper =
       std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
                 config,
                 std::placeholders::_1,
                 kDRegSize,
                 std::placeholders::_2,
                 std::placeholders::_3,
                 std::placeholders::_4,
                 std::placeholders::_5,
                 false);

   Ld1Macro ld1h = &MacroAssembler::Ld1h;
   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
   ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
   ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);

   Ld1Macro ld1w = &MacroAssembler::Ld1w;
   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
   ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
   ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);

   Ld1Macro ld1d = &MacroAssembler::Ld1d;
   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
   ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false);
   ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false);

   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
   ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
   ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);

   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
   ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true);
   ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true);
 }

 TEST_SVE(sve_ld1_scalar_plus_vector_64_scaled_offset) {
   auto ld1_64_scaled_offset_helper =
       std::bind(&GatherLoadScalarPlusVectorHelper<Shift>,
                 config,
                 std::placeholders::_1,
                 kDRegSize,
                 std::placeholders::_2,
                 std::placeholders::_3,
                 LSL,
                 std::placeholders::_4,
                 true);

   Ld1Macro ld1h = &MacroAssembler::Ld1h;
   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
   ld1_64_scaled_offset_helper(kHRegSize, ld1h, ldff1h, false);

   Ld1Macro ld1w = &MacroAssembler::Ld1w;
   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
   ld1_64_scaled_offset_helper(kSRegSize, ld1w, ldff1w, false);

   Ld1Macro ld1d = &MacroAssembler::Ld1d;
   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
   ld1_64_scaled_offset_helper(kDRegSize, ld1d, ldff1d, false);

   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
   ld1_64_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true);

   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
   ld1_64_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true);
 }

 TEST_SVE(sve_ld1_scalar_plus_vector_64_unscaled_offset) {
   auto ld1_64_unscaled_offset_helper =
       std::bind(&GatherLoadScalarPlusVectorHelper<Shift>,
                 config,
                 std::placeholders::_1,
                 kDRegSize,
                 std::placeholders::_2,
                 std::placeholders::_3,
                 NO_SHIFT,
                 std::placeholders::_4,
                 false);

   Ld1Macro ld1b = &MacroAssembler::Ld1b;
   Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
   ld1_64_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, false);

   Ld1Macro ld1h = &MacroAssembler::Ld1h;
   Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
   ld1_64_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, false);

   Ld1Macro ld1w = &MacroAssembler::Ld1w;
   Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
   ld1_64_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, false);

   Ld1Macro ld1d = &MacroAssembler::Ld1d;
   Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
   ld1_64_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, false);

   Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
   Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
   ld1_64_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, true);

   Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
   Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
   ld1_64_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true);

   Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
   Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
   ld1_64_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true);
 }

 TEST_SVE(sve_ldnt1) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int data_size = kZRegMaxSizeInBytes * 16;
   uint8_t* data = new uint8_t[data_size];
   for (int i = 0; i < data_size; i++) {
     data[i] = i & 0xff;
   }

   // Set the base half-way through the buffer so we can use negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
   __ Ptrue(p0.VnB());
   __ Punpklo(p1.VnH(), p0.VnB());
   __ Punpklo(p2.VnH(), p1.VnB());
   __ Punpklo(p3.VnH(), p2.VnB());
   __ Punpklo(p4.VnH(), p3.VnB());

   __ Mov(x1, 42);
   __ Ld1b(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
   __ Ldnt1b(z1.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));

   __ Mov(x1, -21);
   __ Ld1h(z2.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
   __ Ldnt1h(z3.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));

   __ Mov(x1, 10);
   __ Ld1w(z4.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
   __ Ldnt1w(z5.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));

   __ Mov(x1, -5);
   __ Ld1d(z6.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
   __ Ldnt1d(z7.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));

   __ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL));
   __ Ldnt1b(z9.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL));

   __ Ld1h(z10.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
   __ Ldnt1h(z11.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));

   __ Ld1w(z12.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
   __ Ldnt1w(z13.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));

   __ Ld1d(z14.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
   __ Ldnt1d(z15.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(z0, z1);
     ASSERT_EQUAL_SVE(z2, z3);
     ASSERT_EQUAL_SVE(z4, z5);
     ASSERT_EQUAL_SVE(z6, z7);
     ASSERT_EQUAL_SVE(z8, z9);
     ASSERT_EQUAL_SVE(z10, z11);
     ASSERT_EQUAL_SVE(z12, z13);
     ASSERT_EQUAL_SVE(z14, z15);
   }
 }

 TEST_SVE(sve_stnt1) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int data_size = kZRegMaxSizeInBytes * 16;
   uint8_t* data = new uint8_t[data_size];

   // Set the base half-way through the buffer so we can use negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
   __ Ptrue(p0.VnB());
   __ Punpklo(p1.VnH(), p0.VnB());
   __ Punpklo(p2.VnH(), p1.VnB());
   __ Punpklo(p3.VnH(), p2.VnB());
   __ Punpklo(p4.VnH(), p3.VnB());
   __ Dup(z0.VnB(), 0x55);
   __ Index(z1.VnB(), 0, 1);

   // Store with all-true and patterned predication, load back, and create a
   // reference value for later comparison.
   __ Rdvl(x1, 1);
   __ Stnt1b(z0.VnB(), p0, SVEMemOperand(x0, x1));
   __ Stnt1b(z1.VnB(), p1, SVEMemOperand(x0, 1, SVE_MUL_VL));
   __ Ld1b(z2.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1));
   __ Sel(z3.VnB(), p1, z1.VnB(), z0.VnB());

   // Repeated, with wider elements and different offsets.
   __ Rdvl(x1, -1);
   __ Lsr(x1, x1, 1);
   __ Stnt1h(z0.VnH(), p0, SVEMemOperand(x0, x1, LSL, 1));
   __ Stnt1h(z1.VnH(), p2, SVEMemOperand(x0, -1, SVE_MUL_VL));
   __ Ld1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
   __ Sel(z5.VnH(), p2, z1.VnH(), z0.VnH());

   __ Rdvl(x1, 7);
   __ Lsr(x1, x1, 2);
   __ Stnt1w(z0.VnS(), p0, SVEMemOperand(x0, x1, LSL, 2));
   __ Stnt1w(z1.VnS(), p3, SVEMemOperand(x0, 7, SVE_MUL_VL));
   __ Ld1b(z6.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
   __ Sel(z7.VnS(), p3, z1.VnS(), z0.VnS());

   __ Rdvl(x1, -8);
   __ Lsr(x1, x1, 3);
   __ Stnt1d(z0.VnD(), p0, SVEMemOperand(x0, x1, LSL, 3));
   __ Stnt1d(z1.VnD(), p4, SVEMemOperand(x0, -8, SVE_MUL_VL));
   __ Ld1b(z8.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
   __ Sel(z9.VnD(), p4, z1.VnD(), z0.VnD());
   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(z2, z3);
     ASSERT_EQUAL_SVE(z4, z5);
     ASSERT_EQUAL_SVE(z6, z7);
     ASSERT_EQUAL_SVE(z8, z9);
   }
 }

 TEST_SVE(sve_ld1rq) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int data_size = (kQRegSizeInBytes + 128) * 2;
   uint8_t* data = new uint8_t[data_size];
   for (int i = 0; i < data_size; i++) {
     data[i] = i & 0xff;
   }

   // Set the base half-way through the buffer so we can use negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));

   __ Index(z0.VnB(), 0, 1);
   __ Ptrue(p0.VnB());
   __ Cmplo(p0.VnB(), p0.Zeroing(), z0.VnB(), 4);
   __ Pfalse(p1.VnB());
   __ Zip1(p1.VnB(), p0.VnB(), p1.VnB());

   // Load and broadcast using scalar offsets.
   __ Mov(x1, -42);
   __ Ld1rqb(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));

   __ Add(x2, x0, 1);
   __ Mov(x1, -21);
   __ Punpklo(p2.VnH(), p1.VnB());
   __ Ld1rqh(z1.VnH(), p2.Zeroing(), SVEMemOperand(x2, x1, LSL, 1));

   __ Add(x2, x2, 1);
   __ Mov(x1, -10);
   __ Punpklo(p3.VnH(), p2.VnB());
   __ Ld1rqw(z2.VnS(), p3.Zeroing(), SVEMemOperand(x2, x1, LSL, 2));

   __ Add(x2, x2, 1);
   __ Mov(x1, 5);
   __ Punpklo(p4.VnH(), p3.VnB());
   __ Ld1rqd(z3.VnD(), p4.Zeroing(), SVEMemOperand(x2, x1, LSL, 3));

   // Check that all segments match by rotating the vector by one segment,
   // eoring, and orring across the vector.
   __ Mov(z4, z0);
   __ Ext(z4.VnB(), z4.VnB(), z4.VnB(), 16);
   __ Eor(z4.VnB(), z4.VnB(), z0.VnB());
   __ Orv(b4, p0, z4.VnB());
   __ Mov(z5, z1);
   __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16);
   __ Eor(z5.VnB(), z5.VnB(), z1.VnB());
   __ Orv(b5, p0, z5.VnB());
   __ Orr(z4, z4, z5);
   __ Mov(z5, z2);
   __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16);
   __ Eor(z5.VnB(), z5.VnB(), z2.VnB());
   __ Orv(b5, p0, z5.VnB());
   __ Orr(z4, z4, z5);
   __ Mov(z5, z3);
   __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16);
   __ Eor(z5.VnB(), z5.VnB(), z3.VnB());
   __ Orv(b5, p0, z5.VnB());
   __ Orr(z4, z4, z5);

   // Load and broadcast the same values, using immediate offsets.
   __ Add(x1, x0, 6);
   __ Ld1rqb(z5.VnB(), p1.Zeroing(), SVEMemOperand(x1, -48));
   __ Add(x1, x0, -9);
   __ Ld1rqh(z6.VnH(), p2.Zeroing(), SVEMemOperand(x1, -32));
   __ Add(x1, x0, -70);
   __ Ld1rqw(z7.VnS(), p3.Zeroing(), SVEMemOperand(x1, 32));
   __ Add(x1, x0, 27);
   __ Ld1rqd(z8.VnD(), p4.Zeroing(), SVEMemOperand(x1, 16));
   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t expected_z0[] = {0x0000000000000000, 0x006c006a00680066};
     uint64_t expected_z1[] = {0x000074730000706f, 0x00006c6b00006867};
     uint64_t expected_z2[] = {0x0000000075747372, 0x000000006d6c6b6a};
     uint64_t expected_z3[] = {0x0000000000000000, 0xc2c1c0bfbebdbcbb};
     uint64_t expected_z4[] = {0, 0};
     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
     ASSERT_EQUAL_SVE(z0, z5);
     ASSERT_EQUAL_SVE(z1, z6);
     ASSERT_EQUAL_SVE(z2, z7);
     ASSERT_EQUAL_SVE(z3, z8);
   }
 }

 TEST_SVE(sve_st1_vec_imm) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE);
   START();

   // TODO: Use mmap() to request a buffer in the low 4GB, which allows testing
   // 32-bit address vectors.
   int data_size = kZRegMaxSizeInBytes * 16;
   uint8_t* data = new uint8_t[data_size];

   // Set the base to 16 bytes from the end of the buffer so we can use negative
   // indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size - 16]));
   __ Ptrue(p0.VnB());

   // Store a vector of index values in reverse order, using
   // vector-plus-immediate addressing to begin at byte 15, then storing to
   // bytes 14, 13, etc.
   __ Index(z1.VnD(), x0, -1);
   __ Index(z2.VnD(), 0, 1);

   // Iterate in order to store at least 16 bytes. The number of iterations
   // depends on VL, eg. VL128 iterates eight times, storing bytes 15 and 14
   // on the first iteration, 13 and 12 on the next, etc.
   uint64_t dlanes = config->sve_vl_in_bytes() / kDRegSizeInBytes;
   for (int i = 15; i >= 0; i -= dlanes * kBRegSizeInBytes) {
     __ St1b(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
     __ Incd(z2.VnD());
   }

   // Reload the stored data, and build a reference for comparison. The reference
   // is truncated to a Q register, as only the least-significant 128 bits are
   // checked.
   __ Ldr(q4, MemOperand(x0));
   __ Index(z5.VnB(), 15, -1);
   __ Mov(q5, q5);

   // Repeat for wider elements.
   __ Index(z1.VnD(), x0, -2);  // Stepping by -2 for H-sized elements.
   __ Index(z2.VnD(), 0, 1);
   for (int i = 14; i >= 0; i -= dlanes * kHRegSizeInBytes) {
     __ St1h(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
     __ Incd(z2.VnD());
   }
   __ Ldr(q6, MemOperand(x0));
   __ Index(z7.VnH(), 7, -1);
   __ Mov(q7, q7);

   __ Index(z1.VnD(), x0, -4);  // Stepping by -4 for S-sized elements.
   __ Index(z2.VnD(), 0, 1);
   for (int i = 12; i >= 0; i -= dlanes * kSRegSizeInBytes) {
     __ St1w(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
     __ Incd(z2.VnD());
   }
   __ Ldr(q8, MemOperand(x0));
   __ Index(z9.VnS(), 3, -1);
   __ Mov(q9, q9);

   __ Index(z1.VnD(), x0, -8);  // Stepping by -8 for D-sized elements.
   __ Index(z2.VnD(), 0, 1);
   for (int i = 8; i >= 0; i -= dlanes * kDRegSizeInBytes) {
     __ St1d(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
     __ Incd(z2.VnD());
   }
   __ Ldr(q10, MemOperand(x0));
   __ Index(z11.VnD(), 1, -1);
   __ Mov(q11, q11);

   // Test predication by storing even halfwords to memory (using predication)
   // at byte-separated addresses. The result should be the same as storing
   // even halfwords contiguously to memory.
   __ Pfalse(p1.VnB());
   __ Zip1(p1.VnD(), p0.VnD(), p1.VnD());
   __ Mov(x0, reinterpret_cast<uintptr_t>(data));
   __ Index(z1.VnD(), x0, 1);
   __ Index(z2.VnD(), 0x1000, 1);
   for (int i = 0; i < 16; i += dlanes) {
     __ St1h(z2.VnD(), p1, SVEMemOperand(z1.VnD(), i));
     __ Incd(z2.VnD());
   }
   __ Ldr(q2, MemOperand(x0));
   __ Index(z3.VnH(), 0x1000, 2);
   __ Mov(q3, q3);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(z3, z2);
     ASSERT_EQUAL_SVE(z5, z4);
     ASSERT_EQUAL_SVE(z7, z6);
     ASSERT_EQUAL_SVE(z9, z8);
     ASSERT_EQUAL_SVE(z11, z10);
   }
 }

 template <typename T>
 static void sve_st1_scalar_plus_vector_helper(Test* config,
                                               int esize_in_bits,
                                               T mod,
                                               bool is_scaled) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int vl = config->sve_vl_in_bytes();
   int data_size = vl * 160;
   uint8_t* data = new uint8_t[data_size];
   memset(data, 0, data_size);
   int vl_per_esize = vl / (esize_in_bits / kBitsPerByte);

   ZRegister zn_b = z0.WithLaneSize(esize_in_bits);
   ZRegister zn_h = z1.WithLaneSize(esize_in_bits);
   ZRegister zn_s = z2.WithLaneSize(esize_in_bits);
   ZRegister zn_d = z3.WithLaneSize(esize_in_bits);

   ZRegister zn_ld_b = z10.WithLaneSize(esize_in_bits);
   ZRegister zn_ld_h = z11.WithLaneSize(esize_in_bits);
   ZRegister zn_ld_s = z12.WithLaneSize(esize_in_bits);
   ZRegister zn_ld_d = z13.WithLaneSize(esize_in_bits);
   ZRegister offsets = z31.WithLaneSize(esize_in_bits);

   // Set the base half-way through the buffer so we can use negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
   __ Ptrue(p6.WithLaneSize(esize_in_bits));
   __ Pfalse(p7.WithLaneSize(esize_in_bits));
   __ Zip1(p0.WithLaneSize(esize_in_bits),
           p6.WithLaneSize(esize_in_bits),
           p7.WithLaneSize(esize_in_bits));
   __ Zip1(p1.WithLaneSize(esize_in_bits),
           p7.WithLaneSize(esize_in_bits),
           p6.WithLaneSize(esize_in_bits));

   // `st1b` doesn't have the scaled-offset forms.
   if (is_scaled == false) {
     // Simply stepping the index by 2 to simulate a scatter memory access.
     __ Index(offsets, 1, 2);
     __ St1b(offsets, p0, SVEMemOperand(x0, offsets, mod));
     __ Ld1b(zn_ld_b, p0.Zeroing(), SVEMemOperand(x0, offsets, mod));
     __ Dup(zn_b, 0);
     __ Mov(zn_b, p0.Merging(), offsets);
   }

   // Store the values to isolated range different with other stores.
   int scale = is_scaled ? 1 : 0;
   __ Add(x1, x0, vl_per_esize * 4);
   __ Index(offsets, 6, 4);
   __ St1h(offsets, p0, SVEMemOperand(x1, offsets, mod, scale));
   __ Ld1h(zn_ld_h, p0.Zeroing(), SVEMemOperand(x1, offsets, mod, scale));
   __ Dup(zn_h, 0);
   __ Mov(zn_h, p0.Merging(), offsets);

   scale = is_scaled ? 2 : 0;
   __ Add(x2, x0, UINT64_MAX + (vl_per_esize * -8) + 1);
   __ Index(offsets, 64, 8);
   if ((std::is_same<T, vixl::aarch64::Extend>::value) &&
       (static_cast<int>(mod) == SXTW)) {
     // Testing negative offsets.
     __ Neg(offsets, p6.Merging(), offsets);
   }
   __ St1w(offsets, p1, SVEMemOperand(x2, offsets, mod, scale));
   __ Ld1w(zn_ld_s, p1.Zeroing(), SVEMemOperand(x2, offsets, mod, scale));
   __ Dup(zn_s, 0);
   __ Mov(zn_s, p1.Merging(), offsets);

   if (esize_in_bits == kDRegSize) {
     // Test st1w by comparing the 32-bit value loaded correspondingly with the
     // 32-bit value stored.
     __ Lsl(zn_s, zn_s, kSRegSize);
     __ Lsr(zn_s, zn_s, kSRegSize);
   }

   // `st1d` doesn't have the S-sized lane forms.
   if (esize_in_bits == kDRegSize) {
     scale = is_scaled ? 3 : 0;
     __ Add(x3, x0, UINT64_MAX + (vl_per_esize * -16) + 1);
     __ Index(offsets, 128, 16);
     if ((std::is_same<T, vixl::aarch64::Extend>::value) &&
         (static_cast<int>(mod) == SXTW)) {
       __ Neg(offsets, p6.Merging(), offsets);
     }
     __ St1d(offsets, p1, SVEMemOperand(x3, offsets, mod, scale));
     __ Ld1d(zn_ld_d, p1.Zeroing(), SVEMemOperand(x3, offsets, mod, scale));
     __ Dup(zn_d, 0);
     __ Mov(zn_d, p1.Merging(), offsets);
   }

   END();

   if (CAN_RUN()) {
     RUN();

     if (scale == false) {
       ASSERT_EQUAL_SVE(zn_ld_b, zn_b);
     }

     ASSERT_EQUAL_SVE(zn_ld_h, zn_h);
     ASSERT_EQUAL_SVE(zn_ld_s, zn_s);

     if (esize_in_bits == kDRegSize) {
       ASSERT_EQUAL_SVE(zn_ld_d, zn_d);
     }
   }

   delete[] data;
 }

 TEST_SVE(sve_st1_sca_vec_32_unpacked_unscaled) {
   sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, false);
   sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, false);
 }

 TEST_SVE(sve_st1_sca_vec_32_unpacked_scaled) {
   sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, true);
   sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, true);
 }

 TEST_SVE(sve_st1_sca_vec_32_unscaled) {
   sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, false);
   sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, false);
 }

 TEST_SVE(sve_st1_sca_vec_32_scaled) {
   sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, true);
   sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, true);
 }

 TEST_SVE(sve_st1_sca_vec_64_scaled) {
   sve_st1_scalar_plus_vector_helper(config, kDRegSize, LSL, true);
 }

 TEST_SVE(sve_st1_sca_vec_64_unscaled) {
   sve_st1_scalar_plus_vector_helper(config, kDRegSize, NO_SHIFT, false);
 }

 typedef void (MacroAssembler::*IntWideImmFn)(const ZRegister& zd,
                                              const ZRegister& zn,
                                              const IntegerOperand imm);

 template <typename F, typename Td, typename Tn>
 static void IntWideImmHelper(Test* config,
                              F macro,
                              unsigned lane_size_in_bits,
                              const Tn& zn_inputs,
                              IntegerOperand imm,
                              const Td& zd_expected) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   ZRegister zd1 = z0.WithLaneSize(lane_size_in_bits);
   InsrHelper(&masm, zd1, zn_inputs);

   // Also test with a different zn, to test the movprfx case.
   ZRegister zn = z1.WithLaneSize(lane_size_in_bits);
   InsrHelper(&masm, zn, zn_inputs);
   ZRegister zd2 = z2.WithLaneSize(lane_size_in_bits);
   ZRegister zn_copy = z3.WithSameLaneSizeAs(zn);

   // Make a copy so we can check that constructive operations preserve zn.
   __ Mov(zn_copy, zn);

   {
     UseScratchRegisterScope temps(&masm);
     // The MacroAssembler needs a P scratch register for some of these macros,
     // and it doesn't have one by default.
     temps.Include(p3);

     (masm.*macro)(zd1, zd1, imm);
     (masm.*macro)(zd2, zn, imm);
   }

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(zd_expected, zd1);

     // Check the result from `instr` with movprfx is the same as
     // the immediate version.
     ASSERT_EQUAL_SVE(zd_expected, zd2);

     ASSERT_EQUAL_SVE(zn_copy, zn);
   }
 }

 TEST_SVE(sve_int_wide_imm_unpredicated_smax) {
   int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55};
   int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555};
   int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555};
   int64_t in_d[] = {1, 10, 10000, 1000000};

   IntWideImmFn fn = &MacroAssembler::Smax;

   int exp_b_1[] = {0, -1, 127, -1, 126, 1, -1, 55};
   int exp_h_1[] = {127, 127, 127, 127, INT16_MAX, 127, 127, 5555};
   int exp_s_1[] = {0, -128, 127, -128, INT32_MAX, 1, -1, 555555};
   int64_t exp_d_1[] = {99, 99, 10000, 1000000};

   IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1);
   IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1);
   IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);

   int exp_h_2[] = {0, -128, 127, -255, INT16_MAX, 1, -1, 5555};
   int exp_s_2[] = {2048, 2048, 2048, 2048, INT32_MAX, 2048, 2048, 555555};
   int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000};

   // The immediate is in the range [-128, 127], but the macro is able to
   // synthesise unencodable immediates.
   // B-sized lanes cannot take an immediate out of the range [-128, 127].
   IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
   IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
 }

 TEST_SVE(sve_int_wide_imm_unpredicated_smin) {
   int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55};
   int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555};
   int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555};
   int64_t in_d[] = {1, 10, 10000, 1000000};

   IntWideImmFn fn = &MacroAssembler::Smin;

   int exp_b_1[] = {-1, -128, -1, -127, -1, -1, -1, -1};
   int exp_h_1[] = {0, -128, 127, INT16_MIN, 127, 1, -1, 127};
   int exp_s_1[] = {-128, -128, -128, INT32_MIN, -128, -128, -128, -128};
   int64_t exp_d_1[] = {1, 10, 99, 99};

   IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1);
   IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1);
   IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);

   int exp_h_2[] = {-255, -255, -255, INT16_MIN, -255, -255, -255, -255};
   int exp_s_2[] = {0, -128, 127, INT32_MIN, 2048, 1, -1, 2048};
   int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX};

   // The immediate is in the range [-128, 127], but the macro is able to
   // synthesise unencodable immediates.
   // B-sized lanes cannot take an immediate out of the range [-128, 127].
   IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
   IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
 }

 TEST_SVE(sve_int_wide_imm_unpredicated_umax) {
   int in_b[] = {0, 255, 127, 0x80, 1, 55};
   int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555};
   int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555};
   int64_t in_d[] = {1, 10, 10000, 1000000};

   IntWideImmFn fn = &MacroAssembler::Umax;

   int exp_b_1[] = {17, 255, 127, 0x80, 17, 55};
   int exp_h_1[] = {127, 255, 127, INT16_MAX, 127, 5555};
   int exp_s_1[] = {255, 255, 255, INT32_MAX, 255, 555555};
   int64_t exp_d_1[] = {99, 99, 10000, 1000000};

   IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1);
   IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 0xff, exp_s_1);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);

   int exp_h_2[] = {511, 511, 511, INT16_MAX, 511, 5555};
   int exp_s_2[] = {2048, 2048, 2048, INT32_MAX, 2048, 555555};
   int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000};

   // The immediate is in the range [0, 255], but the macro is able to
   // synthesise unencodable immediates.
   // B-sized lanes cannot take an immediate out of the range [0, 255].
   IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
   IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
 }

 TEST_SVE(sve_int_wide_imm_unpredicated_umin) {
   int in_b[] = {0, 255, 127, 0x80, 1, 55};
   int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555};
   int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555};
   int64_t in_d[] = {1, 10, 10000, 1000000};

   IntWideImmFn fn = &MacroAssembler::Umin;

   int exp_b_1[] = {0, 17, 17, 17, 1, 17};
   int exp_h_1[] = {0, 127, 127, 127, 1, 127};
   int exp_s_1[] = {0, 255, 127, 255, 1, 255};
   int64_t exp_d_1[] = {1, 10, 99, 99};

   IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1);
   IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 255, exp_s_1);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);

   int exp_h_2[] = {0, 255, 127, 511, 1, 511};
   int exp_s_2[] = {0, 255, 127, 2048, 1, 2048};
   int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX};

   // The immediate is in the range [0, 255], but the macro is able to
   // synthesise unencodable immediates.
   // B-sized lanes cannot take an immediate out of the range [0, 255].
   IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
   IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
 }

 TEST_SVE(sve_int_wide_imm_unpredicated_mul) {
   int in_b[] = {11, -1, 7, -3};
   int in_h[] = {111, -1, 17, -123};
   int in_s[] = {11111, -1, 117, -12345};
   int64_t in_d[] = {0x7fffffff, 0x80000000};

   IntWideImmFn fn = &MacroAssembler::Mul;

   int exp_b_1[] = {66, -6, 42, -18};
   int exp_h_1[] = {-14208, 128, -2176, 15744};
   int exp_s_1[] = {11111 * 127, -127, 117 * 127, -12345 * 127};
   int64_t exp_d_1[] = {0xfffffffe, 0x100000000};

   IntWideImmHelper(config, fn, kBRegSize, in_b, 6, exp_b_1);
   IntWideImmHelper(config, fn, kHRegSize, in_h, -128, exp_h_1);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 2, exp_d_1);

   int exp_h_2[] = {-28305, 255, -4335, 31365};
   int exp_s_2[] = {22755328, -2048, 239616, -25282560};
   int64_t exp_d_2[] = {0x00000063ffffff38, 0x0000006400000000};

   // The immediate is in the range [-128, 127], but the macro is able to
   // synthesise unencodable immediates.
   // B-sized lanes cannot take an immediate out of the range [0, 255].
   IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 200, exp_d_2);

   // Integer overflow on multiplication.
   unsigned exp_b_3[] = {0x75, 0x81, 0x79, 0x83};

   IntWideImmHelper(config, fn, kBRegSize, in_b, 0x7f, exp_b_3);
 }

 TEST_SVE(sve_int_wide_imm_unpredicated_add) {
   unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
   unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
   unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
   uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};

   IntWideImmFn fn = &MacroAssembler::Add;

   unsigned exp_b_1[] = {0x02, 0x00, 0x91, 0x80};
   unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
   unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
   uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};

   // Encodable with `add` (shift 0).
   IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
   IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);

   unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa};
   unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
   uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};

   // Encodable with `add` (shift 8).
   // B-sized lanes cannot take a shift of 8.
   IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);

   unsigned exp_s_3[] = {0x80808181, 0x807e7f7f, 0xab29aaaa, 0xf07ff0f0};

   // The macro is able to synthesise unencodable immediates.
   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3);

   unsigned exp_b_4[] = {0x61, 0x5f, 0xf0, 0xdf};
   unsigned exp_h_4[] = {0x6181, 0x5f7f, 0xf010, 0x8aaa};
   unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0};
   uint64_t exp_d_4[] = {0x8000000180018180, 0x7fffffff7fff7f7e};

   // Negative immediates use `sub`.
   IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4);
   IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4);
   IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4);
   IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4);
 }

 TEST_SVE(sve_int_wide_imm_unpredicated_sqadd) {
   unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
   unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
   unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
   uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};

   IntWideImmFn fn = &MacroAssembler::Sqadd;

   unsigned exp_b_1[] = {0x02, 0x7f, 0x7f, 0x7f};
   unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
   unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
   uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};

   // Encodable with `sqadd` (shift 0).
   // Note that encodable immediates are unsigned, even for signed saturation.
   IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1);
   IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1);

   unsigned exp_h_2[] = {0x9181, 0x7fff, 0x2010, 0xbaaa};
   unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
   uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};

   // Encodable with `sqadd` (shift 8).
   // B-sized lanes cannot take a shift of 8.
   IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
 }

 TEST_SVE(sve_int_wide_imm_unpredicated_uqadd) {
   unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
   unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
   unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
   uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};

   IntWideImmFn fn = &MacroAssembler::Uqadd;

   unsigned exp_b_1[] = {0xff, 0xff, 0x91, 0xff};
   unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
   unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
   uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};

   // Encodable with `uqadd` (shift 0).
   IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
   IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);

   unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa};
   unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
   uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};

   // Encodable with `uqadd` (shift 8).
   // B-sized lanes cannot take a shift of 8.
   IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
 }

 TEST_SVE(sve_int_wide_imm_unpredicated_sub) {
   unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
   unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
   unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
   uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};

   IntWideImmFn fn = &MacroAssembler::Sub;

   unsigned exp_b_1[] = {0x00, 0xfe, 0x8f, 0x7e};
   unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
   unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
   uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};

   // Encodable with `sub` (shift 0).
   IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
   IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);

   unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa};
   unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
   uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};

   // Encodable with `sub` (shift 8).
   // B-sized lanes cannot take a shift of 8.
   IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);

   unsigned exp_s_3[] = {0x7f828181, 0x7f807f7f, 0xaa2baaaa, 0xef81f0f0};

   // The macro is able to synthesise unencodable immediates.
   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3);

   unsigned exp_b_4[] = {0xa1, 0x9f, 0x30, 0x1f};
   unsigned exp_h_4[] = {0xa181, 0x9f7f, 0x3010, 0xcaaa};
   unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0};
   uint64_t exp_d_4[] = {0x8000000180018182, 0x7fffffff7fff7f80};

   // Negative immediates use `add`.
   IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4);
   IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4);
   IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4);
   IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4);
 }

 TEST_SVE(sve_int_wide_imm_unpredicated_sqsub) {
   unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
   unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
   unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
   uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};

   IntWideImmFn fn = &MacroAssembler::Sqsub;

   unsigned exp_b_1[] = {0x80, 0xfe, 0x8f, 0x80};
   unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
   unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
   uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};

   // Encodable with `sqsub` (shift 0).
   // Note that encodable immediates are unsigned, even for signed saturation.
   IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1);
   IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1);

   unsigned exp_h_2[] = {0x8000, 0x6f7f, 0x0010, 0x9aaa};
   unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
   uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};

   // Encodable with `sqsub` (shift 8).
   // B-sized lanes cannot take a shift of 8.
   IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
 }

 TEST_SVE(sve_int_wide_imm_unpredicated_uqsub) {
   unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
   unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
   unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
   uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};

   IntWideImmFn fn = &MacroAssembler::Uqsub;

   unsigned exp_b_1[] = {0x00, 0x00, 0x00, 0x7e};
   unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
   unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
   uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};

   // Encodable with `uqsub` (shift 0).
   IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
   IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);

   unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa};
   unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
   uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};

   // Encodable with `uqsub` (shift 8).
   // B-sized lanes cannot take a shift of 8.
   IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
   IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
   IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
 }

 TEST_SVE(sve_int_wide_imm_unpredicated_subr) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Encodable with `subr` (shift 0).
   __ Index(z0.VnD(), 1, 1);
   __ Sub(z0.VnD(), 100, z0.VnD());
   __ Index(z1.VnS(), 0x7f, 1);
   __ Sub(z1.VnS(), 0xf7, z1.VnS());
   __ Index(z2.VnH(), 0xaaaa, 0x2222);
   __ Sub(z2.VnH(), 0x80, z2.VnH());
   __ Index(z3.VnB(), 133, 1);
   __ Sub(z3.VnB(), 255, z3.VnB());

   // Encodable with `subr` (shift 8).
   __ Index(z4.VnD(), 256, -1);
   __ Sub(z4.VnD(), 42 * 256, z4.VnD());
   __ Index(z5.VnS(), 0x7878, 1);
   __ Sub(z5.VnS(), 0x8000, z5.VnS());
   __ Index(z6.VnH(), 0x30f0, -1);
   __ Sub(z6.VnH(), 0x7f00, z6.VnH());
   // B-sized lanes cannot take a shift of 8.

   // Select with movprfx.
   __ Index(z31.VnD(), 256, 4001);
   __ Sub(z7.VnD(), 42 * 256, z31.VnD());

   // Out of immediate encodable range of `sub`.
   __ Index(z30.VnS(), 0x11223344, 1);
   __ Sub(z8.VnS(), 0x88776655, z30.VnS());

   END();

   if (CAN_RUN()) {
     RUN();

     int expected_z0[] = {87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99};
     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());

     int expected_z1[] = {0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnS());

     int expected_z2[] = {0xab2c, 0xcd4e, 0xef70, 0x1192, 0x33b4, 0x55d6};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnH());

     int expected_z3[] = {0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnB());

     int expected_z4[] = {10502, 10501, 10500, 10499, 10498, 10497, 10496};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());

     int expected_z5[] = {0x0783, 0x0784, 0x0785, 0x0786, 0x0787, 0x0788};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnS());

     int expected_z6[] = {0x4e15, 0x4e14, 0x4e13, 0x4e12, 0x4e11, 0x4e10};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnH());

     int expected_z7[] = {-13510, -9509, -5508, -1507, 2494, 6495, 10496};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());

     int expected_z8[] = {0x7755330e, 0x7755330f, 0x77553310, 0x77553311};
     ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
   }
 }

 TEST_SVE(sve_int_wide_imm_unpredicated_fdup) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Immediates which can be encoded in the instructions.
   __ Fdup(z0.VnH(), RawbitsToFloat16(0xc500));
   __ Fdup(z1.VnS(), Float16(2.0));
   __ Fdup(z2.VnD(), Float16(3.875));
   __ Fdup(z3.VnH(), 8.0f);
   __ Fdup(z4.VnS(), -4.75f);
   __ Fdup(z5.VnD(), 0.5f);
   __ Fdup(z6.VnH(), 1.0);
   __ Fdup(z7.VnS(), 2.125);
   __ Fdup(z8.VnD(), -13.0);

   // Immediates which cannot be encoded in the instructions.
   __ Fdup(z10.VnH(), Float16(0.0));
   __ Fdup(z11.VnH(), kFP16PositiveInfinity);
   __ Fdup(z12.VnS(), 255.0f);
   __ Fdup(z13.VnS(), kFP32NegativeInfinity);
   __ Fdup(z14.VnD(), 12.3456);
   __ Fdup(z15.VnD(), kFP64PositiveInfinity);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(0xc500, z0.VnH());
     ASSERT_EQUAL_SVE(0x40000000, z1.VnS());
     ASSERT_EQUAL_SVE(0x400f000000000000, z2.VnD());
     ASSERT_EQUAL_SVE(0x4800, z3.VnH());
     ASSERT_EQUAL_SVE(FloatToRawbits(-4.75f), z4.VnS());
     ASSERT_EQUAL_SVE(DoubleToRawbits(0.5), z5.VnD());
     ASSERT_EQUAL_SVE(0x3c00, z6.VnH());
     ASSERT_EQUAL_SVE(FloatToRawbits(2.125f), z7.VnS());
     ASSERT_EQUAL_SVE(DoubleToRawbits(-13.0), z8.VnD());

     ASSERT_EQUAL_SVE(0x0000, z10.VnH());
     ASSERT_EQUAL_SVE(Float16ToRawbits(kFP16PositiveInfinity), z11.VnH());
     ASSERT_EQUAL_SVE(FloatToRawbits(255.0), z12.VnS());
     ASSERT_EQUAL_SVE(FloatToRawbits(kFP32NegativeInfinity), z13.VnS());
     ASSERT_EQUAL_SVE(DoubleToRawbits(12.3456), z14.VnD());
     ASSERT_EQUAL_SVE(DoubleToRawbits(kFP64PositiveInfinity), z15.VnD());
   }
 }

 TEST_SVE(sve_andv_eorv_orv) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t in[] = {0x8899aabbccddeeff, 0x7777555533331111, 0x123456789abcdef0};
   InsrHelper(&masm, z31.VnD(), in);

   // For simplicity, we re-use the same pg for various lane sizes.
   // For D lanes:         1,                      1,                      0
   // For S lanes:         1,          1,          1,          0,          0
   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
   Initialise(&masm, p0.VnB(), pg_in);

   // Make a copy so we can check that constructive operations preserve zn.
   __ Mov(z0, z31);
   __ Andv(b0, p0, z0.VnB());  // destructive
   __ Andv(h1, p0, z31.VnH());
   __ Mov(z2, z31);
   __ Andv(s2, p0, z2.VnS());  // destructive
   __ Andv(d3, p0, z31.VnD());

   __ Eorv(b4, p0, z31.VnB());
   __ Mov(z5, z31);
   __ Eorv(h5, p0, z5.VnH());  // destructive
   __ Eorv(s6, p0, z31.VnS());
   __ Mov(z7, z31);
   __ Eorv(d7, p0, z7.VnD());  // destructive

   __ Mov(z8, z31);
   __ Orv(b8, p0, z8.VnB());  // destructive
   __ Orv(h9, p0, z31.VnH());
   __ Mov(z10, z31);
   __ Orv(s10, p0, z10.VnS());  // destructive
   __ Orv(d11, p0, z31.VnD());

   END();

   if (CAN_RUN()) {
     RUN();

     if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
       ASSERT_EQUAL_64(0x10, d0);
       ASSERT_EQUAL_64(0x1010, d1);
       ASSERT_EQUAL_64(0x33331111, d2);
       ASSERT_EQUAL_64(0x7777555533331111, d3);
       ASSERT_EQUAL_64(0xbf, d4);
       ASSERT_EQUAL_64(0xedcb, d5);
       ASSERT_EQUAL_64(0x44444444, d6);
       ASSERT_EQUAL_64(0x7777555533331111, d7);
       ASSERT_EQUAL_64(0xff, d8);
       ASSERT_EQUAL_64(0xffff, d9);
       ASSERT_EQUAL_64(0x77775555, d10);
       ASSERT_EQUAL_64(0x7777555533331111, d11);
     } else {
       ASSERT_EQUAL_64(0, d0);
       ASSERT_EQUAL_64(0x0010, d1);
       ASSERT_EQUAL_64(0x00110011, d2);
       ASSERT_EQUAL_64(0x0011001100110011, d3);
       ASSERT_EQUAL_64(0x62, d4);
       ASSERT_EQUAL_64(0x0334, d5);
       ASSERT_EQUAL_64(0x8899aabb, d6);
       ASSERT_EQUAL_64(0xffeeffeeffeeffee, d7);
       ASSERT_EQUAL_64(0xff, d8);
       ASSERT_EQUAL_64(0xffff, d9);
       ASSERT_EQUAL_64(0xffffffff, d10);
       ASSERT_EQUAL_64(0xffffffffffffffff, d11);
     }

     // Check the upper lanes above the top of the V register are all clear.
     for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
       ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z8.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z9.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z10.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z11.VnD(), i);
     }
   }
 }


 TEST_SVE(sve_saddv_uaddv) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t in[] = {0x8899aabbccddeeff, 0x8182838485868788, 0x0807060504030201};
   InsrHelper(&masm, z31.VnD(), in);

   // For simplicity, we re-use the same pg for various lane sizes.
   // For D lanes:         1,                      1,                      0
   // For S lanes:         1,          1,          1,          0,          0
   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
   Initialise(&masm, p0.VnB(), pg_in);

   // Make a copy so we can check that constructive operations preserve zn.
   __ Mov(z0, z31);
   __ Saddv(b0, p0, z0.VnB());  // destructive
   __ Saddv(h1, p0, z31.VnH());
   __ Mov(z2, z31);
   __ Saddv(s2, p0, z2.VnS());  // destructive

   __ Uaddv(b4, p0, z31.VnB());
   __ Mov(z5, z31);
   __ Uaddv(h5, p0, z5.VnH());  // destructive
   __ Uaddv(s6, p0, z31.VnS());
   __ Mov(z7, z31);
   __ Uaddv(d7, p0, z7.VnD());  // destructive

   END();

   if (CAN_RUN()) {
     RUN();

     if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
       // Saddv
       ASSERT_EQUAL_64(0xfffffffffffffda9, d0);
       ASSERT_EQUAL_64(0xfffffffffffe9495, d1);
       ASSERT_EQUAL_64(0xffffffff07090b0c, d2);
       // Uaddv
       ASSERT_EQUAL_64(0x00000000000002a9, d4);
       ASSERT_EQUAL_64(0x0000000000019495, d5);
       ASSERT_EQUAL_64(0x0000000107090b0c, d6);
       ASSERT_EQUAL_64(0x8182838485868788, d7);
     } else {
       // Saddv
       ASSERT_EQUAL_64(0xfffffffffffffd62, d0);
       ASSERT_EQUAL_64(0xfffffffffffe8394, d1);
       ASSERT_EQUAL_64(0xfffffffed3e6fa0b, d2);
       // Uaddv
       ASSERT_EQUAL_64(0x0000000000000562, d4);
       ASSERT_EQUAL_64(0x0000000000028394, d5);
       ASSERT_EQUAL_64(0x00000001d3e6fa0b, d6);
       ASSERT_EQUAL_64(0x0a1c2e4052647687, d7);
     }

     // Check the upper lanes above the top of the V register are all clear.
     for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
       ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
     }
   }
 }


 TEST_SVE(sve_sminv_uminv) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00};
   InsrHelper(&masm, z31.VnD(), in);

   // For simplicity, we re-use the same pg for various lane sizes.
   // For D lanes:         1,                      0,                      1
   // For S lanes:         1,          1,          0,          0,          1
   // For H lanes:   1,    1,    0,    1,    1,    0,    0,    0,    1,    1
   int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1};
   Initialise(&masm, p0.VnB(), pg_in);

   // Make a copy so we can check that constructive operations preserve zn.
   __ Mov(z0, z31);
   __ Sminv(b0, p0, z0.VnB());  // destructive
   __ Sminv(h1, p0, z31.VnH());
   __ Mov(z2, z31);
   __ Sminv(s2, p0, z2.VnS());  // destructive
   __ Sminv(d3, p0, z31.VnD());

   __ Uminv(b4, p0, z31.VnB());
   __ Mov(z5, z31);
   __ Uminv(h5, p0, z5.VnH());  // destructive
   __ Uminv(s6, p0, z31.VnS());
   __ Mov(z7, z31);
   __ Uminv(d7, p0, z7.VnD());  // destructive

   END();

   if (CAN_RUN()) {
     RUN();

     if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
       // Sminv
       ASSERT_EQUAL_64(0xaa, d0);
       ASSERT_EQUAL_64(0xaabb, d1);
       ASSERT_EQUAL_64(0xaabbfc00, d2);
       ASSERT_EQUAL_64(0x00112233aabbfc00, d3);  // The smaller lane is inactive.
       // Uminv
       ASSERT_EQUAL_64(0, d4);
       ASSERT_EQUAL_64(0x2233, d5);
       ASSERT_EQUAL_64(0x112233, d6);
       ASSERT_EQUAL_64(0x00112233aabbfc00, d7);  // The smaller lane is inactive.
     } else {
       // Sminv
       ASSERT_EQUAL_64(0xaa, d0);
       ASSERT_EQUAL_64(0xaaaa, d1);
       ASSERT_EQUAL_64(0xaaaaaaaa, d2);
       ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d3);
       // Uminv
       ASSERT_EQUAL_64(0, d4);
       ASSERT_EQUAL_64(0x2233, d5);
       ASSERT_EQUAL_64(0x112233, d6);
       ASSERT_EQUAL_64(0x00112233aabbfc00, d7);
     }

     // Check the upper lanes above the top of the V register are all clear.
     for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
       ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
     }
   }
 }

 TEST_SVE(sve_smaxv_umaxv) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00};
   InsrHelper(&masm, z31.VnD(), in);

   // For simplicity, we re-use the same pg for various lane sizes.
   // For D lanes:         1,                      0,                      1
   // For S lanes:         1,          1,          0,          0,          1
   // For H lanes:   1,    1,    0,    1,    1,    0,    0,    0,    1,    1
   int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1};
   Initialise(&masm, p0.VnB(), pg_in);

   // Make a copy so we can check that constructive operations preserve zn.
   __ Mov(z0, z31);
   __ Smaxv(b0, p0, z0.VnB());  // destructive
   __ Smaxv(h1, p0, z31.VnH());
   __ Mov(z2, z31);
   __ Smaxv(s2, p0, z2.VnS());  // destructive
   __ Smaxv(d3, p0, z31.VnD());

   __ Umaxv(b4, p0, z31.VnB());
   __ Mov(z5, z31);
   __ Umaxv(h5, p0, z5.VnH());  // destructive
   __ Umaxv(s6, p0, z31.VnS());
   __ Mov(z7, z31);
   __ Umaxv(d7, p0, z7.VnD());  // destructive

   END();

   if (CAN_RUN()) {
     RUN();

     if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
       // Smaxv
       ASSERT_EQUAL_64(0x33, d0);
       ASSERT_EQUAL_64(0x44aa, d1);
       ASSERT_EQUAL_64(0x112233, d2);
       ASSERT_EQUAL_64(0x112233aabbfc00, d3);
       // Umaxv
       ASSERT_EQUAL_64(0xfe, d4);
       ASSERT_EQUAL_64(0xfc00, d5);
       ASSERT_EQUAL_64(0xaabbfc00, d6);
       ASSERT_EQUAL_64(0x112233aabbfc00, d7);
     } else {
       // Smaxv
       ASSERT_EQUAL_64(0x33, d0);
       ASSERT_EQUAL_64(0x44aa, d1);
       ASSERT_EQUAL_64(0x112233, d2);
       ASSERT_EQUAL_64(0x00112233aabbfc00, d3);
       // Umaxv
       ASSERT_EQUAL_64(0xfe, d4);
       ASSERT_EQUAL_64(0xfc00, d5);
       ASSERT_EQUAL_64(0xaabbfc00, d6);
       ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d7);
     }

     // Check the upper lanes above the top of the V register are all clear.
     for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
       ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
       ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
     }
   }
 }

 template <typename T, size_t M, size_t N>
 static void SdotUdotHelper(Test* config,
                            unsigned lane_size_in_bits,
                            const T (&zd_inputs)[M],
                            const T (&za_inputs)[M],
                            const T (&zn_inputs)[N],
                            const T (&zm_inputs)[N],
                            const T (&zd_expected)[M],
                            const T (&zdnm_expected)[M],
                            bool is_signed,
                            int index = -1) {
   VIXL_STATIC_ASSERT(N == (M * 4));
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   auto dot_fn = [&](const ZRegister& zd,
                     const ZRegister& za,
                     const ZRegister& zn,
                     const ZRegister& zm,
                     bool is_signed_fn,
                     int index_fn) {
     if (is_signed_fn) {
       if (index_fn < 0) {
         __ Sdot(zd, za, zn, zm);
       } else {
         __ Sdot(zd, za, zn, zm, index_fn);
       }
     } else {
       if (index_fn < 0) {
         __ Udot(zd, za, zn, zm);
       } else {
         __ Udot(zd, za, zn, zm, index_fn);
       }
     }
   };

   ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
   ZRegister za = z1.WithLaneSize(lane_size_in_bits);
   ZRegister zn = z2.WithLaneSize(lane_size_in_bits / 4);
   ZRegister zm = z3.WithLaneSize(lane_size_in_bits / 4);

   InsrHelper(&masm, zd, zd_inputs);
   InsrHelper(&masm, za, za_inputs);
   InsrHelper(&masm, zn, zn_inputs);
   InsrHelper(&masm, zm, zm_inputs);

   // The Dot macro handles arbitrarily-aliased registers in the argument list.
   ZRegister dm_result = z4.WithLaneSize(lane_size_in_bits);
   ZRegister dnm_result = z5.WithLaneSize(lane_size_in_bits);
   ZRegister da_result = z6.WithLaneSize(lane_size_in_bits);
   ZRegister dn_result = z7.WithLaneSize(lane_size_in_bits);
   ZRegister d_result = z8.WithLaneSize(lane_size_in_bits);

   __ Mov(da_result, za);
   // zda = zda + (zn . zm)
   dot_fn(da_result, da_result, zn, zm, is_signed, index);

   __ Mov(dn_result, zn.WithSameLaneSizeAs(dn_result));
   // zdn = za + (zdn . zm)
   dot_fn(dn_result, za, dn_result.WithSameLaneSizeAs(zn), zm, is_signed, index);

   __ Mov(dm_result, zm.WithSameLaneSizeAs(dm_result));
   // zdm = za + (zn . zdm)
   dot_fn(dm_result, za, zn, dm_result.WithSameLaneSizeAs(zm), is_signed, index);

   __ Mov(d_result, zd);
   // zd = za + (zn . zm)
   dot_fn(d_result, za, zn, zm, is_signed, index);

   __ Mov(dnm_result, zn.WithSameLaneSizeAs(dnm_result));
   // zdnm = za + (zdmn . zdnm)
   dot_fn(dnm_result,
          za,
          dnm_result.WithSameLaneSizeAs(zn),
          dnm_result.WithSameLaneSizeAs(zm),
          is_signed,
          index);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits));
     ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits / 4));
     ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits / 4));

     ASSERT_EQUAL_SVE(zd_expected, da_result);
     ASSERT_EQUAL_SVE(zd_expected, dn_result);
     ASSERT_EQUAL_SVE(zd_expected, dm_result);
     ASSERT_EQUAL_SVE(zd_expected, d_result);

     ASSERT_EQUAL_SVE(zdnm_expected, dnm_result);
   }
 }

 TEST_SVE(sve_sdot) {
   int64_t zd_inputs[] = {0x33, 0xee, 0xff};
   int64_t za_inputs[] = {INT32_MAX, -3, 2};
   int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8};
   int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5};

   // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
   int64_t zd_expected_s[] = {-2147418113, -183, 133};  // 0x8000ffff
   int64_t zd_expected_d[] = {2147549183, -183, 133};   // 0x8000ffff

   // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
   int64_t zdnm_expected_s[] = {-2147418113, 980, 572};
   int64_t zdnm_expected_d[] = {2147549183, 980, 572};

   SdotUdotHelper(config,
                  kSRegSize,
                  zd_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  zd_expected_s,
                  zdnm_expected_s,
                  true);

   SdotUdotHelper(config,
                  kDRegSize,
                  zd_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  zd_expected_d,
                  zdnm_expected_d,
                  true);
 }

 TEST_SVE(sve_udot) {
   int64_t zd_inputs[] = {0x33, 0xee, 0xff};
   int64_t za_inputs[] = {INT32_MAX, -3, 2};
   int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8};
   int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5};

   // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
   int64_t zd_expected_s[] = {0x8000ffff, 0x00001749, 0x0000f085};
   int64_t zd_expected_d[] = {0x000000047c00ffff,
                              0x000000000017ff49,
                              0x00000000fff00085};

   // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
   int64_t zdnm_expected_s[] = {0x8000ffff, 0x000101d4, 0x0001d03c};
   int64_t zdnm_expected_d[] = {0x000000047c00ffff,
                                0x00000000fffe03d4,
                                0x00000001ffce023c};

   SdotUdotHelper(config,
                  kSRegSize,
                  zd_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  zd_expected_s,
                  zdnm_expected_s,
                  false);

   SdotUdotHelper(config,
                  kDRegSize,
                  zd_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  zd_expected_d,
                  zdnm_expected_d,
                  false);
 }

 TEST_SVE(sve_sdot_indexed_s) {
   int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff};
   int64_t za_inputs[] = {0, 1, 2, 3};
   int64_t zn_inputs[] =
       {-1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4};
   int64_t zm_inputs[] =
       {127, 127, 127, 127, -128, -128, -128, -128, -1, -1, -1, -1, 0, 0, 0, 0};

   constexpr int s = kQRegSize / kSRegSize;

   // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
   int64_t zd_expected_s[][s] = {{0, 1, 2, 3},  // Generated from zm[0]
                                 {4, 9, 14, 19},
                                 {512, 1025, 1538, 2051},
                                 {-508, -1015, -1522, -2029}};

   // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
   int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67},
                                   {12, 25, 38, 51},
                                   {8, 17, 26, 35},
                                   {4, 9, 14, 19}};

   for (unsigned i = 0; i < s; i++) {
     SdotUdotHelper(config,
                    kSRegSize,
                    zd_inputs,
                    za_inputs,
                    zn_inputs,
                    zm_inputs,
                    zd_expected_s[i],
                    zdnm_expected_s[i],
                    true,
                    i);
   }
 }

 TEST_SVE(sve_sdot_indexed_d) {
   int64_t zd_inputs[] = {0xff, 0xff};
   int64_t za_inputs[] = {0, 1};
   int64_t zn_inputs[] = {-1, -1, -1, -1, -1, -1, -1, -1};
   int64_t zm_inputs[] = {-128, -128, -128, -128, 127, 127, 127, 127};

   constexpr int d = kQRegSize / kDRegSize;

   // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
   int64_t zd_expected_d[][d] = {{-508, -507},  // Generated from zm[0]
                                 {512, 513}};

   // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
   int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}};

   for (unsigned i = 0; i < d; i++) {
     SdotUdotHelper(config,
                    kDRegSize,
                    zd_inputs,
                    za_inputs,
                    zn_inputs,
                    zm_inputs,
                    zd_expected_d[i],
                    zdnm_expected_d[i],
                    true,
                    i);
   }
 }

 TEST_SVE(sve_udot_indexed_s) {
   int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff};
   int64_t za_inputs[] = {0, 1, 2, 3};
   int64_t zn_inputs[] = {1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4};
   int64_t zm_inputs[] =
       {127, 127, 127, 127, 255, 255, 255, 255, 1, 1, 1, 1, 0, 0, 0, 0};

   constexpr int s = kQRegSize / kSRegSize;

   // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
   int64_t zd_expected_s[][s] = {{0, 1, 2, 3},
                                 {4, 9, 14, 19},
                                 {1020, 2041, 3062, 4083},
                                 {508, 1017, 1526, 2035}};

   // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
   int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67},
                                   {12, 25, 38, 51},
                                   {8, 17, 26, 35},
                                   {4, 9, 14, 19}};

   for (unsigned i = 0; i < s; i++) {
     SdotUdotHelper(config,
                    kSRegSize,
                    zd_inputs,
                    za_inputs,
                    zn_inputs,
                    zm_inputs,
                    zd_expected_s[i],
                    zdnm_expected_s[i],
                    false,
                    i);
   }
 }

 TEST_SVE(sve_udot_indexed_d) {
   int64_t zd_inputs[] = {0xff, 0xff};
   int64_t za_inputs[] = {0, 1};
   int64_t zn_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1};
   int64_t zm_inputs[] = {255, 255, 255, 255, 127, 127, 127, 127};

   constexpr int d = kQRegSize / kDRegSize;

   // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
   int64_t zd_expected_d[][d] = {{508, 509}, {1020, 1021}};

   // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
   int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}};

   for (unsigned i = 0; i < d; i++) {
     SdotUdotHelper(config,
                    kDRegSize,
                    zd_inputs,
                    za_inputs,
                    zn_inputs,
                    zm_inputs,
                    zd_expected_d[i],
                    zdnm_expected_d[i],
                    false,
                    i);
   }
 }

 static void IntSegmentPatternHelper(MacroAssembler* masm,
                                     const ZRegister& dst,
                                     const ZRegister& src) {
   VIXL_ASSERT(AreSameLaneSize(dst, src));
   UseScratchRegisterScope temps(masm);
   ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst);
   masm->Index(ztmp, 0, 1);
   masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2());
   masm->Add(dst, src, ztmp);
 }

 TEST_SVE(sve_sdot_udot_indexed_s) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   const int multiplier = 2;
   __ Dup(z9.VnS(), multiplier);

   __ Ptrue(p0.VnB());
   __ Index(z29.VnS(), 4, 1);

   // z29 = [... 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0]
   __ And(z29.VnS(), z29.VnS(), 3);

   // p7 = [... 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]
   __ Cmple(p7.VnS(), p0.Zeroing(), z29.VnS(), 0);

   // p6 = [... 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
   __ Cmple(p6.VnS(), p0.Zeroing(), z29.VnS(), 1);

   // p5 = [... 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1]
   __ Cmple(p5.VnS(), p0.Zeroing(), z29.VnS(), 2);

   __ Index(z28.VnB(), 1, 1);
   __ Dup(z27.VnS(), z28.VnS(), 0);

   // z27 = [... 3, 2, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1]
   IntSegmentPatternHelper(&masm, z27.VnB(), z27.VnB());

   // z27 = [... 6, 4, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2]
   __ Mul(z27.VnS(), p7.Merging(), z27.VnS(), z9.VnS());

   // z27 = [... 12, 8, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4]
   __ Mul(z27.VnS(), p6.Merging(), z27.VnS(), z9.VnS());

   //     2nd segment |                                        1st segment |
   //                 v                                                    v
   // z27 = [... 24, 16, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4, 32, 24, 16, 8]
   __ Mul(z27.VnS(), p5.Merging(), z27.VnS(), z9.VnS());

   __ Dup(z0.VnS(), 0);
   __ Dup(z1.VnS(), 0);
   __ Dup(z2.VnS(), 0);
   __ Dup(z3.VnS(), 0);
   __ Dup(z4.VnS(), 0);
   __ Dup(z5.VnS(), 0);

   // Skip the lanes starting from the 129th lane since the value of these lanes
   // are overflow after the number sequence creation by `index`.
   __ Cmpls(p3.VnB(), p0.Zeroing(), z28.VnB(), 128);
   __ Mov(z0.VnB(), p3.Merging(), z27.VnB());
   __ Mov(z1.VnB(), p3.Merging(), z28.VnB());

   __ Dup(z2.VnS(), 0);
   __ Dup(z3.VnS(), 0);
   __ Dup(z4.VnS(), 0);
   __ Dup(z5.VnS(), 0);

   __ Udot(z2.VnS(), z2.VnS(), z1.VnB(), z0.VnB(), 0);

   __ Udot(z3.VnS(), z3.VnS(), z1.VnB(), z0.VnB(), 1);
   __ Mul(z3.VnS(), z3.VnS(), 2);

   __ Udot(z4.VnS(), z4.VnS(), z1.VnB(), z0.VnB(), 2);
   __ Mul(z4.VnS(), z4.VnS(), 4);

   __ Udot(z5.VnS(), z5.VnS(), z1.VnB(), z0.VnB(), 3);
   __ Mul(z5.VnS(), z5.VnS(), 8);

   __ Dup(z7.VnS(), 0);
   __ Dup(z8.VnS(), 0);
   __ Dup(z9.VnS(), 0);
   __ Dup(z10.VnS(), 0);

   // Negate the all positive vector for testing signed dot.
   __ Neg(z6.VnB(), p0.Merging(), z0.VnB());
   __ Sdot(z7.VnS(), z7.VnS(), z1.VnB(), z6.VnB(), 0);

   __ Sdot(z8.VnS(), z8.VnS(), z1.VnB(), z6.VnB(), 1);
   __ Mul(z8.VnS(), z8.VnS(), 2);

   __ Sdot(z9.VnS(), z9.VnS(), z1.VnB(), z6.VnB(), 2);
   __ Mul(z9.VnS(), z9.VnS(), 4);

   __ Sdot(z10.VnS(), z10.VnS(), z1.VnB(), z6.VnB(), 3);
   __ Mul(z10.VnS(), z10.VnS(), 8);

   END();

   if (CAN_RUN()) {
     RUN();

     // Only compare the first 128-bit segment of destination register, use
     // another result from generated instructions to check the remaining part.
     // s_lane[0] = (1 * 8) + (2 * 16) + (3 * 24) + (4 * 32) = 240
     // ...
     // s_lane[3] = (13 * 8) + (14 * 16) + (15 * 24) + (16 * 32) = 1200
     int udot_expected[] = {1200, 880, 560, 240};
     ASSERT_EQUAL_SVE(udot_expected, z2.VnS());
     ASSERT_EQUAL_SVE(z2.VnS(), z3.VnS());
     ASSERT_EQUAL_SVE(z2.VnS(), z4.VnS());
     ASSERT_EQUAL_SVE(z2.VnS(), z5.VnS());

     int sdot_expected[] = {-1200, -880, -560, -240};
     ASSERT_EQUAL_SVE(sdot_expected, z7.VnS());
     ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS());
     ASSERT_EQUAL_SVE(z7.VnS(), z9.VnS());
     ASSERT_EQUAL_SVE(z7.VnS(), z10.VnS());
   }
 }

 TEST_SVE(sve_sdot_udot_indexed_d) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   const int multiplier = 2;
   __ Dup(z9.VnD(), multiplier);

   __ Ptrue(p0.VnD());
   __ Pfalse(p1.VnD());

   // p2 = [..., 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
   __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());

   __ Index(z1.VnH(), 1, 1);
   __ Dup(z0.VnD(), z1.VnD(), 0);

   // z0 = [... 5, 4, 3, 2, 5, 4, 3, 2, 4, 3, 2, 1, 4, 3, 2, 1]
   IntSegmentPatternHelper(&masm, z0.VnH(), z0.VnH());

   //                     2nd segment |           1st segment |
   //                                 v                       v
   // z0 = [... 5, 4, 3, 2, 10, 8, 6, 4, 4, 3, 2, 1, 8, 6, 4, 2]
   __ Mul(z0.VnD(), p2.Merging(), z0.VnD(), z9.VnD());

   __ Dup(z3.VnD(), 0);
   __ Dup(z4.VnD(), 0);

   __ Udot(z3.VnD(), z3.VnD(), z1.VnH(), z0.VnH(), 0);

   __ Udot(z4.VnD(), z4.VnD(), z1.VnH(), z0.VnH(), 1);
   __ Mul(z4.VnD(), z4.VnD(), multiplier);

   __ Dup(z12.VnD(), 0);
   __ Dup(z13.VnD(), 0);

   __ Ptrue(p4.VnH());
   __ Neg(z10.VnH(), p4.Merging(), z0.VnH());

   __ Sdot(z12.VnD(), z12.VnD(), z1.VnH(), z10.VnH(), 0);

   __ Sdot(z13.VnD(), z13.VnD(), z1.VnH(), z10.VnH(), 1);
   __ Mul(z13.VnD(), z13.VnD(), multiplier);

   END();

   if (CAN_RUN()) {
     RUN();

     // Only compare the first 128-bit segment of destination register, use
     // another result from generated instructions to check the remaining part.
     // d_lane[0] = (1 * 2) + (2 * 4) + (3 * 6) + (4 * 8) = 60
     // d_lane[1] = (5 * 2) + (6 * 4) + (7 * 6) + (8 * 8) = 140
     uint64_t udot_expected[] = {416, 304, 140, 60};
     ASSERT_EQUAL_SVE(udot_expected, z3.VnD());
     ASSERT_EQUAL_SVE(z3.VnD(), z4.VnD());

     int64_t sdot_expected[] = {-416, -304, -140, -60};
     ASSERT_EQUAL_SVE(sdot_expected, z12.VnD());
     ASSERT_EQUAL_SVE(z12.VnD(), z13.VnD());
   }
 }

 template <typename T, size_t N>
 static void FPToRawbitsWithSize(const T (&inputs)[N],
                                 uint64_t* outputs,
                                 unsigned size_in_bits) {
   for (size_t i = 0; i < N; i++) {
     outputs[i] = vixl::FPToRawbitsWithSize(size_in_bits, inputs[i]);
   }
 }

 template <typename Ti, typename Te, size_t N>
 static void FPBinArithHelper(Test* config,
                              ArithFn macro,
                              int lane_size_in_bits,
                              const Ti (&zn_inputs)[N],
                              const Ti (&zm_inputs)[N],
                              const Te (&zd_expected)[N]) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);

   START();

   ZRegister zd = z29.WithLaneSize(lane_size_in_bits);
   ZRegister zn = z30.WithLaneSize(lane_size_in_bits);
   ZRegister zm = z31.WithLaneSize(lane_size_in_bits);

   uint64_t zn_rawbits[N];
   uint64_t zm_rawbits[N];

   FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
   FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits);

   InsrHelper(&masm, zn, zn_rawbits);
   InsrHelper(&masm, zm, zm_rawbits);

   (masm.*macro)(zd, zn, zm);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(zd_expected, zd);
   }
 }

 TEST_SVE(sve_fp_arithmetic_unpredicated_fadd) {
   double zn_inputs[] = {24.0,
                         5.5,
                         0.0,
                         3.875,
                         2.125,
                         kFP64PositiveInfinity,
                         kFP64NegativeInfinity};

   double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};

   ArithFn fn = &MacroAssembler::Fadd;

   uint16_t expected_h[] = {Float16ToRawbits(Float16(1048.0)),
                            Float16ToRawbits(Float16(2053.5)),
                            Float16ToRawbits(Float16(0.1)),
                            Float16ToRawbits(Float16(-0.875)),
                            Float16ToRawbits(Float16(14.465)),
                            Float16ToRawbits(kFP16PositiveInfinity),
                            Float16ToRawbits(kFP16NegativeInfinity)};

   FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);

   uint32_t expected_s[] = {FloatToRawbits(1048.0f),
                            FloatToRawbits(2053.5f),
                            FloatToRawbits(0.1f),
                            FloatToRawbits(-0.875f),
                            FloatToRawbits(14.465f),
                            FloatToRawbits(kFP32PositiveInfinity),
                            FloatToRawbits(kFP32NegativeInfinity)};

   FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);

   uint64_t expected_d[] = {DoubleToRawbits(1048.0),
                            DoubleToRawbits(2053.5),
                            DoubleToRawbits(0.1),
                            DoubleToRawbits(-0.875),
                            DoubleToRawbits(14.465),
                            DoubleToRawbits(kFP64PositiveInfinity),
                            DoubleToRawbits(kFP64NegativeInfinity)};

   FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
 }

 TEST_SVE(sve_fp_arithmetic_unpredicated_fsub) {
   double zn_inputs[] = {24.0,
                         5.5,
                         0.0,
                         3.875,
                         2.125,
                         kFP64PositiveInfinity,
                         kFP64NegativeInfinity};

   double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};

   ArithFn fn = &MacroAssembler::Fsub;

   uint16_t expected_h[] = {Float16ToRawbits(Float16(-1000.0)),
                            Float16ToRawbits(Float16(-2042.5)),
                            Float16ToRawbits(Float16(-0.1)),
                            Float16ToRawbits(Float16(8.625)),
                            Float16ToRawbits(Float16(-10.215)),
                            Float16ToRawbits(kFP16PositiveInfinity),
                            Float16ToRawbits(kFP16NegativeInfinity)};

   FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);

   uint32_t expected_s[] = {FloatToRawbits(-1000.0),
                            FloatToRawbits(-2042.5),
                            FloatToRawbits(-0.1),
                            FloatToRawbits(8.625),
                            FloatToRawbits(-10.215),
                            FloatToRawbits(kFP32PositiveInfinity),
                            FloatToRawbits(kFP32NegativeInfinity)};

   FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);

   uint64_t expected_d[] = {DoubleToRawbits(-1000.0),
                            DoubleToRawbits(-2042.5),
                            DoubleToRawbits(-0.1),
                            DoubleToRawbits(8.625),
                            DoubleToRawbits(-10.215),
                            DoubleToRawbits(kFP64PositiveInfinity),
                            DoubleToRawbits(kFP64NegativeInfinity)};

   FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
 }

 TEST_SVE(sve_fp_arithmetic_unpredicated_fmul) {
   double zn_inputs[] = {24.0,
                         5.5,
                         0.0,
                         3.875,
                         2.125,
                         kFP64PositiveInfinity,
                         kFP64NegativeInfinity};

   double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};

   ArithFn fn = &MacroAssembler::Fmul;

   uint16_t expected_h[] = {Float16ToRawbits(Float16(24576.0)),
                            Float16ToRawbits(Float16(11264.0)),
                            Float16ToRawbits(Float16(0.0)),
                            Float16ToRawbits(Float16(-18.4)),
                            Float16ToRawbits(Float16(26.23)),
                            Float16ToRawbits(kFP16PositiveInfinity),
                            Float16ToRawbits(kFP16PositiveInfinity)};

   FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);

   uint32_t expected_s[] = {FloatToRawbits(24576.0),
                            FloatToRawbits(11264.0),
                            FloatToRawbits(0.0),
                            FloatToRawbits(-18.40625),
                            FloatToRawbits(26.2225),
                            FloatToRawbits(kFP32PositiveInfinity),
                            FloatToRawbits(kFP32PositiveInfinity)};

   FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);

   uint64_t expected_d[] = {DoubleToRawbits(24576.0),
                            DoubleToRawbits(11264.0),
                            DoubleToRawbits(0.0),
                            DoubleToRawbits(-18.40625),
                            DoubleToRawbits(26.2225),
                            DoubleToRawbits(kFP64PositiveInfinity),
                            DoubleToRawbits(kFP64PositiveInfinity)};

   FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
 }

 typedef void (MacroAssembler::*FPArithPredicatedFn)(
     const ZRegister& zd,
     const PRegisterM& pg,
     const ZRegister& zn,
     const ZRegister& zm,
     FPMacroNaNPropagationOption nan_option);

 typedef void (MacroAssembler::*FPArithPredicatedNoNaNOptFn)(
     const ZRegister& zd,
     const PRegisterM& pg,
     const ZRegister& zn,
     const ZRegister& zm);

 template <typename Ti, typename Te, size_t N>
 static void FPBinArithHelper(
     Test* config,
     FPArithPredicatedFn macro,
     FPArithPredicatedNoNaNOptFn macro_nonan,
     unsigned lane_size_in_bits,
     const Ti (&zd_inputs)[N],
     const int (&pg_inputs)[N],
     const Ti (&zn_inputs)[N],
     const Ti (&zm_inputs)[N],
     const Te (&zd_expected)[N],
     FPMacroNaNPropagationOption nan_option = FastNaNPropagation) {
   VIXL_ASSERT((macro == NULL) ^ (macro_nonan == NULL));
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Avoid choosing default scratch registers.
   ZRegister zd = z26.WithLaneSize(lane_size_in_bits);
   ZRegister zn = z27.WithLaneSize(lane_size_in_bits);
   ZRegister zm = z28.WithLaneSize(lane_size_in_bits);

   uint64_t zn_inputs_rawbits[N];
   uint64_t zm_inputs_rawbits[N];
   uint64_t zd_inputs_rawbits[N];

   FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits);
   FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits);
   FPToRawbitsWithSize(zd_inputs, zd_inputs_rawbits, lane_size_in_bits);

   InsrHelper(&masm, zn, zn_inputs_rawbits);
   InsrHelper(&masm, zm, zm_inputs_rawbits);
   InsrHelper(&masm, zd, zd_inputs_rawbits);

   PRegisterWithLaneSize pg = p0.WithLaneSize(lane_size_in_bits);
   Initialise(&masm, pg, pg_inputs);

   // `instr` zdn, pg, zdn, zm
   ZRegister dn_result = z0.WithLaneSize(lane_size_in_bits);
   __ Mov(dn_result, zn);
   if (macro_nonan == NULL) {
     (masm.*macro)(dn_result, pg.Merging(), dn_result, zm, nan_option);
   } else {
     (masm.*macro_nonan)(dn_result, pg.Merging(), dn_result, zm);
   }

   // Based on whether zd and zm registers are aliased, the macro of instructions
   // (`Instr`) swaps the order of operands if it has the commutative property,
   // otherwise, transfer to the reversed `Instr`, such as fdivr.
   // `instr` zdm, pg, zn, zdm
   ZRegister dm_result = z1.WithLaneSize(lane_size_in_bits);
   __ Mov(dm_result, zm);
   if (macro_nonan == NULL) {
     (masm.*macro)(dm_result, pg.Merging(), zn, dm_result, nan_option);
   } else {
     (masm.*macro_nonan)(dm_result, pg.Merging(), zn, dm_result);
   }

   // The macro of instructions (`Instr`) automatically selects between `instr`
   // and movprfx + `instr` based on whether zd and zn registers are aliased.
   // A generated movprfx instruction is predicated that using the same
   // governing predicate register. In order to keep the result constant,
   // initialize the destination register first.
   // `instr` zd, pg, zn, zm
   ZRegister d_result = z2.WithLaneSize(lane_size_in_bits);
   __ Mov(d_result, zd);
   if (macro_nonan == NULL) {
     (masm.*macro)(d_result, pg.Merging(), zn, zm, nan_option);
   } else {
     (masm.*macro_nonan)(d_result, pg.Merging(), zn, zm);
   }

   END();

   if (CAN_RUN()) {
     RUN();

     for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
       int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
       if (!core.HasSVELane(dn_result, lane)) break;
       if ((pg_inputs[i] & 1) != 0) {
         ASSERT_EQUAL_SVE_LANE(zd_expected[i], dn_result, lane);
       } else {
         ASSERT_EQUAL_SVE_LANE(zn_inputs_rawbits[i], dn_result, lane);
       }
     }

     for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
       int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
       if (!core.HasSVELane(dm_result, lane)) break;
       if ((pg_inputs[i] & 1) != 0) {
         ASSERT_EQUAL_SVE_LANE(zd_expected[i], dm_result, lane);
       } else {
         ASSERT_EQUAL_SVE_LANE(zm_inputs_rawbits[i], dm_result, lane);
       }
     }

     ASSERT_EQUAL_SVE(zd_expected, d_result);
   }
 }

 TEST_SVE(sve_binary_arithmetic_predicated_fdiv) {
   // The inputs are shared with different precision tests.
   double zd_in[] = {0.1, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9};

   double zn_in[] = {24.0,
                     24.0,
                     -2.0,
                     -2.0,
                     5.5,
                     5.5,
                     kFP64PositiveInfinity,
                     kFP64PositiveInfinity,
                     kFP64NegativeInfinity,
                     kFP64NegativeInfinity};

   double zm_in[] = {-2.0, -2.0, 24.0, 24.0, 0.5, 0.5, 0.65, 0.65, 24.0, 24.0};

   int pg_in[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1};

   uint16_t exp_h[] = {Float16ToRawbits(Float16(0.1)),
                       Float16ToRawbits(Float16(-12.0)),
                       Float16ToRawbits(Float16(2.2)),
                       Float16ToRawbits(Float16(-0.0833)),
                       Float16ToRawbits(Float16(4.4)),
                       Float16ToRawbits(Float16(11.0)),
                       Float16ToRawbits(Float16(6.6)),
                       Float16ToRawbits(kFP16PositiveInfinity),
                       Float16ToRawbits(Float16(8.8)),
                       Float16ToRawbits(kFP16NegativeInfinity)};

   FPBinArithHelper(config,
                    NULL,
                    &MacroAssembler::Fdiv,
                    kHRegSize,
                    zd_in,
                    pg_in,
                    zn_in,
                    zm_in,
                    exp_h);

   uint32_t exp_s[] = {FloatToRawbits(0.1),
                       FloatToRawbits(-12.0),
                       FloatToRawbits(2.2),
                       0xbdaaaaab,
                       FloatToRawbits(4.4),
                       FloatToRawbits(11.0),
                       FloatToRawbits(6.6),
                       FloatToRawbits(kFP32PositiveInfinity),
                       FloatToRawbits(8.8),
                       FloatToRawbits(kFP32NegativeInfinity)};

   FPBinArithHelper(config,
                    NULL,
                    &MacroAssembler::Fdiv,
                    kSRegSize,
                    zd_in,
                    pg_in,
                    zn_in,
                    zm_in,
                    exp_s);

   uint64_t exp_d[] = {DoubleToRawbits(0.1),
                       DoubleToRawbits(-12.0),
                       DoubleToRawbits(2.2),
                       0xbfb5555555555555,
                       DoubleToRawbits(4.4),
                       DoubleToRawbits(11.0),
                       DoubleToRawbits(6.6),
                       DoubleToRawbits(kFP64PositiveInfinity),
                       DoubleToRawbits(8.8),
                       DoubleToRawbits(kFP64NegativeInfinity)};

   FPBinArithHelper(config,
                    NULL,
                    &MacroAssembler::Fdiv,
                    kDRegSize,
                    zd_in,
                    pg_in,
                    zn_in,
                    zm_in,
                    exp_d);
 }

 TEST_SVE(sve_select) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t in0[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
   uint64_t in1[] = {0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa};

   // For simplicity, we re-use the same pg for various lane sizes.
   // For D lanes:         1,                      1,                      0
   // For S lanes:         1,          1,          1,          0,          0
   // For H lanes:   0,    1,    0,    1,    1,    1,    0,    0,    1,    0
   int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
   Initialise(&masm, p0.VnB(), pg_in);
   PRegisterM pg = p0.Merging();

   InsrHelper(&masm, z30.VnD(), in0);
   InsrHelper(&masm, z31.VnD(), in1);

   __ Sel(z0.VnB(), pg, z30.VnB(), z31.VnB());
   __ Sel(z1.VnH(), pg, z30.VnH(), z31.VnH());
   __ Sel(z2.VnS(), pg, z30.VnS(), z31.VnS());
   __ Sel(z3.VnD(), pg, z30.VnD(), z31.VnD());

   END();

   if (CAN_RUN()) {
     RUN();

     uint64_t expected_z0[] = {0xaaaaaaaa05aa07f8,
                               0xfeaaaaf0aac3870f,
                               0xaaaa56aa9abcdeaa};
     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());

     uint64_t expected_z1[] = {0xaaaaaaaaaaaa07f8,
                               0xaaaaf8f0e1c3870f,
                               0xaaaaaaaa9abcaaaa};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());

     uint64_t expected_z2[] = {0xaaaaaaaa05f607f8,
                               0xfefcf8f0e1c3870f,
                               0xaaaaaaaaaaaaaaaa};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());

     uint64_t expected_z3[] = {0x01f203f405f607f8,
                               0xfefcf8f0e1c3870f,
                               0xaaaaaaaaaaaaaaaa};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
   }
 }

 TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_h) {
   double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
   double zn_inputs[] = {-2.1,
                         8.5,
                         225.5,
                         0.0,
                         8.8,
                         -4.75,
                         kFP64PositiveInfinity,
                         kFP64NegativeInfinity};
   double zm_inputs[] = {-2.0,
                         -13.0,
                         24.0,
                         0.01,
                         0.5,
                         300.75,
                         kFP64NegativeInfinity,
                         kFP64PositiveInfinity};
   int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};

   uint16_t zd_expected_max[] = {Float16ToRawbits(Float16(-2.0)),
                                 Float16ToRawbits(Float16(8.5)),
                                 Float16ToRawbits(Float16(3.3)),
                                 Float16ToRawbits(Float16(0.01)),
                                 Float16ToRawbits(Float16(5.5)),
                                 Float16ToRawbits(Float16(300.75)),
                                 Float16ToRawbits(kFP16PositiveInfinity),
                                 Float16ToRawbits(kFP16PositiveInfinity)};
   FPBinArithHelper(config,
                    &MacroAssembler::Fmax,
                    NULL,
                    kHRegSize,
                    zd_inputs,
                    pg_inputs,
                    zn_inputs,
                    zm_inputs,
                    zd_expected_max);

   uint16_t zd_expected_min[] = {Float16ToRawbits(Float16(-2.1)),
                                 Float16ToRawbits(Float16(-13.0)),
                                 Float16ToRawbits(Float16(3.3)),
                                 Float16ToRawbits(Float16(0.0)),
                                 Float16ToRawbits(Float16(5.5)),
                                 Float16ToRawbits(Float16(-4.75)),
                                 Float16ToRawbits(kFP16NegativeInfinity),
                                 Float16ToRawbits(kFP16NegativeInfinity)};
   FPBinArithHelper(config,
                    &MacroAssembler::Fmin,
                    NULL,
                    kHRegSize,
                    zd_inputs,
                    pg_inputs,
                    zn_inputs,
                    zm_inputs,
                    zd_expected_min);
 }

 TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_s) {
   double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
   double zn_inputs[] = {-2.1,
                         8.5,
                         225.5,
                         0.0,
                         8.8,
                         -4.75,
                         kFP64PositiveInfinity,
                         kFP64NegativeInfinity};
   double zm_inputs[] = {-2.0,
                         -13.0,
                         24.0,
                         0.01,
                         0.5,
                         300.75,
                         kFP64NegativeInfinity,
                         kFP64PositiveInfinity};
   int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};

   uint32_t zd_expected_max[] = {FloatToRawbits(-2.0),
                                 FloatToRawbits(8.5),
                                 FloatToRawbits(3.3),
                                 FloatToRawbits(0.01),
                                 FloatToRawbits(5.5),
                                 FloatToRawbits(300.75),
                                 FloatToRawbits(kFP32PositiveInfinity),
                                 FloatToRawbits(kFP32PositiveInfinity)};
   FPBinArithHelper(config,
                    &MacroAssembler::Fmax,
                    NULL,
                    kSRegSize,
                    zd_inputs,
                    pg_inputs,
                    zn_inputs,
                    zm_inputs,
                    zd_expected_max);

   uint32_t zd_expected_min[] = {FloatToRawbits(-2.1),
                                 FloatToRawbits(-13.0),
                                 FloatToRawbits(3.3),
                                 FloatToRawbits(0.0),
                                 FloatToRawbits(5.5),
                                 FloatToRawbits(-4.75),
                                 FloatToRawbits(kFP32NegativeInfinity),
                                 FloatToRawbits(kFP32NegativeInfinity)};
   FPBinArithHelper(config,
                    &MacroAssembler::Fmin,
                    NULL,
                    kSRegSize,
                    zd_inputs,
                    pg_inputs,
                    zn_inputs,
                    zm_inputs,
                    zd_expected_min);
 }

 TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_d) {
   double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
   double zn_inputs[] = {-2.1,
                         8.5,
                         225.5,
                         0.0,
                         8.8,
                         -4.75,
                         kFP64PositiveInfinity,
                         kFP64NegativeInfinity};
   double zm_inputs[] = {-2.0,
                         -13.0,
                         24.0,
                         0.01,
                         0.5,
                         300.75,
                         kFP64NegativeInfinity,
                         kFP64PositiveInfinity};
   int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};

   uint64_t zd_expected_max[] = {DoubleToRawbits(-2.0),
                                 DoubleToRawbits(8.5),
                                 DoubleToRawbits(3.3),
                                 DoubleToRawbits(0.01),
                                 DoubleToRawbits(5.5),
                                 DoubleToRawbits(300.75),
                                 DoubleToRawbits(kFP64PositiveInfinity),
                                 DoubleToRawbits(kFP64PositiveInfinity)};
   FPBinArithHelper(config,
                    &MacroAssembler::Fmax,
                    NULL,
                    kDRegSize,
                    zd_inputs,
                    pg_inputs,
                    zn_inputs,
                    zm_inputs,
                    zd_expected_max);

   uint64_t zd_expected_min[] = {DoubleToRawbits(-2.1),
                                 DoubleToRawbits(-13.0),
                                 DoubleToRawbits(3.3),
                                 DoubleToRawbits(0.0),
                                 DoubleToRawbits(5.5),
                                 DoubleToRawbits(-4.75),
                                 DoubleToRawbits(kFP64NegativeInfinity),
                                 DoubleToRawbits(kFP64NegativeInfinity)};
   FPBinArithHelper(config,
                    &MacroAssembler::Fmin,
                    NULL,
                    kDRegSize,
                    zd_inputs,
                    pg_inputs,
                    zn_inputs,
                    zm_inputs,
                    zd_expected_min);
 }

 template <typename T, size_t N>
 static void BitwiseShiftImmHelper(Test* config,
                                   int lane_size_in_bits,
                                   const T (&zn_inputs)[N],
                                   int shift) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   ZRegister zd_asr = z25.WithLaneSize(lane_size_in_bits);
   ZRegister zd_lsr = z26.WithLaneSize(lane_size_in_bits);
   ZRegister zd_lsl = z27.WithLaneSize(lane_size_in_bits);
   ZRegister zn = z28.WithLaneSize(lane_size_in_bits);

   InsrHelper(&masm, zn, zn_inputs);

   __ Asr(zd_asr, zn, shift);
   __ Lsr(zd_lsr, zn, shift);
   __ Lsl(zd_lsl, zn, shift - 1);  // Lsl supports 0 - lane_size-1.

   END();

   if (CAN_RUN()) {
     RUN();

     const uint64_t mask = GetUintMask(lane_size_in_bits);
     for (int i = 0; i < static_cast<int>(N); i++) {
       int lane = N - i - 1;
       if (!core.HasSVELane(zd_asr, lane)) break;
       bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0;
       uint64_t result;
       if (shift >= lane_size_in_bits) {
         result = is_negative ? mask : 0;
       } else {
         result = zn_inputs[i] >> shift;
         if (is_negative) {
           result |= mask << (lane_size_in_bits - shift);
           result &= mask;
         }
       }
       ASSERT_EQUAL_SVE_LANE(result, zd_asr, lane);
     }

     for (int i = 0; i < static_cast<int>(N); i++) {
       int lane = N - i - 1;
       if (!core.HasSVELane(zd_lsr, lane)) break;
       uint64_t result =
           (shift >= lane_size_in_bits) ? 0 : zn_inputs[i] >> shift;
       ASSERT_EQUAL_SVE_LANE(result, zd_lsr, lane);
     }

     for (int i = 0; i < static_cast<int>(N); i++) {
       int lane = N - i - 1;
       if (!core.HasSVELane(zd_lsl, lane)) break;
       uint64_t result =
           (shift > lane_size_in_bits) ? 0 : zn_inputs[i] << (shift - 1);
       ASSERT_EQUAL_SVE_LANE(result & mask, zd_lsl, lane);
     }
   }
 }

 TEST_SVE(sve_bitwise_shift_imm_unpredicated) {
   uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
   int shift_b[] = {1, 3, 5, 8};
   for (size_t i = 0; i < ArrayLength(shift_b); i++) {
     BitwiseShiftImmHelper(config, kBRegSize, inputs_b, shift_b[i]);
   }

   uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233};
   int shift_h[] = {1, 8, 11, 16};
   for (size_t i = 0; i < ArrayLength(shift_h); i++) {
     BitwiseShiftImmHelper(config, kHRegSize, inputs_h, shift_h[i]);
   }

   uint64_t inputs_s[] = {0xfedcba98, 0xfffa55aa, 0x00112233};
   int shift_s[] = {1, 9, 17, 32};
   for (size_t i = 0; i < ArrayLength(shift_s); i++) {
     BitwiseShiftImmHelper(config, kSRegSize, inputs_s, shift_s[i]);
   }

   uint64_t inputs_d[] = {0xfedcba98fedcba98,
                          0xfffa5555aaaaaaaa,
                          0x0011223344aafe80};
   int shift_d[] = {1, 23, 45, 64};
   for (size_t i = 0; i < ArrayLength(shift_d); i++) {
     BitwiseShiftImmHelper(config, kDRegSize, inputs_d, shift_d[i]);
   }
 }

 template <typename T, typename R, size_t N>
 static void BitwiseShiftWideElementsHelper(Test* config,
                                            Shift shift_type,
                                            int lane_size_in_bits,
                                            const T (&zn_inputs)[N],
                                            const R& zm_inputs,
                                            const T (&zd_expected)[N]) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   ArithFn macro;
   // Since logical shift left and right by the current lane size width is equal
   // to 0, so initialize the array to 0 for convenience.
   uint64_t zd_expected_max_shift_amount[N] = {0};
   switch (shift_type) {
     case ASR: {
       macro = &MacroAssembler::Asr;
       uint64_t mask = GetUintMask(lane_size_in_bits);
       for (size_t i = 0; i < ArrayLength(zn_inputs); i++) {
         bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0;
         zd_expected_max_shift_amount[i] = is_negative ? mask : 0;
       }
       break;
     }
     case LSR:
       macro = &MacroAssembler::Lsr;
       break;
     case LSL:
       macro = &MacroAssembler::Lsl;
       break;
     default:
       VIXL_UNIMPLEMENTED();
       macro = NULL;
       break;
   }

   ZRegister zd = z26.WithLaneSize(lane_size_in_bits);
   ZRegister zn = z27.WithLaneSize(lane_size_in_bits);
   ZRegister zm = z28.WithLaneSize(kDRegSize);

   InsrHelper(&masm, zn, zn_inputs);
   InsrHelper(&masm, zm, zm_inputs);

   (masm.*macro)(zd, zn, zm);

   ZRegister zm_max_shift_amount = z25.WithLaneSize(kDRegSize);
   ZRegister zd_max_shift_amount = z24.WithLaneSize(lane_size_in_bits);

   __ Dup(zm_max_shift_amount, lane_size_in_bits);
   (masm.*macro)(zd_max_shift_amount, zn, zm_max_shift_amount);

   ZRegister zm_out_of_range = z23.WithLaneSize(kDRegSize);
   ZRegister zd_out_of_range = z22.WithLaneSize(lane_size_in_bits);

   __ Dup(zm_out_of_range, GetUintMask(lane_size_in_bits));
   (masm.*macro)(zd_out_of_range, zn, zm_out_of_range);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(zd_expected, zd);
     ASSERT_EQUAL_SVE(zd_expected_max_shift_amount, zd_max_shift_amount);
     ASSERT_EQUAL_SVE(zd_max_shift_amount, zd_out_of_range);
   }
 }

 TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_asr) {
   // clang-format off
   uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
                          0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
   int shift_b[] = {1, 3};
   uint64_t expected_b[] = {0xff, 0xee, 0xdd, 0xcc, 0xff, 0x2a, 0xd5, 0xc0,
                            0xff, 0xfb, 0xf7, 0xf3, 0xff, 0x0a, 0xf5, 0xf0};
   BitwiseShiftWideElementsHelper(config,
                                  ASR,
                                  kBRegSize,
                                  inputs_b,
                                  shift_b,
                                  expected_b);

   uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
                          0xfedc, 0xfa55, 0x0011, 0x2233,
                          0xfedc, 0xfa55, 0x0011, 0x2233};
   int shift_h[] = {1, 8, 11};
   uint64_t expected_h[] = {0xff6e, 0xfd2a, 0x0008, 0x1119,
                            0xfffe, 0xfffa, 0x0000, 0x0022,
                            0xffff, 0xffff, 0x0000, 0x0004};
   BitwiseShiftWideElementsHelper(config,
                                  ASR,
                                  kHRegSize,
                                  inputs_h,
                                  shift_h,
                                  expected_h);

   uint64_t inputs_s[] =
       {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
   int shift_s[] = {1, 9, 23};
   uint64_t expected_s[] =
       {0xff6e5d4c, 0xfffd2ad5, 0x00000891, 0x000091a2, 0xffffff55, 0xffffff11};
   BitwiseShiftWideElementsHelper(config,
                                  ASR,
                                  kSRegSize,
                                  inputs_s,
                                  shift_s,
                                  expected_s);
   // clang-format on
 }

 TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsr) {
   // clang-format off
   uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
                          0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
   int shift_b[] = {1, 3};
   uint64_t expected_b[] = {0x7f, 0x6e, 0x5d, 0x4c, 0x7f, 0x2a, 0x55, 0x40,
                            0x1f, 0x1b, 0x17, 0x13, 0x1f, 0x0a, 0x15, 0x10};

   BitwiseShiftWideElementsHelper(config,
                                  LSR,
                                  kBRegSize,
                                  inputs_b,
                                  shift_b,
                                  expected_b);

   uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
                          0xfedc, 0xfa55, 0x0011, 0x2233,
                          0xfedc, 0xfa55, 0x0011, 0x2233};
   int shift_h[] = {1, 8, 11};
   uint64_t expected_h[] = {0x7f6e, 0x7d2a, 0x0008, 0x1119,
                            0x00fe, 0x00fa, 0x0000, 0x0022,
                            0x001f, 0x001f, 0x0000, 0x0004};
   BitwiseShiftWideElementsHelper(config,
                                  LSR,
                                  kHRegSize,
                                  inputs_h,
                                  shift_h,
                                  expected_h);

   uint64_t inputs_s[] =
       {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
   int shift_s[] = {1, 9, 23};
   uint64_t expected_s[] =
       {0x7f6e5d4c, 0x7ffd2ad5, 0x00000891, 0x000091a2, 0x00000155, 0x00000111};
   BitwiseShiftWideElementsHelper(config,
                                  LSR,
                                  kSRegSize,
                                  inputs_s,
                                  shift_s,
                                  expected_s);
   // clang-format on
 }

 TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsl) {
   // clang-format off
   uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
                          0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
   int shift_b[] = {1, 5};

   uint64_t expected_b[] = {0xfc, 0xb8, 0x74, 0x30, 0xfe, 0xaa, 0x54, 0x00,
                            0xc0, 0x80, 0x40, 0x00, 0xe0, 0xa0, 0x40, 0x00};

   BitwiseShiftWideElementsHelper(config,
                                  LSL,
                                  kBRegSize,
                                  inputs_b,
                                  shift_b,
                                  expected_b);
   uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
                          0xfedc, 0xfa55, 0x0011, 0x2233,
                          0xfedc, 0xfa55, 0x0011, 0x2233};
   int shift_h[] = {1, 2, 14};

   uint64_t expected_h[] = {0xfdb8, 0xf4aa, 0x0022, 0x4466,
                            0xfb70, 0xe954, 0x0044, 0x88cc,
                            0x0000, 0x4000, 0x4000, 0xc000};
   BitwiseShiftWideElementsHelper(config,
                                  LSL,
                                  kHRegSize,
                                  inputs_h,
                                  shift_h,
                                  expected_h);
   uint64_t inputs_s[] =
       {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
   int shift_s[] = {1, 19, 26};
   uint64_t expected_s[] =
       {0xfdb97530, 0xfff4ab54, 0x11980000, 0x2b380000, 0xa8000000, 0x20000000};
   BitwiseShiftWideElementsHelper(config,
                                  LSL,
                                  kSRegSize,
                                  inputs_s,
                                  shift_s,
                                  expected_s);

   // Test large shifts outside the range of the "unsigned" type.
   uint64_t inputs_b2[] = {1, 2, 4, 8, 3, 5, 7, 9,
                           1, 2, 4, 8, 3, 5, 7, 9};
   uint64_t shift_b2[] = {1, 0x1000000001};
   uint64_t expected_b2[] = {2, 4, 8, 16, 6, 10, 14, 18,
                             0, 0, 0, 0, 0, 0, 0, 0};
   BitwiseShiftWideElementsHelper(config, LSL, kBRegSize, inputs_b2, shift_b2,
                                  expected_b2);

   // clang-format on
 }

 TEST_SVE(sve_shift_by_vector) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);

   START();
   __ Ptrue(p0.VnB());
   __ Pfalse(p1.VnB());
   __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
   __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
   __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
   __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());

   __ Dup(z31.VnD(), 0x8000000080008080);
   __ Dup(z0.VnB(), -1);

   __ Index(z1.VnB(), 0, 1);
   __ Dup(z2.VnB(), 0x55);
   __ Lsr(z2.VnB(), p2.Merging(), z0.VnB(), z1.VnB());
   __ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnB());
   __ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnB());

   __ Index(z1.VnH(), 0, 1);
   __ Dup(z6.VnB(), 0x55);
   __ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnH());
   __ Lsl(z6.VnH(), p3.Merging(), z0.VnH(), z1.VnH());
   __ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnH());

   __ Index(z1.VnS(), 0, 1);
   __ Dup(z10.VnB(), 0x55);
   __ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
   __ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
   __ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnS());

   __ Index(z1.VnD(), 0, 1);
   __ Lsr(z0.VnD(), p5.Merging(), z0.VnD(), z1.VnD());
   __ Lsl(z12.VnD(), p0.Merging(), z0.VnD(), z1.VnD());
   __ Asr(z13.VnD(), p0.Merging(), z31.VnD(), z1.VnD());

   __ Dup(z11.VnD(), 0x100000001);
   __ Lsl(z14.VnD(), p0.Merging(), z1.VnD(), z11.VnD());

   __ Index(z0.VnH(), 7, -1);
   __ Lsr(z0.VnH(), p0.Merging(), z31.VnH(), z0.VnH());
   END();

   if (CAN_RUN()) {
     RUN();

     uint64_t expected_z0[] = {0x8000000020001010, 0x0800000002000101};
     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
     uint64_t expected_z2[] = {0x5500550055005500, 0x5503550f553f55ff};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
     uint64_t expected_z3[] = {0x0000000000000000, 0x80c0e0f0f8fcfeff};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
     uint64_t expected_z4[] = {0xff000000ff00ffff, 0xff000000f000c080};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
     uint64_t expected_z5[] = {0x01ff03ff07ff0fff, 0x1fff3fff7fffffff};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
     uint64_t expected_z6[] = {0x5555ffc05555fff0, 0x5555fffc5555ffff};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
     uint64_t expected_z7[] = {0xff000000fc00f808, 0xf0000000c0008080};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
     uint64_t expected_z8[] = {0x1fffffff3fffffff, 0x7fffffffffffffff};
     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
     uint64_t expected_z9[] = {0xfffffff8fffffffc, 0xfffffffeffffffff};
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
     uint64_t expected_z10[] = {0x55555555e0002020, 0x5555555580008080};
     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
     uint64_t expected_z12[] = {0xfffffffffffffffe, 0xffffffffffffffff};
     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
     uint64_t expected_z13[] = {0xc000000040004040, 0x8000000080008080};
     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
     uint64_t expected_z14[] = {0, 0};
     ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
   }
 }

 TEST_SVE(sve_shift_by_wide_vector) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);

   START();
   __ Ptrue(p0.VnB());
   __ Pfalse(p1.VnB());
   __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
   __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
   __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());

   __ Dup(z31.VnD(), 0x8000000080008080);
   __ Dup(z0.VnB(), -1);
   __ Index(z1.VnD(), 1, 5);

   __ Dup(z2.VnB(), 0x55);
   __ Lsr(z2.VnB(), p2.Merging(), z2.VnB(), z1.VnD());
   __ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnD());
   __ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnD());

   __ Dup(z6.VnB(), 0x55);
   __ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnD());
   __ Lsl(z6.VnH(), p3.Merging(), z6.VnH(), z1.VnD());
   __ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnD());

   __ Dup(z10.VnB(), 0x55);
   __ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnD());
   __ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnD());
   __ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnD());
   END();

   if (CAN_RUN()) {
     RUN();

     uint64_t expected_z2[] = {0x5501550155015501, 0x552a552a552a552a};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
     uint64_t expected_z3[] = {0xc0c0c0c0c0c0c0c0, 0xfefefefefefefefe};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
     uint64_t expected_z4[] = {0xfe000000fe00fefe, 0xc0000000c000c0c0};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
     uint64_t expected_z5[] = {0x03ff03ff03ff03ff, 0x7fff7fff7fff7fff};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
     uint64_t expected_z6[] = {0x5555554055555540, 0x5555aaaa5555aaaa};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
     uint64_t expected_z7[] = {0xfe000000fe00fe02, 0xc0000000c000c040};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
     uint64_t expected_z8[] = {0x03ffffff03ffffff, 0x7fffffff7fffffff};
     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
     uint64_t expected_z9[] = {0xffffffc0ffffffc0, 0xfffffffefffffffe};
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
     uint64_t expected_z10[] = {0x55555555fe000202, 0x55555555c0004040};
     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
   }
 }

 TEST_SVE(sve_pred_shift_imm) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);

   START();
   __ Ptrue(p0.VnB());
   __ Pfalse(p1.VnB());
   __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
   __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
   __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
   __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());

   __ Dup(z31.VnD(), 0x8000000080008080);
   __ Lsr(z0.VnB(), p0.Merging(), z31.VnB(), 1);
   __ Mov(z1, z0);
   __ Lsl(z1.VnB(), p2.Merging(), z1.VnB(), 1);
   __ Asr(z2.VnB(), p0.Merging(), z1.VnB(), 2);

   __ Lsr(z3.VnH(), p0.Merging(), z31.VnH(), 2);
   __ Mov(z4, z3);
   __ Lsl(z4.VnH(), p3.Merging(), z4.VnH(), 2);
   __ Asr(z5.VnH(), p0.Merging(), z4.VnH(), 3);

   __ Lsr(z6.VnS(), p0.Merging(), z31.VnS(), 3);
   __ Mov(z7, z6);
   __ Lsl(z7.VnS(), p4.Merging(), z7.VnS(), 3);
   __ Asr(z8.VnS(), p0.Merging(), z7.VnS(), 4);

   __ Lsr(z9.VnD(), p0.Merging(), z31.VnD(), 4);
   __ Mov(z10, z9);
   __ Lsl(z10.VnD(), p5.Merging(), z10.VnD(), 4);
   __ Asr(z11.VnD(), p0.Merging(), z10.VnD(), 5);
   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t expected_z0[] = {0x4000000040004040, 0x4000000040004040};
     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
     uint64_t expected_z1[] = {0x4000000040004080, 0x4000000040004080};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
     uint64_t expected_z2[] = {0x10000000100010e0, 0x10000000100010e0};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
     uint64_t expected_z3[] = {0x2000000020002020, 0x2000000020002020};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
     uint64_t expected_z4[] = {0x2000000020008080, 0x2000000020008080};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
     uint64_t expected_z5[] = {0x040000000400f010, 0x040000000400f010};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
     uint64_t expected_z6[] = {0x1000000010001010, 0x1000000010001010};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
     uint64_t expected_z7[] = {0x1000000080008080, 0x1000000080008080};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
     uint64_t expected_z8[] = {0x01000000f8000808, 0x01000000f8000808};
     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
     uint64_t expected_z9[] = {0x0800000008000808, 0x0800000008000808};
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
     uint64_t expected_z10[] = {0x0800000008000808, 0x8000000080008080};
     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
     uint64_t expected_z11[] = {0x0040000000400040, 0xfc00000004000404};
     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
   }
 }

 TEST_SVE(sve_asrd) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);

   START();
   __ Ptrue(p0.VnB());
   __ Pfalse(p1.VnB());
   __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
   __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
   __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
   __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());

   __ Index(z31.VnB(), 0x7f - 3, 1);
   __ Asrd(z0.VnB(), p0.Merging(), z31.VnB(), 1);
   __ Mov(z1, z31);
   __ Asrd(z1.VnB(), p2.Merging(), z1.VnB(), 2);
   __ Asrd(z2.VnB(), p0.Merging(), z31.VnB(), 7);
   __ Asrd(z3.VnB(), p0.Merging(), z31.VnB(), 8);

   __ Index(z31.VnH(), 0x7fff - 3, 1);
   __ Asrd(z4.VnH(), p0.Merging(), z31.VnH(), 1);
   __ Mov(z5, z31);
   __ Asrd(z5.VnH(), p3.Merging(), z5.VnH(), 2);
   __ Asrd(z6.VnH(), p0.Merging(), z31.VnH(), 15);
   __ Asrd(z7.VnH(), p0.Merging(), z31.VnH(), 16);

   __ Index(z31.VnS(), 0x7fffffff - 1, 1);
   __ Asrd(z8.VnS(), p0.Merging(), z31.VnS(), 1);
   __ Mov(z9, z31);
   __ Asrd(z9.VnS(), p4.Merging(), z9.VnS(), 2);
   __ Asrd(z10.VnS(), p0.Merging(), z31.VnS(), 31);
   __ Asrd(z11.VnS(), p0.Merging(), z31.VnS(), 32);

   __ Index(z31.VnD(), 0x7fffffffffffffff, 1);
   __ Asrd(z12.VnD(), p0.Merging(), z31.VnD(), 1);
   __ Mov(z13, z31);
   __ Asrd(z13.VnD(), p5.Merging(), z13.VnD(), 2);
   __ Asrd(z14.VnD(), p0.Merging(), z31.VnD(), 63);
   __ Asrd(z31.VnD(), p0.Merging(), z31.VnD(), 64);
   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t expected_z0[] = {0xc6c5c5c4c4c3c3c2, 0xc2c1c1c03f3f3e3e};
     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
     uint64_t expected_z1[] = {0x8be389e287e285e1, 0x83e181e07f1f7d1f};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
     uint64_t expected_z2[] = {0x0000000000000000, 0x000000ff00000000};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
     uint64_t expected_z3[] = {0x0000000000000000, 0x0000000000000000};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
     uint64_t expected_z4[] = {0xc002c001c001c000, 0x3fff3fff3ffe3ffe};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
     uint64_t expected_z5[] = {0x8003e0018001e000, 0x7fff1fff7ffd1fff};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
     uint64_t expected_z6[] = {0x000000000000ffff, 0x0000000000000000};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
     uint64_t expected_z7[] = {0x0000000000000000, 0x0000000000000000};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
     uint64_t expected_z8[] = {0xc0000001c0000000, 0x3fffffff3fffffff};
     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
     uint64_t expected_z9[] = {0x80000001e0000000, 0x7fffffff1fffffff};
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
     uint64_t expected_z10[] = {0x00000000ffffffff, 0x0000000000000000};
     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
     uint64_t expected_z11[] = {0x0000000000000000, 0x0000000000000000};
     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
     uint64_t expected_z12[] = {0xc000000000000000, 0x3fffffffffffffff};
     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
     uint64_t expected_z13[] = {0x8000000000000000, 0x1fffffffffffffff};
     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
     uint64_t expected_z14[] = {0xffffffffffffffff, 0x0000000000000000};
     ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
     uint64_t expected_z31[] = {0x0000000000000000, 0x0000000000000000};
     ASSERT_EQUAL_SVE(expected_z31, z31.VnD());
   }
 }

 TEST_SVE(sve_setffr) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   __ Ptrue(p15.VnB());
   __ Setffr();
   __ Rdffr(p14.VnB());

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(p14.VnB(), p15.VnB());
   }
 }

 static void WrffrHelper(Test* config, unsigned active_lanes) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int inputs[kPRegMaxSize] = {0};
   VIXL_ASSERT(active_lanes <= kPRegMaxSize);
   for (unsigned i = 0; i < active_lanes; i++) {
     // The rightmost (highest-indexed) array element maps to the lowest-numbered
     // lane.
     inputs[kPRegMaxSize - i - 1] = 1;
   }

   Initialise(&masm, p1.VnB(), inputs);
   __ Wrffr(p1.VnB());
   __ Rdffr(p2.VnB());

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(p1.VnB(), p2.VnB());
   }
 }

 TEST_SVE(sve_wrffr) {
   int active_lanes_inputs[] = {0, 1, 7, 10, 32, 48, kPRegMaxSize};
   for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) {
     WrffrHelper(config, active_lanes_inputs[i]);
   }
 }

 template <size_t N>
 static void RdffrHelper(Test* config,
                         size_t active_lanes,
                         const int (&pg_inputs)[N]) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   VIXL_ASSERT(active_lanes <= kPRegMaxSize);

   // The rightmost (highest-indexed) array element maps to the lowest-numbered
   // lane.
   int pd[kPRegMaxSize] = {0};
   for (unsigned i = 0; i < active_lanes; i++) {
     pd[kPRegMaxSize - i - 1] = 1;
   }

   int pg[kPRegMaxSize] = {0};
   for (unsigned i = 0; i < N; i++) {
     pg[kPRegMaxSize - i - 1] = pg_inputs[i];
   }

   int pd_expected[kPRegMaxSize] = {0};
   for (unsigned i = 0; i < std::min(active_lanes, N); i++) {
     int lane = kPRegMaxSize - i - 1;
     pd_expected[lane] = pd[lane] & pg[lane];
   }

   Initialise(&masm, p0.VnB(), pg);
   Initialise(&masm, p1.VnB(), pd);

   // The unpredicated form of rdffr has been tested in `WrffrHelper`.
   __ Wrffr(p1.VnB());
   __ Rdffr(p14.VnB(), p0.Zeroing());
   __ Rdffrs(p13.VnB(), p0.Zeroing());
   __ Mrs(x8, NZCV);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(pd_expected, p14.VnB());
     ASSERT_EQUAL_SVE(pd_expected, p13.VnB());
     StatusFlags nzcv_expected =
         GetPredTestFlags(pd_expected, pg, core.GetSVELaneCount(kBRegSize));
     ASSERT_EQUAL_64(nzcv_expected, x8);
   }
 }

 TEST_SVE(sve_rdffr_rdffrs) {
   // clang-format off
   int active_lanes_inputs[] = {0, 1, 15, 26, 39, 47, kPRegMaxSize};
   int pg_inputs_0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int pg_inputs_1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
   int pg_inputs_2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
   int pg_inputs_3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
   int pg_inputs_4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   // clang-format on

   for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) {
     RdffrHelper(config, active_lanes_inputs[i], pg_inputs_0);
     RdffrHelper(config, active_lanes_inputs[i], pg_inputs_1);
     RdffrHelper(config, active_lanes_inputs[i], pg_inputs_2);
     RdffrHelper(config, active_lanes_inputs[i], pg_inputs_3);
     RdffrHelper(config, active_lanes_inputs[i], pg_inputs_4);
   }
 }

 typedef void (MacroAssembler::*BrkpFn)(const PRegisterWithLaneSize& pd,
                                        const PRegisterZ& pg,
                                        const PRegisterWithLaneSize& pn,
                                        const PRegisterWithLaneSize& pm);

 template <typename Tg, typename Tn, typename Td>
 static void BrkpaBrkpbHelper(Test* config,
                              BrkpFn macro,
                              BrkpFn macro_set_flags,
                              const Tg& pg_inputs,
                              const Tn& pn_inputs,
                              const Tn& pm_inputs,
                              const Td& pd_expected) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   PRegister pg = p15;
   PRegister pn = p14;
   PRegister pm = p13;
   Initialise(&masm, pg.VnB(), pg_inputs);
   Initialise(&masm, pn.VnB(), pn_inputs);
   Initialise(&masm, pm.VnB(), pm_inputs);

   // Initialise NZCV to an impossible value, to check that we actually write it.
   __ Mov(x10, NZCVFlag);
   __ Msr(NZCV, x10);

   (masm.*macro_set_flags)(p0.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
   __ Mrs(x0, NZCV);

   (masm.*macro)(p1.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(pd_expected, p0.VnB());

     // Check that the flags were properly set.
     StatusFlags nzcv_expected =
         GetPredTestFlags(pd_expected,
                          pg_inputs,
                          core.GetSVELaneCount(kBRegSize));
     ASSERT_EQUAL_64(nzcv_expected, x0);
     ASSERT_EQUAL_SVE(p0.VnB(), p1.VnB());
   }
 }

 template <typename Tg, typename Tn, typename Td>
 static void BrkpaHelper(Test* config,
                         const Tg& pg_inputs,
                         const Tn& pn_inputs,
                         const Tn& pm_inputs,
                         const Td& pd_expected) {
   BrkpaBrkpbHelper(config,
                    &MacroAssembler::Brkpa,
                    &MacroAssembler::Brkpas,
                    pg_inputs,
                    pn_inputs,
                    pm_inputs,
                    pd_expected);
 }

 template <typename Tg, typename Tn, typename Td>
 static void BrkpbHelper(Test* config,
                         const Tg& pg_inputs,
                         const Tn& pn_inputs,
                         const Tn& pm_inputs,
                         const Td& pd_expected) {
   BrkpaBrkpbHelper(config,
                    &MacroAssembler::Brkpb,
                    &MacroAssembler::Brkpbs,
                    pg_inputs,
                    pn_inputs,
                    pm_inputs,
                    pd_expected);
 }

 TEST_SVE(sve_brkpb) {
   // clang-format off
   // The last active element of `pn` are `true` in all vector length configurations.
   //                                | boundary of 128-bits VL.
   //                                v
   int pg_1[] =      {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
   int pg_2[] =      {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
   int pg_3[] =      {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};

   //                 | highest-numbered lane                lowest-numbered lane |
   //                 v                                                           v
   int pn_1[] =      {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
   int pn_2[] =      {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
   int pn_3[] =      {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1};

   int pm_1[] =      {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
   int pm_2[] =      {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int pm_3[] =      {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};

   //                                                                    | first active
   //                                                                    v
   int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
   //                                            | first active
   //                                            v
   int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
   //                                                                    | first active
   //                                                                    v
   int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};

   BrkpbHelper(config, pg_1, pn_1, pm_1, exp_1_1_1);
   BrkpbHelper(config, pg_1, pn_2, pm_2, exp_1_2_2);
   BrkpbHelper(config, pg_1, pn_3, pm_3, exp_1_3_3);

   //                                               | first active
   //                                               v
   int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
   //                                            | first active
   //                                            v
   int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
   //                                                                 | first active
   //                                                                 v
   int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
   BrkpbHelper(config, pg_2, pn_1, pm_2, exp_2_1_2);
   BrkpbHelper(config, pg_2, pn_2, pm_3, exp_2_2_3);
   BrkpbHelper(config, pg_2, pn_3, pm_1, exp_2_3_1);

   //                                                                    | first active
   //                                                                    v
   int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
   //                                                                    | first active
   //                                                                    v
   int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
   //                                      | first active
   //                                      v
   int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
   BrkpbHelper(config, pg_3, pn_1, pm_3, exp_3_1_3);
   BrkpbHelper(config, pg_3, pn_2, pm_1, exp_3_2_1);
   BrkpbHelper(config, pg_3, pn_3, pm_2, exp_3_3_2);

   // The last active element of `pn` are `false` in all vector length configurations.
   //                       | last active lane when VL > 128 bits.
   //                       v
   //                                   | last active lane when VL == 128 bits.
   //                                   v
   int pg_4[] =      {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
   int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   BrkpbHelper(config, pg_4, pn_1, pm_1, exp_4_x_x);
   BrkpbHelper(config, pg_4, pn_2, pm_2, exp_4_x_x);
   BrkpbHelper(config, pg_4, pn_3, pm_3, exp_4_x_x);
   // clang-format on
 }

 TEST_SVE(sve_brkpa) {
   // clang-format off
   // The last active element of `pn` are `true` in all vector length configurations.
   //                                | boundary of 128-bits VL.
   //                                v
   int pg_1[] =      {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
   int pg_2[] =      {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
   int pg_3[] =      {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};

   //                 | highest-numbered lane                lowest-numbered lane |
   //                 v                                                           v
   int pn_1[] =      {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
   int pn_2[] =      {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
   int pn_3[] =      {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1};

   int pm_1[] =      {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
   int pm_2[] =      {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int pm_3[] =      {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};

   //                                                                    | first active
   //                                                                    v
   int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
   //                                            | first active
   //                                            v
   int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
   //                                                                    | first active
   //                                                                    v
   int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};

   BrkpaHelper(config, pg_1, pn_1, pm_1, exp_1_1_1);
   BrkpaHelper(config, pg_1, pn_2, pm_2, exp_1_2_2);
   BrkpaHelper(config, pg_1, pn_3, pm_3, exp_1_3_3);

   //                                               | first active
   //                                               v
   int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
   //                                            | first active
   //                                            v
   int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
   //                                                                 | first active
   //                                                                 v
   int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1};
   BrkpaHelper(config, pg_2, pn_1, pm_2, exp_2_1_2);
   BrkpaHelper(config, pg_2, pn_2, pm_3, exp_2_2_3);
   BrkpaHelper(config, pg_2, pn_3, pm_1, exp_2_3_1);

   //                                                                    | first active
   //                                                                    v
   int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
   //                                                                    | first active
   //                                                                    v
   int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
   //                                      | first active
   //                                      v
   int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
   BrkpaHelper(config, pg_3, pn_1, pm_3, exp_3_1_3);
   BrkpaHelper(config, pg_3, pn_2, pm_1, exp_3_2_1);
   BrkpaHelper(config, pg_3, pn_3, pm_2, exp_3_3_2);

   // The last active element of `pn` are `false` in all vector length configurations.
   //                       | last active lane when VL > 128 bits.
   //                       v
   //                                   | last active lane when VL == 128 bits.
   //                                   v
   int pg_4[] =      {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
   int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   BrkpaHelper(config, pg_4, pn_1, pm_1, exp_4_x_x);
   BrkpaHelper(config, pg_4, pn_2, pm_2, exp_4_x_x);
   BrkpaHelper(config, pg_4, pn_3, pm_3, exp_4_x_x);
   // clang-format on
 }

 TEST_SVE(sve_rbit) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55};
   InsrHelper(&masm, z0.VnD(), inputs);

   __ Ptrue(p1.VnB());
   int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
   Initialise(&masm, p2.VnB(), pred);

   __ Rbit(z0.VnB(), p1.Merging(), z0.VnB());
   __ Rbit(z0.VnB(), p1.Merging(), z0.VnB());

   __ Rbit(z1.VnB(), p1.Merging(), z0.VnB());
   __ Rbit(z2.VnH(), p1.Merging(), z0.VnH());
   __ Rbit(z3.VnS(), p1.Merging(), z0.VnS());
   __ Rbit(z4.VnD(), p1.Merging(), z0.VnD());

   __ Dup(z5.VnB(), 0x42);
   __ Rbit(z5.VnB(), p2.Merging(), z0.VnB());
   __ Dup(z6.VnB(), 0x42);
   __ Rbit(z6.VnS(), p2.Merging(), z0.VnS());

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(inputs, z0.VnD());

     uint64_t expected_z1[] = {0x55555555aaaaaaaa, 0x5555aaaa55aa55aa};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
     uint64_t expected_z2[] = {0x55555555aaaaaaaa, 0x5555aaaaaa55aa55};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
     uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0xaaaa5555aa55aa55};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
     uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0xaa55aa55aaaa5555};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
     uint64_t expected_z5[] = {0x4255425542aa42aa, 0x4255424242aa42aa};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
     uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0x42424242aa55aa55};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
   }
 }

 TEST_SVE(sve_rev_bhw) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55};
   InsrHelper(&masm, z0.VnD(), inputs);

   __ Ptrue(p1.VnB());
   int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
   Initialise(&masm, p2.VnB(), pred);

   __ Revb(z1.VnH(), p1.Merging(), z0.VnH());
   __ Revb(z2.VnS(), p1.Merging(), z0.VnS());
   __ Revb(z3.VnD(), p1.Merging(), z0.VnD());
   __ Revh(z4.VnS(), p1.Merging(), z0.VnS());
   __ Revh(z5.VnD(), p1.Merging(), z0.VnD());
   __ Revw(z6.VnD(), p1.Merging(), z0.VnD());

   __ Dup(z7.VnB(), 0x42);
   __ Revb(z7.VnH(), p2.Merging(), z0.VnH());
   __ Dup(z8.VnB(), 0x42);
   __ Revh(z8.VnS(), p2.Merging(), z0.VnS());

   END();

   if (CAN_RUN()) {
     RUN();

     uint64_t expected_z1[] = {0xaaaaaaaa55555555, 0xaaaa555555aa55aa};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
     uint64_t expected_z2[] = {0xaaaaaaaa55555555, 0x5555aaaa55aa55aa};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
     uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0x55aa55aa5555aaaa};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
     uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0x5555aaaaaa55aa55};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
     uint64_t expected_z5[] = {0x55555555aaaaaaaa, 0xaa55aa555555aaaa};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
     uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0xaa55aa55aaaa5555};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
     uint64_t expected_z7[] = {0xaaaaaaaa55555555, 0xaaaa424255aa55aa};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
     uint64_t expected_z8[] = {0xaaaaaaaa55555555, 0x42424242aa55aa55};
     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
   }
 }

 TEST_SVE(sve_ftssel) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t in[] = {0x1111777766665555, 0xaaaabbbbccccdddd};
   uint64_t q[] = {0x0001000300000002, 0x0001000200000003};
   InsrHelper(&masm, z0.VnD(), in);
   InsrHelper(&masm, z1.VnD(), q);

   __ Ftssel(z2.VnH(), z0.VnH(), z1.VnH());
   __ Ftssel(z3.VnS(), z0.VnS(), z1.VnS());
   __ Ftssel(z4.VnD(), z0.VnD(), z1.VnD());

   END();

   if (CAN_RUN()) {
     RUN();

     uint64_t expected_z2[] = {0x3c00bc006666d555, 0x3c003bbbccccbc00};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
     uint64_t expected_z3[] = {0xbf800000e6665555, 0x2aaabbbbbf800000};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
     uint64_t expected_z4[] = {0x9111777766665555, 0xbff0000000000000};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
   }
 }

 TEST_SVE(sve_fexpa) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t in0[] = {0x3ff0000000000000, 0x3ff0000000011001};
   uint64_t in1[] = {0x3ff000000002200f, 0xbff000000003301f};
   uint64_t in2[] = {0xbff000000004403f, 0x3ff0000000055040};
   uint64_t in3[] = {0x3f800000bf800001, 0x3f80000f3f80001f};
   uint64_t in4[] = {0x3f80002f3f82203f, 0xbf8000403f833041};
   uint64_t in5[] = {0x3c003c01bc00bc07, 0x3c08bc0f3c1fbc20};
   InsrHelper(&masm, z0.VnD(), in0);
   InsrHelper(&masm, z1.VnD(), in1);
   InsrHelper(&masm, z2.VnD(), in2);
   InsrHelper(&masm, z3.VnD(), in3);
   InsrHelper(&masm, z4.VnD(), in4);
   InsrHelper(&masm, z5.VnD(), in5);

   __ Fexpa(z6.VnD(), z0.VnD());
   __ Fexpa(z7.VnD(), z1.VnD());
   __ Fexpa(z8.VnD(), z2.VnD());
   __ Fexpa(z9.VnS(), z3.VnS());
   __ Fexpa(z10.VnS(), z4.VnS());
   __ Fexpa(z11.VnH(), z5.VnH());

   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t expected_z6[] = {0x0000000000000000, 0x44002c9a3e778061};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
     uint64_t expected_z7[] = {0x0802d285a6e4030b, 0x4c06623882552225};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
     uint64_t expected_z8[] = {0x100fa7c1819e90d8, 0x5410000000000000};
     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
     uint64_t expected_z9[] = {0x00000000000164d2, 0x0016942d003311c4};
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
     uint64_t expected_z10[] = {0x0054f35b407d3e0c, 0x00800000608164d2};
     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
     uint64_t expected_z11[] = {0x00000016000000a8, 0x00c2018903d40400};
     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
   }
 }

 TEST_SVE(sve_rev_p) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   Initialise(&masm,
              p0.VnB(),
              0xabcdabcdabcdabcd,
              0xabcdabcdabcdabcd,
              0xabcdabcdabcdabcd,
              0xabcdabcdabcdabcd);

   __ Rev(p1.VnB(), p0.VnB());
   __ Rev(p2.VnH(), p0.VnH());
   __ Rev(p3.VnS(), p0.VnS());
   __ Rev(p4.VnD(), p0.VnD());

   END();

   if (CAN_RUN()) {
     RUN();

     int p1_expected[] = {1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1};
     ASSERT_EQUAL_SVE(p1_expected, p1.VnB());
     int p2_expected[] = {0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0};
     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
     int p3_expected[] = {1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0};
     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
     int p4_expected[] = {1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1};
     ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
   }
 }

 TEST_SVE(sve_trn_p_bh) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   Initialise(&masm, p0.VnB(), 0xa5a55a5a);
   __ Pfalse(p1.VnB());

   __ Trn1(p2.VnB(), p0.VnB(), p0.VnB());
   __ Trn2(p3.VnB(), p0.VnB(), p0.VnB());
   __ Trn1(p4.VnB(), p1.VnB(), p0.VnB());
   __ Trn2(p5.VnB(), p1.VnB(), p0.VnB());
   __ Trn1(p6.VnB(), p0.VnB(), p1.VnB());
   __ Trn2(p7.VnB(), p0.VnB(), p1.VnB());

   __ Trn1(p8.VnH(), p0.VnH(), p0.VnH());
   __ Trn2(p9.VnH(), p0.VnH(), p0.VnH());
   __ Trn1(p10.VnH(), p1.VnH(), p0.VnH());
   __ Trn2(p11.VnH(), p1.VnH(), p0.VnH());
   __ Trn1(p12.VnH(), p0.VnH(), p1.VnH());
   __ Trn2(p13.VnH(), p0.VnH(), p1.VnH());

   END();

   if (CAN_RUN()) {
     RUN();
     int p2_expected[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0};
     int p3_expected[] = {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());

     int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
     int p5_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0};
     ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
     ASSERT_EQUAL_SVE(p5_expected, p5.VnB());

     int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0};
     int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1};
     ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
     ASSERT_EQUAL_SVE(p7_expected, p7.VnB());

     int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
     int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
     ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
     ASSERT_EQUAL_SVE(p9_expected, p9.VnB());

     int p10_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0};
     int p11_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0};
     ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
     ASSERT_EQUAL_SVE(p11_expected, p11.VnB());

     int p12_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0};
     int p13_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0};
     ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
     ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
   }
 }

 TEST_SVE(sve_trn_p_sd) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   Initialise(&masm, p0.VnB(), 0x55a55aaa);
   __ Pfalse(p1.VnB());

   __ Trn1(p2.VnS(), p0.VnS(), p0.VnS());
   __ Trn2(p3.VnS(), p0.VnS(), p0.VnS());
   __ Trn1(p4.VnS(), p1.VnS(), p0.VnS());
   __ Trn2(p5.VnS(), p1.VnS(), p0.VnS());
   __ Trn1(p6.VnS(), p0.VnS(), p1.VnS());
   __ Trn2(p7.VnS(), p0.VnS(), p1.VnS());

   __ Trn1(p8.VnD(), p0.VnD(), p0.VnD());
   __ Trn2(p9.VnD(), p0.VnD(), p0.VnD());
   __ Trn1(p10.VnD(), p1.VnD(), p0.VnD());
   __ Trn2(p11.VnD(), p1.VnD(), p0.VnD());
   __ Trn1(p12.VnD(), p0.VnD(), p1.VnD());
   __ Trn2(p13.VnD(), p0.VnD(), p1.VnD());

   END();

   if (CAN_RUN()) {
     RUN();
     int p2_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
     int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());

     int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
     int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
     ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
     ASSERT_EQUAL_SVE(p5_expected, p5.VnB());

     int p6_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0};
     int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
     ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
     ASSERT_EQUAL_SVE(p7_expected, p7.VnB());

     int p8_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
     int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
     ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
     ASSERT_EQUAL_SVE(p9_expected, p9.VnB());

     int p10_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
     ASSERT_EQUAL_SVE(p11_expected, p11.VnB());

     int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0};
     int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
     ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
     ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
   }
 }

 TEST_SVE(sve_zip_p_bh) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   Initialise(&masm,
              p0.VnB(),
              0x5a5a5a5a5a5a5a5a,
              0x5a5a5a5a5a5a5a5a,
              0x5a5a5a5a5a5a5a5a,
              0x5a5a5a5a5a5a5a5a);
   __ Pfalse(p1.VnB());

   __ Zip1(p2.VnB(), p0.VnB(), p0.VnB());
   __ Zip2(p3.VnB(), p0.VnB(), p0.VnB());
   __ Zip1(p4.VnB(), p1.VnB(), p0.VnB());
   __ Zip2(p5.VnB(), p1.VnB(), p0.VnB());
   __ Zip1(p6.VnB(), p0.VnB(), p1.VnB());
   __ Zip2(p7.VnB(), p0.VnB(), p1.VnB());

   __ Zip1(p8.VnH(), p0.VnH(), p0.VnH());
   __ Zip2(p9.VnH(), p0.VnH(), p0.VnH());
   __ Zip1(p10.VnH(), p1.VnH(), p0.VnH());
   __ Zip2(p11.VnH(), p1.VnH(), p0.VnH());
   __ Zip1(p12.VnH(), p0.VnH(), p1.VnH());
   __ Zip2(p13.VnH(), p0.VnH(), p1.VnH());

   END();

   if (CAN_RUN()) {
     RUN();
     int p2_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0};
     int p3_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0};
     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());

     int p4_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0};
     int p5_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0};
     ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
     ASSERT_EQUAL_SVE(p5_expected, p5.VnB());

     int p6_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0};
     int p7_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0};
     ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
     ASSERT_EQUAL_SVE(p7_expected, p7.VnB());

     int p8_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
     int p9_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
     ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
     ASSERT_EQUAL_SVE(p9_expected, p9.VnB());

     int p10_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
     int p11_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
     ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
     ASSERT_EQUAL_SVE(p11_expected, p11.VnB());

     int p12_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
     int p13_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
     ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
     ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
   }
 }

 TEST_SVE(sve_zip_p_sd) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   Initialise(&masm,
              p0.VnB(),
              0x5a5a5a5a5a5a5a5a,
              0x5a5a5a5a5a5a5a5a,
              0x5a5a5a5a5a5a5a5a,
              0x5a5a5a5a5a5a5a5a);
   __ Pfalse(p1.VnB());

   __ Zip1(p2.VnS(), p0.VnS(), p0.VnS());
   __ Zip2(p3.VnS(), p0.VnS(), p0.VnS());
   __ Zip1(p4.VnS(), p1.VnS(), p0.VnS());
   __ Zip2(p5.VnS(), p1.VnS(), p0.VnS());
   __ Zip1(p6.VnS(), p0.VnS(), p1.VnS());
   __ Zip2(p7.VnS(), p0.VnS(), p1.VnS());

   __ Zip1(p8.VnD(), p0.VnD(), p0.VnD());
   __ Zip2(p9.VnD(), p0.VnD(), p0.VnD());
   __ Zip1(p10.VnD(), p1.VnD(), p0.VnD());
   __ Zip2(p11.VnD(), p1.VnD(), p0.VnD());
   __ Zip1(p12.VnD(), p0.VnD(), p1.VnD());
   __ Zip2(p13.VnD(), p0.VnD(), p1.VnD());

   END();

   if (CAN_RUN()) {
     RUN();
     int p2_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
     int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
     ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
     ASSERT_EQUAL_SVE(p3_expected, p3.VnB());

     int p4_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
     int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
     ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
     ASSERT_EQUAL_SVE(p5_expected, p5.VnB());

     int p6_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
     int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
     ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
     ASSERT_EQUAL_SVE(p7_expected, p7.VnB());

     int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
     int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
     ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
     ASSERT_EQUAL_SVE(p9_expected, p9.VnB());

     int p10_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
     ASSERT_EQUAL_SVE(p11_expected, p11.VnB());

     int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
     int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
     ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
     ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
   }
 }

 TEST_SVE(sve_uzp_p) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   Initialise(&masm,
              p0.VnB(),
              0xf0f0ff00ffff0000,
              0x4242424242424242,
              0x5a5a5a5a5a5a5a5a,
              0x0123456789abcdef);
   __ Rev(p1.VnB(), p0.VnB());

   __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
   __ Zip2(p3.VnB(), p0.VnB(), p1.VnB());
   __ Uzp1(p4.VnB(), p2.VnB(), p3.VnB());
   __ Uzp2(p5.VnB(), p2.VnB(), p3.VnB());

   __ Zip1(p2.VnH(), p0.VnH(), p1.VnH());
   __ Zip2(p3.VnH(), p0.VnH(), p1.VnH());
   __ Uzp1(p6.VnH(), p2.VnH(), p3.VnH());
   __ Uzp2(p7.VnH(), p2.VnH(), p3.VnH());

   __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
   __ Zip2(p3.VnS(), p0.VnS(), p1.VnS());
   __ Uzp1(p8.VnS(), p2.VnS(), p3.VnS());
   __ Uzp2(p9.VnS(), p2.VnS(), p3.VnS());

   __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
   __ Zip2(p3.VnD(), p0.VnD(), p1.VnD());
   __ Uzp1(p10.VnD(), p2.VnD(), p3.VnD());
   __ Uzp2(p11.VnD(), p2.VnD(), p3.VnD());

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(p0, p4);
     ASSERT_EQUAL_SVE(p1, p5);
     ASSERT_EQUAL_SVE(p0, p6);
     ASSERT_EQUAL_SVE(p1, p7);
     ASSERT_EQUAL_SVE(p0, p8);
     ASSERT_EQUAL_SVE(p1, p9);
     ASSERT_EQUAL_SVE(p0, p10);
     ASSERT_EQUAL_SVE(p1, p11);
   }
 }

 TEST_SVE(sve_punpk) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   auto get_64_bits_at = [](int byte_index) -> uint64_t {
     // Each 8-bit chunk has the value 0x50 + the byte index of the chunk.
     return 0x5756555453525150 + (0x0101010101010101 * byte_index);
   };

   Initialise(&masm,
              p0.VnB(),
              get_64_bits_at(24),
              get_64_bits_at(16),
              get_64_bits_at(8),
              get_64_bits_at(0));
   __ Punpklo(p1.VnH(), p0.VnB());
   __ Punpkhi(p2.VnH(), p0.VnB());

   END();

   if (CAN_RUN()) {
     RUN();

     int pl = config->sve_vl_in_bits() / kZRegBitsPerPRegBit;
     // For simplicity, just test the bottom 64 H-sized lanes.
     uint64_t p1_h_bits = get_64_bits_at(0);
     uint64_t p2_h_bits = get_64_bits_at(pl / (2 * 8));
     int p1_expected[64];
     int p2_expected[64];
     for (size_t i = 0; i < 64; i++) {
       p1_expected[63 - i] = (p1_h_bits >> i) & 1;
       p2_expected[63 - i] = (p2_h_bits >> i) & 1;
     }
     // Testing `VnH` ensures that odd-numbered B lanes are zero.
     ASSERT_EQUAL_SVE(p1_expected, p1.VnH());
     ASSERT_EQUAL_SVE(p2_expected, p2.VnH());
   }
 }

 typedef void (MacroAssembler::*BrkFn)(const PRegisterWithLaneSize& pd,
                                       const PRegister& pg,
                                       const PRegisterWithLaneSize& pn);

 typedef void (MacroAssembler::*BrksFn)(const PRegisterWithLaneSize& pd,
                                        const PRegisterZ& pg,
                                        const PRegisterWithLaneSize& pn);

 template <typename T, size_t N>
 static void BrkaBrkbHelper(Test* config,
                            BrkFn macro,
                            BrksFn macro_set_flags,
                            const T (&pd_inputs)[N],
                            const T (&pg_inputs)[N],
                            const T (&pn_inputs)[N],
                            const T (&pd_z_expected)[N]) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   PRegister pg = p10;
   PRegister pn = p9;
   PRegister pd_z = p0;
   PRegister pd_z_s = p1;
   PRegister pd_m = p2;
   Initialise(&masm, pg.VnB(), pg_inputs);
   Initialise(&masm, pn.VnB(), pn_inputs);
   Initialise(&masm, pd_m.VnB(), pd_inputs);

   // Initialise NZCV to an impossible value, to check that we actually write it.
   __ Mov(x10, NZCVFlag);
   __ Msr(NZCV, x10);

   (masm.*macro)(pd_z.VnB(), pg.Zeroing(), pn.VnB());
   (masm.*macro_set_flags)(pd_z_s.VnB(), pg.Zeroing(), pn.VnB());
   __ Mrs(x0, NZCV);

   (masm.*macro)(pd_m.VnB(), pg.Merging(), pn.VnB());

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(pd_z_expected, pd_z.VnB());

     // Check that the flags were properly set.
     StatusFlags nzcv_expected =
         GetPredTestFlags(pd_z_expected,
                          pg_inputs,
                          core.GetSVELaneCount(kBRegSize));
     ASSERT_EQUAL_64(nzcv_expected, x0);
     ASSERT_EQUAL_SVE(pd_z.VnB(), pd_z_s.VnB());

     T pd_m_expected[N];
     // Set expected `pd` result on merging predication.
     for (size_t i = 0; i < N; i++) {
       pd_m_expected[i] = pg_inputs[i] ? pd_z_expected[i] : pd_inputs[i];
     }
     ASSERT_EQUAL_SVE(pd_m_expected, pd_m.VnB());
   }
 }

 template <typename T>
 static void BrkaHelper(Test* config,
                        const T& pd_inputs,
                        const T& pg_inputs,
                        const T& pn_inputs,
                        const T& pd_expected) {
   BrkaBrkbHelper(config,
                  &MacroAssembler::Brka,
                  &MacroAssembler::Brkas,
                  pd_inputs,
                  pg_inputs,
                  pn_inputs,
                  pd_expected);
 }

 TEST_SVE(sve_brka) {
   // clang-format off
   //                              | boundary of 128-bits VL.
   //                              v
   int pd[] =      {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

   //               | highest-numbered lane                lowest-numbered lane |
   //               v                                                           v
   int pg_1[] =    {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
   int pg_2[] =    {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};

   int pn_1[] =    {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
   int pn_2[] =    {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int pn_3[] =    {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1};

   //                                                                  | first break
   //                                                                  v
   int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
   //                              | first break
   //                              v
   int exp_1_2[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
   //                                                      | first break
   //                                                      v
   int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};

   BrkaHelper(config, pd, pg_1, pn_1, exp_1_1);
   BrkaHelper(config, pd, pg_1, pn_2, exp_1_2);
   BrkaHelper(config, pd, pg_1, pn_3, exp_1_3);

   //                                                               | first break
   //                                                               v
   int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1};
   //                                       | first break
   //                                       v
   int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
   //                                                                           | first break
   //                                                                           v
   int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
   BrkaHelper(config, pd, pg_2, pn_1, exp_2_1);
   BrkaHelper(config, pd, pg_2, pn_2, exp_2_2);
   BrkaHelper(config, pd, pg_2, pn_3, exp_2_3);

   // The all-inactive zeroing predicate sets destination predicate all-false.
   int pg_3[] =    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   BrkaHelper(config, pd, pg_3, pn_1, exp_3_x);
   BrkaHelper(config, pd, pg_3, pn_2, exp_3_x);
   BrkaHelper(config, pd, pg_3, pn_3, exp_3_x);
   // clang-format on
 }

 template <typename T>
 static void BrkbHelper(Test* config,
                        const T& pd_inputs,
                        const T& pg_inputs,
                        const T& pn_inputs,
                        const T& pd_expected) {
   BrkaBrkbHelper(config,
                  &MacroAssembler::Brkb,
                  &MacroAssembler::Brkbs,
                  pd_inputs,
                  pg_inputs,
                  pn_inputs,
                  pd_expected);
 }

 TEST_SVE(sve_brkb) {
   // clang-format off
   //                              | boundary of 128-bits VL.
   //                              v
   int pd[] =      {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

   //               | highest-numbered lane                lowest-numbered lane |
   //               v                                                           v
   int pg_1[] =    {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
   int pg_2[] =    {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};

   int pn_1[] =    {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
   int pn_2[] =    {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int pn_3[] =    {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1};

   //                                                                  | first break
   //                                                                  v
   int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
   //                              | first break
   //                              v
   int exp_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
   //                                                      | first break
   //                                                      v
   int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0};

   BrkbHelper(config, pd, pg_1, pn_1, exp_1_1);
   BrkbHelper(config, pd, pg_1, pn_2, exp_1_2);
   BrkbHelper(config, pd, pg_1, pn_3, exp_1_3);

   //                                                               | first break
   //                                                               v
   int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
   //                                       | first break
   //                                       v
   int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
   //                                                                           | first break
   //                                                                           v
   int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   BrkbHelper(config, pd, pg_2, pn_1, exp_2_1);
   BrkbHelper(config, pd, pg_2, pn_2, exp_2_2);
   BrkbHelper(config, pd, pg_2, pn_3, exp_2_3);

   // The all-inactive zeroing predicate sets destination predicate all-false.
   int pg_3[] =    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   BrkbHelper(config, pd, pg_3, pn_1, exp_3_x);
   BrkbHelper(config, pd, pg_3, pn_2, exp_3_x);
   BrkbHelper(config, pd, pg_3, pn_3, exp_3_x);
   // clang-format on
 }

 typedef void (MacroAssembler::*BrknFn)(const PRegisterWithLaneSize& pd,
                                        const PRegisterZ& pg,
                                        const PRegisterWithLaneSize& pn,
                                        const PRegisterWithLaneSize& pm);

 typedef void (MacroAssembler::*BrknsFn)(const PRegisterWithLaneSize& pd,
                                         const PRegisterZ& pg,
                                         const PRegisterWithLaneSize& pn,
                                         const PRegisterWithLaneSize& pm);

 enum BrknDstPredicateState { kAllFalse, kUnchanged };

 template <typename T, size_t N>
 static void BrknHelper(Test* config,
                        const T (&pd_inputs)[N],
                        const T (&pg_inputs)[N],
                        const T (&pn_inputs)[N],
                        const T (&pm_inputs)[N],
                        BrknDstPredicateState expected_pd_state) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   PRegister pg = p10;
   PRegister pn = p9;
   PRegister pm = p8;
   PRegister pdm = p0;
   PRegister pd = p1;
   PRegister pd_s = p2;
   Initialise(&masm, pg.VnB(), pg_inputs);
   Initialise(&masm, pn.VnB(), pn_inputs);
   Initialise(&masm, pm.VnB(), pm_inputs);
   Initialise(&masm, pdm.VnB(), pm_inputs);
   Initialise(&masm, pd.VnB(), pd_inputs);
   Initialise(&masm, pd_s.VnB(), pd_inputs);

   // Initialise NZCV to an impossible value, to check that we actually write it.
   __ Mov(x10, NZCVFlag);
   __ Msr(NZCV, x10);

   __ Brkn(pdm.VnB(), pg.Zeroing(), pn.VnB(), pdm.VnB());
   // !pd.Aliases(pm).
   __ Brkn(pd.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
   __ Brkns(pd_s.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
   __ Mrs(x0, NZCV);

   END();

   if (CAN_RUN()) {
     RUN();

     T all_false[N] = {0};
     if (expected_pd_state == kAllFalse) {
       ASSERT_EQUAL_SVE(all_false, pd.VnB());
     } else {
       ASSERT_EQUAL_SVE(pm_inputs, pd.VnB());
     }
     ASSERT_EQUAL_SVE(pm_inputs, pm.VnB());

     T all_true[N];
     for (size_t i = 0; i < ArrayLength(all_true); i++) {
       all_true[i] = 1;
     }

     // Check that the flags were properly set.
     StatusFlags nzcv_expected =
         GetPredTestFlags((expected_pd_state == kAllFalse) ? all_false
                                                           : pm_inputs,
                          all_true,
                          core.GetSVELaneCount(kBRegSize));
     ASSERT_EQUAL_64(nzcv_expected, x0);
     ASSERT_EQUAL_SVE(pd.VnB(), pdm.VnB());
     ASSERT_EQUAL_SVE(pd.VnB(), pd_s.VnB());
   }
 }

 TEST_SVE(sve_brkn) {
   int pd[] = {1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
   int pm[] = {0, 1, 1, 1, 1, 0, 0, 1, 0, 1};

   int pg_1[] = {1, 1, 0, 0, 1, 0, 1, 1, 0, 0};
   int pg_2[] = {0, 0, 0, 1, 1, 1, 0, 0, 1, 1};
   int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

   int pn_1[] = {1, 0, 0, 0, 0, 1, 1, 0, 0, 0};
   int pn_2[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
   int pn_3[] = {0, 0, 0, 0, 1, 1, 0, 0, 1, 1};

   BrknHelper(config, pd, pg_1, pn_1, pm, kUnchanged);
   BrknHelper(config, pd, pg_1, pn_2, pm, kAllFalse);
   BrknHelper(config, pd, pg_1, pn_3, pm, kAllFalse);

   BrknHelper(config, pd, pg_2, pn_1, pm, kAllFalse);
   BrknHelper(config, pd, pg_2, pn_2, pm, kUnchanged);
   BrknHelper(config, pd, pg_2, pn_3, pm, kAllFalse);

   BrknHelper(config, pd, pg_3, pn_1, pm, kAllFalse);
   BrknHelper(config, pd, pg_3, pn_2, pm, kAllFalse);
   BrknHelper(config, pd, pg_3, pn_3, pm, kAllFalse);
 }

 TEST_SVE(sve_trn) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t in0[] = {0xffeeddccbbaa9988, 0x7766554433221100};
   uint64_t in1[] = {0xaa55aa55aa55aa55, 0x55aa55aa55aa55aa};
   InsrHelper(&masm, z0.VnD(), in0);
   InsrHelper(&masm, z1.VnD(), in1);

   __ Trn1(z2.VnB(), z0.VnB(), z1.VnB());
   __ Trn2(z3.VnB(), z0.VnB(), z1.VnB());
   __ Trn1(z4.VnH(), z0.VnH(), z1.VnH());
   __ Trn2(z5.VnH(), z0.VnH(), z1.VnH());
   __ Trn1(z6.VnS(), z0.VnS(), z1.VnS());
   __ Trn2(z7.VnS(), z0.VnS(), z1.VnS());
   __ Trn1(z8.VnD(), z0.VnD(), z1.VnD());
   __ Trn2(z9.VnD(), z0.VnD(), z1.VnD());

   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t expected_z2[] = {0x55ee55cc55aa5588, 0xaa66aa44aa22aa00};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
     uint64_t expected_z3[] = {0xaaffaaddaabbaa99, 0x5577555555335511};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
     uint64_t expected_z4[] = {0xaa55ddccaa559988, 0x55aa554455aa1100};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
     uint64_t expected_z5[] = {0xaa55ffeeaa55bbaa, 0x55aa776655aa3322};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
     uint64_t expected_z6[] = {0xaa55aa55bbaa9988, 0x55aa55aa33221100};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
     uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0x55aa55aa77665544};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
     uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100};
     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
     uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988};
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
   }
 }

 TEST_SVE(sve_zip_uzp) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   __ Dup(z0.VnD(), 0xffeeddccbbaa9988);
   __ Insr(z0.VnD(), 0x7766554433221100);
   __ Dup(z1.VnD(), 0xaa55aa55aa55aa55);
   __ Insr(z1.VnD(), 0x55aa55aa55aa55aa);

   __ Zip1(z2.VnB(), z0.VnB(), z1.VnB());
   __ Zip2(z3.VnB(), z0.VnB(), z1.VnB());
   __ Zip1(z4.VnH(), z0.VnH(), z1.VnH());
   __ Zip2(z5.VnH(), z0.VnH(), z1.VnH());
   __ Zip1(z6.VnS(), z0.VnS(), z1.VnS());
   __ Zip2(z7.VnS(), z0.VnS(), z1.VnS());
   __ Zip1(z8.VnD(), z0.VnD(), z1.VnD());
   __ Zip2(z9.VnD(), z0.VnD(), z1.VnD());

   __ Uzp1(z10.VnB(), z2.VnB(), z3.VnB());
   __ Uzp2(z11.VnB(), z2.VnB(), z3.VnB());
   __ Uzp1(z12.VnH(), z4.VnH(), z5.VnH());
   __ Uzp2(z13.VnH(), z4.VnH(), z5.VnH());
   __ Uzp1(z14.VnS(), z6.VnS(), z7.VnS());
   __ Uzp2(z15.VnS(), z6.VnS(), z7.VnS());
   __ Uzp1(z16.VnD(), z8.VnD(), z9.VnD());
   __ Uzp2(z17.VnD(), z8.VnD(), z9.VnD());

   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t expected_z2[] = {0x5577aa665555aa44, 0x5533aa225511aa00};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
     uint64_t expected_z3[] = {0xaaff55eeaadd55cc, 0xaabb55aaaa995588};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
     uint64_t expected_z4[] = {0x55aa776655aa5544, 0x55aa332255aa1100};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
     uint64_t expected_z5[] = {0xaa55ffeeaa55ddcc, 0xaa55bbaaaa559988};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
     uint64_t expected_z6[] = {0x55aa55aa77665544, 0x55aa55aa33221100};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
     uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0xaa55aa55bbaa9988};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
     uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100};
     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
     uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988};
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());

     // Check uzp is the opposite of zip.
     ASSERT_EQUAL_SVE(z0.VnD(), z10.VnD());
     ASSERT_EQUAL_SVE(z1.VnD(), z11.VnD());
     ASSERT_EQUAL_SVE(z0.VnD(), z12.VnD());
     ASSERT_EQUAL_SVE(z1.VnD(), z13.VnD());
     ASSERT_EQUAL_SVE(z0.VnD(), z14.VnD());
     ASSERT_EQUAL_SVE(z1.VnD(), z15.VnD());
     ASSERT_EQUAL_SVE(z0.VnD(), z16.VnD());
     ASSERT_EQUAL_SVE(z1.VnD(), z17.VnD());
   }
 }

 TEST_SVE(sve_fcadd) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   __ Dup(z30.VnS(), 0);

   __ Ptrue(p0.VnB());
   __ Pfalse(p1.VnB());
   __ Zip1(p2.VnH(), p0.VnH(), p1.VnH());  // Real elements.
   __ Zip1(p3.VnH(), p1.VnH(), p0.VnH());  // Imaginary elements.

   __ Fdup(z0.VnH(), 10.0);  // 10i + 10
   __ Fdup(z1.VnH(), 5.0);   // 5i + 5
   __ Index(z7.VnH(), 1, 1);
   __ Scvtf(z7.VnH(), p0.Merging(), z7.VnH());  // Ai + B

   __ Sel(z2.VnH(), p3, z1.VnH(), z30.VnH());  // 5i + 0
   __ Sel(z3.VnH(), p2, z1.VnH(), z30.VnH());  // 0i + 5
   __ Sel(z7.VnH(), p3, z7.VnH(), z0.VnH());   // Ai + 10
   __ Mov(z8, z7);
   __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 2);
   __ Sel(z8.VnH(), p2, z8.VnH(), z30.VnH());  // 0i + A

   // (10i + 10) + rotate(5i + 0, 90)
   //   = (10i + 10) + (0i - 5)
   //   = 10i + 5
   __ Fcadd(z4.VnH(), p0.Merging(), z0.VnH(), z2.VnH(), 90);

   // (10i + 5) + rotate(0i + 5, 270)
   //   = (10i + 5) + (-5i + 0)
   //   = 5i + 5
   __ Fcadd(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH(), 270);

   // The same calculation, but selecting real/imaginary using predication.
   __ Mov(z5, z0);
   __ Fcadd(z5.VnH(), p2.Merging(), z5.VnH(), z1.VnH(), 90);
   __ Fcadd(z5.VnH(), p3.Merging(), z5.VnH(), z1.VnH(), 270);

   // Reference calculation: (10i + 10) - (5i + 5)
   __ Fsub(z6.VnH(), z0.VnH(), z1.VnH());

   // Calculation using varying imaginary values.
   // (Ai + 10) + rotate(5i + 0, 90)
   //   = (Ai + 10) + (0i - 5)
   //   = Ai + 5
   __ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z2.VnH(), 90);

   // (Ai + 5) + rotate(0i + A, 270)
   //   = (Ai + 5) + (-Ai + 0)
   //   = 5
   __ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z8.VnH(), 270);

   // Repeated, but for wider elements.
   __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
   __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
   __ Fdup(z0.VnS(), 42.0);
   __ Fdup(z1.VnS(), 21.0);
   __ Index(z11.VnS(), 1, 1);
   __ Scvtf(z11.VnS(), p0.Merging(), z11.VnS());
   __ Sel(z2.VnS(), p3, z1.VnS(), z30.VnS());
   __ Sel(z29.VnS(), p2, z1.VnS(), z30.VnS());
   __ Sel(z11.VnS(), p3, z11.VnS(), z0.VnS());
   __ Mov(z12, z11);
   __ Ext(z12.VnB(), z12.VnB(), z12.VnB(), 4);
   __ Sel(z12.VnS(), p2, z12.VnS(), z30.VnS());
   __ Fcadd(z8.VnS(), p0.Merging(), z0.VnS(), z2.VnS(), 90);
   __ Fcadd(z8.VnS(), p0.Merging(), z8.VnS(), z29.VnS(), 270);
   __ Mov(z9, z0);
   __ Fcadd(z9.VnS(), p2.Merging(), z9.VnS(), z1.VnS(), 90);
   __ Fcadd(z9.VnS(), p3.Merging(), z9.VnS(), z1.VnS(), 270);
   __ Fsub(z10.VnS(), z0.VnS(), z1.VnS());
   __ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z2.VnS(), 90);
   __ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z12.VnS(), 270);

   __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
   __ Zip1(p3.VnD(), p1.VnD(), p0.VnD());
   __ Fdup(z0.VnD(), -42.0);
   __ Fdup(z1.VnD(), -21.0);
   __ Index(z15.VnD(), 1, 1);
   __ Scvtf(z15.VnD(), p0.Merging(), z15.VnD());
   __ Sel(z2.VnD(), p3, z1.VnD(), z30.VnD());
   __ Sel(z28.VnD(), p2, z1.VnD(), z30.VnD());
   __ Sel(z15.VnD(), p3, z15.VnD(), z0.VnD());
   __ Mov(z16, z15);
   __ Ext(z16.VnB(), z16.VnB(), z16.VnB(), 8);
   __ Sel(z16.VnD(), p2, z16.VnD(), z30.VnD());
   __ Fcadd(z12.VnD(), p0.Merging(), z0.VnD(), z2.VnD(), 90);
   __ Fcadd(z12.VnD(), p0.Merging(), z12.VnD(), z28.VnD(), 270);
   __ Mov(z13, z0);
   __ Fcadd(z13.VnD(), p2.Merging(), z13.VnD(), z1.VnD(), 90);
   __ Fcadd(z13.VnD(), p3.Merging(), z13.VnD(), z1.VnD(), 270);
   __ Fsub(z14.VnD(), z0.VnD(), z1.VnD());
   __ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z2.VnD(), 90);
   __ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z16.VnD(), 270);
   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH());
     ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH());
     ASSERT_EQUAL_SVE(z3.VnH(), z7.VnH());
     ASSERT_EQUAL_SVE(z10.VnS(), z8.VnS());
     ASSERT_EQUAL_SVE(z10.VnS(), z9.VnS());
     ASSERT_EQUAL_SVE(z29.VnS(), z11.VnS());
     ASSERT_EQUAL_SVE(z14.VnD(), z12.VnD());
     ASSERT_EQUAL_SVE(z14.VnD(), z13.VnD());
     ASSERT_EQUAL_SVE(z28.VnS(), z15.VnS());
   }
 }

 TEST_SVE(sve_fcmla_index) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   __ Ptrue(p0.VnB());

   __ Fdup(z0.VnH(), 10.0);
   __ Fdup(z2.VnH(), 2.0);
   __ Zip1(z0.VnH(), z0.VnH(), z2.VnH());

   // Duplicate complex numbers across z2 segments. First segment has 1i+0,
   // second has 3i+2, etc.
   __ Index(z1.VnH(), 0, 1);
   __ Scvtf(z1.VnH(), p0.Merging(), z1.VnH());
   __ Zip1(z2.VnS(), z1.VnS(), z1.VnS());
   __ Zip1(z2.VnS(), z2.VnS(), z2.VnS());

   // Derive a vector from z2 where only the third element in each segment
   // contains a complex number, with other elements zero.
   __ Index(z3.VnS(), 0, 1);
   __ And(z3.VnS(), z3.VnS(), 3);
   __ Cmpeq(p2.VnS(), p0.Zeroing(), z3.VnS(), 2);
   __ Dup(z3.VnB(), 0);
   __ Sel(z3.VnS(), p2, z2.VnS(), z3.VnS());

   // Use indexed complex multiply on this vector, indexing the third element.
   __ Dup(z4.VnH(), 0);
   __ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 0);
   __ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 90);

   // Rotate the indexed complex number and repeat, negated, and with a different
   // index.
   __ Ext(z3.VnH(), z3.VnH(), z3.VnH(), 4);
   __ Dup(z5.VnH(), 0);
   __ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 180);
   __ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 270);
   __ Fneg(z5.VnH(), p0.Merging(), z5.VnH());

   // Create a reference result from a vector complex multiply.
   __ Dup(z6.VnH(), 0);
   __ Fcmla(z6.VnH(), p0.Merging(), z6.VnH(), z0.VnH(), z2.VnH(), 0);
   __ Fcmla(z6.VnH(), p0.Merging(), z6.VnH(), z0.VnH(), z2.VnH(), 90);

   // Repeated, but for wider elements.
   __ Fdup(z0.VnS(), 42.0);
   __ Fdup(z2.VnS(), 24.0);
   __ Zip1(z0.VnS(), z0.VnS(), z2.VnS());
   __ Index(z1.VnS(), -42, 13);
   __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
   __ Zip1(z2.VnD(), z1.VnD(), z1.VnD());
   __ Zip1(z2.VnD(), z2.VnD(), z2.VnD());
   __ Index(z3.VnD(), 0, 1);
   __ And(z3.VnD(), z3.VnD(), 1);
   __ Cmpeq(p2.VnD(), p0.Zeroing(), z3.VnD(), 1);
   __ Dup(z3.VnB(), 0);
   __ Sel(z3.VnD(), p2, z2.VnD(), z3.VnD());
   __ Dup(z7.VnS(), 0);
   __ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 0);
   __ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 90);
   __ Ext(z3.VnB(), z3.VnB(), z3.VnB(), 8);
   __ Dup(z8.VnS(), 0);
   __ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 180);
   __ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 270);
   __ Fneg(z8.VnS(), p0.Merging(), z8.VnS());
   __ Dup(z9.VnS(), 0);
   __ Fcmla(z9.VnS(), p0.Merging(), z9.VnS(), z0.VnS(), z2.VnS(), 0);
   __ Fcmla(z9.VnS(), p0.Merging(), z9.VnS(), z0.VnS(), z2.VnS(), 90);
   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH());
     ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH());
     ASSERT_EQUAL_SVE(z9.VnS(), z7.VnS());
     ASSERT_EQUAL_SVE(z9.VnS(), z8.VnS());
   }
 }

 TEST_SVE(sve_fcmla) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   __ Ptrue(p0.VnB());
   __ Pfalse(p1.VnB());
   __ Zip1(p2.VnH(), p0.VnH(), p1.VnH());  // Real elements.
   __ Zip1(p3.VnH(), p1.VnH(), p0.VnH());  // Imaginary elements.

   __ Fdup(z0.VnH(), 10.0);
   __ Fdup(z2.VnH(), 2.0);

   // Create pairs of complex numbers, Ai + A. A is chosen to be non-zero, as
   // the later fneg will result in a failed comparison otherwise.
   __ Index(z1.VnH(), -4, 3);
   __ Zip1(z1.VnH(), z1.VnH(), z1.VnH());
   __ Zip1(z1.VnH(), z1.VnH(), z1.VnH());
   __ Scvtf(z1.VnH(), p0.Merging(), z1.VnH());

   __ Sel(z3.VnH(), p2, z0.VnH(), z1.VnH());  // Ai + 10
   __ Sel(z4.VnH(), p2, z1.VnH(), z2.VnH());  // 2i + A

   __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());  // Even complex numbers.
   __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());  // Odd complex numbers.

   // Calculate (Ai + 10) * (2i + A) = (20 + A^2)i + 8A, using predication to
   // select only the complex numbers in odd-numbered element pairs. This leaves
   // results in elements 2/3, 6/7, etc. with zero in elements 0/1, 4/5, etc.
   //   ...      7      6   5   4      3      2   1   0     <-- element
   //   ... | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | 0 | 0 |   <-- value
   __ Dup(z5.VnH(), 0);
   __ Fcmla(z5.VnH(), p3.Merging(), z5.VnH(), z4.VnH(), z3.VnH(), 0);
   __ Fcmla(z5.VnH(), p3.Merging(), z5.VnH(), z4.VnH(), z3.VnH(), 90);

   // Move the odd results to the even result positions.
   //   ...   7   6      5      4   3   2      1      0     <-- element
   //   ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A |   <-- value
   __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 4);

   // Calculate -(Ai + 10) * (2i + A) = -(20 + A^2)i - 8A for the even complex
   // numbers.
   //   ...   7   6       5       4   3   2       1       0     <-- element
   //   ... | 0 | 0 | -20-A^2 | -8A | 0 | 0 | -20-A^2 | -8A |   <-- value
   __ Dup(z6.VnH(), 0);
   __ Fcmla(z6.VnH(), p2.Merging(), z6.VnH(), z4.VnH(), z3.VnH(), 180);
   __ Fcmla(z6.VnH(), p2.Merging(), z6.VnH(), z4.VnH(), z3.VnH(), 270);

   // Negate the even results. The results in z6 should now match the results
   // computed earlier in z5.
   //   ...   7   6      5      4   3   2      1      0     <-- element
   //   ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A |   <-- value
   __ Fneg(z6.VnH(), p2.Merging(), z6.VnH());


   // Similarly, but for wider elements.
   __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
   __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
   __ Index(z1.VnS(), -4, 3);
   __ Zip1(z1.VnS(), z1.VnS(), z1.VnS());
   __ Zip1(z1.VnS(), z1.VnS(), z1.VnS());
   __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
   __ Fdup(z0.VnS(), 20.0);
   __ Fdup(z2.VnS(), 21.0);
   __ Sel(z3.VnS(), p2, z0.VnS(), z1.VnS());
   __ Sel(z4.VnS(), p2, z1.VnS(), z2.VnS());
   __ Punpklo(p2.VnH(), p2.VnB());
   __ Punpklo(p3.VnH(), p3.VnB());
   __ Dup(z7.VnS(), 0);
   __ Fcmla(z7.VnS(), p3.Merging(), z7.VnS(), z4.VnS(), z3.VnS(), 0);
   __ Fcmla(z7.VnS(), p3.Merging(), z7.VnS(), z4.VnS(), z3.VnS(), 90);
   __ Ext(z7.VnB(), z7.VnB(), z7.VnB(), 8);
   __ Dup(z8.VnS(), 0);
   __ Fcmla(z8.VnS(), p2.Merging(), z8.VnS(), z4.VnS(), z3.VnS(), 180);
   __ Fcmla(z8.VnS(), p2.Merging(), z8.VnS(), z4.VnS(), z3.VnS(), 270);
   __ Fneg(z8.VnS(), p2.Merging(), z8.VnS());

   // Double precision computed for even lanes only.
   __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
   __ Index(z1.VnD(), -4, 3);
   __ Zip1(z1.VnD(), z1.VnD(), z1.VnD());
   __ Zip1(z1.VnD(), z1.VnD(), z1.VnD());
   __ Scvtf(z1.VnD(), p0.Merging(), z1.VnD());
   __ Fdup(z0.VnD(), 20.0);
   __ Fdup(z2.VnD(), 21.0);
   __ Sel(z3.VnD(), p2, z0.VnD(), z1.VnD());
   __ Sel(z4.VnD(), p2, z1.VnD(), z2.VnD());
   __ Punpklo(p2.VnH(), p2.VnB());
   __ Dup(z9.VnD(), 0);
   __ Fcmla(z9.VnD(), p2.Merging(), z9.VnD(), z4.VnD(), z3.VnD(), 0);
   __ Fcmla(z9.VnD(), p2.Merging(), z9.VnD(), z4.VnD(), z3.VnD(), 90);
   __ Dup(z10.VnD(), 0);
   __ Fcmla(z10.VnD(), p2.Merging(), z10.VnD(), z4.VnD(), z3.VnD(), 180);
   __ Fcmla(z10.VnD(), p2.Merging(), z10.VnD(), z4.VnD(), z3.VnD(), 270);
   __ Fneg(z10.VnD(), p2.Merging(), z10.VnD());
   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(z5.VnH(), z6.VnH());
     ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS());
     ASSERT_EQUAL_SVE(z9.VnD(), z10.VnD());
   }
 }

 // Create a pattern in dst where the value of each element in src is incremented
 // by the segment number. This allows varying a short input by a predictable
 // pattern for each segment.
 static void FPSegmentPatternHelper(MacroAssembler* masm,
                                    const ZRegister& dst,
                                    const PRegisterM& ptrue,
                                    const ZRegister& src) {
   VIXL_ASSERT(AreSameLaneSize(dst, src));
   UseScratchRegisterScope temps(masm);
   ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst);
   masm->Index(ztmp, 0, 1);
   masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2());
   masm->Scvtf(ztmp, ptrue, ztmp);
   masm->Fadd(dst, src, ztmp);
 }

 TEST_SVE(sve_fpmul_index) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t in0[] = {0x3ff000003f803c00, 0xbff00000bf80bc00};
   uint64_t in1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76};

   __ Ptrue(p0.VnB());
   // Repeat indexed vector across up to 2048-bit VL.
   for (size_t i = 0; i < (kZRegMaxSize / kDRegSize); i++) {
     InsrHelper(&masm, z25.VnD(), in0);
   }
   InsrHelper(&masm, z1.VnD(), in1);

   FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z25.VnH());
   __ Fmul(z2.VnH(), z1.VnH(), z0.VnH(), 0);
   __ Fmul(z3.VnH(), z1.VnH(), z0.VnH(), 1);
   __ Fmul(z4.VnH(), z1.VnH(), z0.VnH(), 4);
   __ Fmul(z5.VnH(), z1.VnH(), z0.VnH(), 7);

   __ Fmul(z6.VnS(), z1.VnS(), z0.VnS(), 0);
   __ Fmul(z7.VnS(), z1.VnS(), z0.VnS(), 1);
   __ Fmul(z8.VnS(), z1.VnS(), z0.VnS(), 2);
   __ Fmul(z9.VnS(), z1.VnS(), z0.VnS(), 3);

   __ Fmul(z10.VnD(), z1.VnD(), z0.VnD(), 0);
   __ Fmul(z11.VnD(), z1.VnD(), z0.VnD(), 1);

   // Compute the results using other instructions.
   __ Dup(z12.VnH(), z25.VnH(), 0);
   FPSegmentPatternHelper(&masm, z12.VnH(), p0.Merging(), z12.VnH());
   __ Fmul(z12.VnH(), z1.VnH(), z12.VnH());
   __ Dup(z13.VnH(), z25.VnH(), 1);
   FPSegmentPatternHelper(&masm, z13.VnH(), p0.Merging(), z13.VnH());
   __ Fmul(z13.VnH(), z1.VnH(), z13.VnH());
   __ Dup(z14.VnH(), z25.VnH(), 4);
   FPSegmentPatternHelper(&masm, z14.VnH(), p0.Merging(), z14.VnH());
   __ Fmul(z14.VnH(), z1.VnH(), z14.VnH());
   __ Dup(z15.VnH(), z25.VnH(), 7);
   FPSegmentPatternHelper(&masm, z15.VnH(), p0.Merging(), z15.VnH());
   __ Fmul(z15.VnH(), z1.VnH(), z15.VnH());

   __ Dup(z16.VnS(), z25.VnS(), 0);
   FPSegmentPatternHelper(&masm, z16.VnH(), p0.Merging(), z16.VnH());
   __ Fmul(z16.VnS(), z1.VnS(), z16.VnS());
   __ Dup(z17.VnS(), z25.VnS(), 1);
   FPSegmentPatternHelper(&masm, z17.VnH(), p0.Merging(), z17.VnH());
   __ Fmul(z17.VnS(), z1.VnS(), z17.VnS());
   __ Dup(z18.VnS(), z25.VnS(), 2);
   FPSegmentPatternHelper(&masm, z18.VnH(), p0.Merging(), z18.VnH());
   __ Fmul(z18.VnS(), z1.VnS(), z18.VnS());
   __ Dup(z19.VnS(), z25.VnS(), 3);
   FPSegmentPatternHelper(&masm, z19.VnH(), p0.Merging(), z19.VnH());
   __ Fmul(z19.VnS(), z1.VnS(), z19.VnS());

   __ Dup(z20.VnD(), z25.VnD(), 0);
   FPSegmentPatternHelper(&masm, z20.VnH(), p0.Merging(), z20.VnH());
   __ Fmul(z20.VnD(), z1.VnD(), z20.VnD());
   __ Dup(z21.VnD(), z25.VnD(), 1);
   FPSegmentPatternHelper(&masm, z21.VnH(), p0.Merging(), z21.VnH());
   __ Fmul(z21.VnD(), z1.VnD(), z21.VnD());

   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(z12.VnH(), z2.VnH());
     ASSERT_EQUAL_SVE(z13.VnH(), z3.VnH());
     ASSERT_EQUAL_SVE(z14.VnH(), z4.VnH());
     ASSERT_EQUAL_SVE(z15.VnH(), z5.VnH());
     ASSERT_EQUAL_SVE(z16.VnS(), z6.VnS());
     ASSERT_EQUAL_SVE(z17.VnS(), z7.VnS());
     ASSERT_EQUAL_SVE(z18.VnS(), z8.VnS());
     ASSERT_EQUAL_SVE(z19.VnS(), z9.VnS());
     ASSERT_EQUAL_SVE(z20.VnD(), z10.VnD());
     ASSERT_EQUAL_SVE(z21.VnD(), z11.VnD());
   }
 }

 TEST_SVE(sve_ftmad) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t in_h0[] = {0x7c027e01fc02fe01,
                       0x3c003c00bc00bc00,
                       0x3c003c00bc00bc00};
   uint64_t in_h1[] = {0xfe01fc027e017e01,
                       0x3c00bc003c00bc00,
                       0x3c00bc003c00bc00};
   uint64_t in_s0[] = {0x7f800002ffc00001,
                       0x3f8000003f800000,
                       0xbf800000bf800000};
   uint64_t in_s1[] = {0xffc00001ffc00001,
                       0x3f800000bf800000,
                       0x3f800000bf800000};
   uint64_t in_d0[] = {0x7ff8000000000001,
                       0x3ff0000000000000,
                       0xbff0000000000000};
   uint64_t in_d1[] = {0xfff0000000000002,
                       0xbff0000000000000,
                       0x3ff0000000000000};
   InsrHelper(&masm, z0.VnD(), in_h0);
   InsrHelper(&masm, z1.VnD(), in_h1);
   InsrHelper(&masm, z2.VnD(), in_s0);
   InsrHelper(&masm, z3.VnD(), in_s1);
   InsrHelper(&masm, z4.VnD(), in_d0);
   InsrHelper(&masm, z5.VnD(), in_d1);

   __ Mov(z6, z0);
   __ Ftmad(z6.VnH(), z6.VnH(), z1.VnH(), 0);
   __ Mov(z7, z0);
   __ Ftmad(z7.VnH(), z7.VnH(), z1.VnH(), 1);
   __ Mov(z8, z0);
   __ Ftmad(z8.VnH(), z8.VnH(), z1.VnH(), 2);

   __ Mov(z9, z2);
   __ Ftmad(z9.VnS(), z9.VnS(), z3.VnS(), 0);
   __ Mov(z10, z2);
   __ Ftmad(z10.VnS(), z10.VnS(), z3.VnS(), 3);
   __ Mov(z11, z2);
   __ Ftmad(z11.VnS(), z11.VnS(), z3.VnS(), 4);

   __ Mov(z12, z4);
   __ Ftmad(z12.VnD(), z12.VnD(), z5.VnD(), 0);
   __ Mov(z13, z4);
   __ Ftmad(z13.VnD(), z13.VnD(), z5.VnD(), 5);
   __ Mov(z14, z4);
   __ Ftmad(z14.VnD(), z14.VnD(), z5.VnD(), 7);

   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t expected_z6[] = {0x7e027e02fe02fe01,
                               0x4000400000000000,
                               0x4000400000000000};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
     uint64_t expected_z7[] = {0x7e027e02fe02fe01,
                               0x3aab3800bcabbe00,
                               0x3aab3800bcabbe00};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
     uint64_t expected_z8[] = {0x7e027e02fe02fe01,
                               0x3c083c2abbefbbac,
                               0x3c083c2abbefbbac};
     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
     uint64_t expected_z9[] = {0x7fc00002ffc00001,
                               0x4000000040000000,
                               0x0000000000000000};
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
     uint64_t expected_z10[] = {0x7fc00002ffc00001,
                                0x3f7ff2ff3f7fa4fc,
                                0xbf800680bf802d82};
     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
     uint64_t expected_z11[] = {0x7fc00002ffc00001,
                                0x3f8000173f8000cd,
                                0xbf7fffd2bf7ffe66};
     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
     uint64_t expected_z12[] = {0x7ff8000000000002,
                                0x4000000000000000,
                                0x0000000000000000};
     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
     uint64_t expected_z13[] = {0x7ff8000000000002,
                                0x3fefffff6c0d846c,
                                0xbff0000006b978ae};
     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
     uint64_t expected_z14[] = {0x7ff8000000000002,
                                0x3feffffffffe708a,
                                0xbff0000000000000};
     ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
   }
 }

 static void BasicFPArithHelper(MacroAssembler* masm,
                                int lane_size_in_bits,
                                const uint64_t (&inputs)[2],
                                const uint64_t (&inputs_fmulx)[2],
                                const uint64_t (&inputs_nans)[2]) {
   int ls = lane_size_in_bits;

   for (int i = 0; i < 16; i++) {
     InsrHelper(masm, z0.VnD(), inputs);
   }
   ZRegister rvrs = z1.WithLaneSize(ls);
   masm->Rev(rvrs, z0.WithLaneSize(ls));

   int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
   Initialise(masm, p2.VnB(), pred);
   PRegisterM p2m = p2.Merging();

   masm->Mov(z2, z0);
   masm->Fadd(z2.WithLaneSize(ls),
              p2m,
              z2.WithLaneSize(ls),
              rvrs,
              FastNaNPropagation);
   masm->Mov(z3, z0);
   masm->Fsub(z3.WithLaneSize(ls), p2m, z3.WithLaneSize(ls), rvrs);
   masm->Mov(z4, z0);
   masm->Fsub(z4.WithLaneSize(ls), p2m, rvrs, z4.WithLaneSize(ls));
   masm->Mov(z5, z0);
   masm->Fabd(z5.WithLaneSize(ls),
              p2m,
              z5.WithLaneSize(ls),
              rvrs,
              FastNaNPropagation);
   masm->Mov(z6, z0);
   masm->Fmul(z6.WithLaneSize(ls),
              p2m,
              z6.WithLaneSize(ls),
              rvrs,
              FastNaNPropagation);

   for (int i = 0; i < 16; i++) {
     InsrHelper(masm, z7.VnD(), inputs_fmulx);
   }
   masm->Rev(z8.WithLaneSize(ls), z7.WithLaneSize(ls));
   masm->Fmulx(z7.WithLaneSize(ls),
               p2m,
               z7.WithLaneSize(ls),
               z8.WithLaneSize(ls),
               FastNaNPropagation);

   InsrHelper(masm, z8.VnD(), inputs_nans);
   masm->Mov(z9, z8);
   masm->Fminnm(z9.WithLaneSize(ls),
                p2m,
                z9.WithLaneSize(ls),
                rvrs,
                FastNaNPropagation);
   masm->Mov(z10, z8);
   masm->Fmaxnm(z10.WithLaneSize(ls),
                p2m,
                z10.WithLaneSize(ls),
                rvrs,
                FastNaNPropagation);
 }

 TEST_SVE(sve_fp_arith_pred_h) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t inputs[] = {0x4800470046004500, 0x4400420040003c00};
   uint64_t inputs_fmulx[] = {0x7c00fc007c00fc00, 0x0000800000008000};
   uint64_t inputs_nans[] = {0x7fffffff7fffffff, 0x7bfffbff7fbbfbff};

   BasicFPArithHelper(&masm, kHRegSize, inputs, inputs_fmulx, inputs_nans);

   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t expected_z2[] = {0x4880488048804880, 0x4880420048804880};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
     uint64_t expected_z3[] = {0x4700450042003c00, 0xbc004200c500c700};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
     uint64_t expected_z4[] = {0xc700c500c200bc00, 0x3c00420045004700};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
     uint64_t expected_z5[] = {0x4700450042003c00, 0x3c00420045004700};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
     uint64_t expected_z6[] = {0x48004b004c804d00, 0x4d0042004b004800};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
     uint64_t expected_z7[] = {0xc000c000c000c000, 0xc0008000c000c000};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
     uint64_t expected_z9[] = {0x3c00400042004400, 0x4500fbff4700fbff};
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
     uint64_t expected_z10[] = {0x3c00400042004400, 0x7bfffbff47004800};
     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
   }
 }

 TEST_SVE(sve_fp_arith_pred_s) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t inputs[] = {0x4080000040400000, 0x400000003f800000};
   uint64_t inputs_fmulx[] = {0x7f800000ff800000, 0x0000000080000000};
   uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x41000000c1000000};

   BasicFPArithHelper(&masm, kSRegSize, inputs, inputs_fmulx, inputs_nans);

   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t expected_z2[] = {0x40a0000040a00000, 0x4000000040a00000};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
     uint64_t expected_z3[] = {0x404000003f800000, 0x40000000c0400000};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
     uint64_t expected_z4[] = {0xc0400000bf800000, 0x4000000040400000};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
     uint64_t expected_z5[] = {0x404000003f800000, 0x4000000040400000};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
     uint64_t expected_z6[] = {0x4080000040c00000, 0x4000000040800000};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
     uint64_t expected_z7[] = {0xc0000000c0000000, 0x00000000c0000000};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
     uint64_t expected_z9[] = {0x3f80000040000000, 0x41000000c1000000};
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
     uint64_t expected_z10[] = {0x3f80000040000000, 0x4100000040800000};
     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
   }
 }

 TEST_SVE(sve_fp_arith_pred_d) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t inputs[] = {0x4000000000000000, 0x3ff0000000000000};
   uint64_t inputs_fmulx[] = {0x7ff0000000000000, 0x8000000000000000};
   uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x4100000000000000};

   BasicFPArithHelper(&masm, kDRegSize, inputs, inputs_fmulx, inputs_nans);

   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t expected_z2[] = {0x4008000000000000, 0x4008000000000000};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
     uint64_t expected_z3[] = {0x3ff0000000000000, 0xbff0000000000000};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
     uint64_t expected_z4[] = {0xbff0000000000000, 0x3ff0000000000000};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
     uint64_t expected_z5[] = {0x3ff0000000000000, 0x3ff0000000000000};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
     uint64_t expected_z6[] = {0x4000000000000000, 0x4000000000000000};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
     uint64_t expected_z7[] = {0xc000000000000000, 0xc000000000000000};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
     uint64_t expected_z9[] = {0x3ff0000000000000, 0x4000000000000000};
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
     uint64_t expected_z10[] = {0x3ff0000000000000, 0x4100000000000000};
     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
   }
 }

 TEST_SVE(sve_fp_arith_pred_imm) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
   Initialise(&masm, p0.VnB(), pred);
   PRegisterM p0m = p0.Merging();
   __ Ptrue(p1.VnB());

   __ Fdup(z0.VnD(), 0.0);

   __ Mov(z1, z0);
   __ Fdiv(z1.VnH(), p1.Merging(), z1.VnH(), z1.VnH());
   __ Mov(z2, z0);
   __ Fadd(z2.VnH(), p0m, z2.VnH(), 0.5);
   __ Mov(z3, z2);
   __ Fsub(z3.VnH(), p0m, z3.VnH(), 1.0);
   __ Mov(z4, z3);
   __ Fsub(z4.VnH(), p0m, 1.0, z4.VnH());
   __ Mov(z5, z4);
   __ Fmul(z5.VnH(), p0m, z5.VnH(), 2.0);
   __ Mov(z6, z1);
   __ Fminnm(z6.VnH(), p0m, z6.VnH(), 0.0);
   __ Mov(z7, z1);
   __ Fmaxnm(z7.VnH(), p0m, z7.VnH(), 1.0);
   __ Mov(z8, z5);
   __ Fmin(z8.VnH(), p0m, z8.VnH(), 1.0);
   __ Mov(z9, z5);
   __ Fmax(z9.VnH(), p0m, z9.VnH(), 0.0);

   __ Mov(z11, z0);
   __ Fdiv(z11.VnS(), p1.Merging(), z11.VnS(), z11.VnS());
   __ Mov(z12, z0);
   __ Fadd(z12.VnS(), p0m, z12.VnS(), 0.5);
   __ Mov(z13, z12);
   __ Fsub(z13.VnS(), p0m, z13.VnS(), 1.0);
   __ Mov(z14, z13);
   __ Fsub(z14.VnS(), p0m, 1.0, z14.VnS());
   __ Mov(z15, z14);
   __ Fmul(z15.VnS(), p0m, z15.VnS(), 2.0);
   __ Mov(z16, z11);
   __ Fminnm(z16.VnS(), p0m, z16.VnS(), 0.0);
   __ Mov(z17, z11);
   __ Fmaxnm(z17.VnS(), p0m, z17.VnS(), 1.0);
   __ Mov(z18, z15);
   __ Fmin(z18.VnS(), p0m, z18.VnS(), 1.0);
   __ Mov(z19, z15);
   __ Fmax(z19.VnS(), p0m, z19.VnS(), 0.0);

   __ Mov(z21, z0);
   __ Fdiv(z21.VnD(), p1.Merging(), z21.VnD(), z21.VnD());
   __ Mov(z22, z0);
   __ Fadd(z22.VnD(), p0m, z22.VnD(), 0.5);
   __ Mov(z23, z22);
   __ Fsub(z23.VnD(), p0m, z23.VnD(), 1.0);
   __ Mov(z24, z23);
   __ Fsub(z24.VnD(), p0m, 1.0, z24.VnD());
   __ Mov(z25, z24);
   __ Fmul(z25.VnD(), p0m, z25.VnD(), 2.0);
   __ Mov(z26, z21);
   __ Fminnm(z26.VnD(), p0m, z26.VnD(), 0.0);
   __ Mov(z27, z21);
   __ Fmaxnm(z27.VnD(), p0m, z27.VnD(), 1.0);
   __ Mov(z28, z25);
   __ Fmin(z28.VnD(), p0m, z28.VnD(), 1.0);
   __ Mov(z29, z25);
   __ Fmax(z29.VnD(), p0m, z29.VnD(), 0.0);

   __ Index(z0.VnH(), -3, 1);
   __ Scvtf(z0.VnH(), p1.Merging(), z0.VnH());
   __ Fmax(z0.VnH(), p1.Merging(), z0.VnH(), 0.0);
   __ Index(z1.VnS(), -4, 2);
   __ Scvtf(z1.VnS(), p1.Merging(), z1.VnS());
   __ Fadd(z1.VnS(), p1.Merging(), z1.VnS(), 1.0);

   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t expected_z2[] = {0x3800380038003800, 0x3800000038003800};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
     uint64_t expected_z3[] = {0xb800b800b800b800, 0xb8000000b800b800};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
     uint64_t expected_z4[] = {0x3e003e003e003e00, 0x3e0000003e003e00};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
     uint64_t expected_z5[] = {0x4200420042004200, 0x4200000042004200};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
     uint64_t expected_z6[] = {0x0000000000000000, 0x00007e0000000000};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
     uint64_t expected_z7[] = {0x3c003c003c003c00, 0x3c007e003c003c00};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
     uint64_t expected_z8[] = {0x3c003c003c003c00, 0x3c0000003c003c00};
     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
     uint64_t expected_z9[] = {0x4200420042004200, 0x4200000042004200};
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());

     uint64_t expected_z12[] = {0x3f0000003f000000, 0x000000003f000000};
     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
     uint64_t expected_z13[] = {0xbf000000bf000000, 0x00000000bf000000};
     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
     uint64_t expected_z14[] = {0x3fc000003fc00000, 0x000000003fc00000};
     ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
     uint64_t expected_z15[] = {0x4040000040400000, 0x0000000040400000};
     ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
     uint64_t expected_z16[] = {0x0000000000000000, 0x7fc0000000000000};
     ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
     uint64_t expected_z17[] = {0x3f8000003f800000, 0x7fc000003f800000};
     ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
     uint64_t expected_z18[] = {0x3f8000003f800000, 0x000000003f800000};
     ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
     uint64_t expected_z19[] = {0x4040000040400000, 0x0000000040400000};
     ASSERT_EQUAL_SVE(expected_z19, z19.VnD());

     uint64_t expected_z22[] = {0x3fe0000000000000, 0x3fe0000000000000};
     ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
     uint64_t expected_z23[] = {0xbfe0000000000000, 0xbfe0000000000000};
     ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
     uint64_t expected_z24[] = {0x3ff8000000000000, 0x3ff8000000000000};
     ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
     uint64_t expected_z25[] = {0x4008000000000000, 0x4008000000000000};
     ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
     uint64_t expected_z26[] = {0x0000000000000000, 0x0000000000000000};
     ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
     uint64_t expected_z27[] = {0x3ff0000000000000, 0x3ff0000000000000};
     ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
     uint64_t expected_z28[] = {0x3ff0000000000000, 0x3ff0000000000000};
     ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
     uint64_t expected_z29[] = {0x4008000000000000, 0x4008000000000000};
     ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
     uint64_t expected_z0[] = {0x4400420040003c00, 0x0000000000000000};
     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
     uint64_t expected_z1[] = {0x404000003f800000, 0xbf800000c0400000};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
   }
 }

 TEST_SVE(sve_fscale) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   uint64_t inputs_h[] = {0x4800470046004500, 0x4400420040003c00};
   InsrHelper(&masm, z0.VnD(), inputs_h);
   uint64_t inputs_s[] = {0x4080000040400000, 0x400000003f800000};
   InsrHelper(&masm, z1.VnD(), inputs_s);
   uint64_t inputs_d[] = {0x40f0000000000000, 0x4000000000000000};
   InsrHelper(&masm, z2.VnD(), inputs_d);

   uint64_t scales[] = {0x00080002fff8fffe, 0x00100001fff0ffff};
   InsrHelper(&masm, z3.VnD(), scales);

   __ Ptrue(p0.VnB());
   int pred[] = {0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1};
   Initialise(&masm, p1.VnB(), pred);

   __ Mov(z4, z0);
   __ Fscale(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH());
   __ Mov(z5, z0);
   __ Fscale(z5.VnH(), p1.Merging(), z5.VnH(), z3.VnH());

   __ Sunpklo(z3.VnS(), z3.VnH());
   __ Mov(z6, z1);
   __ Fscale(z6.VnS(), p0.Merging(), z6.VnS(), z3.VnS());
   __ Mov(z7, z1);
   __ Fscale(z7.VnS(), p1.Merging(), z7.VnS(), z3.VnS());

   __ Sunpklo(z3.VnD(), z3.VnS());
   __ Mov(z8, z2);
   __ Fscale(z8.VnD(), p0.Merging(), z8.VnD(), z3.VnD());
   __ Mov(z9, z2);
   __ Fscale(z9.VnD(), p1.Merging(), z9.VnD(), z3.VnD());

   // Test full double precision range scaling.
   __ Dup(z10.VnD(), 2045);
   __ Dup(z11.VnD(), 0x0010000000000000);  // 2^-1022
   __ Fscale(z11.VnD(), p0.Merging(), z11.VnD(), z10.VnD());

   END();

   if (CAN_RUN()) {
     RUN();

     uint64_t expected_z4[] = {0x68004f0026003d00, 0x7c00460002003800};
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
     uint64_t expected_z5[] = {0x68004f0026004500, 0x7c00420002003800};
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());

     uint64_t expected_z6[] = {0x4880000040c00000, 0x380000003f000000};
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
     uint64_t expected_z7[] = {0x4880000040400000, 0x400000003f000000};
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());

     uint64_t expected_z8[] = {0x3ff0000000000000, 0x3ff0000000000000};
     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
     uint64_t expected_z9[] = {0x40f0000000000000, 0x3ff0000000000000};
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());

     uint64_t expected_z11[] = {0x7fe0000000000000, 0x7fe0000000000000};
     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
   }
 }

 typedef void (MacroAssembler::*FcvtFrintMFn)(const ZRegister& zd,
                                              const PRegisterM& pg,
                                              const ZRegister& zn);

 typedef void (MacroAssembler::*FcvtFrintZFn)(const ZRegister& zd,
                                              const PRegisterZ& pg,
                                              const ZRegister& zn);

 template <typename F, size_t N>
 static void TestFcvtFrintHelper(Test* config,
                                 FcvtFrintMFn macro_m,
                                 FcvtFrintZFn macro_z,
                                 int dst_type_size_in_bits,
                                 int src_type_size_in_bits,
                                 const F (&zn_inputs)[N],
                                 const int (&pg_inputs)[N],
                                 const uint64_t (&zd_expected_all_active)[N]) {
   VIXL_ASSERT(macro_m != NULL);
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // If the input and result types have a different size, the instruction
   // options on elements of the largest specified type is determined by the
   // larger type.
   int lane_size_in_bits =
       std::max(dst_type_size_in_bits, src_type_size_in_bits);

   ZRegister zd_all_active = z25;
   ZRegister zd_merging = z26;
   ZRegister zn = z27;

   uint64_t zn_rawbits[N];
   FPToRawbitsWithSize(zn_inputs, zn_rawbits, src_type_size_in_bits);
   InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_rawbits);

   PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits);
   __ Ptrue(pg_all_active);

   // Test floating-point conversions with all lanes activated.
   (masm.*macro_m)(zd_all_active.WithLaneSize(dst_type_size_in_bits),
                   pg_all_active.Merging(),
                   zn.WithLaneSize(src_type_size_in_bits));

   PRegisterWithLaneSize pg_merging = p1.WithLaneSize(lane_size_in_bits);
   Initialise(&masm, pg_merging, pg_inputs);

   __ Dup(zd_merging.VnD(), 0x0bad0bad0bad0bad);

   // Use the same `zn` inputs to test floating-point conversions but partial
   // lanes are set inactive.
   (masm.*macro_m)(zd_merging.WithLaneSize(dst_type_size_in_bits),
                   pg_merging.Merging(),
                   zn.WithLaneSize(src_type_size_in_bits));

   ZRegister zd_zeroing = z24;
   PRegisterWithLaneSize pg_zeroing = p1.WithLaneSize(lane_size_in_bits);
   Initialise(&masm, pg_zeroing, pg_inputs);

   if (macro_z != NULL) {
     __ Dup(zd_zeroing.VnD(), 0x0bad0bad0bad0bad);
     (masm.*macro_z)(zd_zeroing.WithLaneSize(dst_type_size_in_bits),
                     pg_zeroing.Zeroing(),
                     zn.WithLaneSize(src_type_size_in_bits));
   }

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(zd_expected_all_active,
                      zd_all_active.WithLaneSize(lane_size_in_bits));

     uint64_t zd_expected_merging[N];
     for (unsigned i = 0; i < N; i++) {
       zd_expected_merging[i] =
           pg_inputs[i] ? zd_expected_all_active[i]
                        : 0x0bad0bad0bad0bad & GetUintMask(lane_size_in_bits);
     }
     ASSERT_EQUAL_SVE(zd_expected_merging,
                      zd_merging.WithLaneSize(lane_size_in_bits));

     if (macro_z != NULL) {
       uint64_t zd_expected_zeroing[N] = {0};
       for (unsigned i = 0; i < N; i++) {
         if (pg_inputs[i]) {
           zd_expected_zeroing[i] = zd_expected_all_active[i];
         }
       }
       ASSERT_EQUAL_SVE(zd_expected_zeroing,
                        zd_zeroing.WithLaneSize(lane_size_in_bits));
     }
   }
 }

 template <typename F, size_t N>
 static void TestFcvtzHelper(Test* config,
                             FcvtFrintMFn macro_m,
                             int dst_type_size_in_bits,
                             int src_type_size_in_bits,
                             const F (&zn_inputs)[N],
                             const int (&pg_inputs)[N],
                             const uint64_t (&zd_expected_all_active)[N]) {
   TestFcvtFrintHelper(config,
                       macro_m,
                       // Fcvt variants have no zeroing predication form.
                       NULL,
                       dst_type_size_in_bits,
                       src_type_size_in_bits,
                       zn_inputs,
                       pg_inputs,
                       zd_expected_all_active);
 }

 TEST_SVE(fcvtzs_fcvtzu_float16) {
   const double h_max_float16 = 0x7ff0;          // Largest float16 == INT16_MAX.
   const double h_min_float16 = -h_max_float16;  // Smallest float16 > INT16_MIN.
   const double largest_float16 = 0xffe0;        // 65504
   const double smallest_float16 = -largest_float16;
   const double h_max_int_add_one = 0x8000;

   double zn_inputs[] = {1.0,
                         1.1,
                         1.5,
                         -1.5,
                         h_max_float16,
                         h_min_float16,
                         largest_float16,
                         smallest_float16,
                         kFP64PositiveInfinity,
                         kFP64NegativeInfinity,
                         h_max_int_add_one};

   int pg_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1};

   uint64_t expected_fcvtzs_fp162h[] =
       {1, 1, 1, 0xffff, 0x7ff0, 0x8010, 0x7fff, 0x8000, 0x7fff, 0x8000, 0x7fff};

   uint64_t expected_fcvtzu_fp162h[] =
       {1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffff, 0, 0x8000};

   // Float16 to 16-bit integers.
   TestFcvtzHelper(config,
                   &MacroAssembler::Fcvtzs,
                   kHRegSize,
                   kHRegSize,
                   zn_inputs,
                   pg_inputs,
                   expected_fcvtzs_fp162h);

   TestFcvtzHelper(config,
                   &MacroAssembler::Fcvtzu,
                   kHRegSize,
                   kHRegSize,
                   zn_inputs,
                   pg_inputs,
                   expected_fcvtzu_fp162h);

   uint64_t expected_fcvtzs_fp162w[] = {1,
                                        1,
                                        1,
                                        0xffffffff,
                                        0x7ff0,
                                        0xffff8010,
                                        0xffe0,
                                        0xffff0020,
                                        0x7fffffff,
                                        0x80000000,
                                        0x8000};

   uint64_t expected_fcvtzu_fp162w[] =
       {1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffffffff, 0, 0x8000};

   // Float16 to 32-bit integers.
   TestFcvtzHelper(config,
                   &MacroAssembler::Fcvtzs,
                   kSRegSize,
                   kHRegSize,
                   zn_inputs,
                   pg_inputs,
                   expected_fcvtzs_fp162w);

   TestFcvtzHelper(config,
                   &MacroAssembler::Fcvtzu,
                   kSRegSize,
                   kHRegSize,
                   zn_inputs,
                   pg_inputs,
                   expected_fcvtzu_fp162w);

   uint64_t expected_fcvtzs_fp162x[] = {1,
                                        1,
                                        1,
                                        0xffffffffffffffff,
                                        0x7ff0,
                                        0xffffffffffff8010,
                                        0xffe0,
                                        0xffffffffffff0020,
                                        0x7fffffffffffffff,
                                        0x8000000000000000,
                                        0x8000};

   uint64_t expected_fcvtzu_fp162x[] =
       {1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffffffffffffffff, 0, 0x8000};

   // Float16 to 64-bit integers.
   TestFcvtzHelper(config,
                   &MacroAssembler::Fcvtzs,
                   kDRegSize,
                   kHRegSize,
                   zn_inputs,
                   pg_inputs,
                   expected_fcvtzs_fp162x);

   TestFcvtzHelper(config,
                   &MacroAssembler::Fcvtzu,
                   kDRegSize,
                   kHRegSize,
                   zn_inputs,
                   pg_inputs,
                   expected_fcvtzu_fp162x);
 }

 TEST_SVE(fcvtzs_fcvtzu_float) {
   const double w_max_float = 0x7fffff80;          // Largest float < INT32_MAX.
   const double w_min_float = -w_max_float;        // Smallest float > INT32_MIN.
   const double x_max_float = 0x7fffff8000000000;  // Largest float < INT64_MAX.
   const double x_min_float = -x_max_float;        // Smallest float > INT64_MIN.
   const double w_min_int_add_one = 0x80000000;
   const double x_max_int_add_one = 0x80000000'00000000;

   double zn_inputs[] = {1.0,
                         1.1,
                         1.5,
                         -1.5,
                         w_max_float,
                         w_min_float,
                         x_max_float,
                         x_min_float,
                         kFP64PositiveInfinity,
                         kFP64NegativeInfinity,
                         w_min_int_add_one,
                         x_max_int_add_one};

   int pg_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1};

   uint64_t expected_fcvtzs_s2w[] = {1,
                                     1,
                                     1,
                                     0xffffffff,
                                     0x7fffff80,
                                     0x80000080,
                                     0x7fffffff,
                                     0x80000000,
                                     0x7fffffff,
                                     0x80000000,
                                     0x7fffffff,
                                     0x7fffffff};

   uint64_t expected_fcvtzu_s2w[] = {1,
                                     1,
                                     1,
                                     0,
                                     0x7fffff80,
                                     0,
                                     0xffffffff,
                                     0,
                                     0xffffffff,
                                     0,
                                     0x80000000,
                                     0xffffffff};

   // Float to 32-bit integers.
   TestFcvtzHelper(config,
                   &MacroAssembler::Fcvtzs,
                   kSRegSize,
                   kSRegSize,
                   zn_inputs,
                   pg_inputs,
                   expected_fcvtzs_s2w);

   TestFcvtzHelper(config,
                   &MacroAssembler::Fcvtzu,
                   kSRegSize,
                   kSRegSize,
                   zn_inputs,
                   pg_inputs,
                   expected_fcvtzu_s2w);

   uint64_t expected_fcvtzs_s2x[] = {1,
                                     1,
                                     1,
                                     0xffffffffffffffff,
                                     0x7fffff80,
                                     0xffffffff80000080,
                                     0x7fffff8000000000,
                                     0x8000008000000000,
                                     0x7fffffffffffffff,
                                     0x8000000000000000,
                                     0x80000000,
                                     0x7fffffffffffffff};

   uint64_t expected_fcvtzu_s2x[] = {1,
                                     1,
                                     1,
                                     0,
                                     0x7fffff80,
                                     0,
                                     0x7fffff8000000000,
                                     0,
                                     0xffffffffffffffff,
                                     0,
                                     0x80000000,
                                     0x8000000000000000};

   // Float to 64-bit integers.
   TestFcvtzHelper(config,
                   &MacroAssembler::Fcvtzs,
                   kDRegSize,
                   kSRegSize,
                   zn_inputs,
                   pg_inputs,
                   expected_fcvtzs_s2x);

   TestFcvtzHelper(config,
                   &MacroAssembler::Fcvtzu,
                   kDRegSize,
                   kSRegSize,
                   zn_inputs,
                   pg_inputs,
                   expected_fcvtzu_s2x);
 }

 TEST_SVE(fcvtzs_fcvtzu_double) {
   const double w_max_float = 0x7fffff80;          // Largest float < INT32_MAX.
   const double w_min_float = -w_max_float;        // Smallest float > INT32_MIN.
   const double x_max_float = 0x7fffff8000000000;  // Largest float < INT64_MAX.
   const double x_min_float = -x_max_float;        // Smallest float > INT64_MIN.
   const double w_max_double = kWMaxInt;       // Largest double == INT32_MAX.
   const double w_min_double = -w_max_double;  // Smallest double > INT32_MIN.
   const double x_max_double =
       0x7ffffffffffffc00;                     // Largest double < INT64_MAX.
   const double x_min_double = -x_max_double;  // Smallest double > INT64_MIN.
   const double w_max_int_sub_one = kWMaxInt - 1;
   const double w_min_int_add_one = kWMinInt + 1;
   const double w_max_int_add_one = 0x80000000;
   const double x_max_int_add_one = 0x80000000'00000000;

   double zn_inputs[] = {1.0,
                         1.1,
                         1.5,
                         -1.5,
                         w_max_float,
                         w_min_float,
                         x_max_float,
                         x_min_float,
                         w_max_double,
                         w_min_double,
                         x_max_double,
                         x_min_double,
                         kFP64PositiveInfinity,
                         kFP64NegativeInfinity,
                         w_max_int_sub_one,
                         w_min_int_add_one,
                         w_max_int_add_one,
                         x_max_int_add_one};

   int pg_inputs[] = {1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0};

   uint64_t expected_fcvtzs_d2w[] = {1,
                                     1,
                                     1,
                                     0xffffffffffffffff,
                                     0x7fffff80,
                                     0xffffffff80000080,
                                     0x7fffffff,
                                     0xffffffff80000000,
                                     0x7fffffff,
                                     0xffffffff80000001,
                                     0x7fffffff,
                                     0xffffffff80000000,
                                     0x7fffffff,
                                     0xffffffff80000000,
                                     0x7ffffffe,
                                     0xffffffff80000001,
                                     0x7fffffff,
                                     0x7fffffff};

   uint64_t expected_fcvtzu_d2w[] = {1,
                                     1,
                                     1,
                                     0,
                                     0x7fffff80,
                                     0,
                                     0xffffffff,
                                     0,
                                     0x7fffffff,
                                     0,
                                     0xffffffff,
                                     0,
                                     0xffffffff,
                                     0,
                                     0x7ffffffe,
                                     0,
                                     0x80000000,
                                     0xffffffff};

   // Double to 32-bit integers.
   TestFcvtzHelper(config,
                   &MacroAssembler::Fcvtzs,
                   kSRegSize,
                   kDRegSize,
                   zn_inputs,
                   pg_inputs,
                   expected_fcvtzs_d2w);

   TestFcvtzHelper(config,
                   &MacroAssembler::Fcvtzu,
                   kSRegSize,
                   kDRegSize,
                   zn_inputs,
                   pg_inputs,
                   expected_fcvtzu_d2w);

   uint64_t expected_fcvtzs_d2x[] = {1,
                                     1,
                                     1,
                                     0xffffffffffffffff,
                                     0x7fffff80,
                                     0xffffffff80000080,
                                     0x7fffff8000000000,
                                     0x8000008000000000,
                                     0x7fffffff,
                                     0xffffffff80000001,
                                     0x7ffffffffffffc00,
                                     0x8000000000000400,
                                     0x7fffffffffffffff,
                                     0x8000000000000000,
                                     0x7ffffffe,
                                     0xffffffff80000001,
                                     0x80000000,
                                     0x7fffffffffffffff};

   uint64_t expected_fcvtzu_d2x[] = {1,
                                     1,
                                     1,
                                     0,
                                     0x7fffff80,
                                     0,
                                     0x7fffff8000000000,
                                     0,
                                     0x7fffffff,
                                     0,
                                     0x7ffffffffffffc00,
                                     0,
                                     0xffffffffffffffff,
                                     0,
                                     0x000000007ffffffe,
                                     0,
                                     0x80000000,
                                     0x8000000000000000};

   // Double to 64-bit integers.
   TestFcvtzHelper(config,
                   &MacroAssembler::Fcvtzs,
                   kDRegSize,
                   kDRegSize,
                   zn_inputs,
                   pg_inputs,
                   expected_fcvtzs_d2x);

   TestFcvtzHelper(config,
                   &MacroAssembler::Fcvtzu,
                   kDRegSize,
                   kDRegSize,
                   zn_inputs,
                   pg_inputs,
                   expected_fcvtzu_d2x);
 }

 template <typename F, size_t N>
 static void TestFrintHelper(Test* config,
                             FcvtFrintMFn macro_m,
                             FcvtFrintZFn macro_z,
                             int lane_size_in_bits,
                             const F (&zn_inputs)[N],
                             const int (&pg_inputs)[N],
                             const F (&zd_expected)[N]) {
   uint64_t zd_expected_rawbits[N];
   FPToRawbitsWithSize(zd_expected, zd_expected_rawbits, lane_size_in_bits);
   TestFcvtFrintHelper(config,
                       macro_m,
                       macro_z,
                       lane_size_in_bits,
                       lane_size_in_bits,
                       zn_inputs,
                       pg_inputs,
                       zd_expected_rawbits);
 }

 TEST_SVE(frint) {
   const double inf_pos = kFP64PositiveInfinity;
   const double inf_neg = kFP64NegativeInfinity;

   double zn_inputs[] =
       {1.1, 1.5, 1.9, 2.5, -1.5, -2.5, 0.0, -0.0, -0.2, inf_pos, inf_neg};
   double zd_expected_a[] =
       {1.0, 2.0, 2.0, 3.0, -2.0, -3.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
   double zd_expected_i[] =
       {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
   double zd_expected_m[] =
       {1.0, 1.0, 1.0, 2.0, -2.0, -3.0, 0.0, -0.0, -1.0, inf_pos, inf_neg};
   double zd_expected_n[] =
       {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
   double zd_expected_p[] =
       {2.0, 2.0, 2.0, 3.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
   double zd_expected_x[] =
       {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
   double zd_expected_z[] =
       {1.0, 1.0, 1.0, 2.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};

   int pg_inputs[] = {0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0};

   struct TestDataSet {
     FcvtFrintMFn macro_m;  // merging form.
     FcvtFrintZFn macro_z;  // zeroing form.
     double (&expected)[11];
   };

   TestDataSet test_data[] =
       {{&MacroAssembler::Frinta, &MacroAssembler::Frinta, zd_expected_a},
        {&MacroAssembler::Frinti, &MacroAssembler::Frinti, zd_expected_i},
        {&MacroAssembler::Frintm, &MacroAssembler::Frintm, zd_expected_m},
        {&MacroAssembler::Frintn, &MacroAssembler::Frintn, zd_expected_n},
        {&MacroAssembler::Frintp, &MacroAssembler::Frintp, zd_expected_p},
        {&MacroAssembler::Frintx, &MacroAssembler::Frintx, zd_expected_x},
        {&MacroAssembler::Frintz, &MacroAssembler::Frintz, zd_expected_z}};

   unsigned lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize};

   for (size_t i = 0; i < sizeof(test_data) / sizeof(TestDataSet); i++) {
     for (size_t j = 0; j < ArrayLength(lane_sizes); j++) {
       TestFrintHelper(config,
                       test_data[i].macro_m,
                       test_data[i].macro_z,
                       lane_sizes[j],
                       zn_inputs,
                       pg_inputs,
                       test_data[i].expected);
     }
   }
 }

 struct CvtfTestDataSet {
   uint64_t int_value;
   uint64_t scvtf_result;
   uint64_t ucvtf_result;
 };

 template <size_t N>
 static void TestUScvtfHelper(Test* config,
                              int dst_type_size_in_bits,
                              int src_type_size_in_bits,
                              const int (&pg_inputs)[N],
                              const CvtfTestDataSet (&data_set)[N]) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   // Unpack the data from the array of struct into individual arrays that can
   // simplify the testing.
   uint64_t zn_inputs[N];
   uint64_t expected_zd_scvtf_all_active[N];
   uint64_t expected_zd_ucvtf_all_active[N];
   for (size_t i = 0; i < N; i++) {
     zn_inputs[i] = data_set[i].int_value;
     expected_zd_scvtf_all_active[i] = data_set[i].scvtf_result;
     expected_zd_ucvtf_all_active[i] = data_set[i].ucvtf_result;
   }

   // If the input and result types have a different size, the instruction
   // operates on elements of the largest specified type.
   int lane_size_in_bits =
       std::max(dst_type_size_in_bits, src_type_size_in_bits);

   ZRegister zd_scvtf_all_active = z25;
   ZRegister zd_ucvtf_all_active = z26;
   ZRegister zn = z27;
   InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_inputs);

   PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits);
   __ Ptrue(pg_all_active);

   // Test integer conversions with all lanes activated.
   __ Scvtf(zd_scvtf_all_active.WithLaneSize(dst_type_size_in_bits),
            pg_all_active.Merging(),
            zn.WithLaneSize(src_type_size_in_bits));
   __ Ucvtf(zd_ucvtf_all_active.WithLaneSize(dst_type_size_in_bits),
            pg_all_active.Merging(),
            zn.WithLaneSize(src_type_size_in_bits));

   ZRegister zd_scvtf_merged = z23;
   ZRegister zd_ucvtf_merged = z24;

   PRegisterWithLaneSize pg_merged = p1.WithLaneSize(lane_size_in_bits);
   Initialise(&masm, pg_merged, pg_inputs);

   uint64_t snan;
   switch (lane_size_in_bits) {
     case kHRegSize:
       snan = 0x7c11;
       break;
     case kSRegSize:
       snan = 0x7f951111;
       break;
     case kDRegSize:
       snan = 0x7ff5555511111111;
       break;
   }
   __ Dup(zd_scvtf_merged.WithLaneSize(lane_size_in_bits), snan);
   __ Dup(zd_ucvtf_merged.WithLaneSize(lane_size_in_bits), snan);

   // Use the same `zn` inputs to test integer conversions but some lanes are set
   // inactive.
   __ Scvtf(zd_scvtf_merged.WithLaneSize(dst_type_size_in_bits),
            pg_merged.Merging(),
            zn.WithLaneSize(src_type_size_in_bits));
   __ Ucvtf(zd_ucvtf_merged.WithLaneSize(dst_type_size_in_bits),
            pg_merged.Merging(),
            zn.WithLaneSize(src_type_size_in_bits));

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(expected_zd_scvtf_all_active,
                      zd_scvtf_all_active.WithLaneSize(lane_size_in_bits));
     ASSERT_EQUAL_SVE(expected_zd_ucvtf_all_active,
                      zd_ucvtf_all_active.WithLaneSize(lane_size_in_bits));

     uint64_t expected_zd_scvtf_merged[N];
     for (size_t i = 0; i < N; i++) {
       expected_zd_scvtf_merged[i] =
           pg_inputs[i] ? expected_zd_scvtf_all_active[i] : snan;
     }
     ASSERT_EQUAL_SVE(expected_zd_scvtf_merged,
                      zd_scvtf_merged.WithLaneSize(lane_size_in_bits));

     uint64_t expected_zd_ucvtf_merged[N];
     for (size_t i = 0; i < N; i++) {
       expected_zd_ucvtf_merged[i] =
           pg_inputs[i] ? expected_zd_ucvtf_all_active[i] : snan;
     }
     ASSERT_EQUAL_SVE(expected_zd_ucvtf_merged,
                      zd_ucvtf_merged.WithLaneSize(lane_size_in_bits));
   }
 }

 TEST_SVE(scvtf_ucvtf_h_s_d_to_float16) {
   // clang-format off
   CvtfTestDataSet data_set_1[] = {
     // Simple conversions of positive numbers which require no rounding; the
     // results should not depend on the rounding mode, and ucvtf and scvtf should
     // produce the same result.
     {0x0000, 0x0000, 0x0000},
     {0x0001, 0x3c00, 0x3c00},
     {0x0010, 0x4c00, 0x4c00},
     {0x0080, 0x5800, 0x5800},
     {0x0400, 0x6400, 0x6400},
     // Conversions which require rounding.
     {0x4000, 0x7400, 0x7400},
     {0x4001, 0x7400, 0x7400},
     // Round up to produce a result that's too big for the input to represent.
     {0x7ff0, 0x77ff, 0x77ff},
     {0x7ff1, 0x77ff, 0x77ff},
     {0x7ffe, 0x7800, 0x7800},
     {0x7fff, 0x7800, 0x7800}};
   int pg_1[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
   TestUScvtfHelper(config, kHRegSize, kDRegSize, pg_1, data_set_1);
   TestUScvtfHelper(config, kHRegSize, kSRegSize, pg_1, data_set_1);
   TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_1, data_set_1);

   CvtfTestDataSet data_set_2[] = {
     // Test mantissa extremities.
     {0x0401, 0x6401, 0x6401},
     {0x4020, 0x7402, 0x7402},
     // The largest int16_t that fits in a float16.
     {0xffef, 0xcc40, 0x7bff},
     // Values that would be negative if treated as an int16_t.
     {0xff00, 0xdc00, 0x7bf8},
     {0x8000, 0xf800, 0x7800},
     {0x8100, 0xf7f0, 0x7808},
     // Check for bit pattern reproduction.
     {0x0123, 0x5c8c, 0x5c8c},
     {0x0cde, 0x6a6f, 0x6a6f},
     // Simple conversions of negative int64_t values. These require no rounding,
     // and the results should not depend on the rounding mode.
     {0xf800, 0xe800, 0x7bc0},
     {0xfc00, 0xe400, 0x7be0},
     {0xc000, 0xf400, 0x7a00},
     // Check rounding of negative int16_t values.
     {0x8ffe, 0xf700, 0x7880},
     {0x8fff, 0xf700, 0x7880},
     {0xffee, 0xcc80, 0x7bff},
     {0xffef, 0xcc40, 0x7bff}};
   int pg_2[] = {1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1};
   // `32-bit to float16` and `64-bit to float16` of above tests has been tested
   // in `ucvtf` of `16-bit to float16`.
   TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_2, data_set_2);
   // clang-format on
 }

 TEST_SVE(scvtf_ucvtf_s_to_float) {
   // clang-format off
   int dst_lane_size = kSRegSize;
   int src_lane_size = kSRegSize;

   // Simple conversions of positive numbers which require no rounding; the
   // results should not depend on the rounding mode, and ucvtf and scvtf should
   // produce the same result.
   CvtfTestDataSet data_set_1[] = {
     {0x00000000, 0x00000000, 0x00000000},
     {0x00000001, 0x3f800000, 0x3f800000},
     {0x00004000, 0x46800000, 0x46800000},
     {0x00010000, 0x47800000, 0x47800000},
     {0x40000000, 0x4e800000, 0x4e800000}};
   int pg_1[] = {1, 0, 1, 0, 0};
   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);

   CvtfTestDataSet data_set_2[] = {
     // Test mantissa extremities.
     {0x00800001, 0x4b000001, 0x4b000001},
     {0x40400000, 0x4e808000, 0x4e808000},
     // The largest int32_t that fits in a double.
     {0x7fffff80, 0x4effffff, 0x4effffff},
     // Values that would be negative if treated as an int32_t.
     {0xffffffff, 0xbf800000, 0x4f800000},
     {0xffffff00, 0xc3800000, 0x4f7fffff},
     {0x80000000, 0xcf000000, 0x4f000000},
     {0x80000001, 0xcf000000, 0x4f000000},
     // Check for bit pattern reproduction.
     {0x089abcde, 0x4d09abce, 0x4d09abce},
     {0x12345678, 0x4d91a2b4, 0x4d91a2b4}};
   int pg_2[] = {1, 0, 1, 0, 1, 1, 1, 0, 0};
   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);

   // Simple conversions of negative int32_t values. These require no rounding,
   // and the results should not depend on the rounding mode.
   CvtfTestDataSet data_set_3[] = {
     {0xffffc000, 0xc6800000, 0x4f7fffc0},
     {0xffff0000, 0xc7800000, 0x4f7fff00},
     {0xc0000000, 0xce800000, 0x4f400000},
     // Conversions which require rounding.
     {0x72800000, 0x4ee50000, 0x4ee50000},
     {0x72800001, 0x4ee50000, 0x4ee50000},
     {0x73000000, 0x4ee60000, 0x4ee60000},
     // Check rounding of negative int32_t values.
     {0x80000140, 0xcefffffe, 0x4f000001},
     {0x80000141, 0xcefffffd, 0x4f000001},
     {0x80000180, 0xcefffffd, 0x4f000002},
     // Round up to produce a result that's too big for the input to represent.
     {0x7fffffc0, 0x4f000000, 0x4f000000},
     {0x7fffffff, 0x4f000000, 0x4f000000}};
   int pg_3[] = {1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0};
   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
   // clang-format on
 }

 TEST_SVE(scvtf_ucvtf_d_to_float) {
   // clang-format off
   int dst_lane_size = kSRegSize;
   int src_lane_size = kDRegSize;

   // Simple conversions of positive numbers which require no rounding; the
   // results should not depend on the rounding mode, and ucvtf and scvtf should
   // produce the same result.
   CvtfTestDataSet data_set_1[] = {
     {0x0000000000000000, 0x00000000, 0x00000000},
     {0x0000000000000001, 0x3f800000, 0x3f800000},
     {0x0000000040000000, 0x4e800000, 0x4e800000},
     {0x0000000100000000, 0x4f800000, 0x4f800000},
     {0x4000000000000000, 0x5e800000, 0x5e800000}};
   int pg_1[] = {1, 1, 0, 1, 0};
   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);

   CvtfTestDataSet data_set_2[] = {
     // Test mantissa extremities.
     {0x0010000000000001, 0x59800000, 0x59800000},
     {0x4008000000000000, 0x5e801000, 0x5e801000},
     // The largest int32_t that fits in a float.
     {0x000000007fffff80, 0x4effffff, 0x4effffff},
     // Values that would be negative if treated as an int32_t.
     {0x00000000ffffffff, 0x4f800000, 0x4f800000},
     {0x00000000ffffff00, 0x4f7fffff, 0x4f7fffff},
     {0x0000000080000000, 0x4f000000, 0x4f000000},
     {0x0000000080000100, 0x4f000001, 0x4f000001},
     // The largest int64_t that fits in a float.
     {0x7fffff8000000000, 0x5effffff, 0x5effffff},
     // Check for bit pattern reproduction.
     {0x0123456789abcde0, 0x5b91a2b4, 0x5b91a2b4},
     {0x0000000000876543, 0x4b076543, 0x4b076543}};
   int pg_2[] = {1, 0, 0, 0, 1, 0, 0, 0, 0, 1};
   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);

   CvtfTestDataSet data_set_3[] = {
     // Simple conversions of negative int64_t values. These require no rounding,
     // and the results should not depend on the rounding mode.
     {0xffffffffc0000000, 0xce800000, 0x5f800000},
     {0xffffffff00000000, 0xcf800000, 0x5f800000},
     {0xc000000000000000, 0xde800000, 0x5f400000},
     // Conversions which require rounding.
     {0x0000800002800000, 0x57000002, 0x57000002},
     {0x0000800002800001, 0x57000003, 0x57000003},
     {0x0000800003000000, 0x57000003, 0x57000003},
     // Check rounding of negative int64_t values.
     {0x8000014000000000, 0xdefffffe, 0x5f000001},
     {0x8000014000000001, 0xdefffffd, 0x5f000001},
     {0x8000018000000000, 0xdefffffd, 0x5f000002},
     // Round up to produce a result that's too big for the input to represent.
     {0x00000000ffffff80, 0x4f800000, 0x4f800000},
     {0x00000000ffffffff, 0x4f800000, 0x4f800000},
     {0xffffff8000000000, 0xd3000000, 0x5f800000},
     {0xffffffffffffffff, 0xbf800000, 0x5f800000}};
   int pg_3[] = {0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1};
   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
   // clang-format on
 }

 TEST_SVE(scvtf_ucvtf_d_to_double) {
   // clang-format off
   int dst_lane_size = kDRegSize;
   int src_lane_size = kDRegSize;

   // Simple conversions of positive numbers which require no rounding; the
   // results should not depend on the rounding mode, and ucvtf and scvtf should
   // produce the same result.
   CvtfTestDataSet data_set_1[] = {
     {0x0000000000000000, 0x0000000000000000, 0x0000000000000000},
     {0x0000000000000001, 0x3ff0000000000000, 0x3ff0000000000000},
     {0x0000000040000000, 0x41d0000000000000, 0x41d0000000000000},
     {0x0000000100000000, 0x41f0000000000000, 0x41f0000000000000},
     {0x4000000000000000, 0x43d0000000000000, 0x43d0000000000000}};
   int pg_1[] = {0, 1, 1, 0, 0};
   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);

   CvtfTestDataSet data_set_2[] = {
     // Test mantissa extremities.
     {0x0010000000000001, 0x4330000000000001, 0x4330000000000001},
     {0x4008000000000000, 0x43d0020000000000, 0x43d0020000000000},
     // The largest int32_t that fits in a double.
     {0x000000007fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000},
     // Values that would be negative if treated as an int32_t.
     {0x00000000ffffffff, 0x41efffffffe00000, 0x41efffffffe00000},
     {0x0000000080000000, 0x41e0000000000000, 0x41e0000000000000},
     {0x0000000080000001, 0x41e0000000200000, 0x41e0000000200000},
     // The largest int64_t that fits in a double.
     {0x7ffffffffffffc00, 0x43dfffffffffffff, 0x43dfffffffffffff},
     // Check for bit pattern reproduction.
     {0x0123456789abcde0, 0x43723456789abcde, 0x43723456789abcde},
     {0x0000000012345678, 0x41b2345678000000, 0x41b2345678000000}};
   int pg_2[] = {1, 1, 1, 1, 1, 0, 0, 0, 0};
   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);

   CvtfTestDataSet data_set_3[] = {
     // Simple conversions of negative int64_t values. These require no rounding,
     // and the results should not depend on the rounding mode.
     {0xffffffffc0000000, 0xc1d0000000000000, 0x43effffffff80000},
     {0xffffffff00000000, 0xc1f0000000000000, 0x43efffffffe00000},
     {0xc000000000000000, 0xc3d0000000000000, 0x43e8000000000000},
     // Conversions which require rounding.
     {0x1000000000000280, 0x43b0000000000002, 0x43b0000000000002},
     {0x1000000000000281, 0x43b0000000000003, 0x43b0000000000003},
     {0x1000000000000300, 0x43b0000000000003, 0x43b0000000000003},
     // Check rounding of negative int64_t values.
     {0x8000000000000a00, 0xc3dffffffffffffe, 0x43e0000000000001},
     {0x8000000000000a01, 0xc3dffffffffffffd, 0x43e0000000000001},
     {0x8000000000000c00, 0xc3dffffffffffffd, 0x43e0000000000002},
     // Round up to produce a result that's too big for the input to represent.
     {0x7ffffffffffffe00, 0x43e0000000000000, 0x43e0000000000000},
     {0x7fffffffffffffff, 0x43e0000000000000, 0x43e0000000000000},
     {0xfffffffffffffc00, 0xc090000000000000, 0x43f0000000000000},
     {0xffffffffffffffff, 0xbff0000000000000, 0x43f0000000000000}};
   int pg_3[] = {1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0};
   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
   // clang-format on
 }

 TEST_SVE(scvtf_ucvtf_s_to_double) {
   // clang-format off
   int dst_lane_size = kDRegSize;
   int src_lane_size = kSRegSize;

   // Simple conversions of positive numbers which require no rounding; the
   // results should not depend on the rounding mode, and ucvtf and scvtf should
   // produce the same result.
   CvtfTestDataSet data_set_1[] = {
     {0x00000000, 0x0000000000000000, 0x0000000000000000},
     {0x00000001, 0x3ff0000000000000, 0x3ff0000000000000},
     {0x00004000, 0x40d0000000000000, 0x40d0000000000000},
     {0x00010000, 0x40f0000000000000, 0x40f0000000000000},
     {0x40000000, 0x41d0000000000000, 0x41d0000000000000}};
   int pg_1[] = {1, 0, 0, 0, 1};
   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);

   CvtfTestDataSet data_set_2[] = {
     // Test mantissa extremities.
     {0x40000400, 0x41d0000100000000, 0x41d0000100000000},
     // The largest int32_t that fits in a double.
     {0x7fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000},
     // Values that would be negative if treated as an int32_t.
     {0xffffffff, 0xbff0000000000000, 0x41efffffffe00000},
     {0x80000000, 0xc1e0000000000000, 0x41e0000000000000},
     {0x80000001, 0xc1dfffffffc00000, 0x41e0000000200000},
     // Check for bit pattern reproduction.
     {0x089abcde, 0x41a13579bc000000, 0x41a13579bc000000},
     {0x12345678, 0x41b2345678000000, 0x41b2345678000000},
     // Simple conversions of negative int32_t values. These require no rounding,
     // and the results should not depend on the rounding mode.
     {0xffffc000, 0xc0d0000000000000, 0x41effff800000000},
     {0xffff0000, 0xc0f0000000000000, 0x41efffe000000000},
     {0xc0000000, 0xc1d0000000000000, 0x41e8000000000000}};
   int pg_2[] = {1, 0, 1, 0, 0, 1, 1, 0, 1, 1};
   TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);

   // Note that IEEE 754 double-precision format has 52-bits fraction, so all
   // 32-bits integers are representable in double.
   // clang-format on
 }

 TEST_SVE(sve_fadda) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kFP);
   START();

   __ Ptrue(p0.VnB());
   __ Pfalse(p1.VnB());
   __ Zip1(p1.VnH(), p0.VnH(), p1.VnH());

   __ Index(z0.VnS(), 3, 3);
   __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
   __ Fmov(s2, 2.0);
   __ Fadda(s2, p0, s2, z0.VnS());

   __ Index(z0.VnD(), -7, -7);
   __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
   __ Fmov(d3, 3.0);
   __ Fadda(d3, p0, d3, z0.VnD());

   __ Index(z0.VnH(), 1, 1);
   __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
   __ Fmov(h4, 0);
   __ Fadda(h4, p1, h4, z0.VnH());
   END();

   if (CAN_RUN()) {
     RUN();
     // Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2.
     int n = core.GetSVELaneCount(kSRegSize);
     ASSERT_EQUAL_FP32(2 + 3 * ((n + 1) * (n / 2)), s2);

     n /= 2;  // Half as many lanes.
     ASSERT_EQUAL_FP64(3 + -7 * ((n + 1) * (n / 2)), d3);

     // Sum of first n odd numbers is n^2.
     n = core.GetSVELaneCount(kHRegSize) / 2;  // Half are odd numbers.
     ASSERT_EQUAL_FP16(Float16(n * n), h4);
   }
 }

 TEST_SVE(sve_extract) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   __ Index(z0.VnB(), 0, 1);

   __ Mov(z1, z0);
   __ Mov(z2, z0);
   __ Mov(z3, z0);
   __ Mov(z4, z0);
   __ Mov(z5, z0);
   __ Mov(z6, z0);

   __ Ext(z1, z1, z0, 0);
   __ Ext(z2, z2, z0, 1);
   __ Ext(z3, z3, z0, 15);
   __ Ext(z4, z4, z0, 31);
   __ Ext(z5, z5, z0, 47);
   __ Ext(z6, z6, z0, 255);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(z1, z0);

     int lane_count = core.GetSVELaneCount(kBRegSize);
     if (lane_count == 16) {
       uint64_t z2_expected[] = {0x000f0e0d0c0b0a09, 0x0807060504030201};
       ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
     } else {
       uint64_t z2_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201};
       ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
     }

     if (lane_count == 16) {
       uint64_t z3_expected[] = {0x0e0d0c0b0a090807, 0x060504030201000f};
       ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
     } else {
       uint64_t z3_expected[] = {0x1e1d1c1b1a191817, 0x161514131211100f};
       ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
     }

     if (lane_count < 32) {
       ASSERT_EQUAL_SVE(z4, z0);
     } else if (lane_count == 32) {
       uint64_t z4_expected[] = {0x0e0d0c0b0a090807, 0x060504030201001f};
       ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
     } else {
       uint64_t z4_expected[] = {0x2e2d2c2b2a292827, 0x262524232221201f};
       ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
     }

     if (lane_count < 48) {
       ASSERT_EQUAL_SVE(z5, z0);
     } else if (lane_count == 48) {
       uint64_t z5_expected[] = {0x0e0d0c0b0a090807, 0x060504030201002f};
       ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
     } else {
       uint64_t z5_expected[] = {0x3e3d3c3b3a393837, 0x363534333231302f};
       ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
     }

     if (lane_count < 256) {
       ASSERT_EQUAL_SVE(z6, z0);
     } else {
       uint64_t z6_expected[] = {0x0e0d0c0b0a090807, 0x06050403020100ff};
       ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
     }
   }
 }

 TEST_SVE(sve_fp_paired_across) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);

   START();

   __ Ptrue(p0.VnB());
   __ Pfalse(p1.VnB());
   __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
   __ Zip1(p3.VnD(), p0.VnD(), p1.VnD());
   __ Zip1(p4.VnH(), p0.VnH(), p1.VnH());

   __ Index(z0.VnS(), 3, 3);
   __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
   __ Faddv(s1, p0, z0.VnS());
   __ Fminv(s2, p2, z0.VnS());
   __ Fmaxv(s3, p2, z0.VnS());

   __ Index(z0.VnD(), -7, -7);
   __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
   __ Faddv(d4, p0, z0.VnD());
   __ Fminv(d5, p3, z0.VnD());
   __ Fmaxv(d6, p3, z0.VnD());

   __ Index(z0.VnH(), 1, 1);
   __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
   __ Faddv(h7, p4, z0.VnH());
   __ Fminv(h8, p4, z0.VnH());
   __ Fmaxv(h9, p4, z0.VnH());

   __ Dup(z10.VnH(), 0);
   __ Fdiv(z10.VnH(), p0.Merging(), z10.VnH(), z10.VnH());
   __ Insr(z10.VnH(), 0x5140);
   __ Insr(z10.VnH(), 0xd140);
   __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 2);
   __ Fmaxnmv(h11, p0, z10.VnH());
   __ Fmaxnmv(h12, p4, z10.VnH());
   __ Fminnmv(h13, p0, z10.VnH());
   __ Fminnmv(h14, p4, z10.VnH());

   __ Dup(z10.VnS(), 0);
   __ Fdiv(z10.VnS(), p0.Merging(), z10.VnS(), z10.VnS());
   __ Insr(z10.VnS(), 0x42280000);
   __ Insr(z10.VnS(), 0xc2280000);
   __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 4);
   __ Fmaxnmv(s15, p0, z10.VnS());
   __ Fmaxnmv(s16, p2, z10.VnS());
   __ Fminnmv(s17, p0, z10.VnS());
   __ Fminnmv(s18, p2, z10.VnS());

   __ Dup(z10.VnD(), 0);
   __ Fdiv(z10.VnD(), p0.Merging(), z10.VnD(), z10.VnD());
   __ Insr(z10.VnD(), 0x4045000000000000);
   __ Insr(z10.VnD(), 0xc045000000000000);
   __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 8);
   __ Fmaxnmv(d19, p0, z10.VnD());
   __ Fmaxnmv(d20, p3, z10.VnD());
   __ Fminnmv(d21, p0, z10.VnD());
   __ Fminnmv(d22, p3, z10.VnD());
   END();

   if (CAN_RUN()) {
     RUN();
     // Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2.
     int n = core.GetSVELaneCount(kSRegSize);
     ASSERT_EQUAL_FP32(3 * ((n + 1) * (n / 2)), s1);
     ASSERT_EQUAL_FP32(3, s2);
     ASSERT_EQUAL_FP32(3 * n - 3, s3);

     n /= 2;  // Half as many lanes.
     ASSERT_EQUAL_FP64(-7 * ((n + 1) * (n / 2)), d4);
     ASSERT_EQUAL_FP64(-7 * (n - 1), d5);
     ASSERT_EQUAL_FP64(-7, d6);

     // Sum of first n odd numbers is n^2.
     n = core.GetSVELaneCount(kHRegSize) / 2;  // Half are odd numbers.
     ASSERT_EQUAL_FP16(Float16(n * n), h7);
     ASSERT_EQUAL_FP16(Float16(1), h8);

     n = core.GetSVELaneCount(kHRegSize);
     ASSERT_EQUAL_FP16(Float16(n - 1), h9);

     ASSERT_EQUAL_FP16(Float16(42), h11);
     ASSERT_EQUAL_FP16(Float16(42), h12);
     ASSERT_EQUAL_FP16(Float16(-42), h13);
     ASSERT_EQUAL_FP16(Float16(42), h14);
     ASSERT_EQUAL_FP32(42, s15);
     ASSERT_EQUAL_FP32(42, s16);
     ASSERT_EQUAL_FP32(-42, s17);
     ASSERT_EQUAL_FP32(42, s18);
     ASSERT_EQUAL_FP64(42, d19);
     ASSERT_EQUAL_FP64(42, d20);
     ASSERT_EQUAL_FP64(-42, d21);
     ASSERT_EQUAL_FP64(42, d22);
   }
 }

 TEST_SVE(sve_frecpe_frsqrte) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);

   START();

   __ Ptrue(p0.VnB());

   __ Index(z0.VnH(), 0, 1);
   __ Fdup(z1.VnH(), Float16(1));
   __ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH());
   __ Insr(z1.VnH(), 0);
   __ Frsqrte(z2.VnH(), z1.VnH());
   __ Frecpe(z1.VnH(), z1.VnH());

   __ Index(z0.VnS(), 0, 1);
   __ Fdup(z3.VnS(), Float16(1));
   __ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS());
   __ Insr(z3.VnS(), 0);
   __ Frsqrte(z4.VnS(), z3.VnS());
   __ Frecpe(z3.VnS(), z3.VnS());

   __ Index(z0.VnD(), 0, 1);
   __ Fdup(z5.VnD(), Float16(1));
   __ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD());
   __ Insr(z5.VnD(), 0);
   __ Frsqrte(z6.VnD(), z5.VnD());
   __ Frecpe(z5.VnD(), z5.VnD());
   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t z1_expected[] = {0x23fc27fc2bfc2ffc, 0x33fc37fc3bfc7c00};
     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
     uint64_t z2_expected[] = {0x2ffc31a433fc35a4, 0x37fc39a43bfc7c00};
     ASSERT_EQUAL_SVE(z2_expected, z2.VnD());

     uint64_t z3_expected[] = {0x3e7f80003eff8000, 0x3f7f80007f800000};
     ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
     uint64_t z4_expected[] = {0x3eff80003f348000, 0x3f7f80007f800000};
     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());

     uint64_t z5_expected[] = {0x3feff00000000000, 0x7ff0000000000000};
     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
     uint64_t z6_expected[] = {0x3feff00000000000, 0x7ff0000000000000};
     ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
   }
 }

 TEST_SVE(sve_frecps_frsqrts) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);

   START();
   __ Ptrue(p0.VnB());

   __ Index(z0.VnH(), 0, -1);
   __ Fdup(z1.VnH(), Float16(1));
   __ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH());
   __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
   __ Insr(z1.VnH(), 0);
   __ Frsqrts(z2.VnH(), z1.VnH(), z0.VnH());
   __ Frecps(z1.VnH(), z1.VnH(), z0.VnH());

   __ Index(z0.VnS(), 0, -1);
   __ Fdup(z3.VnS(), Float16(1));
   __ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS());
   __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
   __ Insr(z3.VnS(), 0);
   __ Frsqrts(z4.VnS(), z3.VnS(), z0.VnS());
   __ Frecps(z3.VnS(), z3.VnS(), z0.VnS());

   __ Index(z0.VnD(), 0, -1);
   __ Fdup(z5.VnD(), Float16(1));
   __ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD());
   __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
   __ Insr(z5.VnD(), 0);
   __ Frsqrts(z6.VnD(), z5.VnD(), z0.VnD());
   __ Frecps(z5.VnD(), z5.VnD(), z0.VnD());
   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t z1_expected[] = {0x4038406040a04100, 0x4180420042004000};
     ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
     uint64_t z2_expected[] = {0x3e383e603ea03f00, 0x3f80400040003e00};
     ASSERT_EQUAL_SVE(z2_expected, z2.VnD());

     uint64_t z3_expected[] = {0x4030000040400000, 0x4040000040000000};
     ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
     uint64_t z4_expected[] = {0x3ff0000040000000, 0x400000003fc00000};
     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());

     uint64_t z5_expected[] = {0x4008000000000000, 0x4000000000000000};
     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
     uint64_t z6_expected[] = {0x4000000000000000, 0x3ff8000000000000};
     ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
   }
 }

 TEST_SVE(sve_ftsmul) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);

   START();
   __ Ptrue(p0.VnB());

   __ Index(z0.VnH(), 0, 1);
   __ Rev(z1.VnH(), z0.VnH());
   __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
   __ Dup(z2.VnH(), 0);
   __ Fdiv(z2.VnH(), p0.Merging(), z2.VnH(), z2.VnH());
   __ Ftsmul(z3.VnH(), z0.VnH(), z1.VnH());
   __ Ftsmul(z4.VnH(), z2.VnH(), z1.VnH());

   __ Index(z0.VnS(), -7, 1);
   __ Rev(z1.VnS(), z0.VnS());
   __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
   __ Dup(z2.VnS(), 0);
   __ Fdiv(z2.VnS(), p0.Merging(), z2.VnS(), z2.VnS());
   __ Ftsmul(z5.VnS(), z0.VnS(), z1.VnS());
   __ Ftsmul(z6.VnS(), z2.VnS(), z1.VnS());

   __ Index(z0.VnD(), 2, -1);
   __ Rev(z1.VnD(), z0.VnD());
   __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
   __ Dup(z2.VnD(), 0);
   __ Fdiv(z2.VnD(), p0.Merging(), z2.VnD(), z2.VnD());
   __ Ftsmul(z7.VnD(), z0.VnD(), z1.VnD());
   __ Ftsmul(z8.VnD(), z2.VnD(), z1.VnD());
   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t z3_expected[] = {0x5220d0804e40cc00, 0x4880c4003c008000};
     ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
     uint64_t z4_expected[] = {0x7e007e007e007e00, 0x7e007e007e007e00};
     ASSERT_EQUAL_SVE(z4_expected, z4.VnD());

     uint64_t z5_expected[] = {0xc180000041c80000, 0xc210000042440000};
     ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
     uint64_t z6_expected[] = {0x7fc000007fc00000, 0x7fc000007fc00000};
     ASSERT_EQUAL_SVE(z6_expected, z6.VnD());

     uint64_t z7_expected[] = {0x3ff0000000000000, 0xc010000000000000};
     ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
     uint64_t z8_expected[] = {0x7ff8000000000000, 0x7ff8000000000000};
     ASSERT_EQUAL_SVE(z8_expected, z8.VnD());
   }
 }

 typedef void (MacroAssembler::*FPMulAccFn)(
     const ZRegister& zd,
     const PRegisterM& pg,
     const ZRegister& za,
     const ZRegister& zn,
     const ZRegister& zm,
     FPMacroNaNPropagationOption nan_option);

 // The `pg_inputs` is used for examining the predication correctness internally.
 // It does not imply the value of `result` argument. `result` stands for the
 // expected result on all-true predication.
 template <typename T, size_t N>
 static void FPMulAccHelper(
     Test* config,
     FPMulAccFn macro,
     unsigned lane_size_in_bits,
     const int (&pg_inputs)[N],
     const T (&za_inputs)[N],
     const T (&zn_inputs)[N],
     const T (&zm_inputs)[N],
     const uint64_t (&result)[N],
     FPMacroNaNPropagationOption nan_option = FastNaNPropagation) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
   ZRegister za = z1.WithLaneSize(lane_size_in_bits);
   ZRegister zn = z2.WithLaneSize(lane_size_in_bits);
   ZRegister zm = z3.WithLaneSize(lane_size_in_bits);

   uint64_t za_rawbits[N];
   uint64_t zn_rawbits[N];
   uint64_t zm_rawbits[N];

   FPToRawbitsWithSize(za_inputs, za_rawbits, lane_size_in_bits);
   FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
   FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits);

   InsrHelper(&masm, za, za_rawbits);
   InsrHelper(&masm, zn, zn_rawbits);
   InsrHelper(&masm, zm, zm_rawbits);

   // Initialize `zd` with a signalling NaN.
   uint64_t sn = GetSignallingNan(lane_size_in_bits);
   __ Mov(x29, sn);
   __ Dup(zd, x29);

   Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);

   // Fmla  macro automatically selects between fmla,  fmad  and movprfx + fmla
   // Fmls                 `ditto`              fmls,  fmsb  and movprfx + fmls
   // Fnmla                `ditto`              fnmla, fnmad and movprfx + fnmla
   // Fnmls                `ditto`              fnmls, fnmsb and movprfx + fnmls
   // based on what registers are aliased.
   ZRegister da_result = z10.WithLaneSize(lane_size_in_bits);
   ZRegister dn_result = z11.WithLaneSize(lane_size_in_bits);
   ZRegister dm_result = z12.WithLaneSize(lane_size_in_bits);
   ZRegister d_result = z13.WithLaneSize(lane_size_in_bits);

   __ Mov(da_result, za);
   (masm.*macro)(da_result, p0.Merging(), da_result, zn, zm, nan_option);

   __ Mov(dn_result, zn);
   (masm.*macro)(dn_result, p0.Merging(), za, dn_result, zm, nan_option);

   __ Mov(dm_result, zm);
   (masm.*macro)(dm_result, p0.Merging(), za, zn, dm_result, nan_option);

   __ Mov(d_result, zd);
   (masm.*macro)(d_result, p0.Merging(), za, zn, zm, nan_option);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(za_rawbits, za);
     ASSERT_EQUAL_SVE(zn_rawbits, zn);
     ASSERT_EQUAL_SVE(zm_rawbits, zm);

     uint64_t da_expected[N];
     uint64_t dn_expected[N];
     uint64_t dm_expected[N];
     uint64_t d_expected[N];
     for (size_t i = 0; i < N; i++) {
       da_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : za_rawbits[i];
       dn_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zn_rawbits[i];
       dm_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zm_rawbits[i];
       d_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : sn;
     }

     ASSERT_EQUAL_SVE(da_expected, da_result);
     ASSERT_EQUAL_SVE(dn_expected, dn_result);
     ASSERT_EQUAL_SVE(dm_expected, dm_result);
     ASSERT_EQUAL_SVE(d_expected, d_result);
   }
 }

 TEST_SVE(sve_fmla_fmad) {
   // fmla : zd = za + zn * zm
   double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
   double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
   double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
   int pg_inputs[] = {1, 1, 0, 1};

   uint64_t fmla_result_h[] = {Float16ToRawbits(Float16(-84.0)),
                               Float16ToRawbits(Float16(101.0)),
                               Float16ToRawbits(Float16(33.0)),
                               Float16ToRawbits(Float16(42.0))};

   // `fmad` has been tested in the helper.
   FPMulAccHelper(config,
                  &MacroAssembler::Fmla,
                  kHRegSize,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  fmla_result_h);

   uint64_t fmla_result_s[] = {FloatToRawbits(-84.0f),
                               FloatToRawbits(101.0f),
                               FloatToRawbits(33.0f),
                               FloatToRawbits(42.0f)};

   FPMulAccHelper(config,
                  &MacroAssembler::Fmla,
                  kSRegSize,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  fmla_result_s);

   uint64_t fmla_result_d[] = {DoubleToRawbits(-84.0),
                               DoubleToRawbits(101.0),
                               DoubleToRawbits(33.0),
                               DoubleToRawbits(42.0)};

   FPMulAccHelper(config,
                  &MacroAssembler::Fmla,
                  kDRegSize,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  fmla_result_d);
 }

 TEST_SVE(sve_fmls_fmsb) {
   // fmls : zd = za - zn * zm
   double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
   double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
   double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
   int pg_inputs[] = {1, 0, 1, 1};

   uint64_t fmls_result_h[] = {Float16ToRawbits(Float16(6.0)),
                               Float16ToRawbits(Float16(-99.0)),
                               Float16ToRawbits(Float16(-39.0)),
                               Float16ToRawbits(Float16(-38.0))};

   // `fmsb` has been tested in the helper.
   FPMulAccHelper(config,
                  &MacroAssembler::Fmls,
                  kHRegSize,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  fmls_result_h);

   uint64_t fmls_result_s[] = {FloatToRawbits(6.0f),
                               FloatToRawbits(-99.0f),
                               FloatToRawbits(-39.0f),
                               FloatToRawbits(-38.0f)};

   FPMulAccHelper(config,
                  &MacroAssembler::Fmls,
                  kSRegSize,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  fmls_result_s);

   uint64_t fmls_result_d[] = {DoubleToRawbits(6.0),
                               DoubleToRawbits(-99.0),
                               DoubleToRawbits(-39.0),
                               DoubleToRawbits(-38.0)};

   FPMulAccHelper(config,
                  &MacroAssembler::Fmls,
                  kDRegSize,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  fmls_result_d);
 }

 TEST_SVE(sve_fnmla_fnmad) {
   // fnmla : zd = -za - zn * zm
   double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
   double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
   double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
   int pg_inputs[] = {0, 1, 1, 1};

   uint64_t fnmla_result_h[] = {Float16ToRawbits(Float16(84.0)),
                                Float16ToRawbits(Float16(-101.0)),
                                Float16ToRawbits(Float16(-33.0)),
                                Float16ToRawbits(Float16(-42.0))};

   // `fnmad` has been tested in the helper.
   FPMulAccHelper(config,
                  &MacroAssembler::Fnmla,
                  kHRegSize,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  fnmla_result_h);

   uint64_t fnmla_result_s[] = {FloatToRawbits(84.0f),
                                FloatToRawbits(-101.0f),
                                FloatToRawbits(-33.0f),
                                FloatToRawbits(-42.0f)};

   FPMulAccHelper(config,
                  &MacroAssembler::Fnmla,
                  kSRegSize,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  fnmla_result_s);

   uint64_t fnmla_result_d[] = {DoubleToRawbits(84.0),
                                DoubleToRawbits(-101.0),
                                DoubleToRawbits(-33.0),
                                DoubleToRawbits(-42.0)};

   FPMulAccHelper(config,
                  &MacroAssembler::Fnmla,
                  kDRegSize,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  fnmla_result_d);
 }

 TEST_SVE(sve_fnmls_fnmsb) {
   // fnmls : zd = -za + zn * zm
   double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
   double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
   double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
   int pg_inputs[] = {1, 1, 1, 0};

   uint64_t fnmls_result_h[] = {Float16ToRawbits(Float16(-6.0)),
                                Float16ToRawbits(Float16(99.0)),
                                Float16ToRawbits(Float16(39.0)),
                                Float16ToRawbits(Float16(38.0))};

   // `fnmsb` has been tested in the helper.
   FPMulAccHelper(config,
                  &MacroAssembler::Fnmls,
                  kHRegSize,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  fnmls_result_h);

   uint64_t fnmls_result_s[] = {FloatToRawbits(-6.0f),
                                FloatToRawbits(99.0f),
                                FloatToRawbits(39.0f),
                                FloatToRawbits(38.0f)};

   FPMulAccHelper(config,
                  &MacroAssembler::Fnmls,
                  kSRegSize,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  fnmls_result_s);

   uint64_t fnmls_result_d[] = {DoubleToRawbits(-6.0),
                                DoubleToRawbits(99.0),
                                DoubleToRawbits(39.0),
                                DoubleToRawbits(38.0)};

   FPMulAccHelper(config,
                  &MacroAssembler::Fnmls,
                  kDRegSize,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  fnmls_result_d);
 }

 typedef void (MacroAssembler::*FPMulAccIdxFn)(const ZRegister& zd,
                                               const ZRegister& za,
                                               const ZRegister& zn,
                                               const ZRegister& zm,
                                               int index);

 template <typename T, size_t N>
 static void FPMulAccIdxHelper(Test* config,
                               FPMulAccFn macro,
                               FPMulAccIdxFn macro_idx,
                               const T (&za_inputs)[N],
                               const T (&zn_inputs)[N],
                               const T (&zm_inputs)[N]) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   __ Ptrue(p0.VnB());

   // Repeat indexed vector across up to 2048-bit VL.
   for (size_t i = 0; i < (kZRegMaxSize / kDRegSize); i += N) {
     InsrHelper(&masm, z30.VnD(), zm_inputs);
   }

   FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z30.VnH());

   InsrHelper(&masm, z1.VnD(), zn_inputs);
   InsrHelper(&masm, z2.VnD(), za_inputs);

   __ Mov(z3, z0);
   (masm.*macro_idx)(z3.VnH(), z2.VnH(), z1.VnH(), z3.VnH(), 0);  // zd == zm
   __ Mov(z4, z1);
   (masm.*macro_idx)(z4.VnH(), z2.VnH(), z4.VnH(), z0.VnH(), 1);  // zd == zn
   __ Mov(z5, z2);
   (masm.*macro_idx)(z5.VnH(), z5.VnH(), z1.VnH(), z0.VnH(), 4);  // zd == za
   (masm.*macro_idx)(z6.VnH(), z2.VnH(), z1.VnH(), z0.VnH(), 7);

   FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z30.VnS());

   __ Mov(z7, z0);
   (masm.*macro_idx)(z7.VnS(), z2.VnS(), z1.VnS(), z7.VnS(), 0);  // zd == zm
   __ Mov(z8, z1);
   (masm.*macro_idx)(z8.VnS(), z2.VnS(), z8.VnS(), z0.VnS(), 1);  // zd == zn
   __ Mov(z9, z2);
   (masm.*macro_idx)(z9.VnS(), z9.VnS(), z1.VnS(), z0.VnS(), 2);  // zd == za
   (masm.*macro_idx)(z10.VnS(), z2.VnS(), z1.VnS(), z0.VnS(), 3);

   FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD());

   __ Mov(z11, z0);
   (masm.*macro_idx)(z11.VnD(), z2.VnD(), z1.VnD(), z11.VnD(), 0);  // zd == zm
   __ Mov(z12, z1);
   (masm.*macro_idx)(z12.VnD(), z2.VnD(), z12.VnD(), z0.VnD(), 1);  // zd == zn
   __ Mov(z13, z2);
   (masm.*macro_idx)(z13.VnD(), z13.VnD(), z1.VnD(), z0.VnD(), 0);  // zd == za
   __ Mov(z14, z0);
   // zd == zn == zm
   (masm.*macro_idx)(z14.VnD(), z2.VnD(), z14.VnD(), z14.VnD(), 1);

   // Indexed form of Fmla and Fmls won't swap argument, passing strict NaN
   // propagation mode to ensure the following macros don't swap argument in
   // any cases.
   FPMacroNaNPropagationOption option = StrictNaNPropagation;
   // Compute the results using other instructions.
   __ Dup(z0.VnH(), z30.VnH(), 0);
   FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
   (masm.*macro)(z15.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
   __ Dup(z0.VnH(), z30.VnH(), 1);
   FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
   (masm.*macro)(z16.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
   __ Dup(z0.VnH(), z30.VnH(), 4);
   FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
   (masm.*macro)(z17.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
   __ Dup(z0.VnH(), z30.VnH(), 7);
   FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
   (masm.*macro)(z18.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);

   __ Dup(z0.VnS(), z30.VnS(), 0);
   FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
   (masm.*macro)(z19.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
   __ Dup(z0.VnS(), z30.VnS(), 1);
   FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
   (masm.*macro)(z20.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
   __ Dup(z0.VnS(), z30.VnS(), 2);
   FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
   (masm.*macro)(z21.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
   __ Dup(z0.VnS(), z30.VnS(), 3);
   FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
   (masm.*macro)(z22.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);

   __ Dup(z0.VnD(), z30.VnD(), 0);
   FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD());
   (masm.*macro)(z23.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option);
   __ Dup(z0.VnD(), z30.VnD(), 1);
   FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD());
   (masm.*macro)(z24.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option);
   FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD());
   __ Dup(z29.VnD(), z30.VnD(), 1);
   FPSegmentPatternHelper(&masm, z29.VnD(), p0.Merging(), z29.VnD());
   (masm.*macro)(z25.VnD(), p0.Merging(), z2.VnD(), z0.VnD(), z29.VnD(), option);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(z15.VnH(), z3.VnH());
     ASSERT_EQUAL_SVE(z16.VnH(), z4.VnH());
     ASSERT_EQUAL_SVE(z17.VnH(), z5.VnH());
     ASSERT_EQUAL_SVE(z18.VnH(), z6.VnH());

     ASSERT_EQUAL_SVE(z19.VnS(), z7.VnS());
     ASSERT_EQUAL_SVE(z20.VnS(), z8.VnS());
     ASSERT_EQUAL_SVE(z21.VnS(), z9.VnS());
     ASSERT_EQUAL_SVE(z22.VnS(), z10.VnS());

     ASSERT_EQUAL_SVE(z23.VnD(), z11.VnD());
     ASSERT_EQUAL_SVE(z24.VnD(), z12.VnD());
     ASSERT_EQUAL_SVE(z11.VnD(), z13.VnD());
     ASSERT_EQUAL_SVE(z25.VnD(), z14.VnD());
   }
 }

 TEST_SVE(sve_fmla_fmls_index) {
   uint64_t zm_inputs_1[] = {0x3ff000003f803c00, 0xbff00000bf80bc00};
   uint64_t zn_inputs_1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76};
   uint64_t za_inputs_1[] = {0x3c004000bc00c000, 0x64006800e400e800};

   // Using the vector form of Fmla and Fmls to verify the indexed form.
   FPMulAccIdxHelper(config,
                     &MacroAssembler::Fmla,  // vector form
                     &MacroAssembler::Fmla,  // indexed form
                     za_inputs_1,
                     zn_inputs_1,
                     zm_inputs_1);

   FPMulAccIdxHelper(config,
                     &MacroAssembler::Fmls,  // vector form
                     &MacroAssembler::Fmls,  // indexed form
                     za_inputs_1,
                     zn_inputs_1,
                     zm_inputs_1);

   uint64_t zm_inputs_2[] = {0x7ff5555511111111,   // NaN
                             0xfff0000000000000};  // Infinity
   uint64_t zn_inputs_2[] = {0x7f9511117fc00000,   // NaN
                             0x7f800000ff800000};  // Infinity
   uint64_t za_inputs_2[] = {0x7c11000000007e00,   // NaN
                             0x000000007c00fc00};  // Infinity
   FPMulAccIdxHelper(config,
                     &MacroAssembler::Fmla,  // vector form
                     &MacroAssembler::Fmla,  // indexed form
                     za_inputs_2,
                     zn_inputs_2,
                     zm_inputs_2);

   FPMulAccIdxHelper(config,
                     &MacroAssembler::Fmls,  // vector form
                     &MacroAssembler::Fmls,  // indexed form
                     za_inputs_2,
                     zn_inputs_2,
                     zm_inputs_2);
 }

 // Execute a number of instructions which all use ProcessNaNs, and check that
 // they all propagate NaNs correctly.
 template <typename Ti, typename Td, size_t N>
 static void ProcessNaNsHelper(Test* config,
                               int lane_size_in_bits,
                               const Ti (&zn_inputs)[N],
                               const Ti (&zm_inputs)[N],
                               const Td (&zd_expected)[N],
                               FPMacroNaNPropagationOption nan_option) {
   ArithFn arith_unpredicated_macro[] = {&MacroAssembler::Fadd,
                                         &MacroAssembler::Fsub,
                                         &MacroAssembler::Fmul};

   for (size_t i = 0; i < ArrayLength(arith_unpredicated_macro); i++) {
     FPBinArithHelper(config,
                      arith_unpredicated_macro[i],
                      lane_size_in_bits,
                      zn_inputs,
                      zm_inputs,
                      zd_expected);
   }

   FPArithPredicatedFn arith_predicated_macro[] = {&MacroAssembler::Fmax,
                                                   &MacroAssembler::Fmin};
   int pg_inputs[N];
   // With an all-true predicate, this helper aims to compare with special
   // numbers.
   for (size_t i = 0; i < N; i++) {
     pg_inputs[i] = 1;
   }

   // fdivr propagates the quotient (Zm) preferentially, so we don't actually
   // need any special handling for StrictNaNPropagation.
   FPBinArithHelper(config,
                    NULL,
                    &MacroAssembler::Fdiv,
                    lane_size_in_bits,
                    // With an all-true predicate, the value in zd is
                    // irrelevant to the operations.
                    zn_inputs,
                    pg_inputs,
                    zn_inputs,
                    zm_inputs,
                    zd_expected);

   for (size_t i = 0; i < ArrayLength(arith_predicated_macro); i++) {
     FPBinArithHelper(config,
                      arith_predicated_macro[i],
                      NULL,
                      lane_size_in_bits,
                      // With an all-true predicate, the value in zd is
                      // irrelevant to the operations.
                      zn_inputs,
                      pg_inputs,
                      zn_inputs,
                      zm_inputs,
                      zd_expected,
                      nan_option);
   }
 }

 template <typename Ti, typename Td, size_t N>
 static void ProcessNaNsHelper3(Test* config,
                                int lane_size_in_bits,
                                const Ti (&za_inputs)[N],
                                const Ti (&zn_inputs)[N],
                                const Ti (&zm_inputs)[N],
                                const Td (&zd_expected_fmla)[N],
                                const Td (&zd_expected_fmls)[N],
                                const Td (&zd_expected_fnmla)[N],
                                const Td (&zd_expected_fnmls)[N],
                                FPMacroNaNPropagationOption nan_option) {
   int pg_inputs[N];
   // With an all-true predicate, this helper aims to compare with special
   // numbers.
   for (size_t i = 0; i < N; i++) {
     pg_inputs[i] = 1;
   }

   FPMulAccHelper(config,
                  &MacroAssembler::Fmla,
                  lane_size_in_bits,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  zd_expected_fmla,
                  nan_option);

   FPMulAccHelper(config,
                  &MacroAssembler::Fmls,
                  lane_size_in_bits,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  zd_expected_fmls,
                  nan_option);

   FPMulAccHelper(config,
                  &MacroAssembler::Fnmla,
                  lane_size_in_bits,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  zd_expected_fnmla,
                  nan_option);

   FPMulAccHelper(config,
                  &MacroAssembler::Fnmls,
                  lane_size_in_bits,
                  pg_inputs,
                  za_inputs,
                  zn_inputs,
                  zm_inputs,
                  zd_expected_fnmls,
                  nan_option);
 }

 TEST_SVE(sve_process_nans_double) {
   // Use non-standard NaNs to check that the payload bits are preserved.
   double sa = RawbitsToDouble(0x7ff5555511111111);
   double sn = RawbitsToDouble(0x7ff5555522222222);
   double sm = RawbitsToDouble(0x7ff5555533333333);
   double qa = RawbitsToDouble(0x7ffaaaaa11111111);
   double qn = RawbitsToDouble(0x7ffaaaaa22222222);
   double qm = RawbitsToDouble(0x7ffaaaaa33333333);
   VIXL_ASSERT(IsSignallingNaN(sa));
   VIXL_ASSERT(IsSignallingNaN(sn));
   VIXL_ASSERT(IsSignallingNaN(sm));
   VIXL_ASSERT(IsQuietNaN(qa));
   VIXL_ASSERT(IsQuietNaN(qn));
   VIXL_ASSERT(IsQuietNaN(qm));

   // The input NaNs after passing through ProcessNaN.
   uint64_t sa_proc = 0x7ffd555511111111;
   uint64_t sn_proc = 0x7ffd555522222222;
   uint64_t sm_proc = 0x7ffd555533333333;
   uint64_t qa_proc = DoubleToRawbits(qa);
   uint64_t qn_proc = DoubleToRawbits(qn);
   uint64_t qm_proc = DoubleToRawbits(qm);
   uint64_t sa_proc_n = sa_proc ^ kDSignMask;
   uint64_t sn_proc_n = sn_proc ^ kDSignMask;
   uint64_t qa_proc_n = qa_proc ^ kDSignMask;
   uint64_t qn_proc_n = qn_proc ^ kDSignMask;

   // Quiet NaNs are propagated.
   double zn_inputs_1[] = {qn, 0.0, 0.0, qm, qn, qm};
   double zm_inputs_1[] = {0.0, qn, qm, 0.0, qm, qn};
   uint64_t zd_expected_1[] =
       {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};

   ProcessNaNsHelper(config,
                     kDRegSize,
                     zn_inputs_1,
                     zm_inputs_1,
                     zd_expected_1,
                     StrictNaNPropagation);

   // Signalling NaNs are propagated.
   double zn_inputs_2[] = {sn, 0.0, 0.0, sm, sn, sm};
   double zm_inputs_2[] = {0.0, sn, sm, 0.0, sm, sn};
   uint64_t zd_expected_2[] =
       {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
   ProcessNaNsHelper(config,
                     kDRegSize,
                     zn_inputs_2,
                     zm_inputs_2,
                     zd_expected_2,
                     StrictNaNPropagation);

   // Signalling NaNs take precedence over quiet NaNs.
   double zn_inputs_3[] = {sn, qn, sn, sn, qn};
   double zm_inputs_3[] = {qm, sm, sm, qn, sn};
   uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
   ProcessNaNsHelper(config,
                     kDRegSize,
                     zn_inputs_3,
                     zm_inputs_3,
                     zd_expected_3,
                     StrictNaNPropagation);

   double za_inputs_4[] = {qa, qa, 0.0, 0.0, qa, qa};
   double zn_inputs_4[] = {qn, 0.0, 0.0, qn, qn, qn};
   double zm_inputs_4[] = {0.0, qm, qm, qm, qm, 0.0};

   // If `a` is propagated, its sign is inverted by fnmla and fnmls.
   // If `n` is propagated, its sign is inverted by fmls and fnmla.
   // If `m` is propagated, its sign is never inverted.
   uint64_t zd_expected_fmla_4[] =
       {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
   uint64_t zd_expected_fmls_4[] =
       {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
   uint64_t zd_expected_fnmla_4[] =
       {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
   uint64_t zd_expected_fnmls_4[] =
       {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};

   ProcessNaNsHelper3(config,
                      kDRegSize,
                      za_inputs_4,
                      zn_inputs_4,
                      zm_inputs_4,
                      zd_expected_fmla_4,
                      zd_expected_fmls_4,
                      zd_expected_fnmla_4,
                      zd_expected_fnmls_4,
                      StrictNaNPropagation);

   // Signalling NaNs take precedence over quiet NaNs.
   double za_inputs_5[] = {qa, qa, sa, sa, sa};
   double zn_inputs_5[] = {qn, sn, sn, sn, qn};
   double zm_inputs_5[] = {sm, qm, sm, qa, sm};
   uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
   uint64_t zd_expected_fmls_5[] = {sm_proc,
                                    sn_proc_n,
                                    sa_proc,
                                    sa_proc,
                                    sa_proc};
   uint64_t zd_expected_fnmla_5[] = {sm_proc,
                                     sn_proc_n,
                                     sa_proc_n,
                                     sa_proc_n,
                                     sa_proc_n};
   uint64_t zd_expected_fnmls_5[] = {sm_proc,
                                     sn_proc,
                                     sa_proc_n,
                                     sa_proc_n,
                                     sa_proc_n};

   ProcessNaNsHelper3(config,
                      kDRegSize,
                      za_inputs_5,
                      zn_inputs_5,
                      zm_inputs_5,
                      zd_expected_fmla_5,
                      zd_expected_fmls_5,
                      zd_expected_fnmla_5,
                      zd_expected_fnmls_5,
                      StrictNaNPropagation);

   const double inf = kFP64PositiveInfinity;
   const double inf_n = kFP64NegativeInfinity;
   uint64_t inf_proc = DoubleToRawbits(inf);
   uint64_t inf_proc_n = DoubleToRawbits(inf_n);
   uint64_t d_inf_proc = DoubleToRawbits(kFP64DefaultNaN);

   double za_inputs_6[] = {qa, qa, 0.0f, -0.0f, qa, sa};
   double zn_inputs_6[] = {inf, -0.0f, -0.0f, inf, inf_n, inf};
   double zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f};

   // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
   // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
   // quiet_nan.
   uint64_t zd_expected_fmla_6[] =
       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
   uint64_t zd_expected_fmls_6[] =
       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
   uint64_t zd_expected_fnmla_6[] =
       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
   uint64_t zd_expected_fnmls_6[] =
       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};

   ProcessNaNsHelper3(config,
                      kDRegSize,
                      za_inputs_6,
                      zn_inputs_6,
                      zm_inputs_6,
                      zd_expected_fmla_6,
                      zd_expected_fmls_6,
                      zd_expected_fnmla_6,
                      zd_expected_fnmls_6,
                      StrictNaNPropagation);
 }

 TEST_SVE(sve_process_nans_float) {
   // Use non-standard NaNs to check that the payload bits are preserved.
   float sa = RawbitsToFloat(0x7f951111);
   float sn = RawbitsToFloat(0x7f952222);
   float sm = RawbitsToFloat(0x7f953333);
   float qa = RawbitsToFloat(0x7fea1111);
   float qn = RawbitsToFloat(0x7fea2222);
   float qm = RawbitsToFloat(0x7fea3333);
   VIXL_ASSERT(IsSignallingNaN(sa));
   VIXL_ASSERT(IsSignallingNaN(sn));
   VIXL_ASSERT(IsSignallingNaN(sm));
   VIXL_ASSERT(IsQuietNaN(qa));
   VIXL_ASSERT(IsQuietNaN(qn));
   VIXL_ASSERT(IsQuietNaN(qm));

   // The input NaNs after passing through ProcessNaN.
   uint32_t sa_proc = 0x7fd51111;
   uint32_t sn_proc = 0x7fd52222;
   uint32_t sm_proc = 0x7fd53333;
   uint32_t qa_proc = FloatToRawbits(qa);
   uint32_t qn_proc = FloatToRawbits(qn);
   uint32_t qm_proc = FloatToRawbits(qm);
   uint32_t sa_proc_n = sa_proc ^ kSSignMask;
   uint32_t sn_proc_n = sn_proc ^ kSSignMask;
   uint32_t qa_proc_n = qa_proc ^ kSSignMask;
   uint32_t qn_proc_n = qn_proc ^ kSSignMask;

   // Quiet NaNs are propagated.
   float zn_inputs_1[] = {qn, 0.0f, 0.0f, qm, qn, qm};
   float zm_inputs_1[] = {0.0f, qn, qm, 0.0f, qm, qn};
   uint64_t zd_expected_1[] =
       {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};

   ProcessNaNsHelper(config,
                     kSRegSize,
                     zn_inputs_1,
                     zm_inputs_1,
                     zd_expected_1,
                     StrictNaNPropagation);

   // Signalling NaNs are propagated.
   float zn_inputs_2[] = {sn, 0.0f, 0.0f, sm, sn, sm};
   float zm_inputs_2[] = {0.0f, sn, sm, 0.0f, sm, sn};
   uint64_t zd_expected_2[] =
       {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
   ProcessNaNsHelper(config,
                     kSRegSize,
                     zn_inputs_2,
                     zm_inputs_2,
                     zd_expected_2,
                     StrictNaNPropagation);

   // Signalling NaNs take precedence over quiet NaNs.
   float zn_inputs_3[] = {sn, qn, sn, sn, qn};
   float zm_inputs_3[] = {qm, sm, sm, qn, sn};
   uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
   ProcessNaNsHelper(config,
                     kSRegSize,
                     zn_inputs_3,
                     zm_inputs_3,
                     zd_expected_3,
                     StrictNaNPropagation);

   float za_inputs_4[] = {qa, qa, 0.0f, 0.0f, qa, qa};
   float zn_inputs_4[] = {qn, 0.0f, 0.0f, qn, qn, qn};
   float zm_inputs_4[] = {0.0f, qm, qm, qm, qm, 0.0f};

   // If `a` is propagated, its sign is inverted by fnmla and fnmls.
   // If `n` is propagated, its sign is inverted by fmls and fnmla.
   // If `m` is propagated, its sign is never inverted.
   uint64_t zd_expected_fmla_4[] =
       {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
   uint64_t zd_expected_fmls_4[] =
       {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
   uint64_t zd_expected_fnmla_4[] =
       {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
   uint64_t zd_expected_fnmls_4[] =
       {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};

   ProcessNaNsHelper3(config,
                      kSRegSize,
                      za_inputs_4,
                      zn_inputs_4,
                      zm_inputs_4,
                      zd_expected_fmla_4,
                      zd_expected_fmls_4,
                      zd_expected_fnmla_4,
                      zd_expected_fnmls_4,
                      StrictNaNPropagation);

   // Signalling NaNs take precedence over quiet NaNs.
   float za_inputs_5[] = {qa, qa, sa, sa, sa};
   float zn_inputs_5[] = {qn, sn, sn, sn, qn};
   float zm_inputs_5[] = {sm, qm, sm, qa, sm};
   uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
   uint64_t zd_expected_fmls_5[] = {sm_proc,
                                    sn_proc_n,
                                    sa_proc,
                                    sa_proc,
                                    sa_proc};
   uint64_t zd_expected_fnmla_5[] = {sm_proc,
                                     sn_proc_n,
                                     sa_proc_n,
                                     sa_proc_n,
                                     sa_proc_n};
   uint64_t zd_expected_fnmls_5[] = {sm_proc,
                                     sn_proc,
                                     sa_proc_n,
                                     sa_proc_n,
                                     sa_proc_n};

   ProcessNaNsHelper3(config,
                      kSRegSize,
                      za_inputs_5,
                      zn_inputs_5,
                      zm_inputs_5,
                      zd_expected_fmla_5,
                      zd_expected_fmls_5,
                      zd_expected_fnmla_5,
                      zd_expected_fnmls_5,
                      StrictNaNPropagation);

   const float inf = kFP32PositiveInfinity;
   const float inf_n = kFP32NegativeInfinity;
   uint32_t inf_proc = FloatToRawbits(inf);
   uint32_t inf_proc_n = FloatToRawbits(inf_n);
   uint32_t d_inf_proc = FloatToRawbits(kFP32DefaultNaN);

   float za_inputs_6[] = {qa, qa, 0.0f, 0.0f, qa, sa};
   float zn_inputs_6[] = {inf, 0.0f, 0.0f, inf, inf_n, inf};
   float zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f};

   // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
   // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
   // quiet_nan.
   uint64_t zd_expected_fmla_6[] =
       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
   uint64_t zd_expected_fmls_6[] =
       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
   uint64_t zd_expected_fnmla_6[] =
       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
   uint64_t zd_expected_fnmls_6[] =
       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};

   ProcessNaNsHelper3(config,
                      kSRegSize,
                      za_inputs_6,
                      zn_inputs_6,
                      zm_inputs_6,
                      zd_expected_fmla_6,
                      zd_expected_fmls_6,
                      zd_expected_fnmla_6,
                      zd_expected_fnmls_6,
                      StrictNaNPropagation);
 }

 TEST_SVE(sve_process_nans_half) {
   // Use non-standard NaNs to check that the payload bits are preserved.
   Float16 sa(RawbitsToFloat16(0x7c11));
   Float16 sn(RawbitsToFloat16(0x7c22));
   Float16 sm(RawbitsToFloat16(0x7c33));
   Float16 qa(RawbitsToFloat16(0x7e44));
   Float16 qn(RawbitsToFloat16(0x7e55));
   Float16 qm(RawbitsToFloat16(0x7e66));
   VIXL_ASSERT(IsSignallingNaN(sa));
   VIXL_ASSERT(IsSignallingNaN(sn));
   VIXL_ASSERT(IsSignallingNaN(sm));
   VIXL_ASSERT(IsQuietNaN(qa));
   VIXL_ASSERT(IsQuietNaN(qn));
   VIXL_ASSERT(IsQuietNaN(qm));

   // The input NaNs after passing through ProcessNaN.
   uint16_t sa_proc = 0x7e11;
   uint16_t sn_proc = 0x7e22;
   uint16_t sm_proc = 0x7e33;
   uint16_t qa_proc = Float16ToRawbits(qa);
   uint16_t qn_proc = Float16ToRawbits(qn);
   uint16_t qm_proc = Float16ToRawbits(qm);
   uint16_t sa_proc_n = sa_proc ^ kHSignMask;
   uint16_t sn_proc_n = sn_proc ^ kHSignMask;
   uint16_t qa_proc_n = qa_proc ^ kHSignMask;
   uint16_t qn_proc_n = qn_proc ^ kHSignMask;
   Float16 zero(0.0);

   // Quiet NaNs are propagated.
   Float16 zn_inputs_1[] = {qn, zero, zero, qm, qn, qm};
   Float16 zm_inputs_1[] = {zero, qn, qm, zero, qm, qn};
   uint64_t zd_expected_1[] =
       {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};

   ProcessNaNsHelper(config,
                     kHRegSize,
                     zn_inputs_1,
                     zm_inputs_1,
                     zd_expected_1,
                     StrictNaNPropagation);

   // Signalling NaNs are propagated.
   Float16 zn_inputs_2[] = {sn, zero, zero, sm, sn, sm};
   Float16 zm_inputs_2[] = {zero, sn, sm, zero, sm, sn};
   uint64_t zd_expected_2[] =
       {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
   ProcessNaNsHelper(config,
                     kHRegSize,
                     zn_inputs_2,
                     zm_inputs_2,
                     zd_expected_2,
                     StrictNaNPropagation);

   // Signalling NaNs take precedence over quiet NaNs.
   Float16 zn_inputs_3[] = {sn, qn, sn, sn, qn};
   Float16 zm_inputs_3[] = {qm, sm, sm, qn, sn};
   uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
   ProcessNaNsHelper(config,
                     kHRegSize,
                     zn_inputs_3,
                     zm_inputs_3,
                     zd_expected_3,
                     StrictNaNPropagation);

   Float16 za_inputs_4[] = {qa, qa, zero, zero, qa, qa};
   Float16 zn_inputs_4[] = {qn, zero, zero, qn, qn, qn};
   Float16 zm_inputs_4[] = {zero, qm, qm, qm, qm, zero};

   // If `a` is propagated, its sign is inverted by fnmla and fnmls.
   // If `n` is propagated, its sign is inverted by fmls and fnmla.
   // If `m` is propagated, its sign is never inverted.
   uint64_t zd_expected_fmla_4[] =
       {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
   uint64_t zd_expected_fmls_4[] =
       {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
   uint64_t zd_expected_fnmla_4[] =
       {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
   uint64_t zd_expected_fnmls_4[] =
       {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};

   ProcessNaNsHelper3(config,
                      kHRegSize,
                      za_inputs_4,
                      zn_inputs_4,
                      zm_inputs_4,
                      zd_expected_fmla_4,
                      zd_expected_fmls_4,
                      zd_expected_fnmla_4,
                      zd_expected_fnmls_4,
                      StrictNaNPropagation);

   // Signalling NaNs take precedence over quiet NaNs.
   Float16 za_inputs_5[] = {qa, qa, sa, sa, sa};
   Float16 zn_inputs_5[] = {qn, sn, sn, sn, qn};
   Float16 zm_inputs_5[] = {sm, qm, sm, qa, sm};
   uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
   uint64_t zd_expected_fmls_5[] = {sm_proc,
                                    sn_proc_n,
                                    sa_proc,
                                    sa_proc,
                                    sa_proc};
   uint64_t zd_expected_fnmla_5[] = {sm_proc,
                                     sn_proc_n,
                                     sa_proc_n,
                                     sa_proc_n,
                                     sa_proc_n};
   uint64_t zd_expected_fnmls_5[] = {sm_proc,
                                     sn_proc,
                                     sa_proc_n,
                                     sa_proc_n,
                                     sa_proc_n};

   ProcessNaNsHelper3(config,
                      kHRegSize,
                      za_inputs_5,
                      zn_inputs_5,
                      zm_inputs_5,
                      zd_expected_fmla_5,
                      zd_expected_fmls_5,
                      zd_expected_fnmla_5,
                      zd_expected_fnmls_5,
                      StrictNaNPropagation);

   const Float16 inf = kFP16PositiveInfinity;
   const Float16 inf_n = kFP16NegativeInfinity;
   uint64_t inf_proc = Float16ToRawbits(inf);
   uint64_t inf_proc_n = Float16ToRawbits(inf_n);
   uint64_t d_inf_proc = Float16ToRawbits(kFP16DefaultNaN);

   Float16 za_inputs_6[] = {qa, qa, zero, zero, qa, sa};
   Float16 zn_inputs_6[] = {inf, zero, zero, inf, inf_n, inf};
   Float16 zm_inputs_6[] = {zero, inf_n, inf, inf, inf, zero};

   // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
   // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
   // quiet_nan.
   uint64_t zd_expected_fmla_6[] =
       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
   uint64_t zd_expected_fmls_6[] =
       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
   uint64_t zd_expected_fnmla_6[] =
       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
   uint64_t zd_expected_fnmls_6[] =
       {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};

   ProcessNaNsHelper3(config,
                      kHRegSize,
                      za_inputs_6,
                      zn_inputs_6,
                      zm_inputs_6,
                      zd_expected_fmla_6,
                      zd_expected_fmls_6,
                      zd_expected_fnmla_6,
                      zd_expected_fnmls_6,
                      StrictNaNPropagation);
 }

 typedef void (MacroAssembler::*FCmpFn)(const PRegisterWithLaneSize& pd,
                                        const PRegisterZ& pg,
                                        const ZRegister& zn,
                                        const ZRegister& zm);

 typedef void (MacroAssembler::*FCmpZeroFn)(const PRegisterWithLaneSize& pd,
                                            const PRegisterZ& pg,
                                            const ZRegister& zn,
                                            double zero);

 typedef void (MacroAssembler::*CmpFn)(const PRegisterWithLaneSize& pd,
                                       const PRegisterZ& pg,
                                       const ZRegister& zn,
                                       const ZRegister& zm);

 static FCmpFn GetFpAbsCompareFn(Condition cond) {
   switch (cond) {
     case ge:
       return &MacroAssembler::Facge;
     case gt:
       return &MacroAssembler::Facgt;
     case le:
       return &MacroAssembler::Facle;
     case lt:
       return &MacroAssembler::Faclt;
     default:
       VIXL_UNIMPLEMENTED();
       return NULL;
   }
 }

 static FCmpFn GetFpCompareFn(Condition cond) {
   switch (cond) {
     case ge:
       return &MacroAssembler::Fcmge;
     case gt:
       return &MacroAssembler::Fcmgt;
     case le:
       return &MacroAssembler::Fcmle;
     case lt:
       return &MacroAssembler::Fcmlt;
     case eq:
       return &MacroAssembler::Fcmeq;
     case ne:
       return &MacroAssembler::Fcmne;
     case uo:
       return &MacroAssembler::Fcmuo;
     default:
       VIXL_UNIMPLEMENTED();
       return NULL;
   }
 }

 static FCmpZeroFn GetFpCompareZeroFn(Condition cond) {
   switch (cond) {
     case ge:
       return &MacroAssembler::Fcmge;
     case gt:
       return &MacroAssembler::Fcmgt;
     case le:
       return &MacroAssembler::Fcmle;
     case lt:
       return &MacroAssembler::Fcmlt;
     case eq:
       return &MacroAssembler::Fcmeq;
     case ne:
       return &MacroAssembler::Fcmne;
     default:
       VIXL_UNIMPLEMENTED();
       return NULL;
   }
 }

 static CmpFn GetIntCompareFn(Condition cond) {
   switch (cond) {
     case ge:
       return &MacroAssembler::Cmpge;
     case gt:
       return &MacroAssembler::Cmpgt;
     case le:
       return &MacroAssembler::Cmple;
     case lt:
       return &MacroAssembler::Cmplt;
     case eq:
       return &MacroAssembler::Cmpeq;
     case ne:
       return &MacroAssembler::Cmpne;
     default:
       VIXL_UNIMPLEMENTED();
       return NULL;
   }
 }

 template <size_t N>
 static void TestFpCompareHelper(Test* config,
                                 int lane_size_in_bits,
                                 Condition cond,
                                 const double (&zn_inputs)[N],
                                 const double (&zm_inputs)[N],
                                 const int (&pd_expected)[N],
                                 bool is_absolute = false) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   ZRegister zt_int_1 = z1.WithLaneSize(lane_size_in_bits);
   ZRegister zt_int_2 = z2.WithLaneSize(lane_size_in_bits);
   ZRegister zt_int_3 = z3.WithLaneSize(lane_size_in_bits);
   ZRegister zt_fp_1 = z11.WithLaneSize(lane_size_in_bits);
   ZRegister zt_fp_2 = z12.WithLaneSize(lane_size_in_bits);
   ZRegister zt_fp_3 = z13.WithLaneSize(lane_size_in_bits);
   ZRegister fp_one = z31.WithLaneSize(lane_size_in_bits);

   PRegisterWithLaneSize pd_result_int_1 = p15.WithLaneSize(lane_size_in_bits);
   PRegisterWithLaneSize pd_result_fp_1 = p14.WithLaneSize(lane_size_in_bits);
   PRegisterWithLaneSize pd_result_int_2 = p13.WithLaneSize(lane_size_in_bits);
   PRegisterWithLaneSize pd_result_fp_2 = p12.WithLaneSize(lane_size_in_bits);

   FCmpFn fcmp = is_absolute ? GetFpAbsCompareFn(cond) : GetFpCompareFn(cond);
   __ Ptrue(p1.VnB());

   if (cond != uo) {
     int pg_inputs[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1};
     Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);

     __ Fdup(fp_one, 0.1f);

     __ Index(zt_int_1, 3, 3);
     __ Scvtf(zt_fp_1, p0.Merging(), zt_int_1);
     __ Fadd(zt_fp_1, zt_fp_1, fp_one);

     __ Index(zt_int_2, 3, -10);
     __ Scvtf(zt_fp_2, p0.Merging(), zt_int_2);
     __ Fadd(zt_fp_2, zt_fp_2, fp_one);

     __ Index(zt_int_3, 3, 2);
     __ Scvtf(zt_fp_3, p0.Merging(), zt_int_3);
     __ Fadd(zt_fp_3, zt_fp_3, fp_one);


     // There is no absolute comparison in integer type, use `abs` with `cmp<cc>`
     // to synthesize the expected result for `fac<cc>`.
     if (is_absolute == true) {
       __ Abs(zt_int_2, p1.Merging(), zt_int_2);
     }

     CmpFn cmp = GetIntCompareFn(cond);
     (masm.*cmp)(pd_result_int_1, p0.Zeroing(), zt_int_1, zt_int_2);
     (masm.*fcmp)(pd_result_fp_1, p0.Zeroing(), zt_fp_1, zt_fp_2);

     (masm.*cmp)(pd_result_int_2, p0.Zeroing(), zt_int_1, zt_int_3);
     (masm.*fcmp)(pd_result_fp_2, p0.Zeroing(), zt_fp_1, zt_fp_3);
   }

   uint64_t zn_inputs_rawbits[N];
   uint64_t zm_inputs_rawbits[N];
   FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits);
   FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits);

   ZRegister zn_fp = z14.WithLaneSize(lane_size_in_bits);
   ZRegister zm_fp = z15.WithLaneSize(lane_size_in_bits);
   InsrHelper(&masm, zn_fp, zn_inputs_rawbits);
   InsrHelper(&masm, zm_fp, zm_inputs_rawbits);

   PRegisterWithLaneSize pd_result_fp_3 = p11.WithLaneSize(lane_size_in_bits);
   (masm.*fcmp)(pd_result_fp_3, p1.Zeroing(), zn_fp, zm_fp);

   END();

   if (CAN_RUN()) {
     RUN();

     if (cond != uo) {
       ASSERT_EQUAL_SVE(pd_result_int_1, pd_result_fp_1);
       ASSERT_EQUAL_SVE(pd_result_int_2, pd_result_fp_2);
     }
     ASSERT_EQUAL_SVE(pd_expected, pd_result_fp_3);
   }
 }

 TEST_SVE(sve_fp_compare_vectors) {
   double inf_p = kFP64PositiveInfinity;
   double inf_n = kFP64NegativeInfinity;
   double nan = kFP64DefaultNaN;

   // Normal floating point comparison has been tested in the helper.
   double zn[] = {0.0, inf_n, 1.0, inf_p, inf_p, nan, 0.0, nan};
   double zm[] = {-0.0, inf_n, inf_n, -2.0, inf_n, nan, nan, inf_p};

   int pd_fcm_gt[] = {0, 0, 1, 1, 1, 0, 0, 0};
   int pd_fcm_lt[] = {0, 0, 0, 0, 0, 0, 0, 0};
   int pd_fcm_ge[] = {1, 1, 1, 1, 1, 0, 0, 0};
   int pd_fcm_le[] = {1, 1, 0, 0, 0, 0, 0, 0};
   int pd_fcm_eq[] = {1, 1, 0, 0, 0, 0, 0, 0};
   int pd_fcm_ne[] = {0, 0, 1, 1, 1, 1, 1, 1};
   int pd_fcm_uo[] = {0, 0, 0, 0, 0, 1, 1, 1};
   int pd_fac_gt[] = {0, 0, 0, 1, 0, 0, 0, 0};
   int pd_fac_lt[] = {0, 0, 1, 0, 0, 0, 0, 0};
   int pd_fac_ge[] = {1, 1, 0, 1, 1, 0, 0, 0};
   int pd_fac_le[] = {1, 1, 1, 0, 1, 0, 0, 0};

   int lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize};

   for (size_t i = 0; i < ArrayLength(lane_sizes); i++) {
     int lane_size = lane_sizes[i];
     // Test floating-point compare vectors.
     TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fcm_gt);
     TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fcm_lt);
     TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fcm_ge);
     TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fcm_le);
     TestFpCompareHelper(config, lane_size, eq, zn, zm, pd_fcm_eq);
     TestFpCompareHelper(config, lane_size, ne, zn, zm, pd_fcm_ne);
     TestFpCompareHelper(config, lane_size, uo, zn, zm, pd_fcm_uo);

     // Test floating-point absolute compare vectors.
     TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fac_gt, true);
     TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fac_lt, true);
     TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fac_ge, true);
     TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fac_le, true);
   }
 }

 template <size_t N, typename T>
 static void TestFpCompareZeroHelper(Test* config,
                                     int lane_size_in_bits,
                                     Condition cond,
                                     const T (&zn_inputs)[N],
                                     const int (&pd_expected)[N]) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   ZRegister zn = z28.WithLaneSize(lane_size_in_bits);
   PRegisterWithLaneSize pd = p14.WithLaneSize(lane_size_in_bits);

   uint64_t zn_rawbits[N];
   FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
   InsrHelper(&masm, zn, zn_rawbits);

   __ Ptrue(p0.VnB());
   (masm.*GetFpCompareZeroFn(cond))(pd, p0.Zeroing(), zn, 0.0);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(pd_expected, pd);
   }
 }

 TEST_SVE(sve_fp_compare_vector_zero) {
   Float16 fp16_inf_p = kFP16PositiveInfinity;
   Float16 fp16_inf_n = kFP16NegativeInfinity;
   Float16 fp16_dn = kFP16DefaultNaN;
   Float16 fp16_sn = RawbitsToFloat16(0x7c22);
   Float16 fp16_qn = RawbitsToFloat16(0x7e55);

   float fp32_inf_p = kFP32PositiveInfinity;
   float fp32_inf_n = kFP32NegativeInfinity;
   float fp32_dn = kFP32DefaultNaN;
   float fp32_sn = RawbitsToFloat(0x7f952222);
   float fp32_qn = RawbitsToFloat(0x7fea2222);

   double fp64_inf_p = kFP64PositiveInfinity;
   double fp64_inf_n = kFP64NegativeInfinity;
   double fp64_dn = kFP64DefaultNaN;
   double fp64_sn = RawbitsToDouble(0x7ff5555511111111);
   double fp64_qn = RawbitsToDouble(0x7ffaaaaa11111111);

   // Normal floating point comparison has been tested in the non-zero form.
   Float16 zn_inputs_h[] = {Float16(0.0),
                            Float16(-0.0),
                            fp16_inf_p,
                            fp16_inf_n,
                            fp16_dn,
                            fp16_sn,
                            fp16_qn};
   float zn_inputs_s[] =
       {0.0, -0.0, fp32_inf_p, fp32_inf_n, fp32_dn, fp32_sn, fp32_qn};
   double zn_inputs_d[] =
       {0.0, -0.0, fp64_inf_p, fp64_inf_n, fp64_dn, fp64_sn, fp64_qn};

   int pd_expected_gt[] = {0, 0, 1, 0, 0, 0, 0};
   int pd_expected_lt[] = {0, 0, 0, 1, 0, 0, 0};
   int pd_expected_ge[] = {1, 1, 1, 0, 0, 0, 0};
   int pd_expected_le[] = {1, 1, 0, 1, 0, 0, 0};
   int pd_expected_eq[] = {1, 1, 0, 0, 0, 0, 0};
   int pd_expected_ne[] = {0, 0, 1, 1, 1, 1, 1};

   TestFpCompareZeroHelper(config, kDRegSize, gt, zn_inputs_d, pd_expected_gt);
   TestFpCompareZeroHelper(config, kDRegSize, lt, zn_inputs_d, pd_expected_lt);
   TestFpCompareZeroHelper(config, kDRegSize, ge, zn_inputs_d, pd_expected_ge);
   TestFpCompareZeroHelper(config, kDRegSize, le, zn_inputs_d, pd_expected_le);
   TestFpCompareZeroHelper(config, kDRegSize, eq, zn_inputs_d, pd_expected_eq);
   TestFpCompareZeroHelper(config, kDRegSize, ne, zn_inputs_d, pd_expected_ne);

   TestFpCompareZeroHelper(config, kSRegSize, gt, zn_inputs_s, pd_expected_gt);
   TestFpCompareZeroHelper(config, kSRegSize, lt, zn_inputs_s, pd_expected_lt);
   TestFpCompareZeroHelper(config, kSRegSize, ge, zn_inputs_s, pd_expected_ge);
   TestFpCompareZeroHelper(config, kSRegSize, le, zn_inputs_s, pd_expected_le);
   TestFpCompareZeroHelper(config, kSRegSize, eq, zn_inputs_s, pd_expected_eq);
   TestFpCompareZeroHelper(config, kSRegSize, ne, zn_inputs_s, pd_expected_ne);

   TestFpCompareZeroHelper(config, kHRegSize, gt, zn_inputs_h, pd_expected_gt);
   TestFpCompareZeroHelper(config, kHRegSize, lt, zn_inputs_h, pd_expected_lt);
   TestFpCompareZeroHelper(config, kHRegSize, ge, zn_inputs_h, pd_expected_ge);
   TestFpCompareZeroHelper(config, kHRegSize, le, zn_inputs_h, pd_expected_le);
   TestFpCompareZeroHelper(config, kHRegSize, eq, zn_inputs_h, pd_expected_eq);
   TestFpCompareZeroHelper(config, kHRegSize, ne, zn_inputs_h, pd_expected_ne);
 }

 typedef void (MacroAssembler::*FPUnaryMFn)(const ZRegister& zd,
                                            const PRegisterM& pg,
                                            const ZRegister& zn);

 typedef void (MacroAssembler::*FPUnaryZFn)(const ZRegister& zd,
                                            const PRegisterZ& pg,
                                            const ZRegister& zn);

 template <size_t N, size_t M>
 static void TestFPUnaryPredicatedHelper(Test* config,
                                         int src_size_in_bits,
                                         int dst_size_in_bits,
                                         uint64_t (&zn_inputs)[N],
                                         const uint64_t (&pg_inputs)[M],
                                         const uint64_t (&zd_expected)[N],
                                         FPUnaryMFn macro_m,
                                         FPUnaryZFn macro_z) {
   // Provide the full predicate input.
   VIXL_ASSERT(M == (kPRegMaxSize / kDRegSize));
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   int ds = dst_size_in_bits;
   int ss = src_size_in_bits;
   int ls = std::max(ss, ds);

   // When destination type is larger than source type, fill the high parts with
   // noise values, which should be ignored.
   if (ds > ss) {
     VIXL_ASSERT(ss < 64);
     uint64_t zn_inputs_mod[N];
     uint64_t sn = GetSignallingNan(ss);
     for (unsigned i = 0; i < N; i++) {
       zn_inputs_mod[i] = zn_inputs[i] | ((sn + i) << ss);
     }
     InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs_mod);
   } else {
     InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs);
   }

   // Make a copy so we can check that constructive operations preserve zn.
   __ Mov(z28, z29);

   // Run the operation on all lanes.
   __ Ptrue(p0.WithLaneSize(ls));
   (masm.*macro_m)(z27.WithLaneSize(ds), p0.Merging(), z28.WithLaneSize(ss));

   Initialise(&masm,
              p1.VnB(),
              pg_inputs[3],
              pg_inputs[2],
              pg_inputs[1],
              pg_inputs[0]);

   // Clear the irrelevant lanes.
   __ Index(z31.WithLaneSize(ls), 0, 1);
   __ Cmplt(p1.WithLaneSize(ls), p1.Zeroing(), z31.WithLaneSize(ls), N);

   // Check merging predication.
   __ Index(z11.WithLaneSize(ls), 42, 1);
   // Preserve the base value so we can derive the expected result.
   __ Mov(z21, z11);
   __ Mov(z9, z11);
   (masm.*macro_m)(z11.WithLaneSize(ds), p1.Merging(), z28.WithLaneSize(ss));

   // Generate expected values using explicit merging operations.
   InsrHelper(&masm, z25.WithLaneSize(ls), zd_expected);
   __ Mov(z21.WithLaneSize(ls), p1.Merging(), z25.WithLaneSize(ls));

   // Check zeroing predication.
   __ Index(z12.WithLaneSize(ds), 42, -1);
   (masm.*macro_z)(z12.WithLaneSize(ds), p1.Zeroing(), z28.WithLaneSize(ss));

   // Generate expected values using explicit zeroing operations.
   InsrHelper(&masm, z30.WithLaneSize(ls), zd_expected);
   // Emulate zeroing predication.
   __ Dup(z22.WithLaneSize(ls), 0);
   __ Mov(z22.WithLaneSize(ls), p1.Merging(), z30.WithLaneSize(ls));

   // Check an in-place update.
   __ Mov(z9.WithLaneSize(ls), p1.Merging(), z28.WithLaneSize(ls));
   (masm.*macro_m)(z9.WithLaneSize(ds), p1.Merging(), z9.WithLaneSize(ss));

   END();

   if (CAN_RUN()) {
     RUN();

     // Check all lanes.
     ASSERT_EQUAL_SVE(zd_expected, z27.WithLaneSize(ls));

     // Check that constructive operations preserve their inputs.
     ASSERT_EQUAL_SVE(z28, z29);

     // Check merging predication.
     ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z21.WithLaneSize(ls));

     // Check zeroing predication.
     ASSERT_EQUAL_SVE(z22.WithLaneSize(ls), z12.WithLaneSize(ls));

     // Check in-place operation where zd == zn.
     ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z9.WithLaneSize(ls));
   }
 }

 template <size_t N, typename T>
 static void TestFPUnaryPredicatedHelper(Test* config,
                                         int src_size_in_bits,
                                         int dst_size_in_bits,
                                         T (&zn_inputs)[N],
                                         const T (&zd_expected)[N],
                                         FPUnaryMFn macro_m,
                                         FPUnaryZFn macro_z) {
   uint64_t pg_inputs[] = {0xa55aa55aa55aa55a,
                           0xa55aa55aa55aa55a,
                           0xa55aa55aa55aa55a,
                           0xa55aa55aa55aa55a};

   TestFPUnaryPredicatedHelper(config,
                               src_size_in_bits,
                               dst_size_in_bits,
                               zn_inputs,
                               pg_inputs,
                               zd_expected,
                               macro_m,
                               macro_z);

   // The complementary of above predicate to get full input coverage.
   uint64_t pg_c_inputs[] = {0x5aa55aa55aa55aa5,
                             0x5aa55aa55aa55aa5,
                             0x5aa55aa55aa55aa5,
                             0x5aa55aa55aa55aa5};

   TestFPUnaryPredicatedHelper(config,
                               src_size_in_bits,
                               dst_size_in_bits,
                               zn_inputs,
                               pg_c_inputs,
                               zd_expected,
                               macro_m,
                               macro_z);
 }

 template <size_t N, typename T>
 static void TestFcvtHelper(Test* config,
                            int src_size_in_bits,
                            int dst_size_in_bits,
                            T (&zn_inputs)[N],
                            const T (&zd_expected)[N]) {
   TestFPUnaryPredicatedHelper(config,
                               src_size_in_bits,
                               dst_size_in_bits,
                               zn_inputs,
                               zd_expected,
                               &MacroAssembler::Fcvt,   // Merging form.
                               &MacroAssembler::Fcvt);  // Zerging form.
 }

 TEST_SVE(sve_fcvt) {
   uint64_t h_vals[] = {0x7c00,
                        0xfc00,
                        0,
                        0x8000,
                        0x7bff,   // Max half precision.
                        0x0400,   // Min positive normal.
                        0x03ff,   // Max subnormal.
                        0x0001};  // Min positive subnormal.

   uint64_t s_vals[] = {0x7f800000,
                        0xff800000,
                        0,
                        0x80000000,
                        0x477fe000,
                        0x38800000,
                        0x387fc000,
                        0x33800000};

   uint64_t d_vals[] = {0x7ff0000000000000,
                        0xfff0000000000000,
                        0,
                        0x8000000000000000,
                        0x40effc0000000000,
                        0x3f10000000000000,
                        0x3f0ff80000000000,
                        0x3e70000000000000};

   TestFcvtHelper(config, kHRegSize, kSRegSize, h_vals, s_vals);
   TestFcvtHelper(config, kSRegSize, kHRegSize, s_vals, h_vals);
   TestFcvtHelper(config, kSRegSize, kDRegSize, s_vals, d_vals);
   TestFcvtHelper(config, kDRegSize, kSRegSize, d_vals, s_vals);
   TestFcvtHelper(config, kHRegSize, kDRegSize, h_vals, d_vals);
   TestFcvtHelper(config, kDRegSize, kHRegSize, d_vals, h_vals);
 }

 TEST_SVE(sve_fcvt_nan) {
   uint64_t h_inputs[] = {0x7e55,   // Quiet NaN.
                          0x7c22};  // Signalling NaN.

   uint64_t h2s_expected[] = {0x7fcaa000, 0x7fc44000};

   uint64_t h2d_expected[] = {0x7ff9540000000000, 0x7ff8880000000000};

   uint64_t s_inputs[] = {0x7fc12345,   // Quiet NaN.
                          0x7f812345};  // Signalling NaN.

   uint64_t s2h_expected[] = {0x7e09, 0x7e09};

   uint64_t s2d_expected[] = {0x7ff82468a0000000, 0x7ff82468a0000000};

   uint64_t d_inputs[] = {0x7ffaaaaa22222222,   // Quiet NaN.
                          0x7ff5555511111111};  // Signalling NaN.

   uint64_t d2h_expected[] = {0x7eaa, 0x7f55};

   uint64_t d2s_expected[] = {0x7fd55551, 0x7feaaaa8};

   TestFcvtHelper(config, kHRegSize, kSRegSize, h_inputs, h2s_expected);
   TestFcvtHelper(config, kSRegSize, kHRegSize, s_inputs, s2h_expected);
   TestFcvtHelper(config, kHRegSize, kDRegSize, h_inputs, h2d_expected);
   TestFcvtHelper(config, kDRegSize, kHRegSize, d_inputs, d2h_expected);
   TestFcvtHelper(config, kSRegSize, kDRegSize, s_inputs, s2d_expected);
   TestFcvtHelper(config, kDRegSize, kSRegSize, d_inputs, d2s_expected);
 }

 template <size_t N, typename T>
 static void TestFrecpxHelper(Test* config,
                              int lane_size_in_bits,
                              T (&zn_inputs)[N],
                              const T (&zd_expected)[N]) {
   TestFPUnaryPredicatedHelper(config,
                               lane_size_in_bits,
                               lane_size_in_bits,
                               zn_inputs,
                               zd_expected,
                               &MacroAssembler::Frecpx,   // Merging form.
                               &MacroAssembler::Frecpx);  // Zerging form.
 }

 TEST_SVE(sve_frecpx_h) {
   uint64_t zn_inputs[] = {Float16ToRawbits(kFP16PositiveInfinity),
                           Float16ToRawbits(kFP16NegativeInfinity),
                           Float16ToRawbits(Float16(0.0)),
                           Float16ToRawbits(Float16(-0.0)),
                           0x0001,   // Smallest positive subnormal number.
                           0x03ff,   // Largest subnormal number.
                           0x0400,   // Smallest positive normal number.
                           0x7bff,   // Largest normal number.
                           0x3bff,   // Largest number less than one.
                           0x3c01,   // Smallest number larger than one.
                           0x7c22,   // Signalling NaN.
                           0x7e55};  // Quiet NaN.

   uint64_t zd_expected[] = {0,
                             0x8000,
                             0x7800,
                             0xf800,
                             // Exponent of subnormal numbers are zero.
                             0x7800,
                             0x7800,
                             0x7800,
                             0x0400,
                             0x4400,
                             0x4000,
                             0x7e22,  // To quiet NaN.
                             0x7e55};

   TestFrecpxHelper(config, kHRegSize, zn_inputs, zd_expected);
 }

 TEST_SVE(sve_frecpx_s) {
   uint64_t zn_inputs[] = {FloatToRawbits(kFP32PositiveInfinity),
                           FloatToRawbits(kFP32NegativeInfinity),
                           FloatToRawbits(65504),       // Max half precision.
                           FloatToRawbits(6.10352e-5),  // Min positive normal.
                           FloatToRawbits(6.09756e-5),  // Max subnormal.
                           FloatToRawbits(
                               5.96046e-8),       // Min positive subnormal.
                           FloatToRawbits(5e-9),  // Not representable -> zero.
                           FloatToRawbits(-0.0),
                           FloatToRawbits(0.0),
                           0x7f952222,   // Signalling NaN.
                           0x7fea2222};  // Quiet NaN;

   uint64_t zd_expected[] = {0,           // 0.0
                             0x80000000,  // -0.0
                             0x38800000,  // 6.10352e-05
                             0x47000000,  // 32768
                             0x47800000,  // 65536
                             0x4c800000,  // 6.71089e+07
                             0x4e000000,  // 5.36871e+08
                             0xff000000,  // -1.70141e+38
                             0x7f000000,  // 1.70141e+38
                             0x7fd52222,
                             0x7fea2222};

   TestFrecpxHelper(config, kSRegSize, zn_inputs, zd_expected);
 }

 TEST_SVE(sve_frecpx_d) {
   uint64_t zn_inputs[] = {DoubleToRawbits(kFP64PositiveInfinity),
                           DoubleToRawbits(kFP64NegativeInfinity),
                           DoubleToRawbits(65504),       // Max half precision.
                           DoubleToRawbits(6.10352e-5),  // Min positive normal.
                           DoubleToRawbits(6.09756e-5),  // Max subnormal.
                           DoubleToRawbits(
                               5.96046e-8),        // Min positive subnormal.
                           DoubleToRawbits(5e-9),  // Not representable -> zero.
                           DoubleToRawbits(-0.0),
                           DoubleToRawbits(0.0),
                           0x7ff5555511111111,   // Signalling NaN.
                           0x7ffaaaaa11111111};  // Quiet NaN;

   uint64_t zd_expected[] = {0,                   // 0.0
                             0x8000000000000000,  // -0.0
                             0x3f10000000000000,  // 6.10352e-05
                             0x40e0000000000000,  // 32768
                             0x40f0000000000000,  // 65536
                             0x4190000000000000,  // 6.71089e+07
                             0x41c0000000000000,  // 5.36871e+08
                             0xffe0000000000000,  // -1.70141e+38
                             0x7fe0000000000000,  // 1.70141e+38
                             0x7ffd555511111111,
                             0x7ffaaaaa11111111};

   TestFrecpxHelper(config, kDRegSize, zn_inputs, zd_expected);
 }

 template <size_t N, typename T>
 static void TestFsqrtHelper(Test* config,
                             int lane_size_in_bits,
                             T (&zn_inputs)[N],
                             const T (&zd_expected)[N]) {
   TestFPUnaryPredicatedHelper(config,
                               lane_size_in_bits,
                               lane_size_in_bits,
                               zn_inputs,
                               zd_expected,
                               &MacroAssembler::Fsqrt,   // Merging form.
                               &MacroAssembler::Fsqrt);  // Zerging form.
 }

 TEST_SVE(sve_fsqrt_h) {
   uint64_t zn_inputs[] =
       {Float16ToRawbits(Float16(0.0)),
        Float16ToRawbits(Float16(-0.0)),
        Float16ToRawbits(Float16(1.0)),
        Float16ToRawbits(Float16(65025.0)),
        Float16ToRawbits(kFP16PositiveInfinity),
        Float16ToRawbits(kFP16NegativeInfinity),
        Float16ToRawbits(Float16(6.10352e-5)),  // Min normal positive.
        Float16ToRawbits(Float16(65504.0)),     // Max normal positive float.
        Float16ToRawbits(Float16(6.09756e-5)),  // Max subnormal.
        Float16ToRawbits(Float16(5.96046e-8)),  // Min subnormal positive.
        0x7c22,                                 // Signaling NaN
        0x7e55};                                // Quiet NaN

   uint64_t zd_expected[] = {Float16ToRawbits(Float16(0.0)),
                             Float16ToRawbits(Float16(-0.0)),
                             Float16ToRawbits(Float16(1.0)),
                             Float16ToRawbits(Float16(255.0)),
                             Float16ToRawbits(kFP16PositiveInfinity),
                             Float16ToRawbits(kFP16DefaultNaN),
                             0x2000,
                             0x5bff,
                             0x1fff,
                             0x0c00,
                             0x7e22,  // To quiet NaN.
                             0x7e55};

   TestFsqrtHelper(config, kHRegSize, zn_inputs, zd_expected);
 }

 TEST_SVE(sve_fsqrt_s) {
   uint64_t zn_inputs[] = {FloatToRawbits(0.0f),
                           FloatToRawbits(-0.0f),
                           FloatToRawbits(1.0f),
                           FloatToRawbits(65536.0f),
                           FloatToRawbits(kFP32PositiveInfinity),
                           FloatToRawbits(kFP32NegativeInfinity),
                           0x00800000,   // Min normal positive, ~1.17e−38
                           0x7f7fffff,   // Max normal positive, ~3.40e+38
                           0x00000001,   // Min subnormal positive, ~1.40e−45
                           0x007fffff,   // Max subnormal, ~1.17e−38
                           0x7f951111,   // Signaling NaN
                           0x7fea1111};  // Quiet NaN

   uint64_t zd_expected[] = {FloatToRawbits(0.0f),
                             FloatToRawbits(-0.0f),
                             FloatToRawbits(1.0f),
                             FloatToRawbits(256.0f),
                             FloatToRawbits(kFP32PositiveInfinity),
                             FloatToRawbits(kFP32DefaultNaN),
                             0x20000000,  // ~1.08e-19
                             0x5f7fffff,  // ~1.84e+19
                             0x1a3504f3,  // ~3.74e-23
                             0x1fffffff,  // ~1.08e-19
                             0x7fd51111,  // To quiet NaN.
                             0x7fea1111};

   TestFsqrtHelper(config, kSRegSize, zn_inputs, zd_expected);
 }

 TEST_SVE(sve_fsqrt_d) {
   uint64_t zn_inputs[] =
       {DoubleToRawbits(0.0),
        DoubleToRawbits(-0.0),
        DoubleToRawbits(1.0),
        DoubleToRawbits(65536.0),
        DoubleToRawbits(kFP64PositiveInfinity),
        DoubleToRawbits(kFP64NegativeInfinity),
        0x0010000000000000,  // Min normal positive, ~2.22e-308
        0x7fefffffffffffff,  // Max normal positive, ~1.79e+308
        0x0000000000000001,  // Min subnormal positive, 5e-324
        0x000fffffffffffff,  // Max subnormal, ~2.22e-308
        0x7ff5555511111111,
        0x7ffaaaaa11111111};

   uint64_t zd_expected[] = {DoubleToRawbits(0.0),
                             DoubleToRawbits(-0.0),
                             DoubleToRawbits(1.0),
                             DoubleToRawbits(256.0),
                             DoubleToRawbits(kFP64PositiveInfinity),
                             DoubleToRawbits(kFP64DefaultNaN),
                             0x2000000000000000,  // ~1.49e-154
                             0x5fefffffffffffff,  // ~1.34e+154
                             0x1e60000000000000,  // ~2.22e-162
                             0x1fffffffffffffff,  // ~1.49e-154
                             0x7ffd555511111111,  // To quiet NaN.
                             0x7ffaaaaa11111111};

   TestFsqrtHelper(config, kDRegSize, zn_inputs, zd_expected);
 }

 TEST_SVE(sve_adr) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   __ Index(z0.VnD(), 0x10000000f0000000, 0x1000);
   __ Index(z1.VnD(), 1, 3);
   __ Index(z2.VnS(), -1, -1);
   __ Adr(z3.VnD(), SVEMemOperand(z0.VnD(), z1.VnD()));
   __ Adr(z4.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 1));
   __ Adr(z5.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 2));
   __ Adr(z6.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 3));
   __ Adr(z7.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW));
   __ Adr(z8.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 1));
   __ Adr(z9.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 2));
   __ Adr(z10.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 3));
   __ Adr(z11.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW));
   __ Adr(z12.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 1));
   __ Adr(z13.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 2));
   __ Adr(z14.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 3));
   __ Adr(z15.VnS(), SVEMemOperand(z0.VnS(), z2.VnS()));
   __ Adr(z16.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 1));
   __ Adr(z17.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 2));
   __ Adr(z18.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 3));

   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t expected_z3[] = {0x10000000f0001004, 0x10000000f0000001};
     uint64_t expected_z4[] = {0x10000000f0001008, 0x10000000f0000002};
     uint64_t expected_z5[] = {0x10000000f0001010, 0x10000000f0000004};
     uint64_t expected_z6[] = {0x10000000f0001020, 0x10000000f0000008};
     uint64_t expected_z7[] = {0x10000001f0000ffd, 0x10000001efffffff};
     uint64_t expected_z8[] = {0x10000002f0000ffa, 0x10000002effffffe};
     uint64_t expected_z9[] = {0x10000004f0000ff4, 0x10000004effffffc};
     uint64_t expected_z10[] = {0x10000008f0000fe8, 0x10000008effffff8};
     uint64_t expected_z11[] = {0x10000000f0000ffd, 0x10000000efffffff};
     uint64_t expected_z12[] = {0x10000000f0000ffa, 0x10000000effffffe};
     uint64_t expected_z13[] = {0x10000000f0000ff4, 0x10000000effffffc};
     uint64_t expected_z14[] = {0x10000000f0000fe8, 0x10000000effffff8};
     uint64_t expected_z15[] = {0x0ffffffcf0000ffd, 0x0ffffffeefffffff};
     uint64_t expected_z16[] = {0x0ffffff8f0000ffa, 0x0ffffffceffffffe};
     uint64_t expected_z17[] = {0x0ffffff0f0000ff4, 0x0ffffff8effffffc};
     uint64_t expected_z18[] = {0x0fffffe0f0000fe8, 0x0ffffff0effffff8};

     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
     ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
     ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
     ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
     ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
     ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
     ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
     ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
     ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
     ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
     ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
     ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
     ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
     ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
     ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
     ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
   }
 }

 // Test loads and broadcast by comparing them with the result of a set of
 // equivalent scalar loads.
 template <typename F>
 static void LoadBcastHelper(Test* config,
                             unsigned msize_in_bits,
                             unsigned esize_in_bits,
                             F sve_ld1,
                             bool is_signed) {
   VIXL_ASSERT((esize_in_bits == kBRegSize) || (esize_in_bits == kHRegSize) ||
               (esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
   static const unsigned kMaxLaneCount = kZRegMaxSize / kBRegSize;

   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
   unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
   int vl = config->sve_vl_in_bytes();

   uint64_t offsets[kMaxLaneCount];
   uint64_t buffer_size = vl * 64;
   uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
   BufferFillingHelper(data,
                       buffer_size,
                       msize_in_bytes,
                       kMaxLaneCount,
                       offsets);

   for (unsigned i = 0; i < (kMaxLaneCount / 2); i++) {
     // Assign encodable offsets into the first part of the offset array so
     // that both encodable and unencodable offset can be tested.
     // Note that the encoding bit range of immediate offset is 6 bits.
     offsets[i] = (offsets[i] % (UINT64_C(1) << 6)) * msize_in_bytes;
   }

   ZRegister zn = z0.WithLaneSize(esize_in_bits);
   ZRegister zn_ref = z4.WithLaneSize(esize_in_bits);

   PRegisterZ pg = p0.Zeroing();
   Initialise(&masm,
              pg,
              0x9abcdef012345678,
              0xabcdef0123456789,
              0xf4f3f1f0fefdfcfa,
              0xf9f8f6f5f3f2f0ff);

   __ Mov(x2, data);
   uint64_t enablable_offset = offsets[0];
   // Simple check if the operation correct in a single offset.
   (masm.*sve_ld1)(zn, pg, SVEMemOperand(x2, enablable_offset));

   // Generate a reference result using scalar loads.
   uint64_t address = data + enablable_offset;
   uint64_t duplicated_addresses[kMaxLaneCount];
   for (unsigned i = 0; i < kMaxLaneCount; i++) {
     duplicated_addresses[i] = address;
   }

   ScalarLoadHelper(&masm,
                    vl,
                    duplicated_addresses,
                    zn_ref,
                    pg,
                    esize_in_bits,
                    msize_in_bits,
                    is_signed);

   ZRegister zn_agg = z10.WithLaneSize(esize_in_bits);
   ZRegister zn_agg_ref = z11.WithLaneSize(esize_in_bits);
   ZRegister zn_temp = z12.WithLaneSize(esize_in_bits);

   __ Dup(zn_agg, 0);
   __ Dup(zn_agg_ref, 0);

   // Check if the operation correct in different offsets.
   for (unsigned i = 0; i < (vl / esize_in_bytes); i++) {
     (masm.*sve_ld1)(zn_temp, pg, SVEMemOperand(x2, offsets[i]));
     __ Lastb(x1, pg, zn_temp);
     __ Insr(zn_agg, x1);

     __ Mov(x3, data + offsets[i]);
     ScalarLoadHelper(&masm, x1, x3, msize_in_bits, is_signed);
     __ Insr(zn_agg_ref, x1);
   }

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(zn_ref, zn);
     ASSERT_EQUAL_SVE(zn_agg_ref, zn_agg);
   }

   free(reinterpret_cast<void*>(data));
 }

 TEST_SVE(sve_ld1rb) {
   LoadBcastHelper(config, kBRegSize, kBRegSize, &MacroAssembler::Ld1rb, false);
   LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rb, false);
   LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rb, false);
   LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rb, false);
 }

 TEST_SVE(sve_ld1rh) {
   LoadBcastHelper(config, kHRegSize, kHRegSize, &MacroAssembler::Ld1rh, false);
   LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rh, false);
   LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rh, false);
 }

 TEST_SVE(sve_ld1rw) {
   LoadBcastHelper(config, kSRegSize, kSRegSize, &MacroAssembler::Ld1rw, false);
   LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rw, false);
 }

 TEST_SVE(sve_ld1rd) {
   LoadBcastHelper(config, kDRegSize, kDRegSize, &MacroAssembler::Ld1rd, false);
 }

 TEST_SVE(sve_ld1rsb) {
   LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rsb, true);
   LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rsb, true);
   LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rsb, true);
 }

 TEST_SVE(sve_ld1rsh) {
   LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rsh, true);
   LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rsh, true);
 }

 TEST_SVE(sve_ld1rsw) {
   LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rsw, true);
 }

 TEST_SVE(sve_prefetch_offset) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);

   START();

   __ Prfb(PLDL1KEEP, p5, SVEMemOperand(z30.VnS(), 0));
   __ Prfb(PLDL1STRM, p5, SVEMemOperand(x28, -11, SVE_MUL_VL));
   __ Prfb(PLDL2KEEP, p6, SVEMemOperand(x30, x29));
   __ Prfb(PLDL2STRM, p6, SVEMemOperand(x7, z12.VnS(), UXTW));
   __ Prfh(PSTL2KEEP, p6, SVEMemOperand(z0.VnS(), 28));
   __ Prfh(PSTL2STRM, p4, SVEMemOperand(x17, -3, SVE_MUL_VL));
   __ Prfh(PSTL3KEEP, p3, SVEMemOperand(x0, x0, LSL, 1));
   __ Prfh(PSTL3STRM, p4, SVEMemOperand(x20, z0.VnD(), LSL, 1));
   __ Prfw(PLDL1KEEP, p3, SVEMemOperand(z23.VnD(), 5));
   __ Prfw(PLDL1STRM, p1, SVEMemOperand(x4, 10, SVE_MUL_VL));
   __ Prfw(PLDL2KEEP, p2, SVEMemOperand(x22, x22, LSL, 2));
   __ Prfw(PLDL2STRM, p1, SVEMemOperand(x2, z6.VnS(), SXTW, 2));
   __ Prfd(PLDL3KEEP, p5, SVEMemOperand(z11.VnD(), 9));
   __ Prfd(PLDL3STRM, p3, SVEMemOperand(x0, -24, SVE_MUL_VL));
   __ Prfd(PSTL1KEEP, p7, SVEMemOperand(x5, x5, LSL, 3));
   __ Prfd(PSTL1STRM, p1, SVEMemOperand(x19, z18.VnS(), SXTW, 3));

   END();
   if (CAN_RUN()) {
     RUN();
   }
 }

 TEST_SVE(sve2_match_nmatch) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);

   START();

   __ Ptrue(p0.VnB());
   __ Ptrue(p1.VnH());
   __ Ptrue(p2.VnS());

   // Vector to search is bytes 0 - 7, repeating every eight bytes.
   __ Index(z0.VnB(), 0, 1);
   __ Dup(z0.VnD(), z0.VnD(), 0);

   // Elements to find are (repeated) bytes 0 - 3 in the first segment, 4 - 7
   // in the second, 8 - 11 in the third, etc.
   __ Index(z1.VnB(), 0, 1);
   __ Lsr(z1.VnB(), z1.VnB(), 2);

   __ Match(p3.VnB(), p0.Zeroing(), z0.VnB(), z1.VnB());
   __ Match(p4.VnB(), p1.Zeroing(), z0.VnB(), z1.VnB());
   __ Nmatch(p0.VnB(), p0.Zeroing(), z0.VnB(), z1.VnB());

   __ Uunpklo(z0.VnH(), z0.VnB());
   __ Uunpklo(z1.VnH(), z1.VnB());

   __ Match(p5.VnH(), p1.Zeroing(), z0.VnH(), z1.VnH());
   __ Match(p6.VnH(), p2.Zeroing(), z0.VnH(), z1.VnH());
   __ Nmatch(p1.VnH(), p1.Zeroing(), z0.VnH(), z1.VnH());

   END();
   if (CAN_RUN()) {
     RUN();

     int p3_exp[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
                     0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
     ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
     int p4_exp[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
                     0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1};
     ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
     int p0_exp[] = {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
                     1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0};
     ASSERT_EQUAL_SVE(p0_exp, p0.VnB());

     int p5_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1};
     ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
     int p6_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
     ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
     int p1_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
                     0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0};
     ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
   }
 }

 TEST_SVE(sve2_saba_uaba) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);

   START();

   __ Index(z0.VnB(), 0, 1);
   __ Dup(z1.VnB(), 0xff);
   __ Dup(z2.VnB(), 1);
   __ Uaba(z2.VnB(), z2.VnB(), z0.VnB(), z1.VnB());
   __ Index(z0.VnB(), 0, -1);

   __ Index(z3.VnH(), 0, 1);
   __ Index(z4.VnH(), 1, 1);
   __ Uaba(z3.VnH(), z3.VnH(), z3.VnH(), z4.VnH());

   __ Index(z5.VnS(), 3, 6);
   __ Index(z6.VnS(), 5, 6);
   __ Uaba(z5.VnS(), z5.VnS(), z5.VnS(), z6.VnS());

   __ Index(z7.VnD(), 424, 12);
   __ Index(z8.VnD(), 4242, 12);
   __ Uaba(z7.VnD(), z7.VnD(), z7.VnD(), z8.VnD());

   __ Index(z9.VnH(), -1, -1);
   __ Dup(z10.VnB(), 0);
   __ Saba(z10.VnB(), z10.VnB(), z9.VnB(), z10.VnB());
   __ Index(z11.VnH(), 0x0101, 1);

   __ Index(z12.VnH(), 0, 1);
   __ Index(z13.VnH(), 0, -1);
   __ Saba(z13.VnH(), z13.VnH(), z12.VnH(), z13.VnH());

   __ Index(z14.VnS(), 0, 2);
   __ Index(z15.VnS(), 0, -2);
   __ Saba(z15.VnS(), z15.VnS(), z14.VnS(), z15.VnS());

   __ Index(z16.VnD(), 0, 42);
   __ Index(z17.VnD(), 0, -42);
   __ Saba(z17.VnD(), z17.VnD(), z16.VnD(), z17.VnD());

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(z0, z2);
     ASSERT_EQUAL_SVE(z3, z4);
     ASSERT_EQUAL_SVE(z5, z6);
     ASSERT_EQUAL_SVE(z7, z8);

     ASSERT_EQUAL_SVE(z10, z11);
     ASSERT_EQUAL_SVE(z12, z13);
     ASSERT_EQUAL_SVE(z14, z15);
     ASSERT_EQUAL_SVE(z16, z17);
   }
 }

 TEST_SVE(sve2_integer_multiply_long_vector) {
   // The test just check Sqdmull[b|t] and Pmull[b|t], as the way how the element
   // operating of the other instructions in the group are likewise.
   int32_t zn_inputs_s[] =
       {1, -2, 3, -4, 5, -6, 7, -8, INT32_MIN, INT32_MAX, INT32_MAX, INT32_MIN};

   int32_t zm_inputs_s[] =
       {1, 2, 3, 4, 5, 6, 7, 8, INT32_MAX, INT32_MIN, INT32_MAX, INT32_MIN};
   int64_t sqdmullb_vec_expected_d[] =
       {-8, -32, -72, -128, RawbitsToInt64(0x8000000100000000), INT64_MAX};

   uint64_t sqdmullt_vec_expected_d[] =
       {2, 18, 50, 98, 0x8000000100000000, 0x7ffffffe00000002};

   uint64_t pmullb_vec_expected_d[] = {0x00000001fffffffc,
                                       0x00000003fffffff0,
                                       0x000000020000001c,
                                       0x00000007ffffffc0,
                                       0x3fffffff80000000,
                                       0x4000000000000000};

   uint64_t pmullt_vec_expected_d[] = {0x05,
                                       0x11,
                                       0x15,
                                       0x3fffffff80000000,
                                       0x1555555555555555};

   uint64_t sqdmullb_idx_expected_d[] = {0xfffffffffffffff8,
                                         0xfffffffffffffff0,
                                         0xffffffffffffffb8,
                                         0xffffffffffffffa0,
                                         0x8000000100000000,
                                         INT64_MAX};

   uint64_t sqdmullt_idx_expected_d[] =
       {8,                    // 2 * zn[11] * zm[8] = 2 * 4 * 1
        24,                   // 2 * zn[9] * zm[8] = 2 * 4 * 3
        80,                   // 2 * zn[7] * zm[4] = 2 * 8 * 5
        112,                  // 2 * zn[5] * zm[4] = 2 * 8 * 7
        0x7fffffffffffffff,   // 2 * zn[3] * zm[0]
        0x8000000100000000};  // 2 * zn[1] * zm[0]

   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
   START();

   InsrHelper(&masm, z31.VnS(), zn_inputs_s);
   InsrHelper(&masm, z30.VnS(), zm_inputs_s);

   __ Sqdmullb(z1.VnD(), z31.VnS(), z30.VnS());
   __ Sqdmullt(z2.VnD(), z31.VnS(), z30.VnS());

   __ Pmullb(z3.VnD(), z31.VnS(), z30.VnS());
   __ Pmullt(z4.VnD(), z31.VnS(), z30.VnS());

   __ Mov(z7, z30);
   __ Mov(z8, z31);
   __ Sqdmullb(z5.VnD(), z8.VnS(), z7.VnS(), 2);
   __ Sqdmullt(z6.VnD(), z8.VnS(), z7.VnS(), 0);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(sqdmullb_vec_expected_d, z1.VnD());
     ASSERT_EQUAL_SVE(sqdmullt_vec_expected_d, z2.VnD());
     ASSERT_EQUAL_SVE(pmullb_vec_expected_d, z3.VnD());
     ASSERT_EQUAL_SVE(pmullt_vec_expected_d, z4.VnD());
     ASSERT_EQUAL_SVE(sqdmullb_idx_expected_d, z5.VnD());
     ASSERT_EQUAL_SVE(sqdmullt_idx_expected_d, z6.VnD());
   }
 }

 TEST_SVE(sve2_integer_multiply_add_long_vector) {
   int32_t zn_inputs_s[] =
       {1, -2, 3, -4, 5, -6, 7, -8, INT32_MIN, INT32_MAX, INT32_MAX, INT32_MIN};

   int32_t zm_inputs_s[] =
       {1, 2, 3, 4, 5, 6, 7, 8, INT32_MAX, INT32_MIN, INT32_MAX, INT32_MIN};

   int64_t sqdmlalb_vec_expected_d[] =
       {-3, -28, -69, -126, RawbitsToInt64(0x8000000100000001), INT64_MAX};

   int64_t sqdmlalt_vec_expected_d[] = {-3,
                                        14,
                                        47,
                                        96,
                                        RawbitsToInt64(0x80000000ffffffff),
                                        static_cast<int64_t>(
                                            0x7ffffffe00000002)};

   int64_t sqdmlalb_idx_expected_d[] =
       {-11,   // za.d[5] + 2 * zn.s[10] * zm.s[8] = 5 + 2 * -2 * 4
        -28,   // za.d[4] + 2 * zn.s[8] * zm.s[8] = 4 + 2 * -4 * 4
        -93,   // za.d[3] + 2 * zn.s[6] * zm.s[4] = 3 + 2 * -6 * 8
        -126,  // za.d[2] + 2 * zn.s[4] * zm.s[4] = 2 + 2 * -8 * 8
        RawbitsToInt64(0x8000000100000001),
        INT64_MAX};

   int64_t sqdmlalt_idx_expected_d[] =
       {1,   // za.d[5] + 2 * zn.s[11] * zm.s[9] = -5 + 2 * 1 * 3
        14,  // za.d[4] + 2 * zn.s[9] * zm.s[9] = -4 + 2 * 3 * 3
        67,  // za.d[3] + 2 * zn.s[7] * zm.s[5] = -3 + 2 * 5 * 7
        96,  // za.d[2] + 2 * zn.s[5] * zm.s[5] = -2 + 2 * 7 * 7
        RawbitsToInt64(0x80000000ffffffff),
        static_cast<int64_t>(0x7ffffffe00000002)};

   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
   START();

   InsrHelper(&masm, z0.VnS(), zn_inputs_s);
   InsrHelper(&masm, z1.VnS(), zm_inputs_s);
   __ Index(z2.VnD(), 0, 1);
   __ Index(z3.VnD(), 0, -1);

   __ Mov(z31, z2);
   __ Sqdmlalb(z31.VnD(), z31.VnD(), z0.VnS(), z1.VnS());
   __ Mov(z30, z3);
   __ Sqdmlalt(z30.VnD(), z30.VnD(), z0.VnS(), z1.VnS());
   __ Mov(z29, z31);
   __ Sqdmlslb(z29.VnD(), z29.VnD(), z0.VnS(), z1.VnS());
   __ Mov(z28, z30);
   __ Sqdmlslt(z28.VnD(), z28.VnD(), z0.VnS(), z1.VnS());

   __ Sqdmlalb(z27.VnD(), z2.VnD(), z0.VnS(), z1.VnS());
   __ Sqdmlalt(z26.VnD(), z3.VnD(), z0.VnS(), z1.VnS());
   __ Sqdmlslb(z25.VnD(), z27.VnD(), z0.VnS(), z1.VnS());
   __ Sqdmlslt(z24.VnD(), z26.VnD(), z0.VnS(), z1.VnS());

   __ Mov(z23, z2);
   __ Sqdmlalb(z23.VnD(), z23.VnD(), z0.VnS(), z1.VnS(), 0);
   __ Mov(z22, z3);
   __ Sqdmlalt(z22.VnD(), z22.VnD(), z0.VnS(), z1.VnS(), 1);
   __ Mov(z21, z23);
   __ Sqdmlslb(z21.VnD(), z21.VnD(), z0.VnS(), z1.VnS(), 0);
   __ Mov(z20, z22);
   __ Sqdmlslt(z20.VnD(), z20.VnD(), z0.VnS(), z1.VnS(), 1);


   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(sqdmlalb_vec_expected_d, z31.VnD());
     ASSERT_EQUAL_SVE(sqdmlalt_vec_expected_d, z30.VnD());
     ASSERT_EQUAL_SVE(z2, z29);
     ASSERT_EQUAL_SVE(z3, z28);

     ASSERT_EQUAL_SVE(z31, z27);
     ASSERT_EQUAL_SVE(z30, z26);
     ASSERT_EQUAL_SVE(z29, z25);
     ASSERT_EQUAL_SVE(z28, z24);

     ASSERT_EQUAL_SVE(sqdmlalb_idx_expected_d, z23.VnD());
     ASSERT_EQUAL_SVE(sqdmlalt_idx_expected_d, z22.VnD());
     ASSERT_EQUAL_SVE(z2, z21);
     ASSERT_EQUAL_SVE(z3, z20);
   }
 }

 TEST_SVE(sve2_ldnt1) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
   START();

   int data_size = kZRegMaxSizeInBytes * 4;
   uint8_t* data = new uint8_t[data_size];
   for (int i = 0; i < data_size; i++) {
     data[i] = i & 0xff;
   }

   // Set the base half-way through the buffer so we can use negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
   __ Index(z30.VnD(), x0, 1);
   __ Ptrue(p0.VnB());
   __ Punpklo(p1.VnH(), p0.VnB());
   __ Punpklo(p2.VnH(), p1.VnB());
   __ Punpklo(p3.VnH(), p2.VnB());
   __ Punpklo(p4.VnH(), p3.VnB());

   __ Mov(x1, 1);
   __ Ldnt1b(z0.VnD(), p1.Zeroing(), SVEMemOperand(z30.VnD(), x1));
   __ Ld1b(z1.VnD(), p1.Zeroing(), SVEMemOperand(x1, z30.VnD()));

   __ Mov(x1, -4);
   __ Ldnt1h(z2.VnD(), p2.Zeroing(), SVEMemOperand(z30.VnD(), x1));
   __ Ld1h(z3.VnD(), p2.Zeroing(), SVEMemOperand(x1, z30.VnD()));

   __ Mov(x1, 16);
   __ Ldnt1w(z4.VnD(), p3.Zeroing(), SVEMemOperand(z30.VnD(), x1));
   __ Ld1w(z5.VnD(), p3.Zeroing(), SVEMemOperand(x1, z30.VnD()));

   __ Mov(x1, -16);
   __ Ldnt1d(z6.VnD(), p4.Zeroing(), SVEMemOperand(z30.VnD(), x1));
   __ Ld1d(z7.VnD(), p4.Zeroing(), SVEMemOperand(x1, z30.VnD()));

   __ Mov(x1, 1);
   __ Ldnt1sb(z8.VnD(), p0.Zeroing(), SVEMemOperand(z30.VnD(), x1));
   __ Ld1sb(z9.VnD(), p0.Zeroing(), SVEMemOperand(x1, z30.VnD()));

   __ Mov(x1, -4);
   __ Ldnt1sh(z10.VnD(), p2.Zeroing(), SVEMemOperand(z30.VnD(), x1));
   __ Ld1sh(z11.VnD(), p2.Zeroing(), SVEMemOperand(x1, z30.VnD()));

   __ Mov(x1, 16);
   __ Ldnt1sw(z12.VnD(), p3.Zeroing(), SVEMemOperand(z30.VnD(), x1));
   __ Ld1sw(z13.VnD(), p3.Zeroing(), SVEMemOperand(x1, z30.VnD()));

   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(z0, z1);
     ASSERT_EQUAL_SVE(z2, z3);
     ASSERT_EQUAL_SVE(z4, z5);
     ASSERT_EQUAL_SVE(z6, z7);
     ASSERT_EQUAL_SVE(z8, z9);
     ASSERT_EQUAL_SVE(z10, z11);
     ASSERT_EQUAL_SVE(z12, z13);
   }
 }

 TEST_SVE(sve2_stnt1) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
   START();

   int data_size = kZRegMaxSizeInBytes * 4;
   uint8_t* data = new uint8_t[data_size];

   // Set the base half-way through the buffer so we can use negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
   __ Ptrue(p0.VnB());
   __ Punpklo(p1.VnH(), p0.VnB());
   __ Punpklo(p2.VnH(), p1.VnB());
   __ Punpklo(p3.VnH(), p2.VnB());
   __ Punpklo(p4.VnH(), p3.VnB());
   __ Dup(z0.VnB(), 0xaa);
   __ Dup(z1.VnB(), 0x55);
   __ Rdvl(x1, 1);
   __ Mov(x3, 0);

   // Put store addresses into z30, and a small offset in x4.
   __ Index(z30.VnD(), x0, 1);
   __ Mov(x4, 2);

   // Store an entire vector of 0xaa to the buffer, then a smaller scatter store
   // of 0x55 using Stnt1b.
   __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
   __ Stnt1b(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));

   // Load the entire vector back from the buffer.
   __ Ld1b(z2.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));

   // Construct a predicate that reflects the number of bytes stored by Stnt1b,
   // based on the current VL, and use Sel to obtain a reference vector for
   // comparison.
   __ Lsr(x2, x1, 3);
   __ Whilelo(p5.VnB(), x3, x2);
   __ Sel(z3.VnB(), p5.Merging(), z1.VnB(), z0.VnB());

   // Repeat for larger element sizes.
   __ Mov(x4, -4);
   __ Index(z30.VnD(), x0, 2);
   __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
   __ Stnt1h(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
   __ Ld1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
   __ Lsr(x2, x1, 2);
   __ Whilelo(p5.VnB(), x3, x2);
   __ Sel(z5.VnB(), p5.Merging(), z1.VnB(), z0.VnB());

   __ Mov(x4, 16);
   __ Index(z30.VnD(), x0, 4);
   __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
   __ Stnt1w(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
   __ Ld1b(z6.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
   __ Lsr(x2, x1, 1);
   __ Whilelo(p5.VnB(), x3, x2);
   __ Sel(z7.VnB(), p5.Merging(), z1.VnB(), z0.VnB());

   __ Mov(x4, -16);
   __ Index(z30.VnD(), x0, 8);
   __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
   __ Stnt1d(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
   __ Ld1b(z8.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
   __ Whilelo(p5.VnB(), x3, x1);
   __ Sel(z9.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(z2, z3);
     ASSERT_EQUAL_SVE(z4, z5);
     ASSERT_EQUAL_SVE(z6, z7);
     ASSERT_EQUAL_SVE(z8, z9);
   }
 }

 TEST_SVE(sve2_while_simple) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);

   START();
   __ Mov(x0, 1);
   __ Mov(x1, 0);
   __ Mov(x2, 3);

   __ Whilehi(p0.VnB(), x0, x1);
   __ Whilehs(p1.VnB(), x0, x1);
   __ Whilehi(p2.VnB(), x2, x1);
   __ Whilehs(p3.VnB(), x2, x1);
   __ Whilehi(p4.VnB(), x2, x0);
   __ Whilehs(p5.VnB(), x2, x0);

   __ Whilegt(p6.VnB(), x0, x1);
   __ Whilege(p7.VnB(), x0, x1);
   __ Whilegt(p8.VnB(), x2, x1);
   __ Whilege(p9.VnB(), x2, x1);
   __ Whilegt(p10.VnB(), x2, x0);
   __ Whilege(p11.VnB(), x2, x0);

   __ Mov(x4, 0x80000000);
   __ Mov(x5, 0x80000001);
   __ Whilege(p12.VnB(), w5, w4);
   __ Whilegt(p13.VnB(), w5, w4);

   __ Mov(x6, 0x8000000000000000);
   __ Mov(x7, 0x8000000000000001);
   __ Whilege(p14.VnB(), x7, x6);
   __ Whilegt(p15.VnB(), x7, x6);

   for (int i = 0; i < 16; i++) {
     __ Rev(PRegister(i).VnB(), PRegister(i).VnB());
   }

   END();

   if (CAN_RUN()) {
     RUN();
     int p0_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
     int p1_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
     int p2_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
     int p3_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
     int p4_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
     int p5_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
     int p6_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
     int p7_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
     int p8_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
     int p9_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
     int p10_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
     int p11_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
     int p12_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
     int p13_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
     int p14_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
     int p15_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};

     ASSERT_EQUAL_SVE(p0_exp, p0.VnB());
     ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
     ASSERT_EQUAL_SVE(p2_exp, p2.VnB());
     ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
     ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
     ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
     ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
     ASSERT_EQUAL_SVE(p7_exp, p7.VnB());
     ASSERT_EQUAL_SVE(p8_exp, p8.VnB());
     ASSERT_EQUAL_SVE(p9_exp, p9.VnB());
     ASSERT_EQUAL_SVE(p10_exp, p10.VnB());
     ASSERT_EQUAL_SVE(p11_exp, p11.VnB());
     ASSERT_EQUAL_SVE(p12_exp, p12.VnB());
     ASSERT_EQUAL_SVE(p13_exp, p13.VnB());
     ASSERT_EQUAL_SVE(p14_exp, p14.VnB());
     ASSERT_EQUAL_SVE(p15_exp, p15.VnB());
   }
 }

 TEST_SVE(sve2_whilerw_whilewr_simple) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);

   START();
   __ Mov(x0, 0);
   __ Mov(x1, 1);
   __ Mov(x2, 3);

   __ Whilerw(p0.VnB(), x0, x0);
   __ Whilerw(p1.VnB(), x0, x1);
   __ Whilerw(p2.VnB(), x1, x0);

   __ Whilewr(p3.VnB(), x0, x0);
   __ Whilewr(p4.VnB(), x0, x1);
   __ Whilewr(p5.VnB(), x1, x0);

   __ Whilewr(p6.VnH(), x1, x1);
   __ Whilewr(p7.VnH(), x1, x2);
   __ Whilewr(p8.VnH(), x2, x1);

   END();

   if (CAN_RUN()) {
     RUN();
     int p0_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
     ASSERT_EQUAL_SVE(p0_exp, p0.VnB());
     int p1_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
     ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
     int p2_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
     ASSERT_EQUAL_SVE(p2_exp, p2.VnB());
     int p3_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
     ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
     int p4_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
     ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
     int p5_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
     ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
     int p6_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
     ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
     int p7_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
     ASSERT_EQUAL_SVE(p7_exp, p7.VnB());
     int p8_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
     ASSERT_EQUAL_SVE(p8_exp, p8.VnB());
   }
 }

 TEST_SVE(sve2_sqrdcmlah) {
   int32_t zn_inputs[] = {-1, -2, -3, -4, 1, 2, 3, 4};
   int32_t zm_inputs[] = {-1, -2, 3, 4, 1, 2, -3, -4};
   int32_t za_inputs[] = {1, 2, 3, 4, 5, 6, 7, 8};
   int32_t zd_000_expected[] =
       {1025, 2050, -6141, -8188, 1029, 2054, -6137, -8184};
   int32_t zd_090_expected[] =
       {1025, -510, -6141, 4612, 1029, -506, -6137, 4616};
   int32_t zd_180_expected[] =
       {-1023, -2046, 6147, 8196, -1019, -2042, 6151, 8200};
   int32_t zd_270_expected[] =
       {-1023, 514, 6147, -4604, -1019, 518, 6151, -4600};
   int32_t zd_0_270_expected[] =
       {2049, -1534, 6147, -4604, 2053, -1530, 6151, -4600};
   int32_t zd_3_090_expected[] =
       {1025, -510, 3075, -1532, 1029, -506, 3079, -1528};

   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
   START();

   InsrHelper(&masm, z0.VnS(), zn_inputs);
   InsrHelper(&masm, z1.VnS(), zm_inputs);
   InsrHelper(&masm, z31.VnS(), za_inputs);

   // When the value in operands is small, shift left a random value so that it
   // can affect the result in destination.
   int shift = 20;
   __ Lsl(z0.VnS(), z0.VnS(), shift);
   __ Lsl(z1.VnS(), z1.VnS(), shift);

   __ Mov(z10, z31);
   __ Sqrdcmlah(z10.VnS(), z10.VnS(), z0.VnS(), z1.VnS(), 0);

   __ Mov(z11, z31);
   __ Sqrdcmlah(z11.VnS(), z11.VnS(), z0.VnS(), z1.VnS(), 90);

   __ Mov(z12, z31);
   __ Sqrdcmlah(z12.VnS(), z12.VnS(), z0.VnS(), z1.VnS(), 180);

   __ Mov(z13, z31);
   __ Sqrdcmlah(z13.VnS(), z13.VnS(), z0.VnS(), z1.VnS(), 270);

   __ Sqrdcmlah(z14.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 0);
   __ Sqrdcmlah(z15.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 90);
   __ Sqrdcmlah(z16.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 180);
   __ Sqrdcmlah(z17.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 270);

   __ Mov(z18, z31);
   __ Sqrdcmlah(z18.VnS(), z18.VnS(), z0.VnS(), z1.VnS(), 0, 270);

   __ Mov(z19, z31);
   __ Sqrdcmlah(z19.VnS(), z19.VnS(), z0.VnS(), z1.VnS(), 1, 90);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(zd_000_expected, z10.VnS());
     ASSERT_EQUAL_SVE(zd_090_expected, z11.VnS());
     ASSERT_EQUAL_SVE(zd_180_expected, z12.VnS());
     ASSERT_EQUAL_SVE(zd_270_expected, z13.VnS());

     ASSERT_EQUAL_SVE(z14, z10);
     ASSERT_EQUAL_SVE(z15, z11);
     ASSERT_EQUAL_SVE(z16, z12);
     ASSERT_EQUAL_SVE(z17, z13);

     ASSERT_EQUAL_SVE(zd_0_270_expected, z18.VnS());
     ASSERT_EQUAL_SVE(zd_3_090_expected, z19.VnS());
   }
 }

 TEST_SVE(sve2_sqrdmlah) {
   uint16_t zn_inputs_h[] = {0x7ffe, 0x7ffd, 0x7ffd, 0x7ffd, 0x8000,
                             0x7fff, 0x7ffe, 0x7ffe, 0x8001, 0x8000,
                             0x7ffd, 0x7ffd, 0x7ffd, 0x5555, 0x5555,
                             0x5555, 0x8000, 0x8000, 0xaaaa, 0x8001};

   uint16_t zm_inputs_h[] = {0x7ffd, 0x7fff, 0x7ffe, 0x7ffd, 0x8001,
                             0x7fff, 0x7fff, 0x7ffe, 0x8000, 0x8000,
                             0xaaaa, 0x0001, 0x0001, 0xaaaa, 0xaaaa,
                             0xcccc, 0x8000, 0x8000, 0x8000, 0x8001};

   uint16_t za_inputs_h[] = {0x1010, 0x1010, 0x1010, 0x1010, 0x1010,
                             0x1010, 0x1010, 0x1010, 0x8000, 0x8011,
                             0x8006, 0xff7d, 0xfeff, 0xaabc, 0xaabb,
                             0x9c72, 0x8000, 0x0000, 0x8000, 0xffff};

   uint16_t zd_expected_h[] = {0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
                               0x7fff, 0x7fff, 0x7fff, 0xffff, 0x0011,
                               0x8000, 0xff7e, 0xff00, 0x8000, 0x8000,
                               0x8000, 0x0000, 0x7fff, 0xd556, 0x7ffd};

   uint32_t zn_inputs_s[] = {0x04000000,
                             0x80000000,
                             0x04000000,
                             0x80000000,
                             0x80000000,
                             0x80000001,
                             0x7fffffff,
                             0x80000000,
                             0x7ffffffe,
                             0x7ffffffd,
                             0x7ffffffd,
                             0x7ffffffd};

   uint32_t zm_inputs_s[] = {0x00000020,
                             0x80000000,
                             0x00000010,
                             0x80000000,
                             0x7fffffff,
                             0x80000000,
                             0x80000000,
                             0x80000001,
                             0x7ffffffd,
                             0x7fffffff,
                             0x7ffffffe,
                             0x7ffffffd};

   uint32_t za_inputs_s[] = {0x00000000,
                             0x00000000,
                             0x00000020,
                             0x00108000,
                             0x00000000,
                             0x00000001,
                             0x00000000,
                             0x00000001,
                             0x10101010,
                             0x10101010,
                             0x10101010,
                             0x10101010};

   uint32_t zd_expected_s[] = {0x00000001,
                               0x7fffffff,
                               0x00000021,
                               0x7fffffff,
                               0x80000001,
                               0x7fffffff,
                               0x80000001,
                               0x7fffffff,
                               0x7fffffff,
                               0x7fffffff,
                               0x7fffffff,
                               0x7fffffff};

   uint64_t zn_inputs_d[] = {0x0400000000000000, 0x8000000000000000,
                             0x0400000000000000, 0x8000000000000000,
                             0x8000000000000000, 0x8000000000000001,
                             0x7fffffffffffffff, 0x8000000000000000,
                             0x7ffffffffffffffe, 0x7ffffffffffffffd,
                             0x7ffffffffffffffd, 0x7ffffffffffffffd,
                             0xf1299accc9186169, 0xd529d2675ee9da21,
                             0x1a10b5d60b92dcf9, 0xfb1d358e0e6455b1,
                             0x8eb7721078bdc589, 0x4171509750ded141,
                             0x8eb7721078bdc589, 0x4171509750ded141};

   uint64_t zm_inputs_d[] = {0x0000000000000020, 0x8000000000000000,
                             0x0000000000000010, 0x8000000000000000,
                             0x7fffffffffffffff, 0x8000000000000000,
                             0x8000000000000000, 0x8000000000000001,
                             0x7ffffffffffffffd, 0x7fffffffffffffff,
                             0x7ffffffffffffffe, 0x7ffffffffffffffd,
                             0x30b940efe73f180e, 0x3bc1ff1e52a99b66,
                             0x40de5c9793535a5e, 0x24752faf47bdddb6,
                             0x162663016b07e5ae, 0x1de34b56f3d22006,
                             0x8eb7721078bdc589, 0x4171509750ded141};

   uint64_t za_inputs_d[] = {0x0000000000000000, 0x0000000000000000,
                             0x0000000000000020, 0x0010108000000000,
                             0x0000000000000000, 0x0000000000000001,
                             0x0000000000000000, 0x0000000000000001,
                             0x1010101010101010, 0x1010101010101010,
                             0x1010101010101010, 0x1010101010101010,
                             0xb18253371b2c2c77, 0xa70de31e6645eaef,
                             0xda817198c0318487, 0x9fd9e6b8e04b42ff,
                             0xced1f6b7119ab197, 0x01ae051a85509b0f,
                             0x01a211e9352f7927, 0x7667b70a5b13749f};

   uint64_t zd_expected_d[] = {0x0000000000000001, 0x7fffffffffffffff,
                               0x0000000000000021, 0x7fffffffffffffff,
                               0x8000000000000001, 0x7fffffffffffffff,
                               0x8000000000000001, 0x7fffffffffffffff,
                               0x7fffffffffffffff, 0x7fffffffffffffff,
                               0x7fffffffffffffff, 0x7fffffffffffffff,
                               0xabdc73dea0d72a35, 0x930e3dc877301966,
                               0xe7b7145a059f8a9f, 0x9e75a4a9d10cf8af,
                               0xbb378528642d2581, 0x10f5e6d693ffddf3,
                               0x65e455a46adc091c, 0x7fffffffffffffff};

   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
   START();

   InsrHelper(&masm, z0.VnH(), zn_inputs_h);
   InsrHelper(&masm, z1.VnH(), zm_inputs_h);
   InsrHelper(&masm, z2.VnH(), za_inputs_h);

   __ Sqrdmlah(z2.VnH(), z2.VnH(), z0.VnH(), z1.VnH());

   InsrHelper(&masm, z3.VnS(), zn_inputs_s);
   InsrHelper(&masm, z4.VnS(), zm_inputs_s);
   InsrHelper(&masm, z5.VnS(), za_inputs_s);

   __ Sqrdmlah(z5.VnS(), z5.VnS(), z3.VnS(), z4.VnS());

   InsrHelper(&masm, z6.VnD(), zn_inputs_d);
   InsrHelper(&masm, z7.VnD(), zm_inputs_d);
   InsrHelper(&masm, z8.VnD(), za_inputs_d);

   __ Sqrdmlah(z8.VnD(), z8.VnD(), z6.VnD(), z7.VnD());

   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(zd_expected_h, z2.VnH());
     ASSERT_EQUAL_SVE(zd_expected_s, z5.VnS());
     ASSERT_EQUAL_SVE(zd_expected_d, z8.VnD());
   }
 }

 TEST_SVE(sve2_cmla) {
   int32_t zn_inputs_s[] = {-2, -4, -6, -8, 2, 4, 6, 8};
   int32_t zm_inputs_s[] = {-2, -4, -6, -8, 2, 4, 6, 8};
   int32_t zda_inputs_s[] = {1, 2, 3, 4, 5, 6, 7, 8};
   int32_t zd_000_expected[] = {9, 18, 51, 68, 13, 22, 55, 72};
   int32_t zd_090_expected[] = {9, -2, 51, -32, 13, 2, 55, -28};
   int32_t zd_180_expected[] = {-7, -14, -45, -60, -3, -10, -41, -56};
   int32_t zd_270_expected[] = {-7, 6, -45, 40, -3, 10, -41, 44};

   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
   START();

   InsrHelper(&masm, z31.VnS(), zn_inputs_s);
   InsrHelper(&masm, z30.VnS(), zm_inputs_s);

   InsrHelper(&masm, z0.VnS(), zda_inputs_s);
   __ Mov(z29, z0);
   __ Cmla(z0.VnS(), z0.VnS(), z31.VnS(), z30.VnS(), 0);

   InsrHelper(&masm, z1.VnS(), zda_inputs_s);
   __ Mov(z28, z1);
   __ Cmla(z1.VnS(), z1.VnS(), z31.VnS(), z30.VnS(), 90);

   InsrHelper(&masm, z2.VnS(), zda_inputs_s);
   __ Mov(z27, z2);
   __ Cmla(z2.VnS(), z2.VnS(), z31.VnS(), z30.VnS(), 180);

   InsrHelper(&masm, z3.VnS(), zda_inputs_s);
   __ Mov(z26, z3);
   __ Cmla(z3.VnS(), z3.VnS(), z31.VnS(), z30.VnS(), 270);

   __ Cmla(z4.VnS(), z29.VnS(), z31.VnS(), z30.VnS(), 0);
   __ Cmla(z5.VnS(), z28.VnS(), z31.VnS(), z30.VnS(), 90);
   __ Cmla(z6.VnS(), z27.VnS(), z31.VnS(), z30.VnS(), 180);
   __ Cmla(z7.VnS(), z26.VnS(), z31.VnS(), z30.VnS(), 270);

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(zd_000_expected, z0.VnS());
     ASSERT_EQUAL_SVE(zd_090_expected, z1.VnS());
     ASSERT_EQUAL_SVE(zd_180_expected, z2.VnS());
     ASSERT_EQUAL_SVE(zd_270_expected, z3.VnS());

     ASSERT_EQUAL_SVE(z4, z0);
     ASSERT_EQUAL_SVE(z5, z1);
     ASSERT_EQUAL_SVE(z6, z2);
     ASSERT_EQUAL_SVE(z7, z3);
   }
 }

 TEST_SVE(sve2_integer_saturating_multiply_add_long) {
   int32_t zn_bottom_inputs[] =
       {-2, -4, -6, -8, INT32_MAX, INT32_MIN, INT32_MIN};

   int32_t zm_top_inputs[] = {1, 3, 5, 7, INT32_MAX, INT32_MAX, INT32_MIN};

   int64_t sqdmlalbt_expected[] = {2,
                                   -19,
                                   -56,
                                   -109,
                                   static_cast<int64_t>(0x7ffffffe00000004),
                                   RawbitsToInt64(0x8000000100000001),
                                   INT64_MAX};

   int64_t sqdmlslbt_expected[] = {-2,
                                   19,
                                   56,
                                   109,
                                   RawbitsToInt64(0x80000001fffffffc),
                                   static_cast<int64_t>(0x7ffffffeffffffff),
                                   RawbitsToInt64(0x8000000000000001)};

   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
   START();

   InsrHelper(&masm, z31.VnS(), zn_bottom_inputs);
   InsrHelper(&masm, z30.VnS(), zm_top_inputs);

   __ Dup(z29.VnD(), 0);
   __ Zip1(z31.VnS(), z31.VnS(), z29.VnS());
   __ Zip1(z30.VnS(), z29.VnS(), z30.VnS());

   // Initialise inputs for za.
   __ Index(z1.VnD(), 0, 1);
   __ Index(z2.VnD(), 0, -1);

   __ Sqdmlalbt(z1.VnD(), z1.VnD(), z31.VnS(), z30.VnS());
   __ Sqdmlslbt(z2.VnD(), z2.VnD(), z31.VnS(), z30.VnS());

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(sqdmlalbt_expected, z1.VnD());
     ASSERT_EQUAL_SVE(sqdmlslbt_expected, z2.VnD());
   }
 }

 TEST_SVE(sve2_floating_point_multiply_add_long_vector) {
   uint16_t zn_inputs[] = {Float16ToRawbits(Float16(1000)),
                           Float16ToRawbits(Float16(2000)),
                           Float16ToRawbits(Float16(0.5)),
                           Float16ToRawbits(Float16(-0.5)),
                           Float16ToRawbits(Float16(14)),
                           Float16ToRawbits(Float16(-14)),
                           Float16ToRawbits(kFP16PositiveInfinity),
                           Float16ToRawbits(kFP16NegativeInfinity)};

   uint16_t zm_inputs[] = {Float16ToRawbits(Float16(10)),
                           Float16ToRawbits(Float16(-10)),
                           Float16ToRawbits(Float16(10)),
                           Float16ToRawbits(Float16(-10)),
                           Float16ToRawbits(Float16(10)),
                           Float16ToRawbits(Float16(-10)),
                           Float16ToRawbits(Float16(10)),
                           Float16ToRawbits(Float16(-10))};

   uint32_t za_inputs[] = {FloatToRawbits(1.0f),
                           FloatToRawbits(-1.0f),
                           FloatToRawbits(1.0f),
                           FloatToRawbits(-1.0f)};

   uint32_t fmlalb_zd_expected[] = {0xc69c3e00,  // -19999
                                    0x40800000,  // 4
                                    0x430d0000,  // 141
                                    FloatToRawbits(kFP32PositiveInfinity)};

   uint32_t fmlalt_zd_expected[] = {0x461c4400,  // 10001
                                    0x40800000,  // 4
                                    0x430d0000,  // 141
                                    FloatToRawbits(kFP32PositiveInfinity)};

   uint32_t fmlslb_zd_expected[] = {0x469c4200,  // 20001
                                    0xc0c00000,  // -6
                                    0xc30b0000,  // -139
                                    FloatToRawbits(kFP32NegativeInfinity)};

   uint32_t fmlslt_zd_expected[] = {0xc61c3c00,  // -9999
                                    0xc0c00000,  // -6
                                    0xc30b0000,  // -139
                                    FloatToRawbits(kFP32NegativeInfinity)};

   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
   START();

   InsrHelper(&masm, z31.VnH(), zn_inputs);
   InsrHelper(&masm, z30.VnH(), zm_inputs);
   InsrHelper(&masm, z29.VnS(), za_inputs);

   __ Mov(z0, z29);
   __ Fmlalb(z0.VnS(), z0.VnS(), z31.VnH(), z30.VnH());

   __ Mov(z1, z29);
   __ Fmlalt(z1.VnS(), z1.VnS(), z31.VnH(), z30.VnH());

   __ Mov(z2, z29);
   __ Fmlslb(z2.VnS(), z2.VnS(), z31.VnH(), z30.VnH());

   __ Mov(z3, z29);
   __ Fmlslt(z3.VnS(), z3.VnS(), z31.VnH(), z30.VnH());

   __ Fmlalb(z4.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
   __ Fmlalt(z5.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
   __ Fmlslb(z6.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
   __ Fmlslt(z7.VnS(), z29.VnS(), z31.VnH(), z30.VnH());

   END();

   if (CAN_RUN()) {
     RUN();

     ASSERT_EQUAL_SVE(fmlalb_zd_expected, z0.VnS());
     ASSERT_EQUAL_SVE(fmlalt_zd_expected, z1.VnS());
     ASSERT_EQUAL_SVE(fmlslb_zd_expected, z2.VnS());
     ASSERT_EQUAL_SVE(fmlslt_zd_expected, z3.VnS());

     ASSERT_EQUAL_SVE(z4, z0);
     ASSERT_EQUAL_SVE(z5, z1);
     ASSERT_EQUAL_SVE(z6, z2);
     ASSERT_EQUAL_SVE(z7, z3);
   }
 }

 TEST_SVE(sve2_flogb_simple) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);

   START();
   __ Ptrue(p0.VnB());
   __ Index(z0.VnS(), -4, 1);
   __ Mov(z1.VnS(), 0);
   __ Mov(z2.VnD(), 0x000fffffffffffff);
   __ Mov(z3.VnD(), 0x0010000000000000);
   __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
   __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
   __ Fdiv(z1.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
   __ Flogb(z0.VnS(), p0.Merging(), z0.VnS());
   __ Flogb(z1.VnS(), p0.Merging(), z1.VnS());
   __ Flogb(z2.VnD(), p0.Merging(), z2.VnD());
   __ Flogb(z3.VnD(), p0.Merging(), z3.VnD());
   END();

   if (CAN_RUN()) {
     RUN();
     uint64_t expected_z0[] = {0x0000000200000002,
                               0x0000000200000002,
                               0x0000000100000001,
                               0x0000000080000000,
                               0x0000000000000001,
                               0x0000000100000002};
     ASSERT_EQUAL_SVE(expected_z0, z0.VnD());

     uint64_t expected_z1[] = {0x7fffffff7fffffff,
                               0x7fffffff7fffffff,
                               0x7fffffff7fffffff,
                               0x7fffffff80000000,
                               0x7fffffff7fffffff,
                               0x7fffffff7fffffff};
     ASSERT_EQUAL_SVE(expected_z1, z1.VnD());

     uint64_t expected_z2[] = {0xfffffffffffffc01,
                               0xfffffffffffffc01,
                               0xfffffffffffffc01,
                               0xfffffffffffffc01};
     ASSERT_EQUAL_SVE(expected_z2, z2.VnD());

     uint64_t expected_z3[] = {0xfffffffffffffc02,
                               0xfffffffffffffc02,
                               0xfffffffffffffc02,
                               0xfffffffffffffc02};
     ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
   }
 }

 TEST_SVE(neon_matmul) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
                           CPUFeatures::kSVEI8MM,
                           CPUFeatures::kNEON,
                           CPUFeatures::kI8MM);

   // Test Neon integer matrix multiply against SVE.
   START();
   __ Movi(v0.V2D(), 0xffeeddccbbaa9988, 0x77665544332211);
   __ Movi(v1.V2D(), 0xaa5555aa55555555, 0x55aaaa55aaaaaa);
   __ Movi(v2.V2D(), 0, 0);
   __ Movi(v3.V2D(), 0, 0);
   __ Movi(v4.V2D(), 0, 0);
   __ Movi(v5.V2D(), 0, 0);
   __ Movi(v6.V2D(), 0, 0);
   __ Movi(v7.V2D(), 0, 0);

   __ Smmla(v2.V4S(), v0.V16B(), v1.V16B());
   __ Smmla(z3.VnS(), z3.VnS(), z0.VnB(), z1.VnB());
   __ Ummla(v4.V4S(), v0.V16B(), v1.V16B());
   __ Ummla(z5.VnS(), z5.VnS(), z0.VnB(), z1.VnB());
   __ Usmmla(v6.V4S(), v0.V16B(), v1.V16B());
   __ Usmmla(z7.VnS(), z7.VnS(), z0.VnB(), z1.VnB());
   END();

   if (CAN_RUN()) {
     RUN();

     // The inputs as Z registers are zero beyond the least-significant 128 bits,
     // so the Neon and SVE results should be equal for any VL.
     ASSERT_EQUAL_SVE(z3, z2);
     ASSERT_EQUAL_SVE(z5, z4);
     ASSERT_EQUAL_SVE(z7, z6);
   }
 }

 TEST_SVE(sudot_usdot) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
                           CPUFeatures::kSVE2,
                           CPUFeatures::kSVEI8MM);

   START();
   __ Ptrue(p0.VnB());
   __ Index(z0.VnS(), -424242, 77777);
   __ Index(z1.VnB(), 127, -1);
   __ Sqabs(z1.VnB(), p0.Merging(), z1.VnB());
   __ Index(z2.VnB(), 0, 1);
   __ Sqabs(z2.VnB(), p0.Merging(), z2.VnB());
   __ Index(z3.VnB(), -128, 1);
   __ Mov(z4.VnD(), 0);

   // Test Usdot against Udot/Sdot over the range of inputs where they should be
   // equal.
   __ Usdot(z5.VnS(), z0.VnS(), z1.VnB(), z2.VnB());
   __ Udot(z6.VnS(), z0.VnS(), z1.VnB(), z2.VnB());
   __ Usdot(z7.VnS(), z0.VnS(), z1.VnB(), z3.VnB());
   __ Sdot(z8.VnS(), z0.VnS(), z1.VnB(), z3.VnB());

   // Construct values which, when interpreted correctly as signed/unsigned,
   // should give a zero result for dot product.
   __ Mov(z10.VnS(), 0x8101ff40);  // [-127, 1, -1, 64] as signed bytes.
   __ Mov(z11.VnS(), 0x02fe8002);  // [2, 254, 128, 2] as unsigned bytes.
   __ Usdot(z12.VnS(), z4.VnS(), z11.VnB(), z10.VnB());
   __ Usdot(z13.VnS(), z4.VnS(), z10.VnB(), z11.VnB());

   // Construct a vector with duplicated values across segments. This allows
   // testing indexed dot product against the already tested variant.
   __ Mov(z14.VnS(), 1);
   __ Mul(z15.VnS(), z14.VnS(), z3.VnS(), 1);

   __ Usdot(z16.VnS(), z0.VnS(), z3.VnB(), z3.VnB(), 1);
   __ Usdot(z17.VnS(), z0.VnS(), z3.VnB(), z15.VnB());
   __ Sudot(z18.VnS(), z0.VnS(), z3.VnB(), z3.VnB(), 1);
   __ Usdot(z19.VnS(), z0.VnS(), z15.VnB(), z3.VnB());
   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(z6, z5);
     ASSERT_EQUAL_SVE(z8, z7);
     ASSERT_EQUAL_SVE(z4, z12);

     uint64_t z13_expected[] = {0xffff8200ffff8200, 0xffff8200ffff8200};
     ASSERT_EQUAL_SVE(z13_expected, z13.VnD());

     ASSERT_EQUAL_SVE(z17, z16);
     ASSERT_EQUAL_SVE(z19, z18);
   }
 }

 TEST_SVE(neon_ins_zero_high_regression_test) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE);

   START();
   __ Movi(v0.V2D(), 0x0f0e0d0c0b0a0908, 0x0706050403020100);

   // Check that both forms of ins zero bits <VL-1:128>
   __ Index(z1.VnB(), 0, 1);
   __ Ins(v1.V16B(), 0, wzr);
   __ Index(z2.VnB(), 0, 1);
   __ Ins(v2.V16B(), 3, v2.V16B(), 3);
   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(z0, z1);
     ASSERT_EQUAL_SVE(z0, z2);
   }
 }

 TEST_SVE(neon_fcvt_zero_high_regression_test) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kFP,
                           CPUFeatures::kNEON,
                           CPUFeatures::kSVE);

   START();
   __ Mov(z1.VnD(), 0);
   __ Mov(z2.VnD(), 0);
   __ Mov(z3.VnD(), 0);
   __ Mov(z4.VnD(), 0);
   __ Mov(z5.VnD(), 0);
   __ Mov(z6.VnD(), 0);
   __ Mov(z10.VnD(), 0);

   Label done;
   // Skip calculations for VL128.
   __ Rdvl(x0, 1);
   __ Cmp(x0, 16);
   __ B(eq, &done);

   __ Movi(v0.V2D(), 0x3ff000003f800000);
   __ Index(z1.VnB(), 0, 1);
   __ Index(z2.VnB(), 0, 1);
   __ Index(z3.VnB(), 0, 1);
   __ Index(z4.VnB(), 0, 1);
   __ Index(z5.VnB(), 0, 1);
   __ Index(z6.VnB(), 0, 1);

   // Test zeroing bits <VL-1:128> for fcvtl, fcvtn and fcvtxn.
   __ Fcvtl(v1.V2D(), v0.V2S());
   __ Fcvtl2(v2.V2D(), v0.V4S());

   __ Fcvtn(v3.V2S(), v0.V2D());
   __ Fcvtn2(v4.V4S(), v0.V2D());

   __ Fcvtxn(v5.V2S(), v0.V2D());
   __ Fcvtxn2(v6.V4S(), v0.V2D());

   // Set the expected non-zero bits to zero.
   __ Ext(z1.VnB(), z1.VnB(), z10.VnB(), kDRegSizeInBytes * 2);
   __ Ext(z2.VnB(), z2.VnB(), z10.VnB(), kDRegSizeInBytes * 2);
   __ Ext(z3.VnB(), z3.VnB(), z10.VnB(), kSRegSizeInBytes * 2);
   __ Ext(z4.VnB(), z4.VnB(), z10.VnB(), kSRegSizeInBytes * 4);
   __ Ext(z5.VnB(), z5.VnB(), z10.VnB(), kSRegSizeInBytes * 2);
   __ Ext(z6.VnB(), z6.VnB(), z10.VnB(), kSRegSizeInBytes * 4);

   __ Bind(&done);
   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(z10, z1);
     ASSERT_EQUAL_SVE(z10, z2);
     ASSERT_EQUAL_SVE(z10, z3);
     ASSERT_EQUAL_SVE(z10, z4);
     ASSERT_EQUAL_SVE(z10, z5);
     ASSERT_EQUAL_SVE(z10, z6);
   }
 }

 #define TEST_ZEROING(INST)  \
   __ Index(z0.VnB(), 0, 1); \
   __ INST;                  \
   __ Orr(z10.VnB(), z10.VnB(), z0.VnB());

 TEST_SVE(neon_zero_high) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kFP,
                           CPUFeatures::kNEON,
                           CPUFeatures::kNEONHalf,
                           CPUFeatures::kSVE,
                           CPUFeatures::kFcma,
                           CPUFeatures::kFHM,
                           CPUFeatures::kFrintToFixedSizedInt,
                           CPUFeatures::kDotProduct,
                           CPUFeatures::kRDM,
                           CPUFeatures::kI8MM);

   START();
   __ Mov(z10.VnD(), 0);  // Initialise cumulative result register.

   TEST_ZEROING(Abs(v0.V16B(), v0.V16B()));
   TEST_ZEROING(Abs(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Add(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Add(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Addhn2(v0.V16B(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Addhn(v0.V4H(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Addp(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Addp(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(And(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Bic(v0.V8H(), 0, 0));
   TEST_ZEROING(Bic(v0.V2S(), 255, 0));
   TEST_ZEROING(Bic(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Bif(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Bit(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Bsl(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Cls(v0.V16B(), v0.V16B()));
   TEST_ZEROING(Cls(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Clz(v0.V16B(), v0.V16B()));
   TEST_ZEROING(Clz(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Cmeq(v0.V16B(), v0.V16B(), 0));
   TEST_ZEROING(Cmeq(v0.V2S(), v0.V2S(), 0));
   TEST_ZEROING(Cmeq(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Cmeq(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Cmge(v0.V16B(), v0.V16B(), 0));
   TEST_ZEROING(Cmge(v0.V2S(), v0.V2S(), 0));
   TEST_ZEROING(Cmge(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Cmge(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Cmgt(v0.V16B(), v0.V16B(), 0));
   TEST_ZEROING(Cmgt(v0.V2S(), v0.V2S(), 0));
   TEST_ZEROING(Cmgt(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Cmgt(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Cmhi(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Cmhi(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Cmhs(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Cmhs(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Cmle(v0.V16B(), v0.V16B(), 0));
   TEST_ZEROING(Cmle(v0.V2S(), v0.V2S(), 0));
   TEST_ZEROING(Cmlt(v0.V16B(), v0.V16B(), 0));
   TEST_ZEROING(Cmlt(v0.V2S(), v0.V2S(), 0));
   TEST_ZEROING(Cmtst(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Cmtst(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Cnt(v0.V16B(), v0.V16B()));
   TEST_ZEROING(Dup(v0.V2S(), w0));
   TEST_ZEROING(Dup(v0.V8B(), w0));
   TEST_ZEROING(Dup(v0.V2S(), v0.S(), 0));
   TEST_ZEROING(Dup(v0.V8B(), v0.B(), 0));
   TEST_ZEROING(Eor(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Ext(v0.V16B(), v0.V16B(), v0.V16B(), 0));
   TEST_ZEROING(Ext(v0.V8B(), v0.V8B(), v0.V8B(), 4));
   TEST_ZEROING(Fabd(v0.V4S(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Fabd(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fabs(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Fabs(v0.V8H(), v0.V8H()));
   TEST_ZEROING(Facge(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Facge(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Facgt(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Facgt(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fadd(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fadd(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Faddp(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Faddp(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fcadd(v0.V2S(), v0.V2S(), v0.V2S(), 90));
   TEST_ZEROING(Fcadd(v0.V8H(), v0.V8H(), v0.V8H(), 90));
   TEST_ZEROING(Fcmeq(v0.V2S(), v0.V2S(), 0));
   TEST_ZEROING(Fcmeq(v0.V8H(), v0.V8H(), 0));
   TEST_ZEROING(Fcmeq(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fcmeq(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fcmge(v0.V2S(), v0.V2S(), 0));
   TEST_ZEROING(Fcmge(v0.V8H(), v0.V8H(), 0));
   TEST_ZEROING(Fcmge(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fcmge(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fcmgt(v0.V2S(), v0.V2S(), 0));
   TEST_ZEROING(Fcmgt(v0.V8H(), v0.V8H(), 0));
   TEST_ZEROING(Fcmgt(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fcmgt(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fcmla(v0.V4H(), v0.V4H(), v0.H(), 0, 0));
   TEST_ZEROING(Fcmla(v0.V4S(), v0.V4S(), v0.S(), 0, 0));
   TEST_ZEROING(Fcmla(v0.V4S(), v0.V4S(), v0.V4S(), 0));
   TEST_ZEROING(Fcmla(v0.V4H(), v0.V4H(), v0.V4H(), 0));
   TEST_ZEROING(Fcmle(v0.V2S(), v0.V2S(), 0));
   TEST_ZEROING(Fcmle(v0.V8H(), v0.V8H(), 0));
   TEST_ZEROING(Fcmlt(v0.V2S(), v0.V2S(), 0));
   TEST_ZEROING(Fcmlt(v0.V8H(), v0.V8H(), 0));
   TEST_ZEROING(Fcvtas(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fcvtas(v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fcvtau(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fcvtau(v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fcvtl2(v0.V4S(), v0.V8H()));
   TEST_ZEROING(Fcvtl(v0.V2D(), v0.V2S()));
   TEST_ZEROING(Fcvtms(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fcvtms(v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fcvtmu(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fcvtmu(v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fcvtn2(v0.V8H(), v0.V4S()));
   TEST_ZEROING(Fcvtn(v0.V2S(), v0.V2D()));
   TEST_ZEROING(Fcvtns(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fcvtns(v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fcvtnu(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fcvtnu(v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fcvtps(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fcvtps(v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fcvtpu(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fcvtpu(v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fcvtxn(v0.V2S(), v0.V2D()));
   TEST_ZEROING(Fcvtxn2(v0.V4S(), v0.V2D()));
   TEST_ZEROING(Fcvtzs(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fcvtzs(v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fcvtzs(v0.V2D(), v0.V2D(), 8));
   TEST_ZEROING(Fcvtzu(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fcvtzu(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Fcvtzu(v0.V2D(), v0.V2D(), 8));
   TEST_ZEROING(Fdiv(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fdiv(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fmax(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fmax(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fmaxnm(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fmaxnm(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fmaxnmp(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fmaxnmp(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fmaxp(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fmaxp(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fmin(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fmin(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fminnm(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fminnm(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fminnmp(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fminnmp(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fminp(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Fminp(v0.V8H(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Fmla(v0.V4S(), v0.V4S(), v0.S(), 0));
   TEST_ZEROING(Fmla(v0.V4H(), v0.V4H(), v0.H(), 2));
   TEST_ZEROING(Fmla(v0.V4S(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Fmla(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Fmlal2(v0.V4S(), v0.V4H(), v0.H(), 0));
   TEST_ZEROING(Fmlal2(v0.V2S(), v0.V2H(), v0.H(), 2));
   TEST_ZEROING(Fmlal2(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Fmlal(v0.V4S(), v0.V4H(), v0.H(), 0));
   TEST_ZEROING(Fmlal(v0.V2S(), v0.V2H(), v0.H(), 2));
   TEST_ZEROING(Fmlal(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Fmls(v0.V4S(), v0.V4S(), v0.S(), 0));
   TEST_ZEROING(Fmls(v0.V4H(), v0.V4H(), v0.H(), 2));
   TEST_ZEROING(Fmls(v0.V4S(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Fmls(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Fmlsl2(v0.V4S(), v0.V4H(), v0.H(), 0));
   TEST_ZEROING(Fmlsl2(v0.V2S(), v0.V2H(), v0.H(), 2));
   TEST_ZEROING(Fmlsl2(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Fmlsl(v0.V4S(), v0.V4H(), v0.H(), 0));
   TEST_ZEROING(Fmlsl(v0.V2S(), v0.V2H(), v0.H(), 2));
   TEST_ZEROING(Fmlsl(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Fmov(v0.V2D(), 2.0000));
   TEST_ZEROING(Fmov(v0.V4H(), 2.0000));
   TEST_ZEROING(Fmov(v0.D(), 1, x1));
   TEST_ZEROING(Fmul(v0.V4S(), v0.V4S(), v0.S(), 0));
   TEST_ZEROING(Fmul(v0.V4H(), v0.V4H(), v0.H(), 2));
   TEST_ZEROING(Fmul(v0.V4S(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Fmul(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Fmulx(v0.V4S(), v0.V4S(), v0.S(), 0));
   TEST_ZEROING(Fmulx(v0.V4H(), v0.V4H(), v0.H(), 2));
   TEST_ZEROING(Fmulx(v0.V4S(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Fmulx(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Fneg(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Fneg(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Frecpe(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Frecpe(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Frecps(v0.V4S(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Frecps(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Frint32x(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Frint32z(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Frint64x(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Frint64z(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Frinta(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Frinta(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Frinti(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Frinti(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Frintm(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Frintm(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Frintn(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Frintn(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Frintp(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Frintp(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Frintx(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Frintx(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Frintz(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Frintz(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Frsqrte(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Frsqrte(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Frsqrts(v0.V4S(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Frsqrts(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Fsqrt(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Fsqrt(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Fsub(v0.V4S(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Fsub(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Mov(v0.D(), 0, x0));
   TEST_ZEROING(Mov(v0.S(), 0, w0));
   TEST_ZEROING(Mov(v0.H(), 0, w0));
   TEST_ZEROING(Mov(v0.B(), 0, w0));
   TEST_ZEROING(Mov(v0.D(), 0, v0.D(), 0));
   TEST_ZEROING(Mov(v0.S(), 0, v0.S(), 0));
   TEST_ZEROING(Mov(v0.H(), 0, v0.H(), 0));
   TEST_ZEROING(Mov(v0.B(), 0, v0.B(), 0));
   TEST_ZEROING(Mla(v0.V4S(), v0.V4S(), v0.S(), 0));
   TEST_ZEROING(Mla(v0.V4H(), v0.V4H(), v0.H(), 0));
   TEST_ZEROING(Mla(v0.V4S(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Mla(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Mls(v0.V4S(), v0.V4S(), v0.S(), 0));
   TEST_ZEROING(Mls(v0.V4H(), v0.V4H(), v0.H(), 0));
   TEST_ZEROING(Mls(v0.V4S(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Mls(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Movi(v0.V2D(), 0xff));
   TEST_ZEROING(Movi(v0.V2S(), 0xff));
   TEST_ZEROING(Movi(v0.V4S(), 0x10, LSL, 8));
   TEST_ZEROING(Movi(v0.V2S(), 0x10, LSL, 8));
   TEST_ZEROING(Mul(v0.V4S(), v0.V4S(), v0.S(), 0));
   TEST_ZEROING(Mul(v0.V4H(), v0.V4H(), v0.H(), 0));
   TEST_ZEROING(Mul(v0.V4S(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Mul(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Mvni(v0.V4H(), 0x10, LSL, 8));
   TEST_ZEROING(Mvni(v0.V4H(), 0x10, LSL, 8));
   TEST_ZEROING(Neg(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Neg(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Mvn(v0.V16B(), v0.V16B()));
   TEST_ZEROING(Mvn(v0.V8B(), v0.V8B()));
   TEST_ZEROING(Orn(v0.V8B(), v0.V8B(), v0.V8B()));
   TEST_ZEROING(Orn(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Orr(v0.V8H(), 0x10, 8));
   TEST_ZEROING(Orr(v0.V4H(), 0x10, 8));
   TEST_ZEROING(Mov(v0.V8B(), v0.V8B()));
   TEST_ZEROING(Mov(v0.V16B(), v0.V16B()));
   TEST_ZEROING(Pmul(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Pmull(v0.V8H(), v0.V8B(), v0.V8B()));
   TEST_ZEROING(Pmull2(v0.V8H(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Raddhn2(v0.V16B(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Raddhn(v0.V4H(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Rbit(v0.V8B(), v0.V8B()));
   TEST_ZEROING(Rbit(v0.V16B(), v0.V16B()));
   TEST_ZEROING(Rsubhn2(v0.V16B(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Rsubhn(v0.V4H(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Saba(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Saba(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Saba(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sabal2(v0.V8H(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Sabal(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sabd(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Sabd(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Sabd(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sabdl2(v0.V8H(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Sabdl(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sadalp(v0.V8H(), v0.V16B()));
   TEST_ZEROING(Saddl2(v0.V8H(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Saddl(v0.V2D(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Saddl(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Saddw2(v0.V8H(), v0.V8H(), v0.V16B()));
   TEST_ZEROING(Saddw(v0.V4S(), v0.V4S(), v0.V4H()));
   TEST_ZEROING(Scvtf(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Scvtf(v0.V8H(), v0.V8H()));
   TEST_ZEROING(Scvtf(v0.V2D(), v0.V2D(), 8));
   TEST_ZEROING(Sdot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
   TEST_ZEROING(Sdot(v0.V2S(), v0.V8B(), v0.S4B(), 0));
   TEST_ZEROING(Sdot(v0.V4S(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Sdot(v0.V2S(), v0.V8B(), v0.V8B()));
   TEST_ZEROING(Shadd(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Shadd(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Shl(v0.V2D(), v0.V2D(), 56));
   TEST_ZEROING(Shll2(v0.V8H(), v0.V16B(), 8));
   TEST_ZEROING(Shll(v0.V2D(), v0.V2S(), 32));
   TEST_ZEROING(Shsub(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Shsub(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sli(v0.V2D(), v0.V2D(), 56));
   TEST_ZEROING(Sli(v0.V2S(), v0.V2S(), 16));
   TEST_ZEROING(Smax(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Smax(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Smaxp(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Smaxp(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Smin(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Smin(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sminp(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Sminp(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Smlal2(v0.V8H(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Smlal(v0.V2D(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Smlal(v0.V2D(), v0.V2S(), v0.S(), 0));
   TEST_ZEROING(Smlsl2(v0.V8H(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Smlsl(v0.V2D(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Smlsl(v0.V2D(), v0.V2S(), v0.S(), 0));
   TEST_ZEROING(Smull2(v0.V8H(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Smull(v0.V2D(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Smull(v0.V2D(), v0.V2S(), v0.S(), 0));
   TEST_ZEROING(Sqabs(v0.V16B(), v0.V16B()));
   TEST_ZEROING(Sqabs(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sqadd(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Sqadd(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sqdmlal2(v0.V4S(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Sqdmlal(v0.V2D(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Sqdmlal(v0.V2D(), v0.V2S(), v0.S(), 0));
   TEST_ZEROING(Sqdmlsl2(v0.V4S(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Sqdmlsl(v0.V2D(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Sqdmlsl(v0.V2D(), v0.V2S(), v0.S(), 0));
   TEST_ZEROING(Sqdmulh(v0.V4S(), v0.V4S(), v0.S(), 0));
   TEST_ZEROING(Sqdmulh(v0.V4H(), v0.V4H(), v0.H(), 0));
   TEST_ZEROING(Sqdmulh(v0.V4S(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Sqdmulh(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sqdmull2(v0.V2D(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Sqdmull(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sqdmull2(v0.V2D(), v0.V4S(), v0.S(), 0));
   TEST_ZEROING(Sqdmull(v0.V4S(), v0.V4H(), v0.H(), 0));
   TEST_ZEROING(Sqneg(v0.V16B(), v0.V16B()));
   TEST_ZEROING(Sqneg(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Sqrdmlah(v0.V4S(), v0.V4S(), v0.S(), 0));
   TEST_ZEROING(Sqrdmlah(v0.V4H(), v0.V4H(), v0.H(), 0));
   TEST_ZEROING(Sqrdmlah(v0.V4S(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Sqrdmlah(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sqrdmlsh(v0.V4S(), v0.V4S(), v0.S(), 0));
   TEST_ZEROING(Sqrdmlsh(v0.V4H(), v0.V4H(), v0.H(), 0));
   TEST_ZEROING(Sqrdmlsh(v0.V4S(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Sqrdmlsh(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sqrdmulh(v0.V4S(), v0.V4S(), v0.S(), 0));
   TEST_ZEROING(Sqrdmulh(v0.V4H(), v0.V4H(), v0.H(), 0));
   TEST_ZEROING(Sqrdmulh(v0.V4S(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Sqrdmulh(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sqrshl(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Sqrshl(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sqshl(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Sqshl(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sqshl(v0.V2D(), v0.V2D(), 56));
   TEST_ZEROING(Sqshl(v0.V2S(), v0.V2S(), 16));
   TEST_ZEROING(Sqshlu(v0.V2D(), v0.V2D(), 56));
   TEST_ZEROING(Sqshlu(v0.V2S(), v0.V2S(), 16));
   TEST_ZEROING(Sqsub(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Sqsub(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sqxtn2(v0.V16B(), v0.V8H()));
   TEST_ZEROING(Sqxtn(v0.V2S(), v0.V2D()));
   TEST_ZEROING(Sqxtun2(v0.V16B(), v0.V8H()));
   TEST_ZEROING(Sqxtun(v0.V2S(), v0.V2D()));
   TEST_ZEROING(Srhadd(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Srhadd(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sri(v0.V2D(), v0.V2D(), 8));
   TEST_ZEROING(Sri(v0.V2S(), v0.V2S(), 8));
   TEST_ZEROING(Srshl(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Srshl(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Srshr(v0.V2D(), v0.V2D(), 8));
   TEST_ZEROING(Srshr(v0.V2S(), v0.V2S(), 8));
   TEST_ZEROING(Srsra(v0.V2D(), v0.V2D(), 8));
   TEST_ZEROING(Srsra(v0.V2S(), v0.V2S(), 8));
   TEST_ZEROING(Sshl(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Sshl(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Sshr(v0.V2D(), v0.V2D(), 8));
   TEST_ZEROING(Sshr(v0.V2S(), v0.V2S(), 8));
   TEST_ZEROING(Ssra(v0.V2D(), v0.V2D(), 8));
   TEST_ZEROING(Ssra(v0.V2S(), v0.V2S(), 8));
   TEST_ZEROING(Ssubl2(v0.V8H(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Ssubl(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Ssubw2(v0.V8H(), v0.V8H(), v0.V16B()));
   TEST_ZEROING(Ssubw(v0.V4S(), v0.V4S(), v0.V4H()));
   TEST_ZEROING(Sub(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Sub(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Subhn2(v0.V16B(), v0.V8H(), v0.V8H()));
   TEST_ZEROING(Subhn(v0.V4H(), v0.V4S(), v0.V4S()));
   TEST_ZEROING(Sudot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
   TEST_ZEROING(Sudot(v0.V2S(), v0.V8B(), v0.S4B(), 2));
   TEST_ZEROING(Suqadd(v0.V16B(), v0.V16B()));
   TEST_ZEROING(Suqadd(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Tbl(v0.V8B(), {v0.V16B()}, v0.V8B()));
   TEST_ZEROING(Tbl(v0.V16B(), {v0.V16B()}, v0.V16B()));
   TEST_ZEROING(Tbx(v0.V8B(), {v0.V16B()}, v0.V8B()));
   TEST_ZEROING(Tbx(v0.V16B(), {v0.V16B()}, v0.V16B()));
   TEST_ZEROING(Trn1(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Trn1(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Trn2(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Trn2(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Uaba(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Uaba(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Uabal2(v0.V8H(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Uabal(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Uabd(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Uabd(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Uabdl2(v0.V8H(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Uabdl(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Uadalp(v0.V8H(), v0.V16B()));
   TEST_ZEROING(Uadalp(v0.V2S(), v0.V4H()));
   TEST_ZEROING(Uaddl2(v0.V8H(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Uaddl(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Uaddlp(v0.V8H(), v0.V16B()));
   TEST_ZEROING(Uaddlp(v0.V2S(), v0.V4H()));
   TEST_ZEROING(Uaddw2(v0.V8H(), v0.V8H(), v0.V16B()));
   TEST_ZEROING(Uaddw(v0.V4S(), v0.V4S(), v0.V4H()));
   TEST_ZEROING(Ucvtf(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Ucvtf(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Ucvtf(v0.V2D(), v0.V2D(), 8));
   TEST_ZEROING(Ucvtf(v0.V2S(), v0.V2S(), 8));
   TEST_ZEROING(Udot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
   TEST_ZEROING(Udot(v0.V2S(), v0.V8B(), v0.S4B(), 0));
   TEST_ZEROING(Udot(v0.V2S(), v0.V8B(), v0.V8B()));
   TEST_ZEROING(Udot(v0.V4S(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Uhadd(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Uhadd(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Uhsub(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Uhsub(v0.V2S(), v0.V2S(), v0.V2S()));
   TEST_ZEROING(Umax(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Umax(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Umaxp(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Umaxp(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Umin(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Umin(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Uminp(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Uminp(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Umlal2(v0.V8H(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Umlal(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Umlal(v0.V2D(), v0.V2S(), v0.S(), 0));
   TEST_ZEROING(Umlal(v0.V4S(), v0.V4H(), v0.H(), 0));
   TEST_ZEROING(Umlsl2(v0.V8H(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Umlsl(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Umlsl(v0.V2D(), v0.V2S(), v0.S(), 0));
   TEST_ZEROING(Umlsl(v0.V4S(), v0.V4H(), v0.H(), 0));
   TEST_ZEROING(Umull2(v0.V8H(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Umull(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Umull(v0.V2D(), v0.V2S(), v0.S(), 0));
   TEST_ZEROING(Umull(v0.V4S(), v0.V4H(), v0.H(), 0));
   TEST_ZEROING(Uqadd(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Uqadd(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Uqrshl(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Uqrshl(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Uqshl(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Uqshl(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Uqsub(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Uqsub(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Uqxtn2(v0.V16B(), v0.V8H()));
   TEST_ZEROING(Uqxtn(v0.V2S(), v0.V2D()));
   TEST_ZEROING(Urecpe(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Urecpe(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Urhadd(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Urhadd(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Urshl(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Urshl(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Urshr(v0.V2D(), v0.V2D(), 8));
   TEST_ZEROING(Urshr(v0.V2S(), v0.V2S(), 8));
   TEST_ZEROING(Ursqrte(v0.V4S(), v0.V4S()));
   TEST_ZEROING(Ursqrte(v0.V2S(), v0.V2S()));
   TEST_ZEROING(Ursra(v0.V2D(), v0.V2D(), 8));
   TEST_ZEROING(Ursra(v0.V2S(), v0.V2S(), 8));
   TEST_ZEROING(Usdot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
   TEST_ZEROING(Usdot(v0.V2S(), v0.V8B(), v0.S4B(), 1));
   TEST_ZEROING(Usdot(v0.V4S(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Usdot(v0.V2S(), v0.V8B(), v0.V8B()));
   TEST_ZEROING(Ushl(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Ushl(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Ushr(v0.V2D(), v0.V2D(), 8));
   TEST_ZEROING(Ushr(v0.V2S(), v0.V2S(), 8));
   TEST_ZEROING(Usqadd(v0.V16B(), v0.V16B()));
   TEST_ZEROING(Usqadd(v0.V4H(), v0.V4H()));
   TEST_ZEROING(Usra(v0.V2D(), v0.V2D(), 8));
   TEST_ZEROING(Usra(v0.V2S(), v0.V2S(), 8));
   TEST_ZEROING(Usubl2(v0.V8H(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Usubl(v0.V4S(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Usubw2(v0.V8H(), v0.V8H(), v0.V16B()));
   TEST_ZEROING(Usubw(v0.V4S(), v0.V4S(), v0.V4H()));
   TEST_ZEROING(Uzp1(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Uzp1(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Uzp2(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Uzp2(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Xtn2(v0.V16B(), v0.V8H()));
   TEST_ZEROING(Xtn(v0.V4H(), v0.V4S()));
   TEST_ZEROING(Zip1(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Zip1(v0.V4H(), v0.V4H(), v0.V4H()));
   TEST_ZEROING(Zip2(v0.V16B(), v0.V16B(), v0.V16B()));
   TEST_ZEROING(Zip2(v0.V4H(), v0.V4H(), v0.V4H()));

   __ Mov(z11.VnD(), 0);

   Label done, zero_127_to_0;
   __ Rdvl(x0, 1);
   __ Cmp(x0, 16);
   __ B(gt, &zero_127_to_0);

   // For 128-bit VL, there's nothing to be tested, so zero the whole register.
   __ Mov(z10.VnD(), 0);
   __ B(&done);

   // Set the expected non-zero bits to zero.
   __ Bind(&zero_127_to_0);
   __ Ext(z10.VnB(), z10.VnB(), z11.VnB(), kDRegSizeInBytes * 2);

   __ Bind(&done);

   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(z11, z10);
   }
 }

 #undef TEST_ZEROING

 #define TEST_ZEROING_1(INST) \
   __ Index(z0.VnB(), 0, 1);  \
   __ INST;                   \
   __ Orr(z10.VnB(), z10.VnB(), z0.VnB());
 #define TEST_ZEROING_2(INST)              \
   __ Index(z0.VnB(), 0, 1);               \
   __ Index(z1.VnB(), 0, 1);               \
   __ INST;                                \
   __ Orr(z10.VnB(), z10.VnB(), z0.VnB()); \
   __ Orr(z10.VnB(), z10.VnB(), z1.VnB());
 #define TEST_ZEROING_3(INST)              \
   __ Index(z0.VnB(), 0, 1);               \
   __ Index(z1.VnB(), 0, 1);               \
   __ Index(z2.VnB(), 0, 1);               \
   __ INST;                                \
   __ Orr(z10.VnB(), z10.VnB(), z0.VnB()); \
   __ Orr(z10.VnB(), z10.VnB(), z1.VnB()); \
   __ Orr(z10.VnB(), z10.VnB(), z2.VnB());
 #define TEST_ZEROING_4(INST)              \
   __ Index(z0.VnB(), 0, 1);               \
   __ Index(z1.VnB(), 0, 1);               \
   __ Index(z2.VnB(), 0, 1);               \
   __ Index(z3.VnB(), 0, 1);               \
   __ INST;                                \
   __ Orr(z10.VnB(), z10.VnB(), z0.VnB()); \
   __ Orr(z10.VnB(), z10.VnB(), z1.VnB()); \
   __ Orr(z10.VnB(), z10.VnB(), z2.VnB()); \
   __ Orr(z10.VnB(), z10.VnB(), z3.VnB());

 TEST_SVE(neon_load_zero_high) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE);

   START();
   __ Mov(z10.VnD(), 0);  // Initialise cumulative result register.

   // Initialise x0 to point to a buffer from which data is loaded. The contents
   // does not need to be defined.
   int data_size = 4 * kQRegSizeInBytes;
   uint8_t* data = new uint8_t[data_size];
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size]));

   MemOperand mop = MemOperand(x0);
   TEST_ZEROING_1(Ld1(v0.V16B(), mop));
   TEST_ZEROING_1(Ld1(v0.V4H(), mop));
   TEST_ZEROING_1(Ld1(v0.V16B(), v1.V16B(), mop));
   TEST_ZEROING_1(Ld1(v0.V4H(), v1.V4H(), mop));
   TEST_ZEROING_1(Ld1(v0.V16B(), v1.V16B(), v2.V16B(), mop));
   TEST_ZEROING_1(Ld1(v0.V4H(), v1.V4H(), v2.V4H(), mop));
   TEST_ZEROING_1(Ld1(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B(), mop));
   TEST_ZEROING_1(Ld1(v0.V4H(), v1.V4H(), v2.V4H(), v3.V4H(), mop));
   TEST_ZEROING_1(Ld1(v0.B(), 1, mop));
   TEST_ZEROING_1(Ld1(v0.D(), 1, mop));
   TEST_ZEROING_1(Ld1(v0.H(), 1, mop));
   TEST_ZEROING_1(Ld1(v0.S(), 1, mop));
   TEST_ZEROING_1(Ld1r(v0.V16B(), mop));
   TEST_ZEROING_1(Ld1r(v0.V4H(), mop));
   TEST_ZEROING_2(Ld2(v0.V16B(), v1.V16B(), mop));
   TEST_ZEROING_2(Ld2(v0.V4H(), v1.V4H(), mop));
   TEST_ZEROING_2(Ld2(v0.B(), v1.B(), 1, mop));
   TEST_ZEROING_2(Ld2(v0.D(), v1.D(), 1, mop));
   TEST_ZEROING_2(Ld2(v0.H(), v1.H(), 1, mop));
   TEST_ZEROING_2(Ld2(v0.S(), v1.S(), 1, mop));
   TEST_ZEROING_2(Ld2r(v0.V16B(), v1.V16B(), mop));
   TEST_ZEROING_2(Ld2r(v0.V4H(), v1.V4H(), mop));
   TEST_ZEROING_3(Ld3(v0.V16B(), v1.V16B(), v2.V16B(), mop));
   TEST_ZEROING_3(Ld3(v0.V4H(), v1.V4H(), v2.V4H(), mop));
   TEST_ZEROING_3(Ld3(v0.B(), v1.B(), v2.B(), 1, mop));
   TEST_ZEROING_3(Ld3(v0.D(), v1.D(), v2.D(), 1, mop));
   TEST_ZEROING_3(Ld3(v0.H(), v1.H(), v2.H(), 1, mop));
   TEST_ZEROING_3(Ld3(v0.S(), v1.S(), v2.S(), 1, mop));
   TEST_ZEROING_3(Ld3r(v0.V16B(), v1.V16B(), v2.V16B(), mop));
   TEST_ZEROING_3(Ld3r(v0.V4H(), v1.V4H(), v2.V4H(), mop));
   TEST_ZEROING_4(Ld4(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B(), mop));
   TEST_ZEROING_4(Ld4(v0.V4H(), v1.V4H(), v2.V4H(), v3.V4H(), mop));
   TEST_ZEROING_4(Ld4(v0.B(), v1.B(), v2.B(), v3.B(), 1, mop));
   TEST_ZEROING_4(Ld4(v0.D(), v1.D(), v2.D(), v3.D(), 1, mop));
   TEST_ZEROING_4(Ld4(v0.H(), v1.H(), v2.H(), v3.H(), 1, mop));
   TEST_ZEROING_4(Ld4(v0.S(), v1.S(), v2.S(), v3.S(), 1, mop));
   TEST_ZEROING_4(Ld4r(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B(), mop));
   TEST_ZEROING_4(Ld4r(v0.V4H(), v1.V4H(), v2.V4H(), v3.V4H(), mop));

   __ Mov(z11.VnD(), 0);

   Label done, zero_127_to_0;
   __ Rdvl(x0, 1);
   __ Cmp(x0, 16);
   __ B(gt, &zero_127_to_0);

   // For 128-bit VL, there's nothing to be tested, so zero the whole register.
   __ Mov(z10.VnD(), 0);
   __ B(&done);

   // Set the expected non-zero bits to zero.
   __ Bind(&zero_127_to_0);
   __ Ext(z10.VnB(), z10.VnB(), z11.VnB(), kDRegSizeInBytes * 2);

   __ Bind(&done);

   END();

   if (CAN_RUN()) {
     RUN();
     ASSERT_EQUAL_SVE(z11, z10);
   }
 }

 #undef TEST_ZEROING_1
 #undef TEST_ZEROING_2
 #undef TEST_ZEROING_3
 #undef TEST_ZEROING_4

 TEST_SVE(sve_load_store_sp_base_regression_test) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();

   __ Mov(x0, 0);
   __ Mov(z0.VnB(), 0);
   __ Ptrue(p0.VnB());

   Label loop;
   __ Mov(x1, 128);
   __ Bind(&loop);
   __ Push(xzr, xzr);
   __ Sub(x1, x1, 1);
   __ Cbnz(x1, &loop);

   {
     ExactAssemblyScope scope(&masm, 193 * kInstructionSize);

     __ dci(0xa420a3e0);  // ld1b {z0.h}, p0/z, [sp]
     __ dci(0xa440a3e0);  // ld1b {z0.s}, p0/z, [sp]
     __ dci(0xa460a3e0);  // ld1b {z0.d}, p0/z, [sp]
     __ dci(0xa400a3e0);  // ld1b {z0.b}, p0/z, [sp]
     __ dci(0xa42043e0);  // ld1b {z0.h}, p0/z, [sp, x0]
     __ dci(0xa44043e0);  // ld1b {z0.s}, p0/z, [sp, x0]
     __ dci(0xa46043e0);  // ld1b {z0.d}, p0/z, [sp, x0]
     __ dci(0xa40043e0);  // ld1b {z0.b}, p0/z, [sp, x0]
     __ dci(0xc440c3e0);  // ld1b {z0.d}, p0/z, [sp, z0.d]
     __ dci(0xa5e0a3e0);  // ld1d {z0.d}, p0/z, [sp]
     __ dci(0xa5e043e0);  // ld1d {z0.d}, p0/z, [sp, x0, lsl #3]
     __ dci(0xc5e0c3e0);  // ld1d {z0.d}, p0/z, [sp, z0.d, lsl #3]
     __ dci(0xc5c0c3e0);  // ld1d {z0.d}, p0/z, [sp, z0.d]
     __ dci(0xa4a0a3e0);  // ld1h {z0.h}, p0/z, [sp]
     __ dci(0xa4c0a3e0);  // ld1h {z0.s}, p0/z, [sp]
     __ dci(0xa4e0a3e0);  // ld1h {z0.d}, p0/z, [sp]
     __ dci(0xa4a043e0);  // ld1h {z0.h}, p0/z, [sp, x0, lsl #1]
     __ dci(0xa4c043e0);  // ld1h {z0.s}, p0/z, [sp, x0, lsl #1]
     __ dci(0xa4e043e0);  // ld1h {z0.d}, p0/z, [sp, x0, lsl #1]
     __ dci(0xc4e0c3e0);  // ld1h {z0.d}, p0/z, [sp, z0.d, lsl #1]
     __ dci(0xc4c0c3e0);  // ld1h {z0.d}, p0/z, [sp, z0.d]
     __ dci(0x8440a3e0);  // ld1rb {z0.h}, p0/z, [sp]
     __ dci(0x8440c3e0);  // ld1rb {z0.s}, p0/z, [sp]
     __ dci(0x8440e3e0);  // ld1rb {z0.d}, p0/z, [sp]
     __ dci(0x844083e0);  // ld1rb {z0.b}, p0/z, [sp]
     __ dci(0x85c0e3e0);  // ld1rd {z0.d}, p0/z, [sp]
     __ dci(0x84c0a3e0);  // ld1rh {z0.h}, p0/z, [sp]
     __ dci(0x84c0c3e0);  // ld1rh {z0.s}, p0/z, [sp]
     __ dci(0x84c0e3e0);  // ld1rh {z0.d}, p0/z, [sp]
     __ dci(0xa40023e0);  // ld1rqb {z0.b}, p0/z, [sp]
     __ dci(0xa40003e0);  // ld1rqb {z0.b}, p0/z, [sp, x0]
     __ dci(0xa58023e0);  // ld1rqd {z0.d}, p0/z, [sp]
     __ dci(0xa58003e0);  // ld1rqd {z0.d}, p0/z, [sp, x0, lsl #3]
     __ dci(0xa48023e0);  // ld1rqh {z0.h}, p0/z, [sp]
     __ dci(0xa48003e0);  // ld1rqh {z0.h}, p0/z, [sp, x0, lsl #1]
     __ dci(0xa50023e0);  // ld1rqw {z0.s}, p0/z, [sp]
     __ dci(0xa50003e0);  // ld1rqw {z0.s}, p0/z, [sp, x0, lsl #2]
     __ dci(0x85c0c3e0);  // ld1rsb {z0.h}, p0/z, [sp]
     __ dci(0x85c0a3e0);  // ld1rsb {z0.s}, p0/z, [sp]
     __ dci(0x85c083e0);  // ld1rsb {z0.d}, p0/z, [sp]
     __ dci(0x8540a3e0);  // ld1rsh {z0.s}, p0/z, [sp]
     __ dci(0x854083e0);  // ld1rsh {z0.d}, p0/z, [sp]
     __ dci(0x84c083e0);  // ld1rsw {z0.d}, p0/z, [sp]
     __ dci(0x8540c3e0);  // ld1rw {z0.s}, p0/z, [sp]
     __ dci(0x8540e3e0);  // ld1rw {z0.d}, p0/z, [sp]
     __ dci(0xa5c0a3e0);  // ld1sb {z0.h}, p0/z, [sp]
     __ dci(0xa5a0a3e0);  // ld1sb {z0.s}, p0/z, [sp]
     __ dci(0xa580a3e0);  // ld1sb {z0.d}, p0/z, [sp]
     __ dci(0xa5c043e0);  // ld1sb {z0.h}, p0/z, [sp, x0]
     __ dci(0xa5a043e0);  // ld1sb {z0.s}, p0/z, [sp, x0]
     __ dci(0xa58043e0);  // ld1sb {z0.d}, p0/z, [sp, x0]
     __ dci(0xc44083e0);  // ld1sb {z0.d}, p0/z, [sp, z0.d]
     __ dci(0xa520a3e0);  // ld1sh {z0.s}, p0/z, [sp]
     __ dci(0xa500a3e0);  // ld1sh {z0.d}, p0/z, [sp]
     __ dci(0xa52043e0);  // ld1sh {z0.s}, p0/z, [sp, x0, lsl #1]
     __ dci(0xa50043e0);  // ld1sh {z0.d}, p0/z, [sp, x0, lsl #1]
     __ dci(0xc4e083e0);  // ld1sh {z0.d}, p0/z, [sp, z0.d, lsl #1]
     __ dci(0xc4c083e0);  // ld1sh {z0.d}, p0/z, [sp, z0.d]
     __ dci(0xa480a3e0);  // ld1sw {z0.d}, p0/z, [sp]
     __ dci(0xa48043e0);  // ld1sw {z0.d}, p0/z, [sp, x0, lsl #2]
     __ dci(0xc56083e0);  // ld1sw {z0.d}, p0/z, [sp, z0.d, lsl #2]
     __ dci(0xc54083e0);  // ld1sw {z0.d}, p0/z, [sp, z0.d]
     __ dci(0xa540a3e0);  // ld1w {z0.s}, p0/z, [sp]
     __ dci(0xa560a3e0);  // ld1w {z0.d}, p0/z, [sp]
     __ dci(0xa54043e0);  // ld1w {z0.s}, p0/z, [sp, x0, lsl #2]
     __ dci(0xa56043e0);  // ld1w {z0.d}, p0/z, [sp, x0, lsl #2]
     __ dci(0xc560c3e0);  // ld1w {z0.d}, p0/z, [sp, z0.d, lsl #2]
     __ dci(0xc540c3e0);  // ld1w {z0.d}, p0/z, [sp, z0.d]
     __ dci(0xa420e3e0);  // ld2b {z0.b, z1.b}, p0/z, [sp]
     __ dci(0xa420c3e0);  // ld2b {z0.b, z1.b}, p0/z, [sp, x0]
     __ dci(0xa5a0e3e0);  // ld2d {z0.d, z1.d}, p0/z, [sp]
     __ dci(0xa5a0c3e0);  // ld2d {z0.d, z1.d}, p0/z, [sp, x0, lsl #3]
     __ dci(0xa4a0e3e0);  // ld2h {z0.h, z1.h}, p0/z, [sp]
     __ dci(0xa4a0c3e0);  // ld2h {z0.h, z1.h}, p0/z, [sp, x0, lsl #1]
     __ dci(0xa520e3e0);  // ld2w {z0.s, z1.s}, p0/z, [sp]
     __ dci(0xa520c3e0);  // ld2w {z0.s, z1.s}, p0/z, [sp, x0, lsl #2]
     __ dci(0xa440e3e0);  // ld3b {z0.b, z1.b, z2.b}, p0/z, [sp]
     __ dci(0xa440c3e0);  // ld3b {z0.b, z1.b, z2.b}, p0/z, [sp, x0]
     __ dci(0xa5c0e3e0);  // ld3d {z0.d, z1.d, z2.d}, p0/z, [sp]
     __ dci(0xa5c0c3e0);  // ld3d {z0.d, z1.d, z2.d}, p0/z, [sp, x0, lsl #3]
     __ dci(0xa4c0e3e0);  // ld3h {z0.h, z1.h, z2.h}, p0/z, [sp]
     __ dci(0xa4c0c3e0);  // ld3h {z0.h, z1.h, z2.h}, p0/z, [sp, x0, lsl #1]
     __ dci(0xa540e3e0);  // ld3w {z0.s, z1.s, z2.s}, p0/z, [sp]
     __ dci(0xa540c3e0);  // ld3w {z0.s, z1.s, z2.s}, p0/z, [sp, x0, lsl #2]
     __ dci(0xa460e3e0);  // ld4b {z0.b, z1.b, z2.b, z3.b}, p0/z, [sp]
     __ dci(0xa460c3e0);  // ld4b {z0.b, z1.b, z2.b, z3.b}, p0/z, [sp, x0]
     __ dci(0xa5e0e3e0);  // ld4d {z0.d, z1.d, z2.d, z3.d}, p0/z, [sp]
     __ dci(
         0xa5e0c3e0);  // ld4d {z0.d, z1.d, z2.d, z3.d}, p0/z, [sp, x0, lsl #3]
     __ dci(0xa4e0e3e0);  // ld4h {z0.h, z1.h, z2.h, z3.h}, p0/z, [sp]
     __ dci(
         0xa4e0c3e0);  // ld4h {z0.h, z1.h, z2.h, z3.h}, p0/z, [sp, x0, lsl #1]
     __ dci(0xa560e3e0);  // ld4w {z0.s, z1.s, z2.s, z3.s}, p0/z, [sp]
     __ dci(
         0xa560c3e0);  // ld4w {z0.s, z1.s, z2.s, z3.s}, p0/z, [sp, x0, lsl #2]
     __ dci(0xa42063e0);  // ldff1b {z0.h}, p0/z, [sp, x0]
     __ dci(0xa44063e0);  // ldff1b {z0.s}, p0/z, [sp, x0]
     __ dci(0xa46063e0);  // ldff1b {z0.d}, p0/z, [sp, x0]
     __ dci(0xa40063e0);  // ldff1b {z0.b}, p0/z, [sp, x0]
     __ dci(0xc440e3e0);  // ldff1b {z0.d}, p0/z, [sp, z0.d]
     __ dci(0xa5e063e0);  // ldff1d {z0.d}, p0/z, [sp, x0, lsl #3]
     __ dci(0xc5e0e3e0);  // ldff1d {z0.d}, p0/z, [sp, z0.d, lsl #3]
     __ dci(0xc5c0e3e0);  // ldff1d {z0.d}, p0/z, [sp, z0.d]
     __ dci(0xa4a063e0);  // ldff1h {z0.h}, p0/z, [sp, x0, lsl #1]
     __ dci(0xa4c063e0);  // ldff1h {z0.s}, p0/z, [sp, x0, lsl #1]
     __ dci(0xa4e063e0);  // ldff1h {z0.d}, p0/z, [sp, x0, lsl #1]
     __ dci(0xc4e0e3e0);  // ldff1h {z0.d}, p0/z, [sp, z0.d, lsl #1]
     __ dci(0xc4c0e3e0);  // ldff1h {z0.d}, p0/z, [sp, z0.d]
     __ dci(0xa5c063e0);  // ldff1sb {z0.h}, p0/z, [sp, x0]
     __ dci(0xa5a063e0);  // ldff1sb {z0.s}, p0/z, [sp, x0]
     __ dci(0xa58063e0);  // ldff1sb {z0.d}, p0/z, [sp, x0]
     __ dci(0xc440a3e0);  // ldff1sb {z0.d}, p0/z, [sp, z0.d]
     __ dci(0xa52063e0);  // ldff1sh {z0.s}, p0/z, [sp, x0, lsl #1]
     __ dci(0xa50063e0);  // ldff1sh {z0.d}, p0/z, [sp, x0, lsl #1]
     __ dci(0xc4e0a3e0);  // ldff1sh {z0.d}, p0/z, [sp, z0.d, lsl #1]
     __ dci(0xc4c0a3e0);  // ldff1sh {z0.d}, p0/z, [sp, z0.d]
     __ dci(0xa48063e0);  // ldff1sw {z0.d}, p0/z, [sp, x0, lsl #2]
     __ dci(0xc560a3e0);  // ldff1sw {z0.d}, p0/z, [sp, z0.d, lsl #2]
     __ dci(0xc540a3e0);  // ldff1sw {z0.d}, p0/z, [sp, z0.d]
     __ dci(0xa54063e0);  // ldff1w {z0.s}, p0/z, [sp, x0, lsl #2]
     __ dci(0xa56063e0);  // ldff1w {z0.d}, p0/z, [sp, x0, lsl #2]
     __ dci(0xc560e3e0);  // ldff1w {z0.d}, p0/z, [sp, z0.d, lsl #2]
     __ dci(0xc540e3e0);  // ldff1w {z0.d}, p0/z, [sp, z0.d]
     __ dci(0xa430a3e0);  // ldnf1b {z0.h}, p0/z, [sp]
     __ dci(0xa450a3e0);  // ldnf1b {z0.s}, p0/z, [sp]
     __ dci(0xa470a3e0);  // ldnf1b {z0.d}, p0/z, [sp]
     __ dci(0xa410a3e0);  // ldnf1b {z0.b}, p0/z, [sp]
     __ dci(0xa5f0a3e0);  // ldnf1d {z0.d}, p0/z, [sp]
     __ dci(0xa4b0a3e0);  // ldnf1h {z0.h}, p0/z, [sp]
     __ dci(0xa4d0a3e0);  // ldnf1h {z0.s}, p0/z, [sp]
     __ dci(0xa4f0a3e0);  // ldnf1h {z0.d}, p0/z, [sp]
     __ dci(0xa5d0a3e0);  // ldnf1sb {z0.h}, p0/z, [sp]
     __ dci(0xa5b0a3e0);  // ldnf1sb {z0.s}, p0/z, [sp]
     __ dci(0xa590a3e0);  // ldnf1sb {z0.d}, p0/z, [sp]
     __ dci(0xa530a3e0);  // ldnf1sh {z0.s}, p0/z, [sp]
     __ dci(0xa510a3e0);  // ldnf1sh {z0.d}, p0/z, [sp]
     __ dci(0xa490a3e0);  // ldnf1sw {z0.d}, p0/z, [sp]
     __ dci(0xa550a3e0);  // ldnf1w {z0.s}, p0/z, [sp]
     __ dci(0xa570a3e0);  // ldnf1w {z0.d}, p0/z, [sp]
     __ dci(0xa400e3e0);  // ldnt1b {z0.b}, p0/z, [sp]
     __ dci(0xa400c3e0);  // ldnt1b {z0.b}, p0/z, [sp, x0]
     __ dci(0xa580e3e0);  // ldnt1d {z0.d}, p0/z, [sp]
     __ dci(0xa580c3e0);  // ldnt1d {z0.d}, p0/z, [sp, x0, lsl #3]
     __ dci(0xa480e3e0);  // ldnt1h {z0.h}, p0/z, [sp]
     __ dci(0xa480c3e0);  // ldnt1h {z0.h}, p0/z, [sp, x0, lsl #1]
     __ dci(0xa500e3e0);  // ldnt1w {z0.s}, p0/z, [sp]
     __ dci(0xa500c3e0);  // ldnt1w {z0.s}, p0/z, [sp, x0, lsl #2]
     __ dci(0x858043e0);  // ldr z0, [sp]
     __ dci(0xe400e3e0);  // st1b {z0.b}, p0, [sp]
     __ dci(0xe40043e0);  // st1b {z0.b}, p0, [sp, x0]
     __ dci(0xe400a3e0);  // st1b {z0.d}, p0, [sp, z0.d]
     __ dci(0xe5e0e3e0);  // st1d {z0.d}, p0, [sp]
     __ dci(0xe5e043e0);  // st1d {z0.d}, p0, [sp, x0, lsl #3]
     __ dci(0xe5a0a3e0);  // st1d {z0.d}, p0, [sp, z0.d, lsl #3]
     __ dci(0xe580a3e0);  // st1d {z0.d}, p0, [sp, z0.d]
     __ dci(0xe4e0e3e0);  // st1h {z0.d}, p0, [sp]
     __ dci(0xe4e043e0);  // st1h {z0.d}, p0, [sp, x0, lsl #1]
     __ dci(0xe4a0a3e0);  // st1h {z0.d}, p0, [sp, z0.d, lsl #1]
     __ dci(0xe480a3e0);  // st1h {z0.d}, p0, [sp, z0.d]
     __ dci(0xe560e3e0);  // st1w {z0.d}, p0, [sp]
     __ dci(0xe56043e0);  // st1w {z0.d}, p0, [sp, x0, lsl #2]
     __ dci(0xe430e3e0);  // st2b {z0.b, z1.b}, p0, [sp]
     __ dci(0xe42063e0);  // st2b {z0.b, z1.b}, p0, [sp, x0]
     __ dci(0xe5b0e3e0);  // st2d {z0.d, z1.d}, p0, [sp]
     __ dci(0xe5a063e0);  // st2d {z0.d, z1.d}, p0, [sp, x0, lsl #3]
     __ dci(0xe4b0e3e0);  // st2h {z0.h, z1.h}, p0, [sp]
     __ dci(0xe4a063e0);  // st2h {z0.h, z1.h}, p0, [sp, x0, lsl #1]
     __ dci(0xe530e3e0);  // st2w {z0.s, z1.s}, p0, [sp]
     __ dci(0xe52063e0);  // st2w {z0.s, z1.s}, p0, [sp, x0, lsl #2]
     __ dci(0xe450e3e0);  // st3b {z0.b, z1.b, z2.b}, p0, [sp]
     __ dci(0xe44063e0);  // st3b {z0.b, z1.b, z2.b}, p0, [sp, x0]
     __ dci(0xe5d0e3e0);  // st3d {z0.d, z1.d, z2.d}, p0, [sp]
     __ dci(0xe5c063e0);  // st3d {z0.d, z1.d, z2.d}, p0, [sp, x0, lsl #3]
     __ dci(0xe4d0e3e0);  // st3h {z0.h, z1.h, z2.h}, p0, [sp]
     __ dci(0xe4c063e0);  // st3h {z0.h, z1.h, z2.h}, p0, [sp, x0, lsl #1]
     __ dci(0xe550e3e0);  // st3w {z0.s, z1.s, z2.s}, p0, [sp]
     __ dci(0xe54063e0);  // st3w {z0.s, z1.s, z2.s}, p0, [sp, x0, lsl #2]
     __ dci(0xe470e3e0);  // st4b {z0.b, z1.b, z2.b, z3.b}, p0, [sp]
     __ dci(0xe46063e0);  // st4b {z0.b, z1.b, z2.b, z3.b}, p0, [sp, x0]
     __ dci(0xe5f0e3e0);  // st4d {z0.d, z1.d, z2.d, z3.d}, p0, [sp]
     __ dci(0xe5e063e0);  // st4d {z0.d, z1.d, z2.d, z3.d}, p0, [sp, x0, lsl #3]
     __ dci(0xe4f0e3e0);  // st4h {z0.h, z1.h, z2.h, z3.h}, p0, [sp]
     __ dci(0xe4e063e0);  // st4h {z0.h, z1.h, z2.h, z3.h}, p0, [sp, x0, lsl #1]
     __ dci(0xe570e3e0);  // st4w {z0.s, z1.s, z2.s, z3.s}, p0, [sp]
     __ dci(0xe56063e0);  // st4w {z0.s, z1.s, z2.s, z3.s}, p0, [sp, x0, lsl #2]
     __ dci(0xe410e3e0);  // stnt1b {z0.b}, p0, [sp]
     __ dci(0xe40063e0);  // stnt1b {z0.b}, p0, [sp, x0]
     __ dci(0xe590e3e0);  // stnt1d {z0.d}, p0, [sp]
     __ dci(0xe58063e0);  // stnt1d {z0.d}, p0, [sp, x0, lsl #3]
     __ dci(0xe490e3e0);  // stnt1h {z0.h}, p0, [sp]
     __ dci(0xe48063e0);  // stnt1h {z0.h}, p0, [sp, x0, lsl #1]
     __ dci(0xe510e3e0);  // stnt1w {z0.s}, p0, [sp]
     __ dci(0xe50063e0);  // stnt1w {z0.s}, p0, [sp, x0, lsl #2]
     __ dci(0x858003e0);  // ldr p0, [sp]
     __ dci(0xe58003e0);  // str p0, [sp]
     __ dci(0xe58043e0);  // str z0, [sp]
   }

   __ Drop(128 * 2 * kXRegSizeInBytes);

   END();

   if (CAN_RUN()) {
     RUN();

     // No checks are made here. The test is designed to ensure that the base
     // register is interpreted correctly as sp, not xzr. If it is interpreted
     // as xzr, the memory access to addresses near zero will fault, and the
     // test will fail.
   }
 }

 // Manually constructed simulator test to avoid creating a VL128 variant.

 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
 void Test_sve_fmatmul(Test* config) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVEF64MM);

   // Only double-precision matrix multiply is tested here. Single-precision is
   // tested in the simulator tests using a generated sequence. The (templated)
   // code used in the simulator for both cases is the same, which is why the
   // tests here don't need to be comprehensive.
   START();
   Label vl_too_short;
   __ Rdvl(x0, 1);
   __ Cmp(x0, 32);
   __ B(lt, &vl_too_short);  // Skip testing VL128.

   __ Fdup(z0.VnD(), 1.0);
   __ Fdup(z1.VnD(), 2.0);
   __ Mov(z2.VnD(), 0);

   // Build 2x2 identity matrix in z3.
   Label iden_loop;
   __ Lsr(x0, x0, 5);
   __ Bind(&iden_loop);
   __ Insr(z3.VnD(), d0);
   __ Insr(z3.VnD(), d2);
   __ Insr(z3.VnD(), d2);
   __ Insr(z3.VnD(), d0);
   __ Sub(x0, x0, 1);
   __ Cbnz(x0, &iden_loop);

   __ Fmmla(z1.VnD(), z1.VnD(), z0.VnD(), z0.VnD());
   __ Fmmla(z2.VnD(), z2.VnD(), z1.VnD(), z3.VnD());

   __ Ptrue(p0.VnB());
   __ Index(z4.VnD(), -8, 3);
   __ Scvtf(z4.VnD(), p0.Merging(), z4.VnD());
   __ Mov(z5.VnD(), 0);
   __ Fmmla(z4.VnD(), z4.VnD(), z4.VnD(), z4.VnD());
   __ Fmmla(z5.VnD(), z5.VnD(), z4.VnD(), z3.VnD());

   __ Bind(&vl_too_short);
   END();

   if (CAN_RUN()) {
     RUN();

     if (core.GetSVELaneCount(kDRegSize) >= 4) {  // VL256 or longer.
       ASSERT_EQUAL_SVE(z1, z2);
       ASSERT_EQUAL_SVE(z4, z5);

       // All results are 4.0:
       //    z0   z0      z1
       //  (1 1)(1 1) + (2 2) = (4 4)
       //  (1 1)(1 1)   (2 2)   (4 4)
       uint64_t z1_expected[] =
           {0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
            0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
            0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
            0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
            0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
            0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
            0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
            0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
            0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
            0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
            0x4010000000000000, 0x4010000000000000};
       ASSERT_EQUAL_SVE(z1_expected, z1.VnD());

       // First (highest z4_expected index) multiplications are:
       //    z4     z4        z4
       // (-8 -5)(-8 -2) + (-8 -5) = (81 6)
       // (-2  1)(-5  1)   (-2  1)   ( 9 6)
       //
       // ( 4  7)( 4 10) + ( 4  7) = ( 69 138)
       // (10 13)( 7 13)   (10 13)   (141 282)
       uint64_t z4_expected[] = {
           0x40cb690000000000, 0x40c9728000000000, 0x40c9710000000000,
           0x40c79e8000000000, 0x40c41f0000000000, 0x40c2708000000000,
           0x40c26f0000000000, 0x40c0e48000000000, 0x40bbea0000000000,
           0x40b91d0000000000, 0x40b91a0000000000, 0x40b6950000000000,
           0x40b1d60000000000, 0x40af320000000000, 0x40af2c0000000000,
           0x40ab420000000000, 0x40a4040000000000, 0x40a0aa0000000000,
           0x40a0a40000000000, 0x409bb40000000000, 0x4091b80000000000,
           0x408a880000000000, 0x408a700000000000, 0x4083c80000000000,
           0x4071a00000000000, 0x4061a00000000000, 0x4061400000000000,
           0x4051400000000000, 0x4018000000000000, 0x4022000000000000,
           0x4018000000000000, 0x4054400000000000,
       };
       ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
     }
   }
 }
 Test* test_sve_fmatmul_list[] =
     {Test::MakeSVETest(256, "AARCH64_ASM_sve_fmatmul_vl256", &Test_sve_fmatmul),
      Test::MakeSVETest(512, "AARCH64_ASM_sve_fmatmul_vl512", &Test_sve_fmatmul),
      Test::MakeSVETest(2048,
                        "AARCH64_ASM_sve_fmatmul_vl2048",
                        &Test_sve_fmatmul)};

 void Test_sve_ld1ro(Test* config) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVEF64MM);
   START();

   int data_size = (kQRegSizeInBytes + 128) * 4;
   uint8_t* data = new uint8_t[data_size];
   for (int i = 0; i < data_size; i++) {
     data[i] = i & 0xff;
   }

   // Set the base to just past half-way through the buffer so we can use
   // negative indices.
   __ Mov(x0, reinterpret_cast<uintptr_t>(&data[7 + data_size / 2]));

   __ Index(z0.VnB(), 0, 1);
   __ Ptrue(p0.VnB());
   __ Cmplo(p0.VnB(), p0.Zeroing(), z0.VnB(), 4);
   __ Pfalse(p1.VnB());
   __ Zip1(p1.VnB(), p0.VnB(), p1.VnB());
   __ Ptrue(p2.VnB());

   __ Mov(x1, -32);
   __ Ld1rob(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, -32));
   __ Ld1rob(z1.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));

   __ Mov(x1, 64 / 2);
   __ Ld1roh(z2.VnH(), p2.Zeroing(), SVEMemOperand(x0, 64));
   __ Ld1roh(z3.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));

   __ Mov(x1, -96 / 4);
   __ Ld1row(z4.VnS(), p2.Zeroing(), SVEMemOperand(x0, -96));
   __ Ld1row(z5.VnS(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));

   __ Mov(x1, 128 / 8);
   __ Ld1rod(z6.VnD(), p2.Zeroing(), SVEMemOperand(x0, 128));
   __ Ld1rod(z7.VnD(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));

   // Check that all 256-bit segments match by rotating the vector by one
   // segment, eoring, and orring across the vector.
   __ Dup(z11.VnQ(), z0.VnQ(), 2);
   __ Mov(z8, z0);
   __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
   __ Eor(z8.VnB(), z8.VnB(), z0.VnB());
   __ Orv(b9, p2, z8.VnB());

   __ Mov(z8, z2);
   __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
   __ Eor(z8.VnB(), z8.VnB(), z2.VnB());
   __ Orv(b8, p2, z8.VnB());
   __ Orr(z9, z9, z8);

   __ Mov(z8, z4);
   __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
   __ Eor(z8.VnB(), z8.VnB(), z4.VnB());
   __ Orv(b8, p2, z8.VnB());
   __ Orr(z9, z9, z8);

   __ Mov(z8, z6);
   __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
   __ Eor(z8.VnB(), z8.VnB(), z6.VnB());
   __ Orv(b8, p2, z8.VnB());
   __ Orr(z9, z9, z8);

   END();

   if (CAN_RUN()) {
     RUN();

     int vl = core.GetSVELaneCount(kBRegSize) * 8;
     if (vl >= 256) {
       ASSERT_EQUAL_SVE(z0, z1);
       ASSERT_EQUAL_SVE(z2, z3);
       ASSERT_EQUAL_SVE(z4, z5);
       ASSERT_EQUAL_SVE(z6, z7);

       // Check the result of the rotate/eor sequence.
       uint64_t expected_z9[] = {0, 0};
       ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
     }
   }
 }
 Test* test_sve_ld1ro_list[] =
     {Test::MakeSVETest(256, "AARCH64_ASM_sve_ld1ro_vl256", &Test_sve_ld1ro),
      Test::MakeSVETest(512, "AARCH64_ASM_sve_ld1ro_vl512", &Test_sve_ld1ro),
      Test::MakeSVETest(2048, "AARCH64_ASM_sve_ld1ro_vl2048", &Test_sve_ld1ro)};
 #endif

 }  // namespace aarch64
 }  // namespace vixl